1
1
//! Interface for single-file readers
2
2
3
3
pub mod builder;
4
+ pub mod capabilities;
4
5
pub mod output;
5
6
6
7
use async_trait:: async_trait;
7
8
use output:: FileReaderOutputRecv ;
8
9
use polars_core:: schema:: SchemaRef ;
9
10
use polars_error:: PolarsResult ;
11
+ use polars_io:: RowIndex ;
12
+ use polars_io:: predicates:: ScanIOPredicate ;
10
13
use polars_utils:: IdxSize ;
11
14
use polars_utils:: slice_enum:: Slice ;
12
15
13
- use super :: extra_ops:: apply:: ApplyExtraOps ;
16
+ use super :: extra_ops:: cast_columns:: CastColumnsPolicy ;
17
+ use super :: extra_ops:: missing_columns:: MissingColumnsPolicy ;
14
18
use crate :: async_executor:: JoinHandle ;
15
19
16
20
/// Interface to read a single file
@@ -19,24 +23,12 @@ pub trait FileReader: Send + Sync {
19
23
/// Initialize this FileReader. Intended to allow the reader to pre-fetch metadata.
20
24
///
21
25
/// This must be called before calling any other functions of the FileReader.
22
- ///
23
- /// Returns the schema of the morsels that this FileReader will return.
24
26
async fn initialize ( & mut self ) -> PolarsResult < ( ) > ;
25
27
26
28
/// Begin reading the file into morsels.
27
29
fn begin_read (
28
30
& self ,
29
- // Note: This may contain more columns that what exist in the file. The reader should project
30
- // the ones that it finds. The remaining ones will be handled in post.
31
- projected_schema : & SchemaRef ,
32
- extra_ops : ApplyExtraOps ,
33
-
34
- num_pipelines : usize ,
35
- callbacks : FileReaderCallbacks ,
36
- // TODO
37
- // We could introduce dynamic `Option<Box<dyn Any>>` for the reader to use. That would help
38
- // with e.g. synchronizing row group prefetches across multiple files in Parquet. Currently
39
- // every reader started concurrently will prefetch up to the row group prefetch limit.
31
+ args : BeginReadArgs ,
40
32
) -> PolarsResult < ( FileReaderOutputRecv , JoinHandle < PolarsResult < ( ) > > ) > ;
41
33
42
34
/// This FileReader must be initialized before calling this.
@@ -46,15 +38,16 @@ pub trait FileReader: Send + Sync {
46
38
async fn n_rows_in_file ( & self ) -> PolarsResult < IdxSize > {
47
39
let ( tx, rx) = tokio:: sync:: oneshot:: channel ( ) ;
48
40
49
- let ( morsel_receivers, handle) = self . begin_read (
50
- & Default :: default ( ) , // pass empty schema
51
- ApplyExtraOps :: Noop ,
52
- 1 ,
53
- FileReaderCallbacks {
41
+ let ( morsel_receivers, handle) = self . begin_read ( BeginReadArgs {
42
+ // Passing 0-0 slice indicates to the reader that we want the full row count, but it can
43
+ // skip actually reading the data if it is able to.
44
+ pre_slice : Some ( Slice :: Positive { offset : 0 , len : 0 } ) ,
45
+ callbacks : FileReaderCallbacks {
54
46
n_rows_in_file_tx : Some ( tx) ,
55
47
..Default :: default ( )
56
48
} ,
57
- ) ?;
49
+ ..Default :: default ( )
50
+ } ) ?;
58
51
59
52
drop ( morsel_receivers) ;
60
53
@@ -75,15 +68,14 @@ pub trait FileReader: Send + Sync {
75
68
76
69
let ( tx, rx) = tokio:: sync:: oneshot:: channel ( ) ;
77
70
78
- let ( mut morsel_receivers, handle) = self . begin_read (
79
- & Default :: default ( ) , // pass empty schema
80
- ApplyExtraOps :: Noop ,
81
- 1 ,
82
- FileReaderCallbacks {
71
+ let ( mut morsel_receivers, handle) = self . begin_read ( BeginReadArgs {
72
+ pre_slice,
73
+ callbacks : FileReaderCallbacks {
83
74
row_position_on_end_tx : Some ( tx) ,
84
75
..Default :: default ( )
85
76
} ,
86
- ) ?;
77
+ ..Default :: default ( )
78
+ } ) ?;
87
79
88
80
// We are using the `row_position_on_end` callback, this means we must fully consume all of
89
81
// the morsels sent by the reader.
@@ -102,7 +94,54 @@ pub trait FileReader: Send + Sync {
102
94
}
103
95
}
104
96
105
- #[ derive( Default ) ]
97
+ #[ derive( Debug ) ]
98
+ pub struct BeginReadArgs {
99
+ /// Columns to project from the file.
100
+ pub projected_schema : SchemaRef ,
101
+
102
+ pub row_index : Option < RowIndex > ,
103
+ pub pre_slice : Option < Slice > ,
104
+ pub predicate : Option < ScanIOPredicate > ,
105
+
106
+ /// User-configured policy for when datatypes do not match.
107
+ ///
108
+ /// A reader may wish to use this if it is applying predicates.
109
+ ///
110
+ /// This can be ignored by the reader, as the policy is also applied in post.
111
+ #[ expect( unused) ]
112
+ pub cast_columns_policy : CastColumnsPolicy ,
113
+ /// User-configured policy for when columns are not found in the file.
114
+ ///
115
+ /// A reader may wish to use this if it is applying predicates.
116
+ ///
117
+ /// This can be ignored by the reader, as the policy is also applied in post.
118
+ pub missing_columns_policy : MissingColumnsPolicy ,
119
+
120
+ pub num_pipelines : usize ,
121
+ pub callbacks : FileReaderCallbacks ,
122
+ // TODO
123
+ // We could introduce dynamic `Option<Box<dyn Any>>` for the reader to use. That would help
124
+ // with e.g. synchronizing row group prefetches across multiple files in Parquet. Currently
125
+ // every reader started concurrently will prefetch up to the row group prefetch limit.
126
+ }
127
+
128
+ impl Default for BeginReadArgs {
129
+ fn default ( ) -> Self {
130
+ BeginReadArgs {
131
+ projected_schema : SchemaRef :: default ( ) ,
132
+ row_index : None ,
133
+ pre_slice : None ,
134
+ predicate : None ,
135
+ // TODO: Use less restrictive default
136
+ cast_columns_policy : CastColumnsPolicy :: ErrorOnMismatch ,
137
+ missing_columns_policy : MissingColumnsPolicy :: Insert ,
138
+ num_pipelines : 1 ,
139
+ callbacks : FileReaderCallbacks :: default ( ) ,
140
+ }
141
+ }
142
+ }
143
+
144
+ #[ derive( Debug , Default ) ]
106
145
/// We have this to avoid a footgun of accidentally swapping the arguments.
107
146
pub struct FileReaderCallbacks {
108
147
/// Full file schema
@@ -112,10 +151,10 @@ pub struct FileReaderCallbacks {
112
151
/// on the source. Prefer instead to use `row_position_on_end`, which can be much faster.
113
152
///
114
153
/// Notes:
154
+ /// * All readers must ensure that this count is sent if requested, even if the output port
155
+ /// closes prematurely, or a slice is sent.
115
156
/// * Some readers will only send this after their output morsels to be fully consumed (or if
116
157
/// their output port is dropped), so you should not block morsel consumption on waiting for this.
117
- /// * All readers must ensure that this count is sent if requested, even if the output port
118
- /// closes prematurely.
119
158
pub n_rows_in_file_tx : Option < tokio:: sync:: oneshot:: Sender < IdxSize > > ,
120
159
121
160
/// Returns the row position reached by this reader.
@@ -139,19 +178,7 @@ pub fn calc_row_position_after_slice(n_rows_in_file: IdxSize, pre_slice: Option<
139
178
140
179
let out = match pre_slice {
141
180
None => n_rows_in_file,
142
-
143
- Some ( Slice :: Positive { offset, len } ) => {
144
- let slice_end = offset. saturating_add ( len) ;
145
- n_rows_in_file. min ( slice_end)
146
- } ,
147
-
148
- Some ( Slice :: Negative {
149
- offset_from_end,
150
- len,
151
- } ) => {
152
- let n_from_end = offset_from_end. saturating_sub ( len) ;
153
- n_rows_in_file. saturating_sub ( n_from_end)
154
- } ,
181
+ Some ( v) => v. restrict_to_bounds ( n_rows_in_file) . end_position ( ) ,
155
182
} ;
156
183
157
184
IdxSize :: try_from ( out) . unwrap_or ( IdxSize :: MAX )
0 commit comments