1
1
use core:: fmt:: Debug ;
2
2
3
- use columnar:: { ColumnType , DynamicColumn } ;
3
+ use columnar:: { ColumnIndex , DynamicColumn } ;
4
4
5
- use super :: range_query:: VecCursor ;
6
- use super :: ConstScorer ;
5
+ use super :: { ConstScorer , EmptyScorer } ;
7
6
use crate :: core:: SegmentReader ;
8
7
use crate :: docset:: { DocSet , TERMINATED } ;
9
8
use crate :: query:: explanation:: does_not_match;
10
9
use crate :: query:: { EnableScoring , Explanation , Query , Scorer , Weight } ;
11
- use crate :: schema:: FieldType ;
12
10
use crate :: { DocId , Score , TantivyError } ;
13
11
14
12
/// Query that matches all of the documents.
15
13
///
16
14
/// All of the document get the score 1.0.
17
15
#[ derive( Clone , Debug ) ]
18
16
pub struct ExistsQuery {
19
- field : String ,
17
+ field_name : String ,
20
18
}
21
19
22
20
impl ExistsQuery {
@@ -25,62 +23,55 @@ impl ExistsQuery {
25
23
/// If the value type is not correct, something may go terribly wrong when
26
24
/// the `Weight` object is created.
27
25
pub fn new_exists_query ( field : String ) -> ExistsQuery {
28
- ExistsQuery { field }
26
+ ExistsQuery { field_name : field }
29
27
}
30
28
}
31
29
32
30
impl Query for ExistsQuery {
33
31
fn weight ( & self , enable_scoring : EnableScoring ) -> crate :: Result < Box < dyn Weight > > {
34
32
let schema = enable_scoring. schema ( ) ;
35
- let field_type = schema
36
- . get_field_entry ( schema. get_field ( & self . field ) ?)
37
- . field_type ( ) ;
33
+ let Some ( ( field, _path) ) = schema. find_field ( & self . field_name ) else {
34
+ return Err ( TantivyError :: FieldNotFound ( self . field_name . clone ( ) ) ) ;
35
+ } ;
36
+ let field_type = schema. get_field_entry ( field) . field_type ( ) ;
38
37
if !field_type. is_fast ( ) {
39
38
return Err ( TantivyError :: SchemaError ( format ! (
40
- "Field {:? } is not a fast field." ,
41
- self . field
39
+ "Field {} is not a fast field." ,
40
+ self . field_name
42
41
) ) ) ;
43
42
}
44
43
Ok ( Box :: new ( ExistsWeight {
45
- field : self . field . clone ( ) ,
46
- field_type : field_type. clone ( ) ,
44
+ field_name : self . field_name . clone ( ) ,
47
45
} ) )
48
46
}
49
47
}
50
48
51
49
/// Weight associated with the `ExistsQuery` query.
52
50
pub struct ExistsWeight {
53
- field : String ,
54
- field_type : FieldType ,
51
+ field_name : String ,
55
52
}
56
53
57
54
impl ExistsWeight { }
58
55
59
56
impl Weight for ExistsWeight {
60
57
fn scorer ( & self , reader : & SegmentReader , boost : Score ) -> crate :: Result < Box < dyn Scorer > > {
61
58
let fast_field_reader = reader. fast_fields ( ) ;
62
- let column_type = match self . field_type {
63
- FieldType :: Str ( _ ) => Some ( ColumnType :: Str ) ,
64
- FieldType :: U64 ( _ ) => Some ( ColumnType :: U64 ) ,
65
- FieldType :: I64 ( _ ) => Some ( ColumnType :: I64 ) ,
66
- FieldType :: F64 ( _ ) => Some ( ColumnType :: F64 ) ,
67
- FieldType :: Bool ( _ ) => Some ( ColumnType :: Bool ) ,
68
- FieldType :: Date ( _ ) => Some ( ColumnType :: DateTime ) ,
69
- FieldType :: Bytes ( _ ) => Some ( ColumnType :: Bytes ) ,
70
- FieldType :: IpAddr ( _ ) => Some ( ColumnType :: IpAddr ) ,
71
- _ => None ,
59
+ let dynamic_columns : crate :: Result < Vec < DynamicColumn > > = fast_field_reader
60
+ . dynamic_column_handles ( & self . field_name ) ?
61
+ . into_iter ( )
62
+ . map ( |handle| handle . open ( ) . map_err ( |io_error| io_error . into ( ) ) )
63
+ . collect ( ) ;
64
+ let mut non_empty_columns = Vec :: new ( ) ;
65
+ for column in dynamic_columns? {
66
+ if ! matches ! ( column . column_index ( ) , ColumnIndex :: Empty { .. } ) {
67
+ non_empty_columns . push ( column )
68
+ }
72
69
}
73
- . expect ( "Should be here" ) ;
74
- if let Some ( dynamic_column) =
75
- fast_field_reader. dynamic_column_handle ( & self . field , column_type) ?
76
- {
77
- let docset = ExistsDocSet :: new ( dynamic_column. open ( ) ?) ;
70
+ if !non_empty_columns. is_empty ( ) {
71
+ let docset = ExistsDocSet :: new ( non_empty_columns, reader. max_doc ( ) ) ;
78
72
return Ok ( Box :: new ( ConstScorer :: new ( docset, boost) ) ) ;
79
73
} else {
80
- return Err ( TantivyError :: SchemaError ( format ! (
81
- "Field {:?} with type {:?} is not supported by exists query." ,
82
- self . field, self . field_type,
83
- ) ) ) ;
74
+ return Ok ( Box :: new ( EmptyScorer ) ) ;
84
75
}
85
76
}
86
77
@@ -94,147 +85,64 @@ impl Weight for ExistsWeight {
94
85
}
95
86
96
87
pub ( crate ) struct ExistsDocSet {
97
- column : DynamicColumn ,
98
- /// The next docid start range to fetch (inclusive).
99
- next_fetch_start : u32 ,
100
- /// Number of docs range checked in a batch.
101
- ///
102
- /// There are two patterns.
103
- /// - We do a full scan. => We can load large chunks. We don't know in advance if seek call
104
- /// will come, so we start with small chunks
105
- /// - We load docs, interspersed with seek calls. When there are big jumps in the seek, we
106
- /// should load small chunks. When the seeks are small, we can employ the same strategy as on a
107
- /// full scan.
108
- fetch_horizon : u32 ,
109
- /// Current batch of loaded docs.
110
- loaded_docs : VecCursor ,
111
- last_seek_pos_opt : Option < u32 > ,
88
+ columns : Vec < DynamicColumn > ,
89
+ doc : DocId ,
90
+ max_doc : DocId ,
112
91
}
113
92
114
- const DEFAULT_FETCH_HORIZON : u32 = 128 ;
115
93
impl ExistsDocSet {
116
- pub ( crate ) fn new ( column : DynamicColumn ) -> Self {
117
- let mut exists_doc_set = Self {
118
- column,
119
- loaded_docs : VecCursor :: new ( ) ,
120
- next_fetch_start : 0 ,
121
- fetch_horizon : DEFAULT_FETCH_HORIZON ,
122
- last_seek_pos_opt : None ,
94
+ pub ( crate ) fn new ( columns : Vec < DynamicColumn > , max_doc : DocId ) -> Self {
95
+ let mut set = Self {
96
+ columns,
97
+ doc : 0u32 ,
98
+ max_doc,
123
99
} ;
124
- exists_doc_set. reset_fetch_range ( ) ;
125
- exists_doc_set. fetch_block ( ) ;
126
- exists_doc_set
100
+ set. find_next ( ) ;
101
+ set
127
102
}
128
103
129
- /// Returns true if more data could be fetched
130
- fn fetch_block ( & mut self ) {
131
- const MAX_HORIZON : u32 = 100_000 ;
132
- while self . loaded_docs . is_empty ( ) {
133
- let finished_to_end = self . fetch_horizon ( self . fetch_horizon ) ;
134
- if finished_to_end {
135
- break ;
104
+ fn find_next ( & mut self ) -> DocId {
105
+ // TODO: can this be optimized?
106
+ while self . doc < self . max_doc {
107
+ if self
108
+ . columns
109
+ . iter ( )
110
+ . find ( |col| col. column_index ( ) . has_value ( self . doc ) )
111
+ . is_some ( )
112
+ {
113
+ return self . doc ;
136
114
}
137
- // Fetch more data, increase horizon. Horizon only gets reset when doing a seek.
138
- self . fetch_horizon = ( self . fetch_horizon * 2 ) . min ( MAX_HORIZON ) ;
115
+ self . doc += 1 ;
139
116
}
140
- }
141
-
142
- /// Fetches a block for docid range [next_fetch_start .. next_fetch_start + HORIZON]
143
- fn fetch_horizon ( & mut self , horizon : u32 ) -> bool {
144
- let mut finished_to_end = false ;
145
-
146
- let limit = self . column . num_docs ( ) ;
147
- let mut end = self . next_fetch_start + horizon;
148
- if end >= limit {
149
- end = limit;
150
- finished_to_end = true ;
151
- }
152
-
153
- let last_value = self . loaded_docs . last_value ( ) ;
154
- let doc_buffer: & mut Vec < DocId > = self . loaded_docs . get_cleared_data ( ) ;
155
- self . column
156
- . column_index ( )
157
- . get_docids_with_existing_value ( self . next_fetch_start ..end, doc_buffer) ;
158
- if let Some ( last_value) = last_value {
159
- while self . loaded_docs . current ( ) == Some ( last_value) {
160
- self . loaded_docs . next ( ) ;
161
- }
162
- }
163
- self . next_fetch_start = end;
164
-
165
- finished_to_end
166
- }
167
-
168
- /// check if the distance between the seek calls is large
169
- fn is_last_seek_distance_large ( & self , new_seek : DocId ) -> bool {
170
- if let Some ( last_seek_pos) = self . last_seek_pos_opt {
171
- ( new_seek - last_seek_pos) >= 128
172
- } else {
173
- true
174
- }
175
- }
176
-
177
- fn reset_fetch_range ( & mut self ) {
178
- self . fetch_horizon = DEFAULT_FETCH_HORIZON ;
117
+ self . doc = TERMINATED ;
118
+ return TERMINATED ;
179
119
}
180
120
}
181
121
182
122
impl DocSet for ExistsDocSet {
183
123
fn advance ( & mut self ) -> DocId {
184
- if let Some ( docid) = self . loaded_docs . next ( ) {
185
- return docid;
186
- }
187
- if self . next_fetch_start >= self . column . num_values ( ) {
188
- return TERMINATED ;
189
- }
190
- self . fetch_block ( ) ;
191
- self . loaded_docs . current ( ) . unwrap_or ( TERMINATED )
192
- }
193
-
194
- #[ inline]
195
- fn doc ( & self ) -> DocId {
196
- self . loaded_docs . current ( ) . unwrap_or ( TERMINATED )
197
- }
198
-
199
- /// Advances the `DocSet` forward until reaching the target, or going to the
200
- /// lowest [`DocId`] greater than the target.
201
- ///
202
- /// If the end of the `DocSet` is reached, [`TERMINATED`] is returned.
203
- ///
204
- /// Calling `.seek(target)` on a terminated `DocSet` is legal. Implementation
205
- /// of `DocSet` should support it.
206
- ///
207
- /// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`.
208
- fn seek ( & mut self , target : DocId ) -> DocId {
209
- if self . is_last_seek_distance_large ( target) {
210
- self . reset_fetch_range ( ) ;
211
- }
212
- if target > self . next_fetch_start {
213
- self . next_fetch_start = target;
214
- }
215
- let mut doc = self . doc ( ) ;
216
- debug_assert ! ( doc <= target) ;
217
- while doc < target {
218
- doc = self . advance ( ) ;
219
- }
220
- self . last_seek_pos_opt = Some ( target) ;
221
- doc
124
+ self . doc += 1 ;
125
+ self . find_next ( )
222
126
}
223
127
224
128
fn size_hint ( & self ) -> u32 {
225
129
0 // heuristic possible by checking number of hits when fetching a block
226
130
}
131
+
132
+ fn doc ( & self ) -> DocId {
133
+ self . doc
134
+ }
227
135
}
228
136
229
137
#[ cfg( test) ]
230
138
mod tests {
231
139
use crate :: collector:: Count ;
232
140
use crate :: query:: exist_query:: ExistsQuery ;
233
- use crate :: schema:: { Schema , FAST , INDEXED , STRING } ;
141
+ use crate :: schema:: { Schema , FAST , INDEXED , STRING , TEXT } ;
234
142
use crate :: { doc, Index } ;
235
143
236
144
#[ test]
237
- fn test_range_query_simple ( ) -> crate :: Result < ( ) > {
145
+ fn test_exists_query_simple ( ) -> crate :: Result < ( ) > {
238
146
let mut schema_builder = Schema :: builder ( ) ;
239
147
let all_field = schema_builder. add_u64_field ( "all" , INDEXED | FAST ) ;
240
148
let even_field = schema_builder. add_u64_field ( "even" , INDEXED | FAST ) ;
@@ -284,4 +192,41 @@ mod tests {
284
192
285
193
Ok ( ( ) )
286
194
}
195
+
196
+ #[ test]
197
+ fn test_exists_query_json ( ) -> crate :: Result < ( ) > {
198
+ let mut schema_builder = Schema :: builder ( ) ;
199
+ let json = schema_builder. add_json_field ( "json" , TEXT | FAST ) ;
200
+ let schema = schema_builder. build ( ) ;
201
+
202
+ let index = Index :: create_in_ram ( schema) ;
203
+ {
204
+ let mut index_writer = index. writer_for_tests ( ) ?;
205
+ for i in 0u64 ..100u64 {
206
+ if i % 2 == 0 {
207
+ index_writer. add_document ( doc ! ( json => json!( { "all" : i, "even" : true } ) ) ) ?;
208
+ } else {
209
+ index_writer
210
+ . add_document ( doc ! ( json => json!( { "all" : i. to_string( ) , "odd" : true } ) ) ) ?;
211
+ }
212
+ }
213
+ index_writer. commit ( ) ?;
214
+ }
215
+ let reader = index. reader ( ) ?;
216
+ let searcher = reader. searcher ( ) ;
217
+
218
+ let alldocs = ExistsQuery :: new_exists_query ( "json.all" . to_string ( ) ) ;
219
+ let count = searcher. search ( & alldocs, & Count ) ?;
220
+ assert_eq ! ( count, 100 ) ;
221
+
222
+ let even_docs = ExistsQuery :: new_exists_query ( "json.even" . to_string ( ) ) ;
223
+ let count = searcher. search ( & even_docs, & Count ) ?;
224
+ assert_eq ! ( count, 50 ) ;
225
+
226
+ let odd_docs = ExistsQuery :: new_exists_query ( "json.odd" . to_string ( ) ) ;
227
+ let count = searcher. search ( & odd_docs, & Count ) ?;
228
+ assert_eq ! ( count, 50 ) ;
229
+
230
+ Ok ( ( ) )
231
+ }
287
232
}
0 commit comments