@@ -82,21 +82,12 @@ pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
82
82
}
83
83
84
84
/// Only records the doc ids
85
- #[ derive( Clone , Copy ) ]
85
+ #[ derive( Clone , Copy , Default ) ]
86
86
pub struct DocIdRecorder {
87
87
stack : ExpUnrolledLinkedList ,
88
88
current_doc : DocId ,
89
89
}
90
90
91
- impl Default for DocIdRecorder {
92
- fn default ( ) -> Self {
93
- DocIdRecorder {
94
- stack : ExpUnrolledLinkedList :: default ( ) ,
95
- current_doc : u32:: MAX ,
96
- }
97
- }
98
- }
99
-
100
91
impl Recorder for DocIdRecorder {
101
92
#[ inline]
102
93
fn current_doc ( & self ) -> DocId {
@@ -105,8 +96,9 @@ impl Recorder for DocIdRecorder {
105
96
106
97
#[ inline]
107
98
fn new_doc ( & mut self , doc : DocId , arena : & mut MemoryArena ) {
99
+ let delta = doc - self . current_doc ;
108
100
self . current_doc = doc;
109
- self . stack . writer ( arena) . write_u32_vint ( doc ) ;
101
+ self . stack . writer ( arena) . write_u32_vint ( delta ) ;
110
102
}
111
103
112
104
#[ inline]
@@ -123,21 +115,20 @@ impl Recorder for DocIdRecorder {
123
115
buffer_lender : & mut BufferLender ,
124
116
) {
125
117
let ( buffer, doc_ids) = buffer_lender. lend_all ( ) ;
126
- self . stack . read_to_end ( arena, buffer) ;
127
118
// TODO avoid reading twice.
119
+ self . stack . read_to_end ( arena, buffer) ;
128
120
if let Some ( doc_id_map) = doc_id_map {
129
- doc_ids. extend (
130
- VInt32Reader :: new ( & buffer[ ..] )
131
- . map ( |old_doc_id| doc_id_map. get_new_doc_id ( old_doc_id) ) ,
132
- ) ;
121
+ let iter = get_sum_reader ( VInt32Reader :: new ( & buffer[ ..] ) ) ;
122
+ doc_ids. extend ( iter. map ( |old_doc_id| doc_id_map. get_new_doc_id ( old_doc_id) ) ) ;
133
123
doc_ids. sort_unstable ( ) ;
134
124
135
125
for doc in doc_ids {
136
126
serializer. write_doc ( * doc, 0u32 , & [ ] [ ..] ) ;
137
127
}
138
128
} else {
139
- for doc in VInt32Reader :: new ( & buffer[ ..] ) {
140
- serializer. write_doc ( doc, 0u32 , & [ ] [ ..] ) ;
129
+ let iter = get_sum_reader ( VInt32Reader :: new ( & buffer[ ..] ) ) ;
130
+ for doc_id in iter {
131
+ serializer. write_doc ( doc_id, 0u32 , & [ ] [ ..] ) ;
141
132
}
142
133
}
143
134
}
@@ -147,6 +138,15 @@ impl Recorder for DocIdRecorder {
147
138
}
148
139
}
149
140
141
+ /// Takes an Iterator of delta encoded elements and returns an iterator
142
+ /// that yields the sum of the elements.
143
+ fn get_sum_reader ( iter : impl Iterator < Item = u32 > ) -> impl Iterator < Item = u32 > {
144
+ iter. scan ( 0 , |state, delta| {
145
+ * state += delta;
146
+ Some ( * state)
147
+ } )
148
+ }
149
+
150
150
/// Recorder encoding document ids, and term frequencies
151
151
#[ derive( Clone , Copy , Default ) ]
152
152
pub struct TermFrequencyRecorder {
@@ -164,9 +164,10 @@ impl Recorder for TermFrequencyRecorder {
164
164
165
165
#[ inline]
166
166
fn new_doc ( & mut self , doc : DocId , arena : & mut MemoryArena ) {
167
+ let delta = doc - self . current_doc ;
167
168
self . term_doc_freq += 1 ;
168
169
self . current_doc = doc;
169
- self . stack . writer ( arena) . write_u32_vint ( doc ) ;
170
+ self . stack . writer ( arena) . write_u32_vint ( delta ) ;
170
171
}
171
172
172
173
#[ inline]
@@ -193,19 +194,25 @@ impl Recorder for TermFrequencyRecorder {
193
194
let mut u32_it = VInt32Reader :: new ( & buffer[ ..] ) ;
194
195
if let Some ( doc_id_map) = doc_id_map {
195
196
let mut doc_id_and_tf = vec ! [ ] ;
196
- while let Some ( old_doc_id) = u32_it. next ( ) {
197
+ let mut prev_doc = 0 ;
198
+ while let Some ( delta_doc_id) = u32_it. next ( ) {
199
+ let doc_id = prev_doc + delta_doc_id;
200
+ prev_doc = doc_id;
197
201
let term_freq = u32_it. next ( ) . unwrap_or ( self . current_tf ) ;
198
- doc_id_and_tf. push ( ( doc_id_map. get_new_doc_id ( old_doc_id ) , term_freq) ) ;
202
+ doc_id_and_tf. push ( ( doc_id_map. get_new_doc_id ( doc_id ) , term_freq) ) ;
199
203
}
200
204
doc_id_and_tf. sort_unstable_by_key ( |& ( doc_id, _) | doc_id) ;
201
205
202
206
for ( doc_id, tf) in doc_id_and_tf {
203
207
serializer. write_doc ( doc_id, tf, & [ ] [ ..] ) ;
204
208
}
205
209
} else {
206
- while let Some ( doc) = u32_it. next ( ) {
210
+ let mut prev_doc = 0 ;
211
+ while let Some ( delta_doc_id) = u32_it. next ( ) {
212
+ let doc_id = prev_doc + delta_doc_id;
213
+ prev_doc = doc_id;
207
214
let term_freq = u32_it. next ( ) . unwrap_or ( self . current_tf ) ;
208
- serializer. write_doc ( doc , term_freq, & [ ] [ ..] ) ;
215
+ serializer. write_doc ( doc_id , term_freq, & [ ] [ ..] ) ;
209
216
}
210
217
}
211
218
}
@@ -216,23 +223,13 @@ impl Recorder for TermFrequencyRecorder {
216
223
}
217
224
218
225
/// Recorder encoding term frequencies as well as positions.
219
- #[ derive( Clone , Copy ) ]
226
+ #[ derive( Clone , Copy , Default ) ]
220
227
pub struct TfAndPositionRecorder {
221
228
stack : ExpUnrolledLinkedList ,
222
229
current_doc : DocId ,
223
230
term_doc_freq : u32 ,
224
231
}
225
232
226
- impl Default for TfAndPositionRecorder {
227
- fn default ( ) -> Self {
228
- TfAndPositionRecorder {
229
- stack : ExpUnrolledLinkedList :: default ( ) ,
230
- current_doc : u32:: MAX ,
231
- term_doc_freq : 0u32 ,
232
- }
233
- }
234
- }
235
-
236
233
impl Recorder for TfAndPositionRecorder {
237
234
#[ inline]
238
235
fn current_doc ( & self ) -> DocId {
@@ -241,9 +238,10 @@ impl Recorder for TfAndPositionRecorder {
241
238
242
239
#[ inline]
243
240
fn new_doc ( & mut self , doc : DocId , arena : & mut MemoryArena ) {
241
+ let delta = doc - self . current_doc ;
244
242
self . current_doc = doc;
245
243
self . term_doc_freq += 1u32 ;
246
- self . stack . writer ( arena) . write_u32_vint ( doc ) ;
244
+ self . stack . writer ( arena) . write_u32_vint ( delta ) ;
247
245
}
248
246
249
247
#[ inline]
@@ -269,7 +267,10 @@ impl Recorder for TfAndPositionRecorder {
269
267
self . stack . read_to_end ( arena, buffer_u8) ;
270
268
let mut u32_it = VInt32Reader :: new ( & buffer_u8[ ..] ) ;
271
269
let mut doc_id_and_positions = vec ! [ ] ;
272
- while let Some ( doc) = u32_it. next ( ) {
270
+ let mut prev_doc = 0 ;
271
+ while let Some ( delta_doc_id) = u32_it. next ( ) {
272
+ let doc_id = prev_doc + delta_doc_id;
273
+ prev_doc = doc_id;
273
274
let mut prev_position_plus_one = 1u32 ;
274
275
buffer_positions. clear ( ) ;
275
276
loop {
@@ -287,9 +288,9 @@ impl Recorder for TfAndPositionRecorder {
287
288
if let Some ( doc_id_map) = doc_id_map {
288
289
// this simple variant to remap may consume to much memory
289
290
doc_id_and_positions
290
- . push ( ( doc_id_map. get_new_doc_id ( doc ) , buffer_positions. to_vec ( ) ) ) ;
291
+ . push ( ( doc_id_map. get_new_doc_id ( doc_id ) , buffer_positions. to_vec ( ) ) ) ;
291
292
} else {
292
- serializer. write_doc ( doc , buffer_positions. len ( ) as u32 , buffer_positions) ;
293
+ serializer. write_doc ( doc_id , buffer_positions. len ( ) as u32 , buffer_positions) ;
293
294
}
294
295
}
295
296
if doc_id_map. is_some ( ) {
0 commit comments