Skip to content

Commit c11ddec

Browse files
committed
docid deltas while indexing
storing deltas is especially helpful for repetitive data like logs. In those cases, recording a doc on a term costed 4 bytes instead of 1 byte now. HDFS Indexing 1.1GB Total memory consumption: Before: 760 MB Now: 590 MB
1 parent 4837c78 commit c11ddec

File tree

1 file changed

+43
-15
lines changed

1 file changed

+43
-15
lines changed

src/postings/recorder.rs

+43-15
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,14 @@ impl Recorder for DocIdRecorder {
105105

106106
#[inline]
107107
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
108+
let delta = if self.current_doc == u32::MAX {
109+
doc
110+
} else {
111+
doc - self.current_doc
112+
};
113+
108114
self.current_doc = doc;
109-
self.stack.writer(arena).write_u32_vint(doc);
115+
self.stack.writer(arena).write_u32_vint(delta);
110116
}
111117

112118
#[inline]
@@ -125,18 +131,22 @@ impl Recorder for DocIdRecorder {
125131
let (buffer, doc_ids) = buffer_lender.lend_all();
126132
self.stack.read_to_end(arena, buffer);
127133
// TODO avoid reading twice.
134+
let mut prev_doc = 0;
128135
if let Some(doc_id_map) = doc_id_map {
129-
doc_ids.extend(
130-
VInt32Reader::new(&buffer[..])
131-
.map(|old_doc_id| doc_id_map.get_new_doc_id(old_doc_id)),
132-
);
136+
doc_ids.extend(VInt32Reader::new(&buffer[..]).map(|delta_doc_id| {
137+
let old_doc_id = prev_doc + delta_doc_id;
138+
prev_doc = old_doc_id;
139+
doc_id_map.get_new_doc_id(old_doc_id)
140+
}));
133141
doc_ids.sort_unstable();
134142

135143
for doc in doc_ids {
136144
serializer.write_doc(*doc, 0u32, &[][..]);
137145
}
138146
} else {
139-
for doc in VInt32Reader::new(&buffer[..]) {
147+
for delta_doc_id in VInt32Reader::new(&buffer[..]) {
148+
let doc = prev_doc + delta_doc_id;
149+
prev_doc = doc;
140150
serializer.write_doc(doc, 0u32, &[][..]);
141151
}
142152
}
@@ -164,9 +174,14 @@ impl Recorder for TermFrequencyRecorder {
164174

165175
#[inline]
166176
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
177+
let delta = if self.current_doc == u32::MAX {
178+
doc
179+
} else {
180+
doc - self.current_doc
181+
};
167182
self.term_doc_freq += 1;
168183
self.current_doc = doc;
169-
self.stack.writer(arena).write_u32_vint(doc);
184+
self.stack.writer(arena).write_u32_vint(delta);
170185
}
171186

172187
#[inline]
@@ -191,21 +206,26 @@ impl Recorder for TermFrequencyRecorder {
191206
let buffer = buffer_lender.lend_u8();
192207
self.stack.read_to_end(arena, buffer);
193208
let mut u32_it = VInt32Reader::new(&buffer[..]);
209+
let mut prev_doc = 0;
194210
if let Some(doc_id_map) = doc_id_map {
195211
let mut doc_id_and_tf = vec![];
196-
while let Some(old_doc_id) = u32_it.next() {
212+
while let Some(delta_doc_id) = u32_it.next() {
213+
let doc_id = prev_doc + delta_doc_id;
214+
prev_doc = doc_id;
197215
let term_freq = u32_it.next().unwrap_or(self.current_tf);
198-
doc_id_and_tf.push((doc_id_map.get_new_doc_id(old_doc_id), term_freq));
216+
doc_id_and_tf.push((doc_id_map.get_new_doc_id(doc_id), term_freq));
199217
}
200218
doc_id_and_tf.sort_unstable_by_key(|&(doc_id, _)| doc_id);
201219

202220
for (doc_id, tf) in doc_id_and_tf {
203221
serializer.write_doc(doc_id, tf, &[][..]);
204222
}
205223
} else {
206-
while let Some(doc) = u32_it.next() {
224+
while let Some(delta_doc_id) = u32_it.next() {
225+
let doc_id = prev_doc + delta_doc_id;
226+
prev_doc = doc_id;
207227
let term_freq = u32_it.next().unwrap_or(self.current_tf);
208-
serializer.write_doc(doc, term_freq, &[][..]);
228+
serializer.write_doc(doc_id, term_freq, &[][..]);
209229
}
210230
}
211231
}
@@ -241,9 +261,14 @@ impl Recorder for TfAndPositionRecorder {
241261

242262
#[inline]
243263
fn new_doc(&mut self, doc: DocId, arena: &mut MemoryArena) {
264+
let delta = if self.current_doc == u32::MAX {
265+
doc
266+
} else {
267+
doc - self.current_doc
268+
};
244269
self.current_doc = doc;
245270
self.term_doc_freq += 1u32;
246-
self.stack.writer(arena).write_u32_vint(doc);
271+
self.stack.writer(arena).write_u32_vint(delta);
247272
}
248273

249274
#[inline]
@@ -269,7 +294,10 @@ impl Recorder for TfAndPositionRecorder {
269294
self.stack.read_to_end(arena, buffer_u8);
270295
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
271296
let mut doc_id_and_positions = vec![];
272-
while let Some(doc) = u32_it.next() {
297+
let mut prev_doc = 0;
298+
while let Some(delta_doc_id) = u32_it.next() {
299+
let doc_id = prev_doc + delta_doc_id;
300+
prev_doc = doc_id;
273301
let mut prev_position_plus_one = 1u32;
274302
buffer_positions.clear();
275303
loop {
@@ -287,9 +315,9 @@ impl Recorder for TfAndPositionRecorder {
287315
if let Some(doc_id_map) = doc_id_map {
288316
// this simple variant to remap may consume to much memory
289317
doc_id_and_positions
290-
.push((doc_id_map.get_new_doc_id(doc), buffer_positions.to_vec()));
318+
.push((doc_id_map.get_new_doc_id(doc_id), buffer_positions.to_vec()));
291319
} else {
292-
serializer.write_doc(doc, buffer_positions.len() as u32, buffer_positions);
320+
serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions);
293321
}
294322
}
295323
if doc_id_map.is_some() {

0 commit comments

Comments
 (0)