Skip to content

Commit 6834424

Browse files
committed
use iter api
1 parent 6adf461 commit 6834424

File tree

8 files changed

+69
-157
lines changed

8 files changed

+69
-157
lines changed

fastfield_codecs/benches/bench.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,8 @@ mod tests {
100100

101101
fn get_u128_column_from_data(data: &[u128]) -> Arc<dyn Column<u128>> {
102102
let mut out = vec![];
103-
serialize_u128(VecColumn::from(&data), &mut out).unwrap();
103+
let iter = || data.iter().cloned();
104+
serialize_u128(iter, data.len() as u64, &mut out).unwrap();
104105
let out = OwnedBytes::new(out);
105106
open_u128::<u128>(out).unwrap()
106107
}

fastfield_codecs/src/compact_space/mod.rs

+11-6
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,10 @@ pub struct IPCodecParams {
171171

172172
impl CompactSpaceCompressor {
173173
/// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals.
174-
pub fn train_from(column: &impl Column<u128>) -> Self {
174+
pub fn train_from(iter: impl Iterator<Item = u128>, num_vals: u64) -> Self {
175175
let mut values_sorted = BTreeSet::new();
176-
values_sorted.extend(column.iter());
177-
let total_num_values = column.num_vals();
176+
values_sorted.extend(iter);
177+
let total_num_values = num_vals;
178178

179179
let compact_space =
180180
get_compact_space(&values_sorted, total_num_values, COST_PER_BLANK_IN_BITS);
@@ -443,7 +443,7 @@ impl CompactSpaceDecompressor {
443443
mod tests {
444444

445445
use super::*;
446-
use crate::{open_u128, serialize_u128, VecColumn};
446+
use crate::{open_u128, serialize_u128};
447447

448448
#[test]
449449
fn compact_space_test() {
@@ -513,7 +513,12 @@ mod tests {
513513

514514
fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes {
515515
let mut out = Vec::new();
516-
serialize_u128(VecColumn::from(u128_vals), &mut out).unwrap();
516+
serialize_u128(
517+
|| u128_vals.iter().cloned(),
518+
u128_vals.len() as u64,
519+
&mut out,
520+
)
521+
.unwrap();
517522

518523
let data = OwnedBytes::new(out);
519524
test_all(data.clone(), u128_vals);
@@ -603,7 +608,7 @@ mod tests {
603608
5_000_000_000,
604609
];
605610
let mut out = Vec::new();
606-
serialize_u128(VecColumn::from(vals), &mut out).unwrap();
611+
serialize_u128(|| vals.iter().cloned(), vals.len() as u64, &mut out).unwrap();
607612
let decomp = open_u128::<u128>(OwnedBytes::new(out)).unwrap();
608613

609614
assert_eq!(decomp.get_between_vals(199..=200), vec![0]);

fastfield_codecs/src/main.rs

+5-2
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ fn bench_ip() {
9090
{
9191
let mut data = vec![];
9292
for dataset in dataset.chunks(500_000) {
93-
serialize_u128(VecColumn::from(dataset), &mut data).unwrap();
93+
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap();
9494
}
9595
let compression = data.len() as f64 / (dataset.len() * 16) as f64;
9696
println!("Compression 50_000 chunks {:.4}", compression);
@@ -101,7 +101,10 @@ fn bench_ip() {
101101
}
102102

103103
let mut data = vec![];
104-
serialize_u128(VecColumn::from(&dataset), &mut data).unwrap();
104+
{
105+
print_time!("creation");
106+
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap();
107+
}
105108

106109
let compression = data.len() as f64 / (dataset.len() * 16) as f64;
107110
println!("Compression {:.2}", compression);

fastfield_codecs/src/serialize.rs

+5-6
Original file line numberDiff line numberDiff line change
@@ -142,15 +142,14 @@ pub fn estimate<T: MonotonicallyMappableToU64>(
142142
}
143143
}
144144

145-
pub fn serialize_u128(
146-
typed_column: impl Column<u128>,
145+
pub fn serialize_u128<F: Fn() -> I, I: Iterator<Item = u128>>(
146+
iter_gen: F,
147+
num_vals: u64,
147148
output: &mut impl io::Write,
148149
) -> io::Result<()> {
149150
// TODO write header, to later support more codecs
150-
let compressor = CompactSpaceCompressor::train_from(&typed_column);
151-
compressor
152-
.compress_into(typed_column.iter(), output)
153-
.unwrap();
151+
let compressor = CompactSpaceCompressor::train_from(iter_gen(), num_vals);
152+
compressor.compress_into(iter_gen(), output).unwrap();
154153

155154
Ok(())
156155
}

src/fastfield/multivalued/writer.rs

+2-9
Original file line numberDiff line numberDiff line change
@@ -409,15 +409,8 @@ impl MultiValueU128FastFieldWriter {
409409
{
410410
let field_write = serializer.get_field_writer(self.field, 1);
411411

412-
let mut values = Vec::with_capacity(self.vals.len());
413-
for vals in self.get_ordered_values(doc_id_map) {
414-
for &val in vals {
415-
values.push(val);
416-
}
417-
}
418-
let col = VecColumn::from(&values[..]);
419-
420-
serialize_u128(col, field_write)?;
412+
let iter = || self.get_ordered_values(doc_id_map).flatten().cloned();
413+
serialize_u128(iter, self.vals.len() as u64, field_write)?;
421414
}
422415
Ok(())
423416
}

src/fastfield/writer.rs

+23-57
Original file line numberDiff line numberDiff line change
@@ -364,66 +364,32 @@ impl U128FastFieldWriter {
364364
}
365365
}
366366

367-
struct RemappedFFWriter<'a> {
368-
doc_id_map: Option<&'a DocIdMapping>,
369-
null_values: &'a RoaringBitmap,
370-
vals: &'a [u128],
371-
idx_to_val_idx: Vec<u32>,
372-
val_count: u32,
373-
}
374-
impl<'a> Column<u128> for RemappedFFWriter<'a> {
375-
fn get_val(&self, _idx: u64) -> u128 {
376-
// unused by codec
377-
unreachable!()
378-
}
379-
380-
fn min_value(&self) -> u128 {
381-
// unused by codec
382-
unreachable!()
383-
}
384-
385-
fn max_value(&self) -> u128 {
386-
// unused by codec
387-
unreachable!()
388-
}
367+
let field_write = serializer.get_field_writer(self.field, 0);
389368

390-
fn num_vals(&self) -> u64 {
391-
self.val_count as u64
392-
}
393-
fn iter(&self) -> Box<dyn Iterator<Item = u128> + '_> {
394-
if let Some(doc_id_map) = self.doc_id_map {
395-
let iter = doc_id_map.iter_old_doc_ids().map(|idx| {
396-
if self.null_values.contains(idx as u32) {
397-
0 // TODO properly handle nulls
398-
} else {
399-
self.vals[self.idx_to_val_idx[idx as usize] as usize]
400-
}
401-
});
402-
Box::new(iter)
403-
} else {
404-
let iter = (0..self.val_count).map(|idx| {
405-
if self.null_values.contains(idx as u32) {
406-
0 // TODO properly handle nulls
407-
} else {
408-
self.vals[self.idx_to_val_idx[idx as usize] as usize]
409-
}
410-
});
411-
Box::new(iter)
412-
}
413-
}
369+
if let Some(doc_id_map) = doc_id_map {
370+
let iter = || {
371+
doc_id_map.iter_old_doc_ids().map(|idx| {
372+
if self.null_values.contains(idx as u32) {
373+
0 // TODO properly handle nulls
374+
} else {
375+
self.vals[idx_to_val_idx[idx as usize] as usize]
376+
}
377+
})
378+
};
379+
serialize_u128(iter, self.val_count as u64, field_write)?;
380+
} else {
381+
let iter = || {
382+
(0..self.val_count).map(|idx| {
383+
if self.null_values.contains(idx as u32) {
384+
0 // TODO properly handle nulls
385+
} else {
386+
self.vals[idx_to_val_idx[idx as usize] as usize]
387+
}
388+
})
389+
};
390+
serialize_u128(iter, self.val_count as u64, field_write)?;
414391
}
415392

416-
let column = RemappedFFWriter {
417-
doc_id_map,
418-
null_values: &self.null_values,
419-
vals: &self.vals,
420-
idx_to_val_idx,
421-
val_count: self.val_count,
422-
};
423-
424-
let field_write = serializer.get_field_writer(self.field, 0);
425-
serialize_u128(column, field_write)?;
426-
427393
Ok(())
428394
}
429395
}

src/indexer/merger.rs

+15-76
Original file line numberDiff line numberDiff line change
@@ -354,49 +354,16 @@ impl IndexMerger {
354354
.map(|(_, ff_reader)| ff_reader)
355355
.collect::<Vec<_>>();
356356

357-
struct RemappedFFReader<'a> {
358-
doc_id_mapping: &'a SegmentDocIdMapping,
359-
fast_field_readers: Vec<MultiValuedU128FastFieldReader<u128>>,
360-
}
361-
impl<'a> Column<u128> for RemappedFFReader<'a> {
362-
fn get_val(&self, _idx: u64) -> u128 {
363-
// unused by codec
364-
unreachable!()
365-
}
366-
367-
fn min_value(&self) -> u128 {
368-
// unused by codec
369-
unreachable!()
370-
}
371-
372-
fn max_value(&self) -> u128 {
373-
// unused by codec
374-
unreachable!()
375-
}
376-
377-
fn num_vals(&self) -> u64 {
378-
self.doc_id_mapping.len() as u64
379-
}
380-
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u128> + 'b> {
381-
Box::new(
382-
self.doc_id_mapping
383-
.iter_old_doc_addrs()
384-
.flat_map(|doc_addr| {
385-
let fast_field_reader =
386-
&self.fast_field_readers[doc_addr.segment_ord as usize];
387-
let mut out = vec![];
388-
fast_field_reader.get_vals(doc_addr.doc_id, &mut out);
389-
out.into_iter()
390-
}),
391-
)
392-
}
393-
}
394-
let column = RemappedFFReader {
395-
doc_id_mapping,
396-
fast_field_readers,
357+
let iter = || {
358+
doc_id_mapping.iter_old_doc_addrs().flat_map(|doc_addr| {
359+
let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize];
360+
let mut out = vec![];
361+
fast_field_reader.get_vals(doc_addr.doc_id, &mut out);
362+
out.into_iter()
363+
})
397364
};
398365
let field_write = fast_field_serializer.get_field_writer(field, 1);
399-
serialize_u128(column, field_write)?;
366+
serialize_u128(iter, doc_id_mapping.len() as u64, field_write)?;
400367

401368
Ok(())
402369
}
@@ -420,42 +387,14 @@ impl IndexMerger {
420387
})
421388
.collect::<Vec<_>>();
422389

423-
struct RemappedFFReader<'a> {
424-
doc_id_mapping: &'a SegmentDocIdMapping,
425-
fast_field_readers: Vec<Arc<dyn Column<u128>>>,
426-
}
427-
impl<'a> Column<u128> for RemappedFFReader<'a> {
428-
fn get_val(&self, _idx: u64) -> u128 {
429-
// unused by codec
430-
unreachable!()
431-
}
432-
433-
fn min_value(&self) -> u128 {
434-
// unused by codec
435-
unreachable!()
436-
}
437-
438-
fn max_value(&self) -> u128 {
439-
// unused by codec
440-
unreachable!()
441-
}
442-
443-
fn num_vals(&self) -> u64 {
444-
self.doc_id_mapping.len() as u64
445-
}
446-
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u128> + 'b> {
447-
Box::new(self.doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
448-
let fast_field_reader = &self.fast_field_readers[doc_addr.segment_ord as usize];
449-
fast_field_reader.get_val(doc_addr.doc_id as u64)
450-
}))
451-
}
452-
}
453-
let column = RemappedFFReader {
454-
doc_id_mapping,
455-
fast_field_readers,
456-
};
457390
let field_write = fast_field_serializer.get_field_writer(field, 0);
458-
serialize_u128(column, field_write)?;
391+
let iter = || {
392+
doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
393+
let fast_field_reader = &fast_field_readers[doc_addr.segment_ord as usize];
394+
fast_field_reader.get_val(doc_addr.doc_id as u64)
395+
})
396+
};
397+
fastfield_codecs::serialize_u128(iter, doc_id_mapping.len() as u64, field_write)?;
459398
Ok(())
460399
}
461400

src/schema/document.rs

+6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use std::collections::{HashMap, HashSet};
22
use std::io::{self, Read, Write};
33
use std::mem;
4+
use std::net::IpAddr;
45

56
use common::{BinarySerializable, VInt};
67

@@ -97,6 +98,11 @@ impl Document {
9798
self.add_field_value(field, value);
9899
}
99100

101+
/// Add a u64 field
102+
pub fn add_ip(&mut self, field: Field, value: IpAddr) {
103+
self.add_field_value(field, value);
104+
}
105+
100106
/// Add a i64 field
101107
pub fn add_i64(&mut self, field: Field, value: i64) {
102108
self.add_field_value(field, value);

0 commit comments

Comments
 (0)