Skip to content

Commit 3650d1f

Browse files
authored
Merge pull request #1553 from quickwit-oss/ip_field
ip field
2 parents e443ca6 + 5c9cbee commit 3650d1f

30 files changed

+1285
-124
lines changed

common/src/serialize.rs

+13
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,19 @@ impl FixedSize for u64 {
107107
const SIZE_IN_BYTES: usize = 8;
108108
}
109109

110+
impl BinarySerializable for u128 {
111+
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
112+
writer.write_u128::<Endianness>(*self)
113+
}
114+
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
115+
reader.read_u128::<Endianness>()
116+
}
117+
}
118+
119+
impl FixedSize for u128 {
120+
const SIZE_IN_BYTES: usize = 16;
121+
}
122+
110123
impl BinarySerializable for f32 {
111124
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
112125
writer.write_f32::<Endianness>(*self)

fastfield_codecs/benches/bench.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,10 @@ mod tests {
100100

101101
fn get_u128_column_from_data(data: &[u128]) -> Arc<dyn Column<u128>> {
102102
let mut out = vec![];
103-
serialize_u128(VecColumn::from(&data), &mut out).unwrap();
103+
let iter_gen = || data.iter().cloned();
104+
serialize_u128(iter_gen, data.len() as u64, &mut out).unwrap();
104105
let out = OwnedBytes::new(out);
105-
open_u128(out).unwrap()
106+
open_u128::<u128>(out).unwrap()
106107
}
107108

108109
#[bench]

fastfield_codecs/src/column.rs

+74-34
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ use std::ops::RangeInclusive;
33

44
use tantivy_bitpacker::minmax;
55

6+
use crate::monotonic_mapping::StrictlyMonotonicFn;
7+
68
pub trait Column<T: PartialOrd = u64>: Send + Sync {
79
/// Return the value associated with the given idx.
810
///
@@ -143,16 +145,30 @@ struct MonotonicMappingColumn<C, T, Input> {
143145
_phantom: PhantomData<Input>,
144146
}
145147

146-
/// Creates a view of a column transformed by a monotonic mapping.
147-
pub fn monotonic_map_column<C, T, Input: PartialOrd, Output: PartialOrd>(
148+
/// Creates a view of a column transformed by a strictly monotonic mapping. See
149+
/// [`StrictlyMonotonicFn`].
150+
///
151+
/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3]
152+
/// monotonic_mapping.mapping() is expected to be injective, and we should always have
153+
/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el
154+
///
155+
/// The inverse of the mapping is required for:
156+
/// `fn get_between_vals(&self, range: RangeInclusive<T>) -> Vec<u64> `
157+
/// The user provides the original value range and we need to monotonic map them in the same way the
158+
/// serialization does before calling the underlying column.
159+
///
160+
/// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping
161+
/// during serialization. And therefore the monotonic_mapping_inv when opening is the same as
162+
/// monotonic_mapping during serialization.
163+
pub fn monotonic_map_column<C, T, Input, Output>(
148164
from_column: C,
149165
monotonic_mapping: T,
150166
) -> impl Column<Output>
151167
where
152168
C: Column<Input>,
153-
T: Fn(Input) -> Output + Send + Sync,
154-
Input: Send + Sync,
155-
Output: Send + Sync,
169+
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
170+
Input: PartialOrd + Send + Sync + Clone,
171+
Output: PartialOrd + Send + Sync + Clone,
156172
{
157173
MonotonicMappingColumn {
158174
from_column,
@@ -161,36 +177,46 @@ where
161177
}
162178
}
163179

164-
impl<C, T, Input: PartialOrd, Output: PartialOrd> Column<Output>
165-
for MonotonicMappingColumn<C, T, Input>
180+
impl<C, T, Input, Output> Column<Output> for MonotonicMappingColumn<C, T, Input>
166181
where
167182
C: Column<Input>,
168-
T: Fn(Input) -> Output + Send + Sync,
169-
Input: Send + Sync,
170-
Output: Send + Sync,
183+
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
184+
Input: PartialOrd + Send + Sync + Clone,
185+
Output: PartialOrd + Send + Sync + Clone,
171186
{
172187
#[inline]
173188
fn get_val(&self, idx: u64) -> Output {
174189
let from_val = self.from_column.get_val(idx);
175-
(self.monotonic_mapping)(from_val)
190+
self.monotonic_mapping.mapping(from_val)
176191
}
177192

178193
fn min_value(&self) -> Output {
179194
let from_min_value = self.from_column.min_value();
180-
(self.monotonic_mapping)(from_min_value)
195+
self.monotonic_mapping.mapping(from_min_value)
181196
}
182197

183198
fn max_value(&self) -> Output {
184199
let from_max_value = self.from_column.max_value();
185-
(self.monotonic_mapping)(from_max_value)
200+
self.monotonic_mapping.mapping(from_max_value)
186201
}
187202

188203
fn num_vals(&self) -> u64 {
189204
self.from_column.num_vals()
190205
}
191206

192207
fn iter(&self) -> Box<dyn Iterator<Item = Output> + '_> {
193-
Box::new(self.from_column.iter().map(&self.monotonic_mapping))
208+
Box::new(
209+
self.from_column
210+
.iter()
211+
.map(|el| self.monotonic_mapping.mapping(el)),
212+
)
213+
}
214+
215+
fn get_between_vals(&self, range: RangeInclusive<Output>) -> Vec<u64> {
216+
self.from_column.get_between_vals(
217+
self.monotonic_mapping.inverse(range.start().clone())
218+
..=self.monotonic_mapping.inverse(range.end().clone()),
219+
)
194220
}
195221

196222
// We voluntarily do not implement get_range as it yields a regression,
@@ -236,19 +262,22 @@ where
236262
#[cfg(test)]
237263
mod tests {
238264
use super::*;
239-
use crate::MonotonicallyMappableToU64;
265+
use crate::monotonic_mapping::{
266+
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternalBaseval,
267+
StrictlyMonotonicMappingToInternalGCDBaseval,
268+
};
240269

241270
#[test]
242271
fn test_monotonic_mapping() {
243-
let vals = &[1u64, 3u64][..];
272+
let vals = &[3u64, 5u64][..];
244273
let col = VecColumn::from(vals);
245-
let mapped = monotonic_map_column(col, |el| el + 4);
246-
assert_eq!(mapped.min_value(), 5u64);
247-
assert_eq!(mapped.max_value(), 7u64);
274+
let mapped = monotonic_map_column(col, StrictlyMonotonicMappingToInternalBaseval::new(2));
275+
assert_eq!(mapped.min_value(), 1u64);
276+
assert_eq!(mapped.max_value(), 3u64);
248277
assert_eq!(mapped.num_vals(), 2);
249278
assert_eq!(mapped.num_vals(), 2);
250-
assert_eq!(mapped.get_val(0), 5);
251-
assert_eq!(mapped.get_val(1), 7);
279+
assert_eq!(mapped.get_val(0), 1);
280+
assert_eq!(mapped.get_val(1), 3);
252281
}
253282

254283
#[test]
@@ -260,31 +289,42 @@ mod tests {
260289

261290
#[test]
262291
fn test_monotonic_mapping_iter() {
263-
let vals: Vec<u64> = (-1..99).map(i64::to_u64).collect();
292+
let vals: Vec<u64> = (10..110u64).map(|el| el * 10).collect();
264293
let col = VecColumn::from(&vals);
265-
let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64);
266-
let val_i64s: Vec<i64> = mapped.iter().collect();
294+
let mapped = monotonic_map_column(
295+
col,
296+
StrictlyMonotonicMappingInverter::from(
297+
StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 100),
298+
),
299+
);
300+
let val_i64s: Vec<u64> = mapped.iter().collect();
267301
for i in 0..100 {
268302
assert_eq!(val_i64s[i as usize], mapped.get_val(i));
269303
}
270304
}
271305

272306
#[test]
273307
fn test_monotonic_mapping_get_range() {
274-
let vals: Vec<u64> = (-1..99).map(i64::to_u64).collect();
308+
let vals: Vec<u64> = (0..100u64).map(|el| el * 10).collect();
275309
let col = VecColumn::from(&vals);
276-
let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64);
277-
assert_eq!(mapped.min_value(), -10i64);
278-
assert_eq!(mapped.max_value(), 980i64);
310+
let mapped = monotonic_map_column(
311+
col,
312+
StrictlyMonotonicMappingInverter::from(
313+
StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 0),
314+
),
315+
);
316+
317+
assert_eq!(mapped.min_value(), 0u64);
318+
assert_eq!(mapped.max_value(), 9900u64);
279319
assert_eq!(mapped.num_vals(), 100);
280-
let val_i64s: Vec<i64> = mapped.iter().collect();
281-
assert_eq!(val_i64s.len(), 100);
320+
let val_u64s: Vec<u64> = mapped.iter().collect();
321+
assert_eq!(val_u64s.len(), 100);
282322
for i in 0..100 {
283-
assert_eq!(val_i64s[i as usize], mapped.get_val(i));
284-
assert_eq!(val_i64s[i as usize], i64::from_u64(vals[i as usize]) * 10);
323+
assert_eq!(val_u64s[i as usize], mapped.get_val(i));
324+
assert_eq!(val_u64s[i as usize], vals[i as usize] * 10);
285325
}
286-
let mut buf = [0i64; 20];
326+
let mut buf = [0u64; 20];
287327
mapped.get_range(7, &mut buf[..]);
288-
assert_eq!(&val_i64s[7..][..20], &buf);
328+
assert_eq!(&val_u64s[7..][..20], &buf);
289329
}
290330
}

fastfield_codecs/src/compact_space/mod.rs

+12-7
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,10 @@ pub struct IPCodecParams {
171171

172172
impl CompactSpaceCompressor {
173173
/// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals.
174-
pub fn train_from(column: &impl Column<u128>) -> Self {
174+
pub fn train_from(iter: impl Iterator<Item = u128>, num_vals: u64) -> Self {
175175
let mut values_sorted = BTreeSet::new();
176-
values_sorted.extend(column.iter());
177-
let total_num_values = column.num_vals();
176+
values_sorted.extend(iter);
177+
let total_num_values = num_vals;
178178

179179
let compact_space =
180180
get_compact_space(&values_sorted, total_num_values, COST_PER_BLANK_IN_BITS);
@@ -443,7 +443,7 @@ impl CompactSpaceDecompressor {
443443
mod tests {
444444

445445
use super::*;
446-
use crate::{open_u128, serialize_u128, VecColumn};
446+
use crate::{open_u128, serialize_u128};
447447

448448
#[test]
449449
fn compact_space_test() {
@@ -513,7 +513,12 @@ mod tests {
513513

514514
fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes {
515515
let mut out = Vec::new();
516-
serialize_u128(VecColumn::from(u128_vals), &mut out).unwrap();
516+
serialize_u128(
517+
|| u128_vals.iter().cloned(),
518+
u128_vals.len() as u64,
519+
&mut out,
520+
)
521+
.unwrap();
517522

518523
let data = OwnedBytes::new(out);
519524
test_all(data.clone(), u128_vals);
@@ -603,8 +608,8 @@ mod tests {
603608
5_000_000_000,
604609
];
605610
let mut out = Vec::new();
606-
serialize_u128(VecColumn::from(vals), &mut out).unwrap();
607-
let decomp = open_u128(OwnedBytes::new(out)).unwrap();
611+
serialize_u128(|| vals.iter().cloned(), vals.len() as u64, &mut out).unwrap();
612+
let decomp = open_u128::<u128>(OwnedBytes::new(out)).unwrap();
608613

609614
assert_eq!(decomp.get_between_vals(199..=200), vec![0]);
610615
assert_eq!(decomp.get_between_vals(199..=201), vec![0, 1]);

fastfield_codecs/src/lib.rs

+35-7
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ use std::sync::Arc;
1313

1414
use common::BinarySerializable;
1515
use compact_space::CompactSpaceDecompressor;
16+
use monotonic_mapping::{
17+
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
18+
StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval,
19+
};
1620
use ownedbytes::OwnedBytes;
1721
use serialize::Header;
1822

@@ -22,6 +26,7 @@ mod compact_space;
2226
mod line;
2327
mod linear;
2428
mod monotonic_mapping;
29+
mod monotonic_mapping_u128;
2530

2631
mod column;
2732
mod gcd;
@@ -31,7 +36,8 @@ use self::bitpacked::BitpackedCodec;
3136
use self::blockwise_linear::BlockwiseLinearCodec;
3237
pub use self::column::{monotonic_map_column, Column, VecColumn};
3338
use self::linear::LinearCodec;
34-
pub use self::monotonic_mapping::MonotonicallyMappableToU64;
39+
pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
40+
pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
3541
pub use self::serialize::{
3642
estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader,
3743
};
@@ -73,8 +79,13 @@ impl FastFieldCodecType {
7379
}
7480

7581
/// Returns the correct codec reader wrapped in the `Arc` for the data.
76-
pub fn open_u128(bytes: OwnedBytes) -> io::Result<Arc<dyn Column<u128>>> {
77-
Ok(Arc::new(CompactSpaceDecompressor::open(bytes)?))
82+
pub fn open_u128<Item: MonotonicallyMappableToU128>(
83+
bytes: OwnedBytes,
84+
) -> io::Result<Arc<dyn Column<Item>>> {
85+
let reader = CompactSpaceDecompressor::open(bytes)?;
86+
let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<Item>> =
87+
StrictlyMonotonicMappingToInternal::<Item>::new().into();
88+
Ok(Arc::new(monotonic_map_column(reader, inverted)))
7889
}
7990

8091
/// Returns the correct codec reader wrapped in the `Arc` for the data.
@@ -99,11 +110,15 @@ fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
99110
let reader = C::open_from_bytes(bytes, normalized_header)?;
100111
let min_value = header.min_value;
101112
if let Some(gcd) = header.gcd {
102-
let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val * gcd.get());
103-
Ok(Arc::new(monotonic_map_column(reader, monotonic_mapping)))
113+
let mapping = StrictlyMonotonicMappingInverter::from(
114+
StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd.get(), min_value),
115+
);
116+
Ok(Arc::new(monotonic_map_column(reader, mapping)))
104117
} else {
105-
let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val);
106-
Ok(Arc::new(monotonic_map_column(reader, monotonic_mapping)))
118+
let mapping = StrictlyMonotonicMappingInverter::from(
119+
StrictlyMonotonicMappingToInternalBaseval::new(min_value),
120+
);
121+
Ok(Arc::new(monotonic_map_column(reader, mapping)))
107122
}
108123
}
109124

@@ -143,6 +158,7 @@ pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
143158

144159
#[cfg(test)]
145160
mod tests {
161+
146162
use proptest::prelude::*;
147163
use proptest::strategy::Strategy;
148164
use proptest::{prop_oneof, proptest};
@@ -177,6 +193,18 @@ mod tests {
177193
`{data:?}`",
178194
);
179195
}
196+
197+
if !data.is_empty() {
198+
let test_rand_idx = rand::thread_rng().gen_range(0..=data.len() - 1);
199+
let expected_positions: Vec<u64> = data
200+
.iter()
201+
.enumerate()
202+
.filter(|(_, el)| **el == data[test_rand_idx])
203+
.map(|(pos, _)| pos as u64)
204+
.collect();
205+
let positions = reader.get_between_vals(data[test_rand_idx]..=data[test_rand_idx]);
206+
assert_eq!(expected_positions, positions);
207+
}
180208
Some((estimation, actual_compression))
181209
}
182210

fastfield_codecs/src/main.rs

+6-3
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ fn bench_ip() {
9090
{
9191
let mut data = vec![];
9292
for dataset in dataset.chunks(500_000) {
93-
serialize_u128(VecColumn::from(dataset), &mut data).unwrap();
93+
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap();
9494
}
9595
let compression = data.len() as f64 / (dataset.len() * 16) as f64;
9696
println!("Compression 50_000 chunks {:.4}", compression);
@@ -101,7 +101,10 @@ fn bench_ip() {
101101
}
102102

103103
let mut data = vec![];
104-
serialize_u128(VecColumn::from(&dataset), &mut data).unwrap();
104+
{
105+
print_time!("creation");
106+
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap();
107+
}
105108

106109
let compression = data.len() as f64 / (dataset.len() * 16) as f64;
107110
println!("Compression {:.2}", compression);
@@ -110,7 +113,7 @@ fn bench_ip() {
110113
(data.len() * 8) as f32 / dataset.len() as f32
111114
);
112115

113-
let decompressor = open_u128(OwnedBytes::new(data)).unwrap();
116+
let decompressor = open_u128::<u128>(OwnedBytes::new(data)).unwrap();
114117
// Sample some ranges
115118
for value in dataset.iter().take(1110).skip(1100).cloned() {
116119
print_time!("get range");

0 commit comments

Comments
 (0)