Skip to content

Commit 7c6cc81

Browse files
authored
enable range query on fast field for u64 compatible types (#1762)
* enable range query on fast field for u64 compatible types * rename, update benches
1 parent 514d23a commit 7c6cc81

File tree

7 files changed

+799
-36
lines changed

7 files changed

+799
-36
lines changed

fastfield_codecs/benches/bench.rs

+87-22
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ extern crate test;
44

55
#[cfg(test)]
66
mod tests {
7-
use std::iter;
7+
use std::ops::RangeInclusive;
88
use std::sync::Arc;
99

1010
use common::OwnedBytes;
@@ -71,27 +71,24 @@ mod tests {
7171
});
7272
}
7373

74-
fn get_exp_data() -> Vec<u64> {
74+
const FIFTY_PERCENT_RANGE: RangeInclusive<u64> = 1..=50;
75+
const SINGLE_ITEM: u64 = 90;
76+
const SINGLE_ITEM_RANGE: RangeInclusive<u64> = 90..=90;
77+
const ONE_PERCENT_ITEM_RANGE: RangeInclusive<u64> = 49..=49;
78+
fn get_data_50percent_item() -> Vec<u128> {
79+
let mut rng = StdRng::from_seed([1u8; 32]);
80+
7581
let mut data = vec![];
76-
for i in 0..100 {
77-
let num = i * i;
78-
data.extend(iter::repeat(i as u64).take(num));
82+
for _ in 0..300_000 {
83+
let val = rng.gen_range(1..=100);
84+
data.push(val);
7985
}
80-
data.shuffle(&mut StdRng::from_seed([1u8; 32]));
86+
data.push(SINGLE_ITEM);
8187

82-
// lengt = 328350
88+
data.shuffle(&mut rng);
89+
let data = data.iter().map(|el| *el as u128).collect::<Vec<_>>();
8390
data
8491
}
85-
86-
fn get_data_50percent_item() -> (u128, u128, Vec<u128>) {
87-
let mut permutation = get_exp_data();
88-
let major_item = 20;
89-
let minor_item = 10;
90-
permutation.extend(iter::repeat(major_item).take(permutation.len()));
91-
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
92-
let permutation = permutation.iter().map(|el| *el as u128).collect::<Vec<_>>();
93-
(major_item as u128, minor_item as u128, permutation)
94-
}
9592
fn get_u128_column_random() -> Arc<dyn Column<u128>> {
9693
let permutation = generate_random();
9794
let permutation = permutation.iter().map(|el| *el as u128).collect::<Vec<_>>();
@@ -106,15 +103,82 @@ mod tests {
106103
open_u128::<u128>(out).unwrap()
107104
}
108105

106+
// U64 RANGE START
107+
#[bench]
108+
fn bench_intfastfield_getrange_u64_50percent_hit(b: &mut Bencher) {
109+
let data = get_data_50percent_item();
110+
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
111+
let column: Arc<dyn Column<u64>> = serialize_and_load(&data);
112+
113+
b.iter(|| {
114+
let mut positions = Vec::new();
115+
column.get_docids_for_value_range(
116+
FIFTY_PERCENT_RANGE,
117+
0..data.len() as u32,
118+
&mut positions,
119+
);
120+
positions
121+
});
122+
}
123+
124+
#[bench]
125+
fn bench_intfastfield_getrange_u64_1percent_hit(b: &mut Bencher) {
126+
let data = get_data_50percent_item();
127+
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
128+
let column: Arc<dyn Column<u64>> = serialize_and_load(&data);
129+
130+
b.iter(|| {
131+
let mut positions = Vec::new();
132+
column.get_docids_for_value_range(
133+
ONE_PERCENT_ITEM_RANGE,
134+
0..data.len() as u32,
135+
&mut positions,
136+
);
137+
positions
138+
});
139+
}
140+
141+
#[bench]
142+
fn bench_intfastfield_getrange_u64_single_hit(b: &mut Bencher) {
143+
let data = get_data_50percent_item();
144+
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
145+
let column: Arc<dyn Column<u64>> = serialize_and_load(&data);
146+
147+
b.iter(|| {
148+
let mut positions = Vec::new();
149+
column.get_docids_for_value_range(
150+
SINGLE_ITEM_RANGE,
151+
0..data.len() as u32,
152+
&mut positions,
153+
);
154+
positions
155+
});
156+
}
157+
158+
#[bench]
159+
fn bench_intfastfield_getrange_u64_hit_all(b: &mut Bencher) {
160+
let data = get_data_50percent_item();
161+
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
162+
let column: Arc<dyn Column<u64>> = serialize_and_load(&data);
163+
164+
b.iter(|| {
165+
let mut positions = Vec::new();
166+
column.get_docids_for_value_range(0..=u64::MAX, 0..data.len() as u32, &mut positions);
167+
positions
168+
});
169+
}
170+
// U64 RANGE END
171+
172+
// U128 RANGE START
109173
#[bench]
110174
fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) {
111-
let (major_item, _minor_item, data) = get_data_50percent_item();
175+
let data = get_data_50percent_item();
112176
let column = get_u128_column_from_data(&data);
113177

114178
b.iter(|| {
115179
let mut positions = Vec::new();
116180
column.get_docids_for_value_range(
117-
major_item..=major_item,
181+
*FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128,
118182
0..data.len() as u32,
119183
&mut positions,
120184
);
@@ -124,13 +188,13 @@ mod tests {
124188

125189
#[bench]
126190
fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) {
127-
let (_major_item, minor_item, data) = get_data_50percent_item();
191+
let data = get_data_50percent_item();
128192
let column = get_u128_column_from_data(&data);
129193

130194
b.iter(|| {
131195
let mut positions = Vec::new();
132196
column.get_docids_for_value_range(
133-
minor_item..=minor_item,
197+
*SINGLE_ITEM_RANGE.start() as u128..=*SINGLE_ITEM_RANGE.end() as u128,
134198
0..data.len() as u32,
135199
&mut positions,
136200
);
@@ -140,7 +204,7 @@ mod tests {
140204

141205
#[bench]
142206
fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) {
143-
let (_major_item, _minor_item, data) = get_data_50percent_item();
207+
let data = get_data_50percent_item();
144208
let column = get_u128_column_from_data(&data);
145209

146210
b.iter(|| {
@@ -149,6 +213,7 @@ mod tests {
149213
positions
150214
});
151215
}
216+
// U128 RANGE END
152217

153218
#[bench]
154219
fn bench_intfastfield_scan_all_fflookup_u128(b: &mut Bencher) {

src/fastfield/multivalued/reader.rs

+70
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,78 @@ impl<T: PartialOrd + MakeZero + Clone> MultiValuedFastFieldReader<T> {
122122
#[cfg(test)]
123123
mod tests {
124124

125+
use time::{Duration, OffsetDateTime};
126+
127+
use crate::collector::Count;
125128
use crate::core::Index;
129+
use crate::query::RangeQuery;
126130
use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema};
131+
use crate::{DateOptions, DateTime};
132+
133+
#[test]
134+
fn test_multivalued_date_docids_for_value_range() -> crate::Result<()> {
135+
let mut schema_builder = Schema::builder();
136+
let date_field = schema_builder.add_date_field(
137+
"multi_date_field",
138+
DateOptions::default()
139+
.set_fast(Cardinality::MultiValues)
140+
.set_indexed()
141+
.set_fieldnorm()
142+
.set_stored(),
143+
);
144+
let schema = schema_builder.build();
145+
let index = Index::create_in_ram(schema);
146+
let mut index_writer = index.writer_for_tests()?;
147+
let first_time_stamp = OffsetDateTime::now_utc();
148+
index_writer.add_document(doc!(
149+
date_field => DateTime::from_utc(first_time_stamp),
150+
date_field => DateTime::from_utc(first_time_stamp),
151+
))?;
152+
index_writer.add_document(doc!())?;
153+
// add one second
154+
index_writer.add_document(doc!(
155+
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(1)),
156+
))?;
157+
// add another second
158+
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
159+
index_writer.add_document(doc!(
160+
date_field => DateTime::from_utc(two_secs_ahead),
161+
date_field => DateTime::from_utc(two_secs_ahead),
162+
date_field => DateTime::from_utc(two_secs_ahead),
163+
))?;
164+
// add three seconds
165+
index_writer.add_document(doc!(
166+
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(3)),
167+
))?;
168+
index_writer.commit()?;
169+
170+
let reader = index.reader()?;
171+
let searcher = reader.searcher();
172+
let reader = searcher.segment_reader(0);
173+
assert_eq!(reader.num_docs(), 5);
174+
175+
let date_ff_reader = reader.fast_fields().dates(date_field).unwrap();
176+
let mut docids = vec![];
177+
date_ff_reader.get_docids_for_value_range(
178+
DateTime::from_utc(first_time_stamp)..=DateTime::from_utc(two_secs_ahead),
179+
0..5,
180+
&mut docids,
181+
);
182+
assert_eq!(docids, vec![0, 2, 3]);
183+
184+
let count_multiples =
185+
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
186+
187+
assert_eq!(
188+
count_multiples(RangeQuery::new_date(
189+
date_field,
190+
DateTime::from_utc(first_time_stamp)..DateTime::from_utc(two_secs_ahead)
191+
)),
192+
2
193+
);
194+
195+
Ok(())
196+
}
127197

128198
#[test]
129199
fn test_multifastfield_reader() -> crate::Result<()> {

src/query/range_query/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
mod fast_field_range_query;
22
mod range_query;
33
mod range_query_ip_fastfield;
4+
mod range_query_u64_fastfield;
45

56
pub use self::range_query::RangeQuery;

src/query/range_query/range_query.rs

+77-8
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
use std::io;
22
use std::ops::{Bound, Range};
33

4-
use common::BitSet;
4+
use common::{BinarySerializable, BitSet};
55

6+
use super::range_query_u64_fastfield::FastFieldRangeWeight;
67
use crate::core::SegmentReader;
78
use crate::error::TantivyError;
89
use crate::query::explanation::does_not_match;
910
use crate::query::range_query::range_query_ip_fastfield::IPFastFieldRangeWeight;
1011
use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight};
1112
use crate::schema::{Field, IndexRecordOption, Term, Type};
1213
use crate::termdict::{TermDictionary, TermStreamer};
13-
use crate::{DocId, Score};
14+
use crate::{DateTime, DocId, Score};
1415

1516
pub(crate) fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
1617
bound: &Bound<TFrom>,
@@ -203,6 +204,40 @@ impl RangeQuery {
203204
)
204205
}
205206

207+
/// Create a new `RangeQuery` over a `date` field.
208+
///
209+
/// The two `Bound` arguments make it possible to create more complex
210+
/// ranges than semi-inclusive range.
211+
///
212+
/// If the field is not of the type `date`, tantivy
213+
/// will panic when the `Weight` object is created.
214+
pub fn new_date_bounds(
215+
field: Field,
216+
left_bound: Bound<DateTime>,
217+
right_bound: Bound<DateTime>,
218+
) -> RangeQuery {
219+
let make_term_val =
220+
|val: &DateTime| Term::from_field_date(field, *val).value_bytes().to_owned();
221+
RangeQuery {
222+
field,
223+
value_type: Type::Date,
224+
left_bound: map_bound(&left_bound, &make_term_val),
225+
right_bound: map_bound(&right_bound, &make_term_val),
226+
}
227+
}
228+
229+
/// Create a new `RangeQuery` over a `date` field.
230+
///
231+
/// If the field is not of the type `date`, tantivy
232+
/// will panic when the `Weight` object is created.
233+
pub fn new_date(field: Field, range: Range<DateTime>) -> RangeQuery {
234+
RangeQuery::new_date_bounds(
235+
field,
236+
Bound::Included(range.start),
237+
Bound::Excluded(range.end),
238+
)
239+
}
240+
206241
/// Create a new `RangeQuery` over a `Str` field.
207242
///
208243
/// The two `Bound` arguments make it possible to create more complex
@@ -252,6 +287,23 @@ impl RangeQuery {
252287
}
253288
}
254289

290+
fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
291+
match typ {
292+
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
293+
Type::IpAddr => true,
294+
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
295+
}
296+
}
297+
298+
/// Returns true if the type maps to a u64 fast field
299+
pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool {
300+
match typ {
301+
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
302+
Type::IpAddr => false,
303+
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
304+
}
305+
}
306+
255307
impl Query for RangeQuery {
256308
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
257309
let schema = enable_scoring.schema();
@@ -265,12 +317,29 @@ impl Query for RangeQuery {
265317
return Err(TantivyError::SchemaError(err_msg));
266318
}
267319

268-
if field_type.is_ip_addr() && field_type.is_fast() {
269-
Ok(Box::new(IPFastFieldRangeWeight::new(
270-
self.field,
271-
&self.left_bound,
272-
&self.right_bound,
273-
)))
320+
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type) {
321+
if field_type.is_ip_addr() {
322+
Ok(Box::new(IPFastFieldRangeWeight::new(
323+
self.field,
324+
&self.left_bound,
325+
&self.right_bound,
326+
)))
327+
} else {
328+
// We run the range query on u64 value space for performance reasons and simpicity
329+
// assert the type maps to u64
330+
assert!(maps_to_u64_fastfield(self.value_type));
331+
let parse_from_bytes = |data: &Vec<u8>| {
332+
u64::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap())
333+
};
334+
335+
let left_bound = map_bound(&self.left_bound, &parse_from_bytes);
336+
let right_bound = map_bound(&self.right_bound, &parse_from_bytes);
337+
Ok(Box::new(FastFieldRangeWeight::new(
338+
self.field,
339+
left_bound,
340+
right_bound,
341+
)))
342+
}
274343
} else {
275344
Ok(Box::new(RangeWeight {
276345
field: self.field,

src/query/range_query/range_query_ip_fastfield.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@ pub struct IPFastFieldRangeWeight {
2323

2424
impl IPFastFieldRangeWeight {
2525
pub fn new(field: Field, left_bound: &Bound<Vec<u8>>, right_bound: &Bound<Vec<u8>>) -> Self {
26-
let ip_from_bound_raw_data = |data: &Vec<u8>| {
27-
let left_ip_u128: u128 =
26+
let parse_ip_from_bytes = |data: &Vec<u8>| {
27+
let ip_u128: u128 =
2828
u128::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap());
29-
Ipv6Addr::from_u128(left_ip_u128)
29+
Ipv6Addr::from_u128(ip_u128)
3030
};
31-
let left_bound = map_bound(left_bound, &ip_from_bound_raw_data);
32-
let right_bound = map_bound(right_bound, &ip_from_bound_raw_data);
31+
let left_bound = map_bound(left_bound, &parse_ip_from_bytes);
32+
let right_bound = map_bound(right_bound, &parse_ip_from_bytes);
3333
Self {
3434
field,
3535
left_bound,

0 commit comments

Comments
 (0)