Skip to content

Commit ac53335

Browse files
committed
enable range query on fast field for u64 compatible types
1 parent 07a51eb commit ac53335

File tree

7 files changed

+773
-14
lines changed

7 files changed

+773
-14
lines changed

fastfield_codecs/benches/bench.rs

+55
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ mod tests {
7979
}
8080
data.shuffle(&mut StdRng::from_seed([1u8; 32]));
8181

82+
assert_eq!(data.len(), 328350);
83+
8284
// lengt = 328350
8385
data
8486
}
@@ -106,6 +108,58 @@ mod tests {
106108
open_u128::<u128>(out).unwrap()
107109
}
108110

111+
// U64 RANGE START
112+
#[bench]
113+
fn bench_intfastfield_getrange_u64_50percent_hit(b: &mut Bencher) {
114+
let (major_item, _minor_item, data) = get_data_50percent_item();
115+
let major_item = major_item as u64;
116+
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
117+
let column: Arc<dyn Column<u64>> = serialize_and_load(&data);
118+
119+
b.iter(|| {
120+
let mut positions = Vec::new();
121+
column.get_docids_for_value_range(
122+
major_item..=major_item,
123+
0..data.len() as u32,
124+
&mut positions,
125+
);
126+
positions
127+
});
128+
}
129+
130+
#[bench]
131+
fn bench_intfastfield_getrange_u64_single_hit(b: &mut Bencher) {
132+
let (_major_item, minor_item, data) = get_data_50percent_item();
133+
let minor_item = minor_item as u64;
134+
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
135+
let column: Arc<dyn Column<u64>> = serialize_and_load(&data);
136+
137+
b.iter(|| {
138+
let mut positions = Vec::new();
139+
column.get_docids_for_value_range(
140+
minor_item..=minor_item,
141+
0..data.len() as u32,
142+
&mut positions,
143+
);
144+
positions
145+
});
146+
}
147+
148+
#[bench]
149+
fn bench_intfastfield_getrange_u64_hit_all(b: &mut Bencher) {
150+
let (_major_item, _minor_item, data) = get_data_50percent_item();
151+
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
152+
let column: Arc<dyn Column<u64>> = serialize_and_load(&data);
153+
154+
b.iter(|| {
155+
let mut positions = Vec::new();
156+
column.get_docids_for_value_range(0..=u64::MAX, 0..data.len() as u32, &mut positions);
157+
positions
158+
});
159+
}
160+
// U64 RANGE END
161+
162+
// U128 RANGE START
109163
#[bench]
110164
fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) {
111165
let (major_item, _minor_item, data) = get_data_50percent_item();
@@ -149,6 +203,7 @@ mod tests {
149203
positions
150204
});
151205
}
206+
// U128 RANGE END
152207

153208
#[bench]
154209
fn bench_intfastfield_scan_all_fflookup_u128(b: &mut Bencher) {

src/fastfield/multivalued/reader.rs

+70
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,78 @@ impl<T: PartialOrd + MakeZero + Clone> MultiValuedFastFieldReader<T> {
122122
#[cfg(test)]
123123
mod tests {
124124

125+
use time::{Duration, OffsetDateTime};
126+
127+
use crate::collector::Count;
125128
use crate::core::Index;
129+
use crate::query::RangeQuery;
126130
use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema};
131+
use crate::{DateOptions, DateTime};
132+
133+
#[test]
134+
fn test_multivalued_date_docids_for_value_range() -> crate::Result<()> {
135+
let mut schema_builder = Schema::builder();
136+
let date_field = schema_builder.add_date_field(
137+
"multi_date_field",
138+
DateOptions::default()
139+
.set_fast(Cardinality::MultiValues)
140+
.set_indexed()
141+
.set_fieldnorm()
142+
.set_stored(),
143+
);
144+
let schema = schema_builder.build();
145+
let index = Index::create_in_ram(schema);
146+
let mut index_writer = index.writer_for_tests()?;
147+
let first_time_stamp = OffsetDateTime::now_utc();
148+
index_writer.add_document(doc!(
149+
date_field => DateTime::from_utc(first_time_stamp),
150+
date_field => DateTime::from_utc(first_time_stamp),
151+
))?;
152+
index_writer.add_document(doc!())?;
153+
// add one second
154+
index_writer.add_document(doc!(
155+
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(1)),
156+
))?;
157+
// add another second
158+
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
159+
index_writer.add_document(doc!(
160+
date_field => DateTime::from_utc(two_secs_ahead),
161+
date_field => DateTime::from_utc(two_secs_ahead),
162+
date_field => DateTime::from_utc(two_secs_ahead),
163+
))?;
164+
// add three seconds
165+
index_writer.add_document(doc!(
166+
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(3)),
167+
))?;
168+
index_writer.commit()?;
169+
170+
let reader = index.reader()?;
171+
let searcher = reader.searcher();
172+
let reader = searcher.segment_reader(0);
173+
assert_eq!(reader.num_docs(), 5);
174+
175+
let date_ff_reader = reader.fast_fields().dates(date_field).unwrap();
176+
let mut docids = vec![];
177+
date_ff_reader.get_docids_for_value_range(
178+
DateTime::from_utc(first_time_stamp)..=DateTime::from_utc(two_secs_ahead),
179+
0..5,
180+
&mut docids,
181+
);
182+
assert_eq!(docids, vec![0, 2, 3]);
183+
184+
let count_multiples =
185+
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
186+
187+
assert_eq!(
188+
count_multiples(RangeQuery::new_date(
189+
date_field,
190+
DateTime::from_utc(first_time_stamp)..DateTime::from_utc(two_secs_ahead)
191+
)),
192+
2
193+
);
194+
195+
Ok(())
196+
}
127197

128198
#[test]
129199
fn test_multifastfield_reader() -> crate::Result<()> {

src/query/range_query/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
mod fast_field_range_query;
22
mod range_query;
33
mod range_query_ip_fastfield;
4+
mod range_query_u64_fastfield;
45

56
pub use self::range_query::RangeQuery;

src/query/range_query/range_query.rs

+77-8
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
11
use std::io;
22
use std::ops::{Bound, Range};
33

4-
use common::BitSet;
4+
use common::{BinarySerializable, BitSet};
55

6+
use super::range_query_u64_fastfield::FastFieldRangeWeight;
67
use crate::core::SegmentReader;
78
use crate::error::TantivyError;
89
use crate::query::explanation::does_not_match;
910
use crate::query::range_query::range_query_ip_fastfield::IPFastFieldRangeWeight;
1011
use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight};
1112
use crate::schema::{Field, IndexRecordOption, Term, Type};
1213
use crate::termdict::{TermDictionary, TermStreamer};
13-
use crate::{DocId, Score};
14+
use crate::{DateTime, DocId, Score};
1415

1516
pub(crate) fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
1617
bound: &Bound<TFrom>,
@@ -203,6 +204,40 @@ impl RangeQuery {
203204
)
204205
}
205206

207+
/// Create a new `RangeQuery` over a `date` field.
208+
///
209+
/// The two `Bound` arguments make it possible to create more complex
210+
/// ranges than semi-inclusive range.
211+
///
212+
/// If the field is not of the type `date`, tantivy
213+
/// will panic when the `Weight` object is created.
214+
pub fn new_date_bounds(
215+
field: Field,
216+
left_bound: Bound<DateTime>,
217+
right_bound: Bound<DateTime>,
218+
) -> RangeQuery {
219+
let make_term_val =
220+
|val: &DateTime| Term::from_field_date(field, *val).value_bytes().to_owned();
221+
RangeQuery {
222+
field,
223+
value_type: Type::Date,
224+
left_bound: map_bound(&left_bound, &make_term_val),
225+
right_bound: map_bound(&right_bound, &make_term_val),
226+
}
227+
}
228+
229+
/// Create a new `RangeQuery` over a `date` field.
230+
///
231+
/// If the field is not of the type `date`, tantivy
232+
/// will panic when the `Weight` object is created.
233+
pub fn new_date(field: Field, range: Range<DateTime>) -> RangeQuery {
234+
RangeQuery::new_date_bounds(
235+
field,
236+
Bound::Included(range.start),
237+
Bound::Excluded(range.end),
238+
)
239+
}
240+
206241
/// Create a new `RangeQuery` over a `Str` field.
207242
///
208243
/// The two `Bound` arguments make it possible to create more complex
@@ -252,6 +287,23 @@ impl RangeQuery {
252287
}
253288
}
254289

290+
fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
291+
match typ {
292+
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
293+
Type::IpAddr => true,
294+
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
295+
}
296+
}
297+
298+
/// Returns true if the type maps to a u64 fast field
299+
pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool {
300+
match typ {
301+
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
302+
Type::IpAddr => false,
303+
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
304+
}
305+
}
306+
255307
impl Query for RangeQuery {
256308
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
257309
let schema = enable_scoring.schema();
@@ -265,12 +317,29 @@ impl Query for RangeQuery {
265317
return Err(TantivyError::SchemaError(err_msg));
266318
}
267319

268-
if field_type.is_ip_addr() && field_type.is_fast() {
269-
Ok(Box::new(IPFastFieldRangeWeight::new(
270-
self.field,
271-
&self.left_bound,
272-
&self.right_bound,
273-
)))
320+
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type) {
321+
if field_type.is_ip_addr() {
322+
Ok(Box::new(IPFastFieldRangeWeight::new(
323+
self.field,
324+
&self.left_bound,
325+
&self.right_bound,
326+
)))
327+
} else {
328+
// We run the range query on u64 value space for performance reasons and simpicity
329+
// assert the type maps to u64
330+
assert!(maps_to_u64_fastfield(self.value_type));
331+
let parse_from_bytes = |data: &Vec<u8>| {
332+
u64::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap())
333+
};
334+
335+
let left_bound = map_bound(&self.left_bound, &parse_from_bytes);
336+
let right_bound = map_bound(&self.right_bound, &parse_from_bytes);
337+
Ok(Box::new(FastFieldRangeWeight::new(
338+
self.field,
339+
&left_bound,
340+
&right_bound,
341+
)))
342+
}
274343
} else {
275344
Ok(Box::new(RangeWeight {
276345
field: self.field,

src/query/range_query/range_query_ip_fastfield.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@ pub struct IPFastFieldRangeWeight {
2323

2424
impl IPFastFieldRangeWeight {
2525
pub fn new(field: Field, left_bound: &Bound<Vec<u8>>, right_bound: &Bound<Vec<u8>>) -> Self {
26-
let ip_from_bound_raw_data = |data: &Vec<u8>| {
27-
let left_ip_u128: u128 =
26+
let parse_ip_from_bytes = |data: &Vec<u8>| {
27+
let ip_u128: u128 =
2828
u128::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap());
29-
Ipv6Addr::from_u128(left_ip_u128)
29+
Ipv6Addr::from_u128(ip_u128)
3030
};
31-
let left_bound = map_bound(left_bound, &ip_from_bound_raw_data);
32-
let right_bound = map_bound(right_bound, &ip_from_bound_raw_data);
31+
let left_bound = map_bound(left_bound, &parse_ip_from_bytes);
32+
let right_bound = map_bound(right_bound, &parse_ip_from_bytes);
3333
Self {
3434
field,
3535
left_bound,

0 commit comments

Comments
 (0)