Skip to content

Commit 833fe37

Browse files
committed
Fix range query
Fix range query end check in advance Rename vars to reduce ambiguity add tests Fixes #2225
1 parent 4feeb23 commit 833fe37

File tree

2 files changed

+145
-37
lines changed

2 files changed

+145
-37
lines changed

src/indexer/index_writer.rs

+88-31
Original file line numberDiff line numberDiff line change
@@ -1706,7 +1706,8 @@ mod tests {
17061706

17071707
let old_reader = index.reader()?;
17081708

1709-
let id_exists = |id| id % 3 != 0; // 0 does not exist
1709+
// Every 3rd doc has only id field
1710+
let id_is_full_doc = |id| id % 3 != 0;
17101711

17111712
let multi_text_field_text1 = "test1 test2 test3 test1 test2 test3";
17121713
// rotate left
@@ -1722,7 +1723,7 @@ mod tests {
17221723
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
17231724
let ip = ip_from_id(id);
17241725

1725-
if !id_exists(id) {
1726+
if !id_is_full_doc(id) {
17261727
// every 3rd doc has no ip field
17271728
index_writer.add_document(doc!(
17281729
id_field=>id,
@@ -1842,7 +1843,7 @@ mod tests {
18421843

18431844
let num_docs_with_values = expected_ids_and_num_occurrences
18441845
.iter()
1845-
.filter(|(id, _id_occurrences)| id_exists(**id))
1846+
.filter(|(id, _id_occurrences)| id_is_full_doc(**id))
18461847
.map(|(_, id_occurrences)| *id_occurrences as usize)
18471848
.sum::<usize>();
18481849

@@ -1866,7 +1867,7 @@ mod tests {
18661867
if force_end_merge && num_segments_before_merge > 1 && num_segments_after_merge == 1 {
18671868
let mut expected_multi_ips: Vec<_> = id_list
18681869
.iter()
1869-
.filter(|id| id_exists(**id))
1870+
.filter(|id| id_is_full_doc(**id))
18701871
.flat_map(|id| vec![ip_from_id(*id), ip_from_id(*id)])
18711872
.collect();
18721873
assert_eq!(num_ips, expected_multi_ips.len() as u32);
@@ -1904,7 +1905,7 @@ mod tests {
19041905
let expected_ips = expected_ids_and_num_occurrences
19051906
.keys()
19061907
.flat_map(|id| {
1907-
if !id_exists(*id) {
1908+
if !id_is_full_doc(*id) {
19081909
None
19091910
} else {
19101911
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1916,7 +1917,7 @@ mod tests {
19161917
let expected_ips = expected_ids_and_num_occurrences
19171918
.keys()
19181919
.filter_map(|id| {
1919-
if !id_exists(*id) {
1920+
if !id_is_full_doc(*id) {
19201921
None
19211922
} else {
19221923
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1951,7 +1952,7 @@ mod tests {
19511952
let id = id_reader.first(doc).unwrap();
19521953

19531954
let vals: Vec<u64> = ff_reader.values_for_doc(doc).collect();
1954-
if id_exists(id) {
1955+
if id_is_full_doc(id) {
19551956
assert_eq!(vals.len(), 2);
19561957
assert_eq!(vals[0], vals[1]);
19571958
assert!(expected_ids_and_num_occurrences.contains_key(&vals[0]));
@@ -1961,7 +1962,7 @@ mod tests {
19611962
}
19621963

19631964
let bool_vals: Vec<bool> = bool_ff_reader.values_for_doc(doc).collect();
1964-
if id_exists(id) {
1965+
if id_is_full_doc(id) {
19651966
assert_eq!(bool_vals.len(), 2);
19661967
assert_ne!(bool_vals[0], bool_vals[1]);
19671968
} else {
@@ -1990,7 +1991,7 @@ mod tests {
19901991
.as_u64()
19911992
.unwrap();
19921993
assert!(expected_ids_and_num_occurrences.contains_key(&id));
1993-
if id_exists(id) {
1994+
if id_is_full_doc(id) {
19941995
let id2 = store_reader
19951996
.get::<TantivyDocument>(doc_id)
19961997
.unwrap()
@@ -2037,7 +2038,7 @@ mod tests {
20372038
let (existing_id, count) = (*id, *count);
20382039
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
20392040
assert_eq!(get_num_hits(id_field), count);
2040-
if !id_exists(existing_id) {
2041+
if !id_is_full_doc(existing_id) {
20412042
continue;
20422043
}
20432044
assert_eq!(get_num_hits(text_field), count);
@@ -2087,7 +2088,7 @@ mod tests {
20872088
//
20882089
for (existing_id, count) in &expected_ids_and_num_occurrences {
20892090
let (existing_id, count) = (*existing_id, *count);
2090-
if !id_exists(existing_id) {
2091+
if !id_is_full_doc(existing_id) {
20912092
continue;
20922093
}
20932094
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
@@ -2104,34 +2105,84 @@ mod tests {
21042105
}
21052106
}
21062107

2107-
// assert data is like expected
2108+
// Range query
21082109
//
2109-
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
2110-
let (existing_id, count) = (*existing_id, *count);
2111-
if !id_exists(existing_id) {
2112-
continue;
2113-
}
2114-
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
2115-
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
2110+
// Take half as sample
2111+
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect();
2112+
sample.sort_by_key(|(k, _num_occurences)| *k);
2113+
// sample.truncate(sample.len() / 2);
2114+
if !sample.is_empty() {
2115+
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
2116+
2117+
let expected_count = |sample: &[(&u64, &u64)]| {
2118+
sample
2119+
.iter()
2120+
.filter(|(id, _)| id_is_full_doc(**id))
2121+
.map(|(_id, num_occurences)| **num_occurences)
2122+
.sum::<u64>()
21162123
};
2117-
let ip = ip_from_id(existing_id);
2118-
2119-
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
2120-
// Range query on single value field
2121-
let query = gen_query_inclusive("ip", ip, ip);
2122-
assert_eq!(do_search_ip_field(&query), count);
2123-
2124-
// Range query on multi value field
2125-
let query = gen_query_inclusive("ips", ip, ip);
2124+
fn gen_query_inclusive<T1: ToString, T2: ToString>(
2125+
field: &str,
2126+
from: T1,
2127+
to: T2,
2128+
) -> String {
2129+
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
2130+
}
21262131

2127-
assert_eq!(do_search_ip_field(&query), count);
2132+
// Query first half
2133+
if left_sample.len() >= 1 {
2134+
let expected_count = expected_count(left_sample);
2135+
2136+
let start_range = *left_sample[0].0;
2137+
let end_range = *left_sample.last().unwrap().0;
2138+
let query = gen_query_inclusive("id_opt", start_range, end_range);
2139+
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
2140+
2141+
// Range query on ip field
2142+
let ip1 = ip_from_id(start_range);
2143+
let ip2 = ip_from_id(end_range);
2144+
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
2145+
let query = gen_query_inclusive("ip", ip1, ip2);
2146+
assert_eq!(do_search_ip_field(&query), expected_count);
2147+
let query = gen_query_inclusive("ip", "*", ip2);
2148+
assert_eq!(do_search_ip_field(&query), expected_count);
2149+
// Range query on multi value field
2150+
let query = gen_query_inclusive("ips", ip1, ip2);
2151+
assert_eq!(do_search_ip_field(&query), expected_count);
2152+
let query = gen_query_inclusive("ips", "*", ip2);
2153+
assert_eq!(do_search_ip_field(&query), expected_count);
2154+
}
2155+
// Query second half
2156+
if right_sample.len() >= 1 {
2157+
let expected_count = expected_count(right_sample);
2158+
let start_range = *right_sample[0].0;
2159+
let end_range = *right_sample.last().unwrap().0;
2160+
// Range query on id opt field
2161+
let query =
2162+
gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
2163+
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
2164+
2165+
// Range query on ip field
2166+
let ip1 = ip_from_id(start_range);
2167+
let ip2 = ip_from_id(end_range);
2168+
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
2169+
let query = gen_query_inclusive("ip", ip1, ip2);
2170+
assert_eq!(do_search_ip_field(&query), expected_count);
2171+
let query = gen_query_inclusive("ip", ip1, "*");
2172+
assert_eq!(do_search_ip_field(&query), expected_count);
2173+
// Range query on multi value field
2174+
let query = gen_query_inclusive("ips", ip1, ip2);
2175+
assert_eq!(do_search_ip_field(&query), expected_count);
2176+
let query = gen_query_inclusive("ips", ip1, "*");
2177+
assert_eq!(do_search_ip_field(&query), expected_count);
2178+
}
21282179
}
21292180

21302181
// ip range query on fast field
21312182
//
21322183
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
21332184
let (existing_id, count) = (*existing_id, *count);
2134-
if !id_exists(existing_id) {
2185+
if !id_is_full_doc(existing_id) {
21352186
continue;
21362187
}
21372188
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
@@ -2159,7 +2210,7 @@ mod tests {
21592210
.first_or_default_col(9999);
21602211
for doc_id in segment_reader.doc_ids_alive() {
21612212
let id = ff_reader.get_val(doc_id);
2162-
if !id_exists(id) {
2213+
if !id_is_full_doc(id) {
21632214
continue;
21642215
}
21652216
let facet_ords: Vec<u64> = facet_reader.facet_ords(doc_id).collect();
@@ -2197,6 +2248,12 @@ mod tests {
21972248
Ok(index)
21982249
}
21992250

2251+
#[test]
2252+
fn test_fast_field_range() {
2253+
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect();
2254+
assert!(test_operation_strategy(&ops, false, true).is_ok());
2255+
}
2256+
22002257
#[test]
22012258
fn test_sort_index_on_opt_field_regression() {
22022259
assert!(test_operation_strategy(

src/query/range_query/fast_field_range_query.rs

+57-6
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ impl VecCursor {
3131
self.current_pos = 0;
3232
&mut self.docs
3333
}
34-
fn last_value(&self) -> Option<u32> {
35-
self.docs.iter().last().cloned()
34+
fn last_doc(&self) -> Option<u32> {
35+
self.docs.last().cloned()
3636
}
3737
fn is_empty(&self) -> bool {
3838
self.current().is_none()
@@ -112,15 +112,15 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
112112
finished_to_end = true;
113113
}
114114

115-
let last_value = self.loaded_docs.last_value();
115+
let last_doc = self.loaded_docs.last_doc();
116116
let doc_buffer: &mut Vec<DocId> = self.loaded_docs.get_cleared_data();
117117
self.column.get_docids_for_value_range(
118118
self.value_range.clone(),
119119
self.next_fetch_start..end,
120120
doc_buffer,
121121
);
122-
if let Some(last_value) = last_value {
123-
while self.loaded_docs.current() == Some(last_value) {
122+
if let Some(last_doc) = last_doc {
123+
while self.loaded_docs.current() == Some(last_doc) {
124124
self.loaded_docs.next();
125125
}
126126
}
@@ -136,7 +136,7 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
136136
if let Some(docid) = self.loaded_docs.next() {
137137
return docid;
138138
}
139-
if self.next_fetch_start >= self.column.values.num_vals() {
139+
if self.next_fetch_start >= self.column.num_docs() {
140140
return TERMINATED;
141141
}
142142
self.fetch_block();
@@ -177,3 +177,54 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
177177
0 // heuristic possible by checking number of hits when fetching a block
178178
}
179179
}
180+
181+
#[cfg(test)]
182+
mod tests {
183+
use crate::collector::Count;
184+
use crate::directory::RamDirectory;
185+
use crate::query::RangeQuery;
186+
use crate::{schema, IndexBuilder, TantivyDocument};
187+
188+
#[test]
189+
fn range_query_fast_optional_field_minimum() {
190+
let mut schema_builder = schema::SchemaBuilder::new();
191+
let id_field = schema_builder.add_text_field("id", schema::STRING);
192+
let score_field = schema_builder.add_u64_field("score", schema::FAST | schema::INDEXED);
193+
194+
let dir = RamDirectory::default();
195+
let index = IndexBuilder::new()
196+
.schema(schema_builder.build())
197+
.open_or_create(dir)
198+
.unwrap();
199+
200+
{
201+
let mut writer = index.writer(15_000_000).unwrap();
202+
203+
let count = 1000;
204+
for i in 0..count {
205+
let mut doc = TantivyDocument::new();
206+
doc.add_text(id_field, format!("doc{i}"));
207+
208+
let nb_scores = i % 2; // 0 or 1 scores
209+
for _ in 0..nb_scores {
210+
doc.add_u64(score_field, 80);
211+
}
212+
213+
writer.add_document(doc).unwrap();
214+
}
215+
writer.commit().unwrap();
216+
}
217+
218+
let reader = index.reader().unwrap();
219+
let searcher = reader.searcher();
220+
221+
let query = RangeQuery::new_u64_bounds(
222+
"score".to_string(),
223+
std::ops::Bound::Included(70),
224+
std::ops::Bound::Unbounded,
225+
);
226+
227+
let count = searcher.search(&query, &Count).unwrap();
228+
assert_eq!(count, 500);
229+
}
230+
}

0 commit comments

Comments
 (0)