Skip to content

Commit 21aabf9

Browse files
committed
Fix range query (#2226)
Fix range query end check in advance Rename vars to reduce ambiguity add tests Fixes #2225
1 parent 49448b3 commit 21aabf9

File tree

2 files changed

+145
-37
lines changed

2 files changed

+145
-37
lines changed

src/indexer/index_writer.rs

+88-31
Original file line numberDiff line numberDiff line change
@@ -1688,7 +1688,8 @@ mod tests {
16881688

16891689
let old_reader = index.reader()?;
16901690

1691-
let id_exists = |id| id % 3 != 0; // 0 does not exist
1691+
// Every 3rd doc has only id field
1692+
let id_is_full_doc = |id| id % 3 != 0;
16921693

16931694
let multi_text_field_text1 = "test1 test2 test3 test1 test2 test3";
16941695
// rotate left
@@ -1704,7 +1705,7 @@ mod tests {
17041705
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
17051706
let ip = ip_from_id(id);
17061707

1707-
if !id_exists(id) {
1708+
if !id_is_full_doc(id) {
17081709
// every 3rd doc has no ip field
17091710
index_writer.add_document(doc!(
17101711
id_field=>id,
@@ -1824,7 +1825,7 @@ mod tests {
18241825

18251826
let num_docs_with_values = expected_ids_and_num_occurrences
18261827
.iter()
1827-
.filter(|(id, _id_occurrences)| id_exists(**id))
1828+
.filter(|(id, _id_occurrences)| id_is_full_doc(**id))
18281829
.map(|(_, id_occurrences)| *id_occurrences as usize)
18291830
.sum::<usize>();
18301831

@@ -1848,7 +1849,7 @@ mod tests {
18481849
if force_end_merge && num_segments_before_merge > 1 && num_segments_after_merge == 1 {
18491850
let mut expected_multi_ips: Vec<_> = id_list
18501851
.iter()
1851-
.filter(|id| id_exists(**id))
1852+
.filter(|id| id_is_full_doc(**id))
18521853
.flat_map(|id| vec![ip_from_id(*id), ip_from_id(*id)])
18531854
.collect();
18541855
assert_eq!(num_ips, expected_multi_ips.len() as u32);
@@ -1886,7 +1887,7 @@ mod tests {
18861887
let expected_ips = expected_ids_and_num_occurrences
18871888
.keys()
18881889
.flat_map(|id| {
1889-
if !id_exists(*id) {
1890+
if !id_is_full_doc(*id) {
18901891
None
18911892
} else {
18921893
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1898,7 +1899,7 @@ mod tests {
18981899
let expected_ips = expected_ids_and_num_occurrences
18991900
.keys()
19001901
.filter_map(|id| {
1901-
if !id_exists(*id) {
1902+
if !id_is_full_doc(*id) {
19021903
None
19031904
} else {
19041905
Some(Ipv6Addr::from_u128(*id as u128))
@@ -1933,7 +1934,7 @@ mod tests {
19331934
let id = id_reader.first(doc).unwrap();
19341935

19351936
let vals: Vec<u64> = ff_reader.values_for_doc(doc).collect();
1936-
if id_exists(id) {
1937+
if id_is_full_doc(id) {
19371938
assert_eq!(vals.len(), 2);
19381939
assert_eq!(vals[0], vals[1]);
19391940
assert!(expected_ids_and_num_occurrences.contains_key(&vals[0]));
@@ -1943,7 +1944,7 @@ mod tests {
19431944
}
19441945

19451946
let bool_vals: Vec<bool> = bool_ff_reader.values_for_doc(doc).collect();
1946-
if id_exists(id) {
1947+
if id_is_full_doc(id) {
19471948
assert_eq!(bool_vals.len(), 2);
19481949
assert_ne!(bool_vals[0], bool_vals[1]);
19491950
} else {
@@ -1972,7 +1973,7 @@ mod tests {
19721973
.as_u64()
19731974
.unwrap();
19741975
assert!(expected_ids_and_num_occurrences.contains_key(&id));
1975-
if id_exists(id) {
1976+
if id_is_full_doc(id) {
19761977
let id2 = store_reader
19771978
.get(doc_id)
19781979
.unwrap()
@@ -2019,7 +2020,7 @@ mod tests {
20192020
let (existing_id, count) = (*id, *count);
20202021
let get_num_hits = |field| do_search(&existing_id.to_string(), field).len() as u64;
20212022
assert_eq!(get_num_hits(id_field), count);
2022-
if !id_exists(existing_id) {
2023+
if !id_is_full_doc(existing_id) {
20232024
continue;
20242025
}
20252026
assert_eq!(get_num_hits(text_field), count);
@@ -2069,7 +2070,7 @@ mod tests {
20692070
//
20702071
for (existing_id, count) in &expected_ids_and_num_occurrences {
20712072
let (existing_id, count) = (*existing_id, *count);
2072-
if !id_exists(existing_id) {
2073+
if !id_is_full_doc(existing_id) {
20732074
continue;
20742075
}
20752076
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
@@ -2086,34 +2087,84 @@ mod tests {
20862087
}
20872088
}
20882089

2089-
// assert data is like expected
2090+
// Range query
20902091
//
2091-
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
2092-
let (existing_id, count) = (*existing_id, *count);
2093-
if !id_exists(existing_id) {
2094-
continue;
2095-
}
2096-
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
2097-
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
2092+
// Take half as sample
2093+
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect();
2094+
sample.sort_by_key(|(k, _num_occurences)| *k);
2095+
// sample.truncate(sample.len() / 2);
2096+
if !sample.is_empty() {
2097+
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
2098+
2099+
let expected_count = |sample: &[(&u64, &u64)]| {
2100+
sample
2101+
.iter()
2102+
.filter(|(id, _)| id_is_full_doc(**id))
2103+
.map(|(_id, num_occurences)| **num_occurences)
2104+
.sum::<u64>()
20982105
};
2099-
let ip = ip_from_id(existing_id);
2100-
2101-
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
2102-
// Range query on single value field
2103-
let query = gen_query_inclusive("ip", ip, ip);
2104-
assert_eq!(do_search_ip_field(&query), count);
2105-
2106-
// Range query on multi value field
2107-
let query = gen_query_inclusive("ips", ip, ip);
2106+
fn gen_query_inclusive<T1: ToString, T2: ToString>(
2107+
field: &str,
2108+
from: T1,
2109+
to: T2,
2110+
) -> String {
2111+
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
2112+
}
21082113

2109-
assert_eq!(do_search_ip_field(&query), count);
2114+
// Query first half
2115+
if !left_sample.is_empty() {
2116+
let expected_count = expected_count(left_sample);
2117+
2118+
let start_range = *left_sample[0].0;
2119+
let end_range = *left_sample.last().unwrap().0;
2120+
let query = gen_query_inclusive("id_opt", start_range, end_range);
2121+
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
2122+
2123+
// Range query on ip field
2124+
let ip1 = ip_from_id(start_range);
2125+
let ip2 = ip_from_id(end_range);
2126+
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
2127+
let query = gen_query_inclusive("ip", ip1, ip2);
2128+
assert_eq!(do_search_ip_field(&query), expected_count);
2129+
let query = gen_query_inclusive("ip", "*", ip2);
2130+
assert_eq!(do_search_ip_field(&query), expected_count);
2131+
// Range query on multi value field
2132+
let query = gen_query_inclusive("ips", ip1, ip2);
2133+
assert_eq!(do_search_ip_field(&query), expected_count);
2134+
let query = gen_query_inclusive("ips", "*", ip2);
2135+
assert_eq!(do_search_ip_field(&query), expected_count);
2136+
}
2137+
// Query second half
2138+
if !right_sample.is_empty() {
2139+
let expected_count = expected_count(right_sample);
2140+
let start_range = *right_sample[0].0;
2141+
let end_range = *right_sample.last().unwrap().0;
2142+
// Range query on id opt field
2143+
let query =
2144+
gen_query_inclusive("id_opt", start_range.to_string(), end_range.to_string());
2145+
assert_eq!(do_search(&query, id_opt_field).len() as u64, expected_count);
2146+
2147+
// Range query on ip field
2148+
let ip1 = ip_from_id(start_range);
2149+
let ip2 = ip_from_id(end_range);
2150+
let do_search_ip_field = |term: &str| do_search(term, ip_field).len() as u64;
2151+
let query = gen_query_inclusive("ip", ip1, ip2);
2152+
assert_eq!(do_search_ip_field(&query), expected_count);
2153+
let query = gen_query_inclusive("ip", ip1, "*");
2154+
assert_eq!(do_search_ip_field(&query), expected_count);
2155+
// Range query on multi value field
2156+
let query = gen_query_inclusive("ips", ip1, ip2);
2157+
assert_eq!(do_search_ip_field(&query), expected_count);
2158+
let query = gen_query_inclusive("ips", ip1, "*");
2159+
assert_eq!(do_search_ip_field(&query), expected_count);
2160+
}
21102161
}
21112162

21122163
// ip range query on fast field
21132164
//
21142165
for (existing_id, count) in expected_ids_and_num_occurrences.iter().take(10) {
21152166
let (existing_id, count) = (*existing_id, *count);
2116-
if !id_exists(existing_id) {
2167+
if !id_is_full_doc(existing_id) {
21172168
continue;
21182169
}
21192170
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
@@ -2141,7 +2192,7 @@ mod tests {
21412192
.first_or_default_col(9999);
21422193
for doc_id in segment_reader.doc_ids_alive() {
21432194
let id = ff_reader.get_val(doc_id);
2144-
if !id_exists(id) {
2195+
if !id_is_full_doc(id) {
21452196
continue;
21462197
}
21472198
let facet_ords: Vec<u64> = facet_reader.facet_ords(doc_id).collect();
@@ -2179,6 +2230,12 @@ mod tests {
21792230
Ok(index)
21802231
}
21812232

2233+
#[test]
2234+
fn test_fast_field_range() {
2235+
let ops: Vec<_> = (0..1000).map(|id| IndexingOp::AddDoc { id }).collect();
2236+
assert!(test_operation_strategy(&ops, false, true).is_ok());
2237+
}
2238+
21822239
#[test]
21832240
fn test_sort_index_on_opt_field_regression() {
21842241
assert!(test_operation_strategy(

src/query/range_query/fast_field_range_query.rs

+57-6
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ impl VecCursor {
3131
self.current_pos = 0;
3232
&mut self.docs
3333
}
34-
fn last_value(&self) -> Option<u32> {
35-
self.docs.iter().last().cloned()
34+
fn last_doc(&self) -> Option<u32> {
35+
self.docs.last().cloned()
3636
}
3737
fn is_empty(&self) -> bool {
3838
self.current().is_none()
@@ -112,15 +112,15 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
112112
finished_to_end = true;
113113
}
114114

115-
let last_value = self.loaded_docs.last_value();
115+
let last_doc = self.loaded_docs.last_doc();
116116
let doc_buffer: &mut Vec<DocId> = self.loaded_docs.get_cleared_data();
117117
self.column.get_docids_for_value_range(
118118
self.value_range.clone(),
119119
self.next_fetch_start..end,
120120
doc_buffer,
121121
);
122-
if let Some(last_value) = last_value {
123-
while self.loaded_docs.current() == Some(last_value) {
122+
if let Some(last_doc) = last_doc {
123+
while self.loaded_docs.current() == Some(last_doc) {
124124
self.loaded_docs.next();
125125
}
126126
}
@@ -136,7 +136,7 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
136136
if let Some(docid) = self.loaded_docs.next() {
137137
return docid;
138138
}
139-
if self.next_fetch_start >= self.column.values.num_vals() {
139+
if self.next_fetch_start >= self.column.num_docs() {
140140
return TERMINATED;
141141
}
142142
self.fetch_block();
@@ -177,3 +177,54 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
177177
0 // heuristic possible by checking number of hits when fetching a block
178178
}
179179
}
180+
181+
#[cfg(test)]
182+
mod tests {
183+
use crate::collector::Count;
184+
use crate::directory::RamDirectory;
185+
use crate::query::RangeQuery;
186+
use crate::{schema, Document, IndexBuilder};
187+
188+
#[test]
189+
fn range_query_fast_optional_field_minimum() {
190+
let mut schema_builder = schema::SchemaBuilder::new();
191+
let id_field = schema_builder.add_text_field("id", schema::STRING);
192+
let score_field = schema_builder.add_u64_field("score", schema::FAST | schema::INDEXED);
193+
194+
let dir = RamDirectory::default();
195+
let index = IndexBuilder::new()
196+
.schema(schema_builder.build())
197+
.open_or_create(dir)
198+
.unwrap();
199+
200+
{
201+
let mut writer = index.writer(15_000_000).unwrap();
202+
203+
let count = 1000;
204+
for i in 0..count {
205+
let mut doc = Document::new();
206+
doc.add_text(id_field, format!("doc{i}"));
207+
208+
let nb_scores = i % 2; // 0 or 1 scores
209+
for _ in 0..nb_scores {
210+
doc.add_u64(score_field, 80);
211+
}
212+
213+
writer.add_document(doc).unwrap();
214+
}
215+
writer.commit().unwrap();
216+
}
217+
218+
let reader = index.reader().unwrap();
219+
let searcher = reader.searcher();
220+
221+
let query = RangeQuery::new_u64_bounds(
222+
"score".to_string(),
223+
std::ops::Bound::Included(70),
224+
std::ops::Bound::Unbounded,
225+
);
226+
227+
let count = searcher.search(&query, &Count).unwrap();
228+
assert_eq!(count, 500);
229+
}
230+
}

0 commit comments

Comments
 (0)