Skip to content

Commit b325d56

Browse files
authored
Expose phrase-prefix queries via the built-in query parser (#2044)
* Expose phrase-prefix queries via the built-in query parser This proposes the less-than-imaginative syntax `field:"phrase ter"*` to perform a phrase prefix query against `field` using `phrase` and `ter` as the terms. The aim of this is to make this type of query more discoverable and simplify manual testing. I did consider exposing the `max_expansions` parameter similar to how slop is handled, but I think that this is rather something that should be configured via the querser parser (similar to `set_field_boost` and `set_field_fuzzy`) as choosing it requires rather intimiate knowledge of the backing index. * Prevent construction of zero or one term phrase-prefix queries via the query parser. * Add example using phrase-prefix search via surface API to improve feature discoverability.
1 parent 7ee78bd commit b325d56

File tree

6 files changed

+232
-46
lines changed

6 files changed

+232
-46
lines changed

examples/phrase_prefix_search.rs

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
use tantivy::collector::TopDocs;
2+
use tantivy::query::QueryParser;
3+
use tantivy::schema::*;
4+
use tantivy::{doc, Index, ReloadPolicy, Result};
5+
use tempfile::TempDir;
6+
7+
fn main() -> Result<()> {
8+
let index_path = TempDir::new()?;
9+
10+
let mut schema_builder = Schema::builder();
11+
schema_builder.add_text_field("title", TEXT | STORED);
12+
schema_builder.add_text_field("body", TEXT);
13+
let schema = schema_builder.build();
14+
15+
let title = schema.get_field("title").unwrap();
16+
let body = schema.get_field("body").unwrap();
17+
18+
let index = Index::create_in_dir(&index_path, schema)?;
19+
20+
let mut index_writer = index.writer(50_000_000)?;
21+
22+
index_writer.add_document(doc!(
23+
title => "The Old Man and the Sea",
24+
body => "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \
25+
eighty-four days now without taking a fish.",
26+
))?;
27+
28+
index_writer.add_document(doc!(
29+
title => "Of Mice and Men",
30+
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
31+
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
32+
over the yellow sands in the sunlight before reaching the narrow pool. On one \
33+
side of the river the golden foothill slopes curve up to the strong and rocky \
34+
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
35+
fresh and green with every spring, carrying in their lower leaf junctures the \
36+
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
37+
limbs and branches that arch over the pool"
38+
))?;
39+
40+
// Multivalued field just need to be repeated.
41+
index_writer.add_document(doc!(
42+
title => "Frankenstein",
43+
title => "The Modern Prometheus",
44+
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
45+
enterprise which you have regarded with such evil forebodings. I arrived here \
46+
yesterday, and my first task is to assure my dear sister of my welfare and \
47+
increasing confidence in the success of my undertaking."
48+
))?;
49+
50+
index_writer.commit()?;
51+
52+
let reader = index
53+
.reader_builder()
54+
.reload_policy(ReloadPolicy::OnCommit)
55+
.try_into()?;
56+
57+
let searcher = reader.searcher();
58+
59+
let query_parser = QueryParser::for_index(&index, vec![title, body]);
60+
// This will match documents containing the phrase "in the"
61+
// followed by some word starting with "su",
62+
// i.e. it will match "in the sunlight" and "in the success",
63+
// but not "in the Gulf Stream".
64+
let query = query_parser.parse_query("\"in the su\"*")?;
65+
66+
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
67+
let mut titles = top_docs
68+
.into_iter()
69+
.map(|(_score, doc_address)| {
70+
let doc = searcher.doc(doc_address)?;
71+
let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned();
72+
Ok(title)
73+
})
74+
.collect::<Result<Vec<_>>>()?;
75+
titles.sort_unstable();
76+
assert_eq!(titles, ["Frankenstein", "Of Mice and Men"]);
77+
78+
Ok(())
79+
}

query-grammar/src/query_grammar.rs

+30-9
Original file line numberDiff line numberDiff line change
@@ -162,14 +162,22 @@ fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> {
162162
}
163163

164164
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
165-
(field_name(), term_val(), slop_val()).map(|(field_name, (delimiter, phrase), slop)| {
166-
UserInputLiteral {
165+
(field_name(), term_val(), slop_or_prefix_val()).map(
166+
|(field_name, (delimiter, phrase), (slop, prefix))| UserInputLiteral {
167167
field_name: Some(field_name),
168168
phrase,
169169
delimiter,
170170
slop,
171-
}
172-
})
171+
prefix,
172+
},
173+
)
174+
}
175+
176+
fn slop_or_prefix_val<'a>() -> impl Parser<&'a str, Output = (u32, bool)> {
177+
let prefix_val = char('*').map(|_ast| (0, true));
178+
let slop_val = slop_val().map(|slop| (slop, false));
179+
180+
prefix_val.or(slop_val)
173181
}
174182

175183
fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
@@ -186,11 +194,14 @@ fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
186194

187195
fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
188196
let term_default_field =
189-
(term_val(), slop_val()).map(|((delimiter, phrase), slop)| UserInputLiteral {
190-
field_name: None,
191-
phrase,
192-
delimiter,
193-
slop,
197+
(term_val(), slop_or_prefix_val()).map(|((delimiter, phrase), (slop, prefix))| {
198+
UserInputLiteral {
199+
field_name: None,
200+
phrase,
201+
delimiter,
202+
slop,
203+
prefix,
204+
}
194205
});
195206

196207
attempt(term_query())
@@ -872,6 +883,16 @@ mod test {
872883
test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
873884
}
874885

886+
#[test]
887+
fn test_phrase_prefix() {
888+
test_parse_query_to_ast_helper("\"a b\"*", "\"a b\"*");
889+
test_parse_query_to_ast_helper("\"a\"*", "\"a\"*");
890+
test_parse_query_to_ast_helper("\"\"*", "\"\"*");
891+
test_parse_query_to_ast_helper("foo:\"a b\"*", "\"foo\":\"a b\"*");
892+
test_parse_query_to_ast_helper("foo:\"a\"*", "\"foo\":\"a\"*");
893+
test_parse_query_to_ast_helper("foo:\"\"*", "\"foo\":\"\"*");
894+
}
895+
875896
#[test]
876897
fn test_not_queries_are_consistent() {
877898
test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");

query-grammar/src/user_input_ast.rs

+3
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ pub struct UserInputLiteral {
6666
pub phrase: String,
6767
pub delimiter: Delimiter,
6868
pub slop: u32,
69+
pub prefix: bool,
6970
}
7071

7172
impl fmt::Debug for UserInputLiteral {
@@ -86,6 +87,8 @@ impl fmt::Debug for UserInputLiteral {
8687
}
8788
if self.slop > 0 {
8889
write!(formatter, "~{}", self.slop)?;
90+
} else if self.prefix {
91+
write!(formatter, "*")?;
8992
}
9093
Ok(())
9194
}

src/query/phrase_prefix_query/phrase_prefix_query.rs

-3
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,6 @@ impl PhrasePrefixQuery {
8888
/// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait.
8989
/// If the query was only one term long, this returns `None` wherease [`Query::weight`]
9090
/// returns a boxed [`RangeWeight`]
91-
///
92-
/// Returns `None`, if phrase_terms is empty, which happens if the phrase prefix query was
93-
/// built with a single term.
9491
pub(crate) fn phrase_prefix_query_weight(
9592
&self,
9693
enable_scoring: EnableScoring<'_>,

src/query/query_parser/logical_ast.rs

+12-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@ use crate::Score;
88
#[derive(Clone)]
99
pub enum LogicalLiteral {
1010
Term(Term),
11-
Phrase(Vec<(usize, Term)>, u32),
11+
Phrase {
12+
terms: Vec<(usize, Term)>,
13+
slop: u32,
14+
prefix: bool,
15+
},
1216
Range {
1317
field: String,
1418
value_type: Type,
@@ -79,10 +83,16 @@ impl fmt::Debug for LogicalLiteral {
7983
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
8084
match *self {
8185
LogicalLiteral::Term(ref term) => write!(formatter, "{term:?}"),
82-
LogicalLiteral::Phrase(ref terms, slop) => {
86+
LogicalLiteral::Phrase {
87+
ref terms,
88+
slop,
89+
prefix,
90+
} => {
8391
write!(formatter, "\"{terms:?}\"")?;
8492
if slop > 0 {
8593
write!(formatter, "~{slop:?}")
94+
} else if prefix {
95+
write!(formatter, "*")
8696
} else {
8797
Ok(())
8898
}

0 commit comments

Comments
 (0)