Expose phrase-prefix queries via the built-in query parser

adamreichold · adamreichold · commit 557af61b793c · 2023-05-19T07:49:23.000+02:00
This proposes the less-than-imaginative syntax `field:"phrase ter"*` to
perform a phrase prefix query against `field` using `phrase` and `ter` as the
terms. The aim of this is to make this type of query more discoverable and
simplify manual testing.

I did consider exposing the `max_expansions` parameter similar to how slop is
handled, but I think that this is rather something that should be configured via
the querser parser (similar to `set_field_boost` and `set_field_fuzzy`) as
choosing it requires rather intimiate knowledge of the backing index.
diff --git a/query-grammar/src/query_grammar.rs b/query-grammar/src/query_grammar.rs
@@ -162,14 +162,22 @@ fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> {
 }
 
 fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
-    (field_name(), term_val(), slop_val()).map(|(field_name, (delimiter, phrase), slop)| {
-        UserInputLiteral {
+    (field_name(), term_val(), slop_or_prefix_val()).map(
+        |(field_name, (delimiter, phrase), (slop, prefix))| UserInputLiteral {
             field_name: Some(field_name),
             phrase,
             delimiter,
             slop,
-        }
-    })
+            prefix,
+        },
+    )
+}
+
+fn slop_or_prefix_val<'a>() -> impl Parser<&'a str, Output = (u32, bool)> {
+    let prefix_val = char('*').map(|_ast| (0, true));
+    let slop_val = slop_val().map(|slop| (slop, false));
+
+    prefix_val.or(slop_val)
 }
 
 fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
@@ -186,11 +194,14 @@ fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
 
 fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
     let term_default_field =
-        (term_val(), slop_val()).map(|((delimiter, phrase), slop)| UserInputLiteral {
-            field_name: None,
-            phrase,
-            delimiter,
-            slop,
+        (term_val(), slop_or_prefix_val()).map(|((delimiter, phrase), (slop, prefix))| {
+            UserInputLiteral {
+                field_name: None,
+                phrase,
+                delimiter,
+                slop,
+                prefix,
+            }
         });
 
     attempt(term_query())
@@ -872,6 +883,12 @@ mod test {
         test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
     }
 
+    #[test]
+    fn test_phrase_prefix() {
+        test_parse_query_to_ast_helper("\"a b\"*", "\"a b\"*");
+        test_parse_query_to_ast_helper("foo:\"a b\"*", "\"foo\":\"a b\"*");
+    }
+
     #[test]
     fn test_not_queries_are_consistent() {
         test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");
diff --git a/query-grammar/src/user_input_ast.rs b/query-grammar/src/user_input_ast.rs
@@ -66,6 +66,7 @@ pub struct UserInputLiteral {
     pub phrase: String,
     pub delimiter: Delimiter,
     pub slop: u32,
+    pub prefix: bool,
 }
 
 impl fmt::Debug for UserInputLiteral {
@@ -86,6 +87,8 @@ impl fmt::Debug for UserInputLiteral {
         }
         if self.slop > 0 {
             write!(formatter, "~{}", self.slop)?;
+        } else if self.prefix {
+            write!(formatter, "*")?;
         }
         Ok(())
     }
diff --git a/src/query/query_parser/logical_ast.rs b/src/query/query_parser/logical_ast.rs
@@ -8,7 +8,11 @@ use crate::Score;
 #[derive(Clone)]
 pub enum LogicalLiteral {
     Term(Term),
-    Phrase(Vec<(usize, Term)>, u32),
+    Phrase {
+        terms: Vec<(usize, Term)>,
+        slop: u32,
+        prefix: bool,
+    },
     Range {
         field: String,
         value_type: Type,
@@ -79,10 +83,16 @@ impl fmt::Debug for LogicalLiteral {
     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
         match *self {
             LogicalLiteral::Term(ref term) => write!(formatter, "{term:?}"),
-            LogicalLiteral::Phrase(ref terms, slop) => {
+            LogicalLiteral::Phrase {
+                ref terms,
+                slop,
+                prefix,
+            } => {
                 write!(formatter, "\"{terms:?}\"")?;
                 if slop > 0 {
                     write!(formatter, "~{slop:?}")
+                } else if prefix {
+                    write!(formatter, "*")
                 } else {
                     Ok(())
                 }
diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs
@@ -15,17 +15,8 @@ use crate::core::json_utils::{
 use crate::core::Index;
 use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery};
 use crate::query::{
-    AllQuery,
-    BooleanQuery,
-    BoostQuery,
-    EmptyQuery,
-    FuzzyTermQuery,
-    Occur,
-    PhraseQuery,
-    Query,
-    // RangeQuery,
-    TermQuery,
-    TermSetQuery,
+    AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhrasePrefixQuery,
+    PhraseQuery, Query, TermQuery, TermSetQuery,
 };
 use crate::schema::{
     Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
@@ -194,6 +185,10 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
 ///
 /// Phrase terms support the `~` slop operator which allows to set the phrase's matching
 /// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`.
+///
+/// Phrase terms also support the `*` prefix operator which switches the phrase's matching
+/// to consider all documents which contain the last term as a prefix, e.g. `"big bad wo"*` will
+/// match `"big bad wolf"`.
 #[derive(Clone)]
 pub struct QueryParser {
     schema: Schema,
@@ -446,6 +441,7 @@ impl QueryParser {
         json_path: &str,
         phrase: &str,
         slop: u32,
+        prefix: bool,
     ) -> Result<Vec<LogicalLiteral>, QueryParserError> {
         let field_entry = self.schema.get_field_entry(field);
         let field_type = field_entry.field_type();
@@ -503,6 +499,7 @@ impl QueryParser {
                     field,
                     phrase,
                     slop,
+                    prefix,
                     &text_analyzer,
                     index_record_option,
                 )?
@@ -661,9 +658,13 @@ impl QueryParser {
                     self.compute_path_triplets_for_literal(&literal)?;
                 let mut asts: Vec<LogicalAst> = Vec::new();
                 for (field, json_path, phrase) in term_phrases {
-                    for ast in
-                        self.compute_logical_ast_for_leaf(field, json_path, phrase, literal.slop)?
-                    {
+                    for ast in self.compute_logical_ast_for_leaf(
+                        field,
+                        json_path,
+                        phrase,
+                        literal.slop,
+                        literal.prefix,
+                    )? {
                         // Apply some field specific boost defined at the query parser level.
                         let boost = self.field_boost(field);
                         asts.push(LogicalAst::Leaf(Box::new(ast)).boost(boost));
@@ -753,9 +754,17 @@ fn convert_literal_to_query(
                 Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs))
             }
         }
-        LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new(
-            PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop),
-        ),
+        LogicalLiteral::Phrase {
+            terms,
+            slop,
+            prefix,
+        } => {
+            if prefix {
+                Box::new(PhrasePrefixQuery::new_with_offset(terms))
+            } else {
+                Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
+            }
+        }
         LogicalLiteral::Range {
             field,
             value_type,
@@ -774,6 +783,7 @@ fn generate_literals_for_str(
     field: Field,
     phrase: &str,
     slop: u32,
+    prefix: bool,
     text_analyzer: &TextAnalyzer,
     index_record_option: IndexRecordOption,
 ) -> Result<Option<LogicalLiteral>, QueryParserError> {
@@ -795,7 +805,11 @@ fn generate_literals_for_str(
             field_name.to_string(),
         ));
     }
-    Ok(Some(LogicalLiteral::Phrase(terms, slop)))
+    Ok(Some(LogicalLiteral::Phrase {
+        terms,
+        slop,
+        prefix,
+    }))
 }
 
 fn generate_literals_for_json_object(
@@ -841,7 +855,11 @@ fn generate_literals_for_json_object(
             field_name.to_string(),
         ));
     }
-    logical_literals.push(LogicalLiteral::Phrase(terms, 0));
+    logical_literals.push(LogicalLiteral::Phrase {
+        terms,
+        slop: 0,
+        prefix: false,
+    });
     Ok(logical_literals)
 }
 
@@ -1643,6 +1661,27 @@ mod test {
         );
     }
 
+    #[test]
+    pub fn test_phrase_prefix() {
+        test_parse_query_to_logical_ast_helper(
+            "\"big bad wo\"*",
+            r#"("[(0, Term(field=0, type=Str, "big")), (1, Term(field=0, type=Str, "bad")), (2, Term(field=0, type=Str, "wo"))]"* "[(0, Term(field=1, type=Str, "big")), (1, Term(field=1, type=Str, "bad")), (2, Term(field=1, type=Str, "wo"))]"*)"#,
+            false,
+        );
+
+        let query_parser = make_query_parser();
+        let query = query_parser.parse_query("\"big bad wo\"*").unwrap();
+        assert_eq!(
+            format!("{query:?}"),
+            "BooleanQuery { subqueries: [(Should, PhrasePrefixQuery { field: Field(0), \
+             phrase_terms: [(0, Term(field=0, type=Str, \"big\")), (1, Term(field=0, type=Str, \
+             \"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \
+             (Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \
+             type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \
+             Term(field=1, type=Str, \"wo\")), max_expansions: 50 })] }"
+        );
+    }
+
     #[test]
     pub fn test_term_set_query() {
         test_parse_query_to_logical_ast_helper(

Original file line number	Diff line number	Diff line change
`@@ -66,6 +66,7 @@ pub struct UserInputLiteral {`
`66`	`66`	`pub phrase: String,`
`67`	`67`	`pub delimiter: Delimiter,`
`68`	`68`	`pub slop: u32,`
	`69`	`+ pub prefix: bool,`
`69`	`70`	`}`
`70`	`71`
`71`	`72`	`impl fmt::Debug for UserInputLiteral {`
`@@ -86,6 +87,8 @@ impl fmt::Debug for UserInputLiteral {`
`86`	`87`	`}`
`87`	`88`	`if self.slop > 0 {`
`88`	`89`	`write!(formatter, "~{}", self.slop)?;`
	`90`	`+ } else if self.prefix {`
	`91`	`+ write!(formatter, "*")?;`
`89`	`92`	`}`
`90`	`93`	`Ok(())`
`91`	`94`	`}`