Skip to content

Commit 62709b8

Browse files
authoredMay 19, 2023
Change in the query grammar. (#2050)
* Change in the query grammar. Quotation mark can now be used for phrase queries. The delimiter is part of the `UserInputLeaf`. That information is meant to be used in Quickwit to solve #3364. This PR also adds support for quotation marks escaping in phrase queries. * Apply suggestions from code review
1 parent 04562c0 commit 62709b8

File tree

4 files changed

+148
-78
lines changed

4 files changed

+148
-78
lines changed
 

‎query-grammar/src/lib.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ use combine::parser::Parser;
77

88
pub use crate::occur::Occur;
99
use crate::query_grammar::parse_to_ast;
10-
pub use crate::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
10+
pub use crate::user_input_ast::{
11+
Delimiter, UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral,
12+
};
1113

1214
pub struct Error;
1315

‎query-grammar/src/query_grammar.rs

+107-56
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@ use combine::parser::range::{take_while, take_while1};
55
use combine::parser::repeat::escaped;
66
use combine::parser::Parser;
77
use combine::{
8-
attempt, between, choice, eof, many, many1, one_of, optional, parser, satisfy, sep_by,
8+
any, attempt, between, choice, eof, many, many1, one_of, optional, parser, satisfy, sep_by,
99
skip_many1, value,
1010
};
1111
use once_cell::sync::Lazy;
1212
use regex::Regex;
1313

1414
use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
15+
use crate::user_input_ast::Delimiter;
1516
use crate::Occur;
1617

1718
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
@@ -133,16 +134,41 @@ fn date_time<'a>() -> impl Parser<&'a str, Output = String> {
133134
recognize((date, char('T'), time))
134135
}
135136

136-
fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
137-
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
138-
negative_number().or(phrase.or(word()))
137+
fn escaped_character<'a>() -> impl Parser<&'a str, Output = char> {
138+
(char('\\'), any()).map(|(_, x)| x)
139+
}
140+
141+
fn escaped_string<'a>(delimiter: char) -> impl Parser<&'a str, Output = String> {
142+
(
143+
char(delimiter),
144+
many(choice((
145+
escaped_character(),
146+
satisfy(move |c: char| c != delimiter),
147+
))),
148+
char(delimiter),
149+
)
150+
.map(|(_, s, _)| s)
151+
}
152+
153+
fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> {
154+
let double_quotes = escaped_string('"').map(|phrase| (Delimiter::DoubleQuotes, phrase));
155+
let single_quotes = escaped_string('\'').map(|phrase| (Delimiter::SingleQuotes, phrase));
156+
let text_no_delimiter = word().map(|text| (Delimiter::None, text));
157+
negative_number()
158+
.map(|negative_number_str| (Delimiter::None, negative_number_str))
159+
.or(double_quotes)
160+
.or(single_quotes)
161+
.or(text_no_delimiter)
139162
}
140163

141164
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
142-
(field_name(), term_val(), slop_val()).map(|(field_name, phrase, slop)| UserInputLiteral {
143-
field_name: Some(field_name),
144-
phrase,
145-
slop,
165+
(field_name(), term_val(), slop_val()).map(|(field_name, (delimiter, phrase), slop)| {
166+
UserInputLiteral {
167+
field_name: Some(field_name),
168+
phrase,
169+
delimiter,
170+
slop,
171+
}
146172
})
147173
}
148174

@@ -159,11 +185,13 @@ fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
159185
}
160186

161187
fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
162-
let term_default_field = (term_val(), slop_val()).map(|(phrase, slop)| UserInputLiteral {
163-
field_name: None,
164-
phrase,
165-
slop,
166-
});
188+
let term_default_field =
189+
(term_val(), slop_val()).map(|((delimiter, phrase), slop)| UserInputLiteral {
190+
field_name: None,
191+
phrase,
192+
delimiter,
193+
slop,
194+
});
167195

168196
attempt(term_query())
169197
.or(term_default_field)
@@ -268,7 +296,11 @@ fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
268296
/// Function that parses a set out of a Stream
269297
/// Supports ranges like: `IN [val1 val2 val3]`
270298
fn set<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
271-
let term_list = between(char('['), char(']'), sep_by(term_val(), spaces()));
299+
let term_list = between(
300+
char('['),
301+
char(']'),
302+
sep_by(term_val().map(|(_delimiter, text)| text), spaces()),
303+
);
272304

273305
let set_content = ((string("IN"), spaces()), term_list).map(|(_, elements)| elements);
274306

@@ -486,6 +518,7 @@ mod test {
486518
assert_eq!(remaining, "");
487519
}
488520

521+
#[track_caller]
489522
fn test_parse_query_to_ast_helper(query: &str, expected: &str) {
490523
let query = parse_to_ast().parse(query).unwrap().0;
491524
let query_str = format!("{query:?}");
@@ -504,8 +537,9 @@ mod test {
504537
#[test]
505538
fn test_parse_query_to_ast_hyphen() {
506539
test_parse_query_to_ast_helper("\"www-form-encoded\"", "\"www-form-encoded\"");
507-
test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\"");
508-
test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\"");
540+
test_parse_query_to_ast_helper("'www-form-encoded'", "'www-form-encoded'");
541+
test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
542+
test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
509543
}
510544

511545
#[test]
@@ -514,25 +548,25 @@ mod test {
514548
format!("{:?}", parse_to_ast().parse("NOT")),
515549
"Err(UnexpectedParse)"
516550
);
517-
test_parse_query_to_ast_helper("NOTa", "\"NOTa\"");
518-
test_parse_query_to_ast_helper("NOT a", "(-\"a\")");
551+
test_parse_query_to_ast_helper("NOTa", "NOTa");
552+
test_parse_query_to_ast_helper("NOT a", "(-a)");
519553
}
520554

521555
#[test]
522556
fn test_boosting() {
523557
assert!(parse_to_ast().parse("a^2^3").is_err());
524558
assert!(parse_to_ast().parse("a^2^").is_err());
525-
test_parse_query_to_ast_helper("a^3", "(\"a\")^3");
526-
test_parse_query_to_ast_helper("a^3 b^2", "(*(\"a\")^3 *(\"b\")^2)");
527-
test_parse_query_to_ast_helper("a^1", "\"a\"");
559+
test_parse_query_to_ast_helper("a^3", "(a)^3");
560+
test_parse_query_to_ast_helper("a^3 b^2", "(*(a)^3 *(b)^2)");
561+
test_parse_query_to_ast_helper("a^1", "a");
528562
}
529563

530564
#[test]
531565
fn test_parse_query_to_ast_binary_op() {
532-
test_parse_query_to_ast_helper("a AND b", "(+\"a\" +\"b\")");
533-
test_parse_query_to_ast_helper("a OR b", "(?\"a\" ?\"b\")");
534-
test_parse_query_to_ast_helper("a OR b AND c", "(?\"a\" ?(+\"b\" +\"c\"))");
535-
test_parse_query_to_ast_helper("a AND b AND c", "(+\"a\" +\"b\" +\"c\")");
566+
test_parse_query_to_ast_helper("a AND b", "(+a +b)");
567+
test_parse_query_to_ast_helper("a OR b", "(?a ?b)");
568+
test_parse_query_to_ast_helper("a OR b AND c", "(?a ?(+b +c))");
569+
test_parse_query_to_ast_helper("a AND b AND c", "(+a +b +c)");
536570
assert_eq!(
537571
format!("{:?}", parse_to_ast().parse("a OR b aaa")),
538572
"Err(UnexpectedParse)"
@@ -574,7 +608,7 @@ mod test {
574608
fn test_occur_leaf() {
575609
let ((occur, ast), _) = super::occur_leaf().parse("+abc").unwrap();
576610
assert_eq!(occur, Some(Occur::Must));
577-
assert_eq!(format!("{ast:?}"), "\"abc\"");
611+
assert_eq!(format!("{ast:?}"), "abc");
578612
}
579613

580614
#[test]
@@ -728,56 +762,62 @@ mod test {
728762

729763
#[test]
730764
fn test_parse_query_to_triming_spaces() {
731-
test_parse_query_to_ast_helper(" abc", "\"abc\"");
732-
test_parse_query_to_ast_helper("abc ", "\"abc\"");
733-
test_parse_query_to_ast_helper("( a OR abc)", "(?\"a\" ?\"abc\")");
734-
test_parse_query_to_ast_helper("(a OR abc)", "(?\"a\" ?\"abc\")");
735-
test_parse_query_to_ast_helper("(a OR abc)", "(?\"a\" ?\"abc\")");
736-
test_parse_query_to_ast_helper("a OR abc ", "(?\"a\" ?\"abc\")");
737-
test_parse_query_to_ast_helper("(a OR abc )", "(?\"a\" ?\"abc\")");
738-
test_parse_query_to_ast_helper("(a OR abc) ", "(?\"a\" ?\"abc\")");
765+
test_parse_query_to_ast_helper(" abc", "abc");
766+
test_parse_query_to_ast_helper("abc ", "abc");
767+
test_parse_query_to_ast_helper("( a OR abc)", "(?a ?abc)");
768+
test_parse_query_to_ast_helper("(a OR abc)", "(?a ?abc)");
769+
test_parse_query_to_ast_helper("(a OR abc)", "(?a ?abc)");
770+
test_parse_query_to_ast_helper("a OR abc ", "(?a ?abc)");
771+
test_parse_query_to_ast_helper("(a OR abc )", "(?a ?abc)");
772+
test_parse_query_to_ast_helper("(a OR abc) ", "(?a ?abc)");
739773
}
740774

741775
#[test]
742776
fn test_parse_query_single_term() {
743-
test_parse_query_to_ast_helper("abc", "\"abc\"");
777+
test_parse_query_to_ast_helper("abc", "abc");
744778
}
745779

746780
#[test]
747781
fn test_parse_query_default_clause() {
748-
test_parse_query_to_ast_helper("a b", "(*\"a\" *\"b\")");
782+
test_parse_query_to_ast_helper("a b", "(*a *b)");
749783
}
750784

751785
#[test]
752786
fn test_parse_query_must_default_clause() {
753-
test_parse_query_to_ast_helper("+(a b)", "(*\"a\" *\"b\")");
787+
test_parse_query_to_ast_helper("+(a b)", "(*a *b)");
754788
}
755789

756790
#[test]
757791
fn test_parse_query_must_single_term() {
758-
test_parse_query_to_ast_helper("+d", "\"d\"");
792+
test_parse_query_to_ast_helper("+d", "d");
759793
}
760794

761795
#[test]
762796
fn test_single_term_with_field() {
763-
test_parse_query_to_ast_helper("abc:toto", "\"abc\":\"toto\"");
797+
test_parse_query_to_ast_helper("abc:toto", "\"abc\":toto");
798+
}
799+
800+
#[test]
801+
fn test_phrase_with_field() {
802+
test_parse_query_to_ast_helper("abc:\"happy tax payer\"", "\"abc\":\"happy tax payer\"");
803+
test_parse_query_to_ast_helper("abc:'happy tax payer'", "\"abc\":'happy tax payer'");
764804
}
765805

766806
#[test]
767807
fn test_single_term_with_float() {
768-
test_parse_query_to_ast_helper("abc:1.1", "\"abc\":\"1.1\"");
769-
test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":\"1.1\"");
770-
test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":\"1.1\"");
808+
test_parse_query_to_ast_helper("abc:1.1", "\"abc\":1.1");
809+
test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":1.1");
810+
test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":1.1");
771811
}
772812

773813
#[test]
774814
fn test_must_clause() {
775-
test_parse_query_to_ast_helper("(+a +b)", "(+\"a\" +\"b\")");
815+
test_parse_query_to_ast_helper("(+a +b)", "(+a +b)");
776816
}
777817

778818
#[test]
779819
fn test_parse_test_query_plus_a_b_plus_d() {
780-
test_parse_query_to_ast_helper("+(a b) +d", "(+(*\"a\" *\"b\") +\"d\")");
820+
test_parse_query_to_ast_helper("+(a b) +d", "(+(*a *b) +d)");
781821
}
782822

783823
#[test]
@@ -790,13 +830,13 @@ mod test {
790830

791831
#[test]
792832
fn test_parse_test_query_other() {
793-
test_parse_query_to_ast_helper("(+a +b) d", "(*(+\"a\" +\"b\") *\"d\")");
794-
test_parse_query_to_ast_helper("+abc:toto", "\"abc\":\"toto\"");
795-
test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":\"toto\"");
796-
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":\"toto\" -\"titi\")");
797-
test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":\"toto\")");
833+
test_parse_query_to_ast_helper("(+a +b) d", "(*(+a +b) *d)");
834+
test_parse_query_to_ast_helper("+abc:toto", "\"abc\":toto");
835+
test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":toto");
836+
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":toto -titi)");
837+
test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":toto)");
798838
test_is_parse_err("--abc:toto");
799-
test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":\"a\" *\"b\")");
839+
test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":a *b)");
800840
test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\"");
801841
test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]");
802842
}
@@ -821,11 +861,10 @@ mod test {
821861
assert!(parse_to_ast().parse("foo:\"a b\"~").is_err());
822862
assert!(parse_to_ast().parse("\"a b\"~a").is_err());
823863
assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err());
824-
825-
test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *\"~4\")");
864+
test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *~4)");
826865
test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\"");
827-
test_parse_query_to_ast_helper("~Document", "\"~Document\"");
828-
test_parse_query_to_ast_helper("a~2", "\"a~2\"");
866+
test_parse_query_to_ast_helper("~Document", "~Document");
867+
test_parse_query_to_ast_helper("a~2", "a~2");
829868
test_parse_query_to_ast_helper("\"a b\"~0", "\"a b\"");
830869
test_parse_query_to_ast_helper("\"a b\"~1", "\"a b\"~1");
831870
test_parse_query_to_ast_helper("\"a b\"~3", "\"a b\"~3");
@@ -835,7 +874,19 @@ mod test {
835874

836875
#[test]
837876
fn test_not_queries_are_consistent() {
838-
test_parse_query_to_ast_helper("tata -toto", "(*\"tata\" -\"toto\")");
839-
test_parse_query_to_ast_helper("tata NOT toto", "(*\"tata\" -\"toto\")");
877+
test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");
878+
test_parse_query_to_ast_helper("tata NOT toto", "(*tata -toto)");
879+
}
880+
881+
#[test]
882+
fn test_escaping() {
883+
test_parse_query_to_ast_helper(
884+
r#"myfield:"hello\"happy\'tax""#,
885+
r#""myfield":"hello"happy'tax""#,
886+
);
887+
test_parse_query_to_ast_helper(
888+
r#"myfield:'hello\"happy\'tax'"#,
889+
r#""myfield":'hello"happy'tax'"#,
890+
);
840891
}
841892
}

‎query-grammar/src/user_input_ast.rs

+23-5
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ pub enum UserInputLeaf {
1919
}
2020

2121
impl Debug for UserInputLeaf {
22-
fn fmt(&self, formatter: &mut Formatter<'_>) -> Result<(), fmt::Error> {
22+
fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> {
2323
match self {
2424
UserInputLeaf::Literal(literal) => literal.fmt(formatter),
2525
UserInputLeaf::Range {
@@ -40,11 +40,11 @@ impl Debug for UserInputLeaf {
4040
write!(formatter, "\"{field}\": ")?;
4141
}
4242
write!(formatter, "IN [")?;
43-
for (i, element) in elements.iter().enumerate() {
43+
for (i, text) in elements.iter().enumerate() {
4444
if i != 0 {
4545
write!(formatter, " ")?;
4646
}
47-
write!(formatter, "\"{element}\"")?;
47+
write!(formatter, "\"{text}\"")?;
4848
}
4949
write!(formatter, "]")
5050
}
@@ -53,19 +53,37 @@ impl Debug for UserInputLeaf {
5353
}
5454
}
5555

56+
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
57+
pub enum Delimiter {
58+
SingleQuotes,
59+
DoubleQuotes,
60+
None,
61+
}
62+
5663
#[derive(PartialEq)]
5764
pub struct UserInputLiteral {
5865
pub field_name: Option<String>,
5966
pub phrase: String,
67+
pub delimiter: Delimiter,
6068
pub slop: u32,
6169
}
6270

6371
impl fmt::Debug for UserInputLiteral {
64-
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
72+
fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
6573
if let Some(ref field) = self.field_name {
6674
write!(formatter, "\"{field}\":")?;
6775
}
68-
write!(formatter, "\"{}\"", self.phrase)?;
76+
match self.delimiter {
77+
Delimiter::SingleQuotes => {
78+
write!(formatter, "'{}'", self.phrase)?;
79+
}
80+
Delimiter::DoubleQuotes => {
81+
write!(formatter, "\"{}\"", self.phrase)?;
82+
}
83+
Delimiter::None => {
84+
write!(formatter, "{}", self.phrase)?;
85+
}
86+
}
6987
if self.slop > 0 {
7088
write!(formatter, "~{}", self.slop)?;
7189
}

‎src/query/query_parser/query_parser.rs

+15-16
Original file line numberDiff line numberDiff line change
@@ -129,24 +129,22 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
129129
///
130130
/// The language covered by the current parser is extremely simple.
131131
///
132-
/// * simple terms: "e.g.: `Barack Obama` are simply tokenized using tantivy's
133-
/// [`SimpleTokenizer`](crate::tokenizer::SimpleTokenizer), hence becoming `["barack", "obama"]`.
134-
/// The terms are then searched within the default terms of the query parser.
132+
/// * simple terms: "e.g.: `Barack Obama` will be seen as a sequence of two tokens Barack and Obama.
133+
/// By default, the query parser will interpret this as a disjunction (see
134+
/// `.set_conjunction_by_default()`) and will match all documents that contains either "Barack" or
135+
/// "Obama" or both. Since we did not target a specific field, the query parser will look into the
136+
/// so-called default fields (as set up in the constructor).
135137
///
136-
/// e.g. If `body` and `title` are default fields, our example terms are
137-
/// `["title:barack", "body:barack", "title:obama", "body:obama"]`.
138+
/// Assuming that the default fields are `body` and `title`, and the query parser is set with
139+
/// conjunction as a default, our query will be interpreted as.
140+
/// `(body:Barack OR title:Barack) AND (title:Obama OR body:Obama)`.
138141
/// By default, all tokenized and indexed fields are default fields.
139142
///
140-
/// Multiple terms are handled as an `OR` : any document containing at least
141-
/// one of the term will go through the scoring.
142-
///
143-
/// This behavior is slower, but is not a bad idea if the user is sorting
144-
/// by relevance : The user typically just scans through the first few
145-
/// documents in order of decreasing relevance and will stop when the documents
146-
/// are not relevant anymore.
147-
///
148-
/// Switching to a default of `AND` can be done by calling `.set_conjunction_by_default()`.
149-
///
143+
/// It is possible to explicitly target a field by prefixing the text by the `fieldname:`.
144+
/// Note this only applies to the term directly following.
145+
/// For instance, assuming the query parser is configured to use conjunction by default,
146+
/// `body:Barack Obama` is not interpreted as `body:Barack AND body:Obama` but as
147+
/// `body:Barack OR (body:Barack OR text:Obama)` .
150148
///
151149
/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is
152150
/// interpreted
@@ -165,7 +163,8 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
165163
///
166164
/// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed. e.g.,
167165
/// `title:"Barack Obama"` will only find documents that have "barack" immediately followed by
168-
/// "obama".
166+
/// "obama". Single quotes can also be used. If the text to be searched contains quotation mark,
167+
/// it is possible to escape them with a \.
169168
///
170169
/// * range terms: Range searches can be done by specifying the start and end bound. These can be
171170
/// inclusive or exclusive. e.g., `title:[a TO c}` will find all documents whose title contains a

0 commit comments

Comments
 (0)
Please sign in to comment.