Skip to content

Commit 55b2d08

Browse files
ochafikHanClinto
andauthoredJun 6, 2024··
grammars: x{min,max} repetition operator (#6640)
* grammars: x{min,max} repetition operator + tweak +/*/? to avoid duplication of original over alternates * grammars: handle `x{n}` and fix `x{n,n}` * grammars: document new repetition operators * grammars: uniform use of int for min & max * grammars: refactor parser test * grammar: parsing tests w/ natural pretty print of updated expectations * grammars: much prettier print of expectations (+ TEST_GRAMMAR_PARSER_PRINT_ALL=1 to force all) * grammars: improve test pretty print again * grammars: pretty print rules and chars * grammars: fix copy rule skipping * grammars: disallow `a{,}` (not allowed in regexps) * Update common/grammar-parser.cpp Co-authored-by: Clint Herron <[email protected]> * grammars: fix copy rule skipping (again) & display of expectations * grammars: more test cases * grammars: update reps parsing to bring ? / * / + closer to before * json: use new GBNF repetitions{m,n} syntax * grammars: update performance gotchas w/ repetition advice * Update examples/json_schema_to_grammar.py Co-authored-by: Clint Herron <[email protected]> * Update examples/server/public/json-schema-to-grammar.mjs Co-authored-by: Clint Herron <[email protected]> * grammars: comment on rule repetitions * grammars: ensure unambiguous number alternatives * grammar: nit typo switched error msgs * grammar: nit numbering in comment * json: update numeric rule to be unambiguous * Apply suggestions from code review Co-authored-by: Clint Herron <[email protected]> * Update examples/server/public/json-schema-to-grammar.mjs Co-authored-by: Clint Herron <[email protected]> * json: fix integral-part * grammar: add repetition tests --------- Co-authored-by: Clint Herron <[email protected]>
1 parent f5d7b26 commit 55b2d08

9 files changed

+726
-408
lines changed
 

‎common/grammar-parser.cpp

+107-31
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,12 @@ namespace grammar_parser {
4646
state.rules[rule_id] = rule;
4747
}
4848

49+
static bool is_digit_char(char c) {
50+
return '0' <= c && c <= '9';
51+
}
52+
4953
static bool is_word_char(char c) {
50-
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
54+
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
5155
}
5256

5357
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
@@ -99,6 +103,17 @@ namespace grammar_parser {
99103
return pos;
100104
}
101105

106+
static const char * parse_int(const char * src) {
107+
const char * pos = src;
108+
while (is_digit_char(*pos)) {
109+
pos++;
110+
}
111+
if (pos == src) {
112+
throw std::runtime_error(std::string("expecting integer at ") + src);
113+
}
114+
return pos;
115+
}
116+
102117
static std::pair<uint32_t, const char *> parse_char(const char * src) {
103118
if (*src == '\\') {
104119
switch (src[1]) {
@@ -137,6 +152,60 @@ namespace grammar_parser {
137152
bool is_nested) {
138153
size_t last_sym_start = out_elements.size();
139154
const char * pos = src;
155+
156+
auto handle_repetitions = [&](int min_times, int max_times) {
157+
158+
if (last_sym_start == out_elements.size()) {
159+
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
160+
}
161+
162+
// apply transformation to previous symbol (last_sym_start to end) according to
163+
// the following rewrite rules:
164+
// S{m,n} --> S S S (m times) S'(n-m)
165+
// S'(x) ::= S S'(x-1) |
166+
// (... n-m definitions of these S' rules ...)
167+
// S'(1) ::= S |
168+
// S{m,} --> S S S (m times) S'
169+
// S' ::= S S' |
170+
// S* --> S{0,}
171+
// --> S' ::= S S' |
172+
// S+ --> S{1,}
173+
// --> S S'
174+
// S' ::= S S' |
175+
// S? --> S{0,1}
176+
// --> S'
177+
// S' ::= S |
178+
179+
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
180+
if (min_times == 0) {
181+
out_elements.resize(last_sym_start);
182+
} else {
183+
// Repeat the previous elements (min_times - 1) times
184+
for (int i = 1; i < min_times; i++) {
185+
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
186+
}
187+
}
188+
189+
uint32_t last_rec_rule_id = 0;
190+
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
191+
192+
std::vector<llama_grammar_element> rec_rule(previous_elements);
193+
for (int i = 0; i < n_opt; i++) {
194+
rec_rule.resize(previous_elements.size());
195+
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
196+
if (i > 0 || max_times < 0) {
197+
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
198+
}
199+
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
200+
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
201+
add_rule(state, rec_rule_id, rec_rule);
202+
last_rec_rule_id = rec_rule_id;
203+
}
204+
if (n_opt > 0) {
205+
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
206+
}
207+
};
208+
140209
while (*pos) {
141210
if (*pos == '"') { // literal string
142211
pos++;
@@ -197,40 +266,47 @@ namespace grammar_parser {
197266
throw std::runtime_error(std::string("expecting ')' at ") + pos);
198267
}
199268
pos = parse_space(pos + 1, is_nested);
200-
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
201-
if (last_sym_start == out_elements.size()) {
202-
throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
203-
}
269+
} else if (*pos == '*') {
270+
pos = parse_space(pos + 1, is_nested);
271+
handle_repetitions(0, -1);
272+
} else if (*pos == '+') {
273+
pos = parse_space(pos + 1, is_nested);
274+
handle_repetitions(1, -1);
275+
} else if (*pos == '?') {
276+
pos = parse_space(pos + 1, is_nested);
277+
handle_repetitions(0, 1);
278+
} else if (*pos == '{') {
279+
pos = parse_space(pos + 1, is_nested);
204280

205-
// apply transformation to previous symbol (last_sym_start to end) according to
206-
// rewrite rules:
207-
// S* --> S' ::= S S' |
208-
// S+ --> S' ::= S S' | S
209-
// S? --> S' ::= S |
210-
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
211-
std::vector<llama_grammar_element> sub_rule;
212-
// add preceding symbol to generated rule
213-
sub_rule.insert(
214-
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
215-
if (*pos == '*' || *pos == '+') {
216-
// cause generated rule to recurse
217-
sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
218-
}
219-
// mark start of alternate def
220-
sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
221-
if (*pos == '+') {
222-
// add preceding symbol as alternate only for '+' (otherwise empty)
223-
sub_rule.insert(
224-
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
281+
if (!is_digit_char(*pos)) {
282+
throw std::runtime_error(std::string("expecting an int at ") + pos);
225283
}
226-
sub_rule.push_back({LLAMA_GRETYPE_END, 0});
227-
add_rule(state, sub_rule_id, sub_rule);
284+
const char * int_end = parse_int(pos);
285+
int min_times = std::stoul(std::string(pos, int_end - pos));
286+
pos = parse_space(int_end, is_nested);
228287

229-
// in original rule, replace previous symbol with reference to generated rule
230-
out_elements.resize(last_sym_start);
231-
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
288+
int max_times = -1;
232289

233-
pos = parse_space(pos + 1, is_nested);
290+
if (*pos == '}') {
291+
max_times = min_times;
292+
pos = parse_space(pos + 1, is_nested);
293+
} else if (*pos == ',') {
294+
pos = parse_space(pos + 1, is_nested);
295+
296+
if (is_digit_char(*pos)) {
297+
const char * int_end = parse_int(pos);
298+
max_times = std::stoul(std::string(pos, int_end - pos));
299+
pos = parse_space(int_end, is_nested);
300+
}
301+
302+
if (*pos != '}') {
303+
throw std::runtime_error(std::string("expecting '}' at ") + pos);
304+
}
305+
pos = parse_space(pos + 1, is_nested);
306+
} else {
307+
throw std::runtime_error(std::string("expecting ',' at ") + pos);
308+
}
309+
handle_repetitions(min_times, max_times);
234310
} else {
235311
break;
236312
}

‎common/json-schema-to-grammar.cpp

+20-58
Original file line numberDiff line numberDiff line change
@@ -16,58 +16,27 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa
1616

1717
static std::string repeat(const std::string & str, size_t n);
1818

19-
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
20-
if (separator_rule.empty()) {
21-
if (min_items == 0 && max_items == 1) {
22-
return item_rule + "?";
23-
} else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
24-
return item_rule + "+";
25-
}
26-
}
19+
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
20+
auto has_max = max_items != std::numeric_limits<int>::max();
2721

28-
std::string result;
29-
if (min_items > 0) {
30-
if (item_rule_is_literal && separator_rule.empty()) {
31-
result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
32-
} else {
33-
std::vector<std::string> items(min_items, item_rule);
34-
result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
35-
}
22+
if (min_items == 0 && max_items == 1) {
23+
return item_rule + "?";
3624
}
3725

38-
std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
39-
auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
40-
41-
if (up_to_n == 0) {
42-
return "";
43-
} else if (up_to_n == 1) {
44-
return "(" + content + ")?";
45-
} else if (!separator_rule.empty() && !prefix_with_sep) {
46-
return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
26+
if (separator_rule.empty()) {
27+
if (min_items == 1 && !has_max) {
28+
return item_rule + "+";
29+
} else if (min_items == 0 && !has_max) {
30+
return item_rule + "*";
4731
} else {
48-
std::string res = repeat("(" + content + " ", up_to_n);
49-
// strip trailing space
50-
res = res.substr(0, res.length() - 1);
51-
res += repeat(")?", up_to_n);
52-
return res;
32+
return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
5333
}
54-
};
55-
56-
if (min_items > 0 && max_items != min_items) {
57-
result += " ";
5834
}
5935

60-
if (max_items != std::numeric_limits<int>::max()) {
61-
result += opt_repetitions(max_items - min_items, min_items > 0);
62-
} else {
63-
std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
64-
if (min_items == 0 && !separator_rule.empty()) {
65-
result = "(" + item_rule + " " + item_operator + "*)?";
66-
} else {
67-
result += item_operator + "*";
68-
}
36+
auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
37+
if (min_items == 0) {
38+
result = "(" + result + ")?";
6939
}
70-
7140
return result;
7241
}
7342

@@ -78,30 +47,24 @@ struct BuiltinRule {
7847
std::vector<std::string> deps;
7948
};
8049

81-
const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
82-
8350
std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
8451
{"boolean", {"(\"true\" | \"false\") space", {}}},
85-
{"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
86-
{"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
52+
{"decimal-part", {"[0-9]{1,16}", {}}},
53+
{"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
8754
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
8855
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
8956
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
9057
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
9158
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
92-
{"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
93-
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
94-
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
95-
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
96-
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
97-
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
59+
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
60+
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
9861
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
9962
{"null", {"\"null\" space", {}}},
10063
};
10164

10265
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
103-
{"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
104-
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
66+
{"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
67+
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
10568
{"date-time", {"date \"T\" time", {"date", "time"}}},
10669
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
10770
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
@@ -385,8 +348,7 @@ class SchemaConverter {
385348
sub_is_literal ? "\"" + sub + "\"" : sub,
386349
min_times,
387350
max_times,
388-
"",
389-
sub_is_literal
351+
""
390352
);
391353
seq.back().second = false;
392354
} else {

‎examples/json_schema_to_grammar.py

+18-50
Original file line numberDiff line numberDiff line change
@@ -6,84 +6,52 @@
66
import sys
77
from typing import Any, Dict, List, Set, Tuple, Union
88

9-
def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
10-
if not separator_rule:
11-
if min_items == 0 and max_items == 1:
12-
return f'{item_rule}?'
13-
elif min_items == 1 and max_items is None:
14-
return f'{item_rule}+'
15-
16-
result = ''
17-
18-
if min_items > 0:
19-
if item_rule_is_literal and separator_rule is None:
20-
result = '"' + (item_rule[1:-1] * min_items) + '"'
21-
else:
22-
result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
23-
24-
def opt_repetitions(up_to_n, prefix_with_sep=False):
25-
'''
26-
- n=4, no sep: '(a (a (a (a)?)?)?)?'
27-
- n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?'
28-
- n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
29-
'''
30-
31-
content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
32-
if up_to_n == 0:
33-
return ''
34-
elif up_to_n == 1:
35-
return f'({content})?'
36-
elif separator_rule and not prefix_with_sep:
37-
return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
38-
else:
39-
return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
409

41-
if min_items > 0 and max_items != min_items:
42-
result += ' '
10+
def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
4311

44-
if max_items is not None:
45-
result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
46-
else:
47-
item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
12+
if min_items == 0 and max_items == 1:
13+
return f'{item_rule}?'
4814

49-
if min_items == 0 and separator_rule:
50-
result = f'({item_rule} {item_operator}*)?'
15+
if not separator_rule:
16+
if min_items == 1 and max_items is None:
17+
return f'{item_rule}+'
18+
elif min_items == 0 and max_items is None:
19+
return f'{item_rule}*'
5120
else:
52-
result += f'{item_operator}*'
21+
return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'
5322

54-
return result
23+
result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
24+
return f'({result})?' if min_items == 0 else result
5525

5626

5727
class BuiltinRule:
5828
def __init__(self, content: str, deps: list = None):
5929
self.content = content
6030
self.deps = deps or []
6131

62-
_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
63-
6432
# whitespace is constrained to a single space char to prevent model "running away" in
6533
# whitespace. Also maybe improves generation quality?
6634
SPACE_RULE = '" "?'
6735

6836
PRIMITIVE_RULES = {
6937
'boolean' : BuiltinRule('("true" | "false") space', []),
70-
'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
71-
'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
38+
'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
39+
'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
7240
'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
7341
'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']),
7442
'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
7543
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
7644
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
77-
'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
78-
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
45+
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
46+
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
7947
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
8048
'null' : BuiltinRule('"null" space', []),
8149
}
8250

8351
# TODO: support "uri", "email" string formats
8452
STRING_FORMAT_RULES = {
85-
'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
86-
'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
53+
'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
54+
'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
8755
'date-time' : BuiltinRule('date "T" time', ['date', 'time']),
8856
'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']),
8957
'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']),
@@ -333,7 +301,7 @@ def join_seq():
333301
sub_rule_ids[sub] = id
334302
sub = id
335303

336-
seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
304+
seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
337305
else:
338306
literal = ''
339307
while i < length:

‎examples/pydantic_models_to_grammar.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -624,7 +624,7 @@ def get_primitive_grammar(grammar):
624624
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
625625
)* "\"" ws
626626
ws ::= ([ \t\n] ws)?
627-
float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
627+
float ::= ("-"? ([0] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
628628
629629
integer ::= [0-9]+"""
630630

‎examples/server/public/json-schema-to-grammar.mjs

+17-50
Original file line numberDiff line numberDiff line change
@@ -2,57 +2,26 @@
22
const SPACE_RULE = '" "?';
33

44
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
5+
if (minItems === 0 && maxItems === 1) {
6+
return `${itemRule}?`;
7+
}
8+
9+
510
const separatorRule = opts.separatorRule ?? '';
611
const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
712

813
if (separatorRule === '') {
9-
if (minItems === 0 && maxItems === 1) {
10-
return `${itemRule}?`;
11-
} else if (minItems === 1 && maxItems === undefined) {
14+
if (minItems === 1 && maxItems === undefined) {
1215
return `${itemRule}+`;
13-
}
14-
}
15-
16-
let result = '';
17-
if (minItems > 0) {
18-
if (itemRuleIsLiteral && separatorRule === '') {
19-
result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
16+
} else if (minItems === 0 && maxItems === undefined) {
17+
return `${itemRule}*`;
2018
} else {
21-
result = Array.from({ length: minItems }, () => itemRule)
22-
.join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
19+
return `${itemRule}{${minItems},${maxItems !== undefined ? maxItems : ''}}`;
2320
}
2421
}
2522

26-
const optRepetitions = (upToN, prefixWithSep=false) => {
27-
const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
28-
if (upToN === 0) {
29-
return '';
30-
} else if (upToN === 1) {
31-
return `(${content})?`;
32-
} else if (separatorRule !== '' && !prefixWithSep) {
33-
return `(${content} ${optRepetitions(upToN - 1, true)})?`;
34-
} else {
35-
return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
36-
}
37-
};
38-
39-
if (minItems > 0 && maxItems !== minItems) {
40-
result += ' ';
41-
}
42-
43-
if (maxItems !== undefined) {
44-
result += optRepetitions(maxItems - minItems, minItems > 0);
45-
} else {
46-
const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
47-
48-
if (minItems === 0 && separatorRule !== '') {
49-
result = `(${itemRule} ${itemOperator}*)?`;
50-
} else {
51-
result += `${itemOperator}*`;
52-
}
53-
}
54-
55-
return result;
23+
const result = itemRule + ' ' + _buildRepetition(`(${separatorRule} ${itemRule})`, minItems > 0 ? minItems - 1 : 0, maxItems !== undefined ? maxItems - 1 : undefined);
24+
return minItems === 0 ? `(${result})?` : result;
5625
}
5726

5827
class BuiltinRule {
@@ -62,27 +31,25 @@ class BuiltinRule {
6231
}
6332
}
6433

65-
const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
66-
6734
const PRIMITIVE_RULES = {
6835
boolean : new BuiltinRule('("true" | "false") space', []),
69-
'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
70-
'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
36+
'decimal-part' : new BuiltinRule('[0-9]{1,16}', []),
37+
'integral-part': new BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
7138
number : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
7239
integer : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
7340
value : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
7441
object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
7542
array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
76-
uuid : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
77-
char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
43+
uuid : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
44+
char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
7845
string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
7946
null : new BuiltinRule('"null" space', []),
8047
};
8148

8249
// TODO: support "uri", "email" string formats
8350
const STRING_FORMAT_RULES = {
84-
'date' : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
85-
'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
51+
'date' : new BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
52+
'time' : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
8653
'date-time' : new BuiltinRule('date "T" time', ['date', 'time']),
8754
'date-string' : new BuiltinRule('"\\"" date "\\"" space', ['date']),
8855
'time-string' : new BuiltinRule('"\\"" time "\\"" space', ['time']),

‎grammars/README.md

+8-4
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,13 @@ Parentheses `()` can be used to group sequences, which allows for embedding alte
5959

6060
## Repetition and Optional Symbols
6161

62-
- `*` after a symbol or sequence means that it can be repeated zero or more times.
63-
- `+` denotes that the symbol or sequence should appear one or more times.
64-
- `?` makes the preceding symbol or sequence optional.
62+
- `*` after a symbol or sequence means that it can be repeated zero or more times (equivalent to `{0,}`).
63+
- `+` denotes that the symbol or sequence should appear one or more times (equivalent to `{1,}`).
64+
- `?` makes the preceding symbol or sequence optional (equivalent to `{0,1}`).
65+
- `{m}` repeats the precedent symbol or sequence exactly `m` times
66+
- `{m,}` repeats the precedent symbol or sequence at least `m` times
67+
- `{m,n}` repeats the precedent symbol or sequence at between `m` and `n` times (included)
68+
- `{0,n}` repeats the precedent symbol or sequence at most `n` times (included)
6569

6670
## Comments and newlines
6771

@@ -98,4 +102,4 @@ Grammars currently have performance gotchas (see https://github.com/ggerganov/ll
98102

99103
A common pattern is to allow repetitions of a pattern `x` up to N times.
100104

101-
While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) will result in extremely slow inference. Instead, you can write `(x (x (x ... (x)?...)?)?)?` (w/ N-deep nesting)
105+
While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) may result in extremely slow sampling. Instead, you can write `x{0,N}` (or `(x (x (x ... (x)?...)?)?)?` w/ N-deep nesting in earlier llama.cpp versions).

‎tests/test-grammar-integration.cpp

+76
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,82 @@ static void test_quantifiers() {
292292
"catyyy",
293293
}
294294
);
295+
test_grammar(
296+
"simple exact repetition",
297+
// Grammar
298+
R"""(
299+
root ::= [ab]{4}
300+
)""",
301+
// Passing strings
302+
{
303+
"aaaa",
304+
"bbbb",
305+
"abab",
306+
},
307+
// Failing strings
308+
{
309+
"a",
310+
"b",
311+
"aaaaa",
312+
}
313+
);
314+
test_grammar(
315+
"simple min repetition",
316+
// Grammar
317+
R"""(
318+
root ::= [ab]{4,}
319+
)""",
320+
// Passing strings
321+
{
322+
"aaaa",
323+
"aaaaab",
324+
"bbbb",
325+
"ababab",
326+
},
327+
// Failing strings
328+
{
329+
"",
330+
"aba",
331+
}
332+
);
333+
test_grammar(
334+
"simple max repetition",
335+
// Grammar
336+
R"""(
337+
root ::= [ab]{0,4}
338+
)""",
339+
// Passing strings
340+
{
341+
"",
342+
"a",
343+
"aa",
344+
"aaa",
345+
"aaab",
346+
},
347+
// Failing strings
348+
{
349+
"aaaaa",
350+
}
351+
);
352+
test_grammar(
353+
"min / max repetition",
354+
// Grammar
355+
R"""(
356+
root ::= ("0x" [A-F0-9]{2} " "?){3,5}
357+
)""",
358+
// Passing strings
359+
{
360+
"0xFF 0x12 0xAB",
361+
"0xFF 0x12 0xAB 0x00 0x00",
362+
},
363+
// Failing strings
364+
{
365+
"",
366+
"0xFF",
367+
"0xFF 0x12",
368+
"0xFF 0x12 0xAB 0x00 0x00 0x00",
369+
}
370+
);
295371
}
296372

297373
static void test_failure_missing_root() {

‎tests/test-grammar-parser.cpp

+423-158
Large diffs are not rendered by default.

‎tests/test-json-schema-to-grammar.cpp

+56-56
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.