Skip to content

Commit ad675e1

Browse files
authored
Added support for . (any character) token in grammar engine. (#6467)
* Added support for . (any characer) token in grammar engine. * Add integration tests for any-character symbol.
1 parent a143c04 commit ad675e1

File tree

4 files changed

+52
-2
lines changed

4 files changed

+52
-2
lines changed

β€Žcommon/grammar-parser.cpp

+11
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,10 @@ namespace grammar_parser {
266266
throw std::runtime_error(std::string("expecting ')' at ") + pos);
267267
}
268268
pos = parse_space(pos + 1, is_nested);
269+
} else if (*pos == '.') { // any char
270+
last_sym_start = out_elements.size();
271+
out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
272+
pos = parse_space(pos + 1, is_nested);
269273
} else if (*pos == '*') {
270274
pos = parse_space(pos + 1, is_nested);
271275
handle_repetitions(0, -1);
@@ -401,6 +405,7 @@ namespace grammar_parser {
401405
case LLAMA_GRETYPE_CHAR_NOT: return true;
402406
case LLAMA_GRETYPE_CHAR_ALT: return true;
403407
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
408+
case LLAMA_GRETYPE_CHAR_ANY: return true;
404409
default: return false;
405410
}
406411
}
@@ -415,6 +420,7 @@ namespace grammar_parser {
415420
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
416421
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
417422
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
423+
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
418424
}
419425
switch (elem.type) {
420426
case LLAMA_GRETYPE_END:
@@ -426,6 +432,7 @@ namespace grammar_parser {
426432
case LLAMA_GRETYPE_CHAR_NOT:
427433
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
428434
case LLAMA_GRETYPE_CHAR_ALT:
435+
case LLAMA_GRETYPE_CHAR_ANY:
429436
fprintf(file, "(\"");
430437
print_grammar_char(file, elem.value);
431438
fprintf(file, "\") ");
@@ -483,11 +490,15 @@ namespace grammar_parser {
483490
}
484491
print_grammar_char(file, elem.value);
485492
break;
493+
case LLAMA_GRETYPE_CHAR_ANY:
494+
fprintf(file, ".");
495+
break;
486496
}
487497
if (is_char_element(elem)) {
488498
switch (rule[i + 1].type) {
489499
case LLAMA_GRETYPE_CHAR_ALT:
490500
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
501+
case LLAMA_GRETYPE_CHAR_ANY:
491502
break;
492503
default:
493504
fprintf(file, "] ");

β€Žllama.cpp

+10-2
Original file line numberDiff line numberDiff line change
@@ -13640,7 +13640,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
1364013640
const uint32_t chr) {
1364113641

1364213642
bool found = false;
13643-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13643+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
1364413644

1364513645
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
1364613646

@@ -13649,6 +13649,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
1364913649
// inclusive range, e.g. [a-z]
1365013650
found = found || (pos->value <= chr && chr <= pos[1].value);
1365113651
pos += 2;
13652+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
13653+
// Any character matches "."
13654+
found = true;
13655+
pos += 1;
1365213656
} else {
1365313657
// exact char match, e.g. [a] or "a"
1365413658
found = found || pos->value == chr;
@@ -13666,7 +13670,7 @@ static bool llama_grammar_match_partial_char(
1366613670
const llama_grammar_element * pos,
1366713671
const llama_partial_utf8 partial_utf8) {
1366813672

13669-
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
13673+
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
1367013674
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
1367113675

1367213676
uint32_t partial_value = partial_utf8.value;
@@ -13696,6 +13700,9 @@ static bool llama_grammar_match_partial_char(
1369613700
return is_positive_char;
1369713701
}
1369813702
pos += 2;
13703+
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
13704+
// Any character matches "."
13705+
return true;
1369913706
} else {
1370013707
// exact char match, e.g. [a] or "a"
1370113708
if (low <= pos->value && pos->value <= high) {
@@ -13756,6 +13763,7 @@ static void llama_grammar_advance_stack(
1375613763
}
1375713764
case LLAMA_GRETYPE_CHAR:
1375813765
case LLAMA_GRETYPE_CHAR_NOT:
13766+
case LLAMA_GRETYPE_CHAR_ANY:
1375913767
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
1376013768
// only add the stack if it's not a duplicate of one we already have
1376113769
new_stacks.emplace_back(stack);

β€Žllama.h

+3
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,9 @@ extern "C" {
365365
// modifies a preceding LLAMA_GRETYPE_CHAR or
366366
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
367367
LLAMA_GRETYPE_CHAR_ALT = 6,
368+
369+
// any character (.)
370+
LLAMA_GRETYPE_CHAR_ANY = 7,
368371
};
369372

370373
typedef struct llama_grammar_element {

β€Žtests/test-grammar-integration.cpp

+28
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,33 @@ static void test_complex_grammar() {
205205
);
206206
}
207207

208+
static void test_special_chars() {
209+
// A collection of tests to exercise special characters such as "."
210+
test_grammar(
211+
"special characters",
212+
// Grammar
213+
R"""(
214+
root ::= ... "abc" ...
215+
)""",
216+
// Passing strings
217+
{
218+
"abcabcabc",
219+
"aaaabcccc",
220+
// NOTE: Also ensures that multi-byte characters still count as a single character
221+
"πŸ”΅πŸŸ βœ…abcβŒπŸŸ πŸ”΅"
222+
},
223+
// Failing strings
224+
{
225+
"aaabcccc",
226+
"aaaaabcccc",
227+
"aaaabccc",
228+
"aaaabccccc",
229+
"πŸ”΅πŸŸ βœ…βŒabcβŒβœ…πŸŸ πŸ”΅"
230+
"πŸ”΅πŸŸ abcπŸŸ πŸ”΅"
231+
}
232+
);
233+
}
234+
208235
static void test_quantifiers() {
209236
// A collection of tests to exercise * + and ? quantifiers
210237

@@ -445,6 +472,7 @@ int main() {
445472
fprintf(stdout, "Running grammar integration tests...\n");
446473
test_simple_grammar();
447474
test_complex_grammar();
475+
test_special_chars();
448476
test_quantifiers();
449477
test_failure_missing_root();
450478
test_failure_missing_reference();

0 commit comments

Comments
Β (0)