Skip to content

Commit 375f85d

Browse files
committed
json: improved repetitions & builtin rule deps
1 parent 8451cdb commit 375f85d

File tree

2 files changed

+119
-71
lines changed

2 files changed

+119
-71
lines changed

examples/json_schema_to_grammar.py

+64-43
Original file line numberDiff line numberDiff line change
@@ -6,45 +6,57 @@
66
import sys
77
from typing import Any, Dict, List, Set, Tuple, Union
88

9+
def _build_repetition(content, up_to_n):
10+
# return ' '.join([content] * n)
11+
if up_to_n == 0:
12+
return ''
13+
return f'({content}{" " + _build_repetition(content, up_to_n-1) if up_to_n > 1 else ""})?'
14+
15+
class BuiltinRule:
16+
def __init__(self, content: str, deps: list[str] = None):
17+
self.content = content
18+
self.deps = deps or []
19+
20+
def __str__(self):
21+
assert false
22+
23+
_up_to_15_digits = _build_repetition('[0-9]', 15)
24+
925
# whitespace is constrained to a single space char to prevent model "running away" in
1026
# whitespace. Also maybe improves generation quality?
1127
SPACE_RULE = '" "?'
12-
28+
1329
PRIMITIVE_RULES = {
14-
'boolean': '("true" | "false") space',
15-
'decimal-part': '[0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] [0-9]?)?)?)?)?)?)?)?)?)?',
16-
'integral-part': '[0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] [0-9]?)?)?)?)?)?)?)?)?)?',
17-
18-
# 'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
19-
# 'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
20-
'number': '("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space',
21-
'integer': '("-"? integral-part) space',
22-
'value' : 'object | array | string | number | boolean',
23-
'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
24-
'array' : '"[" space ( value ("," space value)* )? "]" space',
25-
'uuid' : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space',
26-
'string': r''' "\"" (
30+
'boolean': BuiltinRule('("true" | "false") space', []),
31+
'decimal-part': BuiltinRule('[0-9] ' + _up_to_15_digits, []),
32+
'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
33+
'number': BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
34+
'integer': BuiltinRule('("-"? integral-part) space', ['integral-part']),
35+
'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
36+
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
37+
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
38+
'uuid' : BuiltinRule('"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space', []),
39+
'string': BuiltinRule(r''' "\"" (
2740
[^"\\] |
2841
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
29-
)* "\"" space''',
30-
'null': '"null" space',
42+
)* "\"" space''', []),
43+
'null': BuiltinRule('"null" space', []),
3144
}
32-
OBJECT_RULE_NAMES = ['object', 'array', 'string', 'integral-part', 'decimal-part', 'number', 'boolean', 'null', 'value']
3345

3446
# TODO: support "uri", "email" string formats
35-
DATE_RULES = {
36-
'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
37-
'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
38-
'date-time': 'date "T" time',
39-
'date-string': '"\\"" date "\\"" space',
40-
'time-string': '"\\"" time "\\"" space',
41-
'date-time-string': '"\\"" date-time "\\"" space',
47+
STRING_FORMAT_RULES = {
48+
'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
49+
'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
50+
'date-time': BuiltinRule('date "T" time', ['date', 'time']),
51+
'date-string': BuiltinRule('"\\"" date "\\"" space', ['date']),
52+
'time-string': BuiltinRule('"\\"" time "\\"" space', ['time']),
53+
'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
4254
}
4355

4456
DOTALL = '[\\U00000000-\\U0010FFFF]'
4557
DOT = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]'
4658

47-
RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()])
59+
RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
4860

4961
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
5062
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
@@ -54,8 +66,6 @@
5466
NON_LITERAL_SET = set('|.()[]{}*+?')
5567
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
5668

57-
DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
58-
TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits
5969

6070
class SchemaConverter:
6171
def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
@@ -65,8 +75,6 @@ def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
6575
self._raw_pattern = raw_pattern
6676
self._rules = {
6777
'space': SPACE_RULE,
68-
'integral-part': PRIMITIVE_RULES['integral-part'],
69-
'decimal-part': PRIMITIVE_RULES['decimal-part'],
7078
}
7179
self._refs = {}
7280
self._refs_being_resolved = set()
@@ -420,7 +428,9 @@ def add_component(comp_schema, is_required):
420428
successive_items = list_item_operator * (min_items - 1)
421429
min_items -= 1
422430
if max_items is not None and max_items > min_items:
423-
successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
431+
# TODO: avoid grammar branch explosion here
432+
successive_items += _build_repetition(list_item_operator, max_items - min_items - 1)
433+
# successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
424434
else:
425435
successive_items += list_item_operator + "*"
426436
if min_items == 0:
@@ -433,28 +443,39 @@ def add_component(comp_schema, is_required):
433443
return self._visit_pattern(schema['pattern'], rule_name)
434444

435445
elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
436-
return self._add_rule(
446+
return self._add_primitive(
437447
'root' if rule_name == 'root' else schema_format,
438448
PRIMITIVE_RULES['uuid']
439449
)
440450

441-
elif schema_type in (None, 'string') and schema_format in DATE_RULES:
442-
for t, r in DATE_RULES.items():
443-
self._add_rule(t, r)
444-
return schema_format + '-string'
451+
elif schema_type in (None, 'string') and schema_format in STRING_FORMAT_RULES:
452+
return self._add_rule(rule_name, self._add_primitive(schema_format, STRING_FORMAT_RULES[schema_format]))
445453

446454
elif (schema_type == 'object') or (len(schema) == 0):
447-
for n in OBJECT_RULE_NAMES:
448-
self._add_rule(n, PRIMITIVE_RULES[n])
449-
return self._add_rule(rule_name, 'object')
455+
return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
450456

451457
else:
452458
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
453459
# TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
454-
return self._add_rule(
455-
'root' if rule_name == 'root' else schema_type,
456-
PRIMITIVE_RULES[schema_type]
457-
)
460+
return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
461+
462+
def _add_primitive(self, name: str, rule: BuiltinRule):
463+
assert isinstance(rule, BuiltinRule), f'rule: {rule}'
464+
assert isinstance(rule.content, str), f'{name}: {rule.content}'
465+
n = self._add_rule(name, rule.content)
466+
467+
for dep in rule.deps:
468+
dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
469+
assert dep_rule, f'Rule {dep} not known'
470+
if dep not in self._rules:
471+
self._add_primitive(dep, dep_rule)
472+
return n
473+
474+
def _build_number_rule(self):
475+
_up_to_15_digits = _build_repetition('[0-9]', 15)
476+
decimal_rule = self._add_rule('decimal-part', f'[0-9] {_up_to_15_digits}')
477+
integral_rule = self._add_rule('integral-part', f'[0-9] | [1-9] {_up_to_15_digits}')
478+
return self._add_rule('number', f'("-"? {integral_rule}) ("." {decimal_rule})? ([eE] [-+]? {integral_rule})? space')
458479

459480
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
460481
prop_order = self._prop_order
@@ -476,7 +497,7 @@ def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[st
476497
value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
477498
prop_kv_rule_names["*"] = self._add_rule(
478499
f'{sub_name}-kv',
479-
self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
500+
self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
480501
)
481502
optional_props.append("*")
482503

0 commit comments

Comments
 (0)