Skip to content

Commit 8183159

Browse files
authoredAug 3, 2023
examples : generate JSON according to schema (#1887)
* examples : add JSON schema grammars * complete JSON grammar * ensure primitive types can be used as root of schema * support integer type and adjust usage text
1 parent 468ea24 commit 8183159

File tree

2 files changed

+138
-10
lines changed

2 files changed

+138
-10
lines changed
 

‎examples/json-schema-to-grammar.py

+132
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import argparse
2+
import json
3+
import re
4+
import sys
5+
6+
# whitespace is constrained to a single space char to prevent model "running away" in
7+
# whitespace. Also maybe improves generation quality?
8+
SPACE_RULE = '" "?'
9+
10+
PRIMITIVE_RULES = {
11+
'boolean': '("true" | "false") space',
12+
'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
13+
'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
14+
'string': r''' "\"" (
15+
[^"\\] |
16+
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
17+
)* "\"" space ''',
18+
'null': '"null" space',
19+
}
20+
21+
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
22+
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
23+
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}
24+
25+
26+
class SchemaConverter:
27+
def __init__(self, prop_order):
28+
self._prop_order = prop_order
29+
self._rules = {'space': SPACE_RULE}
30+
31+
def _format_literal(self, literal):
32+
escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
33+
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
34+
)
35+
return f'"{escaped}"'
36+
37+
def _add_rule(self, name, rule):
38+
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
39+
if esc_name not in self._rules or self._rules[esc_name] == rule:
40+
key = esc_name
41+
else:
42+
i = 0
43+
while f'{esc_name}{i}' in self._rules:
44+
i += 1
45+
key = f'{esc_name}{i}'
46+
self._rules[key] = rule
47+
return key
48+
49+
def visit(self, schema, name):
50+
schema_type = schema.get('type')
51+
rule_name = name or 'root'
52+
53+
if 'oneOf' in schema or 'anyOf' in schema:
54+
rule = ' | '.join((
55+
self.visit(alt_schema, f'{name}{"-" if name else ""}{i}')
56+
for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf'])
57+
))
58+
return self._add_rule(rule_name, rule)
59+
60+
elif 'const' in schema:
61+
return self._add_rule(rule_name, self._format_literal(schema['const']))
62+
63+
elif 'enum' in schema:
64+
rule = ' | '.join((self._format_literal(v) for v in schema['enum']))
65+
return self._add_rule(rule_name, rule)
66+
67+
elif schema_type == 'object' and 'properties' in schema:
68+
# TODO: `required` keyword
69+
prop_order = self._prop_order
70+
prop_pairs = sorted(
71+
schema['properties'].items(),
72+
# sort by position in prop_order (if specified) then by key
73+
key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
74+
)
75+
76+
rule = '"{" space'
77+
for i, (prop_name, prop_schema) in enumerate(prop_pairs):
78+
prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
79+
if i > 0:
80+
rule += ' "," space'
81+
rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
82+
rule += ' "}" space'
83+
84+
return self._add_rule(rule_name, rule)
85+
86+
elif schema_type == 'array' and 'items' in schema:
87+
# TODO `prefixItems` keyword
88+
item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
89+
rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
90+
return self._add_rule(rule_name, rule)
91+
92+
else:
93+
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
94+
return self._add_rule(
95+
'root' if rule_name == 'root' else schema_type,
96+
PRIMITIVE_RULES[schema_type]
97+
)
98+
99+
def format_grammar(self):
100+
return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items()))
101+
102+
103+
def main(args_in = None):
104+
parser = argparse.ArgumentParser(
105+
description='''
106+
Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
107+
given JSON schema. Only a subset of JSON schema features are supported; more may be
108+
added in the future.
109+
''',
110+
)
111+
parser.add_argument(
112+
'--prop-order',
113+
default=[],
114+
type=lambda s: s.split(','),
115+
help='''
116+
comma-separated property names defining the order of precedence for object properties;
117+
properties not specified here are given lower precedence than those that are, and are
118+
sorted alphabetically
119+
'''
120+
)
121+
parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
122+
args = parser.parse_args(args_in)
123+
124+
schema = json.load(sys.stdin if args.schema == '-' else open(args.schema))
125+
prop_order = {name: idx for idx, name in enumerate(args.prop_order)}
126+
converter = SchemaConverter(prop_order)
127+
converter.visit(schema, '')
128+
print(converter.format_grammar())
129+
130+
131+
if __name__ == '__main__':
132+
main()

‎grammars/json.gbnf

+6-10
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,25 @@
1-
# Grammar for subset of JSON - doesn't support full string or number syntax
2-
3-
root ::= object
4-
value ::= object | array | string | number | boolean | "null"
1+
root ::= object
2+
value ::= object | array | string | number | ("true" | "false" | "null") ws
53

64
object ::=
75
"{" ws (
86
string ":" ws value
97
("," ws string ":" ws value)*
10-
)? "}"
8+
)? "}" ws
119

1210
array ::=
1311
"[" ws (
1412
value
1513
("," ws value)*
16-
)? "]"
14+
)? "]" ws
1715

18-
string ::=
16+
string ::=
1917
"\"" (
2018
[^"\\] |
2119
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
2220
)* "\"" ws
2321

24-
# Only plain integers currently
25-
number ::= "-"? [0-9]+ ws
26-
boolean ::= ("true" | "false") ws
22+
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
2723

2824
# Optional space: by convention, applied in this grammar after literal chars when allowed
2925
ws ::= ([ \t\n] ws)?

0 commit comments

Comments
 (0)
Please sign in to comment.