6
6
import sys
7
7
from typing import Any , Dict , List , Set , Tuple , Union
8
8
9
+ def _build_repetition (content , up_to_n ):
10
+ # return ' '.join([content] * n)
11
+ if up_to_n == 0 :
12
+ return ''
13
+ return f'({ content } { " " + _build_repetition (content , up_to_n - 1 ) if up_to_n > 1 else "" } )?'
14
+
15
+ class BuiltinRule :
16
+ def __init__ (self , content : str , deps : list [str ] = None ):
17
+ self .content = content
18
+ self .deps = deps or []
19
+
20
+ def __str__ (self ):
21
+ assert false
22
+
23
+ _up_to_15_digits = _build_repetition ('[0-9]' , 15 )
24
+
9
25
# whitespace is constrained to a single space char to prevent model "running away" in
10
26
# whitespace. Also maybe improves generation quality?
11
27
SPACE_RULE = '" "?'
12
-
28
+
13
29
PRIMITIVE_RULES = {
14
- 'boolean' : '("true" | "false") space' ,
15
- 'decimal-part' : '[0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] [0-9]?)?)?)?)?)?)?)?)?)?' ,
16
- 'integral-part' : '[0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] [0-9]?)?)?)?)?)?)?)?)?)?' ,
17
-
18
- # 'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
19
- # 'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
20
- 'number' : '("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space' ,
21
- 'integer' : '("-"? integral-part) space' ,
22
- 'value' : 'object | array | string | number | boolean' ,
23
- 'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space' ,
24
- 'array' : '"[" space ( value ("," space value)* )? "]" space' ,
25
- 'uuid' : '"\\ "" ' + ' "-" ' .join ('[0-9a-fA-F]' * n for n in [8 , 4 , 4 , 4 , 12 ]) + ' "\\ "" space' ,
26
- 'string' : r''' "\"" (
30
+ 'boolean' : BuiltinRule ('("true" | "false") space' , []),
31
+ 'decimal-part' : BuiltinRule ('[0-9] ' + _up_to_15_digits , []),
32
+ 'integral-part' : BuiltinRule ('[0-9] | [1-9] ' + _up_to_15_digits , []),
33
+ 'number' : BuiltinRule ('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space' , ['integral-part' , 'decimal-part' ]),
34
+ 'integer' : BuiltinRule ('("-"? integral-part) space' , ['integral-part' ]),
35
+ 'value' : BuiltinRule ('object | array | string | number | boolean | null' , ['object' , 'array' , 'string' , 'number' , 'boolean' , 'null' ]),
36
+ 'object' : BuiltinRule ('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space' , ['string' , 'value' ]),
37
+ 'array' : BuiltinRule ('"[" space ( value ("," space value)* )? "]" space' , ['value' ]),
38
+ 'uuid' : BuiltinRule ('"\\ "" ' + ' "-" ' .join ('[0-9a-fA-F]' * n for n in [8 , 4 , 4 , 4 , 12 ]) + ' "\\ "" space' , []),
39
+ 'string' : BuiltinRule (r''' "\"" (
27
40
[^"\\] |
28
41
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
29
- )* "\"" space''' ,
30
- 'null' : '"null" space' ,
42
+ )* "\"" space''' , []),
43
+ 'null' : BuiltinRule ( '"null" space' , []) ,
31
44
}
32
- OBJECT_RULE_NAMES = ['object' , 'array' , 'string' , 'integral-part' , 'decimal-part' , 'number' , 'boolean' , 'null' , 'value' ]
33
45
34
46
# TODO: support "uri", "email" string formats
35
- DATE_RULES = {
36
- 'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \" 0\" [1-9] | [1-2] [0-9] | "3" [0-1] )' ,
37
- 'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )' ,
38
- 'date-time' : 'date "T" time' ,
39
- 'date-string' : '"\\ "" date "\\ "" space' ,
40
- 'time-string' : '"\\ "" time "\\ "" space' ,
41
- 'date-time-string' : '"\\ "" date-time "\\ "" space' ,
47
+ STRING_FORMAT_RULES = {
48
+ 'date' : BuiltinRule ( '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \" 0\" [1-9] | [1-2] [0-9] | "3" [0-1] )' , []) ,
49
+ 'time' : BuiltinRule ( '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )' , []) ,
50
+ 'date-time' : BuiltinRule ( 'date "T" time' , [ 'date' , 'time' ]) ,
51
+ 'date-string' : BuiltinRule ( '"\\ "" date "\\ "" space' , [ 'date' ]) ,
52
+ 'time-string' : BuiltinRule ( '"\\ "" time "\\ "" space' , [ 'time' ]) ,
53
+ 'date-time-string' : BuiltinRule ( '"\\ "" date-time "\\ "" space' , [ 'date-time' ]) ,
42
54
}
43
55
44
56
DOTALL = '[\\ U00000000-\\ U0010FFFF]'
45
57
DOT = '[\\ U00000000-\\ x09\\ x0B\\ x0C\\ x0E-\\ U0010FFFF]'
46
58
47
- RESERVED_NAMES = set (["root" , * PRIMITIVE_RULES .keys (), * DATE_RULES .keys ()])
59
+ RESERVED_NAMES = set (["root" , * PRIMITIVE_RULES .keys (), * STRING_FORMAT_RULES .keys ()])
48
60
49
61
INVALID_RULE_CHARS_RE = re .compile (r'[^a-zA-Z0-9-]+' )
50
62
GRAMMAR_LITERAL_ESCAPE_RE = re .compile (r'[\r\n"]' )
54
66
NON_LITERAL_SET = set ('|.()[]{}*+?' )
55
67
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set ('[]()|{}*+?' )
56
68
57
- DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
58
- TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\ .[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits
59
69
60
70
class SchemaConverter :
61
71
def __init__ (self , * , prop_order , allow_fetch , dotall , raw_pattern ):
@@ -65,8 +75,6 @@ def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
65
75
self ._raw_pattern = raw_pattern
66
76
self ._rules = {
67
77
'space' : SPACE_RULE ,
68
- 'integral-part' : PRIMITIVE_RULES ['integral-part' ],
69
- 'decimal-part' : PRIMITIVE_RULES ['decimal-part' ],
70
78
}
71
79
self ._refs = {}
72
80
self ._refs_being_resolved = set ()
@@ -420,7 +428,9 @@ def add_component(comp_schema, is_required):
420
428
successive_items = list_item_operator * (min_items - 1 )
421
429
min_items -= 1
422
430
if max_items is not None and max_items > min_items :
423
- successive_items += (list_item_operator + "?" ) * (max_items - min_items - 1 )
431
+ # TODO: avoid grammar branch explosion here
432
+ successive_items += _build_repetition (list_item_operator , max_items - min_items - 1 )
433
+ # successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
424
434
else :
425
435
successive_items += list_item_operator + "*"
426
436
if min_items == 0 :
@@ -433,28 +443,39 @@ def add_component(comp_schema, is_required):
433
443
return self ._visit_pattern (schema ['pattern' ], rule_name )
434
444
435
445
elif schema_type in (None , 'string' ) and re .match (r'^uuid[1-5]?$' , schema_format or '' ):
436
- return self ._add_rule (
446
+ return self ._add_primitive (
437
447
'root' if rule_name == 'root' else schema_format ,
438
448
PRIMITIVE_RULES ['uuid' ]
439
449
)
440
450
441
- elif schema_type in (None , 'string' ) and schema_format in DATE_RULES :
442
- for t , r in DATE_RULES .items ():
443
- self ._add_rule (t , r )
444
- return schema_format + '-string'
451
+ elif schema_type in (None , 'string' ) and schema_format in STRING_FORMAT_RULES :
452
+ return self ._add_rule (rule_name , self ._add_primitive (schema_format , STRING_FORMAT_RULES [schema_format ]))
445
453
446
454
elif (schema_type == 'object' ) or (len (schema ) == 0 ):
447
- for n in OBJECT_RULE_NAMES :
448
- self ._add_rule (n , PRIMITIVE_RULES [n ])
449
- return self ._add_rule (rule_name , 'object' )
455
+ return self ._add_rule (rule_name , self ._add_primitive ('object' , PRIMITIVE_RULES ['object' ]))
450
456
451
457
else :
452
458
assert schema_type in PRIMITIVE_RULES , f'Unrecognized schema: { schema } '
453
459
# TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
454
- return self ._add_rule (
455
- 'root' if rule_name == 'root' else schema_type ,
456
- PRIMITIVE_RULES [schema_type ]
457
- )
460
+ return self ._add_primitive ('root' if rule_name == 'root' else schema_type , PRIMITIVE_RULES [schema_type ])
461
+
462
+ def _add_primitive (self , name : str , rule : BuiltinRule ):
463
+ assert isinstance (rule , BuiltinRule ), f'rule: { rule } '
464
+ assert isinstance (rule .content , str ), f'{ name } : { rule .content } '
465
+ n = self ._add_rule (name , rule .content )
466
+
467
+ for dep in rule .deps :
468
+ dep_rule = PRIMITIVE_RULES .get (dep ) or STRING_FORMAT_RULES .get (dep )
469
+ assert dep_rule , f'Rule { dep } not known'
470
+ if dep not in self ._rules :
471
+ self ._add_primitive (dep , dep_rule )
472
+ return n
473
+
474
+ def _build_number_rule (self ):
475
+ _up_to_15_digits = _build_repetition ('[0-9]' , 15 )
476
+ decimal_rule = self ._add_rule ('decimal-part' , f'[0-9] { _up_to_15_digits } ' )
477
+ integral_rule = self ._add_rule ('integral-part' , f'[0-9] | [1-9] { _up_to_15_digits } ' )
478
+ return self ._add_rule ('number' , f'("-"? { integral_rule } ) ("." { decimal_rule } )? ([eE] [-+]? { integral_rule } )? space' )
458
479
459
480
def _build_object_rule (self , properties : List [Tuple [str , Any ]], required : Set [str ], name : str , additional_properties : Union [bool , Any ]):
460
481
prop_order = self ._prop_order
@@ -476,7 +497,7 @@ def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[st
476
497
value_rule = self .visit ({} if additional_properties == True else additional_properties , f'{ sub_name } -value' )
477
498
prop_kv_rule_names ["*" ] = self ._add_rule (
478
499
f'{ sub_name } -kv' ,
479
- self ._add_rule ('string' , PRIMITIVE_RULES ['string' ]) + f' ":" space { value_rule } '
500
+ self ._add_primitive ('string' , PRIMITIVE_RULES ['string' ]) + f' ":" space { value_rule } '
480
501
)
481
502
optional_props .append ("*" )
482
503
0 commit comments