1
+ import logging
1
2
import re
2
3
3
4
from regexploit .found_regex import FoundRegex
@@ -10,8 +11,9 @@ def make_token_regex(*token_specification):
10
11
TOKENS_BASE = make_token_regex (
11
12
("LINE_COMMENT" , r"//" ),
12
13
("MULTILINE_COMMENT" , r"/\*" ),
14
+ ("INDENT_OR_PREPROCESSOR" , r"(?:^|\n)\s*#?" ),
13
15
("SEMI_COLON" , r";" ),
14
- ("DOUBLE_QUOTE_CHAR_LITERAL" , "' \" '" ),
16
+ ("DOUBLE_QUOTE_CHAR_LITERAL" , r"'\\? \"'" ),
15
17
("NEW_REGEX" , r"new\s+[\w.]*?Regex\(" ),
16
18
("BEGIN_VERBATIM_STRING" , r'(\$@|@\$?)"' ),
17
19
("BEGIN_STRING" , r'\$?"' ),
@@ -31,14 +33,15 @@ def make_token_regex(*token_specification):
31
33
32
34
33
35
def find_regexes (code ):
34
- code = code .decode ()
36
+ code = code .decode ("utf-8" , "replace" )
35
37
cursor : int = 0
36
38
mode : re .Pattern = TOKENS_BASE
37
39
reached_end : bool = False
38
40
inside_new_regex : bool = False
39
41
buffered_regex = None
40
42
interpolated : bool = False # TODO: interpolated $ strings
41
43
newline_positions = make_lines (code )
44
+ seen_line = 0
42
45
43
46
while not reached_end :
44
47
for mo in mode .finditer (code , cursor ):
@@ -53,6 +56,11 @@ def find_regexes(code):
53
56
mode = TOKENS_LINE_COMMENT
54
57
cursor = mo .end ()
55
58
break
59
+ elif kind == "INDENT_OR_PREPROCESSOR" :
60
+ if value and value [- 1 ] == "#" : # Preprocessor
61
+ mode = TOKENS_LINE_COMMENT
62
+ cursor = mo .end ()
63
+ break
56
64
elif kind == "MULTILINE_COMMENT" :
57
65
mode = TOKENS_MULTILINE_COMMENT
58
66
cursor = mo .end ()
@@ -93,16 +101,24 @@ def find_regexes(code):
93
101
elif kind in ["END_VERBATIM_STRING" , "END_STRING" ]:
94
102
string = code [cursor : mo .start ()]
95
103
if kind == "END_STRING" :
96
- string = string .encode ().decode (
97
- "unicode_escape"
98
- ) # not verbatim, might error?
99
- line = line_of (cursor , newline_positions )
104
+ try :
105
+ string = string .encode ().decode ("unicode_escape" )
106
+ except UnicodeDecodeError :
107
+ logging .warning (f"Unable to process: { string } " )
108
+ string = string .encode ().decode ("utf-8" , "replace" )
109
+ else :
110
+ string = string .replace ('""' , '"' )
111
+ line = line_of (cursor , newline_positions , seen_line )
112
+ seen_line = line - 1
100
113
cursor = mo .end ()
101
114
if inside_new_regex :
102
115
buffered_regex = (cursor , line , string )
103
116
mode = TOKENS_END_NEW_REGEX
104
117
else :
105
- yield FoundRegex (line , string , 0 , False )
118
+ flags = (
119
+ re .X if kind == "END_VERBATIM_STRING" and "\n " in string else 0
120
+ )
121
+ yield FoundRegex (line , string , flags , False )
106
122
mode = TOKENS_BASE
107
123
break
108
124
else :
@@ -113,10 +129,10 @@ def make_lines(code):
113
129
return [m .start () for m in re .finditer ("\n " , code )]
114
130
115
131
116
- def line_of (character_index : int , newline_positions ):
132
+ def line_of (character_index : int , newline_positions , seen_line : int ):
117
133
if not newline_positions :
118
134
return 1
119
- for line_index , newline_position in enumerate (newline_positions ):
135
+ for line_index , newline_position in enumerate (newline_positions [ seen_line :] ):
120
136
if character_index < newline_position :
121
- return line_index + 1
122
- return line_index
137
+ return line_index + seen_line + 1
138
+ return line_index + seen_line
0 commit comments