Skip to content

Commit 55fa729

Browse files
b-c-dsbcaller
authored andcommitted
Testing, fix C# with pragmas, rearrange
1 parent e0c28e9 commit 55fa729

13 files changed

+130
-20
lines changed

MANIFEST.in

-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,3 @@ include README.md
22
include LICENSE
33
include regexploit/bin/javascript/*.js
44
include regexploit/bin/javascript/*.json
5-
include regexploit/bin/javascript/.eslintrc.yml

regexploit/bin/regexploit.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import traceback
88

99
from regexploit.ast.sre import SreOpParser
10-
from regexploit.javascript import fix_js_regex
10+
from regexploit.languages.javascript import fix_js_regex
1111
from regexploit.output.text import TextOutput
1212
from regexploit.redos import find
1313

regexploit/bin/regexploit_csharp.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77

88
from regexploit.ast.sre import SreOpParser
99
from regexploit.bin.files import file_generator
10-
from regexploit.csharp_string_extractor import find_regexes
11-
from regexploit.javascript import fix_js_regex
10+
from regexploit.languages.csharp_string_extractor import find_regexes
11+
from regexploit.languages.javascript import fix_js_regex
1212
from regexploit.output.text import TextOutput
1313
from regexploit.redos import find
1414

@@ -36,7 +36,7 @@ def handle_file(filename: str, output: TextOutput):
3636
)
3737
continue
3838
try:
39-
parsed = SreOpParser().parse_sre(fixed)
39+
parsed = SreOpParser().parse_sre(fixed, regex.flags)
4040
except:
4141
print(f"Error in regexploit parsing: {pattern} from {filename}")
4242
print(traceback.format_exc())

regexploit/bin/regexploit_js.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from regexploit.ast.sre import SreOpParser
1414
from regexploit.bin.files import file_generator
15-
from regexploit.javascript import fix_js_regex
15+
from regexploit.languages.javascript import fix_js_regex
1616
from regexploit.output.text import TextOutput
1717
from regexploit.redos import find
1818

regexploit/bin/regexploit_python_ast.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88

99
from regexploit.ast.sre import SreOpParser
1010
from regexploit.bin.files import file_generator
11+
from regexploit.languages.python_node_visitor import PythonNodeVisitor
1112
from regexploit.output.text import TextOutput
12-
from regexploit.python_node_visitor import PythonNodeVisitor
1313
from regexploit.redos import find
1414

1515

regexploit/languages/__init__.py

Whitespace-only changes.

regexploit/csharp_string_extractor.py regexploit/languages/csharp_string_extractor.py

+27-11
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
import re
23

34
from regexploit.found_regex import FoundRegex
@@ -10,8 +11,9 @@ def make_token_regex(*token_specification):
1011
TOKENS_BASE = make_token_regex(
1112
("LINE_COMMENT", r"//"),
1213
("MULTILINE_COMMENT", r"/\*"),
14+
("INDENT_OR_PREPROCESSOR", r"(?:^|\n)\s*#?"),
1315
("SEMI_COLON", r";"),
14-
("DOUBLE_QUOTE_CHAR_LITERAL", "'\"'"),
16+
("DOUBLE_QUOTE_CHAR_LITERAL", r"'\\?\"'"),
1517
("NEW_REGEX", r"new\s+[\w.]*?Regex\("),
1618
("BEGIN_VERBATIM_STRING", r'(\$@|@\$?)"'),
1719
("BEGIN_STRING", r'\$?"'),
@@ -31,14 +33,15 @@ def make_token_regex(*token_specification):
3133

3234

3335
def find_regexes(code):
34-
code = code.decode()
36+
code = code.decode("utf-8", "replace")
3537
cursor: int = 0
3638
mode: re.Pattern = TOKENS_BASE
3739
reached_end: bool = False
3840
inside_new_regex: bool = False
3941
buffered_regex = None
4042
interpolated: bool = False # TODO: interpolated $ strings
4143
newline_positions = make_lines(code)
44+
seen_line = 0
4245

4346
while not reached_end:
4447
for mo in mode.finditer(code, cursor):
@@ -53,6 +56,11 @@ def find_regexes(code):
5356
mode = TOKENS_LINE_COMMENT
5457
cursor = mo.end()
5558
break
59+
elif kind == "INDENT_OR_PREPROCESSOR":
60+
if value and value[-1] == "#": # Preprocessor
61+
mode = TOKENS_LINE_COMMENT
62+
cursor = mo.end()
63+
break
5664
elif kind == "MULTILINE_COMMENT":
5765
mode = TOKENS_MULTILINE_COMMENT
5866
cursor = mo.end()
@@ -93,16 +101,24 @@ def find_regexes(code):
93101
elif kind in ["END_VERBATIM_STRING", "END_STRING"]:
94102
string = code[cursor : mo.start()]
95103
if kind == "END_STRING":
96-
string = string.encode().decode(
97-
"unicode_escape"
98-
) # not verbatim, might error?
99-
line = line_of(cursor, newline_positions)
104+
try:
105+
string = string.encode().decode("unicode_escape")
106+
except UnicodeDecodeError:
107+
logging.warning(f"Unable to process: {string}")
108+
string = string.encode().decode("utf-8", "replace")
109+
else:
110+
string = string.replace('""', '"')
111+
line = line_of(cursor, newline_positions, seen_line)
112+
seen_line = line - 1
100113
cursor = mo.end()
101114
if inside_new_regex:
102115
buffered_regex = (cursor, line, string)
103116
mode = TOKENS_END_NEW_REGEX
104117
else:
105-
yield FoundRegex(line, string, 0, False)
118+
flags = (
119+
re.X if kind == "END_VERBATIM_STRING" and "\n" in string else 0
120+
)
121+
yield FoundRegex(line, string, flags, False)
106122
mode = TOKENS_BASE
107123
break
108124
else:
@@ -113,10 +129,10 @@ def make_lines(code):
113129
return [m.start() for m in re.finditer("\n", code)]
114130

115131

116-
def line_of(character_index: int, newline_positions):
132+
def line_of(character_index: int, newline_positions, seen_line: int):
117133
if not newline_positions:
118134
return 1
119-
for line_index, newline_position in enumerate(newline_positions):
135+
for line_index, newline_position in enumerate(newline_positions[seen_line:]):
120136
if character_index < newline_position:
121-
return line_index + 1
122-
return line_index
137+
return line_index + seen_line + 1
138+
return line_index + seen_line

regexploit/javascript.py regexploit/languages/javascript.py

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33

44
# There's quite a lot wrong here, but it'll do for now.
5+
# Wow, looking back on this, this is still horrific.
56
CARAT_FIX = re.compile(r"(?<!\\)\[\^\]")
67
NAMED_GROUP_FIX = re.compile(r"(?<!\\)\(\?<(\w+)>")
78
HYPHEN_FIX_1 = re.compile(r"(?<!\\)(\[[^\]]*(?<!\\)\\[wsdWSD])-")
File renamed without changes.

tests/test.cs

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
using System;
2+
using System.Text.RegularExpressions;
3+
4+
public class Example
5+
{
6+
#line 1 "C:\Users\test"
7+
public static void Main()
8+
{
9+
/****"@"
10+
; " @ '"\
11+
*/
12+
string input = "Not a regex*****";
13+
string regex = "\\w+_[\\w\"]+_\\w+w";
14+
/**/
15+
string pattern = @"x""\d+.\d+.\d+!";
16+
char c = '"';
17+
char d = '\"';
18+
Regex r = new Regex(@"\b(?<word>\w+)\s+x\b", RegexOptions.IgnoreCase);
19+
Regex r = new Regex(
20+
"\\b(?<word>\\w+)\\s+\\b",
21+
// What?
22+
/**/
23+
RegexOptions.IgnoreCase
24+
);
25+
Something(@"
26+
(a # An a
27+
* # starred
28+
) # bracket
29+
* # starred again
30+
x", x);
31+
}
32+
}

tests/test_csharp.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import re
2+
from unittest.mock import Mock
3+
4+
from regexploit.bin.regexploit_csharp import handle_file
5+
from regexploit.languages.csharp_string_extractor import find_regexes
6+
7+
8+
def test_csharp():
9+
with open("tests/test.cs", "rb") as f:
10+
code = f.read()
11+
found = list(find_regexes(code))
12+
assert len(found) == 6
13+
assert found[0].pattern == "Not a regex*****"
14+
assert found[1].pattern == '\\w+_[\\w"]+_\\w+w'
15+
assert found[2].pattern == r'x"\d+.\d+.\d+!'
16+
assert found[2].lineno == 15
17+
assert not found[2].definitely_regex
18+
assert found[3].definitely_regex
19+
assert found[4].flags == re.I
20+
assert found[5].flags == re.X
21+
22+
23+
def test_handle_file():
24+
output = Mock(spec=["next", "record"])
25+
handle_file("tests/test.cs", output)
26+
assert output.next.call_count == 5
27+
assert output.record.call_count == 3

tests/test_javascript.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import re
2+
from json import dumps
3+
from unittest.mock import Mock
24

35
import pytest
46

5-
from regexploit.javascript import fix_js_regex
7+
from regexploit.bin.regexploit_js import handle_line_from_node
8+
from regexploit.languages.javascript import fix_js_regex
69

710

811
@pytest.mark.parametrize(
@@ -26,3 +29,27 @@ def test_fixes(r, f):
2629
fixed = fix_js_regex(r)
2730
assert fixed == f
2831
re.compile(fixed)
32+
33+
34+
@pytest.mark.parametrize(
35+
"pat,next_called,recorded",
36+
[
37+
("ab*cdef", False, False), # too few stars
38+
("ab+c+def", True, False),
39+
("ab*b+b*c", True, True),
40+
("a[^](?<xyz>c*)*d", True, True),
41+
("a[^](?<xyz>c*)d*", True, False),
42+
],
43+
)
44+
def test_handle_line_from_node(pat, next_called, recorded):
45+
output = Mock(spec=["next", "record"])
46+
line_json = dict(pattern=pat, lineno=1, filename="testfile")
47+
handle_line_from_node(dumps(line_json), output)
48+
if next_called:
49+
output.next.assert_called_once()
50+
else:
51+
output.next.assert_not_called()
52+
if recorded:
53+
output.record.assert_called_once()
54+
else:
55+
output.record.assert_not_called()

tests/test_python_ast.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import ast
22
import re
33
import textwrap
4+
from unittest.mock import Mock
45

6+
from regexploit.bin.regexploit_python_ast import handle_file
57
from regexploit.found_regex import FoundRegex
6-
from regexploit.python_node_visitor import PythonNodeVisitor
8+
from regexploit.languages.python_node_visitor import PythonNodeVisitor
79

810

911
def patterns_from_code(code: str):
@@ -27,3 +29,9 @@ def x():
2729
assert patterns[0] == FoundRegex(2, "abc+d+", 0, False)
2830
assert patterns[1] == FoundRegex(6, "aregex", re.A, True)
2931
assert patterns[2] == FoundRegex(7, "x*y*z", re.X | re.MULTILINE, True)
32+
33+
34+
def test_file():
35+
output = Mock(spec=["next"])
36+
handle_file(__file__, output)
37+
assert output.next.call_count == 2 # abc+d+, x*y*z, code string errors

0 commit comments

Comments
 (0)