Testing, fix C# with pragmas, rearrange

b-c-ds · bcaller · commit 55fa72991d48 · 2021-03-08T10:27:30.000Z
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -2,4 +2,3 @@ include README.md
 include LICENSE
 include regexploit/bin/javascript/*.js
 include regexploit/bin/javascript/*.json
-include regexploit/bin/javascript/.eslintrc.yml
diff --git a/regexploit/bin/regexploit.py b/regexploit/bin/regexploit.py
@@ -7,7 +7,7 @@
 import traceback
 
 from regexploit.ast.sre import SreOpParser
-from regexploit.javascript import fix_js_regex
+from regexploit.languages.javascript import fix_js_regex
 from regexploit.output.text import TextOutput
 from regexploit.redos import find
 
diff --git a/regexploit/bin/regexploit_csharp.py b/regexploit/bin/regexploit_csharp.py
@@ -7,8 +7,8 @@
 
 from regexploit.ast.sre import SreOpParser
 from regexploit.bin.files import file_generator
-from regexploit.csharp_string_extractor import find_regexes
-from regexploit.javascript import fix_js_regex
+from regexploit.languages.csharp_string_extractor import find_regexes
+from regexploit.languages.javascript import fix_js_regex
 from regexploit.output.text import TextOutput
 from regexploit.redos import find
 
@@ -36,7 +36,7 @@ def handle_file(filename: str, output: TextOutput):
                     )
                 continue
             try:
-                parsed = SreOpParser().parse_sre(fixed)
+                parsed = SreOpParser().parse_sre(fixed, regex.flags)
             except:
                 print(f"Error in regexploit parsing: {pattern} from {filename}")
                 print(traceback.format_exc())
diff --git a/regexploit/bin/regexploit_js.py b/regexploit/bin/regexploit_js.py
@@ -12,7 +12,7 @@
 
 from regexploit.ast.sre import SreOpParser
 from regexploit.bin.files import file_generator
-from regexploit.javascript import fix_js_regex
+from regexploit.languages.javascript import fix_js_regex
 from regexploit.output.text import TextOutput
 from regexploit.redos import find
 
diff --git a/regexploit/bin/regexploit_python_ast.py b/regexploit/bin/regexploit_python_ast.py
@@ -8,8 +8,8 @@
 
 from regexploit.ast.sre import SreOpParser
 from regexploit.bin.files import file_generator
+from regexploit.languages.python_node_visitor import PythonNodeVisitor
 from regexploit.output.text import TextOutput
-from regexploit.python_node_visitor import PythonNodeVisitor
 from regexploit.redos import find
 
 
diff --git a/regexploit/languages/__init__.py b/regexploit/languages/__init__.py
diff --git a/regexploit/languages/csharp_string_extractor.py b/regexploit/languages/csharp_string_extractor.py
@@ -1,3 +1,4 @@
+import logging
 import re
 
 from regexploit.found_regex import FoundRegex
@@ -10,8 +11,9 @@ def make_token_regex(*token_specification):
 TOKENS_BASE = make_token_regex(
     ("LINE_COMMENT", r"//"),
     ("MULTILINE_COMMENT", r"/\*"),
+    ("INDENT_OR_PREPROCESSOR", r"(?:^|\n)\s*#?"),
     ("SEMI_COLON", r";"),
-    ("DOUBLE_QUOTE_CHAR_LITERAL", "'\"'"),
+    ("DOUBLE_QUOTE_CHAR_LITERAL", r"'\\?\"'"),
     ("NEW_REGEX", r"new\s+[\w.]*?Regex\("),
     ("BEGIN_VERBATIM_STRING", r'(\$@|@\$?)"'),
     ("BEGIN_STRING", r'\$?"'),
@@ -31,14 +33,15 @@ def make_token_regex(*token_specification):
 
 
 def find_regexes(code):
-    code = code.decode()
+    code = code.decode("utf-8", "replace")
     cursor: int = 0
     mode: re.Pattern = TOKENS_BASE
     reached_end: bool = False
     inside_new_regex: bool = False
     buffered_regex = None
     interpolated: bool = False  # TODO: interpolated $ strings
     newline_positions = make_lines(code)
+    seen_line = 0
 
     while not reached_end:
         for mo in mode.finditer(code, cursor):
@@ -53,6 +56,11 @@ def find_regexes(code):
                 mode = TOKENS_LINE_COMMENT
                 cursor = mo.end()
                 break
+            elif kind == "INDENT_OR_PREPROCESSOR":
+                if value and value[-1] == "#":  # Preprocessor
+                    mode = TOKENS_LINE_COMMENT
+                    cursor = mo.end()
+                    break
             elif kind == "MULTILINE_COMMENT":
                 mode = TOKENS_MULTILINE_COMMENT
                 cursor = mo.end()
@@ -93,16 +101,24 @@ def find_regexes(code):
             elif kind in ["END_VERBATIM_STRING", "END_STRING"]:
                 string = code[cursor : mo.start()]
                 if kind == "END_STRING":
-                    string = string.encode().decode(
-                        "unicode_escape"
-                    )  # not verbatim, might error?
-                line = line_of(cursor, newline_positions)
+                    try:
+                        string = string.encode().decode("unicode_escape")
+                    except UnicodeDecodeError:
+                        logging.warning(f"Unable to process: {string}")
+                        string = string.encode().decode("utf-8", "replace")
+                else:
+                    string = string.replace('""', '"')
+                line = line_of(cursor, newline_positions, seen_line)
+                seen_line = line - 1
                 cursor = mo.end()
                 if inside_new_regex:
                     buffered_regex = (cursor, line, string)
                     mode = TOKENS_END_NEW_REGEX
                 else:
-                    yield FoundRegex(line, string, 0, False)
+                    flags = (
+                        re.X if kind == "END_VERBATIM_STRING" and "\n" in string else 0
+                    )
+                    yield FoundRegex(line, string, flags, False)
                     mode = TOKENS_BASE
                 break
         else:
@@ -113,10 +129,10 @@ def make_lines(code):
     return [m.start() for m in re.finditer("\n", code)]
 
 
-def line_of(character_index: int, newline_positions):
+def line_of(character_index: int, newline_positions, seen_line: int):
     if not newline_positions:
         return 1
-    for line_index, newline_position in enumerate(newline_positions):
+    for line_index, newline_position in enumerate(newline_positions[seen_line:]):
         if character_index < newline_position:
-            return line_index + 1
-    return line_index
+            return line_index + seen_line + 1
+    return line_index + seen_line
diff --git a/regexploit/languages/javascript.py b/regexploit/languages/javascript.py
@@ -2,6 +2,7 @@
 
 
 # There's quite a lot wrong here, but it'll do for now.
+# Wow, looking back on this, this is still horrific.
 CARAT_FIX = re.compile(r"(?<!\\)\[\^\]")
 NAMED_GROUP_FIX = re.compile(r"(?<!\\)\(\?<(\w+)>")
 HYPHEN_FIX_1 = re.compile(r"(?<!\\)(\[[^\]]*(?<!\\)\\[wsdWSD])-")
diff --git a/regexploit/languages/python_node_visitor.py b/regexploit/languages/python_node_visitor.py
diff --git a/tests/test.cs b/tests/test.cs
@@ -0,0 +1,32 @@
+using System;
+using System.Text.RegularExpressions;
+
+public class Example
+{
+   #line 1 "C:\Users\test"
+   public static void Main()
+   {
+      /****"@"
+      ; " @ '"\
+      */
+      string input = "Not a regex*****";
+      string regex = "\\w+_[\\w\"]+_\\w+w";
+      /**/
+      string pattern = @"x""\d+.\d+.\d+!";
+      char c = '"';
+      char d = '\"';
+      Regex r = new Regex(@"\b(?<word>\w+)\s+x\b", RegexOptions.IgnoreCase);
+      Regex r = new Regex(
+          "\\b(?<word>\\w+)\\s+\\b",
+          // What?
+          /**/
+          RegexOptions.IgnoreCase
+      );
+      Something(@"
+         (a              # An a
+           *   # starred
+         )  # bracket
+         *  # starred again
+      x", x);
+   }
+}
diff --git a/tests/test_csharp.py b/tests/test_csharp.py
@@ -0,0 +1,27 @@
+import re
+from unittest.mock import Mock
+
+from regexploit.bin.regexploit_csharp import handle_file
+from regexploit.languages.csharp_string_extractor import find_regexes
+
+
+def test_csharp():
+    with open("tests/test.cs", "rb") as f:
+        code = f.read()
+    found = list(find_regexes(code))
+    assert len(found) == 6
+    assert found[0].pattern == "Not a regex*****"
+    assert found[1].pattern == '\\w+_[\\w"]+_\\w+w'
+    assert found[2].pattern == r'x"\d+.\d+.\d+!'
+    assert found[2].lineno == 15
+    assert not found[2].definitely_regex
+    assert found[3].definitely_regex
+    assert found[4].flags == re.I
+    assert found[5].flags == re.X
+
+
+def test_handle_file():
+    output = Mock(spec=["next", "record"])
+    handle_file("tests/test.cs", output)
+    assert output.next.call_count == 5
+    assert output.record.call_count == 3
diff --git a/tests/test_javascript.py b/tests/test_javascript.py
@@ -1,8 +1,11 @@
 import re
+from json import dumps
+from unittest.mock import Mock
 
 import pytest
 
-from regexploit.javascript import fix_js_regex
+from regexploit.bin.regexploit_js import handle_line_from_node
+from regexploit.languages.javascript import fix_js_regex
 
 
 @pytest.mark.parametrize(
@@ -26,3 +29,27 @@ def test_fixes(r, f):
     fixed = fix_js_regex(r)
     assert fixed == f
     re.compile(fixed)
+
+
+@pytest.mark.parametrize(
+    "pat,next_called,recorded",
+    [
+        ("ab*cdef", False, False),  # too few stars
+        ("ab+c+def", True, False),
+        ("ab*b+b*c", True, True),
+        ("a[^](?<xyz>c*)*d", True, True),
+        ("a[^](?<xyz>c*)d*", True, False),
+    ],
+)
+def test_handle_line_from_node(pat, next_called, recorded):
+    output = Mock(spec=["next", "record"])
+    line_json = dict(pattern=pat, lineno=1, filename="testfile")
+    handle_line_from_node(dumps(line_json), output)
+    if next_called:
+        output.next.assert_called_once()
+    else:
+        output.next.assert_not_called()
+    if recorded:
+        output.record.assert_called_once()
+    else:
+        output.record.assert_not_called()
diff --git a/tests/test_python_ast.py b/tests/test_python_ast.py
@@ -1,9 +1,11 @@
 import ast
 import re
 import textwrap
+from unittest.mock import Mock
 
+from regexploit.bin.regexploit_python_ast import handle_file
 from regexploit.found_regex import FoundRegex
-from regexploit.python_node_visitor import PythonNodeVisitor
+from regexploit.languages.python_node_visitor import PythonNodeVisitor
 
 
 def patterns_from_code(code: str):
@@ -27,3 +29,9 @@ def x():
     assert patterns[0] == FoundRegex(2, "abc+d+", 0, False)
     assert patterns[1] == FoundRegex(6, "aregex", re.A, True)
     assert patterns[2] == FoundRegex(7, "x*y*z", re.X | re.MULTILINE, True)
+
+
+def test_file():
+    output = Mock(spec=["next"])
+    handle_file(__file__, output)
+    assert output.next.call_count == 2  # abc+d+, x*y*z, code string errors