fix(sql): unable to filter text with quotes (apache#17881)

stephenLYZ · ofekisr · commit ebde45abf1de · 2022-02-08T20:26:12.000+02:00
diff --git a/superset/connectors/base/models.py b/superset/connectors/base/models.py
@@ -398,7 +398,7 @@ def handle_single_value(value: Optional[FilterValue]) -> Optional[FilterValue]:
             ):
                 return datetime.utcfromtimestamp(value / 1000)
             if isinstance(value, str):
-                value = value.strip("\t\n'\"")
+                value = value.strip("\t\n")
 
                 if target_column_type == utils.GenericDataType.NUMERIC:
                     # For backwards compatibility and edge cases
diff --git a/tests/integration_tests/druid_func_tests.py b/tests/integration_tests/druid_func_tests.py
@@ -359,7 +359,7 @@ def test_get_filters_extracts_values_in_quotes(self):
         col = DruidColumn(column_name="A")
         column_dict = {"A": col}
         res = DruidDatasource.get_filters([filtr], [], column_dict)
-        self.assertEqual("a", res.filter["filter"]["value"])
+        self.assertEqual('"a"', res.filter["filter"]["value"])
 
     @unittest.skipUnless(
         SupersetTestCase.is_module_installed("pydruid"), "pydruid not installed"
diff --git a/tests/integration_tests/druid_func_tests_sip38.py b/tests/integration_tests/druid_func_tests_sip38.py
@@ -364,7 +364,7 @@ def test_get_filters_extracts_values_in_quotes(self):
         col = DruidColumn(column_name="A")
         column_dict = {"A": col}
         res = DruidDatasource.get_filters([filtr], [], column_dict)
-        self.assertEqual("a", res.filter["filter"]["value"])
+        self.assertEqual('"a"', res.filter["filter"]["value"])
 
     @unittest.skipUnless(
         SupersetTestCase.is_module_installed("pydruid"), "pydruid not installed"
diff --git a/tests/integration_tests/query_context_tests.py b/tests/integration_tests/query_context_tests.py
@@ -380,7 +380,7 @@ def test_query_response_type(self):
         assert re.search(r'[`"\[]?num[`"\]]? IS NOT NULL', sql_text)
         assert re.search(
             r"""NOT \([`"\[]?name[`"\]]? IS NULL[\s\n]* """
-            r"""OR [`"\[]?name[`"\]]? IN \('abc'\)\)""",
+            r"""OR [`"\[]?name[`"\]]? IN \('"abc"'\)\)""",
             sql_text,
         )
 
diff --git a/tests/integration_tests/sqla_models_tests.py b/tests/integration_tests/sqla_models_tests.py
@@ -47,6 +47,7 @@
     load_birth_names_dashboard_with_slices,
     load_birth_names_data,
 )
+from tests.integration_tests.test_app import app
 
 from .base_tests import SupersetTestCase
 
@@ -475,80 +476,133 @@ def test_labels_expected_on_mutated_query(self):
         db.session.delete(database)
         db.session.commit()
 
-    def test_values_for_column(self):
+
+@pytest.fixture
+def text_column_table():
+    with app.app_context():
         table = SqlaTable(
-            table_name="test_null_in_column",
+            table_name="text_column_table",
             sql=(
                 "SELECT 'foo' as foo "
                 "UNION SELECT '' "
                 "UNION SELECT NULL "
-                "UNION SELECT 'null'"
+                "UNION SELECT 'null' "
+                "UNION SELECT '\"text in double quotes\"' "
+                "UNION SELECT '''text in single quotes''' "
+                "UNION SELECT 'double quotes \" in text' "
+                "UNION SELECT 'single quotes '' in text' "
             ),
             database=get_example_database(),
         )
         TableColumn(column_name="foo", type="VARCHAR(255)", table=table)
         SqlMetric(metric_name="count", expression="count(*)", table=table)
+        yield table
 
-        # null value, empty string and text should be retrieved
-        with_null = table.values_for_column(column_name="foo", limit=10000)
-        assert None in with_null
-        assert len(with_null) == 4
 
-        # null value should be replaced
-        result_object = table.query(
-            {
-                "metrics": ["count"],
-                "filter": [{"col": "foo", "val": [NULL_STRING], "op": "IN"}],
-                "is_timeseries": False,
-            }
-        )
-        assert result_object.df["count"][0] == 1
+def test_values_for_column_on_text_column(text_column_table):
+    # null value, empty string and text should be retrieved
+    with_null = text_column_table.values_for_column(column_name="foo", limit=10000)
+    assert None in with_null
+    assert len(with_null) == 8
 
-        # also accept None value
-        result_object = table.query(
-            {
-                "metrics": ["count"],
-                "filter": [{"col": "foo", "val": [None], "op": "IN"}],
-                "is_timeseries": False,
-            }
-        )
-        assert result_object.df["count"][0] == 1
 
-        # empty string should be replaced
-        result_object = table.query(
-            {
-                "metrics": ["count"],
-                "filter": [{"col": "foo", "val": [EMPTY_STRING], "op": "IN"}],
-                "is_timeseries": False,
-            }
-        )
-        assert result_object.df["count"][0] == 1
+def test_filter_on_text_column(text_column_table):
+    table = text_column_table
+    # null value should be replaced
+    result_object = table.query(
+        {
+            "metrics": ["count"],
+            "filter": [{"col": "foo", "val": [NULL_STRING], "op": "IN"}],
+            "is_timeseries": False,
+        }
+    )
+    assert result_object.df["count"][0] == 1
 
-        # also accept "" string
-        result_object = table.query(
-            {
-                "metrics": ["count"],
-                "filter": [{"col": "foo", "val": [""], "op": "IN"}],
-                "is_timeseries": False,
-            }
-        )
-        assert result_object.df["count"][0] == 1
+    # also accept None value
+    result_object = table.query(
+        {
+            "metrics": ["count"],
+            "filter": [{"col": "foo", "val": [None], "op": "IN"}],
+            "is_timeseries": False,
+        }
+    )
+    assert result_object.df["count"][0] == 1
 
-        # both replaced
-        result_object = table.query(
-            {
-                "metrics": ["count"],
-                "filter": [
-                    {
-                        "col": "foo",
-                        "val": [EMPTY_STRING, NULL_STRING, "null", "foo"],
-                        "op": "IN",
-                    }
-                ],
-                "is_timeseries": False,
-            }
-        )
-        assert result_object.df["count"][0] == 4
+    # empty string should be replaced
+    result_object = table.query(
+        {
+            "metrics": ["count"],
+            "filter": [{"col": "foo", "val": [EMPTY_STRING], "op": "IN"}],
+            "is_timeseries": False,
+        }
+    )
+    assert result_object.df["count"][0] == 1
+
+    # also accept "" string
+    result_object = table.query(
+        {
+            "metrics": ["count"],
+            "filter": [{"col": "foo", "val": [""], "op": "IN"}],
+            "is_timeseries": False,
+        }
+    )
+    assert result_object.df["count"][0] == 1
+
+    # both replaced
+    result_object = table.query(
+        {
+            "metrics": ["count"],
+            "filter": [
+                {
+                    "col": "foo",
+                    "val": [EMPTY_STRING, NULL_STRING, "null", "foo"],
+                    "op": "IN",
+                }
+            ],
+            "is_timeseries": False,
+        }
+    )
+    assert result_object.df["count"][0] == 4
+
+    # should filter text in double quotes
+    result_object = table.query(
+        {
+            "metrics": ["count"],
+            "filter": [{"col": "foo", "val": ['"text in double quotes"'], "op": "IN",}],
+            "is_timeseries": False,
+        }
+    )
+    assert result_object.df["count"][0] == 1
+
+    # should filter text in single quotes
+    result_object = table.query(
+        {
+            "metrics": ["count"],
+            "filter": [{"col": "foo", "val": ["'text in single quotes'"], "op": "IN",}],
+            "is_timeseries": False,
+        }
+    )
+    assert result_object.df["count"][0] == 1
+
+    # should filter text with double quote
+    result_object = table.query(
+        {
+            "metrics": ["count"],
+            "filter": [{"col": "foo", "val": ['double quotes " in text'], "op": "IN",}],
+            "is_timeseries": False,
+        }
+    )
+    assert result_object.df["count"][0] == 1
+
+    # should filter text with single quote
+    result_object = table.query(
+        {
+            "metrics": ["count"],
+            "filter": [{"col": "foo", "val": ["single quotes ' in text"], "op": "IN",}],
+            "is_timeseries": False,
+        }
+    )
+    assert result_object.df["count"][0] == 1
 
 
 @pytest.mark.parametrize(

Original file line number	Diff line number	Diff line change
`@@ -380,7 +380,7 @@ def test_query_response_type(self):`
`380`	`380`	assert re.search(r'[`"\[]?num[`"\]]? IS NOT NULL', sql_text)
`381`	`381`	`assert re.search(`
`382`	`382`	r"""NOT \([`"\[]?name[`"\]]? IS NULL[\s\n]* """
`383`		- r"""OR [`"\[]?name[`"\]]? IN \('abc'\)\)""",
	`383`	+ r"""OR [`"\[]?name[`"\]]? IN \('"abc"'\)\)""",
`384`	`384`	`sql_text,`
`385`	`385`	`)`
`386`	`386`