Skip to content

Commit 01cfc55

Browse files
committed
[fix] HTMLParser: undocumented not implemented method
In python versions <py3.10 there is an issue with an undocumented method HTMLParser.error() [1][2] that was deprecated in Python 3.4 and removed in Python 3.5. To be compatible to higher versions (>=py3.10) an error method is implemented which throws an AssertionError exception like the higher Python versions do [3]. [1] python/cpython#76025 [2] https://bugs.python.org/issue31844 [3] python/cpython#8562 Signed-off-by: Markus Heiser <[email protected]>
1 parent b013cbb commit 01cfc55

File tree

1 file changed

+13
-1
lines changed

1 file changed

+13
-1
lines changed

searx/utils.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from os.path import splitext, join
1616
from random import choice
1717
from html.parser import HTMLParser
18+
from html import escape
1819
from urllib.parse import urljoin, urlparse
1920
from markdown_it import MarkdownIt
2021

@@ -88,7 +89,7 @@ class _HTMLTextExtractorException(Exception):
8889
"""Internal exception raised when the HTML is invalid"""
8990

9091

91-
class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://bugs.python.org/issue31844)
92+
class _HTMLTextExtractor(HTMLParser):
9293
"""Internal class to extract text from HTML"""
9394

9495
def __init__(self):
@@ -137,6 +138,11 @@ def handle_entityref(self, name):
137138
def get_text(self):
138139
return ''.join(self.result).strip()
139140

141+
def error(self, message):
142+
# error handle is needed in <py3.10
143+
# https://github.com/python/cpython/pull/8562/files
144+
raise AssertionError(message)
145+
140146

141147
def html_to_text(html_str: str) -> str:
142148
"""Extract text from a HTML string
@@ -153,12 +159,18 @@ def html_to_text(html_str: str) -> str:
153159
154160
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
155161
'Example'
162+
163+
>>> html_to_text(r'regexp: (?<![a-zA-Z]')
164+
'regexp: (?<![a-zA-Z]'
156165
"""
157166
html_str = html_str.replace('\n', ' ').replace('\r', ' ')
158167
html_str = ' '.join(html_str.split())
159168
s = _HTMLTextExtractor()
160169
try:
161170
s.feed(html_str)
171+
except AssertionError:
172+
s = _HTMLTextExtractor()
173+
s.feed(escape(html_str, quote=True))
162174
except _HTMLTextExtractorException:
163175
logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
164176
return s.get_text()

0 commit comments

Comments
 (0)