Skip to content

Commit 3148d9f

Browse files
committed
adds language extractor feature
1 parent f9e35aa commit 3148d9f

File tree

3 files changed

+31
-1
lines changed

3 files changed

+31
-1
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ docs/_template/
2929

3030
# pipenv
3131
Pipfile*
32+
venv/
3233

3334
# older stuff
3435
old/

trafilatura/core.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
# own
2828
from .external import justext_rescue, sanitize_tree, SANITIZED_XPATH, try_readability
2929
from .filters import (LANGID_FLAG, check_html_lang, content_fingerprint, duplicate_test,
30-
language_filter, text_chars_test)
30+
language_filter, text_chars_test, extract_lang)
3131
from .htmlprocessing import (convert_tags, handle_textnode, process_node,
3232
delete_by_link_density, link_density_test_tables,
3333
prune_unwanted_nodes, tree_cleaning)
@@ -950,6 +950,10 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
950950
LOGGER.error('duplicate document for URL %s', url)
951951
raise ValueError
952952

953+
954+
# extract lang
955+
document.language = extract_lang(tree_backup_2, temp_text, temp_comments)
956+
953957
# sanity check on language
954958
if target_language is not None:
955959
is_not_target_lang, document = language_filter(temp_text, temp_comments, target_language, document)

trafilatura/filters.py

+25
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,31 @@ def duplicate_test(element, config):
6060
put_in_cache(teststring)
6161
return False
6262

63+
def extract_lang(tree, temp_text, temp_comments, strict=False):
64+
"""
65+
extracts the language from HTML meta-elements and the language_classifier if library installed
66+
returns the first language extracted
67+
"""
68+
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Language
69+
elems = set()
70+
target_elements = tree.findall('.//meta[@http-equiv="content-language"][@content]')
71+
if target_elements:
72+
elems.update([x.get('content').lower()[:2] for x in target_elements])
73+
# locale
74+
target_elements = tree.findall('.//meta[@property="og:locale"][@content]')
75+
if target_elements:
76+
elems.update([x.get('content').lower()[:2] for x in target_elements])
77+
# HTML lang attribute: sometimes a wrong indication
78+
if strict is True:
79+
target_elements = tree.xpath('//html[@lang]')
80+
if target_elements:
81+
elems.update([x.get('content').lower()[:2] for x in target_elements])
82+
# external tool
83+
classified_lang = language_classifier(temp_text, temp_comments)
84+
if classified_lang:
85+
elems.add(classified_lang)
86+
return [*elems, ][0]
87+
6388

6489
def check_html_lang(tree, target_language, strict=False):
6590
'''Check HTML meta-elements for language information and split

0 commit comments

Comments
 (0)