Skip to content

Commit 90629ea

Browse files
authored
Merge pull request #551 from alephdata/bug/aleph-2957
Emit a verbose error when processing a password-protected XLS / XLSX file
2 parents 14118ec + a0aef89 commit 90629ea

File tree

10 files changed

+38
-10
lines changed

10 files changed

+38
-10
lines changed

ingestors/documents/pdf.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from ingestors.ingestor import Ingestor
44
from ingestors.support.pdf import PDFSupport
5-
from ingestors.exc import ProcessingException, UnauthorizedError
5+
from ingestors.exc import ProcessingException, UnauthorizedError, ENCRYPTED_MSG
66

77
log = logging.getLogger(__name__)
88

@@ -24,9 +24,7 @@ def ingest(self, file_path, entity):
2424
try:
2525
self.parse_and_ingest(file_path, entity, self.manager)
2626
except UnauthorizedError as pwe:
27-
raise ProcessingException(
28-
"Could not extract PDF file. The PDF is protected with a password. Try removing the password protection and re-uploading the documents."
29-
) from pwe
27+
raise ProcessingException(ENCRYPTED_MSG) from pwe
3028
except Exception as ex:
3129
raise ProcessingException("Could not extract PDF file: %r" % ex) from ex
3230

ingestors/exc.py

+3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
ENCRYPTED_MSG = "The document might be protected with a password. Try removing the password protection and re-uploading the documents."
2+
3+
14
class ProcessingException(Exception):
25
"A data-related error occuring during file processing."
36
pass

ingestors/manager.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from followthemoney.namespace import Namespace
1818

1919
from ingestors.directory import DirectoryIngestor
20-
from ingestors.exc import ProcessingException
20+
from ingestors.exc import ProcessingException, ENCRYPTED_MSG
2121
from ingestors.util import filter_text, remove_directory
2222
from ingestors import settings
2323

@@ -94,6 +94,9 @@ def auction(self, file_path, entity):
9494
return DirectoryIngestor
9595
entity.add("mimeType", self.MAGIC.from_file(file_path.as_posix()))
9696

97+
if "application/encrypted" in entity.get("mimeType"):
98+
raise ProcessingException(ENCRYPTED_MSG)
99+
97100
best_score, best_cls = 0, None
98101
for cls in get_extensions("ingestors"):
99102
score = cls.match(file_path, entity)

ingestors/support/ooxml.py

+4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from ingestors.support.xml import XMLSupport
66
from ingestors.support.timestamp import TimestampSupport
7+
from ingestors.exc import ProcessingException, ENCRYPTED_MSG
78

89
# from ingestors.exc import ProcessingException
910

@@ -53,6 +54,9 @@ def get(ns, name):
5354
@classmethod
5455
def inspect_ooxml_manifest(cls, file_path):
5556
if not zipfile.is_zipfile(file_path):
57+
# password-protected Excel files are detected as zipfiles
58+
if "Excel" in cls.__name__:
59+
raise ProcessingException(ENCRYPTED_MSG)
5660
return False
5761
try:
5862
with zipfile.ZipFile(file_path, "r") as zf:

ingestors/tabular/xls.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from ingestors.ingestor import Ingestor
88
from ingestors.support.table import TableSupport
99
from ingestors.support.ole import OLESupport
10-
from ingestors.exc import ProcessingException
10+
from ingestors.exc import ProcessingException, ENCRYPTED_MSG
1111

1212
log = logging.getLogger(__name__)
1313

@@ -49,6 +49,8 @@ def ingest(self, file_path, entity):
4949
self.extract_ole_metadata(file_path, entity)
5050
try:
5151
book = xlrd.open_workbook(file_path, formatting_info=False)
52+
except XLRDError:
53+
raise ProcessingException(ENCRYPTED_MSG)
5254
except Exception as err:
5355
raise ProcessingException("Invalid Excel file: %s" % err) from err
5456

ingestors/tabular/xlsx.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
23
from followthemoney import model
34
from openpyxl import load_workbook
45
from xml.etree.ElementTree import ParseError

tests/fixtures/password_protected.xls

25.5 KB
Binary file not shown.
15.5 KB
Binary file not shown.

tests/test_pdf.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22

33
from .support import TestCase
4+
from ingestors.exc import ENCRYPTED_MSG
45

56

67
class PDFIngestorTest(TestCase):
@@ -84,10 +85,7 @@ def test_ingest_protected(self):
8485
text = self.manager.entities[0].first("bodyText")
8586
self.assertEqual(None, text)
8687
err = self.manager.entities[0].first("processingError")
87-
self.assertIn(
88-
"Could not extract PDF file. The PDF is protected with a password. Try removing the password protection and re-uploading the documents.",
89-
err,
90-
)
88+
self.assertIn(ENCRYPTED_MSG, err)
9189
status = self.manager.entities[0].first("processingStatus")
9290
self.assertEqual("failure", status)
9391

tests/test_tabular.py

+19
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# -*- coding: utf-8 -*-
22
from .support import TestCase
3+
from ingestors.exc import ENCRYPTED_MSG
34

45

56
class TabularIngestorTest(TestCase):
@@ -34,3 +35,21 @@ def test_unicode_ods(self):
3435
tables = [t.first("title") for t in tables]
3536
self.assertIn("Лист1", tables)
3637
self.assertEqual(entity.schema.name, "Workbook")
38+
39+
def test_password_protected_xlsx(self):
40+
fixture_path, entity = self.fixture("password_protected.xlsx")
41+
self.manager.ingest(fixture_path, entity)
42+
self.assertEqual(len(self.get_emitted()), 1)
43+
err = self.manager.entities[0].first("processingError")
44+
self.assertIn(ENCRYPTED_MSG, err)
45+
status = self.manager.entities[0].first("processingStatus")
46+
self.assertEqual("failure", status)
47+
48+
def test_password_protected_xls(self):
49+
fixture_path, entity = self.fixture("password_protected.xls")
50+
self.manager.ingest(fixture_path, entity)
51+
self.assertEqual(len(self.get_emitted()), 1)
52+
err = self.manager.entities[0].first("processingError")
53+
self.assertIn(ENCRYPTED_MSG, err)
54+
status = self.manager.entities[0].first("processingStatus")
55+
self.assertEqual("failure", status)

0 commit comments

Comments
 (0)