Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Emit a verbose error when processing a password-protected XLS / XLSX file #551

Merged
merged 4 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions ingestors/documents/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from ingestors.ingestor import Ingestor
from ingestors.support.pdf import PDFSupport
from ingestors.exc import ProcessingException, UnauthorizedError
from ingestors.exc import ProcessingException, UnauthorizedError, ENCRYPTED_MSG

log = logging.getLogger(__name__)

Expand All @@ -24,9 +24,7 @@ def ingest(self, file_path, entity):
try:
self.parse_and_ingest(file_path, entity, self.manager)
except UnauthorizedError as pwe:
raise ProcessingException(
"Could not extract PDF file. The PDF is protected with a password. Try removing the password protection and re-uploading the documents."
) from pwe
raise ProcessingException(ENCRYPTED_MSG) from pwe
except Exception as ex:
raise ProcessingException("Could not extract PDF file: %r" % ex) from ex

Expand Down
3 changes: 3 additions & 0 deletions ingestors/exc.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
ENCRYPTED_MSG = "The document might be protected with a password. Try removing the password protection and re-uploading the documents."


class ProcessingException(Exception):
"A data-related error occuring during file processing."
pass
Expand Down
5 changes: 4 additions & 1 deletion ingestors/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from followthemoney.namespace import Namespace

from ingestors.directory import DirectoryIngestor
from ingestors.exc import ProcessingException
from ingestors.exc import ProcessingException, ENCRYPTED_MSG
from ingestors.util import filter_text, remove_directory
from ingestors import settings

Expand Down Expand Up @@ -94,6 +94,9 @@ def auction(self, file_path, entity):
return DirectoryIngestor
entity.add("mimeType", self.MAGIC.from_file(file_path.as_posix()))

if "application/encrypted" in entity.get("mimeType"):
raise ProcessingException(ENCRYPTED_MSG)

best_score, best_cls = 0, None
for cls in get_extensions("ingestors"):
score = cls.match(file_path, entity)
Expand Down
4 changes: 4 additions & 0 deletions ingestors/support/ooxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ingestors.support.xml import XMLSupport
from ingestors.support.timestamp import TimestampSupport
from ingestors.exc import ProcessingException, ENCRYPTED_MSG

# from ingestors.exc import ProcessingException

Expand Down Expand Up @@ -53,6 +54,9 @@ def get(ns, name):
@classmethod
def inspect_ooxml_manifest(cls, file_path):
if not zipfile.is_zipfile(file_path):
# password-protected Excel files are detected as zipfiles
if "Excel" in cls.__name__:
raise ProcessingException(ENCRYPTED_MSG)
return False
try:
with zipfile.ZipFile(file_path, "r") as zf:
Expand Down
4 changes: 3 additions & 1 deletion ingestors/tabular/xls.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ingestors.ingestor import Ingestor
from ingestors.support.table import TableSupport
from ingestors.support.ole import OLESupport
from ingestors.exc import ProcessingException
from ingestors.exc import ProcessingException, ENCRYPTED_MSG

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -49,6 +49,8 @@ def ingest(self, file_path, entity):
self.extract_ole_metadata(file_path, entity)
try:
book = xlrd.open_workbook(file_path, formatting_info=False)
except XLRDError:
raise ProcessingException(ENCRYPTED_MSG)
except Exception as err:
raise ProcessingException("Invalid Excel file: %s" % err) from err

Expand Down
1 change: 1 addition & 0 deletions ingestors/tabular/xlsx.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging

from followthemoney import model
from openpyxl import load_workbook
from xml.etree.ElementTree import ParseError
Expand Down
Binary file added tests/fixtures/password_protected.xls
Binary file not shown.
Binary file added tests/fixtures/password_protected.xlsx
Binary file not shown.
6 changes: 2 additions & 4 deletions tests/test_pdf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-

from .support import TestCase
from ingestors.exc import ENCRYPTED_MSG


class PDFIngestorTest(TestCase):
Expand Down Expand Up @@ -84,10 +85,7 @@ def test_ingest_protected(self):
text = self.manager.entities[0].first("bodyText")
self.assertEqual(None, text)
err = self.manager.entities[0].first("processingError")
self.assertIn(
"Could not extract PDF file. The PDF is protected with a password. Try removing the password protection and re-uploading the documents.",
err,
)
self.assertIn(ENCRYPTED_MSG, err)
status = self.manager.entities[0].first("processingStatus")
self.assertEqual("failure", status)

Expand Down
19 changes: 19 additions & 0 deletions tests/test_tabular.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from .support import TestCase
from ingestors.exc import ENCRYPTED_MSG


class TabularIngestorTest(TestCase):
Expand Down Expand Up @@ -34,3 +35,21 @@ def test_unicode_ods(self):
tables = [t.first("title") for t in tables]
self.assertIn("Лист1", tables)
self.assertEqual(entity.schema.name, "Workbook")

def test_password_protected_xlsx(self):
fixture_path, entity = self.fixture("password_protected.xlsx")
self.manager.ingest(fixture_path, entity)
self.assertEqual(len(self.get_emitted()), 1)
err = self.manager.entities[0].first("processingError")
self.assertIn(ENCRYPTED_MSG, err)
status = self.manager.entities[0].first("processingStatus")
self.assertEqual("failure", status)

def test_password_protected_xls(self):
fixture_path, entity = self.fixture("password_protected.xls")
self.manager.ingest(fixture_path, entity)
self.assertEqual(len(self.get_emitted()), 1)
err = self.manager.entities[0].first("processingError")
self.assertIn(ENCRYPTED_MSG, err)
status = self.manager.entities[0].first("processingStatus")
self.assertEqual("failure", status)