Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix TIFF processing. Add tests to prevent regression in OCR for gif, jpg, jp2, tiff, webp #587

Merged
merged 6 commits into from
Feb 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions ingestors/media/tiff.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import logging
from followthemoney import model

from ingestors.ingestor import Ingestor
from ingestors.support.pdf import PDFSupport
from ingestors.support.shell import ShellSupport
from ingestors.support.temp import TempFileSupport

log = logging.getLogger(__name__)
from ingestors.exc import ProcessingException


class TIFFIngestor(Ingestor, PDFSupport, TempFileSupport, ShellSupport):
Expand All @@ -22,8 +20,24 @@ class TIFFIngestor(Ingestor, PDFSupport, TempFileSupport, ShellSupport):
def ingest(self, file_path, entity):
entity.schema = model.get("Pages")
pdf_path = self.make_work_file("tiff.pdf")
self.exec_command(
"tiff2pdf", file_path, "-x", "300", "-y", "300", "-o", pdf_path
)
try:
self.exec_command(
"tiff2pdf",
file_path,
"-n",
"-j",
"-x",
"300",
"-y",
"300",
"-o",
pdf_path,
)
except ProcessingException:
self.exec_command(
"tiff2pdf", file_path, "-x", "300", "-y", "300", "-o", pdf_path
)

self.assert_outfile(pdf_path)

self.pdf_alternative_extract(entity, pdf_path, self.manager)
Binary file added tests/fixtures/regression_gif.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
File renamed without changes
Binary file added tests/fixtures/regression_openjpeg.jp2
Binary file not shown.
Binary file added tests/fixtures/regression_tiff.tiff
Binary file not shown.
Binary file added tests/fixtures/regression_webp.webp
Binary file not shown.
Binary file removed tests/fixtures/some hand wirtten veird text.jpg
Binary file not shown.
82 changes: 76 additions & 6 deletions tests/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,80 @@ def test_ingest_on_svg(self):
self.assertIn("TEST", entity.first("bodyText"))
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)

def test_ingest_on_jpeg(self):
fixture_path, entity = self.fixture("jpegtest.jpg")
self.manager.ingest(fixture_path, entity)
self.assertIn("Debian", entity.first("bodyText"))
self.assertEqual(entity.first("mimeType"), "image/jpeg")
def test_tesseract_ocr_regression(self):
"""This test is meant to catch a regression in the OCR behaviour
described in this PR: https://github.com/alephdata/ingest-file/pull/585"""

self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
test_data = {
"jpeg": {
"file": "regression_jpg.jpg",
"content": "Debian -- Packages",
"mime_type": "image/jpeg",
},
"gif": {
"file": "regression_gif.gif",
"content": "This is text inside a GIF image",
"mime_type": "image/gif",
},
"tiff": {
"file": "regression_tiff.tiff",
"content": "Debian -- Packages",
"mime_type": "image/tiff",
},
"webp": {
"file": "regression_webp.webp",
"content": "Debian -- Packages",
"mime_type": "image/webp",
},
"openjpeg": {
"file": "regression_openjpeg.jp2",
"content": "Debian -- Packages",
"mime_type": "image/jp2",
},
}

for test_image_type in test_data:
fixture_path, entity = self.fixture(test_data[test_image_type]["file"])
self.manager.ingest(fixture_path, entity)

emitted_image_entities = [
x
for x in self.get_emitted()
if "mimeType" in x.properties and "image" in x.first("mimeType")
]

# Have entities been emitted with a mime type that contains "image"?
self.assertTrue(
len(emitted_image_entities) != 0,
f"Test failed for {test_data[test_image_type]['file']}",
)
image_entity = emitted_image_entities.pop()

# Is the mimeType correct?
self.assertEqual(
image_entity.first("mimeType"),
test_data[test_image_type]["mime_type"],
f"Test failed for {test_data[test_image_type]['file']}",
)

# Is the processing status of the entity == SUCCESS?
self.assertEqual(
image_entity.first("processingStatus"),
self.manager.STATUS_SUCCESS,
f"Test failed for {test_data[test_image_type]['file']}",
)

# Does either the bodyText prop or the indexText prop contain
# the text resulted from OCR?
try:
self.assertIn(
test_data[test_image_type]["content"],
image_entity.first("bodyText"),
f"Test failed for {test_data[test_image_type]['file']}",
)
except TypeError:
self.assertIn(
test_data[test_image_type]["content"],
image_entity.first("indexText"),
f"Test failed for {test_data[test_image_type]['file']}",
)
41 changes: 36 additions & 5 deletions tests/test_tiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,45 @@ class TIFFIngestorTest(TestCase):
def test_match(self):
fixture_path, entity = self.fixture("multipage_tiff_example.tif")
self.manager.ingest(fixture_path, entity)
self.assertEqual(entity.first("mimeType"), "image/tiff")
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)

emitted_image_entities = [
x
for x in self.get_emitted()
if "mimeType" in x.properties and "image" in x.first("mimeType")
]

# Have entities been emitted with a mime type that contains "image"?
self.assertTrue(
len(emitted_image_entities) != 0,
"Test failed for multipage_tiff_example.tif",
)
image_entity = emitted_image_entities.pop()

self.assertEqual(image_entity.first("mimeType"), "image/tiff")
self.assertEqual(
image_entity.first("processingStatus"), self.manager.STATUS_SUCCESS
)
entities = self.get_emitted()
self.assertEqual(len(entities), 11)

def test_ingest_tiff_format(self):
fixture_path, entity = self.fixture("hello_world_tiff.tif")
self.manager.ingest(fixture_path, entity)
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
entity = self.get_emitted_by_id(entity.id)
self.assertEqual(entity.first("indexText"), "HELLO WORLD")

emitted_image_entities = [
x
for x in self.get_emitted()
if "mimeType" in x.properties and "image" in x.first("mimeType")
]

# Have entities been emitted with a mime type that contains "image"?
self.assertTrue(
len(emitted_image_entities) != 0,
"Test failed for multipage_tiff_example.tif",
)
image_entity = emitted_image_entities.pop()

self.assertEqual(
image_entity.first("processingStatus"), self.manager.STATUS_SUCCESS
)
self.assertEqual(image_entity.first("indexText"), "HELLO WORLD")
Loading