Skip to content

Commit c014ac9

Browse files
authored
Merge pull request #657 from alephdata/bug/3894
Add Workbook metadata to Table entities
2 parents 97050ac + 325eced commit c014ac9

File tree

6 files changed

+68
-2
lines changed

6 files changed

+68
-2
lines changed

ingestors/cli.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,12 @@
88
from ftmstore import get_dataset
99
from servicelayer.cache import get_redis
1010
from servicelayer.logs import configure_logging
11-
from servicelayer.taskqueue import Dataset, Task
11+
from servicelayer.taskqueue import (
12+
Dataset,
13+
Task,
14+
get_rabbitmq_channel,
15+
declare_rabbitmq_queue,
16+
)
1217
from servicelayer import settings as sl_settings
1318
from servicelayer.archive.util import ensure_path
1419
from servicelayer import settings as sls
@@ -78,6 +83,7 @@ def _ingest_path(db, dataset, path, languages=[]):
7883
entity.make_id(checksum)
7984
entity.set("fileName", path.name)
8085
log.info("Queue: %r", entity.to_dict())
86+
8187
manager.queue_entity(entity)
8288
if path.is_dir():
8389
DirectoryIngestor.crawl(manager, path)
@@ -116,6 +122,7 @@ def analyze(dataset):
116122
def debug(path, languages=None):
117123
"""Debug the ingest for the given path."""
118124
settings.fts.DATABASE_URI = "sqlite:////tmp/debug.sqlite3"
125+
settings.TESTING = True
119126

120127
# collection ID that is meant for testing purposes only
121128
debug_datatset_id = 100
@@ -126,6 +133,13 @@ def debug(path, languages=None):
126133
database_uri=settings.fts.DATABASE_URI,
127134
)
128135
db.delete()
136+
channel = get_rabbitmq_channel()
137+
qos_mapping = {
138+
settings.STAGE_INGEST: settings.RABBITMQ_QOS_INGEST_QUEUE,
139+
settings.STAGE_ANALYZE: settings.RABBITMQ_QOS_ANALYZE_QUEUE,
140+
}
141+
for queue_name in qos_mapping.keys():
142+
declare_rabbitmq_queue(channel, queue_name, qos_mapping[queue_name])
129143
_ingest_path(db, debug_datatset_id, path, languages=languages)
130144
worker = get_worker()
131145
worker.process(blocking=False)

ingestors/tabular/ods.py

+10
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,16 @@ def ingest(self, file_path, entity):
6363
table = self.manager.make_entity("Table", parent=entity)
6464
table.make_id(entity.id, name)
6565
table.set("title", name)
66+
# add workbook metadata to individual tables
67+
for metadatum in [
68+
"authoredAt",
69+
"author",
70+
"summary",
71+
"generator",
72+
"date",
73+
"processingAgent",
74+
]:
75+
table.set(metadatum, entity.get(metadatum))
6676
# Emit a partial table fragment with parent reference and name
6777
# early, so that we don't have orphan fragments in case of an error
6878
# in the middle of processing.

ingestors/tabular/xls.py

+11
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,17 @@ def ingest(self, file_path, entity):
5959
table = self.manager.make_entity("Table", parent=entity)
6060
table.make_id(entity.id, sheet.name)
6161
table.set("title", sheet.name)
62+
# add workbook metadata to individual tables
63+
for metadatum in [
64+
"authoredAt",
65+
"modifiedAt",
66+
"author",
67+
"summary",
68+
"generator",
69+
"language",
70+
"processingAgent",
71+
]:
72+
table.set(metadatum, entity.get(metadatum))
6273
# Emit a partial table fragment with parent reference and name
6374
# early, so that we don't have orphan fragments in case of an error
6475
# in the middle of processing.

ingestors/tabular/xlsx.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,22 @@ def ingest(self, file_path, entity):
4848
table = self.manager.make_entity("Table", parent=entity)
4949
table.make_id(entity.id, name)
5050
table.set("title", name)
51+
# add workbook metadata to individual tables
52+
for metadatum in [
53+
"authoredAt",
54+
"modifiedAt",
55+
"author",
56+
"summary",
57+
"generator",
58+
"language",
59+
"processingAgent",
60+
]:
61+
table.set(metadatum, entity.get(metadatum))
5162
# Emit a partial table fragment with parent reference and name
5263
# early, so that we don't have orphan fragments in case of an error
5364
# in the middle of processing.
5465
# See https://github.com/alephdata/ingest-file/issues/171
5566
self.manager.emit_entity(table, fragment="initial")
56-
log.debug("Sheet: %s", name)
5767
self.emit_row_tuples(table, self.generate_rows(sheet))
5868
if table.has("csvHash"):
5969
self.manager.emit_entity(table)

tests/fixtures/staff_list.xlsx

8.2 KB
Binary file not shown.

tests/test_tabular.py

+21
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,24 @@ def test_password_protected_xls(self):
5353
self.assertIn(ENCRYPTED_MSG, err)
5454
status = self.manager.entities[0].first("processingStatus")
5555
self.assertEqual("failure", status)
56+
57+
def test_metadata_inheritance(self):
58+
fixture_path, entity = self.fixture("staff_list.xlsx")
59+
self.manager.ingest(fixture_path, entity)
60+
table_entities = self.get_emitted("Table")
61+
parent_entity = self.get_emitted("Workbook").pop()
62+
self.assertEqual(len(table_entities), 3)
63+
64+
for metadatum in [
65+
"authoredAt",
66+
"modifiedAt",
67+
"author",
68+
"summary",
69+
"generator",
70+
"language",
71+
"processingAgent",
72+
]:
73+
for table_entity in table_entities:
74+
self.assertEqual(
75+
table_entity.get(metadatum), parent_entity.get(metadatum)
76+
)

0 commit comments

Comments
 (0)