-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract.py
108 lines (93 loc) · 5.31 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Copyright Jiaqi Liu
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from wiktionary.wiktextract.german import get_gender_modifier
from wiktionary.wiktextract.german import get_german_inflection
def extract_data(wiktextract_data_path: str):
"""
Extract data from raw-wiktextract-data.jsonl useful for wilhelmlang.com.
Data of each language is written in a dedicated .jsonl file. Each line of the JSONL file has the following fields:
- term: the word of the language
- pos: the Part of Speech of this word
- definitions: an array of definitions, each element of the array is a string
:param wiktextract_data_path: the path of the wiktextract jsonl file. Can be downloaded from https://kaikki.org/dictionary/rawdata.html
"""
import json
from wiktionary.wiktextract.parse import get_audios
from wiktionary.wiktextract.parse import get_definitions
with (open(wiktextract_data_path) as data,
open("german-wiktextract-data.jsonl", "w") as german,
open("latin-wiktextract-data.jsonl", "w") as latin,
open("ancient-greek-wiktextract-data.jsonl", "w") as ancient_greek,
open("korean-wiktextract-data.jsonl", "w") as korean,
open("old-persian-wiktextract-data.jsonl", "w") as old_persian,
open("akkadian-wiktextract-data.jsonl", "w") as akkadian,
open("elamite-wiktextract-data.jsonl", "w") as elamite,
open("sanskrit-wiktextract-data.jsonl", "w") as sanskrit
):
for line in data:
vocabulary = json.loads(line)
if "lang" in vocabulary:
term = vocabulary["word"]
pos = vocabulary["pos"] if "pos" in vocabulary else "Unknown"
definitions = get_definitions(vocabulary)
audios = get_audios(vocabulary)
if vocabulary["lang"] == "German":
term = get_gender_modifier(vocabulary) + term
german.write(
json.dumps({
"term": term,
"part of speech": pos,
"definitions": definitions,
"audios": audios,
"inflection": get_german_inflection(vocabulary)
})
)
german.write("\n")
if vocabulary["lang"] == "Latin":
latin.write(json.dumps({"term": term, "part of speech": pos, "definitions": definitions, "audios": audios}))
latin.write("\n")
if vocabulary["lang"] == "Ancient Greek":
ancient_greek.write(json.dumps({"term": term, "part of speech": pos, "definitions": definitions, "audios": audios}))
ancient_greek.write("\n")
if vocabulary["lang"] == "Korean":
korean.write(json.dumps({"term": term, "part of speech": pos, "definitions": definitions, "audios": audios}))
korean.write("\n")
if vocabulary["lang"] == "Old Persian":
old_persian.write(json.dumps({"term": term, "part of speech": pos, "definitions": definitions, "audios": audios}))
old_persian.write("\n")
if vocabulary["lang"] == "Akkadian":
akkadian.write(json.dumps({"term": term, "part of speech": pos, "definitions": definitions, "audios": audios}))
akkadian.write("\n")
if vocabulary["lang"] == "Elamite":
elamite.write(json.dumps({"term": term, "part of speech": pos, "definitions": definitions, "audios": audios}))
elamite.write("\n")
if vocabulary["lang"] == "Sanskrit":
sanskrit.write(json.dumps({"term": term, "part of speech": pos, "definitions": definitions, "audios": audios}))
sanskrit.write("\n")
LANGUAGES = ["German", "Latin", "Ancient Greek", "Korean", "Old Persian", "Akkadian", "Elamite", "Sanskrit"]
def extract_graph(wiktextract_data_path: str):
import json
from wiktionary.wiktextract.parse import get_definitions
with (open(wiktextract_data_path) as data, open("word-definition-graph-data.jsonl", "w") as graph):
for line in data:
vocabulary = json.loads(line)
if "lang" in vocabulary and vocabulary["lang"] in LANGUAGES:
term = vocabulary["word"]
if vocabulary["lang"] == "German":
term = get_gender_modifier(vocabulary) + term
source_node = {"term": term, "language": vocabulary["lang"]}
definitions = get_definitions(vocabulary)
for definition in definitions:
graph.write(json.dumps({"source": source_node, "target": definition, "label": "definition"}))
graph.write("\n")