-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathgenerate_data.py
96 lines (82 loc) · 2.55 KB
/
generate_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import glob
import shutil
import argparse
from tqdm import tqdm
def get_lc(path):
i = 0
with open(path) as f:
for i, _ in enumerate(f, 1):
pass
return i
def run(args):
data_type = "train"
type2prefix = {
"m2m": "*",
"m2o": f"*-{args.pivot_lang}",
"o2m": f"{args.pivot_lang}-*"
}
iterator = glob.glob(
os.path.join(
args.dataset_dir,
type2prefix[args.training_type] + "_" + data_type + "*.target"
)
)
os.makedirs(args.output_dir, exist_ok=True)
for f in tqdm(iterator, desc="Moving data files for training"):
basename = os.path.basename(f)
src_lang, tgt_lang = basename.rsplit("_", 1)[0].split("-")
if src_lang == tgt_lang and args.exclude_native == "yes":
continue
if get_lc(f) >= args.min_example_count:
shutil.copy(
os.path.join(
args.dataset_dir,
src_lang + "-" + tgt_lang + "_" + data_type + ".source"
),
os.path.join(
args.output_dir,
src_lang + "-" + tgt_lang + "_" + data_type + ".source"
)
)
shutil.copy(
os.path.join(
args.dataset_dir,
src_lang + "-" + tgt_lang + "_" + data_type + ".target"
),
os.path.join(
args.output_dir,
src_lang + "-" + tgt_lang + "_" + data_type + ".target"
)
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'--dataset_dir',
metavar='PATH',
help="Input directory")
parser.add_argument(
'--output_dir',
required=True,
metavar='PATH',
help="Output directory")
parser.add_argument(
'--pivot_lang', type=str,
required=True,
help="Pivot language")
parser.add_argument(
'--training_type', type=str,
choices=["m2m", "m2o", "o2m"],
required=True,
help='Training type (many-to-many/many-to-one/one-to-many)'
)
parser.add_argument('--exclude_native', type=str,
default=False,
help='Exclude the native-to-native filepairs during training'
)
parser.add_argument('--min_example_count', type=int,
default=32,
help='Minimum example count for a training pair to be included in training'
)
args = parser.parse_args()
run(args)