Skip to content

Commit 9d0060f

Browse files
committed
update paracrawl download scripts
1 parent 6d9f7b6 commit 9d0060f

File tree

4 files changed

+14
-5
lines changed

4 files changed

+14
-5
lines changed

configs/opusfilter/paracrawl_ga-en.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
common:
2-
output_directory: data/ga/paracrawl
2+
output_directory: data/ga/paracrawl/raw
33

44
steps:
55
- type: opus_read

scripts/download_handler.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def argparser():
2323
'gdrive',
2424
'NCI',
2525
'oscar',
26+
'paracrawl',
2627
'sampleNCI',
2728
},
2829
nargs='+',
@@ -35,7 +36,7 @@ def main(argv):
3536

3637
for dataset in args.datasets:
3738

38-
if dataset in ("conll17", "NCI", "oscar", "sampleNCI"):
39+
if dataset in ("conll17", "NCI", "oscar", "paracrawl", "sampleNCI"):
3940
print(f"Downloading {dataset} data.")
4041
cmd = script_dir + f"download_{dataset}_data.sh"
4142
rcmd = subprocess.call(cmd)
@@ -44,7 +45,7 @@ def main(argv):
4445
print(f"Downloading {dataset} data.")
4546
# download all files from gdrive
4647
cmd = script_dir + f"download_{dataset}_data.sh"
47-
rcmd = subprocess.call(cmd)
48+
rcmd = subprocess.call(cmd, shell=True)
4849

4950
# gather the various files into a common directory
5051
fcmd = "find data/ga/gdrive/ -maxdepth 3 -type f | python3 scripts/gather_gdrive_data_by_filelist.py"

scripts/download_scripts/download_NCI_data.sh

+8-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
echo "Downloading data from Google Drive ..."
99

1010
OUTDIR=data/ga/NCI/raw
11+
OUTFILE=${OUTDIR}/NCI_v2.txt
12+
1113
mkdir -p $OUTDIR
1214

1315
if [[ -n $(rclone lsf "gdrive:Theme A DCU" 2> /dev/null) ]]; then
@@ -21,6 +23,11 @@ else
2123
fi
2224
fi
2325

24-
rclone copy "${THEME_A_DCU}/Irish_Data/ForasNaGaeilge/new-extraction/NCI_extracted_v2.txt" $OUTDIR --bwlimit 1000M --transfers 1
26+
rclone cat \
27+
"${THEME_A_DCU}/Irish_Data/ForasNaGaeilge/9MqDsdf834ms2NfS8L2joi7u_NCIv2.vert" \
28+
--bwlimit 1000M --transfers 1 | \
29+
scripts/extract_text_from_nci_vert.py --document-newline > ${OUTFILE}
30+
31+
bzip2 ${OUTFILE}
2532

2633
echo "Done"

scripts/download_scripts/download_paracrawl_data.sh

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#/bin/bash
1+
#!/bin/bash
22

33
echo "Running OpusFilter to collect Opus data ..."
44

@@ -7,6 +7,7 @@ TRG=en
77

88
# OPUS corpus name, e.g. 'paracrawl'
99
CORPUS=paracrawl
10+
1011
echo "Using corpus $CORPUS"
1112
mkdir -p data/ga/$CORPUS
1213

0 commit comments

Comments
 (0)