forked from lbechberger/MLinPractice
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.sge
41 lines (31 loc) · 1.21 KB
/
preprocessing.sge
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#Adapted from course repository by Yesid Cano Nov. 2nd 2021
#!/bin/bash
#$ -N preprocessing_task
#$ -l mem=2G
#$ -cwd
#$ -pe default 2
#$ -o $HOME
#$ -e $HOME
#$ -l h=*cippy*
export PATH="$HOME/miniconda/bin:$PATH"
eval "$(conda shell.bash hook)"
# create directory if not yet existing
mkdir -p data/preprocessing/split/
echo " folder to store the split has been created and about to activate enviroment"
echo $PWD
conda activate MLinPractice
# add labels
echo " creating labels"
python -m script.preprocessing.create_labels data/raw/ data/preprocessing/labeled.csv $*
#download NLTK data needed for the preprocessing phase
echo "downloading NLTK stopwords, punk and wordnet"
python -m nltk.downloader stopwords $*
python -m nltk.downloader punkt $*
python -m nltk.downloader wordnet $*
# other preprocessing (removing punctuation etc.)
echo " general preprocessing"
python -m script.preprocessing.run_preprocessing data/preprocessing/labeled.csv data/preprocessing/preprocessed.csv --punctuation --tokenize -e data/preprocessing/pipeline.pickle $*
# split the data set
echo " splitting the data set"
python -m script.preprocessing.split_data data/preprocessing/preprocessed.csv data/preprocessing/split/ -s 42 $*
conda deactivate