-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_batch_process.py
141 lines (98 loc) · 3.51 KB
/
text_batch_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
TEXT_PATH = "Texts/"
ML_PATH = "ML/"
#text_extension = ".txt"
import utilities as u
# Those are the names of the 2 objects that will be used for the ML part
x_ML_name = "x" # X is a matrix of features
y_ML_name = "y" # Y a list of tags
csv_name = "all_data.csv"
def get_difficulty(text):
"""
It takes a text that contains the difficulty tag and returns it
Example:
A1_cine2 --> A1
Args:
text: name of the document
Returns:
Difficulty tag, it should be like A1, A2, B1, C2 or A, B, C
"""
#tag = text[:2] #If difficulty is taged like A1, A2, B1, B2, C1, C2
tag = text[:1] #If difficulty is taged like A, B, C
return tag
def store_results(x, y, file_names, change_decimal_separator=True):
"""
Function that store all the variables needed for the ML part.
It saves the metrics (x) and the difficult labels (y) in pickle objects.
It also stores a CSV with all the information.
"""
#first store x, y as pickle objects
u.save_pickle(x, x_ML_name, path = ML_PATH)
u.save_pickle(y, y_ML_name, path = ML_PATH)
import process_text as pt
import numpy as np
header = ["File"] + pt.get_metrics_header() + ["Difficulty"]
matrix = np.c_[file_names, x, y]
#Then save a CSV with all the data
u.save_to_csv(csv_name, ML_PATH, matrix, header)
#Locally a decimal is write like 3,14 not 3.14
#By default it changes that
if change_decimal_separator:
u.change_decimal_separator(csv_name, ML_PATH, [".txt"])
def process_all_texts():
"""
Reads all the texts presents in the folders inside TEXT_PATH.
It retrives some metrics that will be stored and used in the ML part
Returns:
x: Matrix of metrics with size n*m, where =>
n = num of texts processed
m = num of different characteristics
y: Array with difficulty tags
"""
#used to calculate how the preprocessing part lasts
timer = u.Timer()
print "Starting to process all the texts"
import os
import process_text as pt
# For the ML part, it will try to solve something like a*x = y
x = []
y = []
file_names = []
#Explore every folder inside TEXT_PATH
for folder in os.listdir(os.getcwd() + "/" + TEXT_PATH):
#check that it is really a folder, not a file
if "." not in folder:
#Get every document
documents = os.listdir(os.getcwd() + "/" + TEXT_PATH + folder)
actual = 1
for doc_name in documents:
print "\n\nProcessing text", actual, "/", len(documents), "inside", folder
actual += 1
x.append(pt.process_text_from_document(doc_name, TEXT_PATH + folder + "/"))
y.append(get_difficulty(doc_name))
file_names.append(folder + "/"+ doc_name)
print len(y), "texts processed in", timer.get_time(), "seconds"
store_results(x, y, file_names)
return x, y
def load_ML_variables(process = False):
"""
It retrives x, y objects to be used in the ML part. It will try to load from pickle objects and
if it fails it will calculate them.
It is posible to force the machine to calculate x and y instead of reading from pickle objects
Args:
process: if true, it will process all the texts instead of loading x and y
"""
#if asked by user, process anyway
if process:
x, y = process_all_texts()
#if not, try to load previously processed data
else:
try:
x = u.load_pickle(x_ML_name, path = ML_PATH)
y = u.load_pickle(y_ML_name, path = ML_PATH)
except IOError:
print "Pickle objects not found, starting batch process"
x, y = process_all_texts()
return x, y
#If it is not imported, run that
if __name__ == '__main__':
process_all_texts()