-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_analisis.py
153 lines (111 loc) · 2.95 KB
/
data_analisis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
GRAPH_PATH = "ML/Graphs/"
import utilities as u
def show_difficulties_distribution(y):
"""
Shows how many text are of every difficulty
Args:
y: list of difficulties
"""
num_a = 0
num_b = 0
num_c = 0
for m_y in y:
if m_y == "A":
num_a += 1
elif m_y == "B":
num_b += 1
elif m_y == "C":
num_c += 1
print "A:", num_a, "/ B:", num_b, "/ C:", num_c
def scatter_plot_from_lists(var1, var2, title="graph_temp", path = GRAPH_PATH, xlabel=None, ylabel=None):
"""
It saves a scatter plot from 2 lists
Args:
var1: first list of values
var2: second list of values
title: title of the plot
path: where to store the plot
xlabel: label for the x axis
ylabel: label for the y axis
"""
import matplotlib.pyplot as plt
plt.plot(var1, var2)
if xlabel is not None:
plt.xlabel(xlabel)
if ylabel is not None:
plt.ylabel(ylabel)
plt.title(title)
#plt.show()
#export plot and clear it
plt.savefig(path + title + ".png")
plt.clf()
def scatter_plot(index1, index2):
"""
Given 2 index variables it will export a scatter_plot.
It loads some information that uses the ML partition
Args:
index1: index for the x values of the scatter_plot
index2: index for the y values of the scatter_plot
"""
#get headers
import process_text as pt
headers = pt.get_metrics_header()
#get info
import text_batch_process as tbp
matrix, tag = tbp.load_ML_variables()
#change difficulty label to color tag
for index, item in enumerate(tag):
if item == 'A':
tag[index] = 'g'
elif item == 'B':
tag[index] = 'b'
elif item == 'C':
tag[index] = 'r'
#extract info
import numpy as np
aux = np.array(matrix)
var1 = aux[:, index1]
var2 = aux[:, index2]
xlabel = headers[index1]
ylabel = headers[index2]
#plot
import matplotlib.pyplot as plt
plt.scatter(var1, var2, color=tag)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(xlabel + " vs " + ylabel)
#plt.show()
#export plot and clear it
plt.savefig(GRAPH_PATH + str(index1) + " vs " + str(index2) + ".png")
plt.clf()
def plot_all():
"""
Plot scatter plots of all the combinations of the current metrics
"""
timer = u.Timer()
print "\nPlotting all the combinations"
#delete existing png files
u.delete_files(GRAPH_PATH, ".png")
import process_text as pt
#do all the possible combination
for i in range(0, len(pt.get_metrics_header())):
for j in range(0, len(pt.get_metrics_header())):
if i != j:
print "ploting", i, "vs", j
scatter_plot(i, j)
print "\nAll graphs created in", timer.get_time()
def get_correlation_matrix():
"""
It saves a csv with the correlation matrix of the variables used in ML
"""
path_ML = "ML/"
input_doc = "all_data.csv"
output_doc = "Correlation_matrix.csv"
import pandas as pd
df = u.csv_to_df(path_ML + input_doc)
df = df[1:]
u.df_to_csv(path_ML + output_doc, df.corr())
#If it is not imported, run that
if __name__ == '__main__':
plot_all()
get_correlation_matrix()