-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
120 lines (99 loc) · 3.09 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import ast
from matplotlib import pyplot as plt
import progressbar
from sklearn.decomposition import TruncatedSVD
#from tokenizer.custom_tokenizer import CustomToken, SpacyCustomTokenizer
import sys
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
def files_list(folder):
return list(os.listdir(f'{folder}'))
def load(files, path=''):
for filename in files:
with open(f'{path}/{filename}', 'r') as f:
text = f.read()
f.close()
yield ast.literal_eval(text)
def pb(len_, name=""):
bar = progressbar.ProgressBar(len_, widgets=[progressbar.Bar(
'=', '[', ']', ), name, progressbar.Percentage()])
bar.start()
return bar
# def data_to_token(data):
# token = CustomToken(data["text"], lex=data['lemma'],
# is_stop=data['is_stop'], is_sy=data['is_symbol'])
# return token
#
#
# def doc2vec(files):
# document = []
# matrix = []
# vocabulary = set()
# nlp = SpacyCustomTokenizer()
# _len_ = len(files)
# bar = pb(_len_, f' tokenizer {_len_} ')
# for i, data in enumerate(load(files, 'tokens')):
# matrix.append(nlp.nlp(data[0]).vector)
# s = set()
# for token in data[1]:
# token = data_to_token(token)
#
# if (token.is_stop
# or token.is_symbol
# or token.space()
# or token.is_emoji()
# or token.is_url()
# # or token.is_date()
# or token.is_digit()):
# continue
# lemma = token.lemma.lower()
# vocabulary.add(lemma)
# s.add(lemma)
# # s += lemma + " "
# document.append(s)
# bar.update(i+1)
# bar.finish()
#
# # tf = TfidfVectorizer()
# # matrix = tf.fit_transform(document)
#
# # if matrix.shape > (len(document), 96):
# # print("SVD decomposition")
# # truncatedSVD = TruncatedSVD(96)
# # matrix = truncatedSVD.fit_transform(matrix)
#
# return matrix, document
#
def save(vectors, document):
np.save('results/vectors.npy', vectors)
with open('results/document.json', 'w+') as f:
json.dump(document, f)
f.close()
def loads():
vectors = np.load('results/vectors.npy', allow_pickle=True)
l = len(vectors[0])
vectors = [i for i in vectors if len(i) == l]
with open('results/document.json', 'r') as f:
document = json.load(f)
f.close()
return vectors, document
def view_points(vectors, tags=None):
truncatedSVD = TruncatedSVD(2)
X_truncate_plot = truncatedSVD.fit_transform(vectors)
x = [point[0] for point in X_truncate_plot]
y = [point[1] for point in X_truncate_plot]
plt.scatter(x, y, c=tags)
plt.show()
# if __name__ == '__main__':
# cmd = sys.argv[1] if len(sys.argv) == 2 else ''
# if cmd == 'plot':
# print("ONLY PLOT")
# v, _ = loads()
# else:
# files = files_list('tokens')
# v, d = doc2vec(files)
# save(v, d)
# v, _ = loads()
# view_points(v)