-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassify_temp_pinned.py
253 lines (221 loc) · 9.53 KB
/
classify_temp_pinned.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import sklearn.metrics
import sys
import datetime
from train_graph import Train_Graph
from test_graph import Test_Graph
from feature_extractors import *
import snap
import math
from get_examples import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from weight_evolution import EvolModel
import sklearn.preprocessing
def print_metrics(gt, pred):
print 'Accuracy:', sklearn.metrics.accuracy_score(gt, pred)
print 'Precision:', sklearn.metrics.precision_score(gt, pred)
print 'Recall:', sklearn.metrics.recall_score(gt, pred)
print 'F1 Score:', sklearn.metrics.f1_score(gt, pred)
def is_board(n, board_ids):
return n >= board_ids[0] and n <= board_ids[1]
def is_pin(n, board_ids):
return n > board_ids[1]
def add_edges_from_int(graph, interval_edges, intervals):
for i in intervals:
for src_id, dst_id in interval_edges[i]:
graph.AddEdge(src_id, dst_id)
def get_feat_vals(graph, examples, feature_funcs):
result = np.zeros([len(examples), len(feature_funcs)])
for i, elem in enumerate(examples):
if (i % 500) == 0: print i
src_id, dst_id = elem
for j, func in enumerate(feature_funcs):
if i == 0 and (func == get_ev_centr_sum or func == get_page_rank_sum):
score = func(graph, src_id, dst_id, reset=True)
else: score = func(graph, src_id, dst_id)
result[i][j] = score
return result
def test_classifiers(train_examples, train_labels, test_examples, test_labels):
knn = KNeighborsClassifier()
logistic = LogisticRegression()
rf = RandomForestClassifier(n_estimators=100)
my_nn = MLPClassifier(hidden_layer_sizes = (100, 50, 50))
bliss_model = EvolModel()
models = [knn, logistic, rf, my_nn, bliss_model]
for model in models:
print 'Training model', model
model.fit(train_examples, train_labels)
preds = model.predict(test_examples)
gt = [elem for elem in test_labels]
print ''
print 'Evaluating Testing Set:'
print_metrics(gt, preds)
print ''
print 'Evaluating Training Set:'
preds_train = model.predict(train_examples)
gt_train = [elem for elem in train_labels]
print_metrics(gt_train, preds_train)
def get_train_features(train_examples, graph, interval_edges, feature_funcs):
print 'Nodes, Edges before:', graph.GetNodes(), graph.GetEdges()
all_edges = [(edge.GetSrcNId(), edge.GetDstNId()) for edge in graph.Edges()]
for src_id, dst_id in all_edges:
graph.DelEdge(src_id, dst_id)
add_edges_from_int(graph, interval_edges, [0])
num_intervals = len(interval_edges)
num_features = len(feature_funcs)
num_examples = len(train_examples)
# Old vals is a num_examples x num_features np.array
print 'Getting init values'
old_vals = get_feat_vals(graph, train_examples, feature_funcs)
# Final feats will eventually be a num_examples x (num_intervals - 2)*num_features
final_feats = np.zeros([num_examples, (num_intervals - 2)*num_features])
for int_num in range(1, num_intervals - 1):
print 'Evaluating Interval', int_num
add_edges_from_int(graph, interval_edges, [int_num])
new_vals = get_feat_vals(graph, train_examples, feature_funcs)
deltas = new_vals - old_vals
final_feats[:, (int_num - 1)*num_features:int_num*num_features] = deltas
old_vals = new_vals
add_edges_from_int(graph, interval_edges, [num_intervals - 1])
print 'Nodes, edges afterwards:', graph.GetNodes(), graph.GetEdges()
return final_feats
def get_test_features(test_examples, graph, interval_edges, feature_funcs):
print 'Nodes, Edges before:', graph.GetNodes(), graph.GetEdges()
all_edges = [(edge.GetSrcNId(), edge.GetDstNId()) for edge in graph.Edges()]
print 'Deleting all edges...'
for src_id, dst_id in all_edges:
graph.DelEdge(src_id, dst_id)
add_edges_from_int(graph, interval_edges, [0, 1])
num_intervals = len(interval_edges)
num_features = len(feature_funcs)
num_examples = len(test_examples)
# Old vals is a num_examples x num_features np.array
old_vals = get_feat_vals(graph, test_examples, feature_funcs)
# Final feats will eventually be a num_examples x (num_intervals - 2)*num_features
final_feats = np.zeros([num_examples, (num_intervals - 2)*num_features])
for int_num in range(2, num_intervals):
print 'Evaluating Interval', int_num
add_edges_from_int(graph, interval_edges, [int_num])
new_vals = get_feat_vals(graph, test_examples, feature_funcs)
deltas = new_vals - old_vals
final_feats[:, (int_num - 2)*num_features:(int_num - 1)*num_features] = deltas
old_vals = new_vals
print final_feats.shape
print 'Nodes, edges afterwards:', graph.GetNodes(), graph.GetEdges()
return final_feats
def get_train_set(train_pgraph, interval_edges, board_ids, num_pos=10000, num_neg=10000):
last_interval = interval_edges[-1]
all_pinned_edges = []
for src_id, dst_id in last_interval:
if not is_pin(src_id, board_ids) and not is_pin(dst_id, board_ids): continue
all_pinned_edges.append(tuple(sorted([src_id, dst_id])))
pos_edges = random.sample(all_pinned_edges, num_pos)
pos_labels = [1]*len(pos_edges)
neg_edges = get_neg_pin_edges(train_pgraph, num_neg, board_ids)
neg_labels = [-1]*len(neg_edges)
all_pairs = pos_edges + neg_edges
all_labels = pos_labels + neg_labels
return all_pairs, all_labels
def get_time_limits(graph_obj):
attributes = graph_obj.attributes
max_time = datetime.datetime(1, 1, 1)
min_time = datetime.datetime(3000, 12, 31)
for edge in attributes:
if not isinstance(edge, tuple): continue
if 'pin_time' in attributes[edge]: key_val = 'pin_time'
if 'follow_time' in attributes[edge]: key_val = 'follow_time'
if 'create_time' in attributes[edge]: key_val = 'create_time'
time_val = datetime.datetime.fromtimestamp(attributes[edge][key_val])
if time_val < min_time: min_time = time_val
if time_val > max_time: max_time = time_val
print min_time, max_time
return min_time, max_time
'''
Return a list of num_intervals lists, where each sub-list holds
the edges formed during that interval.
'''
def get_intervals(min_time, max_time, graph, attributes, num_intervals, board_ids):
int_edges = [[] for i in range(num_intervals)]
time_delta = (max_time - min_time)/num_intervals
print time_delta
all_edges = [(edge.GetSrcNId(), edge.GetDstNId()) for edge in graph.Edges()]
for src_id, dst_id in all_edges:
if src_id < board_ids[0] or src_id > board_ids[1]:
key_val = (dst_id, src_id)
else:
key_val = (src_id, dst_id)
if 'pin_time' in attributes[key_val]:
time_val = attributes[key_val]['pin_time']
elif 'follow_time' in attributes[key_val]:
time_val = attributes[key_val]['follow_time']
elif 'create_time' in attributes[key_val]:
time_val = attributes[key_val]['create_time']
else:
print 'Here!'
continue
time_val = datetime.datetime.fromtimestamp(time_val)
index = int(math.ceil((time_val - min_time).total_seconds()/time_delta.total_seconds()) - 1)
# if index == num_intervals:
# print 'Timeval', time_val
# print 'Numer:', (time_val - min_time).total_seconds()
# print 'Denom:', time_delta.total_seconds()
# print ''
index = min(num_intervals - 1, index)
int_edges[index].append((src_id, dst_id))
for interval in int_edges:
print len(interval)
return int_edges
def add_init_edges(train_pgraph, int_edges):
for src_id, dst_id in int_edges[0]:
train_pgraph.AddEdge(src_id, dst_id)
def main(input_train, input_test, num_intervals):
# Read in the graph
train_graph_obj = Train_Graph(graph_file_root=input_train)
train_pgraph = train_graph_obj.pgraph
# (Get max SCC?)
# Get limits on the time range
print 'Getting time limits'
min_time, max_time = get_time_limits(train_graph_obj)
# Divide into intervals based on time range
print 'Dividing into intervals'
interval_edges = get_intervals(min_time, max_time, train_pgraph, \
train_graph_obj.attributes, num_intervals, train_graph_obj.board_node_ids)
assert sum([len(interval) for interval in interval_edges]) == train_pgraph.GetEdges()
# Extract positive and negative training examples in the last frame
print 'Getting training examples/labels'
train_examples, train_labels = get_train_set(train_pgraph, interval_edges, \
train_graph_obj.board_node_ids, num_pos=5000, num_neg=5000)
# Contruct our testing set
test_graph_obj = Test_Graph(graph_file_root=input_test)
test_pgraph = test_graph_obj.pgraph
print 'Getting testing examples/labels'
test_examples, test_labels = get_pin_tst_ex(train_pgraph, test_pgraph, \
train_examples, 2500, 2500, test_graph_obj.board_node_ids)
feature_funcs = [get_graph_distance, get_ev_centr_sum, get_page_rank_sum, \
preferential_attachment, get_2_hops, get_degree_sum, \
std_nbr_degree_sum, mean_nbr_deg_sum, adamic_adar_2, \
common_neighbors_2]
print 'Extracting Training features...'
train_features = get_train_features(train_examples, train_pgraph, interval_edges, feature_funcs)
try:
np.save('train_temp_pin_features_3', train_features)
np.save('train_temp_pin_examples_3', zip(train_examples, train_labels))
except Exception as e:
print str(e)
train_features = sklearn.preprocessing.scale(train_features)
print 'Extracting Testing features...'
test_features = get_test_features(test_examples, train_pgraph, interval_edges, feature_funcs)
try:
np.save('test_temp_pin_features_3', test_features)
np.save('test_temp_pin_examples_3', zip(test_examples, test_labels))
except Exception as e:
print str(e)
test_features = sklearn.preprocessing.scale(test_features)
test_classifiers(train_features, train_labels, test_features, test_labels)
if __name__=='__main__':
input_train = sys.argv[1]
input_test = sys.argv[2]
num_intervals = int(sys.argv[3])
main(input_train, input_test, num_intervals)