-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
201 lines (178 loc) · 7.86 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
import gc
import os.path as osp
import warnings
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import Sequential, Linear, ReLU, GRU
import torch.optim as optim
from torch_geometric.data import Data, DataLoader
from torch_geometric.data import InMemoryDataset
from torch_geometric.nn import NNConv, Set2Set, AGNNConv
from torch_geometric.nn import GCNConv, SGConv, ChebConv, DynamicEdgeConv, XConv
import torch_geometric.transforms as T
from torch_geometric.data import DataLoader
from torch_geometric.utils import remove_self_loops
# from torch_geometric.datasets import QM9
# from torch_geometric.nn import MessagePassing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from dataset import ChampsDataset
from model import Net
from lr import LocalCyclicLR
warnings.filterwarnings('ignore')
def eval_df(gb, scaler):
"""
Eval the results on the validation set which is in form of dataframe.
Args:
gb : groupby dataframe which contains the results.
scaler : the scikit-learn transformer that scaled the training data.
Return:
log mean MAE score
"""
if scaler:
gb[['pred']] = scaler.inverse_transform(gb[['pred']])
gb[['true']] = scaler.inverse_transform(gb[['true']])
gb['abs_dif'] = (gb['pred']- gb['true']).abs()
ss = gb.groupby('type').abs_dif.mean()
lb_score = np.log(ss).mean()
return lb_score
def train_eval_epoch(model_, optimizer_,scheduler_, train_loader_, val_loader_, number_epoch, PRINT_EACH_=1000, scaler_=None):
""" A whole pipeline: train - val - repeat for this problem
Args:
the name of each args should tell about themself if reader is familiar with the concept of pytorch
Returns:
model_ : trained model
train_losses : loss for training
val_losses : loss for evaluation
lb_scores : score for leaderboard of CHAMPS competition, aka mean of log mae for each type.
"""
train_losses = []
val_losses = []
lb_scores = []
for ep in range(number_epoch):
model_.train()
loss_all = 0
i = 0
for data in train_loader_:
data = data.to(device)
optimizer_.zero_grad()
cls_, precs_ = model_(data)
loss = sum([0.8 * criterion(cls_, data.y_cls.view(-1).long()), 0.2 * F.mse_loss(precs_, data.y_precs)])
loss.backward()
loss_all += loss.item() * data.num_graphs
optimizer_.step()
if scheduler_:
scheduler_.step()
if i % PRINT_EACH_==0:
print(f" Loss item : {loss.item()}")
i+=1
train_l = loss_all / len(train_loader_.dataset)
print(f"avg train loss at epoch {ep} : {train_l}")
train_losses.append(train_l)
model_.eval()
error = []
lb_error = 0
nb_edges = 0
mega_type = []
mega_pred = []
mega_true = []
print_ = True
with torch.no_grad():
i = 0
for data in val_loader_:
data = data.to(device)
o_cls, o_precs = model_(data)
loss = sum([0.8 * F.cross_entropy(o_cls, data.y_cls.view(-1).long()), 0.2 * F.mse_loss(o_precs, data.y_precs)])
_, predicted = torch.max(o_cls, 1)
pred = predicted.float() + o_precs.view(predicted.size())
pred = np.reshape(pred.detach().cpu().numpy(), (-1, 1)) # torch.sum(out, dim=1)
tp = data.edge_atr.cpu().numpy()
gt = np.reshape((data.y_cls.float() + data.y_precs).cpu().view(-1).numpy(), (-1,1))
mega_type.append(tp)
mega_pred.append(pred)
mega_true.append(gt)
error.append(loss.item()*data.num_graphs)
if i % PRINT_EACH_ == 0 :
print(f' cur_loss_avg: {loss}')
i+=1
types = np.concatenate(mega_type, axis=0)
pred = np.concatenate(mega_pred, axis= 0)
trues = np.concatenate(mega_true, axis= 0)
dataset = pd.DataFrame({'type' : types[:,0], 'pred' : pred[:,0], 'true' : trues[:,0]})
val_l = np.sum([x/len(val_loader_.dataset) for x in error])
print(f"avg val loss at epoch {ep} : {val_l}")
val_losses.append(val_l)
l = eval_df(dataset,None) #
lb_scores.append(l)
print(f"Epoch : {ep} val lb loss = {l}")
del dataset
gc.collect()
return model_, train_losses, val_losses, lb_scores
def cycle_batch(train_set_, model_, optimizer_,scheduler_, val_loader_, number_epoch, PRINT_EACH_=1000, scaler_=None):
""" idea of changing batch among the epoches. Not that effective in this dataset.
"""
batch_scheduler = [4,8,16]
train_loader_ = None
train_losses = []
val_losses = []
lb_scores = []
for e in range(number_epoch):
if e < len(batch_scheduler):
train_loader_ = DataLoader(train_set_, batch_size=batch_scheduler[e], shuffle=True)
model_, train_loss, val_loss, lb_score = train_eval_epoch(model_, optimizer_,scheduler_, train_loader_, val_loader_, 1, PRINT_EACH_, scaler_)
train_losses.append(train_loss[0]) ## , ,
val_losses.append(val_loss[0])
lb_scores.append(lb_score[0])
return model_, train_losses, val_losses, lb_scores
def main():
## train_test.parquet is the combination of train.csv and test.csv
## with some rduction on memory the original csv consumes a lot of
## memory and time to load so I saved them on parquet for faster loading.
train_val_test_df = pd.read_parquet('train_test.parquet', engine='fastparquet',
columns = ['atom_index_0', 'atom_index_1', 'molecule_name',
'scalar_coupling_constant','type','dist','inv_dist',
'inv_dist_p_2','inv_dist_p_3', 'coulomb_0_1'])
structures_df = pd.read_csv('structures.csv')
train_set = ChampsDataset(train_df, structures_df, './processed_node', './', debug=2560, add_ele=False,
saved_name='train_data.pt', save_id = False, train=True)
val_set = ChampsDataset(val_df, structures_df, './processed_node', './', debug=1280, add_ele=False,
saved_name='val_data.pt', save_id = False, train=True)
## if you notice that there is no test set, because the test set is separated here.
## This is the experiment to improve the Local Validation score.
## parameter to control the networks
dim = 256
egde_attr_size = 8
input_size = 10 if train_set.add_ele else 8
OUT_SIZE = 1
D_LR = 0.001
NB_EPOCH = 300
BATCH_SZ = 64
PRINT_EACH = 1000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device) # in_size = 1, out_size=OUT_SIZE, num_cls = 242
train_loader = DataLoader(train_set, batch_size=256, shuffle=True)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
## ignore the scheduler in this experiment
scheduler = None
model, train_losses, val_losses, lb_scores = cycle_batch(train_set, model, optimizer, scheduler, val_loader, 20)
#######
"""
Example of output:
Loss item : 4.466595649719238
avg train loss at epoch 0 : 3.0191433942676382
cur_loss_avg: 3.1088061332702637
avg val loss at epoch 0 : 2.936697326722692
Epoch : 0 val lb loss = 1.440756916999817
....
"""
torch.save(model.state_dict(), './trained_model.pkl')
if __name__=="__main__":
main()