-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathmodel.py
executable file
·126 lines (108 loc) · 7.21 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import tensorflow as tf
from .func import CudnnGRU, NativeGRU, dot_attention, summ, dropout, PointerNet
class Model(object):
def __init__(self, config, batch, word_mat=None, char_mat=None, trainable=True, opt=True):
self.config = config
self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32,
initializer=tf.constant_initializer(0), trainable=False)
# self.c: context idxs
# self.q: question idxs
# self.ch: context char idxs
# self.qh: question char idxs
# self.y1: ans start point
# self.y2: ans end point
# self.qa_id: qa_id, (not used)
self.c, self.q, self.ch, self.qh, self.y1, self.y2, self.qa_id = batch.get_next()
self.is_train = tf.get_variable("is_train", shape=[], dtype=tf.bool, trainable=False)
self.word_mat = tf.get_variable("word_mat", initializer=tf.constant(word_mat, dtype=tf.float32),
trainable=False) # word embeddings
self.char_mat = tf.get_variable("char_mat", initializer=tf.constant(char_mat, dtype=tf.float32)) # char embs
self.c_mask = tf.cast(self.c, tf.bool) # paragraph representation dropout mask
self.q_mask = tf.cast(self.q, tf.bool) # question representation dropout mask
self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1)
self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)
if opt:
N, CL = config.batch_size, config.char_limit
self.c_maxlen = tf.reduce_max(self.c_len)
self.q_maxlen = tf.reduce_max(self.q_len)
self.c = tf.slice(self.c, [0, 0], [N, self.c_maxlen])
self.q = tf.slice(self.q, [0, 0], [N, self.q_maxlen])
self.c_mask = tf.slice(self.c_mask, [0, 0], [N, self.c_maxlen])
self.q_mask = tf.slice(self.q_mask, [0, 0], [N, self.q_maxlen])
self.ch = tf.slice(self.ch, [0, 0, 0], [N, self.c_maxlen, CL])
self.qh = tf.slice(self.qh, [0, 0, 0], [N, self.q_maxlen, CL])
self.y1 = tf.slice(self.y1, [0, 0], [N, self.c_maxlen])
self.y2 = tf.slice(self.y2, [0, 0], [N, self.c_maxlen])
else:
self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit
self.ch_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1])
self.qh_len = tf.reshape(tf.reduce_sum(tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1])
self.ready()
if trainable:
self.lr = tf.get_variable("lr", shape=[], dtype=tf.float32, trainable=False)
self.opt = tf.train.AdadeltaOptimizer(learning_rate=self.lr, epsilon=1e-6, rho=0.95)
grads = self.opt.compute_gradients(self.loss)
gradients, variables = zip(*grads)
capped_grads, _ = tf.clip_by_global_norm(gradients, config.grad_clip)
self.train_op = self.opt.apply_gradients(zip(capped_grads, variables), global_step=self.global_step)
def ready(self):
config = self.config
N, PL, QL, CL, d, dc, dg = \
config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, \
config.char_dim, config.char_hidden
gru = CudnnGRU if config.use_cudnn else NativeGRU
with tf.variable_scope("emb"):
with tf.variable_scope("char"):
ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc])
qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc])
ch_emb = dropout(ch_emb, keep_prob=config.keep_prob, is_train=self.is_train)
qh_emb = dropout(qh_emb, keep_prob=config.keep_prob, is_train=self.is_train)
cell_fw = tf.contrib.rnn.GRUCell(dg)
cell_bw = tf.contrib.rnn.GRUCell(dg)
_, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, ch_emb, self.ch_len,
dtype=tf.float32)
ch_emb = tf.concat([state_fw, state_bw], axis=1)
_, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, qh_emb, self.qh_len,
dtype=tf.float32)
qh_emb = tf.concat([state_fw, state_bw], axis=1)
qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])
with tf.name_scope("word"):
c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)
c_emb = tf.concat([c_emb, ch_emb], axis=2)
q_emb = tf.concat([q_emb, qh_emb], axis=2)
with tf.variable_scope("encoding"):
rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape().as_list()[-1],
keep_prob=config.keep_prob, is_train=self.is_train)
c = rnn(c_emb, seq_len=self.c_len) # representation of paragraph
q = rnn(q_emb, seq_len=self.q_len) # representation of question
with tf.variable_scope("attention"): # gated att rnn (using dot att from Attention is All You Need actually)
qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train)
rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape().as_list()[-1],
keep_prob=config.keep_prob, is_train=self.is_train)
att = rnn(qc_att, seq_len=self.c_len)
with tf.variable_scope("match"): # self-matching rnn
self_att = dot_attention(att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob,
is_train=self.is_train)
rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape().as_list()[-1],
keep_prob=config.keep_prob, is_train=self.is_train)
match = rnn(self_att, seq_len=self.c_len)
with tf.variable_scope("pointer"):
init = summ(q[:, :, -2 * d:], d, mask=self.q_mask, keep_prob=config.ptr_keep_prob, is_train=self.is_train)
pointer = PointerNet(batch=N, hidden=init.get_shape().as_list()[-1], keep_prob=config.ptr_keep_prob,
is_train=self.is_train)
logits1, logits2 = pointer(init, match, d, self.c_mask)
with tf.variable_scope("predict"):
outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
tf.expand_dims(tf.nn.softmax(logits2), axis=1))
outer = tf.matrix_band_part(outer, 0, 15)
self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1)
losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2)
self.loss = tf.reduce_mean(losses + losses2)
def get_loss(self):
return self.loss
def get_global_step(self):
return self.global_step