1
- # author - Richard Liao
1
+ # author - Richard Liao
2
2
# Dec 26 2016
3
3
import numpy as np
4
4
import pandas as pd
11
11
import sys
12
12
import os
13
13
14
- os .environ ['KERAS_BACKEND' ]= 'theano'
15
14
16
15
from keras .preprocessing .text import Tokenizer , text_to_word_sequence
17
16
from keras .preprocessing .sequence import pad_sequences
24
23
25
24
from keras import backend as K
26
25
from keras .engine .topology import Layer , InputSpec
27
- from keras import initializations
26
+ from keras import initializers
28
27
29
28
MAX_SENT_LENGTH = 100
30
29
MAX_SENTS = 15
31
30
MAX_NB_WORDS = 20000
32
31
EMBEDDING_DIM = 100
33
32
VALIDATION_SPLIT = 0.2
34
33
34
+
35
35
def clean_str (string ):
36
36
"""
37
37
Tokenization/string cleaning for dataset
38
38
Every dataset is lower cased except
39
39
"""
40
- string = re .sub (r"\\" , "" , string )
41
- string = re .sub (r"\'" , "" , string )
42
- string = re .sub (r"\"" , "" , string )
40
+ string = re .sub (r"\\" , "" , string )
41
+ string = re .sub (r"\'" , "" , string )
42
+ string = re .sub (r"\"" , "" , string )
43
43
return string .strip ().lower ()
44
44
45
- data_train = pd .read_csv ('~/Testground/data/imdb/labeledTrainData.tsv' , sep = '\t ' )
45
+
46
+ data_train = pd .read_csv ('labeledTrainData.tsv' , sep = '\t ' )
46
47
print data_train .shape
47
48
48
49
from nltk import tokenize
@@ -53,11 +54,11 @@ def clean_str(string):
53
54
54
55
for idx in range (data_train .review .shape [0 ]):
55
56
text = BeautifulSoup (data_train .review [idx ])
56
- text = clean_str (text .get_text ().encode ('ascii' ,'ignore' ))
57
+ text = clean_str (text .get_text ().encode ('ascii' , 'ignore' ))
57
58
texts .append (text )
58
59
sentences = tokenize .sent_tokenize (text )
59
60
reviews .append (sentences )
60
-
61
+
61
62
labels .append (data_train .sentiment [idx ])
62
63
63
64
tokenizer = Tokenizer (nb_words = MAX_NB_WORDS )
@@ -67,14 +68,14 @@ def clean_str(string):
67
68
68
69
for i , sentences in enumerate (reviews ):
69
70
for j , sent in enumerate (sentences ):
70
- if j < MAX_SENTS :
71
+ if j < MAX_SENTS :
71
72
wordTokens = text_to_word_sequence (sent )
72
- k = 0
73
+ k = 0
73
74
for _ , word in enumerate (wordTokens ):
74
- if k < MAX_SENT_LENGTH and tokenizer .word_index [word ]< MAX_NB_WORDS :
75
- data [i ,j , k ] = tokenizer .word_index [word ]
76
- k = k + 1
77
-
75
+ if k < MAX_SENT_LENGTH and tokenizer .word_index [word ] < MAX_NB_WORDS :
76
+ data [i , j , k ] = tokenizer .word_index [word ]
77
+ k = k + 1
78
+
78
79
word_index = tokenizer .word_index
79
80
print ('Total %s unique tokens.' % len (word_index ))
80
81
@@ -97,7 +98,7 @@ def clean_str(string):
97
98
print y_train .sum (axis = 0 )
98
99
print y_val .sum (axis = 0 )
99
100
100
- GLOVE_DIR = "/ext/home/analyst/Testground/data/glove "
101
+ GLOVE_DIR = ". "
101
102
embeddings_index = {}
102
103
f = open (os .path .join (GLOVE_DIR , 'glove.6B.100d.txt' ))
103
104
for line in f :
@@ -115,32 +116,7 @@ def clean_str(string):
115
116
if embedding_vector is not None :
116
117
# words not found in embedding index will be all-zeros.
117
118
embedding_matrix [i ] = embedding_vector
118
-
119
- embedding_layer = Embedding (len (word_index ) + 1 ,
120
- EMBEDDING_DIM ,
121
- weights = [embedding_matrix ],
122
- input_length = MAX_SENT_LENGTH ,
123
- trainable = True )
124
-
125
- sentence_input = Input (shape = (MAX_SENT_LENGTH ,), dtype = 'int32' )
126
- embedded_sequences = embedding_layer (sentence_input )
127
- l_lstm = Bidirectional (LSTM (100 ))(embedded_sequences )
128
- sentEncoder = Model (sentence_input , l_lstm )
129
-
130
- review_input = Input (shape = (MAX_SENTS ,MAX_SENT_LENGTH ), dtype = 'int32' )
131
- review_encoder = TimeDistributed (sentEncoder )(review_input )
132
- l_lstm_sent = Bidirectional (LSTM (100 ))(review_encoder )
133
- preds = Dense (2 , activation = 'softmax' )(l_lstm_sent )
134
- model = Model (review_input , preds )
135
-
136
- model .compile (loss = 'categorical_crossentropy' ,
137
- optimizer = 'rmsprop' ,
138
- metrics = ['acc' ])
139
119
140
- print ("model fitting - Hierachical LSTM" )
141
- print model .summary ()
142
- model .fit (x_train , y_train , validation_data = (x_val , y_val ),
143
- nb_epoch = 10 , batch_size = 50 )
144
120
145
121
# building Hierachical Attention network
146
122
embedding_matrix = np .random .random ((len (word_index ) + 1 , EMBEDDING_DIM ))
@@ -149,51 +125,67 @@ def clean_str(string):
149
125
if embedding_vector is not None :
150
126
# words not found in embedding index will be all-zeros.
151
127
embedding_matrix [i ] = embedding_vector
152
-
128
+
153
129
embedding_layer = Embedding (len (word_index ) + 1 ,
154
130
EMBEDDING_DIM ,
155
131
weights = [embedding_matrix ],
156
132
input_length = MAX_SENT_LENGTH ,
157
- trainable = True )
133
+ trainable = True ,
134
+ mask_zero = True )
135
+
158
136
159
137
class AttLayer (Layer ):
160
- def __init__ (self , ** kwargs ):
161
- self .init = initializations .get ('normal' )
162
- #self.input_spec = [InputSpec(ndim=3)]
163
- super (AttLayer , self ).__init__ (** kwargs )
138
+ def __init__ (self , attention_dim ):
139
+ self .init = initializers .get ('normal' )
140
+ self .supports_masking = True
141
+ self .attention_dim = attention_dim
142
+ super (AttLayer , self ).__init__ ()
164
143
165
144
def build (self , input_shape ):
166
- assert len (input_shape )== 3
167
- #self.W = self.init((input_shape[-1],1))
168
- self .W = self .init ((input_shape [- 1 ],))
169
- #self.input_spec = [InputSpec(shape=input_shape)]
170
- self .trainable_weights = [self .W ]
171
- super (AttLayer , self ).build (input_shape ) # be sure you call this somewhere!
145
+ assert len (input_shape ) == 3
146
+ self .W = K .variable (self .init ((input_shape [- 1 ], self .attention_dim )))
147
+ self .b = K .variable (self .init ((self .attention_dim , )))
148
+ self .u = K .variable (self .init ((self .attention_dim , 1 )))
149
+ self .trainable_weights = [self .W , self .b , self .u ]
150
+ super (AttLayer , self ).build (input_shape )
151
+
152
+ def compute_mask (self , inputs , mask = None ):
153
+ return mask
172
154
173
155
def call (self , x , mask = None ):
174
- eij = K .tanh (K .dot (x , self .W ))
175
-
176
- ai = K .exp (eij )
177
- weights = ai / K .sum (ai , axis = 1 ).dimshuffle (0 ,'x' )
178
-
179
- weighted_input = x * weights .dimshuffle (0 ,1 ,'x' )
180
- return weighted_input .sum (axis = 1 )
181
-
182
- def get_output_shape_for (self , input_shape ):
156
+ # size of x :[batch_size, sel_len, attention_dim]
157
+ # size of u :[batch_size, attention_dim]
158
+ # uit = tanh(xW+b)
159
+ uit = K .tanh (K .bias_add (K .dot (x , self .W ), self .b ))
160
+ ait = K .dot (uit , self .u )
161
+ ait = K .squeeze (ait , - 1 )
162
+
163
+ ait = K .exp (ait )
164
+
165
+ if mask is not None :
166
+ # Cast the mask to floatX to avoid float64 upcasting in theano
167
+ ait *= K .cast (mask , K .floatx ())
168
+ ait /= K .cast (K .sum (ait , axis = 1 , keepdims = True ) + K .epsilon (), K .floatx ())
169
+ ait = K .expand_dims (ait )
170
+ weighted_input = x * ait
171
+ output = K .sum (weighted_input , axis = 1 )
172
+
173
+ return output
174
+
175
+ def compute_output_shape (self , input_shape ):
183
176
return (input_shape [0 ], input_shape [- 1 ])
184
177
178
+
185
179
sentence_input = Input (shape = (MAX_SENT_LENGTH ,), dtype = 'int32' )
186
180
embedded_sequences = embedding_layer (sentence_input )
187
181
l_lstm = Bidirectional (GRU (100 , return_sequences = True ))(embedded_sequences )
188
- l_dense = TimeDistributed (Dense (200 ))(l_lstm )
189
- l_att = AttLayer ()(l_dense )
182
+ l_att = AttLayer (100 )(l_lstm )
190
183
sentEncoder = Model (sentence_input , l_att )
191
184
192
- review_input = Input (shape = (MAX_SENTS ,MAX_SENT_LENGTH ), dtype = 'int32' )
185
+ review_input = Input (shape = (MAX_SENTS , MAX_SENT_LENGTH ), dtype = 'int32' )
193
186
review_encoder = TimeDistributed (sentEncoder )(review_input )
194
187
l_lstm_sent = Bidirectional (GRU (100 , return_sequences = True ))(review_encoder )
195
- l_dense_sent = TimeDistributed (Dense (200 ))(l_lstm_sent )
196
- l_att_sent = AttLayer ()(l_dense_sent )
188
+ l_att_sent = AttLayer (100 )(l_lstm_sent )
197
189
preds = Dense (2 , activation = 'softmax' )(l_att_sent )
198
190
model = Model (review_input , preds )
199
191
0 commit comments