Skip to content

Commit 2947737

Browse files
committed
update attention layer
1 parent 0e16186 commit 2947737

File tree

3 files changed

+122
-68
lines changed

3 files changed

+122
-68
lines changed

README.md

+26-1
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,29 @@ textClassifierConv has implemented [Convolutional Neural Networks for Sentence C
77
textClassifierRNN has implemented bidirectional LSTM and one level attentional RNN. Please see the [my blog](https://richliao.github.io/supervised/classification/2016/12/26/textclassifier-RNN/) for full detail.
88

99
## update on 6/22/2017 ##
10-
To derive the attention weight which can be useful to identify important words for the classification. Please see my latest update on the post. All you need to do is run a forward pass right before attention layer output. The result is not very promising. I will update the post once I have further result.
10+
To derive the attention weight which can be useful to identify important words for the classification. Please see my latest update on the post. All you need to do is run a forward pass right before attention layer output. The result is not very promising. I will update the post once I have further result.
11+
12+
---
13+
This repo is forked from [https://github.com/richliao/textClassifier](https://github.com/richliao/textClassifier) and we find some issue [here](https://github.com/richliao/textClassifier/issues/28). So we update the textClassifierHATT with `python 2.7` and `keras 2.0.8`
14+
15+
```
16+
# clone the repo
17+
git clone {repo address}
18+
19+
# install Dependent library
20+
cd textClassifier
21+
pip install -r req.xt
22+
23+
# download imdb train from Kaggle
24+
wget https://www.kaggle.com/c/word2vec-nlp-tutorial/download/labeledTrainData.tsv
25+
# download glove word vector
26+
wget http://nlp.stanford.edu/data/glove.6B.zip
27+
unzip glove.6B.zip
28+
29+
# train the model
30+
python textClassifierHATT.py
31+
```
32+
33+
34+
35+
Enjoy!

req.txt

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
backports.weakref==1.0.post1
2+
beautifulsoup4==4.6.3
3+
bleach==1.5.0
4+
bs4==0.0.1
5+
Cython==0.27.3
6+
enum34==1.1.6
7+
funcsigs==1.0.2
8+
h5py==2.7.1
9+
html5lib==0.9999999
10+
Keras==2.0.8
11+
laspy==1.5.0
12+
lda==1.0.5
13+
Markdown==2.6.9
14+
mock==2.0.0
15+
nltk==3.3
16+
numpy==1.13.3
17+
olefile==0.44
18+
pandas==0.20.3
19+
pbr==3.1.1
20+
Pillow==4.3.0
21+
protobuf==3.4.0
22+
pypinyin==0.29.0
23+
python-dateutil==2.6.1
24+
pytz==2017.2
25+
PyYAML==3.12
26+
scikit-learn==0.19.1
27+
scipy==0.19.1
28+
six==1.11.0
29+
sklearn==0.0
30+
tensorflow==1.3.0
31+
tensorflow-gpu==1.3.0
32+
tensorflow-tensorboard==0.1.8
33+
Theano==1.0.2
34+
thulac==0.2.0
35+
typing==3.6.2
36+
Werkzeug==0.12.2
37+
word2vec==0.9.2

textClassifierHATT.py

+59-67
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# author - Richard Liao
1+
# author - Richard Liao
22
# Dec 26 2016
33
import numpy as np
44
import pandas as pd
@@ -11,7 +11,6 @@
1111
import sys
1212
import os
1313

14-
os.environ['KERAS_BACKEND']='theano'
1514

1615
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
1716
from keras.preprocessing.sequence import pad_sequences
@@ -24,25 +23,27 @@
2423

2524
from keras import backend as K
2625
from keras.engine.topology import Layer, InputSpec
27-
from keras import initializations
26+
from keras import initializers
2827

2928
MAX_SENT_LENGTH = 100
3029
MAX_SENTS = 15
3130
MAX_NB_WORDS = 20000
3231
EMBEDDING_DIM = 100
3332
VALIDATION_SPLIT = 0.2
3433

34+
3535
def clean_str(string):
3636
"""
3737
Tokenization/string cleaning for dataset
3838
Every dataset is lower cased except
3939
"""
40-
string = re.sub(r"\\", "", string)
41-
string = re.sub(r"\'", "", string)
42-
string = re.sub(r"\"", "", string)
40+
string = re.sub(r"\\", "", string)
41+
string = re.sub(r"\'", "", string)
42+
string = re.sub(r"\"", "", string)
4343
return string.strip().lower()
4444

45-
data_train = pd.read_csv('~/Testground/data/imdb/labeledTrainData.tsv', sep='\t')
45+
46+
data_train = pd.read_csv('labeledTrainData.tsv', sep='\t')
4647
print data_train.shape
4748

4849
from nltk import tokenize
@@ -53,11 +54,11 @@ def clean_str(string):
5354

5455
for idx in range(data_train.review.shape[0]):
5556
text = BeautifulSoup(data_train.review[idx])
56-
text = clean_str(text.get_text().encode('ascii','ignore'))
57+
text = clean_str(text.get_text().encode('ascii', 'ignore'))
5758
texts.append(text)
5859
sentences = tokenize.sent_tokenize(text)
5960
reviews.append(sentences)
60-
61+
6162
labels.append(data_train.sentiment[idx])
6263

6364
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
@@ -67,14 +68,14 @@ def clean_str(string):
6768

6869
for i, sentences in enumerate(reviews):
6970
for j, sent in enumerate(sentences):
70-
if j< MAX_SENTS:
71+
if j < MAX_SENTS:
7172
wordTokens = text_to_word_sequence(sent)
72-
k=0
73+
k = 0
7374
for _, word in enumerate(wordTokens):
74-
if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
75-
data[i,j,k] = tokenizer.word_index[word]
76-
k=k+1
77-
75+
if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
76+
data[i, j, k] = tokenizer.word_index[word]
77+
k = k + 1
78+
7879
word_index = tokenizer.word_index
7980
print('Total %s unique tokens.' % len(word_index))
8081

@@ -97,7 +98,7 @@ def clean_str(string):
9798
print y_train.sum(axis=0)
9899
print y_val.sum(axis=0)
99100

100-
GLOVE_DIR = "/ext/home/analyst/Testground/data/glove"
101+
GLOVE_DIR = "."
101102
embeddings_index = {}
102103
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
103104
for line in f:
@@ -115,32 +116,7 @@ def clean_str(string):
115116
if embedding_vector is not None:
116117
# words not found in embedding index will be all-zeros.
117118
embedding_matrix[i] = embedding_vector
118-
119-
embedding_layer = Embedding(len(word_index) + 1,
120-
EMBEDDING_DIM,
121-
weights=[embedding_matrix],
122-
input_length=MAX_SENT_LENGTH,
123-
trainable=True)
124-
125-
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
126-
embedded_sequences = embedding_layer(sentence_input)
127-
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
128-
sentEncoder = Model(sentence_input, l_lstm)
129-
130-
review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
131-
review_encoder = TimeDistributed(sentEncoder)(review_input)
132-
l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)
133-
preds = Dense(2, activation='softmax')(l_lstm_sent)
134-
model = Model(review_input, preds)
135-
136-
model.compile(loss='categorical_crossentropy',
137-
optimizer='rmsprop',
138-
metrics=['acc'])
139119

140-
print("model fitting - Hierachical LSTM")
141-
print model.summary()
142-
model.fit(x_train, y_train, validation_data=(x_val, y_val),
143-
nb_epoch=10, batch_size=50)
144120

145121
# building Hierachical Attention network
146122
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
@@ -149,51 +125,67 @@ def clean_str(string):
149125
if embedding_vector is not None:
150126
# words not found in embedding index will be all-zeros.
151127
embedding_matrix[i] = embedding_vector
152-
128+
153129
embedding_layer = Embedding(len(word_index) + 1,
154130
EMBEDDING_DIM,
155131
weights=[embedding_matrix],
156132
input_length=MAX_SENT_LENGTH,
157-
trainable=True)
133+
trainable=True,
134+
mask_zero=True)
135+
158136

159137
class AttLayer(Layer):
160-
def __init__(self, **kwargs):
161-
self.init = initializations.get('normal')
162-
#self.input_spec = [InputSpec(ndim=3)]
163-
super(AttLayer, self).__init__(**kwargs)
138+
def __init__(self, attention_dim):
139+
self.init = initializers.get('normal')
140+
self.supports_masking = True
141+
self.attention_dim = attention_dim
142+
super(AttLayer, self).__init__()
164143

165144
def build(self, input_shape):
166-
assert len(input_shape)==3
167-
#self.W = self.init((input_shape[-1],1))
168-
self.W = self.init((input_shape[-1],))
169-
#self.input_spec = [InputSpec(shape=input_shape)]
170-
self.trainable_weights = [self.W]
171-
super(AttLayer, self).build(input_shape) # be sure you call this somewhere!
145+
assert len(input_shape) == 3
146+
self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
147+
self.b = K.variable(self.init((self.attention_dim, )))
148+
self.u = K.variable(self.init((self.attention_dim, 1)))
149+
self.trainable_weights = [self.W, self.b, self.u]
150+
super(AttLayer, self).build(input_shape)
151+
152+
def compute_mask(self, inputs, mask=None):
153+
return mask
172154

173155
def call(self, x, mask=None):
174-
eij = K.tanh(K.dot(x, self.W))
175-
176-
ai = K.exp(eij)
177-
weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
178-
179-
weighted_input = x*weights.dimshuffle(0,1,'x')
180-
return weighted_input.sum(axis=1)
181-
182-
def get_output_shape_for(self, input_shape):
156+
# size of x :[batch_size, sel_len, attention_dim]
157+
# size of u :[batch_size, attention_dim]
158+
# uit = tanh(xW+b)
159+
uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
160+
ait = K.dot(uit, self.u)
161+
ait = K.squeeze(ait, -1)
162+
163+
ait = K.exp(ait)
164+
165+
if mask is not None:
166+
# Cast the mask to floatX to avoid float64 upcasting in theano
167+
ait *= K.cast(mask, K.floatx())
168+
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
169+
ait = K.expand_dims(ait)
170+
weighted_input = x * ait
171+
output = K.sum(weighted_input, axis=1)
172+
173+
return output
174+
175+
def compute_output_shape(self, input_shape):
183176
return (input_shape[0], input_shape[-1])
184177

178+
185179
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
186180
embedded_sequences = embedding_layer(sentence_input)
187181
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
188-
l_dense = TimeDistributed(Dense(200))(l_lstm)
189-
l_att = AttLayer()(l_dense)
182+
l_att = AttLayer(100)(l_lstm)
190183
sentEncoder = Model(sentence_input, l_att)
191184

192-
review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
185+
review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
193186
review_encoder = TimeDistributed(sentEncoder)(review_input)
194187
l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
195-
l_dense_sent = TimeDistributed(Dense(200))(l_lstm_sent)
196-
l_att_sent = AttLayer()(l_dense_sent)
188+
l_att_sent = AttLayer(100)(l_lstm_sent)
197189
preds = Dense(2, activation='softmax')(l_att_sent)
198190
model = Model(review_input, preds)
199191

0 commit comments

Comments
 (0)