update attention layer

zhangsh950618 · zhangsh950618 · commit 2947737df335 · 2018-08-21T12:39:34.000+08:00
diff --git a/README.md b/README.md
@@ -7,4 +7,29 @@ textClassifierConv has implemented [Convolutional Neural Networks for Sentence C
 textClassifierRNN has implemented bidirectional LSTM and one level attentional RNN. Please see the [my blog](https://richliao.github.io/supervised/classification/2016/12/26/textclassifier-RNN/) for full detail.
 
 ## update on 6/22/2017 ##
-To derive the attention weight which can be useful to identify important words for the classification. Please see my latest update on the post. All you need to do is run a forward pass right before attention layer output. The result is not very promising. I will update the post once I have further result. 
+To derive the attention weight which can be useful to identify important words for the classification. Please see my latest update on the post. All you need to do is run a forward pass right before attention layer output. The result is not very promising. I will update the post once I have further result.
+
+---
+This repo is forked from [https://github.com/richliao/textClassifier](https://github.com/richliao/textClassifier) and we find some issue [here](https://github.com/richliao/textClassifier/issues/28). So we update the textClassifierHATT with `python 2.7` and `keras 2.0.8`
+
+```
+# clone the repo
+git clone {repo address}
+
+# install Dependent library
+cd textClassifier
+pip install -r req.xt
+
+# download imdb train from Kaggle
+wget https://www.kaggle.com/c/word2vec-nlp-tutorial/download/labeledTrainData.tsv
+# download glove word vector
+wget http://nlp.stanford.edu/data/glove.6B.zip
+unzip glove.6B.zip
+
+# train the model
+python textClassifierHATT.py 
+```
+
+
+
+Enjoy！
diff --git a/req.txt b/req.txt
@@ -0,0 +1,37 @@
+backports.weakref==1.0.post1
+beautifulsoup4==4.6.3
+bleach==1.5.0
+bs4==0.0.1
+Cython==0.27.3
+enum34==1.1.6
+funcsigs==1.0.2
+h5py==2.7.1
+html5lib==0.9999999
+Keras==2.0.8
+laspy==1.5.0
+lda==1.0.5
+Markdown==2.6.9
+mock==2.0.0
+nltk==3.3
+numpy==1.13.3
+olefile==0.44
+pandas==0.20.3
+pbr==3.1.1
+Pillow==4.3.0
+protobuf==3.4.0
+pypinyin==0.29.0
+python-dateutil==2.6.1
+pytz==2017.2
+PyYAML==3.12
+scikit-learn==0.19.1
+scipy==0.19.1
+six==1.11.0
+sklearn==0.0
+tensorflow==1.3.0
+tensorflow-gpu==1.3.0
+tensorflow-tensorboard==0.1.8
+Theano==1.0.2
+thulac==0.2.0
+typing==3.6.2
+Werkzeug==0.12.2
+word2vec==0.9.2
diff --git a/textClassifierHATT.py b/textClassifierHATT.py
@@ -1,4 +1,4 @@
-# author - Richard Liao 
+# author - Richard Liao
 # Dec 26 2016
 import numpy as np
 import pandas as pd
@@ -11,7 +11,6 @@
 import sys
 import os
 
-os.environ['KERAS_BACKEND']='theano'
 
 from keras.preprocessing.text import Tokenizer, text_to_word_sequence
 from keras.preprocessing.sequence import pad_sequences
@@ -24,25 +23,27 @@
 
 from keras import backend as K
 from keras.engine.topology import Layer, InputSpec
-from keras import initializations
+from keras import initializers
 
 MAX_SENT_LENGTH = 100
 MAX_SENTS = 15
 MAX_NB_WORDS = 20000
 EMBEDDING_DIM = 100
 VALIDATION_SPLIT = 0.2
 
+
 def clean_str(string):
     """
     Tokenization/string cleaning for dataset
     Every dataset is lower cased except
     """
-    string = re.sub(r"\\", "", string)    
-    string = re.sub(r"\'", "", string)    
-    string = re.sub(r"\"", "", string)    
+    string = re.sub(r"\\", "", string)
+    string = re.sub(r"\'", "", string)
+    string = re.sub(r"\"", "", string)
     return string.strip().lower()
 
-data_train = pd.read_csv('~/Testground/data/imdb/labeledTrainData.tsv', sep='\t')
+
+data_train = pd.read_csv('labeledTrainData.tsv', sep='\t')
 print data_train.shape
 
 from nltk import tokenize
@@ -53,11 +54,11 @@ def clean_str(string):
 
 for idx in range(data_train.review.shape[0]):
     text = BeautifulSoup(data_train.review[idx])
-    text = clean_str(text.get_text().encode('ascii','ignore'))
+    text = clean_str(text.get_text().encode('ascii', 'ignore'))
     texts.append(text)
     sentences = tokenize.sent_tokenize(text)
     reviews.append(sentences)
-    
+
     labels.append(data_train.sentiment[idx])
 
 tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
@@ -67,14 +68,14 @@ def clean_str(string):
 
 for i, sentences in enumerate(reviews):
     for j, sent in enumerate(sentences):
-        if j< MAX_SENTS:
+        if j < MAX_SENTS:
             wordTokens = text_to_word_sequence(sent)
-            k=0
+            k = 0
             for _, word in enumerate(wordTokens):
-                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
-                    data[i,j,k] = tokenizer.word_index[word]
-                    k=k+1                    
-                    
+                if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
+                    data[i, j, k] = tokenizer.word_index[word]
+                    k = k + 1
+
 word_index = tokenizer.word_index
 print('Total %s unique tokens.' % len(word_index))
 
@@ -97,7 +98,7 @@ def clean_str(string):
 print y_train.sum(axis=0)
 print y_val.sum(axis=0)
 
-GLOVE_DIR = "/ext/home/analyst/Testground/data/glove"
+GLOVE_DIR = "."
 embeddings_index = {}
 f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
 for line in f:
@@ -115,32 +116,7 @@ def clean_str(string):
     if embedding_vector is not None:
         # words not found in embedding index will be all-zeros.
         embedding_matrix[i] = embedding_vector
-        
-embedding_layer = Embedding(len(word_index) + 1,
-                            EMBEDDING_DIM,
-                            weights=[embedding_matrix],
-                            input_length=MAX_SENT_LENGTH,
-                            trainable=True)
-
-sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
-embedded_sequences = embedding_layer(sentence_input)
-l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
-sentEncoder = Model(sentence_input, l_lstm)
-
-review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
-review_encoder = TimeDistributed(sentEncoder)(review_input)
-l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)
-preds = Dense(2, activation='softmax')(l_lstm_sent)
-model = Model(review_input, preds)
-
-model.compile(loss='categorical_crossentropy',
-              optimizer='rmsprop',
-              metrics=['acc'])
 
-print("model fitting - Hierachical LSTM")
-print model.summary()
-model.fit(x_train, y_train, validation_data=(x_val, y_val),
-          nb_epoch=10, batch_size=50)
 
 # building Hierachical Attention network
 embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
@@ -149,51 +125,67 @@ def clean_str(string):
     if embedding_vector is not None:
         # words not found in embedding index will be all-zeros.
         embedding_matrix[i] = embedding_vector
-        
+
 embedding_layer = Embedding(len(word_index) + 1,
                             EMBEDDING_DIM,
                             weights=[embedding_matrix],
                             input_length=MAX_SENT_LENGTH,
-                            trainable=True)
+                            trainable=True,
+                            mask_zero=True)
+
 
 class AttLayer(Layer):
-    def __init__(self, **kwargs):
-        self.init = initializations.get('normal')
-        #self.input_spec = [InputSpec(ndim=3)]
-        super(AttLayer, self).__init__(**kwargs)
+    def __init__(self, attention_dim):
+        self.init = initializers.get('normal')
+        self.supports_masking = True
+        self.attention_dim = attention_dim
+        super(AttLayer, self).__init__()
 
     def build(self, input_shape):
-        assert len(input_shape)==3
-        #self.W = self.init((input_shape[-1],1))
-        self.W = self.init((input_shape[-1],))
-        #self.input_spec = [InputSpec(shape=input_shape)]
-        self.trainable_weights = [self.W]
-        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!
+        assert len(input_shape) == 3
+        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
+        self.b = K.variable(self.init((self.attention_dim, )))
+        self.u = K.variable(self.init((self.attention_dim, 1)))
+        self.trainable_weights = [self.W, self.b, self.u]
+        super(AttLayer, self).build(input_shape)
+
+    def compute_mask(self, inputs, mask=None):
+        return mask
 
     def call(self, x, mask=None):
-        eij = K.tanh(K.dot(x, self.W))
-        
-        ai = K.exp(eij)
-        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
-        
-        weighted_input = x*weights.dimshuffle(0,1,'x')
-        return weighted_input.sum(axis=1)
-
-    def get_output_shape_for(self, input_shape):
+        # size of x :[batch_size, sel_len, attention_dim]
+        # size of u :[batch_size, attention_dim]
+        # uit = tanh(xW+b)
+        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
+        ait = K.dot(uit, self.u)
+        ait = K.squeeze(ait, -1)
+
+        ait = K.exp(ait)
+
+        if mask is not None:
+            # Cast the mask to floatX to avoid float64 upcasting in theano
+            ait *= K.cast(mask, K.floatx())
+        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
+        ait = K.expand_dims(ait)
+        weighted_input = x * ait
+        output = K.sum(weighted_input, axis=1)
+
+        return output
+
+    def compute_output_shape(self, input_shape):
         return (input_shape[0], input_shape[-1])
 
+
 sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
 embedded_sequences = embedding_layer(sentence_input)
 l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
-l_dense = TimeDistributed(Dense(200))(l_lstm)
-l_att = AttLayer()(l_dense)
+l_att = AttLayer(100)(l_lstm)
 sentEncoder = Model(sentence_input, l_att)
 
-review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
+review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
 review_encoder = TimeDistributed(sentEncoder)(review_input)
 l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
-l_dense_sent = TimeDistributed(Dense(200))(l_lstm_sent)
-l_att_sent = AttLayer()(l_dense_sent)
+l_att_sent = AttLayer(100)(l_lstm_sent)
 preds = Dense(2, activation='softmax')(l_att_sent)
 model = Model(review_input, preds)