forked from Tixierae/deep_learning_NLP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimdb_main.py
289 lines (216 loc) · 11.2 KB
/
imdb_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
# see companion handout for a theoretical intro: http://www.lix.polytechnique.fr/~anti5662/intro_cnn_lstm_tixier.pdf
# gets to ~0.895 accuracy on the test set usually within 2 to 4 epochs (~160s per epoch on NVidia TITAN)
# tested on ubuntu with Python 3, Keras version 1.1.0., tensorflow backend
import csv
import json
import numpy as np
from gensim.models.word2vec import Word2Vec
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.constraints import maxnorm
from keras.layers import Convolution1D, GlobalMaxPooling1D, Dense, Embedding, Input, Merge, Dropout
print('packages loaded')
path_to_IMDB = ''
use_pretrained = True # when using pre-trained embeddings, convergence is faster, and absolute accuracy is slightly greater (the margin is bigger when max_size is small)
do_early_stopping = True
model_save = False
if model_save:
name_save = 'imdb_cnn_pre_400_04_04_17'
print('model will be saved under name:',name_save)
# ========== parameter values ==========
max_features = int(2e4)
stpwd_thd = 10 # TODO: allow greater values than 1 - should be fixed now
max_size = int(4e2)
word_vector_dim = int(3e2)
do_non_static = True
nb_filters = 200
drop_rate = 0.3
batch_size = 32
nb_epoch = 10 # increasing the number of epochs may lead to overfitting when max_size is small (especially since dataset is small in the 1st place)
my_optimizer = 'adam' # proved better than SGD and Adadelta
my_patience = 2 # for early stopping strategy
if not use_pretrained:
# if the embeddings are initialized randomly, using static mode doesn't make sense
do_non_static = True
print("not using pre-trained embeddings, overwriting 'do_non_static' argument")
print('=== parameter values: ===')
print('top',max_features,'words used as features')
print('top',stpwd_thd,'words excluded')
print('max size of doc (in words):',max_size)
print('dim of word vectors:',word_vector_dim)
print('non-static:',do_non_static)
print('number of filters applied to each region:',nb_filters)
print('dropout rate:',drop_rate)
print('batch size:',batch_size)
print('number of epochs:',nb_epoch)
print('optimizer:',my_optimizer)
print('patience:',my_patience)
print('=== end parameter values ===')
# ========== read pre-processed data ==========
# dictionary of word indexes (sorted by decreasing frequency across the corpus)
# this is a 1-based index - 0 is reserved for zero-padding
with open(path_to_IMDB + 'word_to_index.json', 'r') as my_file:
word_to_index = json.load(my_file)
with open(path_to_IMDB + 'training.csv', 'r') as my_file:
reader = csv.reader(my_file, delimiter=',')
x_train = list(reader)
with open(path_to_IMDB + 'test.csv', 'r') as my_file:
reader = csv.reader(my_file, delimiter=',')
x_test = list(reader)
with open(path_to_IMDB + 'training_labels.txt', 'r') as my_file:
y_train = my_file.read().splitlines()
with open(path_to_IMDB + 'test_labels.txt', 'r') as my_file:
y_test = my_file.read().splitlines()
# turn lists of strings into lists of integers
x_train = [[int(elt) for elt in sublist] for sublist in x_train]
x_test = [[int(elt) for elt in sublist] for sublist in x_test]
y_train = [int(elt) for elt in y_train]
y_test = [int(elt) for elt in y_test]
print('data loaded')
# ========== pruning ==========
# only take into account the 'max_features' most frequent words
# disregard the 'stopword_threhsold' most frequent words
x_train = [[elt for elt in rev if elt<=max_features and elt>=stpwd_thd] for rev in x_train]
x_test = [[elt for elt in rev if elt<=max_features and elt>=stpwd_thd] for rev in x_test]
print('pruning done')
# ========== truncation and padding ==========
# truncate reviews of size larger than 'max_size' to their 'max_size' first words
x_train = [rev[:max_size] for rev in x_train]
x_test = [rev[:max_size] for rev in x_test]
# pad reviews shorter than 'max_size' with zeroes
# the vector of the 0th index will be set to all zeroes (zero padding strategy)
print('padding',len([elt for elt in x_train if len(elt)<max_size])
,'reviews from the training set')
x_train = [rev+[0]*(max_size-len(rev)) if len(rev)<max_size else rev for rev in x_train]
# sanity check: all reviews should now be of size 'max_size'
if max_size == list(set([len(rev) for rev in x_train]))[0]:
print('1st sanity check passed')
else:
print('1st sanity check failed !')
print('padding',len([elt for elt in x_test if len(elt)<max_size])
,'reviews from the test set')
x_test = [rev+[0]*(max_size-len(rev)) if len(rev)<max_size else rev for rev in x_test]
if max_size == list(set([len(rev) for rev in x_test]))[0]:
print('2nd sanity check passed')
else:
print('2nd sanity check failed !')
print('truncation and padding done')
# ========== loading pre-trained word vectors ==========
# invert mapping
index_to_word = dict((v,k) for k, v in word_to_index.items())
# to display the 'stopwords'
print('stopwords are:',[index_to_word[idx] for idx in range(1,stpwd_thd)])
# convert integer reviews into word reviews
x_full = x_train + x_test
x_full_words = [[index_to_word[idx] for idx in rev if idx!=0] for rev in x_full]
all_words = [word for rev in x_full_words for word in rev]
print(len(all_words),'words')
print(len(list(set(all_words))),'unique words')
if use_pretrained:
# initialize word vectors
word_vectors = Word2Vec(size=word_vector_dim, min_count=1)
# create entries for the words in our vocabulary
word_vectors.build_vocab(x_full_words)
# sanity check
if len(list(set(all_words))) == len(word_vectors.wv.vocab):
print('3rd sanity check passed')
else:
print('3rd sanity check failed !')
# fill entries with the pre-trained word vectors
path_to_pretrained_wv = ''
word_vectors.intersect_word2vec_format(path_to_pretrained_wv + 'GoogleNews-vectors-negative300.bin', binary=True)
print('pre-trained word vectors loaded')
# NOTE: in-vocab words without an entry in the binary file are not removed from the vocabulary
# instead, their vectors are silently initialized to random values
# if necessary, we can detect those vectors via their norms which approach zero
#norms = [np.linalg.norm(word_vectors[word]) for word in word_vectors.wv.vocab.keys()]
#idxs_zero_norms = [idx for idx, norm in enumerate(norms) if norm<0.05]
# most of those words are proper nouns, like patton, deneuve, etc.
# they don't have an entry in the word vectors because we lowercased the text
#no_entry_words = [word_vectors.wv.vocab.keys()[idx] for idx in idxs_zero_norms]
# create numpy array of embeddings
embeddings = np.zeros((max_features + 1,word_vector_dim))
for word in word_vectors.wv.vocab.keys():
idx = word_to_index[word]
# word_to_index is 1-based! the 0-th row, used for padding, stays at zero
embeddings[idx,] = word_vectors[word]
print('embeddings created')
else:
print('not using pre-trained embeddings')
# ========== training CNN ==========
#max([max(elt) for elt in x_full])
my_input = Input(shape=(max_size,), dtype='int32') # for some reason here it is important to let the second argument of shape blank
# NOTE: create embedding tables with dimensions based on max_features, ignoring stopword removal and truncation
# for instance if initially max_features = 2e4, reviews will be composed of integers from 1 to 2e4
# but after truncation and stopword removal, the actual length of the vocabulary (number of unique integer values) may be much smaller
# but if input dim is based on this final voc size, some integers still present (in the range [1,2e4]) won't have a row anymore
# so we create the embedding lookup table based on original max_features value, knowing that the words that have been removed just won't be looked up
if use_pretrained:
embedding = Embedding(input_dim = max_features + 1, # vocab size, including the 0-th word used for padding
output_dim = word_vector_dim,
#input_length = max_size, # length of input sequences
dropout = drop_rate, # see http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf
#embeddings_constraint = maxnorm(3.),
weights=[embeddings], # we pass our pre-trained embeddings
trainable = do_non_static
) (my_input)
else:
embedding = Embedding(input_dim = max_features + 1, # vocab size, including the 0-th word used for padding
output_dim = word_vector_dim,
#input_length = max_size, # length of input sequences
dropout = drop_rate, # see http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf
#embeddings_constraint = maxnorm(3.),
trainable = do_non_static
) (my_input)
# size of the feature map should be equal to max_size-filter_length+1
conv_1 = Convolution1D(nb_filter = nb_filters,
filter_length = 4, # region size
activation='relu',
) (embedding)
pooled_conv_1 = GlobalMaxPooling1D() (conv_1)
conv_2 = Convolution1D(nb_filter = nb_filters,
filter_length = 5, # region size
activation='relu',
) (embedding)
pooled_conv_2 = GlobalMaxPooling1D() (conv_2)
conv_3 = Convolution1D(nb_filter = nb_filters,
filter_length = 6, # region size
activation='relu',
) (embedding)
pooled_conv_3 = GlobalMaxPooling1D() (conv_3)
merge = Merge(mode='concat') ([pooled_conv_1,pooled_conv_2,pooled_conv_3])
merge_dropped = Dropout(drop_rate) (merge) # adding this layer improved test set accuracy by almost 2%
# we finally project onto a single unit output layer with sigmoi activation
prob = Dense(output_dim = 1, # dimensionality of the output space
activation='sigmoid'#,
#W_constraint = maxnorm(3.) # constrain L-2 norm of the weights. Slows up convergence (more epochs needed), but does not improve performance. In most recent version of Keras this argument has been renamed 'kernel_constraint'
) (merge_dropped)
model = Model(my_input, prob)
model.compile(loss='binary_crossentropy',
optimizer=my_optimizer,
metrics=['accuracy'])
print('model compiled')
early_stopping = EarlyStopping(monitor='val_loss', # go through epochs as long as loss on validation set decreases
patience = my_patience,
mode = 'min')
if do_early_stopping:
print('using early stopping strategy')
model.fit(x_train,
y_train,
batch_size = batch_size,
nb_epoch = nb_epoch,
validation_data = (x_test, y_test),
callbacks = [early_stopping])
else:
model.fit(x_train,
y_train,
batch_size = batch_size,
nb_epoch = nb_epoch,
validation_data = (x_test, y_test),
)
# persist model to disk
if model_save:
model.save(path_to_IMDB + name_save)
print('model saved to disk')
#loss, acc = model.evaluate(x_test, y_test, batch_size = batch_size)
#print('final accuracy on test set:', acc)