-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbert.py
336 lines (279 loc) · 15.2 KB
/
bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import re
import math
import torch
import numpy as np
from random import *
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
text = (
'Hello, how are you? I am Romeo.\n' # R
'Hello, Romeo My name is Juliet. Nice to meet you.\n' # J
'Nice meet you too. How are you today?\n' # R
'Great. My baseball team won the competition.\n' # J
'Oh Congratulations, Juliet\n' # R
'Thank you Romeo\n' # J
'Where are you going today?\n' # R
'I am going shopping. What about you?\n' # J
'I am going to visit my grandmother. she is not very well' # R
)
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n') # filter '.', ',', '?', '!' sentences list
word_list = list(set(" ".join(sentences).split())) # vocab ['hello', 'how', 'are', 'you',...] 无序
word2idx = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}
for i, w in enumerate(word_list):
word2idx[w] = i + 4
idx2word = {i: w for i, w in enumerate(word2idx)} # id2word
vocab_size = len(word2idx) # word字典大小(包括特殊字符)
token_list = list()
for sentence in sentences:
arr = [word2idx[s] for s in sentence.split()]
token_list.append(arr)
'''
[[4, 9, 28, 25, 22, 7, 5],
[4, 5, 23, 21, 36, 17, 10, 18, 34, 25],
[10, 34, 25, 12, 9, 28, 25, 13],
[38, 23, 39, 6, 33, 27, 14],
[19, 32, 17],
[8, 25, 5],
[26, 28, 25, 20, 13],
[22, 7, 20, 30, 11, 35, 25],
[22, 7, 20, 18, 31, 23, 29, 24, 36, 37, 15, 16]]
'''
'''
maxlen 表示同一个batch中的所有句子都由30个token组成,不够的补PAD(这里我实现的方式比较粗暴,直接固定所有batch中的所有句子都为30)
max_pred 表示最多需要预测多少个单词,即BERT中的完形填空任务
n_layers 表示Encoder Layer的数量
d_model 表示Token Embeddings、Segment Embeddings、Position Embeddings的维度
d_ff 表示Encoder Layer中全连接层的维度
n_segments 表示Decoder input由几句话组成
'''
maxlen = 30
batch_size = 6
max_pred = 5 # max tokens of prediction
n_layers = 6
n_heads = 12
d_model = 768
d_ff = 768 * 4 # 4*d_model, FeedForward dimension
d_k = d_v = 64 # dimension of K(=Q), V
n_segments = 2
'''
上述代码中,positive变量代表两句话是连续的个数,negative代表两句话不是连续的个数,我们需要做到在一个batch中,这两个样本的比例为1:1。
随机选取的两句话是否连续,只要通过判断tokens_a_index + 1 == tokens_b_index即可
然后是随机mask一些token,n_pred变量代表的是即将mask的token数量,cand_maked_pos代表的是有哪些位置是候选的、可以mask的
(因为像[SEP],[CLS]这些不能做mask,没有意义),最后shuffle()一下,然后根据random()的值选择是替换为[MASK]还是替换为其它的token
接下来会做两个Zero Padding,第一个是为了补齐句子的长度,使得一个batch中的句子都是相同长度。第二个是为了补齐mask的数量,因为不同句
子长度,会导致不同数量的单词进行mask,我们需要保证同一个batch中,mask的数量(必须)是相同的,所以也需要在后面补一些没有意义的东西,
比方说[0]
'''
# sample IsNext and NotNext to be same in small batch size
def make_data():
batch = []
positive = negative = 0
while positive != batch_size / 2 or negative != batch_size / 2: # 各占一半
tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences)) # 随机取第几个句子
tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]
input_ids = [word2idx['[CLS]']] + tokens_a + [word2idx['[SEP]']] + tokens_b + [word2idx['[SEP]']]
# 中间sep为第一句
segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1) # 第一句话用0,第二句话用1
# MASK LM
n_pred = min(max_pred, max(1, int(len(input_ids) * 0.15))) # 随机把一句话中 15% 的 token(字或词)进行替换
cand_maked_pos = [i for i, token in enumerate(input_ids) # 只对是单词的位置进行mask,特殊字符不Mask
if token != word2idx['[CLS]'] and token != word2idx['[SEP]']] # candidate masked position
shuffle(cand_maked_pos)
masked_tokens, masked_pos = [], []
for pos in cand_maked_pos[:n_pred]: # 以shuffle()的方式选取要mask的位置
masked_pos.append(pos) # 记录要mask的单词在input_ids中的位置
masked_tokens.append(input_ids[pos]) # 记录要mask的单词在字典当中的index
if random() < 0.8: # 80%
input_ids[pos] = word2idx['[MASK]'] # 一个 token 有 80% 的几率被替换成 [MASK],例如 my dog is hairy→my dog is [MASK]
elif random() > 0.9: # 10%
index = randint(0, vocab_size - 1) # 有 10% 的几率被替换成任意一个其它的 token,例如 my dog is hairy→my dog is apple
while index < 4: # can't involve 'CLS', 'SEP', 'PAD'
index = randint(0, vocab_size - 1)
input_ids[pos] = index # 10%的概率用字典中的其他词replace
# 有 10% 的几率原封不动,例如 my dog is hairy→my dog is hairy
# Zero Paddings 0填充
n_pad = maxlen - len(input_ids)
input_ids.extend([0] * n_pad)
segment_ids.extend([0] * n_pad)
# Zero Padding (100% - 15%) tokens
if max_pred > n_pred: # 如果要0填充的词的数量小于设置的最大值
n_pad = max_pred - n_pred
masked_pos.extend([0] * n_pad) #
masked_tokens.extend([0] * n_pad)
if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:
batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
positive += 1
elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:
batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
negative += 1
return batch
# Proprecessing Finished
batch = make_data()
input_ids, segment_ids, masked_tokens, masked_pos, isNext = zip(*batch)
input_ids, segment_ids, masked_tokens, masked_pos, isNext = \
torch.LongTensor(input_ids), torch.LongTensor(segment_ids), torch.LongTensor(masked_tokens), \
torch.LongTensor(masked_pos), torch.LongTensor(isNext)
class MyDataSet(Data.Dataset):
def __init__(self, input_ids, segment_ids, masked_tokens, masked_pos, isNext):
self.input_ids = input_ids
self.segment_ids = segment_ids
self.masked_tokens = masked_tokens
self.masked_pos = masked_pos
self.isNext = isNext
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.segment_ids[idx], self.masked_tokens[idx], self.masked_pos[idx], self.isNext[
idx]
# input_ids [6, 30]
# segment_ids [6, 30]
# masked_tokens [6, 5]
# masked_pos [6, 5]
# isNext [6]
loader = Data.DataLoader(MyDataSet(input_ids, segment_ids, masked_tokens, masked_pos, isNext), batch_size, True)
# train
def get_attn_pad_mask(seq_q, seq_k):
batch_size, seq_len = seq_q.size()
# eq(zero) is PAD token
pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) # [batch_size, 1, seq_len] seq_len.data.eq(0).unsqueeze(1)
return pad_attn_mask.expand(batch_size, seq_len, seq_len) # [batch_size, seq_len, seq_len]
def gelu(x):
"""
Implementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
class Embedding(nn.Module):
def __init__(self):
super(Embedding, self).__init__()
self.tok_embed = nn.Embedding(vocab_size, d_model) # token embedding
self.pos_embed = nn.Embedding(maxlen, d_model) # position embedding
self.seg_embed = nn.Embedding(n_segments, d_model) # segment(token type) embedding
self.norm = nn.LayerNorm(d_model)
def forward(self, x, seg):
"""
Args:
x (_type_):input_ids [batch, seq_len]
seg (_type_):segment_ids [batch, seq_len]
Returns:
_type_: _description_
"""
seq_len = x.size(1)
pos = torch.arange(seq_len, dtype=torch.long)
pos = pos.unsqueeze(0).expand_as(x) # [seq_len] -> [batch_size, seq_len]
embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
return self.norm(embedding)
class ScaledDotProductAttention(nn.Module):
def __init__(self):
super(ScaledDotProductAttention, self).__init__()
def forward(self, Q, K, V, attn_mask):
scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size, n_heads, seq_len, seq_len]
scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
attn = nn.Softmax(dim=-1)(scores)
context = torch.matmul(attn, V)
return context
class MultiHeadAttention(nn.Module):
def __init__(self):
super(MultiHeadAttention, self).__init__()
self.W_Q = nn.Linear(d_model, d_k * n_heads)
self.W_K = nn.Linear(d_model, d_k * n_heads)
self.W_V = nn.Linear(d_model, d_v * n_heads)
def forward(self, Q, K, V, attn_mask):
# q: [batch_size, seq_len, d_model], k: [batch_size, seq_len, d_model], v: [batch_size, seq_len, d_model]
residual, batch_size = Q, Q.size(0)
# (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # q_s: [batch_size, n_heads, seq_len, d_k]
k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # k_s: [batch_size, n_heads, seq_len, d_k]
v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2) # v_s: [batch_size, n_heads, seq_len, d_v]
attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1,
1) # attn_mask : [batch_size, n_heads, seq_len, seq_len]
# context: [batch_size, n_heads, seq_len, d_v], attn: [batch_size, n_heads, seq_len, seq_len]
context = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
context = context.transpose(1, 2).contiguous().view(batch_size, -1,
n_heads * d_v) # context: [batch_size, seq_len, n_heads*d_v]
output = nn.Linear(n_heads * d_v, d_model)(context)
return nn.LayerNorm(d_model)(output + residual) # output: [batch_size, seq_len, d_model]
class PoswiseFeedForwardNet(nn.Module):
def __init__(self):
super(PoswiseFeedForwardNet, self).__init__()
self.fc1 = nn.Linear(d_model, d_ff)
# 中间有激活函数
self.fc2 = nn.Linear(d_ff, d_model)
def forward(self, x):
# (batch_size, seq_len, d_model) -> (batch_size, seq_len, d_ff) -> (batch_size, seq_len, d_model)
return self.fc2(gelu(self.fc1(x)))
class EncoderLayer(nn.Module):
def __init__(self):
super(EncoderLayer, self).__init__()
self.enc_self_attn = MultiHeadAttention()
self.pos_ffn = PoswiseFeedForwardNet()
def forward(self, enc_inputs, enc_self_attn_mask):
enc_outputs = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs,
enc_self_attn_mask) # enc_inputs [batch_size, seq_len, d_model]
enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size, seq_len, d_model]
return enc_outputs
class BERT(nn.Module):
def __init__(self):
super(BERT, self).__init__()
self.embedding = Embedding()
self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
self.fc = nn.Sequential(
nn.Linear(d_model, d_model),
nn.Dropout(0.5),
nn.Tanh(),
)
self.classifier = nn.Linear(d_model, 2)
self.linear = nn.Linear(d_model, d_model)
self.activ2 = gelu
# fc2 is shared with embedding layer
embed_weight = self.embedding.tok_embed.weight
self.fc2 = nn.Linear(d_model, vocab_size, bias=False)
self.fc2.weight = embed_weight
def forward(self, input_ids, segment_ids, masked_pos):
output = self.embedding(input_ids, segment_ids) # [bach_size, seq_len, d_model]
enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids) # [batch_size, maxlen, maxlen]
for layer in self.layers:
# output: [batch_size, seq_len, d_model]
output = layer(output, enc_self_attn_mask)
# 取cls嵌入向量
output_cls = output[:, 0] # [batch_size, d_model] 取cls嵌入向量
h_pooled = self.fc(output_cls) # [batch_size, d_model]
logits_clsf = self.classifier(h_pooled) # [batch_size, 2] predict isNext
masked_pos = masked_pos[:, :, None].expand(-1, -1, d_model) # [batch_size, max_pred, d_model]
# 取masked的位置的嵌入限量
h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
h_masked = self.activ2(self.linear(h_masked)) # [batch_size, max_pred, d_model]
logits_lm = self.fc2(h_masked) # [batch_size, max_pred, vocab_size]
return logits_lm, logits_clsf
model = BERT()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.001)
# train
for epoch in range(200):
for input_ids, segment_ids, masked_tokens, masked_pos, isNext in loader:
logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos) # [batch, 2] for logits_clsf
loss_lm = criterion(logits_lm.view(-1, vocab_size), masked_tokens.view(-1)) # [batch_size, max_pred, vocab_size] for masked LM
loss_lm = (loss_lm.float()).mean()
loss_clsf = criterion(logits_clsf, isNext) # for sentence classification
loss = loss_lm + loss_clsf # 两个loss优化
if (epoch + 1) % 10 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))
optimizer.zero_grad()
loss.backward()
optimizer.step()
# test
# Predict mask tokens ans isNext
input_ids, segment_ids, masked_tokens, masked_pos, isNext = batch[0]
print(text)
print([idx2word[w] for w in input_ids if idx2word[w] != '[PAD]'])
logits_lm, logits_clsf = model(torch.LongTensor([input_ids]), \
torch.LongTensor([segment_ids]), torch.LongTensor([masked_pos]))
logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
print('masked tokens list : ',[pos for pos in masked_tokens if pos != 0])
print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])
logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
print('isNext : ', True if isNext else False)
print('predict isNext : ',True if logits_clsf else False)