-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmodels_bert.py
94 lines (78 loc) · 4.18 KB
/
models_bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import torch
from torch import nn
from transformers import BertForSequenceClassification
from tqdm import tqdm
class outputclass(nn.Module): # nn.Module required or gradients won't flow
'''
Used so that loss and logits can be called as class attributes of the
output of our bert model+classifier.
This is so we can use the same train function for both BERTforsequenceclassification
and BERTforsequenceclassification with our custom classifier
Example use :
outputs = BertWithCustomClassifier(nn.Module)
outputs.loss
>>> tensor
'''
def __init__(self, loss=1, logits=2):
super().__init__()
self.loss = loss
self.logits = logits
class BertWithCustomClassifier(nn.Module):
"""
Pytorch nn.Module composed of a BERT model which has it's classifier layers replaced
with our own classifier. The classifier layers are initialised in the same
way as the bert layers. Using outputclass, this model outputs the same
.logits and .loss attributes as the unmodified BERTforsequenceclassification model.
Instantiation:
nb_hidden (int) : number of hidden units in the hidden layer of the classification head
Attributes:
forward : pytorch forward pass as usually done in a nn.Module
tokens (tensor) : batch of input tokens
token_type_ids (None) : only used to keep inputs consistent between models
(so that train function would work with unmodified
BERTforsequenceclassification)
attention_mask (tensor) : batch of attention masks
labels (tensor) : batch of labels
Outputs:
freeze_bert : freezes all parameters that are not in the classification head
freeze (bool) : True = freez the parameters ; False = don't
"""
def __init__(self, nb_hidden=500):
super().__init__()
self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased",
num_labels = 2, output_attentions = False,
output_hidden_states = False)
# Using the same dropout probability as BERTforsequenceclassification
# https://forums.pytorchlightning.ai/t/difference-between-bertforsequenceclassification-and-bert-nn-linear/470/2
dropout_p = self.bert.config.hidden_dropout_prob
# Replacing the default classifier layer with our own:
# https://github.com/huggingface/transformers/issues/1001
# inspired from roberta's classifier https://github.com/huggingface/transformers/blob/19e5ed736611227b004c6f55679ce3536db3c28d/src/transformers/models/roberta/modeling_roberta.py#L1443
self.bert.classifier = nn.Sequential(
nn.Dropout(p=dropout_p),
nn.Linear(768,nb_hidden),
nn.Tanh(),
nn.Dropout(p=dropout_p),
nn.Linear(nb_hidden,2),
)
# Initialisze classifier weights same way that bert does it
# from : https://forums.pytorchlightning.ai/t/difference-between-bertforsequenceclassification-and-bert-nn-linear/470/2
for layer in self.bert.classifier:
self.bert._init_weights(layer)
self.loss = nn.CrossEntropyLoss()
self.outputclass = outputclass(loss=None,logits=None)
def forward(self, tokens, token_type_ids, attention_mask, labels):
self.outputclass.logits = self.bert(input_ids=tokens, attention_mask=attention_mask).logits
self.outputclass.loss = self.loss(self.outputclass.logits, labels)
# calling our outputclass to format the output the same way as BERTforsequenceclassification
return self.outputclass
def freeze_bert(self, freeze=True):
"""
Only freez the bert layers so classifier can be trained
Inputs
freeze(bool) : True = freeze the parameters; False = don't
outputs:
None
"""
for param in self.bert.bert.parameters(): # from: https://github.com/huggingface/transformers/issues/400
param.requires_grad = (not freeze)