-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGED_Main.py
464 lines (369 loc) · 19.8 KB
/
GED_Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
# -*- coding: utf-8 -*-
"""GED_SHL.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1fZJx2kRQGb8hGQLeFpUrP9JX1lDLgGiW
# Installing packages and using GPU
"""
import torch
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
!pip install git+https://github.com/huggingface/transformers
!pip install tqdm
"""# Data Pre-processing and Dataset Loaders"""
# importing required libraries and modules
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import confusion_matrix, f1_score, precision_score
from tqdm import tqdm
# defining the class for DebertaDataset
class DebertaDataset:
def __init__(self, sentences, labels=None, tokenizer_name='sileod/deberta-v3-base-tasksource-nli', batch_size=16):
# instantiating a tokenizer object
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
self.batch_size = batch_size
# creating input ids, attention masks and labels for the given sentences
self.input_ids, self.attention_masks, self.labels = self._prepare_data(sentences, labels)
# creating dataloaders for training, validation and testing
self.train_dataloader = self._create_train_dataloader()
self.val_dataloader = self._create_val_dataloader()
self.test_dataloader = self._create_test_dataloader()
# method to prepare data for DebertaClassifier
def _prepare_data(self, sentences, labels):
input_ids = []
attention_masks = []
# convert labels into tensor objects, if not None
if labels is not None:
labels = torch.tensor(labels)
# loop through the sentences, tokenize and create input ids, attention masks
for sent in sentences:
encoded_dict = self.tokenizer.encode_plus(
sent, # Sentence to encode.
add_special_tokens=True, # Add '[CLS]' and '[SEP]'
max_length=128, # Pad & truncate all sentences.
pad_to_max_length=True,
return_attention_mask=True, # Construct attn. masks.
return_tensors='pt', # Return pytorch tensors.
)
input_ids.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])
# concatenate the list of input ids and attention masks into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
# create a tensor object for labels, if None
if labels is None:
labels = torch.zeros(len(sentences), dtype=torch.long)
return input_ids, attention_masks, labels
# method to create dataloader for training
def _create_train_dataloader(self):
# create dataset from input ids, attention masks and labels
dataset = TensorDataset(self.input_ids, self.attention_masks, self.labels)
# create dataloader with random sampler
dataloader = DataLoader(
dataset, # The samples.
sampler=RandomSampler(dataset), # Select batches randomly
batch_size=self.batch_size # Trains with this batch size.
)
return dataloader
# method to create dataloader for validation
def _create_val_dataloader(self):
# create dataset from input ids, attention masks and labels
dataset = TensorDataset(self.input_ids, self.attention_masks, self.labels)
# create dataloader with sequential sampler
dataloader = DataLoader(
dataset, # The samples.
sampler=SequentialSampler(dataset), # Select batches sequentially
batch_size=self.batch_size # Validates with this batch size.
)
return dataloader
# method to create dataloader for testing
def _create_test_dataloader(self):
# create dataset from input ids and attention masks
dataset = TensorDataset(self.input_ids, self.attention_masks)
dataloader = DataLoader(
dataset, # The samples.
sampler=SequentialSampler(dataset), # Select batches sequentially
batch_size=self.batch_size # Tests with this batch size.
)
return dataloader
"""References"""
# @article{sileo2023tasksource,
# title={tasksource: Structured Dataset Preprocessing Annotations for Frictionless Extreme Multi-Task Learning and Evaluation},
# author={Sileo, Damien},
# url= {https://arxiv.org/abs/2301.05948},
# journal={arXiv preprint arXiv:2301.05948},
# year={2023}
# }
"""# DeBERTaV3 + CNN Classifier Model"""
import torch.nn as nn
from transformers import AutoModel
num_classes = 2
# Define a class DebertaClassifier that inherits from nn.Module
class DebertaClassifier(nn.Module):
# Constructor method for the DebertaClassifier class
def __init__(self, num_classes):
# Call the constructor of the superclass (nn.Module)
super(DebertaClassifier, self).__init__()
# Load the pre-trained DeBERTa model
self.deberta = AutoModel.from_pretrained('yevheniimaslov/deberta-v3-base-cola')
# Define a dropout layer with 10% dropout probability
self.dropout = nn.Dropout(0.1)
# Define a 1D convolutional layer with 768 input channels, 256 output channels, and kernel size 3
self.conv1d = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=3, padding=1)
# Define a ReLU activation function
self.relu = nn.ReLU()
# Define an adaptive max pooling layer with output size 1
self.pooling = nn.AdaptiveMaxPool1d(1)
# Define a fully connected (linear) layer with 256 input features and num_classes output features
self.fc1 = nn.Linear(256, num_classes)
# Define a softmax activation function
self.softmax = nn.Softmax(dim=1)
# Define the forward method for the DebertaClassifier class
def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
# Pass the input through the DeBERTa model and obtain the last hidden state
outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
last_hidden_state = outputs.last_hidden_state
# Transpose the last hidden state so that the channels dimension comes second
last_hidden_state = last_hidden_state.transpose(1,2)
# Apply dropout to the last hidden state
x = self.dropout(last_hidden_state)
# Apply the 1D convolutional layer to the last hidden state
x = self.conv1d(x)
# Apply the ReLU activation function to the output of the convolutional layer
x = self.relu(x)
# Apply the adaptive max pooling layer to the output of the ReLU activation function
x = self.pooling(x).squeeze()
# Apply the fully connected layer to the output of the adaptive max pooling layer
x = self.fc1(x)
# Apply the softmax activation function to the output of the fully connected layer
x = self.softmax(x)
# Create a tuple of outputs containing the predicted class probabilities
outputs = (x,)
# If labels are provided, calculate the cross-entropy loss and add it to the outputs tuple
if labels is not None:
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(x.view(-1, num_classes), labels.view(-1))
outputs = (loss,) + outputs
return outputs
"""# Training Validation and Testing"""
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import confusion_matrix, f1_score, precision_score
from tqdm import tqdm
import csv
class DebertaTrainer:
def __init__(self, dataset_train, dataset_val, dataset_test, num_classes=2, learning_rate=2e-5, eps=1e-8, weight_decay=0.01, betas=(0.9, 0.999), num_epochs=2, warmup_prop=0.1):
# Initialize DebertaClassifier model with specified number of classes
self.model = DebertaClassifier(num_classes)
# Send the model to the GPU if available
self.model.cuda()
# Initialize hyperparameters
self.learning_rate = learning_rate
self.eps = eps
self.weight_decay = weight_decay
self.betas = betas
self.num_epochs = num_epochs
self.warmup_prop = warmup_prop
# Create dataloaders for train, validation, and test datasets
self.train_dataloader = dataset_train.train_dataloader
self.validation_dataloader = dataset_val.val_dataloader
self.test_dataloader = dataset_test.test_dataloader
# Initialize optimizer and scheduler
self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, eps=self.eps, weight_decay=self.weight_decay, betas=self.betas)
self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=len(self.train_dataloader) * self.num_epochs * self.warmup_prop, num_training_steps=len(self.train_dataloader) * self.num_epochs)
# This function is used to train the model for one epoch
def _train_epoch(self, epoch):
# Set the model to training mode
self.model.train()
# Initialize variables for tracking training progress
train_loss = 0
train_f1 = 0
train_precision = 0
num_train_steps = 0
# Get an iterator for the training data
train_iterator = tqdm(self.train_dataloader, desc="Training")
# Loop over each batch in the training data
for step, batch in enumerate(train_iterator):
# Extract the input_ids, attention_masks, and labels from the batch
input_ids = batch[0].to(device)
attention_masks = batch[1].to(device)
labels = batch[2].to(device)
# Clear the gradients
self.model.zero_grad()
# Forward pass through the model and calculate the loss and logits
loss, logits = self.model(input_ids, attention_mask=attention_masks, labels=labels)
# Backpropagate the loss
loss.backward()
# Clip the gradients to avoid exploding gradients
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
# Update the model parameters using the optimizer
self.optimizer.step()
# Adjust the learning rate using the scheduler
self.scheduler.step()
# Update the training loss, F1 score, and precision for this batch
train_loss += loss.item()
train_f1 += f1_score(torch.argmax(logits, axis=1).cpu().numpy(), labels.cpu().numpy(), average='macro')
train_precision += precision_score(torch.argmax(logits, axis=1).cpu().numpy(), labels.cpu().numpy(), average='macro')
# Increment the number of training steps
num_train_steps += 1
# Update the progress bar with the current epoch and training loss
train_iterator.set_description(f"Epoch {epoch+1} - Train loss: {train_loss/num_train_steps:.3f}")
# Calculate the average training loss, F1 score, and precision over all training steps
train_loss /= num_train_steps
train_f1 /= num_train_steps
train_precision /= num_train_steps
# Print the average training loss, F1 score, and precision for this epoch
print(f"Epoch {epoch+1} - Training loss: {train_loss:.3f}, Training F1 score: {train_f1:.3f}, Training Precision: {train_precision:.3f}")
# Define a method for validation
def evaluate(self):
# Set the model in evaluation mode
self.model.eval()
# Initialize variables for evaluation metrics
val_loss = 0
val_f1 = 0
val_precision = 0
num_val_steps = 0
# Initialize lists for collecting all predictions and labels for confusion matrix
all_predictions = []
all_labels = []
# Iterate over the validation data loader
for batch in tqdm(self.validation_dataloader, desc="Validation"):
# Extract input_ids, attention_masks, and labels from the batch and send them to device (CPU/GPU)
input_ids = batch[0].to(device)
attention_masks = batch[1].to(device)
labels = batch[2].to(device)
# Disable gradient computation
with torch.no_grad():
# Get the loss and logits from the model
loss, logits = self.model(input_ids, attention_mask=attention_masks, labels=labels)
# Update the loss and evaluation metrics
val_loss += loss.item()
# Compute F1 score for validation set by taking the argmax of the logits and comparing them to the labels
val_f1 += f1_score(torch.argmax(logits, axis=1).cpu().numpy(), labels.cpu().numpy(), average='macro')
# Compute Precision score for validation set by taking the argmax of the logits and comparing them to the labels
val_precision += precision_score(torch.argmax(logits, axis=1).cpu().numpy(), labels.cpu().numpy(), average='macro')
num_val_steps += 1
# Collect all predictions and labels for confusion matrix
all_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())
all_labels.extend(labels.cpu().numpy())
# Compute the average validation loss and evaluation metrics
val_loss /= num_val_steps
val_f1 /= num_val_steps
val_precision /= num_val_steps
# Compute confusion matrix for the validation set using all the predictions and labels
cm = confusion_matrix(all_labels, all_predictions)
# Return the validation loss, F1 score, Precision score, and confusion matrix
return (val_loss, val_f1, val_precision, cm)
# Define a method for training
def train(self):
# Set the initial best validation loss to infinity
best_val_loss = float('inf')
# Iterate over the number of epochs
for epoch in range(self.num_epochs):
# Call the train_epoch method to train the model for one epoch
self._train_epoch(epoch)
# Evaluate the model on the validation set
val_loss, val_f1, val_precision, confusion_matrix = self.evaluate()
# Save the model if the validation loss has improved
if val_loss < best_val_loss:
best_val_loss = val_loss
# Save the state dict of the model to a file named 'model.pt'
torch.save(self.model.state_dict(), "model.pt")
# Print a message indicating that the model has been saved and the epoch number
print(f"Saved model at epoch {epoch + 1}:")
# Print the epoch number, validation loss, F1 score, Precision score, and confusion matrix
print(f"Epoch {epoch + 1}:")
print(f"Validation loss: {val_loss:.3f}, Validation F1 score: {val_f1:.3f}, Validation Precision: {val_precision:.3f}")
print(f"Confusion matrix:\n{confusion_matrix}")
# Define a method for training
def test(self):
# Load saved model from .pt file
self.model.load_state_dict(torch.load("model.pt"))
self.model.eval()
# Predict labels for test sentences
predictions = []
for batch in tqdm(self.test_dataloader, desc="Testing"):
# Get input IDs and attention masks for current batch and move to device
input_ids = batch[0].to(device)
attention_masks = batch[1].to(device)
# Disable gradient computation
with torch.no_grad():
# Forward pass through the model
logits, = self.model(input_ids, attention_mask=attention_masks)
# Move logits to CPU and convert to numpy array
logits = logits.detach().cpu().numpy()
# Get predicted label for each input sentence in the batch
batch_predictions = np.argmax(logits, axis=1)
predictions.extend(batch_predictions)
# Combine predicted labels for all batches into a single list
predicted_labels = []
for batch in self.test_dataloader:
# Append predicted labels for current batch to output list
predicted_labels.extend(predictions[:len(batch[0])])
# Remove predicted labels for current batch from predictions list
predictions = predictions[len(batch[0]):]
# Return predicted labels for all test sentences
return predicted_labels
"""References"""
# https://arxiv.org/abs/2111.09543
# https://huggingface.co/yevheniimaslov/deberta-v3-base-cola/
"""# Runner Code"""
# import necessary libraries
import pandas as pd
import random
import numpy as np
from sklearn.metrics import confusion_matrix
# load training, validation, and testing data
train_df = pd.read_csv('/content/sample_data/train_data.csv')
val_df = pd.read_csv('/content/sample_data/val_data.csv')
test_df = pd.read_excel("/content/sample_data/test_data.xlsx")
# convert input column in test data to string
test_df['input'] = test_df['input'].astype(str)
# get sentences and labels from training and validation data
sentences_train = train_df['input'].tolist()
labels_train = train_df['labels'].tolist()
sentences_val = val_df['input'].tolist()
labels_val = val_df['labels'].tolist()
# get sentences from test data
sentences_test = test_df['input'].tolist()
# # Set the seed value all over the place to make this reproducible.
# seed_val = 42
# random.seed(seed_val)
# np.random.seed(seed_val)
# torch.manual_seed(seed_val)
# torch.cuda.manual_seed_all(seed_val)
# create DebertaDataset objects for the train, validation, and test datasets
train_dataset = DebertaDataset(sentences_train, labels_train, tokenizer_name='sileod/deberta-v3-base-tasksource-nli', batch_size=16)
val_dataset = DebertaDataset(sentences_val, labels_val, tokenizer_name='sileod/deberta-v3-base-tasksource-nli', batch_size=16)
test_dataset = DebertaDataset(sentences_test, tokenizer_name='sileod/deberta-v3-base-tasksource-nli', batch_size=16)
# create a DebertaTrainer object with specified parameters
deberta_model = DebertaTrainer(dataset_train=train_dataset, dataset_val=val_dataset, dataset_test=test_dataset,
num_classes=2,
learning_rate=2e-5,
eps=1e-8,
weight_decay=0.01,
betas=(0.9, 0.999),
num_epochs=2,
warmup_prop=0.1)
# train the Deberta model and save the best one
deberta_model.train()
# Final precision and F1-scores on validation set along with the confusion matrix
val_loss, val_f1, val_precision, confusion_matrix = deberta_model.evaluate()
print(f"Validation loss: {val_loss:.3f}, Validation F1 score: {val_f1:.3f}, Validation Precision: {val_precision:.3f}")
print(f"Confusion matrix:\n{confusion_matrix}")
# Test the model
test_preds = deberta_model.test()
"""# Saving the predictions of test data"""
print(len(test_preds))
# Write the test_sentence and corresponding predicted-label pairs to a CSV file
test_results = pd.DataFrame({"input": sentences_test, "labels": test_preds})
test_results.to_csv('anannyo_dey_submission.csv')