-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocessing_bert.py
130 lines (110 loc) · 6.04 KB
/
preprocessing_bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import pandas as pd
from transformers import AutoTokenizer
from tqdm import tqdm
import torch
def get_text_label_values(df):
'''
Return the text and labels separately from the dataframe containing both
Input
df (Dataframe): dataframe containing the tweets and the labels
outputs
tweets (np.ndarray) : containing a string in each cell that is the tweet
labels (np.ndarray) : the labels corresponding to each tweet
'''
tweets = df.text.values
labels = df.label.values
return tweets, labels
''' Tokenizers'''
def tokenize_with_encode_plus(df, max_len=140):
''''
Tokenize tweets using the BERT pretrained tokenizer.
Uses the encode_plus function for tokenization.
inputs:
df (DataFrame) : dataframe containing the tweets and labels
max_len : maximum length in tokens to keep ( so only max_len-2 tokens of the tweet will be kept,
because two tokens are used to indicate the beggining and end of the sequence)
outputs:
input_ids (tensor) : tensor containing the input ids
attention_masks (tensor) : tensor containing the
labels (tensor) : tensor containing the labels
'''
# take the text and label from dataframe
tweets, labels = get_text_label_values(df)
# initialize container variables
input_ids, attention_masks = [], []
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
for tweet in tqdm(tweets):
# compute the tokens with the special tokens and the padding
# use bert tokenizer to tokenize the text, to add 'CLS' token (101) at the start, 'SEP' token(102) and the end, TOO DOOO CHECK THIS
# map each word to IDS that bert understands ( like 'dog' to '1231')
# tweets that have more tokens than 40-2 tokens are truncated ( 2 tokens are taken by CLS and SEP), shorter tweets are padded with 0
# an attention mask composed of 1's and 0's where there are words
encoded_tweet = tokenizer.encode_plus(tweet, add_special_tokens = True, max_length = max_len, padding = 'max_length',
truncation=True, return_attention_mask = True, return_tensors = 'pt')
# save the input_ids and attention_masks into list
input_ids.append(encoded_tweet['input_ids'])
attention_masks.append(encoded_tweet['attention_mask'])
# transform all data into torch tensors in correct format
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
return input_ids, attention_masks, labels
def tokenize_with_autoencoder(df, max_len=140):
''''
Tokenize tweets using the BERT pretrained tokenizer.
Uses the only the tokenizer function for tokenization, slighty faster than using encode_plus
inputs:
df (DataFrame) : dataframe containing the tweets and labels
max_len : maximum length in tokens to keep ( so only max_len-2 tokens of the tweet will be kept,
because two tokens are used to indicate the beggining and end of the sequence)
outputs:
input_ids (tensor) : tensor containing the input ids
attention_masks (tensor) : tensor containing the
labels (tensor) : tensor containing the labels
'''
# take the text and label from dataframe
tweets, labels = get_text_label_values(df)
# initialize container variables
input_ids, attention_masks = [], []
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
for tweet in tqdm(tweets):
# compute the tokens with the special tokens and the padding
# use bert tokenizer to tokenize the text, to add 'CLS' token (101) at the start, 'SEP' token(102) and the end, TOO DOOO CHECK THIS
# map each word to IDS that bert understands ( like 'dog' to '1231')
# tweets that have more tokens than 40-2 tokens are truncated ( 2 tokens are taken by CLS and SEP), shorter tweets are padded with 0
# an attention mask composed of 1's and 0's where there are words
encoded_tweet = tokenizer( tweet , add_special_tokens = True, max_length = max_len, padding = 'max_length',
truncation=True, return_attention_mask = True, return_tensors = 'pt')
# save the input_ids and attention_masks into list
input_ids.append(encoded_tweet['input_ids'])
attention_masks.append(encoded_tweet['attention_mask'])
# transform all data into torch tensors in correct format
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
return input_ids, attention_masks, labels
def tokenize_with_autotokenizer_test(df, max_len=140):
''''
Tokenize tweets using the BERT pretrained tokenizer for the dataset, so this time we don't have labels.
Uses the only the tokenizer function for tokenization, slighty faster than using encode_plus
inputs:
df (DataFrame) : dataframe containing the tweets and labels
max_len : maximum length in tokens to keep ( so only max_len-2 tokens of the tweet will be kept,
because two tokens are used to indicate the beggining and end of the sequence)
outputs:
input_ids (tensor) : tensor containing the input ids
attention_masks (tensor) : tensor containing the
'''
tweets = df.text.values
input_ids, attention_masks = [], []
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
for tweet in tqdm(tweets):
encoded_tweet = tokenizer(tweet, add_special_tokens = True, max_length = max_len, padding = 'max_length',
truncation=True, return_attention_mask = True, return_tensors = 'pt')
input_ids.append(encoded_tweet['input_ids'])
attention_masks.append(encoded_tweet['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
return input_ids, attention_masks