Gun-Control-Sentiment-Analysis/roberta_classifer.py at main · Sam-Malta/Gun-Control-Sentiment-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import torch
import torch.nn as nn
from tqdm import tqdm
from transformers import RobertaModel
from transformers import RobertaTokenizer
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import re

class RobertaClassifier(nn.Module):
    def __init__(self, freeze=False):
        super(RobertaClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 3

        # Instantiate BERT model
        self.bert = RobertaModel.from_pretrained("roberta-base")

        self.to('cuda')
        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Linear(H, D_out)
        )

        if freeze:
            for param in self.bert.parameters():
                param.requires_grad = False

    def get_optimizer(self, lr=1e-3):

        optimizer = torch.optim.AdamW(self.parameters(), lr=lr)
        return optimizer

    def get_scheduler(self, optimizer, data_loader, epochs):

        total_steps = len(data_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        return scheduler

    def forward(self, input_ids, attention_mask):

        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        last_hidden_state_cls = outputs[0][:, 0, :]

        logits = self.classifier(last_hidden_state_cls)

        return logits


    def preprocess_for_roberta(self, data):
        # Initialize lists to store the input_ids and attention_masks
        input_ids = []
        attention_masks = []

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

        for text in tqdm(data):
            # Use tokenizer.encode_plus to tokenize and encode the text
            encoded = tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                padding='max_length',
                truncation=True,
                pad_to_max_length=True,
                max_length=360, # Determined from the processed text
                return_attention_mask=True,
            )

            # Add the input_ids and attention_mask to the lists
            input_ids.append(encoded.get('input_ids'))
            attention_masks.append(encoded.get('attention_mask'))

        # Convert the lists to tensors
        input_ids = torch.tensor(input_ids)
        attention_masks = torch.tensor(attention_masks)


        return input_ids, attention_masks

    def create_test_dataloader(self, inputs, masks, batch_size):
        test_data = TensorDataset(inputs, masks)
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
        return test_dataloader

    def create_train_dataloader(self, inputs, masks, labels, batch_size):
        train_data = TensorDataset(inputs, masks, labels)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
        return train_dataloader