SentimentAnalysis/helper.py at master · b-rad15/SentimentAnalysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
from keras.preprocessing.text import Tokenizer
import re
from nltk.tokenize import word_tokenize
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, GRU, Dense
import pickle
import tensorflow as tf


def prep_data(filename: str):
    # training
    x_data = []
    y_data = []
    with open(filename, 'r') as f:
        for i in f:
            line_split = i.split(';')
            y_data.append(line_split[1].strip())
            x_data.append(line_split[0])
    data = pd.DataFrame({'Text': x_data, 'Emotion': y_data})
    return data


def make_tokenizer(texts):
    # tokenization
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    return tokenizer


regex1 = re.compile(r"(#[\d\w\.]+)")
regex2 = re.compile(r"(@[\d\w\.]+)")


def clean_text(data):
    data = regex1.sub('', data)
    data = regex2.sub('', data)
    data = word_tokenize(data)
    return data


# pre-trained word vectors
def create_embedding_matrix(word_index, embedding_dim, filepath: str = 'models/crawl-300d-2M.vec'):
    vocab_size = len(word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    with open(filepath, 'r', encoding='utf-8') as vec_file:
        for line in vec_file:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix


def create_model(cuda_optimized_gru: bool, tokenizer: Tokenizer, embed_num_dims: int, embed_filepath: str, max_seq_len: int,
                 num_classes: int):
    embedd_matrix = create_embedding_matrix(tokenizer.word_index, embed_num_dims, embed_filepath)

    # adding layers
    embedded_layer = Embedding(len(tokenizer.word_index) + 1, embed_num_dims, input_length=max_seq_len,
                               weights=[embedd_matrix],
                               trainable=False)
    gru_output_size = 150
    bidirectional = True
    model = Sequential()
    model.add(embedded_layer)
    # Cudnn Compatibility using reference https://www.tensorflow.org/api_docs/python/tf/keras/layers/GRU#used-in-the-notebooks
    gru_layer = GRU(units=gru_output_size, dropout=0.2, recurrent_dropout=0.2) if not cuda_optimized_gru \
        else GRU(units=gru_output_size, activation="tanh", recurrent_activation='sigmoid', recurrent_dropout=0,
                 unroll=False, use_bias=True, reset_after=True, dropout=0.2)
    model.add(Bidirectional(gru_layer))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


def pickle_save_object(object_to_pickle: object, filename: str):
    with open(filename, 'wb') as f:
        pickle.dump(object_to_pickle, f, protocol=pickle.HIGHEST_PROTOCOL)


def pickle_read_object(filename: str):
    with open(filename, 'rb') as f:
        return pickle.load(f)


def save_model(model_to_save: object, foldername: str, overwrite: bool = True):
    # save the model for later use
    tf.keras.models.save_model(model_to_save, foldername, overwrite=overwrite, include_optimizer=True, save_format=None,
                               signatures=None, options=None)


def print_confidences(prediction, encoding_dict: dict):
    for k, v in encoding_dict.items():
        print(f"{k}: {prediction[v]:.1%}")