thaipostagger/util.py at master · circuswork/thaipostagger · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import tensorflow as tf
import tokenization as tok
import re
import numpy as np
from string import punctuation
from pythainlp import thai_digits
from tensorflow.keras.layers import concatenate
from transformers import TFBertModel

class PaddingInputExample(object):
    """
    Fake example so the num input examples is a multiple of the batch size.
    When running eval/predict on the TPU, we need to pad the number of examples
    to be a multiple of the batch size, because the TPU requires a fixed batch
    size. The alternative is to drop the last batch, which is bad because it means
    the entire output data won't be generated.
    We use this class instead of `None` because treating `None` as padding
    battches could cause silent errors.
    """

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
        Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

def load_bert():
    bert = TFBertModel.from_pretrained('bert_base_th')
    return bert

def load_tokenizer():
    tf.gfile = tf.io.gfile
    tokenizer = tok.ThaiTokenizer('th.wiki.bpe.op25000.vocab', 'th.wiki.bpe.op25000.model')
    return tokenizer

def build_model(max_seq_length):
    n_tags = 14
    bert = load_bert()
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids", dtype='int64')
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks", dtype='int64')
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids", dtype='int64')
    orthog_in = tf.keras.layers.Input(shape=(max_seq_length, 6))
    bert_inputs = [in_id, in_mask, in_segment]
    all_inputs = bert_inputs + [orthog_in]
    bert_output = bert(bert_inputs)[0]
    x = concatenate([bert_output, orthog_in], axis=-1)
    x = tf.keras.layers.SpatialDropout1D(0.3)(x)
    out = tf.keras.layers.Dense(n_tags, activation=tf.keras.activations.softmax)(x)
    model = tf.keras.models.Model(inputs=all_inputs, outputs=out)
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.00004), loss=tf.keras.losses.categorical_crossentropy, metrics=['accuracy'])
    return model

def load_model(max_seq_length):
    model = build_model(max_seq_length+2)
    model.load_weights('bert_base_th/AACL_BERT_TH.hdf5')
    return model

def convert_examples_to_features(tokenizer, examples, max_seq_length=110):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids = [], [], []
    for example in examples:
        input_id, input_mask, segment_id = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),

    )

def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        return input_ids, input_mask, segment_ids

    tokens_a = example.text_a
    if len(tokens_a) > max_seq_length-2:
        tokens_a = tokens_a[0 : (max_seq_length-2)]

# Token map will be an int -> int mapping between the `orig_tokens` index and
# the `bert_tokens` index.

# bert_tokens == ["[CLS]", "john", "johan", "##son", "'", "s", "house", "[SEP]"]
# orig_to_tok_map == [1, 2, 4, 6]
    orig_to_tok_map = []
    tokens = []
    segment_ids = []

    tokens.append("[CLS]")
    segment_ids.append(0)
    orig_to_tok_map.append(len(tokens)-1)
    #print(len(tokens_a))
    for token in tokens_a:
        orig_to_tok_map.append(len(tokens))
        tokens.extend(tokenizer.tokenize(token))
        #orig_to_tok_map.append(len(tokens)-1)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)
    orig_to_tok_map.append(len(tokens)-1)
    input_ids = tokenizer.convert_tokens_to_ids([tokens[i] for i in orig_to_tok_map])
    #print(len(orig_to_tok_map), len(tokens), len(input_ids), len(segment_ids)) #for debugging

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)
    #print(len(label_ids)) #for debugging
    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    return input_ids, input_mask, segment_ids

def convert_text_to_examples(texts):
    """Create InputExamples"""
    InputExamples = []
    for text in texts:
        InputExamples.append(
            InputExample(guid=None, text_a=text, text_b=None)
        )
    return InputExamples

def is_thai_alpha(word):
    return int(bool(re.match(r'^[ก-๏\s]+$', word)))

def is_roman_alpha(word):
    return int(bool(re.match(r'^[a-zA-Z\s]+$', word)))

def is_punct(word):
    return int(bool(re.match(r'^[{}\s]+$'.format(punctuation+'ฯ '), word)))

def is_num(word):
    return int(bool(re.match(r'^[๐-๙0-9]+$', word)))

def orthog(word):
    all_thai_digits = [num for num in thai_digits]+ '1 2 3 4 5 6 7 8 9 0'.split(' ')
    return {
        '1is_roman': is_roman_alpha(word),
        '2is_thai': is_thai_alpha(word),
        '3all_digit': is_num(word),
        '4has_digit': int(any([char in word for char in all_thai_digits])),
        '5all_punct': is_punct(word),
        '6has_punct': int(any([char in word for char in punctuation])),
    }

def orthog_to_vector(feature_dict):
    feature_vec = np.zeros(len(feature_dict))
    for i, k in enumerate(sorted(feature_dict.keys())):
        feature_vec[i] = feature_dict[k]
    return feature_vec

def all_orthog_vec(X, max_seq_length):
    vec = np.zeros((len(X), max_seq_length+2, 6))
    for i, seq in enumerate(X):
        for j, word in enumerate(seq):
            feature = orthog(word)
            for k, feat in enumerate(sorted(feature.keys())):
                vec[i][j+1][k] = feature[feat]
    return vec

def preprocess_one_text(text):
    """
    text = a list of word tokens
    space is necessary to remove because it can interfere with the tokenizer.
    """
    return [word.replace(' ', '_') for word in text]

def preprocess_text_list(text_list):
    return [preprocess_one_text(text) for text in text_list]

def text_list_to_feature(text_list, tokenizer, max_seq_length):
    """
    text_list = list of tokenized, preprocessed text samples
    e.g. [['สวัสดี', 'ครับ'], ['สวัสดี, 'ค่ะ']]
    """
    examples = convert_text_to_examples(text_list)
    input_ids, input_masks, segment_ids = convert_examples_to_features(tokenizer, examples, max_seq_length=max_seq_length+2)
    orthog = all_orthog_vec(text_list, max_seq_length)
    return input_ids, input_masks, segment_ids, orthog