-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmake_json2.py
More file actions
71 lines (52 loc) · 1.74 KB
/
make_json2.py
File metadata and controls
71 lines (52 loc) · 1.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import json
with open("reduced_unigram_freq.txt", "r", encoding="utf8") as s:
content = s.read()
words = content.split("\n")[:-1]
# words = words[:len(words) // 10] # limit size for testing
button_groups = ["abc", "def", "ghi", "jkl", "mno", "pqrs", "tuv", "wxyz"]
_abc = [chr(i) for i in range(97, 97 + 26)]
_ABC = [ch.upper() for ch in _abc]
let2num = {}
for ch in _abc:
for i, group in enumerate(button_groups):
if ch in group:
let2num[ch] = str(i + 2)
def word2num(word):
text = ""
for ch in word:
num = let2num[ch]
if num is not None:
text += num
return text
nums_by_freq = [ f"{word2num(word)}*{word}" for word in words ]
num2words = {}
for word in words:
num_string = word2num(word)
num2words.setdefault(num_string, [])
num2words[num_string].append(word)
dict_len = len(num2words)
key_index = 0
for num_key in num2words:
most_freq = []
for num_word_pair in nums_by_freq:
# ex num_word_pair: '222*aaa'
if num_word_pair[:len(num_key)] == num_key:
word = num_word_pair.split("*")[1]
if word not in most_freq:
most_freq.append(word)
if len(most_freq) >= 3:
break
existing_words = num2words[num_key]
for word in existing_words:
if word not in most_freq:
most_freq.append(word)
num2words[num_key] = most_freq
if key_index % 1000 == 0:
print(f"Processed {key_index} of {dict_len}")
key_index += 1
# from pprint import pprint
# pprint(num2words)
# with open("words.json", "w", encoding="utf8") as o:
# o.write(json.dumps(words))
with open("num_words_superior.json", "w", encoding="utf8") as o:
o.write(json.dumps(num2words))