-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathutility.py
More file actions
133 lines (98 loc) · 3.06 KB
/
utility.py
File metadata and controls
133 lines (98 loc) · 3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import gc
import os
import psutil
from pypinyin import Style, pinyin, load_phrases_dict, load_single_dict
import json
import plistlib
from opencc import OpenCC
from res import pinyin_data
import re
from zhon import hanzi
import hanlp
cc = OpenCC('t2s')
global tok_fine
def load_user_data_pypinyin():
from res import pypinyinDict
load_phrases_dict(pypinyinDict.datas)
load_single_dict({ord('豉'): 'chǐ,shì'})
def init_hanlp():
global tok_fine
tok_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
special_py_list = ['ao', 'ai', 'ie', 'ue', 'an']
# 声母
__shengmu = {'b','p','m','f','d','t','n','l','g','k','h','j','q','x','zh','ch','sh','r','z','c','s','w','y'}
def is_shengmu(v):
return v in __shengmu
def get_shengmu(one_py):
if len(one_py) == 0:
return None
elif len(one_py) == 1:
if is_shengmu(one_py):
return one_py
else:
return None
else:
if is_shengmu(one_py[:2]):
return one_py[:2]
elif is_shengmu(one_py[:1]):
return one_py[:1]
else:
return None
def is_chinese(s: str) -> bool:
if len(re.findall(hanzi.sentence, s)) != 0:
return True
return False
def is_pinyin(py: str) -> bool:
return py in pinyin_data.s2i_dict
def t2s(s: str) -> str:
return cc.convert(s)
def cut_line(s: [str] or str) -> [str]:
return tok_fine(s)
def get_pinyin_list(word):
r = pinyin(word, style=Style.NORMAL, strict=False)
a = []
for w in r:a.append(w[0])
return a
def get_pinyin_str(word):
return "'".join(get_pinyin_list(word))
def writePlist2File(obj, filename):
with open(filename, 'wb') as out:
plistlib.dump(obj, out, fmt=plistlib.PlistFormat.FMT_BINARY)
def writejson2file(data, filename):
# with open(filename, 'w') as outfile:
# data = json.dumps(data, indent=4, sort_keys=True)
# outfile.write(data)
with open(filename, 'w', encoding='utf8') as f:
for chunk in json.JSONEncoder(indent=4, sort_keys=True, ensure_ascii=False).iterencode(data):
f.write(chunk)
def readjsondatafromfile(filename):
with open(filename) as outfile:
return json.load(outfile)
def read_lines_from(path: str) -> int:
num = 0
with open(path, encoding='gb18030') as f:
try:
for _ in f:
num += 1
if num % 1000000 == 0:
print('\r{}'.format(num), end='', flush=True)
except:
pass
with open(path, encoding='utf8') as f:
try:
for _ in f:
num += 1
if num % 1000000 == 0:
print('\r{}'.format(num), end='', flush=True)
except:
pass
gc.collect()
return num
def read_bytes_from(path: str) -> int:
return os.path.getsize(path)
def get_current_memory_gb() -> int:
pid = os.getpid()
p = psutil.Process(pid)
# 获取当前进程内存占用,如果快满了就写到硬盘里。
info = p.memory_full_info()
return info.uss / 1024. / 1024. / 1024.