-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathformat.py
More file actions
136 lines (122 loc) · 5.19 KB
/
format.py
File metadata and controls
136 lines (122 loc) · 5.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import jieba
import string
import json
import Levenshtein
import re
class Format2Subtitle:
def __init__(self, asr_result, ori_text):
self.asr_result = asr_result
self.ori_text = ori_text
def ms_to_srt_time(self, ms):
"""
将毫秒转换为SRT时间格式 (HH:MM:SS,mmm)
例如:150毫秒 -> 00:00:00,150
"""
hours = ms // 3600000 # 1小时 = 3,600,000毫秒
ms %= 3600000
minutes = ms // 60000 # 1分钟 = 60,000毫秒
ms %= 60000
seconds = ms // 1000 # 1秒 = 1,000毫秒
milliseconds = ms % 1000
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
# 查询标点符号位置
def find_punctuation_indices(self, words):
punctuation = """,。;!?!,;.?:"""
for i, word in enumerate(words):
if word in punctuation:
return i
return -1
# 清除句子中的断句符号
def remove_sentence_punctuation(self, sentence):
"""去除句子中的断句符号"""
punctuation = """,。;!?!,;.?:()()"""
for p in punctuation:
sentence = sentence.replace(p, ' ')
sentence = sentence.strip()
return sentence
def align_char_timestamps(self, original_sentence, recognized_text, word_timestamps, text_length=8):
"""
将词语级时间戳对齐到字符级时间戳
参数:
original_sentence (str): 带标点的原始句子
recognized_text (str): 空格分隔的无标点识别文本
word_timestamps (list): 词语级时间戳列表 [[start1, end1], [start2, end2], ...]
返回:
list: 原始字符级时间戳列表 [[char_start, char_end], ...]
"""
if not original_sentence:
original_sentence = "".join(recognized_text.split())
words = jieba.lcut(original_sentence)
# print(len(original_sentence), len(word_timestamps), len(recognized_text))
recognized_words = recognized_text.split()
# 4. 为每个字符分配时间戳
char_timestamps = []
while True:
if len(words) < 10:
break
# 查询列表中的标点符号的位置
punctuation_indices = self.find_punctuation_indices(words)
# print("标点符号位置:",punctuation_indices)
# 如果位置小于10,怎使用这个位置分割
if punctuation_indices > 10 or punctuation_indices <= 2:
_words = words[:text_length]
words = words[text_length:]
else:
_words = words[:punctuation_indices]
words = words[punctuation_indices:]
text = ''.join(_words)
# 计算词语持续时间
best_index = -1
best_score = 9999
for index in range(text_length*4):
_recognized_words = recognized_words[:index]
_recognized_text = ''.join(_recognized_words)
if text == _recognized_text:
break
# 计算编辑距离
distance = Levenshtein.distance(text, _recognized_text)
if distance < best_score:
best_index = index
best_score = distance
_recognized_words = recognized_words[:best_index]
recognized_words = recognized_words[best_index:]
timestamp = word_timestamps[:best_index]
word_timestamps = word_timestamps[best_index:]
word_start = timestamp[0][0]
word_end = timestamp[-1][1]
char_timestamps.append([word_start, word_end, self.remove_sentence_punctuation(text)])
# 处理最后一段
if len(word_timestamps) > 0:
text = ''.join(words)
timestamp = word_timestamps
word_start = timestamp[0][0]
word_end = timestamp[-1][1]
char_timestamps.append([word_start, word_end, self.remove_sentence_punctuation(text)])
return char_timestamps
def format_subtitle(self, char_timestamps):
"""
[230, 2210, '一个能够从音频中生成面部']
[230, 5430, '运动系数的工具']
转为字幕文件
"""
srt_content = ""
counter = 1
for item in char_timestamps:
start_time = self.ms_to_srt_time(item[0])
end_time = self.ms_to_srt_time(item[1])
text = item[2]
srt_content += f"{counter}\n{start_time} --> {end_time}\n{text}\n\n"
# print(srt_content)
counter += 1
return srt_content
def pipeline(self, output=None):
# print(rec_result)
recognized_text = self.asr_result['text']
word_timestamps = self.asr_result['timestamp']
original_sentence = self.ori_text
char_timestamps = self.align_char_timestamps(original_sentence, recognized_text, word_timestamps)
subtitle_text = self.format_subtitle(char_timestamps)
if output is not None:
with open(output, 'w', encoding='utf-8') as f:
f.write(subtitle_text)
return subtitle_text