diff --git a/.gitignore b/.gitignore index 3348575..b9f4cb4 100644 --- a/.gitignore +++ b/.gitignore @@ -209,5 +209,6 @@ __marimo__/ # Local cache/ checkpoints/ +dumps/ examples/ register_db/ diff --git a/Dockerfile b/Dockerfile index b11b76e..d8ce818 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM megrez:pytorch-2.8.0_cuda-12.8_python-3.12_ubuntu-22.04 +FROM megrez:pytorch-2.6.0_cuda-12.6_python-3.12_ubuntu-22.04 WORKDIR /app diff --git a/README.md b/README.md index eb22c1e..810c01c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# faster-whisper 实时语音转录系统 +# funasr 实时语音转录系统 -本项目是一个基于 `faster-whisper` 和 `Silero VAD` 的实时语音转录系统,支持流式音频输入和低延迟转录。系统由后端服务和前端客户端组成,可应用于会议记录、实时字幕等场景。 +本项目是一个基于 `funasr` 的实时语音转录系统,支持流式音频输入和低延迟转录。系统由后端服务和前端客户端组成,可应用于会议记录、实时字幕等场景。 ## 项目结构 @@ -33,26 +33,17 @@ > **注意**: 音频长度小于 0.4 秒时,将仅进行简单的峰值归一化以保证处理稳定性。 -### 2. 语音活动检测 (VAD) +### 2. 实时转录 -使用 Silero VAD 模型检测语音活动,有效过滤静音段,提升转录效率和准确性。 +基于 funasr 模型实现流式转录,支持以下特性: -**配置参数** (`config.py`): -- `vad_threshold`: VAD 检测阈值 (默认 0.1) -- `min_silence_duration`: 最小静音时长 (默认 12 帧 ≈ 375ms) -- `min_voice_duration`: 最小语音时长 (默认 8 帧 ≈ 250ms) -- `silence_reserve`: 语音段前后保留的静音采样点 (默认 6 帧 ≈ 187.5ms) - -### 3. 实时转录 - -基于 faster-whisper 模型实现流式转录,支持以下特性: +- **自动语言检测**: 支持自动识别音频语言类型 +- **逆文本规范化**: 自动将数字、日期等转换为标准文本格式 +- **VAD 智能合并**: 通过 `merge_vad` 和 `merge_length_s` 参数合并相邻语音片段,提升长句转录准确性 +- **句子时间戳**: 提供每个句子的起止时间信息,支持精确的音频定位 +- **上下文连续性**: 通过音频缓冲区管理保持流式转录的上下文连续性 -- **上下文感知**: 使用上一段落文本作为 prompt 或 hotwords,提升转录连贯性 -- **幻觉抑制**: 通过 `suppress_blank` 和 `repetition_penalty` 参数减少模型幻觉 -- **多温度采样**: 支持 `[0.0, 0.2, 0.6, 1.0]` 温度序列,平衡生成质量和多样性 -- **繁体转简体**: 可选开启繁体中文到简体中文的转换 - -### 4. 发言人识别 +### 3. 发言人识别 基于 ModelScope 的 ERes2NetV2 模型实现发言人验证,支持多发言人场景的自动识别。 @@ -85,17 +76,14 @@ pip install -r requirements.txt ## 模型准备 -下载 `faster-whisper` 、 `ERes2NetV2` 、 `MossFormer2_SE_48K` 和 `silero-vad` 模型到 `checkpoints/` 目录 +下载 `ERes2NetV2` 模型到 `checkpoints/` 目录 ```bash cd checkpoints -modelscope download --model mobiuslabsgmbh/faster-whisper-large-v3-turbo --local_dir ./faster-whisper-large-v3-turbo modelscope download --model iic/ClearerVoice-Studio MossFormer2_SE_48K/last_best_checkpoint --local_dir . modelscope download --model iic/ClearerVoice-Studio MossFormer2_SE_48K/last_best_checkpoint.pt --local_dir . modelscope download --model iic/speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common --local_dir ./ERes2NetV2_w24s4ep4 - -git clone https://github.com/snakers4/silero-vad.git ``` ## 运行方式 diff --git a/cache/dump_save_here b/cache/modelscope_cache_here similarity index 100% rename from cache/dump_save_here rename to cache/modelscope_cache_here diff --git a/config.py b/config.py index e0f8143..7215e2e 100644 --- a/config.py +++ b/config.py @@ -6,15 +6,15 @@ class Config: models = { "asr": { - "name": "faster-whisper", - "path": os.path.join(model_path, "faster-whisper-large-v3-turbo"), - "compute_type": "float16", + "name": "paraformer-zh", "device": "cuda" }, "vad": { - "name": "silero", - "path": os.path.join(model_path, "silero-vad"), - "compute_type": "float16", + "name": "fsmn-vad", + "device": "cuda" + }, + "punc": { + "name": "ct-punc-c", "device": "cuda" }, "speaker_verifier": { @@ -28,63 +28,21 @@ class Config: } } + samplerate = 16000 preheat_audio = "./preheat_audio.wav" + max_silence_interval = 2 # 最大间隔时长,单位:秒,超过该时长则认为中断 + max_speech_duration = 20 # 最大音频时长,单位:秒,超过该时长则强制结束说话人验证 dump = { - "audio_save": "none", # all: 保存所有音频,final: 只保存最终音频, none: 不保存 - "audio_dir": "./cache" + "audio_save": "none", # all: 保存所有音频,final: 只保存最终音频, none: 不保存 + "audio_dir": "./dumps" } speech_enhance = { - "enable": True, + "enable": False, "model_name": "MossFormer2_SE_48K", "target_lufs": -16.0, "true_peak_limit": -1.0, "mute_if_too_quiet": True, "threshold_dbfs": -50, } - - vad = { - "enable": True, - "vad_threshold": 0.2, - "sampling_rate": 16000, - "sampling_per_chunk": 512, - "min_silence_duration": 12, # 12 * 31.25ms = 375ms - "min_voice_duration": 8, # 8 * 31.25ms = 250ms - "silence_reserve": 6, # 6 * 31.25ms = 187.5ms - } - - filter_match = { - "enable": True, - "find_match": ["谢谢大家", "简体中文", "优独播剧场", "大家好,这是一段会议录音。"], - "cos_match": [ - "请不吝点赞 订阅 转发 打赏支持明镜与点栏目", - "志愿者 李宗盛", - "大家好,这是一段会议录音。", - "字幕志愿者 杨栋梁", - "明镜需要您的支持 欢迎订阅明镜", - "优优独播剧场——YoYo Television Series Exclusive", - "中文字幕——Yo Television Series Exclusive" - ], - "cos_sim": 0.02 - } - - whisper_config = { - "tradition_to_simple": False, - "interruption_duration": 20, # 最大中断时长,单位:秒 - "beam_size": 8, # 1、beam_size调整为8 best_of调整为4 提高模型效果 - "best_of": 4, # 2、beam_size调整为4 best_of调整为1 速度更快 - "patience": 1.0, - "suppress_blank": True, # 幻觉抑制 - "repetition_penalty": 1.2, # 重复惩罚 但降低效果 - "log_prob_threshold": -1.0, - "no_speech_threshold": 0.8, - "condition_on_previous_text": True, - "previous_text_prompt": False, - "previous_text_hotwords": True, # 把上段语句做为提示 断句相对更保守 以提升效果 - "previous_text_prefix": False, - "initial_prompt": "大家好,这是一段会议录音。", - "hotwords_text": "", - "temperature": [0.0, 0.2, 0.6, 1.0], - "avg_logprob_score": -1.0 # 设置过滤阈值 低于阈值则不输出 - } diff --git a/docker-compose.yml b/docker-compose.yml index 0fbabaa..640c626 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,6 @@ services: api: image: transcriptor:latest - runtime: nvidia deploy: resources: reservations: @@ -12,8 +11,9 @@ services: environment: - PYTHONUNBUFFERED=1 volumes: - - ./cache:/app/cache + - ./cache:/root/.cache/modelscope - ./checkpoints:/app/checkpoints + - ./dumps:/app/dumps - ./examples:/app/examples - ./register_db:/app/register_db - ./config.py:/app/config.py diff --git a/dumps/dump_save_here b/dumps/dump_save_here new file mode 100644 index 0000000..e69de29 diff --git a/requirements-server.txt b/requirements-server.txt index 28664b0..a6f2fa7 100644 --- a/requirements-server.txt +++ b/requirements-server.txt @@ -1,15 +1,10 @@ -faster-whisper==1.2.0 -librosa==0.10.2.post1 -OpenCC==1.1.9 -opuslib_next==1.1.5 -scikit-learn==1.7.2 -websockets==14.1 +modelscope==1.32.0 +modelscope[framework]==1.32.0 +funasr==1.2.7 +torch==2.6.0+cu126 +torchaudio==2.6.0+cu126 pydub==0.25.1 -modelscope==1.31.0 -addict==2.4.0 -datasets==3.6.0 -pillow==12.0.0 -simplejson==3.20.2 -sortedcontainers==2.4.0 pyloudnorm==0.1.1 clearvoice==0.1.2 +websockets==14.1 +opuslib_next==1.1.5 diff --git a/requirements.txt b/requirements.txt index b4103d4..38832a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,19 +1,12 @@ -faster-whisper==1.2.0 -librosa==0.10.2.post1 -OpenCC==1.1.9 -opuslib_next==1.1.5 -scikit-learn==1.7.2 -torch==2.9.0 -torchaudio==2.9.0 -websocket-client==1.9.0 -websockets==14.1 +modelscope==1.32.0 +modelscope[framework]==1.32.0 +funasr==1.2.7 +torch==2.6.0 +torchaudio==2.6.0 pydub==0.25.1 -pyaudio==0.2.14 -modelscope==1.31.0 -addict==2.4.0 -datasets==3.6.0 -pillow==12.0.0 -simplejson==3.20.2 -sortedcontainers==2.4.0 pyloudnorm==0.1.1 clearvoice==0.1.2 +websockets==14.1 +opuslib_next==1.1.5 +websocket-client==1.9.0 +pyaudio==0.2.14 diff --git a/transcriptor.py b/transcriptor.py index 22a9a0e..ea143d8 100644 --- a/transcriptor.py +++ b/transcriptor.py @@ -1,49 +1,41 @@ import os -import torch import scipy -from itertools import groupby import numpy as np from pydub import AudioSegment -import librosa -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.metrics.pairwise import cosine_similarity -from faster_whisper import WhisperModel +from funasr import AutoModel from config import Config from speaker_recognize import SpeakerVerifier from speech_enhance import SpeechEnhance +class Segment: + def __init__(self, text, start, end): + self.text = text + self.start = start + self.end = end + + class Transcriptor: def __init__(self): - self.samplerate = 16000 + self.samplerate = Config.samplerate self.epoch = 0 self.load_models(Config.models) self.preheat(Config.preheat_audio) def load_models(self, models): - asr_config = models.get("asr") - vad_config = models.get("vad") - - self.asr_model = WhisperModel( - model_size_or_path = asr_config["path"], - device = asr_config["device"], - local_files_only = False, - compute_type = asr_config["compute_type"] + asr_config = models["asr"] + self.asr_model = AutoModel( + model=asr_config["name"], + vad_model=models["vad"]["name"], + punc_model=models["punc"]["name"], + vad_kwargs={"max_single_segment_time": Config.max_speech_duration * 1000}, + device=asr_config["device"], + disable_update=True ) self.speaker_verifier = SpeakerVerifier() - if Config.vad.get("enable"): - self.vad_model, _ = torch.hub.load( - repo_or_dir = vad_config["path"], - model = 'silero_vad', - trust_repo = None, - source = 'local', - ) - else: - self.vad_model = None - se_config = Config.speech_enhance if se_config.get("enable"): self.speech_enhance = SpeechEnhance( @@ -56,34 +48,17 @@ def load_models(self, models): else: self.speech_enhance = None - if Config.filter_match.get("enable"): - self.vectorizer = TfidfVectorizer() - else: - self.vectorizer = None - - self.whisper_config = Config.whisper_config - if self.whisper_config.get("tradition_to_simple"): - import opencc - self.cc_model = opencc.OpenCC('t2s.json') - else: - self.cc_model = None - def preheat(self, preheat_audio): - preheat_audio_, _ = librosa.load(preheat_audio, sr=self.samplerate, dtype=np.float32) - self.asr_model.transcribe( - preheat_audio_, - beam_size = self.whisper_config.get("beam_size"), - best_of = self.whisper_config.get("best_of"), - patience = self.whisper_config.get("patience"), - suppress_blank = self.whisper_config.get("suppress_blank"), - repetition_penalty = self.whisper_config.get("repetition_penalty"), - log_prob_threshold = self.whisper_config.get("log_prob_threshold"), - no_speech_threshold = self.whisper_config.get("no_speech_threshold"), - condition_on_previous_text = self.whisper_config.get("condition_on_previous_text"), - initial_prompt = self.whisper_config.get("initial_prompt"), - hotwords = self.whisper_config.get("hotwords_text"), - prefix = self.whisper_config.get("previous_text_prefix"), - temperature = self.whisper_config.get("temperature"), + self.asr_model.generate( + input=preheat_audio, + cache={}, + language="zh", + use_itn=True, + batch_size_s=60, + merge_vad=True, + merge_length_s=15, + sentence_timestamp=True, + disable_pbar=True ) def dump(self, final, audio_buffer): @@ -104,176 +79,84 @@ def dump(self, final, audio_buffer): audio_path = os.path.join(audio_dir, f"{self.epoch:06d}.wav") scipy.io.wavfile.write(audio_path, rate=self.samplerate, data=audio_buffer) - def vad_rm_silence(self, audio_chunk): - vad_config = Config.vad - - vad_flags = [] - chunk_num = len(audio_chunk) // 512 - sampling_rate = vad_config.get("sampling_rate") - sampling_per_chunk = vad_config.get("sampling_per_chunk") - - for i in range(chunk_num): - chunk = audio_chunk[i*sampling_per_chunk:(i+1)*sampling_per_chunk] - - chunk_torch = torch.tensor(chunk).unsqueeze(0) - silero_score = self.vad_model(chunk_torch, sampling_rate).item() - - # 如果人生检测概率大于阈值,则认为有语音 - if silero_score > vad_config.get("vad_threshold"): - vad_flags.append(1) - else: - vad_flags.append(0) - - # print("vad_flags: ", vad_flags) - - # 如果语音时间小于最小语音时间,则认为没有语音,直接返回空 - voice_duration = vad_flags.count(1) - if voice_duration < vad_config.get("min_voice_duration"): - return None - - # 如果静音时间小于最小静音时间,则认为没有静音,直接返回原始音频 - silence_duration = vad_flags.count(0) - if silence_duration < vad_config.get("min_silence_duration"): - return audio_chunk - - # 删除静音部分,但是语音前后均保留 silence_reserve 个采样点 - silence_reserve = vad_config.get("silence_reserve") - # 找到所有语音段的起始和结束位置 - indices = [] - for flag, group in groupby(enumerate(vad_flags), lambda x: x[1]): - if flag == 1: # 语音段 - group = list(group) - start = group[0][0] - end = group[-1][0] - indices.append((start, end)) - - # print("indices: ", indices) - - split_chunk = [] - for start, end in indices: - # 计算保留的前后静音区间 - # print("start: ", (start - silence_reserve), "end: ", (end + 1 + silence_reserve)) - start_sample = max(0, (start - silence_reserve) * sampling_per_chunk) - end_sample = min(len(audio_chunk), (end + 1 + silence_reserve) * sampling_per_chunk) - split_chunk.extend(audio_chunk[start_sample:end_sample]) - - if len(split_chunk) > 0: - return np.array(split_chunk, dtype=np.float32) - else: - return None - - def filter(self, text): - filter_match = Config.filter_match - - for match_text in filter_match.get("find_match"): - if text.find(match_text) != -1: - return "" - - for match_text in filter_match.get("cos_match"): - tfidf_matrix = self.vectorizer.fit_transform([match_text, text]) - - cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2]) - if cos_sim > filter_match.get("cos_sim"): - return "" - - return text - - def transcript(self, audio_buffer, last_speaker, last_sentence): - whisper_config = Config.whisper_config - - initial_prompt = whisper_config.get("initial_prompt") - if whisper_config.get("previous_text_prompt"): - initial_prompt += last_sentence - - hotwords = whisper_config.get("hotwords_text") - if whisper_config.get("previous_text_hotwords"): - hotwords += last_sentence - - prefix_text = None - if whisper_config.get("previous_text_prefix"): - prefix_text = last_sentence - - interruption_duration = whisper_config.get("interruption_duration") - - segments, info = self.asr_model.transcribe( - audio_buffer, - beam_size = whisper_config.get("beam_size"), - best_of = whisper_config.get("best_of"), - patience = whisper_config.get("patience"), - suppress_blank = whisper_config.get("suppress_blank"), - repetition_penalty = whisper_config.get("repetition_penalty"), - log_prob_threshold = whisper_config.get("log_prob_threshold"), - no_speech_threshold = whisper_config.get("no_speech_threshold"), - condition_on_previous_text = whisper_config.get("condition_on_previous_text"), - initial_prompt = initial_prompt, - hotwords = hotwords, - prefix = prefix_text, - temperature = whisper_config.get("temperature"), + def parse_to_segments(self, funasr_res): + segments = [] + if "sentence_info" in funasr_res: + new_sentence = True + sentence = "" + for res in funasr_res["sentence_info"]: + if new_sentence: + start = res["start"] / 1000 + new_sentence = False + sentence += res["text"] + end = res["end"] / 1000 + if any(punct in sentence for punct in ["。", "?", "!", ".", "!", "?"]): + segments.append(Segment(sentence, start, end)) + new_sentence = True + sentence = "" + if sentence != "": + segments.append(Segment(sentence, start, end)) + return segments + + def transcript(self, audio_buffer, last_speaker, last_sentence, last_transcript): + res = self.asr_model.generate( + input=audio_buffer, + cache={}, + language="auto", + use_itn=True, + batch_size_s=60, + merge_vad=True, + merge_length_s=15, ) - # print("transcript info: ", info) final = False speaker = last_speaker sentence = last_sentence - transcript = "" + transcript = last_transcript new_buffer = audio_buffer - # 计算音频时长 - audio_duration = len(audio_buffer) / self.samplerate - # 获取转录结果 - generated_segments = [] - for segment in segments: - generated_segments.append(segment) - num_segments = len(generated_segments) + if not res or len(res) == 0: + print("No result from asr model") + return final, speaker, sentence, transcript, new_buffer + + segments = self.parse_to_segments(res[0]) + num_segments = len(segments) + + audio_duration = len(audio_buffer) / self.samplerate if num_segments == 0: - # 如果转录结果为空,则直接返回 - return False, speaker, sentence, transcript, new_buffer + if audio_duration > Config.max_silence_interval: + new_buffer = np.array([],dtype=np.float32) elif num_segments == 1: - # 如果只有一段,则记录转录信息 - # print("log: ", generated_segments[0].avg_logprob) - if generated_segments[0].avg_logprob > whisper_config.get("log_prob_threshold"): - transcript = generated_segments[0].text - else: - transcript = "" - - # 如果音频时长超过最大中断时长,则认为中断结束 - if audio_duration > interruption_duration: - print(f"Warning: audio buffer over {interruption_duration} seconds, interrupt") - speaker = self.speaker_verifier.match_speaker(audio_buffer) - sentence = transcript + # 只有一段 + if audio_duration - segments[0].end > Config.max_silence_interval: + # 音频尾段过长,则认为结束 + final = True + sentence = segments[0].text transcript = "" new_buffer = np.array([],dtype=np.float32) - final = True else: - final = False + # 音频尾段不长,则认为继续 + transcript = segments[0].text self.dump(final, audio_buffer) elif num_segments >= 2: # 如果有多段,则截取最后一段 sentence = "" for i in range(num_segments - 1): - sentence += generated_segments[i].text - # print("log: ", generated_segments[num_segments - 1].avg_logprob) - if generated_segments[num_segments - 1].avg_logprob > whisper_config.get("log_prob_threshold"): - transcript = generated_segments[num_segments - 1].text - else: - transcript = "" + sentence += segments[i].text + transcript = segments[num_segments - 1].text # 截取最后一段音频作为新的音频缓冲区 - cut_point = int(generated_segments[num_segments - 2].end * self.samplerate) + cut_point = int(segments[num_segments - 2].end * self.samplerate) last_buffer = audio_buffer[:cut_point] speaker = self.speaker_verifier.match_speaker(last_buffer) + final = True new_buffer = audio_buffer[cut_point:] - final = True self.dump(final, last_buffer) - if whisper_config.get("tradition_to_simple"): - # 繁体到简体 - transcript = self.cc_model.convert(transcript) - return final, speaker, sentence, transcript, new_buffer def inference(self, audio_data, last_speaker, last_sentence, last_transcript, last_buffer): @@ -281,31 +164,11 @@ def inference(self, audio_data, last_speaker, last_sentence, last_transcript, la # 语音增强 audio_data = self.speech_enhance.enhance(audio_data, self.samplerate) - if Config.vad.get("enable"): - # vad 过滤静音 - audio_data = self.vad_rm_silence(audio_data) - - # 如果 audio_data 为空,不做转录 - if audio_data is None: - if len(last_buffer) > 0 and len(last_transcript) > 0: - # 如果 last_buffer 不为空,则视为结束,完整句子为 last_transcript ,新的转录结果为空,新的音频缓冲区为空 - self.dump(True, last_buffer) - speaker = self.speaker_verifier.match_speaker(last_buffer) - new_buffer = np.array([],dtype=np.float32) - return True, speaker, last_transcript, "", new_buffer - else: - # 如果 last_buffer 为空,则视为未结束 - return False, last_speaker, last_sentence, last_transcript, last_buffer - # 合并 last_buffer 和 chunk_audio audio_buffer = np.concatenate([last_buffer, audio_data]) # 转录,last_sentence 为上一段转录的完整句子,可作为 prompt 或 hotwords - final, speaker, sentence, transcript, new_buffer = self.transcript(audio_buffer, last_speaker, last_sentence) - - # 过滤幻觉词 - sentence = self.filter(sentence) - transcript = self.filter(transcript) + final, speaker, sentence, transcript, new_buffer = self.transcript(audio_buffer, last_speaker, last_sentence, last_transcript) return final, speaker, sentence, transcript, new_buffer @@ -336,10 +199,9 @@ def inference(self, audio_data, last_speaker, last_sentence, last_transcript, la last_transcript = "" last_buffer = np.array([],dtype=np.float32) - # 按 1 秒的频率读取数据 - audio_size = 16384 # 每秒的样本数 - for i in range(0, len(samples), audio_size): - audio_data = samples[i:i + audio_size] + chunk_size = int(Config.samplerate * 0.5) + for i in range(0, len(samples), chunk_size): + audio_data = samples[i:i + chunk_size] audio_f32 = audio_data.astype(np.float32) / 32768.0 final, speaker, sentence, transcript, new_buffer = transcriptor.inference(