Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions runtime/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,21 @@ def _import_operators():
from . import remove_duplicate_sentences
from . import knowledge_relation_slice
from . import pii_ner_detection
from . import audio_format_convert
from . import audio_anomaly_filter
from . import audio_dc_offset_removal
from . import audio_pre_emphasis
from . import audio_simple_agc
from . import audio_noise_gate
from . import audio_soft_peak_limiter
from . import audio_trim_silence_edges
from . import audio_rms_loudness_normalize
from . import audio_hum_notch
from . import audio_telephony_bandpass
from . import audio_gtcrn_denoise
from . import audio_quantize_encode
from . import audio_fast_lang_id
from . import audio_asr_pipeline


_import_operators()
40 changes: 40 additions & 0 deletions runtime/ops/mapper/audio_anomaly_filter/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# AudioAnomalyFilter 异常语音检测与过滤算子

## 概述

AudioAnomalyFilter 用于对音频做快速质量检测,计算时长与静音帧比例,并给出 `quality_flag`。当判定为异常时,可选择直接“过滤”(清空 `text/data`)或“保留但打标”(仅写入报告)。

## 功能特性

- **时长检测**:支持最小时长/最大时长阈值
- **静音比例检测**:基于短时 RMS 统计静音帧占比
- **过滤策略可控**:支持保留异常文件(仅打标)或直接过滤
- **结果结构化输出**:报告写入 `ext_params.audio_quality`

## 参数说明

| 参数 | 类型 | 默认值 | 说明 |
|---|---|---:|---|
| minDur | inputNumber | 1.0 | 最小时长(秒),小于该值视为异常 |
| maxDur | inputNumber | 20000.0 | 最大时长(秒),大于该值视为异常 |
| silenceRatioTh | slider | 0.8 | 静音帧比例阈值(0~1),>= 阈值视为异常 |
| silenceRmsRatioTh | slider | 0.05 | 静音判定阈值 = global_rms * 该比例 |
| keepInvalid | switch | false | true=保留异常文件仅打标;false=异常则清空 text/data 便于过滤 |

## 输入输出

- **输入**:`sample["filePath"]`(音频文件路径)
- **输出**:
- `sample["ext_params"]["audio_quality"]`:
- `quality_flag`: `ok/invalid`
- `duration/silence_ratio/global_rms/reason`
- 若 `keepInvalid=false` 且 `quality_flag=invalid`:清空 `sample["text"]` 与 `sample["data"]`

## 依赖说明

- **Python 依赖**:优先 `torchaudio`,兜底 `soundfile`

## 版本历史

- **v1.0.0**:首次发布,支持时长/静音比例检测与过滤策略配置

9 changes: 9 additions & 0 deletions runtime/ops/mapper/audio_anomaly_filter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# -*- coding: utf-8 -*-

from datamate.core.base_op import OPERATORS

OPERATORS.register_module(
module_name="AudioAnomalyFilter",
module_path="ops.mapper.audio_anomaly_filter.process",
)

60 changes: 60 additions & 0 deletions runtime/ops/mapper/audio_anomaly_filter/metadata.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
name: '异常语音检测与过滤'
name_en: 'Audio Anomaly Detect & Filter'
description: '对音频做快速异常检测:时长范围与静音帧比例。若判定为异常,将清空 sample 的 text/data 以便后续过滤,并在 ext_params 中写入报告字段。'
description_en: 'Fast audio anomaly detection (duration and silence ratio). If invalid, clears text/data so downstream can filter, and writes report to ext_params.'
language: 'python'
vendor: 'huawei'
raw_id: 'AudioAnomalyFilter'
version: '1.0.0'
types:
- 'cleanse'
modal: 'audio'
inputs: 'audio'
outputs: 'audio'
settings:
minDur:
name: '最小时长(秒)'
type: 'inputNumber'
description: '小于该值视为异常。'
defaultVal: 1.0
min: 0
max: 36000
step: 0.1
maxDur:
name: '最大时长(秒)'
type: 'inputNumber'
description: '大于该值视为异常。'
defaultVal: 20000.0
min: 0
max: 360000
step: 1
silenceRatioTh:
name: '静音帧比例阈值'
type: 'slider'
description: '静音帧比例 >= 阈值 时视为异常。'
defaultVal: 0.8
min: 0
max: 1
step: 0.01
silenceRmsRatioTh:
name: '静音判定比例'
type: 'slider'
description: '静音判定阈值 = global_rms * 该比例。'
defaultVal: 0.05
min: 0
max: 1
step: 0.01
keepInvalid:
name: '保留异常文件'
description: '开启后不清空 text/data,仅打标 quality_flag=invalid。'
type: 'switch'
defaultVal: 'false'
required: false
checkedLabel: '保留'
unCheckedLabel: '过滤'
runtime:
memory: 104857600
cpu: 0.2
gpu: 0
npu: 0

136 changes: 136 additions & 0 deletions runtime/ops/mapper/audio_anomaly_filter/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# -- encoding: utf-8 --

import math
import time
from pathlib import Path
from typing import Dict, Any, List, Tuple

from loguru import logger

from datamate.core.base_op import Mapper


def _as_bool(v: object) -> bool:
if isinstance(v, bool):
return v
s = str(v).strip().lower()
return s in {"1", "true", "yes", "y", "on"}


def _load_wave_mono(path: Path) -> Tuple[List[float], int]:
"""
尽量少依赖:优先 torchaudio,其次 soundfile。
返回 mono waveform(list[float]) 与采样率。
"""
try:
import torchaudio # type: ignore

wav, sr = torchaudio.load(str(path))
if wav.ndim > 1:
wav = wav.mean(dim=0, keepdim=True)
mono = wav.squeeze(0).float().tolist()
return mono, int(sr)
except Exception:
try:
import soundfile as sf # type: ignore

data, sr = sf.read(str(path), always_2d=False)
if getattr(data, "ndim", 1) > 1:
data = data.mean(axis=1)
return data.tolist(), int(sr)
except Exception as e:
raise RuntimeError(f"读取音频失败: {path}, error={e}") from e


def _frame_rms(x: List[float], sr: int, frame_ms: float, hop_ms: float) -> Tuple[List[float], float]:
if not x or sr <= 0:
return [], 0.0
frame_len = max(1, int(sr * frame_ms / 1000.0))
hop = max(1, int(sr * hop_ms / 1000.0))
n = len(x)
total_sq = 0.0
for v in x:
total_sq += float(v) * float(v)
global_rms = math.sqrt(total_sq / max(1, n))
rms_list: List[float] = []
for start in range(0, n, hop):
end = min(start + frame_len, n)
if end <= start:
continue
s = 0.0
cnt = 0
for v in x[start:end]:
s += float(v) * float(v)
cnt += 1
rms_list.append(math.sqrt(s / cnt) if cnt else 0.0)
return rms_list, global_rms


class AudioAnomalyFilter(Mapper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.min_dur = float(kwargs.get("minDur", 1.0))
self.max_dur = float(kwargs.get("maxDur", 20000.0))
self.silence_ratio_th = float(kwargs.get("silenceRatioTh", 0.8))
self.silence_rms_ratio_th = float(kwargs.get("silenceRmsRatioTh", 0.05))
self.keep_invalid = _as_bool(kwargs.get("keepInvalid", False))

def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
wav_path = Path(sample.get(self.filepath_key, "")).resolve()
if not wav_path.exists():
raise FileNotFoundError(f"输入音频不存在: {wav_path}")

wav, sr = _load_wave_mono(wav_path)
n = len(wav)
duration = float(n) / float(sr) if sr > 0 else 0.0
rms_frames, global_rms = _frame_rms(wav, sr, frame_ms=25.0, hop_ms=10.0)
if not rms_frames or global_rms <= 0.0:
silence_ratio = 1.0
else:
th = max(1e-8, global_rms * float(self.silence_rms_ratio_th))
silent = sum(1 for r in rms_frames if r < th)
silence_ratio = float(silent) / float(len(rms_frames))

reasons: List[str] = []
quality_flag = "ok"
if duration <= 0.0:
quality_flag = "invalid"
reasons.append("duration_le_zero")
elif duration < self.min_dur:
quality_flag = "invalid"
reasons.append("too_short")
elif duration > self.max_dur:
quality_flag = "invalid"
reasons.append("too_long")
if silence_ratio >= self.silence_ratio_th:
quality_flag = "invalid"
reasons.append("too_much_silence")

report = {
"quality_flag": quality_flag,
"duration": round(duration, 3),
"silence_ratio": round(silence_ratio, 4),
"global_rms": round(global_rms, 6),
"reason": ",".join(reasons) if reasons else "",
}
ext = sample.get(self.ext_params_key, {})
if not isinstance(ext, dict):
ext = {"_raw": ext}
ext["audio_quality"] = report
sample[self.ext_params_key] = ext

if quality_flag == "invalid" and not self.keep_invalid:
# 清空内容以便后续被框架过滤(Mapper 的“空内容过滤”逻辑)
sample[self.text_key] = ""
sample[self.data_key] = b""
else:
if not sample.get(self.text_key):
sample[self.text_key] = "ok"
sample[self.data_key] = b""

logger.info(
f"fileName: {sample.get(self.filename_key)}, method: AudioAnomalyFilter costs {time.time() - start:6f} s"
)
return sample

2 changes: 2 additions & 0 deletions runtime/ops/mapper/audio_anomaly_filter/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
torchaudio
soundfile
52 changes: 52 additions & 0 deletions runtime/ops/mapper/audio_asr_pipeline/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# AudioAsrPipeline 音频预处理与中英ASR流水线算子

## 概述

AudioAsrPipeline 将 `audio_preprocessor` 的推荐流水线封装为一个 DataMate Mapper 算子:标准化、(可选)降噪、(可选)异常过滤、语言识别、切分、ASR 识别与合并。最终合并文本写入 `sample["text"]`,并在 `ext_params` 中记录中间产物路径,便于排查与验收。

## 功能特性

- **端到端流水线**:normalization →(可选)GTCRN →(可选)异常过滤 → LID → split → ASR → merge
- **可配置**:每个关键步骤参数化(降噪开关、过滤阈值、LID 截断秒数、切分长度、ASR 设备等)
- **结果可追溯**:中间产物路径记录在 `ext_params.audio_asr.artifacts`
- **面向验收**:输出合并转写文本到 `sample["text"]`

## 参数说明

| 参数 | 类型 | 默认值 | 说明 |
|---|---|---:|---|
| doDenoise | switch | false | 是否启用 GTCRN 降噪 |
| denoiseModelPath | input | (空) | GTCRN ONNX 模型绝对路径(启用降噪时必填) |
| doAnomalyFilter | switch | true | 是否启用异常语音检测与过滤 |
| minDur | inputNumber | 1.0 | 最小时长(秒) |
| maxDur | inputNumber | 20000.0 | 最大时长(秒) |
| silenceRatioTh | slider | 0.8 | 静音帧比例阈值(0~1) |
| silenceRmsRatioTh | slider | 0.05 | 静音判定阈值比例 |
| lidModelSource | input | (空) | SpeechBrain LID 模型 source(本地目录或 HF repo) |
| lidDevice | select | cpu | LID 推理设备(cpu/cuda/npu) |
| lidMaxSeconds | inputNumber | 3.0 | LID 只取前 N 秒,0=全长 |
| maxSegmentSeconds | inputNumber | 120 | 切分最大秒数 |
| asrDevice | select | auto | ASR 设备参数(auto/cpu/npu) |

## 输入输出

- **输入**:`sample["filePath"]`(音频文件路径)
- **输出**:
- `sample["text"]`:合并后的转写文本(来自 `merged_text.txt`)
- `sample["ext_params"]["audio_asr"]`:
- `lang`:LID 结果(zh/en)
- `artifacts`:中间产物路径(normalized/denoise/lid/split/asr/merged_text)

## 依赖说明

- **Python 依赖**(按启用功能而定):
- normalization/切分:`pydub`、`soundfile`、`numpy`
- LID:`torch`、`torchaudio`、`speechbrain`
- 降噪:`onnxruntime`(以及 GTCRN 模型文件)
- **系统依赖**:
- `pydub` 通常需要 `ffmpeg`

## 版本历史

- **v1.0.0**:首次发布,支持音频标准化/(可选)降噪/过滤/LID/切分/ASR/合并

9 changes: 9 additions & 0 deletions runtime/ops/mapper/audio_asr_pipeline/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# -*- coding: utf-8 -*-

from datamate.core.base_op import OPERATORS

OPERATORS.register_module(
module_name="AudioAsrPipeline",
module_path="ops.mapper.audio_asr_pipeline.process",
)

Loading