Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion preload.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ async def preload_embedding():

# preload kokoro tts model if enabled
async def preload_kokoro():
if set["tts_kokoro"]:
if set["tts_enabled"] and set["tts_engine"] == "kokoro":
try:
return await kokoro_tts.preload()
except Exception as e:
Expand Down
104 changes: 24 additions & 80 deletions python/api/synthesize.py
Original file line number Diff line number Diff line change
@@ -1,96 +1,40 @@
# api/synthesize.py
# api/synthesize.py - Unified TTS API supporting multiple engines

from python.helpers.api import ApiHandler, Request, Response

from python.helpers import runtime, settings, kokoro_tts
from python.helpers import settings as settings_helper, kokoro_tts, edge_tts

class Synthesize(ApiHandler):
async def process(self, input: dict, request: Request) -> dict | Response:
text = input.get("text", "")
ctxid = input.get("ctxid", "")

if ctxid:
context = self.use_context(ctxid)

# if not await kokoro_tts.is_downloaded():
# context.log.log(type="info", content="Kokoro TTS model is currently being initialized, please wait...")
# Read TTS settings
current_settings = settings_helper.get_settings()
engine = current_settings.get("tts_engine", "kokoro")
voice = current_settings.get("tts_voice", "")
rate = current_settings.get("tts_rate", "+0%")

try:
# # Clean and chunk text for long responses
# cleaned_text = self._clean_text(text)
# chunks = self._chunk_text(cleaned_text)

# if len(chunks) == 1:
# # Single chunk - return as before
# audio = await kokoro_tts.synthesize_sentences(chunks)
# return {"audio": audio, "success": True}
# else:
# # Multiple chunks - return as sequence
# audio_parts = []
# for chunk in chunks:
# chunk_audio = await kokoro_tts.synthesize_sentences([chunk])
# audio_parts.append(chunk_audio)
# return {"audio_parts": audio_parts, "success": True}
if engine == "edge":
audio = await edge_tts.synthesize(text, voice=voice or "en-US-AriaNeural", rate=rate)
else:
# kokoro engine (default)
# Convert rate string like "+20%" to float speed multiplier
speed = self._rate_to_speed(rate)
audio = await kokoro_tts.synthesize_sentences([text], voice=voice, speed=speed)

# audio is chunked on the frontend for better flow
audio = await kokoro_tts.synthesize_sentences([text])
return {"audio": audio, "success": True}
except Exception as e:
return {"error": str(e), "success": False}

# def _clean_text(self, text: str) -> str:
# """Clean text by removing markdown, tables, code blocks, and other formatting"""
# # Remove code blocks
# text = re.sub(r'```[\s\S]*?```', '', text)
# text = re.sub(r'`[^`]*`', '', text)

# # Remove markdown links
# text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)

# # Remove markdown formatting
# text = re.sub(r'[*_#]+', '', text)

# # Remove tables (basic cleanup)
# text = re.sub(r'\|[^\n]*\|', '', text)

# # Remove extra whitespace and newlines
# text = re.sub(r'\n+', ' ', text)
# text = re.sub(r'\s+', ' ', text)

# # Remove URLs
# text = re.sub(r'https?://[^\s]+', '', text)

# # Remove email addresses
# text = re.sub(r'\S+@\S+', '', text)

# return text.strip()

# def _chunk_text(self, text: str) -> list[str]:
# """Split text into manageable chunks for TTS"""
# # If text is short enough, return as single chunk
# if len(text) <= 300:
# return [text]

# # Split into sentences first
# sentences = re.split(r'(?<=[.!?])\s+', text)

# chunks = []
# current_chunk = ""

# for sentence in sentences:
# sentence = sentence.strip()
# if not sentence:
# continue

# # If adding this sentence would make chunk too long, start new chunk
# if current_chunk and len(current_chunk + " " + sentence) > 300:
# chunks.append(current_chunk.strip())
# current_chunk = sentence
# else:
# current_chunk += (" " if current_chunk else "") + sentence

# # Add the last chunk if it has content
# if current_chunk.strip():
# chunks.append(current_chunk.strip())

# return chunks if chunks else [text]

@staticmethod
def _rate_to_speed(rate: str) -> float:
"""Convert rate string (e.g. '+20%', '-10%') to kokoro speed float."""
try:
pct = int(rate.replace("%", "").replace("+", ""))
return 1.0 + pct / 100.0
except (ValueError, AttributeError):
return 1.1
51 changes: 51 additions & 0 deletions python/api/tts_voices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# api/tts_voices.py - Returns available voices for the selected TTS engine

from python.helpers.api import ApiHandler, Request, Response
from python.helpers import settings as settings_helper


# Built-in Kokoro voice list
KOKORO_VOICES = [
{"id": "af_heart", "name": "Heart (Female)", "language": "en-US", "gender": "Female"},
{"id": "af_alloy", "name": "Alloy (Female)", "language": "en-US", "gender": "Female"},
{"id": "af_aoede", "name": "Aoede (Female)", "language": "en-US", "gender": "Female"},
{"id": "af_bella", "name": "Bella (Female)", "language": "en-US", "gender": "Female"},
{"id": "af_jessica", "name": "Jessica (Female)", "language": "en-US", "gender": "Female"},
{"id": "af_kore", "name": "Kore (Female)", "language": "en-US", "gender": "Female"},
{"id": "af_nicole", "name": "Nicole (Female)", "language": "en-US", "gender": "Female"},
{"id": "af_nova", "name": "Nova (Female)", "language": "en-US", "gender": "Female"},
{"id": "af_river", "name": "River (Female)", "language": "en-US", "gender": "Female"},
{"id": "af_sarah", "name": "Sarah (Female)", "language": "en-US", "gender": "Female"},
{"id": "af_sky", "name": "Sky (Female)", "language": "en-US", "gender": "Female"},
{"id": "am_adam", "name": "Adam (Male)", "language": "en-US", "gender": "Male"},
{"id": "am_echo", "name": "Echo (Male)", "language": "en-US", "gender": "Male"},
{"id": "am_eric", "name": "Eric (Male)", "language": "en-US", "gender": "Male"},
{"id": "am_liam", "name": "Liam (Male)", "language": "en-US", "gender": "Male"},
{"id": "am_michael", "name": "Michael (Male)", "language": "en-US", "gender": "Male"},
{"id": "am_onyx", "name": "Onyx (Male)", "language": "en-US", "gender": "Male"},
{"id": "am_puck", "name": "Puck (Male)", "language": "en-US", "gender": "Male"},
{"id": "bf_emma", "name": "Emma (Female)", "language": "en-GB", "gender": "Female"},
{"id": "bf_isabella", "name": "Isabella (Female)", "language": "en-GB", "gender": "Female"},
{"id": "bm_daniel", "name": "Daniel (Male)", "language": "en-GB", "gender": "Male"},
{"id": "bm_fable", "name": "Fable (Male)", "language": "en-GB", "gender": "Male"},
{"id": "bm_george", "name": "George (Male)", "language": "en-GB", "gender": "Male"},
{"id": "bm_lewis", "name": "Lewis (Male)", "language": "en-GB", "gender": "Male"},
]


class TtsVoices(ApiHandler):
async def process(self, input: dict, request: Request) -> dict | Response:
engine = input.get("engine", "")

try:
if engine == "edge":
from python.helpers import edge_tts
voices = await edge_tts.get_voices()
elif engine == "kokoro":
voices = KOKORO_VOICES
else:
voices = []

return {"voices": voices, "success": True}
except Exception as e:
return {"error": str(e), "success": False}
75 changes: 75 additions & 0 deletions python/helpers/edge_tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# edge_tts.py - Edge TTS engine wrapper

import base64
import io
import asyncio
import soundfile as sf
from python.helpers.print_style import PrintStyle

# Cached voice list
_voices_cache: list[dict] | None = None


async def get_voices() -> list[dict]:
"""Get available Edge TTS voices with caching."""
global _voices_cache
if _voices_cache is not None:
return _voices_cache

try:
import edge_tts as et
raw_voices = await et.list_voices()
_voices_cache = [
{
"id": v["ShortName"],
"name": f"{v['ShortName'].split('-')[-1].replace('Neural', '').strip()} ({v['Gender']})",
"language": v["Locale"],
"gender": v["Gender"],
}
for v in raw_voices
]
return _voices_cache
except Exception as e:
PrintStyle.error(f"Error fetching Edge TTS voices: {e}")
return []


async def synthesize(text: str, voice: str = "en-US-AriaNeural", rate: str = "+0%") -> str:
"""Synthesize text to speech and return base64-encoded WAV audio.

Args:
text: Text to synthesize.
voice: Edge TTS voice short name (e.g. "zh-CN-XiaoxiaoNeural").
rate: Speed rate string (e.g. "+20%", "-10%", "+0%").

Returns:
Base64-encoded WAV audio string.
"""
try:
import edge_tts as et

communicate = et.Communicate(text, voice=voice, rate=rate)

# Collect MP3 chunks
mp3_buffer = io.BytesIO()
async for chunk in communicate.stream():
if chunk["type"] == "audio":
mp3_buffer.write(chunk["data"])

mp3_bytes = mp3_buffer.getvalue()
if not mp3_bytes:
raise ValueError("Edge TTS returned empty audio")

# Convert MP3 to WAV using soundfile (via virtual file)
mp3_buffer.seek(0)
audio_data, sample_rate = sf.read(mp3_buffer, dtype="float32")

wav_buffer = io.BytesIO()
sf.write(wav_buffer, audio_data, sample_rate, format="WAV")
wav_bytes = wav_buffer.getvalue()

return base64.b64encode(wav_bytes).decode("utf-8")

except Exception as e:
PrintStyle.error(f"Error in Edge TTS synthesis: {e}")
raise
18 changes: 10 additions & 8 deletions python/helpers/kokoro_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
warnings.filterwarnings("ignore", category=UserWarning)

_pipeline = None
_voice = "am_puck,am_onyx"
_speed = 1.1
_default_voice = "am_puck,am_onyx"
_default_speed = 1.1
is_updating_model = False


Expand Down Expand Up @@ -86,27 +86,29 @@ def _is_downloaded():
return _pipeline is not None


async def synthesize_sentences(sentences: list[str]):
async def synthesize_sentences(sentences: list[str], voice: str = "", speed: float = 0):
"""Generate audio for multiple sentences and return concatenated base64 audio"""
try:
# return await runtime.call_development_function(_synthesize_sentences, sentences)
return await _synthesize_sentences(sentences)
# return await runtime.call_development_function(_synthesize_sentences, sentences, voice, speed)
return await _synthesize_sentences(sentences, voice, speed)
except Exception as e:
# if not runtime.is_development():
raise e
# Fallback to direct execution if RFC fails in development
# return await _synthesize_sentences(sentences)
# return await _synthesize_sentences(sentences, voice, speed)


async def _synthesize_sentences(sentences: list[str]):
async def _synthesize_sentences(sentences: list[str], voice: str = "", speed: float = 0):
await _preload()

tts_voice = voice or _default_voice
tts_speed = speed or _default_speed
combined_audio = []

try:
for sentence in sentences:
if sentence.strip():
segments = _pipeline(sentence.strip(), voice=_voice, speed=_speed) # type: ignore
segments = _pipeline(sentence.strip(), voice=tts_voice, speed=tts_speed) # type: ignore
segment_list = list(segments)

for segment in segment_list:
Expand Down
10 changes: 8 additions & 2 deletions python/helpers/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,10 @@ class Settings(TypedDict):
stt_silence_duration: int
stt_waiting_timeout: int

tts_kokoro: bool
tts_enabled: bool
tts_engine: str
tts_voice: str
tts_rate: str

mcp_servers: str
mcp_client_init_timeout: int
Expand Down Expand Up @@ -588,7 +591,10 @@ def get_default_settings() -> Settings:
stt_silence_threshold=get_default_value("stt_silence_threshold", 0.3),
stt_silence_duration=get_default_value("stt_silence_duration", 1000),
stt_waiting_timeout=get_default_value("stt_waiting_timeout", 2000),
tts_kokoro=get_default_value("tts_kokoro", True),
tts_enabled=get_default_value("tts_enabled", True),
tts_engine=get_default_value("tts_engine", "kokoro"),
tts_voice=get_default_value("tts_voice", ""),
tts_rate=get_default_value("tts_rate", "+0%"),
mcp_servers=get_default_value("mcp_servers", '{\n "mcpServers": {}\n}'),
mcp_client_init_timeout=get_default_value("mcp_client_init_timeout", 10),
mcp_client_tool_timeout=get_default_value("mcp_client_tool_timeout", 120),
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ flaredantic==0.1.5
GitPython==3.1.43
inputimeout==1.0.4
kokoro>=0.9.2
edge-tts>=7.2.7
simpleeval==1.0.3
langchain-core==0.3.49
langchain-community==0.3.19
Expand Down
Loading