ComfyUI-FishAudioS2/__init__.py at main · siliconflow/ComfyUI-FishAudioS2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
"""ComfyUI custom nodes for Fish Audio S2-Pro TTS.

Provides four nodes:
  - FishS2TTS             — text → speech, 80+ languages, inline emotion tags
  - FishS2VoiceCloneTTS   — reference audio + text → cloned-voice speech
  - FishS2MultiSpeakerTTS — multi-speaker conversation synthesis in one pass
  - FishS2MultiSpeakerSplitTTS — multi-speaker with per-speaker audio outputs

Required pip packages are auto-installed on startup.
Model weights are auto-downloaded from HuggingFace on first inference.
"""

__version__ = "0.4.4"

import importlib
import logging
import subprocess
import sys
from pathlib import Path
from typing import Any, Dict

# ---------------------------------------------------------------------------
# Bundle fish-speech source — add it to sys.path so `import fish_speech` and
# `import tools` resolve from our bundled copy, not from pip (which can't
# install it reliably into embedded Python).
# ---------------------------------------------------------------------------
_HERE = Path(__file__).parent.resolve()
_FISH_SRC = _HERE / "fish_speech_src"
# Add fish_speech_src to path
if _FISH_SRC.is_dir():
    _fish_src_str = str(_FISH_SRC)
    if _fish_src_str not in sys.path:
        sys.path.insert(0, _fish_src_str)

logger = logging.getLogger("FishAudioS2")
logger.propagate = False

if not logger.handlers:
    _handler = logging.StreamHandler()
    _handler.setFormatter(logging.Formatter("[FishAudioS2] %(message)s"))
    logger.addHandler(_handler)
    logger.setLevel(logging.INFO)


# ---------------------------------------------------------------------------
# pip helper — works for portable embedded Python, venv, conda, system Python
# ---------------------------------------------------------------------------

def _find_pip() -> list[str]:
    """
    Return the pip command that installs into the same environment as the
    currently-running Python — regardless of install type.

    Portable embedded:  python_embeded/python.exe -m pip
    venv / conda:       <venv>/bin/python -m pip
    System Python:      python -m pip

    Using [sys.executable, "-m", "pip"] is the only reliable method because:
    - It always targets the active interpreter, not any pip.exe on PATH
    - It works even when pip.exe doesn't exist but pip is installed as a module
    - It works inside embedded Python where Scripts/ may not be on PATH
    """
    return [sys.executable, "-m", "pip"]


def _pip_install(spec: str) -> bool:
    """
    Install a package. spec may include flags like '--no-deps'.
    Splits on whitespace so flags are passed as separate args to pip.
    Returns True on success.
    """
    cmd = _find_pip() + ["install"] + spec.split()
    logger.info(f"Running: {' '.join(cmd)}")
    try:
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            encoding="utf-8",
            errors="replace",
            timeout=300,
        )
        if result.returncode == 0:
            logger.info(f"Successfully installed: {spec}")
            # Invalidate Python's import-system filesystem cache so the newly
            # installed package is visible to __import__ without a restart.
            importlib.invalidate_caches()
            return True
        logger.error(f"pip install failed for '{spec}':\n{result.stderr.strip()}")
        return False
    except subprocess.TimeoutExpired:
        logger.error(f"pip install timed out for: {spec}")
        return False
    except Exception as e:
        logger.error(f"pip install error for '{spec}': {e}")
        return False


# Packages to auto-install if missing: (import_name, pip_install_spec)
# fish_speech is bundled in fish_speech_src/ — NOT installed via pip.
# Only its runtime deps are installed here. torch is intentionally excluded.
_REQUIRED = [
    ("numpy",           "numpy"),
    ("tqdm",            "tqdm"),
    ("soundfile",       "soundfile"),
    ("loguru",          "loguru"),
    ("transformers",    "transformers>=4.45.2"),
    ("einops",          "einops>=0.7.0"),
    ("librosa",         "librosa>=0.10.1"),
    ("rich",            "rich>=13.5.3"),
    ("ormsgpack",       "ormsgpack"),
    ("pydantic",        "pydantic==2.9.2"),
    ("tiktoken",        "tiktoken>=0.8.0"),
    ("cachetools",      "cachetools"),
    ("zstandard",       "zstandard>=0.22.0"),
    ("resampy",         "resampy>=0.4.3"),
    ("safetensors",     "safetensors>=0.4.0"),
    ("pyrootutils",     "pyrootutils>=1.0.4"),
    ("natsort",         "natsort>=8.4.0"),
    ("loralib",         "loralib>=0.1.2"),
    ("hydra",           "hydra-core>=1.3.2"),
    # einx is an optional dep used only by certain attention backends.
    # It is NOT hard-required for TTS inference — skip if it can't import
    # (e.g. jax namespace conflicts in some environments).
    # ("einx",          "einx==0.2.2"),
    # These are direct runtime imports of dac/audiotools (not training deps).
    # Must be installed before dac/audiotools even with --no-deps.
    #   flatten_dict:        audiotools/core/util.py
    #   importlib_resources: audiotools/core/playback.py
    #   julius:              audiotools/core/dsp.py, loudness.py
    #   randomname:          audiotools/ml/experiment.py
    #   ffmpy:               audiotools/core/ffmpeg.py (via audio_signal.py FFMPEGMixin)
    #   argbind:             dac/utils/__init__.py (imported by dac/__init__.py)
    #   tensorboard:         audiotools/ml/__init__.py (imported at module load)
    ("flatten_dict",        "flatten-dict"),
    ("importlib_resources", "importlib-resources"),
    ("julius",              "julius"),
    ("randomname",          "randomname"),
    ("ffmpy",               "ffmpy"),
    ("argbind",             "argbind"),
    ("tensorboard",         "tensorboard"),
    # Install dac/audiotools with --no-deps to avoid their protobuf<5 upper-bound
    # constraint being enforced into the environment. All of their runtime deps
    # that matter for inference (numpy, torch, einops, etc.) are already covered
    # by entries above. protobuf is NOT needed for TTS inference — it is only
    # used by fish-speech's training dataset tooling which is never called here.
    ("dac",             "descript-audio-codec --no-deps"),
    ("audiotools",      "descript-audiotools>=0.7.2 --no-deps"),
    ("bitsandbytes",    "bitsandbytes"),
]


def _ensure_fish_source() -> bool:
    """
    Add the bundled fish_speech_src/ to sys.path and verify it is importable.
    The source is shipped with the node — no git, no pip for fish_speech itself.
    """
    if not _FISH_SRC.is_dir():
        logger.error(
            f"fish_speech_src/ not found at {_FISH_SRC}\n"
            "The bundled fish-speech source is missing from the node folder."
        )
        return False

    fish_src_str = str(_FISH_SRC)
    if fish_src_str not in sys.path:
        sys.path.insert(0, fish_src_str)

    try:
        import fish_speech.models  # noqa: F401
        return True
    except ImportError as e:
        logger.error(f"fish_speech not importable from {_FISH_SRC}: {e}")
        return False

# After installing fish-speech we must restore the correct torch build.
# fish-speech pins torch==2.8.0 which would downgrade and break ComfyUI.
# We detect the current torch version and re-pin it with the right CUDA index.
def _restore_torch() -> None:
    """Re-install torch/torchaudio with CUDA after fish-speech may have downgraded it."""
    try:
        import torch
        version = torch.__version__
        # If it's already a CUDA build (contains +cu) we're fine
        if "+cu" in version:
            logger.info(f"torch {version} is a CUDA build — no restore needed.")
            return
        logger.warning(
            f"torch {version} is NOT a CUDA build — fish-speech downgraded it. "
            "Restoring CUDA torch..."
        )
    except ImportError:
        logger.warning("torch not found — skipping restore.")
        return

    # Detect CUDA version from nvidia-smi or fall back to cu128
    cuda_tag = "cu128"
    try:
        import subprocess as sp
        r = sp.run(["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"],
                   capture_output=True, text=True, timeout=5)
        # Rough mapping: driver >= 528 → cu12x
        cuda_tag = "cu128"
    except Exception:
        pass

    index_url = f"https://download.pytorch.org/whl/{cuda_tag}"
    logger.info(f"Restoring torch with: --index-url {index_url}")
    _pip_install(f"torch torchaudio --index-url {index_url}")


def _ensure_dependencies() -> bool:
    """Auto-install any missing packages. Returns True when all are available."""
    all_ok = True
    any_installed = False
    failed_specs: list[str] = []

    for import_name, pip_spec in _REQUIRED:
        try:
            __import__(import_name)
        except ImportError as e:
            logger.warning(
                f"'{import_name}' not found — auto-installing from: {pip_spec}\n"
                f"  ImportError: {e}\n"
                f"  sys.path: {sys.path}\n"
                f"  sys.modules entry: {sys.modules.get(import_name, '<not in sys.modules>')}"
            )
            if _pip_install(pip_spec):
                any_installed = True
                try:
                    __import__(import_name)
                except ImportError as e2:
                    logger.error(
                        f"Installed '{pip_spec}' but '{import_name}' still "
                        f"cannot be imported. Please restart ComfyUI.\n"
                        f"  ImportError: {e2}\n"
                        f"  sys.path: {sys.path}\n"
                        f"  sys.modules entry: {sys.modules.get(import_name, '<not in sys.modules>')}"
                    )
                    failed_specs.append(pip_spec)
                    all_ok = False
            else:
                failed_specs.append(pip_spec)
                all_ok = False

    # If any package was auto-installed, ensure torch is still a CUDA build.
    # pip may silently install a CPU torch as a transitive dependency of packages
    # like transformers or bitsandbytes — especially in embedded Python where the
    # CUDA torch was not installed via pip and has no pip metadata record.
    if any_installed:
        _restore_torch()

    if not all_ok:
        install_cmds = "\n".join(
            f"  {sys.executable} -m pip install {s}" for s in failed_specs
        )
        logger.error(
            "Auto-install failed for some packages. "
            "Install them manually then restart ComfyUI:\n"
            + install_cmds
        )
    return all_ok


# ---------------------------------------------------------------------------
# Node registration
# ---------------------------------------------------------------------------

NODE_CLASS_MAPPINGS: Dict[str, Any] = {}
NODE_DISPLAY_NAME_MAPPINGS: Dict[str, str] = {}

if _ensure_fish_source() and _ensure_dependencies():
    try:
        from .nodes.loader import _register_folder
        _register_folder()

        from .nodes.tts_node import FishS2TTS
        from .nodes.voice_clone_node import FishS2VoiceCloneTTS
        from .nodes.multi_speaker_node import FishS2MultiSpeakerTTS
        from .nodes.multi_speaker_split_node import FishS2MultiSpeakerSplitTTS

        NODE_CLASS_MAPPINGS = {
            "FishS2TTS": FishS2TTS,
            "FishS2VoiceCloneTTS": FishS2VoiceCloneTTS,
            "FishS2MultiSpeakerTTS": FishS2MultiSpeakerTTS,
            "FishS2MultiSpeakerSplitTTS": FishS2MultiSpeakerSplitTTS,
        }

        NODE_DISPLAY_NAME_MAPPINGS = {
            "FishS2TTS": "Fish S2 TTS",
            "FishS2VoiceCloneTTS": "Fish S2 Voice Clone TTS",
            "FishS2MultiSpeakerTTS": "Fish S2 Multi-Speaker TTS",
            "FishS2MultiSpeakerSplitTTS": "Fish S2 Multi-Speaker Split TTS",
        }

        logger.info(
            f"Registered {len(NODE_CLASS_MAPPINGS)} nodes "
            f"(v{__version__}): {', '.join(NODE_DISPLAY_NAME_MAPPINGS.values())}"
        )

    except Exception as e:
        logger.error(f"Failed to register nodes: {e}", exc_info=True)
else:
    logger.warning(
        "FishAudioS2 nodes not registered — "
        "fix dependency errors above and restart ComfyUI."
    )

__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS", "__version__"]