From aeadb5833475ca2927ab04b484565d1e6c89c25d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Mar 2026 12:49:07 +0000 Subject: [PATCH 1/4] Initial plan From 3829b26413b671a2861bfc2f30aaabb9940db622 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Mar 2026 12:54:41 +0000 Subject: [PATCH 2/4] Fix MP3 decoding failure in COVER mode by converting non-WAV audio to WAV before passing to dit-vae Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com> --- server/src/services/acestep.ts | 49 ++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/server/src/services/acestep.ts b/server/src/services/acestep.ts index 53663a9..2fd33af 100644 --- a/server/src/services/acestep.ts +++ b/server/src/services/acestep.ts @@ -254,6 +254,49 @@ function resolveAudioPath(audioUrl: string): string { return audioUrl; } +/** + * Ensures the audio file at the given path is in PCM WAV format, which is + * required by the dit-vae binary for the --src-audio argument. + * + * If the file is already a WAV it is returned as-is. Any other format + * (MP3, FLAC, M4A, AAC, …) is converted with ffmpeg and the resulting WAV + * file is placed in tmpDir. The converted path is returned. + */ +async function ensureWavFormat(audioPath: string, tmpDir: string): Promise { + const ext = path.extname(audioPath).toLowerCase(); + if (ext === '.wav') return audioPath; + + const outPath = path.join(tmpDir, 'src_audio_converted.wav'); + console.log(`[Audio] Converting ${ext} → WAV: ${audioPath} → ${outPath}`); + + await new Promise((resolve, reject) => { + const proc = spawn('ffmpeg', [ + '-y', // overwrite output without asking + '-i', audioPath, // input file (any format ffmpeg supports) + '-vn', // drop any video stream + '-acodec', 'pcm_s16le', // 16-bit PCM — universally readable WAV + outPath, + ], { stdio: 'pipe' }); + + let stderr = ''; + proc.stderr?.on('data', (chunk: Buffer) => { stderr += chunk.toString(); }); + proc.on('error', (err) => { + reject(new Error(`ffmpeg not found or failed to start: ${err.message}`)); + }); + proc.on('close', (code) => { + if (code === 0) { + console.log(`[Audio] Converted to WAV successfully: ${outPath}`); + resolve(); + } else { + console.error(`[Audio] Failed to convert ${ext} to WAV: ${audioPath}`); + reject(new Error(`ffmpeg exited with code ${code} converting "${audioPath}":\n${stderr.slice(-1000)}`)); + } + }); + }); + + return outPath; +} + // --------------------------------------------------------------------------- // Spawn mode: run these step.cpp binaries in a two-step pipeline // Step 1: ace-qwen3 — LLM generates lyrics + audio codes from caption @@ -640,9 +683,11 @@ async function runViaSpawn( const batchSize = Math.min(Math.max(params.batchSize ?? 1, 1), 8); if (batchSize > 1) ditArgs.push('--batch', String(batchSize)); - // Cover and repaint modes both require a source audio file + // Cover and repaint modes both require a source audio file. + // dit-vae can only read WAV; convert MP3/FLAC/etc. to PCM WAV first. if (params.sourceAudioUrl) { - const srcAudioPath = resolveAudioPath(params.sourceAudioUrl); + const resolvedPath = resolveAudioPath(params.sourceAudioUrl); + const srcAudioPath = await ensureWavFormat(resolvedPath, tmpDir); ditArgs.push('--src-audio', srcAudioPath); } ditArgs.push(...parseExtraArgs(process.env.DIT_VAE_EXTRA_ARGS)); From 80becdadc95d0157df7c9a76afc49e75d1a9f0cf Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 8 Mar 2026 13:07:05 +0000 Subject: [PATCH 3/4] Fix AUDIO_DIR path mismatch: single source of truth for audio directory across storage, serving, and spawn Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com> --- .env.example | 6 ++- backend/README.md | 57 ++++++++++++++++++++++++++++ server/src/config/index.ts | 9 +++-- server/src/index.ts | 5 ++- server/src/services/acestep.ts | 48 +---------------------- server/src/services/storage/local.ts | 11 +++--- 6 files changed, 77 insertions(+), 59 deletions(-) diff --git a/.env.example b/.env.example index e76874a..f24a9ec 100644 --- a/.env.example +++ b/.env.example @@ -39,7 +39,11 @@ MODELS_DIR=./models # ACESTEP_CPP_BRANCH=main # ── Storage ─────────────────────────────────────────────────────────────────── -AUDIO_DIR=./public/audio +# Audio directory for generated songs and uploaded reference tracks. +# Relative paths are resolved from the project root (APP_ROOT). +# This is the single source of truth: LocalStorageProvider (writes), +# Express /audio/ endpoint (serves), and the spawn service (reads) all use it. +AUDIO_DIR=./server/public/audio # ── Auth ────────────────────────────────────────────────────────────────────── # Change this to a long random string in any multi-user or network-exposed setup. diff --git a/backend/README.md b/backend/README.md index cfd75ea..cc8cb45 100644 --- a/backend/README.md +++ b/backend/README.md @@ -41,3 +41,60 @@ you can point `ACESTEP_API_URL` at it and leave `ACESTEP_BIN` empty. The Node.js service already has an HTTP-client fallback for exactly this case. See `server/src/services/acestep.ts` for the dual-mode implementation. + +--- + +## Upstream C++ dependency: `audiohacking/acestep.cpp` + +This UI depends on the custom fork at +[`audiohacking/acestep.cpp` branch `copilot/add-wav-mp3-conversion`](https://github.com/audiohacking/acestep.cpp/tree/copilot/add-wav-mp3-conversion) +which adds native MP3 decoding to the `dit-vae` binary via `dr_mp3.h`. + +### Known issues in `src/audio.h` (pending upstream fix) + +**Bug: mono audio is not upmixed to stereo before encoding** + +`read_audio()` returns a native-channel-count buffer (`[T x n_channels]` floats), +but `vae_enc_compute()` in `vae-enc.h` always reads two channels: + +```cpp +// vae-enc.h (hardcodes stereo access — UB when n_channels == 1) +for (int c = 0; c < 2; c++) { + for (int t = 0; t < T_audio; t++) { + m->scratch_in[c * T_audio + t] = audio[t * 2 + c]; + } +} +``` + +For stereo inputs (most user uploads) this works correctly. +For mono inputs the second channel index reads out-of-bounds memory. + +**Required fix in `src/audio.h`** — always return interleaved stereo `[T x 2]`. +Add this block after resampling completes (before the final `return out`): + +```c +// Upmix mono -> stereo, or use first two channels of N-ch audio. +if ((int) channels != 2) { + int n_ch_src = (int) channels; + float *stereo = (float *) malloc((size_t) T_raw * 2 * sizeof(float)); + if (!stereo) { + fprintf(stderr, "[Audio] Out of memory converting to stereo\n"); + free(out); + return NULL; + } + for (int t = 0; t < T_raw; t++) { + float L = out[(size_t) t * n_ch_src + 0]; + float R = (n_ch_src > 1) ? out[(size_t) t * n_ch_src + 1] : L; + stereo[t * 2 + 0] = L; + stereo[t * 2 + 1] = R; + } + free(out); + out = stereo; + fprintf(stderr, "[Audio] Converted %dch -> stereo\n", n_ch_src); +} +*n_channels = 2; +``` + +**Also: replace `drwav_free(raw, NULL)` with `free(raw)`** in the resampling +branch — both `dr_wav` and `dr_mp3` use the system allocator by default, so +`free()` is always safe regardless of which decoder produced the buffer. diff --git a/server/src/config/index.ts b/server/src/config/index.ts index 14269e8..7abb41c 100644 --- a/server/src/config/index.ts +++ b/server/src/config/index.ts @@ -224,10 +224,11 @@ export const config = { storage: { provider: 'local' as const, - // Audio directory must match where LocalStorageProvider writes files and - // where Express serves /audio/ from (server/src/index.ts: '../public/audio'). - // Both resolve to /public/audio, so we use SERVER_ROOT here. - // AUDIO_DIR env override is still supported (resolved against APP_ROOT). + // Single source of truth for the audio directory. + // LocalStorageProvider, Express (/audio/), and the spawn service all read + // this value so they always point at the same filesystem location. + // Default: /public/audio (SERVER_ROOT = server/). + // Override via AUDIO_DIR in .env (relative paths are resolved from APP_ROOT). audioDir: resolveFromRoot(process.env.AUDIO_DIR || path.join(SERVER_ROOT, 'public', 'audio')), }, diff --git a/server/src/index.ts b/server/src/index.ts index f6bdbfb..e336f78 100644 --- a/server/src/index.ts +++ b/server/src/index.ts @@ -79,8 +79,9 @@ app.use(cors({ app.use(express.json()); -// Serve static audio files -app.use('/audio', express.static(path.join(__dirname, '../public/audio'))); +// Serve static audio files from the configured audio directory so that any +// AUDIO_DIR env override is honoured consistently across upload, spawn, and serving. +app.use('/audio', express.static(config.storage.audioDir)); // Audio Editor (AudioMass) - needs relaxed CSP for inline scripts and external images app.use('/editor', (req, res, next) => { diff --git a/server/src/services/acestep.ts b/server/src/services/acestep.ts index 2fd33af..1e40d8e 100644 --- a/server/src/services/acestep.ts +++ b/server/src/services/acestep.ts @@ -254,49 +254,6 @@ function resolveAudioPath(audioUrl: string): string { return audioUrl; } -/** - * Ensures the audio file at the given path is in PCM WAV format, which is - * required by the dit-vae binary for the --src-audio argument. - * - * If the file is already a WAV it is returned as-is. Any other format - * (MP3, FLAC, M4A, AAC, …) is converted with ffmpeg and the resulting WAV - * file is placed in tmpDir. The converted path is returned. - */ -async function ensureWavFormat(audioPath: string, tmpDir: string): Promise { - const ext = path.extname(audioPath).toLowerCase(); - if (ext === '.wav') return audioPath; - - const outPath = path.join(tmpDir, 'src_audio_converted.wav'); - console.log(`[Audio] Converting ${ext} → WAV: ${audioPath} → ${outPath}`); - - await new Promise((resolve, reject) => { - const proc = spawn('ffmpeg', [ - '-y', // overwrite output without asking - '-i', audioPath, // input file (any format ffmpeg supports) - '-vn', // drop any video stream - '-acodec', 'pcm_s16le', // 16-bit PCM — universally readable WAV - outPath, - ], { stdio: 'pipe' }); - - let stderr = ''; - proc.stderr?.on('data', (chunk: Buffer) => { stderr += chunk.toString(); }); - proc.on('error', (err) => { - reject(new Error(`ffmpeg not found or failed to start: ${err.message}`)); - }); - proc.on('close', (code) => { - if (code === 0) { - console.log(`[Audio] Converted to WAV successfully: ${outPath}`); - resolve(); - } else { - console.error(`[Audio] Failed to convert ${ext} to WAV: ${audioPath}`); - reject(new Error(`ffmpeg exited with code ${code} converting "${audioPath}":\n${stderr.slice(-1000)}`)); - } - }); - }); - - return outPath; -} - // --------------------------------------------------------------------------- // Spawn mode: run these step.cpp binaries in a two-step pipeline // Step 1: ace-qwen3 — LLM generates lyrics + audio codes from caption @@ -684,10 +641,9 @@ async function runViaSpawn( if (batchSize > 1) ditArgs.push('--batch', String(batchSize)); // Cover and repaint modes both require a source audio file. - // dit-vae can only read WAV; convert MP3/FLAC/etc. to PCM WAV first. + // dit-vae reads WAV or MP3 natively (via dr_wav / dr_mp3 in audio.h). if (params.sourceAudioUrl) { - const resolvedPath = resolveAudioPath(params.sourceAudioUrl); - const srcAudioPath = await ensureWavFormat(resolvedPath, tmpDir); + const srcAudioPath = resolveAudioPath(params.sourceAudioUrl); ditArgs.push('--src-audio', srcAudioPath); } ditArgs.push(...parseExtraArgs(process.env.DIT_VAE_EXTRA_ARGS)); diff --git a/server/src/services/storage/local.ts b/server/src/services/storage/local.ts index 903582c..94f5979 100644 --- a/server/src/services/storage/local.ts +++ b/server/src/services/storage/local.ts @@ -1,17 +1,16 @@ import { writeFile, unlink, stat, mkdir, copyFile } from 'fs/promises'; import path from 'path'; -import { fileURLToPath } from 'url'; import type { StorageProvider } from './index.js'; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); -const AUDIO_DIR = path.join(__dirname, '../../../public/audio'); +import { config } from '../../config/index.js'; export class LocalStorageProvider implements StorageProvider { private audioDir: string; constructor() { - this.audioDir = AUDIO_DIR; + // Derive the audio directory from the central config so that the storage + // provider always writes to the same location the spawn service resolves + // paths from (config.storage.audioDir, which honours the AUDIO_DIR env var). + this.audioDir = config.storage.audioDir; } async upload(key: string, data: Buffer, _contentType: string): Promise { From c2230f20ee7154104c2841e3585f6de3b10e7327 Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Sun, 8 Mar 2026 14:12:28 +0100 Subject: [PATCH 4/4] Delete backend/README.md --- backend/README.md | 100 ---------------------------------------------- 1 file changed, 100 deletions(-) delete mode 100644 backend/README.md diff --git a/backend/README.md b/backend/README.md deleted file mode 100644 index cc8cb45..0000000 --- a/backend/README.md +++ /dev/null @@ -1,100 +0,0 @@ -# backend/ — design note - -## Why there is no custom C++ HTTP server here - -An earlier design wrapped `acestep-generate` in a second C++ HTTP server process. -That was removed because it added unnecessary complexity: - -| Problem | Impact | -|---------|--------| -| Two processes to manage (Node.js + C++ server) | harder to deploy, restart, monitor | -| C++ server used `popen()` with shell-built strings | fragile, platform-specific, injection surface | -| LoRA state split across two processes | race conditions, stale cache | -| Extra HTTP hop for every generation request | added latency and error surface | -| Users need to build *two* C++ projects | poor DX | - -## Current architecture - -``` -Browser - │ - │ HTTP - ▼ -Node.js Express (port 3001) - │ handles: auth, songs DB, playlists, audio storage, job queue - │ - │ child_process.spawn(bin, args, { shell: false }) - ▼ -acestep-generate ←── GGUF model on GPU/CPU - │ - └─► writes audio files → ./public/audio/ -``` - -The Node.js server reads `ACESTEP_BIN` from `.env` and spawns `acestep-generate` -directly — the same pattern used by llama.cpp, whisper.cpp, and similar tools. -No shell is involved, so there is no injection risk. - -## When a separate HTTP server *would* make sense - -If `acestep.cpp` ever ships a **built-in** HTTP server mode (like `llama-server`), -you can point `ACESTEP_API_URL` at it and leave `ACESTEP_BIN` empty. -The Node.js service already has an HTTP-client fallback for exactly this case. - -See `server/src/services/acestep.ts` for the dual-mode implementation. - ---- - -## Upstream C++ dependency: `audiohacking/acestep.cpp` - -This UI depends on the custom fork at -[`audiohacking/acestep.cpp` branch `copilot/add-wav-mp3-conversion`](https://github.com/audiohacking/acestep.cpp/tree/copilot/add-wav-mp3-conversion) -which adds native MP3 decoding to the `dit-vae` binary via `dr_mp3.h`. - -### Known issues in `src/audio.h` (pending upstream fix) - -**Bug: mono audio is not upmixed to stereo before encoding** - -`read_audio()` returns a native-channel-count buffer (`[T x n_channels]` floats), -but `vae_enc_compute()` in `vae-enc.h` always reads two channels: - -```cpp -// vae-enc.h (hardcodes stereo access — UB when n_channels == 1) -for (int c = 0; c < 2; c++) { - for (int t = 0; t < T_audio; t++) { - m->scratch_in[c * T_audio + t] = audio[t * 2 + c]; - } -} -``` - -For stereo inputs (most user uploads) this works correctly. -For mono inputs the second channel index reads out-of-bounds memory. - -**Required fix in `src/audio.h`** — always return interleaved stereo `[T x 2]`. -Add this block after resampling completes (before the final `return out`): - -```c -// Upmix mono -> stereo, or use first two channels of N-ch audio. -if ((int) channels != 2) { - int n_ch_src = (int) channels; - float *stereo = (float *) malloc((size_t) T_raw * 2 * sizeof(float)); - if (!stereo) { - fprintf(stderr, "[Audio] Out of memory converting to stereo\n"); - free(out); - return NULL; - } - for (int t = 0; t < T_raw; t++) { - float L = out[(size_t) t * n_ch_src + 0]; - float R = (n_ch_src > 1) ? out[(size_t) t * n_ch_src + 1] : L; - stereo[t * 2 + 0] = L; - stereo[t * 2 + 1] = R; - } - free(out); - out = stereo; - fprintf(stderr, "[Audio] Converted %dch -> stereo\n", n_ch_src); -} -*n_channels = 2; -``` - -**Also: replace `drwav_free(raw, NULL)` with `free(raw)`** in the resampling -branch — both `dr_wav` and `dr_mp3` use the system allocator by default, so -`free()` is always safe regardless of which decoder produced the buffer.