From aeadb5833475ca2927ab04b484565d1e6c89c25d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 8 Mar 2026 12:49:07 +0000
Subject: [PATCH 1/4] Initial plan


From 3829b26413b671a2861bfc2f30aaabb9940db622 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 8 Mar 2026 12:54:41 +0000
Subject: [PATCH 2/4] Fix MP3 decoding failure in COVER mode by converting
 non-WAV audio to WAV before passing to dit-vae

Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com>
---
 server/src/services/acestep.ts | 49 ++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)
diff --git a/server/src/services/acestep.ts b/server/src/services/acestep.ts
index 53663a9..2fd33af 100644
--- a/server/src/services/acestep.ts
+++ b/server/src/services/acestep.ts
@@ -254,6 +254,49 @@ function resolveAudioPath(audioUrl: string): string {
   return audioUrl;
 }
 
+/**
+ * Ensures the audio file at the given path is in PCM WAV format, which is
+ * required by the dit-vae binary for the --src-audio argument.
+ *
+ * If the file is already a WAV it is returned as-is.  Any other format
+ * (MP3, FLAC, M4A, AAC, …) is converted with ffmpeg and the resulting WAV
+ * file is placed in tmpDir.  The converted path is returned.
+ */
+async function ensureWavFormat(audioPath: string, tmpDir: string): Promise<string> {
+  const ext = path.extname(audioPath).toLowerCase();
+  if (ext === '.wav') return audioPath;
+
+  const outPath = path.join(tmpDir, 'src_audio_converted.wav');
+  console.log(`[Audio] Converting ${ext} → WAV: ${audioPath} → ${outPath}`);
+
+  await new Promise<void>((resolve, reject) => {
+    const proc = spawn('ffmpeg', [
+      '-y',              // overwrite output without asking
+      '-i', audioPath,   // input file (any format ffmpeg supports)
+      '-vn',             // drop any video stream
+      '-acodec', 'pcm_s16le',  // 16-bit PCM — universally readable WAV
+      outPath,
+    ], { stdio: 'pipe' });
+
+    let stderr = '';
+    proc.stderr?.on('data', (chunk: Buffer) => { stderr += chunk.toString(); });
+    proc.on('error', (err) => {
+      reject(new Error(`ffmpeg not found or failed to start: ${err.message}`));
+    });
+    proc.on('close', (code) => {
+      if (code === 0) {
+        console.log(`[Audio] Converted to WAV successfully: ${outPath}`);
+        resolve();
+      } else {
+        console.error(`[Audio] Failed to convert ${ext} to WAV: ${audioPath}`);
+        reject(new Error(`ffmpeg exited with code ${code} converting "${audioPath}":\n${stderr.slice(-1000)}`));
+      }
+    });
+  });
+
+  return outPath;
+}
+
 // ---------------------------------------------------------------------------
 // Spawn mode: run these step.cpp binaries in a two-step pipeline
 //   Step 1: ace-qwen3  — LLM generates lyrics + audio codes from caption
@@ -640,9 +683,11 @@ async function runViaSpawn(
     const batchSize = Math.min(Math.max(params.batchSize ?? 1, 1), 8);
     if (batchSize > 1) ditArgs.push('--batch', String(batchSize));
 
-    // Cover and repaint modes both require a source audio file
+    // Cover and repaint modes both require a source audio file.
+    // dit-vae can only read WAV; convert MP3/FLAC/etc. to PCM WAV first.
     if (params.sourceAudioUrl) {
-      const srcAudioPath = resolveAudioPath(params.sourceAudioUrl);
+      const resolvedPath = resolveAudioPath(params.sourceAudioUrl);
+      const srcAudioPath = await ensureWavFormat(resolvedPath, tmpDir);
       ditArgs.push('--src-audio', srcAudioPath);
     }
     ditArgs.push(...parseExtraArgs(process.env.DIT_VAE_EXTRA_ARGS));

From 80becdadc95d0157df7c9a76afc49e75d1a9f0cf Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 8 Mar 2026 13:07:05 +0000
Subject: [PATCH 3/4] Fix AUDIO_DIR path mismatch: single source of truth for
 audio directory across storage, serving, and spawn

Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com>
---
 .env.example                         |  6 ++-
 backend/README.md                    | 57 ++++++++++++++++++++++++++++
 server/src/config/index.ts           |  9 +++--
 server/src/index.ts                  |  5 ++-
 server/src/services/acestep.ts       | 48 +----------------------
 server/src/services/storage/local.ts | 11 +++---
 6 files changed, 77 insertions(+), 59 deletions(-)

diff --git a/.env.example b/.env.example
index e76874a..f24a9ec 100644
--- a/.env.example
+++ b/.env.example
@@ -39,7 +39,11 @@ MODELS_DIR=./models
 # ACESTEP_CPP_BRANCH=main
 
 # ── Storage ───────────────────────────────────────────────────────────────────
-AUDIO_DIR=./public/audio
+# Audio directory for generated songs and uploaded reference tracks.
+# Relative paths are resolved from the project root (APP_ROOT).
+# This is the single source of truth: LocalStorageProvider (writes),
+# Express /audio/ endpoint (serves), and the spawn service (reads) all use it.
+AUDIO_DIR=./server/public/audio
 
 # ── Auth ──────────────────────────────────────────────────────────────────────
 # Change this to a long random string in any multi-user or network-exposed setup.
diff --git a/backend/README.md b/backend/README.md
index cfd75ea..cc8cb45 100644
--- a/backend/README.md
+++ b/backend/README.md
@@ -41,3 +41,60 @@ you can point `ACESTEP_API_URL` at it and leave `ACESTEP_BIN` empty.
 The Node.js service already has an HTTP-client fallback for exactly this case.
 
 See `server/src/services/acestep.ts` for the dual-mode implementation.
+
+---
+
+## Upstream C++ dependency: `audiohacking/acestep.cpp`
+
+This UI depends on the custom fork at
+[`audiohacking/acestep.cpp` branch `copilot/add-wav-mp3-conversion`](https://github.com/audiohacking/acestep.cpp/tree/copilot/add-wav-mp3-conversion)
+which adds native MP3 decoding to the `dit-vae` binary via `dr_mp3.h`.
+
+### Known issues in `src/audio.h` (pending upstream fix)
+
+**Bug: mono audio is not upmixed to stereo before encoding**
+
+`read_audio()` returns a native-channel-count buffer (`[T x n_channels]` floats),
+but `vae_enc_compute()` in `vae-enc.h` always reads two channels:
+
+```cpp
+// vae-enc.h (hardcodes stereo access — UB when n_channels == 1)
+for (int c = 0; c < 2; c++) {
+    for (int t = 0; t < T_audio; t++) {
+        m->scratch_in[c * T_audio + t] = audio[t * 2 + c];
+    }
+}
+```
+
+For stereo inputs (most user uploads) this works correctly.
+For mono inputs the second channel index reads out-of-bounds memory.
+
+**Required fix in `src/audio.h`** — always return interleaved stereo `[T x 2]`.
+Add this block after resampling completes (before the final `return out`):
+
+```c
+// Upmix mono -> stereo, or use first two channels of N-ch audio.
+if ((int) channels != 2) {
+    int    n_ch_src = (int) channels;
+    float *stereo   = (float *) malloc((size_t) T_raw * 2 * sizeof(float));
+    if (!stereo) {
+        fprintf(stderr, "[Audio] Out of memory converting to stereo\n");
+        free(out);
+        return NULL;
+    }
+    for (int t = 0; t < T_raw; t++) {
+        float L = out[(size_t) t * n_ch_src + 0];
+        float R = (n_ch_src > 1) ? out[(size_t) t * n_ch_src + 1] : L;
+        stereo[t * 2 + 0] = L;
+        stereo[t * 2 + 1] = R;
+    }
+    free(out);
+    out = stereo;
+    fprintf(stderr, "[Audio] Converted %dch -> stereo\n", n_ch_src);
+}
+*n_channels = 2;
+```
+
+**Also: replace `drwav_free(raw, NULL)` with `free(raw)`** in the resampling
+branch — both `dr_wav` and `dr_mp3` use the system allocator by default, so
+`free()` is always safe regardless of which decoder produced the buffer.
diff --git a/server/src/config/index.ts b/server/src/config/index.ts
index 14269e8..7abb41c 100644
--- a/server/src/config/index.ts
+++ b/server/src/config/index.ts
@@ -224,10 +224,11 @@ export const config = {
 
   storage: {
     provider: 'local' as const,
-    // Audio directory must match where LocalStorageProvider writes files and
-    // where Express serves /audio/ from (server/src/index.ts: '../public/audio').
-    // Both resolve to <server_root>/public/audio, so we use SERVER_ROOT here.
-    // AUDIO_DIR env override is still supported (resolved against APP_ROOT).
+    // Single source of truth for the audio directory.
+    // LocalStorageProvider, Express (/audio/), and the spawn service all read
+    // this value so they always point at the same filesystem location.
+    // Default: <server_root>/public/audio (SERVER_ROOT = server/).
+    // Override via AUDIO_DIR in .env (relative paths are resolved from APP_ROOT).
     audioDir: resolveFromRoot(process.env.AUDIO_DIR || path.join(SERVER_ROOT, 'public', 'audio')),
   },
 
diff --git a/server/src/index.ts b/server/src/index.ts
index f6bdbfb..e336f78 100644
--- a/server/src/index.ts
+++ b/server/src/index.ts
@@ -79,8 +79,9 @@ app.use(cors({
 
 app.use(express.json());
 
-// Serve static audio files
-app.use('/audio', express.static(path.join(__dirname, '../public/audio')));
+// Serve static audio files from the configured audio directory so that any
+// AUDIO_DIR env override is honoured consistently across upload, spawn, and serving.
+app.use('/audio', express.static(config.storage.audioDir));
 
 // Audio Editor (AudioMass) - needs relaxed CSP for inline scripts and external images
 app.use('/editor', (req, res, next) => {
diff --git a/server/src/services/acestep.ts b/server/src/services/acestep.ts
index 2fd33af..1e40d8e 100644
--- a/server/src/services/acestep.ts
+++ b/server/src/services/acestep.ts
@@ -254,49 +254,6 @@ function resolveAudioPath(audioUrl: string): string {
   return audioUrl;
 }
 
-/**
- * Ensures the audio file at the given path is in PCM WAV format, which is
- * required by the dit-vae binary for the --src-audio argument.
- *
- * If the file is already a WAV it is returned as-is.  Any other format
- * (MP3, FLAC, M4A, AAC, …) is converted with ffmpeg and the resulting WAV
- * file is placed in tmpDir.  The converted path is returned.
- */
-async function ensureWavFormat(audioPath: string, tmpDir: string): Promise<string> {
-  const ext = path.extname(audioPath).toLowerCase();
-  if (ext === '.wav') return audioPath;
-
-  const outPath = path.join(tmpDir, 'src_audio_converted.wav');
-  console.log(`[Audio] Converting ${ext} → WAV: ${audioPath} → ${outPath}`);
-
-  await new Promise<void>((resolve, reject) => {
-    const proc = spawn('ffmpeg', [
-      '-y',              // overwrite output without asking
-      '-i', audioPath,   // input file (any format ffmpeg supports)
-      '-vn',             // drop any video stream
-      '-acodec', 'pcm_s16le',  // 16-bit PCM — universally readable WAV
-      outPath,
-    ], { stdio: 'pipe' });
-
-    let stderr = '';
-    proc.stderr?.on('data', (chunk: Buffer) => { stderr += chunk.toString(); });
-    proc.on('error', (err) => {
-      reject(new Error(`ffmpeg not found or failed to start: ${err.message}`));
-    });
-    proc.on('close', (code) => {
-      if (code === 0) {
-        console.log(`[Audio] Converted to WAV successfully: ${outPath}`);
-        resolve();
-      } else {
-        console.error(`[Audio] Failed to convert ${ext} to WAV: ${audioPath}`);
-        reject(new Error(`ffmpeg exited with code ${code} converting "${audioPath}":\n${stderr.slice(-1000)}`));
-      }
-    });
-  });
-
-  return outPath;
-}
-
 // ---------------------------------------------------------------------------
 // Spawn mode: run these step.cpp binaries in a two-step pipeline
 //   Step 1: ace-qwen3  — LLM generates lyrics + audio codes from caption
@@ -684,10 +641,9 @@ async function runViaSpawn(
     if (batchSize > 1) ditArgs.push('--batch', String(batchSize));
 
     // Cover and repaint modes both require a source audio file.
-    // dit-vae can only read WAV; convert MP3/FLAC/etc. to PCM WAV first.
+    // dit-vae reads WAV or MP3 natively (via dr_wav / dr_mp3 in audio.h).
     if (params.sourceAudioUrl) {
-      const resolvedPath = resolveAudioPath(params.sourceAudioUrl);
-      const srcAudioPath = await ensureWavFormat(resolvedPath, tmpDir);
+      const srcAudioPath = resolveAudioPath(params.sourceAudioUrl);
       ditArgs.push('--src-audio', srcAudioPath);
     }
     ditArgs.push(...parseExtraArgs(process.env.DIT_VAE_EXTRA_ARGS));
diff --git a/server/src/services/storage/local.ts b/server/src/services/storage/local.ts
index 903582c..94f5979 100644
--- a/server/src/services/storage/local.ts
+++ b/server/src/services/storage/local.ts
@@ -1,17 +1,16 @@
 import { writeFile, unlink, stat, mkdir, copyFile } from 'fs/promises';
 import path from 'path';
-import { fileURLToPath } from 'url';
 import type { StorageProvider } from './index.js';
-
-const __filename = fileURLToPath(import.meta.url);
-const __dirname = path.dirname(__filename);
-const AUDIO_DIR = path.join(__dirname, '../../../public/audio');
+import { config } from '../../config/index.js';
 
 export class LocalStorageProvider implements StorageProvider {
   private audioDir: string;
 
   constructor() {
-    this.audioDir = AUDIO_DIR;
+    // Derive the audio directory from the central config so that the storage
+    // provider always writes to the same location the spawn service resolves
+    // paths from (config.storage.audioDir, which honours the AUDIO_DIR env var).
+    this.audioDir = config.storage.audioDir;
   }
 
   async upload(key: string, data: Buffer, _contentType: string): Promise<string> {

From c2230f20ee7154104c2841e3585f6de3b10e7327 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Sun, 8 Mar 2026 14:12:28 +0100
Subject: [PATCH 4/4] Delete backend/README.md

---
 backend/README.md | 100 ----------------------------------------------
 1 file changed, 100 deletions(-)
 delete mode 100644 backend/README.md

diff --git a/backend/README.md b/backend/README.md
deleted file mode 100644
index cc8cb45..0000000
--- a/backend/README.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# backend/ — design note
-
-## Why there is no custom C++ HTTP server here
-
-An earlier design wrapped `acestep-generate` in a second C++ HTTP server process.
-That was removed because it added unnecessary complexity:
-
-| Problem | Impact |
-|---------|--------|
-| Two processes to manage (Node.js + C++ server) | harder to deploy, restart, monitor |
-| C++ server used `popen()` with shell-built strings | fragile, platform-specific, injection surface |
-| LoRA state split across two processes | race conditions, stale cache |
-| Extra HTTP hop for every generation request | added latency and error surface |
-| Users need to build *two* C++ projects | poor DX |
-
-## Current architecture
-
-```
-Browser
-  │
-  │ HTTP
-  ▼
-Node.js Express (port 3001)
-  │  handles: auth, songs DB, playlists, audio storage, job queue
-  │
-  │ child_process.spawn(bin, args, { shell: false })
-  ▼
-acestep-generate  ←── GGUF model on GPU/CPU
-  │
-  └─► writes audio files → ./public/audio/
-```
-
-The Node.js server reads `ACESTEP_BIN` from `.env` and spawns `acestep-generate`
-directly — the same pattern used by llama.cpp, whisper.cpp, and similar tools.
-No shell is involved, so there is no injection risk.
-
-## When a separate HTTP server *would* make sense
-
-If `acestep.cpp` ever ships a **built-in** HTTP server mode (like `llama-server`),
-you can point `ACESTEP_API_URL` at it and leave `ACESTEP_BIN` empty.
-The Node.js service already has an HTTP-client fallback for exactly this case.
-
-See `server/src/services/acestep.ts` for the dual-mode implementation.
-
----
-
-## Upstream C++ dependency: `audiohacking/acestep.cpp`
-
-This UI depends on the custom fork at
-[`audiohacking/acestep.cpp` branch `copilot/add-wav-mp3-conversion`](https://github.com/audiohacking/acestep.cpp/tree/copilot/add-wav-mp3-conversion)
-which adds native MP3 decoding to the `dit-vae` binary via `dr_mp3.h`.
-
-### Known issues in `src/audio.h` (pending upstream fix)
-
-**Bug: mono audio is not upmixed to stereo before encoding**
-
-`read_audio()` returns a native-channel-count buffer (`[T x n_channels]` floats),
-but `vae_enc_compute()` in `vae-enc.h` always reads two channels:
-
-```cpp
-// vae-enc.h (hardcodes stereo access — UB when n_channels == 1)
-for (int c = 0; c < 2; c++) {
-    for (int t = 0; t < T_audio; t++) {
-        m->scratch_in[c * T_audio + t] = audio[t * 2 + c];
-    }
-}
-```
-
-For stereo inputs (most user uploads) this works correctly.
-For mono inputs the second channel index reads out-of-bounds memory.
-
-**Required fix in `src/audio.h`** — always return interleaved stereo `[T x 2]`.
-Add this block after resampling completes (before the final `return out`):
-
-```c
-// Upmix mono -> stereo, or use first two channels of N-ch audio.
-if ((int) channels != 2) {
-    int    n_ch_src = (int) channels;
-    float *stereo   = (float *) malloc((size_t) T_raw * 2 * sizeof(float));
-    if (!stereo) {
-        fprintf(stderr, "[Audio] Out of memory converting to stereo\n");
-        free(out);
-        return NULL;
-    }
-    for (int t = 0; t < T_raw; t++) {
-        float L = out[(size_t) t * n_ch_src + 0];
-        float R = (n_ch_src > 1) ? out[(size_t) t * n_ch_src + 1] : L;
-        stereo[t * 2 + 0] = L;
-        stereo[t * 2 + 1] = R;
-    }
-    free(out);
-    out = stereo;
-    fprintf(stderr, "[Audio] Converted %dch -> stereo\n", n_ch_src);
-}
-*n_channels = 2;
-```
-
-**Also: replace `drwav_free(raw, NULL)` with `free(raw)`** in the resampling
-branch — both `dr_wav` and `dr_mp3` use the system allocator by default, so
-`free()` is always safe regardless of which decoder produced the buffer.