Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion server/src/config/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ const __dirname = path.dirname(__filename);
// release: <bundle>/server/dist/config/ → ../../.. → <bundle>/
const APP_ROOT = path.resolve(__dirname, '../../..');

// Server root = two levels above this file (always the server/ directory):
// dev: <repo>/server/src/config/ → ../.. → <repo>/server/
// release: <bundle>/server/dist/config/ → ../.. → <bundle>/server/
const SERVER_ROOT = path.resolve(__dirname, '../..');

// ── Path helpers ─────────────────────────────────────────────────────────────

/**
Expand Down Expand Up @@ -219,7 +224,11 @@ export const config = {

storage: {
provider: 'local' as const,
audioDir: resolveFromRoot(process.env.AUDIO_DIR || path.join(APP_ROOT, 'public', 'audio')),
// Audio directory must match where LocalStorageProvider writes files and
// where Express serves /audio/ from (server/src/index.ts: '../public/audio').
// Both resolve to <server_root>/public/audio, so we use SERVER_ROOT here.
// AUDIO_DIR env override is still supported (resolved against APP_ROOT).
audioDir: resolveFromRoot(process.env.AUDIO_DIR || path.join(SERVER_ROOT, 'public', 'audio')),
},

jwt: {
Expand Down
18 changes: 17 additions & 1 deletion server/src/routes/generate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -270,11 +270,27 @@ router.post('/', authMiddleware, async (req: AuthenticatedRequest, res: Response
return;
}

if (customMode && !style && !lyrics && !referenceAudioUrl) {
// In custom mode, at least one content field is required — unless the request
// is for cover, audio2audio, or repaint mode and a source audio is provided
// (the source audio itself is the primary input; style/lyrics are optional).
const requiresSourceAudio = taskType === 'cover' || taskType === 'audio2audio' || taskType === 'repaint';
if (customMode && !style && !lyrics && !referenceAudioUrl && !(requiresSourceAudio && sourceAudioUrl)) {
res.status(400).json({ error: 'Style, lyrics, or reference audio required for custom mode' });
return;
}

// Debug log: show what the API client sent
console.log(
`[API] POST /generate:` +
`\n taskType = ${taskType || 'text2music'}` +
`\n customMode = ${customMode}` +
`\n ditModel = ${ditModel || '(default)'}` +
`\n sourceAudio = ${sourceAudioUrl || 'none'}` +
`\n repaint = [${repaintingStart ?? 'start'}, ${repaintingEnd ?? 'end'}]` +
`\n coverStr = ${audioCoverStrength ?? 'n/a'}` +
`\n user = ${req.user!.id}`
);

const params = {
customMode,
songDescription,
Expand Down
191 changes: 140 additions & 51 deletions server/src/services/acestep.ts
Original file line number Diff line number Diff line change
Expand Up @@ -220,18 +220,37 @@ function resolveParamDitModel(name: string | undefined): string {
// Audio path resolution (for reference/source audio inputs)
// ---------------------------------------------------------------------------

/**
* Resolves a UI audio URL (e.g. "/audio/reference-tracks/user/file.mp3") or
* an absolute filesystem path to the local filesystem path that the spawned
* binary can open.
*
* Supported input formats:
* • "/audio/<rest>" — relative public URL; joined with AUDIO_DIR
* (covers reference-tracks/, generated songs, etc.)
* • "http[s]://host/audio/…" — absolute URL whose path starts with /audio/
* • Any other absolute path — returned as-is
*/
function resolveAudioPath(audioUrl: string): string {
// Relative public URL produced by the UI player or upload endpoint
if (audioUrl.startsWith('/audio/')) {
return path.join(AUDIO_DIR, audioUrl.replace('/audio/', ''));
const resolved = path.join(AUDIO_DIR, audioUrl.slice('/audio/'.length));
console.log(`[resolveAudio] ${audioUrl} → ${resolved}`);
return resolved;
}
if (audioUrl.startsWith('http')) {
// Full HTTP URL — extract the path component and try again
if (audioUrl.startsWith('http://') || audioUrl.startsWith('https://')) {
try {
const parsed = new URL(audioUrl);
if (parsed.pathname.startsWith('/audio/')) {
return path.join(AUDIO_DIR, parsed.pathname.replace('/audio/', ''));
const resolved = path.join(AUDIO_DIR, parsed.pathname.slice('/audio/'.length));
console.log(`[resolveAudio] ${audioUrl} → ${resolved}`);
return resolved;
}
} catch { /* fall through */ }
}
// Already an absolute filesystem path — pass through
console.log(`[resolveAudio] ${audioUrl} → (absolute path, no change)`);
return audioUrl;
}

Expand Down Expand Up @@ -446,61 +465,105 @@ async function runViaSpawn(
const tmpDir = path.join(AUDIO_DIR, `_tmp_${jobId}`);
await mkdir(tmpDir, { recursive: true });

// ── Determine generation mode ────────────────────────────────────────────
// Explicit task type drives mode selection; source audio / audio codes act
// as secondary signals for backward compatibility.
const taskType = params.taskType || 'text2music';
const isCover = taskType === 'cover' || taskType === 'audio2audio';
const isRepaint = taskType === 'repaint';
// Passthrough: taskType explicitly set, or audio codes provided without
// a source audio file (legacy callers that omit the taskType field).
const isPassthru = taskType === 'passthrough' || Boolean(params.audioCodes && !params.sourceAudioUrl);
// LLM (ace-qwen3) is only needed for plain text-to-music generation.
// Cover, repaint, and passthrough all skip it.
const skipLm = isCover || isRepaint || isPassthru;

// ── Debug: log what the UI/API client requested ──────────────────────────
console.log(
`[Job ${jobId}] Request received:` +
`\n mode = ${taskType}` +
`\n customMode = ${params.customMode}` +
`\n ditModel = ${params.ditModel || '(default)'}` +
`\n sourceAudio = ${params.sourceAudioUrl || 'none'}` +
`\n repaintRegion = [${params.repaintingStart ?? 'start'}, ${params.repaintingEnd ?? 'end'}]` +
`\n coverStrength = ${params.audioCoverStrength ?? 'n/a'}` +
`\n steps = ${params.inferenceSteps ?? 8}` +
`\n guidance = ${params.guidanceScale ?? 0.0}` +
`\n shift = ${params.shift ?? 3.0}` +
`\n skipLm = ${skipLm}`
);

try {
// ── Build request.json ─────────────────────────────────────────────────
// ace-qwen3 reads generation parameters from a JSON file. Only `caption`
// is strictly required; all other fields default to sensible values.
// The JSON file is read by ace-qwen3 (text2music) or dit-vae directly
// (cover / repaint / passthrough). Only include the fields each binary
// actually understands so the format stays clean and predictable.
const caption = params.style || 'pop music';
const prompt = params.customMode ? caption : (params.songDescription || caption);
// Instrumental: pass the special "[Instrumental]" lyrics string so the LLM
// Instrumental: pass the special "[Instrumental]" lyrics marker so the LLM
// skips lyrics generation (as documented in the acestep.cpp README).
const lyrics = params.instrumental ? '[Instrumental]' : (params.lyrics || '');

// Fields common to all modes (understood by both ace-qwen3 and dit-vae)
const requestJson: Record<string, unknown> = {
caption: prompt,
caption: prompt,
lyrics,
vocal_language: params.vocalLanguage || 'unknown',
seed: params.randomSeed !== false ? -1 : (params.seed ?? -1),
lm_temperature: params.lmTemperature ?? 0.85,
lm_cfg_scale: params.lmCfgScale ?? 2.0,
lm_top_p: params.lmTopP ?? 0.9,
lm_top_k: params.lmTopK ?? 0,
lm_negative_prompt: params.lmNegativePrompt || '',
inference_steps: params.inferenceSteps ?? 8,
guidance_scale: params.guidanceScale ?? 0.0,
shift: params.shift ?? 3.0,
seed: params.randomSeed !== false ? -1 : (params.seed ?? -1),
inference_steps: params.inferenceSteps ?? 8,
guidance_scale: params.guidanceScale ?? 0.0,
shift: params.shift ?? 3.0,
};
// Optional metadata (0 / empty = let the LLM fill it)
if (params.bpm && params.bpm > 0) requestJson.bpm = params.bpm;
if (params.duration && params.duration > 0) requestJson.duration = params.duration;
if (params.keyScale) requestJson.keyscale = params.keyScale;
if (params.timeSignature) requestJson.timesignature = params.timeSignature;
// Passthrough: skip the LLM when audio codes are already provided
if (params.audioCodes) requestJson.audio_codes = params.audioCodes;
// Cover/audio-to-audio: strength of the source audio influence on the output
// (ignored in repaint mode — the mask handles everything)
if (params.audioCoverStrength !== undefined && params.taskType !== 'repaint') {
requestJson.audio_cover_strength = params.audioCoverStrength;
}
// Repaint mode: regenerate a time region while preserving the rest.
// Activated by setting repainting_start and/or repainting_end in the JSON.
// Both default to -1 (inactive): -1 on start means 0s, -1 on end means source duration.
if (params.taskType === 'repaint' && params.sourceAudioUrl) {
requestJson.repainting_start = params.repaintingStart ?? -1;
requestJson.repainting_end = params.repaintingEnd ?? -1;

// Optional music metadata (0 / empty → binary fills it in)
if (params.bpm && params.bpm > 0) requestJson.bpm = params.bpm;
if (params.duration && params.duration > 0) requestJson.duration = params.duration;
if (params.keyScale) requestJson.keyscale = params.keyScale;
if (params.timeSignature) requestJson.timesignature = params.timeSignature;

if (skipLm) {
// ── Cover / repaint / passthrough: ace-qwen3 is skipped ─────────────
// Add only the mode-specific fields that dit-vae cares about.
if (isPassthru) {
if (!params.audioCodes) {
// Passthrough requires pre-computed codes — fail early with a clear message
throw new Error("task_type='passthrough' requires pre-computed audio_codes");
}
requestJson.audio_codes = params.audioCodes;
} else if (isCover) {
// Cover / audio-to-audio: strength of the source audio influence (0–1)
if (params.audioCoverStrength !== undefined) {
requestJson.audio_cover_strength = params.audioCoverStrength;
}
} else if (isRepaint) {
// Repaint: regenerate only the specified time region; preserve the rest.
// Both default to -1: start=-1 → 0 s, end=-1 → full source duration.
// Note: sourceAudioUrl is guaranteed here — validated in processGeneration.
requestJson.repainting_start = params.repaintingStart ?? -1;
requestJson.repainting_end = params.repaintingEnd ?? -1;
}
} else {
// ── Text-to-music: include LM parameters for ace-qwen3 ──────────────
requestJson.vocal_language = params.vocalLanguage || 'unknown';
requestJson.lm_temperature = params.lmTemperature ?? 0.85;
requestJson.lm_cfg_scale = params.lmCfgScale ?? 2.0;
requestJson.lm_top_p = params.lmTopP ?? 0.9;
requestJson.lm_top_k = params.lmTopK ?? 0;
requestJson.lm_negative_prompt = params.lmNegativePrompt || '';
}

const requestPath = path.join(tmpDir, 'request.json');
await writeFile(requestPath, JSON.stringify(requestJson, null, 2));
console.log(`[Job ${jobId}] Request JSON written to ${requestPath}:`);
console.log(JSON.stringify(requestJson, null, 2));

// ── Step 1: ace-qwen3 — LLM (lyrics + audio codes) ────────────────────
// Skipped when:
// • audio_codes are provided (passthrough) — codes are already known
// • sourceAudioUrl is provided (cover/audio-to-audio) — dit-vae derives
// codes directly from the source audio; running ace-qwen3 is not needed
// • taskType is cover / audio2audio / repaint — dit-vae derives tokens
// directly from the source audio; running ace-qwen3 is not needed
// • taskType is passthrough — audio codes are already provided
let enrichedPaths: string[] = [];

if (!params.audioCodes && !params.sourceAudioUrl) {
if (!skipLm) {
job.stage = 'LLM: generating lyrics and audio codes…';

const lmBin = config.acestep.lmBin!;
Expand All @@ -513,7 +576,7 @@ async function runViaSpawn(
if (batchSize > 1) lmArgs.push('--batch', String(batchSize));
lmArgs.push(...parseExtraArgs(process.env.ACE_QWEN3_EXTRA_ARGS));

console.log(`[Spawn] Job ${jobId}: ace-qwen3 ${lmArgs.slice(0, 6).join(' ')}`);
console.log(`[Job ${jobId}] Running ace-qwen3:\n ${lmBin} ${lmArgs.join(' ')}`);
await runBinary(lmBin, lmArgs, 'ace-qwen3', undefined, makeLmProgressHandler(job));

// Collect enriched JSON files produced by ace-qwen3:
Expand All @@ -528,24 +591,33 @@ async function runViaSpawn(
if (enrichedPaths.length === 0) {
throw new Error('ace-qwen3 produced no enriched request files');
}
console.log(`[Job ${jobId}] ace-qwen3 produced ${enrichedPaths.length} enriched file(s): ${enrichedPaths.join(', ')}`);
} else {
// Passthrough: use the original request.json directly
// (audio codes provided, or source audio supplied for cover/audio-to-audio mode)
// Cover / repaint / passthrough: pass the original request.json directly
// to dit-vae; no LLM enrichment step needed.
enrichedPaths = [requestPath];
console.log(`[Job ${jobId}] LLM step skipped (mode=${taskType}); passing request.json directly to dit-vae`);
}

// ── Step 2: dit-vae — DiT + VAE (audio synthesis) ──────────────────────
job.stage = 'DiT+VAE: synthesising audio…';

const ditVaeBin = config.acestep.ditVaeBin!;
const textEncoderModel = config.acestep.textEncoderModel;
const ditModel = resolveParamDitModel(params.ditModel);
const vaeModel = config.acestep.vaeModel;
const ditVaeBin = config.acestep.ditVaeBin!;
const textEncoderModel = config.acestep.textEncoderModel;
const ditModel = resolveParamDitModel(params.ditModel);
const vaeModel = config.acestep.vaeModel;

if (!textEncoderModel) throw new Error('Text-encoder model not found — run models.sh first');
if (!ditModel) throw new Error('DiT model not found — run models.sh first');
if (!vaeModel) throw new Error('VAE model not found — run models.sh first');

console.log(
`[Job ${jobId}] Resolved model paths:` +
`\n text-encoder = ${textEncoderModel}` +
`\n dit = ${ditModel}` +
`\n vae = ${vaeModel}`
);

const ditArgs: string[] = [
'--request', ...enrichedPaths,
'--text-encoder', textEncoderModel,
Expand All @@ -556,10 +628,14 @@ async function runViaSpawn(
const batchSize = Math.min(Math.max(params.batchSize ?? 1, 1), 8);
if (batchSize > 1) ditArgs.push('--batch', String(batchSize));

if (params.sourceAudioUrl) ditArgs.push('--src-audio', resolveAudioPath(params.sourceAudioUrl));
// Cover and repaint modes both require a source audio file
if (params.sourceAudioUrl) {
const srcAudioPath = resolveAudioPath(params.sourceAudioUrl);
ditArgs.push('--src-audio', srcAudioPath);
}
ditArgs.push(...parseExtraArgs(process.env.DIT_VAE_EXTRA_ARGS));

console.log(`[Spawn] Job ${jobId}: dit-vae ${ditArgs.slice(0, 6).join(' ')}`);
console.log(`[Job ${jobId}] Running dit-vae:\n ${ditVaeBin} ${ditArgs.join(' ')}`);
await runBinary(ditVaeBin, ditArgs, 'dit-vae', undefined, makeDitVaeProgressHandler(job));

// ── Collect generated WAV files ─────────────────────────────────────────
Expand Down Expand Up @@ -608,7 +684,7 @@ async function runViaSpawn(
status: 'succeeded',
};
job.rawResponse = enrichedMeta;
console.log(`[Spawn] Job ${jobId}: completed with ${audioUrls.length} audio file(s)`);
console.log(`[Job ${jobId}] Completed successfully with ${audioUrls.length} audio file(s): ${audioUrls.join(', ')}`);

// Clean up tmp directory
await rm(tmpDir, { recursive: true, force: true }).catch(() => { /* best-effort */ });
Expand Down Expand Up @@ -843,16 +919,28 @@ async function processGeneration(
job.status = 'running';
job.stage = 'Starting generation...';

const mode = useSpawnMode(params) ? 'spawn' : 'http';
console.log(
`[Job ${jobId}] Starting generation (${mode} mode):` +
`\n taskType = ${params.taskType || 'text2music'}` +
`\n customMode = ${params.customMode}` +
`\n ditModel = ${params.ditModel || '(default)'}` +
`\n sourceAudio = ${params.sourceAudioUrl || 'none'}` +
`\n audioCodes = ${params.audioCodes ? '[provided]' : 'none'}`
);

if ((params.taskType === 'cover' || params.taskType === 'audio2audio') &&
!params.sourceAudioUrl && !params.audioCodes) {
job.status = 'failed';
job.error = `task_type='${params.taskType}' requires a source audio or audio codes`;
console.error(`[Job ${jobId}] Validation failed: ${job.error}`);
return;
}

if (params.taskType === 'repaint' && !params.sourceAudioUrl) {
job.status = 'failed';
job.error = "task_type='repaint' requires a source audio (--src-audio)";
console.error(`[Job ${jobId}] Validation failed: ${job.error}`);
return;
}

Expand All @@ -864,9 +952,10 @@ async function processGeneration(
await runViaHttp(jobId, params, job);
}
} catch (err) {
console.error(`Job ${jobId} failed:`, err);
const errMsg = err instanceof Error ? err.message : String(err);
console.error(`[Job ${jobId}] Generation failed: ${errMsg}`);
job.status = 'failed';
job.error = err instanceof Error ? err.message : 'Generation failed';
job.error = errMsg || 'Generation failed';
}
}

Expand Down
Loading