From 62857efea2928fb38349181f45a2713e33c4758d Mon Sep 17 00:00:00 2001 From: Pascal Date: Sat, 7 Mar 2026 18:29:06 +0100 Subject: [PATCH 1/3] cleaning --- src/request.h | 1 - tests/request0.json | 1 - 2 files changed, 2 deletions(-) diff --git a/src/request.h b/src/request.h index 0834c7f..67a3339 100644 --- a/src/request.h +++ b/src/request.h @@ -3,7 +3,6 @@ // request.h - AceStep generation request (JSON serialization) // // Pure data container + JSON read/write. Zero business logic. -// Aligned with Python GenerationParams (inference.py:39) and API /release_task. // #include diff --git a/tests/request0.json b/tests/request0.json index 114f7cc..4199989 100644 --- a/tests/request0.json +++ b/tests/request0.json @@ -6,7 +6,6 @@ "keyscale": "G major", "timesignature": "4", "vocal_language": "fr", - "task_type": "text2music", "seed": 158961132, "thinking": true, "lm_temperature": 0.85, From 409026358f2c749608f291b9e0a4768ea4a92e4b Mon Sep 17 00:00:00 2001 From: Pascal Date: Sat, 7 Mar 2026 19:34:17 +0100 Subject: [PATCH 2/3] repaint: selective region regeneration via --src-audio + repainting_start/end Add repaint mode to dit-vae: regenerate a time region of source audio while preserving the rest. Activated by setting repainting_start and/or repainting_end in the request JSON (both default to -1 = inactive). When active, the DiT receives a binary chunk mask: - Inside region: mask=1.0, src=silence (generate new content) - Outside region: mask=0.0, src=cover_latents (preserve original) Instruction switches to "Repaint the mask area based on the given conditions:" Semantics: - Both -1: inactive (plain cover) - start=-1: means 0s (beginning) - end=-1: means source duration (end) - end <= start after resolve: hard error - repaint params without --src-audio: hard error No task_type field needed. Two floats activate the mode. Also add [Component] prefix to all bare ERROR/FATAL messages across tools/ and src/ to match existing convention ([VAE], [Load], [GGUF]) --- README.md | 39 +++++++++++++++- src/bpe.h | 4 +- src/dit-sampler.h | 2 +- src/request.cpp | 11 +++++ src/request.h | 6 +++ tools/ace-qwen3.cpp | 6 +-- tools/dit-vae.cpp | 110 +++++++++++++++++++++++++++++++++----------- 7 files changed, 143 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 0fc55b2..6e27447 100644 --- a/README.md +++ b/README.md @@ -206,6 +206,32 @@ is VAE-encoded to latent space and used as DiT context instead of silence. steers the style while the source provides structure, melody, and rhythm. Duration is determined by the source audio. +**Repaint** (`--src-audio` + `repainting_start`/`repainting_end` in JSON): +regenerates a time region of the source audio while preserving the rest. +The DiT receives a binary mask: 1.0 inside the region (generate), 0.0 outside +(keep original). Source latents outside the region provide context; silence +fills the repaint zone. Both fields default to -1 (inactive). Set one or both +to activate: -1 on start means 0s, -1 on end means source duration. +`audio_cover_strength` is ignored in repaint mode (the mask handles everything). + +```bash +cat > /tmp/repaint.json << 'EOF' +{ + "caption": "Smooth jazz guitar solo with reverb", + "lyrics": "[Instrumental]", + "repainting_start": 10.0, + "repainting_end": 25.0 +} +EOF + +./build/dit-vae \ + --src-audio song.wav \ + --request /tmp/repaint.json \ + --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit models/acestep-v15-turbo-Q8_0.gguf \ + --vae models/vae-BF16.gguf +``` + ## Request JSON reference Only `caption` is required. All other fields default to "unset" which means @@ -230,7 +256,9 @@ the LLM fills them, or a sensible runtime default is applied. "inference_steps": 8, "guidance_scale": 0.0, "shift": 3.0, - "audio_cover_strength": 0.5 + "audio_cover_strength": 0.5, + "repainting_start": -1, + "repainting_end": -1 } ``` @@ -289,6 +317,13 @@ half of the steps are guided by the source structure, the second half are free to follow the caption. Lower values give more creative freedom, higher values preserve more of the original. +**`repainting_start`** (float seconds, default `-1` = inactive) +**`repainting_end`** (float seconds, default `-1` = inactive) +Only used with `--src-audio`. When one or both are >= 0, repaint mode activates: +the DiT regenerates the `[start, end)` time region while preserving everything +else. `-1` on start means 0s (beginning), `-1` on end means source duration +(end). Error if end <= start after resolve. `audio_cover_strength` is ignored. + ### LM sampling (ace-qwen3) **`lm_temperature`** (float, default `0.85`) @@ -487,7 +522,7 @@ No cloud, no black box, scriptable and nothing between you and the model. ### dit-vae - [x] Reference audio input: `--src-audio` + `audio_cover_strength` -- [ ] Repaint: selective region regeneration (repainting_start/end) +- [x] Repaint: selective region regeneration (repainting_start/end) ### Audio I/O The binaries read and write 48kHz stereo 16-bit PCM WAV. No codec library, diff --git a/src/bpe.h b/src/bpe.h index d7eeb4f..2a4e6d5 100644 --- a/src/bpe.h +++ b/src/bpe.h @@ -540,11 +540,11 @@ static bool load_bpe_tokenizer(BPETokenizer * tok, const char * dir) { std::string merges_path = std::string(dir) + "/merges.txt"; if (!load_vocab_json(vocab_path, tok->vocab)) { - fprintf(stderr, "ERROR: failed to load %s\n", vocab_path.c_str()); + fprintf(stderr, "[BPE] ERROR: failed to load %s\n", vocab_path.c_str()); return false; } if (!load_merges(merges_path, tok->merges)) { - fprintf(stderr, "ERROR: failed to load %s\n", merges_path.c_str()); + fprintf(stderr, "[BPE] ERROR: failed to load %s\n", merges_path.c_str()); return false; } diff --git a/src/dit-sampler.h b/src/dit-sampler.h index a22ad10..fee770d 100644 --- a/src/dit-sampler.h +++ b/src/dit-sampler.h @@ -188,7 +188,7 @@ static void dit_ggml_generate(DiTGGML * model, } } if (!ggml_backend_sched_alloc_graph(model->sched, gf)) { - fprintf(stderr, "FATAL: failed to allocate graph\n"); + fprintf(stderr, "[DiT] FATAL: failed to allocate graph\n"); ggml_free(ctx); return; } diff --git a/src/request.cpp b/src/request.cpp index 725d0ca..65a7bbe 100644 --- a/src/request.cpp +++ b/src/request.cpp @@ -31,6 +31,8 @@ void request_init(AceRequest * r) { r->guidance_scale = 0.0f; r->shift = 3.0f; r->audio_cover_strength = 0.5f; + r->repainting_start = -1.0f; + r->repainting_end = -1.0f; } // JSON string escape / unescape @@ -312,6 +314,10 @@ bool request_parse(AceRequest * r, const char * path) { r->shift = (float) atof(v.c_str()); } else if (k == "audio_cover_strength") { r->audio_cover_strength = (float) atof(v.c_str()); + } else if (k == "repainting_start") { + r->repainting_start = (float) atof(v.c_str()); + } else if (k == "repainting_end") { + r->repainting_end = (float) atof(v.c_str()); } } @@ -344,6 +350,8 @@ bool request_write(const AceRequest * r, const char * path) { fprintf(f, " \"guidance_scale\": %.1f,\n", r->guidance_scale); fprintf(f, " \"shift\": %.1f,\n", r->shift); fprintf(f, " \"audio_cover_strength\": %.2f,\n", r->audio_cover_strength); + fprintf(f, " \"repainting_start\": %.1f,\n", r->repainting_start); + fprintf(f, " \"repainting_end\": %.1f,\n", r->repainting_end); // audio_codes last (no trailing comma) fprintf(f, " \"audio_codes\": \"%s\"\n", json_escape(r->audio_codes).c_str()); fprintf(f, "}\n"); @@ -365,5 +373,8 @@ void request_dump(const AceRequest * r, FILE * f) { if (r->audio_cover_strength != 0.5f) { fprintf(f, " cover: strength=%.2f\n", r->audio_cover_strength); } + if (r->repainting_start >= 0.0f || r->repainting_end >= 0.0f) { + fprintf(f, " repaint: start=%.1f end=%.1f\n", r->repainting_start, r->repainting_end); + } fprintf(f, " audio_codes: %s\n", r->audio_codes.empty() ? "(none)" : "(present)"); } diff --git a/src/request.h b/src/request.h index 67a3339..fe83945 100644 --- a/src/request.h +++ b/src/request.h @@ -42,6 +42,12 @@ struct AceRequest { // cover mode (active when --src-audio is provided on CLI) float audio_cover_strength; // 0.5 (0-1, fraction of DiT steps using source context) + + // repaint mode (requires --src-audio) + // Both -1 = no repaint (plain cover). One or both >= 0 activates repaint. + // -1 on start means 0s, -1 on end means source duration. + float repainting_start; // -1 + float repainting_end; // -1 }; // Initialize all fields to defaults (matches Python GenerationParams defaults) diff --git a/tools/ace-qwen3.cpp b/tools/ace-qwen3.cpp index dbba8c6..82226f0 100644 --- a/tools/ace-qwen3.cpp +++ b/tools/ace-qwen3.cpp @@ -692,12 +692,12 @@ int main(int argc, char ** argv) { } if (!model_path) { - fprintf(stderr, "ERROR: --model required\n"); + fprintf(stderr, "[CLI] ERROR: --model required\n"); usage(argv[0]); return 1; } if (!request_path) { - fprintf(stderr, "ERROR: --request required\n"); + fprintf(stderr, "[CLI] ERROR: --request required\n"); usage(argv[0]); return 1; } @@ -710,7 +710,7 @@ int main(int argc, char ** argv) { request_dump(&req, stderr); if (req.caption.empty()) { - fprintf(stderr, "ERROR: caption is empty in %s\n", request_path); + fprintf(stderr, "[Request] ERROR: caption is empty in %s\n", request_path); return 1; } diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index 3347dc5..f1ff29e 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -118,21 +118,21 @@ int main(int argc, char ** argv) { } if (request_paths.empty()) { - fprintf(stderr, "ERROR: --request required\n"); + fprintf(stderr, "[CLI] ERROR: --request required\n"); print_usage(argv[0]); return 1; } if (batch_n < 1 || batch_n > 9) { - fprintf(stderr, "ERROR: --batch must be 1..9\n"); + fprintf(stderr, "[CLI] ERROR: --batch must be 1..9\n"); return 1; } if (!dit_gguf) { - fprintf(stderr, "ERROR: --dit required\n"); + fprintf(stderr, "[CLI] ERROR: --dit required\n"); print_usage(argv[0]); return 1; } if (!text_enc_gguf) { - fprintf(stderr, "ERROR: --text-encoder required\n"); + fprintf(stderr, "[CLI] ERROR: --text-encoder required\n"); print_usage(argv[0]); return 1; } @@ -155,7 +155,7 @@ int main(int argc, char ** argv) { timer.reset(); if (!dit_ggml_load(&model, dit_gguf, cfg)) { - fprintf(stderr, "FATAL: failed to load DiT model\n"); + fprintf(stderr, "[DiT] FATAL: failed to load model\n"); return 1; } fprintf(stderr, "[Load] DiT weight load: %.1f ms\n", timer.ms()); @@ -173,14 +173,14 @@ int main(int argc, char ** argv) { memcpy(silence_full.data(), sl_data, 15000 * 64 * sizeof(float)); fprintf(stderr, "[Load] silence_latent: [15000, 64] from GGUF\n"); } else { - fprintf(stderr, "FATAL: silence_latent tensor not found in %s\n", dit_gguf); + fprintf(stderr, "[DiT] FATAL: silence_latent tensor not found in %s\n", dit_gguf); gf_close(&gf); dit_ggml_free(&model); return 1; } gf_close(&gf); } else { - fprintf(stderr, "FATAL: cannot reopen %s for metadata\n", dit_gguf); + fprintf(stderr, "[DiT] FATAL: cannot reopen %s for metadata\n", dit_gguf); dit_ggml_free(&model); return 1; } @@ -206,14 +206,14 @@ int main(int argc, char ** argv) { int T_cover = 0; if (src_audio_path) { if (!vae_gguf) { - fprintf(stderr, "ERROR: --src-audio requires --vae\n"); + fprintf(stderr, "[Cover] ERROR: --src-audio requires --vae\n"); return 1; } timer.reset(); int T_audio = 0, wav_sr = 0; float * wav_data = read_wav(src_audio_path, &T_audio, &wav_sr); if (!wav_data) { - fprintf(stderr, "FATAL: cannot read --src-audio %s\n", src_audio_path); + fprintf(stderr, "[Cover] FATAL: cannot read --src-audio %s\n", src_audio_path); return 1; } if (wav_sr != 48000) { @@ -229,7 +229,7 @@ int main(int argc, char ** argv) { vae_enc_encode_tiled(&vae_enc, wav_data, T_audio, cover_latents.data(), max_T_lat, vae_chunk, vae_overlap); free(wav_data); if (T_cover < 0) { - fprintf(stderr, "FATAL: VAE encode of src_audio failed\n"); + fprintf(stderr, "[VAE-Enc] FATAL: encode failed\n"); vae_enc_free(&vae_enc); return 1; } @@ -257,11 +257,11 @@ int main(int argc, char ** argv) { AceRequest req; request_init(&req); if (!request_parse(&req, rpath)) { - fprintf(stderr, "ERROR: failed to parse %s, skipping\n", rpath); + fprintf(stderr, "[Request] ERROR: failed to parse %s, skipping\n", rpath); continue; } if (req.caption.empty()) { - fprintf(stderr, "ERROR: caption is empty in %s, skipping\n", rpath); + fprintf(stderr, "[Request] ERROR: caption is empty in %s, skipping\n", rpath); continue; } @@ -332,7 +332,7 @@ int main(int argc, char ** argv) { guidance_scale, shift, duration); if (T > 15000) { - fprintf(stderr, "ERROR: T=%d exceeds silence_latent max 15000, skipping\n", T); + fprintf(stderr, "[Pipeline] ERROR: T=%d exceeds silence_latent max 15000, skipping\n", T); continue; } @@ -341,7 +341,7 @@ int main(int argc, char ** argv) { timer.reset(); BPETokenizer tok; if (!load_bpe_from_gguf(&tok, text_enc_gguf)) { - fprintf(stderr, "FATAL: failed to load tokenizer from %s\n", text_enc_gguf); + fprintf(stderr, "[BPE] FATAL: failed to load tokenizer from %s\n", text_enc_gguf); dit_ggml_free(&model); if (have_vae) { vae_ggml_free(&vae); @@ -350,14 +350,48 @@ int main(int argc, char ** argv) { } fprintf(stderr, "[Load] BPE tokenizer: %.1f ms\n", timer.ms()); + // Repaint mode: resolve start/end, requires --src-audio + // Both -1 = inactive. One or both >= 0 activates repaint. + bool is_repaint = false; + float rs = req.repainting_start; + float re = req.repainting_end; + if (rs >= 0.0f || re >= 0.0f) { + if (!have_cover) { + fprintf(stderr, "[Repaint] ERROR: repainting_start/end require --src-audio\n"); + return 1; + } + float src_dur = (float) T_cover * 1920.0f / 48000.0f; + if (rs < 0.0f) { + rs = 0.0f; + } + if (re < 0.0f) { + re = src_dur; + } + if (rs > src_dur) { + rs = src_dur; + } + if (re > src_dur) { + re = src_dur; + } + if (re > rs) { + is_repaint = true; + fprintf(stderr, "[Repaint] region: %.1fs - %.1fs (src=%.1fs)\n", rs, re, src_dur); + } else { + fprintf(stderr, "[Repaint] ERROR: repainting_end (%.1f) <= repainting_start (%.1f)\n", re, rs); + return 1; + } + } + // 2. Build formatted prompts // Reference project uses opposite-sounding instructions (constants.py): // text2music = "Fill the audio semantic mask..." // cover = "Generate audio semantic tokens..." + // repaint = "Repaint the mask area..." // Auto-switches to cover when audio_codes are present bool is_cover = have_cover || !codes_vec.empty(); - const char * instruction = is_cover ? "Generate audio semantic tokens based on the given conditions:" : - "Fill the audio semantic mask based on the given conditions:"; + const char * instruction = is_repaint ? "Repaint the mask area based on the given conditions:" : + is_cover ? "Generate audio semantic tokens based on the given conditions:" : + "Fill the audio semantic mask based on the given conditions:"; char metas[512]; snprintf(metas, sizeof(metas), "- bpm: %s\n- timesignature: %s\n- keyscale: %s\n- duration: %d seconds\n", bpm, timesig, keyscale, (int) duration); @@ -381,7 +415,7 @@ int main(int argc, char ** argv) { text_enc.use_flash_attn = false; } if (!qwen3_load_text_encoder(&text_enc, text_enc_gguf)) { - fprintf(stderr, "FATAL: failed to load text encoder\n"); + fprintf(stderr, "[TextEncoder] FATAL: failed to load\n"); dit_ggml_free(&model); if (have_vae) { vae_ggml_free(&vae); @@ -413,7 +447,7 @@ int main(int argc, char ** argv) { cond.use_flash_attn = false; } if (!cond_ggml_load(&cond, dit_gguf)) { - fprintf(stderr, "FATAL: failed to load condition encoder\n"); + fprintf(stderr, "[CondEncoder] FATAL: failed to load\n"); dit_ggml_free(&model); if (have_vae) { vae_ggml_free(&vae); @@ -445,7 +479,7 @@ int main(int argc, char ** argv) { timer.reset(); DetokGGML detok = {}; if (!detok_ggml_load(&detok, dit_gguf, model.backend, model.cpu_backend)) { - fprintf(stderr, "FATAL: failed to load detokenizer\n"); + fprintf(stderr, "[Detokenizer] FATAL: failed to load\n"); dit_ggml_free(&model); if (have_vae) { vae_ggml_free(&vae); @@ -464,7 +498,7 @@ int main(int argc, char ** argv) { timer.reset(); int ret = detok_ggml_decode(&detok, codes_vec.data(), T_5Hz, decoded_latents.data()); if (ret < 0) { - fprintf(stderr, "FATAL: detokenizer decode failed\n"); + fprintf(stderr, "[Detokenizer] FATAL: decode failed\n"); dit_ggml_free(&model); if (have_vae) { vae_ggml_free(&vae); @@ -478,19 +512,40 @@ int main(int argc, char ** argv) { detok_ggml_free(&detok); } - // Build context: [T, ctx_ch] = src_latents[64] + mask_ones[64] - // Cover: VAE latents directly (matching Python: is_covers=False, raw latents as context) - // Passthrough: detokenized FSQ codes + silence padding - // Text2music: silence only + // Build context: [T, ctx_ch] = src_latents[64] + chunk_mask[64] + // Cover: src = cover_latents, mask = 1.0 everywhere + // Repaint: src = silence in region / cover outside, mask = 1.0 in region / 0.0 outside + // Passthrough: detokenized FSQ codes + silence padding, mask = 1.0 + // Text2music: silence only, mask = 1.0 + int repaint_t0 = 0, repaint_t1 = 0; + if (is_repaint) { + repaint_t0 = (int) (rs * 48000.0f / 1920.0f); // sec -> latent frames (25 Hz) + repaint_t1 = (int) (re * 48000.0f / 1920.0f); + if (repaint_t0 < 0) { + repaint_t0 = 0; + } + if (repaint_t1 > T) { + repaint_t1 = T; + } + if (repaint_t0 > T) { + repaint_t0 = T; + } + fprintf(stderr, "[Repaint] latent frames: [%d, %d) / %d\n", repaint_t0, repaint_t1, T); + } std::vector context_single(T * ctx_ch); if (have_cover) { for (int t = 0; t < T; t++) { - const float * src = (t < T_cover) ? cover_latents.data() + t * Oc : silence_full.data() + t * Oc; + bool in_region = is_repaint && t >= repaint_t0 && t < repaint_t1; + // src: silence in repaint region, cover_latents outside + const float * src = in_region ? + silence_full.data() + t * Oc : + ((t < T_cover) ? cover_latents.data() + t * Oc : silence_full.data() + t * Oc); + float mask_val = is_repaint ? (in_region ? 1.0f : 0.0f) : 1.0f; for (int c = 0; c < Oc; c++) { context_single[t * ctx_ch + c] = src[c]; } for (int c = 0; c < Oc; c++) { - context_single[t * ctx_ch + Oc + c] = 1.0f; + context_single[t * ctx_ch + Oc + c] = mask_val; } } } else { @@ -514,9 +569,10 @@ int main(int argc, char ** argv) { // Cover mode: build silence context for audio_cover_strength switching // When step >= cover_steps, DiT switches from cover context to silence context + // Repaint mode: mask handles region selection, no context switching needed std::vector context_silence; int cover_steps = -1; - if (have_cover) { + if (have_cover && !is_repaint) { float cover_strength = req.audio_cover_strength; if (cover_strength < 1.0f) { // Build silence context: all frames use silence_latent From 05829eccb34a3b17908ad8e69107328f63ff2c23 Mon Sep 17 00:00:00 2001 From: Pascal Date: Sat, 7 Mar 2026 20:43:54 +0100 Subject: [PATCH 3/3] doc: repaint requires SFT model --- README.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 6e27447..8fd25a4 100644 --- a/README.md +++ b/README.md @@ -208,11 +208,13 @@ Duration is determined by the source audio. **Repaint** (`--src-audio` + `repainting_start`/`repainting_end` in JSON): regenerates a time region of the source audio while preserving the rest. +Requires the **SFT model** (the turbo model is less performant for this task). The DiT receives a binary mask: 1.0 inside the region (generate), 0.0 outside (keep original). Source latents outside the region provide context; silence -fills the repaint zone. Both fields default to -1 (inactive). Set one or both -to activate: -1 on start means 0s, -1 on end means source duration. -`audio_cover_strength` is ignored in repaint mode (the mask handles everything). +fills the repaint zone. Both fields default to -1 +(inactive). Set one or both to activate: -1 on start means 0s, -1 on end means +source duration. `audio_cover_strength` is ignored in repaint mode (the mask +handles everything). ```bash cat > /tmp/repaint.json << 'EOF' @@ -220,7 +222,10 @@ cat > /tmp/repaint.json << 'EOF' "caption": "Smooth jazz guitar solo with reverb", "lyrics": "[Instrumental]", "repainting_start": 10.0, - "repainting_end": 25.0 + "repainting_end": 25.0, + "inference_steps": 50, + "guidance_scale": 7.0, + "shift": 1.0 } EOF @@ -228,7 +233,7 @@ EOF --src-audio song.wav \ --request /tmp/repaint.json \ --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \ - --dit models/acestep-v15-turbo-Q8_0.gguf \ + --dit models/acestep-v15-sft-Q8_0.gguf \ --vae models/vae-BF16.gguf ```