audiohacking · lmangani · Mar 8, 2026 · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026
diff --git a/README.md b/README.md
@@ -206,6 +206,37 @@ is VAE-encoded to latent space and used as DiT context instead of silence.
 steers the style while the source provides structure, melody, and rhythm.
 Duration is determined by the source audio.
 
+**Repaint** (`--src-audio` + `repainting_start`/`repainting_end` in JSON):
+regenerates a time region of the source audio while preserving the rest.
+Requires the **SFT model** (the turbo model is less performant for this task).
+The DiT receives a binary mask: 1.0 inside the region (generate), 0.0 outside
+(keep original). Source latents outside the region provide context; silence
+fills the repaint zone. Both fields default to -1
+(inactive). Set one or both to activate: -1 on start means 0s, -1 on end means
+source duration. `audio_cover_strength` is ignored in repaint mode (the mask
+handles everything).
+
+```bash
+cat > /tmp/repaint.json << 'EOF'
+{
+    "caption": "Smooth jazz guitar solo with reverb",
+    "lyrics": "[Instrumental]",
+    "repainting_start": 10.0,
+    "repainting_end": 25.0,
+    "inference_steps": 50,
+    "guidance_scale": 7.0,
+    "shift": 1.0
+}
+EOF
+
+./build/dit-vae \
+    --src-audio song.wav \
+    --request /tmp/repaint.json \
+    --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit models/acestep-v15-sft-Q8_0.gguf \
+    --vae models/vae-BF16.gguf
+```
+
 ## Request JSON reference
 
 Only `caption` is required. All other fields default to "unset" which means
@@ -230,7 +261,9 @@ the LLM fills them, or a sensible runtime default is applied.
     "inference_steps":      8,
     "guidance_scale":       0.0,
     "shift":                3.0,
-    "audio_cover_strength": 0.5
+    "audio_cover_strength": 0.5,
+    "repainting_start":    -1,
+    "repainting_end":      -1
 }
 ```
 
@@ -289,6 +322,13 @@ half of the steps are guided by the source structure, the second half are free
 to follow the caption. Lower values give more creative freedom, higher values
 preserve more of the original.
 
+**`repainting_start`** (float seconds, default `-1` = inactive)
+**`repainting_end`** (float seconds, default `-1` = inactive)
+Only used with `--src-audio`. When one or both are >= 0, repaint mode activates:
+the DiT regenerates the `[start, end)` time region while preserving everything
+else. `-1` on start means 0s (beginning), `-1` on end means source duration
+(end). Error if end <= start after resolve. `audio_cover_strength` is ignored.
+
 ### LM sampling (ace-qwen3)
 
 **`lm_temperature`** (float, default `0.85`)
@@ -487,7 +527,7 @@ No cloud, no black box, scriptable and nothing between you and the model.
 
 ### dit-vae
 - [x] Reference audio input: `--src-audio` + `audio_cover_strength`
-- [ ] Repaint: selective region regeneration (repainting_start/end)
+- [x] Repaint: selective region regeneration (repainting_start/end)
 
 ### Audio I/O
 The binaries read and write 48kHz stereo 16-bit PCM WAV. No codec library,

diff --git a/src/bpe.h b/src/bpe.h
@@ -540,11 +540,11 @@ static bool load_bpe_tokenizer(BPETokenizer * tok, const char * dir) {
     std::string merges_path = std::string(dir) + "/merges.txt";
 
     if (!load_vocab_json(vocab_path, tok->vocab)) {
-        fprintf(stderr, "ERROR: failed to load %s\n", vocab_path.c_str());
+        fprintf(stderr, "[BPE] ERROR: failed to load %s\n", vocab_path.c_str());
         return false;
     }
     if (!load_merges(merges_path, tok->merges)) {
-        fprintf(stderr, "ERROR: failed to load %s\n", merges_path.c_str());
+        fprintf(stderr, "[BPE] ERROR: failed to load %s\n", merges_path.c_str());
         return false;
     }
 

diff --git a/src/dit-sampler.h b/src/dit-sampler.h
@@ -188,7 +188,7 @@ static void dit_ggml_generate(DiTGGML *           model,
         }
     }
     if (!ggml_backend_sched_alloc_graph(model->sched, gf)) {
-        fprintf(stderr, "FATAL: failed to allocate graph\n");
+        fprintf(stderr, "[DiT] FATAL: failed to allocate graph\n");
         ggml_free(ctx);
         return;
     }

diff --git a/src/request.cpp b/src/request.cpp
@@ -31,6 +31,8 @@ void request_init(AceRequest * r) {
     r->guidance_scale       = 0.0f;
     r->shift                = 3.0f;
     r->audio_cover_strength = 0.5f;
+    r->repainting_start     = -1.0f;
+    r->repainting_end       = -1.0f;
 }
 
 // JSON string escape / unescape
@@ -312,6 +314,10 @@ bool request_parse(AceRequest * r, const char * path) {
             r->shift = (float) atof(v.c_str());
         } else if (k == "audio_cover_strength") {
             r->audio_cover_strength = (float) atof(v.c_str());
+        } else if (k == "repainting_start") {
+            r->repainting_start = (float) atof(v.c_str());
+        } else if (k == "repainting_end") {
+            r->repainting_end = (float) atof(v.c_str());
         }
     }
 
@@ -344,6 +350,8 @@ bool request_write(const AceRequest * r, const char * path) {
     fprintf(f, "  \"guidance_scale\": %.1f,\n", r->guidance_scale);
     fprintf(f, "  \"shift\": %.1f,\n", r->shift);
     fprintf(f, "  \"audio_cover_strength\": %.2f,\n", r->audio_cover_strength);
+    fprintf(f, "  \"repainting_start\": %.1f,\n", r->repainting_start);
+    fprintf(f, "  \"repainting_end\": %.1f,\n", r->repainting_end);
     // audio_codes last (no trailing comma)
     fprintf(f, "  \"audio_codes\": \"%s\"\n", json_escape(r->audio_codes).c_str());
     fprintf(f, "}\n");
@@ -365,5 +373,8 @@ void request_dump(const AceRequest * r, FILE * f) {
     if (r->audio_cover_strength != 0.5f) {
         fprintf(f, "  cover: strength=%.2f\n", r->audio_cover_strength);
     }
+    if (r->repainting_start >= 0.0f || r->repainting_end >= 0.0f) {
+        fprintf(f, "  repaint: start=%.1f end=%.1f\n", r->repainting_start, r->repainting_end);
+    }
     fprintf(f, "  audio_codes: %s\n", r->audio_codes.empty() ? "(none)" : "(present)");
 }
diff --git a/src/request.h b/src/request.h
@@ -3,7 +3,6 @@
 // request.h - AceStep generation request (JSON serialization)
 //
 // Pure data container + JSON read/write. Zero business logic.
-// Aligned with Python GenerationParams (inference.py:39) and API /release_task.
 //
 
 #include <cstdint>
@@ -43,6 +42,12 @@ struct AceRequest {
 
     // cover mode (active when --src-audio is provided on CLI)
     float audio_cover_strength;  // 0.5 (0-1, fraction of DiT steps using source context)
+
+    // repaint mode (requires --src-audio)
+    // Both -1 = no repaint (plain cover). One or both >= 0 activates repaint.
+    // -1 on start means 0s, -1 on end means source duration.
+    float repainting_start;  // -1
+    float repainting_end;    // -1
 };
 
 // Initialize all fields to defaults (matches Python GenerationParams defaults)

diff --git a/tests/request0.json b/tests/request0.json
@@ -6,7 +6,6 @@
   "keyscale": "G major",
   "timesignature": "4",
   "vocal_language": "fr",
-  "task_type": "text2music",
   "seed": 158961132,
   "thinking": true,
   "lm_temperature": 0.85,

diff --git a/tools/ace-qwen3.cpp b/tools/ace-qwen3.cpp
@@ -692,12 +692,12 @@ int main(int argc, char ** argv) {
     }
 
     if (!model_path) {
-        fprintf(stderr, "ERROR: --model required\n");
+        fprintf(stderr, "[CLI] ERROR: --model required\n");
         usage(argv[0]);
         return 1;
     }
     if (!request_path) {
-        fprintf(stderr, "ERROR: --request required\n");
+        fprintf(stderr, "[CLI] ERROR: --request required\n");
         usage(argv[0]);
         return 1;
     }
@@ -710,7 +710,7 @@ int main(int argc, char ** argv) {
     request_dump(&req, stderr);
 
     if (req.caption.empty()) {
-        fprintf(stderr, "ERROR: caption is empty in %s\n", request_path);
+        fprintf(stderr, "[Request] ERROR: caption is empty in %s\n", request_path);
         return 1;
     }