Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 42 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,37 @@ is VAE-encoded to latent space and used as DiT context instead of silence.
steers the style while the source provides structure, melody, and rhythm.
Duration is determined by the source audio.

**Repaint** (`--src-audio` + `repainting_start`/`repainting_end` in JSON):
regenerates a time region of the source audio while preserving the rest.
Requires the **SFT model** (the turbo model is less performant for this task).
The DiT receives a binary mask: 1.0 inside the region (generate), 0.0 outside
(keep original). Source latents outside the region provide context; silence
fills the repaint zone. Both fields default to -1
(inactive). Set one or both to activate: -1 on start means 0s, -1 on end means
source duration. `audio_cover_strength` is ignored in repaint mode (the mask
handles everything).

```bash
cat > /tmp/repaint.json << 'EOF'
{
"caption": "Smooth jazz guitar solo with reverb",
"lyrics": "[Instrumental]",
"repainting_start": 10.0,
"repainting_end": 25.0,
"inference_steps": 50,
"guidance_scale": 7.0,
"shift": 1.0
}
EOF

./build/dit-vae \
--src-audio song.wav \
--request /tmp/repaint.json \
--text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
--dit models/acestep-v15-sft-Q8_0.gguf \
--vae models/vae-BF16.gguf
```

## Request JSON reference

Only `caption` is required. All other fields default to "unset" which means
Expand All @@ -230,7 +261,9 @@ the LLM fills them, or a sensible runtime default is applied.
"inference_steps": 8,
"guidance_scale": 0.0,
"shift": 3.0,
"audio_cover_strength": 0.5
"audio_cover_strength": 0.5,
"repainting_start": -1,
"repainting_end": -1
}
```

Expand Down Expand Up @@ -289,6 +322,13 @@ half of the steps are guided by the source structure, the second half are free
to follow the caption. Lower values give more creative freedom, higher values
preserve more of the original.

**`repainting_start`** (float seconds, default `-1` = inactive)
**`repainting_end`** (float seconds, default `-1` = inactive)
Only used with `--src-audio`. When one or both are >= 0, repaint mode activates:
the DiT regenerates the `[start, end)` time region while preserving everything
else. `-1` on start means 0s (beginning), `-1` on end means source duration
(end). Error if end <= start after resolve. `audio_cover_strength` is ignored.

### LM sampling (ace-qwen3)

**`lm_temperature`** (float, default `0.85`)
Expand Down Expand Up @@ -487,7 +527,7 @@ No cloud, no black box, scriptable and nothing between you and the model.

### dit-vae
- [x] Reference audio input: `--src-audio` + `audio_cover_strength`
- [ ] Repaint: selective region regeneration (repainting_start/end)
- [x] Repaint: selective region regeneration (repainting_start/end)

### Audio I/O
The binaries read and write 48kHz stereo 16-bit PCM WAV. No codec library,
Expand Down
4 changes: 2 additions & 2 deletions src/bpe.h
Original file line number Diff line number Diff line change
Expand Up @@ -540,11 +540,11 @@ static bool load_bpe_tokenizer(BPETokenizer * tok, const char * dir) {
std::string merges_path = std::string(dir) + "/merges.txt";

if (!load_vocab_json(vocab_path, tok->vocab)) {
fprintf(stderr, "ERROR: failed to load %s\n", vocab_path.c_str());
fprintf(stderr, "[BPE] ERROR: failed to load %s\n", vocab_path.c_str());
return false;
}
if (!load_merges(merges_path, tok->merges)) {
fprintf(stderr, "ERROR: failed to load %s\n", merges_path.c_str());
fprintf(stderr, "[BPE] ERROR: failed to load %s\n", merges_path.c_str());
return false;
}

Expand Down
2 changes: 1 addition & 1 deletion src/dit-sampler.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ static void dit_ggml_generate(DiTGGML * model,
}
}
if (!ggml_backend_sched_alloc_graph(model->sched, gf)) {
fprintf(stderr, "FATAL: failed to allocate graph\n");
fprintf(stderr, "[DiT] FATAL: failed to allocate graph\n");
ggml_free(ctx);
return;
}
Expand Down
11 changes: 11 additions & 0 deletions src/request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ void request_init(AceRequest * r) {
r->guidance_scale = 0.0f;
r->shift = 3.0f;
r->audio_cover_strength = 0.5f;
r->repainting_start = -1.0f;
r->repainting_end = -1.0f;
}

// JSON string escape / unescape
Expand Down Expand Up @@ -312,6 +314,10 @@ bool request_parse(AceRequest * r, const char * path) {
r->shift = (float) atof(v.c_str());
} else if (k == "audio_cover_strength") {
r->audio_cover_strength = (float) atof(v.c_str());
} else if (k == "repainting_start") {
r->repainting_start = (float) atof(v.c_str());
} else if (k == "repainting_end") {
r->repainting_end = (float) atof(v.c_str());
}
}

Expand Down Expand Up @@ -344,6 +350,8 @@ bool request_write(const AceRequest * r, const char * path) {
fprintf(f, " \"guidance_scale\": %.1f,\n", r->guidance_scale);
fprintf(f, " \"shift\": %.1f,\n", r->shift);
fprintf(f, " \"audio_cover_strength\": %.2f,\n", r->audio_cover_strength);
fprintf(f, " \"repainting_start\": %.1f,\n", r->repainting_start);
fprintf(f, " \"repainting_end\": %.1f,\n", r->repainting_end);
// audio_codes last (no trailing comma)
fprintf(f, " \"audio_codes\": \"%s\"\n", json_escape(r->audio_codes).c_str());
fprintf(f, "}\n");
Expand All @@ -365,5 +373,8 @@ void request_dump(const AceRequest * r, FILE * f) {
if (r->audio_cover_strength != 0.5f) {
fprintf(f, " cover: strength=%.2f\n", r->audio_cover_strength);
}
if (r->repainting_start >= 0.0f || r->repainting_end >= 0.0f) {
fprintf(f, " repaint: start=%.1f end=%.1f\n", r->repainting_start, r->repainting_end);
}
fprintf(f, " audio_codes: %s\n", r->audio_codes.empty() ? "(none)" : "(present)");
}
7 changes: 6 additions & 1 deletion src/request.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
// request.h - AceStep generation request (JSON serialization)
//
// Pure data container + JSON read/write. Zero business logic.
// Aligned with Python GenerationParams (inference.py:39) and API /release_task.
//

#include <cstdint>
Expand Down Expand Up @@ -43,6 +42,12 @@ struct AceRequest {

// cover mode (active when --src-audio is provided on CLI)
float audio_cover_strength; // 0.5 (0-1, fraction of DiT steps using source context)

// repaint mode (requires --src-audio)
// Both -1 = no repaint (plain cover). One or both >= 0 activates repaint.
// -1 on start means 0s, -1 on end means source duration.
float repainting_start; // -1
float repainting_end; // -1
};

// Initialize all fields to defaults (matches Python GenerationParams defaults)
Expand Down
1 change: 0 additions & 1 deletion tests/request0.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"keyscale": "G major",
"timesignature": "4",
"vocal_language": "fr",
"task_type": "text2music",
"seed": 158961132,
"thinking": true,
"lm_temperature": 0.85,
Expand Down
6 changes: 3 additions & 3 deletions tools/ace-qwen3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -692,12 +692,12 @@ int main(int argc, char ** argv) {
}

if (!model_path) {
fprintf(stderr, "ERROR: --model required\n");
fprintf(stderr, "[CLI] ERROR: --model required\n");
usage(argv[0]);
return 1;
}
if (!request_path) {
fprintf(stderr, "ERROR: --request required\n");
fprintf(stderr, "[CLI] ERROR: --request required\n");
usage(argv[0]);
return 1;
}
Expand All @@ -710,7 +710,7 @@ int main(int argc, char ** argv) {
request_dump(&req, stderr);

if (req.caption.empty()) {
fprintf(stderr, "ERROR: caption is empty in %s\n", request_path);
fprintf(stderr, "[Request] ERROR: caption is empty in %s\n", request_path);
return 1;
}

Expand Down
Loading