Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ jobs:
- name: Run clang-format (check mode)
run: |
find . \
\( -path './.git' -o -path './ggml' -o -path './build' \) -prune -o \
\( -path './.git' -o -path './ggml' -o -path './build' -o -path './vendor' -o -path './mp3' \) -prune -o \
-type f \( -name '*.c' -o -name '*.h' -o -name '*.cc' -o -name '*.cpp' -o -name '*.hpp' \) \
-print0 | xargs -0 clang-format --dry-run --Werror

Expand Down
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,16 @@ EOF
--vae models/vae-BF16.gguf
```

**Lego** (`--lego <track>` + `--src-audio`):
generates a new instrument track layered over an existing backing track.
Only the **base model** (`acestep-v15-base`) supports lego mode.
The track name is passed on the CLI; set `audio_cover_strength=1.0` in the
request so the source audio guides all DiT steps.
See `examples/lego.json` and `examples/lego.sh`.

Available track names: `vocals`, `backing_vocals`, `drums`, `bass`, `guitar`,
`keyboard`, `percussion`, `strings`, `synth`, `fx`, `brass`, `woodwinds`.

## Request JSON reference

Only `caption` is required. All other fields default to "unset" which means
Expand Down
6 changes: 6 additions & 0 deletions examples/lego.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"caption": "electric guitar riff, funk guitar, house music, instrumental",
"audio_cover_strength": 1.0,
"inference_steps": 50,
"guidance_scale": 7.0
}
37 changes: 37 additions & 0 deletions examples/lego.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash
# Lego test: three-step self-contained pipeline.
#
# step zero: download the base DiT model if not already present
# (lego requires acestep-v15-base; turbo/sft do not support it)
# step one: generate a track from the simple prompt
# step two: apply lego guitar to that generated track

set -eu

# Step 0: ensure the base model is available
echo "=== Step 0: ensure base model ==="
(cd .. && ./models.sh --base)

# Step 1: generate a source track with the simple prompt
echo "=== Step 1: generate track ==="
../build/ace-qwen3 \
--request simple.json \
--model ../models/acestep-5Hz-lm-4B-Q8_0.gguf

../build/dit-vae \
--request simple0.json \
--text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
--dit ../models/acestep-v15-turbo-Q8_0.gguf \
--vae ../models/vae-BF16.gguf \
--wav

# Step 2: lego guitar on the generated track (base model required)
echo "=== Step 2: lego guitar ==="
../build/dit-vae \
--src-audio simple00.wav \
--lego guitar \
--request lego.json \
--text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
--dit ../models/acestep-v15-base-Q8_0.gguf \
--vae ../models/vae-BF16.gguf \
--wav
35 changes: 30 additions & 5 deletions tools/dit-vae.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ static void print_usage(const char * prog) {
" --vae <gguf> VAE GGUF file\n\n"
"Reference audio:\n"
" --src-audio <file> Source audio (WAV or MP3, any sample rate)\n\n"
"Lego mode (base model only, requires --src-audio):\n"
" --lego <track> Generate a track over the source audio context\n"
" Track names: vocals, backing_vocals, drums, bass,\n"
" guitar, keyboard, percussion, strings, synth,\n"
" fx, brass, woodwinds\n\n"
"LoRA:\n"
" --lora <path> LoRA safetensors file or directory\n"
" --lora-scale <float> LoRA scaling factor (default: 1.0)\n\n"
Expand Down Expand Up @@ -83,6 +88,7 @@ int main(int argc, char ** argv) {
const char * dit_gguf = NULL;
const char * vae_gguf = NULL;
const char * src_audio_path = NULL;
const char * lego_track = NULL; // --lego <track>
const char * dump_dir = NULL;
const char * lora_path = NULL;
float lora_scale = 1.0f;
Expand All @@ -107,6 +113,8 @@ int main(int argc, char ** argv) {
vae_gguf = argv[++i];
} else if (strcmp(argv[i], "--src-audio") == 0 && i + 1 < argc) {
src_audio_path = argv[++i];
} else if (strcmp(argv[i], "--lego") == 0 && i + 1 < argc) {
lego_track = argv[++i];
} else if (strcmp(argv[i], "--lora") == 0 && i + 1 < argc) {
lora_path = argv[++i];
} else if (strcmp(argv[i], "--lora-scale") == 0 && i + 1 < argc) {
Expand Down Expand Up @@ -144,6 +152,10 @@ int main(int argc, char ** argv) {
fprintf(stderr, "[CLI] ERROR: --batch must be 1..9\n");
return 1;
}
if (lego_track && !src_audio_path) {
fprintf(stderr, "[CLI] ERROR: --lego requires --src-audio\n");
return 1;
}
if (!dit_gguf) {
fprintf(stderr, "[CLI] ERROR: --dit required\n");
print_usage(argv[0]);
Expand Down Expand Up @@ -410,12 +422,25 @@ int main(int argc, char ** argv) {
// text2music = "Fill the audio semantic mask..."
// cover = "Generate audio semantic tokens..."
// repaint = "Repaint the mask area..."
// lego = "Generate the {track} track based on the audio context:"
// Auto-switches to cover when audio_codes are present
bool is_cover = have_cover || !codes_vec.empty();
const char * instruction = is_repaint ? "Repaint the mask area based on the given conditions:" :
is_cover ? "Generate audio semantic tokens based on the given conditions:" :
"Fill the audio semantic mask based on the given conditions:";
char metas[512];
bool is_cover = have_cover || !codes_vec.empty();

// Lego: build instruction from the track name supplied via --lego <track>
char lego_instruction[256] = {};
const char * instruction;
if (lego_track) {
snprintf(lego_instruction, sizeof(lego_instruction),
"Generate the %s track based on the audio context:", lego_track);
instruction = lego_instruction;
fprintf(stderr, "[Lego] track=%s\n", lego_track);
} else {
instruction = is_repaint ? "Repaint the mask area based on the given conditions:" :
is_cover ? "Generate audio semantic tokens based on the given conditions:" :
"Fill the audio semantic mask based on the given conditions:";
}

char metas[512];
snprintf(metas, sizeof(metas), "- bpm: %s\n- timesignature: %s\n- keyscale: %s\n- duration: %d seconds\n", bpm,
timesig, keyscale, (int) duration);
std::string text_str = std::string("# Instruction\n") + instruction + "\n\n" + "# Caption\n" + caption +
Expand Down
Loading