diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 78da84d..0a57c41 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -64,7 +64,7 @@ jobs: - name: Run clang-format (check mode) run: | find . \ - \( -path './.git' -o -path './ggml' -o -path './build' \) -prune -o \ + \( -path './.git' -o -path './ggml' -o -path './build' -o -path './vendor' -o -path './mp3' \) -prune -o \ -type f \( -name '*.c' -o -name '*.h' -o -name '*.cc' -o -name '*.cpp' -o -name '*.hpp' \) \ -print0 | xargs -0 clang-format --dry-run --Werror diff --git a/README.md b/README.md index d71b0a9..2c0ff48 100644 --- a/README.md +++ b/README.md @@ -258,6 +258,16 @@ EOF --vae models/vae-BF16.gguf ``` +**Lego** (`--lego ` + `--src-audio`): +generates a new instrument track layered over an existing backing track. +Only the **base model** (`acestep-v15-base`) supports lego mode. +The track name is passed on the CLI; set `audio_cover_strength=1.0` in the +request so the source audio guides all DiT steps. +See `examples/lego.json` and `examples/lego.sh`. + +Available track names: `vocals`, `backing_vocals`, `drums`, `bass`, `guitar`, +`keyboard`, `percussion`, `strings`, `synth`, `fx`, `brass`, `woodwinds`. + ## Request JSON reference Only `caption` is required. All other fields default to "unset" which means diff --git a/examples/lego.json b/examples/lego.json new file mode 100644 index 0000000..d4138d6 --- /dev/null +++ b/examples/lego.json @@ -0,0 +1,6 @@ +{ + "caption": "electric guitar riff, funk guitar, house music, instrumental", + "audio_cover_strength": 1.0, + "inference_steps": 50, + "guidance_scale": 7.0 +} diff --git a/examples/lego.sh b/examples/lego.sh new file mode 100755 index 0000000..f954223 --- /dev/null +++ b/examples/lego.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Lego test: three-step self-contained pipeline. +# +# step zero: download the base DiT model if not already present +# (lego requires acestep-v15-base; turbo/sft do not support it) +# step one: generate a track from the simple prompt +# step two: apply lego guitar to that generated track + +set -eu + +# Step 0: ensure the base model is available +echo "=== Step 0: ensure base model ===" +(cd .. && ./models.sh --base) + +# Step 1: generate a source track with the simple prompt +echo "=== Step 1: generate track ===" +../build/ace-qwen3 \ + --request simple.json \ + --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf + +../build/dit-vae \ + --request simple0.json \ + --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit ../models/acestep-v15-turbo-Q8_0.gguf \ + --vae ../models/vae-BF16.gguf \ + --wav + +# Step 2: lego guitar on the generated track (base model required) +echo "=== Step 2: lego guitar ===" +../build/dit-vae \ + --src-audio simple00.wav \ + --lego guitar \ + --request lego.json \ + --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit ../models/acestep-v15-base-Q8_0.gguf \ + --vae ../models/vae-BF16.gguf \ + --wav diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index 585a089..6ac71a6 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -32,6 +32,11 @@ static void print_usage(const char * prog) { " --vae VAE GGUF file\n\n" "Reference audio:\n" " --src-audio Source audio (WAV or MP3, any sample rate)\n\n" + "Lego mode (base model only, requires --src-audio):\n" + " --lego Generate a track over the source audio context\n" + " Track names: vocals, backing_vocals, drums, bass,\n" + " guitar, keyboard, percussion, strings, synth,\n" + " fx, brass, woodwinds\n\n" "LoRA:\n" " --lora LoRA safetensors file or directory\n" " --lora-scale LoRA scaling factor (default: 1.0)\n\n" @@ -83,6 +88,7 @@ int main(int argc, char ** argv) { const char * dit_gguf = NULL; const char * vae_gguf = NULL; const char * src_audio_path = NULL; + const char * lego_track = NULL; // --lego const char * dump_dir = NULL; const char * lora_path = NULL; float lora_scale = 1.0f; @@ -107,6 +113,8 @@ int main(int argc, char ** argv) { vae_gguf = argv[++i]; } else if (strcmp(argv[i], "--src-audio") == 0 && i + 1 < argc) { src_audio_path = argv[++i]; + } else if (strcmp(argv[i], "--lego") == 0 && i + 1 < argc) { + lego_track = argv[++i]; } else if (strcmp(argv[i], "--lora") == 0 && i + 1 < argc) { lora_path = argv[++i]; } else if (strcmp(argv[i], "--lora-scale") == 0 && i + 1 < argc) { @@ -144,6 +152,10 @@ int main(int argc, char ** argv) { fprintf(stderr, "[CLI] ERROR: --batch must be 1..9\n"); return 1; } + if (lego_track && !src_audio_path) { + fprintf(stderr, "[CLI] ERROR: --lego requires --src-audio\n"); + return 1; + } if (!dit_gguf) { fprintf(stderr, "[CLI] ERROR: --dit required\n"); print_usage(argv[0]); @@ -410,12 +422,25 @@ int main(int argc, char ** argv) { // text2music = "Fill the audio semantic mask..." // cover = "Generate audio semantic tokens..." // repaint = "Repaint the mask area..." + // lego = "Generate the {track} track based on the audio context:" // Auto-switches to cover when audio_codes are present - bool is_cover = have_cover || !codes_vec.empty(); - const char * instruction = is_repaint ? "Repaint the mask area based on the given conditions:" : - is_cover ? "Generate audio semantic tokens based on the given conditions:" : - "Fill the audio semantic mask based on the given conditions:"; - char metas[512]; + bool is_cover = have_cover || !codes_vec.empty(); + + // Lego: build instruction from the track name supplied via --lego + char lego_instruction[256] = {}; + const char * instruction; + if (lego_track) { + snprintf(lego_instruction, sizeof(lego_instruction), + "Generate the %s track based on the audio context:", lego_track); + instruction = lego_instruction; + fprintf(stderr, "[Lego] track=%s\n", lego_track); + } else { + instruction = is_repaint ? "Repaint the mask area based on the given conditions:" : + is_cover ? "Generate audio semantic tokens based on the given conditions:" : + "Fill the audio semantic mask based on the given conditions:"; + } + + char metas[512]; snprintf(metas, sizeof(metas), "- bpm: %s\n- timesignature: %s\n- keyscale: %s\n- duration: %d seconds\n", bpm, timesig, keyscale, (int) duration); std::string text_str = std::string("# Instruction\n") + instruction + "\n\n" + "# Caption\n" + caption +