audiohacking · lmangani · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml
@@ -64,7 +64,7 @@ jobs:
       - name: Run clang-format (check mode)
         run: |
           find . \
-          \( -path './.git' -o -path './ggml' -o -path './build' \) -prune -o \
+          \( -path './.git' -o -path './ggml' -o -path './build' -o -path './vendor' -o -path './mp3' \) -prune -o \
           -type f \( -name '*.c' -o -name '*.h' -o -name '*.cc' -o -name '*.cpp' -o -name '*.hpp' \) \
           -print0 | xargs -0 clang-format --dry-run --Werror
 

diff --git a/README.md b/README.md
@@ -258,6 +258,16 @@ EOF
     --vae models/vae-BF16.gguf
 ```
 
+**Lego** (`--lego <track>` + `--src-audio`):
+generates a new instrument track layered over an existing backing track.
+Only the **base model** (`acestep-v15-base`) supports lego mode.
+The track name is passed on the CLI; set `audio_cover_strength=1.0` in the
+request so the source audio guides all DiT steps.
+See `examples/lego.json` and `examples/lego.sh`.
+
+Available track names: `vocals`, `backing_vocals`, `drums`, `bass`, `guitar`,
+`keyboard`, `percussion`, `strings`, `synth`, `fx`, `brass`, `woodwinds`.
+
 ## Request JSON reference
 
 Only `caption` is required. All other fields default to "unset" which means

diff --git a/examples/lego.json b/examples/lego.json
@@ -0,0 +1,6 @@
+{
+    "caption": "electric guitar riff, funk guitar, house music, instrumental",
+    "audio_cover_strength": 1.0,
+    "inference_steps": 50,
+    "guidance_scale": 7.0
+}
diff --git a/examples/lego.sh b/examples/lego.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Lego test: three-step self-contained pipeline.
+#
+# step zero: download the base DiT model if not already present
+#            (lego requires acestep-v15-base; turbo/sft do not support it)
+# step one:  generate a track from the simple prompt
+# step two:  apply lego guitar to that generated track
+
+set -eu
+
+# Step 0: ensure the base model is available
+echo "=== Step 0: ensure base model ==="
+(cd .. && ./models.sh --base)
+
+# Step 1: generate a source track with the simple prompt
+echo "=== Step 1: generate track ==="
+../build/ace-qwen3 \
+    --request simple.json \
+    --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf
+
+../build/dit-vae \
+    --request simple0.json \
+    --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit ../models/acestep-v15-turbo-Q8_0.gguf \
+    --vae ../models/vae-BF16.gguf \
+    --wav
+
+# Step 2: lego guitar on the generated track (base model required)
+echo "=== Step 2: lego guitar ==="
+../build/dit-vae \
+    --src-audio simple00.wav \
+    --lego guitar \
+    --request lego.json \
+    --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit ../models/acestep-v15-base-Q8_0.gguf \
+    --vae ../models/vae-BF16.gguf \
+    --wav
diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp
@@ -32,6 +32,11 @@ static void print_usage(const char * prog) {
             "  --vae <gguf>            VAE GGUF file\n\n"
             "Reference audio:\n"
             "  --src-audio <file>      Source audio (WAV or MP3, any sample rate)\n\n"
+            "Lego mode (base model only, requires --src-audio):\n"
+            "  --lego <track>          Generate a track over the source audio context\n"
+            "                          Track names: vocals, backing_vocals, drums, bass,\n"
+            "                          guitar, keyboard, percussion, strings, synth,\n"
+            "                          fx, brass, woodwinds\n\n"
             "LoRA:\n"
             "  --lora <path>           LoRA safetensors file or directory\n"
             "  --lora-scale <float>    LoRA scaling factor (default: 1.0)\n\n"
@@ -83,6 +88,7 @@ int main(int argc, char ** argv) {
     const char *              dit_gguf       = NULL;
     const char *              vae_gguf       = NULL;
     const char *              src_audio_path = NULL;
+    const char *              lego_track     = NULL;  // --lego <track>
     const char *              dump_dir       = NULL;
     const char *              lora_path      = NULL;
     float                     lora_scale     = 1.0f;
@@ -107,6 +113,8 @@ int main(int argc, char ** argv) {
             vae_gguf = argv[++i];
         } else if (strcmp(argv[i], "--src-audio") == 0 && i + 1 < argc) {
             src_audio_path = argv[++i];
+        } else if (strcmp(argv[i], "--lego") == 0 && i + 1 < argc) {
+            lego_track = argv[++i];
         } else if (strcmp(argv[i], "--lora") == 0 && i + 1 < argc) {
             lora_path = argv[++i];
         } else if (strcmp(argv[i], "--lora-scale") == 0 && i + 1 < argc) {
@@ -144,6 +152,10 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "[CLI] ERROR: --batch must be 1..9\n");
         return 1;
     }
+    if (lego_track && !src_audio_path) {
+        fprintf(stderr, "[CLI] ERROR: --lego requires --src-audio\n");
+        return 1;
+    }
     if (!dit_gguf) {
         fprintf(stderr, "[CLI] ERROR: --dit required\n");
         print_usage(argv[0]);
@@ -410,12 +422,25 @@ int main(int argc, char ** argv) {
         //   text2music = "Fill the audio semantic mask..."
         //   cover      = "Generate audio semantic tokens..."
         //   repaint    = "Repaint the mask area..."
+        //   lego       = "Generate the {track} track based on the audio context:"
         // Auto-switches to cover when audio_codes are present
-        bool         is_cover    = have_cover || !codes_vec.empty();
-        const char * instruction = is_repaint ? "Repaint the mask area based on the given conditions:" :
-                                   is_cover   ? "Generate audio semantic tokens based on the given conditions:" :
-                                                "Fill the audio semantic mask based on the given conditions:";
-        char         metas[512];
+        bool is_cover = have_cover || !codes_vec.empty();
+
+        // Lego: build instruction from the track name supplied via --lego <track>
+        char         lego_instruction[256] = {};
+        const char * instruction;
+        if (lego_track) {
+            snprintf(lego_instruction, sizeof(lego_instruction),
+                     "Generate the %s track based on the audio context:", lego_track);
+            instruction = lego_instruction;
+            fprintf(stderr, "[Lego] track=%s\n", lego_track);
+        } else {
+            instruction = is_repaint ? "Repaint the mask area based on the given conditions:" :
+                          is_cover   ? "Generate audio semantic tokens based on the given conditions:" :
+                                       "Fill the audio semantic mask based on the given conditions:";
+        }
+
+        char metas[512];
         snprintf(metas, sizeof(metas), "- bpm: %s\n- timesignature: %s\n- keyscale: %s\n- duration: %d seconds\n", bpm,
                  timesig, keyscale, (int) duration);
         std::string text_str = std::string("# Instruction\n") + instruction + "\n\n" + "# Caption\n" + caption +