diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml
index 5eb1757..013cf5b 100644
--- a/.github/workflows/ci-build.yml
+++ b/.github/workflows/ci-build.yml
@@ -40,7 +40,6 @@ jobs:
           ./build/ace-qwen3 --help 2>&1 | head -5
           ./build/dit-vae --help 2>&1 | head -5
           ./build/quantize --help 2>&1 | head -3
-
   lint:
     name: Lint & Static Analysis
     runs-on: ubuntu-latest
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..a31e093
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,164 @@
+# Build and publish portable release binaries for every supported platform.
+# Runs automatically when a GitHub Release is published, and can also be
+# triggered manually via workflow_dispatch (provide the existing release tag).
+name: Release Binaries
+
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+    inputs:
+      release_tag:
+        description: 'Release tag to attach binaries to (e.g. v0.1.0)'
+        required: true
+        type: string
+
+# Allow uploading assets to releases
+permissions:
+  contents: write
+
+jobs:
+  build:
+    name: Build · ${{ matrix.name }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          # ──────────── Linux ────────────────────────────────────────────
+          - name: linux-x64-cpu-blas
+            os: ubuntu-22.04
+            cmake_flags: -DGGML_BLAS=ON
+            apt_extra: pkg-config libopenblas-dev
+
+          - name: linux-x64-cuda
+            os: ubuntu-22.04
+            cmake_flags: -DGGML_CUDA=ON
+            install_cuda: true
+
+          - name: linux-x64-vulkan
+            os: ubuntu-22.04
+            cmake_flags: -DGGML_VULKAN=ON
+            install_vulkan: true
+
+          # ──────────── macOS ────────────────────────────────────────────
+          # macos-latest = arm64 (M-series); Metal + Accelerate auto-enabled
+          - name: macos-arm64-metal
+            os: macos-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      # ── Linux: base build tools & optional apt packages ───────────────
+      - name: Install build tools (Linux)
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y -qq cmake build-essential ${{ matrix.apt_extra || '' }}
+
+      # ── Linux: CUDA toolkit ────────────────
+      - name: Install CUDA toolkit (Linux)
+        uses: Jimver/cuda-toolkit@v0.2.30
+        if: matrix.install_cuda == true
+        with:
+          log-file-suffix: '${{matrix.os}}.txt'
+
+      # ── Linux: Vulkan SDK  ────────────────────────────
+      - name: Install Vulkan SDK (Windows)
+        if: matrix.install_vulkan == true
+        uses: humbletim/install-vulkan-sdk@v1.2
+        with:
+          version: 1.4.309.0
+          cache: true
+
+      # ── Configure & Build ─────────────────────────────────────────────
+      - name: Configure & Build (Linux / macOS)
+        if: runner.os != 'Windows'
+        run: |
+          mkdir build && cd build
+          cmake .. ${{ matrix.cmake_flags || '' }}
+          CORES=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
+          cmake --build . --config Release -j"$CORES"
+
+      - name: Configure & Build (Windows)
+        if: runner.os == 'Windows'
+        shell: pwsh
+        run: |
+          New-Item -ItemType Directory -Path build | Out-Null
+          Set-Location build
+          cmake .. ${{ matrix.cmake_flags || '' }}
+          cmake --build . --config Release -j $env:NUMBER_OF_PROCESSORS
+
+      # ── Smoke test: verify binaries run (no GPU / model required) ─────
+      - name: Smoke test
+        continue-on-error: true
+        shell: bash
+        run: |
+          if [ "$RUNNER_OS" = "Windows" ]; then
+            BIN="build/Release"
+            EXT=".exe"
+          else
+            BIN="build"
+            EXT=""
+          fi
+          "$BIN/ace-qwen3$EXT"    --help 2>&1 | head -5
+          "$BIN/dit-vae$EXT"      --help 2>&1 | head -5
+          "$BIN/ace-understand$EXT" --help 2>&1 | head -5
+          "$BIN/neural-codec$EXT" --help 2>&1 | head -5
+          # quantize and mp3-codec print usage on bad args (exit 1 swallowed by pipe)
+          "$BIN/quantize$EXT"     --help 2>&1 | head -3
+          "$BIN/mp3-codec$EXT"    2>&1 | head -3
+
+      # ── Determine which release tag to upload to ──────────────────────
+      - name: Resolve release tag
+        id: tag
+        shell: bash
+        run: |
+          if [ "${{ github.event_name }}" = "release" ]; then
+            echo "value=${{ github.event.release.tag_name }}" >> $GITHUB_OUTPUT
+          else
+            echo "value=${{ inputs.release_tag }}" >> $GITHUB_OUTPUT
+          fi
+
+      # ── Package binaries ──────────────────────────────────────────────
+      - name: Package binaries (Linux / macOS)
+        if: runner.os != 'Windows'
+        run: |
+          mkdir -p dist
+          cp build/ace-qwen3 build/dit-vae build/ace-understand \
+             build/quantize build/neural-codec build/mp3-codec dist/
+          tar -C dist -czf "acestep-${{ matrix.name }}.tar.gz" .
+
+      - name: Package binaries (Windows)
+        if: runner.os == 'Windows'
+        shell: pwsh
+        run: |
+          New-Item -ItemType Directory -Path dist | Out-Null
+          $bins = @('ace-qwen3','dit-vae','ace-understand','quantize','neural-codec','mp3-codec')
+          foreach ($b in $bins) {
+            Copy-Item "build\Release\$b.exe" dist\
+          }
+          Compress-Archive -Path dist\* -DestinationPath "acestep-${{ matrix.name }}.zip"
+
+      # ── Upload archive to the GitHub release ──────────────────────────
+      - name: Upload to release (Linux / macOS)
+        if: runner.os != 'Windows'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          gh release upload "${{ steps.tag.outputs.value }}" \
+            "acestep-${{ matrix.name }}.tar.gz" \
+            --clobber
+
+      - name: Upload to release (Windows)
+        if: runner.os == 'Windows'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        shell: pwsh
+        run: |
+          gh release upload "${{ steps.tag.outputs.value }}" `
+            "acestep-${{ matrix.name }}.zip" `
+            --clobber
diff --git a/src/audio-io.h b/src/audio-io.h
index b461b6e..4de62fc 100644
--- a/src/audio-io.h
+++ b/src/audio-io.h
@@ -258,9 +258,14 @@ static bool audio_write_wav(const char * path, const float * audio, int T_audio,
     fwrite("data", 1, 4, f);
     fwrite(&data_size, 4, 1, f);
 
-    // write interleaved PCM16 from planar float
+    // buffer all samples then write once (avoids millions of fwrite syscalls)
     const float * L = audio;
     const float * R = audio + T_audio;
+    short * pcm = (short *) malloc((size_t) T_audio * 2 * sizeof(short));
+    if (!pcm) {
+        fclose(f);
+        return false;
+    }
     for (int t = 0; t < T_audio; t++) {
         float lf = L[t];
         float rf = R[t];
@@ -276,11 +281,11 @@ static bool audio_write_wav(const char * path, const float * audio, int T_audio,
         if (rf < -1.0f) {
             rf = -1.0f;
         }
-        short ls = (short) (lf * 32767.0f);
-        short rs = (short) (rf * 32767.0f);
-        fwrite(&ls, 2, 1, f);
-        fwrite(&rs, 2, 1, f);
+        pcm[t * 2 + 0] = (short) (lf * 32767.0f);
+        pcm[t * 2 + 1] = (short) (rf * 32767.0f);
     }
+    fwrite(pcm, 2, (size_t) T_audio * 2, f);
+    free(pcm);
 
     fclose(f);
     fprintf(stderr, "[WAV] wrote %s: %d samples, %d Hz, stereo\n", path, T_audio, sr);
diff --git a/src/audio-resample.h b/src/audio-resample.h
index ffbb4f3..01e69fb 100644
--- a/src/audio-resample.h
+++ b/src/audio-resample.h
@@ -1,6 +1,6 @@
 #pragma once
-// audio-resample.h: sample rate conversion via windowed sinc interpolation.
-// Kaiser window, configurable quality. No external dependencies.
+// audio-resample.h: polyphase sample rate conversion via pre-computed
+// Kaiser-windowed sinc filter. No external dependencies.
 // Part of acestep.cpp. MIT license.
 
 #include <cmath>
@@ -11,8 +11,20 @@
 #    define M_PI 3.14159265358979323846
 #endif
 
-// Modified Bessel function I0 (first kind, zeroth order).
-// Used by the Kaiser window. Series expansion, converges fast.
+// Polyphase resampler with pre-computed Kaiser-windowed sinc filter.
+//
+// Instead of computing bessel_i0 + sqrt + sin per output sample (O(N_out * N_taps)),
+// we build a polyphase table once: table[phase][tap] = sinc(d) * kaiser(d).
+// The hot loop is then just a table lookup + dot product -- no transcendentals.
+//
+// Table size: 256 phases * 64 taps * 4 bytes = 64 KB (fits L1 cache).
+
+#define RESAMPLE_N_TAPS   64
+#define RESAMPLE_N_PHASES 256
+#define RESAMPLE_HALF_LEN (RESAMPLE_N_TAPS / 2)
+
+// Bessel I0 via Taylor series. Only used during table construction
+// (called RESAMPLE_N_PHASES * RESAMPLE_N_TAPS = 16384 times total, not millions).
 static double audio_resample_bessel_i0(double x) {
     double sum  = 1.0;
     double term = 1.0;
@@ -27,6 +39,47 @@ static double audio_resample_bessel_i0(double x) {
     return sum;
 }
 
+// Build polyphase filter bank.
+//
+// For output sample i at position center = i / ratio in input space:
+//   center_int = floor(center), frac = center - center_int
+//   phase = frac * N_PHASES
+//   base  = center_int - HALF_LEN + 1
+//   for tap 0..N_TAPS-1: h = table[phase][tap], input = src[base + tap]
+//
+// d (distance from center to tap) = frac + HALF_LEN - 1 - tap
+// This depends only on phase and tap, so we can pre-compute everything.
+static void audio_resample_build_table(float table[][RESAMPLE_N_TAPS], double fc, double beta) {
+    double inv_i0b = 1.0 / audio_resample_bessel_i0(beta);
+
+    for (int p = 0; p < RESAMPLE_N_PHASES; p++) {
+        double frac = (double) p / (double) RESAMPLE_N_PHASES;
+
+        for (int tap = 0; tap < RESAMPLE_N_TAPS; tap++) {
+            double d = frac + (double) (RESAMPLE_HALF_LEN - 1 - tap);
+
+            // windowed sinc
+            double sinc_val;
+            if (fabs(d) < 1e-9) {
+                sinc_val = 2.0 * fc;
+            } else {
+                sinc_val = sin(2.0 * M_PI * fc * d) / (M_PI * d);
+            }
+
+            // Kaiser window
+            double t = d / (double) RESAMPLE_HALF_LEN;
+            double win;
+            if (t < -1.0 || t > 1.0) {
+                win = 0.0;
+            } else {
+                win = audio_resample_bessel_i0(beta * sqrt(1.0 - t * t)) * inv_i0b;
+            }
+
+            table[p][tap] = (float) (sinc_val * win);
+        }
+    }
+}
+
 // Resample a planar float audio buffer from sr_in to sr_out.
 //
 // in:     planar float [ch0: n_in samples][ch1: n_in samples][...]
@@ -69,69 +122,71 @@ static float * audio_resample(const float * in, int n_in, int sr_in, int sr_out,
         return NULL;
     }
 
-    // filter half length in input samples.
-    // 32 taps (64 total) for high quality music resampling.
-    int half_len = 32;
-
     // Kaiser window parameter (beta=9.0 gives ~80 dB stopband)
-    double beta    = 9.0;
-    double inv_i0b = 1.0 / audio_resample_bessel_i0(beta);
+    double beta = 9.0;
 
     // cutoff: lowpass at the lower of the two rates to prevent aliasing
     double fc = 0.5 * ((ratio < 1.0) ? ratio : 1.0);
 
+    // build polyphase filter table (one-time cost: ~16K coeff, microseconds)
+    float (*table)[RESAMPLE_N_TAPS] = (float (*)[RESAMPLE_N_TAPS]) malloc(
+        RESAMPLE_N_PHASES * RESAMPLE_N_TAPS * sizeof(float));
+    if (!table) {
+        free(out);
+        *n_out = 0;
+        return NULL;
+    }
+    audio_resample_build_table(table, fc, beta);
+
+    float ratio_f    = (float) ratio;
+    float inv_ratio  = 1.0f / ratio_f;
+
     for (int ch = 0; ch < nch; ch++) {
-        const float * src = in + ch * n_in;
+        const float * src = in  + ch * n_in;
         float *       dst = out + ch * (*n_out);
 
         for (int i = 0; i < *n_out; i++) {
             // position in input sample space
-            double center = (double) i / ratio;
-            int    start  = (int) floor(center) - half_len + 1;
-            int    end    = (int) floor(center) + half_len;
-
-            double sum = 0.0;
-            double wgt = 0.0;
-
-            for (int j = start; j <= end; j++) {
-                double d = center - (double) j;
-
-                // windowed sinc
-                double sinc_val;
-                if (fabs(d) < 1e-9) {
-                    sinc_val = 2.0 * fc;
-                } else {
-                    sinc_val = sin(2.0 * M_PI * fc * d) / (M_PI * d);
-                }
-
-                // Kaiser window
-                double t = d / (double) half_len;
-                double win;
-                if (t < -1.0 || t > 1.0) {
-                    win = 0.0;
-                } else {
-                    win = audio_resample_bessel_i0(beta * sqrt(1.0 - t * t)) * inv_i0b;
-                }
-
-                double h = sinc_val * win;
+            float center    = (float) i * inv_ratio;
+            int   center_i  = (int) floorf(center);
+            float frac      = center - (float) center_i;
+
+            // phase index + interpolation fraction between adjacent phases
+            float phase_f   = frac * (float) RESAMPLE_N_PHASES;
+            int   phase     = (int) phase_f;
+            float phase_mix = phase_f - (float) phase;
+            if (phase >= RESAMPLE_N_PHASES - 1) {
+                phase     = RESAMPLE_N_PHASES - 2;
+                phase_mix = 1.0f;
+            }
+
+            int base = center_i - RESAMPLE_HALF_LEN + 1;
+
+            // dot product with linear interpolation between adjacent phases
+            // (avoids quantization artifacts from 256 discrete phases)
+            const float * h0 = table[phase];
+            const float * h1 = table[phase + 1];
+
+            float sum = 0.0f;
+            float wgt = 0.0f;
+
+            for (int tap = 0; tap < RESAMPLE_N_TAPS; tap++) {
+                float h = h0[tap] + phase_mix * (h1[tap] - h0[tap]);
 
                 // clamp to input bounds (repeat edge samples)
-                int idx = j;
-                if (idx < 0) {
-                    idx = 0;
-                }
-                if (idx >= n_in) {
-                    idx = n_in - 1;
-                }
-
-                sum += (double) src[idx] * h;
+                int idx = base + tap;
+                if (idx < 0)     { idx = 0; }
+                if (idx >= n_in) { idx = n_in - 1; }
+
+                sum += src[idx] * h;
                 wgt += h;
             }
 
             // normalize to compensate for edge effects
-            dst[i] = (wgt > 1e-12) ? (float) (sum / wgt) : 0.0f;
+            dst[i] = (wgt > 1e-12f) ? sum / wgt : 0.0f;
         }
     }
 
+    free(table);
     return out;
 }
diff --git a/src/wav.h b/src/wav.h
index 7bd23e6..8c5e96f 100644
--- a/src/wav.h
+++ b/src/wav.h
@@ -159,12 +159,18 @@ static bool write_wav(const char * path, const float * audio, int T_audio, int s
     }
     float scale = peak > 0.0f ? 32767.0f / peak : 0.0f;
 
+    // buffer all samples then write once (avoids millions of fwrite syscalls)
+    short * pcm = (short *) malloc((size_t) T_audio * 2 * sizeof(short));
+    if (!pcm) {
+        fclose(f);
+        return false;
+    }
     for (int t = 0; t < T_audio; t++) {
-        for (int c = 0; c < 2; c++) {
-            short v = (short) (audio[c * T_audio + t] * scale);
-            fwrite(&v, 2, 1, f);
-        }
+        pcm[t * 2 + 0] = (short) (audio[0 * T_audio + t] * scale);
+        pcm[t * 2 + 1] = (short) (audio[1 * T_audio + t] * scale);
     }
+    fwrite(pcm, 2, (size_t) T_audio * 2, f);
+    free(pcm);
     fclose(f);
     return true;
 }