diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 5eb1757..013cf5b 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -40,7 +40,6 @@ jobs: ./build/ace-qwen3 --help 2>&1 | head -5 ./build/dit-vae --help 2>&1 | head -5 ./build/quantize --help 2>&1 | head -3 - lint: name: Lint & Static Analysis runs-on: ubuntu-latest diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..a31e093 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,164 @@ +# Build and publish portable release binaries for every supported platform. +# Runs automatically when a GitHub Release is published, and can also be +# triggered manually via workflow_dispatch (provide the existing release tag). +name: Release Binaries + +on: + release: + types: [published] + workflow_dispatch: + inputs: + release_tag: + description: 'Release tag to attach binaries to (e.g. v0.1.0)' + required: true + type: string + +# Allow uploading assets to releases +permissions: + contents: write + +jobs: + build: + name: Build · ${{ matrix.name }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + # ──────────── Linux ──────────────────────────────────────────── + - name: linux-x64-cpu-blas + os: ubuntu-22.04 + cmake_flags: -DGGML_BLAS=ON + apt_extra: pkg-config libopenblas-dev + + - name: linux-x64-cuda + os: ubuntu-22.04 + cmake_flags: -DGGML_CUDA=ON + install_cuda: true + + - name: linux-x64-vulkan + os: ubuntu-22.04 + cmake_flags: -DGGML_VULKAN=ON + install_vulkan: true + + # ──────────── macOS ──────────────────────────────────────────── + # macos-latest = arm64 (M-series); Metal + Accelerate auto-enabled + - name: macos-arm64-metal + os: macos-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + + # ── Linux: base build tools & optional apt packages ─────────────── + - name: Install build tools (Linux) + if: runner.os == 'Linux' + run: | + sudo apt-get update -qq + sudo apt-get install -y -qq cmake build-essential ${{ matrix.apt_extra || '' }} + + # ── Linux: CUDA toolkit ──────────────── + - name: Install CUDA toolkit (Linux) + uses: Jimver/cuda-toolkit@v0.2.30 + if: matrix.install_cuda == true + with: + log-file-suffix: '${{matrix.os}}.txt' + + # ── Linux: Vulkan SDK ──────────────────────────── + - name: Install Vulkan SDK (Windows) + if: matrix.install_vulkan == true + uses: humbletim/install-vulkan-sdk@v1.2 + with: + version: 1.4.309.0 + cache: true + + # ── Configure & Build ───────────────────────────────────────────── + - name: Configure & Build (Linux / macOS) + if: runner.os != 'Windows' + run: | + mkdir build && cd build + cmake .. ${{ matrix.cmake_flags || '' }} + CORES=$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) + cmake --build . --config Release -j"$CORES" + + - name: Configure & Build (Windows) + if: runner.os == 'Windows' + shell: pwsh + run: | + New-Item -ItemType Directory -Path build | Out-Null + Set-Location build + cmake .. ${{ matrix.cmake_flags || '' }} + cmake --build . --config Release -j $env:NUMBER_OF_PROCESSORS + + # ── Smoke test: verify binaries run (no GPU / model required) ───── + - name: Smoke test + continue-on-error: true + shell: bash + run: | + if [ "$RUNNER_OS" = "Windows" ]; then + BIN="build/Release" + EXT=".exe" + else + BIN="build" + EXT="" + fi + "$BIN/ace-qwen3$EXT" --help 2>&1 | head -5 + "$BIN/dit-vae$EXT" --help 2>&1 | head -5 + "$BIN/ace-understand$EXT" --help 2>&1 | head -5 + "$BIN/neural-codec$EXT" --help 2>&1 | head -5 + # quantize and mp3-codec print usage on bad args (exit 1 swallowed by pipe) + "$BIN/quantize$EXT" --help 2>&1 | head -3 + "$BIN/mp3-codec$EXT" 2>&1 | head -3 + + # ── Determine which release tag to upload to ────────────────────── + - name: Resolve release tag + id: tag + shell: bash + run: | + if [ "${{ github.event_name }}" = "release" ]; then + echo "value=${{ github.event.release.tag_name }}" >> $GITHUB_OUTPUT + else + echo "value=${{ inputs.release_tag }}" >> $GITHUB_OUTPUT + fi + + # ── Package binaries ────────────────────────────────────────────── + - name: Package binaries (Linux / macOS) + if: runner.os != 'Windows' + run: | + mkdir -p dist + cp build/ace-qwen3 build/dit-vae build/ace-understand \ + build/quantize build/neural-codec build/mp3-codec dist/ + tar -C dist -czf "acestep-${{ matrix.name }}.tar.gz" . + + - name: Package binaries (Windows) + if: runner.os == 'Windows' + shell: pwsh + run: | + New-Item -ItemType Directory -Path dist | Out-Null + $bins = @('ace-qwen3','dit-vae','ace-understand','quantize','neural-codec','mp3-codec') + foreach ($b in $bins) { + Copy-Item "build\Release\$b.exe" dist\ + } + Compress-Archive -Path dist\* -DestinationPath "acestep-${{ matrix.name }}.zip" + + # ── Upload archive to the GitHub release ────────────────────────── + - name: Upload to release (Linux / macOS) + if: runner.os != 'Windows' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release upload "${{ steps.tag.outputs.value }}" \ + "acestep-${{ matrix.name }}.tar.gz" \ + --clobber + + - name: Upload to release (Windows) + if: runner.os == 'Windows' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + shell: pwsh + run: | + gh release upload "${{ steps.tag.outputs.value }}" ` + "acestep-${{ matrix.name }}.zip" ` + --clobber diff --git a/src/audio-io.h b/src/audio-io.h index b461b6e..4de62fc 100644 --- a/src/audio-io.h +++ b/src/audio-io.h @@ -258,9 +258,14 @@ static bool audio_write_wav(const char * path, const float * audio, int T_audio, fwrite("data", 1, 4, f); fwrite(&data_size, 4, 1, f); - // write interleaved PCM16 from planar float + // buffer all samples then write once (avoids millions of fwrite syscalls) const float * L = audio; const float * R = audio + T_audio; + short * pcm = (short *) malloc((size_t) T_audio * 2 * sizeof(short)); + if (!pcm) { + fclose(f); + return false; + } for (int t = 0; t < T_audio; t++) { float lf = L[t]; float rf = R[t]; @@ -276,11 +281,11 @@ static bool audio_write_wav(const char * path, const float * audio, int T_audio, if (rf < -1.0f) { rf = -1.0f; } - short ls = (short) (lf * 32767.0f); - short rs = (short) (rf * 32767.0f); - fwrite(&ls, 2, 1, f); - fwrite(&rs, 2, 1, f); + pcm[t * 2 + 0] = (short) (lf * 32767.0f); + pcm[t * 2 + 1] = (short) (rf * 32767.0f); } + fwrite(pcm, 2, (size_t) T_audio * 2, f); + free(pcm); fclose(f); fprintf(stderr, "[WAV] wrote %s: %d samples, %d Hz, stereo\n", path, T_audio, sr); diff --git a/src/audio-resample.h b/src/audio-resample.h index ffbb4f3..01e69fb 100644 --- a/src/audio-resample.h +++ b/src/audio-resample.h @@ -1,6 +1,6 @@ #pragma once -// audio-resample.h: sample rate conversion via windowed sinc interpolation. -// Kaiser window, configurable quality. No external dependencies. +// audio-resample.h: polyphase sample rate conversion via pre-computed +// Kaiser-windowed sinc filter. No external dependencies. // Part of acestep.cpp. MIT license. #include @@ -11,8 +11,20 @@ # define M_PI 3.14159265358979323846 #endif -// Modified Bessel function I0 (first kind, zeroth order). -// Used by the Kaiser window. Series expansion, converges fast. +// Polyphase resampler with pre-computed Kaiser-windowed sinc filter. +// +// Instead of computing bessel_i0 + sqrt + sin per output sample (O(N_out * N_taps)), +// we build a polyphase table once: table[phase][tap] = sinc(d) * kaiser(d). +// The hot loop is then just a table lookup + dot product -- no transcendentals. +// +// Table size: 256 phases * 64 taps * 4 bytes = 64 KB (fits L1 cache). + +#define RESAMPLE_N_TAPS 64 +#define RESAMPLE_N_PHASES 256 +#define RESAMPLE_HALF_LEN (RESAMPLE_N_TAPS / 2) + +// Bessel I0 via Taylor series. Only used during table construction +// (called RESAMPLE_N_PHASES * RESAMPLE_N_TAPS = 16384 times total, not millions). static double audio_resample_bessel_i0(double x) { double sum = 1.0; double term = 1.0; @@ -27,6 +39,47 @@ static double audio_resample_bessel_i0(double x) { return sum; } +// Build polyphase filter bank. +// +// For output sample i at position center = i / ratio in input space: +// center_int = floor(center), frac = center - center_int +// phase = frac * N_PHASES +// base = center_int - HALF_LEN + 1 +// for tap 0..N_TAPS-1: h = table[phase][tap], input = src[base + tap] +// +// d (distance from center to tap) = frac + HALF_LEN - 1 - tap +// This depends only on phase and tap, so we can pre-compute everything. +static void audio_resample_build_table(float table[][RESAMPLE_N_TAPS], double fc, double beta) { + double inv_i0b = 1.0 / audio_resample_bessel_i0(beta); + + for (int p = 0; p < RESAMPLE_N_PHASES; p++) { + double frac = (double) p / (double) RESAMPLE_N_PHASES; + + for (int tap = 0; tap < RESAMPLE_N_TAPS; tap++) { + double d = frac + (double) (RESAMPLE_HALF_LEN - 1 - tap); + + // windowed sinc + double sinc_val; + if (fabs(d) < 1e-9) { + sinc_val = 2.0 * fc; + } else { + sinc_val = sin(2.0 * M_PI * fc * d) / (M_PI * d); + } + + // Kaiser window + double t = d / (double) RESAMPLE_HALF_LEN; + double win; + if (t < -1.0 || t > 1.0) { + win = 0.0; + } else { + win = audio_resample_bessel_i0(beta * sqrt(1.0 - t * t)) * inv_i0b; + } + + table[p][tap] = (float) (sinc_val * win); + } + } +} + // Resample a planar float audio buffer from sr_in to sr_out. // // in: planar float [ch0: n_in samples][ch1: n_in samples][...] @@ -69,69 +122,71 @@ static float * audio_resample(const float * in, int n_in, int sr_in, int sr_out, return NULL; } - // filter half length in input samples. - // 32 taps (64 total) for high quality music resampling. - int half_len = 32; - // Kaiser window parameter (beta=9.0 gives ~80 dB stopband) - double beta = 9.0; - double inv_i0b = 1.0 / audio_resample_bessel_i0(beta); + double beta = 9.0; // cutoff: lowpass at the lower of the two rates to prevent aliasing double fc = 0.5 * ((ratio < 1.0) ? ratio : 1.0); + // build polyphase filter table (one-time cost: ~16K coeff, microseconds) + float (*table)[RESAMPLE_N_TAPS] = (float (*)[RESAMPLE_N_TAPS]) malloc( + RESAMPLE_N_PHASES * RESAMPLE_N_TAPS * sizeof(float)); + if (!table) { + free(out); + *n_out = 0; + return NULL; + } + audio_resample_build_table(table, fc, beta); + + float ratio_f = (float) ratio; + float inv_ratio = 1.0f / ratio_f; + for (int ch = 0; ch < nch; ch++) { - const float * src = in + ch * n_in; + const float * src = in + ch * n_in; float * dst = out + ch * (*n_out); for (int i = 0; i < *n_out; i++) { // position in input sample space - double center = (double) i / ratio; - int start = (int) floor(center) - half_len + 1; - int end = (int) floor(center) + half_len; - - double sum = 0.0; - double wgt = 0.0; - - for (int j = start; j <= end; j++) { - double d = center - (double) j; - - // windowed sinc - double sinc_val; - if (fabs(d) < 1e-9) { - sinc_val = 2.0 * fc; - } else { - sinc_val = sin(2.0 * M_PI * fc * d) / (M_PI * d); - } - - // Kaiser window - double t = d / (double) half_len; - double win; - if (t < -1.0 || t > 1.0) { - win = 0.0; - } else { - win = audio_resample_bessel_i0(beta * sqrt(1.0 - t * t)) * inv_i0b; - } - - double h = sinc_val * win; + float center = (float) i * inv_ratio; + int center_i = (int) floorf(center); + float frac = center - (float) center_i; + + // phase index + interpolation fraction between adjacent phases + float phase_f = frac * (float) RESAMPLE_N_PHASES; + int phase = (int) phase_f; + float phase_mix = phase_f - (float) phase; + if (phase >= RESAMPLE_N_PHASES - 1) { + phase = RESAMPLE_N_PHASES - 2; + phase_mix = 1.0f; + } + + int base = center_i - RESAMPLE_HALF_LEN + 1; + + // dot product with linear interpolation between adjacent phases + // (avoids quantization artifacts from 256 discrete phases) + const float * h0 = table[phase]; + const float * h1 = table[phase + 1]; + + float sum = 0.0f; + float wgt = 0.0f; + + for (int tap = 0; tap < RESAMPLE_N_TAPS; tap++) { + float h = h0[tap] + phase_mix * (h1[tap] - h0[tap]); // clamp to input bounds (repeat edge samples) - int idx = j; - if (idx < 0) { - idx = 0; - } - if (idx >= n_in) { - idx = n_in - 1; - } - - sum += (double) src[idx] * h; + int idx = base + tap; + if (idx < 0) { idx = 0; } + if (idx >= n_in) { idx = n_in - 1; } + + sum += src[idx] * h; wgt += h; } // normalize to compensate for edge effects - dst[i] = (wgt > 1e-12) ? (float) (sum / wgt) : 0.0f; + dst[i] = (wgt > 1e-12f) ? sum / wgt : 0.0f; } } + free(table); return out; } diff --git a/src/wav.h b/src/wav.h index 7bd23e6..8c5e96f 100644 --- a/src/wav.h +++ b/src/wav.h @@ -159,12 +159,18 @@ static bool write_wav(const char * path, const float * audio, int T_audio, int s } float scale = peak > 0.0f ? 32767.0f / peak : 0.0f; + // buffer all samples then write once (avoids millions of fwrite syscalls) + short * pcm = (short *) malloc((size_t) T_audio * 2 * sizeof(short)); + if (!pcm) { + fclose(f); + return false; + } for (int t = 0; t < T_audio; t++) { - for (int c = 0; c < 2; c++) { - short v = (short) (audio[c * T_audio + t] * scale); - fwrite(&v, 2, 1, f); - } + pcm[t * 2 + 0] = (short) (audio[0 * T_audio + t] * scale); + pcm[t * 2 + 1] = (short) (audio[1 * T_audio + t] * scale); } + fwrite(pcm, 2, (size_t) T_audio * 2, f); + free(pcm); fclose(f); return true; }