From 98cccf37f73f74f2dab0d79266194e77861bc645 Mon Sep 17 00:00:00 2001 From: Phil Tomson Date: Thu, 2 Apr 2026 08:08:51 -0700 Subject: [PATCH 1/2] ggml: fix q1_0_g128 CPU dot product and add x86 SIMD kernels Fix float-truncation bug in ggml_vec_dot_q1_0_g128_q8_0: the Q8_0 scale factor d1 was accumulated into an int, silently truncating it to zero for most blocks and producing garbage output on CPU. Add AVX2 kernel using the same shuffle/bit-mask expansion as q1_0, processing one 32-element Q8_0 sub-block per iteration (~4x speedup). Add AVX-512 hybrid kernel using _mm256_mask_blend_epi8 (AVX-512BW+VL) to expand weight bits in a single instruction, with _mm256_dpbusd_epi32 (AVX-512VNNI) for the dot product (~5.7x speedup on 1.7B model). Co-Authored-By: Claude Sonnet 4.6 --- ggml/src/ggml-cpu/arch/x86/quants.c | 118 ++++++++++++++++++++++++---- ggml/src/ggml-cpu/quants.c | 20 ++--- 2 files changed, 111 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index 45129f08a16..3e71d656fa9 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -662,37 +662,121 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons const block_q1_0_g128 * GGML_RESTRICT x = vx; const block_q8_0 * GGML_RESTRICT y = vy; + int ib = 0; float sumf = 0; - // Each Q1_0_g128 block has 128 elements - // Each Q8_0 block has 32 elements - // So we need 4 Q8_0 blocks per Q1_0_g128 block - for (int ib = 0; ib < nb; ++ib) { +#if defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__) + // Hybrid AVX-512 path: stay in 256-bit to avoid lane-crossing overhead, but use + // AVX-512BW+VL and VNNI to reduce the inner loop to ~3 instructions per sub-block. + // + // Bit expansion: _mm256_mask_blend_epi8 uses the 32-bit integer directly as a + // __mmask32, selecting +qy where the weight bit is 1 and -qy where it is 0. + // That replaces the 7-instruction shuffle/mask/cmp/convert sequence with 2 + // instructions (sub + mask_blend). + // + // Dot product: _mm256_dpbusd_epi32(zero, all_ones, signed_qy) sums signed_qy in + // groups of 4, giving 8 int32 partial sums that represent the full dot product. + + __m256 acc = _mm256_setzero_ps(); + const __m256i zero_256 = _mm256_setzero_si256(); + const __m256i all_ones_256 = _mm256_set1_epi8(1); + + for (; ib < nb; ++ib) { const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); - - int sumi = 0; - - // Process 4 Q8_0 blocks (4 * 32 = 128 elements) - for (int k = 0; k < 4; k++) { + + for (int k = 0; k < 4; ++k) { + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d); + const uint32_t qbits32 = *(const uint32_t *)(x[ib].qs + k * 4); + const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs); + + // Negate qy where bit=0 (weight=-1), keep qy where bit=1 (weight=+1) + const __m256i neg_qy = _mm256_sub_epi8(zero_256, qy); + const __m256i signed_qy = _mm256_mask_blend_epi8((__mmask32)qbits32, neg_qy, qy); + + // dpbusd(zero, all_ones, signed_qy) = sum(signed_qy) = dot product + const __m256i int_acc = _mm256_dpbusd_epi32(zero_256, all_ones_256, signed_qy); + + acc = _mm256_fmadd_ps(_mm256_set1_ps(d0 * d1), _mm256_cvtepi32_ps(int_acc), acc); + } + } + + sumf = hsum_float_8(acc); + +#elif defined(__AVX2__) + __m256 acc = _mm256_setzero_ps(); + + // Hoist constants out of the loop + const __m256i shuffle_mask = _mm256_set_epi8( + 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0 + ); + const __m256i bit_mask = _mm256_set_epi8( + (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, + (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, + (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, + (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 + ); + const __m256i ones = _mm256_set1_epi8(1); + + for (; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + + // Each q1_0_g128 block maps to 4 consecutive q8_0 blocks (4 * 32 = 128 elements). + // Process one q8_0 sub-block (32 elements) per iteration. + for (int k = 0; k < 4; ++k) { + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d); + const __m256 d = _mm256_set1_ps(d0 * d1); + + // Load the 4 bytes (32 bits) covering elements [k*32 .. k*32+31] + const uint32_t qbits32 = *(const uint32_t *)(x[ib].qs + k * 4); + + // Load 32 q8_0 int8 activations + const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs); + + // Expand 32 bits → 32 bytes of ±1 using the same shuffle/mask trick as q1_0 + const __m128i qbits_128 = _mm_set1_epi32(qbits32); + const __m256i qbits_256 = _mm256_broadcastsi128_si256(qbits_128); + const __m256i qbits_shuf = _mm256_shuffle_epi8(qbits_256, shuffle_mask); + const __m256i bit_test = _mm256_and_si256(qbits_shuf, bit_mask); + const __m256i is_set = _mm256_cmpeq_epi8(bit_test, bit_mask); + const __m256i bit_value = _mm256_and_si256(is_set, ones); + const __m256i bit_doubled = _mm256_add_epi8(bit_value, bit_value); + const __m256i qx = _mm256_sub_epi8(bit_doubled, ones); + + acc = _mm256_fmadd_ps(d, mul_sum_i8_pairs_float(qx, qy), acc); + } + } + + sumf = hsum_float_8(acc); +#endif + + // Scalar fallback for any remaining blocks + for (; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + + float sumi = 0.0f; + + for (int k = 0; k < 4; ++k) { const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d); - + int sumi_block = 0; - + for (int j = 0; j < QK8_0; j++) { - const int bit_index = k * QK8_0 + j; + const int bit_index = k * QK8_0 + j; const int byte_index = bit_index / 8; const int bit_offset = bit_index % 8; - - // Extract bit: 1 = +1, 0 = -1 + const int xi = ((x[ib].qs[byte_index] >> bit_offset) & 1) ? 1 : -1; const int yi = y[ib*4 + k].qs[j]; - + sumi_block += xi * yi; } - + sumi += d1 * sumi_block; } - + sumf += d0 * sumi; } diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 7f8456a5db8..7d25c9dc392 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -177,35 +177,35 @@ void ggml_vec_dot_q1_0_g128_q8_0_generic(int n, float * GGML_RESTRICT s, size_t float sumf = 0.0; - + // Each Q1_0_g128 block has 128 elements, each Q8_0 block has 32 elements // So we need 4 Q8_0 blocks per Q1_0_g128 block for (int i = 0; i < nb; i++) { const float d0 = GGML_FP16_TO_FP32(x[i].d); - - int sumi = 0; - + + float sumi = 0.0f; + // Process 4 Q8_0 blocks (4 * 32 = 128 elements) for (int k = 0; k < 4; k++) { const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d); - + int sumi_block = 0; - + for (int j = 0; j < QK8_0; j++) { const int bit_index = k * QK8_0 + j; const int byte_index = bit_index / 8; const int bit_offset = bit_index % 8; - + // Extract bit: 1 = +1, 0 = -1 const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1; const int yi = y[i*4 + k].qs[j]; - + sumi_block += xi * yi; } - + sumi += d1 * sumi_block; } - + sumf += d0 * sumi; } From 4eaccd98e9bb67c69c9fbe8a305e3a168a2f4117 Mon Sep 17 00:00:00 2001 From: Phil Tomson Date: Thu, 2 Apr 2026 08:09:08 -0700 Subject: [PATCH 2/2] docs: add AMD ROCm/HIP build instructions to README Document how to build with GGML_HIP=ON targeting gfx1151 (Radeon 8060S / Ryzen AI MAX+), including the Docker-based workflow for systems where the system ROCm installation is partial. Co-Authored-By: Claude Sonnet 4.6 --- README.md | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/README.md b/README.md index 5c11f38048a..808e30ad5f3 100644 --- a/README.md +++ b/README.md @@ -290,6 +290,61 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon | | [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR | +## AMD GPU support (ROCm/HIP) + +This branch supports running Bonsai Q1_0_g128 models on AMD GPUs via ROCm/HIP. The Q1_0_g128 CUDA kernels are compiled transparently by the HIP toolchain — no separate HIP-specific kernel code is needed. + +### Requirements + +- ROCm 7.x with hipBLAS and rocBLAS (including device libraries for your GPU target) +- The easiest way to get a fully configured environment is to use the `rocm/pytorch:rocm7.2_ubuntu24.04_py3.12_pytorch_release_2.10.0` Docker image, which includes gfx1151 (Radeon 8060S / Ryzen AI MAX+) support + +### Build + +```bash +HIPCXX=$(hipconfig -l)/clang HIP_PATH=$(hipconfig -R) \ +cmake -B build-hip \ + -DGGML_HIP=ON \ + -DGPU_TARGETS=gfx1151 \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_CUDA_FA=OFF +cmake --build build-hip --config Release -j$(nproc) +``` + +Replace `gfx1151` with your GPU's architecture. Find yours with: +```bash +rocminfo | grep gfx | head -1 | awk '{print $2}' +``` + +### Run + +```bash +build-hip/bin/llama-cli -m /path/to/Bonsai-8B.gguf -ngl 99 -p "your prompt" +``` + +`-ngl 99` offloads all layers to the GPU. On an integrated GPU (APU) that shares system memory, the full model fits in the GPU address space. + +### Docker + +If your system ROCm installation is partial, build and run inside the Docker image: + +```bash +docker run --rm \ + --device /dev/kfd --device /dev/dri/card1 --device /dev/dri/renderD128 \ + --group-add video --group-add render \ + -v /path/to/llama.cpp:/llama.cpp \ + -v /path/to/models:/models \ + rocm/pytorch:rocm7.2_ubuntu24.04_py3.12_pytorch_release_2.10.0 \ + bash -c " + pip install cmake -q && cd /llama.cpp && \ + HIPCXX=\$(hipconfig -l)/clang HIP_PATH=\$(hipconfig -R) \ + cmake -B build-hip -DGGML_HIP=ON -DGPU_TARGETS=gfx1151 \ + -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA_FA=OFF && \ + cmake --build build-hip -j\$(nproc) && \ + build-hip/bin/llama-cli -m /models/Bonsai-8B.gguf -ngl 99 -p 'Hello' + " +``` + ## Obtaining and quantizing models The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`: