From 98cccf37f73f74f2dab0d79266194e77861bc645 Mon Sep 17 00:00:00 2001
From: Phil Tomson <philtomson@gmail.com>
Date: Thu, 2 Apr 2026 08:08:51 -0700
Subject: [PATCH 1/2] ggml: fix q1_0_g128 CPU dot product and add x86 SIMD
 kernels

Fix float-truncation bug in ggml_vec_dot_q1_0_g128_q8_0: the Q8_0
scale factor d1 was accumulated into an int, silently truncating it
to zero for most blocks and producing garbage output on CPU.

Add AVX2 kernel using the same shuffle/bit-mask expansion as q1_0,
processing one 32-element Q8_0 sub-block per iteration (~4x speedup).

Add AVX-512 hybrid kernel using _mm256_mask_blend_epi8 (AVX-512BW+VL)
to expand weight bits in a single instruction, with _mm256_dpbusd_epi32
(AVX-512VNNI) for the dot product (~5.7x speedup on 1.7B model).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ggml/src/ggml-cpu/arch/x86/quants.c | 118 ++++++++++++++++++++++++----
 ggml/src/ggml-cpu/quants.c          |  20 ++---
 2 files changed, 111 insertions(+), 27 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
index 45129f08a16..3e71d656fa9 100644
--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -662,37 +662,121 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons
     const block_q1_0_g128 * GGML_RESTRICT x = vx;
     const block_q8_0 * GGML_RESTRICT y = vy;
 
+    int ib = 0;
     float sumf = 0;
 
-    // Each Q1_0_g128 block has 128 elements
-    // Each Q8_0 block has 32 elements
-    // So we need 4 Q8_0 blocks per Q1_0_g128 block
-    for (int ib = 0; ib < nb; ++ib) {
+#if defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__)
+    // Hybrid AVX-512 path: stay in 256-bit to avoid lane-crossing overhead, but use
+    // AVX-512BW+VL and VNNI to reduce the inner loop to ~3 instructions per sub-block.
+    //
+    // Bit expansion: _mm256_mask_blend_epi8 uses the 32-bit integer directly as a
+    // __mmask32, selecting +qy where the weight bit is 1 and -qy where it is 0.
+    // That replaces the 7-instruction shuffle/mask/cmp/convert sequence with 2
+    // instructions (sub + mask_blend).
+    //
+    // Dot product: _mm256_dpbusd_epi32(zero, all_ones, signed_qy) sums signed_qy in
+    // groups of 4, giving 8 int32 partial sums that represent the full dot product.
+
+    __m256 acc = _mm256_setzero_ps();
+    const __m256i zero_256     = _mm256_setzero_si256();
+    const __m256i all_ones_256 = _mm256_set1_epi8(1);
+
+    for (; ib < nb; ++ib) {
         const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
-        
-        int sumi = 0;
-        
-        // Process 4 Q8_0 blocks (4 * 32 = 128 elements)
-        for (int k = 0; k < 4; k++) {
+
+        for (int k = 0; k < 4; ++k) {
+            const float   d1       = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
+            const uint32_t qbits32 = *(const uint32_t *)(x[ib].qs + k * 4);
+            const __m256i  qy      = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
+
+            // Negate qy where bit=0 (weight=-1), keep qy where bit=1 (weight=+1)
+            const __m256i neg_qy    = _mm256_sub_epi8(zero_256, qy);
+            const __m256i signed_qy = _mm256_mask_blend_epi8((__mmask32)qbits32, neg_qy, qy);
+
+            // dpbusd(zero, all_ones, signed_qy) = sum(signed_qy) = dot product
+            const __m256i int_acc = _mm256_dpbusd_epi32(zero_256, all_ones_256, signed_qy);
+
+            acc = _mm256_fmadd_ps(_mm256_set1_ps(d0 * d1), _mm256_cvtepi32_ps(int_acc), acc);
+        }
+    }
+
+    sumf = hsum_float_8(acc);
+
+#elif defined(__AVX2__)
+    __m256 acc = _mm256_setzero_ps();
+
+    // Hoist constants out of the loop
+    const __m256i shuffle_mask = _mm256_set_epi8(
+        3, 3, 3, 3, 3, 3, 3, 3,
+        2, 2, 2, 2, 2, 2, 2, 2,
+        1, 1, 1, 1, 1, 1, 1, 1,
+        0, 0, 0, 0, 0, 0, 0, 0
+    );
+    const __m256i bit_mask = _mm256_set_epi8(
+        (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01,
+        (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01,
+        (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01,
+        (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+    );
+    const __m256i ones = _mm256_set1_epi8(1);
+
+    for (; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+
+        // Each q1_0_g128 block maps to 4 consecutive q8_0 blocks (4 * 32 = 128 elements).
+        // Process one q8_0 sub-block (32 elements) per iteration.
+        for (int k = 0; k < 4; ++k) {
+            const float d1  = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
+            const __m256 d  = _mm256_set1_ps(d0 * d1);
+
+            // Load the 4 bytes (32 bits) covering elements [k*32 .. k*32+31]
+            const uint32_t qbits32 = *(const uint32_t *)(x[ib].qs + k * 4);
+
+            // Load 32 q8_0 int8 activations
+            const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
+
+            // Expand 32 bits → 32 bytes of ±1 using the same shuffle/mask trick as q1_0
+            const __m128i qbits_128   = _mm_set1_epi32(qbits32);
+            const __m256i qbits_256   = _mm256_broadcastsi128_si256(qbits_128);
+            const __m256i qbits_shuf  = _mm256_shuffle_epi8(qbits_256, shuffle_mask);
+            const __m256i bit_test    = _mm256_and_si256(qbits_shuf, bit_mask);
+            const __m256i is_set      = _mm256_cmpeq_epi8(bit_test, bit_mask);
+            const __m256i bit_value   = _mm256_and_si256(is_set, ones);
+            const __m256i bit_doubled = _mm256_add_epi8(bit_value, bit_value);
+            const __m256i qx          = _mm256_sub_epi8(bit_doubled, ones);
+
+            acc = _mm256_fmadd_ps(d, mul_sum_i8_pairs_float(qx, qy), acc);
+        }
+    }
+
+    sumf = hsum_float_8(acc);
+#endif
+
+    // Scalar fallback for any remaining blocks
+    for (; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+
+        float sumi = 0.0f;
+
+        for (int k = 0; k < 4; ++k) {
             const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
-            
+
             int sumi_block = 0;
-            
+
             for (int j = 0; j < QK8_0; j++) {
-                const int bit_index = k * QK8_0 + j;
+                const int bit_index  = k * QK8_0 + j;
                 const int byte_index = bit_index / 8;
                 const int bit_offset = bit_index % 8;
-                
-                // Extract bit: 1 = +1, 0 = -1
+
                 const int xi = ((x[ib].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
                 const int yi = y[ib*4 + k].qs[j];
-                
+
                 sumi_block += xi * yi;
             }
-            
+
             sumi += d1 * sumi_block;
         }
-        
+
         sumf += d0 * sumi;
     }
 
diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c
index 7f8456a5db8..7d25c9dc392 100644
--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@@ -177,35 +177,35 @@ void ggml_vec_dot_q1_0_g128_q8_0_generic(int n, float * GGML_RESTRICT s, size_t
     
     
     float sumf = 0.0;
-    
+
     // Each Q1_0_g128 block has 128 elements, each Q8_0 block has 32 elements
     // So we need 4 Q8_0 blocks per Q1_0_g128 block
     for (int i = 0; i < nb; i++) {
         const float d0 = GGML_FP16_TO_FP32(x[i].d);
-        
-        int sumi = 0;
-        
+
+        float sumi = 0.0f;
+
         // Process 4 Q8_0 blocks (4 * 32 = 128 elements)
         for (int k = 0; k < 4; k++) {
             const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
-            
+
             int sumi_block = 0;
-            
+
             for (int j = 0; j < QK8_0; j++) {
                 const int bit_index = k * QK8_0 + j;
                 const int byte_index = bit_index / 8;
                 const int bit_offset = bit_index % 8;
-                
+
                 // Extract bit: 1 = +1, 0 = -1
                 const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
                 const int yi = y[i*4 + k].qs[j];
-                
+
                 sumi_block += xi * yi;
             }
-            
+
             sumi += d1 * sumi_block;
         }
-        
+
         sumf += d0 * sumi;
     }
     

From 4eaccd98e9bb67c69c9fbe8a305e3a168a2f4117 Mon Sep 17 00:00:00 2001
From: Phil Tomson <philtomson@gmail.com>
Date: Thu, 2 Apr 2026 08:09:08 -0700
Subject: [PATCH 2/2] docs: add AMD ROCm/HIP build instructions to README

Document how to build with GGML_HIP=ON targeting gfx1151 (Radeon 8060S /
Ryzen AI MAX+), including the Docker-based workflow for systems where the
system ROCm installation is partial.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/README.md b/README.md
index 5c11f38048a..808e30ad5f3 100644
--- a/README.md
+++ b/README.md
@@ -290,6 +290,61 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
 | [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
 
+## AMD GPU support (ROCm/HIP)
+
+This branch supports running Bonsai Q1_0_g128 models on AMD GPUs via ROCm/HIP. The Q1_0_g128 CUDA kernels are compiled transparently by the HIP toolchain — no separate HIP-specific kernel code is needed.
+
+### Requirements
+
+- ROCm 7.x with hipBLAS and rocBLAS (including device libraries for your GPU target)
+- The easiest way to get a fully configured environment is to use the `rocm/pytorch:rocm7.2_ubuntu24.04_py3.12_pytorch_release_2.10.0` Docker image, which includes gfx1151 (Radeon 8060S / Ryzen AI MAX+) support
+
+### Build
+
+```bash
+HIPCXX=$(hipconfig -l)/clang HIP_PATH=$(hipconfig -R) \
+cmake -B build-hip \
+  -DGGML_HIP=ON \
+  -DGPU_TARGETS=gfx1151 \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DGGML_CUDA_FA=OFF
+cmake --build build-hip --config Release -j$(nproc)
+```
+
+Replace `gfx1151` with your GPU's architecture. Find yours with:
+```bash
+rocminfo | grep gfx | head -1 | awk '{print $2}'
+```
+
+### Run
+
+```bash
+build-hip/bin/llama-cli -m /path/to/Bonsai-8B.gguf -ngl 99 -p "your prompt"
+```
+
+`-ngl 99` offloads all layers to the GPU. On an integrated GPU (APU) that shares system memory, the full model fits in the GPU address space.
+
+### Docker
+
+If your system ROCm installation is partial, build and run inside the Docker image:
+
+```bash
+docker run --rm \
+  --device /dev/kfd --device /dev/dri/card1 --device /dev/dri/renderD128 \
+  --group-add video --group-add render \
+  -v /path/to/llama.cpp:/llama.cpp \
+  -v /path/to/models:/models \
+  rocm/pytorch:rocm7.2_ubuntu24.04_py3.12_pytorch_release_2.10.0 \
+  bash -c "
+    pip install cmake -q && cd /llama.cpp && \
+    HIPCXX=\$(hipconfig -l)/clang HIP_PATH=\$(hipconfig -R) \
+    cmake -B build-hip -DGGML_HIP=ON -DGPU_TARGETS=gfx1151 \
+          -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA_FA=OFF && \
+    cmake --build build-hip -j\$(nproc) && \
+    build-hip/bin/llama-cli -m /models/Bonsai-8B.gguf -ngl 99 -p 'Hello'
+  "
+```
+
 ## Obtaining and quantizing models
 
 The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`: