From bfbafa28e05646ade38bdd28568f20037ead45ad Mon Sep 17 00:00:00 2001
From: wildcattrio <wildcattrio@gmail.com>
Date: Thu, 2 Apr 2026 18:30:55 +0100
Subject: [PATCH] =?UTF-8?q?fix:=20Q1=5F0=5Fg128=20x86=20CPU=20kernel=20?=
 =?UTF-8?q?=E2=80=94=20float=20truncation=20+=20AVX2=20vectorization?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Q1_0_g128 x86 kernel has two bugs causing gibberish output at
0.25 tok/s on Intel CPUs:

1. Float-to-int truncation: the per-block accumulator was `int`,
   truncating `d1 * sumi_block` (float * int → float → int). Each
   Q8_0 block's scale factor was rounded to 0 or ±1, destroying
   the output. Fix: `float block_sum` accumulator.

2. No SIMD: the x86 path was scalar-only while ARM NEON had full
   vectorization. Added AVX2 using the same broadcast/shuffle/cmpeq
   pattern from the existing Q1_0 kernel + mul_sum_i8_pairs_float.

Results on i5-1135G7 with Bonsai 8B:
  Before (MSVC):      0.25 tok/s, gibberish output
  Bug fix only:       3.7  tok/s, correct output
  Bug fix + AVX2:     6.9  tok/s, correct output

Both the x86-specific kernel (arch/x86/quants.c) and the generic
fallback (quants.c) are fixed.
---
 ggml/src/ggml-cpu/arch/x86/quants.c | 102 +++++++++++++++++++++++-----
 ggml/src/ggml-cpu/quants.c          |  26 +++----
 2 files changed, 99 insertions(+), 29 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
index 45129f08a16..426b707a58d 100644
--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -662,41 +662,111 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons
     const block_q1_0_g128 * GGML_RESTRICT x = vx;
     const block_q8_0 * GGML_RESTRICT y = vy;
 
-    float sumf = 0;
+    float sumf = 0.0f;
+
+#if defined(__AVX2__)
+    // AVX2 vectorized path for Q1_0_g128 dot Q8_0
+    // Uses the same proven bit-expansion pattern as the Q1_0 kernel above.
+    //
+    // Each Q1_0_g128 block has 128 bits = 16 bytes of packed bits.
+    // Each Q8_0 block has 32 int8 values with its own fp16 scale.
+    // We process 4 Q8_0 blocks per Q1_0_g128 block.
+
+    // Constant shuffle mask: replicate each of 4 bytes to 8 positions
+    // Low 128-bit lane: byte0 x8, byte1 x8
+    // High 128-bit lane: byte2 x8, byte3 x8
+    // (AVX2 shuffle_epi8 works within each 128-bit lane independently)
+    const __m256i shuffle_mask = _mm256_set_epi8(
+        3, 3, 3, 3, 3, 3, 3, 3,   // high lane: byte 3 replicated
+        2, 2, 2, 2, 2, 2, 2, 2,   // high lane: byte 2 replicated
+        1, 1, 1, 1, 1, 1, 1, 1,   // low lane: byte 1 replicated
+        0, 0, 0, 0, 0, 0, 0, 0    // low lane: byte 0 replicated
+    );
+
+    // Bit mask: test each bit position within the replicated byte
+    const __m256i bit_mask = _mm256_set_epi8(
+        (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01,
+        (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01,
+        (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01,
+        (char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+    );
+
+    const __m256i ones = _mm256_set1_epi8(1);
+    __m256 acc = _mm256_setzero_ps();
 
-    // Each Q1_0_g128 block has 128 elements
-    // Each Q8_0 block has 32 elements
-    // So we need 4 Q8_0 blocks per Q1_0_g128 block
     for (int ib = 0; ib < nb; ++ib) {
         const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
-        
-        int sumi = 0;
-        
+
         // Process 4 Q8_0 blocks (4 * 32 = 128 elements)
+        for (int k = 0; k < 4; k++) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
+
+            // Combined scale for this sub-block
+            const __m256 d = _mm256_set1_ps(d0 * GGML_CPU_FP16_TO_FP32(yb->d));
+
+            // Load 32 int8 values from y
+            const __m256i qy = _mm256_loadu_si256((const __m256i *)yb->qs);
+
+            // Get 4 bytes of bits for this Q8_0 block
+            const uint32_t bits32 = *(const uint32_t *)&x[ib].qs[k * 4];
+
+            // Expand 32 bits to 32 sign bytes (+1/-1)
+            // Same pattern as Q1_0 kernel: broadcast → shuffle → test → convert
+            const __m128i bits_128 = _mm_set1_epi32((int)bits32);
+            const __m256i bits_256 = _mm256_broadcastsi128_si256(bits_128);
+            const __m256i bits_shuffled = _mm256_shuffle_epi8(bits_256, shuffle_mask);
+
+            const __m256i bit_test = _mm256_and_si256(bits_shuffled, bit_mask);
+            const __m256i is_set = _mm256_cmpeq_epi8(bit_test, bit_mask);
+
+            // Convert 0xFF → +1, 0x00 → -1
+            const __m256i bit_value = _mm256_and_si256(is_set, ones);       // 0x01 or 0x00
+            const __m256i bit_doubled = _mm256_add_epi8(bit_value, bit_value); // 0x02 or 0x00
+            const __m256i qx = _mm256_sub_epi8(bit_doubled, ones);          // +1 or -1
+
+            // Dot product of sign bytes * y bytes, result as float
+            const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+            // Accumulate with scaling
+            acc = _mm256_fmadd_ps(d, q, acc);
+        }
+    }
+
+    sumf = hsum_float_8(acc);
+    *s = sumf;
+    return;
+
+#else
+    // Scalar fallback with float accumulation (bug-fixed)
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+
+        float block_sum = 0.0f;  // BUG FIX: was int, must be float
+
         for (int k = 0; k < 4; k++) {
             const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
-            
+
             int sumi_block = 0;
-            
+
             for (int j = 0; j < QK8_0; j++) {
                 const int bit_index = k * QK8_0 + j;
                 const int byte_index = bit_index / 8;
                 const int bit_offset = bit_index % 8;
-                
-                // Extract bit: 1 = +1, 0 = -1
+
                 const int xi = ((x[ib].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
                 const int yi = y[ib*4 + k].qs[j];
-                
+
                 sumi_block += xi * yi;
             }
-            
-            sumi += d1 * sumi_block;
+
+            block_sum += d1 * (float)sumi_block;  // BUG FIX: float accumulation
         }
-        
-        sumf += d0 * sumi;
+
+        sumf += d0 * block_sum;
     }
 
     *s = sumf;
+#endif
 }
 
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c
index 7f8456a5db8..a8fadb064b8 100644
--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@@ -176,37 +176,37 @@ void ggml_vec_dot_q1_0_g128_q8_0_generic(int n, float * GGML_RESTRICT s, size_t
     const block_q8_0 * GGML_RESTRICT y = vy;
     
     
-    float sumf = 0.0;
-    
+    float sumf = 0.0f;
+
     // Each Q1_0_g128 block has 128 elements, each Q8_0 block has 32 elements
     // So we need 4 Q8_0 blocks per Q1_0_g128 block
     for (int i = 0; i < nb; i++) {
         const float d0 = GGML_FP16_TO_FP32(x[i].d);
-        
-        int sumi = 0;
-        
+
+        float block_sum = 0.0f;  // BUG FIX: was int, must be float
+
         // Process 4 Q8_0 blocks (4 * 32 = 128 elements)
         for (int k = 0; k < 4; k++) {
             const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
-            
+
             int sumi_block = 0;
-            
+
             for (int j = 0; j < QK8_0; j++) {
                 const int bit_index = k * QK8_0 + j;
                 const int byte_index = bit_index / 8;
                 const int bit_offset = bit_index % 8;
-                
+
                 // Extract bit: 1 = +1, 0 = -1
                 const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
                 const int yi = y[i*4 + k].qs[j];
-                
+
                 sumi_block += xi * yi;
             }
-            
-            sumi += d1 * sumi_block;
+
+            block_sum += d1 * (float)sumi_block;  // BUG FIX: float accumulation
         }
-        
-        sumf += d0 * sumi;
+
+        sumf += d0 * block_sum;
     }
     
     *s = sumf;