From 9343e094a02301679ed228dd04b473bea6ecb342 Mon Sep 17 00:00:00 2001
From: Marxist-Leninist <noreply@users.noreply.github.com>
Date: Thu, 2 Apr 2026 16:17:03 +0000
Subject: [PATCH 1/5] fix: Q1_0_g128 CPU dot product int truncation

The accumulator `sumi` in ggml_vec_dot_q1_0_g128_q8_0 was declared
as `int` but accumulates `float d1 * int sumi_block`, causing the
float result to be truncated to integer on each iteration. This
produced garbage output for Q1_0_g128 models on CPU.

Fix: change `int sumi = 0` to `float sumi = 0` in both the x86
and generic (portable) kernels.
---
 ggml/src/ggml-cpu/arch/x86/quants.c | 2 +-
 ggml/src/ggml-cpu/quants.c          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
index 45129f08a16..e2481d37114 100644
--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -670,7 +670,7 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons
     for (int ib = 0; ib < nb; ++ib) {
         const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
         
-        int sumi = 0;
+        float sumi = 0;
         
         // Process 4 Q8_0 blocks (4 * 32 = 128 elements)
         for (int k = 0; k < 4; k++) {
diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c
index 7f8456a5db8..bd83809e855 100644
--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@@ -183,7 +183,7 @@ void ggml_vec_dot_q1_0_g128_q8_0_generic(int n, float * GGML_RESTRICT s, size_t
     for (int i = 0; i < nb; i++) {
         const float d0 = GGML_FP16_TO_FP32(x[i].d);
         
-        int sumi = 0;
+        float sumi = 0;
         
         // Process 4 Q8_0 blocks (4 * 32 = 128 elements)
         for (int k = 0; k < 4; k++) {

From 3909d587836c997af66bc444004a9b35b5c8158e Mon Sep 17 00:00:00 2001
From: Marxist-Leninist <noreply@users.noreply.github.com>
Date: Thu, 2 Apr 2026 16:30:11 +0000
Subject: [PATCH 2/5] feat: add AVX2 SIMD path for Q1_0_g128 CPU dot product

The existing scalar fallback runs at ~0.2 t/s on CPUs without AVX-512
(Ryzen, Intel 12th+ gen consumer). This adds an AVX2 path that:
- Sign-extends int8->int16 in two 16-element passes per Q8_0 block
- Expands 1-bit weights to 16-bit masks via broadcast+AND+cmpeq
- Uses blendv to negate activations where weight bit=0
- Accumulates via madd_epi16 -> cvtepi32_ps -> fmadd_ps

AVX2 is supported on virtually all x86-64 CPUs from 2013+.
---
 ggml/src/ggml-cpu/arch/x86/quants.c | 57 +++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
index e2481d37114..acbb3807d5f 100644
--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -664,9 +664,59 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons
 
     float sumf = 0;
 
-    // Each Q1_0_g128 block has 128 elements
-    // Each Q8_0 block has 32 elements
-    // So we need 4 Q8_0 blocks per Q1_0_g128 block
+#if defined(__AVX2__)
+    // AVX2: process 32 Q8_0 values per sub-block in two 16-element passes.
+    // Sign-extend int8->int16, expand 1-bit weights to masks, blend to negate,
+    // then madd->fma accumulation.
+    const __m256i ones_16 = _mm256_set1_epi16(1);
+    const __m256i bmask = _mm256_setr_epi16(
+        1<<0,  1<<1,  1<<2,  1<<3,  1<<4,  1<<5,  1<<6,  1<<7,
+        1<<8,  1<<9,  1<<10, 1<<11, 1<<12, 1<<13, 1<<14, (short)(1<<15));
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        __m256 acc_block = _mm256_setzero_ps();
+
+        for (int k = 0; k < 4; k++) {
+            const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
+            const __m256i y_bytes = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
+
+            uint32_t bits;
+            memcpy(&bits, &x[ib].qs[k * 4], sizeof(bits));
+
+            // Lower 16 elements: sign-extend int8->int16, apply sign from weight bits
+            const __m256i y_lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(y_bytes));
+            const __m256i neg_lo = _mm256_sub_epi16(_mm256_setzero_si256(), y_lo);
+            const __m256i mask_lo = _mm256_cmpeq_epi16(
+                _mm256_and_si256(_mm256_set1_epi16((short)(bits & 0xFFFF)), bmask), bmask);
+            const __m256i signed_lo = _mm256_blendv_epi8(neg_lo, y_lo, mask_lo);
+
+            // Upper 16 elements
+            const __m256i y_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(y_bytes, 1));
+            const __m256i neg_hi = _mm256_sub_epi16(_mm256_setzero_si256(), y_hi);
+            const __m256i mask_hi = _mm256_cmpeq_epi16(
+                _mm256_and_si256(_mm256_set1_epi16((short)(bits >> 16)), bmask), bmask);
+            const __m256i signed_hi = _mm256_blendv_epi8(neg_hi, y_hi, mask_hi);
+
+            // Pair-wise sum int16->int32, combine halves, convert to float, FMA
+            const __m256i sum_32 = _mm256_add_epi32(
+                _mm256_madd_epi16(signed_lo, ones_16),
+                _mm256_madd_epi16(signed_hi, ones_16));
+            acc_block = _mm256_fmadd_ps(_mm256_set1_ps(d1),
+                                         _mm256_cvtepi32_ps(sum_32), acc_block);
+        }
+        acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
+    }
+    // Horizontal reduction: 256 -> 128 -> scalar
+    {
+        const __m128 h = _mm_add_ps(_mm256_extractf128_ps(acc, 0),
+                                     _mm256_extractf128_ps(acc, 1));
+        const __m128 q = _mm_add_ps(h, _mm_movehl_ps(h, h));
+        *s = _mm_cvtss_f32(_mm_add_ss(q, _mm_movehdup_ps(q)));
+    }
+#else
+    // Scalar fallback
     for (int ib = 0; ib < nb; ++ib) {
         const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
         
@@ -697,6 +747,7 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons
     }
 
     *s = sumf;
+#endif
 }
 
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {

From 77c635522ddcdf11763033b512578e686f7ae104 Mon Sep 17 00:00:00 2001
From: Marxist-Leninist <noreply@users.noreply.github.com>
Date: Thu, 2 Apr 2026 16:35:52 +0000
Subject: [PATCH 3/5] chore: track gguf files with git-lfs

---
 .gitattributes | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitattributes

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000000..ae756a3f816
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.gguf filter=lfs diff=lfs merge=lfs -text

From d603bf40924139e4481cd9d4209728fa919a19d9 Mon Sep 17 00:00:00 2001
From: Marxist-Leninist <noreply@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:23:47 +0000
Subject: [PATCH 4/5] =?UTF-8?q?perf:=20optimized=20AVX2=20Q1=5F0=5Fg128=20?=
 =?UTF-8?q?kernel=20=E2=80=94=20single-pass=20byte-level=20processing?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace two-pass int16 blendv approach with:
- Single-pass byte-level bit expansion (shuffle+AND+cmpeq)
- XOR+SUB negate trick (replaces slow blendv, 2-3 cyc -> 1 cyc each)
- maddubs+madd accumulation (stays in int8 longer)
- Fully unrolled k-loop (eliminates loop overhead + branch)

Benchmark on i7-10510U (AVX2+FMA, 4T):
  Scalar:  0.2 t/s prompt, 0.2 t/s gen
  AVX2 v1: 2.4 t/s prompt, 2.1 t/s gen (two-pass blendv)
  AVX2 v3: 4.7 t/s prompt, 3.1 t/s gen (this commit)

~15x faster than scalar, ~50% faster than v1.
---
 ggml/src/ggml-cpu/arch/x86/quants.c | 90 +++++++++++++++--------------
 1 file changed, 47 insertions(+), 43 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
index acbb3807d5f..e46eaa2562e 100644
--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -65,52 +65,56 @@ static inline int hsum_i32_4(const __m128i a) {
     return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
 }
 
-#if defined(__AVX2__) || defined(__AVX512F__)
-static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    const __m256i sy = _mm256_sign_epi8(y, x);
-    return _mm256_maddubs_epi16(ax, sy);
-}
-
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m256i shuf_mask = _mm256_set_epi64x(
-            0x0303030303030303, 0x0202020202020202,
-            0x0101010101010101, 0x0000000000000000);
-    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
-    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
-    bytes = _mm256_or_si256(bytes, bit_mask);
-    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
-}
+#if defined(__AVX2__)
+    // AVX2: single-pass byte-level processing, fully unrolled k-loop.
+    // Pipeline: broadcast+shuffle -> AND+cmpeq -> XOR+SUB -> maddubs+madd -> cvt+fma
+    const __m256i ones_8  = _mm256_set1_epi8(1);
+    const __m256i ones_16 = _mm256_set1_epi16(1);
+    const __m256i byte_shuf = _mm256_setr_epi8(
+        0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,
+        2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3);
+    const __m256i bit_masks = _mm256_setr_epi8(
+        1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128,
+        1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128);
+    const __m256i zero = _mm256_setzero_si256();
+    __m256 acc = _mm256_setzero_ps();
 
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
-{
-    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
-    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
-    const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    return _mm256_and_si256(lowMask, bytes);
-}
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        const uint32_t * qs32 = (const uint32_t *)x[ib].qs;
+
+#define Q1_AVX2_BLOCK(K) \
+        { \
+            const __m256i y = _mm256_loadu_si256((const __m256i *)y_ptr[K].qs); \
+            const __m256i sm = _mm256_cmpeq_epi8(_mm256_and_si256( \
+                _mm256_shuffle_epi8(_mm256_set1_epi32((int)qs32[K]), byte_shuf), \
+                bit_masks), zero); \
+            const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(y, sm), sm); \
+            const __m256i s32 = _mm256_madd_epi16( \
+                _mm256_maddubs_epi16(ones_8, sy), ones_16); \
+            acc_block = (K == 0) \
+                ? _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \
+                                _mm256_cvtepi32_ps(s32)) \
+                : _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \
+                                   _mm256_cvtepi32_ps(s32), acc_block); \
+        }
 
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m256i x) {
-    const __m256i ones = _mm256_set1_epi16(1);
-    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
+        const block_q8_0 * y_ptr = &y[ib*4];
+        __m256 acc_block;
+        Q1_AVX2_BLOCK(0)
+        Q1_AVX2_BLOCK(1)
+        Q1_AVX2_BLOCK(2)
+        Q1_AVX2_BLOCK(3)
+#undef Q1_AVX2_BLOCK
 
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#elif defined(__AVXVNNI__)
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
-    return _mm256_cvtepi32_ps(summed_pairs);
+        acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
+    }
+    {
+        const __m128 h = _mm_add_ps(_mm256_extractf128_ps(acc, 0),
+                                     _mm256_extractf128_ps(acc, 1));
+        const __m128 q = _mm_add_ps(h, _mm_movehl_ps(h, h));
+        *s = _mm_cvtss_f32(_mm_add_ss(q, _mm_movehdup_ps(q)));
+    }
 #else
     // Perform multiplication and create 16-bit values
     const __m256i dot = _mm256_maddubs_epi16(ax, sy);

From 12788a267bed32373365d369203e9b37ac747f46 Mon Sep 17 00:00:00 2001
From: Marxist-Leninist <noreply@users.noreply.github.com>
Date: Thu, 2 Apr 2026 17:45:35 +0000
Subject: [PATCH 5/5] perf: COM6-inspired mul_mat dispatch for Q1_0_g128

Apply cache-blocking and prefetch optimizations from the COM6 matrix
multiplication library (github.com/Marxist-Leninist/COM6):

- Increase weight row block size from 16 to 64 for Q1_0_g128
  (1-bit rows are ~576 bytes at K=4096, 64 rows = 36KB fits in L1d)
- Add software prefetch of weight rows 4 iterations ahead,
  mirroring COM6 distributed prefetch strategy
- Enlarge tmp accumulator buffer to match larger block size

Benchmark on i7-10510U (4T, Bonsai-8B Q1_0_g128):
  Before: 3.14 t/s generation
  After:  3.43 t/s generation (+9%)
---
 ggml/src/ggml-cpu/ggml-cpu.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 48fbddf74f5..7de4da3338c 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1185,15 +1185,16 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     assert(ne12 % ne02 == 0);
     assert(ne13 % ne03 == 0);
 
-    // block-tiling attempt
-    const int64_t blck_0 = 16;
+    // COM6-inspired block-tiling: larger blocks for Q1_0_g128 (1-bit weights are tiny,
+    // so we can fit more rows in L1). Prefetch next weight block while processing current.
+    const int64_t blck_0 = (type == GGML_TYPE_Q1_0_g128) ? 64 : 16;
     const int64_t blck_1 = 16;
 
     const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
 
     // attempt to reduce false-sharing (does not seem to make a difference)
-    // 16 * 2, accounting for mmla kernels
-    float tmp[32];
+    // Size: blck_0 * 2 (accounting for mmla kernels that compute 2 rows at once)
+    float tmp[128];
 
     for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
         for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
@@ -1226,12 +1227,17 @@ static void ggml_compute_forward_mul_mat_one_chunk(
                 //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
                 //}
 
-                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
+                // COM6-inspired: prefetch next weight rows while computing current ones.
+                const int64_t ir0_max = MIN(iir0 + blck_0, ir0_end);
+                for (int64_t ir0 = iir0; ir0 < ir0_max; ir0 += num_rows_per_vec_dot) {
+                    if (ir0 + 4 * num_rows_per_vec_dot < ir0_max) {
+                        __builtin_prefetch(src0_row + (ir0 + 4 * num_rows_per_vec_dot) * nb01, 0, 1);
+                    }
                     vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
                 }
 
                 for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
-                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
+                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (ir0_max - iir0) * sizeof(float));
                 }
             }
         }