diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
index e4130ef22f9..8793aff4a71 100644
--- a/ggml/src/ggml-cpu/arch/x86/quants.c
+++ b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -274,6 +274,25 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const
 }
 #endif
 #elif defined(__SSSE3__)
+static inline int hsum_i32_4(const __m128i a) {
+    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+    const __m128i sum64 = _mm_add_epi32(hi64, a);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+static inline __m128i bytes_from_bits_16(const uint8_t * x) {
+    uint16_t x16;
+    memcpy(&x16, x, sizeof(uint16_t));
+
+    const __m128i shuf_mask = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+    __m128i bytes = _mm_shuffle_epi8(_mm_set1_epi16((short) x16), shuf_mask);
+    const __m128i bit_mask = _mm_set_epi64x(0x7fbfdfeff7fbfdfe, 0x7fbfdfeff7fbfdfe);
+    bytes = _mm_or_si128(bytes, bit_mask);
+
+    return _mm_cmpeq_epi8(bytes, _mm_set1_epi64x(-1));
+}
+
 // horizontally add 4x4 floats
 static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
     __m128 res_0 =_mm_hadd_ps(a, b);
@@ -541,11 +560,375 @@ static inline __m128i get_scale_shuffle(int i) {
 #endif
 
 void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__AVX512BW__)
+    // AVX-512BW: widen one full Q8_0 block to int16, apply the 32 sign bits
+    // directly to y, reduce pairs with madd, then scale and accumulate in fp32.
+    const __m512i ones_16 = _mm512_set1_epi16(1);
+    const __m512i zero = _mm512_setzero_si512();
+    __m512 acc = _mm512_setzero_ps();
+
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
+        const __m256i qy_8 = _mm256_loadu_si256((const __m256i *) y[ib].qs);
+        const __m512i qy_16 = _mm512_cvtepi8_epi16(qy_8);
+        uint32_t bits;
+        memcpy(&bits, x[ib].qs, sizeof(bits));
+        const __m512i signed_y = _mm512_mask_sub_epi16(qy_16, (__mmask32)(~bits), zero, qy_16);
+        const __m512i sum_32 = _mm512_madd_epi16(signed_y, ones_16);
+
+        acc = _mm512_fmadd_ps(_mm512_set1_ps(d), _mm512_cvtepi32_ps(sum_32), acc);
+    }
+
+    {
+        const __m256 h = _mm256_add_ps(_mm512_extractf32x8_ps(acc, 0),
+                                       _mm512_extractf32x8_ps(acc, 1));
+        __m128 q = _mm_add_ps(_mm256_extractf128_ps(h, 0),
+                              _mm256_extractf128_ps(h, 1));
+        q = _mm_add_ps(q, _mm_movehl_ps(q, q));
+        q = _mm_add_ss(q, _mm_movehdup_ps(q));
+        sumf = _mm_cvtss_f32(q);
+    }
+#elif defined(__AVX2__)
+    // AVX2: expand the 32 packed sign bits to 32 signed bytes, run one byte-dot
+    // against the full Q8_0 block, and accumulate the scaled fp32 reduction.
+    __m256 acc = _mm256_setzero_ps();
+
+    for (; ib < nb; ++ib) {
+        const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+        const __m256i bit_mask = bytes_from_bits_32(x[ib].qs);
+        const __m256i bit_value = _mm256_and_si256(bit_mask, _mm256_set1_epi8(1));
+        const __m256i qx = _mm256_sub_epi8(_mm256_add_epi8(bit_value, bit_value), _mm256_set1_epi8(1));
+        const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        acc = _mm256_fmadd_ps(d, q, acc);
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // AVX: keep the same 32-bit sign expansion, but process the byte-domain dot
+    // as two 128-bit halves before combining them into one 256-bit reduction.
+    const __m128i ones_8 = _mm_set1_epi8(1);
+    __m256 acc = _mm256_setzero_ps();
+
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
+        const __m256i bit_mask = bytes_from_bits_32(x[ib].qs);
+        const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
+        const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
+        const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8);
+        const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8);
+        const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8);
+        const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8);
+        const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y[ib].qs[0]);
+        const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y[ib].qs[16]);
+        const __m256i qx = MM256_SET_M128I(qx_1, qx_0);
+        const __m256i qy = MM256_SET_M128I(qy_1, qy_0);
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d), q));
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__SSSE3__)
+    // SSSE3: decode the two 16-bit bit chunks into two 16-byte sign vectors,
+    // perform the pairwise int8 dot in 128-bit lanes, and accumulate to scalar.
+    const __m128i ones_8 = _mm_set1_epi8(1);
+
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
+        const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[0]);
+        const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[2]);
+        const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8);
+        const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8);
+        const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8);
+        const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8);
+        const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y[ib].qs[0]);
+        const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y[ib].qs[16]);
+        const __m128i sum_0 = mul_sum_i8_pairs(qx_0, qy_0);
+        const __m128i sum_1 = mul_sum_i8_pairs(qx_1, qy_1);
+
+        sumf += d * hsum_i32_4(_mm_add_epi32(sum_0, sum_1));
+    }
+#endif
+
+    // Scalar fallback: stay byte-oriented so the packed sign bits feed four
+    // straight-line 8-value accumulations without per-element bit math.
+    for (; ib < nb; ++ib) {
+        const uint8_t * GGML_RESTRICT bits = x[ib].qs;
+        const int8_t  * GGML_RESTRICT qy   = y[ib].qs;
+        int sumi = 0;
+
+        for (int b = 0; b < 4; ++b, qy += 8) {
+            const unsigned mask = bits[b];
+            sumi += ((mask & 0x01) ? qy[0] : -qy[0])
+                 +  ((mask & 0x02) ? qy[1] : -qy[1])
+                 +  ((mask & 0x04) ? qy[2] : -qy[2])
+                 +  ((mask & 0x08) ? qy[3] : -qy[3])
+                 +  ((mask & 0x10) ? qy[4] : -qy[4])
+                 +  ((mask & 0x20) ? qy[5] : -qy[5])
+                 +  ((mask & 0x40) ? qy[6] : -qy[6])
+                 +  ((mask & 0x80) ? qy[7] : -qy[7]);
+        }
+
+        sumf += sumi * GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
 }
 
 void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    ggml_vec_dot_q1_0_g128_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    const int qk = QK1_0_g128;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q1_0_g128 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__AVX512BW__)
+    // AVX-512BW: sign-extend int8→int16, mask-negate, madd→fma pipeline.
+    // The inner loop over 4 Q8_0 sub-blocks accumulates into acc_block,
+    // then a single FMA folds into the outer acc — this structure lets the
+    // CPU start the next outer iteration while the final FMA is still in flight.
+    const __m512i ones_16 = _mm512_set1_epi16(1);
+    const __m512i zero = _mm512_setzero_si512();
+    __m512 acc = _mm512_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        __m512 acc_block = _mm512_setzero_ps();
+
+        for (int k = 0; k < 4; k++) {
+            const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
+
+            // 32 int8 → 32 int16 in a zmm register
+            __m256i y_8 = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
+            __m512i y_16 = _mm512_cvtepi8_epi16(y_8);
+
+            // Load 32 weight bits; negate y where bit=0 (weight = -1)
+            uint32_t bits;
+            memcpy(&bits, &x[ib].qs[k * 4], sizeof(bits));
+            __m512i signed_y = _mm512_mask_sub_epi16(y_16, (__mmask32)(~bits), zero, y_16);
+
+            // Pair-wise sum int16→int32, convert to float, scale and accumulate
+            __m512i sum_32 = _mm512_madd_epi16(signed_y, ones_16);
+            __m512 sum_f = _mm512_cvtepi32_ps(sum_32);
+            acc_block = _mm512_fmadd_ps(_mm512_set1_ps(d1), sum_f, acc_block);
+        }
+
+        acc = _mm512_fmadd_ps(_mm512_set1_ps(d0), acc_block, acc);
+    }
+
+    // Horizontal sum: 512 → 256 → 128 → scalar
+    {
+        __m256 h = _mm256_add_ps(_mm512_extractf32x8_ps(acc, 0),
+                                  _mm512_extractf32x8_ps(acc, 1));
+        __m128 q = _mm_add_ps(_mm256_extractf128_ps(h, 0),
+                               _mm256_extractf128_ps(h, 1));
+        q = _mm_add_ps(q, _mm_movehl_ps(q, q));
+        q = _mm_add_ss(q, _mm_movehdup_ps(q));
+        *s = _mm_cvtss_f32(q);
+    }
+#elif defined(__AVX2__)
+    // AVX2: expand each 32-bit sign stream to 32 signed bytes, reduce two
+    // Q8_0 sub-blocks in parallel, then fold the pair into the outer block sum.
+    // Splitting the fixed 4-way inner loop into two independent accumulators
+    // gives the core more scheduling freedom than one longer dependency chain.
+    const __m256i ones_8 = _mm256_set1_epi8(1);
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        __m256 acc_block_0 = _mm256_setzero_ps();
+        __m256 acc_block_1 = _mm256_setzero_ps();
+
+        for (int k = 0; k < 4; k += 2) {
+            const block_q8_0 * GGML_RESTRICT yb_0 = &y[ib * 4 + k + 0];
+            const block_q8_0 * GGML_RESTRICT yb_1 = &y[ib * 4 + k + 1];
+            const __m256i bit_mask_0 = bytes_from_bits_32(&x[ib].qs[(k + 0) * 4]);
+            const __m256i bit_mask_1 = bytes_from_bits_32(&x[ib].qs[(k + 1) * 4]);
+            const __m256i bit_value_0 = _mm256_and_si256(bit_mask_0, ones_8);
+            const __m256i bit_value_1 = _mm256_and_si256(bit_mask_1, ones_8);
+            const __m256i qx_0 = _mm256_sub_epi8(_mm256_add_epi8(bit_value_0, bit_value_0), ones_8);
+            const __m256i qx_1 = _mm256_sub_epi8(_mm256_add_epi8(bit_value_1, bit_value_1), ones_8);
+            const __m256i qy_0 = _mm256_loadu_si256((const __m256i *) yb_0->qs);
+            const __m256i qy_1 = _mm256_loadu_si256((const __m256i *) yb_1->qs);
+            const __m256 q_0 = mul_sum_i8_pairs_float(qx_0, qy_0);
+            const __m256 q_1 = mul_sum_i8_pairs_float(qx_1, qy_1);
+            const __m256 d1_0 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(yb_0->d));
+            const __m256 d1_1 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(yb_1->d));
+
+            acc_block_0 = _mm256_fmadd_ps(d1_0, q_0, acc_block_0);
+            acc_block_1 = _mm256_fmadd_ps(d1_1, q_1, acc_block_1);
+        }
+
+        acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), _mm256_add_ps(acc_block_0, acc_block_1), acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // AVX: reuse the same 32-bit sign expansion, but do the byte-domain work
+    // with two 128-bit halves before combining them into one 256-bit reduction.
+    const __m128i ones_8 = _mm_set1_epi8(1);
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        __m256 acc_block = _mm256_setzero_ps();
+
+        for (int k = 0; k < 4; ++k) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
+            const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
+            const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[k * 4]);
+            const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
+            const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
+            const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8);
+            const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8);
+            const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8);
+            const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8);
+            const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &yb->qs[0]);
+            const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &yb->qs[16]);
+            const __m256i qx = MM256_SET_M128I(qx_1, qx_0);
+            const __m256i qy = MM256_SET_M128I(qy_1, qy_0);
+            const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+            acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(d1), q));
+        }
+
+        acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d0), acc_block));
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__SSSE3__)
+    // SSSE3: decode each 32-bit sign stream into two 16-byte sign vectors,
+    // keep one fp32 accumulator per Q8_0 sub-block, then reduce once at end.
+    const __m128i ones_8 = _mm_set1_epi8(1);
+    __m128 acc_0 = _mm_setzero_ps();
+    __m128 acc_1 = _mm_setzero_ps();
+    __m128 acc_2 = _mm_setzero_ps();
+    __m128 acc_3 = _mm_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const __m128 d0 = _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
+        const block_q8_0 * GGML_RESTRICT yb_0 = &y[ib * 4 + 0];
+        const block_q8_0 * GGML_RESTRICT yb_1 = &y[ib * 4 + 1];
+        const block_q8_0 * GGML_RESTRICT yb_2 = &y[ib * 4 + 2];
+        const block_q8_0 * GGML_RESTRICT yb_3 = &y[ib * 4 + 3];
+
+        const __m128i bit_mask_0_0 = bytes_from_bits_16(&x[ib].qs[0]);
+        const __m128i bit_mask_0_1 = bytes_from_bits_16(&x[ib].qs[2]);
+        const __m128i bit_value_0_0 = _mm_and_si128(bit_mask_0_0, ones_8);
+        const __m128i bit_value_0_1 = _mm_and_si128(bit_mask_0_1, ones_8);
+        const __m128i qx_0_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0_0, bit_value_0_0), ones_8);
+        const __m128i qx_0_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_0_1, bit_value_0_1), ones_8);
+        const __m128i qy_0_0 = _mm_loadu_si128((const __m128i *) &yb_0->qs[0]);
+        const __m128i qy_0_1 = _mm_loadu_si128((const __m128i *) &yb_0->qs[16]);
+        const __m128i sum_0_0 = mul_sum_i8_pairs(qx_0_0, qy_0_0);
+        const __m128i sum_0_1 = mul_sum_i8_pairs(qx_0_1, qy_0_1);
+        const __m128 q_0 = _mm_cvtepi32_ps(_mm_add_epi32(sum_0_0, sum_0_1));
+        acc_0 = _mm_add_ps(acc_0, _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(yb_0->d))), q_0));
+
+        const __m128i bit_mask_1_0 = bytes_from_bits_16(&x[ib].qs[4]);
+        const __m128i bit_mask_1_1 = bytes_from_bits_16(&x[ib].qs[6]);
+        const __m128i bit_value_1_0 = _mm_and_si128(bit_mask_1_0, ones_8);
+        const __m128i bit_value_1_1 = _mm_and_si128(bit_mask_1_1, ones_8);
+        const __m128i qx_1_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_1_0, bit_value_1_0), ones_8);
+        const __m128i qx_1_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1_1, bit_value_1_1), ones_8);
+        const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *) &yb_1->qs[0]);
+        const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *) &yb_1->qs[16]);
+        const __m128i sum_1_0 = mul_sum_i8_pairs(qx_1_0, qy_1_0);
+        const __m128i sum_1_1 = mul_sum_i8_pairs(qx_1_1, qy_1_1);
+        const __m128 q_1 = _mm_cvtepi32_ps(_mm_add_epi32(sum_1_0, sum_1_1));
+        acc_1 = _mm_add_ps(acc_1, _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(yb_1->d))), q_1));
+
+        const __m128i bit_mask_2_0 = bytes_from_bits_16(&x[ib].qs[8]);
+        const __m128i bit_mask_2_1 = bytes_from_bits_16(&x[ib].qs[10]);
+        const __m128i bit_value_2_0 = _mm_and_si128(bit_mask_2_0, ones_8);
+        const __m128i bit_value_2_1 = _mm_and_si128(bit_mask_2_1, ones_8);
+        const __m128i qx_2_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_2_0, bit_value_2_0), ones_8);
+        const __m128i qx_2_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_2_1, bit_value_2_1), ones_8);
+        const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *) &yb_2->qs[0]);
+        const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *) &yb_2->qs[16]);
+        const __m128i sum_2_0 = mul_sum_i8_pairs(qx_2_0, qy_2_0);
+        const __m128i sum_2_1 = mul_sum_i8_pairs(qx_2_1, qy_2_1);
+        const __m128 q_2 = _mm_cvtepi32_ps(_mm_add_epi32(sum_2_0, sum_2_1));
+        acc_2 = _mm_add_ps(acc_2, _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(yb_2->d))), q_2));
+
+        const __m128i bit_mask_3_0 = bytes_from_bits_16(&x[ib].qs[12]);
+        const __m128i bit_mask_3_1 = bytes_from_bits_16(&x[ib].qs[14]);
+        const __m128i bit_value_3_0 = _mm_and_si128(bit_mask_3_0, ones_8);
+        const __m128i bit_value_3_1 = _mm_and_si128(bit_mask_3_1, ones_8);
+        const __m128i qx_3_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_3_0, bit_value_3_0), ones_8);
+        const __m128i qx_3_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_3_1, bit_value_3_1), ones_8);
+        const __m128i qy_3_0 = _mm_loadu_si128((const __m128i *) &yb_3->qs[0]);
+        const __m128i qy_3_1 = _mm_loadu_si128((const __m128i *) &yb_3->qs[16]);
+        const __m128i sum_3_0 = mul_sum_i8_pairs(qx_3_0, qy_3_0);
+        const __m128i sum_3_1 = mul_sum_i8_pairs(qx_3_1, qy_3_1);
+        const __m128 q_3 = _mm_cvtepi32_ps(_mm_add_epi32(sum_3_0, sum_3_1));
+        acc_3 = _mm_add_ps(acc_3, _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(yb_3->d))), q_3));
+    }
+
+    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#else
+    // Scalar fallback: keep the bitstream byte-oriented so each Q8_0 sub-block
+    // becomes four straight-line 8-value accumulations with no per-element
+    // divide/modulo/index arithmetic.
+    float sumf = 0.0f;
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        float sumi = 0.0f;
+
+        for (int k = 0; k < 4; k++) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
+            const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
+            int sumi_block = 0;
+
+            const uint8_t * GGML_RESTRICT bits = &x[ib].qs[k * 4];
+            const int8_t  * GGML_RESTRICT qy   = yb->qs;
+
+            for (int b = 0; b < 4; ++b, qy += 8) {
+                const unsigned mask = bits[b];
+                sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
+                           +  ((mask & 0x02) ? qy[1] : -qy[1])
+                           +  ((mask & 0x04) ? qy[2] : -qy[2])
+                           +  ((mask & 0x08) ? qy[3] : -qy[3])
+                           +  ((mask & 0x10) ? qy[4] : -qy[4])
+                           +  ((mask & 0x20) ? qy[5] : -qy[5])
+                           +  ((mask & 0x40) ? qy[6] : -qy[6])
+                           +  ((mask & 0x80) ? qy[7] : -qy[7]);
+            }
+
+            sumi += d1 * sumi_block;
+        }
+
+        sumf += d0 * sumi;
+    }
+
+    *s = sumf;
+#endif
 }
 
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c
index 00af7e2ddc6..790eafda05a 100644
--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@@ -137,25 +137,29 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
     
     float sumf = 0.0;
     
+    // Generic fallback: keep the sign stream byte-oriented and process the
+    // 32-value Q8_0 block as four explicit 8-value groups. This keeps the
+    // portable structure while avoiding per-element bit-index arithmetic.
     for (int i = 0; i < nb; i++) {
-        const float d0 = GGML_FP16_TO_FP32(x[i].d);
-        const float d1 = GGML_FP16_TO_FP32(y[i].d);
-        
+        const float d = GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d);
         int sumi = 0;
-        
-        for (int j = 0; j < QK1_0; j++) {
-            const int bit_index = j;
-            const int byte_index = bit_index / 8;
-            const int bit_offset = bit_index % 8;
-            
-            // Extract bit: 1 = +1, 0 = -1
-            const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
-            const int yi = y[i].qs[j];
-            
-            sumi += xi * yi;
+
+        const uint8_t * GGML_RESTRICT bits = x[i].qs;
+        const int8_t  * GGML_RESTRICT qy   = y[i].qs;
+
+        for (int b = 0; b < 4; ++b, qy += 8) {
+            const unsigned mask = bits[b];
+            sumi += ((mask & 0x01) ? qy[0] : -qy[0])
+                 +  ((mask & 0x02) ? qy[1] : -qy[1])
+                 +  ((mask & 0x04) ? qy[2] : -qy[2])
+                 +  ((mask & 0x08) ? qy[3] : -qy[3])
+                 +  ((mask & 0x10) ? qy[4] : -qy[4])
+                 +  ((mask & 0x20) ? qy[5] : -qy[5])
+                 +  ((mask & 0x40) ? qy[6] : -qy[6])
+                 +  ((mask & 0x80) ? qy[7] : -qy[7]);
         }
-        
-        sumf += d0 * d1 * sumi;
+
+        sumf += d * sumi;
     }
     
     *s = sumf;
@@ -178,25 +182,33 @@ void ggml_vec_dot_q1_0_g128_q8_0_generic(int n, float * GGML_RESTRICT s, size_t
     
     float sumf = 0.0;
     
-    // Each Q1_0_g128 block has 128 elements, each Q8_0 block has 32 elements
-    // So we need 4 Q8_0 blocks per Q1_0_g128 block
+    // Generic fallback: keep the packed sign bits byte-oriented and process
+    // one Q8_0 sub-block as four explicit 8-value groups. This preserves the
+    // portable structure while avoiding per-element bit-index arithmetic.
     for (int i = 0; i < nb; i++) {
         const float d0 = GGML_FP16_TO_FP32(x[i].d);
         
         float sumi = 0.0f;
 
+        // Process 4 Q8_0 blocks (4 * 32 = 128 elements)
         for (int k = 0; k < 4; k++) {
-            const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
-
+            const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
+            const float d1 = GGML_FP16_TO_FP32(yb->d);
             int sumi_block = 0;
 
-            for (int j = 0; j < QK8_0; j++) {
-                const int bit_index = k * QK8_0 + j;
-                const int byte_index = bit_index / 8;
-                const int bit_offset = bit_index % 8;
-
-                const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
-                sumi_block += xi * y[i*4 + k].qs[j];
+            const uint8_t * GGML_RESTRICT bits = &x[i].qs[k * 4];
+            const int8_t  * GGML_RESTRICT qy   = yb->qs;
+
+            for (int b = 0; b < 4; ++b, qy += 8) {
+                const unsigned mask = bits[b];
+                sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
+                           +  ((mask & 0x02) ? qy[1] : -qy[1])
+                           +  ((mask & 0x04) ? qy[2] : -qy[2])
+                           +  ((mask & 0x08) ? qy[3] : -qy[3])
+                           +  ((mask & 0x10) ? qy[4] : -qy[4])
+                           +  ((mask & 0x20) ? qy[5] : -qy[5])
+                           +  ((mask & 0x40) ? qy[6] : -qy[6])
+                           +  ((mask & 0x80) ? qy[7] : -qy[7]);
             }
 
             sumi += d1 * sumi_block;