PrismML-Eng · pl752 · Apr 1, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 3, 2026
diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -274,6 +274,25 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const
 }
 #endif
 #elif defined(__SSSE3__)
+static inline int hsum_i32_4(const __m128i a) {
+    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+    const __m128i sum64 = _mm_add_epi32(hi64, a);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+static inline __m128i bytes_from_bits_16(const uint8_t * x) {
+    uint16_t x16;
+    memcpy(&x16, x, sizeof(uint16_t));
+
+    const __m128i shuf_mask = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+    __m128i bytes = _mm_shuffle_epi8(_mm_set1_epi16((short) x16), shuf_mask);
+    const __m128i bit_mask = _mm_set_epi64x(0x7fbfdfeff7fbfdfe, 0x7fbfdfeff7fbfdfe);
+    bytes = _mm_or_si128(bytes, bit_mask);
+
+    return _mm_cmpeq_epi8(bytes, _mm_set1_epi64x(-1));
+}
+
 // horizontally add 4x4 floats
 static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
     __m128 res_0 =_mm_hadd_ps(a, b);
@@ -541,11 +560,334 @@ static inline __m128i get_scale_shuffle(int i) {
 #endif
 
 void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    const int qk = QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q1_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__AVX512BW__)
+    // AVX-512BW: widen one full Q8_0 block to int16, apply the 32 sign bits
+    // directly to y, reduce pairs with madd, then scale and accumulate in fp32.
+    const __m512i ones_16 = _mm512_set1_epi16(1);
+    const __m512i zero = _mm512_setzero_si512();
+    __m512 acc = _mm512_setzero_ps();
+
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
+        const __m256i qy_8 = _mm256_loadu_si256((const __m256i *) y[ib].qs);
+        const __m512i qy_16 = _mm512_cvtepi8_epi16(qy_8);
+        uint32_t bits;
+        memcpy(&bits, x[ib].qs, sizeof(bits));
+        const __m512i signed_y = _mm512_mask_sub_epi16(qy_16, (__mmask32)(~bits), zero, qy_16);
+        const __m512i sum_32 = _mm512_madd_epi16(signed_y, ones_16);
+
+        acc = _mm512_fmadd_ps(_mm512_set1_ps(d), _mm512_cvtepi32_ps(sum_32), acc);
+    }
+
+    {
+        const __m256 h = _mm256_add_ps(_mm512_extractf32x8_ps(acc, 0),
+                                       _mm512_extractf32x8_ps(acc, 1));
+        __m128 q = _mm_add_ps(_mm256_extractf128_ps(h, 0),
+                              _mm256_extractf128_ps(h, 1));
+        q = _mm_add_ps(q, _mm_movehl_ps(q, q));
+        q = _mm_add_ss(q, _mm_movehdup_ps(q));
+        sumf = _mm_cvtss_f32(q);
+    }
+#elif defined(__AVX2__)
+    // AVX2: expand the 32 packed sign bits to 32 signed bytes, run one byte-dot
+    // against the full Q8_0 block, and accumulate the scaled fp32 reduction.
+    __m256 acc = _mm256_setzero_ps();
+
+    for (; ib < nb; ++ib) {
+        const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+        const __m256i bit_mask = bytes_from_bits_32(x[ib].qs);
+        const __m256i bit_value = _mm256_and_si256(bit_mask, _mm256_set1_epi8(1));
+        const __m256i qx = _mm256_sub_epi8(_mm256_add_epi8(bit_value, bit_value), _mm256_set1_epi8(1));
+        const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        acc = _mm256_fmadd_ps(d, q, acc);
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // AVX: keep the same 32-bit sign expansion, but process the byte-domain dot
+    // as two 128-bit halves before combining them into one 256-bit reduction.
+    const __m128i ones_8 = _mm_set1_epi8(1);
+    __m256 acc = _mm256_setzero_ps();
+
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
+        const __m256i bit_mask = bytes_from_bits_32(x[ib].qs);
+        const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
+        const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
+        const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8);
+        const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8);
+        const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8);
+        const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8);
+        const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y[ib].qs[0]);
+        const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y[ib].qs[16]);
+        const __m256i qx = MM256_SET_M128I(qx_1, qx_0);
+        const __m256i qy = MM256_SET_M128I(qy_1, qy_0);
+        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+        acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d), q));
+    }
+
+    sumf = hsum_float_8(acc);
+#elif defined(__SSSE3__)
+    // SSSE3: decode the two 16-bit bit chunks into two 16-byte sign vectors,
+    // perform the pairwise int8 dot in 128-bit lanes, and accumulate to scalar.
+    const __m128i ones_8 = _mm_set1_epi8(1);
+
+    for (; ib < nb; ++ib) {
+        const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
+        const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[0]);
+        const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[2]);
+        const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8);
+        const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8);
+        const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8);
+        const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8);
+        const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y[ib].qs[0]);
+        const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y[ib].qs[16]);
+        const __m128i sum_0 = mul_sum_i8_pairs(qx_0, qy_0);
+        const __m128i sum_1 = mul_sum_i8_pairs(qx_1, qy_1);
+
+        sumf += d * hsum_i32_4(_mm_add_epi32(sum_0, sum_1));
+    }
+#endif
+
+    // Scalar fallback: stay byte-oriented so the packed sign bits feed four
+    // straight-line 8-value accumulations without per-element bit math.
+    for (; ib < nb; ++ib) {
+        const uint8_t * GGML_RESTRICT bits = x[ib].qs;
+        const int8_t  * GGML_RESTRICT qy   = y[ib].qs;
+        int sumi = 0;
+
+        for (int b = 0; b < 4; ++b, qy += 8) {
+            const unsigned mask = bits[b];
+            sumi += ((mask & 0x01) ? qy[0] : -qy[0])
+                 +  ((mask & 0x02) ? qy[1] : -qy[1])
+                 +  ((mask & 0x04) ? qy[2] : -qy[2])
+                 +  ((mask & 0x08) ? qy[3] : -qy[3])
+                 +  ((mask & 0x10) ? qy[4] : -qy[4])
+                 +  ((mask & 0x20) ? qy[5] : -qy[5])
+                 +  ((mask & 0x40) ? qy[6] : -qy[6])
+                 +  ((mask & 0x80) ? qy[7] : -qy[7]);
+        }
+
+        sumf += sumi * GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
+    }
+
+    *s = sumf;
 }
 
 void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    ggml_vec_dot_q1_0_g128_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+    const int qk = QK1_0_g128;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q1_0_g128 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__AVX512BW__)
+    // AVX-512BW: sign-extend int8→int16, mask-negate, madd→fma pipeline.
+    // The inner loop over 4 Q8_0 sub-blocks accumulates into acc_block,
+    // then a single FMA folds into the outer acc — this structure lets the
+    // CPU start the next outer iteration while the final FMA is still in flight.
+    const __m512i ones_16 = _mm512_set1_epi16(1);
+    const __m512i zero = _mm512_setzero_si512();
+    __m512 acc = _mm512_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        __m512 acc_block = _mm512_setzero_ps();
+
+        for (int k = 0; k < 4; k++) {
+            const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
+
+            // 32 int8 → 32 int16 in a zmm register
+            __m256i y_8 = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
+            __m512i y_16 = _mm512_cvtepi8_epi16(y_8);
+
+            // Load 32 weight bits; negate y where bit=0 (weight = -1)
+            uint32_t bits;
+            memcpy(&bits, &x[ib].qs[k * 4], sizeof(bits));
+            __m512i signed_y = _mm512_mask_sub_epi16(y_16, (__mmask32)(~bits), zero, y_16);
+
+            // Pair-wise sum int16→int32, convert to float, scale and accumulate
+            __m512i sum_32 = _mm512_madd_epi16(signed_y, ones_16);
+            __m512 sum_f = _mm512_cvtepi32_ps(sum_32);
+            acc_block = _mm512_fmadd_ps(_mm512_set1_ps(d1), sum_f, acc_block);
+        }
+
+        acc = _mm512_fmadd_ps(_mm512_set1_ps(d0), acc_block, acc);
+    }
+
+    // Horizontal sum: 512 → 256 → 128 → scalar
+    {
+        __m256 h = _mm256_add_ps(_mm512_extractf32x8_ps(acc, 0),
+                                  _mm512_extractf32x8_ps(acc, 1));
+        __m128 q = _mm_add_ps(_mm256_extractf128_ps(h, 0),
+                               _mm256_extractf128_ps(h, 1));
+        q = _mm_add_ps(q, _mm_movehl_ps(q, q));
+        q = _mm_add_ss(q, _mm_movehdup_ps(q));
+        *s = _mm_cvtss_f32(q);
+    }
+#elif defined(__AVX2__)
+    // AVX2: expand each 32-bit sign stream to 32 signed bytes, reduce two
+    // Q8_0 sub-blocks in parallel, then fold the pair into the outer block sum.
+    // Splitting the fixed 4-way inner loop into two independent accumulators
+    // gives the core more scheduling freedom than one longer dependency chain.
+    const __m256i ones_8 = _mm256_set1_epi8(1);
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        __m256 acc_block_0 = _mm256_setzero_ps();
+        __m256 acc_block_1 = _mm256_setzero_ps();
+
+        for (int k = 0; k < 4; k += 2) {
+            const block_q8_0 * GGML_RESTRICT yb_0 = &y[ib * 4 + k + 0];
+            const block_q8_0 * GGML_RESTRICT yb_1 = &y[ib * 4 + k + 1];
+            const __m256i bit_mask_0 = bytes_from_bits_32(&x[ib].qs[(k + 0) * 4]);
+            const __m256i bit_mask_1 = bytes_from_bits_32(&x[ib].qs[(k + 1) * 4]);
+            const __m256i bit_value_0 = _mm256_and_si256(bit_mask_0, ones_8);
+            const __m256i bit_value_1 = _mm256_and_si256(bit_mask_1, ones_8);
+            const __m256i qx_0 = _mm256_sub_epi8(_mm256_add_epi8(bit_value_0, bit_value_0), ones_8);
+            const __m256i qx_1 = _mm256_sub_epi8(_mm256_add_epi8(bit_value_1, bit_value_1), ones_8);
+            const __m256i qy_0 = _mm256_loadu_si256((const __m256i *) yb_0->qs);
+            const __m256i qy_1 = _mm256_loadu_si256((const __m256i *) yb_1->qs);
+            const __m256 q_0 = mul_sum_i8_pairs_float(qx_0, qy_0);
+            const __m256 q_1 = mul_sum_i8_pairs_float(qx_1, qy_1);
+            const __m256 d1_0 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(yb_0->d));
+            const __m256 d1_1 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(yb_1->d));
+
+            acc_block_0 = _mm256_fmadd_ps(d1_0, q_0, acc_block_0);
+            acc_block_1 = _mm256_fmadd_ps(d1_1, q_1, acc_block_1);
+        }
+
+        acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), _mm256_add_ps(acc_block_0, acc_block_1), acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // AVX: reuse the same 32-bit sign expansion, but do the byte-domain work
+    // with two 128-bit halves before combining them into one 256-bit reduction.
+    const __m128i ones_8 = _mm_set1_epi8(1);
+    __m256 acc = _mm256_setzero_ps();
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        __m256 acc_block = _mm256_setzero_ps();
+
+        for (int k = 0; k < 4; ++k) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
+            const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
+            const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[k * 4]);
+            const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
+            const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
+            const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8);
+            const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8);
+            const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8);
+            const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8);
+            const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &yb->qs[0]);
+            const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &yb->qs[16]);
+            const __m256i qx = MM256_SET_M128I(qx_1, qx_0);
+            const __m256i qy = MM256_SET_M128I(qy_1, qy_0);
+            const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+            acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(d1), q));
+        }
+
+        acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d0), acc_block));
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__SSSE3__)
+    // SSSE3: decode two 16-bit chunks into 16 signed bytes each, run the
+    // pairwise int8 dot in 128-bit lanes, and accumulate directly to scalar.
+    const __m128i ones_8 = _mm_set1_epi8(1);
+    float sumf = 0.0f;
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+
+        for (int k = 0; k < 4; ++k) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
+            const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
+
+            const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[k * 4 + 0]);
+            const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[k * 4 + 2]);
+            const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8);
+            const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8);
+            const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8);
+            const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8);
+            const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &yb->qs[0]);
+            const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &yb->qs[16]);
+            const __m128i sum_0 = mul_sum_i8_pairs(qx_0, qy_0);
+            const __m128i sum_1 = mul_sum_i8_pairs(qx_1, qy_1);
+
+            sumf += d0 * d1 * hsum_i32_4(_mm_add_epi32(sum_0, sum_1));
+        }
+    }
+
+    *s = sumf;
+#else
+    // Scalar fallback: keep the bitstream byte-oriented so each Q8_0 sub-block
+    // becomes four straight-line 8-value accumulations with no per-element
+    // divide/modulo/index arithmetic.
+    float sumf = 0.0f;
+
+    for (int ib = 0; ib < nb; ++ib) {
+        const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+        float sumi = 0.0f;
+
+        for (int k = 0; k < 4; k++) {
+            const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
+            const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
+            int sumi_block = 0;
+
+            const uint8_t * GGML_RESTRICT bits = &x[ib].qs[k * 4];
+            const int8_t  * GGML_RESTRICT qy   = yb->qs;
+
+            for (int b = 0; b < 4; ++b, qy += 8) {
+                const unsigned mask = bits[b];
+                sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
+                           +  ((mask & 0x02) ? qy[1] : -qy[1])
+                           +  ((mask & 0x04) ? qy[2] : -qy[2])
+                           +  ((mask & 0x08) ? qy[3] : -qy[3])
+                           +  ((mask & 0x10) ? qy[4] : -qy[4])
+                           +  ((mask & 0x20) ? qy[5] : -qy[5])
+                           +  ((mask & 0x40) ? qy[6] : -qy[6])
+                           +  ((mask & 0x80) ? qy[7] : -qy[7]);
+            }
+
+            sumi += d1 * sumi_block;
+        }
+
+        sumf += d0 * sumi;
+    }
+
+    *s = sumf;
+#endif
 }
 
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {