From 082e830bf364f73b33c0a208898cabe38560b1f9 Mon Sep 17 00:00:00 2001 From: jordankzf Date: Thu, 2 Apr 2026 01:04:24 +0800 Subject: [PATCH] fix: Q1_0_g128 CPU kernel - fix gibberish output and add AVX-512 SIMD The Q1_0_g128 vec_dot kernel had a bug where `sumi` was declared as `int` but accumulated `float` partial products (`d1 * sumi_block`), causing float-to-int truncation that destroyed dot product results and produced gibberish output on CPU. Additionally, the x86 kernel was purely scalar (one bit at a time). This adds an AVX-512BW path that processes 32 elements per iteration using mask_sub + madd + fma, with a single horizontal reduction at the end. Benchmarks (Bonsai-8B, CPU-only, AVX-512): Before: 0.73 t/s prompt, 0.65 t/s generation (gibberish output) After: 23.2 t/s prompt, 13.5 t/s generation (coherent output) --- ggml/src/ggml-cpu/arch/x86/quants.c | 80 ++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index e4130ef22f9..a2ae35c87e1 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -545,7 +545,85 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - ggml_vec_dot_q1_0_g128_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + const int qk = QK1_0_g128; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q1_0_g128 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__AVX512BW__) + const __m512i ones_16 = _mm512_set1_epi16(1); + const __m512i zero = _mm512_setzero_si512(); + __m512 acc = _mm512_setzero_ps(); + + for (int ib = 0; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + __m512 acc_block = _mm512_setzero_ps(); + + for (int k = 0; k < 4; k++) { + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d); + + __m256i y_8 = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs); + __m512i y_16 = _mm512_cvtepi8_epi16(y_8); + + uint32_t bits; + memcpy(&bits, &x[ib].qs[k * 4], sizeof(bits)); + __m512i signed_y = _mm512_mask_sub_epi16(y_16, (__mmask32)(~bits), zero, y_16); + + __m512i sum_32 = _mm512_madd_epi16(signed_y, ones_16); + __m512 sum_f = _mm512_cvtepi32_ps(sum_32); + acc_block = _mm512_fmadd_ps(_mm512_set1_ps(d1), sum_f, acc_block); + } + + acc = _mm512_fmadd_ps(_mm512_set1_ps(d0), acc_block, acc); + } + + { + __m256 h = _mm256_add_ps(_mm512_extractf32x8_ps(acc, 0), + _mm512_extractf32x8_ps(acc, 1)); + __m128 q = _mm_add_ps(_mm256_extractf128_ps(h, 0), + _mm256_extractf128_ps(h, 1)); + q = _mm_add_ps(q, _mm_movehl_ps(q, q)); + q = _mm_add_ss(q, _mm_movehdup_ps(q)); + *s = _mm_cvtss_f32(q); + } +#else + float sumf = 0.0f; + + for (int ib = 0; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + float sumi = 0.0f; + + for (int k = 0; k < 4; k++) { + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d); + int sumi_block = 0; + + for (int j = 0; j < QK8_0; j++) { + const int bit_index = k * QK8_0 + j; + const int byte_index = bit_index / 8; + const int bit_offset = bit_index % 8; + + const int xi = ((x[ib].qs[byte_index] >> bit_offset) & 1) ? 1 : -1; + const int yi = y[ib*4 + k].qs[j]; + + sumi_block += xi * yi; + } + + sumi += d1 * sumi_block; + } + + sumf += d0 * sumi; + } + + *s = sumf; +#endif } void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {