diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index e4130ef22f9..c5b9e090e55 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -545,7 +545,110 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK1_0_g128; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q1_0_g128 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + float sumf = 0.0f; + +#if defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__) + // AVX-512 VNNI path: mask registers for bit expansion + VNNI dot product + // Accumulate into float vector, single hsum at the end + const __m256i ones_u8 = _mm256_set1_epi8(1); + __m256 acc = _mm256_setzero_ps(); + + for (int ib = 0; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + + for (int k = 0; k < 4; k++) { + // Load 32 bits of weights using alias-safe unaligned load + uint32_t bmask_u32; + memcpy(&bmask_u32, x[ib].qs + k * 4, sizeof(bmask_u32)); + const __mmask32 bmask = (__mmask32)bmask_u32; + + // Load 32 int8 activations + const __m256i q8 = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs); + + // Sum ALL q8 values using VNNI (groups of 4 int8 -> int32) + const __m256i sum_all = _mm256_dpbusd_epi32(_mm256_setzero_si256(), ones_u8, q8); + + // Zero out q8 where bit=0, keep where bit=1 (single instruction) + const __m256i masked_q8 = _mm256_maskz_mov_epi8(bmask, q8); + + // Sum MASKED q8 values using VNNI + const __m256i sum_masked = _mm256_dpbusd_epi32(_mm256_setzero_si256(), ones_u8, masked_q8); + + // dot = 2 * sum_masked - sum_all + // (weight = 2*bit - 1, so dot = sum((2*bit-1)*q8) = 2*sum(q8 where bit=1) - sum(q8)) + const __m256i dp = _mm256_sub_epi32(_mm256_slli_epi32(sum_masked, 1), sum_all); + + // Scale by d1 and accumulate into float accumulator + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d); + acc = _mm256_fmadd_ps(_mm256_set1_ps(d0 * d1), _mm256_cvtepi32_ps(dp), acc); + } + } + + sumf = hsum_float_8(acc); + +#elif defined(__AVX2__) + // AVX2 path: shuffle-based bit expansion + mul_sum_i8_pairs_float + // Uses llama.cpp's optimized helper (auto-selects AVXVNNI dpbssd when available) + const __m256i shuf = _mm256_setr_epi8( + 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3); + const __m256i bmask = _mm256_set1_epi64x(0x8040201008040201LL); + const __m256i ones8 = _mm256_set1_epi8(1); + const __m256i neg8 = _mm256_set1_epi8(-1); + + __m256 acc = _mm256_setzero_ps(); + + for (int ib = 0; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + + for (int k = 0; k < 4; k++) { + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d); + const __m256 d_scale = _mm256_set1_ps(d0 * d1); + + // Broadcast 4 bytes of 1-bit weights using alias-safe load + int32_t bits_i32; + memcpy(&bits_i32, x[ib].qs + k * 4, sizeof(bits_i32)); + __m256i vb = _mm256_set1_epi32(bits_i32); + __m256i ex = _mm256_shuffle_epi8(vb, shuf); + ex = _mm256_cmpeq_epi8(_mm256_and_si256(ex, bmask), bmask); + + // Convert mask to +1/-1 + const __m256i xi = _mm256_blendv_epi8(neg8, ones8, ex); + + // Load 32 int8 activations + const __m256i q8 = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs); + + // Dot product + float conversion via optimized helper + // (auto-uses AVXVNNI dpbssd on supported CPUs) + const __m256 p = mul_sum_i8_pairs_float(xi, q8); + + // Accumulate scaled result + acc = _mm256_fmadd_ps(d_scale, p, acc); + } + } + + sumf = hsum_float_8(acc); + +#else + // Scalar fallback — delegates to generic implementation ggml_vec_dot_q1_0_g128_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + return; +#endif + + *s = sumf; } void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 00af7e2ddc6..7c51b917d44 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -176,35 +176,26 @@ void ggml_vec_dot_q1_0_g128_q8_0_generic(int n, float * GGML_RESTRICT s, size_t const block_q8_0 * GGML_RESTRICT y = vy; - float sumf = 0.0; - - // Each Q1_0_g128 block has 128 elements, each Q8_0 block has 32 elements - // So we need 4 Q8_0 blocks per Q1_0_g128 block + float sumf = 0.0f; + for (int i = 0; i < nb; i++) { const float d0 = GGML_FP16_TO_FP32(x[i].d); - - float sumi = 0.0f; for (int k = 0; k < 4; k++) { const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d); + const uint8_t * bits = x[i].qs + k * 4; + const int8_t * q8 = y[i*4 + k].qs; - int sumi_block = 0; - + int sumi = 0; for (int j = 0; j < QK8_0; j++) { - const int bit_index = k * QK8_0 + j; - const int byte_index = bit_index / 8; - const int bit_offset = bit_index % 8; - - const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1; - sumi_block += xi * y[i*4 + k].qs[j]; + const int bit = (bits[j >> 3] >> (j & 7)) & 1; + sumi += (2*bit - 1) * q8[j]; } - sumi += d1 * sumi_block; + sumf += d0 * d1 * (float)sumi; } - - sumf += d0 * sumi; } - + *s = sumf; }