diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index e4130ef22f9..8793aff4a71 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -274,6 +274,25 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const } #endif #elif defined(__SSSE3__) +static inline int hsum_i32_4(const __m128i a) { + const __m128i hi64 = _mm_unpackhi_epi64(a, a); + const __m128i sum64 = _mm_add_epi32(hi64, a); + const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); +} + +static inline __m128i bytes_from_bits_16(const uint8_t * x) { + uint16_t x16; + memcpy(&x16, x, sizeof(uint16_t)); + + const __m128i shuf_mask = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); + __m128i bytes = _mm_shuffle_epi8(_mm_set1_epi16((short) x16), shuf_mask); + const __m128i bit_mask = _mm_set_epi64x(0x7fbfdfeff7fbfdfe, 0x7fbfdfeff7fbfdfe); + bytes = _mm_or_si128(bytes, bit_mask); + + return _mm_cmpeq_epi8(bytes, _mm_set1_epi64x(-1)); +} + // horizontally add 4x4 floats static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) { __m128 res_0 =_mm_hadd_ps(a, b); @@ -541,11 +560,375 @@ static inline __m128i get_scale_shuffle(int i) { #endif void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q1_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0; + +#if defined(__AVX512BW__) + // AVX-512BW: widen one full Q8_0 block to int16, apply the 32 sign bits + // directly to y, reduce pairs with madd, then scale and accumulate in fp32. + const __m512i ones_16 = _mm512_set1_epi16(1); + const __m512i zero = _mm512_setzero_si512(); + __m512 acc = _mm512_setzero_ps(); + + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d); + const __m256i qy_8 = _mm256_loadu_si256((const __m256i *) y[ib].qs); + const __m512i qy_16 = _mm512_cvtepi8_epi16(qy_8); + uint32_t bits; + memcpy(&bits, x[ib].qs, sizeof(bits)); + const __m512i signed_y = _mm512_mask_sub_epi16(qy_16, (__mmask32)(~bits), zero, qy_16); + const __m512i sum_32 = _mm512_madd_epi16(signed_y, ones_16); + + acc = _mm512_fmadd_ps(_mm512_set1_ps(d), _mm512_cvtepi32_ps(sum_32), acc); + } + + { + const __m256 h = _mm256_add_ps(_mm512_extractf32x8_ps(acc, 0), + _mm512_extractf32x8_ps(acc, 1)); + __m128 q = _mm_add_ps(_mm256_extractf128_ps(h, 0), + _mm256_extractf128_ps(h, 1)); + q = _mm_add_ps(q, _mm_movehl_ps(q, q)); + q = _mm_add_ss(q, _mm_movehdup_ps(q)); + sumf = _mm_cvtss_f32(q); + } +#elif defined(__AVX2__) + // AVX2: expand the 32 packed sign bits to 32 signed bytes, run one byte-dot + // against the full Q8_0 block, and accumulate the scaled fp32 reduction. + __m256 acc = _mm256_setzero_ps(); + + for (; ib < nb; ++ib) { + const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); + const __m256i bit_mask = bytes_from_bits_32(x[ib].qs); + const __m256i bit_value = _mm256_and_si256(bit_mask, _mm256_set1_epi8(1)); + const __m256i qx = _mm256_sub_epi8(_mm256_add_epi8(bit_value, bit_value), _mm256_set1_epi8(1)); + const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); + const __m256 q = mul_sum_i8_pairs_float(qx, qy); + + acc = _mm256_fmadd_ps(d, q, acc); + } + + sumf = hsum_float_8(acc); +#elif defined(__AVX__) + // AVX: keep the same 32-bit sign expansion, but process the byte-domain dot + // as two 128-bit halves before combining them into one 256-bit reduction. + const __m128i ones_8 = _mm_set1_epi8(1); + __m256 acc = _mm256_setzero_ps(); + + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d); + const __m256i bit_mask = bytes_from_bits_32(x[ib].qs); + const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask); + const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1); + const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8); + const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8); + const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8); + const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8); + const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y[ib].qs[0]); + const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y[ib].qs[16]); + const __m256i qx = MM256_SET_M128I(qx_1, qx_0); + const __m256i qy = MM256_SET_M128I(qy_1, qy_0); + const __m256 q = mul_sum_i8_pairs_float(qx, qy); + + acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d), q)); + } + + sumf = hsum_float_8(acc); +#elif defined(__SSSE3__) + // SSSE3: decode the two 16-bit bit chunks into two 16-byte sign vectors, + // perform the pairwise int8 dot in 128-bit lanes, and accumulate to scalar. + const __m128i ones_8 = _mm_set1_epi8(1); + + for (; ib < nb; ++ib) { + const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d); + const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[0]); + const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[2]); + const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8); + const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8); + const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8); + const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8); + const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y[ib].qs[0]); + const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y[ib].qs[16]); + const __m128i sum_0 = mul_sum_i8_pairs(qx_0, qy_0); + const __m128i sum_1 = mul_sum_i8_pairs(qx_1, qy_1); + + sumf += d * hsum_i32_4(_mm_add_epi32(sum_0, sum_1)); + } +#endif + + // Scalar fallback: stay byte-oriented so the packed sign bits feed four + // straight-line 8-value accumulations without per-element bit math. + for (; ib < nb; ++ib) { + const uint8_t * GGML_RESTRICT bits = x[ib].qs; + const int8_t * GGML_RESTRICT qy = y[ib].qs; + int sumi = 0; + + for (int b = 0; b < 4; ++b, qy += 8) { + const unsigned mask = bits[b]; + sumi += ((mask & 0x01) ? qy[0] : -qy[0]) + + ((mask & 0x02) ? qy[1] : -qy[1]) + + ((mask & 0x04) ? qy[2] : -qy[2]) + + ((mask & 0x08) ? qy[3] : -qy[3]) + + ((mask & 0x10) ? qy[4] : -qy[4]) + + ((mask & 0x20) ? qy[5] : -qy[5]) + + ((mask & 0x40) ? qy[6] : -qy[6]) + + ((mask & 0x80) ? qy[7] : -qy[7]); + } + + sumf += sumi * GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d); + } + + *s = sumf; } void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { - ggml_vec_dot_q1_0_g128_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + const int qk = QK1_0_g128; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q1_0_g128 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + +#if defined(__AVX512BW__) + // AVX-512BW: sign-extend int8→int16, mask-negate, madd→fma pipeline. + // The inner loop over 4 Q8_0 sub-blocks accumulates into acc_block, + // then a single FMA folds into the outer acc — this structure lets the + // CPU start the next outer iteration while the final FMA is still in flight. + const __m512i ones_16 = _mm512_set1_epi16(1); + const __m512i zero = _mm512_setzero_si512(); + __m512 acc = _mm512_setzero_ps(); + + for (int ib = 0; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + __m512 acc_block = _mm512_setzero_ps(); + + for (int k = 0; k < 4; k++) { + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d); + + // 32 int8 → 32 int16 in a zmm register + __m256i y_8 = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs); + __m512i y_16 = _mm512_cvtepi8_epi16(y_8); + + // Load 32 weight bits; negate y where bit=0 (weight = -1) + uint32_t bits; + memcpy(&bits, &x[ib].qs[k * 4], sizeof(bits)); + __m512i signed_y = _mm512_mask_sub_epi16(y_16, (__mmask32)(~bits), zero, y_16); + + // Pair-wise sum int16→int32, convert to float, scale and accumulate + __m512i sum_32 = _mm512_madd_epi16(signed_y, ones_16); + __m512 sum_f = _mm512_cvtepi32_ps(sum_32); + acc_block = _mm512_fmadd_ps(_mm512_set1_ps(d1), sum_f, acc_block); + } + + acc = _mm512_fmadd_ps(_mm512_set1_ps(d0), acc_block, acc); + } + + // Horizontal sum: 512 → 256 → 128 → scalar + { + __m256 h = _mm256_add_ps(_mm512_extractf32x8_ps(acc, 0), + _mm512_extractf32x8_ps(acc, 1)); + __m128 q = _mm_add_ps(_mm256_extractf128_ps(h, 0), + _mm256_extractf128_ps(h, 1)); + q = _mm_add_ps(q, _mm_movehl_ps(q, q)); + q = _mm_add_ss(q, _mm_movehdup_ps(q)); + *s = _mm_cvtss_f32(q); + } +#elif defined(__AVX2__) + // AVX2: expand each 32-bit sign stream to 32 signed bytes, reduce two + // Q8_0 sub-blocks in parallel, then fold the pair into the outer block sum. + // Splitting the fixed 4-way inner loop into two independent accumulators + // gives the core more scheduling freedom than one longer dependency chain. + const __m256i ones_8 = _mm256_set1_epi8(1); + __m256 acc = _mm256_setzero_ps(); + + for (int ib = 0; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + __m256 acc_block_0 = _mm256_setzero_ps(); + __m256 acc_block_1 = _mm256_setzero_ps(); + + for (int k = 0; k < 4; k += 2) { + const block_q8_0 * GGML_RESTRICT yb_0 = &y[ib * 4 + k + 0]; + const block_q8_0 * GGML_RESTRICT yb_1 = &y[ib * 4 + k + 1]; + const __m256i bit_mask_0 = bytes_from_bits_32(&x[ib].qs[(k + 0) * 4]); + const __m256i bit_mask_1 = bytes_from_bits_32(&x[ib].qs[(k + 1) * 4]); + const __m256i bit_value_0 = _mm256_and_si256(bit_mask_0, ones_8); + const __m256i bit_value_1 = _mm256_and_si256(bit_mask_1, ones_8); + const __m256i qx_0 = _mm256_sub_epi8(_mm256_add_epi8(bit_value_0, bit_value_0), ones_8); + const __m256i qx_1 = _mm256_sub_epi8(_mm256_add_epi8(bit_value_1, bit_value_1), ones_8); + const __m256i qy_0 = _mm256_loadu_si256((const __m256i *) yb_0->qs); + const __m256i qy_1 = _mm256_loadu_si256((const __m256i *) yb_1->qs); + const __m256 q_0 = mul_sum_i8_pairs_float(qx_0, qy_0); + const __m256 q_1 = mul_sum_i8_pairs_float(qx_1, qy_1); + const __m256 d1_0 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(yb_0->d)); + const __m256 d1_1 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(yb_1->d)); + + acc_block_0 = _mm256_fmadd_ps(d1_0, q_0, acc_block_0); + acc_block_1 = _mm256_fmadd_ps(d1_1, q_1, acc_block_1); + } + + acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), _mm256_add_ps(acc_block_0, acc_block_1), acc); + } + + *s = hsum_float_8(acc); +#elif defined(__AVX__) + // AVX: reuse the same 32-bit sign expansion, but do the byte-domain work + // with two 128-bit halves before combining them into one 256-bit reduction. + const __m128i ones_8 = _mm_set1_epi8(1); + __m256 acc = _mm256_setzero_ps(); + + for (int ib = 0; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + __m256 acc_block = _mm256_setzero_ps(); + + for (int k = 0; k < 4; ++k) { + const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k]; + const float d1 = GGML_CPU_FP16_TO_FP32(yb->d); + const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[k * 4]); + const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask); + const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1); + const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8); + const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8); + const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8); + const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8); + const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &yb->qs[0]); + const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &yb->qs[16]); + const __m256i qx = MM256_SET_M128I(qx_1, qx_0); + const __m256i qy = MM256_SET_M128I(qy_1, qy_0); + const __m256 q = mul_sum_i8_pairs_float(qx, qy); + + acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(d1), q)); + } + + acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d0), acc_block)); + } + + *s = hsum_float_8(acc); +#elif defined(__SSSE3__) + // SSSE3: decode each 32-bit sign stream into two 16-byte sign vectors, + // keep one fp32 accumulator per Q8_0 sub-block, then reduce once at end. + const __m128i ones_8 = _mm_set1_epi8(1); + __m128 acc_0 = _mm_setzero_ps(); + __m128 acc_1 = _mm_setzero_ps(); + __m128 acc_2 = _mm_setzero_ps(); + __m128 acc_3 = _mm_setzero_ps(); + + for (int ib = 0; ib < nb; ++ib) { + const __m128 d0 = _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d)); + const block_q8_0 * GGML_RESTRICT yb_0 = &y[ib * 4 + 0]; + const block_q8_0 * GGML_RESTRICT yb_1 = &y[ib * 4 + 1]; + const block_q8_0 * GGML_RESTRICT yb_2 = &y[ib * 4 + 2]; + const block_q8_0 * GGML_RESTRICT yb_3 = &y[ib * 4 + 3]; + + const __m128i bit_mask_0_0 = bytes_from_bits_16(&x[ib].qs[0]); + const __m128i bit_mask_0_1 = bytes_from_bits_16(&x[ib].qs[2]); + const __m128i bit_value_0_0 = _mm_and_si128(bit_mask_0_0, ones_8); + const __m128i bit_value_0_1 = _mm_and_si128(bit_mask_0_1, ones_8); + const __m128i qx_0_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0_0, bit_value_0_0), ones_8); + const __m128i qx_0_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_0_1, bit_value_0_1), ones_8); + const __m128i qy_0_0 = _mm_loadu_si128((const __m128i *) &yb_0->qs[0]); + const __m128i qy_0_1 = _mm_loadu_si128((const __m128i *) &yb_0->qs[16]); + const __m128i sum_0_0 = mul_sum_i8_pairs(qx_0_0, qy_0_0); + const __m128i sum_0_1 = mul_sum_i8_pairs(qx_0_1, qy_0_1); + const __m128 q_0 = _mm_cvtepi32_ps(_mm_add_epi32(sum_0_0, sum_0_1)); + acc_0 = _mm_add_ps(acc_0, _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(yb_0->d))), q_0)); + + const __m128i bit_mask_1_0 = bytes_from_bits_16(&x[ib].qs[4]); + const __m128i bit_mask_1_1 = bytes_from_bits_16(&x[ib].qs[6]); + const __m128i bit_value_1_0 = _mm_and_si128(bit_mask_1_0, ones_8); + const __m128i bit_value_1_1 = _mm_and_si128(bit_mask_1_1, ones_8); + const __m128i qx_1_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_1_0, bit_value_1_0), ones_8); + const __m128i qx_1_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1_1, bit_value_1_1), ones_8); + const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *) &yb_1->qs[0]); + const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *) &yb_1->qs[16]); + const __m128i sum_1_0 = mul_sum_i8_pairs(qx_1_0, qy_1_0); + const __m128i sum_1_1 = mul_sum_i8_pairs(qx_1_1, qy_1_1); + const __m128 q_1 = _mm_cvtepi32_ps(_mm_add_epi32(sum_1_0, sum_1_1)); + acc_1 = _mm_add_ps(acc_1, _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(yb_1->d))), q_1)); + + const __m128i bit_mask_2_0 = bytes_from_bits_16(&x[ib].qs[8]); + const __m128i bit_mask_2_1 = bytes_from_bits_16(&x[ib].qs[10]); + const __m128i bit_value_2_0 = _mm_and_si128(bit_mask_2_0, ones_8); + const __m128i bit_value_2_1 = _mm_and_si128(bit_mask_2_1, ones_8); + const __m128i qx_2_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_2_0, bit_value_2_0), ones_8); + const __m128i qx_2_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_2_1, bit_value_2_1), ones_8); + const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *) &yb_2->qs[0]); + const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *) &yb_2->qs[16]); + const __m128i sum_2_0 = mul_sum_i8_pairs(qx_2_0, qy_2_0); + const __m128i sum_2_1 = mul_sum_i8_pairs(qx_2_1, qy_2_1); + const __m128 q_2 = _mm_cvtepi32_ps(_mm_add_epi32(sum_2_0, sum_2_1)); + acc_2 = _mm_add_ps(acc_2, _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(yb_2->d))), q_2)); + + const __m128i bit_mask_3_0 = bytes_from_bits_16(&x[ib].qs[12]); + const __m128i bit_mask_3_1 = bytes_from_bits_16(&x[ib].qs[14]); + const __m128i bit_value_3_0 = _mm_and_si128(bit_mask_3_0, ones_8); + const __m128i bit_value_3_1 = _mm_and_si128(bit_mask_3_1, ones_8); + const __m128i qx_3_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_3_0, bit_value_3_0), ones_8); + const __m128i qx_3_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_3_1, bit_value_3_1), ones_8); + const __m128i qy_3_0 = _mm_loadu_si128((const __m128i *) &yb_3->qs[0]); + const __m128i qy_3_1 = _mm_loadu_si128((const __m128i *) &yb_3->qs[16]); + const __m128i sum_3_0 = mul_sum_i8_pairs(qx_3_0, qy_3_0); + const __m128i sum_3_1 = mul_sum_i8_pairs(qx_3_1, qy_3_1); + const __m128 q_3 = _mm_cvtepi32_ps(_mm_add_epi32(sum_3_0, sum_3_1)); + acc_3 = _mm_add_ps(acc_3, _mm_mul_ps(_mm_mul_ps(d0, _mm_set1_ps(GGML_CPU_FP16_TO_FP32(yb_3->d))), q_3)); + } + + *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); +#else + // Scalar fallback: keep the bitstream byte-oriented so each Q8_0 sub-block + // becomes four straight-line 8-value accumulations with no per-element + // divide/modulo/index arithmetic. + float sumf = 0.0f; + + for (int ib = 0; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + float sumi = 0.0f; + + for (int k = 0; k < 4; k++) { + const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k]; + const float d1 = GGML_CPU_FP16_TO_FP32(yb->d); + int sumi_block = 0; + + const uint8_t * GGML_RESTRICT bits = &x[ib].qs[k * 4]; + const int8_t * GGML_RESTRICT qy = yb->qs; + + for (int b = 0; b < 4; ++b, qy += 8) { + const unsigned mask = bits[b]; + sumi_block += ((mask & 0x01) ? qy[0] : -qy[0]) + + ((mask & 0x02) ? qy[1] : -qy[1]) + + ((mask & 0x04) ? qy[2] : -qy[2]) + + ((mask & 0x08) ? qy[3] : -qy[3]) + + ((mask & 0x10) ? qy[4] : -qy[4]) + + ((mask & 0x20) ? qy[5] : -qy[5]) + + ((mask & 0x40) ? qy[6] : -qy[6]) + + ((mask & 0x80) ? qy[7] : -qy[7]); + } + + sumi += d1 * sumi_block; + } + + sumf += d0 * sumi; + } + + *s = sumf; +#endif } void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 00af7e2ddc6..790eafda05a 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -137,25 +137,29 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c float sumf = 0.0; + // Generic fallback: keep the sign stream byte-oriented and process the + // 32-value Q8_0 block as four explicit 8-value groups. This keeps the + // portable structure while avoiding per-element bit-index arithmetic. for (int i = 0; i < nb; i++) { - const float d0 = GGML_FP16_TO_FP32(x[i].d); - const float d1 = GGML_FP16_TO_FP32(y[i].d); - + const float d = GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d); int sumi = 0; - - for (int j = 0; j < QK1_0; j++) { - const int bit_index = j; - const int byte_index = bit_index / 8; - const int bit_offset = bit_index % 8; - - // Extract bit: 1 = +1, 0 = -1 - const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1; - const int yi = y[i].qs[j]; - - sumi += xi * yi; + + const uint8_t * GGML_RESTRICT bits = x[i].qs; + const int8_t * GGML_RESTRICT qy = y[i].qs; + + for (int b = 0; b < 4; ++b, qy += 8) { + const unsigned mask = bits[b]; + sumi += ((mask & 0x01) ? qy[0] : -qy[0]) + + ((mask & 0x02) ? qy[1] : -qy[1]) + + ((mask & 0x04) ? qy[2] : -qy[2]) + + ((mask & 0x08) ? qy[3] : -qy[3]) + + ((mask & 0x10) ? qy[4] : -qy[4]) + + ((mask & 0x20) ? qy[5] : -qy[5]) + + ((mask & 0x40) ? qy[6] : -qy[6]) + + ((mask & 0x80) ? qy[7] : -qy[7]); } - - sumf += d0 * d1 * sumi; + + sumf += d * sumi; } *s = sumf; @@ -178,25 +182,33 @@ void ggml_vec_dot_q1_0_g128_q8_0_generic(int n, float * GGML_RESTRICT s, size_t float sumf = 0.0; - // Each Q1_0_g128 block has 128 elements, each Q8_0 block has 32 elements - // So we need 4 Q8_0 blocks per Q1_0_g128 block + // Generic fallback: keep the packed sign bits byte-oriented and process + // one Q8_0 sub-block as four explicit 8-value groups. This preserves the + // portable structure while avoiding per-element bit-index arithmetic. for (int i = 0; i < nb; i++) { const float d0 = GGML_FP16_TO_FP32(x[i].d); float sumi = 0.0f; + // Process 4 Q8_0 blocks (4 * 32 = 128 elements) for (int k = 0; k < 4; k++) { - const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d); - + const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k]; + const float d1 = GGML_FP16_TO_FP32(yb->d); int sumi_block = 0; - for (int j = 0; j < QK8_0; j++) { - const int bit_index = k * QK8_0 + j; - const int byte_index = bit_index / 8; - const int bit_offset = bit_index % 8; - - const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1; - sumi_block += xi * y[i*4 + k].qs[j]; + const uint8_t * GGML_RESTRICT bits = &x[i].qs[k * 4]; + const int8_t * GGML_RESTRICT qy = yb->qs; + + for (int b = 0; b < 4; ++b, qy += 8) { + const unsigned mask = bits[b]; + sumi_block += ((mask & 0x01) ? qy[0] : -qy[0]) + + ((mask & 0x02) ? qy[1] : -qy[1]) + + ((mask & 0x04) ? qy[2] : -qy[2]) + + ((mask & 0x08) ? qy[3] : -qy[3]) + + ((mask & 0x10) ? qy[4] : -qy[4]) + + ((mask & 0x20) ? qy[5] : -qy[5]) + + ((mask & 0x40) ? qy[6] : -qy[6]) + + ((mask & 0x80) ? qy[7] : -qy[7]); } sumi += d1 * sumi_block;