Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
346 changes: 344 additions & 2 deletions ggml/src/ggml-cpu/arch/x86/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,25 @@ static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const
}
#endif
#elif defined(__SSSE3__)
static inline int hsum_i32_4(const __m128i a) {
const __m128i hi64 = _mm_unpackhi_epi64(a, a);
const __m128i sum64 = _mm_add_epi32(hi64, a);
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
}

static inline __m128i bytes_from_bits_16(const uint8_t * x) {
uint16_t x16;
memcpy(&x16, x, sizeof(uint16_t));

const __m128i shuf_mask = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
__m128i bytes = _mm_shuffle_epi8(_mm_set1_epi16((short) x16), shuf_mask);
const __m128i bit_mask = _mm_set_epi64x(0x7fbfdfeff7fbfdfe, 0x7fbfdfeff7fbfdfe);
bytes = _mm_or_si128(bytes, bit_mask);

return _mm_cmpeq_epi8(bytes, _mm_set1_epi64x(-1));
}

// horizontally add 4x4 floats
static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
__m128 res_0 =_mm_hadd_ps(a, b);
Expand Down Expand Up @@ -541,11 +560,334 @@ static inline __m128i get_scale_shuffle(int i) {
#endif

void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
const int qk = QK8_0;
const int nb = n / qk;

assert(n % qk == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);

const block_q1_0 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;

int ib = 0;
float sumf = 0;

#if defined(__AVX512BW__)
// AVX-512BW: widen one full Q8_0 block to int16, apply the 32 sign bits
// directly to y, reduce pairs with madd, then scale and accumulate in fp32.
const __m512i ones_16 = _mm512_set1_epi16(1);
const __m512i zero = _mm512_setzero_si512();
__m512 acc = _mm512_setzero_ps();

for (; ib < nb; ++ib) {
const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
const __m256i qy_8 = _mm256_loadu_si256((const __m256i *) y[ib].qs);
const __m512i qy_16 = _mm512_cvtepi8_epi16(qy_8);
uint32_t bits;
memcpy(&bits, x[ib].qs, sizeof(bits));
const __m512i signed_y = _mm512_mask_sub_epi16(qy_16, (__mmask32)(~bits), zero, qy_16);
const __m512i sum_32 = _mm512_madd_epi16(signed_y, ones_16);

acc = _mm512_fmadd_ps(_mm512_set1_ps(d), _mm512_cvtepi32_ps(sum_32), acc);
}

{
const __m256 h = _mm256_add_ps(_mm512_extractf32x8_ps(acc, 0),
_mm512_extractf32x8_ps(acc, 1));
__m128 q = _mm_add_ps(_mm256_extractf128_ps(h, 0),
_mm256_extractf128_ps(h, 1));
q = _mm_add_ps(q, _mm_movehl_ps(q, q));
q = _mm_add_ss(q, _mm_movehdup_ps(q));
sumf = _mm_cvtss_f32(q);
}
#elif defined(__AVX2__)
// AVX2: expand the 32 packed sign bits to 32 signed bytes, run one byte-dot
// against the full Q8_0 block, and accumulate the scaled fp32 reduction.
__m256 acc = _mm256_setzero_ps();

for (; ib < nb; ++ib) {
const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
const __m256i bit_mask = bytes_from_bits_32(x[ib].qs);
const __m256i bit_value = _mm256_and_si256(bit_mask, _mm256_set1_epi8(1));
const __m256i qx = _mm256_sub_epi8(_mm256_add_epi8(bit_value, bit_value), _mm256_set1_epi8(1));
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
const __m256 q = mul_sum_i8_pairs_float(qx, qy);

acc = _mm256_fmadd_ps(d, q, acc);
}

sumf = hsum_float_8(acc);
#elif defined(__AVX__)
// AVX: keep the same 32-bit sign expansion, but process the byte-domain dot
// as two 128-bit halves before combining them into one 256-bit reduction.
const __m128i ones_8 = _mm_set1_epi8(1);
__m256 acc = _mm256_setzero_ps();

for (; ib < nb; ++ib) {
const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
const __m256i bit_mask = bytes_from_bits_32(x[ib].qs);
const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8);
const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8);
const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8);
const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8);
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y[ib].qs[0]);
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y[ib].qs[16]);
const __m256i qx = MM256_SET_M128I(qx_1, qx_0);
const __m256i qy = MM256_SET_M128I(qy_1, qy_0);
const __m256 q = mul_sum_i8_pairs_float(qx, qy);

acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d), q));
}

sumf = hsum_float_8(acc);
#elif defined(__SSSE3__)
// SSSE3: decode the two 16-bit bit chunks into two 16-byte sign vectors,
// perform the pairwise int8 dot in 128-bit lanes, and accumulate to scalar.
const __m128i ones_8 = _mm_set1_epi8(1);

for (; ib < nb; ++ib) {
const float d = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[0]);
const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[2]);
const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8);
const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8);
const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8);
const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8);
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &y[ib].qs[0]);
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &y[ib].qs[16]);
const __m128i sum_0 = mul_sum_i8_pairs(qx_0, qy_0);
const __m128i sum_1 = mul_sum_i8_pairs(qx_1, qy_1);

sumf += d * hsum_i32_4(_mm_add_epi32(sum_0, sum_1));
}
#endif

// Scalar fallback: stay byte-oriented so the packed sign bits feed four
// straight-line 8-value accumulations without per-element bit math.
for (; ib < nb; ++ib) {
const uint8_t * GGML_RESTRICT bits = x[ib].qs;
const int8_t * GGML_RESTRICT qy = y[ib].qs;
int sumi = 0;

for (int b = 0; b < 4; ++b, qy += 8) {
const unsigned mask = bits[b];
sumi += ((mask & 0x01) ? qy[0] : -qy[0])
+ ((mask & 0x02) ? qy[1] : -qy[1])
+ ((mask & 0x04) ? qy[2] : -qy[2])
+ ((mask & 0x08) ? qy[3] : -qy[3])
+ ((mask & 0x10) ? qy[4] : -qy[4])
+ ((mask & 0x20) ? qy[5] : -qy[5])
+ ((mask & 0x40) ? qy[6] : -qy[6])
+ ((mask & 0x80) ? qy[7] : -qy[7]);
}

sumf += sumi * GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
}

*s = sumf;
}

void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
ggml_vec_dot_q1_0_g128_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
const int qk = QK1_0_g128;
const int nb = n / qk;

assert(n % qk == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);

const block_q1_0_g128 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;

#if defined(__AVX512BW__)
// AVX-512BW: sign-extend int8→int16, mask-negate, madd→fma pipeline.
// The inner loop over 4 Q8_0 sub-blocks accumulates into acc_block,
// then a single FMA folds into the outer acc — this structure lets the
// CPU start the next outer iteration while the final FMA is still in flight.
const __m512i ones_16 = _mm512_set1_epi16(1);
const __m512i zero = _mm512_setzero_si512();
__m512 acc = _mm512_setzero_ps();

for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
__m512 acc_block = _mm512_setzero_ps();

for (int k = 0; k < 4; k++) {
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);

// 32 int8 → 32 int16 in a zmm register
__m256i y_8 = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);
__m512i y_16 = _mm512_cvtepi8_epi16(y_8);

// Load 32 weight bits; negate y where bit=0 (weight = -1)
uint32_t bits;
memcpy(&bits, &x[ib].qs[k * 4], sizeof(bits));
__m512i signed_y = _mm512_mask_sub_epi16(y_16, (__mmask32)(~bits), zero, y_16);

// Pair-wise sum int16→int32, convert to float, scale and accumulate
__m512i sum_32 = _mm512_madd_epi16(signed_y, ones_16);
__m512 sum_f = _mm512_cvtepi32_ps(sum_32);
acc_block = _mm512_fmadd_ps(_mm512_set1_ps(d1), sum_f, acc_block);
}

acc = _mm512_fmadd_ps(_mm512_set1_ps(d0), acc_block, acc);
}

// Horizontal sum: 512 → 256 → 128 → scalar
{
__m256 h = _mm256_add_ps(_mm512_extractf32x8_ps(acc, 0),
_mm512_extractf32x8_ps(acc, 1));
__m128 q = _mm_add_ps(_mm256_extractf128_ps(h, 0),
_mm256_extractf128_ps(h, 1));
q = _mm_add_ps(q, _mm_movehl_ps(q, q));
q = _mm_add_ss(q, _mm_movehdup_ps(q));
*s = _mm_cvtss_f32(q);
}
#elif defined(__AVX2__)
// AVX2: expand each 32-bit sign stream to 32 signed bytes, reduce two
// Q8_0 sub-blocks in parallel, then fold the pair into the outer block sum.
// Splitting the fixed 4-way inner loop into two independent accumulators
// gives the core more scheduling freedom than one longer dependency chain.
const __m256i ones_8 = _mm256_set1_epi8(1);
__m256 acc = _mm256_setzero_ps();

for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
__m256 acc_block_0 = _mm256_setzero_ps();
__m256 acc_block_1 = _mm256_setzero_ps();

for (int k = 0; k < 4; k += 2) {
const block_q8_0 * GGML_RESTRICT yb_0 = &y[ib * 4 + k + 0];
const block_q8_0 * GGML_RESTRICT yb_1 = &y[ib * 4 + k + 1];
const __m256i bit_mask_0 = bytes_from_bits_32(&x[ib].qs[(k + 0) * 4]);
const __m256i bit_mask_1 = bytes_from_bits_32(&x[ib].qs[(k + 1) * 4]);
const __m256i bit_value_0 = _mm256_and_si256(bit_mask_0, ones_8);
const __m256i bit_value_1 = _mm256_and_si256(bit_mask_1, ones_8);
const __m256i qx_0 = _mm256_sub_epi8(_mm256_add_epi8(bit_value_0, bit_value_0), ones_8);
const __m256i qx_1 = _mm256_sub_epi8(_mm256_add_epi8(bit_value_1, bit_value_1), ones_8);
const __m256i qy_0 = _mm256_loadu_si256((const __m256i *) yb_0->qs);
const __m256i qy_1 = _mm256_loadu_si256((const __m256i *) yb_1->qs);
const __m256 q_0 = mul_sum_i8_pairs_float(qx_0, qy_0);
const __m256 q_1 = mul_sum_i8_pairs_float(qx_1, qy_1);
const __m256 d1_0 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(yb_0->d));
const __m256 d1_1 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(yb_1->d));

acc_block_0 = _mm256_fmadd_ps(d1_0, q_0, acc_block_0);
acc_block_1 = _mm256_fmadd_ps(d1_1, q_1, acc_block_1);
}

acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), _mm256_add_ps(acc_block_0, acc_block_1), acc);
}

*s = hsum_float_8(acc);
#elif defined(__AVX__)
// AVX: reuse the same 32-bit sign expansion, but do the byte-domain work
// with two 128-bit halves before combining them into one 256-bit reduction.
const __m128i ones_8 = _mm_set1_epi8(1);
__m256 acc = _mm256_setzero_ps();

for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
__m256 acc_block = _mm256_setzero_ps();

for (int k = 0; k < 4; ++k) {
const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
const __m256i bit_mask = bytes_from_bits_32(&x[ib].qs[k * 4]);
const __m128i bit_mask_0 = _mm256_castsi256_si128(bit_mask);
const __m128i bit_mask_1 = _mm256_extractf128_si256(bit_mask, 1);
const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8);
const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8);
const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8);
const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8);
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &yb->qs[0]);
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &yb->qs[16]);
const __m256i qx = MM256_SET_M128I(qx_1, qx_0);
const __m256i qy = MM256_SET_M128I(qy_1, qy_0);
const __m256 q = mul_sum_i8_pairs_float(qx, qy);

acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(d1), q));
}

acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d0), acc_block));
}

*s = hsum_float_8(acc);
#elif defined(__SSSE3__)
// SSSE3: decode two 16-bit chunks into 16 signed bytes each, run the
// pairwise int8 dot in 128-bit lanes, and accumulate directly to scalar.
const __m128i ones_8 = _mm_set1_epi8(1);
float sumf = 0.0f;

for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);

for (int k = 0; k < 4; ++k) {
const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);

const __m128i bit_mask_0 = bytes_from_bits_16(&x[ib].qs[k * 4 + 0]);
const __m128i bit_mask_1 = bytes_from_bits_16(&x[ib].qs[k * 4 + 2]);
const __m128i bit_value_0 = _mm_and_si128(bit_mask_0, ones_8);
const __m128i bit_value_1 = _mm_and_si128(bit_mask_1, ones_8);
const __m128i qx_0 = _mm_sub_epi8(_mm_add_epi8(bit_value_0, bit_value_0), ones_8);
const __m128i qx_1 = _mm_sub_epi8(_mm_add_epi8(bit_value_1, bit_value_1), ones_8);
const __m128i qy_0 = _mm_loadu_si128((const __m128i *) &yb->qs[0]);
const __m128i qy_1 = _mm_loadu_si128((const __m128i *) &yb->qs[16]);
const __m128i sum_0 = mul_sum_i8_pairs(qx_0, qy_0);
const __m128i sum_1 = mul_sum_i8_pairs(qx_1, qy_1);

sumf += d0 * d1 * hsum_i32_4(_mm_add_epi32(sum_0, sum_1));
}
}

*s = sumf;
#else
// Scalar fallback: keep the bitstream byte-oriented so each Q8_0 sub-block
// becomes four straight-line 8-value accumulations with no per-element
// divide/modulo/index arithmetic.
float sumf = 0.0f;

for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
float sumi = 0.0f;

for (int k = 0; k < 4; k++) {
const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
int sumi_block = 0;

const uint8_t * GGML_RESTRICT bits = &x[ib].qs[k * 4];
const int8_t * GGML_RESTRICT qy = yb->qs;

for (int b = 0; b < 4; ++b, qy += 8) {
const unsigned mask = bits[b];
sumi_block += ((mask & 0x01) ? qy[0] : -qy[0])
+ ((mask & 0x02) ? qy[1] : -qy[1])
+ ((mask & 0x04) ? qy[2] : -qy[2])
+ ((mask & 0x08) ? qy[3] : -qy[3])
+ ((mask & 0x10) ? qy[4] : -qy[4])
+ ((mask & 0x20) ? qy[5] : -qy[5])
+ ((mask & 0x40) ? qy[6] : -qy[6])
+ ((mask & 0x80) ? qy[7] : -qy[7]);
}

sumi += d1 * sumi_block;
}

sumf += d0 * sumi;
}

*s = sumf;
#endif
}

void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
Expand Down
Loading