Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions ggml/src/ggml-cpu/arch/x86/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,110 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}

void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK1_0_g128;
const int nb = n / qk;

assert(n % qk == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);

const block_q1_0_g128 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;

float sumf = 0.0f;

#if defined(__AVX512BW__) && defined(__AVX512VL__) && defined(__AVX512VNNI__)
// AVX-512 VNNI path: mask registers for bit expansion + VNNI dot product
// Accumulate into float vector, single hsum at the end
const __m256i ones_u8 = _mm256_set1_epi8(1);
__m256 acc = _mm256_setzero_ps();

for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);

for (int k = 0; k < 4; k++) {
// Load 32 bits of weights using alias-safe unaligned load
uint32_t bmask_u32;
memcpy(&bmask_u32, x[ib].qs + k * 4, sizeof(bmask_u32));
const __mmask32 bmask = (__mmask32)bmask_u32;

// Load 32 int8 activations
const __m256i q8 = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);

// Sum ALL q8 values using VNNI (groups of 4 int8 -> int32)
const __m256i sum_all = _mm256_dpbusd_epi32(_mm256_setzero_si256(), ones_u8, q8);

// Zero out q8 where bit=0, keep where bit=1 (single instruction)
const __m256i masked_q8 = _mm256_maskz_mov_epi8(bmask, q8);

// Sum MASKED q8 values using VNNI
const __m256i sum_masked = _mm256_dpbusd_epi32(_mm256_setzero_si256(), ones_u8, masked_q8);

// dot = 2 * sum_masked - sum_all
// (weight = 2*bit - 1, so dot = sum((2*bit-1)*q8) = 2*sum(q8 where bit=1) - sum(q8))
const __m256i dp = _mm256_sub_epi32(_mm256_slli_epi32(sum_masked, 1), sum_all);

// Scale by d1 and accumulate into float accumulator
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
acc = _mm256_fmadd_ps(_mm256_set1_ps(d0 * d1), _mm256_cvtepi32_ps(dp), acc);
}
}

sumf = hsum_float_8(acc);

#elif defined(__AVX2__)
// AVX2 path: shuffle-based bit expansion + mul_sum_i8_pairs_float
// Uses llama.cpp's optimized helper (auto-selects AVXVNNI dpbssd when available)
const __m256i shuf = _mm256_setr_epi8(
0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3);
const __m256i bmask = _mm256_set1_epi64x(0x8040201008040201LL);
const __m256i ones8 = _mm256_set1_epi8(1);
const __m256i neg8 = _mm256_set1_epi8(-1);

__m256 acc = _mm256_setzero_ps();

for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);

for (int k = 0; k < 4; k++) {
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
const __m256 d_scale = _mm256_set1_ps(d0 * d1);

// Broadcast 4 bytes of 1-bit weights using alias-safe load
int32_t bits_i32;
memcpy(&bits_i32, x[ib].qs + k * 4, sizeof(bits_i32));
__m256i vb = _mm256_set1_epi32(bits_i32);
__m256i ex = _mm256_shuffle_epi8(vb, shuf);
ex = _mm256_cmpeq_epi8(_mm256_and_si256(ex, bmask), bmask);

// Convert mask to +1/-1
const __m256i xi = _mm256_blendv_epi8(neg8, ones8, ex);

// Load 32 int8 activations
const __m256i q8 = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);

// Dot product + float conversion via optimized helper
// (auto-uses AVXVNNI dpbssd on supported CPUs)
const __m256 p = mul_sum_i8_pairs_float(xi, q8);

// Accumulate scaled result
acc = _mm256_fmadd_ps(d_scale, p, acc);
}
}

sumf = hsum_float_8(acc);

#else
// Scalar fallback — delegates to generic implementation
ggml_vec_dot_q1_0_g128_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
return;
#endif

*s = sumf;
}

void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
Expand Down
27 changes: 9 additions & 18 deletions ggml/src/ggml-cpu/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -176,35 +176,26 @@ void ggml_vec_dot_q1_0_g128_q8_0_generic(int n, float * GGML_RESTRICT s, size_t
const block_q8_0 * GGML_RESTRICT y = vy;


float sumf = 0.0;

// Each Q1_0_g128 block has 128 elements, each Q8_0 block has 32 elements
// So we need 4 Q8_0 blocks per Q1_0_g128 block
float sumf = 0.0f;

for (int i = 0; i < nb; i++) {
const float d0 = GGML_FP16_TO_FP32(x[i].d);

float sumi = 0.0f;

for (int k = 0; k < 4; k++) {
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
const uint8_t * bits = x[i].qs + k * 4;
const int8_t * q8 = y[i*4 + k].qs;

int sumi_block = 0;

int sumi = 0;
for (int j = 0; j < QK8_0; j++) {
const int bit_index = k * QK8_0 + j;
const int byte_index = bit_index / 8;
const int bit_offset = bit_index % 8;

const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
sumi_block += xi * y[i*4 + k].qs[j];
const int bit = (bits[j >> 3] >> (j & 7)) & 1;
sumi += (2*bit - 1) * q8[j];
}

sumi += d1 * sumi_block;
sumf += d0 * d1 * (float)sumi;
}

sumf += d0 * sumi;
}

*s = sumf;
}

Expand Down