Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.gguf filter=lfs diff=lfs merge=lfs -text
149 changes: 102 additions & 47 deletions ggml/src/ggml-cpu/arch/x86/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,52 +65,56 @@ static inline int hsum_i32_4(const __m128i a) {
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
}

#if defined(__AVX2__) || defined(__AVX512F__)
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
const __m256i ax = _mm256_sign_epi8(x, x);
const __m256i sy = _mm256_sign_epi8(y, x);
return _mm256_maddubs_epi16(ax, sy);
}

// spread 32 bits to 32 bytes { 0x00, 0xFF }
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
uint32_t x32;
memcpy(&x32, x, sizeof(uint32_t));
const __m256i shuf_mask = _mm256_set_epi64x(
0x0303030303030303, 0x0202020202020202,
0x0101010101010101, 0x0000000000000000);
__m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
bytes = _mm256_or_si256(bytes, bit_mask);
return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
}
#if defined(__AVX2__)
// AVX2: single-pass byte-level processing, fully unrolled k-loop.
// Pipeline: broadcast+shuffle -> AND+cmpeq -> XOR+SUB -> maddubs+madd -> cvt+fma
const __m256i ones_8 = _mm256_set1_epi8(1);
const __m256i ones_16 = _mm256_set1_epi16(1);
const __m256i byte_shuf = _mm256_setr_epi8(
0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3);
const __m256i bit_masks = _mm256_setr_epi8(
1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128,
1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128);
const __m256i zero = _mm256_setzero_si256();
__m256 acc = _mm256_setzero_ps();

// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
{
const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
const __m256i lowMask = _mm256_set1_epi8( 0xF );
return _mm256_and_si256(lowMask, bytes);
}
for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
const uint32_t * qs32 = (const uint32_t *)x[ib].qs;

#define Q1_AVX2_BLOCK(K) \
{ \
const __m256i y = _mm256_loadu_si256((const __m256i *)y_ptr[K].qs); \
const __m256i sm = _mm256_cmpeq_epi8(_mm256_and_si256( \
_mm256_shuffle_epi8(_mm256_set1_epi32((int)qs32[K]), byte_shuf), \
bit_masks), zero); \
const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(y, sm), sm); \
const __m256i s32 = _mm256_madd_epi16( \
_mm256_maddubs_epi16(ones_8, sy), ones_16); \
acc_block = (K == 0) \
? _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \
_mm256_cvtepi32_ps(s32)) \
: _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \
_mm256_cvtepi32_ps(s32), acc_block); \
}

// add int16_t pairwise and return as float vector
static inline __m256 sum_i16_pairs_float(const __m256i x) {
const __m256i ones = _mm256_set1_epi16(1);
const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
return _mm256_cvtepi32_ps(summed_pairs);
}
const block_q8_0 * y_ptr = &y[ib*4];
__m256 acc_block;
Q1_AVX2_BLOCK(0)
Q1_AVX2_BLOCK(1)
Q1_AVX2_BLOCK(2)
Q1_AVX2_BLOCK(3)
#undef Q1_AVX2_BLOCK

static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
const __m256i zero = _mm256_setzero_si256();
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
return _mm256_cvtepi32_ps(summed_pairs);
#elif defined(__AVXVNNI__)
const __m256i zero = _mm256_setzero_si256();
const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
return _mm256_cvtepi32_ps(summed_pairs);
acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
}
{
const __m128 h = _mm_add_ps(_mm256_extractf128_ps(acc, 0),
_mm256_extractf128_ps(acc, 1));
const __m128 q = _mm_add_ps(h, _mm_movehl_ps(h, h));
*s = _mm_cvtss_f32(_mm_add_ss(q, _mm_movehdup_ps(q)));
}
#else
// Perform multiplication and create 16-bit values
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
Expand Down Expand Up @@ -664,13 +668,63 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons

float sumf = 0;

// Each Q1_0_g128 block has 128 elements
// Each Q8_0 block has 32 elements
// So we need 4 Q8_0 blocks per Q1_0_g128 block
#if defined(__AVX2__)
// AVX2: process 32 Q8_0 values per sub-block in two 16-element passes.
// Sign-extend int8->int16, expand 1-bit weights to masks, blend to negate,
// then madd->fma accumulation.
const __m256i ones_16 = _mm256_set1_epi16(1);
const __m256i bmask = _mm256_setr_epi16(
1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7,
1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, (short)(1<<15));
__m256 acc = _mm256_setzero_ps();

for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
__m256 acc_block = _mm256_setzero_ps();

for (int k = 0; k < 4; k++) {
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
const __m256i y_bytes = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs);

uint32_t bits;
memcpy(&bits, &x[ib].qs[k * 4], sizeof(bits));

// Lower 16 elements: sign-extend int8->int16, apply sign from weight bits
const __m256i y_lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(y_bytes));
const __m256i neg_lo = _mm256_sub_epi16(_mm256_setzero_si256(), y_lo);
const __m256i mask_lo = _mm256_cmpeq_epi16(
_mm256_and_si256(_mm256_set1_epi16((short)(bits & 0xFFFF)), bmask), bmask);
const __m256i signed_lo = _mm256_blendv_epi8(neg_lo, y_lo, mask_lo);

// Upper 16 elements
const __m256i y_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(y_bytes, 1));
const __m256i neg_hi = _mm256_sub_epi16(_mm256_setzero_si256(), y_hi);
const __m256i mask_hi = _mm256_cmpeq_epi16(
_mm256_and_si256(_mm256_set1_epi16((short)(bits >> 16)), bmask), bmask);
const __m256i signed_hi = _mm256_blendv_epi8(neg_hi, y_hi, mask_hi);

// Pair-wise sum int16->int32, combine halves, convert to float, FMA
const __m256i sum_32 = _mm256_add_epi32(
_mm256_madd_epi16(signed_lo, ones_16),
_mm256_madd_epi16(signed_hi, ones_16));
acc_block = _mm256_fmadd_ps(_mm256_set1_ps(d1),
_mm256_cvtepi32_ps(sum_32), acc_block);
}
acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
}
// Horizontal reduction: 256 -> 128 -> scalar
{
const __m128 h = _mm_add_ps(_mm256_extractf128_ps(acc, 0),
_mm256_extractf128_ps(acc, 1));
const __m128 q = _mm_add_ps(h, _mm_movehl_ps(h, h));
*s = _mm_cvtss_f32(_mm_add_ss(q, _mm_movehdup_ps(q)));
}
#else
// Scalar fallback
for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);

int sumi = 0;
float sumi = 0;

// Process 4 Q8_0 blocks (4 * 32 = 128 elements)
for (int k = 0; k < 4; k++) {
Expand All @@ -697,6 +751,7 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons
}

*s = sumf;
#endif
}

void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
Expand Down
18 changes: 12 additions & 6 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1185,15 +1185,16 @@ static void ggml_compute_forward_mul_mat_one_chunk(
assert(ne12 % ne02 == 0);
assert(ne13 % ne03 == 0);

// block-tiling attempt
const int64_t blck_0 = 16;
// COM6-inspired block-tiling: larger blocks for Q1_0_g128 (1-bit weights are tiny,
// so we can fit more rows in L1). Prefetch next weight block while processing current.
const int64_t blck_0 = (type == GGML_TYPE_Q1_0_g128) ? 64 : 16;
const int64_t blck_1 = 16;

const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;

// attempt to reduce false-sharing (does not seem to make a difference)
// 16 * 2, accounting for mmla kernels
float tmp[32];
// Size: blck_0 * 2 (accounting for mmla kernels that compute 2 rows at once)
float tmp[128];

for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
Expand Down Expand Up @@ -1226,12 +1227,17 @@ static void ggml_compute_forward_mul_mat_one_chunk(
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
//}

for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
// COM6-inspired: prefetch next weight rows while computing current ones.
const int64_t ir0_max = MIN(iir0 + blck_0, ir0_end);
for (int64_t ir0 = iir0; ir0 < ir0_max; ir0 += num_rows_per_vec_dot) {
if (ir0 + 4 * num_rows_per_vec_dot < ir0_max) {
__builtin_prefetch(src0_row + (ir0 + 4 * num_rows_per_vec_dot) * nb01, 0, 1);
}
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
}

for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (ir0_max - iir0) * sizeof(float));
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cpu/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ void ggml_vec_dot_q1_0_g128_q8_0_generic(int n, float * GGML_RESTRICT s, size_t
for (int i = 0; i < nb; i++) {
const float d0 = GGML_FP16_TO_FP32(x[i].d);

int sumi = 0;
float sumi = 0;

// Process 4 Q8_0 blocks (4 * 32 = 128 elements)
for (int k = 0; k < 4; k++) {
Expand Down