diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000000..ae756a3f816 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.gguf filter=lfs diff=lfs merge=lfs -text diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index 45129f08a16..e46eaa2562e 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -65,52 +65,56 @@ static inline int hsum_i32_4(const __m128i a) { return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); } -#if defined(__AVX2__) || defined(__AVX512F__) -static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { - const __m256i ax = _mm256_sign_epi8(x, x); - const __m256i sy = _mm256_sign_epi8(y, x); - return _mm256_maddubs_epi16(ax, sy); -} - -// spread 32 bits to 32 bytes { 0x00, 0xFF } -static inline __m256i bytes_from_bits_32(const uint8_t * x) { - uint32_t x32; - memcpy(&x32, x, sizeof(uint32_t)); - const __m256i shuf_mask = _mm256_set_epi64x( - 0x0303030303030303, 0x0202020202020202, - 0x0101010101010101, 0x0000000000000000); - __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); - const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); - bytes = _mm256_or_si256(bytes, bit_mask); - return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); -} +#if defined(__AVX2__) + // AVX2: single-pass byte-level processing, fully unrolled k-loop. + // Pipeline: broadcast+shuffle -> AND+cmpeq -> XOR+SUB -> maddubs+madd -> cvt+fma + const __m256i ones_8 = _mm256_set1_epi8(1); + const __m256i ones_16 = _mm256_set1_epi16(1); + const __m256i byte_shuf = _mm256_setr_epi8( + 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3); + const __m256i bit_masks = _mm256_setr_epi8( + 1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128, + 1,2,4,8,16,32,64,-128, 1,2,4,8,16,32,64,-128); + const __m256i zero = _mm256_setzero_si256(); + __m256 acc = _mm256_setzero_ps(); -// Unpack 32 4-bit fields into 32 bytes -// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval -static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) -{ - const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); - const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); - const __m256i lowMask = _mm256_set1_epi8( 0xF ); - return _mm256_and_si256(lowMask, bytes); -} + for (int ib = 0; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + const uint32_t * qs32 = (const uint32_t *)x[ib].qs; + +#define Q1_AVX2_BLOCK(K) \ + { \ + const __m256i y = _mm256_loadu_si256((const __m256i *)y_ptr[K].qs); \ + const __m256i sm = _mm256_cmpeq_epi8(_mm256_and_si256( \ + _mm256_shuffle_epi8(_mm256_set1_epi32((int)qs32[K]), byte_shuf), \ + bit_masks), zero); \ + const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(y, sm), sm); \ + const __m256i s32 = _mm256_madd_epi16( \ + _mm256_maddubs_epi16(ones_8, sy), ones_16); \ + acc_block = (K == 0) \ + ? _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \ + _mm256_cvtepi32_ps(s32)) \ + : _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), \ + _mm256_cvtepi32_ps(s32), acc_block); \ + } -// add int16_t pairwise and return as float vector -static inline __m256 sum_i16_pairs_float(const __m256i x) { - const __m256i ones = _mm256_set1_epi16(1); - const __m256i summed_pairs = _mm256_madd_epi16(ones, x); - return _mm256_cvtepi32_ps(summed_pairs); -} + const block_q8_0 * y_ptr = &y[ib*4]; + __m256 acc_block; + Q1_AVX2_BLOCK(0) + Q1_AVX2_BLOCK(1) + Q1_AVX2_BLOCK(2) + Q1_AVX2_BLOCK(3) +#undef Q1_AVX2_BLOCK -static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { -#if defined(__AVX512VNNI__) && defined(__AVX512VL__) - const __m256i zero = _mm256_setzero_si256(); - const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); - return _mm256_cvtepi32_ps(summed_pairs); -#elif defined(__AVXVNNI__) - const __m256i zero = _mm256_setzero_si256(); - const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy); - return _mm256_cvtepi32_ps(summed_pairs); + acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc); + } + { + const __m128 h = _mm_add_ps(_mm256_extractf128_ps(acc, 0), + _mm256_extractf128_ps(acc, 1)); + const __m128 q = _mm_add_ps(h, _mm_movehl_ps(h, h)); + *s = _mm_cvtss_f32(_mm_add_ss(q, _mm_movehdup_ps(q))); + } #else // Perform multiplication and create 16-bit values const __m256i dot = _mm256_maddubs_epi16(ax, sy); @@ -664,13 +668,63 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons float sumf = 0; - // Each Q1_0_g128 block has 128 elements - // Each Q8_0 block has 32 elements - // So we need 4 Q8_0 blocks per Q1_0_g128 block +#if defined(__AVX2__) + // AVX2: process 32 Q8_0 values per sub-block in two 16-element passes. + // Sign-extend int8->int16, expand 1-bit weights to masks, blend to negate, + // then madd->fma accumulation. + const __m256i ones_16 = _mm256_set1_epi16(1); + const __m256i bmask = _mm256_setr_epi16( + 1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, + 1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, (short)(1<<15)); + __m256 acc = _mm256_setzero_ps(); + + for (int ib = 0; ib < nb; ++ib) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + __m256 acc_block = _mm256_setzero_ps(); + + for (int k = 0; k < 4; k++) { + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d); + const __m256i y_bytes = _mm256_loadu_si256((const __m256i *)y[ib*4 + k].qs); + + uint32_t bits; + memcpy(&bits, &x[ib].qs[k * 4], sizeof(bits)); + + // Lower 16 elements: sign-extend int8->int16, apply sign from weight bits + const __m256i y_lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(y_bytes)); + const __m256i neg_lo = _mm256_sub_epi16(_mm256_setzero_si256(), y_lo); + const __m256i mask_lo = _mm256_cmpeq_epi16( + _mm256_and_si256(_mm256_set1_epi16((short)(bits & 0xFFFF)), bmask), bmask); + const __m256i signed_lo = _mm256_blendv_epi8(neg_lo, y_lo, mask_lo); + + // Upper 16 elements + const __m256i y_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(y_bytes, 1)); + const __m256i neg_hi = _mm256_sub_epi16(_mm256_setzero_si256(), y_hi); + const __m256i mask_hi = _mm256_cmpeq_epi16( + _mm256_and_si256(_mm256_set1_epi16((short)(bits >> 16)), bmask), bmask); + const __m256i signed_hi = _mm256_blendv_epi8(neg_hi, y_hi, mask_hi); + + // Pair-wise sum int16->int32, combine halves, convert to float, FMA + const __m256i sum_32 = _mm256_add_epi32( + _mm256_madd_epi16(signed_lo, ones_16), + _mm256_madd_epi16(signed_hi, ones_16)); + acc_block = _mm256_fmadd_ps(_mm256_set1_ps(d1), + _mm256_cvtepi32_ps(sum_32), acc_block); + } + acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc); + } + // Horizontal reduction: 256 -> 128 -> scalar + { + const __m128 h = _mm_add_ps(_mm256_extractf128_ps(acc, 0), + _mm256_extractf128_ps(acc, 1)); + const __m128 q = _mm_add_ps(h, _mm_movehl_ps(h, h)); + *s = _mm_cvtss_f32(_mm_add_ss(q, _mm_movehdup_ps(q))); + } +#else + // Scalar fallback for (int ib = 0; ib < nb; ++ib) { const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); - int sumi = 0; + float sumi = 0; // Process 4 Q8_0 blocks (4 * 32 = 128 elements) for (int k = 0; k < 4; k++) { @@ -697,6 +751,7 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons } *s = sumf; +#endif } void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 48fbddf74f5..7de4da3338c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1185,15 +1185,16 @@ static void ggml_compute_forward_mul_mat_one_chunk( assert(ne12 % ne02 == 0); assert(ne13 % ne03 == 0); - // block-tiling attempt - const int64_t blck_0 = 16; + // COM6-inspired block-tiling: larger blocks for Q1_0_g128 (1-bit weights are tiny, + // so we can fit more rows in L1). Prefetch next weight block while processing current. + const int64_t blck_0 = (type == GGML_TYPE_Q1_0_g128) ? 64 : 16; const int64_t blck_1 = 16; const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; // attempt to reduce false-sharing (does not seem to make a difference) - // 16 * 2, accounting for mmla kernels - float tmp[32]; + // Size: blck_0 * 2 (accounting for mmla kernels that compute 2 rows at once) + float tmp[128]; for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) { for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) { @@ -1226,12 +1227,17 @@ static void ggml_compute_forward_mul_mat_one_chunk( // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); //} - for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { + // COM6-inspired: prefetch next weight rows while computing current ones. + const int64_t ir0_max = MIN(iir0 + blck_0, ir0_end); + for (int64_t ir0 = iir0; ir0 < ir0_max; ir0 += num_rows_per_vec_dot) { + if (ir0 + 4 * num_rows_per_vec_dot < ir0_max) { + __builtin_prefetch(src0_row + (ir0 + 4 * num_rows_per_vec_dot) * nb01, 0, 1); + } vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot); } for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { - memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float)); + memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (ir0_max - iir0) * sizeof(float)); } } } diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 7f8456a5db8..bd83809e855 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -183,7 +183,7 @@ void ggml_vec_dot_q1_0_g128_q8_0_generic(int n, float * GGML_RESTRICT s, size_t for (int i = 0; i < nb; i++) { const float d0 = GGML_FP16_TO_FP32(x[i].d); - int sumi = 0; + float sumi = 0; // Process 4 Q8_0 blocks (4 * 32 = 128 elements) for (int k = 0; k < 4; k++) {