Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,7 @@ const std::vector<ggml_type> kv_cache_types = {
GGML_TYPE_IQ4_NL,
GGML_TYPE_Q5_0,
GGML_TYPE_Q5_1,
GGML_TYPE_TQ3_0,
};

static ggml_type kv_cache_type_from_str(const std::string & s) {
Comment on lines +401 to 404
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TQ3_0 is advertised as an allowed KV cache type in the CLI, but the CPU backend has no vec_dot support for GGML_TYPE_TQ3_0 (see ggml/src/ggml-cpu/ggml-cpu.c), so CPU-only runs selecting this type will fail during graph planning/execution. Either add CPU support, or gate this option based on backend capabilities / emit a clear error earlier.

Suggested change
GGML_TYPE_TQ3_0,
};
static ggml_type kv_cache_type_from_str(const std::string & s) {
};
static ggml_type kv_cache_type_from_str(const std::string & s) {
if (s == "TQ3_0") {
throw std::runtime_error(
"Unsupported cache type: " + s +
" (not supported by the CPU backend for KV cache use)"
);
}

Copilot uses AI. Check for mistakes.
Expand Down
3 changes: 2 additions & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,8 @@ extern "C" {
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
GGML_TYPE_Q1_0 = 40,
GGML_TYPE_Q1_0_g128 = 41,
GGML_TYPE_COUNT = 42,
GGML_TYPE_TQ3_0 = 42, // TurboQuant 3-bit polar + QJL (no per-block scale)
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The enum comment says TQ3_0 has “no per-block scale”, but the block layout includes gamma and both CPU/GPU paths use it as a per-block scale. Please correct the type comment to avoid misleading API/CLI consumers.

Suggested change
GGML_TYPE_TQ3_0 = 42, // TurboQuant 3-bit polar + QJL (no per-block scale)
GGML_TYPE_TQ3_0 = 42, // TurboQuant 3-bit polar + QJL (with per-block scale)

Copilot uses AI. Check for mistakes.
GGML_TYPE_COUNT = 43,
};

// precision
Expand Down
15 changes: 15 additions & 0 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,21 @@ typedef struct {
} block_tq2_0;
static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");

// TurboQuant 3-bit quantization (3.5 bpw)
// Per TurboQuant paper (Algorithm 2: TurboQuant_prod), ICLR 2026
// Each block of 32 values is quantized as:
// - 2-bit MSE codebook indices (after random rotation Π·x)
// - 1-bit QJL residual signs (sign(S·r) where r = x - dequant_mse(quant_mse(x)))
// - FP16 residual norm ||r||₂ for QJL scaling
// Requires per-model rotation matrices Π and S (stored externally)
#define QK_TQ3_0 32
typedef struct {
uint8_t qs[QK_TQ3_0 / 4]; // 2-bit codebook indices, 32 × 2 bits = 8 bytes
uint8_t qr[QK_TQ3_0 / 8]; // QJL residual signs, 32 × 1 bit = 4 bytes
ggml_half gamma; // ||residual||₂ for QJL correction scaling
Comment on lines +280 to +290
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The block_tq3_0 documentation says gamma stores the residual norm and that external rotation matrices are required, but the implementation uses a fixed WHT+sign preconditioner and stores the per-block scale d in gamma (see quantize_row_tq3_0_ref). Update the comment/field description (or update the implementation) so the on-wire/layout semantics match the code.

Suggested change
// Per TurboQuant paper (Algorithm 2: TurboQuant_prod), ICLR 2026
// Each block of 32 values is quantized as:
// - 2-bit MSE codebook indices (after random rotation Π·x)
// - 1-bit QJL residual signs (sign(S·r) where r = x - dequant_mse(quant_mse(x)))
// - FP16 residual norm ||r||₂ for QJL scaling
// Requires per-model rotation matrices Π and S (stored externally)
#define QK_TQ3_0 32
typedef struct {
uint8_t qs[QK_TQ3_0 / 4]; // 2-bit codebook indices, 32 × 2 bits = 8 bytes
uint8_t qr[QK_TQ3_0 / 8]; // QJL residual signs, 32 × 1 bit = 4 bytes
ggml_half gamma; // ||residual||₂ for QJL correction scaling
// Implementation note: the on-wire format used here follows the current
// ggml implementation, which applies a fixed WHT+sign preconditioner rather
// than storing or requiring external rotation matrices.
// Each block of 32 values stores:
// - 2-bit quantized indices
// - 1-bit residual/sign bits
// - FP16 per-block scale d
#define QK_TQ3_0 32
typedef struct {
uint8_t qs[QK_TQ3_0 / 4]; // 2-bit codebook indices, 32 × 2 bits = 8 bytes
uint8_t qr[QK_TQ3_0 / 8]; // residual/sign bits, 32 × 1 bit = 4 bytes
ggml_half gamma; // per-block FP16 scale d

Copilot uses AI. Check for mistakes.
} block_tq3_0;
static_assert(sizeof(block_tq3_0) == QK_TQ3_0/4 + QK_TQ3_0/8 + sizeof(ggml_half), "wrong tq3_0 block size/padding");

//
// Super-block quantization structures
//
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,10 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_TQ3_0] = {
.from_float = quantize_row_tq3_0,
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GGML_TYPE_TQ3_0 is added with from_float/to_float support but no CPU vec_dot/vec_dot_type. This will crash tests like tests/test-quantize-fns.cpp (it unconditionally calls qfns_cpu->vec_dot when from_float/to_float exist) and also makes CPU attention paths unable to use TQ3_0 KV. Add a CPU vec_dot implementation (and vec_dot_type) for TQ3_0, or ensure CPU code/tests skip dot-product checks for types without vec_dot and prevent selecting TQ3_0 on CPU-only backends.

Suggested change
.from_float = quantize_row_tq3_0,

Copilot uses AI. Check for mistakes.
.nrows = 1,
},
[GGML_TYPE_I32] = {
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
},
Expand Down
9 changes: 8 additions & 1 deletion ggml/src/ggml-cpu/ggml-cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,11 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
op->type != GGML_TYPE_IQ1_S &&
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
{
const auto * traits = ggml_get_type_traits_cpu(src0->type);
return traits->vec_dot != NULL &&
(src1->type == GGML_TYPE_F32 || src1->type == traits->vec_dot_type);
}
case GGML_OP_SOFT_MAX_BACK: {
if (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32) {
return false;
Expand All @@ -466,6 +470,9 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
case GGML_OP_OUT_PROD:
return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
case GGML_OP_FLASH_ATTN_EXT:
// K type must have vec_dot for CPU flash attention
return ggml_get_type_traits_cpu(src1->type)->vec_dot != NULL;
default:
return true;
}
Expand Down
7 changes: 7 additions & 0 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,7 @@ void ggml_compute_forward_add(
case GGML_TYPE_Q6_K:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_TQ3_0:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
Expand Down Expand Up @@ -1126,6 +1127,7 @@ void ggml_compute_forward_add1(
case GGML_TYPE_Q6_K:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_TQ3_0:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
Expand Down Expand Up @@ -1254,6 +1256,7 @@ void ggml_compute_forward_acc(
case GGML_TYPE_Q6_K:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_TQ3_0:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
Expand Down Expand Up @@ -4341,6 +4344,7 @@ void ggml_compute_forward_out_prod(
case GGML_TYPE_Q6_K:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_TQ3_0:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
Expand Down Expand Up @@ -4616,6 +4620,7 @@ void ggml_compute_forward_set(
case GGML_TYPE_Q6_K:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_TQ3_0:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
Expand Down Expand Up @@ -4840,6 +4845,7 @@ void ggml_compute_forward_get_rows(
case GGML_TYPE_Q6_K:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_TQ3_0:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
Expand Down Expand Up @@ -5566,6 +5572,7 @@ void ggml_compute_forward_clamp(
case GGML_TYPE_Q6_K:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_TQ3_0:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-cpu/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy,
quantize_row_tq2_0_ref(x, y, k);
}

void quantize_row_tq3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(k % QK_TQ3_0 == 0);
block_tq3_0 * GGML_RESTRICT y = vy;
quantize_row_tq3_0_ref(x, y, k);
}

//===================================== Q8_K ==============================================

void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-cpu/quants.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in

void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_tq3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
Expand Down
7 changes: 7 additions & 0 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1043,6 +1043,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
static constexpr int qi = QI3_S;
};

template<>
struct ggml_cuda_type_traits<GGML_TYPE_TQ3_0> {
static constexpr int qk = QK_TQ3_0; // 32
static constexpr int qr = 1;
static constexpr int qi = QK_TQ3_0 / 4; // 8
};

//////////////////////

struct ggml_cuda_device_info {
Expand Down
54 changes: 54 additions & 0 deletions ggml/src/ggml-cuda/convert.cu
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,50 @@ static __global__ void dequantize_block_mxfp4(const void * __restrict__ vx, dst_
}
}

// TurboQuant TQ3_0: 2-bit codebook dequantization + inverse WHT
// Dequantize to rotated space, then apply inverse WHT32 cooperatively
template<typename dst_t>
static __global__ void dequantize_block_tq3_0(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const float centroids[4] = { -1.510f, -0.4528f, 0.4528f, 1.510f };
const int8_t signs[32] = {
+1, -1, +1, +1, -1, -1, +1, -1, +1, +1, -1, +1, -1, +1, -1, -1,
+1, -1, -1, +1, +1, -1, +1, -1, -1, +1, +1, +1, -1, -1, +1, -1
};

const int64_t i = blockIdx.x;
const block_tq3_0 * x = (const block_tq3_0 *)vx;
const int tid = threadIdx.x;
if (tid >= 32) return;

const float d = __half2float(x[i].gamma);

// Step 1: Each thread dequantizes its value (in rotated space)
const int byte_idx = tid / 4;
const int bit_shift = 2 * (tid % 4);
const int idx = (x[i].qs[byte_idx] >> bit_shift) & 3;

__shared__ float shmem[32];
shmem[tid] = d * centroids[idx];
__syncthreads();

// Step 2: Cooperative inverse WHT (5 butterfly stages)
for (int step = 1; step < 32; step <<= 1) {
int partner = tid ^ step; // butterfly partner
float a = shmem[tid];
float b = shmem[partner];
__syncthreads();
if (tid < partner) {
shmem[tid] = a + b;
shmem[partner] = a - b;
}
__syncthreads();
}

// Step 3: Normalize and undo sign flips
const float inv_sqrt32 = 0.17677669529663688f;
yy[i * QK_TQ3_0 + tid] = shmem[tid] * inv_sqrt32 * signs[tid];
}
Comment on lines +489 to +531
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The GPU dequantization kernel reconstructs only d*centroid[idx] and ignores the stored QJL residual-sign bits (qr) and any QJL correction scaling. If qr/gamma are part of the format, dequantization should incorporate them; otherwise consider removing qr from the block layout to avoid wasting bandwidth/storage.

Copilot uses AI. Check for mistakes.

template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static void dequantize_block_cuda(const void * vx, dst_t * y,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
Expand Down Expand Up @@ -617,6 +661,12 @@ static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t
dequantize_block_mxfp4<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_tq3_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
const int nb = k / QK_TQ3_0;
dequantize_block_tq3_0<<<nb, 32, 0, stream>>>(vx, y);
}

template <typename src_t, typename dst_t>
static __global__ void convert_unary(
const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
Expand Down Expand Up @@ -719,6 +769,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
return dequantize_row_iq3_s_cuda;
case GGML_TYPE_MXFP4:
return dequantize_row_mxfp4_cuda;
case GGML_TYPE_TQ3_0:
return dequantize_row_tq3_0_cuda;
case GGML_TYPE_F32:
return convert_unary_cont_cuda<float>;
case GGML_TYPE_BF16:
Expand Down Expand Up @@ -774,6 +826,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
return dequantize_row_iq3_s_cuda;
case GGML_TYPE_MXFP4:
return dequantize_row_mxfp4_cuda;
case GGML_TYPE_TQ3_0:
return dequantize_row_tq3_0_cuda;
case GGML_TYPE_F16:
return convert_unary_cont_cuda<half>;
case GGML_TYPE_BF16:
Expand Down
73 changes: 73 additions & 0 deletions ggml/src/ggml-cuda/cpy-utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,79 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti);
}

// TQ3_0: Device-side Walsh-Hadamard Transform (WHT32) for rotation
// Same sign pattern as CPU (must match for consistency)
static __device__ __forceinline__ void tq3_wht32_forward_device(float * x) {
const int8_t signs[32] = {
+1, -1, +1, +1, -1, -1, +1, -1, +1, +1, -1, +1, -1, +1, -1, -1,
+1, -1, -1, +1, +1, -1, +1, -1, -1, +1, +1, +1, -1, -1, +1, -1
};
for (int j = 0; j < 32; j++) x[j] *= signs[j];
for (int step = 1; step < 32; step <<= 1) {
for (int i = 0; i < 32; i += step * 2) {
for (int j = i; j < i + step; j++) {
float a = x[j], b = x[j + step];
x[j] = a + b; x[j + step] = a - b;
}
}
}
const float s = 0.17677669529663688f; // 1/sqrt(32)
for (int j = 0; j < 32; j++) x[j] *= s;
}

static __device__ __forceinline__ void tq3_wht32_inverse_device(float * x) {
for (int step = 1; step < 32; step <<= 1) {
for (int i = 0; i < 32; i += step * 2) {
for (int j = i; j < i + step; j++) {
float a = x[j], b = x[j + step];
x[j] = a + b; x[j + step] = a - b;
}
}
}
const int8_t signs[32] = {
+1, -1, +1, +1, -1, -1, +1, -1, +1, +1, -1, +1, -1, +1, -1, -1,
+1, -1, -1, +1, +1, -1, +1, -1, -1, +1, +1, +1, -1, -1, +1, -1
};
const float s = 0.17677669529663688f;
for (int j = 0; j < 32; j++) x[j] *= s * signs[j];
}

// TQ3_0: GPU-side 2-bit scalar codebook quantization with WHT rotation
static __device__ void quantize_f32_tq3_0_block(const float * __restrict__ x, block_tq3_0 * __restrict__ y) {
const float centroids[4] = { -1.510f, -0.4528f, 0.4528f, 1.510f };

// Copy and apply WHT rotation
float rotated[QK_TQ3_0];
for (int j = 0; j < QK_TQ3_0; j++) rotated[j] = x[j];
tq3_wht32_forward_device(rotated);

memset(y, 0, sizeof(block_tq3_0));
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

quantize_f32_tq3_0_block uses memset() inside device code. This is the only device-side memset in this folder and can be problematic for some toolchains (e.g., HIP/device-lib availability) and may add overhead. Consider explicitly zeroing qs/qr/gamma with simple loops/assignments instead.

Suggested change
memset(y, 0, sizeof(block_tq3_0));
y->gamma = __float2half(0.0f);
for (int j = 0; j < (int)(sizeof(y->qs) / sizeof(y->qs[0])); ++j) {
y->qs[j] = 0;
}
for (int j = 0; j < (int)(sizeof(y->qr) / sizeof(y->qr[0])); ++j) {
y->qr[j] = 0;
}

Copilot uses AI. Check for mistakes.

float amax = 0.0f;
for (int j = 0; j < QK_TQ3_0; j++) {
float av = fabsf(rotated[j]);
if (av > amax) amax = av;
}

const float d = amax / 1.510f;
const float id = d > 0.0f ? 1.0f / d : 0.0f;
y->gamma = __float2half(d);

for (int j = 0; j < QK_TQ3_0; j++) {
float xn = rotated[j] * id;
int idx;
if (xn < 0.0f) { idx = (xn < -0.9814f) ? 0 : 1; }
else { idx = (xn < 0.9814f) ? 2 : 3; }
y->qs[j / 4] |= (idx << (2 * (j % 4)));
float residual = rotated[j] - d * centroids[idx];
if (residual >= 0.0f) { y->qr[j / 8] |= (1 << (j % 8)); }
}
}

static __device__ void cpy_blck_f32_tq3_0(const char * cxi, char * cdsti) {
quantize_f32_tq3_0_block((const float *)cxi, (block_tq3_0 *)cdsti);
}

template<typename src_t, typename dst_t>
static __device__ void cpy_1_scalar(const char * cxi, char * cdsti) {
*(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
Expand Down
4 changes: 3 additions & 1 deletion ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4622,6 +4622,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_BF16:
case GGML_TYPE_TQ3_0:
return true;
default:
return false;
Expand Down Expand Up @@ -4656,7 +4657,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
{
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL ||
op->type == GGML_TYPE_TQ3_0) &&
op->src[0]->type == GGML_TYPE_F32 &&
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
} break;
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-cuda/mmvq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1;
case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1;
case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1;
case GGML_TYPE_TQ3_0: return vec_dot_tq3_0_q8_1;
default: return nullptr;
}
}
Expand All @@ -57,6 +58,7 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ;
case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ;
case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ;
case GGML_TYPE_TQ3_0: return VDR_TQ3_0_Q8_1_MMVQ;
default: return 1;
}
}
Expand Down Expand Up @@ -645,6 +647,12 @@ static void mul_mat_vec_q_switch_type(
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_TQ3_0:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_TQ3_0>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
default:
GGML_ABORT("fatal error");
break;
Expand Down
10 changes: 10 additions & 0 deletions ggml/src/ggml-cuda/set-rows.cu
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,16 @@ static void set_rows_cuda(ggml_backend_cuda_context & ctx, const ggml_tensor * s
nb1, nb2, nb3,
stream
);
} else if (dst->type == GGML_TYPE_TQ3_0) {
set_rows_cuda_quant<idx_t, block_tq3_0, QK_TQ3_0, quantize_f32_tq3_0_block>(
src0_d, src1_d, (block_tq3_0*)dst->data,
ne00, ne01, ne02, ne03,
ne10, ne11, ne12, ne13,
nb01, nb02, nb03,
nb10, nb11, nb12,
nb1, nb2, nb3,
stream
);
} else {
GGML_ABORT("unsupported type %s", ggml_type_name(dst->type));
}
Expand Down
Loading