From 3566f10cbe305817a4c8ef4f185ee187af846033 Mon Sep 17 00:00:00 2001 From: Yang Date: Fri, 27 Mar 2026 15:51:21 +0800 Subject: [PATCH 01/10] add some description --- .gitignore | 1 + llmc/__main__.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/.gitignore b/.gitignore index 896b38a12..999eefe10 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,4 @@ save* .log *.pid *.ipynb* +.venv/ \ No newline at end of file diff --git a/llmc/__main__.py b/llmc/__main__.py index ec60c1492..eff14f6a3 100755 --- a/llmc/__main__.py +++ b/llmc/__main__.py @@ -26,20 +26,27 @@ def main(config): + # 从注册表拿模型并实例化 + # 动态分配模型 model = MODEL_REGISTRY[config.model.type](config) + # 打印模型和tokenizer logger.info(f'model: {model}') logger.info(f'tokenizer: {model.get_tokenizer()}') + # 获得需要的评测种类 eval_list = get_eval_list(model, config) + # 真正执行评测 eval_model(model, None, eval_list, eval_pos='pretrain') blockwise_opts = [] + # 取出处理模态 modalities, modality_configs = get_modality(config) for modality, modality_config in zip(modalities, modality_configs): model.set_modality(modality) if not config.get('calib', False): + # 不需要校准数据 直接构造算法对象 blockwise_opt = ALGO_REGISTRY[modality_config.method]( model, modality_config, @@ -51,14 +58,17 @@ def main(config): blockwise_opts.append(blockwise_opt) dist.barrier() else: + # 需要校准数据 dataset = BaseDataset( model.get_tokenizer(), config.calib, model.batch_process ) calib_data, padding_mask = dataset.get_calib_dataset() + # 收集第一层block输入 为后续blockwise算法需要的输入缓存下来 model.collect_first_block_input(calib_data, padding_mask) del calib_data gc.collect() torch.cuda.empty_cache() + # 构造算法对象 blockwise_opt = ALGO_REGISTRY[modality_config.method]( model, modality_config, @@ -66,15 +76,20 @@ def main(config): model.get_padding_mask(), config, ) + # 项目逐层block做优化 blockwise_opt.run_block_loop() blockwise_opts.append(blockwise_opt) dist.barrier() + # 对变化后的浮点模型做评测 eval_model(model, blockwise_opts, eval_list, eval_pos='transformed') + # 只有rank 0继续做保存和导出 if int(os.environ['RANK']) == 0: + # 保存变换后的浮点模型 if 'save' in config and config.save.get('save_trans', False): blockwise_opt.save_model(save_trans_path) + # 保存TensorRT-LLM格式并构建engine if 'save' in config and config.save.get('save_trtllm', False): blockwise_opt.save_model(save_trtllm_trans_path) from llmc.utils.export_trtllm import cvt_trtllm_engine @@ -88,12 +103,15 @@ def main(config): eval_model(model, blockwise_opts, eval_list, eval_pos='fake_quant') eval_model(model, blockwise_opts, eval_list, eval_pos='fake_quant_wo_kv') + # 切换到fake quant部署模式再保存 if 'save' in config and config.save.get('save_fake', False): deploy_all_modality(blockwise_opts, 'fake_quant') blockwise_opt.save_model(save_fake_path) if 'save' in config: + # 导出真实量化模型给推理后端 if ( + # 导出前进行遍历检查 config.save.get('save_vllm', False) or config.save.get('save_sgl', False) or config.save.get('save_lightllm', False) @@ -101,9 +119,12 @@ def main(config): for modality_config in modality_configs: w, a = modality_config.weight, modality_config.get('act') + # 只允许特定bit类型 if isinstance(w.bit, str): + # 必须对称量化 assert w.symmetric, 'Only symmetric quant is supported.' assert w.bit in ['e4m3', 'e3m4'], 'Supported quant: w8a16.' + # 有激活量化的话,那激活也要满足对称、bit合法的要求 if a: assert ( w.symmetric and a.symmetric @@ -114,6 +135,7 @@ def main(config): and a.bit in ['e4m3', 'e5m2'] ), 'Only WA FP8 quant is supported' else: + # 是整数则必须是4 or 8 assert w.symmetric, 'Only symmetric quant is supported.' assert w.bit in [4, 8], 'Supported quant: w4a16, w8a16, w8a8.' if a: @@ -130,12 +152,15 @@ def main(config): blockwise_opt.save_model(save_quant_path) update_vllm_quant_config(blockwise_opt.model, config, save_quant_path) + # 给特定后端(AutoAWQ导出 elif config.save.get('save_autoawq', False): for modality_config in modality_configs: + # 只能4 bit 仅含有weight 不支持act assert ( modality_config.weight.bit in [4] and 'act' not in modality_config ), 'AutoAWQ supports only 4-bit weight-only quantization.' assert ( + # 不能对称量化 not modality_config.weight.symmetric ), 'Only asymmetric quant is supported.' @@ -161,11 +186,15 @@ def main(config): blockwise_opt.save_model(save_quant_path) update_lightx2v_quant_config(save_quant_path) + # 判断是否有opencompass if 'opencompass' in config: assert config.save.get('save_trans', False) + # 从配置里读取cfg_path, output_path cfg_path = config['opencompass']['cfg_path'] output_path = config['opencompass']['output_path'] + # 取路径 eval_model_path = os.path.abspath(save_trans_path) + # 拼指令 opencompass_cmd = ( f'opencompass {cfg_path} -w {output_path} ' f'--llmc_cfg {args.config} ' @@ -173,6 +202,7 @@ def main(config): f'--llmc_model_path {eval_model_path}' ) logger.info(f'opencompass_cmd : {opencompass_cmd}') + # 执行 os.system(opencompass_cmd) dist.barrier() @@ -181,20 +211,25 @@ def main(config): logger.add(sys.stdout, level='INFO') llmc_start_time = time.time() parser = argparse.ArgumentParser() + # 解析命令行参数 parser.add_argument('--config', type=str, required=True) parser.add_argument('--task_id', type=str, required=True) args = parser.parse_args() with open(args.config, 'r') as file: + # 读取配置文件 config = yaml.safe_load(file) config = EasyDict(config) init_process_group(backend='nccl') + # 初始化分布式环境 设置GPU torch.cuda.set_device(int(os.environ['LOCAL_RANK'])) + # 检查配置 打印依赖版本 if int(os.environ['RANK']) != 0: logger.remove() + # 检查配置是否合法 check_config(config) logger.info(f'args: {args}') @@ -266,3 +301,4 @@ def main(config): llmc_duration_time = llmc_end_time - llmc_start_time logger.info(f'llmc_duration_time: {llmc_duration_time} s') logger.info('--- llmc finished ---') + \ No newline at end of file From 626ec9bfbcc9545f071a7117897059363e78f79b Mon Sep 17 00:00:00 2001 From: Yang Date: Sat, 28 Mar 2026 14:18:27 +0800 Subject: [PATCH 02/10] fix the bug --- llmc/compression/quantization/base_blockwise_quantization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py index 5a2232699..69efb9d76 100755 --- a/llmc/compression/quantization/base_blockwise_quantization.py +++ b/llmc/compression/quantization/base_blockwise_quantization.py @@ -203,8 +203,8 @@ def set_quant_config(self): kv_special_cfg = self.quant_config['kvcache'].get('special', {}) act_static_cfg = {} if self.act_static: - act_static_cfg.update(self.config.calib.n_sample) - act_static_cfg.update(self.config.calib.bs) + act_static_cfg['n_sample'] = self.config.calib.n_sample + act_static_cfg['bs'] = self.config.calib.bs kv_quant_type = self.quant_config['kvcache'].get('quant_type', 'int-quant') self.kv_module = KV_REGISTRY[self.quant_config['kvcache']['method']]( kv_quant_type, self.quant_config['kvcache'], From e0117aaf98cb4ce997d5c68d9ed994e68c755fb4 Mon Sep 17 00:00:00 2001 From: Yang Date: Sat, 28 Mar 2026 15:39:16 +0800 Subject: [PATCH 03/10] Add the calib export and fix some problems --- .gitignore | 3 +- ...tn_w_a_pertensor_static_naive_quant_kv.yml | 15 ++- llmc/__main__.py | 24 +++- .../base_blockwise_quantization.py | 119 +++++++++++++++++- llmc/compression/quantization/kvquant.py | 9 ++ 5 files changed, 158 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 999eefe10..bf707424f 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,5 @@ save* .log *.pid *.ipynb* -.venv/ \ No newline at end of file +.venv/ +*.sh \ No newline at end of file diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml index f2bbda675..f10ff95e7 100644 --- a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml +++ b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml @@ -1,24 +1,25 @@ base: seed: &seed 42 model: - type: model_type - path: model path + type: Qwen3 + path: /home/michael/Project/models/Qwen3-0.6B torch_dtype: auto calib: name: pileval download: False - path: calib data path + path: /home/michael/Project/calib/pileval + n_sample: 128 n_samples: 128 bs: 1 seq_len: 2048 preproc: txt_general_preproc seed: *seed eval: - eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #long_ppl eval not support pretrain eval pos + eval_pos: [] #long_ppl eval not support pretrain eval pos name: wikitext2 type: decode_ppl download: False - path: eval_data_path + path: /home/michael/Project/llmc_datasets/wikitext2 bs: 1 inference_per_block: False num_samples: 10 @@ -41,5 +42,7 @@ quant: symmetric: True granularity: per_tensor save: + save_calib_json: True + calib_json_name: kv_cache_calib.json save_fake: False - save_path: /path/to/save/ + save_path: /home/michael/Project/llmc_save diff --git a/llmc/__main__.py b/llmc/__main__.py index eff14f6a3..44e0232ef 100755 --- a/llmc/__main__.py +++ b/llmc/__main__.py @@ -85,6 +85,22 @@ def main(config): eval_model(model, blockwise_opts, eval_list, eval_pos='transformed') # 只有rank 0继续做保存和导出 if int(os.environ['RANK']) == 0: + if 'save' in config and config.save.get('save_calib_json', False): + # 收集各个模态/量化器导出的校准结果。 + calib_json_list = [ + blockwise_opt.collect_calib_json() + for blockwise_opt in blockwise_opts + if hasattr(blockwise_opt, 'collect_calib_json') + ] + # 单模态时保持扁平结构,兼容 LightLLM 的校准文件格式。 + calib_json_payload = ( + calib_json_list[0] if len(calib_json_list) == 1 else calib_json_list + ) + # 将最终的校准 JSON 写入配置指定的输出路径。 + with open(save_calib_json_path, 'w') as file: + json.dump(calib_json_payload, file, ensure_ascii=False, indent=4) + logger.info(f'save calib json done -- {save_calib_json_path}') + # 保存变换后的浮点模型 if 'save' in config and config.save.get('save_trans', False): blockwise_opt.save_model(save_trans_path) @@ -244,6 +260,12 @@ def main(config): # Ensure only the main process creates directories if int(os.environ['RANK']) == 0: if 'save' in config: + if config.save.get('save_calib_json', False): + mkdirs(config.save.save_path) + save_calib_json_path = os.path.join( + config.save.save_path, + config.save.get('calib_json_name', 'calib_scales.json'), + ) if config.save.get('save_trans', False): save_trans_path = os.path.join( config.save.save_path, 'transformed_model' @@ -301,4 +323,4 @@ def main(config): llmc_duration_time = llmc_end_time - llmc_start_time logger.info(f'llmc_duration_time: {llmc_duration_time} s') logger.info('--- llmc finished ---') - \ No newline at end of file + diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py index 69efb9d76..3240fda7e 100755 --- a/llmc/compression/quantization/base_blockwise_quantization.py +++ b/llmc/compression/quantization/base_blockwise_quantization.py @@ -175,13 +175,17 @@ def set_quant_config(self): self.act_quant_module = IntegerQuantizer elif quant_type == 'float-quant': self.act_quant_module = FloatQuantizer - self.quant_config['act']['tp'] = self.tp - self.aquantizer = self.act_quant_module(**self.quant_config['act']) self.act_static = self.quant_config['act'].get('static', False) if self.act_static: assert ( self.quant_config['act']['granularity'] == 'per_tensor' ), 'Only support per_tensor static quant' + # 静态激活量化会走批量校准接口,这里把默认的 minmax + # 归一化成对应的 static_minmax,避免后续校准时报算法名不匹配。 + if self.quant_config['act'].get('calib_algo', 'minmax') == 'minmax': + self.quant_config['act']['calib_algo'] = 'static_minmax' + self.quant_config['act']['tp'] = self.tp + self.aquantizer = self.act_quant_module(**self.quant_config['act']) self.quant_attn = self.quant_config['act'].get('quant_attn', False) if self.quant_attn: assert self.config['model']['type'] in ['Vit', 'DeepseekV2'] @@ -203,8 +207,10 @@ def set_quant_config(self): kv_special_cfg = self.quant_config['kvcache'].get('special', {}) act_static_cfg = {} if self.act_static: - act_static_cfg['n_sample'] = self.config.calib.n_sample - act_static_cfg['bs'] = self.config.calib.bs + # KV cache 构造函数接收的是 num_samples / bsz, + # 这里把校准配置里的字段名映射成它实际需要的参数名。 + act_static_cfg['num_samples'] = self.config.calib.n_sample + act_static_cfg['bsz'] = self.config.calib.bs kv_quant_type = self.quant_config['kvcache'].get('quant_type', 'int-quant') self.kv_module = KV_REGISTRY[self.quant_config['kvcache']['method']]( kv_quant_type, self.quant_config['kvcache'], @@ -1003,6 +1009,111 @@ def contiguous_params(self): if not param.is_contiguous(): param.data = param.data.contiguous() + # 将张量等对象转换成 JSON 可直接写出的 Python 基础类型。 + def _to_jsonable(self, value): + if isinstance(value, torch.Tensor): + return value.detach().cpu().tolist() + return value + + # 统一把输入规整成 CPU tensor,便于后续做范围计算和序列化。 + def _to_tensor(self, value, dtype=torch.float32): + if isinstance(value, torch.Tensor): + return value.detach().cpu().to(dtype) + return torch.as_tensor(value, dtype=dtype) + + # LightLLM 需要的是离线 FP8 KV 的 descale,这里先根据 qparams 还原实数范围, + # 再换算成与 torch.float8_e4m3fn 对齐的每层 K/V scale。 + def _collect_lightllm_kv_scale(self, scales, zeros, qmin, qmax): + if isinstance(scales, torch.Tensor) and scales.numel() == 0: + return None + + scales_tensor = self._to_tensor(scales) + zeros_tensor = self._to_tensor(zeros, dtype=scales_tensor.dtype) + qmin_tensor = self._to_tensor(qmin, dtype=scales_tensor.dtype) + qmax_tensor = self._to_tensor(qmax, dtype=scales_tensor.dtype) + min_tensor = (qmin_tensor - zeros_tensor) * scales_tensor + max_tensor = (qmax_tensor - zeros_tensor) * scales_tensor + absmax_tensor = torch.maximum(min_tensor.abs(), max_tensor.abs()) + fp8_qmax = torch.tensor( + torch.finfo(torch.float8_e4m3fn).max, dtype=absmax_tensor.dtype + ) + return absmax_tensor / fp8_qmax + + # 按 LightLLM 的 kv_cache_calib.json 结构导出校准结果, + # 目前只支持它已经接入的 per_tensor / per_head 两种 KV 格式。 + def collect_calib_json(self): + if not getattr(self, 'quant_kvcache', False): + raise ValueError('save_calib_json requires kvcache quantization.') + + kv_cfg = self.quant_config['kvcache'] + granularity = kv_cfg.get('granularity') + # LightLLM 当前只识别 per_tensor 和 per_head 两种静态 KV 校准文件。 + if granularity not in ['per_tensor', 'per_head']: + raise ValueError( + f'LightLLM calib export only supports per_tensor/per_head, got {granularity}' + ) + + num_layers = self.model.model_config.num_hidden_layers + # LightLLM 会校验 KV head 数;如果模型配置里没有这个字段,再退回总 head 数。 + num_head = int( + getattr( + self.model.model_config, + 'num_key_value_heads', + self.model.get_num_attention_heads(), + ) + ) + scales = [] + # 每层导出一行,顺序固定为 [k_scale..., v_scale...]。 + for layer_idx in range(num_layers): + key_scale = self._collect_lightllm_kv_scale( + self.kv_module.k_scales_buffer[layer_idx], + self.kv_module.k_zeros_buffer[layer_idx], + self.kv_module.k_qmin_buffer[layer_idx], + self.kv_module.k_qmax_buffer[layer_idx], + ) + value_scale = self._collect_lightllm_kv_scale( + self.kv_module.v_scales_buffer[layer_idx], + self.kv_module.v_zeros_buffer[layer_idx], + self.kv_module.v_qmin_buffer[layer_idx], + self.kv_module.v_qmax_buffer[layer_idx], + ) + if key_scale is None or value_scale is None: + raise ValueError(f'Calibration scale for layer {layer_idx} is empty.') + + scale_row = torch.cat([key_scale.reshape(-1), value_scale.reshape(-1)]).tolist() + scales.append(scale_row) + + scale_width = len(scales[0]) if scales else 0 + # per_tensor 每层只能有 [k_scale, v_scale] 两个值; + # per_head 则需要每层 2 * num_head 个值。 + if granularity == 'per_tensor' and scale_width != 2: + raise ValueError(f'per_tensor export expects 2 scales per layer, got {scale_width}') + if granularity == 'per_head' and scale_width != num_head * 2: + raise ValueError( + f'per_head export expects {num_head * 2} scales per layer, got {scale_width}' + ) + + # 优先复用 Hugging Face config 里的 architectures 字段, + # 缺失时退回到 LLMC 配置里的模型类型,便于 LightLLM 做架构一致性校验。 + architectures = getattr(self.model.model_config, 'architectures', None) + if isinstance(architectures, list) and len(architectures) > 0: + architectures = architectures[0] + elif architectures is None: + architectures = self.config.model.type + + # 顶层字段名称和含义对齐 LightLLM PR #1220 中的 kv_cache_calib.json。 + return { + 'version': '1.0', + 'architectures': architectures, + 'quant_type': granularity, + 'qmin': float(torch.finfo(torch.float8_e4m3fn).min), + 'qmax': float(torch.finfo(torch.float8_e4m3fn).max), + 'num_layers': num_layers, + 'num_head': num_head, + 'scales_shape': [num_layers, scale_width], + 'scales': scales, + } + @torch.no_grad() def save_model(self, path): if int(os.environ['RANK']) != 0: diff --git a/llmc/compression/quantization/kvquant.py b/llmc/compression/quantization/kvquant.py index 32c2de5be..6cbe75ffb 100644 --- a/llmc/compression/quantization/kvquant.py +++ b/llmc/compression/quantization/kvquant.py @@ -1,3 +1,4 @@ +import copy import torch from loguru import logger from transformers import DynamicCache @@ -12,12 +13,20 @@ class NaiveQuantKVCache(DynamicCache): def __init__(self, quant_type, kvquant_cfg, num_hidden_layers, num_samples=128, bsz=1): super().__init__() + # 复制一份配置,避免在静态 KV 校准场景下修改原始量化配置对象。 + kvquant_cfg = copy.deepcopy(kvquant_cfg) assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group'] self.num_hidden_layers, self.num_samples, self.bsz = ( num_hidden_layers, num_samples, bsz, ) + if kvquant_cfg.get('static', False) and kvquant_cfg.get( + 'calib_algo', 'minmax' + ) == 'minmax': + # 静态 KV 校准会走批量张量统计接口,这里把默认的 minmax + # 归一化成对应的 static_minmax,避免后续校准时报算法名不匹配。 + kvquant_cfg['calib_algo'] = 'static_minmax' if quant_type == 'int-quant': self.kvquantizer = IntegerQuantizer(**kvquant_cfg) elif quant_type == 'float-quant': From cac2d91bfef4df262994af2eb2e237e6dfdaff8b Mon Sep 17 00:00:00 2001 From: Yang Date: Sat, 28 Mar 2026 16:42:00 +0800 Subject: [PATCH 04/10] remove the n_sample and repair the fixing errors --- .../methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml | 1 - llmc/compression/quantization/base_blockwise_quantization.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml index f10ff95e7..097316f5f 100644 --- a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml +++ b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml @@ -8,7 +8,6 @@ calib: name: pileval download: False path: /home/michael/Project/calib/pileval - n_sample: 128 n_samples: 128 bs: 1 seq_len: 2048 diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py index 3240fda7e..224f40c26 100755 --- a/llmc/compression/quantization/base_blockwise_quantization.py +++ b/llmc/compression/quantization/base_blockwise_quantization.py @@ -209,7 +209,7 @@ def set_quant_config(self): if self.act_static: # KV cache 构造函数接收的是 num_samples / bsz, # 这里把校准配置里的字段名映射成它实际需要的参数名。 - act_static_cfg['num_samples'] = self.config.calib.n_sample + act_static_cfg['num_samples'] = self.config.calib.n_samples act_static_cfg['bsz'] = self.config.calib.bs kv_quant_type = self.quant_config['kvcache'].get('quant_type', 'int-quant') self.kv_module = KV_REGISTRY[self.quant_config['kvcache']['method']]( From cecf46cf39e0452febe0404d764a989105246877 Mon Sep 17 00:00:00 2001 From: Yang Date: Sat, 28 Mar 2026 16:51:37 +0800 Subject: [PATCH 05/10] Make the Project more moduled --- .../base_blockwise_quantization.py | 73 +------------------ llmc/utils/__init__.py | 1 + llmc/utils/export_calib.py | 67 +++++++++++++++++ 3 files changed, 70 insertions(+), 71 deletions(-) create mode 100644 llmc/utils/export_calib.py diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py index 224f40c26..9a8e7338c 100755 --- a/llmc/compression/quantization/base_blockwise_quantization.py +++ b/llmc/compression/quantization/base_blockwise_quantization.py @@ -12,6 +12,7 @@ import torch.nn as nn from loguru import logger +from llmc.utils.export_calib import collect_lightllm_kv_calib_json from llmc.utils.registry_factory import KV_REGISTRY, TOKEN_REDUCTION_REGISTRY from ..blockwise_optimization import BlockwiseOpt @@ -1042,77 +1043,7 @@ def _collect_lightllm_kv_scale(self, scales, zeros, qmin, qmax): # 按 LightLLM 的 kv_cache_calib.json 结构导出校准结果, # 目前只支持它已经接入的 per_tensor / per_head 两种 KV 格式。 def collect_calib_json(self): - if not getattr(self, 'quant_kvcache', False): - raise ValueError('save_calib_json requires kvcache quantization.') - - kv_cfg = self.quant_config['kvcache'] - granularity = kv_cfg.get('granularity') - # LightLLM 当前只识别 per_tensor 和 per_head 两种静态 KV 校准文件。 - if granularity not in ['per_tensor', 'per_head']: - raise ValueError( - f'LightLLM calib export only supports per_tensor/per_head, got {granularity}' - ) - - num_layers = self.model.model_config.num_hidden_layers - # LightLLM 会校验 KV head 数;如果模型配置里没有这个字段,再退回总 head 数。 - num_head = int( - getattr( - self.model.model_config, - 'num_key_value_heads', - self.model.get_num_attention_heads(), - ) - ) - scales = [] - # 每层导出一行,顺序固定为 [k_scale..., v_scale...]。 - for layer_idx in range(num_layers): - key_scale = self._collect_lightllm_kv_scale( - self.kv_module.k_scales_buffer[layer_idx], - self.kv_module.k_zeros_buffer[layer_idx], - self.kv_module.k_qmin_buffer[layer_idx], - self.kv_module.k_qmax_buffer[layer_idx], - ) - value_scale = self._collect_lightllm_kv_scale( - self.kv_module.v_scales_buffer[layer_idx], - self.kv_module.v_zeros_buffer[layer_idx], - self.kv_module.v_qmin_buffer[layer_idx], - self.kv_module.v_qmax_buffer[layer_idx], - ) - if key_scale is None or value_scale is None: - raise ValueError(f'Calibration scale for layer {layer_idx} is empty.') - - scale_row = torch.cat([key_scale.reshape(-1), value_scale.reshape(-1)]).tolist() - scales.append(scale_row) - - scale_width = len(scales[0]) if scales else 0 - # per_tensor 每层只能有 [k_scale, v_scale] 两个值; - # per_head 则需要每层 2 * num_head 个值。 - if granularity == 'per_tensor' and scale_width != 2: - raise ValueError(f'per_tensor export expects 2 scales per layer, got {scale_width}') - if granularity == 'per_head' and scale_width != num_head * 2: - raise ValueError( - f'per_head export expects {num_head * 2} scales per layer, got {scale_width}' - ) - - # 优先复用 Hugging Face config 里的 architectures 字段, - # 缺失时退回到 LLMC 配置里的模型类型,便于 LightLLM 做架构一致性校验。 - architectures = getattr(self.model.model_config, 'architectures', None) - if isinstance(architectures, list) and len(architectures) > 0: - architectures = architectures[0] - elif architectures is None: - architectures = self.config.model.type - - # 顶层字段名称和含义对齐 LightLLM PR #1220 中的 kv_cache_calib.json。 - return { - 'version': '1.0', - 'architectures': architectures, - 'quant_type': granularity, - 'qmin': float(torch.finfo(torch.float8_e4m3fn).min), - 'qmax': float(torch.finfo(torch.float8_e4m3fn).max), - 'num_layers': num_layers, - 'num_head': num_head, - 'scales_shape': [num_layers, scale_width], - 'scales': scales, - } + return collect_lightllm_kv_calib_json(self) @torch.no_grad() def save_model(self, path): diff --git a/llmc/utils/__init__.py b/llmc/utils/__init__.py index aa5fefa1a..683f1ad8f 100755 --- a/llmc/utils/__init__.py +++ b/llmc/utils/__init__.py @@ -1,4 +1,5 @@ from .export_autoawq import update_autoawq_quant_config +from .export_calib import collect_lightllm_kv_calib_json from .export_lightx2v import update_lightx2v_quant_config from .export_vllm import update_vllm_quant_config from .utils import (check_config, copy_files, deploy_all_modality, diff --git a/llmc/utils/export_calib.py b/llmc/utils/export_calib.py new file mode 100644 index 000000000..5c9c49887 --- /dev/null +++ b/llmc/utils/export_calib.py @@ -0,0 +1,67 @@ +import torch + + +def collect_lightllm_kv_calib_json(blockwise_opt): + if not getattr(blockwise_opt, 'quant_kvcache', False): + raise ValueError('save_calib_json requires kvcache quantization.') + + kv_cfg = blockwise_opt.quant_config['kvcache'] + granularity = kv_cfg.get('granularity') + if granularity not in ['per_tensor', 'per_head']: + raise ValueError( + f'LightLLM calib export only supports per_tensor/per_head, got {granularity}' + ) + + num_layers = blockwise_opt.model.model_config.num_hidden_layers + num_head = int( + getattr( + blockwise_opt.model.model_config, + 'num_key_value_heads', + blockwise_opt.model.get_num_attention_heads(), + ) + ) + scales = [] + for layer_idx in range(num_layers): + key_scale = blockwise_opt._collect_lightllm_kv_scale( + blockwise_opt.kv_module.k_scales_buffer[layer_idx], + blockwise_opt.kv_module.k_zeros_buffer[layer_idx], + blockwise_opt.kv_module.k_qmin_buffer[layer_idx], + blockwise_opt.kv_module.k_qmax_buffer[layer_idx], + ) + value_scale = blockwise_opt._collect_lightllm_kv_scale( + blockwise_opt.kv_module.v_scales_buffer[layer_idx], + blockwise_opt.kv_module.v_zeros_buffer[layer_idx], + blockwise_opt.kv_module.v_qmin_buffer[layer_idx], + blockwise_opt.kv_module.v_qmax_buffer[layer_idx], + ) + if key_scale is None or value_scale is None: + raise ValueError(f'Calibration scale for layer {layer_idx} is empty.') + + scale_row = torch.cat([key_scale.reshape(-1), value_scale.reshape(-1)]).tolist() + scales.append(scale_row) + + scale_width = len(scales[0]) if scales else 0 + if granularity == 'per_tensor' and scale_width != 2: + raise ValueError(f'per_tensor export expects 2 scales per layer, got {scale_width}') + if granularity == 'per_head' and scale_width != num_head * 2: + raise ValueError( + f'per_head export expects {num_head * 2} scales per layer, got {scale_width}' + ) + + architectures = getattr(blockwise_opt.model.model_config, 'architectures', None) + if isinstance(architectures, list) and len(architectures) > 0: + architectures = architectures[0] + elif architectures is None: + architectures = blockwise_opt.config.model.type + + return { + 'version': '1.0', + 'architectures': architectures, + 'quant_type': granularity, + 'qmin': float(torch.finfo(torch.float8_e4m3fn).min), + 'qmax': float(torch.finfo(torch.float8_e4m3fn).max), + 'num_layers': num_layers, + 'num_head': num_head, + 'scales_shape': [num_layers, scale_width], + 'scales': scales, + } From 3fa3e9de02e4c73814491b7b397f5aea3e8ac7c2 Mon Sep 17 00:00:00 2001 From: Yang Date: Sat, 28 Mar 2026 17:01:31 +0800 Subject: [PATCH 06/10] Rename the config to make the target explicit --- .../rtn_w_a_pertensor_static_naive_quant_kv.yml | 4 ++-- llmc/__main__.py | 16 ++++++++++------ llmc/utils/export_calib.py | 4 +++- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml index 097316f5f..c247b492b 100644 --- a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml +++ b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml @@ -41,7 +41,7 @@ quant: symmetric: True granularity: per_tensor save: - save_calib_json: True - calib_json_name: kv_cache_calib.json + save_lightllm_kv_cache_calib: True + lightllm_kv_cache_calib_name: kv_cache_calib.json save_fake: False save_path: /home/michael/Project/llmc_save diff --git a/llmc/__main__.py b/llmc/__main__.py index 44e0232ef..bc429c335 100755 --- a/llmc/__main__.py +++ b/llmc/__main__.py @@ -85,7 +85,7 @@ def main(config): eval_model(model, blockwise_opts, eval_list, eval_pos='transformed') # 只有rank 0继续做保存和导出 if int(os.environ['RANK']) == 0: - if 'save' in config and config.save.get('save_calib_json', False): + if 'save' in config and config.save.get('save_lightllm_kv_cache_calib', False): # 收集各个模态/量化器导出的校准结果。 calib_json_list = [ blockwise_opt.collect_calib_json() @@ -97,9 +97,11 @@ def main(config): calib_json_list[0] if len(calib_json_list) == 1 else calib_json_list ) # 将最终的校准 JSON 写入配置指定的输出路径。 - with open(save_calib_json_path, 'w') as file: + with open(save_lightllm_kv_cache_calib_path, 'w') as file: json.dump(calib_json_payload, file, ensure_ascii=False, indent=4) - logger.info(f'save calib json done -- {save_calib_json_path}') + logger.info( + f'save lightllm kv cache calib done -- {save_lightllm_kv_cache_calib_path}' + ) # 保存变换后的浮点模型 if 'save' in config and config.save.get('save_trans', False): @@ -260,11 +262,13 @@ def main(config): # Ensure only the main process creates directories if int(os.environ['RANK']) == 0: if 'save' in config: - if config.save.get('save_calib_json', False): + if config.save.get('save_lightllm_kv_cache_calib', False): mkdirs(config.save.save_path) - save_calib_json_path = os.path.join( + save_lightllm_kv_cache_calib_path = os.path.join( config.save.save_path, - config.save.get('calib_json_name', 'calib_scales.json'), + config.save.get( + 'lightllm_kv_cache_calib_name', 'kv_cache_calib.json' + ), ) if config.save.get('save_trans', False): save_trans_path = os.path.join( diff --git a/llmc/utils/export_calib.py b/llmc/utils/export_calib.py index 5c9c49887..dcf9698bf 100644 --- a/llmc/utils/export_calib.py +++ b/llmc/utils/export_calib.py @@ -3,7 +3,9 @@ def collect_lightllm_kv_calib_json(blockwise_opt): if not getattr(blockwise_opt, 'quant_kvcache', False): - raise ValueError('save_calib_json requires kvcache quantization.') + raise ValueError( + 'save_lightllm_kv_cache_calib requires kvcache quantization.' + ) kv_cfg = blockwise_opt.quant_config['kvcache'] granularity = kv_cfg.get('granularity') From 3911c49d356893fe69687e852e1fad827b675002 Mon Sep 17 00:00:00 2001 From: Yang Date: Sat, 28 Mar 2026 17:11:20 +0800 Subject: [PATCH 07/10] modify the comment and transform the chinese to English --- llmc/__main__.py | 38 ------------------- .../base_blockwise_quantization.py | 25 +++++++----- 2 files changed, 15 insertions(+), 48 deletions(-) diff --git a/llmc/__main__.py b/llmc/__main__.py index bc429c335..c211ec35a 100755 --- a/llmc/__main__.py +++ b/llmc/__main__.py @@ -26,27 +26,20 @@ def main(config): - # 从注册表拿模型并实例化 - # 动态分配模型 model = MODEL_REGISTRY[config.model.type](config) - # 打印模型和tokenizer logger.info(f'model: {model}') logger.info(f'tokenizer: {model.get_tokenizer()}') - # 获得需要的评测种类 eval_list = get_eval_list(model, config) - # 真正执行评测 eval_model(model, None, eval_list, eval_pos='pretrain') blockwise_opts = [] - # 取出处理模态 modalities, modality_configs = get_modality(config) for modality, modality_config in zip(modalities, modality_configs): model.set_modality(modality) if not config.get('calib', False): - # 不需要校准数据 直接构造算法对象 blockwise_opt = ALGO_REGISTRY[modality_config.method]( model, modality_config, @@ -58,17 +51,14 @@ def main(config): blockwise_opts.append(blockwise_opt) dist.barrier() else: - # 需要校准数据 dataset = BaseDataset( model.get_tokenizer(), config.calib, model.batch_process ) calib_data, padding_mask = dataset.get_calib_dataset() - # 收集第一层block输入 为后续blockwise算法需要的输入缓存下来 model.collect_first_block_input(calib_data, padding_mask) del calib_data gc.collect() torch.cuda.empty_cache() - # 构造算法对象 blockwise_opt = ALGO_REGISTRY[modality_config.method]( model, modality_config, @@ -76,38 +66,30 @@ def main(config): model.get_padding_mask(), config, ) - # 项目逐层block做优化 blockwise_opt.run_block_loop() blockwise_opts.append(blockwise_opt) dist.barrier() - # 对变化后的浮点模型做评测 eval_model(model, blockwise_opts, eval_list, eval_pos='transformed') - # 只有rank 0继续做保存和导出 if int(os.environ['RANK']) == 0: if 'save' in config and config.save.get('save_lightllm_kv_cache_calib', False): - # 收集各个模态/量化器导出的校准结果。 calib_json_list = [ blockwise_opt.collect_calib_json() for blockwise_opt in blockwise_opts if hasattr(blockwise_opt, 'collect_calib_json') ] - # 单模态时保持扁平结构,兼容 LightLLM 的校准文件格式。 calib_json_payload = ( calib_json_list[0] if len(calib_json_list) == 1 else calib_json_list ) - # 将最终的校准 JSON 写入配置指定的输出路径。 with open(save_lightllm_kv_cache_calib_path, 'w') as file: json.dump(calib_json_payload, file, ensure_ascii=False, indent=4) logger.info( f'save lightllm kv cache calib done -- {save_lightllm_kv_cache_calib_path}' ) - # 保存变换后的浮点模型 if 'save' in config and config.save.get('save_trans', False): blockwise_opt.save_model(save_trans_path) - # 保存TensorRT-LLM格式并构建engine if 'save' in config and config.save.get('save_trtllm', False): blockwise_opt.save_model(save_trtllm_trans_path) from llmc.utils.export_trtllm import cvt_trtllm_engine @@ -121,15 +103,12 @@ def main(config): eval_model(model, blockwise_opts, eval_list, eval_pos='fake_quant') eval_model(model, blockwise_opts, eval_list, eval_pos='fake_quant_wo_kv') - # 切换到fake quant部署模式再保存 if 'save' in config and config.save.get('save_fake', False): deploy_all_modality(blockwise_opts, 'fake_quant') blockwise_opt.save_model(save_fake_path) if 'save' in config: - # 导出真实量化模型给推理后端 if ( - # 导出前进行遍历检查 config.save.get('save_vllm', False) or config.save.get('save_sgl', False) or config.save.get('save_lightllm', False) @@ -137,12 +116,9 @@ def main(config): for modality_config in modality_configs: w, a = modality_config.weight, modality_config.get('act') - # 只允许特定bit类型 if isinstance(w.bit, str): - # 必须对称量化 assert w.symmetric, 'Only symmetric quant is supported.' assert w.bit in ['e4m3', 'e3m4'], 'Supported quant: w8a16.' - # 有激活量化的话,那激活也要满足对称、bit合法的要求 if a: assert ( w.symmetric and a.symmetric @@ -153,7 +129,6 @@ def main(config): and a.bit in ['e4m3', 'e5m2'] ), 'Only WA FP8 quant is supported' else: - # 是整数则必须是4 or 8 assert w.symmetric, 'Only symmetric quant is supported.' assert w.bit in [4, 8], 'Supported quant: w4a16, w8a16, w8a8.' if a: @@ -170,15 +145,12 @@ def main(config): blockwise_opt.save_model(save_quant_path) update_vllm_quant_config(blockwise_opt.model, config, save_quant_path) - # 给特定后端(AutoAWQ导出 elif config.save.get('save_autoawq', False): for modality_config in modality_configs: - # 只能4 bit 仅含有weight 不支持act assert ( modality_config.weight.bit in [4] and 'act' not in modality_config ), 'AutoAWQ supports only 4-bit weight-only quantization.' assert ( - # 不能对称量化 not modality_config.weight.symmetric ), 'Only asymmetric quant is supported.' @@ -204,15 +176,11 @@ def main(config): blockwise_opt.save_model(save_quant_path) update_lightx2v_quant_config(save_quant_path) - # 判断是否有opencompass if 'opencompass' in config: assert config.save.get('save_trans', False) - # 从配置里读取cfg_path, output_path cfg_path = config['opencompass']['cfg_path'] output_path = config['opencompass']['output_path'] - # 取路径 eval_model_path = os.path.abspath(save_trans_path) - # 拼指令 opencompass_cmd = ( f'opencompass {cfg_path} -w {output_path} ' f'--llmc_cfg {args.config} ' @@ -220,7 +188,6 @@ def main(config): f'--llmc_model_path {eval_model_path}' ) logger.info(f'opencompass_cmd : {opencompass_cmd}') - # 执行 os.system(opencompass_cmd) dist.barrier() @@ -229,25 +196,20 @@ def main(config): logger.add(sys.stdout, level='INFO') llmc_start_time = time.time() parser = argparse.ArgumentParser() - # 解析命令行参数 parser.add_argument('--config', type=str, required=True) parser.add_argument('--task_id', type=str, required=True) args = parser.parse_args() with open(args.config, 'r') as file: - # 读取配置文件 config = yaml.safe_load(file) config = EasyDict(config) init_process_group(backend='nccl') - # 初始化分布式环境 设置GPU torch.cuda.set_device(int(os.environ['LOCAL_RANK'])) - # 检查配置 打印依赖版本 if int(os.environ['RANK']) != 0: logger.remove() - # 检查配置是否合法 check_config(config) logger.info(f'args: {args}') diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py index 9a8e7338c..8574ef197 100755 --- a/llmc/compression/quantization/base_blockwise_quantization.py +++ b/llmc/compression/quantization/base_blockwise_quantization.py @@ -181,8 +181,9 @@ def set_quant_config(self): assert ( self.quant_config['act']['granularity'] == 'per_tensor' ), 'Only support per_tensor static quant' - # 静态激活量化会走批量校准接口,这里把默认的 minmax - # 归一化成对应的 static_minmax,避免后续校准时报算法名不匹配。 + # Static activation quantization uses the batched calibration + # path, so normalize the default minmax setting to + # static_minmax to match the downstream calibration logic. if self.quant_config['act'].get('calib_algo', 'minmax') == 'minmax': self.quant_config['act']['calib_algo'] = 'static_minmax' self.quant_config['act']['tp'] = self.tp @@ -208,8 +209,8 @@ def set_quant_config(self): kv_special_cfg = self.quant_config['kvcache'].get('special', {}) act_static_cfg = {} if self.act_static: - # KV cache 构造函数接收的是 num_samples / bsz, - # 这里把校准配置里的字段名映射成它实际需要的参数名。 + # The KV cache constructor expects num_samples / bsz, so map + # the calibration config fields to the parameter names it uses. act_static_cfg['num_samples'] = self.config.calib.n_samples act_static_cfg['bsz'] = self.config.calib.bs kv_quant_type = self.quant_config['kvcache'].get('quant_type', 'int-quant') @@ -1010,20 +1011,23 @@ def contiguous_params(self): if not param.is_contiguous(): param.data = param.data.contiguous() - # 将张量等对象转换成 JSON 可直接写出的 Python 基础类型。 + # Convert tensors and similar objects into Python values that can be + # directly serialized into JSON. def _to_jsonable(self, value): if isinstance(value, torch.Tensor): return value.detach().cpu().tolist() return value - # 统一把输入规整成 CPU tensor,便于后续做范围计算和序列化。 + # Normalize inputs into CPU tensors so the following range computation + # and serialization logic can handle them consistently. def _to_tensor(self, value, dtype=torch.float32): if isinstance(value, torch.Tensor): return value.detach().cpu().to(dtype) return torch.as_tensor(value, dtype=dtype) - # LightLLM 需要的是离线 FP8 KV 的 descale,这里先根据 qparams 还原实数范围, - # 再换算成与 torch.float8_e4m3fn 对齐的每层 K/V scale。 + # LightLLM expects offline FP8 KV descales. Recover the real-value range + # from the qparams first, then convert it into per-layer K/V scales that + # align with torch.float8_e4m3fn. def _collect_lightllm_kv_scale(self, scales, zeros, qmin, qmax): if isinstance(scales, torch.Tensor) and scales.numel() == 0: return None @@ -1040,8 +1044,9 @@ def _collect_lightllm_kv_scale(self, scales, zeros, qmin, qmax): ) return absmax_tensor / fp8_qmax - # 按 LightLLM 的 kv_cache_calib.json 结构导出校准结果, - # 目前只支持它已经接入的 per_tensor / per_head 两种 KV 格式。 + # Export calibration results in the LightLLM kv_cache_calib.json format. + # At the moment, only the per_tensor and per_head KV formats supported by + # LightLLM are handled here. def collect_calib_json(self): return collect_lightllm_kv_calib_json(self) From 5589e8296a0b03fe30ff11921afba0e3c5b34cf3 Mon Sep 17 00:00:00 2001 From: Yang Date: Sat, 28 Mar 2026 17:18:30 +0800 Subject: [PATCH 08/10] remove the hardcode --- .../rtn_w_a_pertensor_static_naive_quant_kv.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml index c247b492b..eb08a4eca 100644 --- a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml +++ b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml @@ -1,24 +1,24 @@ base: seed: &seed 42 model: - type: Qwen3 - path: /home/michael/Project/models/Qwen3-0.6B + type: model_type + path: model path torch_dtype: auto calib: name: pileval download: False - path: /home/michael/Project/calib/pileval + path: calib data path n_samples: 128 bs: 1 seq_len: 2048 preproc: txt_general_preproc seed: *seed eval: - eval_pos: [] #long_ppl eval not support pretrain eval pos + eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #long_ppl eval not support pretrain eval pos name: wikitext2 type: decode_ppl download: False - path: /home/michael/Project/llmc_datasets/wikitext2 + path: eval_data_path bs: 1 inference_per_block: False num_samples: 10 @@ -41,7 +41,7 @@ quant: symmetric: True granularity: per_tensor save: - save_lightllm_kv_cache_calib: True - lightllm_kv_cache_calib_name: kv_cache_calib.json + save_lightllm_kv_calib: True + lightllm_kv_cache_name: kv_cache_calib.json save_fake: False - save_path: /home/michael/Project/llmc_save + save_path: /path/to/save/ \ No newline at end of file From 0eaa3e56545f2b5b5e0b857d1095e7fa7c57572e Mon Sep 17 00:00:00 2001 From: Yang Date: Sat, 28 Mar 2026 19:55:22 +0800 Subject: [PATCH 09/10] Add the per head quant --- .../rtn_w_a_perhead_static_naive_quant_kv.yml | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 configs/quantization/methods/KVQuant/rtn_w_a_perhead_static_naive_quant_kv.yml diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_perhead_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_perhead_static_naive_quant_kv.yml new file mode 100644 index 000000000..0886123b9 --- /dev/null +++ b/configs/quantization/methods/KVQuant/rtn_w_a_perhead_static_naive_quant_kv.yml @@ -0,0 +1,48 @@ +base: + seed: &seed 42 +model: + type: model_type + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 2048 + preproc: txt_general_preproc + seed: *seed +eval: + eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #long_ppl eval not support pretrain eval pos + name: wikitext2 + type: decode_ppl + download: False + path: eval_data_path + bs: 1 + inference_per_block: False + num_samples: 10 + # num_eval_tokens: 3 +quant: + method: RTN + weight: + bit: 8 + symmetric: True + granularity: per_channel + group_size: -1 + act: + bit: 8 + symmetric: True + granularity: per_tensor + static: True + kvcache: + method: Naive + bit: 8 + symmetric: True + granularity: per_head + head_num: kv head num +save: + save_lightllm_kv_calib: True + lightllm_kv_cache_name: kv_cache_calib.json + save_fake: False + save_path: /path/to/save/ From 377ba64c9493b5ece126ee7e046376db11d857ae Mon Sep 17 00:00:00 2001 From: Yang Date: Sat, 28 Mar 2026 20:07:08 +0800 Subject: [PATCH 10/10] replace the chinese with english and add the per_head path --- llmc/compression/quantization/kvquant.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llmc/compression/quantization/kvquant.py b/llmc/compression/quantization/kvquant.py index 6cbe75ffb..29622297c 100644 --- a/llmc/compression/quantization/kvquant.py +++ b/llmc/compression/quantization/kvquant.py @@ -13,9 +13,9 @@ class NaiveQuantKVCache(DynamicCache): def __init__(self, quant_type, kvquant_cfg, num_hidden_layers, num_samples=128, bsz=1): super().__init__() - # 复制一份配置,避免在静态 KV 校准场景下修改原始量化配置对象。 + # Copy the config to avoid mutating the original quantization config in static KV calibration. kvquant_cfg = copy.deepcopy(kvquant_cfg) - assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group'] + assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group', 'per_head'] self.num_hidden_layers, self.num_samples, self.bsz = ( num_hidden_layers, num_samples, @@ -24,8 +24,8 @@ def __init__(self, quant_type, kvquant_cfg, num_hidden_layers, num_samples=128, if kvquant_cfg.get('static', False) and kvquant_cfg.get( 'calib_algo', 'minmax' ) == 'minmax': - # 静态 KV 校准会走批量张量统计接口,这里把默认的 minmax - # 归一化成对应的 static_minmax,避免后续校准时报算法名不匹配。 + # Static KV calibration uses the batched tensor statistics path, so convert the default + # minmax setting to static_minmax here to avoid a later calibration algo name mismatch. kvquant_cfg['calib_algo'] = 'static_minmax' if quant_type == 'int-quant': self.kvquantizer = IntegerQuantizer(**kvquant_cfg)