diff --git a/.gitignore b/.gitignore index 896b38a12..bf707424f 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,5 @@ save* .log *.pid *.ipynb* +.venv/ +*.sh \ No newline at end of file diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_perhead_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_perhead_static_naive_quant_kv.yml new file mode 100644 index 000000000..0886123b9 --- /dev/null +++ b/configs/quantization/methods/KVQuant/rtn_w_a_perhead_static_naive_quant_kv.yml @@ -0,0 +1,48 @@ +base: + seed: &seed 42 +model: + type: model_type + path: model path + torch_dtype: auto +calib: + name: pileval + download: False + path: calib data path + n_samples: 128 + bs: 1 + seq_len: 2048 + preproc: txt_general_preproc + seed: *seed +eval: + eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #long_ppl eval not support pretrain eval pos + name: wikitext2 + type: decode_ppl + download: False + path: eval_data_path + bs: 1 + inference_per_block: False + num_samples: 10 + # num_eval_tokens: 3 +quant: + method: RTN + weight: + bit: 8 + symmetric: True + granularity: per_channel + group_size: -1 + act: + bit: 8 + symmetric: True + granularity: per_tensor + static: True + kvcache: + method: Naive + bit: 8 + symmetric: True + granularity: per_head + head_num: kv head num +save: + save_lightllm_kv_calib: True + lightllm_kv_cache_name: kv_cache_calib.json + save_fake: False + save_path: /path/to/save/ diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml index f2bbda675..eb08a4eca 100644 --- a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml +++ b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml @@ -41,5 +41,7 @@ quant: symmetric: True granularity: per_tensor save: + save_lightllm_kv_calib: True + lightllm_kv_cache_name: kv_cache_calib.json save_fake: False - save_path: /path/to/save/ + save_path: /path/to/save/ \ No newline at end of file diff --git a/llmc/__main__.py b/llmc/__main__.py index ec60c1492..c211ec35a 100755 --- a/llmc/__main__.py +++ b/llmc/__main__.py @@ -72,6 +72,21 @@ def main(config): eval_model(model, blockwise_opts, eval_list, eval_pos='transformed') if int(os.environ['RANK']) == 0: + if 'save' in config and config.save.get('save_lightllm_kv_cache_calib', False): + calib_json_list = [ + blockwise_opt.collect_calib_json() + for blockwise_opt in blockwise_opts + if hasattr(blockwise_opt, 'collect_calib_json') + ] + calib_json_payload = ( + calib_json_list[0] if len(calib_json_list) == 1 else calib_json_list + ) + with open(save_lightllm_kv_cache_calib_path, 'w') as file: + json.dump(calib_json_payload, file, ensure_ascii=False, indent=4) + logger.info( + f'save lightllm kv cache calib done -- {save_lightllm_kv_cache_calib_path}' + ) + if 'save' in config and config.save.get('save_trans', False): blockwise_opt.save_model(save_trans_path) @@ -209,6 +224,14 @@ def main(config): # Ensure only the main process creates directories if int(os.environ['RANK']) == 0: if 'save' in config: + if config.save.get('save_lightllm_kv_cache_calib', False): + mkdirs(config.save.save_path) + save_lightllm_kv_cache_calib_path = os.path.join( + config.save.save_path, + config.save.get( + 'lightllm_kv_cache_calib_name', 'kv_cache_calib.json' + ), + ) if config.save.get('save_trans', False): save_trans_path = os.path.join( config.save.save_path, 'transformed_model' @@ -266,3 +289,4 @@ def main(config): llmc_duration_time = llmc_end_time - llmc_start_time logger.info(f'llmc_duration_time: {llmc_duration_time} s') logger.info('--- llmc finished ---') + diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py index 5a2232699..8574ef197 100755 --- a/llmc/compression/quantization/base_blockwise_quantization.py +++ b/llmc/compression/quantization/base_blockwise_quantization.py @@ -12,6 +12,7 @@ import torch.nn as nn from loguru import logger +from llmc.utils.export_calib import collect_lightllm_kv_calib_json from llmc.utils.registry_factory import KV_REGISTRY, TOKEN_REDUCTION_REGISTRY from ..blockwise_optimization import BlockwiseOpt @@ -175,13 +176,18 @@ def set_quant_config(self): self.act_quant_module = IntegerQuantizer elif quant_type == 'float-quant': self.act_quant_module = FloatQuantizer - self.quant_config['act']['tp'] = self.tp - self.aquantizer = self.act_quant_module(**self.quant_config['act']) self.act_static = self.quant_config['act'].get('static', False) if self.act_static: assert ( self.quant_config['act']['granularity'] == 'per_tensor' ), 'Only support per_tensor static quant' + # Static activation quantization uses the batched calibration + # path, so normalize the default minmax setting to + # static_minmax to match the downstream calibration logic. + if self.quant_config['act'].get('calib_algo', 'minmax') == 'minmax': + self.quant_config['act']['calib_algo'] = 'static_minmax' + self.quant_config['act']['tp'] = self.tp + self.aquantizer = self.act_quant_module(**self.quant_config['act']) self.quant_attn = self.quant_config['act'].get('quant_attn', False) if self.quant_attn: assert self.config['model']['type'] in ['Vit', 'DeepseekV2'] @@ -203,8 +209,10 @@ def set_quant_config(self): kv_special_cfg = self.quant_config['kvcache'].get('special', {}) act_static_cfg = {} if self.act_static: - act_static_cfg.update(self.config.calib.n_sample) - act_static_cfg.update(self.config.calib.bs) + # The KV cache constructor expects num_samples / bsz, so map + # the calibration config fields to the parameter names it uses. + act_static_cfg['num_samples'] = self.config.calib.n_samples + act_static_cfg['bsz'] = self.config.calib.bs kv_quant_type = self.quant_config['kvcache'].get('quant_type', 'int-quant') self.kv_module = KV_REGISTRY[self.quant_config['kvcache']['method']]( kv_quant_type, self.quant_config['kvcache'], @@ -1003,6 +1011,45 @@ def contiguous_params(self): if not param.is_contiguous(): param.data = param.data.contiguous() + # Convert tensors and similar objects into Python values that can be + # directly serialized into JSON. + def _to_jsonable(self, value): + if isinstance(value, torch.Tensor): + return value.detach().cpu().tolist() + return value + + # Normalize inputs into CPU tensors so the following range computation + # and serialization logic can handle them consistently. + def _to_tensor(self, value, dtype=torch.float32): + if isinstance(value, torch.Tensor): + return value.detach().cpu().to(dtype) + return torch.as_tensor(value, dtype=dtype) + + # LightLLM expects offline FP8 KV descales. Recover the real-value range + # from the qparams first, then convert it into per-layer K/V scales that + # align with torch.float8_e4m3fn. + def _collect_lightllm_kv_scale(self, scales, zeros, qmin, qmax): + if isinstance(scales, torch.Tensor) and scales.numel() == 0: + return None + + scales_tensor = self._to_tensor(scales) + zeros_tensor = self._to_tensor(zeros, dtype=scales_tensor.dtype) + qmin_tensor = self._to_tensor(qmin, dtype=scales_tensor.dtype) + qmax_tensor = self._to_tensor(qmax, dtype=scales_tensor.dtype) + min_tensor = (qmin_tensor - zeros_tensor) * scales_tensor + max_tensor = (qmax_tensor - zeros_tensor) * scales_tensor + absmax_tensor = torch.maximum(min_tensor.abs(), max_tensor.abs()) + fp8_qmax = torch.tensor( + torch.finfo(torch.float8_e4m3fn).max, dtype=absmax_tensor.dtype + ) + return absmax_tensor / fp8_qmax + + # Export calibration results in the LightLLM kv_cache_calib.json format. + # At the moment, only the per_tensor and per_head KV formats supported by + # LightLLM are handled here. + def collect_calib_json(self): + return collect_lightllm_kv_calib_json(self) + @torch.no_grad() def save_model(self, path): if int(os.environ['RANK']) != 0: diff --git a/llmc/compression/quantization/kvquant.py b/llmc/compression/quantization/kvquant.py index 32c2de5be..29622297c 100644 --- a/llmc/compression/quantization/kvquant.py +++ b/llmc/compression/quantization/kvquant.py @@ -1,3 +1,4 @@ +import copy import torch from loguru import logger from transformers import DynamicCache @@ -12,12 +13,20 @@ class NaiveQuantKVCache(DynamicCache): def __init__(self, quant_type, kvquant_cfg, num_hidden_layers, num_samples=128, bsz=1): super().__init__() - assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group'] + # Copy the config to avoid mutating the original quantization config in static KV calibration. + kvquant_cfg = copy.deepcopy(kvquant_cfg) + assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group', 'per_head'] self.num_hidden_layers, self.num_samples, self.bsz = ( num_hidden_layers, num_samples, bsz, ) + if kvquant_cfg.get('static', False) and kvquant_cfg.get( + 'calib_algo', 'minmax' + ) == 'minmax': + # Static KV calibration uses the batched tensor statistics path, so convert the default + # minmax setting to static_minmax here to avoid a later calibration algo name mismatch. + kvquant_cfg['calib_algo'] = 'static_minmax' if quant_type == 'int-quant': self.kvquantizer = IntegerQuantizer(**kvquant_cfg) elif quant_type == 'float-quant': diff --git a/llmc/compression/quantization/quant.py b/llmc/compression/quantization/quant.py index 2c24c03a8..d06fd7479 100755 --- a/llmc/compression/quantization/quant.py +++ b/llmc/compression/quantization/quant.py @@ -224,27 +224,24 @@ def get_minmax_stats(self, act_tensors): for tensor in tensors: tensor = self.reshape_tensor(tensor) tensor_range = self.get_minmax_range(tensor) - min_val, max_val = tensor_range[0], tensor_range[1] + min_val = tensor_range[0].detach().cpu().to(torch.float32) + max_val = tensor_range[1].detach().cpu().to(torch.float32) if input_idx not in stats_min_max: stats_min_max[input_idx] = {} - stats_min_max[input_idx]['min'] = torch.tensor( - [min_val], dtype=torch.float32 - ) - stats_min_max[input_idx]['max'] = torch.tensor( - [max_val], dtype=torch.float32 - ) + stats_min_max[input_idx]['min'] = min_val.unsqueeze(0) + stats_min_max[input_idx]['max'] = max_val.unsqueeze(0) else: stats_min_max[input_idx]['min'] = torch.cat( [ stats_min_max[input_idx]['min'], - torch.tensor([min_val], dtype=torch.float32), + min_val.unsqueeze(0), ] ) stats_min_max[input_idx]['max'] = torch.cat( [ stats_min_max[input_idx]['max'], - torch.tensor([max_val], dtype=torch.float32), + max_val.unsqueeze(0), ] ) @@ -255,8 +252,8 @@ def get_static_minmax_range(self, act_tensors): stats_min_max = self.get_minmax_stats(act_tensors) min_vals, max_vals = [], [] for input_idx, tensor_range in stats_min_max.items(): - min_val = tensor_range['min'].mean() - max_val = tensor_range['max'].mean() + min_val = tensor_range['min'].mean(dim=0) + max_val = tensor_range['max'].mean(dim=0) min_vals.append(min_val) max_vals.append(max_val) diff --git a/llmc/utils/__init__.py b/llmc/utils/__init__.py index aa5fefa1a..683f1ad8f 100755 --- a/llmc/utils/__init__.py +++ b/llmc/utils/__init__.py @@ -1,4 +1,5 @@ from .export_autoawq import update_autoawq_quant_config +from .export_calib import collect_lightllm_kv_calib_json from .export_lightx2v import update_lightx2v_quant_config from .export_vllm import update_vllm_quant_config from .utils import (check_config, copy_files, deploy_all_modality, diff --git a/llmc/utils/export_calib.py b/llmc/utils/export_calib.py new file mode 100644 index 000000000..dcf9698bf --- /dev/null +++ b/llmc/utils/export_calib.py @@ -0,0 +1,69 @@ +import torch + + +def collect_lightllm_kv_calib_json(blockwise_opt): + if not getattr(blockwise_opt, 'quant_kvcache', False): + raise ValueError( + 'save_lightllm_kv_cache_calib requires kvcache quantization.' + ) + + kv_cfg = blockwise_opt.quant_config['kvcache'] + granularity = kv_cfg.get('granularity') + if granularity not in ['per_tensor', 'per_head']: + raise ValueError( + f'LightLLM calib export only supports per_tensor/per_head, got {granularity}' + ) + + num_layers = blockwise_opt.model.model_config.num_hidden_layers + num_head = int( + getattr( + blockwise_opt.model.model_config, + 'num_key_value_heads', + blockwise_opt.model.get_num_attention_heads(), + ) + ) + scales = [] + for layer_idx in range(num_layers): + key_scale = blockwise_opt._collect_lightllm_kv_scale( + blockwise_opt.kv_module.k_scales_buffer[layer_idx], + blockwise_opt.kv_module.k_zeros_buffer[layer_idx], + blockwise_opt.kv_module.k_qmin_buffer[layer_idx], + blockwise_opt.kv_module.k_qmax_buffer[layer_idx], + ) + value_scale = blockwise_opt._collect_lightllm_kv_scale( + blockwise_opt.kv_module.v_scales_buffer[layer_idx], + blockwise_opt.kv_module.v_zeros_buffer[layer_idx], + blockwise_opt.kv_module.v_qmin_buffer[layer_idx], + blockwise_opt.kv_module.v_qmax_buffer[layer_idx], + ) + if key_scale is None or value_scale is None: + raise ValueError(f'Calibration scale for layer {layer_idx} is empty.') + + scale_row = torch.cat([key_scale.reshape(-1), value_scale.reshape(-1)]).tolist() + scales.append(scale_row) + + scale_width = len(scales[0]) if scales else 0 + if granularity == 'per_tensor' and scale_width != 2: + raise ValueError(f'per_tensor export expects 2 scales per layer, got {scale_width}') + if granularity == 'per_head' and scale_width != num_head * 2: + raise ValueError( + f'per_head export expects {num_head * 2} scales per layer, got {scale_width}' + ) + + architectures = getattr(blockwise_opt.model.model_config, 'architectures', None) + if isinstance(architectures, list) and len(architectures) > 0: + architectures = architectures[0] + elif architectures is None: + architectures = blockwise_opt.config.model.type + + return { + 'version': '1.0', + 'architectures': architectures, + 'quant_type': granularity, + 'qmin': float(torch.finfo(torch.float8_e4m3fn).min), + 'qmax': float(torch.finfo(torch.float8_e4m3fn).max), + 'num_layers': num_layers, + 'num_head': num_head, + 'scales_shape': [num_layers, scale_width], + 'scales': scales, + }