Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ save*
.log
*.pid
*.ipynb*
.venv/
*.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,7 @@ quant:
symmetric: True
granularity: per_tensor
save:
save_lightllm_kv_calib: True
lightllm_kv_cache_name: kv_cache_calib.json
save_fake: False
save_path: /path/to/save/
save_path: /path/to/save/
24 changes: 24 additions & 0 deletions llmc/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,21 @@ def main(config):

eval_model(model, blockwise_opts, eval_list, eval_pos='transformed')
if int(os.environ['RANK']) == 0:
if 'save' in config and config.save.get('save_lightllm_kv_cache_calib', False):
calib_json_list = [
blockwise_opt.collect_calib_json()
for blockwise_opt in blockwise_opts
if hasattr(blockwise_opt, 'collect_calib_json')
]
calib_json_payload = (
calib_json_list[0] if len(calib_json_list) == 1 else calib_json_list
)
with open(save_lightllm_kv_cache_calib_path, 'w') as file:
json.dump(calib_json_payload, file, ensure_ascii=False, indent=4)
logger.info(
f'save lightllm kv cache calib done -- {save_lightllm_kv_cache_calib_path}'
)

if 'save' in config and config.save.get('save_trans', False):
blockwise_opt.save_model(save_trans_path)

Expand Down Expand Up @@ -209,6 +224,14 @@ def main(config):
# Ensure only the main process creates directories
if int(os.environ['RANK']) == 0:
if 'save' in config:
if config.save.get('save_lightllm_kv_cache_calib', False):
mkdirs(config.save.save_path)
save_lightllm_kv_cache_calib_path = os.path.join(
config.save.save_path,
config.save.get(
'lightllm_kv_cache_calib_name', 'kv_cache_calib.json'
),
)
if config.save.get('save_trans', False):
save_trans_path = os.path.join(
config.save.save_path, 'transformed_model'
Expand Down Expand Up @@ -266,3 +289,4 @@ def main(config):
llmc_duration_time = llmc_end_time - llmc_start_time
logger.info(f'llmc_duration_time: {llmc_duration_time} s')
logger.info('--- llmc finished ---')

55 changes: 51 additions & 4 deletions llmc/compression/quantization/base_blockwise_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import torch.nn as nn
from loguru import logger

from llmc.utils.export_calib import collect_lightllm_kv_calib_json
from llmc.utils.registry_factory import KV_REGISTRY, TOKEN_REDUCTION_REGISTRY

from ..blockwise_optimization import BlockwiseOpt
Expand Down Expand Up @@ -175,13 +176,18 @@ def set_quant_config(self):
self.act_quant_module = IntegerQuantizer
elif quant_type == 'float-quant':
self.act_quant_module = FloatQuantizer
self.quant_config['act']['tp'] = self.tp
self.aquantizer = self.act_quant_module(**self.quant_config['act'])
self.act_static = self.quant_config['act'].get('static', False)
if self.act_static:
assert (
self.quant_config['act']['granularity'] == 'per_tensor'
), 'Only support per_tensor static quant'
# Static activation quantization uses the batched calibration
# path, so normalize the default minmax setting to
# static_minmax to match the downstream calibration logic.
if self.quant_config['act'].get('calib_algo', 'minmax') == 'minmax':
self.quant_config['act']['calib_algo'] = 'static_minmax'
self.quant_config['act']['tp'] = self.tp
self.aquantizer = self.act_quant_module(**self.quant_config['act'])
self.quant_attn = self.quant_config['act'].get('quant_attn', False)
if self.quant_attn:
assert self.config['model']['type'] in ['Vit', 'DeepseekV2']
Expand All @@ -203,8 +209,10 @@ def set_quant_config(self):
kv_special_cfg = self.quant_config['kvcache'].get('special', {})
act_static_cfg = {}
if self.act_static:
act_static_cfg.update(self.config.calib.n_sample)
act_static_cfg.update(self.config.calib.bs)
# The KV cache constructor expects num_samples / bsz, so map
# the calibration config fields to the parameter names it uses.
act_static_cfg['num_samples'] = self.config.calib.n_samples
act_static_cfg['bsz'] = self.config.calib.bs
kv_quant_type = self.quant_config['kvcache'].get('quant_type', 'int-quant')
self.kv_module = KV_REGISTRY[self.quant_config['kvcache']['method']](
kv_quant_type, self.quant_config['kvcache'],
Expand Down Expand Up @@ -1003,6 +1011,45 @@ def contiguous_params(self):
if not param.is_contiguous():
param.data = param.data.contiguous()

# Convert tensors and similar objects into Python values that can be
# directly serialized into JSON.
def _to_jsonable(self, value):
if isinstance(value, torch.Tensor):
return value.detach().cpu().tolist()
return value

# Normalize inputs into CPU tensors so the following range computation
# and serialization logic can handle them consistently.
def _to_tensor(self, value, dtype=torch.float32):
if isinstance(value, torch.Tensor):
return value.detach().cpu().to(dtype)
return torch.as_tensor(value, dtype=dtype)

# LightLLM expects offline FP8 KV descales. Recover the real-value range
# from the qparams first, then convert it into per-layer K/V scales that
# align with torch.float8_e4m3fn.
def _collect_lightllm_kv_scale(self, scales, zeros, qmin, qmax):
if isinstance(scales, torch.Tensor) and scales.numel() == 0:
return None

scales_tensor = self._to_tensor(scales)
zeros_tensor = self._to_tensor(zeros, dtype=scales_tensor.dtype)
qmin_tensor = self._to_tensor(qmin, dtype=scales_tensor.dtype)
qmax_tensor = self._to_tensor(qmax, dtype=scales_tensor.dtype)
min_tensor = (qmin_tensor - zeros_tensor) * scales_tensor
max_tensor = (qmax_tensor - zeros_tensor) * scales_tensor
absmax_tensor = torch.maximum(min_tensor.abs(), max_tensor.abs())
fp8_qmax = torch.tensor(
torch.finfo(torch.float8_e4m3fn).max, dtype=absmax_tensor.dtype
)
return absmax_tensor / fp8_qmax

# Export calibration results in the LightLLM kv_cache_calib.json format.
# At the moment, only the per_tensor and per_head KV formats supported by
# LightLLM are handled here.
def collect_calib_json(self):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The process of exporting calibration json file strictly relates with LightLLM. It's better to move this part to the utils folder.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I will make it more moduled

return collect_lightllm_kv_calib_json(self)

@torch.no_grad()
def save_model(self, path):
if int(os.environ['RANK']) != 0:
Expand Down
9 changes: 9 additions & 0 deletions llmc/compression/quantization/kvquant.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import torch
from loguru import logger
from transformers import DynamicCache
Expand All @@ -12,12 +13,20 @@ class NaiveQuantKVCache(DynamicCache):
def __init__(self, quant_type, kvquant_cfg, num_hidden_layers, num_samples=128, bsz=1):
super().__init__()

# 复制一份配置,避免在静态 KV 校准场景下修改原始量化配置对象。
kvquant_cfg = copy.deepcopy(kvquant_cfg)
assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group']
self.num_hidden_layers, self.num_samples, self.bsz = (
num_hidden_layers,
num_samples,
bsz,
)
if kvquant_cfg.get('static', False) and kvquant_cfg.get(
'calib_algo', 'minmax'
) == 'minmax':
# 静态 KV 校准会走批量张量统计接口,这里把默认的 minmax
# 归一化成对应的 static_minmax,避免后续校准时报算法名不匹配。
kvquant_cfg['calib_algo'] = 'static_minmax'
if quant_type == 'int-quant':
self.kvquantizer = IntegerQuantizer(**kvquant_cfg)
elif quant_type == 'float-quant':
Expand Down
1 change: 1 addition & 0 deletions llmc/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .export_autoawq import update_autoawq_quant_config
from .export_calib import collect_lightllm_kv_calib_json
from .export_lightx2v import update_lightx2v_quant_config
from .export_vllm import update_vllm_quant_config
from .utils import (check_config, copy_files, deploy_all_modality,
Expand Down
69 changes: 69 additions & 0 deletions llmc/utils/export_calib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import torch


def collect_lightllm_kv_calib_json(blockwise_opt):
if not getattr(blockwise_opt, 'quant_kvcache', False):
raise ValueError(
'save_lightllm_kv_cache_calib requires kvcache quantization.'
)

kv_cfg = blockwise_opt.quant_config['kvcache']
granularity = kv_cfg.get('granularity')
if granularity not in ['per_tensor', 'per_head']:
raise ValueError(
f'LightLLM calib export only supports per_tensor/per_head, got {granularity}'
)

num_layers = blockwise_opt.model.model_config.num_hidden_layers
num_head = int(
getattr(
blockwise_opt.model.model_config,
'num_key_value_heads',
blockwise_opt.model.get_num_attention_heads(),
)
)
scales = []
for layer_idx in range(num_layers):
key_scale = blockwise_opt._collect_lightllm_kv_scale(
blockwise_opt.kv_module.k_scales_buffer[layer_idx],
blockwise_opt.kv_module.k_zeros_buffer[layer_idx],
blockwise_opt.kv_module.k_qmin_buffer[layer_idx],
blockwise_opt.kv_module.k_qmax_buffer[layer_idx],
)
value_scale = blockwise_opt._collect_lightllm_kv_scale(
blockwise_opt.kv_module.v_scales_buffer[layer_idx],
blockwise_opt.kv_module.v_zeros_buffer[layer_idx],
blockwise_opt.kv_module.v_qmin_buffer[layer_idx],
blockwise_opt.kv_module.v_qmax_buffer[layer_idx],
)
if key_scale is None or value_scale is None:
raise ValueError(f'Calibration scale for layer {layer_idx} is empty.')

scale_row = torch.cat([key_scale.reshape(-1), value_scale.reshape(-1)]).tolist()
scales.append(scale_row)

scale_width = len(scales[0]) if scales else 0
if granularity == 'per_tensor' and scale_width != 2:
raise ValueError(f'per_tensor export expects 2 scales per layer, got {scale_width}')
if granularity == 'per_head' and scale_width != num_head * 2:
raise ValueError(
f'per_head export expects {num_head * 2} scales per layer, got {scale_width}'
)

architectures = getattr(blockwise_opt.model.model_config, 'architectures', None)
if isinstance(architectures, list) and len(architectures) > 0:
architectures = architectures[0]
elif architectures is None:
architectures = blockwise_opt.config.model.type

return {
'version': '1.0',
'architectures': architectures,
'quant_type': granularity,
'qmin': float(torch.finfo(torch.float8_e4m3fn).min),
'qmax': float(torch.finfo(torch.float8_e4m3fn).max),
'num_layers': num_layers,
'num_head': num_head,
'scales_shape': [num_layers, scale_width],
'scales': scales,
}
Loading