From ecd97345dcf8d75fdd0190d0882df812dffc764e Mon Sep 17 00:00:00 2001 From: MoringLotus Date: Wed, 8 Apr 2026 07:13:58 +0000 Subject: [PATCH 01/12] Base Config Add --- README.md | 2 +- jiuge.sh | 46 ++++++++++++++++++ python/infinilm/__init__.py | 2 + python/infinilm/base_config.py | 86 ++++++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 jiuge.sh create mode 100644 python/infinilm/base_config.py diff --git a/README.md b/README.md index 48448c56..afc242b2 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ - 编译并安装 `InfiniLM` ```bash -xmake && xmake install + ``` - 运行模型推理测试 diff --git a/jiuge.sh b/jiuge.sh new file mode 100644 index 00000000..e7ddb2dd --- /dev/null +++ b/jiuge.sh @@ -0,0 +1,46 @@ +featurize@featurize:~/work/InfiniLM$ cat jiuge.sh +#!/bin/bash + +# Jiuge模型运行脚本 +# 使用NVIDIA显卡运行9G4B模型 + +set -e # 遇到错误立即退出 + +echo "==========================================" +echo "🚀 启动 Jiuge 模型 (9G4B) - NVIDIA版本" +echo "==========================================" +export INFINI_ROOT=/home/featurize/.infini +export LD_LIBRARY_PATH=$INFINI_ROOT/lib:$LD_LIBRARY_PATH +# 设置参数 +MODEL_DIR="/home/featurize/work/InfiniFamily/9G4B" +DEVICE="--nvidia" +N_DEVICE=1 +SCRIPT_PATH="python scripts/jiuge.py" + +# 检查模型目录是否存在 +if [ ! -d "$MODEL_DIR" ]; then + echo "❌ 错误: 模型目录不存在: $MODEL_DIR" + echo "请检查路径是否正确" + exit 1 +fi + +# 检查Python脚本是否存在 +if [ ! -f "scripts/jiuge.py" ]; then + echo "❌ 错误: 未找到jiuge.py脚本: scripts/jiuge.py" + echo "请确保在当前目录下运行此脚本" + exit 1 +fi + +echo "📁 模型路径: $MODEL_DIR" +echo "🎯 设备类型: NVIDIA GPU" +echo "💻 设备数量: $N_DEVICE" +echo "" + +# 运行模型 +echo "🔄 启动模型..." +$SCRIPT_PATH $DEVICE $MODEL_DIR $N_DEVICE + +echo "" +echo "==========================================" +echo "✅ 模型运行完成" +echo "=========================================="ß \ No newline at end of file diff --git a/python/infinilm/__init__.py b/python/infinilm/__init__.py index e34514a7..f552a2cc 100644 --- a/python/infinilm/__init__.py +++ b/python/infinilm/__init__.py @@ -2,6 +2,7 @@ from . import distributed from . import cache from . import llm +from . import base_config from .llm import ( LLM, @@ -16,6 +17,7 @@ "distributed", "cache", "llm", + "base_config", # LLM classes "LLM", "AsyncLLMEngine", diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py new file mode 100644 index 00000000..d4104b5e --- /dev/null +++ b/python/infinilm/base_config.py @@ -0,0 +1,86 @@ +import argparse +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../scripts")) +from libinfinicore_infer import DeviceType + + +class BaseConfig: + """InfiniLM Unified Config - Command line argument parser""" + + def __init__(self): + + self.parser = argparse.ArgumentParser(description="InfiniLM Unified Config") + self._add_common_args() + self.args, self.extra = self.parser.parse_known_args() + + + self.model = self.args.model + self.device_name = self.args.device + self.device_type = self._get_device_type(self.args.device) + self.tp = self.args.tp + + + self.attn = self.args.attn + self.enable_graph = self.args.enable_graph + self.cache_type = self.args.cache_type + self.enable_paged_attn = self.args.enable_paged_attn + self.paged_kv_block_size = self.args.paged_kv_block_size + + + self.batch_size = self.args.batch_size + self.top_k = self.args.top_k + self.top_p = self.args.top_p + self.temperature = self.args.temperature + + self.warm_up = self.args.warm_up + self.verbose = self.args.verbose + self.log_evel = self.args.log_evel + + def _add_common_args(self): + # --- base configuration --- + self.parser.add_argument("--model", type=str, required=True) + self.parser.add_argument("--device", type=str, default="cpu") + self.parser.add_argument("--tp", "--tensor-parallel-size", type=int, default=1) + + + # --- Infer backend optimization --- + self.parser.add_argument("--attn", type=str, default="default", choices=["default", "flash-attn"]) + self.parser.add_argument("--enable-graph", action="store_true") + self.parser.add_argument("--cache-type", type=str, default="paged", choices=["paged", "static"]) + self.parser.add_argument("--enable-paged-attn", action="store_true", help="use paged cache",) + self.parser.add_argument("--paged-kv-block-size", type=int, default=256) + + + # --- Length and infer parameters --- + self.parser.add_argument("--batch-size", type=int, default=1) + self.parser.add_argument("--top-k", type=int, default=1) + self.parser.add_argument("--top-p", type=float, default=1.0) + self.parser.add_argument("--temperature", type=float, default=1.0) + + # --- debug --- + self.parser.add_argument("--warmup", action="store_true") + self.parser.add_argument("--verbose", action="store_true") + self.parser.add_argument("--log-evel", type=str, default="INFO") + + + def _get_device_type(self, dev_str): + """Convert device string to DeviceType enum""" + DEVICE_TYPE_MAP = { + "cpu": DeviceType.DEVICE_TYPE_CPU, + "nvidia": DeviceType.DEVICE_TYPE_NVIDIA, + "qy": DeviceType.DEVICE_TYPE_QY, + "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON, + "ascend": DeviceType.DEVICE_TYPE_ASCEND, + "metax": DeviceType.DEVICE_TYPE_METAX, + "moore": DeviceType.DEVICE_TYPE_MOORE, + "iluvatar": DeviceType.DEVICE_TYPE_ILUVATAR, + "kunlun": DeviceType.DEVICE_TYPE_KUNLUN, + "hygon": DeviceType.DEVICE_TYPE_HYGON, + "ali": DeviceType.DEVICE_TYPE_ALI + } + return DEVICE_TYPE_MAP.get(dev_str.lower(), DeviceType.DEVICE_TYPE_CPU) + + def __repr__(self): + """String representation of configuration""" + return f"BaseConfig(model='{self.model}', device='{self.device_name}', tp={self.tp})" From 312f75e6d1d259e38aec535668ca8a0d555a6ce8 Mon Sep 17 00:00:00 2001 From: MoringLotus Date: Wed, 8 Apr 2026 08:16:26 +0000 Subject: [PATCH 02/12] args supplement --- python/infinilm/base_config.py | 46 ++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py index d4104b5e..5e1a8bd2 100644 --- a/python/infinilm/base_config.py +++ b/python/infinilm/base_config.py @@ -26,9 +26,14 @@ def __init__(self): self.cache_type = self.args.cache_type self.enable_paged_attn = self.args.enable_paged_attn self.paged_kv_block_size = self.args.paged_kv_block_size + self.kv_cache_dtype = self.args.kv_cache_dtype + self.skip_load = self.args.skip_load + - self.batch_size = self.args.batch_size + self.input_len = self.args.input_len + self.output_len = self.args.output_len + self.max_new_tokens = self.args.max_new_tokens self.top_k = self.args.top_k self.top_p = self.args.top_p self.temperature = self.args.temperature @@ -37,6 +42,22 @@ def __init__(self): self.verbose = self.args.verbose self.log_evel = self.args.log_evel + + # Evaluation parameters + self.bench = self.args.bench + self.backend = self.args.backend + self.ndev = self.args.ndev + self.subject = self.args.subject + self.split = self.args.split + self.num_samples = self.args.num_samples + self.output_csv = self.args.output_csv + self.cache_dir = self.args.cache_dir + + + # Quantization parameters + self.awq = self.args.awq + self.gptq = self.args.gptq + def _add_common_args(self): # --- base configuration --- self.parser.add_argument("--model", type=str, required=True) @@ -45,15 +66,20 @@ def _add_common_args(self): # --- Infer backend optimization --- - self.parser.add_argument("--attn", type=str, default="default", choices=["default", "flash-attn"]) + self.parser.add_argument("--attn", type=str, default="default", choices=["default", "paged-attn", "flash-attn"]) self.parser.add_argument("--enable-graph", action="store_true") self.parser.add_argument("--cache-type", type=str, default="paged", choices=["paged", "static"]) self.parser.add_argument("--enable-paged-attn", action="store_true", help="use paged cache",) self.parser.add_argument("--paged-kv-block-size", type=int, default=256) + self.parser.add_argument("--kv-cache-dtype", type=str, default=None, choices=["int8"], help="KV cache data type") + self.parser.add_argument("--skip-load", action="store_true", help="skip loading model weights") # --- Length and infer parameters --- self.parser.add_argument("--batch-size", type=int, default=1) + self.parser.add_argument("--input-len", type=int, default=10, help="input sequence length") + self.parser.add_argument("--output-len", type=int, default=20, help="output sequence length") + self.parser.add_argument("--max-new-tokens", type=int, default=500, help="maximum number of new tokens to generate") self.parser.add_argument("--top-k", type=int, default=1) self.parser.add_argument("--top-p", type=float, default=1.0) self.parser.add_argument("--temperature", type=float, default=1.0) @@ -64,6 +90,22 @@ def _add_common_args(self): self.parser.add_argument("--log-evel", type=str, default="INFO") + # --- Evaluation parameters --- + self.parser.add_argument("--bench", type=str, default=None, choices=["ceval", "mmlu"], help="benchmark to evaluate") + self.parser.add_argument("--backend", type=str, default="cpp", choices=["python", "cpp", "torch", "vllm"], help="backend type") + self.parser.add_argument("--ndev", type=int, default=1, help="number of devices for tensor parallelism") + self.parser.add_argument("--subject", type=str, default="all", help="subject(s) to evaluate, comma-separated or 'all'") + self.parser.add_argument("--split", type=str, default="test", choices=["test", "val", "all"], help="dataset split to use") + self.parser.add_argument("--num-samples", type=int, default=None, help="number of samples to evaluate per subject") + self.parser.add_argument("--output-csv", type=str, default=None, help="path to output CSV file for results") + self.parser.add_argument("--cache-dir", type=str, default=None, help="directory for dataset cache") + + + # --- Quantization parameters --- + self.parser.add_argument("--awq", action="store_true", help="use AWQ quantization") + self.parser.add_argument("--gptq", action="store_true", help="use GPTQ quantization") + + def _get_device_type(self, dev_str): """Convert device string to DeviceType enum""" DEVICE_TYPE_MAP = { From 46997bf7c63373e8bb0f97faa2cfa9fdb86f5c15 Mon Sep 17 00:00:00 2001 From: MoringLotus Date: Fri, 10 Apr 2026 05:28:21 +0000 Subject: [PATCH 03/12] args for two example script --- .gitignore | 7 ++ examples/jiuge.py | 208 ++++----------------------------- examples/jiuge_fix.py | 200 +++++++++++++++++++++++++++++++ examples/llama.py | 171 +++++++++++++-------------- python/infinilm/base_config.py | 108 ++++++++++++----- 5 files changed, 394 insertions(+), 300 deletions(-) create mode 100644 examples/jiuge_fix.py diff --git a/.gitignore b/.gitignore index b728e6ea..086071a5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,13 @@ build/ python/infinilm/lib/*.so +#model_weight +model_weight/ + +jiuge_infer.sh +jiuge.sh +README.md + # MacOS Cache .DS_Store diff --git a/examples/jiuge.py b/examples/jiuge.py index fa547435..9f15433c 100644 --- a/examples/jiuge.py +++ b/examples/jiuge.py @@ -12,153 +12,13 @@ import numpy as np from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig from packaging import version +from infinilm.base_config import BaseConfig sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) _PAGED_KV_BLOCK_SIZE = 256 -def get_args(): - parser = argparse.ArgumentParser(description="run Llama args") - - parser.add_argument( - "--cpu", - action="store_true", - help="Run cpu test", - ) - parser.add_argument( - "--nvidia", - action="store_true", - help="Run nvidia test", - ) - parser.add_argument( - "--qy", - action="store_true", - help="Run qy test", - ) - parser.add_argument( - "--metax", - action="store_true", - help="Run metax test", - ) - parser.add_argument( - "--moore", - action="store_true", - help="Run moore test", - ) - parser.add_argument( - "--iluvatar", - action="store_true", - help="Run iluvatar test", - ) - parser.add_argument( - "--cambricon", - action="store_true", - help="Run cambricon test", - ) - parser.add_argument( - "--ali", - action="store_true", - help="Run alippu test", - ) - parser.add_argument( - "--hygon", - action="store_true", - help="Run hygon test", - ) - parser.add_argument( - "--model-path", - type=str, - required=True, - help="model_path", - ) - parser.add_argument( - "--max-new-tokens", - type=int, - default=100, - help="max_new_tokens", - ) - parser.add_argument( - "--backend", - type=str, - default="cpp", - help="python or cpp model", - ) - parser.add_argument( - "--batch-size", - type=int, - default=1, - help="number of prompts in a batch", - ) - parser.add_argument( - "--prompt", - type=str, - default="How are you", - help="input prompt", - ) - parser.add_argument( - "--tp", - type=int, - default=1, - help="total rank for tensor parallel", - ) - parser.add_argument( - "--enable-paged-attn", - action="store_true", - help="use paged cache", - ) - - parser.add_argument( - "--paged-kv-block-size", - type=int, - default=256, - help="num tokens each kv block can hold", - ) - - parser.add_argument( - "--enable-graph", - action="store_true", - help="enable graph compiling", - ) - - parser.add_argument( - "--top-k", - type=int, - default=1, - help="top k sampling", - ) - - parser.add_argument( - "--top-p", - type=float, - default=1.0, - help="top p sampling", - ) - - parser.add_argument( - "--temperature", - type=float, - default=1.0, - help="sampling temperature", - ) - - parser.add_argument( - "--attn", - type=str, - default="default", - choices=["default", "paged-attn", "flash-attn"], - help="attention backend to use: 'default' or 'flash-attn'", - ) - - parser.add_argument( - "--kv-cache-dtype", - type=str, - default=None, - choices=["int8"], - ) - - return parser.parse_args() - def test( prompts: str | list[str], @@ -186,7 +46,7 @@ def test( distributed_config=DistConfig(tp), enable_graph_compiling=enable_graph, attention_backend=attn_backend, - kv_cache_dtype=args.kv_cache_dtype, + kv_cache_dtype=cfg.kv_cache_dtype, ) # ---------------------------------------------------------------------------- # # Load Weights @@ -300,44 +160,26 @@ def test( if __name__ == "__main__": - args = get_args() - print(args) + cfg = BaseConfig() + + device_str = cfg.get_device_str(cfg.device) - # Parse command line arguments - device_str = "cpu" - if args.cpu: - device_str = "cpu" - elif args.nvidia: - device_str = "cuda" - elif args.qy: - device_str = "cuda" - elif args.metax: - device_str = "cuda" - elif args.moore: - device_str = "musa" - elif args.iluvatar: - device_str = "cuda" - elif args.cambricon: - device_str = "mlu" - elif args.ali: - device_str = "cuda" - elif args.hygon: - device_str = "cuda" - else: - print( - "Usage: python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon | --ali | --hygon] --model_path=\n" - "such as, python examples/jiuge.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0" - ) - sys.exit(1) - prompts = [args.prompt for _ in range(args.batch_size)] - _PAGED_KV_BLOCK_SIZE = args.paged_kv_block_size + prompts = [cfg.prompt for _ in range(cfg.batch_size)] + + _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size + + model_path = cfg.model - model_path = args.model_path - max_new_tokens = args.max_new_tokens - backend = args.backend - tp = args.tp - enable_paged_attn = args.enable_paged_attn - enable_graph = args.enable_graph + max_new_tokens = cfg.max_new_tokens + + backend = cfg.backend + + tp = cfg.tp + + enable_paged_attn = cfg.enable_paged_attn + + enable_graph = cfg.enable_graph + if backend != "cpp": raise ValueError(f"Unsupported backend: {backend}.") @@ -351,8 +193,8 @@ def test( tp=tp, enable_paged_attn=enable_paged_attn, enable_graph=enable_graph, - top_k=args.top_k, - top_p=args.top_p, - temperature=args.temperature, - attn_backend=args.attn, - ) + top_k=cfg.top_k, + top_p=cfg.top_p, + temperature=cfg.temperature, + attn_backend=cfg.attn, + ) \ No newline at end of file diff --git a/examples/jiuge_fix.py b/examples/jiuge_fix.py new file mode 100644 index 00000000..9f15433c --- /dev/null +++ b/examples/jiuge_fix.py @@ -0,0 +1,200 @@ +import infinicore +import transformers +from transformers import AutoTokenizer +from tokenizers import decoders as _dec +from infinilm.modeling_utils import load_model_state_dict_by_file +from infinilm.distributed import DistConfig +from infinilm.infer_engine import GenerationConfig, InferEngine +import argparse +import sys +import time +import os +import numpy as np +from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig +from packaging import version +from infinilm.base_config import BaseConfig + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) + +_PAGED_KV_BLOCK_SIZE = 256 + + + +def test( + prompts: str | list[str], + model_path, + max_new_tokens=100, + infini_device=infinicore.device("cpu", 0), + tp=1, + enable_paged_attn=False, + enable_graph=False, + top_k=1, + top_p=1.0, + temperature=1.0, + attn_backend="default", +): + model_path = os.path.expanduser(model_path) + # ---------------------------------------------------------------------------- # + # Create Model + # ---------------------------------------------------------------------------- # + if enable_paged_attn and attn_backend == "default": + attn_backend = "paged-attn" + + model = InferEngine( + model_path, + device=infini_device, + distributed_config=DistConfig(tp), + enable_graph_compiling=enable_graph, + attention_backend=attn_backend, + kv_cache_dtype=cfg.kv_cache_dtype, + ) + # ---------------------------------------------------------------------------- # + # Load Weights + # ---------------------------------------------------------------------------- # + load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype) + + # ---------------------------------------------------------------------------- # + # create tokenizer + # ---------------------------------------------------------------------------- # + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + if "llama" == model.config.model_type: + backend = getattr(tokenizer, "backend_tokenizer", None) + target = getattr(backend, "_tokenizer", backend) + norm = getattr(target, "normalizer", None) + dec = getattr(target, "decoder", None) + sn = repr(norm)[:800] if norm is not None else "" + sd = repr(dec)[:800] if dec is not None else "" + has_prepend = "Prepend" in sn + has_strip = "Strip" in sd + if has_prepend and has_strip: + target.decoder = _dec.Sequence( + [ + _dec.Replace("▁", " "), + _dec.ByteFallback(), + _dec.Fuse(), + ] + ) + + # ---------------------------------------------------------------------------- # + # tokenize + # ---------------------------------------------------------------------------- # + # prompt = "山东最高的山是?" + if isinstance(prompts, str): + prompts = [prompts] + input_contents = [ + tokenizer.apply_chat_template( + conversation=[{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + for prompt in prompts + ] + + # input_ids_list = tokenizer.batch_encode_plus(input_contents)[ + # "input_ids" + # ] # List: [[1, 1128, 526, 366, 29892]] + if version.parse(transformers.__version__) < version.parse("5.0.0"): + # Ideally this is solved by upgrading transformers. However, doing so causes version mismatch between transformers and mlu pytorch on devices with Phytium CPU. So a branch is temporarily used. + input_ids_list = [ + tokenizer.encode_plus( + text, truncation=True, max_length=2048, add_special_tokens=True + )["input_ids"] + for text in input_contents + ] + else: + input_ids_list = [ + tokenizer._encode_plus( + text, truncation=True, max_length=2048, add_special_tokens=True + )["input_ids"] + for text in input_contents + ] + + # ---------------------------------------------------------------------------- # + # Create KVCache + # ---------------------------------------------------------------------------- # + if enable_paged_attn: + batch_size = 1 if prompts is str else len(prompts) + max_total_tokens = max_new_tokens + len(input_ids_list[0]) + cache_config = PagedKVCacheConfig( + num_blocks=( + (max_total_tokens + (_PAGED_KV_BLOCK_SIZE - 1)) // _PAGED_KV_BLOCK_SIZE + ) + * batch_size, + block_size=_PAGED_KV_BLOCK_SIZE, + ) + else: + batch_size = 1 if prompts is str else len(prompts) + initial_capacity = max_new_tokens + len(input_ids_list[0]) + cache_config = StaticKVCacheConfig( + max_batch_size=batch_size, max_cache_len=initial_capacity + ) + + model.reset_cache(cache_config) + + # ---------------------------------------------------------------------------- # + # Generate + # ---------------------------------------------------------------------------- # + print(input_contents[0], end="", flush=True) + input_ids_infini = infinicore.from_list(input_ids_list) + + t1 = time.time() + print("=================== start generate ====================") + output_ids = model.generate( + input_ids_infini, + GenerationConfig( + max_new_tokens=max_new_tokens, + temperature=temperature, + top_k=top_k, + top_p=top_p, + ), + _measure_and_log_time=True, + ) + t2 = time.time() + + numpy_output_ids = np.array([output_id.to_numpy()[0] for output_id in output_ids]) + print(tokenizer.decode(numpy_output_ids, skip_special_tokens=True)) + + print( + f"total_time: {round((t2 - t1) * 1000, 2)} ms", + ) + + +if __name__ == "__main__": + cfg = BaseConfig() + + device_str = cfg.get_device_str(cfg.device) + + prompts = [cfg.prompt for _ in range(cfg.batch_size)] + + _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size + + model_path = cfg.model + + max_new_tokens = cfg.max_new_tokens + + backend = cfg.backend + + tp = cfg.tp + + enable_paged_attn = cfg.enable_paged_attn + + enable_graph = cfg.enable_graph + + if backend != "cpp": + raise ValueError(f"Unsupported backend: {backend}.") + + infini_device = infinicore.device(device_str, 0) + + test( + prompts, + model_path, + max_new_tokens, + infini_device=infini_device, + tp=tp, + enable_paged_attn=enable_paged_attn, + enable_graph=enable_graph, + top_k=cfg.top_k, + top_p=cfg.top_p, + temperature=cfg.temperature, + attn_backend=cfg.attn, + ) \ No newline at end of file diff --git a/examples/llama.py b/examples/llama.py index aa890ca9..413afa13 100644 --- a/examples/llama.py +++ b/examples/llama.py @@ -7,70 +7,70 @@ import sys import time import os - +from infinilm.base_config import BaseConfig sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) -def get_args(): - parser = argparse.ArgumentParser(description="run Llama args") - - parser.add_argument( - "--cpu", - action="store_true", - help="Run cpu test", - ) - parser.add_argument( - "--nvidia", - action="store_true", - help="Run nvidia test", - ) - parser.add_argument( - "--metax", - action="store_true", - help="Run metax test", - ) - parser.add_argument( - "--moore", - action="store_true", - help="Run moore test", - ) - parser.add_argument( - "--iluvatar", - action="store_true", - help="Run iluvatar test", - ) - parser.add_argument( - "--model_path", - type=str, - required=True, - help="model_path", - ) - parser.add_argument( - "--max_new_tokens", - type=int, - default=100, - help="max_new_tokens", - ) - parser.add_argument( - "--backend", - type=str, - default="python", - help="python or cpp model", - ) - parser.add_argument( - "--batch_size", - type=int, - default=1, - help="number of prompts in a batch", - ) - parser.add_argument( - "--prompt", - type=str, - default="How are you", - help="input prompt", - ) - - return parser.parse_args() +# def get_args(): +# parser = argparse.ArgumentParser(description="run Llama args") + +# parser.add_argument( +# "--cpu", +# action="store_true", +# help="Run cpu test", +# ) +# parser.add_argument( +# "--nvidia", +# action="store_true", +# help="Run nvidia test", +# ) +# parser.add_argument( +# "--metax", +# action="store_true", +# help="Run metax test", +# ) +# parser.add_argument( +# "--moore", +# action="store_true", +# help="Run moore test", +# ) +# parser.add_argument( +# "--iluvatar", +# action="store_true", +# help="Run iluvatar test", +# ) +# parser.add_argument( +# "--model_path", +# type=str, +# required=True, +# help="model_path", +# ) +# parser.add_argument( +# "--max_new_tokens", +# type=int, +# default=100, +# help="max_new_tokens", +# ) +# parser.add_argument( +# "--backend", +# type=str, +# default="python", +# help="python or cpp model", +# ) +# parser.add_argument( +# "--batch_size", +# type=int, +# default=1, +# help="number of prompts in a batch", +# ) +# parser.add_argument( +# "--prompt", +# type=str, +# default="How are you", +# help="input prompt", +# ) + +# return parser.parse_args() def test( @@ -163,32 +163,33 @@ def test( if __name__ == "__main__": - args = get_args() - print(args) + cfg = BaseConfig() + + device_str = cfg.get_device_str(cfg.device) # Parse command line arguments - device_str = "cpu" - if args.cpu: - device_str = "cpu" - elif args.nvidia: - device_str = "cuda" - elif args.metax: - device_str = "cuda" - elif args.moore: - device_str = "musa" - elif args.iluvatar: - device_str = "cuda" - else: - print( - "Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=\n" - "such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0" - ) - sys.exit(1) - prompts = [args.prompt for _ in range(args.batch_size)] - - model_path = args.model_path - max_new_tokens = args.max_new_tokens - backend = args.backend + # device_str = "cpu" + # if args.cpu: + # device_str = "cpu" + # elif args.nvidia: + # device_str = "cuda" + # elif args.metax: + # device_str = "cuda" + # elif args.moore: + # device_str = "musa" + # elif args.iluvatar: + # device_str = "cuda" + # else: + # print( + # "Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=\n" + # "such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0" + # ) + # sys.exit(1) + prompts = [cfg.prompt for _ in range(cfg.batch_size)] + + model_path = cfg.model + max_new_tokens = cfg.max_new_tokens + backend = cfg.backend if backend != "python": raise ValueError(f"Unsupported backend: {backend}.") diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py index 5e1a8bd2..d11d9a8c 100644 --- a/python/infinilm/base_config.py +++ b/python/infinilm/base_config.py @@ -1,8 +1,8 @@ import argparse import sys import os -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../scripts")) -from libinfinicore_infer import DeviceType +# sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../scripts")) +# from libinfinicore_infemport DeviceType class BaseConfig: @@ -16,8 +16,7 @@ def __init__(self): self.model = self.args.model - self.device_name = self.args.device - self.device_type = self._get_device_type(self.args.device) + self.device = self.args.device self.tp = self.args.tp @@ -26,27 +25,34 @@ def __init__(self): self.cache_type = self.args.cache_type self.enable_paged_attn = self.args.enable_paged_attn self.paged_kv_block_size = self.args.paged_kv_block_size + self.num_blocks = self.args.num_blocks + self.block_size = self.args.block_size + self.max_cache_len = self.args.max_cache_len self.kv_cache_dtype = self.args.kv_cache_dtype self.skip_load = self.args.skip_load self.batch_size = self.args.batch_size + self.max_batch = self.args.max_batch + self.max_batch_size = self.args.max_batch_size self.input_len = self.args.input_len self.output_len = self.args.output_len self.max_new_tokens = self.args.max_new_tokens + self.max_tokens = self.args.max_tokens + self.prompt = self.args.prompt self.top_k = self.args.top_k self.top_p = self.args.top_p self.temperature = self.args.temperature - self.warm_up = self.args.warm_up + self.warmup = self.args.warmup self.verbose = self.args.verbose - self.log_evel = self.args.log_evel + self.log_level = self.args.log_level # Evaluation parameters self.bench = self.args.bench self.backend = self.args.backend - self.ndev = self.args.ndev + self.tp = self.args.tp self.subject = self.args.subject self.split = self.args.split self.num_samples = self.args.num_samples @@ -57,6 +63,13 @@ def __init__(self): # Quantization parameters self.awq = self.args.awq self.gptq = self.args.gptq + self.dtype = self.args.dtype + + + # Server parameters + self.host = self.args.host + self.port = self.args.port + self.endpoint = self.args.endpoint def _add_common_args(self): # --- base configuration --- @@ -71,29 +84,36 @@ def _add_common_args(self): self.parser.add_argument("--cache-type", type=str, default="paged", choices=["paged", "static"]) self.parser.add_argument("--enable-paged-attn", action="store_true", help="use paged cache",) self.parser.add_argument("--paged-kv-block-size", type=int, default=256) + self.parser.add_argument("--num-blocks", type=int, default=512, help="number of KV cache blocks") + self.parser.add_argument("--block-size", type=int, default=256, help="size of each KV cache block") + self.parser.add_argument("--max-cache-len", type=int, default=4096, help="maximum cache length") self.parser.add_argument("--kv-cache-dtype", type=str, default=None, choices=["int8"], help="KV cache data type") - self.parser.add_argument("--skip-load", action="store_true", help="skip loading model weights") + self.parser.add_argument("--skip-load", action="store_false", help="skip loading model weights") # --- Length and infer parameters --- self.parser.add_argument("--batch-size", type=int, default=1) + self.parser.add_argument("--max-batch", type=int, default=3, help="maximum batch size") + self.parser.add_argument("--max-batch-size", type=int, default=8, help="maximum batch size for server") self.parser.add_argument("--input-len", type=int, default=10, help="input sequence length") self.parser.add_argument("--output-len", type=int, default=20, help="output sequence length") self.parser.add_argument("--max-new-tokens", type=int, default=500, help="maximum number of new tokens to generate") + self.parser.add_argument("--max-tokens", type=int, default=512, help="maximum tokens") + self.parser.add_argument("--prompt", type=str, default="How are you", help="default prompt text") self.parser.add_argument("--top-k", type=int, default=1) self.parser.add_argument("--top-p", type=float, default=1.0) self.parser.add_argument("--temperature", type=float, default=1.0) # --- debug --- - self.parser.add_argument("--warmup", action="store_true") - self.parser.add_argument("--verbose", action="store_true") - self.parser.add_argument("--log-evel", type=str, default="INFO") + self.parser.add_argument("--warmup", action="store_false") + self.parser.add_argument("--verbose", action="store_false") + self.parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="logging level") # --- Evaluation parameters --- self.parser.add_argument("--bench", type=str, default=None, choices=["ceval", "mmlu"], help="benchmark to evaluate") self.parser.add_argument("--backend", type=str, default="cpp", choices=["python", "cpp", "torch", "vllm"], help="backend type") - self.parser.add_argument("--ndev", type=int, default=1, help="number of devices for tensor parallelism") + self.parser.add_argument("--subject", type=str, default="all", help="subject(s) to evaluate, comma-separated or 'all'") self.parser.add_argument("--split", type=str, default="test", choices=["test", "val", "all"], help="dataset split to use") self.parser.add_argument("--num-samples", type=int, default=None, help="number of samples to evaluate per subject") @@ -102,27 +122,51 @@ def _add_common_args(self): # --- Quantization parameters --- - self.parser.add_argument("--awq", action="store_true", help="use AWQ quantization") - self.parser.add_argument("--gptq", action="store_true", help="use GPTQ quantization") - - - def _get_device_type(self, dev_str): - """Convert device string to DeviceType enum""" - DEVICE_TYPE_MAP = { - "cpu": DeviceType.DEVICE_TYPE_CPU, - "nvidia": DeviceType.DEVICE_TYPE_NVIDIA, - "qy": DeviceType.DEVICE_TYPE_QY, - "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON, - "ascend": DeviceType.DEVICE_TYPE_ASCEND, - "metax": DeviceType.DEVICE_TYPE_METAX, - "moore": DeviceType.DEVICE_TYPE_MOORE, - "iluvatar": DeviceType.DEVICE_TYPE_ILUVATAR, - "kunlun": DeviceType.DEVICE_TYPE_KUNLUN, - "hygon": DeviceType.DEVICE_TYPE_HYGON, - "ali": DeviceType.DEVICE_TYPE_ALI + self.parser.add_argument("--awq", action="store_false", help="use AWQ quantization") + self.parser.add_argument("--gptq", action="store_false", help="use GPTQ quantization") + self.parser.add_argument("--dtype", type=str, default="float16", help="data type for model") + + + # --- Server parameters --- + self.parser.add_argument("--host", type=str, default="0.0.0.0", help="server host") + self.parser.add_argument("--port", type=int, default=8000, help="server port") + self.parser.add_argument("--endpoint", type=str, default="/completions", help="API endpoint") + + + def get_device_str(self, device): + """Convert device name to backend string (cuda/cpu/musa/mlu)""" + DEVICE_STR_MAP = { + "cpu": "cpu", + "nvidia": "cuda", + "qy": "cuda", + "cambricon": "mlu", + "ascend": "ascend", # 假设华为昇腾后端为"ascend" + "metax": "cuda", + "moore": "musa", + "iluvatar": "cuda", + "kunlun": "kunlun", # 假设昆仑芯后端为"kunlun" + "hygon": "cuda", + "ali": "cuda" } - return DEVICE_TYPE_MAP.get(dev_str.lower(), DeviceType.DEVICE_TYPE_CPU) + return DEVICE_STR_MAP.get(device.lower(), "cpu") + + # def _get_device_type(self, dev_str): + # """Convert device string to DeviceType enum""" + # DEVICE_TYPE_MAP = { + # "cpu": DeviceType.DEVICE_TYPE_CPU, + # "nvidia": DeviceType.DEVICE_TYPE_NVIDIA, + # "qy": DeviceType.DEVICE_TYPE_QY, + # "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON, + # "ascend": DeviceType.DEVICE_TYPE_ASCEND, + # "metax": DeviceType.DEVICE_TYPE_METAX, + # "moore": DeviceType.DEVICE_TYPE_MOORE, + # "iluvatar": DeviceType.DEVICE_TYPE_ILUVATAR, + # "kunlun": DeviceType.DEVICE_TYPE_KUNLUN, + # "hygon": DeviceType.DEVICE_TYPE_HYGON, + # "ali": DeviceType.DEVICE_TYPE_ALI + # } + # return DEVICE_TYPE_MAP.get(dev_str.lower(), DeviceType.DEVICE_TYPE_CPU) def __repr__(self): """String representation of configuration""" - return f"BaseConfig(model='{self.model}', device='{self.device_name}', tp={self.tp})" + return f"BaseConfig(model='{self.model}', device='{self.device}', tp={self.tp})" From 41810e421f2863ac57369c25cab7e786ae78c705 Mon Sep 17 00:00:00 2001 From: MoringLotus <151500590+MoringLotus@users.noreply.github.com> Date: Fri, 10 Apr 2026 14:08:24 +0800 Subject: [PATCH 04/12] Delete examples/jiuge_fix.py --- examples/jiuge_fix.py | 200 ------------------------------------------ 1 file changed, 200 deletions(-) delete mode 100644 examples/jiuge_fix.py diff --git a/examples/jiuge_fix.py b/examples/jiuge_fix.py deleted file mode 100644 index 9f15433c..00000000 --- a/examples/jiuge_fix.py +++ /dev/null @@ -1,200 +0,0 @@ -import infinicore -import transformers -from transformers import AutoTokenizer -from tokenizers import decoders as _dec -from infinilm.modeling_utils import load_model_state_dict_by_file -from infinilm.distributed import DistConfig -from infinilm.infer_engine import GenerationConfig, InferEngine -import argparse -import sys -import time -import os -import numpy as np -from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig -from packaging import version -from infinilm.base_config import BaseConfig - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) - -_PAGED_KV_BLOCK_SIZE = 256 - - - -def test( - prompts: str | list[str], - model_path, - max_new_tokens=100, - infini_device=infinicore.device("cpu", 0), - tp=1, - enable_paged_attn=False, - enable_graph=False, - top_k=1, - top_p=1.0, - temperature=1.0, - attn_backend="default", -): - model_path = os.path.expanduser(model_path) - # ---------------------------------------------------------------------------- # - # Create Model - # ---------------------------------------------------------------------------- # - if enable_paged_attn and attn_backend == "default": - attn_backend = "paged-attn" - - model = InferEngine( - model_path, - device=infini_device, - distributed_config=DistConfig(tp), - enable_graph_compiling=enable_graph, - attention_backend=attn_backend, - kv_cache_dtype=cfg.kv_cache_dtype, - ) - # ---------------------------------------------------------------------------- # - # Load Weights - # ---------------------------------------------------------------------------- # - load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype) - - # ---------------------------------------------------------------------------- # - # create tokenizer - # ---------------------------------------------------------------------------- # - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - if "llama" == model.config.model_type: - backend = getattr(tokenizer, "backend_tokenizer", None) - target = getattr(backend, "_tokenizer", backend) - norm = getattr(target, "normalizer", None) - dec = getattr(target, "decoder", None) - sn = repr(norm)[:800] if norm is not None else "" - sd = repr(dec)[:800] if dec is not None else "" - has_prepend = "Prepend" in sn - has_strip = "Strip" in sd - if has_prepend and has_strip: - target.decoder = _dec.Sequence( - [ - _dec.Replace("▁", " "), - _dec.ByteFallback(), - _dec.Fuse(), - ] - ) - - # ---------------------------------------------------------------------------- # - # tokenize - # ---------------------------------------------------------------------------- # - # prompt = "山东最高的山是?" - if isinstance(prompts, str): - prompts = [prompts] - input_contents = [ - tokenizer.apply_chat_template( - conversation=[{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - for prompt in prompts - ] - - # input_ids_list = tokenizer.batch_encode_plus(input_contents)[ - # "input_ids" - # ] # List: [[1, 1128, 526, 366, 29892]] - if version.parse(transformers.__version__) < version.parse("5.0.0"): - # Ideally this is solved by upgrading transformers. However, doing so causes version mismatch between transformers and mlu pytorch on devices with Phytium CPU. So a branch is temporarily used. - input_ids_list = [ - tokenizer.encode_plus( - text, truncation=True, max_length=2048, add_special_tokens=True - )["input_ids"] - for text in input_contents - ] - else: - input_ids_list = [ - tokenizer._encode_plus( - text, truncation=True, max_length=2048, add_special_tokens=True - )["input_ids"] - for text in input_contents - ] - - # ---------------------------------------------------------------------------- # - # Create KVCache - # ---------------------------------------------------------------------------- # - if enable_paged_attn: - batch_size = 1 if prompts is str else len(prompts) - max_total_tokens = max_new_tokens + len(input_ids_list[0]) - cache_config = PagedKVCacheConfig( - num_blocks=( - (max_total_tokens + (_PAGED_KV_BLOCK_SIZE - 1)) // _PAGED_KV_BLOCK_SIZE - ) - * batch_size, - block_size=_PAGED_KV_BLOCK_SIZE, - ) - else: - batch_size = 1 if prompts is str else len(prompts) - initial_capacity = max_new_tokens + len(input_ids_list[0]) - cache_config = StaticKVCacheConfig( - max_batch_size=batch_size, max_cache_len=initial_capacity - ) - - model.reset_cache(cache_config) - - # ---------------------------------------------------------------------------- # - # Generate - # ---------------------------------------------------------------------------- # - print(input_contents[0], end="", flush=True) - input_ids_infini = infinicore.from_list(input_ids_list) - - t1 = time.time() - print("=================== start generate ====================") - output_ids = model.generate( - input_ids_infini, - GenerationConfig( - max_new_tokens=max_new_tokens, - temperature=temperature, - top_k=top_k, - top_p=top_p, - ), - _measure_and_log_time=True, - ) - t2 = time.time() - - numpy_output_ids = np.array([output_id.to_numpy()[0] for output_id in output_ids]) - print(tokenizer.decode(numpy_output_ids, skip_special_tokens=True)) - - print( - f"total_time: {round((t2 - t1) * 1000, 2)} ms", - ) - - -if __name__ == "__main__": - cfg = BaseConfig() - - device_str = cfg.get_device_str(cfg.device) - - prompts = [cfg.prompt for _ in range(cfg.batch_size)] - - _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size - - model_path = cfg.model - - max_new_tokens = cfg.max_new_tokens - - backend = cfg.backend - - tp = cfg.tp - - enable_paged_attn = cfg.enable_paged_attn - - enable_graph = cfg.enable_graph - - if backend != "cpp": - raise ValueError(f"Unsupported backend: {backend}.") - - infini_device = infinicore.device(device_str, 0) - - test( - prompts, - model_path, - max_new_tokens, - infini_device=infini_device, - tp=tp, - enable_paged_attn=enable_paged_attn, - enable_graph=enable_graph, - top_k=cfg.top_k, - top_p=cfg.top_p, - temperature=cfg.temperature, - attn_backend=cfg.attn, - ) \ No newline at end of file From f264e25a54b061aca75703f4574fa65390c10c35 Mon Sep 17 00:00:00 2001 From: MoringLotus <151500590+MoringLotus@users.noreply.github.com> Date: Fri, 10 Apr 2026 14:17:35 +0800 Subject: [PATCH 05/12] Delete jiuge.sh --- jiuge.sh | 46 ---------------------------------------------- 1 file changed, 46 deletions(-) delete mode 100644 jiuge.sh diff --git a/jiuge.sh b/jiuge.sh deleted file mode 100644 index e7ddb2dd..00000000 --- a/jiuge.sh +++ /dev/null @@ -1,46 +0,0 @@ -featurize@featurize:~/work/InfiniLM$ cat jiuge.sh -#!/bin/bash - -# Jiuge模型运行脚本 -# 使用NVIDIA显卡运行9G4B模型 - -set -e # 遇到错误立即退出 - -echo "==========================================" -echo "🚀 启动 Jiuge 模型 (9G4B) - NVIDIA版本" -echo "==========================================" -export INFINI_ROOT=/home/featurize/.infini -export LD_LIBRARY_PATH=$INFINI_ROOT/lib:$LD_LIBRARY_PATH -# 设置参数 -MODEL_DIR="/home/featurize/work/InfiniFamily/9G4B" -DEVICE="--nvidia" -N_DEVICE=1 -SCRIPT_PATH="python scripts/jiuge.py" - -# 检查模型目录是否存在 -if [ ! -d "$MODEL_DIR" ]; then - echo "❌ 错误: 模型目录不存在: $MODEL_DIR" - echo "请检查路径是否正确" - exit 1 -fi - -# 检查Python脚本是否存在 -if [ ! -f "scripts/jiuge.py" ]; then - echo "❌ 错误: 未找到jiuge.py脚本: scripts/jiuge.py" - echo "请确保在当前目录下运行此脚本" - exit 1 -fi - -echo "📁 模型路径: $MODEL_DIR" -echo "🎯 设备类型: NVIDIA GPU" -echo "💻 设备数量: $N_DEVICE" -echo "" - -# 运行模型 -echo "🔄 启动模型..." -$SCRIPT_PATH $DEVICE $MODEL_DIR $N_DEVICE - -echo "" -echo "==========================================" -echo "✅ 模型运行完成" -echo "=========================================="ß \ No newline at end of file From 02e3350052174ce3c484300ff2c6cdf5132cc54a Mon Sep 17 00:00:00 2001 From: MoringLotus Date: Fri, 10 Apr 2026 07:07:45 +0000 Subject: [PATCH 06/12] examples script fix --- README.md | 182 ---------------------------- examples/bench.py | 213 ++++----------------------------- python/infinilm/base_config.py | 2 +- 3 files changed, 24 insertions(+), 373 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index afc242b2..00000000 --- a/README.md +++ /dev/null @@ -1,182 +0,0 @@ -# InfiniLM - -![star](https://atomgit.com/InfiniTensor/InfiniLM/star/badge.svg) - -本项目是基于 [`InfiniCore`](https://github.com/InfiniTensor/InfiniCore) 的推理引擎。 - -## 使用方式 - -- 编译并安装 `InfiniCore` 。注意根据提示设置好 `INFINI_ROOT` 环境变量(默认为 `$HOME/.infini`)。 - -- 编译并安装 `InfiniLM` - -```bash - -``` - -- 运行模型推理测试 - -```bash -python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] path/to/model_dir [n_device] -``` - -- 部署模型推理服务 - -```bash -python scripts/launch_server.py --model-path MODEL_PATH [-h] [--dev {cpu,nvidia,qy, cambricon,ascend,metax,moore,iluvatar,kunlun,hygon}] [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS] -``` - -- 测试模型推理服务性能 - -```bash -python scripts/test_perf.py -``` - -- 使用推理服务测试模型困惑度(Perplexity) - -```bash -python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS] -``` - -## 使用方式(新版) -#### 一、编译并安装 `InfiniCore` -编译并安装 `InfiniCore`, 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) : - -- 注意根据提示设置好 `INFINI_ROOT` 环境变量(默认为 `$HOME/.infini`) -- 根据硬件平台,选择 xmake 构建配置 -- 编译安装InfiniCore -- 安装 C++ 库 -- 安装 Python 包 - - -#### 二、编译并安装 `InfiniLM` - - 克隆项目 - - 由于仓库中含有子模块,所以在克隆时请添加 `--recursive` 或 `--recurse-submodules`,如: - - ```shell - git clone --recursive https://github.com/InfiniTensor/InfiniLM.git - ``` - - 或者在普通克隆后进行更新: - - ```shell - git submodule update --init --recursive - ``` - - - - 选择是否使用kv caching,默认为false;在支持了此算子的平台(英伟达、阿里、天数、沐曦、海光、QY)可以使用 - ```bash - xmake f --use-kv-caching= [true | false] -cv - ``` - - - - 安装 InfiniLM Python 包 - ```bash - pip install -e . - ``` - - - 单次推理测试 - - llama示例 - ```bash - python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali | --cambricon | --hygon] --model_path= - ``` - - 例如: - ```bash - python examples/jigue.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0 - ``` - - 分布式推理测试 - - 9g示例 - ```bash - python examples/jiuge.py [---nvidia] --model_path= --backend=cpp --tp=NDEV --batch_size=MAX_BATCH - ``` - - - 例如: 9G7B模型,cpp后端,batch_size为16,4卡分布式 - ```bash - python examples/jiuge.py --nvidia --model_path=/models/9G7B_MHA/ --backend=cpp --tp=4 --batch_size=16 - ``` - - - - 推理服务测试 - - 启动推理服务 - ```bash - python python/infinilm/server/inference_server.py [--cpu | --nvidia | --metax | --moore | --iluvatar | --cambricon] --model_path= --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH --tp=NDEV --temperature=TEMP --top_p=TOP_P --top_k=TOP_K --host=HOST --port=PORT - ``` - - - 单卡示例: - ```bash - CUDA_VISIBLE_DEVICES=0 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=1 --temperature=1.0 --top_p=0.8 --top_k=1 - ``` - - - 多卡分布式示例: - ```bash - CUDA_VISIBLE_DEVICES=0,1,2,3 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=4 --temperature=1.0 --top_p=0.8 --top_k=1 - ``` - - - 测试推理服务性能: - ```bash - python scripts/test_perf.py --verbose - ``` - - - 运行推理基准测试(C-Eval/MMLU) - - ```bash - python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH] - ``` - - - 参数说明: - - `--subject`: 指定科目,支持单个科目、多个科目(逗号分隔)或 `all`(默认值,加载全部科目) - - `--output_csv`: 可选,指定CSV输出文件路径。如未指定则不生成CSV文件。CSV包含每个科目的结果和总体结果 - - `--cache_dir`: 可选,指定数据集缓存目录的父目录。应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录(例如 `~/.cache/huggingface/datasets/`)。设置后脚本优先使用本地 CSV(`pandas.read_csv`)离线加载数据,避免 `load_dataset` 的网络请求 - - - C-Eval示例: - - 单个科目: - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --num_samples 100 --backend cpp --ndev 1 - ``` - - 多个科目(逗号分隔): - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics,high_school_physics --backend cpp --ndev 1 --output_csv results.csv - ``` - - 全部科目并输出CSV: - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject all --backend cpp --ndev 1 --output_csv results.csv - ``` - - 使用缓存目录加速加载: - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/ - ``` - > 注意:`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录,而不是直接指向这些子目录 - - - MMLU示例: - - 单个科目: - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 - ``` - - 多个科目(逗号分隔): - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra,anatomy,astronomy --backend cpp --ndev 1 --output_csv results.csv - ``` - - 使用缓存目录加速加载: - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/ - ``` - > 注意:`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录,而不是直接指向这些子目录 - - - 试验中功能 - - Warm Up - ```bash - python examples/bench.py --nvidia --model= --warmup - ``` - - Paged Attention - ```bash - python examples/bench.py --nvidia --model= --enable-paged-attn - ``` - - CUDA Graph - ```bash - python examples/bench.py --nvidia --model= --enable-paged-attn --enable-graph - ``` - - 选择attention后端 (使用flash attention后端需要先在InfiniCore完成相关配置和编译) - ```bash - python examples/bench.py --nvidia --model= --enable-paged-attn [--attn=default | --attn=flash-attn] - ``` diff --git a/examples/bench.py b/examples/bench.py index a52c44ec..6fc27498 100644 --- a/examples/bench.py +++ b/examples/bench.py @@ -3,6 +3,7 @@ from infinilm.modeling_utils import load_model_state_dict_by_file from infinilm.distributed import DistConfig from infinilm.infer_engine import GenerationConfig, InferEngine +from infinilm.base_config import BaseConfig from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig import argparse import sys @@ -125,150 +126,6 @@ def get_test_cases( return case_dict - -def get_args(): - parser = argparse.ArgumentParser(description="run Llama args") - - parser.add_argument( - "--cpu", - action="store_true", - help="Run cpu test", - ) - parser.add_argument( - "--nvidia", - action="store_true", - help="Run nvidia test", - ) - parser.add_argument( - "--qy", - action="store_true", - help="Run qy test", - ) - parser.add_argument( - "--metax", - action="store_true", - help="Run metax test", - ) - parser.add_argument( - "--moore", - action="store_true", - help="Run moore test", - ) - parser.add_argument( - "--iluvatar", - action="store_true", - help="Run iluvatar test", - ) - parser.add_argument( - "--cambricon", - action="store_true", - help="Run cambricon test", - ) - parser.add_argument( - "--ali", - action="store_true", - help="Run alippu test", - ) - parser.add_argument( - "--hygon", - action="store_true", - help="Run hygon test", - ) - parser.add_argument( - "--model", - type=str, - required=True, - help="model path", - ) - parser.add_argument( - "--batch-size", - type=parse_list, - default=1, - help="number of prompts in a batch (can be an int or a list of ints, e.g., '1' or '[1,2,4]' or '1,2,4')", - ) - parser.add_argument( - "--tensor-parallel-size", - "--tp", - type=int, - default=1, - help="total rank for tensor parallel", - ) - parser.add_argument( - "--input-len", - type=parse_list, - default=10, - help="output tokens", - ) - - parser.add_argument( - "--output-len", - type=parse_list, - default=20, - help="output tokens", - ) - parser.add_argument( - "--skip-load", - action="store_true", - help="skip loading model weights", - ) - parser.add_argument( - "--top-k", - type=int, - default=1, - help="top k sampling", - ) - - parser.add_argument( - "--top-p", - type=float, - default=1.0, - help="top p sampling", - ) - - parser.add_argument( - "--temperature", - type=float, - default=1.0, - help="sampling temperature", - ) - parser.add_argument( - "--enable-paged-attn", - action="store_true", - help="use paged cache", - ) - parser.add_argument( - "--paged-kv-block-size", - type=int, - default=256, - help="num tokens each kv block can hold", - ) - parser.add_argument( - "--enable-graph", - action="store_true", - help="enable graph compiling", - ) - parser.add_argument( - "--warmup", - action="store_true", - help="Perform a warmup run before benchmarking/inference.", - ) - parser.add_argument( - "--attn", - type=str, - default="default", - choices=["default", "paged-attn", "flash-attn"], - help="attention backend to use: 'default' or 'flash-attn'", - ) - parser.add_argument( - "--kv-cache-dtype", - type=str, - default=None, - choices=["int8"], - ) - - return parser.parse_args() - - with open("examples/bench_prompt.md", "r") as f: prompt = f.read() @@ -305,7 +162,7 @@ def __init__( cache_config=cache_config, enable_graph_compiling=enable_graph, attention_backend=attn_backend, - kv_cache_dtype=args.kv_cache_dtype, + kv_cache_dtype=cfg.kv_cache_dtype, ) # ---------------------------------------------------------------------------- # @@ -396,52 +253,28 @@ def run( if __name__ == "__main__": - args = get_args() - print(args) - - # Parse command line arguments - device_str = "cpu" - if args.cpu: - device_str = "cpu" - elif args.nvidia: - device_str = "cuda" - elif args.qy: - device_str = "cuda" - elif args.metax: - device_str = "cuda" - elif args.moore: - device_str = "musa" - elif args.iluvatar: - device_str = "cuda" - elif args.cambricon: - device_str = "mlu" - elif args.ali: - device_str = "cuda" - elif args.hygon: - device_str = "cuda" - else: - print( - "python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --tp=1 --input-len=50 --output-len=50" - ) - sys.exit(1) - _PAGED_KV_BLOCK_SIZE = args.paged_kv_block_size + cfg = BaseConfig() + + device_str = cfg.get_device_str(cfg.device) + + _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size # -------------------------------------------------------- # # 解析参数 # -------------------------------------------------------- # - model_path = args.model + model_path = cfg.model infini_device = infinicore.device(device_str, 0) - tp = args.tensor_parallel_size + tp = cfg.tp - skip_load = args.skip_load + skip_load = cfg.skip_load - batch_size = args.batch_size - input_len = args.input_len - output_len = args.output_len - enable_paged_attn = args.enable_paged_attn - enable_graph = args.enable_graph - attn_backend = args.attn + batch_size = cfg.batch_size + input_len = cfg.input_len + output_len = cfg.output_len + enable_paged_attn = cfg.enable_paged_attn + enable_graph = cfg.enable_graph + attn_backend = cfg.attn if isinstance(batch_size, int): batch_size = [batch_size] @@ -488,7 +321,7 @@ def run( # ---------------------------------------------------------------------------- # # Warmup # ---------------------------------------------------------------------------- # - if args.warmup: + if cfg.warmup: warmup_steps = 1 # warmup cache capacity @@ -518,9 +351,9 @@ def run( input_ids_infini, GenerationConfig( max_new_tokens=5, # decode kernel warmup - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, + temperature=cfg.temperature, + top_k=cfg.top_k, + top_p=cfg.top_p, stop_on_eos=False, ), _measure_and_log_time=False, @@ -557,7 +390,7 @@ def run( batch_size=batch_size, input_len=input_len, output_len=output_len, - top_k=args.top_k, - top_p=args.top_p, - temperature=args.temperature, + top_k=cfg.top_k, + top_p=cfg.top_p, + temperature=cfg.temperature, ) diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py index d11d9a8c..0b19700a 100644 --- a/python/infinilm/base_config.py +++ b/python/infinilm/base_config.py @@ -88,7 +88,7 @@ def _add_common_args(self): self.parser.add_argument("--block-size", type=int, default=256, help="size of each KV cache block") self.parser.add_argument("--max-cache-len", type=int, default=4096, help="maximum cache length") self.parser.add_argument("--kv-cache-dtype", type=str, default=None, choices=["int8"], help="KV cache data type") - self.parser.add_argument("--skip-load", action="store_false", help="skip loading model weights") + self.parser.add_argument("--skip-load", action="store_true", help="skip loading model weights") # --- Length and infer parameters --- From 0920ce835100a77a6b0ffed4a43a23fc00329bc6 Mon Sep 17 00:00:00 2001 From: MoringLotus Date: Fri, 10 Apr 2026 07:27:56 +0000 Subject: [PATCH 07/12] solve merge problem --- examples/jiuge_fix.py | 200 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 examples/jiuge_fix.py diff --git a/examples/jiuge_fix.py b/examples/jiuge_fix.py new file mode 100644 index 00000000..9f15433c --- /dev/null +++ b/examples/jiuge_fix.py @@ -0,0 +1,200 @@ +import infinicore +import transformers +from transformers import AutoTokenizer +from tokenizers import decoders as _dec +from infinilm.modeling_utils import load_model_state_dict_by_file +from infinilm.distributed import DistConfig +from infinilm.infer_engine import GenerationConfig, InferEngine +import argparse +import sys +import time +import os +import numpy as np +from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig +from packaging import version +from infinilm.base_config import BaseConfig + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) + +_PAGED_KV_BLOCK_SIZE = 256 + + + +def test( + prompts: str | list[str], + model_path, + max_new_tokens=100, + infini_device=infinicore.device("cpu", 0), + tp=1, + enable_paged_attn=False, + enable_graph=False, + top_k=1, + top_p=1.0, + temperature=1.0, + attn_backend="default", +): + model_path = os.path.expanduser(model_path) + # ---------------------------------------------------------------------------- # + # Create Model + # ---------------------------------------------------------------------------- # + if enable_paged_attn and attn_backend == "default": + attn_backend = "paged-attn" + + model = InferEngine( + model_path, + device=infini_device, + distributed_config=DistConfig(tp), + enable_graph_compiling=enable_graph, + attention_backend=attn_backend, + kv_cache_dtype=cfg.kv_cache_dtype, + ) + # ---------------------------------------------------------------------------- # + # Load Weights + # ---------------------------------------------------------------------------- # + load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype) + + # ---------------------------------------------------------------------------- # + # create tokenizer + # ---------------------------------------------------------------------------- # + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + if "llama" == model.config.model_type: + backend = getattr(tokenizer, "backend_tokenizer", None) + target = getattr(backend, "_tokenizer", backend) + norm = getattr(target, "normalizer", None) + dec = getattr(target, "decoder", None) + sn = repr(norm)[:800] if norm is not None else "" + sd = repr(dec)[:800] if dec is not None else "" + has_prepend = "Prepend" in sn + has_strip = "Strip" in sd + if has_prepend and has_strip: + target.decoder = _dec.Sequence( + [ + _dec.Replace("▁", " "), + _dec.ByteFallback(), + _dec.Fuse(), + ] + ) + + # ---------------------------------------------------------------------------- # + # tokenize + # ---------------------------------------------------------------------------- # + # prompt = "山东最高的山是?" + if isinstance(prompts, str): + prompts = [prompts] + input_contents = [ + tokenizer.apply_chat_template( + conversation=[{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + for prompt in prompts + ] + + # input_ids_list = tokenizer.batch_encode_plus(input_contents)[ + # "input_ids" + # ] # List: [[1, 1128, 526, 366, 29892]] + if version.parse(transformers.__version__) < version.parse("5.0.0"): + # Ideally this is solved by upgrading transformers. However, doing so causes version mismatch between transformers and mlu pytorch on devices with Phytium CPU. So a branch is temporarily used. + input_ids_list = [ + tokenizer.encode_plus( + text, truncation=True, max_length=2048, add_special_tokens=True + )["input_ids"] + for text in input_contents + ] + else: + input_ids_list = [ + tokenizer._encode_plus( + text, truncation=True, max_length=2048, add_special_tokens=True + )["input_ids"] + for text in input_contents + ] + + # ---------------------------------------------------------------------------- # + # Create KVCache + # ---------------------------------------------------------------------------- # + if enable_paged_attn: + batch_size = 1 if prompts is str else len(prompts) + max_total_tokens = max_new_tokens + len(input_ids_list[0]) + cache_config = PagedKVCacheConfig( + num_blocks=( + (max_total_tokens + (_PAGED_KV_BLOCK_SIZE - 1)) // _PAGED_KV_BLOCK_SIZE + ) + * batch_size, + block_size=_PAGED_KV_BLOCK_SIZE, + ) + else: + batch_size = 1 if prompts is str else len(prompts) + initial_capacity = max_new_tokens + len(input_ids_list[0]) + cache_config = StaticKVCacheConfig( + max_batch_size=batch_size, max_cache_len=initial_capacity + ) + + model.reset_cache(cache_config) + + # ---------------------------------------------------------------------------- # + # Generate + # ---------------------------------------------------------------------------- # + print(input_contents[0], end="", flush=True) + input_ids_infini = infinicore.from_list(input_ids_list) + + t1 = time.time() + print("=================== start generate ====================") + output_ids = model.generate( + input_ids_infini, + GenerationConfig( + max_new_tokens=max_new_tokens, + temperature=temperature, + top_k=top_k, + top_p=top_p, + ), + _measure_and_log_time=True, + ) + t2 = time.time() + + numpy_output_ids = np.array([output_id.to_numpy()[0] for output_id in output_ids]) + print(tokenizer.decode(numpy_output_ids, skip_special_tokens=True)) + + print( + f"total_time: {round((t2 - t1) * 1000, 2)} ms", + ) + + +if __name__ == "__main__": + cfg = BaseConfig() + + device_str = cfg.get_device_str(cfg.device) + + prompts = [cfg.prompt for _ in range(cfg.batch_size)] + + _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size + + model_path = cfg.model + + max_new_tokens = cfg.max_new_tokens + + backend = cfg.backend + + tp = cfg.tp + + enable_paged_attn = cfg.enable_paged_attn + + enable_graph = cfg.enable_graph + + if backend != "cpp": + raise ValueError(f"Unsupported backend: {backend}.") + + infini_device = infinicore.device(device_str, 0) + + test( + prompts, + model_path, + max_new_tokens, + infini_device=infini_device, + tp=tp, + enable_paged_attn=enable_paged_attn, + enable_graph=enable_graph, + top_k=cfg.top_k, + top_p=cfg.top_p, + temperature=cfg.temperature, + attn_backend=cfg.attn, + ) \ No newline at end of file From 18615afef4a91e2d367fc1924058e66512d08fcf Mon Sep 17 00:00:00 2001 From: MoringLotus Date: Fri, 10 Apr 2026 07:57:22 +0000 Subject: [PATCH 08/12] fix some divergence --- .gitignore | 1 - README.md | 182 ++++++++++++++++++++++++++++++++++++++ examples/jiuge_fix.py | 200 ------------------------------------------ 3 files changed, 182 insertions(+), 201 deletions(-) create mode 100644 README.md delete mode 100644 examples/jiuge_fix.py diff --git a/.gitignore b/.gitignore index 086071a5..7c079758 100644 --- a/.gitignore +++ b/.gitignore @@ -8,7 +8,6 @@ model_weight/ jiuge_infer.sh jiuge.sh -README.md # MacOS Cache .DS_Store diff --git a/README.md b/README.md new file mode 100644 index 00000000..3a455a34 --- /dev/null +++ b/README.md @@ -0,0 +1,182 @@ +# InfiniLM + +![star](https://atomgit.com/InfiniTensor/InfiniLM/star/badge.svg) + +本项目是基于 [`InfiniCore`](https://github.com/InfiniTensor/InfiniCore) 的推理引擎。 + +## 使用方式 + +- 编译并安装 `InfiniCore` 。注意根据提示设置好 `INFINI_ROOT` 环境变量(默认为 `$HOME/.infini`)。 + +- 编译并安装 `InfiniLM` + +```bash +xmake && xmake install +``` + +- 运行模型推理测试 + +```bash +python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] path/to/model_dir [n_device] +``` + +- 部署模型推理服务 + +```bash +python scripts/launch_server.py --model-path MODEL_PATH [-h] [--dev {cpu,nvidia,qy, cambricon,ascend,metax,moore,iluvatar,kunlun,hygon}] [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS] +``` + +- 测试模型推理服务性能 + +```bash +python scripts/test_perf.py +``` + +- 使用推理服务测试模型困惑度(Perplexity) + +```bash +python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS] +``` + +## 使用方式(新版) +#### 一、编译并安装 `InfiniCore` +编译并安装 `InfiniCore`, 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) : + +- 注意根据提示设置好 `INFINI_ROOT` 环境变量(默认为 `$HOME/.infini`) +- 根据硬件平台,选择 xmake 构建配置 +- 编译安装InfiniCore +- 安装 C++ 库 +- 安装 Python 包 + + +#### 二、编译并安装 `InfiniLM` + - 克隆项目 + + 由于仓库中含有子模块,所以在克隆时请添加 `--recursive` 或 `--recurse-submodules`,如: + + ```shell + git clone --recursive https://github.com/InfiniTensor/InfiniLM.git + ``` + + 或者在普通克隆后进行更新: + + ```shell + git submodule update --init --recursive + ``` + + + - 选择是否使用kv caching,默认为false;在支持了此算子的平台(英伟达、阿里、天数、沐曦、海光、QY)可以使用 + ```bash + xmake f --use-kv-caching= [true | false] -cv + ``` + + + - 安装 InfiniLM Python 包 + ```bash + pip install -e . + ``` + + - 单次推理测试 + - llama示例 + ```bash + python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali | --cambricon | --hygon] --model_path= + ``` + - 例如: + ```bash + python examples/jigue.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0 + ``` + - 分布式推理测试 + - 9g示例 + ```bash + python examples/jiuge.py [---nvidia] --model_path= --backend=cpp --tp=NDEV --batch_size=MAX_BATCH + ``` + + - 例如: 9G7B模型,cpp后端,batch_size为16,4卡分布式 + ```bash + python examples/jiuge.py --nvidia --model_path=/models/9G7B_MHA/ --backend=cpp --tp=4 --batch_size=16 + ``` + + + - 推理服务测试 + - 启动推理服务 + ```bash + python python/infinilm/server/inference_server.py [--cpu | --nvidia | --metax | --moore | --iluvatar | --cambricon] --model_path= --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH --tp=NDEV --temperature=TEMP --top_p=TOP_P --top_k=TOP_K --host=HOST --port=PORT + ``` + + - 单卡示例: + ```bash + CUDA_VISIBLE_DEVICES=0 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=1 --temperature=1.0 --top_p=0.8 --top_k=1 + ``` + + - 多卡分布式示例: + ```bash + CUDA_VISIBLE_DEVICES=0,1,2,3 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=4 --temperature=1.0 --top_p=0.8 --top_k=1 + ``` + + - 测试推理服务性能: + ```bash + python scripts/test_perf.py --verbose + ``` + + - 运行推理基准测试(C-Eval/MMLU) + + ```bash + python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH] + ``` + + - 参数说明: + - `--subject`: 指定科目,支持单个科目、多个科目(逗号分隔)或 `all`(默认值,加载全部科目) + - `--output_csv`: 可选,指定CSV输出文件路径。如未指定则不生成CSV文件。CSV包含每个科目的结果和总体结果 + - `--cache_dir`: 可选,指定数据集缓存目录的父目录。应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录(例如 `~/.cache/huggingface/datasets/`)。设置后脚本优先使用本地 CSV(`pandas.read_csv`)离线加载数据,避免 `load_dataset` 的网络请求 + + - C-Eval示例: + - 单个科目: + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --num_samples 100 --backend cpp --ndev 1 + ``` + - 多个科目(逗号分隔): + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics,high_school_physics --backend cpp --ndev 1 --output_csv results.csv + ``` + - 全部科目并输出CSV: + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject all --backend cpp --ndev 1 --output_csv results.csv + ``` + - 使用缓存目录加速加载: + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/ + ``` + > 注意:`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录,而不是直接指向这些子目录 + + - MMLU示例: + - 单个科目: + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 + ``` + - 多个科目(逗号分隔): + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra,anatomy,astronomy --backend cpp --ndev 1 --output_csv results.csv + ``` + - 使用缓存目录加速加载: + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/ + ``` + > 注意:`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录,而不是直接指向这些子目录 + + - 试验中功能 + - Warm Up + ```bash + python examples/bench.py --nvidia --model= --warmup + ``` + - Paged Attention + ```bash + python examples/bench.py --nvidia --model= --enable-paged-attn + ``` + - CUDA Graph + ```bash + python examples/bench.py --nvidia --model= --enable-paged-attn --enable-graph + ``` + - 选择attention后端 (使用flash attention后端需要先在InfiniCore完成相关配置和编译) + ```bash + python examples/bench.py --nvidia --model= --enable-paged-attn [--attn=default | --attn=flash-attn] + ``` \ No newline at end of file diff --git a/examples/jiuge_fix.py b/examples/jiuge_fix.py deleted file mode 100644 index 9f15433c..00000000 --- a/examples/jiuge_fix.py +++ /dev/null @@ -1,200 +0,0 @@ -import infinicore -import transformers -from transformers import AutoTokenizer -from tokenizers import decoders as _dec -from infinilm.modeling_utils import load_model_state_dict_by_file -from infinilm.distributed import DistConfig -from infinilm.infer_engine import GenerationConfig, InferEngine -import argparse -import sys -import time -import os -import numpy as np -from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig -from packaging import version -from infinilm.base_config import BaseConfig - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) - -_PAGED_KV_BLOCK_SIZE = 256 - - - -def test( - prompts: str | list[str], - model_path, - max_new_tokens=100, - infini_device=infinicore.device("cpu", 0), - tp=1, - enable_paged_attn=False, - enable_graph=False, - top_k=1, - top_p=1.0, - temperature=1.0, - attn_backend="default", -): - model_path = os.path.expanduser(model_path) - # ---------------------------------------------------------------------------- # - # Create Model - # ---------------------------------------------------------------------------- # - if enable_paged_attn and attn_backend == "default": - attn_backend = "paged-attn" - - model = InferEngine( - model_path, - device=infini_device, - distributed_config=DistConfig(tp), - enable_graph_compiling=enable_graph, - attention_backend=attn_backend, - kv_cache_dtype=cfg.kv_cache_dtype, - ) - # ---------------------------------------------------------------------------- # - # Load Weights - # ---------------------------------------------------------------------------- # - load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype) - - # ---------------------------------------------------------------------------- # - # create tokenizer - # ---------------------------------------------------------------------------- # - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - if "llama" == model.config.model_type: - backend = getattr(tokenizer, "backend_tokenizer", None) - target = getattr(backend, "_tokenizer", backend) - norm = getattr(target, "normalizer", None) - dec = getattr(target, "decoder", None) - sn = repr(norm)[:800] if norm is not None else "" - sd = repr(dec)[:800] if dec is not None else "" - has_prepend = "Prepend" in sn - has_strip = "Strip" in sd - if has_prepend and has_strip: - target.decoder = _dec.Sequence( - [ - _dec.Replace("▁", " "), - _dec.ByteFallback(), - _dec.Fuse(), - ] - ) - - # ---------------------------------------------------------------------------- # - # tokenize - # ---------------------------------------------------------------------------- # - # prompt = "山东最高的山是?" - if isinstance(prompts, str): - prompts = [prompts] - input_contents = [ - tokenizer.apply_chat_template( - conversation=[{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - for prompt in prompts - ] - - # input_ids_list = tokenizer.batch_encode_plus(input_contents)[ - # "input_ids" - # ] # List: [[1, 1128, 526, 366, 29892]] - if version.parse(transformers.__version__) < version.parse("5.0.0"): - # Ideally this is solved by upgrading transformers. However, doing so causes version mismatch between transformers and mlu pytorch on devices with Phytium CPU. So a branch is temporarily used. - input_ids_list = [ - tokenizer.encode_plus( - text, truncation=True, max_length=2048, add_special_tokens=True - )["input_ids"] - for text in input_contents - ] - else: - input_ids_list = [ - tokenizer._encode_plus( - text, truncation=True, max_length=2048, add_special_tokens=True - )["input_ids"] - for text in input_contents - ] - - # ---------------------------------------------------------------------------- # - # Create KVCache - # ---------------------------------------------------------------------------- # - if enable_paged_attn: - batch_size = 1 if prompts is str else len(prompts) - max_total_tokens = max_new_tokens + len(input_ids_list[0]) - cache_config = PagedKVCacheConfig( - num_blocks=( - (max_total_tokens + (_PAGED_KV_BLOCK_SIZE - 1)) // _PAGED_KV_BLOCK_SIZE - ) - * batch_size, - block_size=_PAGED_KV_BLOCK_SIZE, - ) - else: - batch_size = 1 if prompts is str else len(prompts) - initial_capacity = max_new_tokens + len(input_ids_list[0]) - cache_config = StaticKVCacheConfig( - max_batch_size=batch_size, max_cache_len=initial_capacity - ) - - model.reset_cache(cache_config) - - # ---------------------------------------------------------------------------- # - # Generate - # ---------------------------------------------------------------------------- # - print(input_contents[0], end="", flush=True) - input_ids_infini = infinicore.from_list(input_ids_list) - - t1 = time.time() - print("=================== start generate ====================") - output_ids = model.generate( - input_ids_infini, - GenerationConfig( - max_new_tokens=max_new_tokens, - temperature=temperature, - top_k=top_k, - top_p=top_p, - ), - _measure_and_log_time=True, - ) - t2 = time.time() - - numpy_output_ids = np.array([output_id.to_numpy()[0] for output_id in output_ids]) - print(tokenizer.decode(numpy_output_ids, skip_special_tokens=True)) - - print( - f"total_time: {round((t2 - t1) * 1000, 2)} ms", - ) - - -if __name__ == "__main__": - cfg = BaseConfig() - - device_str = cfg.get_device_str(cfg.device) - - prompts = [cfg.prompt for _ in range(cfg.batch_size)] - - _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size - - model_path = cfg.model - - max_new_tokens = cfg.max_new_tokens - - backend = cfg.backend - - tp = cfg.tp - - enable_paged_attn = cfg.enable_paged_attn - - enable_graph = cfg.enable_graph - - if backend != "cpp": - raise ValueError(f"Unsupported backend: {backend}.") - - infini_device = infinicore.device(device_str, 0) - - test( - prompts, - model_path, - max_new_tokens, - infini_device=infini_device, - tp=tp, - enable_paged_attn=enable_paged_attn, - enable_graph=enable_graph, - top_k=cfg.top_k, - top_p=cfg.top_p, - temperature=cfg.temperature, - attn_backend=cfg.attn, - ) \ No newline at end of file From 070bcd53705c8a7448fe136de42dc31717099f1c Mon Sep 17 00:00:00 2001 From: MoringLotus Date: Fri, 10 Apr 2026 08:10:39 +0000 Subject: [PATCH 09/12] delete redundancy --- .gitignore | 39 ------------ README.md | 182 ----------------------------------------------------- 2 files changed, 221 deletions(-) delete mode 100644 .gitignore delete mode 100644 README.md diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 7c079758..00000000 --- a/.gitignore +++ /dev/null @@ -1,39 +0,0 @@ -# Xmake cache -.xmake/ -build/ -python/infinilm/lib/*.so - -#model_weight -model_weight/ - -jiuge_infer.sh -jiuge.sh - -# MacOS Cache -.DS_Store - -# Vscode -.vscode/ - -# Python -__pycache__/ -*.egg-info/ - -# Log -*.log - -# Cache -.cache/ - -# JSON -*.json - -#GGUF -*.gguf - -# txt -*.txt - -*.http - -*.nsys-rep diff --git a/README.md b/README.md deleted file mode 100644 index 3a455a34..00000000 --- a/README.md +++ /dev/null @@ -1,182 +0,0 @@ -# InfiniLM - -![star](https://atomgit.com/InfiniTensor/InfiniLM/star/badge.svg) - -本项目是基于 [`InfiniCore`](https://github.com/InfiniTensor/InfiniCore) 的推理引擎。 - -## 使用方式 - -- 编译并安装 `InfiniCore` 。注意根据提示设置好 `INFINI_ROOT` 环境变量(默认为 `$HOME/.infini`)。 - -- 编译并安装 `InfiniLM` - -```bash -xmake && xmake install -``` - -- 运行模型推理测试 - -```bash -python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] path/to/model_dir [n_device] -``` - -- 部署模型推理服务 - -```bash -python scripts/launch_server.py --model-path MODEL_PATH [-h] [--dev {cpu,nvidia,qy, cambricon,ascend,metax,moore,iluvatar,kunlun,hygon}] [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS] -``` - -- 测试模型推理服务性能 - -```bash -python scripts/test_perf.py -``` - -- 使用推理服务测试模型困惑度(Perplexity) - -```bash -python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS] -``` - -## 使用方式(新版) -#### 一、编译并安装 `InfiniCore` -编译并安装 `InfiniCore`, 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) : - -- 注意根据提示设置好 `INFINI_ROOT` 环境变量(默认为 `$HOME/.infini`) -- 根据硬件平台,选择 xmake 构建配置 -- 编译安装InfiniCore -- 安装 C++ 库 -- 安装 Python 包 - - -#### 二、编译并安装 `InfiniLM` - - 克隆项目 - - 由于仓库中含有子模块,所以在克隆时请添加 `--recursive` 或 `--recurse-submodules`,如: - - ```shell - git clone --recursive https://github.com/InfiniTensor/InfiniLM.git - ``` - - 或者在普通克隆后进行更新: - - ```shell - git submodule update --init --recursive - ``` - - - - 选择是否使用kv caching,默认为false;在支持了此算子的平台(英伟达、阿里、天数、沐曦、海光、QY)可以使用 - ```bash - xmake f --use-kv-caching= [true | false] -cv - ``` - - - - 安装 InfiniLM Python 包 - ```bash - pip install -e . - ``` - - - 单次推理测试 - - llama示例 - ```bash - python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali | --cambricon | --hygon] --model_path= - ``` - - 例如: - ```bash - python examples/jigue.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0 - ``` - - 分布式推理测试 - - 9g示例 - ```bash - python examples/jiuge.py [---nvidia] --model_path= --backend=cpp --tp=NDEV --batch_size=MAX_BATCH - ``` - - - 例如: 9G7B模型,cpp后端,batch_size为16,4卡分布式 - ```bash - python examples/jiuge.py --nvidia --model_path=/models/9G7B_MHA/ --backend=cpp --tp=4 --batch_size=16 - ``` - - - - 推理服务测试 - - 启动推理服务 - ```bash - python python/infinilm/server/inference_server.py [--cpu | --nvidia | --metax | --moore | --iluvatar | --cambricon] --model_path= --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH --tp=NDEV --temperature=TEMP --top_p=TOP_P --top_k=TOP_K --host=HOST --port=PORT - ``` - - - 单卡示例: - ```bash - CUDA_VISIBLE_DEVICES=0 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=1 --temperature=1.0 --top_p=0.8 --top_k=1 - ``` - - - 多卡分布式示例: - ```bash - CUDA_VISIBLE_DEVICES=0,1,2,3 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=4 --temperature=1.0 --top_p=0.8 --top_k=1 - ``` - - - 测试推理服务性能: - ```bash - python scripts/test_perf.py --verbose - ``` - - - 运行推理基准测试(C-Eval/MMLU) - - ```bash - python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH] - ``` - - - 参数说明: - - `--subject`: 指定科目,支持单个科目、多个科目(逗号分隔)或 `all`(默认值,加载全部科目) - - `--output_csv`: 可选,指定CSV输出文件路径。如未指定则不生成CSV文件。CSV包含每个科目的结果和总体结果 - - `--cache_dir`: 可选,指定数据集缓存目录的父目录。应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录(例如 `~/.cache/huggingface/datasets/`)。设置后脚本优先使用本地 CSV(`pandas.read_csv`)离线加载数据,避免 `load_dataset` 的网络请求 - - - C-Eval示例: - - 单个科目: - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --num_samples 100 --backend cpp --ndev 1 - ``` - - 多个科目(逗号分隔): - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics,high_school_physics --backend cpp --ndev 1 --output_csv results.csv - ``` - - 全部科目并输出CSV: - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject all --backend cpp --ndev 1 --output_csv results.csv - ``` - - 使用缓存目录加速加载: - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/ - ``` - > 注意:`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录,而不是直接指向这些子目录 - - - MMLU示例: - - 单个科目: - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 - ``` - - 多个科目(逗号分隔): - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra,anatomy,astronomy --backend cpp --ndev 1 --output_csv results.csv - ``` - - 使用缓存目录加速加载: - ```bash - python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/ - ``` - > 注意:`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录,而不是直接指向这些子目录 - - - 试验中功能 - - Warm Up - ```bash - python examples/bench.py --nvidia --model= --warmup - ``` - - Paged Attention - ```bash - python examples/bench.py --nvidia --model= --enable-paged-attn - ``` - - CUDA Graph - ```bash - python examples/bench.py --nvidia --model= --enable-paged-attn --enable-graph - ``` - - 选择attention后端 (使用flash attention后端需要先在InfiniCore完成相关配置和编译) - ```bash - python examples/bench.py --nvidia --model= --enable-paged-attn [--attn=default | --attn=flash-attn] - ``` \ No newline at end of file From 91492a737a31dec751f5b9eae38fa8cd6b3397c3 Mon Sep 17 00:00:00 2001 From: MoringLotus Date: Fri, 10 Apr 2026 08:25:13 +0000 Subject: [PATCH 10/12] fix push question --- .gitignore | 36 ++++++++++ README.md | 182 +++++++++++++++++++++++++++++++++++++++++++++++++ jiuge_infer.sh | 1 + 3 files changed, 219 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 jiuge_infer.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..11b10c29 --- /dev/null +++ b/.gitignore @@ -0,0 +1,36 @@ +# Xmake cache +.xmake/ +build/ +python/infinilm/lib/*.so + +# MacOS Cache +.DS_Store + +# Vscode +.vscode/ + + +model_weight/ + +# Python +__pycache__/ +*.egg-info/ + +# Log +*.log + +# Cache +.cache/ + +# JSON +*.json + +#GGUF +*.gguf + +# txt +*.txt + +*.http + +*.nsys-rep diff --git a/README.md b/README.md new file mode 100644 index 00000000..48448c56 --- /dev/null +++ b/README.md @@ -0,0 +1,182 @@ +# InfiniLM + +![star](https://atomgit.com/InfiniTensor/InfiniLM/star/badge.svg) + +本项目是基于 [`InfiniCore`](https://github.com/InfiniTensor/InfiniCore) 的推理引擎。 + +## 使用方式 + +- 编译并安装 `InfiniCore` 。注意根据提示设置好 `INFINI_ROOT` 环境变量(默认为 `$HOME/.infini`)。 + +- 编译并安装 `InfiniLM` + +```bash +xmake && xmake install +``` + +- 运行模型推理测试 + +```bash +python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] path/to/model_dir [n_device] +``` + +- 部署模型推理服务 + +```bash +python scripts/launch_server.py --model-path MODEL_PATH [-h] [--dev {cpu,nvidia,qy, cambricon,ascend,metax,moore,iluvatar,kunlun,hygon}] [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS] +``` + +- 测试模型推理服务性能 + +```bash +python scripts/test_perf.py +``` + +- 使用推理服务测试模型困惑度(Perplexity) + +```bash +python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS] +``` + +## 使用方式(新版) +#### 一、编译并安装 `InfiniCore` +编译并安装 `InfiniCore`, 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) : + +- 注意根据提示设置好 `INFINI_ROOT` 环境变量(默认为 `$HOME/.infini`) +- 根据硬件平台,选择 xmake 构建配置 +- 编译安装InfiniCore +- 安装 C++ 库 +- 安装 Python 包 + + +#### 二、编译并安装 `InfiniLM` + - 克隆项目 + + 由于仓库中含有子模块,所以在克隆时请添加 `--recursive` 或 `--recurse-submodules`,如: + + ```shell + git clone --recursive https://github.com/InfiniTensor/InfiniLM.git + ``` + + 或者在普通克隆后进行更新: + + ```shell + git submodule update --init --recursive + ``` + + + - 选择是否使用kv caching,默认为false;在支持了此算子的平台(英伟达、阿里、天数、沐曦、海光、QY)可以使用 + ```bash + xmake f --use-kv-caching= [true | false] -cv + ``` + + + - 安装 InfiniLM Python 包 + ```bash + pip install -e . + ``` + + - 单次推理测试 + - llama示例 + ```bash + python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali | --cambricon | --hygon] --model_path= + ``` + - 例如: + ```bash + python examples/jigue.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0 + ``` + - 分布式推理测试 + - 9g示例 + ```bash + python examples/jiuge.py [---nvidia] --model_path= --backend=cpp --tp=NDEV --batch_size=MAX_BATCH + ``` + + - 例如: 9G7B模型,cpp后端,batch_size为16,4卡分布式 + ```bash + python examples/jiuge.py --nvidia --model_path=/models/9G7B_MHA/ --backend=cpp --tp=4 --batch_size=16 + ``` + + + - 推理服务测试 + - 启动推理服务 + ```bash + python python/infinilm/server/inference_server.py [--cpu | --nvidia | --metax | --moore | --iluvatar | --cambricon] --model_path= --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH --tp=NDEV --temperature=TEMP --top_p=TOP_P --top_k=TOP_K --host=HOST --port=PORT + ``` + + - 单卡示例: + ```bash + CUDA_VISIBLE_DEVICES=0 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=1 --temperature=1.0 --top_p=0.8 --top_k=1 + ``` + + - 多卡分布式示例: + ```bash + CUDA_VISIBLE_DEVICES=0,1,2,3 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=4 --temperature=1.0 --top_p=0.8 --top_k=1 + ``` + + - 测试推理服务性能: + ```bash + python scripts/test_perf.py --verbose + ``` + + - 运行推理基准测试(C-Eval/MMLU) + + ```bash + python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH] + ``` + + - 参数说明: + - `--subject`: 指定科目,支持单个科目、多个科目(逗号分隔)或 `all`(默认值,加载全部科目) + - `--output_csv`: 可选,指定CSV输出文件路径。如未指定则不生成CSV文件。CSV包含每个科目的结果和总体结果 + - `--cache_dir`: 可选,指定数据集缓存目录的父目录。应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录(例如 `~/.cache/huggingface/datasets/`)。设置后脚本优先使用本地 CSV(`pandas.read_csv`)离线加载数据,避免 `load_dataset` 的网络请求 + + - C-Eval示例: + - 单个科目: + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --num_samples 100 --backend cpp --ndev 1 + ``` + - 多个科目(逗号分隔): + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics,high_school_physics --backend cpp --ndev 1 --output_csv results.csv + ``` + - 全部科目并输出CSV: + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject all --backend cpp --ndev 1 --output_csv results.csv + ``` + - 使用缓存目录加速加载: + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/ + ``` + > 注意:`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录,而不是直接指向这些子目录 + + - MMLU示例: + - 单个科目: + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 + ``` + - 多个科目(逗号分隔): + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra,anatomy,astronomy --backend cpp --ndev 1 --output_csv results.csv + ``` + - 使用缓存目录加速加载: + ```bash + python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/ + ``` + > 注意:`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录,而不是直接指向这些子目录 + + - 试验中功能 + - Warm Up + ```bash + python examples/bench.py --nvidia --model= --warmup + ``` + - Paged Attention + ```bash + python examples/bench.py --nvidia --model= --enable-paged-attn + ``` + - CUDA Graph + ```bash + python examples/bench.py --nvidia --model= --enable-paged-attn --enable-graph + ``` + - 选择attention后端 (使用flash attention后端需要先在InfiniCore完成相关配置和编译) + ```bash + python examples/bench.py --nvidia --model= --enable-paged-attn [--attn=default | --attn=flash-attn] + ``` diff --git a/jiuge_infer.sh b/jiuge_infer.sh new file mode 100644 index 00000000..91e7bc7e --- /dev/null +++ b/jiuge_infer.sh @@ -0,0 +1 @@ +python examples/jiuge.py --model-path /home/featurize/work/qy_interview/InfiniLM/model_weight --nvidia \ No newline at end of file From 56a39cdf0933740864a3b695055f6b7620fd588b Mon Sep 17 00:00:00 2001 From: MoringLotus Date: Fri, 10 Apr 2026 08:38:47 +0000 Subject: [PATCH 11/12] Remove jiuge_infer.sh --- jiuge_infer.sh | 1 - 1 file changed, 1 deletion(-) delete mode 100644 jiuge_infer.sh diff --git a/jiuge_infer.sh b/jiuge_infer.sh deleted file mode 100644 index 91e7bc7e..00000000 --- a/jiuge_infer.sh +++ /dev/null @@ -1 +0,0 @@ -python examples/jiuge.py --model-path /home/featurize/work/qy_interview/InfiniLM/model_weight --nvidia \ No newline at end of file From 102c60326b0d155a65d828e0b166d0cfc9338081 Mon Sep 17 00:00:00 2001 From: MoringLotus Date: Fri, 10 Apr 2026 08:40:20 +0000 Subject: [PATCH 12/12] remove --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 11b10c29..149d4bc3 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,7 @@ python/infinilm/lib/*.so # Vscode .vscode/ - +*.sh model_weight/ # Python