diff --git a/.gitignore b/.gitignore index b728e6ea..149d4bc3 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,9 @@ python/infinilm/lib/*.so # Vscode .vscode/ +*.sh +model_weight/ + # Python __pycache__/ *.egg-info/ diff --git a/examples/bench.py b/examples/bench.py index a52c44ec..6fc27498 100644 --- a/examples/bench.py +++ b/examples/bench.py @@ -3,6 +3,7 @@ from infinilm.modeling_utils import load_model_state_dict_by_file from infinilm.distributed import DistConfig from infinilm.infer_engine import GenerationConfig, InferEngine +from infinilm.base_config import BaseConfig from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig import argparse import sys @@ -125,150 +126,6 @@ def get_test_cases( return case_dict - -def get_args(): - parser = argparse.ArgumentParser(description="run Llama args") - - parser.add_argument( - "--cpu", - action="store_true", - help="Run cpu test", - ) - parser.add_argument( - "--nvidia", - action="store_true", - help="Run nvidia test", - ) - parser.add_argument( - "--qy", - action="store_true", - help="Run qy test", - ) - parser.add_argument( - "--metax", - action="store_true", - help="Run metax test", - ) - parser.add_argument( - "--moore", - action="store_true", - help="Run moore test", - ) - parser.add_argument( - "--iluvatar", - action="store_true", - help="Run iluvatar test", - ) - parser.add_argument( - "--cambricon", - action="store_true", - help="Run cambricon test", - ) - parser.add_argument( - "--ali", - action="store_true", - help="Run alippu test", - ) - parser.add_argument( - "--hygon", - action="store_true", - help="Run hygon test", - ) - parser.add_argument( - "--model", - type=str, - required=True, - help="model path", - ) - parser.add_argument( - "--batch-size", - type=parse_list, - default=1, - help="number of prompts in a batch (can be an int or a list of ints, e.g., '1' or '[1,2,4]' or '1,2,4')", - ) - parser.add_argument( - "--tensor-parallel-size", - "--tp", - type=int, - default=1, - help="total rank for tensor parallel", - ) - parser.add_argument( - "--input-len", - type=parse_list, - default=10, - help="output tokens", - ) - - parser.add_argument( - "--output-len", - type=parse_list, - default=20, - help="output tokens", - ) - parser.add_argument( - "--skip-load", - action="store_true", - help="skip loading model weights", - ) - parser.add_argument( - "--top-k", - type=int, - default=1, - help="top k sampling", - ) - - parser.add_argument( - "--top-p", - type=float, - default=1.0, - help="top p sampling", - ) - - parser.add_argument( - "--temperature", - type=float, - default=1.0, - help="sampling temperature", - ) - parser.add_argument( - "--enable-paged-attn", - action="store_true", - help="use paged cache", - ) - parser.add_argument( - "--paged-kv-block-size", - type=int, - default=256, - help="num tokens each kv block can hold", - ) - parser.add_argument( - "--enable-graph", - action="store_true", - help="enable graph compiling", - ) - parser.add_argument( - "--warmup", - action="store_true", - help="Perform a warmup run before benchmarking/inference.", - ) - parser.add_argument( - "--attn", - type=str, - default="default", - choices=["default", "paged-attn", "flash-attn"], - help="attention backend to use: 'default' or 'flash-attn'", - ) - parser.add_argument( - "--kv-cache-dtype", - type=str, - default=None, - choices=["int8"], - ) - - return parser.parse_args() - - with open("examples/bench_prompt.md", "r") as f: prompt = f.read() @@ -305,7 +162,7 @@ def __init__( cache_config=cache_config, enable_graph_compiling=enable_graph, attention_backend=attn_backend, - kv_cache_dtype=args.kv_cache_dtype, + kv_cache_dtype=cfg.kv_cache_dtype, ) # ---------------------------------------------------------------------------- # @@ -396,52 +253,28 @@ def run( if __name__ == "__main__": - args = get_args() - print(args) - - # Parse command line arguments - device_str = "cpu" - if args.cpu: - device_str = "cpu" - elif args.nvidia: - device_str = "cuda" - elif args.qy: - device_str = "cuda" - elif args.metax: - device_str = "cuda" - elif args.moore: - device_str = "musa" - elif args.iluvatar: - device_str = "cuda" - elif args.cambricon: - device_str = "mlu" - elif args.ali: - device_str = "cuda" - elif args.hygon: - device_str = "cuda" - else: - print( - "python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --tp=1 --input-len=50 --output-len=50" - ) - sys.exit(1) - _PAGED_KV_BLOCK_SIZE = args.paged_kv_block_size + cfg = BaseConfig() + + device_str = cfg.get_device_str(cfg.device) + + _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size # -------------------------------------------------------- # # 解析参数 # -------------------------------------------------------- # - model_path = args.model + model_path = cfg.model infini_device = infinicore.device(device_str, 0) - tp = args.tensor_parallel_size + tp = cfg.tp - skip_load = args.skip_load + skip_load = cfg.skip_load - batch_size = args.batch_size - input_len = args.input_len - output_len = args.output_len - enable_paged_attn = args.enable_paged_attn - enable_graph = args.enable_graph - attn_backend = args.attn + batch_size = cfg.batch_size + input_len = cfg.input_len + output_len = cfg.output_len + enable_paged_attn = cfg.enable_paged_attn + enable_graph = cfg.enable_graph + attn_backend = cfg.attn if isinstance(batch_size, int): batch_size = [batch_size] @@ -488,7 +321,7 @@ def run( # ---------------------------------------------------------------------------- # # Warmup # ---------------------------------------------------------------------------- # - if args.warmup: + if cfg.warmup: warmup_steps = 1 # warmup cache capacity @@ -518,9 +351,9 @@ def run( input_ids_infini, GenerationConfig( max_new_tokens=5, # decode kernel warmup - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, + temperature=cfg.temperature, + top_k=cfg.top_k, + top_p=cfg.top_p, stop_on_eos=False, ), _measure_and_log_time=False, @@ -557,7 +390,7 @@ def run( batch_size=batch_size, input_len=input_len, output_len=output_len, - top_k=args.top_k, - top_p=args.top_p, - temperature=args.temperature, + top_k=cfg.top_k, + top_p=cfg.top_p, + temperature=cfg.temperature, ) diff --git a/examples/jiuge.py b/examples/jiuge.py index fa547435..9f15433c 100644 --- a/examples/jiuge.py +++ b/examples/jiuge.py @@ -12,153 +12,13 @@ import numpy as np from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig from packaging import version +from infinilm.base_config import BaseConfig sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) _PAGED_KV_BLOCK_SIZE = 256 -def get_args(): - parser = argparse.ArgumentParser(description="run Llama args") - - parser.add_argument( - "--cpu", - action="store_true", - help="Run cpu test", - ) - parser.add_argument( - "--nvidia", - action="store_true", - help="Run nvidia test", - ) - parser.add_argument( - "--qy", - action="store_true", - help="Run qy test", - ) - parser.add_argument( - "--metax", - action="store_true", - help="Run metax test", - ) - parser.add_argument( - "--moore", - action="store_true", - help="Run moore test", - ) - parser.add_argument( - "--iluvatar", - action="store_true", - help="Run iluvatar test", - ) - parser.add_argument( - "--cambricon", - action="store_true", - help="Run cambricon test", - ) - parser.add_argument( - "--ali", - action="store_true", - help="Run alippu test", - ) - parser.add_argument( - "--hygon", - action="store_true", - help="Run hygon test", - ) - parser.add_argument( - "--model-path", - type=str, - required=True, - help="model_path", - ) - parser.add_argument( - "--max-new-tokens", - type=int, - default=100, - help="max_new_tokens", - ) - parser.add_argument( - "--backend", - type=str, - default="cpp", - help="python or cpp model", - ) - parser.add_argument( - "--batch-size", - type=int, - default=1, - help="number of prompts in a batch", - ) - parser.add_argument( - "--prompt", - type=str, - default="How are you", - help="input prompt", - ) - parser.add_argument( - "--tp", - type=int, - default=1, - help="total rank for tensor parallel", - ) - parser.add_argument( - "--enable-paged-attn", - action="store_true", - help="use paged cache", - ) - - parser.add_argument( - "--paged-kv-block-size", - type=int, - default=256, - help="num tokens each kv block can hold", - ) - - parser.add_argument( - "--enable-graph", - action="store_true", - help="enable graph compiling", - ) - - parser.add_argument( - "--top-k", - type=int, - default=1, - help="top k sampling", - ) - - parser.add_argument( - "--top-p", - type=float, - default=1.0, - help="top p sampling", - ) - - parser.add_argument( - "--temperature", - type=float, - default=1.0, - help="sampling temperature", - ) - - parser.add_argument( - "--attn", - type=str, - default="default", - choices=["default", "paged-attn", "flash-attn"], - help="attention backend to use: 'default' or 'flash-attn'", - ) - - parser.add_argument( - "--kv-cache-dtype", - type=str, - default=None, - choices=["int8"], - ) - - return parser.parse_args() - def test( prompts: str | list[str], @@ -186,7 +46,7 @@ def test( distributed_config=DistConfig(tp), enable_graph_compiling=enable_graph, attention_backend=attn_backend, - kv_cache_dtype=args.kv_cache_dtype, + kv_cache_dtype=cfg.kv_cache_dtype, ) # ---------------------------------------------------------------------------- # # Load Weights @@ -300,44 +160,26 @@ def test( if __name__ == "__main__": - args = get_args() - print(args) + cfg = BaseConfig() + + device_str = cfg.get_device_str(cfg.device) - # Parse command line arguments - device_str = "cpu" - if args.cpu: - device_str = "cpu" - elif args.nvidia: - device_str = "cuda" - elif args.qy: - device_str = "cuda" - elif args.metax: - device_str = "cuda" - elif args.moore: - device_str = "musa" - elif args.iluvatar: - device_str = "cuda" - elif args.cambricon: - device_str = "mlu" - elif args.ali: - device_str = "cuda" - elif args.hygon: - device_str = "cuda" - else: - print( - "Usage: python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon | --ali | --hygon] --model_path=\n" - "such as, python examples/jiuge.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0" - ) - sys.exit(1) - prompts = [args.prompt for _ in range(args.batch_size)] - _PAGED_KV_BLOCK_SIZE = args.paged_kv_block_size + prompts = [cfg.prompt for _ in range(cfg.batch_size)] + + _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size + + model_path = cfg.model - model_path = args.model_path - max_new_tokens = args.max_new_tokens - backend = args.backend - tp = args.tp - enable_paged_attn = args.enable_paged_attn - enable_graph = args.enable_graph + max_new_tokens = cfg.max_new_tokens + + backend = cfg.backend + + tp = cfg.tp + + enable_paged_attn = cfg.enable_paged_attn + + enable_graph = cfg.enable_graph + if backend != "cpp": raise ValueError(f"Unsupported backend: {backend}.") @@ -351,8 +193,8 @@ def test( tp=tp, enable_paged_attn=enable_paged_attn, enable_graph=enable_graph, - top_k=args.top_k, - top_p=args.top_p, - temperature=args.temperature, - attn_backend=args.attn, - ) + top_k=cfg.top_k, + top_p=cfg.top_p, + temperature=cfg.temperature, + attn_backend=cfg.attn, + ) \ No newline at end of file diff --git a/examples/llama.py b/examples/llama.py index aa890ca9..413afa13 100644 --- a/examples/llama.py +++ b/examples/llama.py @@ -7,70 +7,70 @@ import sys import time import os - +from infinilm.base_config import BaseConfig sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) -def get_args(): - parser = argparse.ArgumentParser(description="run Llama args") - - parser.add_argument( - "--cpu", - action="store_true", - help="Run cpu test", - ) - parser.add_argument( - "--nvidia", - action="store_true", - help="Run nvidia test", - ) - parser.add_argument( - "--metax", - action="store_true", - help="Run metax test", - ) - parser.add_argument( - "--moore", - action="store_true", - help="Run moore test", - ) - parser.add_argument( - "--iluvatar", - action="store_true", - help="Run iluvatar test", - ) - parser.add_argument( - "--model_path", - type=str, - required=True, - help="model_path", - ) - parser.add_argument( - "--max_new_tokens", - type=int, - default=100, - help="max_new_tokens", - ) - parser.add_argument( - "--backend", - type=str, - default="python", - help="python or cpp model", - ) - parser.add_argument( - "--batch_size", - type=int, - default=1, - help="number of prompts in a batch", - ) - parser.add_argument( - "--prompt", - type=str, - default="How are you", - help="input prompt", - ) - - return parser.parse_args() +# def get_args(): +# parser = argparse.ArgumentParser(description="run Llama args") + +# parser.add_argument( +# "--cpu", +# action="store_true", +# help="Run cpu test", +# ) +# parser.add_argument( +# "--nvidia", +# action="store_true", +# help="Run nvidia test", +# ) +# parser.add_argument( +# "--metax", +# action="store_true", +# help="Run metax test", +# ) +# parser.add_argument( +# "--moore", +# action="store_true", +# help="Run moore test", +# ) +# parser.add_argument( +# "--iluvatar", +# action="store_true", +# help="Run iluvatar test", +# ) +# parser.add_argument( +# "--model_path", +# type=str, +# required=True, +# help="model_path", +# ) +# parser.add_argument( +# "--max_new_tokens", +# type=int, +# default=100, +# help="max_new_tokens", +# ) +# parser.add_argument( +# "--backend", +# type=str, +# default="python", +# help="python or cpp model", +# ) +# parser.add_argument( +# "--batch_size", +# type=int, +# default=1, +# help="number of prompts in a batch", +# ) +# parser.add_argument( +# "--prompt", +# type=str, +# default="How are you", +# help="input prompt", +# ) + +# return parser.parse_args() def test( @@ -163,32 +163,33 @@ def test( if __name__ == "__main__": - args = get_args() - print(args) + cfg = BaseConfig() + + device_str = cfg.get_device_str(cfg.device) # Parse command line arguments - device_str = "cpu" - if args.cpu: - device_str = "cpu" - elif args.nvidia: - device_str = "cuda" - elif args.metax: - device_str = "cuda" - elif args.moore: - device_str = "musa" - elif args.iluvatar: - device_str = "cuda" - else: - print( - "Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=\n" - "such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0" - ) - sys.exit(1) - prompts = [args.prompt for _ in range(args.batch_size)] - - model_path = args.model_path - max_new_tokens = args.max_new_tokens - backend = args.backend + # device_str = "cpu" + # if args.cpu: + # device_str = "cpu" + # elif args.nvidia: + # device_str = "cuda" + # elif args.metax: + # device_str = "cuda" + # elif args.moore: + # device_str = "musa" + # elif args.iluvatar: + # device_str = "cuda" + # else: + # print( + # "Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=\n" + # "such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0" + # ) + # sys.exit(1) + prompts = [cfg.prompt for _ in range(cfg.batch_size)] + + model_path = cfg.model + max_new_tokens = cfg.max_new_tokens + backend = cfg.backend if backend != "python": raise ValueError(f"Unsupported backend: {backend}.") diff --git a/python/infinilm/__init__.py b/python/infinilm/__init__.py index e34514a7..f552a2cc 100644 --- a/python/infinilm/__init__.py +++ b/python/infinilm/__init__.py @@ -2,6 +2,7 @@ from . import distributed from . import cache from . import llm +from . import base_config from .llm import ( LLM, @@ -16,6 +17,7 @@ "distributed", "cache", "llm", + "base_config", # LLM classes "LLM", "AsyncLLMEngine", diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py new file mode 100644 index 00000000..0b19700a --- /dev/null +++ b/python/infinilm/base_config.py @@ -0,0 +1,172 @@ +import argparse +import sys +import os +# sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../scripts")) +# from libinfinicore_infemport DeviceType + + +class BaseConfig: + """InfiniLM Unified Config - Command line argument parser""" + + def __init__(self): + + self.parser = argparse.ArgumentParser(description="InfiniLM Unified Config") + self._add_common_args() + self.args, self.extra = self.parser.parse_known_args() + + + self.model = self.args.model + self.device = self.args.device + self.tp = self.args.tp + + + self.attn = self.args.attn + self.enable_graph = self.args.enable_graph + self.cache_type = self.args.cache_type + self.enable_paged_attn = self.args.enable_paged_attn + self.paged_kv_block_size = self.args.paged_kv_block_size + self.num_blocks = self.args.num_blocks + self.block_size = self.args.block_size + self.max_cache_len = self.args.max_cache_len + self.kv_cache_dtype = self.args.kv_cache_dtype + self.skip_load = self.args.skip_load + + + self.batch_size = self.args.batch_size + self.max_batch = self.args.max_batch + self.max_batch_size = self.args.max_batch_size + self.input_len = self.args.input_len + self.output_len = self.args.output_len + self.max_new_tokens = self.args.max_new_tokens + self.max_tokens = self.args.max_tokens + self.prompt = self.args.prompt + self.top_k = self.args.top_k + self.top_p = self.args.top_p + self.temperature = self.args.temperature + + self.warmup = self.args.warmup + self.verbose = self.args.verbose + self.log_level = self.args.log_level + + + # Evaluation parameters + self.bench = self.args.bench + self.backend = self.args.backend + self.tp = self.args.tp + self.subject = self.args.subject + self.split = self.args.split + self.num_samples = self.args.num_samples + self.output_csv = self.args.output_csv + self.cache_dir = self.args.cache_dir + + + # Quantization parameters + self.awq = self.args.awq + self.gptq = self.args.gptq + self.dtype = self.args.dtype + + + # Server parameters + self.host = self.args.host + self.port = self.args.port + self.endpoint = self.args.endpoint + + def _add_common_args(self): + # --- base configuration --- + self.parser.add_argument("--model", type=str, required=True) + self.parser.add_argument("--device", type=str, default="cpu") + self.parser.add_argument("--tp", "--tensor-parallel-size", type=int, default=1) + + + # --- Infer backend optimization --- + self.parser.add_argument("--attn", type=str, default="default", choices=["default", "paged-attn", "flash-attn"]) + self.parser.add_argument("--enable-graph", action="store_true") + self.parser.add_argument("--cache-type", type=str, default="paged", choices=["paged", "static"]) + self.parser.add_argument("--enable-paged-attn", action="store_true", help="use paged cache",) + self.parser.add_argument("--paged-kv-block-size", type=int, default=256) + self.parser.add_argument("--num-blocks", type=int, default=512, help="number of KV cache blocks") + self.parser.add_argument("--block-size", type=int, default=256, help="size of each KV cache block") + self.parser.add_argument("--max-cache-len", type=int, default=4096, help="maximum cache length") + self.parser.add_argument("--kv-cache-dtype", type=str, default=None, choices=["int8"], help="KV cache data type") + self.parser.add_argument("--skip-load", action="store_true", help="skip loading model weights") + + + # --- Length and infer parameters --- + self.parser.add_argument("--batch-size", type=int, default=1) + self.parser.add_argument("--max-batch", type=int, default=3, help="maximum batch size") + self.parser.add_argument("--max-batch-size", type=int, default=8, help="maximum batch size for server") + self.parser.add_argument("--input-len", type=int, default=10, help="input sequence length") + self.parser.add_argument("--output-len", type=int, default=20, help="output sequence length") + self.parser.add_argument("--max-new-tokens", type=int, default=500, help="maximum number of new tokens to generate") + self.parser.add_argument("--max-tokens", type=int, default=512, help="maximum tokens") + self.parser.add_argument("--prompt", type=str, default="How are you", help="default prompt text") + self.parser.add_argument("--top-k", type=int, default=1) + self.parser.add_argument("--top-p", type=float, default=1.0) + self.parser.add_argument("--temperature", type=float, default=1.0) + + # --- debug --- + self.parser.add_argument("--warmup", action="store_false") + self.parser.add_argument("--verbose", action="store_false") + self.parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="logging level") + + + # --- Evaluation parameters --- + self.parser.add_argument("--bench", type=str, default=None, choices=["ceval", "mmlu"], help="benchmark to evaluate") + self.parser.add_argument("--backend", type=str, default="cpp", choices=["python", "cpp", "torch", "vllm"], help="backend type") + + self.parser.add_argument("--subject", type=str, default="all", help="subject(s) to evaluate, comma-separated or 'all'") + self.parser.add_argument("--split", type=str, default="test", choices=["test", "val", "all"], help="dataset split to use") + self.parser.add_argument("--num-samples", type=int, default=None, help="number of samples to evaluate per subject") + self.parser.add_argument("--output-csv", type=str, default=None, help="path to output CSV file for results") + self.parser.add_argument("--cache-dir", type=str, default=None, help="directory for dataset cache") + + + # --- Quantization parameters --- + self.parser.add_argument("--awq", action="store_false", help="use AWQ quantization") + self.parser.add_argument("--gptq", action="store_false", help="use GPTQ quantization") + self.parser.add_argument("--dtype", type=str, default="float16", help="data type for model") + + + # --- Server parameters --- + self.parser.add_argument("--host", type=str, default="0.0.0.0", help="server host") + self.parser.add_argument("--port", type=int, default=8000, help="server port") + self.parser.add_argument("--endpoint", type=str, default="/completions", help="API endpoint") + + + def get_device_str(self, device): + """Convert device name to backend string (cuda/cpu/musa/mlu)""" + DEVICE_STR_MAP = { + "cpu": "cpu", + "nvidia": "cuda", + "qy": "cuda", + "cambricon": "mlu", + "ascend": "ascend", # 假设华为昇腾后端为"ascend" + "metax": "cuda", + "moore": "musa", + "iluvatar": "cuda", + "kunlun": "kunlun", # 假设昆仑芯后端为"kunlun" + "hygon": "cuda", + "ali": "cuda" + } + return DEVICE_STR_MAP.get(device.lower(), "cpu") + + # def _get_device_type(self, dev_str): + # """Convert device string to DeviceType enum""" + # DEVICE_TYPE_MAP = { + # "cpu": DeviceType.DEVICE_TYPE_CPU, + # "nvidia": DeviceType.DEVICE_TYPE_NVIDIA, + # "qy": DeviceType.DEVICE_TYPE_QY, + # "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON, + # "ascend": DeviceType.DEVICE_TYPE_ASCEND, + # "metax": DeviceType.DEVICE_TYPE_METAX, + # "moore": DeviceType.DEVICE_TYPE_MOORE, + # "iluvatar": DeviceType.DEVICE_TYPE_ILUVATAR, + # "kunlun": DeviceType.DEVICE_TYPE_KUNLUN, + # "hygon": DeviceType.DEVICE_TYPE_HYGON, + # "ali": DeviceType.DEVICE_TYPE_ALI + # } + # return DEVICE_TYPE_MAP.get(dev_str.lower(), DeviceType.DEVICE_TYPE_CPU) + + def __repr__(self): + """String representation of configuration""" + return f"BaseConfig(model='{self.model}', device='{self.device}', tp={self.tp})"