From ecd97345dcf8d75fdd0190d0882df812dffc764e Mon Sep 17 00:00:00 2001
From: MoringLotus <lotusdurine2023@gmail.com>
Date: Wed, 8 Apr 2026 07:13:58 +0000
Subject: [PATCH 01/12] Base Config Add

---
 README.md                      |  2 +-
 jiuge.sh                       | 46 ++++++++++++++++++
 python/infinilm/__init__.py    |  2 +
 python/infinilm/base_config.py | 86 ++++++++++++++++++++++++++++++++++
 4 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 jiuge.sh
 create mode 100644 python/infinilm/base_config.py

diff --git a/README.md b/README.md
index 48448c56..afc242b2 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@
 - 编译并安装 `InfiniLM`
 
 ```bash
-xmake && xmake install
+
 ```
 
 - 运行模型推理测试
diff --git a/jiuge.sh b/jiuge.sh
new file mode 100644
index 00000000..e7ddb2dd
--- /dev/null
+++ b/jiuge.sh
@@ -0,0 +1,46 @@
+featurize@featurize:~/work/InfiniLM$ cat jiuge.sh 
+#!/bin/bash
+
+# Jiuge模型运行脚本
+# 使用NVIDIA显卡运行9G4B模型
+
+set -e  # 遇到错误立即退出
+
+echo "=========================================="
+echo "🚀 启动 Jiuge 模型 (9G4B) - NVIDIA版本"
+echo "=========================================="
+export INFINI_ROOT=/home/featurize/.infini
+export LD_LIBRARY_PATH=$INFINI_ROOT/lib:$LD_LIBRARY_PATH
+# 设置参数
+MODEL_DIR="/home/featurize/work/InfiniFamily/9G4B"
+DEVICE="--nvidia"
+N_DEVICE=1
+SCRIPT_PATH="python scripts/jiuge.py"
+
+# 检查模型目录是否存在
+if [ ! -d "$MODEL_DIR" ]; then
+    echo "❌ 错误: 模型目录不存在: $MODEL_DIR"
+    echo "请检查路径是否正确"
+    exit 1
+fi
+
+# 检查Python脚本是否存在
+if [ ! -f "scripts/jiuge.py" ]; then
+    echo "❌ 错误: 未找到jiuge.py脚本: scripts/jiuge.py"
+    echo "请确保在当前目录下运行此脚本"
+    exit 1
+fi
+
+echo "📁 模型路径: $MODEL_DIR"
+echo "🎯 设备类型: NVIDIA GPU"
+echo "💻 设备数量: $N_DEVICE"
+echo ""
+
+# 运行模型
+echo "🔄 启动模型..."
+$SCRIPT_PATH $DEVICE $MODEL_DIR $N_DEVICE
+
+echo ""
+echo "=========================================="
+echo "✅ 模型运行完成"
+echo "=========================================="ß
\ No newline at end of file
diff --git a/python/infinilm/__init__.py b/python/infinilm/__init__.py
index e34514a7..f552a2cc 100644
--- a/python/infinilm/__init__.py
+++ b/python/infinilm/__init__.py
@@ -2,6 +2,7 @@
 from . import distributed
 from . import cache
 from . import llm
+from . import base_config
 
 from .llm import (
     LLM,
@@ -16,6 +17,7 @@
     "distributed",
     "cache",
     "llm",
+    "base_config",
     # LLM classes
     "LLM",
     "AsyncLLMEngine",
diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py
new file mode 100644
index 00000000..d4104b5e
--- /dev/null
+++ b/python/infinilm/base_config.py
@@ -0,0 +1,86 @@
+import argparse
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../scripts"))                                                           
+from libinfinicore_infer import DeviceType  
+
+
+class BaseConfig:
+    """InfiniLM Unified Config - Command line argument parser"""
+
+    def __init__(self):
+
+        self.parser = argparse.ArgumentParser(description="InfiniLM Unified Config")
+        self._add_common_args()
+        self.args, self.extra = self.parser.parse_known_args()
+
+        
+        self.model = self.args.model
+        self.device_name = self.args.device
+        self.device_type = self._get_device_type(self.args.device)
+        self.tp = self.args.tp
+
+
+        self.attn = self.args.attn
+        self.enable_graph = self.args.enable_graph
+        self.cache_type = self.args.cache_type
+        self.enable_paged_attn = self.args.enable_paged_attn
+        self.paged_kv_block_size = self.args.paged_kv_block_size
+
+  
+        self.batch_size = self.args.batch_size
+        self.top_k = self.args.top_k
+        self.top_p = self.args.top_p
+        self.temperature = self.args.temperature
+
+        self.warm_up = self.args.warm_up
+        self.verbose = self.args.verbose
+        self.log_evel = self.args.log_evel
+
+    def _add_common_args(self):
+        # --- base configuration ---
+        self.parser.add_argument("--model", type=str, required=True)
+        self.parser.add_argument("--device", type=str, default="cpu")
+        self.parser.add_argument("--tp", "--tensor-parallel-size", type=int, default=1)
+        
+        
+        # --- Infer backend optimization ---
+        self.parser.add_argument("--attn", type=str, default="default", choices=["default", "flash-attn"])
+        self.parser.add_argument("--enable-graph", action="store_true")
+        self.parser.add_argument("--cache-type", type=str, default="paged", choices=["paged",   "static"])
+        self.parser.add_argument("--enable-paged-attn", action="store_true", help="use paged cache",)
+        self.parser.add_argument("--paged-kv-block-size", type=int, default=256)
+        
+
+        # --- Length and infer parameters ---
+        self.parser.add_argument("--batch-size", type=int, default=1)
+        self.parser.add_argument("--top-k", type=int, default=1)
+        self.parser.add_argument("--top-p", type=float, default=1.0)
+        self.parser.add_argument("--temperature", type=float, default=1.0)
+        
+        # --- debug ---
+        self.parser.add_argument("--warmup", action="store_true")
+        self.parser.add_argument("--verbose", action="store_true")
+        self.parser.add_argument("--log-evel", type=str, default="INFO")
+
+
+    def _get_device_type(self, dev_str):
+        """Convert device string to DeviceType enum"""
+        DEVICE_TYPE_MAP = {
+            "cpu": DeviceType.DEVICE_TYPE_CPU,
+            "nvidia": DeviceType.DEVICE_TYPE_NVIDIA,
+            "qy": DeviceType.DEVICE_TYPE_QY,
+            "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON,
+            "ascend": DeviceType.DEVICE_TYPE_ASCEND,
+            "metax": DeviceType.DEVICE_TYPE_METAX,
+            "moore": DeviceType.DEVICE_TYPE_MOORE,
+            "iluvatar": DeviceType.DEVICE_TYPE_ILUVATAR,
+            "kunlun": DeviceType.DEVICE_TYPE_KUNLUN,
+            "hygon": DeviceType.DEVICE_TYPE_HYGON,
+            "ali": DeviceType.DEVICE_TYPE_ALI
+        }
+        return DEVICE_TYPE_MAP.get(dev_str.lower(), DeviceType.DEVICE_TYPE_CPU)
+
+    def __repr__(self):
+        """String representation of configuration"""
+        return f"BaseConfig(model='{self.model}', device='{self.device_name}', tp={self.tp})"

From 312f75e6d1d259e38aec535668ca8a0d555a6ce8 Mon Sep 17 00:00:00 2001
From: MoringLotus <lotusdurine2023@gmail.com>
Date: Wed, 8 Apr 2026 08:16:26 +0000
Subject: [PATCH 02/12] args supplement

---
 python/infinilm/base_config.py | 46 ++++++++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py
index d4104b5e..5e1a8bd2 100644
--- a/python/infinilm/base_config.py
+++ b/python/infinilm/base_config.py
@@ -26,9 +26,14 @@ def __init__(self):
         self.cache_type = self.args.cache_type
         self.enable_paged_attn = self.args.enable_paged_attn
         self.paged_kv_block_size = self.args.paged_kv_block_size
+        self.kv_cache_dtype = self.args.kv_cache_dtype
+        self.skip_load = self.args.skip_load
+
 
-  
         self.batch_size = self.args.batch_size
+        self.input_len = self.args.input_len
+        self.output_len = self.args.output_len
+        self.max_new_tokens = self.args.max_new_tokens
         self.top_k = self.args.top_k
         self.top_p = self.args.top_p
         self.temperature = self.args.temperature
@@ -37,6 +42,22 @@ def __init__(self):
         self.verbose = self.args.verbose
         self.log_evel = self.args.log_evel
 
+
+        # Evaluation parameters
+        self.bench = self.args.bench
+        self.backend = self.args.backend
+        self.ndev = self.args.ndev
+        self.subject = self.args.subject
+        self.split = self.args.split
+        self.num_samples = self.args.num_samples
+        self.output_csv = self.args.output_csv
+        self.cache_dir = self.args.cache_dir
+
+
+        # Quantization parameters
+        self.awq = self.args.awq
+        self.gptq = self.args.gptq
+
     def _add_common_args(self):
         # --- base configuration ---
         self.parser.add_argument("--model", type=str, required=True)
@@ -45,15 +66,20 @@ def _add_common_args(self):
         
         
         # --- Infer backend optimization ---
-        self.parser.add_argument("--attn", type=str, default="default", choices=["default", "flash-attn"])
+        self.parser.add_argument("--attn", type=str, default="default", choices=["default", "paged-attn", "flash-attn"])
         self.parser.add_argument("--enable-graph", action="store_true")
         self.parser.add_argument("--cache-type", type=str, default="paged", choices=["paged",   "static"])
         self.parser.add_argument("--enable-paged-attn", action="store_true", help="use paged cache",)
         self.parser.add_argument("--paged-kv-block-size", type=int, default=256)
+        self.parser.add_argument("--kv-cache-dtype", type=str, default=None, choices=["int8"], help="KV cache data type")
+        self.parser.add_argument("--skip-load", action="store_true", help="skip loading model weights")
         
 
         # --- Length and infer parameters ---
         self.parser.add_argument("--batch-size", type=int, default=1)
+        self.parser.add_argument("--input-len", type=int, default=10, help="input sequence length")
+        self.parser.add_argument("--output-len", type=int, default=20, help="output sequence length")
+        self.parser.add_argument("--max-new-tokens", type=int, default=500, help="maximum number of new tokens to generate")
         self.parser.add_argument("--top-k", type=int, default=1)
         self.parser.add_argument("--top-p", type=float, default=1.0)
         self.parser.add_argument("--temperature", type=float, default=1.0)
@@ -64,6 +90,22 @@ def _add_common_args(self):
         self.parser.add_argument("--log-evel", type=str, default="INFO")
 
 
+        # --- Evaluation parameters ---
+        self.parser.add_argument("--bench", type=str, default=None, choices=["ceval", "mmlu"], help="benchmark to evaluate")
+        self.parser.add_argument("--backend", type=str, default="cpp", choices=["python", "cpp", "torch", "vllm"], help="backend type")
+        self.parser.add_argument("--ndev", type=int, default=1, help="number of devices for tensor parallelism")
+        self.parser.add_argument("--subject", type=str, default="all", help="subject(s) to evaluate, comma-separated or 'all'")
+        self.parser.add_argument("--split", type=str, default="test", choices=["test", "val", "all"], help="dataset split to use")
+        self.parser.add_argument("--num-samples", type=int, default=None, help="number of samples to evaluate per subject")
+        self.parser.add_argument("--output-csv", type=str, default=None, help="path to output CSV file for results")
+        self.parser.add_argument("--cache-dir", type=str, default=None, help="directory for dataset cache")
+
+
+        # --- Quantization parameters ---
+        self.parser.add_argument("--awq", action="store_true", help="use AWQ quantization")
+        self.parser.add_argument("--gptq", action="store_true", help="use GPTQ quantization")
+
+
     def _get_device_type(self, dev_str):
         """Convert device string to DeviceType enum"""
         DEVICE_TYPE_MAP = {

From 46997bf7c63373e8bb0f97faa2cfa9fdb86f5c15 Mon Sep 17 00:00:00 2001
From: MoringLotus <lotusdurine2023@gmail.com>
Date: Fri, 10 Apr 2026 05:28:21 +0000
Subject: [PATCH 03/12] args for two example script

---
 .gitignore                     |   7 ++
 examples/jiuge.py              | 208 ++++-----------------------------
 examples/jiuge_fix.py          | 200 +++++++++++++++++++++++++++++++
 examples/llama.py              | 171 +++++++++++++--------------
 python/infinilm/base_config.py | 108 ++++++++++++-----
 5 files changed, 394 insertions(+), 300 deletions(-)
 create mode 100644 examples/jiuge_fix.py

diff --git a/.gitignore b/.gitignore
index b728e6ea..086071a5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,13 @@
 build/
 python/infinilm/lib/*.so
 
+#model_weight
+model_weight/
+
+jiuge_infer.sh
+jiuge.sh
+README.md
+
 # MacOS Cache
 .DS_Store
 
diff --git a/examples/jiuge.py b/examples/jiuge.py
index fa547435..9f15433c 100644
--- a/examples/jiuge.py
+++ b/examples/jiuge.py
@@ -12,153 +12,13 @@
 import numpy as np
 from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
 from packaging import version
+from infinilm.base_config import BaseConfig
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
 
 _PAGED_KV_BLOCK_SIZE = 256
 
 
-def get_args():
-    parser = argparse.ArgumentParser(description="run Llama args")
-
-    parser.add_argument(
-        "--cpu",
-        action="store_true",
-        help="Run cpu test",
-    )
-    parser.add_argument(
-        "--nvidia",
-        action="store_true",
-        help="Run nvidia test",
-    )
-    parser.add_argument(
-        "--qy",
-        action="store_true",
-        help="Run qy test",
-    )
-    parser.add_argument(
-        "--metax",
-        action="store_true",
-        help="Run metax test",
-    )
-    parser.add_argument(
-        "--moore",
-        action="store_true",
-        help="Run moore test",
-    )
-    parser.add_argument(
-        "--iluvatar",
-        action="store_true",
-        help="Run iluvatar test",
-    )
-    parser.add_argument(
-        "--cambricon",
-        action="store_true",
-        help="Run cambricon test",
-    )
-    parser.add_argument(
-        "--ali",
-        action="store_true",
-        help="Run alippu test",
-    )
-    parser.add_argument(
-        "--hygon",
-        action="store_true",
-        help="Run hygon test",
-    )
-    parser.add_argument(
-        "--model-path",
-        type=str,
-        required=True,
-        help="model_path",
-    )
-    parser.add_argument(
-        "--max-new-tokens",
-        type=int,
-        default=100,
-        help="max_new_tokens",
-    )
-    parser.add_argument(
-        "--backend",
-        type=str,
-        default="cpp",
-        help="python or cpp model",
-    )
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=1,
-        help="number of prompts in a batch",
-    )
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        default="How are you",
-        help="input prompt",
-    )
-    parser.add_argument(
-        "--tp",
-        type=int,
-        default=1,
-        help="total rank for tensor parallel",
-    )
-    parser.add_argument(
-        "--enable-paged-attn",
-        action="store_true",
-        help="use paged cache",
-    )
-
-    parser.add_argument(
-        "--paged-kv-block-size",
-        type=int,
-        default=256,
-        help="num tokens each kv block can hold",
-    )
-
-    parser.add_argument(
-        "--enable-graph",
-        action="store_true",
-        help="enable graph compiling",
-    )
-
-    parser.add_argument(
-        "--top-k",
-        type=int,
-        default=1,
-        help="top k sampling",
-    )
-
-    parser.add_argument(
-        "--top-p",
-        type=float,
-        default=1.0,
-        help="top p sampling",
-    )
-
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=1.0,
-        help="sampling temperature",
-    )
-
-    parser.add_argument(
-        "--attn",
-        type=str,
-        default="default",
-        choices=["default", "paged-attn", "flash-attn"],
-        help="attention backend to use: 'default' or 'flash-attn'",
-    )
-
-    parser.add_argument(
-        "--kv-cache-dtype",
-        type=str,
-        default=None,
-        choices=["int8"],
-    )
-
-    return parser.parse_args()
-
 
 def test(
     prompts: str | list[str],
@@ -186,7 +46,7 @@ def test(
         distributed_config=DistConfig(tp),
         enable_graph_compiling=enable_graph,
         attention_backend=attn_backend,
-        kv_cache_dtype=args.kv_cache_dtype,
+        kv_cache_dtype=cfg.kv_cache_dtype,
     )
     # ---------------------------------------------------------------------------- #
     #                        Load Weights
@@ -300,44 +160,26 @@ def test(
 
 
 if __name__ == "__main__":
-    args = get_args()
-    print(args)
+    cfg = BaseConfig()
+    
+    device_str = cfg.get_device_str(cfg.device)
 
-    # Parse command line arguments
-    device_str = "cpu"
-    if args.cpu:
-        device_str = "cpu"
-    elif args.nvidia:
-        device_str = "cuda"
-    elif args.qy:
-        device_str = "cuda"
-    elif args.metax:
-        device_str = "cuda"
-    elif args.moore:
-        device_str = "musa"
-    elif args.iluvatar:
-        device_str = "cuda"
-    elif args.cambricon:
-        device_str = "mlu"
-    elif args.ali:
-        device_str = "cuda"
-    elif args.hygon:
-        device_str = "cuda"
-    else:
-        print(
-            "Usage:  python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon | --ali | --hygon] --model_path=<path/to/model_dir>\n"
-            "such as, python examples/jiuge.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
-        )
-        sys.exit(1)
-    prompts = [args.prompt for _ in range(args.batch_size)]
-    _PAGED_KV_BLOCK_SIZE = args.paged_kv_block_size
+    prompts = [cfg.prompt for _ in range(cfg.batch_size)]
+
+    _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size
+
+    model_path = cfg.model
 
-    model_path = args.model_path
-    max_new_tokens = args.max_new_tokens
-    backend = args.backend
-    tp = args.tp
-    enable_paged_attn = args.enable_paged_attn
-    enable_graph = args.enable_graph
+    max_new_tokens = cfg.max_new_tokens
+
+    backend = cfg.backend
+
+    tp = cfg.tp
+
+    enable_paged_attn = cfg.enable_paged_attn
+
+    enable_graph = cfg.enable_graph
+    
     if backend != "cpp":
         raise ValueError(f"Unsupported backend: {backend}.")
 
@@ -351,8 +193,8 @@ def test(
         tp=tp,
         enable_paged_attn=enable_paged_attn,
         enable_graph=enable_graph,
-        top_k=args.top_k,
-        top_p=args.top_p,
-        temperature=args.temperature,
-        attn_backend=args.attn,
-    )
+        top_k=cfg.top_k,
+        top_p=cfg.top_p,
+        temperature=cfg.temperature,
+        attn_backend=cfg.attn,
+    )
\ No newline at end of file
diff --git a/examples/jiuge_fix.py b/examples/jiuge_fix.py
new file mode 100644
index 00000000..9f15433c
--- /dev/null
+++ b/examples/jiuge_fix.py
@@ -0,0 +1,200 @@
+import infinicore
+import transformers
+from transformers import AutoTokenizer
+from tokenizers import decoders as _dec
+from infinilm.modeling_utils import load_model_state_dict_by_file
+from infinilm.distributed import DistConfig
+from infinilm.infer_engine import GenerationConfig, InferEngine
+import argparse
+import sys
+import time
+import os
+import numpy as np
+from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
+from packaging import version
+from infinilm.base_config import BaseConfig
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
+
+_PAGED_KV_BLOCK_SIZE = 256
+
+
+
+def test(
+    prompts: str | list[str],
+    model_path,
+    max_new_tokens=100,
+    infini_device=infinicore.device("cpu", 0),
+    tp=1,
+    enable_paged_attn=False,
+    enable_graph=False,
+    top_k=1,
+    top_p=1.0,
+    temperature=1.0,
+    attn_backend="default",
+):
+    model_path = os.path.expanduser(model_path)
+    # ---------------------------------------------------------------------------- #
+    #                        Create Model
+    # ---------------------------------------------------------------------------- #
+    if enable_paged_attn and attn_backend == "default":
+        attn_backend = "paged-attn"
+
+    model = InferEngine(
+        model_path,
+        device=infini_device,
+        distributed_config=DistConfig(tp),
+        enable_graph_compiling=enable_graph,
+        attention_backend=attn_backend,
+        kv_cache_dtype=cfg.kv_cache_dtype,
+    )
+    # ---------------------------------------------------------------------------- #
+    #                        Load Weights
+    # ---------------------------------------------------------------------------- #
+    load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype)
+
+    # ---------------------------------------------------------------------------- #
+    #                        create tokenizer
+    # ---------------------------------------------------------------------------- #
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    if "llama" == model.config.model_type:
+        backend = getattr(tokenizer, "backend_tokenizer", None)
+        target = getattr(backend, "_tokenizer", backend)
+        norm = getattr(target, "normalizer", None)
+        dec = getattr(target, "decoder", None)
+        sn = repr(norm)[:800] if norm is not None else ""
+        sd = repr(dec)[:800] if dec is not None else ""
+        has_prepend = "Prepend" in sn
+        has_strip = "Strip" in sd
+        if has_prepend and has_strip:
+            target.decoder = _dec.Sequence(
+                [
+                    _dec.Replace("▁", " "),
+                    _dec.ByteFallback(),
+                    _dec.Fuse(),
+                ]
+            )
+
+    # ---------------------------------------------------------------------------- #
+    #                        tokenize
+    # ---------------------------------------------------------------------------- #
+    # prompt = "山东最高的山是？"
+    if isinstance(prompts, str):
+        prompts = [prompts]
+    input_contents = [
+        tokenizer.apply_chat_template(
+            conversation=[{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        for prompt in prompts
+    ]
+
+    # input_ids_list = tokenizer.batch_encode_plus(input_contents)[
+    #     "input_ids"
+    # ]  # List: [[1, 1128, 526, 366, 29892]]
+    if version.parse(transformers.__version__) < version.parse("5.0.0"):
+        # Ideally this is solved by upgrading transformers. However, doing so causes version mismatch between transformers and mlu pytorch on devices with Phytium CPU. So a branch is temporarily used.
+        input_ids_list = [
+            tokenizer.encode_plus(
+                text, truncation=True, max_length=2048, add_special_tokens=True
+            )["input_ids"]
+            for text in input_contents
+        ]
+    else:
+        input_ids_list = [
+            tokenizer._encode_plus(
+                text, truncation=True, max_length=2048, add_special_tokens=True
+            )["input_ids"]
+            for text in input_contents
+        ]
+
+    # ---------------------------------------------------------------------------- #
+    #                       Create KVCache
+    # ---------------------------------------------------------------------------- #
+    if enable_paged_attn:
+        batch_size = 1 if prompts is str else len(prompts)
+        max_total_tokens = max_new_tokens + len(input_ids_list[0])
+        cache_config = PagedKVCacheConfig(
+            num_blocks=(
+                (max_total_tokens + (_PAGED_KV_BLOCK_SIZE - 1)) // _PAGED_KV_BLOCK_SIZE
+            )
+            * batch_size,
+            block_size=_PAGED_KV_BLOCK_SIZE,
+        )
+    else:
+        batch_size = 1 if prompts is str else len(prompts)
+        initial_capacity = max_new_tokens + len(input_ids_list[0])
+        cache_config = StaticKVCacheConfig(
+            max_batch_size=batch_size, max_cache_len=initial_capacity
+        )
+
+    model.reset_cache(cache_config)
+
+    # ---------------------------------------------------------------------------- #
+    #                        Generate
+    # ---------------------------------------------------------------------------- #
+    print(input_contents[0], end="", flush=True)
+    input_ids_infini = infinicore.from_list(input_ids_list)
+
+    t1 = time.time()
+    print("=================== start generate ====================")
+    output_ids = model.generate(
+        input_ids_infini,
+        GenerationConfig(
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+        ),
+        _measure_and_log_time=True,
+    )
+    t2 = time.time()
+
+    numpy_output_ids = np.array([output_id.to_numpy()[0] for output_id in output_ids])
+    print(tokenizer.decode(numpy_output_ids, skip_special_tokens=True))
+
+    print(
+        f"total_time: {round((t2 - t1) * 1000, 2)} ms",
+    )
+
+
+if __name__ == "__main__":
+    cfg = BaseConfig()
+    
+    device_str = cfg.get_device_str(cfg.device)
+
+    prompts = [cfg.prompt for _ in range(cfg.batch_size)]
+
+    _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size
+
+    model_path = cfg.model
+
+    max_new_tokens = cfg.max_new_tokens
+
+    backend = cfg.backend
+
+    tp = cfg.tp
+
+    enable_paged_attn = cfg.enable_paged_attn
+
+    enable_graph = cfg.enable_graph
+    
+    if backend != "cpp":
+        raise ValueError(f"Unsupported backend: {backend}.")
+
+    infini_device = infinicore.device(device_str, 0)
+
+    test(
+        prompts,
+        model_path,
+        max_new_tokens,
+        infini_device=infini_device,
+        tp=tp,
+        enable_paged_attn=enable_paged_attn,
+        enable_graph=enable_graph,
+        top_k=cfg.top_k,
+        top_p=cfg.top_p,
+        temperature=cfg.temperature,
+        attn_backend=cfg.attn,
+    )
\ No newline at end of file
diff --git a/examples/llama.py b/examples/llama.py
index aa890ca9..413afa13 100644
--- a/examples/llama.py
+++ b/examples/llama.py
@@ -7,70 +7,70 @@
 import sys
 import time
 import os
-
+from infinilm.base_config import BaseConfig
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
 
 
-def get_args():
-    parser = argparse.ArgumentParser(description="run Llama args")
-
-    parser.add_argument(
-        "--cpu",
-        action="store_true",
-        help="Run cpu test",
-    )
-    parser.add_argument(
-        "--nvidia",
-        action="store_true",
-        help="Run nvidia test",
-    )
-    parser.add_argument(
-        "--metax",
-        action="store_true",
-        help="Run metax test",
-    )
-    parser.add_argument(
-        "--moore",
-        action="store_true",
-        help="Run moore test",
-    )
-    parser.add_argument(
-        "--iluvatar",
-        action="store_true",
-        help="Run iluvatar test",
-    )
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        required=True,
-        help="model_path",
-    )
-    parser.add_argument(
-        "--max_new_tokens",
-        type=int,
-        default=100,
-        help="max_new_tokens",
-    )
-    parser.add_argument(
-        "--backend",
-        type=str,
-        default="python",
-        help="python or cpp model",
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=1,
-        help="number of prompts in a batch",
-    )
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        default="How are you",
-        help="input prompt",
-    )
-
-    return parser.parse_args()
+# def get_args():
+#     parser = argparse.ArgumentParser(description="run Llama args")
+
+#     parser.add_argument(
+#         "--cpu",
+#         action="store_true",
+#         help="Run cpu test",
+#     )
+#     parser.add_argument(
+#         "--nvidia",
+#         action="store_true",
+#         help="Run nvidia test",
+#     )
+#     parser.add_argument(
+#         "--metax",
+#         action="store_true",
+#         help="Run metax test",
+#     )
+#     parser.add_argument(
+#         "--moore",
+#         action="store_true",
+#         help="Run moore test",
+#     )
+#     parser.add_argument(
+#         "--iluvatar",
+#         action="store_true",
+#         help="Run iluvatar test",
+#     )
+#     parser.add_argument(
+#         "--model_path",
+#         type=str,
+#         required=True,
+#         help="model_path",
+#     )
+#     parser.add_argument(
+#         "--max_new_tokens",
+#         type=int,
+#         default=100,
+#         help="max_new_tokens",
+#     )
+#     parser.add_argument(
+#         "--backend",
+#         type=str,
+#         default="python",
+#         help="python or cpp model",
+#     )
+#     parser.add_argument(
+#         "--batch_size",
+#         type=int,
+#         default=1,
+#         help="number of prompts in a batch",
+#     )
+#     parser.add_argument(
+#         "--prompt",
+#         type=str,
+#         default="How are you",
+#         help="input prompt",
+#     )
+
+#     return parser.parse_args()
 
 
 def test(
@@ -163,32 +163,33 @@ def test(
 
 
 if __name__ == "__main__":
-    args = get_args()
-    print(args)
+    cfg = BaseConfig()
+    
+    device_str = cfg.get_device_str(cfg.device)
 
     # Parse command line arguments
-    device_str = "cpu"
-    if args.cpu:
-        device_str = "cpu"
-    elif args.nvidia:
-        device_str = "cuda"
-    elif args.metax:
-        device_str = "cuda"
-    elif args.moore:
-        device_str = "musa"
-    elif args.iluvatar:
-        device_str = "cuda"
-    else:
-        print(
-            "Usage:  python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>\n"
-            "such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
-        )
-        sys.exit(1)
-    prompts = [args.prompt for _ in range(args.batch_size)]
-
-    model_path = args.model_path
-    max_new_tokens = args.max_new_tokens
-    backend = args.backend
+    # device_str = "cpu"
+    # if args.cpu:
+    #     device_str = "cpu"
+    # elif args.nvidia:
+    #     device_str = "cuda"
+    # elif args.metax:
+    #     device_str = "cuda"
+    # elif args.moore:
+    #     device_str = "musa"
+    # elif args.iluvatar:
+    #     device_str = "cuda"
+    # else:
+    #     print(
+    #         "Usage:  python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>\n"
+    #         "such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
+    #     )
+    #     sys.exit(1)
+    prompts = [cfg.prompt for _ in range(cfg.batch_size)]
+
+    model_path = cfg.model
+    max_new_tokens = cfg.max_new_tokens
+    backend = cfg.backend
 
     if backend != "python":
         raise ValueError(f"Unsupported backend: {backend}.")
diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py
index 5e1a8bd2..d11d9a8c 100644
--- a/python/infinilm/base_config.py
+++ b/python/infinilm/base_config.py
@@ -1,8 +1,8 @@
 import argparse
 import sys
 import os
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../scripts"))                                                           
-from libinfinicore_infer import DeviceType  
+# sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../scripts"))                                                           
+# from libinfinicore_infemport DeviceType  
 
 
 class BaseConfig:
@@ -16,8 +16,7 @@ def __init__(self):
 
         
         self.model = self.args.model
-        self.device_name = self.args.device
-        self.device_type = self._get_device_type(self.args.device)
+        self.device = self.args.device
         self.tp = self.args.tp
 
 
@@ -26,27 +25,34 @@ def __init__(self):
         self.cache_type = self.args.cache_type
         self.enable_paged_attn = self.args.enable_paged_attn
         self.paged_kv_block_size = self.args.paged_kv_block_size
+        self.num_blocks = self.args.num_blocks
+        self.block_size = self.args.block_size
+        self.max_cache_len = self.args.max_cache_len
         self.kv_cache_dtype = self.args.kv_cache_dtype
         self.skip_load = self.args.skip_load
 
 
         self.batch_size = self.args.batch_size
+        self.max_batch = self.args.max_batch
+        self.max_batch_size = self.args.max_batch_size
         self.input_len = self.args.input_len
         self.output_len = self.args.output_len
         self.max_new_tokens = self.args.max_new_tokens
+        self.max_tokens = self.args.max_tokens
+        self.prompt = self.args.prompt
         self.top_k = self.args.top_k
         self.top_p = self.args.top_p
         self.temperature = self.args.temperature
 
-        self.warm_up = self.args.warm_up
+        self.warmup = self.args.warmup
         self.verbose = self.args.verbose
-        self.log_evel = self.args.log_evel
+        self.log_level = self.args.log_level
 
 
         # Evaluation parameters
         self.bench = self.args.bench
         self.backend = self.args.backend
-        self.ndev = self.args.ndev
+        self.tp = self.args.tp
         self.subject = self.args.subject
         self.split = self.args.split
         self.num_samples = self.args.num_samples
@@ -57,6 +63,13 @@ def __init__(self):
         # Quantization parameters
         self.awq = self.args.awq
         self.gptq = self.args.gptq
+        self.dtype = self.args.dtype
+
+
+        # Server parameters
+        self.host = self.args.host
+        self.port = self.args.port
+        self.endpoint = self.args.endpoint
 
     def _add_common_args(self):
         # --- base configuration ---
@@ -71,29 +84,36 @@ def _add_common_args(self):
         self.parser.add_argument("--cache-type", type=str, default="paged", choices=["paged",   "static"])
         self.parser.add_argument("--enable-paged-attn", action="store_true", help="use paged cache",)
         self.parser.add_argument("--paged-kv-block-size", type=int, default=256)
+        self.parser.add_argument("--num-blocks", type=int, default=512, help="number of KV cache blocks")
+        self.parser.add_argument("--block-size", type=int, default=256, help="size of each KV cache block")
+        self.parser.add_argument("--max-cache-len", type=int, default=4096, help="maximum cache length")
         self.parser.add_argument("--kv-cache-dtype", type=str, default=None, choices=["int8"], help="KV cache data type")
-        self.parser.add_argument("--skip-load", action="store_true", help="skip loading model weights")
+        self.parser.add_argument("--skip-load", action="store_false", help="skip loading model weights")
         
 
         # --- Length and infer parameters ---
         self.parser.add_argument("--batch-size", type=int, default=1)
+        self.parser.add_argument("--max-batch", type=int, default=3, help="maximum batch size")
+        self.parser.add_argument("--max-batch-size", type=int, default=8, help="maximum batch size for server")
         self.parser.add_argument("--input-len", type=int, default=10, help="input sequence length")
         self.parser.add_argument("--output-len", type=int, default=20, help="output sequence length")
         self.parser.add_argument("--max-new-tokens", type=int, default=500, help="maximum number of new tokens to generate")
+        self.parser.add_argument("--max-tokens", type=int, default=512, help="maximum tokens")
+        self.parser.add_argument("--prompt", type=str, default="How are you", help="default prompt text")
         self.parser.add_argument("--top-k", type=int, default=1)
         self.parser.add_argument("--top-p", type=float, default=1.0)
         self.parser.add_argument("--temperature", type=float, default=1.0)
         
         # --- debug ---
-        self.parser.add_argument("--warmup", action="store_true")
-        self.parser.add_argument("--verbose", action="store_true")
-        self.parser.add_argument("--log-evel", type=str, default="INFO")
+        self.parser.add_argument("--warmup", action="store_false")
+        self.parser.add_argument("--verbose", action="store_false")
+        self.parser.add_argument("--log-level", type=str, default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="logging level")
 
 
         # --- Evaluation parameters ---
         self.parser.add_argument("--bench", type=str, default=None, choices=["ceval", "mmlu"], help="benchmark to evaluate")
         self.parser.add_argument("--backend", type=str, default="cpp", choices=["python", "cpp", "torch", "vllm"], help="backend type")
-        self.parser.add_argument("--ndev", type=int, default=1, help="number of devices for tensor parallelism")
+        
         self.parser.add_argument("--subject", type=str, default="all", help="subject(s) to evaluate, comma-separated or 'all'")
         self.parser.add_argument("--split", type=str, default="test", choices=["test", "val", "all"], help="dataset split to use")
         self.parser.add_argument("--num-samples", type=int, default=None, help="number of samples to evaluate per subject")
@@ -102,27 +122,51 @@ def _add_common_args(self):
 
 
         # --- Quantization parameters ---
-        self.parser.add_argument("--awq", action="store_true", help="use AWQ quantization")
-        self.parser.add_argument("--gptq", action="store_true", help="use GPTQ quantization")
-
-
-    def _get_device_type(self, dev_str):
-        """Convert device string to DeviceType enum"""
-        DEVICE_TYPE_MAP = {
-            "cpu": DeviceType.DEVICE_TYPE_CPU,
-            "nvidia": DeviceType.DEVICE_TYPE_NVIDIA,
-            "qy": DeviceType.DEVICE_TYPE_QY,
-            "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON,
-            "ascend": DeviceType.DEVICE_TYPE_ASCEND,
-            "metax": DeviceType.DEVICE_TYPE_METAX,
-            "moore": DeviceType.DEVICE_TYPE_MOORE,
-            "iluvatar": DeviceType.DEVICE_TYPE_ILUVATAR,
-            "kunlun": DeviceType.DEVICE_TYPE_KUNLUN,
-            "hygon": DeviceType.DEVICE_TYPE_HYGON,
-            "ali": DeviceType.DEVICE_TYPE_ALI
+        self.parser.add_argument("--awq", action="store_false", help="use AWQ quantization")
+        self.parser.add_argument("--gptq", action="store_false", help="use GPTQ quantization")
+        self.parser.add_argument("--dtype", type=str, default="float16", help="data type for model")
+
+
+        # --- Server parameters ---
+        self.parser.add_argument("--host", type=str, default="0.0.0.0", help="server host")
+        self.parser.add_argument("--port", type=int, default=8000, help="server port")
+        self.parser.add_argument("--endpoint", type=str, default="/completions", help="API endpoint")
+
+
+    def get_device_str(self, device):
+        """Convert device name to backend string (cuda/cpu/musa/mlu)"""
+        DEVICE_STR_MAP = {
+            "cpu": "cpu",
+            "nvidia": "cuda",
+            "qy": "cuda",
+            "cambricon": "mlu",
+            "ascend": "ascend",  # 假设华为昇腾后端为"ascend"
+            "metax": "cuda",
+            "moore": "musa",
+            "iluvatar": "cuda",
+            "kunlun": "kunlun",  # 假设昆仑芯后端为"kunlun"
+            "hygon": "cuda",
+            "ali": "cuda"
         }
-        return DEVICE_TYPE_MAP.get(dev_str.lower(), DeviceType.DEVICE_TYPE_CPU)
+        return DEVICE_STR_MAP.get(device.lower(), "cpu")
+    
+    # def _get_device_type(self, dev_str):
+    #     """Convert device string to DeviceType enum"""
+    #     DEVICE_TYPE_MAP = {
+    #         "cpu": DeviceType.DEVICE_TYPE_CPU,
+    #         "nvidia": DeviceType.DEVICE_TYPE_NVIDIA,
+    #         "qy": DeviceType.DEVICE_TYPE_QY,
+    #         "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON,
+    #         "ascend": DeviceType.DEVICE_TYPE_ASCEND,
+    #         "metax": DeviceType.DEVICE_TYPE_METAX,
+    #         "moore": DeviceType.DEVICE_TYPE_MOORE,
+    #         "iluvatar": DeviceType.DEVICE_TYPE_ILUVATAR,
+    #         "kunlun": DeviceType.DEVICE_TYPE_KUNLUN,
+    #         "hygon": DeviceType.DEVICE_TYPE_HYGON,
+    #         "ali": DeviceType.DEVICE_TYPE_ALI
+    #     }
+    #     return DEVICE_TYPE_MAP.get(dev_str.lower(), DeviceType.DEVICE_TYPE_CPU)
 
     def __repr__(self):
         """String representation of configuration"""
-        return f"BaseConfig(model='{self.model}', device='{self.device_name}', tp={self.tp})"
+        return f"BaseConfig(model='{self.model}', device='{self.device}', tp={self.tp})"

From 41810e421f2863ac57369c25cab7e786ae78c705 Mon Sep 17 00:00:00 2001
From: MoringLotus <151500590+MoringLotus@users.noreply.github.com>
Date: Fri, 10 Apr 2026 14:08:24 +0800
Subject: [PATCH 04/12] Delete examples/jiuge_fix.py

---
 examples/jiuge_fix.py | 200 ------------------------------------------
 1 file changed, 200 deletions(-)
 delete mode 100644 examples/jiuge_fix.py

diff --git a/examples/jiuge_fix.py b/examples/jiuge_fix.py
deleted file mode 100644
index 9f15433c..00000000
--- a/examples/jiuge_fix.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import infinicore
-import transformers
-from transformers import AutoTokenizer
-from tokenizers import decoders as _dec
-from infinilm.modeling_utils import load_model_state_dict_by_file
-from infinilm.distributed import DistConfig
-from infinilm.infer_engine import GenerationConfig, InferEngine
-import argparse
-import sys
-import time
-import os
-import numpy as np
-from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
-from packaging import version
-from infinilm.base_config import BaseConfig
-
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
-
-_PAGED_KV_BLOCK_SIZE = 256
-
-
-
-def test(
-    prompts: str | list[str],
-    model_path,
-    max_new_tokens=100,
-    infini_device=infinicore.device("cpu", 0),
-    tp=1,
-    enable_paged_attn=False,
-    enable_graph=False,
-    top_k=1,
-    top_p=1.0,
-    temperature=1.0,
-    attn_backend="default",
-):
-    model_path = os.path.expanduser(model_path)
-    # ---------------------------------------------------------------------------- #
-    #                        Create Model
-    # ---------------------------------------------------------------------------- #
-    if enable_paged_attn and attn_backend == "default":
-        attn_backend = "paged-attn"
-
-    model = InferEngine(
-        model_path,
-        device=infini_device,
-        distributed_config=DistConfig(tp),
-        enable_graph_compiling=enable_graph,
-        attention_backend=attn_backend,
-        kv_cache_dtype=cfg.kv_cache_dtype,
-    )
-    # ---------------------------------------------------------------------------- #
-    #                        Load Weights
-    # ---------------------------------------------------------------------------- #
-    load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype)
-
-    # ---------------------------------------------------------------------------- #
-    #                        create tokenizer
-    # ---------------------------------------------------------------------------- #
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    if "llama" == model.config.model_type:
-        backend = getattr(tokenizer, "backend_tokenizer", None)
-        target = getattr(backend, "_tokenizer", backend)
-        norm = getattr(target, "normalizer", None)
-        dec = getattr(target, "decoder", None)
-        sn = repr(norm)[:800] if norm is not None else ""
-        sd = repr(dec)[:800] if dec is not None else ""
-        has_prepend = "Prepend" in sn
-        has_strip = "Strip" in sd
-        if has_prepend and has_strip:
-            target.decoder = _dec.Sequence(
-                [
-                    _dec.Replace("▁", " "),
-                    _dec.ByteFallback(),
-                    _dec.Fuse(),
-                ]
-            )
-
-    # ---------------------------------------------------------------------------- #
-    #                        tokenize
-    # ---------------------------------------------------------------------------- #
-    # prompt = "山东最高的山是？"
-    if isinstance(prompts, str):
-        prompts = [prompts]
-    input_contents = [
-        tokenizer.apply_chat_template(
-            conversation=[{"role": "user", "content": prompt}],
-            add_generation_prompt=True,
-            tokenize=False,
-        )
-        for prompt in prompts
-    ]
-
-    # input_ids_list = tokenizer.batch_encode_plus(input_contents)[
-    #     "input_ids"
-    # ]  # List: [[1, 1128, 526, 366, 29892]]
-    if version.parse(transformers.__version__) < version.parse("5.0.0"):
-        # Ideally this is solved by upgrading transformers. However, doing so causes version mismatch between transformers and mlu pytorch on devices with Phytium CPU. So a branch is temporarily used.
-        input_ids_list = [
-            tokenizer.encode_plus(
-                text, truncation=True, max_length=2048, add_special_tokens=True
-            )["input_ids"]
-            for text in input_contents
-        ]
-    else:
-        input_ids_list = [
-            tokenizer._encode_plus(
-                text, truncation=True, max_length=2048, add_special_tokens=True
-            )["input_ids"]
-            for text in input_contents
-        ]
-
-    # ---------------------------------------------------------------------------- #
-    #                       Create KVCache
-    # ---------------------------------------------------------------------------- #
-    if enable_paged_attn:
-        batch_size = 1 if prompts is str else len(prompts)
-        max_total_tokens = max_new_tokens + len(input_ids_list[0])
-        cache_config = PagedKVCacheConfig(
-            num_blocks=(
-                (max_total_tokens + (_PAGED_KV_BLOCK_SIZE - 1)) // _PAGED_KV_BLOCK_SIZE
-            )
-            * batch_size,
-            block_size=_PAGED_KV_BLOCK_SIZE,
-        )
-    else:
-        batch_size = 1 if prompts is str else len(prompts)
-        initial_capacity = max_new_tokens + len(input_ids_list[0])
-        cache_config = StaticKVCacheConfig(
-            max_batch_size=batch_size, max_cache_len=initial_capacity
-        )
-
-    model.reset_cache(cache_config)
-
-    # ---------------------------------------------------------------------------- #
-    #                        Generate
-    # ---------------------------------------------------------------------------- #
-    print(input_contents[0], end="", flush=True)
-    input_ids_infini = infinicore.from_list(input_ids_list)
-
-    t1 = time.time()
-    print("=================== start generate ====================")
-    output_ids = model.generate(
-        input_ids_infini,
-        GenerationConfig(
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-        ),
-        _measure_and_log_time=True,
-    )
-    t2 = time.time()
-
-    numpy_output_ids = np.array([output_id.to_numpy()[0] for output_id in output_ids])
-    print(tokenizer.decode(numpy_output_ids, skip_special_tokens=True))
-
-    print(
-        f"total_time: {round((t2 - t1) * 1000, 2)} ms",
-    )
-
-
-if __name__ == "__main__":
-    cfg = BaseConfig()
-    
-    device_str = cfg.get_device_str(cfg.device)
-
-    prompts = [cfg.prompt for _ in range(cfg.batch_size)]
-
-    _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size
-
-    model_path = cfg.model
-
-    max_new_tokens = cfg.max_new_tokens
-
-    backend = cfg.backend
-
-    tp = cfg.tp
-
-    enable_paged_attn = cfg.enable_paged_attn
-
-    enable_graph = cfg.enable_graph
-    
-    if backend != "cpp":
-        raise ValueError(f"Unsupported backend: {backend}.")
-
-    infini_device = infinicore.device(device_str, 0)
-
-    test(
-        prompts,
-        model_path,
-        max_new_tokens,
-        infini_device=infini_device,
-        tp=tp,
-        enable_paged_attn=enable_paged_attn,
-        enable_graph=enable_graph,
-        top_k=cfg.top_k,
-        top_p=cfg.top_p,
-        temperature=cfg.temperature,
-        attn_backend=cfg.attn,
-    )
\ No newline at end of file

From f264e25a54b061aca75703f4574fa65390c10c35 Mon Sep 17 00:00:00 2001
From: MoringLotus <151500590+MoringLotus@users.noreply.github.com>
Date: Fri, 10 Apr 2026 14:17:35 +0800
Subject: [PATCH 05/12] Delete jiuge.sh

---
 jiuge.sh | 46 ----------------------------------------------
 1 file changed, 46 deletions(-)
 delete mode 100644 jiuge.sh

diff --git a/jiuge.sh b/jiuge.sh
deleted file mode 100644
index e7ddb2dd..00000000
--- a/jiuge.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-featurize@featurize:~/work/InfiniLM$ cat jiuge.sh 
-#!/bin/bash
-
-# Jiuge模型运行脚本
-# 使用NVIDIA显卡运行9G4B模型
-
-set -e  # 遇到错误立即退出
-
-echo "=========================================="
-echo "🚀 启动 Jiuge 模型 (9G4B) - NVIDIA版本"
-echo "=========================================="
-export INFINI_ROOT=/home/featurize/.infini
-export LD_LIBRARY_PATH=$INFINI_ROOT/lib:$LD_LIBRARY_PATH
-# 设置参数
-MODEL_DIR="/home/featurize/work/InfiniFamily/9G4B"
-DEVICE="--nvidia"
-N_DEVICE=1
-SCRIPT_PATH="python scripts/jiuge.py"
-
-# 检查模型目录是否存在
-if [ ! -d "$MODEL_DIR" ]; then
-    echo "❌ 错误: 模型目录不存在: $MODEL_DIR"
-    echo "请检查路径是否正确"
-    exit 1
-fi
-
-# 检查Python脚本是否存在
-if [ ! -f "scripts/jiuge.py" ]; then
-    echo "❌ 错误: 未找到jiuge.py脚本: scripts/jiuge.py"
-    echo "请确保在当前目录下运行此脚本"
-    exit 1
-fi
-
-echo "📁 模型路径: $MODEL_DIR"
-echo "🎯 设备类型: NVIDIA GPU"
-echo "💻 设备数量: $N_DEVICE"
-echo ""
-
-# 运行模型
-echo "🔄 启动模型..."
-$SCRIPT_PATH $DEVICE $MODEL_DIR $N_DEVICE
-
-echo ""
-echo "=========================================="
-echo "✅ 模型运行完成"
-echo "=========================================="ß
\ No newline at end of file

From 02e3350052174ce3c484300ff2c6cdf5132cc54a Mon Sep 17 00:00:00 2001
From: MoringLotus <lotusdurine2023@gmail.com>
Date: Fri, 10 Apr 2026 07:07:45 +0000
Subject: [PATCH 06/12] examples script fix

---
 README.md                      | 182 ----------------------------
 examples/bench.py              | 213 ++++-----------------------------
 python/infinilm/base_config.py |   2 +-
 3 files changed, 24 insertions(+), 373 deletions(-)
 delete mode 100644 README.md

diff --git a/README.md b/README.md
deleted file mode 100644
index afc242b2..00000000
--- a/README.md
+++ /dev/null
@@ -1,182 +0,0 @@
-# InfiniLM
-
-![star](https://atomgit.com/InfiniTensor/InfiniLM/star/badge.svg)
-
-本项目是基于 [`InfiniCore`](https://github.com/InfiniTensor/InfiniCore) 的推理引擎。
-
-## 使用方式
-
-- 编译并安装 `InfiniCore` 。注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）。
-
-- 编译并安装 `InfiniLM`
-
-```bash
-
-```
-
-- 运行模型推理测试
-
-```bash
-python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] path/to/model_dir [n_device]
-```
-
-- 部署模型推理服务
-
-```bash
-python scripts/launch_server.py --model-path MODEL_PATH [-h] [--dev {cpu,nvidia,qy, cambricon,ascend,metax,moore,iluvatar,kunlun,hygon}] [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
-```
-
-- 测试模型推理服务性能
-
-```bash
-python scripts/test_perf.py
-```
-
-- 使用推理服务测试模型困惑度（Perplexity）
-
-```bash
-python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
-```
-
-## 使用方式(新版)
-#### 一、编译并安装 `InfiniCore`
-编译并安装 `InfiniCore`， 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
-
-- 注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）
-- 根据硬件平台，选择 xmake 构建配置
-- 编译安装InfiniCore
-- 安装 C++ 库
-- 安装 Python 包
-
-
-#### 二、编译并安装  `InfiniLM`
-  - 克隆项目
-
-    由于仓库中含有子模块，所以在克隆时请添加 `--recursive` 或 `--recurse-submodules`，如：
-
-    ```shell
-    git clone --recursive https://github.com/InfiniTensor/InfiniLM.git
-    ```
-
-    或者在普通克隆后进行更新：
-
-    ```shell
-    git submodule update --init --recursive
-    ```
-
-
-  - 选择是否使用kv caching，默认为false；在支持了此算子的平台(英伟达、阿里、天数、沐曦、海光、QY)可以使用
-    ```bash
-      xmake f --use-kv-caching= [true | false] -cv
-    ```
-
-
-  - 安装 InfiniLM Python 包
-    ```bash
-      pip install -e .
-    ```
-
-  - 单次推理测试
-    - llama示例
-    ```bash
-    python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali | --cambricon | --hygon] --model_path=<path/to/model_dir>
-    ```
-    - 例如：
-    ```bash
-    python examples/jigue.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
-    ```
-  - 分布式推理测试
-      - 9g示例
-      ```bash
-    python examples/jiuge.py [---nvidia] --model_path=<path/to/model_dir> --backend=cpp --tp=NDEV --batch_size=MAX_BATCH
-    ```
-
-    - 例如： 9G7B模型，cpp后端，batch_size为16，4卡分布式
-    ```bash
-    python examples/jiuge.py --nvidia --model_path=/models/9G7B_MHA/ --backend=cpp --tp=4 --batch_size=16
-    ```
-
-
-  - 推理服务测试
-    - 启动推理服务
-      ```bash
-      python python/infinilm/server/inference_server.py [--cpu | --nvidia | --metax | --moore | --iluvatar | --cambricon] --model_path=<path/to/model_dir> --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH --tp=NDEV --temperature=TEMP --top_p=TOP_P --top_k=TOP_K --host=HOST --port=PORT
-      ```
-    
-    - 单卡示例：
-      ```bash
-      CUDA_VISIBLE_DEVICES=0 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=1 --temperature=1.0 --top_p=0.8 --top_k=1
-      ```
-    
-    - 多卡分布式示例：
-      ```bash
-      CUDA_VISIBLE_DEVICES=0,1,2,3 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=4 --temperature=1.0 --top_p=0.8 --top_k=1
-      ```
-    
-    - 测试推理服务性能：
-      ```bash
-      python scripts/test_perf.py --verbose
-      ```
-
-  - 运行推理基准测试（C-Eval/MMLU）
-
-    ```bash
-    python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
-    ```
-
-    - 参数说明：
-      - `--subject`: 指定科目，支持单个科目、多个科目（逗号分隔）或 `all`（默认值，加载全部科目）
-      - `--output_csv`: 可选，指定CSV输出文件路径。如未指定则不生成CSV文件。CSV包含每个科目的结果和总体结果
-      - `--cache_dir`: 可选，指定数据集缓存目录的父目录。应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录（例如 `~/.cache/huggingface/datasets/`）。设置后脚本优先使用本地 CSV（`pandas.read_csv`）离线加载数据，避免 `load_dataset` 的网络请求
-
-    - C-Eval示例：
-      - 单个科目：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --num_samples 100 --backend cpp --ndev 1
-        ```
-      - 多个科目（逗号分隔）：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics,high_school_physics --backend cpp --ndev 1 --output_csv results.csv
-        ```
-      - 全部科目并输出CSV：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject all --backend cpp --ndev 1 --output_csv results.csv
-        ```
-      - 使用缓存目录加速加载：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
-        ```
-        > 注意：`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录，而不是直接指向这些子目录
-
-    - MMLU示例：
-      - 单个科目：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1
-        ```
-      - 多个科目（逗号分隔）：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra,anatomy,astronomy --backend cpp --ndev 1 --output_csv results.csv
-        ```
-      - 使用缓存目录加速加载：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
-        ```
-        > 注意：`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录，而不是直接指向这些子目录
-
-  - 试验中功能
-    - Warm Up
-      ```bash
-      python examples/bench.py --nvidia --model=<model-path> --warmup
-      ```
-    - Paged Attention
-      ```bash
-      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn
-      ```
-    - CUDA Graph
-      ```bash
-      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn --enable-graph
-      ```
-    - 选择attention后端 (使用flash attention后端需要先在InfiniCore完成相关配置和编译)
-      ```bash
-      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn [--attn=default | --attn=flash-attn]
-      ```
diff --git a/examples/bench.py b/examples/bench.py
index a52c44ec..6fc27498 100644
--- a/examples/bench.py
+++ b/examples/bench.py
@@ -3,6 +3,7 @@
 from infinilm.modeling_utils import load_model_state_dict_by_file
 from infinilm.distributed import DistConfig
 from infinilm.infer_engine import GenerationConfig, InferEngine
+from infinilm.base_config import BaseConfig
 from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
 import argparse
 import sys
@@ -125,150 +126,6 @@ def get_test_cases(
 
     return case_dict
 
-
-def get_args():
-    parser = argparse.ArgumentParser(description="run Llama args")
-
-    parser.add_argument(
-        "--cpu",
-        action="store_true",
-        help="Run cpu test",
-    )
-    parser.add_argument(
-        "--nvidia",
-        action="store_true",
-        help="Run nvidia test",
-    )
-    parser.add_argument(
-        "--qy",
-        action="store_true",
-        help="Run qy test",
-    )
-    parser.add_argument(
-        "--metax",
-        action="store_true",
-        help="Run metax test",
-    )
-    parser.add_argument(
-        "--moore",
-        action="store_true",
-        help="Run moore test",
-    )
-    parser.add_argument(
-        "--iluvatar",
-        action="store_true",
-        help="Run iluvatar test",
-    )
-    parser.add_argument(
-        "--cambricon",
-        action="store_true",
-        help="Run cambricon test",
-    )
-    parser.add_argument(
-        "--ali",
-        action="store_true",
-        help="Run alippu test",
-    )
-    parser.add_argument(
-        "--hygon",
-        action="store_true",
-        help="Run hygon test",
-    )
-    parser.add_argument(
-        "--model",
-        type=str,
-        required=True,
-        help="model path",
-    )
-    parser.add_argument(
-        "--batch-size",
-        type=parse_list,
-        default=1,
-        help="number of prompts in a batch (can be an int or a list of ints, e.g., '1' or '[1,2,4]' or '1,2,4')",
-    )
-    parser.add_argument(
-        "--tensor-parallel-size",
-        "--tp",
-        type=int,
-        default=1,
-        help="total rank for tensor parallel",
-    )
-    parser.add_argument(
-        "--input-len",
-        type=parse_list,
-        default=10,
-        help="output tokens",
-    )
-
-    parser.add_argument(
-        "--output-len",
-        type=parse_list,
-        default=20,
-        help="output tokens",
-    )
-    parser.add_argument(
-        "--skip-load",
-        action="store_true",
-        help="skip loading model weights",
-    )
-    parser.add_argument(
-        "--top-k",
-        type=int,
-        default=1,
-        help="top k sampling",
-    )
-
-    parser.add_argument(
-        "--top-p",
-        type=float,
-        default=1.0,
-        help="top p sampling",
-    )
-
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=1.0,
-        help="sampling temperature",
-    )
-    parser.add_argument(
-        "--enable-paged-attn",
-        action="store_true",
-        help="use paged cache",
-    )
-    parser.add_argument(
-        "--paged-kv-block-size",
-        type=int,
-        default=256,
-        help="num tokens each kv block can hold",
-    )
-    parser.add_argument(
-        "--enable-graph",
-        action="store_true",
-        help="enable graph compiling",
-    )
-    parser.add_argument(
-        "--warmup",
-        action="store_true",
-        help="Perform a warmup run before benchmarking/inference.",
-    )
-    parser.add_argument(
-        "--attn",
-        type=str,
-        default="default",
-        choices=["default", "paged-attn", "flash-attn"],
-        help="attention backend to use: 'default' or 'flash-attn'",
-    )
-    parser.add_argument(
-        "--kv-cache-dtype",
-        type=str,
-        default=None,
-        choices=["int8"],
-    )
-
-    return parser.parse_args()
-
-
 with open("examples/bench_prompt.md", "r") as f:
     prompt = f.read()
 
@@ -305,7 +162,7 @@ def __init__(
             cache_config=cache_config,
             enable_graph_compiling=enable_graph,
             attention_backend=attn_backend,
-            kv_cache_dtype=args.kv_cache_dtype,
+            kv_cache_dtype=cfg.kv_cache_dtype,
         )
 
         # ---------------------------------------------------------------------------- #
@@ -396,52 +253,28 @@ def run(
 
 
 if __name__ == "__main__":
-    args = get_args()
-    print(args)
-
-    # Parse command line arguments
-    device_str = "cpu"
-    if args.cpu:
-        device_str = "cpu"
-    elif args.nvidia:
-        device_str = "cuda"
-    elif args.qy:
-        device_str = "cuda"
-    elif args.metax:
-        device_str = "cuda"
-    elif args.moore:
-        device_str = "musa"
-    elif args.iluvatar:
-        device_str = "cuda"
-    elif args.cambricon:
-        device_str = "mlu"
-    elif args.ali:
-        device_str = "cuda"
-    elif args.hygon:
-        device_str = "cuda"
-    else:
-        print(
-            "python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --tp=1 --input-len=50 --output-len=50"
-        )
-        sys.exit(1)
-    _PAGED_KV_BLOCK_SIZE = args.paged_kv_block_size
+    cfg = BaseConfig()
+    
+    device_str = cfg.get_device_str(cfg.device)
+
+    _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size
     # -------------------------------------------------------- #
     #             解析参数
     # -------------------------------------------------------- #
-    model_path = args.model
+    model_path = cfg.model
 
     infini_device = infinicore.device(device_str, 0)
 
-    tp = args.tensor_parallel_size
+    tp = cfg.tp
 
-    skip_load = args.skip_load
+    skip_load = cfg.skip_load
 
-    batch_size = args.batch_size
-    input_len = args.input_len
-    output_len = args.output_len
-    enable_paged_attn = args.enable_paged_attn
-    enable_graph = args.enable_graph
-    attn_backend = args.attn
+    batch_size = cfg.batch_size
+    input_len = cfg.input_len
+    output_len = cfg.output_len
+    enable_paged_attn = cfg.enable_paged_attn
+    enable_graph = cfg.enable_graph
+    attn_backend = cfg.attn
 
     if isinstance(batch_size, int):
         batch_size = [batch_size]
@@ -488,7 +321,7 @@ def run(
     # ---------------------------------------------------------------------------- #
     #                                Warmup
     # ---------------------------------------------------------------------------- #
-    if args.warmup:
+    if cfg.warmup:
         warmup_steps = 1
 
         # warmup cache capacity
@@ -518,9 +351,9 @@ def run(
                 input_ids_infini,
                 GenerationConfig(
                     max_new_tokens=5,  # decode kernel warmup
-                    temperature=args.temperature,
-                    top_k=args.top_k,
-                    top_p=args.top_p,
+                    temperature=cfg.temperature,
+                    top_k=cfg.top_k,
+                    top_p=cfg.top_p,
                     stop_on_eos=False,
                 ),
                 _measure_and_log_time=False,
@@ -557,7 +390,7 @@ def run(
             batch_size=batch_size,
             input_len=input_len,
             output_len=output_len,
-            top_k=args.top_k,
-            top_p=args.top_p,
-            temperature=args.temperature,
+            top_k=cfg.top_k,
+            top_p=cfg.top_p,
+            temperature=cfg.temperature,
         )
diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py
index d11d9a8c..0b19700a 100644
--- a/python/infinilm/base_config.py
+++ b/python/infinilm/base_config.py
@@ -88,7 +88,7 @@ def _add_common_args(self):
         self.parser.add_argument("--block-size", type=int, default=256, help="size of each KV cache block")
         self.parser.add_argument("--max-cache-len", type=int, default=4096, help="maximum cache length")
         self.parser.add_argument("--kv-cache-dtype", type=str, default=None, choices=["int8"], help="KV cache data type")
-        self.parser.add_argument("--skip-load", action="store_false", help="skip loading model weights")
+        self.parser.add_argument("--skip-load", action="store_true", help="skip loading model weights")
         
 
         # --- Length and infer parameters ---

From 0920ce835100a77a6b0ffed4a43a23fc00329bc6 Mon Sep 17 00:00:00 2001
From: MoringLotus <lotusdurine2023@gmail.com>
Date: Fri, 10 Apr 2026 07:27:56 +0000
Subject: [PATCH 07/12] solve merge problem

---
 examples/jiuge_fix.py | 200 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 200 insertions(+)
 create mode 100644 examples/jiuge_fix.py

diff --git a/examples/jiuge_fix.py b/examples/jiuge_fix.py
new file mode 100644
index 00000000..9f15433c
--- /dev/null
+++ b/examples/jiuge_fix.py
@@ -0,0 +1,200 @@
+import infinicore
+import transformers
+from transformers import AutoTokenizer
+from tokenizers import decoders as _dec
+from infinilm.modeling_utils import load_model_state_dict_by_file
+from infinilm.distributed import DistConfig
+from infinilm.infer_engine import GenerationConfig, InferEngine
+import argparse
+import sys
+import time
+import os
+import numpy as np
+from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
+from packaging import version
+from infinilm.base_config import BaseConfig
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
+
+_PAGED_KV_BLOCK_SIZE = 256
+
+
+
+def test(
+    prompts: str | list[str],
+    model_path,
+    max_new_tokens=100,
+    infini_device=infinicore.device("cpu", 0),
+    tp=1,
+    enable_paged_attn=False,
+    enable_graph=False,
+    top_k=1,
+    top_p=1.0,
+    temperature=1.0,
+    attn_backend="default",
+):
+    model_path = os.path.expanduser(model_path)
+    # ---------------------------------------------------------------------------- #
+    #                        Create Model
+    # ---------------------------------------------------------------------------- #
+    if enable_paged_attn and attn_backend == "default":
+        attn_backend = "paged-attn"
+
+    model = InferEngine(
+        model_path,
+        device=infini_device,
+        distributed_config=DistConfig(tp),
+        enable_graph_compiling=enable_graph,
+        attention_backend=attn_backend,
+        kv_cache_dtype=cfg.kv_cache_dtype,
+    )
+    # ---------------------------------------------------------------------------- #
+    #                        Load Weights
+    # ---------------------------------------------------------------------------- #
+    load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype)
+
+    # ---------------------------------------------------------------------------- #
+    #                        create tokenizer
+    # ---------------------------------------------------------------------------- #
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    if "llama" == model.config.model_type:
+        backend = getattr(tokenizer, "backend_tokenizer", None)
+        target = getattr(backend, "_tokenizer", backend)
+        norm = getattr(target, "normalizer", None)
+        dec = getattr(target, "decoder", None)
+        sn = repr(norm)[:800] if norm is not None else ""
+        sd = repr(dec)[:800] if dec is not None else ""
+        has_prepend = "Prepend" in sn
+        has_strip = "Strip" in sd
+        if has_prepend and has_strip:
+            target.decoder = _dec.Sequence(
+                [
+                    _dec.Replace("▁", " "),
+                    _dec.ByteFallback(),
+                    _dec.Fuse(),
+                ]
+            )
+
+    # ---------------------------------------------------------------------------- #
+    #                        tokenize
+    # ---------------------------------------------------------------------------- #
+    # prompt = "山东最高的山是？"
+    if isinstance(prompts, str):
+        prompts = [prompts]
+    input_contents = [
+        tokenizer.apply_chat_template(
+            conversation=[{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        for prompt in prompts
+    ]
+
+    # input_ids_list = tokenizer.batch_encode_plus(input_contents)[
+    #     "input_ids"
+    # ]  # List: [[1, 1128, 526, 366, 29892]]
+    if version.parse(transformers.__version__) < version.parse("5.0.0"):
+        # Ideally this is solved by upgrading transformers. However, doing so causes version mismatch between transformers and mlu pytorch on devices with Phytium CPU. So a branch is temporarily used.
+        input_ids_list = [
+            tokenizer.encode_plus(
+                text, truncation=True, max_length=2048, add_special_tokens=True
+            )["input_ids"]
+            for text in input_contents
+        ]
+    else:
+        input_ids_list = [
+            tokenizer._encode_plus(
+                text, truncation=True, max_length=2048, add_special_tokens=True
+            )["input_ids"]
+            for text in input_contents
+        ]
+
+    # ---------------------------------------------------------------------------- #
+    #                       Create KVCache
+    # ---------------------------------------------------------------------------- #
+    if enable_paged_attn:
+        batch_size = 1 if prompts is str else len(prompts)
+        max_total_tokens = max_new_tokens + len(input_ids_list[0])
+        cache_config = PagedKVCacheConfig(
+            num_blocks=(
+                (max_total_tokens + (_PAGED_KV_BLOCK_SIZE - 1)) // _PAGED_KV_BLOCK_SIZE
+            )
+            * batch_size,
+            block_size=_PAGED_KV_BLOCK_SIZE,
+        )
+    else:
+        batch_size = 1 if prompts is str else len(prompts)
+        initial_capacity = max_new_tokens + len(input_ids_list[0])
+        cache_config = StaticKVCacheConfig(
+            max_batch_size=batch_size, max_cache_len=initial_capacity
+        )
+
+    model.reset_cache(cache_config)
+
+    # ---------------------------------------------------------------------------- #
+    #                        Generate
+    # ---------------------------------------------------------------------------- #
+    print(input_contents[0], end="", flush=True)
+    input_ids_infini = infinicore.from_list(input_ids_list)
+
+    t1 = time.time()
+    print("=================== start generate ====================")
+    output_ids = model.generate(
+        input_ids_infini,
+        GenerationConfig(
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+        ),
+        _measure_and_log_time=True,
+    )
+    t2 = time.time()
+
+    numpy_output_ids = np.array([output_id.to_numpy()[0] for output_id in output_ids])
+    print(tokenizer.decode(numpy_output_ids, skip_special_tokens=True))
+
+    print(
+        f"total_time: {round((t2 - t1) * 1000, 2)} ms",
+    )
+
+
+if __name__ == "__main__":
+    cfg = BaseConfig()
+    
+    device_str = cfg.get_device_str(cfg.device)
+
+    prompts = [cfg.prompt for _ in range(cfg.batch_size)]
+
+    _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size
+
+    model_path = cfg.model
+
+    max_new_tokens = cfg.max_new_tokens
+
+    backend = cfg.backend
+
+    tp = cfg.tp
+
+    enable_paged_attn = cfg.enable_paged_attn
+
+    enable_graph = cfg.enable_graph
+    
+    if backend != "cpp":
+        raise ValueError(f"Unsupported backend: {backend}.")
+
+    infini_device = infinicore.device(device_str, 0)
+
+    test(
+        prompts,
+        model_path,
+        max_new_tokens,
+        infini_device=infini_device,
+        tp=tp,
+        enable_paged_attn=enable_paged_attn,
+        enable_graph=enable_graph,
+        top_k=cfg.top_k,
+        top_p=cfg.top_p,
+        temperature=cfg.temperature,
+        attn_backend=cfg.attn,
+    )
\ No newline at end of file

From 18615afef4a91e2d367fc1924058e66512d08fcf Mon Sep 17 00:00:00 2001
From: MoringLotus <lotusdurine2023@gmail.com>
Date: Fri, 10 Apr 2026 07:57:22 +0000
Subject: [PATCH 08/12] fix some divergence

---
 .gitignore            |   1 -
 README.md             | 182 ++++++++++++++++++++++++++++++++++++++
 examples/jiuge_fix.py | 200 ------------------------------------------
 3 files changed, 182 insertions(+), 201 deletions(-)
 create mode 100644 README.md
 delete mode 100644 examples/jiuge_fix.py

diff --git a/.gitignore b/.gitignore
index 086071a5..7c079758 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,7 +8,6 @@ model_weight/
 
 jiuge_infer.sh
 jiuge.sh
-README.md
 
 # MacOS Cache
 .DS_Store
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..3a455a34
--- /dev/null
+++ b/README.md
@@ -0,0 +1,182 @@
+# InfiniLM
+
+![star](https://atomgit.com/InfiniTensor/InfiniLM/star/badge.svg)
+
+本项目是基于 [`InfiniCore`](https://github.com/InfiniTensor/InfiniCore) 的推理引擎。
+
+## 使用方式
+
+- 编译并安装 `InfiniCore` 。注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）。
+
+- 编译并安装 `InfiniLM`
+
+```bash
+xmake && xmake install
+```
+
+- 运行模型推理测试
+
+```bash
+python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] path/to/model_dir [n_device]
+```
+
+- 部署模型推理服务
+
+```bash
+python scripts/launch_server.py --model-path MODEL_PATH [-h] [--dev {cpu,nvidia,qy, cambricon,ascend,metax,moore,iluvatar,kunlun,hygon}] [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
+```
+
+- 测试模型推理服务性能
+
+```bash
+python scripts/test_perf.py
+```
+
+- 使用推理服务测试模型困惑度（Perplexity）
+
+```bash
+python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
+```
+
+## 使用方式(新版)
+#### 一、编译并安装 `InfiniCore`
+编译并安装 `InfiniCore`， 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
+
+- 注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）
+- 根据硬件平台，选择 xmake 构建配置
+- 编译安装InfiniCore
+- 安装 C++ 库
+- 安装 Python 包
+
+
+#### 二、编译并安装  `InfiniLM`
+  - 克隆项目
+
+    由于仓库中含有子模块，所以在克隆时请添加 `--recursive` 或 `--recurse-submodules`，如：
+
+    ```shell
+    git clone --recursive https://github.com/InfiniTensor/InfiniLM.git
+    ```
+
+    或者在普通克隆后进行更新：
+
+    ```shell
+    git submodule update --init --recursive
+    ```
+
+
+  - 选择是否使用kv caching，默认为false；在支持了此算子的平台(英伟达、阿里、天数、沐曦、海光、QY)可以使用
+    ```bash
+      xmake f --use-kv-caching= [true | false] -cv
+    ```
+
+
+  - 安装 InfiniLM Python 包
+    ```bash
+      pip install -e .
+    ```
+
+  - 单次推理测试
+    - llama示例
+    ```bash
+    python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali | --cambricon | --hygon] --model_path=<path/to/model_dir>
+    ```
+    - 例如：
+    ```bash
+    python examples/jigue.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
+    ```
+  - 分布式推理测试
+      - 9g示例
+      ```bash
+    python examples/jiuge.py [---nvidia] --model_path=<path/to/model_dir> --backend=cpp --tp=NDEV --batch_size=MAX_BATCH
+    ```
+
+    - 例如： 9G7B模型，cpp后端，batch_size为16，4卡分布式
+    ```bash
+    python examples/jiuge.py --nvidia --model_path=/models/9G7B_MHA/ --backend=cpp --tp=4 --batch_size=16
+    ```
+
+
+  - 推理服务测试
+    - 启动推理服务
+      ```bash
+      python python/infinilm/server/inference_server.py [--cpu | --nvidia | --metax | --moore | --iluvatar | --cambricon] --model_path=<path/to/model_dir> --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH --tp=NDEV --temperature=TEMP --top_p=TOP_P --top_k=TOP_K --host=HOST --port=PORT
+      ```
+    
+    - 单卡示例：
+      ```bash
+      CUDA_VISIBLE_DEVICES=0 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=1 --temperature=1.0 --top_p=0.8 --top_k=1
+      ```
+    
+    - 多卡分布式示例：
+      ```bash
+      CUDA_VISIBLE_DEVICES=0,1,2,3 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=4 --temperature=1.0 --top_p=0.8 --top_k=1
+      ```
+    
+    - 测试推理服务性能：
+      ```bash
+      python scripts/test_perf.py --verbose
+      ```
+
+  - 运行推理基准测试（C-Eval/MMLU）
+
+    ```bash
+    python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
+    ```
+
+    - 参数说明：
+      - `--subject`: 指定科目，支持单个科目、多个科目（逗号分隔）或 `all`（默认值，加载全部科目）
+      - `--output_csv`: 可选，指定CSV输出文件路径。如未指定则不生成CSV文件。CSV包含每个科目的结果和总体结果
+      - `--cache_dir`: 可选，指定数据集缓存目录的父目录。应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录（例如 `~/.cache/huggingface/datasets/`）。设置后脚本优先使用本地 CSV（`pandas.read_csv`）离线加载数据，避免 `load_dataset` 的网络请求
+
+    - C-Eval示例：
+      - 单个科目：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --num_samples 100 --backend cpp --ndev 1
+        ```
+      - 多个科目（逗号分隔）：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics,high_school_physics --backend cpp --ndev 1 --output_csv results.csv
+        ```
+      - 全部科目并输出CSV：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject all --backend cpp --ndev 1 --output_csv results.csv
+        ```
+      - 使用缓存目录加速加载：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
+        ```
+        > 注意：`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录，而不是直接指向这些子目录
+
+    - MMLU示例：
+      - 单个科目：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1
+        ```
+      - 多个科目（逗号分隔）：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra,anatomy,astronomy --backend cpp --ndev 1 --output_csv results.csv
+        ```
+      - 使用缓存目录加速加载：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
+        ```
+        > 注意：`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录，而不是直接指向这些子目录
+
+  - 试验中功能
+    - Warm Up
+      ```bash
+      python examples/bench.py --nvidia --model=<model-path> --warmup
+      ```
+    - Paged Attention
+      ```bash
+      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn
+      ```
+    - CUDA Graph
+      ```bash
+      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn --enable-graph
+      ```
+    - 选择attention后端 (使用flash attention后端需要先在InfiniCore完成相关配置和编译)
+      ```bash
+      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn [--attn=default | --attn=flash-attn]
+      ```
\ No newline at end of file
diff --git a/examples/jiuge_fix.py b/examples/jiuge_fix.py
deleted file mode 100644
index 9f15433c..00000000
--- a/examples/jiuge_fix.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import infinicore
-import transformers
-from transformers import AutoTokenizer
-from tokenizers import decoders as _dec
-from infinilm.modeling_utils import load_model_state_dict_by_file
-from infinilm.distributed import DistConfig
-from infinilm.infer_engine import GenerationConfig, InferEngine
-import argparse
-import sys
-import time
-import os
-import numpy as np
-from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
-from packaging import version
-from infinilm.base_config import BaseConfig
-
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
-
-_PAGED_KV_BLOCK_SIZE = 256
-
-
-
-def test(
-    prompts: str | list[str],
-    model_path,
-    max_new_tokens=100,
-    infini_device=infinicore.device("cpu", 0),
-    tp=1,
-    enable_paged_attn=False,
-    enable_graph=False,
-    top_k=1,
-    top_p=1.0,
-    temperature=1.0,
-    attn_backend="default",
-):
-    model_path = os.path.expanduser(model_path)
-    # ---------------------------------------------------------------------------- #
-    #                        Create Model
-    # ---------------------------------------------------------------------------- #
-    if enable_paged_attn and attn_backend == "default":
-        attn_backend = "paged-attn"
-
-    model = InferEngine(
-        model_path,
-        device=infini_device,
-        distributed_config=DistConfig(tp),
-        enable_graph_compiling=enable_graph,
-        attention_backend=attn_backend,
-        kv_cache_dtype=cfg.kv_cache_dtype,
-    )
-    # ---------------------------------------------------------------------------- #
-    #                        Load Weights
-    # ---------------------------------------------------------------------------- #
-    load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype)
-
-    # ---------------------------------------------------------------------------- #
-    #                        create tokenizer
-    # ---------------------------------------------------------------------------- #
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    if "llama" == model.config.model_type:
-        backend = getattr(tokenizer, "backend_tokenizer", None)
-        target = getattr(backend, "_tokenizer", backend)
-        norm = getattr(target, "normalizer", None)
-        dec = getattr(target, "decoder", None)
-        sn = repr(norm)[:800] if norm is not None else ""
-        sd = repr(dec)[:800] if dec is not None else ""
-        has_prepend = "Prepend" in sn
-        has_strip = "Strip" in sd
-        if has_prepend and has_strip:
-            target.decoder = _dec.Sequence(
-                [
-                    _dec.Replace("▁", " "),
-                    _dec.ByteFallback(),
-                    _dec.Fuse(),
-                ]
-            )
-
-    # ---------------------------------------------------------------------------- #
-    #                        tokenize
-    # ---------------------------------------------------------------------------- #
-    # prompt = "山东最高的山是？"
-    if isinstance(prompts, str):
-        prompts = [prompts]
-    input_contents = [
-        tokenizer.apply_chat_template(
-            conversation=[{"role": "user", "content": prompt}],
-            add_generation_prompt=True,
-            tokenize=False,
-        )
-        for prompt in prompts
-    ]
-
-    # input_ids_list = tokenizer.batch_encode_plus(input_contents)[
-    #     "input_ids"
-    # ]  # List: [[1, 1128, 526, 366, 29892]]
-    if version.parse(transformers.__version__) < version.parse("5.0.0"):
-        # Ideally this is solved by upgrading transformers. However, doing so causes version mismatch between transformers and mlu pytorch on devices with Phytium CPU. So a branch is temporarily used.
-        input_ids_list = [
-            tokenizer.encode_plus(
-                text, truncation=True, max_length=2048, add_special_tokens=True
-            )["input_ids"]
-            for text in input_contents
-        ]
-    else:
-        input_ids_list = [
-            tokenizer._encode_plus(
-                text, truncation=True, max_length=2048, add_special_tokens=True
-            )["input_ids"]
-            for text in input_contents
-        ]
-
-    # ---------------------------------------------------------------------------- #
-    #                       Create KVCache
-    # ---------------------------------------------------------------------------- #
-    if enable_paged_attn:
-        batch_size = 1 if prompts is str else len(prompts)
-        max_total_tokens = max_new_tokens + len(input_ids_list[0])
-        cache_config = PagedKVCacheConfig(
-            num_blocks=(
-                (max_total_tokens + (_PAGED_KV_BLOCK_SIZE - 1)) // _PAGED_KV_BLOCK_SIZE
-            )
-            * batch_size,
-            block_size=_PAGED_KV_BLOCK_SIZE,
-        )
-    else:
-        batch_size = 1 if prompts is str else len(prompts)
-        initial_capacity = max_new_tokens + len(input_ids_list[0])
-        cache_config = StaticKVCacheConfig(
-            max_batch_size=batch_size, max_cache_len=initial_capacity
-        )
-
-    model.reset_cache(cache_config)
-
-    # ---------------------------------------------------------------------------- #
-    #                        Generate
-    # ---------------------------------------------------------------------------- #
-    print(input_contents[0], end="", flush=True)
-    input_ids_infini = infinicore.from_list(input_ids_list)
-
-    t1 = time.time()
-    print("=================== start generate ====================")
-    output_ids = model.generate(
-        input_ids_infini,
-        GenerationConfig(
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-        ),
-        _measure_and_log_time=True,
-    )
-    t2 = time.time()
-
-    numpy_output_ids = np.array([output_id.to_numpy()[0] for output_id in output_ids])
-    print(tokenizer.decode(numpy_output_ids, skip_special_tokens=True))
-
-    print(
-        f"total_time: {round((t2 - t1) * 1000, 2)} ms",
-    )
-
-
-if __name__ == "__main__":
-    cfg = BaseConfig()
-    
-    device_str = cfg.get_device_str(cfg.device)
-
-    prompts = [cfg.prompt for _ in range(cfg.batch_size)]
-
-    _PAGED_KV_BLOCK_SIZE = cfg.paged_kv_block_size
-
-    model_path = cfg.model
-
-    max_new_tokens = cfg.max_new_tokens
-
-    backend = cfg.backend
-
-    tp = cfg.tp
-
-    enable_paged_attn = cfg.enable_paged_attn
-
-    enable_graph = cfg.enable_graph
-    
-    if backend != "cpp":
-        raise ValueError(f"Unsupported backend: {backend}.")
-
-    infini_device = infinicore.device(device_str, 0)
-
-    test(
-        prompts,
-        model_path,
-        max_new_tokens,
-        infini_device=infini_device,
-        tp=tp,
-        enable_paged_attn=enable_paged_attn,
-        enable_graph=enable_graph,
-        top_k=cfg.top_k,
-        top_p=cfg.top_p,
-        temperature=cfg.temperature,
-        attn_backend=cfg.attn,
-    )
\ No newline at end of file

From 070bcd53705c8a7448fe136de42dc31717099f1c Mon Sep 17 00:00:00 2001
From: MoringLotus <lotusdurine2023@gmail.com>
Date: Fri, 10 Apr 2026 08:10:39 +0000
Subject: [PATCH 09/12] delete redundancy

---
 .gitignore |  39 ------------
 README.md  | 182 -----------------------------------------------------
 2 files changed, 221 deletions(-)
 delete mode 100644 .gitignore
 delete mode 100644 README.md

diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 7c079758..00000000
--- a/.gitignore
+++ /dev/null
@@ -1,39 +0,0 @@
-# Xmake cache
-.xmake/
-build/
-python/infinilm/lib/*.so
-
-#model_weight
-model_weight/
-
-jiuge_infer.sh
-jiuge.sh
-
-# MacOS Cache
-.DS_Store
-
-# Vscode
-.vscode/
-
-# Python
-__pycache__/
-*.egg-info/
-
-# Log
-*.log
-
-# Cache
-.cache/
-
-# JSON
-*.json
-
-#GGUF
-*.gguf
-
-# txt
-*.txt
-
-*.http
-
-*.nsys-rep
diff --git a/README.md b/README.md
deleted file mode 100644
index 3a455a34..00000000
--- a/README.md
+++ /dev/null
@@ -1,182 +0,0 @@
-# InfiniLM
-
-![star](https://atomgit.com/InfiniTensor/InfiniLM/star/badge.svg)
-
-本项目是基于 [`InfiniCore`](https://github.com/InfiniTensor/InfiniCore) 的推理引擎。
-
-## 使用方式
-
-- 编译并安装 `InfiniCore` 。注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）。
-
-- 编译并安装 `InfiniLM`
-
-```bash
-xmake && xmake install
-```
-
-- 运行模型推理测试
-
-```bash
-python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] path/to/model_dir [n_device]
-```
-
-- 部署模型推理服务
-
-```bash
-python scripts/launch_server.py --model-path MODEL_PATH [-h] [--dev {cpu,nvidia,qy, cambricon,ascend,metax,moore,iluvatar,kunlun,hygon}] [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
-```
-
-- 测试模型推理服务性能
-
-```bash
-python scripts/test_perf.py
-```
-
-- 使用推理服务测试模型困惑度（Perplexity）
-
-```bash
-python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
-```
-
-## 使用方式(新版)
-#### 一、编译并安装 `InfiniCore`
-编译并安装 `InfiniCore`， 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
-
-- 注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）
-- 根据硬件平台，选择 xmake 构建配置
-- 编译安装InfiniCore
-- 安装 C++ 库
-- 安装 Python 包
-
-
-#### 二、编译并安装  `InfiniLM`
-  - 克隆项目
-
-    由于仓库中含有子模块，所以在克隆时请添加 `--recursive` 或 `--recurse-submodules`，如：
-
-    ```shell
-    git clone --recursive https://github.com/InfiniTensor/InfiniLM.git
-    ```
-
-    或者在普通克隆后进行更新：
-
-    ```shell
-    git submodule update --init --recursive
-    ```
-
-
-  - 选择是否使用kv caching，默认为false；在支持了此算子的平台(英伟达、阿里、天数、沐曦、海光、QY)可以使用
-    ```bash
-      xmake f --use-kv-caching= [true | false] -cv
-    ```
-
-
-  - 安装 InfiniLM Python 包
-    ```bash
-      pip install -e .
-    ```
-
-  - 单次推理测试
-    - llama示例
-    ```bash
-    python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali | --cambricon | --hygon] --model_path=<path/to/model_dir>
-    ```
-    - 例如：
-    ```bash
-    python examples/jigue.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
-    ```
-  - 分布式推理测试
-      - 9g示例
-      ```bash
-    python examples/jiuge.py [---nvidia] --model_path=<path/to/model_dir> --backend=cpp --tp=NDEV --batch_size=MAX_BATCH
-    ```
-
-    - 例如： 9G7B模型，cpp后端，batch_size为16，4卡分布式
-    ```bash
-    python examples/jiuge.py --nvidia --model_path=/models/9G7B_MHA/ --backend=cpp --tp=4 --batch_size=16
-    ```
-
-
-  - 推理服务测试
-    - 启动推理服务
-      ```bash
-      python python/infinilm/server/inference_server.py [--cpu | --nvidia | --metax | --moore | --iluvatar | --cambricon] --model_path=<path/to/model_dir> --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH --tp=NDEV --temperature=TEMP --top_p=TOP_P --top_k=TOP_K --host=HOST --port=PORT
-      ```
-    
-    - 单卡示例：
-      ```bash
-      CUDA_VISIBLE_DEVICES=0 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=1 --temperature=1.0 --top_p=0.8 --top_k=1
-      ```
-    
-    - 多卡分布式示例：
-      ```bash
-      CUDA_VISIBLE_DEVICES=0,1,2,3 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=4 --temperature=1.0 --top_p=0.8 --top_k=1
-      ```
-    
-    - 测试推理服务性能：
-      ```bash
-      python scripts/test_perf.py --verbose
-      ```
-
-  - 运行推理基准测试（C-Eval/MMLU）
-
-    ```bash
-    python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
-    ```
-
-    - 参数说明：
-      - `--subject`: 指定科目，支持单个科目、多个科目（逗号分隔）或 `all`（默认值，加载全部科目）
-      - `--output_csv`: 可选，指定CSV输出文件路径。如未指定则不生成CSV文件。CSV包含每个科目的结果和总体结果
-      - `--cache_dir`: 可选，指定数据集缓存目录的父目录。应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录（例如 `~/.cache/huggingface/datasets/`）。设置后脚本优先使用本地 CSV（`pandas.read_csv`）离线加载数据，避免 `load_dataset` 的网络请求
-
-    - C-Eval示例：
-      - 单个科目：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --num_samples 100 --backend cpp --ndev 1
-        ```
-      - 多个科目（逗号分隔）：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics,high_school_physics --backend cpp --ndev 1 --output_csv results.csv
-        ```
-      - 全部科目并输出CSV：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject all --backend cpp --ndev 1 --output_csv results.csv
-        ```
-      - 使用缓存目录加速加载：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
-        ```
-        > 注意：`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录，而不是直接指向这些子目录
-
-    - MMLU示例：
-      - 单个科目：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1
-        ```
-      - 多个科目（逗号分隔）：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra,anatomy,astronomy --backend cpp --ndev 1 --output_csv results.csv
-        ```
-      - 使用缓存目录加速加载：
-        ```bash
-        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
-        ```
-        > 注意：`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录，而不是直接指向这些子目录
-
-  - 试验中功能
-    - Warm Up
-      ```bash
-      python examples/bench.py --nvidia --model=<model-path> --warmup
-      ```
-    - Paged Attention
-      ```bash
-      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn
-      ```
-    - CUDA Graph
-      ```bash
-      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn --enable-graph
-      ```
-    - 选择attention后端 (使用flash attention后端需要先在InfiniCore完成相关配置和编译)
-      ```bash
-      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn [--attn=default | --attn=flash-attn]
-      ```
\ No newline at end of file

From 91492a737a31dec751f5b9eae38fa8cd6b3397c3 Mon Sep 17 00:00:00 2001
From: MoringLotus <lotusdurine2023@gmail.com>
Date: Fri, 10 Apr 2026 08:25:13 +0000
Subject: [PATCH 10/12] fix push question

---
 .gitignore     |  36 ++++++++++
 README.md      | 182 +++++++++++++++++++++++++++++++++++++++++++++++++
 jiuge_infer.sh |   1 +
 3 files changed, 219 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 jiuge_infer.sh

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..11b10c29
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,36 @@
+# Xmake cache
+.xmake/
+build/
+python/infinilm/lib/*.so
+
+# MacOS Cache
+.DS_Store
+
+# Vscode
+.vscode/
+
+
+model_weight/
+
+# Python
+__pycache__/
+*.egg-info/
+
+# Log
+*.log
+
+# Cache
+.cache/
+
+# JSON
+*.json
+
+#GGUF
+*.gguf
+
+# txt
+*.txt
+
+*.http
+
+*.nsys-rep
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..48448c56
--- /dev/null
+++ b/README.md
@@ -0,0 +1,182 @@
+# InfiniLM
+
+![star](https://atomgit.com/InfiniTensor/InfiniLM/star/badge.svg)
+
+本项目是基于 [`InfiniCore`](https://github.com/InfiniTensor/InfiniCore) 的推理引擎。
+
+## 使用方式
+
+- 编译并安装 `InfiniCore` 。注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）。
+
+- 编译并安装 `InfiniLM`
+
+```bash
+xmake && xmake install
+```
+
+- 运行模型推理测试
+
+```bash
+python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] path/to/model_dir [n_device]
+```
+
+- 部署模型推理服务
+
+```bash
+python scripts/launch_server.py --model-path MODEL_PATH [-h] [--dev {cpu,nvidia,qy, cambricon,ascend,metax,moore,iluvatar,kunlun,hygon}] [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
+```
+
+- 测试模型推理服务性能
+
+```bash
+python scripts/test_perf.py
+```
+
+- 使用推理服务测试模型困惑度（Perplexity）
+
+```bash
+python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
+```
+
+## 使用方式(新版)
+#### 一、编译并安装 `InfiniCore`
+编译并安装 `InfiniCore`， 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
+
+- 注意根据提示设置好 `INFINI_ROOT` 环境变量（默认为 `$HOME/.infini`）
+- 根据硬件平台，选择 xmake 构建配置
+- 编译安装InfiniCore
+- 安装 C++ 库
+- 安装 Python 包
+
+
+#### 二、编译并安装  `InfiniLM`
+  - 克隆项目
+
+    由于仓库中含有子模块，所以在克隆时请添加 `--recursive` 或 `--recurse-submodules`，如：
+
+    ```shell
+    git clone --recursive https://github.com/InfiniTensor/InfiniLM.git
+    ```
+
+    或者在普通克隆后进行更新：
+
+    ```shell
+    git submodule update --init --recursive
+    ```
+
+
+  - 选择是否使用kv caching，默认为false；在支持了此算子的平台(英伟达、阿里、天数、沐曦、海光、QY)可以使用
+    ```bash
+      xmake f --use-kv-caching= [true | false] -cv
+    ```
+
+
+  - 安装 InfiniLM Python 包
+    ```bash
+      pip install -e .
+    ```
+
+  - 单次推理测试
+    - llama示例
+    ```bash
+    python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali | --cambricon | --hygon] --model_path=<path/to/model_dir>
+    ```
+    - 例如：
+    ```bash
+    python examples/jigue.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
+    ```
+  - 分布式推理测试
+      - 9g示例
+      ```bash
+    python examples/jiuge.py [---nvidia] --model_path=<path/to/model_dir> --backend=cpp --tp=NDEV --batch_size=MAX_BATCH
+    ```
+
+    - 例如： 9G7B模型，cpp后端，batch_size为16，4卡分布式
+    ```bash
+    python examples/jiuge.py --nvidia --model_path=/models/9G7B_MHA/ --backend=cpp --tp=4 --batch_size=16
+    ```
+
+
+  - 推理服务测试
+    - 启动推理服务
+      ```bash
+      python python/infinilm/server/inference_server.py [--cpu | --nvidia | --metax | --moore | --iluvatar | --cambricon] --model_path=<path/to/model_dir> --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH --tp=NDEV --temperature=TEMP --top_p=TOP_P --top_k=TOP_K --host=HOST --port=PORT
+      ```
+    
+    - 单卡示例：
+      ```bash
+      CUDA_VISIBLE_DEVICES=0 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=1 --temperature=1.0 --top_p=0.8 --top_k=1
+      ```
+    
+    - 多卡分布式示例：
+      ```bash
+      CUDA_VISIBLE_DEVICES=0,1,2,3 python python/infinilm/server/inference_server.py --nvidia --model_path=/models/9G7B_MHA/ --max_tokens=100 --max_batch_size=32 --tp=4 --temperature=1.0 --top_p=0.8 --top_k=1
+      ```
+    
+    - 测试推理服务性能：
+      ```bash
+      python scripts/test_perf.py --verbose
+      ```
+
+  - 运行推理基准测试（C-Eval/MMLU）
+
+    ```bash
+    python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
+    ```
+
+    - 参数说明：
+      - `--subject`: 指定科目，支持单个科目、多个科目（逗号分隔）或 `all`（默认值，加载全部科目）
+      - `--output_csv`: 可选，指定CSV输出文件路径。如未指定则不生成CSV文件。CSV包含每个科目的结果和总体结果
+      - `--cache_dir`: 可选，指定数据集缓存目录的父目录。应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录（例如 `~/.cache/huggingface/datasets/`）。设置后脚本优先使用本地 CSV（`pandas.read_csv`）离线加载数据，避免 `load_dataset` 的网络请求
+
+    - C-Eval示例：
+      - 单个科目：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --num_samples 100 --backend cpp --ndev 1
+        ```
+      - 多个科目（逗号分隔）：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics,high_school_physics --backend cpp --ndev 1 --output_csv results.csv
+        ```
+      - 全部科目并输出CSV：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject all --backend cpp --ndev 1 --output_csv results.csv
+        ```
+      - 使用缓存目录加速加载：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench ceval --subject middle_school_mathematics --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
+        ```
+        > 注意：`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录，而不是直接指向这些子目录
+
+    - MMLU示例：
+      - 单个科目：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1
+        ```
+      - 多个科目（逗号分隔）：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra,anatomy,astronomy --backend cpp --ndev 1 --output_csv results.csv
+        ```
+      - 使用缓存目录加速加载：
+        ```bash
+        python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
+        ```
+        > 注意：`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录，而不是直接指向这些子目录
+
+  - 试验中功能
+    - Warm Up
+      ```bash
+      python examples/bench.py --nvidia --model=<model-path> --warmup
+      ```
+    - Paged Attention
+      ```bash
+      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn
+      ```
+    - CUDA Graph
+      ```bash
+      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn --enable-graph
+      ```
+    - 选择attention后端 (使用flash attention后端需要先在InfiniCore完成相关配置和编译)
+      ```bash
+      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn [--attn=default | --attn=flash-attn]
+      ```
diff --git a/jiuge_infer.sh b/jiuge_infer.sh
new file mode 100644
index 00000000..91e7bc7e
--- /dev/null
+++ b/jiuge_infer.sh
@@ -0,0 +1 @@
+python examples/jiuge.py --model-path /home/featurize/work/qy_interview/InfiniLM/model_weight --nvidia 
\ No newline at end of file

From 56a39cdf0933740864a3b695055f6b7620fd588b Mon Sep 17 00:00:00 2001
From: MoringLotus <lotusdurine2023@gmail.com>
Date: Fri, 10 Apr 2026 08:38:47 +0000
Subject: [PATCH 11/12] Remove jiuge_infer.sh

---
 jiuge_infer.sh | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 jiuge_infer.sh

diff --git a/jiuge_infer.sh b/jiuge_infer.sh
deleted file mode 100644
index 91e7bc7e..00000000
--- a/jiuge_infer.sh
+++ /dev/null
@@ -1 +0,0 @@
-python examples/jiuge.py --model-path /home/featurize/work/qy_interview/InfiniLM/model_weight --nvidia 
\ No newline at end of file

From 102c60326b0d155a65d828e0b166d0cfc9338081 Mon Sep 17 00:00:00 2001
From: MoringLotus <lotusdurine2023@gmail.com>
Date: Fri, 10 Apr 2026 08:40:20 +0000
Subject: [PATCH 12/12] remove

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 11b10c29..149d4bc3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,7 @@ python/infinilm/lib/*.so
 # Vscode
 .vscode/
 
-
+*.sh
 model_weight/
 
 # Python