mlcommons · arekay-nv · Apr 14, 2026 · Apr 15, 2026
@@ -117,7 +117,9 @@ src/inference_endpoint/
 │   ├── info.py                # execute_info()
 │   ├── validate.py            # execute_validate()
 │   └── init.py                # execute_init()
-├── core/types.py              # APIType, Query, QueryResult, StreamChunk, QueryStatus (msgspec Structs)
+├── core/
+│   ├── types.py               # APIType, Query, QueryResult, StreamChunk, QueryStatus (msgspec Structs)
+│   └── record.py              # EventRecord — transport record used by event logger and ZMQ transport
 ├── load_generator/
 │   ├── session.py             # BenchmarkSession - top-level orchestrator
 │   ├── load_generator.py      # LoadGenerator, SchedulerBasedLoadGenerator
@@ -144,7 +146,6 @@ src/inference_endpoint/
 │   │   └── metrics_aggregator/ # MetricsAggregatorService: real-time metrics (TTFT, TPOT, ISL, OSL)
 │   └── transport/             # ZMQ-based IPC transport layer
 │       ├── protocol.py        # Transport protocols + TransportConfig base
-│       ├── record.py          # Transport records
 │       └── zmq/               # ZMQ implementation (context, pubsub, transport, ZMQTransportConfig)
 ├── dataset_manager/
 │   ├── dataset.py             # Dataset base class, DatasetFormat enum

@@ -18,20 +18,18 @@ The CPU affinity system partitions physical cores between LoadGen (main process)
 
 ## Configuration
 
-| Setting        | Location  | Default | Purpose                                       |
-| -------------- | --------- | ------- | --------------------------------------------- |
-| `cpu_affinity` | Top-level | `-1`    | Pin loadgen and worker processes to CPU cores |
+| Setting               | Location  | Default | Purpose                                       |
+| --------------------- | --------- | ------- | --------------------------------------------- |
+| `enable_cpu_affinity` | Top-level | `true`  | Pin loadgen and worker processes to CPU cores |
 
 **Values:**
 
-- `-1` (auto): Physical core isolation with SMT siblings, fastest cores to loadgen
-- `list[int]`: Use specific cores (shared by loadgen and workers)
-- `null`: Disabled
+- `true` (default): Auto-compute NUMA-aware plan — physical core isolation with SMT siblings, fastest cores assigned to loadgen
+- `false`: Disabled — no CPU pinning (use `--no-cpu-affinity` on the CLI)
 
 ```yaml
-cpu_affinity: -1 # Auto: physical core isolation with SMT siblings
-# cpu_affinity: [4, 5, 6, 7, 8, 9, 10, 11]  # Explicit cores
-# cpu_affinity: null  # Disabled
+enable_cpu_affinity: true # Auto-compute NUMA-aware plan (default)
+# enable_cpu_affinity: false  # Disabled
 ```
 
 **Auto mode allocation** (default 6 physical cores for loadgen):

@@ -73,8 +73,7 @@ inference-endpoint validate-yaml -c test.yaml
 
 ```bash
 # Generate config templates
-inference-endpoint init offline        # or: online, eval, submission
-inference-endpoint init submission
+inference-endpoint init offline        # or: online, concurrency, eval, submission
 
 # Show system info
 inference-endpoint info

@@ -46,10 +46,10 @@ enroot start -e HF_TOKEN=$HF_TOKEN -m $HF_HOME:/root/.cache/huggingface vllm+vll
 
 ## Launching the client
 
-Once the server is up and running, we can send requests to the endpoint by passing in the endpoint address via `-e` as well as the model name
+Once the server is up and running, we can send requests to the endpoint by passing in the endpoint address and model name:
 
 ```
-inference-endpoint benchmark offline -e http://localhost:8000 -d tests/datasets/dummy_1k.jsonl  --model ${MODEL_NAME}
+inference-endpoint benchmark offline --endpoints http://localhost:8000 --dataset tests/datasets/dummy_1k.jsonl --model ${MODEL_NAME}
 ```
 
 # Using a config file

@@ -1,24 +1,32 @@
 # GPT-OSS-120B Benchmark
 
-## Getting dataset
+End-to-end example for benchmarking `openai/gpt-oss-120b` with vLLM or SGLang, including YAML configs
+and Python scripts for AIME25, GPQA, and LiveCodeBench accuracy evaluation.
 
-The dataset can be obtained from the LLM task-force which is in the process of finalizing the contents of the dataset for both performance and accuracy. The dataset is in a parquet format. Place it at:
+## Getting the Dataset
+
+The performance dataset must be obtained from the LLM task-force (parquet format, currently being finalized).
+Place it at:
 
 ```
 examples/04_GPTOSS120B_Example/data/perf_eval_ref.parquet
 ```
 
-## Environment setup
+The accuracy datasets (AIME25, GPQA, LiveCodeBench) are downloaded automatically from HuggingFace.
+
+## Environment Setup
 
 ```bash
 export HF_HOME=<path to your HuggingFace cache, e.g. ~/.cache/huggingface>
 export HF_TOKEN=<your HuggingFace token>
 export MODEL_NAME=openai/gpt-oss-120b
 ```
 
+---
+
 ## vLLM
 
-### Launch server
+### Launch Server
 
 GPT-OSS-120B requires multiple GPUs. Adjust `--tensor-parallel-size` to match your hardware.
 
@@ -35,36 +43,32 @@ docker run --runtime nvidia --gpus all \
   --max-model-len 65536
 ```
 
-### Run benchmark
+### Run Benchmark
 
-The config [`vllm_gptoss_120b_example.yaml`](vllm_gptoss_120b_example.yaml) runs performance + AIME25 + GPQA accuracy at concurrency 512:
+[`vllm_gptoss_120b_example.yaml`](vllm_gptoss_120b_example.yaml) runs performance + AIME25 + GPQA accuracy at concurrency 512:
 
 ```bash
 inference-endpoint benchmark from-config \
   -c examples/04_GPTOSS120B_Example/vllm_gptoss_120b_example.yaml \
   --timeout 60
 ```
 
-> **Note:** In the YAML config, the dataset's `prompt` column is mapped into the benchmark's expected `prompt` field, which is then sent through the chat completions API. vLLM does not support pre-tokenized input via this endpoint, unlike SGLang's `input_tokens` path.
+> **Note:** The dataset's `prompt` column is mapped to the benchmark's `prompt` field and sent through the
+> chat completions API. vLLM does not support pre-tokenized input via this endpoint, unlike SGLang's
+> `input_tokens` path.
 
-### vllm bench serve (reference comparison)
+### vllm bench serve (Reference Comparison)
 
 `vllm bench serve` supports custom datasets only in `jsonl` format. To convert the parquet file:
 
 ```python
 import pandas as pd
 
-parquet_file = 'examples/04_GPTOSS120B_Example/data/perf_eval_ref.parquet'
-json_file = 'examples/04_GPTOSS120B_Example/data/perf_eval_ref.jsonl'
-
-df = pd.read_parquet(parquet_file)
-df = df.rename(columns={'prompt': 'raw_prompt'})
-df = df.rename(columns={'text_input': 'prompt'})
-df.to_json(json_file, orient='records', lines=True)
+df = pd.read_parquet('examples/04_GPTOSS120B_Example/data/perf_eval_ref.parquet')
+df = df.rename(columns={'prompt': 'raw_prompt', 'text_input': 'prompt'})
+df.to_json('examples/04_GPTOSS120B_Example/data/perf_eval_ref.jsonl', orient='records', lines=True)
 ```
 
-This renames `text_input` to `prompt` as the custom dataloader requires the pre-processed prompt under that name. The benchmarking command must point to the `completions` endpoint (not `chat-completions`) since the prompt is pre-processed. Numbers are not directly comparable to inference-endpoint results, but provide a reference for relative performance given the output token distribution.
-
 ```bash
 vllm bench serve \
   --backend vllm \
@@ -79,9 +83,34 @@ vllm bench serve \
   --save-detailed
 ```
 
+Numbers are not directly comparable to `inference-endpoint` results but provide a reference for relative
+performance given the output token distribution.
+
+---
+
 ## SGLang
 
-### Launch server
+### Launch Server
+
+**Option A: MLCommons MLPerf Inference Reference Implementation**
+
+The official reference provides detailed instructions for model setup, data preparation, and deployment:
+
+```bash
+git clone https://github.com/mlcommons/inference.git
+cd inference/language/gpt-oss-120b
+# Follow the README at:
+# https://github.com/mlcommons/inference/tree/master/language/gpt-oss-120b
+./sglang/run_server.sh \
+    --model_path /path/to/gpt-oss-120b/model/ \
+    --dp <Number of GPUs> \
+    --stream_interval 100
+```
+
+**Option B: Direct SGLang Installation**
+
+If you already have the model weights, follow [SGLang's GPT-OSS instructions](https://docs.sglang.io/basic_usage/gpt_oss.html).
+The server must run on port 30000.
 
 ```bash
 docker run --runtime nvidia --gpus all --net host \
@@ -102,24 +131,130 @@ docker run --runtime nvidia --gpus all --net host \
   --stream-interval 500
 ```
 
-### Run benchmark
+### Run Benchmark
 
-The config [`sglang_gptoss_120b_example.yaml`](sglang_gptoss_120b_example.yaml) runs performance + AIME25 + GPQA + LiveCodeBench accuracy at concurrency 512:
+[`sglang_gptoss_120b_example.yaml`](sglang_gptoss_120b_example.yaml) runs performance + AIME25 + GPQA +
+LiveCodeBench accuracy at concurrency 512:
 
 ```bash
 inference-endpoint benchmark from-config \
   -c examples/04_GPTOSS120B_Example/sglang_gptoss_120b_example.yaml \
   --timeout 60
 ```
 
-For a performance-only run with a simpler config, see [`gptoss_120b_example.yaml`](gptoss_120b_example.yaml). Update `endpoint_config.endpoints` in that file to match your server port (e.g. `http://localhost:8000` for vLLM, `http://localhost:30000` for SGLang).
+For a performance-only run, use [`gptoss_120b_example.yaml`](gptoss_120b_example.yaml). Update
+`endpoint_config.endpoints` to match your server port (e.g. `http://localhost:8000` for vLLM,
+`http://localhost:30000` for SGLang).
+
+### LiveCodeBench Setup
+
+LiveCodeBench has dependency conflicts with the main package and should be run via the containerized
+workflow. Follow the instructions in the
+[LiveCodeBench README](../../src/inference_endpoint/evaluation/livecodebench/README.md#running-the-container).
+
+**Non-containerized (not recommended):**
+
+```bash
+source /path/to/inference-endpoint/venv/bin/activate
+pip install datasets==3.6.0
+pip install fastapi==0.128.0 uvicorn[standard]==0.40.0
+export ALLOW_LCB_LOCAL_EVAL=true
+```
+
+With `ALLOW_LCB_LOCAL_EVAL=true`, the `LiveCodeBenchScorer` falls back to running `lcb_serve` as a
+subprocess on the host.
+
+---
+
+## Accuracy Suite Script
+
+`run.py` runs all three accuracy benchmarks (GPQA, AIME25, LiveCodeBench) in sequence via SGLang:
+
+```bash
+cd examples/04_GPTOSS120B_Example
+python run.py \
+    --report-dir ./results \
+    --num-repeats 1 \
+    --min-duration 10 \
+    --max-duration 600
+```
+
+| Argument             | Default                  | Description                          |
+| -------------------- | ------------------------ | ------------------------------------ |
+| `--report-dir`       | `sglang_accuracy_report` | Directory to save results            |
+| `--num-repeats`      | `1`                      | Repeats per dataset                  |
+| `--min-duration`     | `10`                     | Minimum benchmark duration (seconds) |
+| `--max-duration`     | `600`                    | Maximum benchmark duration (seconds) |
+| `--force-regenerate` | off                      | Force dataset regeneration           |
+
+---
+
+## Individual Evaluation Scripts
+
+Run after `run.py` to re-score from an existing report directory.
+
+### GPQA
+
+```bash
+python eval_gpqa.py \
+    --dataset-path datasets/gpqa/diamond/gpqa_diamond.parquet \
+    --report-dir ./results
+```
+
+### AIME25
+
+```bash
+python eval_aime.py \
+    --dataset-path datasets/aime25/aime25.parquet \
+    --report-dir ./results
+```
+
+### LiveCodeBench
+
+```bash
+python eval_livecodebench.py \
+    --dataset-path datasets/livecodebench/release_v6/livecodebench_release_v6.parquet \
+    --report-dir ./results \
+    --lcb-version release_v6 \
+    --timeout 60
+```
+
+---
 
 ## Debugging
 
-[mitmproxy](https://www.mitmproxy.org/) can inspect HTTP traffic between the benchmarking client and the server in reverse-proxy mode:
+[mitmproxy](https://www.mitmproxy.org/) can inspect HTTP traffic in reverse-proxy mode:
 
 ```bash
 mitmproxy -p 8001 --mode reverse:http://localhost:8000/
 ```
 
-This forwards port `8001` to `8000`. Run the server on port `8000` and point the client at port `8001`. All requests and responses are logged transparently.
+Run the server on port 8000 and point the client at 8001. All requests and responses are logged
+transparently.
+
+---
+
+## Troubleshooting
+
+**Cannot connect to SGLang server**
+
+- Verify it is running: `curl http://localhost:30000/health`
+- Check firewall settings for remote servers
+- Ensure the port matches in both server and client configs
+
+**CUDA out of memory**
+
+- Increase `--tensor-parallel-size` / `--data-parallel-size`
+- Use `--mem-fraction-static` to reduce static memory allocation
+- Check GPU utilization with `nvidia-smi`
+
+**LiveCodeBench dependency conflicts**
+
+- Use the containerized workflow (recommended)
+- If running standalone, ensure `datasets==3.6.0` is installed
+
+**Slow inference / benchmark taking too long**
+
+- Check GPU utilization with `nvidia-smi`
+- Increase `num_workers` in `run.py` or the YAML `settings.client.num_workers`
+- Consider enabling FlashInfer or other SGLang optimizations
@@ -39,7 +39,7 @@ metrics:
 
 endpoint_config:
   endpoints:
-    - "http://localhost:3000"
+    - "http://localhost:30000"
   api_key: null
 
 report_dir: "results/gptoss_120b_benchmark_mlperf/"
@@ -200,13 +200,17 @@ def run_main(args):
         # Always generate GPQA diamond dataset
         logging.info("Generating GPQA diamond dataset...")
         gpqa_dataset = GPQA.get_dataloader(
-            num_repeats=num_repeats, transforms=GPQA.PRESETS.gptoss()
+            num_repeats=num_repeats,
+            transforms=GPQA.PRESETS.gptoss(),
+            force_regenerate=args.force_regenerate,
         )
         gpqa_dataset.load(api_type=APIType.SGLANG, model_params=model_params)
         # Always generate AIME25 dataset
         logging.info("Generating AIME25 dataset...")
         aime25_dataset = AIME25.get_dataloader(
-            num_repeats=num_repeats, transforms=AIME25.PRESETS.gptoss()
+            num_repeats=num_repeats,
+            transforms=AIME25.PRESETS.gptoss(),
+            force_regenerate=args.force_regenerate,
         )
         aime25_dataset.load(api_type=APIType.SGLANG, model_params=model_params)
         logging.info(f"Dataset loaded with {aime25_dataset.num_samples()} samples")
@@ -215,6 +219,7 @@ def run_main(args):
         lcb_dataset = LiveCodeBench.get_dataloader(
             num_repeats=num_repeats,
             transforms=LiveCodeBench.PRESETS.gptoss(),
+            force_regenerate=args.force_regenerate,
         )
         lcb_dataset.load(api_type=APIType.SGLANG, model_params=model_params)
         logging.info(f"Dataset loaded with {lcb_dataset.num_samples()} samples")