diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..476a21b1c --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +datasets/isb1/exports/preview/long_context_1m/*.json filter=lfs diff=lfs merge=lfs -text +datasets/isb1/exports/**/*.json filter=lfs diff=lfs merge=lfs -text diff --git a/.github/configs/isb1-kv-stress-pr993.yaml b/.github/configs/isb1-kv-stress-pr993.yaml new file mode 100644 index 000000000..544ecd9dd --- /dev/null +++ b/.github/configs/isb1-kv-stress-pr993.yaml @@ -0,0 +1,3589 @@ +dsr1-fp4-b200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id001 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id001 + workload-type: code + model: deepseek-r1-fp4 + model-prefix: dsr1 + precision: fp4 + runner: b200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp4-b200-dynamo-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id002 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id002 + workload-type: code + model: deepseek-r1-fp4 + model-prefix: dsr1 + precision: fp4 + runner: b200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp4-b200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id003 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id003 + workload-type: code + model: deepseek-r1-fp4 + model-prefix: dsr1 + precision: fp4 + runner: b200-multinode + runtime-stack-id: dynamo:trt +dsr1-fp4-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id004 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id004 + workload-type: code + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:sglang +dsr1-fp4-b200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id005 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id005 + workload-type: code + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp4-b200-trt-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id006 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id006 + workload-type: code + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp4-b300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:b300_sxm_288gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id007 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id007 + workload-type: code + model: deepseek-r1-fp4 + model-prefix: dsr1 + precision: fp4 + runner: b300 + runtime-stack-id: dynamo:trt +dsr1-fp4-gb200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: lmsysorg/sglang:v0.5.8-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id008 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id008 + workload-type: code + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + precision: fp4 + runner: gb200 + runtime-stack-id: dynamo:sglang +dsr1-fp4-gb200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id009 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id009 + workload-type: code + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + precision: fp4 + runner: gb200 + runtime-stack-id: dynamo:trt +dsr1-fp4-gb300-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id010 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id010 + workload-type: code + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + precision: fp4 + runner: gb300 + runtime-stack-id: dynamo:sglang +dsr1-fp4-gb300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id011 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id011 + workload-type: code + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 + model-prefix: dsr1 + precision: fp4 + runner: gb300 + runtime-stack-id: dynamo:trt +dsr1-fp4-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id012 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id012 + workload-type: code + model: amd/DeepSeek-R1-0528-MXFP4-Preview + model-prefix: dsr1 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp4-mi355x-atom-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id013 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id013 + workload-type: code + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp4-mi355x-sglang-disagg-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id014 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id014 + workload-type: code + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + precision: fp4 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp4-mi355x-sglang-disagg-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id015 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id015 + workload-type: code + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + precision: fp4 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp4-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi35x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id016 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id016 + workload-type: code + model: amd/DeepSeek-R1-0528-MXFP4-Preview + model-prefix: dsr1 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:sglang +dsr1-fp8-b200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id017 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id017 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp8-b200-dynamo-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id018 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id018 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp8-b200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id019 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id019 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200-multinode + runtime-stack-id: dynamo:trt +dsr1-fp8-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id020 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id020 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +dsr1-fp8-b200-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id021 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id021 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +dsr1-fp8-b200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id022 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id022 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp8-b200-trt-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id023 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id023 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:trt +dsr1-fp8-b300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:b300_sxm_288gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id024 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id024 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: b300 + runtime-stack-id: dynamo:trt +dsr1-fp8-gb200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id025 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id025 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: gb200 + runtime-stack-id: dynamo:sglang +dsr1-fp8-gb200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id026 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id026 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: gb200 + runtime-stack-id: dynamo:trt +dsr1-fp8-gb300-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id027 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id027 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: gb300 + runtime-stack-id: dynamo:sglang +dsr1-fp8-gb300-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:gb300_grace_blackwell_288gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id028 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id028 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: gb300 + runtime-stack-id: dynamo:trt +dsr1-fp8-h100-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:h100_sxm_80gb + image: lmsysorg/sglang:v0.5.8-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id029 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id029 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h100-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp8-h100-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:h100_sxm_80gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id030 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id030 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h100-multinode + runtime-stack-id: dynamo:trt +dsr1-fp8-h200-dynamo-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.8.post1-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id031 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id031 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200-multinode + runtime-stack-id: dynamo:sglang +dsr1-fp8-h200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: dynamo-trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id032 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id032 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200-multinode + runtime-stack-id: dynamo:trt +dsr1-fp8-h200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id033 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id033 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +dsr1-fp8-h200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id034 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id034 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:trt +dsr1-fp8-h200-trt-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id035 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id035 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:trt +dsr1-fp8-mi300x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: amd:mi300x_192gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id036 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id036 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi300x + runtime-stack-id: standalone:sglang +dsr1-fp8-mi325x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: amd:mi325x_288gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id037 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id037 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi325x + runtime-stack-id: standalone:sglang +dsr1-fp8-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id038 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id038 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp8-mi355x-atom-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id039 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id039 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +dsr1-fp8-mi355x-sglang-disagg-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id040 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id040 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp8-mi355x-sglang-disagg-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang-disagg + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id041 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id041 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x-disagg + runtime-stack-id: standalone:sglang-disagg +dsr1-fp8-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: deepseek_r1_0528 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: lmsysorg/sglang:v0.5.9-rocm700-mi35x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id042 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id042 + workload-type: code + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:sglang +glm5-fp4-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id043 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id043 + workload-type: code + model: nvidia/GLM-5-NVFP4 + model-prefix: glm5 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:sglang +glm5-fp8-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id044 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id044 + workload-type: code + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +glm5-fp8-h200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:glm5-hopper + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id045 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id045 + workload-type: code + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +glm5-fp8-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id046 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id046 + workload-type: code + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +glm5-fp8-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: glm_5 + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id047 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id047 + workload-type: code + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:sglang +gptoss-fp4-b200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: trt + hardware-profile-id: nvidia:b200_sxm_180gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id048 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id048 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: b200 + runtime-stack-id: standalone:trt +gptoss-fp4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.15.1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id049 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id049 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: b200 + runtime-stack-id: standalone:vllm +gptoss-fp4-gb200-dynamo-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: dynamo-trt + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id050 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id050 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: gb200 + runtime-stack-id: dynamo:trt +gptoss-fp4-h100-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: nvidia:h100_sxm_80gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id051 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id051 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: h100 + runtime-stack-id: standalone:vllm +gptoss-fp4-h200-trt-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: trt + hardware-profile-id: nvidia:h200_sxm_141gb + image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc11 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id052 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id052 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: h200 + runtime-stack-id: standalone:trt +gptoss-fp4-h200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id053 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id053 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: h200 + runtime-stack-id: standalone:vllm +gptoss-fp4-mi300x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: amd:mi300x_192gb + image: vllm/vllm-openai-rocm:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id054 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id054 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: mi300x + runtime-stack-id: standalone:vllm +gptoss-fp4-mi325x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: amd:mi325x_288gb + image: vllm/vllm-openai-rocm:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id055 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id055 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: mi325x + runtime-stack-id: standalone:vllm +gptoss-fp4-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id056 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id056 + workload-type: code + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +gptoss-fp4-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: gpt_oss_120b + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id057 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id057 + workload-type: code + model: amd/gpt-oss-120b-w-mxfp4-a-fp8 + model-prefix: gptoss + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:vllm +kimik2.5-fp4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.17.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id058 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id058 + workload-type: code + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:vllm +kimik2.5-fp4-gb200-dynamo-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: dynamo-vllm + hardware-profile-id: nvidia:gb200_grace_blackwell_192gb + image: vllm/vllm-openai:v0.18.0-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id059 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id059 + workload-type: code + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + precision: fp4 + runner: gb200 + runtime-stack-id: dynamo:vllm +kimik2.5-fp4-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id060 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id060 + workload-type: code + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:atom +kimik2.5-fp4-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id061 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id061 + workload-type: code + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:vllm +kimik2.5-int4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.15.1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id062 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id062 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: b200 + runtime-stack-id: standalone:vllm +kimik2.5-int4-h200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.16.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id063 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id063 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: h200 + runtime-stack-id: standalone:vllm +kimik2.5-int4-mi300x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: amd:mi300x_192gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id064 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id064 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: mi300x + runtime-stack-id: standalone:vllm +kimik2.5-int4-mi325x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: amd:mi325x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id065 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id065 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: mi325x + runtime-stack-id: standalone:vllm +kimik2.5-int4-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: kimi_k2_5 + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id066 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id066 + workload-type: code + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + precision: int4 + runner: mi355x + runtime-stack-id: standalone:vllm +minimaxm2.5-fp4-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.19.0-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id067 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id067 + workload-type: code + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-b200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: nvidia:b200_sxm_180gb + image: vllm/vllm-openai:v0.19.0-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id068 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id068 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-h100-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: nvidia:h100_sxm_80gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id069 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id069 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: h100 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-h200-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: nvidia:h200_sxm_141gb + image: vllm/vllm-openai:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id070 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id070 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-mi300x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: amd:mi300x_192gb + image: vllm/vllm-openai-rocm:v0.16.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id071 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id071 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: mi300x + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-mi325x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: amd:mi325x_288gb + image: vllm/vllm-openai-rocm:v0.18.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id072 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id072 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: mi325x + runtime-stack-id: standalone:vllm +minimaxm2.5-fp8-mi355x-atom-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: atom + hardware-profile-id: amd:mi355x_288gb + image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id073 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id073 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:atom +minimaxm2.5-fp8-mi355x-vllm-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: minimax_m2_5 + framework: vllm + hardware-profile-id: amd:mi355x_288gb + image: vllm/vllm-openai-rocm:v0.19.0 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id074 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: unsupported + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id074 + workload-type: code + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:vllm +qwen3.5-bf16-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id075 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id075 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + precision: bf16 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-bf16-mi300x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi300x_192gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id076 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id076 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + precision: bf16 + runner: mi300x + runtime-stack-id: standalone:sglang +qwen3.5-bf16-mi325x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi325x_288gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id077 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id077 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + precision: bf16 + runner: mi325x + runtime-stack-id: standalone:sglang +qwen3.5-bf16-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id078 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id078 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B + model-prefix: qwen3.5 + precision: bf16 + runner: mi355x + runtime-stack-id: standalone:sglang +qwen3.5-fp4-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id079 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id079 + workload-type: code + model: nvidia/Qwen3.5-397B-A17B-NVFP4 + model-prefix: qwen3.5 + precision: fp4 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-fp4-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: lmsysorg/sglang:v0.5.10-rocm720-mi35x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id080 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id080 + workload-type: code + model: amd/Qwen3.5-397B-A17B-MXFP4 + model-prefix: qwen3.5 + precision: fp4 + runner: mi355x + runtime-stack-id: standalone:sglang +qwen3.5-fp8-b200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id081 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id081 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-b200-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:b200_sxm_180gb + image: lmsysorg/sglang:v0.5.9-cu130 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id082 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id082 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: b200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-h200-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.9-cu129-amd64 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id083 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id083 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-h200-sglang-mtp-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: nvidia:h200_sxm_141gb + image: lmsysorg/sglang:v0.5.10.post1 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id084 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id084 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: h200 + runtime-stack-id: standalone:sglang +qwen3.5-fp8-mi300x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi300x_192gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id085 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id085 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: mi300x + runtime-stack-id: standalone:sglang +qwen3.5-fp8-mi325x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi325x_288gb + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id086 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id086 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: mi325x + runtime-stack-id: standalone:sglang +qwen3.5-fp8-mi355x-sglang-isb1-kv-stress: + benchmark-type: isb1_kv_stress + canonical-model-id: qwen3_5_397b_a17b + framework: sglang + hardware-profile-id: amd:mi355x_288gb + image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json + request-mode: multi-turn + search-space: + - duration-s: 1800 + offload-modes: + - 'on' + - 'off' + - noprefix + users: &id087 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + support-status: reviewed_preview + tp-configs: + - duration-s: 1800 + ep: 1 + offload-modes: + - 'on' + - 'off' + - noprefix + tp: 8 + users: *id087 + workload-type: code + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + runner: mi355x + runtime-stack-id: standalone:sglang diff --git a/.github/configs/isb1-kv-stress.yaml b/.github/configs/isb1-kv-stress.yaml new file mode 100644 index 000000000..9ee07ef5d --- /dev/null +++ b/.github/configs/isb1-kv-stress.yaml @@ -0,0 +1,96 @@ +# Dedicated ISB1 KV cache stress sweeps (CTO-approved schema). +# +# This file is intentionally separate from isb1-master.yaml and uses +# benchmark-type: isb1_kv_stress with kv-stress-configs. + +gptoss-fp4-h200-isb1-kv-stress-vllm-code: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_kv_stress + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + workload-type: code + search-space: + - users: [2, 4, 8, 16, 32, 64, 128, 256] + offload-modes: ["on", "off", "noprefix"] + duration-s: 1800 + +gptoss-fp4-b200-isb1-kv-stress-vllm-code: + image: vllm/vllm-openai:v0.15.1 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: b200 + benchmark-type: isb1_kv_stress + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + workload-type: code + search-space: + - users: [2, 4, 8, 16, 32, 64, 128, 256] + offload-modes: ["on", "off", "noprefix"] + duration-s: 1800 + +qwen3.5-fp8-h200-isb1-kv-stress-vllm-code: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_kv_stress + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 131272 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + workload-type: code + search-space: + - users: [2, 4, 8, 16, 32, 64, 128, 256] + offload-modes: ["on", "off", "noprefix"] + duration-s: 1800 + +qwen3.5-fp8-b200-isb1-kv-stress-vllm-code: + image: vllm/vllm-openai:v0.19.0-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_kv_stress + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 131272 + kv-cache-dtype: fp8 + kv-stress-configs: + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + workload-type: code + search-space: + - users: [2, 4, 8, 16, 32, 64, 128, 256] + offload-modes: ["on", "off", "noprefix"] + duration-s: 1800 diff --git a/.github/configs/isb1-master.yaml b/.github/configs/isb1-master.yaml new file mode 100644 index 000000000..99c111967 --- /dev/null +++ b/.github/configs/isb1-master.yaml @@ -0,0 +1,1723 @@ +# PR2 packaged the core 8k1k replay bundles. +# PR4 adds truthful long-context extension replay lanes using only the materialized +# extension_32k / extension_64k / extension_131k code bundles. +# These extension lanes are served-shape replay artifacts derived from larger source +# workloads; they are not native 500k+/1M+ InferenceX served-lane claims. +# +# Core entries keep an explicit 8k1k max-model-len. Extension entries intentionally +# omit max-model-len so the ISB1 workflow derives the served-shape value from the +# export stem (32k1k / 64k1k / 131k1k) at execution time. +# +# Official replay-configs pin support-status: supported so the workflow only replays +# the supported subset of mixed-status export bundles. +# All currently runnable rows also resolve to +# benchmark_certification_status=dataset_replay_verified. +# Phase 2 adds truthful chat-extension widening plus bounded preview/offload +# lanes. Preview rows stay explicit via support-status: reviewed_preview and the +# dedicated preview export paths. The current replay closure covers dsr1, +# gptoss, and qwen3.5 across core 8k1k plus extension bands, with bounded +# 500k code preview for gptoss and qwen3.5 on standalone sglang/vllm across +# b200/h100/h200. + +dsr1-fp8-b200-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: deepseek_r1_0528 + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +dsr1-fp8-h200-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: deepseek_r1_0528 + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +dsr1-fp8-b200-isb1-vllm: + image: vllm/vllm-openai:v0.19.0-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: deepseek_r1_0528 + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +dsr1-fp8-h200-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: deepseek_r1_0528 + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptoss-fp4-b200-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptoss-fp4-h100-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptoss-fp4-h200-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptoss-fp4-b200-isb1-vllm: + # Keep the existing B200 GPT-OSS vLLM pin from the official throughput lane. + image: vllm/vllm-openai:v0.15.1 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptoss-fp4-h100-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptoss-fp4-h200-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +qwen3.5-fp8-b200-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +qwen3.5-fp8-h100-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +qwen3.5-fp8-h200-isb1-sglang: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +qwen3.5-fp8-b200-isb1-vllm: + image: vllm/vllm-openai:v0.19.0-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +qwen3.5-fp8-h100-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +qwen3.5-fp8-h200-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +dsr1-fp8-b200-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: deepseek_r1_0528 + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +dsr1-fp8-h200-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: deepseek_r1_0528 + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +dsr1-fp8-b200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.19.0-cu130 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: deepseek_r1_0528 + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +dsr1-fp8-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: deepseek_r1_0528 + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +gptoss-fp4-b200-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + - export-file: datasets/isb1/exports/extension_131k/sglang/chat_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +gptoss-fp4-h100-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + - export-file: datasets/isb1/exports/extension_131k/sglang/chat_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +gptoss-fp4-h200-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + - export-file: datasets/isb1/exports/extension_131k/sglang/chat_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +gptoss-fp4-b200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.15.1 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + - export-file: datasets/isb1/exports/extension_131k/vllm/chat_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +gptoss-fp4-h100-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + - export-file: datasets/isb1/exports/extension_131k/vllm/chat_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +gptoss-fp4-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + - export-file: datasets/isb1/exports/extension_131k/vllm/chat_131k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-b200-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-h100-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-h200-isb1-sglang-extension: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-b200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.19.0-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-h100-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + - max-concurrency: 4 + +qwen3.5-fp8-b200-isb1-sglang-500k-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +qwen3.5-fp8-h100-isb1-sglang-500k-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +qwen3.5-fp8-h200-isb1-sglang-500k-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +qwen3.5-fp8-b200-isb1-vllm-500k-preview-code: + image: vllm/vllm-openai:v0.19.0-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +qwen3.5-fp8-h100-isb1-vllm-500k-preview-code: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +qwen3.5-fp8-h200-isb1-vllm-500k-preview-code: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-b200-isb1-sglang-500k-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-h100-isb1-sglang-500k-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-h200-isb1-sglang-500k-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-b200-isb1-vllm-500k-preview-code: + image: vllm/vllm-openai:v0.15.1 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-h100-isb1-vllm-500k-preview-code: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-h200-isb1-vllm-500k-preview-code: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 524288 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + +gptoss-fp4-b200-isb1-sglang-offload-core-preview-chat: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + replay-configs: + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__standard.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 4 + max-turns-per-session: 6 + num-warmup-sessions: 0 + +gptoss-fp4-h100-isb1-sglang-offload-core-preview-chat: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + replay-configs: + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__standard.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 4 + max-turns-per-session: 6 + num-warmup-sessions: 0 + +gptoss-fp4-h200-isb1-sglang-offload-core-preview-chat: + image: lmsysorg/sglang:v0.5.9-cu130 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: sglang + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + replay-configs: + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__standard.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 4 + max-turns-per-session: 6 + num-warmup-sessions: 0 + +gptoss-fp4-b200-isb1-vllm-offload-core-preview-code: + image: vllm/vllm-openai:v0.15.1 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + replay-configs: + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__smoke.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__standard.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 4 + max-turns-per-session: 6 + num-warmup-sessions: 0 + +gptoss-fp4-h100-isb1-vllm-offload-core-preview-code: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + replay-configs: + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__smoke.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__standard.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 4 + max-turns-per-session: 6 + num-warmup-sessions: 0 + +gptoss-fp4-h200-isb1-vllm-offload-core-preview-code: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 131272 + replay-configs: + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__smoke.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 2 + max-turns-per-session: 4 + num-warmup-sessions: 0 + - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__standard.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 4 + max-turns-per-session: 6 + num-warmup-sessions: 0 diff --git a/.github/configs/isb1-qwen-1m-preview.yaml b/.github/configs/isb1-qwen-1m-preview.yaml new file mode 100644 index 000000000..1de9c7339 --- /dev/null +++ b/.github/configs/isb1-qwen-1m-preview.yaml @@ -0,0 +1,53 @@ +# Manual-only gated Qwen 1M preview surface. +# The selected export cells remain support-status=reviewed_preview and +# benchmark_certification_status=dataset_replay_verified, but this file is +# intentionally separate from isb1-master.yaml so the lane stays out of the +# ordinary runnable support statement. +# +# Use only for explicit validation dispatches while KV-offload observability and +# correctness remain under review. Running this file does not imply native 1M +# served-lane support or KV-offload certification. + +qwen3.5-fp8-b200-isb1-sglang-1m-gated-preview-code: + image: lmsysorg/sglang:v0.5.9-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: sglang + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:sglang + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 1048576 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__sglang.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 1 + max-turns-per-session: 3 + num-warmup-sessions: 0 + +qwen3.5-fp8-b200-isb1-vllm-1m-gated-preview-code: + image: vllm/vllm-openai:v0.19.0-cu130 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + precision: fp8 + framework: vllm + runner: b200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:b200_sxm_180gb + canonical-model-id: qwen3_5_397b_a17b + max-model-len: 1048576 + replay-configs: + - export-file: datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 1 + max-sessions: 1 + max-turns-per-session: 3 + num-warmup-sessions: 0 diff --git a/.github/configs/isb1-triattn-preview.yaml b/.github/configs/isb1-triattn-preview.yaml new file mode 100644 index 000000000..629cb8fe9 --- /dev/null +++ b/.github/configs/isb1-triattn-preview.yaml @@ -0,0 +1,291 @@ +# TriAttention KV-compression preview lanes for ISB1 replay benchmarks. +# +# These entries deploy vLLM with the TriAttention plugin enabled for runtime +# KV-cache compression on H100/H200 Hopper-class GPUs. The plugin uses env +# vars TRIATTN_RUNTIME_KV_BUDGET and TRIATTN_RUNTIME_SPARSE_STATS_PATH, +# configured in the benchmark scripts. +# +# Key differences from baseline vLLM ISB1 entries: +# - model-prefix includes "triattn" suffix to route to dedicated scripts +# - Prefix caching disabled (incompatible with KV compression) +# - max-num-batched-tokens lowered to 1024 (prevents OOM from large prefills) +# - KV budget auto-detected: 2048 for code workloads, 12000 for chat workloads +# +# This file is intentionally separate from isb1-master.yaml — TriAttention +# preview lanes stay out of the ordinary runnable support statement. +# Use only for explicit validation dispatches. +# +# Prerequisites: +# - triattention pip package installed in the container (or installed at runtime) +# - Optional: pre-calibrated stats at /workspace/triattn_stats/_stats.pt + +# --------------------------------------------------------------------------- +# DeepSeek-R1 FP8 — H100/H200 with TriAttention — core 8k1k +# --------------------------------------------------------------------------- + +dsr1triattn-fp8-h100-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1triattn + precision: fp8 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: deepseek_r1_0528 + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +dsr1triattn-fp8-h200-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1triattn + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: deepseek_r1_0528 + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +# --------------------------------------------------------------------------- +# DeepSeek-R1 FP8 — H100/H200 with TriAttention — long-context extensions +# --------------------------------------------------------------------------- + +dsr1triattn-fp8-h100-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1triattn + precision: fp8 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: deepseek_r1_0528 + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + +dsr1triattn-fp8-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1triattn + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: deepseek_r1_0528 + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + +# --------------------------------------------------------------------------- +# Qwen 3.5 FP8 — H100/H200 with TriAttention — extension only +# (Qwen 3.5 is not present in core 8k1k exports; only extension 131k) +# --------------------------------------------------------------------------- + +qwen3.5triattn-fp8-h100-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5triattn + precision: fp8 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + +qwen3.5triattn-fp8-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5triattn + precision: fp8 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: qwen3_5_397b_a17b + replay-configs: + - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 2 + num-warmup-sessions: 1 + +# --------------------------------------------------------------------------- +# GPT-OSS-120B FP4 — H100/H200 with TriAttention — core 8k1k +# --------------------------------------------------------------------------- + +gptosstriattn-fp4-h100-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptosstriattn + precision: fp4 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +gptosstriattn-fp4-h200-isb1-vllm: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptosstriattn + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + max-model-len: 10240 + replay-configs: + - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - max-concurrency: 8 + +# --------------------------------------------------------------------------- +# GPT-OSS-120B FP4 — H100/H200 with TriAttention — long-context extensions +# --------------------------------------------------------------------------- + +gptosstriattn-fp4-h100-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptosstriattn + precision: fp4 + framework: vllm + runner: h100 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h100_sxm_80gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + +gptosstriattn-fp4-h200-isb1-vllm-extension: + image: vllm/vllm-openai:v0.18.0 + model: openai/gpt-oss-120b + model-prefix: gptosstriattn + precision: fp4 + framework: vllm + runner: h200 + benchmark-type: isb1_replay + runtime-stack-id: standalone:vllm + hardware-profile-id: nvidia:h200_sxm_141gb + canonical-model-id: gpt_oss_120b + replay-configs: + - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json + request-mode: multi-turn + support-status: reviewed_preview + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 + - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json + request-mode: multi-turn + support-status: supported + search-space: + - max-concurrency: 4 + num-warmup-sessions: 1 diff --git a/.github/workflows/benchmark-isb1-tmpl.yml b/.github/workflows/benchmark-isb1-tmpl.yml new file mode 100644 index 000000000..d152d2062 --- /dev/null +++ b/.github/workflows/benchmark-isb1-tmpl.yml @@ -0,0 +1,451 @@ +name: Template - Benchmark ISB1 +on: + workflow_call: + inputs: + runner: + required: true + type: string + image: + required: true + type: string + model: + required: true + type: string + model-prefix: + required: true + type: string + precision: + required: true + type: string + framework: + required: true + type: string + exp-name: + required: true + type: string + benchmark-type: + required: true + type: string + export-file: + required: true + type: string + runtime-stack-id: + required: true + type: string + hardware-profile-id: + required: true + type: string + canonical-model-id: + required: true + type: string + support-status: + required: false + type: string + default: '' + request-mode: + required: true + type: string + max-concurrency: + required: true + type: string + max-sessions: + required: false + type: string + default: '' + max-turns-per-session: + required: false + type: string + default: '' + max-output-len: + required: false + type: string + default: '' + num-warmup-sessions: + required: false + type: string + default: '0' + ignore-waits: + required: false + type: boolean + default: false + ignore-eos: + required: false + type: boolean + default: false + max-model-len: + required: false + type: string + default: '' + tp-override: + required: false + type: string + default: '' + ep-override: + required: false + type: string + default: '' + trace-source: + required: false + type: string + default: '' + offload-mode: + required: false + type: string + default: '' + kv-cache-dtype: + required: false + type: string + default: '' + disable-prefix-caching: + required: false + type: boolean + default: false + benchmark-duration-s: + required: false + type: string + default: '' + workload-type: + required: false + type: string + default: '' + vllm-cpu-offload-gb: + required: false + type: string + default: '' + vllm-swap-space-gb: + required: false + type: string + default: '' + sglang-mem-fraction-override: + required: false + type: string + default: '' + sglang-chunked-prefill-override: + required: false + type: string + default: '' + ref: + description: Git ref (branch/sha) to checkout + required: false + type: string + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_CACHE: '/mnt/hf_hub_cache/' + EXP_NAME: ${{ inputs.exp-name }} + MODEL: ${{ inputs.model }} + MODEL_PREFIX: ${{ inputs.model-prefix }} + IMAGE: ${{ inputs.image }} + FRAMEWORK: ${{ inputs.framework }} + PRECISION: ${{ inputs.precision }} + BENCHMARK_TYPE: ${{ inputs.benchmark-type }} + EXPORT_FILE: ${{ inputs.export-file }} + RUNTIME_STACK_ID: ${{ inputs.runtime-stack-id }} + HARDWARE_PROFILE_ID: ${{ inputs.hardware-profile-id }} + CANONICAL_MODEL_ID: ${{ inputs.canonical-model-id }} + SUPPORT_STATUS: ${{ inputs.support-status }} + REQUEST_MODE: ${{ inputs.request-mode }} + MAX_CONCURRENCY: ${{ inputs.max-concurrency }} + MAX_SESSIONS: ${{ inputs.max-sessions }} + MAX_TURNS_PER_SESSION: ${{ inputs.max-turns-per-session }} + MAX_OUTPUT_LEN: ${{ inputs.max-output-len }} + NUM_WARMUP_SESSIONS: ${{ inputs.num-warmup-sessions }} + IGNORE_WAITS: ${{ inputs.ignore-waits }} + IGNORE_EOS: ${{ inputs.ignore-eos }} + OFFLOAD_MODE: ${{ inputs.offload-mode }} + KV_CACHE_DTYPE: ${{ inputs.kv-cache-dtype }} + DISABLE_PREFIX_CACHING: ${{ inputs.disable-prefix-caching }} + BENCHMARK_DURATION_S: ${{ inputs.benchmark-duration-s }} + WORKLOAD_TYPE: ${{ inputs.workload-type }} + VLLM_CPU_OFFLOAD_GB: ${{ inputs.vllm-cpu-offload-gb }} + VLLM_SWAP_SPACE_GB: ${{ inputs.vllm-swap-space-gb }} + SGLANG_MEM_FRACTION_OVERRIDE: ${{ inputs.sglang-mem-fraction-override }} + SGLANG_CHUNKED_PREFILL_OVERRIDE: ${{ inputs.sglang-chunked-prefill-override }} + TP_OVERRIDE: ${{ inputs.tp-override }} + EP_OVERRIDE: ${{ inputs.ep-override }} + TRACE_SOURCE: ${{ inputs.trace-source }} + PYTHONDONTWRITEBYTECODE: '1' + PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache + +permissions: + contents: read + +jobs: + benchmark: + runs-on: ${{ inputs.runner }} + timeout-minutes: 300 + name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | ${{ inputs.benchmark-type }} conc-${{ inputs.max-concurrency }}" + steps: + - name: Resource cleanup (pre-run) + run: &resource-cleanup | + if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then + echo "[Docker] Cleaning up resources ..." + docker ps -aq | xargs -r docker rm -f + docker network prune -f + while [ -n "$(docker ps -aq)" ]; do + docker ps -a + sleep 5 + done + fi + + if command -v squeue >/dev/null 2>&1; then + if [[ "${{ runner.name }}" == h100-* || "${{ runner.name }}" == h200-* || "${{ runner.name }}" == b200-* ]]; then + echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." + scancel --name="${{ runner.name }}" || true + while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do + squeue --name="${{ runner.name }}" + sleep 5 + done + else + echo "[Slurm] Cleaning up jobs for user: $USER ..." + scancel -u "$USER" || true + while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do + squeue -u "$USER" + sleep 5 + done + fi + fi + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + ref: ${{ inputs.ref || github.ref }} + clean: false + + - name: Certify ISB1 export contract + env: + INPUT_EXPORT_FILE: ${{ inputs.export-file }} + INPUT_RUNTIME_STACK_ID: ${{ inputs.runtime-stack-id }} + INPUT_HARDWARE_PROFILE_ID: ${{ inputs.hardware-profile-id }} + INPUT_CANONICAL_MODEL_ID: ${{ inputs.canonical-model-id }} + INPUT_SUPPORT_STATUS: ${{ inputs.support-status }} + INPUT_MAX_MODEL_LEN: ${{ inputs.max-model-len }} + run: | + python3 - <<'PY' + import json + import os + import re + from pathlib import Path + + export_path = Path(os.environ["INPUT_EXPORT_FILE"]) + if not export_path.exists(): + raise SystemExit(f"Missing ISB1 export file: {export_path}") + + payload = json.loads(export_path.read_text()) + exports = payload.get("exports") + if not isinstance(exports, list) or not exports: + raise SystemExit( + f"ISB1 export file must contain a non-empty 'exports' list: {export_path}" + ) + + support_status = os.environ.get("INPUT_SUPPORT_STATUS", "").strip() or None + explicit_max_model_len = os.environ.get("INPUT_MAX_MODEL_LEN", "").strip() + if not re.search(r"(?P\d+)k(?P\d+)k", export_path.stem) and not explicit_max_model_len: + raise SystemExit( + "Mixed-shape ISB1 exports require explicit max-model-len in the workflow input. " + f"Missing for '{export_path}'." + ) + + identity_cells = [ + cell + for cell in exports + if cell.get("runtime_stack_id") == os.environ["INPUT_RUNTIME_STACK_ID"] + and cell.get("hardware_profile_id") == os.environ["INPUT_HARDWARE_PROFILE_ID"] + and cell.get("canonical_model_id") == os.environ["INPUT_CANONICAL_MODEL_ID"] + ] + identity_statuses = sorted( + { + cell.get("support_status") + for cell in identity_cells + if cell.get("support_status") is not None + } + ) + matching_cells = [ + cell + for cell in identity_cells + if support_status is None or cell.get("support_status") == support_status + ] + + if support_status is None and len(identity_statuses) > 1: + raise SystemExit( + f"Ambiguous ISB1 support tier for {export_path}; identity spans {identity_statuses}. " + "Pin support-status explicitly." + ) + if not matching_cells: + raise SystemExit( + "No ISB1 export cell matches the requested workflow identity/tier for " + f"{export_path}. Available tiers for that identity: {identity_statuses or ['']}" + ) + + certification_statuses = sorted( + { + cell.get("benchmark_certification_status") + for cell in matching_cells + if cell.get("benchmark_certification_status") is not None + } + ) + if not certification_statuses: + raise SystemExit( + "Selected ISB1 export cells must declare benchmark_certification_status. " + f"Missing for '{export_path}'." + ) + if certification_statuses != ["dataset_replay_verified"]: + raise SystemExit( + "Current InferenceX ISB1 consumer lanes only accept " + "benchmark_certification_status=dataset_replay_verified. " + f"Selected cells for '{export_path}' resolved to {certification_statuses}." + ) + + print( + "Certified ISB1 export contract for " + f"{export_path} with support-status={support_status or ''} " + f"and benchmark_certification_status={certification_statuses[0]}" + ) + PY + + - name: Derive ISB1 runner env + env: + INPUT_RUNNER: ${{ inputs.runner }} + INPUT_EXPORT_FILE: ${{ inputs.export-file }} + INPUT_MAX_MODEL_LEN: ${{ inputs.max-model-len }} + INPUT_MAX_CONCURRENCY: ${{ inputs.max-concurrency }} + INPUT_TP_OVERRIDE: ${{ inputs.tp-override }} + run: | + python3 - <<'PY' >> "$GITHUB_ENV" + import json + import os + import re + from pathlib import Path + + runner = os.environ["INPUT_RUNNER"].lower() + export_file = os.environ["INPUT_EXPORT_FILE"] + explicit_max_model_len = os.environ.get("INPUT_MAX_MODEL_LEN", "").strip() + max_concurrency = os.environ["INPUT_MAX_CONCURRENCY"] + + if runner.startswith(("h100", "h200", "b200")): + tp = 8 + else: + raise SystemExit( + f"ISB1 replay lane is NVIDIA-first in PR1b; unsupported runner '{runner}'." + ) + + tp_override = os.environ.get("INPUT_TP_OVERRIDE", "").strip() + if tp_override: + tp = int(tp_override) + + if tp < 8: + raise SystemExit( + f"ISB1 replay requires TP=8 on NVIDIA runners; derived TP={tp} for runner '{runner}'." + ) + + export_path = Path(export_file) + match = re.search(r"(?P\d+)k(?P\d+)k", export_path.stem) + + if match: + isl = int(match.group("isl")) * 1024 + osl = int(match.group("osl")) * 1024 + else: + try: + payload = json.loads(export_path.read_text()) + except Exception as exc: + raise SystemExit( + f"Could not inspect preview export metadata from '{export_file}': {exc}" + ) + served_shape = payload.get("served_shape") or {} + isl = int(served_shape.get("isl", 0) or 0) + osl = int(served_shape.get("osl", 0) or 0) + if not explicit_max_model_len: + raise SystemExit( + "Mixed-shape preview exports require explicit max-model-len in the ISB1 config. " + f"Missing for '{export_file}'." + ) + + if explicit_max_model_len: + max_model_len = int(explicit_max_model_len) + else: + max_model_len = isl + osl + (200 if max(isl, osl) >= 8192 else 20) + + print(f"TP={tp}") + print("EP_SIZE=1") + print("DP_ATTENTION=false") + print("SPEC_DECODING=none") + print("DISAGG=false") + print(f"CONC={max_concurrency}") + print(f"ISL={isl}") + print(f"OSL={osl}") + print(f"MAX_MODEL_LEN={max_model_len}") + print("RANDOM_RANGE_RATIO=1.0") + print(f"EXPORT_STEM={Path(export_file).stem}") + PY + + - id: launch + name: Launch job script + env: + RUNNER_NAME: ${{ runner.name }} + RUNNER_TYPE: ${{ inputs.runner }} + run: | + RESULT_FILENAME="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_${BENCHMARK_TYPE}_${EXPORT_STEM}_conc${MAX_CONCURRENCY}_${RUNNER_NAME}" + echo "RESULT_FILENAME=${RESULT_FILENAME}" >> "$GITHUB_ENV" + echo "result_filename=${RESULT_FILENAME}" >> "$GITHUB_OUTPUT" + bash ./runners/launch_${RUNNER_NAME%%_*}.sh + + FOUND_RESULT_FILE= + for i in {1..10}; do + if [ -f "$RESULT_FILENAME.json" ]; then + FOUND_RESULT_FILE=true + break + fi + echo "Waiting for result file... (attempt $i)" + sleep 1 + done + + if [ -z "$FOUND_RESULT_FILE" ]; then + echo "Run failed: Replay result $RESULT_FILENAME.json not found." >&2 + exit 1 + fi + + - name: Process result + run: | + python3 utils/process_result_isb1.py + + - name: Upload result + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: isb1_${{ steps.launch.outputs.result_filename }} + path: agg_${{ steps.launch.outputs.result_filename }}.json + + - name: Upload raw replay result + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: replay_${{ steps.launch.outputs.result_filename }} + path: ${{ steps.launch.outputs.result_filename }}.json + if-no-files-found: ignore + + - name: Upload server logs + if: always() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: server_logs_${{ steps.launch.outputs.result_filename }} + path: server.log + if-no-files-found: ignore + + - name: Upload GPU metrics + if: always() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: gpu_metrics_${{ steps.launch.outputs.result_filename }} + path: gpu_metrics.csv + if-no-files-found: ignore + + - name: Upload KV metrics + if: always() + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: kv_metrics_${{ steps.launch.outputs.result_filename }} + path: kv_metrics.csv + if-no-files-found: ignore + + - name: Resource cleanup (post-run) + if: always() + run: *resource-cleanup diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index 353918609..6582914ca 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -29,6 +29,7 @@ jobs: pattern: ${{ inputs.result-prefix && format('{0}_*', inputs.result-prefix) || '*' }} - name: Print summary + if: inputs.result-prefix != 'isb1' run: | pip install tabulate python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY @@ -38,8 +39,29 @@ jobs: pip install tabulate python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }} + - name: ISB1 operator summary + if: inputs.result-prefix == 'isb1' + run: | + pip install tabulate + python3 utils/summarize_isb1.py results/ >> $GITHUB_STEP_SUMMARY + + - name: ISB1 gate report + if: inputs.result-prefix == 'isb1' + run: | + AGGREGATE_PATH="agg_${{ inputs.result-prefix }}.json" + python3 utils/gate_isb1.py "$AGGREGATE_PATH" | tee isb1_gate_report.json + python3 utils/gate_isb1.py "$AGGREGATE_PATH" --format markdown >> $GITHUB_STEP_SUMMARY + - name: Upload aggregated results uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: results_${{ inputs.result-prefix || 'all' }} path: agg_${{ inputs.result-prefix || 'all' }}.json + + - name: Upload ISB1 gate report + if: inputs.result-prefix == 'isb1' + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: isb1_gate_report + path: isb1_gate_report.json + if-no-files-found: ignore diff --git a/.github/workflows/run-isb1-kv-stress-sweep.yml b/.github/workflows/run-isb1-kv-stress-sweep.yml new file mode 100644 index 000000000..f72ef3307 --- /dev/null +++ b/.github/workflows/run-isb1-kv-stress-sweep.yml @@ -0,0 +1,110 @@ +name: Run ISB1 KV Stress Sweep +run-name: ISB1 KV Stress - ${{ github.event.inputs.config-file || '.github/configs/isb1-kv-stress.yaml' }} + +on: + workflow_dispatch: + inputs: + config-file: + description: ISB1 KV stress config file path + required: true + default: .github/configs/isb1-kv-stress.yaml + runner-type: + description: Optional space-separated runner filters (e.g. h200 b200) + required: false + default: '' + runner-config: + description: Runner config YAML + required: false + default: .github/configs/runners.yaml + ref: + description: Git ref to checkout + required: false + default: '' + +jobs: + setup: + runs-on: ubuntu-latest + outputs: + kv-stress-matrix: ${{ steps.generate.outputs.kv-stress-matrix }} + has-matrix: ${{ steps.generate.outputs.has-matrix }} + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + ref: ${{ inputs.ref || github.ref }} + + - name: Install dependencies + run: pip install pydantic pyyaml + + - id: generate + env: + CONFIG_FILE: ${{ inputs.config-file }} + RUNNER_CONFIG: ${{ inputs.runner-config }} + RUNNER_TYPE: ${{ inputs.runner-type }} + run: | + if [ ! -f "$CONFIG_FILE" ]; then + echo "Missing ISB1 KV stress config file: $CONFIG_FILE" >&2 + exit 1 + fi + + cmd=(python3 utils/matrix_logic/generate_sweep_configs.py isb1-kv-stress-sweep --config-files "$CONFIG_FILE" --runner-config "$RUNNER_CONFIG") + + if [ -n "$RUNNER_TYPE" ]; then + read -r -a runner_types <<< "$RUNNER_TYPE" + cmd+=(--runner-type "${runner_types[@]}") + fi + + matrix_json="$("${cmd[@]}")" + compact_matrix="$(printf '%s' "$matrix_json" | python3 -c 'import json,sys; print(json.dumps(json.load(sys.stdin)))')" + has_matrix="$(printf '%s' "$compact_matrix" | python3 -c 'import json,sys; print("true" if json.load(sys.stdin) else "false")')" + + { + echo "kv-stress-matrix=$compact_matrix" + echo "has-matrix=$has_matrix" + } >> "$GITHUB_OUTPUT" + + sweep: + needs: setup + if: ${{ needs.setup.outputs.has-matrix == 'true' }} + uses: ./.github/workflows/benchmark-isb1-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.setup.outputs.kv-stress-matrix) }} + secrets: inherit + with: + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + precision: ${{ matrix.config.precision }} + framework: ${{ matrix.config.framework }} + exp-name: ${{ matrix.config.exp-name }} + benchmark-type: ${{ matrix.config.benchmark-type }} + export-file: ${{ matrix.config.export-file }} + runtime-stack-id: ${{ matrix.config.runtime-stack-id }} + hardware-profile-id: ${{ matrix.config.hardware-profile-id }} + canonical-model-id: ${{ matrix.config.canonical-model-id }} + support-status: ${{ matrix.config.support-status || '' }} + request-mode: ${{ matrix.config.request-mode }} + max-concurrency: ${{ matrix.config.max-concurrency }} + max-model-len: ${{ matrix.config.max-model-len || '' }} + tp-override: ${{ matrix.config.tp || '' }} + ep-override: ${{ matrix.config.ep || '' }} + trace-source: ${{ matrix.config.trace-source || '' }} + offload-mode: ${{ matrix.config.offload-mode }} + kv-cache-dtype: ${{ matrix.config.kv-cache-dtype }} + disable-prefix-caching: ${{ matrix.config.disable-prefix-caching }} + benchmark-duration-s: ${{ matrix.config.benchmark-duration-s }} + workload-type: ${{ matrix.config.workload-type }} + ref: ${{ inputs.ref || github.ref }} + + collect-results: + needs: [setup, sweep] + if: ${{ always() && needs.setup.outputs.has-matrix == 'true' && needs.sweep.result != 'skipped' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + result-prefix: isb1 diff --git a/.github/workflows/run-isb1-sweep.yml b/.github/workflows/run-isb1-sweep.yml new file mode 100644 index 000000000..a8f3177de --- /dev/null +++ b/.github/workflows/run-isb1-sweep.yml @@ -0,0 +1,256 @@ +name: Run ISB1 Sweep +run-name: ISB1 Sweep - ${{ github.event.inputs.config-files || '.github/configs/isb1-master.yaml' }} + +on: + workflow_dispatch: + inputs: + config-files: + description: Space-separated ISB1 config file paths + required: true + default: .github/configs/isb1-master.yaml + runner-config: + description: Runner config YAML + required: false + default: .github/configs/runners.yaml + model-prefix: + description: Optional space-separated model-prefix filters + required: false + default: '' + precision: + description: Optional space-separated precision filters + required: false + default: '' + framework: + description: Optional space-separated framework filters + required: false + default: '' + runner-type: + description: Optional space-separated runner filters + required: false + default: '' + runner-node-filter: + description: Optional runner-node substring filter + required: false + default: '' + max-concurrency: + description: Optional cap applied to replay max-concurrency + required: false + default: '' + vllm-cpu-offload-gb: + description: Optional vLLM CPU offload budget in GB for long-context runs + required: false + default: '' + vllm-swap-space-gb: + description: Optional vLLM swap-space budget in GB for long-context runs + required: false + default: '' + sglang-mem-fraction-override: + description: Optional SGLang mem-fraction-static override for long-context runs + required: false + default: '' + sglang-chunked-prefill-override: + description: Optional SGLang chunked-prefill-size override for long-context runs + required: false + default: '' + ref: + description: Git ref to checkout + required: false + default: '' + +jobs: + setup: + runs-on: ubuntu-latest + outputs: + replay-matrix: ${{ steps.generate.outputs.replay-matrix }} + has-matrix: ${{ steps.generate.outputs.has-matrix }} + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + ref: ${{ inputs.ref || github.ref }} + + - name: Install dependencies + run: pip install pydantic pyyaml + + - id: generate + env: + CONFIG_FILES: ${{ inputs.config-files }} + RUNNER_CONFIG: ${{ inputs.runner-config }} + MODEL_PREFIX: ${{ inputs.model-prefix }} + PRECISION: ${{ inputs.precision }} + FRAMEWORK: ${{ inputs.framework }} + RUNNER_TYPE: ${{ inputs.runner-type }} + RUNNER_NODE_FILTER: ${{ inputs.runner-node-filter }} + MAX_CONCURRENCY: ${{ inputs.max-concurrency }} + run: | + read -r -a config_files <<< "$CONFIG_FILES" + + for config_file in "${config_files[@]}"; do + if [ ! -f "$config_file" ]; then + echo "Missing ISB1 config file: $config_file" >&2 + echo "PR1b adds the workflow lane only; the committed config arrives in PR2." >&2 + exit 1 + fi + done + + cmd=(python3 utils/matrix_logic/generate_sweep_configs.py isb1-sweep --config-files "${config_files[@]}" --runner-config "$RUNNER_CONFIG") + + if [ -n "$MODEL_PREFIX" ]; then + read -r -a model_prefixes <<< "$MODEL_PREFIX" + cmd+=(--model-prefix "${model_prefixes[@]}") + fi + if [ -n "$PRECISION" ]; then + read -r -a precisions <<< "$PRECISION" + cmd+=(--precision "${precisions[@]}") + fi + if [ -n "$FRAMEWORK" ]; then + read -r -a frameworks <<< "$FRAMEWORK" + cmd+=(--framework "${frameworks[@]}") + fi + if [ -n "$RUNNER_TYPE" ]; then + read -r -a runner_types <<< "$RUNNER_TYPE" + cmd+=(--runner-type "${runner_types[@]}") + fi + if [ -n "$RUNNER_NODE_FILTER" ]; then + cmd+=(--runner-node-filter "$RUNNER_NODE_FILTER") + fi + if [ -n "$MAX_CONCURRENCY" ]; then + cmd+=(--max-concurrency "$MAX_CONCURRENCY") + fi + + matrix_json="$("${cmd[@]}")" + compact_matrix="$(printf '%s' "$matrix_json" | python3 -c 'import json,sys; print(json.dumps(json.load(sys.stdin)))')" + has_matrix="$(printf '%s' "$compact_matrix" | python3 -c 'import json,sys; print("true" if json.load(sys.stdin) else "false")')" + + { + echo "replay-matrix=$compact_matrix" + echo "has-matrix=$has_matrix" + } >> "$GITHUB_OUTPUT" + + - name: Write ISB1 preflight run manifest + env: + REPLAY_MATRIX: ${{ steps.generate.outputs.replay-matrix }} + HAS_MATRIX: ${{ steps.generate.outputs.has-matrix }} + INPUT_CONFIG_FILES: ${{ inputs.config-files }} + INPUT_RUNNER_CONFIG: ${{ inputs.runner-config }} + INPUT_MODEL_PREFIX: ${{ inputs.model-prefix }} + INPUT_PRECISION: ${{ inputs.precision }} + INPUT_FRAMEWORK: ${{ inputs.framework }} + INPUT_RUNNER_TYPE: ${{ inputs.runner-type }} + INPUT_RUNNER_NODE_FILTER: ${{ inputs.runner-node-filter }} + INPUT_MAX_CONCURRENCY: ${{ inputs.max-concurrency }} + INPUT_VLLM_CPU_OFFLOAD_GB: ${{ inputs.vllm-cpu-offload-gb }} + INPUT_VLLM_SWAP_SPACE_GB: ${{ inputs.vllm-swap-space-gb }} + INPUT_SGLANG_MEM_FRACTION_OVERRIDE: ${{ inputs.sglang-mem-fraction-override }} + INPUT_SGLANG_CHUNKED_PREFILL_OVERRIDE: ${{ inputs.sglang-chunked-prefill-override }} + INPUT_REF: ${{ inputs.ref || github.ref }} + WORKFLOW_RUN_ID: ${{ github.run_id }} + WORKFLOW_RUN_ATTEMPT: ${{ github.run_attempt }} + WORKFLOW_SHA: ${{ github.sha }} + run: | + python3 - <<'PY' + import json + import os + from collections import Counter + + matrix_rows = json.loads(os.environ.get("REPLAY_MATRIX") or "[]") + + def count_by(field: str) -> dict[str, int]: + values = [row.get(field) for row in matrix_rows] + normalized = ["" if value is None else str(value) for value in values] + return dict(sorted(Counter(normalized).items())) + + manifest = { + "dispatch_inputs": { + "config-files": os.environ.get("INPUT_CONFIG_FILES", ""), + "runner-config": os.environ.get("INPUT_RUNNER_CONFIG", ""), + "model-prefix": os.environ.get("INPUT_MODEL_PREFIX", ""), + "precision": os.environ.get("INPUT_PRECISION", ""), + "framework": os.environ.get("INPUT_FRAMEWORK", ""), + "runner-type": os.environ.get("INPUT_RUNNER_TYPE", ""), + "runner-node-filter": os.environ.get("INPUT_RUNNER_NODE_FILTER", ""), + "max-concurrency": os.environ.get("INPUT_MAX_CONCURRENCY", ""), + "vllm-cpu-offload-gb": os.environ.get("INPUT_VLLM_CPU_OFFLOAD_GB", ""), + "vllm-swap-space-gb": os.environ.get("INPUT_VLLM_SWAP_SPACE_GB", ""), + "sglang-mem-fraction-override": os.environ.get("INPUT_SGLANG_MEM_FRACTION_OVERRIDE", ""), + "sglang-chunked-prefill-override": os.environ.get("INPUT_SGLANG_CHUNKED_PREFILL_OVERRIDE", ""), + "ref": os.environ.get("INPUT_REF", ""), + }, + "matrix_summary": { + "has_matrix": os.environ.get("HAS_MATRIX", "false"), + "total_cells": len(matrix_rows), + "by_model_prefix": count_by("model-prefix"), + "by_framework": count_by("framework"), + "by_runner": count_by("runner"), + "by_support_status": count_by("support-status"), + }, + "workflow_context": { + "run_id": os.environ.get("WORKFLOW_RUN_ID", ""), + "run_attempt": os.environ.get("WORKFLOW_RUN_ATTEMPT", ""), + "sha": os.environ.get("WORKFLOW_SHA", ""), + }, + "matrix_rows": matrix_rows, + } + + with open("isb1_run_manifest.json", "w", encoding="utf-8") as fh: + json.dump(manifest, fh, indent=2, sort_keys=True) + PY + + - name: Upload ISB1 run manifest + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: isb1_run_manifest + path: isb1_run_manifest.json + if-no-files-found: error + + sweep: + needs: setup + if: ${{ needs.setup.outputs.has-matrix == 'true' }} + uses: ./.github/workflows/benchmark-isb1-tmpl.yml + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.setup.outputs.replay-matrix) }} + secrets: inherit + with: + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + precision: ${{ matrix.config.precision }} + framework: ${{ matrix.config.framework }} + exp-name: ${{ matrix.config.exp-name }} + benchmark-type: ${{ matrix.config.benchmark-type }} + export-file: ${{ matrix.config.export-file }} + runtime-stack-id: ${{ matrix.config.runtime-stack-id }} + hardware-profile-id: ${{ matrix.config.hardware-profile-id }} + canonical-model-id: ${{ matrix.config.canonical-model-id }} + support-status: ${{ matrix.config.support-status || '' }} + request-mode: ${{ matrix.config.request-mode }} + max-concurrency: ${{ matrix.config.max-concurrency }} + max-sessions: ${{ matrix.config.max-sessions || '' }} + max-turns-per-session: ${{ matrix.config.max-turns-per-session || '' }} + max-output-len: ${{ matrix.config.max-output-len || '' }} + num-warmup-sessions: ${{ matrix.config.num-warmup-sessions || '0' }} + ignore-waits: ${{ matrix.config.ignore-waits || false }} + ignore-eos: ${{ matrix.config.ignore-eos || false }} + max-model-len: ${{ matrix.config.max-model-len || '' }} + offload-mode: ${{ matrix.config.offload-mode || '' }} + kv-cache-dtype: ${{ matrix.config.kv-cache-dtype || '' }} + disable-prefix-caching: ${{ matrix.config.disable-prefix-caching || false }} + benchmark-duration-s: ${{ matrix.config.benchmark-duration-s || '' }} + vllm-cpu-offload-gb: ${{ inputs.vllm-cpu-offload-gb || '' }} + vllm-swap-space-gb: ${{ inputs.vllm-swap-space-gb || '' }} + sglang-mem-fraction-override: ${{ inputs.sglang-mem-fraction-override || '' }} + sglang-chunked-prefill-override: ${{ inputs.sglang-chunked-prefill-override || '' }} + ref: ${{ inputs.ref || github.ref }} + + collect-results: + needs: [setup, sweep] + if: ${{ always() && needs.setup.outputs.has-matrix == 'true' && needs.sweep.result != 'skipped' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + result-prefix: isb1 diff --git a/.gitignore b/.gitignore index 03d36472a..1b87019c5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ **/__pycache__/** -**/.coverage \ No newline at end of file +**/.coverage +**/.DS_Store +prompt-exports/ +.claude \ No newline at end of file diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 535313252..ea35df323 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -66,6 +66,304 @@ stop_gpu_monitor() { GPU_MONITOR_PID="" } +KV_METRICS_PID="" +KV_METRICS_CSV="/workspace/kv_metrics.csv" +VLLM_OFFLOAD_EXTRA_ARGS="" +VLLM_EXTRA_ARGS="" +SGLANG_EXTRA_ARGS="" + +build_yarn_override_json() { + local max_model_len="${1:?}" + local factor="2.0" + if (( max_model_len > 600000 )); then + factor="4.0" + fi + echo "{\"text_config\":{\"rope_parameters\":{\"mrope_interleaved\":true,\"mrope_section\":[11,11,10],\"rope_type\":\"yarn\",\"rope_theta\":10000000,\"partial_rotary_factor\":0.25,\"factor\":${factor},\"original_max_position_embeddings\":262144}}}" +} + +apply_yarn_config_if_needed() { + local model="${1:?}" + local max_model_len="${2:?}" + if [[ "$model" == *"Qwen3.5"* || "$model" == *"qwen3.5"* || "$model" == *"Qwen3_5"* ]] && (( max_model_len > 262144 )); then + YARN_OVERRIDE_JSON=$(build_yarn_override_json "$max_model_len") + export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 + export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 + echo "YaRN enabled: factor=$(echo "$YARN_OVERRIDE_JSON" | grep -o '"factor":[0-9.]*' | cut -d: -f2) for max-model-len=$max_model_len" + fi +} + +_append_config_kv_once() { + local key="$1" + local value="$2" + + if [[ ! -f config.yaml ]]; then + return 0 + fi + + if ! grep -Eq "^${key}:" config.yaml; then + echo "${key}: ${value}" >> config.yaml + fi +} + +_remove_config_kv() { + local key="$1" + + if [[ ! -f config.yaml ]]; then + return 0 + fi + + local tmp_file + tmp_file=$(mktemp) + grep -Ev "^${key}:" config.yaml > "$tmp_file" + mv "$tmp_file" config.yaml +} + +_detect_total_cpu_dram_gb() { + if [[ -n "${TOTAL_CPU_DRAM_GB:-}" ]]; then + echo "${TOTAL_CPU_DRAM_GB}" + return 0 + fi + + if [[ -f /proc/meminfo ]]; then + awk '/MemTotal/{printf "%.0f", $2/1048576}' /proc/meminfo + return 0 + fi + + if command -v sysctl >/dev/null 2>&1; then + local mem_bytes + mem_bytes=$(sysctl -n hw.memsize 2>/dev/null || echo "") + if [[ -n "$mem_bytes" ]]; then + awk -v bytes="$mem_bytes" 'BEGIN {printf "%.0f", bytes/1073741824}' + return 0 + fi + fi + + echo "64" +} + +apply_vllm_offload_config() { + local mode="${OFFLOAD_MODE:-legacy}" + local detected_dram_gb="" + + VLLM_OFFLOAD_EXTRA_ARGS="" + VLLM_EXTRA_ARGS="" + + case "$mode" in + on) + PREFIX_CACHING_CONFIG="" + _remove_config_kv "no-enable-prefix-caching" + _remove_config_kv "cpu-offload-gb" + _remove_config_kv "swap-space" + detected_dram_gb="$(_detect_total_cpu_dram_gb)" + VLLM_OFFLOAD_EXTRA_ARGS="--kv_offloading_backend native --kv_offloading_size ${detected_dram_gb} --disable-hybrid-kv-cache-manager" + ;; + off) + PREFIX_CACHING_CONFIG="" + _remove_config_kv "no-enable-prefix-caching" + _remove_config_kv "cpu-offload-gb" + _remove_config_kv "swap-space" + ;; + noprefix) + PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" + _remove_config_kv "cpu-offload-gb" + _remove_config_kv "swap-space" + _append_config_kv_once "no-enable-prefix-caching" "true" + ;; + legacy|"") + if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + _append_config_kv_once "cpu-offload-gb" "${VLLM_CPU_OFFLOAD_GB}" + fi + if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + _append_config_kv_once "swap-space" "${VLLM_SWAP_SPACE_GB}" + fi + ;; + *) + echo "WARN: Unknown OFFLOAD_MODE='${mode}', falling back to legacy behavior" >&2 + if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + _append_config_kv_once "cpu-offload-gb" "${VLLM_CPU_OFFLOAD_GB}" + fi + if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + _append_config_kv_once "swap-space" "${VLLM_SWAP_SPACE_GB}" + fi + ;; + esac + + if [[ "${DISABLE_PREFIX_CACHING:-false}" == "true" ]]; then + PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" + _append_config_kv_once "no-enable-prefix-caching" "true" + fi + + if [[ "${KV_CACHE_DTYPE:-}" == "fp8" ]]; then + _append_config_kv_once "kv-cache-dtype" "fp8" + fi + + if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then + VLLM_EXTRA_ARGS="${VLLM_EXTRA_ARGS:-} --hf-overrides '${YARN_OVERRIDE_JSON}'" + fi +} + +apply_sglang_offload_config() { + local mode="${OFFLOAD_MODE:-legacy}" + + SGLANG_EXTRA_ARGS="" + + case "$mode" in + on) + echo "WARN: OFFLOAD_MODE=on requested for SGLang, but native KV offload is not supported. Leaving cache mode unchanged." >&2 + ;; + off) + RADIX_CACHE_ARGS="" + ;; + noprefix) + RADIX_CACHE_ARGS="--disable-radix-cache" + ;; + legacy|"") + ;; + *) + echo "WARN: Unknown OFFLOAD_MODE='${mode}' for SGLang; leaving radix cache args unchanged." >&2 + ;; + esac + + if [[ "${DISABLE_PREFIX_CACHING:-false}" == "true" ]]; then + RADIX_CACHE_ARGS="--disable-radix-cache" + fi + + if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then + SGLANG_EXTRA_ARGS="${SGLANG_EXTRA_ARGS:-} --json-model-override-args '${YARN_OVERRIDE_JSON}'" + fi +} + +# launch_vllm_server [extra args...] +# Sets: SERVER_PID, SERVER_LOG +launch_vllm_server() { + local model="$1" + local port="$2" + local config_yaml_path="$3" + shift 3 || true + local extra_args=("$@") + + if [[ -z "$model" || -z "$port" || -z "$config_yaml_path" ]]; then + echo "launch_vllm_server requires: model port config_yaml_path" >&2 + return 1 + fi + + hf download "$model" + apply_vllm_offload_config + + SERVER_LOG="${SERVER_LOG:-/workspace/server.log}" + + local vllm_max_num_seqs="${VLLM_MAX_NUM_SEQS:-}" + if [[ -z "$vllm_max_num_seqs" ]]; then + local conc_value="${CONC:-256}" + if [[ "$conc_value" =~ ^[0-9]+$ ]] && (( conc_value > 256 )); then + vllm_max_num_seqs="$conc_value" + else + vllm_max_num_seqs="256" + fi + fi + + local vllm_tp="${TP:-1}" + local vllm_gpu_mem_util="${VLLM_GPU_MEMORY_UTILIZATION:-0.9}" + + local offload_args=() + if [[ -n "$VLLM_OFFLOAD_EXTRA_ARGS" ]]; then + # shellcheck disable=SC2206 + offload_args=($VLLM_OFFLOAD_EXTRA_ARGS) + fi + + PYTHONNOUSERSITE=1 vllm serve "$model" --host 0.0.0.0 --port "$port" \ + --config "$config_yaml_path" \ + --gpu-memory-utilization "$vllm_gpu_mem_util" \ + --tensor-parallel-size "$vllm_tp" \ + --max-num-seqs "$vllm_max_num_seqs" \ + "${extra_args[@]}" \ + "${offload_args[@]}" \ + > "$SERVER_LOG" 2>&1 & + + SERVER_PID=$! + export SERVER_PID + export SERVER_LOG +} + +# launch_sglang_server [extra args...] +# Sets: SERVER_PID, SERVER_LOG +launch_sglang_server() { + local model="$1" + local port="$2" + shift 2 || true + local extra_args=("$@") + + if [[ -z "$model" || -z "$port" ]]; then + echo "launch_sglang_server requires: model port" >&2 + return 1 + fi + + hf download "$model" + if [[ -n "${OFFLOAD_MODE:-}" || "${DISABLE_PREFIX_CACHING:-false}" == "true" ]]; then + apply_sglang_offload_config + fi + + SERVER_LOG="${SERVER_LOG:-/workspace/server.log}" + + local sglang_tp="${TP:-1}" + local sglang_dp="${DP_SIZE:-1}" + + PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ + --model-path "$model" \ + --host 0.0.0.0 \ + --port "$port" \ + --tensor-parallel-size "$sglang_tp" \ + --data-parallel-size "$sglang_dp" \ + "${extra_args[@]}" \ + > "$SERVER_LOG" 2>&1 & + + SERVER_PID=$! + export SERVER_PID + export SERVER_LOG +} + +start_kv_metrics_collector() { + local port="${1:-8888}" + local output="${2:-$KV_METRICS_CSV}" + local interval="${3:-2.0}" + local collector_script + + collector_script="$(cd "$(dirname "${BASH_SOURCE[0]}")/../datasets/isb1/scripts" && pwd)/metrics_collector.py" + + if [[ ! -f "$collector_script" ]]; then + echo "[KV Metrics] Collector script not found at $collector_script, skipping" + return 0 + fi + + if [[ -n "$KV_METRICS_PID" ]] && kill -0 "$KV_METRICS_PID" 2>/dev/null; then + echo "[KV Metrics] Collector already running (PID=$KV_METRICS_PID)" + return 0 + fi + + KV_METRICS_CSV="$output" + python3 "$collector_script" \ + --metrics-url "http://0.0.0.0:${port}/metrics" \ + --output "$output" \ + --interval "$interval" >/tmp/kv_metrics_collector.log 2>&1 & + KV_METRICS_PID=$! + + echo "[KV Metrics] Started (PID=$KV_METRICS_PID, interval=${interval}s, output=$output)" +} + +stop_kv_metrics_collector() { + if [[ -n "$KV_METRICS_PID" ]] && kill -0 "$KV_METRICS_PID" 2>/dev/null; then + kill "$KV_METRICS_PID" 2>/dev/null || true + wait "$KV_METRICS_PID" 2>/dev/null || true + echo "[KV Metrics] Stopped (PID=$KV_METRICS_PID)" + if [[ -f "$KV_METRICS_CSV" ]]; then + local lines + lines=$(wc -l < "$KV_METRICS_CSV") + echo "[KV Metrics] Collected $lines rows -> $KV_METRICS_CSV" + fi + fi + KV_METRICS_PID="" +} + # Check if required environment variables are set # Usage: check_env_vars VAR1 VAR2 VAR3 ... # Exits with code 1 if any variable is not set @@ -395,6 +693,194 @@ run_benchmark_serving() { return $benchmark_exit_code } +is_isb1_replay_benchmark() { + [[ "${BENCHMARK_TYPE:-}" == "isb1_replay" ]] +} + +is_isb1_kv_stress_benchmark() { + [[ "${BENCHMARK_TYPE:-}" == "isb1_kv_stress" ]] +} + +resolve_replay_request_mode_for_harness() { + local requested_mode="${1:-auto}" + + case "$requested_mode" in + ""|auto|chat|completions) + printf '%s' "${requested_mode:-auto}" + ;; + multi-turn|multi_turn|multiturn) + printf 'auto' + ;; + *) + echo "WARN: Unsupported replay request mode '$requested_mode'; using 'auto' for the harness boundary" >&2 + printf 'auto' + ;; + esac +} + +run_isb1_kv_stress_campaign_cell() { + check_env_vars \ + BENCHMARK_TYPE \ + EXPORT_FILE \ + MAX_CONCURRENCY \ + OFFLOAD_MODE \ + BENCHMARK_DURATION_S \ + KV_CACHE_DTYPE \ + WORKLOAD_TYPE + + if ! is_isb1_kv_stress_benchmark; then + echo "Error: run_isb1_kv_stress_campaign_cell called with BENCHMARK_TYPE='${BENCHMARK_TYPE:-}'" >&2 + return 1 + fi + + local port="${PORT:-8888}" + local kv_metrics_output="/workspace/kv_metrics.csv" + local metadata_path="/workspace/kv_stress_campaign_metadata.json" + local replay_exit_code=0 + + start_gpu_monitor + start_kv_metrics_collector "$port" "$kv_metrics_output" 2.0 + + run_benchmark_export_replay "$@" || replay_exit_code=$? + + stop_kv_metrics_collector + stop_gpu_monitor + + python3 - <<'PY' +import json +import os +import time + +metadata = { + "benchmark_type": os.getenv("BENCHMARK_TYPE", ""), + "export_file": os.getenv("EXPORT_FILE", ""), + "runtime_stack_id": os.getenv("RUNTIME_STACK_ID", ""), + "hardware_profile_id": os.getenv("HARDWARE_PROFILE_ID", ""), + "canonical_model_id": os.getenv("CANONICAL_MODEL_ID", ""), + "request_mode": os.getenv("REQUEST_MODE", ""), + "max_concurrency": os.getenv("MAX_CONCURRENCY", ""), + "offload_mode": os.getenv("OFFLOAD_MODE", ""), + "disable_prefix_caching": os.getenv("DISABLE_PREFIX_CACHING", ""), + "kv_cache_dtype": os.getenv("KV_CACHE_DTYPE", ""), + "benchmark_duration_s": os.getenv("BENCHMARK_DURATION_S", ""), + "workload_type": os.getenv("WORKLOAD_TYPE", ""), + "metrics_files": { + "gpu": "/workspace/gpu_metrics.csv", + "kv": "/workspace/kv_metrics.csv", + }, + "captured_at_epoch_s": int(time.time()), +} +with open("/workspace/kv_stress_campaign_metadata.json", "w", encoding="utf-8") as f: + json.dump(metadata, f, indent=2, sort_keys=True) +PY + + echo "[KV Stress] Campaign metadata written to $metadata_path" + return "$replay_exit_code" +} + +run_single_node_benchmark() { + if ! is_isb1_replay_benchmark && ! is_isb1_kv_stress_benchmark; then + run_benchmark_serving "$@" + return $? + fi + + set +x + local model="" + local port="" + local result_filename="" + local result_dir="" + local workspace_dir="" + local trust_remote_code=false + local server_pid="" + + while [[ $# -gt 0 ]]; do + case $1 in + --model) model="$2"; shift 2 ;; + --port) port="$2"; shift 2 ;; + --result-filename) result_filename="$2"; shift 2 ;; + --result-dir) result_dir="$2"; shift 2 ;; + --bench-serving-dir) workspace_dir="$2"; shift 2 ;; + --trust-remote-code) trust_remote_code=true; shift ;; + --server-pid) server_pid="$2"; shift 2 ;; + --backend|--input-len|--output-len|--random-range-ratio|--num-prompts|--max-concurrency) + shift 2 + ;; + --use-chat-template) + shift + ;; + *) + echo "Unknown parameter: $1" + return 1 + ;; + esac + done + + if [[ -z "$model" ]]; then + echo "Error: --model is required" + return 1 + fi + if [[ -z "$port" ]]; then + echo "Error: --port is required" + return 1 + fi + if [[ -z "$result_filename" ]]; then + echo "Error: --result-filename is required" + return 1 + fi + if [[ -z "$result_dir" ]]; then + echo "Error: --result-dir is required" + return 1 + fi + + local replay_args=( + --model "$model" + --port "$port" + --export-file "${EXPORT_FILE}" + --runtime-stack-id "${RUNTIME_STACK_ID}" + --hardware-profile-id "${HARDWARE_PROFILE_ID}" + --canonical-model-id "${CANONICAL_MODEL_ID}" + --request-mode "${REQUEST_MODE:-auto}" + --max-concurrency "${MAX_CONCURRENCY}" + --num-warmup-sessions "${NUM_WARMUP_SESSIONS:-0}" + --result-filename "$result_filename" + --result-dir "$result_dir" + ) + + if [[ -n "$workspace_dir" ]]; then + replay_args+=(--bench-serving-dir "$workspace_dir") + fi + if [[ -n "${MAX_SESSIONS:-}" ]]; then + replay_args+=(--max-sessions "${MAX_SESSIONS}") + fi + if [[ -n "${SUPPORT_STATUS:-}" ]]; then + replay_args+=(--support-status "${SUPPORT_STATUS}") + fi + if [[ -n "${MAX_TURNS_PER_SESSION:-}" ]]; then + replay_args+=(--max-turns-per-session "${MAX_TURNS_PER_SESSION}") + fi + if [[ -n "${MAX_OUTPUT_LEN:-}" ]]; then + replay_args+=(--max-output-len "${MAX_OUTPUT_LEN}") + fi + if [[ "${IGNORE_WAITS:-false}" == "true" ]]; then + replay_args+=(--ignore-waits) + fi + if [[ "${IGNORE_EOS:-false}" == "true" ]]; then + replay_args+=(--ignore-eos) + fi + if [[ "$trust_remote_code" == true ]]; then + replay_args+=(--trust-remote-code) + fi + if [[ -n "$server_pid" ]]; then + replay_args+=(--server-pid "$server_pid") + fi + + if is_isb1_kv_stress_benchmark; then + run_isb1_kv_stress_campaign_cell "${replay_args[@]}" + else + run_benchmark_export_replay "${replay_args[@]}" + fi +} + # -------------------------------- # Profiling trace helpers @@ -805,3 +1291,215 @@ run_eval() { fi return $eval_rc } + + +# --------------------------------------------------------------------------- +# Multi-turn benchmark wrapper +# --------------------------------------------------------------------------- + +# Run multi-turn chat benchmark with standardized parameters. +# Exercises growing KV cache across conversation turns via /v1/chat/completions. +# +# IMPORTANT: The server MUST be started with prefix/radix caching ENABLED +# for meaningful multi-turn results. Do NOT use --disable-radix-cache or +# --no-enable-prefix-caching with multi-turn benchmarks. +# Replay ISB1 export sessions/events against a running server. +# +# Supports: +# - inferencex_multiturn exports via /v1/chat/completions (standalone vLLM/SGLang) +# - inferencex_trace_replay exports via either chat or projected completions +# mode (useful for TRT / Dynamo-style cells) +# +# Parameters: +# --model: Model name sent to the target server +# --port: Server port +# --export-file: Path to export JSON +# --runtime-stack-id: Filter selected export cells to one runtime stack +# --hardware-profile-id: Filter selected export cells to one hardware row +# --canonical-model-id: Filter selected export cells to one canonical model row +# --request-mode: auto|chat|completions (default: auto) +# --max-concurrency: Max concurrent replay sessions +# --num-warmup-sessions: Warmup sessions before measurement +# --result-filename: Result filename without extension +# --result-dir: Result directory +# --max-sessions: Optional session limit for smoke runs +# --max-turns-per-session: Optional turn cap for smoke runs +# --max-output-len: Optional per-turn output cap +# --ignore-waits: Ignore inter-turn wait gaps from export metadata +# --trust-remote-code: Optional flag +# --server-pid: Optional server process ID to monitor +run_benchmark_export_replay() { + set +x + local model="" + local port="" + local export_file="" + local runtime_stack_id="" + local hardware_profile_id="" + local canonical_model_id="" + local trace_id="" + local support_status="" + local request_mode="auto" + local max_concurrency="8" + local num_warmup_sessions="1" + local result_filename="" + local result_dir="" + local workspace_dir="" + local max_sessions="" + local max_turns_per_session="" + local max_output_len="" + local ignore_waits=false + local trust_remote_code=false + local ignore_eos=false + local server_pid="" + + while [[ $# -gt 0 ]]; do + case $1 in + --model) model="$2"; shift 2 ;; + --port) port="$2"; shift 2 ;; + --export-file) export_file="$2"; shift 2 ;; + --runtime-stack-id) runtime_stack_id="$2"; shift 2 ;; + --hardware-profile-id) hardware_profile_id="$2"; shift 2 ;; + --canonical-model-id) canonical_model_id="$2"; shift 2 ;; + --trace-id) trace_id="$2"; shift 2 ;; + --support-status) support_status="$2"; shift 2 ;; + --request-mode) request_mode="$2"; shift 2 ;; + --max-concurrency) max_concurrency="$2"; shift 2 ;; + --num-warmup-sessions) num_warmup_sessions="$2"; shift 2 ;; + --result-filename) result_filename="$2"; shift 2 ;; + --result-dir) result_dir="$2"; shift 2 ;; + --bench-serving-dir) workspace_dir="$2"; shift 2 ;; + --max-sessions) max_sessions="$2"; shift 2 ;; + --max-turns-per-session) max_turns_per_session="$2"; shift 2 ;; + --max-output-len) max_output_len="$2"; shift 2 ;; + --ignore-waits) ignore_waits=true; shift ;; + --trust-remote-code) trust_remote_code=true; shift ;; + --ignore-eos) ignore_eos=true; shift ;; + --server-pid) server_pid="$2"; shift 2 ;; + *) echo "Unknown parameter: $1"; return 1 ;; + esac + done + + if [[ -z "$model" ]]; then echo "Error: --model is required"; return 1; fi + if [[ -z "$port" ]]; then echo "Error: --port is required"; return 1; fi + if [[ -z "$export_file" ]]; then echo "Error: --export-file is required"; return 1; fi + if [[ -z "$result_filename" ]]; then echo "Error: --result-filename is required"; return 1; fi + if [[ -z "$result_dir" ]]; then echo "Error: --result-dir is required"; return 1; fi + + if [[ -z "$workspace_dir" ]]; then + workspace_dir=$(pwd) + fi + + local requested_request_mode="$request_mode" + local harness_request_mode + harness_request_mode=$(resolve_replay_request_mode_for_harness "$request_mode") + + local benchmark_cmd=( + python3 "$workspace_dir/utils/bench_serving/benchmark_export_replay.py" + --model "$model" + --base-url "http://0.0.0.0:$port" + --export-file "$export_file" + --request-mode "$harness_request_mode" + --max-concurrency "$max_concurrency" + --num-warmup-sessions "$num_warmup_sessions" + --save-result + --result-dir "$result_dir" + --result-filename "$result_filename.json" + --metadata + "benchmark_type=${BENCHMARK_TYPE:-isb1_replay}" + "export_file=$export_file" + "runtime_stack_id=$runtime_stack_id" + "hardware_profile_id=$hardware_profile_id" + "canonical_model_id=$canonical_model_id" + "request_mode=$requested_request_mode" + "harness_request_mode=$harness_request_mode" + ) + + if [[ -n "${WORKLOAD_TYPE:-}" ]]; then + benchmark_cmd+=(--metadata "workload_type=${WORKLOAD_TYPE}") + fi + if [[ -n "${BENCHMARK_DURATION_S:-}" ]]; then + benchmark_cmd+=(--metadata "benchmark_duration_s=${BENCHMARK_DURATION_S}") + fi + if [[ -n "${OFFLOAD_MODE:-}" ]]; then + benchmark_cmd+=(--metadata "offload_mode=${OFFLOAD_MODE}") + fi + if [[ -n "${KV_CACHE_DTYPE:-}" ]]; then + benchmark_cmd+=(--metadata "kv_cache_dtype=${KV_CACHE_DTYPE}") + fi + if [[ -n "${DISABLE_PREFIX_CACHING:-}" ]]; then + benchmark_cmd+=(--metadata "disable_prefix_caching=${DISABLE_PREFIX_CACHING}") + fi + + if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + benchmark_cmd+=(--metadata "vllm_cpu_offload_gb=${VLLM_CPU_OFFLOAD_GB}") + fi + if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + benchmark_cmd+=(--metadata "vllm_swap_space_gb=${VLLM_SWAP_SPACE_GB}") + fi + if [[ -n "${SGLANG_MEM_FRACTION_OVERRIDE:-}" ]]; then + benchmark_cmd+=(--metadata "sglang_mem_fraction_override=${SGLANG_MEM_FRACTION_OVERRIDE}") + fi + if [[ -n "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-}" ]]; then + benchmark_cmd+=(--metadata "sglang_chunked_prefill_override=${SGLANG_CHUNKED_PREFILL_OVERRIDE}") + fi + + if [[ -n "$runtime_stack_id" ]]; then + benchmark_cmd+=(--runtime-stack-id "$runtime_stack_id") + fi + if [[ -n "$hardware_profile_id" ]]; then + benchmark_cmd+=(--hardware-profile-id "$hardware_profile_id") + fi + if [[ -n "$canonical_model_id" ]]; then + benchmark_cmd+=(--canonical-model-id "$canonical_model_id") + fi + if [[ -n "$trace_id" ]]; then + benchmark_cmd+=(--trace-id "$trace_id") + fi + if [[ -n "$support_status" ]]; then + benchmark_cmd+=(--support-status "$support_status") + fi + if [[ -n "$max_sessions" ]]; then + benchmark_cmd+=(--max-sessions "$max_sessions") + fi + if [[ -n "$max_turns_per_session" ]]; then + benchmark_cmd+=(--max-turns-per-session "$max_turns_per_session") + fi + if [[ -n "$max_output_len" ]]; then + benchmark_cmd+=(--max-output-len "$max_output_len") + fi + if [[ "$ignore_waits" == true ]]; then + benchmark_cmd+=(--ignore-waits) + fi + if [[ "$trust_remote_code" == true ]]; then + benchmark_cmd+=(--trust-remote-code) + fi + if [[ "$ignore_eos" == true ]]; then + benchmark_cmd+=(--ignore-eos) + fi + + set -x + if [[ -n "$server_pid" ]]; then + "${benchmark_cmd[@]}" & + local benchmark_pid=$! + + while kill -0 "$benchmark_pid" 2>/dev/null; do + if ! kill -0 "$server_pid" 2>/dev/null; then + echo "ERROR: Server process $server_pid died during export replay benchmark" + kill "$benchmark_pid" 2>/dev/null + wait "$benchmark_pid" 2>/dev/null + set +x + return 1 + fi + sleep 2 + done + + wait "$benchmark_pid" + local benchmark_exit_code=$? + else + "${benchmark_cmd[@]}" + local benchmark_exit_code=$? + fi + set +x + + return $benchmark_exit_code +} diff --git a/benchmarks/single_node/dsr1_fp4_b200.sh b/benchmarks/single_node/dsr1_fp4_b200.sh index d88941628..e11290b95 100644 --- a/benchmarks/single_node/dsr1_fp4_b200.sh +++ b/benchmarks/single_node/dsr1_fp4_b200.sh @@ -31,13 +31,26 @@ else fi echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" -EVAL_CONTEXT_ARGS="" +RUNTIME_CONTEXT_ARGS="" +if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then + RUNTIME_CONTEXT_ARGS="--context-length $MAX_MODEL_LEN" +fi if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context - EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" + RUNTIME_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi set -x PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ @@ -45,7 +58,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0. --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \ --chunked-prefill-size 16384 \ --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ ---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & +--enable-symm-mem $RADIX_CACHE_ARGS --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -54,7 +67,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -run_benchmark_serving \ +run_single_node_benchmark \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -64,7 +77,8 @@ run_benchmark_serving \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -73,5 +87,8 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi stop_gpu_monitor set +x diff --git a/benchmarks/single_node/dsr1_fp8_b200.sh b/benchmarks/single_node/dsr1_fp8_b200.sh index e6d8a0e9c..0fbe9bd6c 100644 --- a/benchmarks/single_node/dsr1_fp8_b200.sh +++ b/benchmarks/single_node/dsr1_fp8_b200.sh @@ -38,9 +38,9 @@ if [[ $TP -eq 8 ]]; then MAX_RUNNING_REQUESTS=128 CUDA_GRAPH_MAX_BATCH_SIZE=128 - MEM_FRAC_STATIC=0.82 - CHUNKED_PREFILL_SIZE=32768 - MAX_PREFILL_TOKENS=32768 + MEM_FRAC_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.82}" + CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" + MAX_PREFILL_TOKENS="$CHUNKED_PREFILL_SIZE" elif [[ $TP -eq 4 ]]; then if [[ $ISL -ne 8192 ]] || [[ $OSL -ne 1024 ]]; then echo "TP=4 not yet supported for ISL=$ISL OSL=$OSL!" @@ -52,9 +52,9 @@ elif [[ $TP -eq 4 ]]; then MAX_RUNNING_REQUESTS=32 CUDA_GRAPH_MAX_BATCH_SIZE=32 - MEM_FRAC_STATIC=0.95 - CHUNKED_PREFILL_SIZE=8192 - MAX_PREFILL_TOKENS=8192 + MEM_FRAC_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.95}" + CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-8192}" + MAX_PREFILL_TOKENS="$CHUNKED_PREFILL_SIZE" SCHEDULER_RECV_INTERVAL=10 else @@ -63,21 +63,34 @@ else fi echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" -EVAL_CONTEXT_ARGS="" +RUNTIME_CONTEXT_ARGS="" +if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then + RUNTIME_CONTEXT_ARGS="--context-length $MAX_MODEL_LEN" +fi if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context - EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" + RUNTIME_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi set -x PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ --tensor-parallel-size=$TP --data-parallel-size=1 \ --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \ --mem-fraction-static $MEM_FRAC_STATIC --kv-cache-dtype fp8_e4m3 --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \ ---enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ ---attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & +--enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL $RADIX_CACHE_ARGS \ +--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -86,7 +99,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -run_benchmark_serving \ +run_single_node_benchmark \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -96,7 +109,8 @@ run_benchmark_serving \ --num-prompts "$((CONC * 10))" \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -105,5 +119,8 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi stop_gpu_monitor -set +x \ No newline at end of file +set +x diff --git a/benchmarks/single_node/dsr1_fp8_b200_vllm.sh b/benchmarks/single_node/dsr1_fp8_b200_vllm.sh new file mode 100644 index 000000000..5c3639fa9 --- /dev/null +++ b/benchmarks/single_node/dsr1_fp8_b200_vllm.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark; then + PREFIX_CACHING_CONFIG="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' +$PREFIX_CACHING_CONFIG +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +export TORCH_CUDA_ARCH_LIST="10.0" +export PYTHONNOUSERSITE=1 +export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size $TP \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/dsr1_fp8_h200.sh b/benchmarks/single_node/dsr1_fp8_h200.sh index c820d180b..a9730917a 100644 --- a/benchmarks/single_node/dsr1_fp8_h200.sh +++ b/benchmarks/single_node/dsr1_fp8_h200.sh @@ -23,34 +23,50 @@ PORT=${PORT:-8888} # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi export TORCH_CUDA_ARCH_LIST="9.0" -EVAL_CONTEXT_ARGS="" +RUNTIME_CONTEXT_ARGS="" +if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then + RUNTIME_CONTEXT_ARGS="--context-length $MAX_MODEL_LEN" +fi if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context - EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" + RUNTIME_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.82}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" set -x if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ --host 0.0.0.0 --port $PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ - --disable-radix-cache --max-running-requests 512 --cuda-graph-max-bs 512 \ - --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \ + $RADIX_CACHE_ARGS --max-running-requests 512 --cuda-graph-max-bs 512 \ + --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens "$CHUNKED_PREFILL_SIZE" --mem-fraction-static "$MEM_FRACTION_STATIC" \ --attention-backend flashinfer --stream-interval 10 \ --decode-log-interval 1 \ - $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 & else PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \ --host 0.0.0.0 --port $PORT --trust-remote-code \ --tensor-parallel-size=$TP --data-parallel-size=1 \ - --disable-radix-cache --max-running-requests 256 --cuda-graph-max-bs 256 \ - --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \ + $RADIX_CACHE_ARGS --max-running-requests 256 --cuda-graph-max-bs 256 \ + --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens "$CHUNKED_PREFILL_SIZE" --mem-fraction-static "$MEM_FRACTION_STATIC" \ --attention-backend flashinfer --stream-interval 10 \ --decode-log-interval 1 \ - $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 & fi SERVER_PID=$! @@ -58,7 +74,7 @@ SERVER_PID=$! # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -run_benchmark_serving \ +run_single_node_benchmark \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -68,7 +84,8 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -77,5 +94,8 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi stop_gpu_monitor set +x diff --git a/benchmarks/single_node/dsr1_fp8_h200_vllm.sh b/benchmarks/single_node/dsr1_fp8_h200_vllm.sh new file mode 100644 index 000000000..65348e831 --- /dev/null +++ b/benchmarks/single_node/dsr1_fp8_h200_vllm.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then + PREFIX_CACHING_CONFIG="" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' +$PREFIX_CACHING_CONFIG +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +apply_vllm_offload_config + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +if ! is_isb1_kv_stress_benchmark; then + start_gpu_monitor +fi + +set -x +vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size $TP \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if ! is_isb1_kv_stress_benchmark; then + stop_gpu_monitor +fi +set +x diff --git a/benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh b/benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh new file mode 100755 index 000000000..60f06b13e --- /dev/null +++ b/benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# TriAttention-enabled vLLM benchmark for DeepSeek-R1 FP8 on H100. +# +# Differences from baseline dsr1_fp8_h200_vllm.sh: +# - Installs triattention vLLM plugin +# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) +# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available +# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) +# - Explicitly disables prefix caching (incompatible with KV compression) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +# --- TriAttention plugin setup --- +pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." + +# Auto-detect KV budget from export filename: chat workloads get larger budget. +TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" +if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then + TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" +fi +export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" + +# Use pre-calibrated sparse stats if available on the runner. +TRIATTN_STATS="/workspace/triattn_stats/deepseek_r1_0528_stats.pt" +if [[ -f "$TRIATTN_STATS" ]]; then + export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" + echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" +else + echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." +fi + +export ENABLE_TRIATTENTION=1 +echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" +# --- End TriAttention setup --- + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +enable-prefix-caching: false +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 1024 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size $TP \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh b/benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh new file mode 100755 index 000000000..1c4722964 --- /dev/null +++ b/benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# TriAttention-enabled vLLM benchmark for DeepSeek-R1 FP8 on H200. +# +# Differences from baseline dsr1_fp8_h200_vllm.sh: +# - Installs triattention vLLM plugin +# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) +# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available +# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) +# - Explicitly disables prefix caching (incompatible with KV compression) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +# --- TriAttention plugin setup --- +pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." + +# Auto-detect KV budget from export filename: chat workloads get larger budget. +TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" +if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then + TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" +fi +export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" + +# Use pre-calibrated sparse stats if available on the runner. +TRIATTN_STATS="/workspace/triattn_stats/deepseek_r1_0528_stats.pt" +if [[ -f "$TRIATTN_STATS" ]]; then + export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" + echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" +else + echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." +fi + +export ENABLE_TRIATTENTION=1 +echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" +# --- End TriAttention setup --- + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +enable-prefix-caching: false +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 1024 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size $TP \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/gptoss_fp4_b200.sh index f6a6f72e9..95240230e 100644 --- a/benchmarks/single_node/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/gptoss_fp4_b200.sh @@ -34,15 +34,33 @@ if [ "${EVAL_ONLY}" = "true" ]; then CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark; then + PREFIX_CACHING_CONFIG="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + cat > config.yaml << EOF kv-cache-dtype: fp8 compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' -no-enable-prefix-caching: true +$PREFIX_CACHING_CONFIG max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 max-model-len: $CALCULATED_MAX_MODEL_LEN EOF +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 @@ -52,6 +70,9 @@ PORT=${PORT:-8888} # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi set -x vllm serve $MODEL --host 0.0.0.0 --port $PORT \ @@ -59,7 +80,8 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size $TP \ --max-num-seqs 512 \ ---disable-log-requests > $SERVER_LOG 2>&1 & +--disable-log-requests $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -68,7 +90,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -run_benchmark_serving \ +run_single_node_benchmark \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -78,7 +100,8 @@ run_benchmark_serving \ --num-prompts $(( CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -87,5 +110,8 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi stop_gpu_monitor set +x diff --git a/benchmarks/single_node/gptoss_fp4_b200_sglang.sh b/benchmarks/single_node/gptoss_fp4_b200_sglang.sh new file mode 100644 index 000000000..f3d9ad82c --- /dev/null +++ b/benchmarks/single_node/gptoss_fp4_b200_sglang.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +export NCCL_NVLS_ENABLE=1 +export SGL_ENABLE_JIT_DEEPGEMM=false +export SGLANG_ENABLE_FLASHINFER_GEMM=true +export PYTHONUNBUFFERED=1 + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +if [[ $CONC -ge 16 ]]; then + SCHEDULER_RECV_INTERVAL=30 +else + SCHEDULER_RECV_INTERVAL=10 +fi + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path="$MODEL" --host=0.0.0.0 --port="$PORT" \ +--trust-remote-code \ +--tensor-parallel-size="$TP" --data-parallel-size=1 \ +--cuda-graph-max-bs 128 --max-running-requests 128 \ +--mem-fraction-static "$MEM_FRACTION_STATIC" --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens 32768 \ +--context-length "$CONTEXT_LENGTH" --kv-cache-dtype fp8_e4m3 \ +$RADIX_CACHE_ARGS --enable-flashinfer-allreduce-fusion \ +--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" \ +--reasoning-parser gpt-oss --tokenizer-worker-num 6 --stream-interval 30 > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh index 8d0e773a2..dc5baf287 100644 --- a/benchmarks/single_node/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/gptoss_fp4_h100.sh @@ -17,20 +17,42 @@ fi hf download "$MODEL" -MAX_MODEL_LEN=10240 +if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then + MAX_MODEL_LEN="${MAX_MODEL_LEN}" +else + MAX_MODEL_LEN=10240 +fi if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark; then + PREFIX_CACHING_CONFIG="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + cat > config.yaml << EOF -no-enable-prefix-caching: true +$PREFIX_CACHING_CONFIG max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 max-model-len: $MAX_MODEL_LEN EOF +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + export PYTHONNOUSERSITE=1 export VLLM_MXFP4_USE_MARLIN=1 SERVER_LOG=/workspace/server.log @@ -38,13 +60,17 @@ PORT=${PORT:-8888} # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi set -x vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --config config.yaml \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ ---max-num-seqs=$CONC > $SERVER_LOG 2>&1 & +--max-num-seqs=$CONC $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -53,7 +79,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas -run_benchmark_serving \ +run_single_node_benchmark \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -63,7 +89,8 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -72,5 +99,8 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi stop_gpu_monitor set +x diff --git a/benchmarks/single_node/gptoss_fp4_h100_sglang.sh b/benchmarks/single_node/gptoss_fp4_h100_sglang.sh new file mode 100644 index 000000000..a045cd99c --- /dev/null +++ b/benchmarks/single_node/gptoss_fp4_h100_sglang.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +export TORCH_CUDA_ARCH_LIST="9.0" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \ +--host 0.0.0.0 --port "$PORT" --trust-remote-code \ +--tensor-parallel-size="$TP" --data-parallel-size=1 \ +$RADIX_CACHE_ARGS --max-running-requests 256 --cuda-graph-max-bs 256 \ +--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens 32768 --mem-fraction-static "$MEM_FRACTION_STATIC" \ +--context-length "$CONTEXT_LENGTH" --reasoning-parser gpt-oss --stream-interval 10 > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/gptoss_fp4_h200.sh index 2a9359b96..9be9959bf 100644 --- a/benchmarks/single_node/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/gptoss_fp4_h200.sh @@ -18,7 +18,9 @@ fi hf download "$MODEL" # Start GPU monitoring (power, temperature, clocks every second) -start_gpu_monitor +if ! is_isb1_kv_stress_benchmark; then + start_gpu_monitor +fi set -x pip install datasets pandas @@ -37,14 +39,21 @@ if [ "${EVAL_ONLY}" = "true" ]; then CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" fi +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then + PREFIX_CACHING_CONFIG="" +fi + # Create config.yaml cat > config.yaml << EOF -no-enable-prefix-caching: true +$PREFIX_CACHING_CONFIG max-cudagraph-capture-size: 2048 max-num-batched-tokens: 8192 max-model-len: $CALCULATED_MAX_MODEL_LEN EOF +apply_vllm_offload_config + SERVER_LOG=/workspace/server.log export TORCH_CUDA_ARCH_LIST="9.0" PORT=${PORT:-8888} @@ -55,14 +64,15 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --config config.yaml \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size $TP \ - --max-num-seqs $CONC > $SERVER_LOG 2>&1 & + --max-num-seqs $CONC $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & SERVER_PID=$! # Wait for server to be ready wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -run_benchmark_serving \ +run_single_node_benchmark \ --model "$MODEL" \ --port "$PORT" \ --backend vllm \ @@ -72,7 +82,8 @@ run_benchmark_serving \ --num-prompts $(( $CONC * 10 )) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then @@ -81,5 +92,7 @@ if [ "${RUN_EVAL}" = "true" ]; then fi # Stop GPU monitoring -stop_gpu_monitor +if ! is_isb1_kv_stress_benchmark; then + stop_gpu_monitor +fi set +x diff --git a/benchmarks/single_node/gptoss_fp4_h200_sglang.sh b/benchmarks/single_node/gptoss_fp4_h200_sglang.sh new file mode 100644 index 000000000..069b1a452 --- /dev/null +++ b/benchmarks/single_node/gptoss_fp4_h200_sglang.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +export TORCH_CUDA_ARCH_LIST="9.0" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" + +if ! is_isb1_kv_stress_benchmark; then + start_gpu_monitor +fi + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \ +--host 0.0.0.0 --port "$PORT" --trust-remote-code \ +--tensor-parallel-size="$TP" --data-parallel-size=1 \ +$RADIX_CACHE_ARGS --max-running-requests 256 --cuda-graph-max-bs 256 \ +--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens 32768 --mem-fraction-static "$MEM_FRACTION_STATIC" \ +--context-length "$CONTEXT_LENGTH" --reasoning-parser gpt-oss --stream-interval 10 > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if ! is_isb1_kv_stress_benchmark; then + stop_gpu_monitor +fi +set +x diff --git a/benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh b/benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh new file mode 100755 index 000000000..cfff2a12d --- /dev/null +++ b/benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# TriAttention-enabled vLLM benchmark for GPT-OSS-120B FP4 on H100. +# +# Differences from baseline gptoss_fp4_h100.sh: +# - Installs triattention vLLM plugin +# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) +# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available +# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) +# - Explicitly disables prefix caching (incompatible with KV compression) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +# --- TriAttention plugin setup --- +pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." + +# Auto-detect KV budget from export filename: chat workloads get larger budget. +TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" +if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then + TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" +fi +export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" + +# Use pre-calibrated sparse stats if available on the runner. +TRIATTN_STATS="/workspace/triattn_stats/gpt_oss_120b_stats.pt" +if [[ -f "$TRIATTN_STATS" ]]; then + export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" + echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" +else + echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." +fi + +export ENABLE_TRIATTENTION=1 +echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" +# --- End TriAttention setup --- + +if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then + MAX_MODEL_LEN="${MAX_MODEL_LEN}" +else + MAX_MODEL_LEN=10240 +fi + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +cat > config.yaml << EOF +enable-prefix-caching: false +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 1024 +max-model-len: $MAX_MODEL_LEN +EOF + +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +export PYTHONNOUSERSITE=1 +export VLLM_MXFP4_USE_MARLIN=1 +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ +--config config.yaml \ +--gpu-memory-utilization=0.9 \ +--tensor-parallel-size=$TP \ +--max-num-seqs=$CONC $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh b/benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh new file mode 100755 index 000000000..fc6f465bc --- /dev/null +++ b/benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# TriAttention-enabled vLLM benchmark for GPT-OSS-120B FP4 on H200. +# +# Differences from baseline gptoss_fp4_h100.sh: +# - Installs triattention vLLM plugin +# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) +# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available +# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) +# - Explicitly disables prefix caching (incompatible with KV compression) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +# --- TriAttention plugin setup --- +pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." + +# Auto-detect KV budget from export filename: chat workloads get larger budget. +TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" +if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then + TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" +fi +export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" + +# Use pre-calibrated sparse stats if available on the runner. +TRIATTN_STATS="/workspace/triattn_stats/gpt_oss_120b_stats.pt" +if [[ -f "$TRIATTN_STATS" ]]; then + export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" + echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" +else + echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." +fi + +export ENABLE_TRIATTENTION=1 +echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" +# --- End TriAttention setup --- + +if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then + MAX_MODEL_LEN="${MAX_MODEL_LEN}" +else + MAX_MODEL_LEN=10240 +fi + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +cat > config.yaml << EOF +enable-prefix-caching: false +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 1024 +max-model-len: $MAX_MODEL_LEN +EOF + +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +export PYTHONNOUSERSITE=1 +export VLLM_MXFP4_USE_MARLIN=1 +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ +--config config.yaml \ +--gpu-memory-utilization=0.9 \ +--tensor-parallel-size=$TP \ +--max-num-seqs=$CONC $VLLM_OFFLOAD_EXTRA_ARGS \ +> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( $CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh b/benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh new file mode 100755 index 000000000..97fb5127c --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +export NCCL_NVLS_ENABLE=1 +export SGL_ENABLE_JIT_DEEPGEMM=false +export SGLANG_ENABLE_FLASHINFER_GEMM=true +export PYTHONUNBUFFERED=1 +export TORCH_CUDA_ARCH_LIST="10.0" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +if [[ $CONC -ge 16 ]]; then + SCHEDULER_RECV_INTERVAL=30 +else + SCHEDULER_RECV_INTERVAL=10 +fi + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.8}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" + +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path="$MODEL" --host=0.0.0.0 --port="$PORT" \ +--trust-remote-code \ +--tensor-parallel-size="$TP" --data-parallel-size=1 --ep-size 1 \ +--quantization fp8 --kv-cache-dtype fp8_e4m3 \ +--mamba-ssm-dtype bfloat16 \ +--cuda-graph-max-bs "$CONC" --max-running-requests 128 \ +--mem-fraction-static "$MEM_FRACTION_STATIC" --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens "$CHUNKED_PREFILL_SIZE" \ +--context-length "$CONTEXT_LENGTH" \ +--attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \ +$RADIX_CACHE_ARGS --enable-flashinfer-allreduce-fusion \ +--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" \ +--reasoning-parser qwen3 --tool-call-parser qwen3_coder \ +--tokenizer-worker-num 6 --stream-interval 30 > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh b/benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh new file mode 100755 index 000000000..e48c56700 --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +apply_yarn_config_if_needed "$MODEL" "$CALCULATED_MAX_MODEL_LEN" + +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then + PREFIX_CACHING_CONFIG="" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' +$PREFIX_CACHING_CONFIG +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +apply_vllm_offload_config + +export TORCH_CUDA_ARCH_LIST="10.0" +export PYTHONNOUSERSITE=1 +export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl + +if ! is_isb1_kv_stress_benchmark; then + start_gpu_monitor +fi + +set -x +vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size "$TP" \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS $VLLM_EXTRA_ARGS \ +> "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if ! is_isb1_kv_stress_benchmark; then + stop_gpu_monitor +fi +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh b/benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh new file mode 100755 index 000000000..61df75cff --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +export TORCH_CUDA_ARCH_LIST="9.0" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.8}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-16384}" + +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \ +--host 0.0.0.0 --port "$PORT" --trust-remote-code \ +--tensor-parallel-size="$TP" --data-parallel-size=1 --ep-size 1 \ +--quantization fp8 --kv-cache-dtype fp8_e4m3 \ +--mamba-ssm-dtype bfloat16 \ +$RADIX_CACHE_ARGS --enable-flashinfer-allreduce-fusion \ +--max-running-requests 128 --cuda-graph-max-bs 128 \ +--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --mem-fraction-static "$MEM_FRACTION_STATIC" \ +--context-length "$CONTEXT_LENGTH" \ +--reasoning-parser qwen3 --tool-call-parser qwen3_coder \ +--attention-backend flashinfer \ +--stream-interval 30 --tokenizer-worker-num 6 > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh b/benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh new file mode 100755 index 000000000..6f576ea0f --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark; then + PREFIX_CACHING_CONFIG="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +$PREFIX_CACHING_CONFIG +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size "$TP" \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh b/benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh new file mode 100755 index 000000000..b3d5ea50b --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +export TORCH_CUDA_ARCH_LIST="9.0" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN" +fi + +apply_yarn_config_if_needed "$MODEL" "$CONTEXT_LENGTH" + +MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.8}" +CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-16384}" + +RADIX_CACHE_ARGS="--disable-radix-cache" +if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then + RADIX_CACHE_ARGS="" +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +if ! is_isb1_kv_stress_benchmark; then + start_gpu_monitor +fi + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \ +--host 0.0.0.0 --port "$PORT" --trust-remote-code \ +--tensor-parallel-size="$TP" --data-parallel-size=1 --ep-size 1 \ +--reasoning-parser qwen3 --tool-call-parser qwen3_coder \ +--enable-flashinfer-allreduce-fusion \ +--max-running-requests 128 \ +--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \ +--mem-fraction-static "$MEM_FRACTION_STATIC" \ +--cuda-graph-max-bs 128 \ +--context-length "$CONTEXT_LENGTH" \ +--kv-cache-dtype fp8_e4m3 \ +--quantization fp8 \ +--attention-backend flashinfer \ +--stream-interval 30 \ +--tokenizer-worker-num 6 \ +--mamba-ssm-dtype bfloat16 \ +$RADIX_CACHE_ARGS \ +$SGLANG_EXTRA_ARGS \ +> "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if ! is_isb1_kv_stress_benchmark; then + stop_gpu_monitor +fi +set +x diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh b/benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh new file mode 100755 index 000000000..de5c66c44 --- /dev/null +++ b/benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +apply_yarn_config_if_needed "$MODEL" "$CALCULATED_MAX_MODEL_LEN" + +PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true" +if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then + PREFIX_CACHING_CONFIG="" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +$PREFIX_CACHING_CONFIG +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +apply_vllm_offload_config + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +if ! is_isb1_kv_stress_benchmark; then + start_gpu_monitor +fi + +set -x +vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size "$TP" \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS $VLLM_EXTRA_ARGS \ +> "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if ! is_isb1_kv_stress_benchmark; then + stop_gpu_monitor +fi +set +x diff --git a/benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh b/benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh new file mode 100755 index 000000000..87e81ab22 --- /dev/null +++ b/benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# TriAttention-enabled vLLM benchmark for Qwen 3.5 FP8 on H100. +# +# Differences from baseline qwen3.5_fp8_h100_vllm.sh: +# - Installs triattention vLLM plugin +# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) +# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available +# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) +# - Explicitly disables prefix caching (incompatible with KV compression) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +# --- TriAttention plugin setup --- +pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." + +# Auto-detect KV budget from export filename: chat workloads get larger budget. +TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" +if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then + TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" +fi +export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" + +# Use pre-calibrated sparse stats if available on the runner. +TRIATTN_STATS="/workspace/triattn_stats/qwen3_5_397b_a17b_stats.pt" +if [[ -f "$TRIATTN_STATS" ]]; then + export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" + echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" +else + echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." +fi + +export ENABLE_TRIATTENTION=1 +echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" +# --- End TriAttention setup --- + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +enable-prefix-caching: false +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 1024 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size "$TP" \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh b/benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh new file mode 100755 index 000000000..83fb3b8c6 --- /dev/null +++ b/benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# TriAttention-enabled vLLM benchmark for Qwen 3.5 FP8 on H200. +# +# Differences from baseline qwen3.5_fp8_h200_vllm.sh: +# - Installs triattention vLLM plugin +# - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads) +# - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available +# - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks) +# - Explicitly disables prefix caching (incompatible with KV compression) + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +# --- TriAttention plugin setup --- +pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image." + +# Auto-detect KV budget from export filename: chat workloads get larger budget. +TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}" +if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then + TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}" +fi +export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET" + +# Use pre-calibrated sparse stats if available on the runner. +TRIATTN_STATS="/workspace/triattn_stats/qwen3_5_397b_a17b_stats.pt" +if [[ -f "$TRIATTN_STATS" ]]; then + export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS" + echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS" +else + echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression." +fi + +export ENABLE_TRIATTENTION=1 +echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-}" +# --- End TriAttention setup --- + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))} +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +cat > config.yaml << EOF +kv-cache-dtype: fp8 +enable-prefix-caching: false +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 1024 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml +fi +if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml +fi +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_vllm_offload_config +fi + +export TORCH_CUDA_ARCH_LIST="9.0" +export PYTHONNOUSERSITE=1 + +start_gpu_monitor +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0 +fi + +set -x +vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ +--config config.yaml \ +--gpu-memory-utilization 0.9 \ +--tensor-parallel-size "$TP" \ +--max-num-seqs 256 \ +--disable-log-requests \ +--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \ +> "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_single_node_benchmark \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --server-pid "$SERVER_PID" \ + --trust-remote-code + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + stop_kv_metrics_collector +fi +stop_gpu_monitor +set +x diff --git a/datasets/isb1/.gitattributes b/datasets/isb1/.gitattributes new file mode 100644 index 000000000..d7fa37c52 --- /dev/null +++ b/datasets/isb1/.gitattributes @@ -0,0 +1,2 @@ +exports/**/*.json linguist-generated=true +exports/**/*.json text eol=lf diff --git a/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md b/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md new file mode 100644 index 000000000..175765ab1 --- /dev/null +++ b/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md @@ -0,0 +1,122 @@ +--- +version: 1.0.0 +date: 2026-04-14 +author: William Chen +status: proposed +--- + +# ISB1 ↔ kv-cache-tester Coexistence Plan + +## The Two Systems + +| | kv-cache-tester (Cameron's) | ISB1 (ours) | +|---|---|---| +| **Location** | `experimental/multiturn/vllm_benchmark/kv-cache-tester/` | `datasets/isb1/exports/` | +| **Traces** | 522 real Claude Code sessions | 35 synthetic multi-turn traces | +| **Source** | Real production agentic workloads | Synthetic with controlled stress patterns | +| **Replay** | `trace_replay_tester.py` | `benchmark_export_replay.py` | +| **Config** | `multiturn-agentic-trace.yaml` | `isb1-kv-stress-pr993.yaml` | +| **Metrics** | Prometheus sidecar (`metrics_collector.py`) | `process_result_isb1.py` | + +## Why Both Are Needed + +**kv-cache-tester** shows how chips perform under **real workloads** — actual Claude Code +sessions with natural token distributions. This is the ground truth for "how does inference +actually work in production?" + +**ISB1** shows how chips perform under **controlled stress conditions** — specific KV cache +behaviors that real workloads rarely trigger but production systems must handle: + +| Stress Pattern | kv-cache-tester | ISB1 | +|---|---|---| +| Natural agentic workload distribution | ✅ (522 real traces) | ❌ | +| Targeted prefix reuse testing | ❌ | ✅ (high_prefix stress class) | +| Forced KV offload cliff | ❌ (depends on trace) | ✅ (offload_cliff stress, 128K-1M context) | +| Session reactivation after idle | ❌ | ✅ (reactivation stress, idle windows) | +| KV compaction under long sessions | ❌ | ✅ (compaction_heavy stress, 25+ turns) | +| Shared prefix fanout | ❌ | ✅ (fanout stress, branching requests) | +| 500K-1M context depth | ❌ (real traces are shorter) | ✅ (xlc2/ulc1/ulc2 bands) | + +Together they give the Pareto frontier Cameron wants: kv-cache-tester at realistic operating +points, ISB1 at stress-test extremes. + +## How They Coexist in PR #993 + +### Configs (no conflict) +```yaml +# Cameron's existing config — uses kv-cache-tester traces +# .github/configs/multiturn-agentic-trace.yaml +h200-fp8-llama70b: + trace-file: experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/... + +# Our config — uses ISB1 export traces +# .github/configs/isb1-kv-stress-pr993.yaml +dsr1-fp8-h200-isb1-kv-stress-vllm-pr993: + export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json +``` + +### Workflows (no conflict) +```yaml +# Cameron's workflow +# .github/workflows/multiturn-sweep.yml → benchmark-multiturn-tmpl.yml +# Uses: trace_replay_tester.py + +# Our workflow +# .github/workflows/run-isb1-sweep.yml → benchmark-isb1-tmpl.yml +# Uses: benchmark_export_replay.py +``` + +### Data directories (no conflict) +``` +experimental/multiturn/vllm_benchmark/ ← Cameron's (untouched) + kv-cache-tester/ 522 real traces + replayer + aiperf/ AIPerf submodule + bench/metrics_collector.py Prometheus sidecar + analysis/plot_pareto.py Pareto charts + +datasets/isb1/ ← Ours (separate directory) + exports/ ISB1 replay bundles + extension_131k/ 131K context (DSR1, GPT-OSS, Qwen) + preview/long_context_500k/ 500K Qwen preview + preview/long_context_1m/ 1M Qwen preview +``` + +### Shared infrastructure we USE from PR #993 +- vLLM offload API flags (`--kv_offloading_backend native`, etc.) +- Prometheus metrics collector (could share `metrics_collector.py`) +- Offload mode sweep pattern (on/off/noprefix) +- Runner launch scripts (`runners/launch_*.sh`) +- Concurrency sweep structure + +### What we DO NOT touch +- `experimental/multiturn/vllm_benchmark/` — entirely Cameron's +- `kv-cache-tester/` submodule — real traces, don't modify +- `aiperf/` submodule — alternative benchmark, don't modify +- `benchmark-multiturn-tmpl.yml` — Cameron's workflow template + +## Recommended PR Structure + +### Option A: Single PR with two benchmark lanes (cleanest) +PR #993 ships with BOTH: +- Lane 1: kv-cache-tester (real traces) — Cameron's existing work +- Lane 2: ISB1 (synthetic stress traces) — our addition + +Both use the same vLLM server configs, offload modes, and concurrency sweeps. +Results are compared side by side — real vs stress. + +### Option B: ISB1 as follow-up PR (safest) +PR #993 ships with kv-cache-tester only (Cameron's work). +We submit a follow-up PR that adds ISB1 as a second benchmark lane. +Uses the same runner infrastructure and offload configs. + +### Recommendation: Option A +Cameron explicitly asked for "realistic multi-turn benchmarks" at GTC. Having both +real traces AND synthetic stress traces in the same PR makes a stronger story: +"Here's how chips perform under real workloads AND here's where they break under +targeted KV stress." That's the complete Pareto frontier. + +## What We Need From Cameron's Team +1. Confirm ISB1 configs don't conflict with multiturn-agentic-trace.yaml +2. Confirm datasets/isb1/exports/ is the right location for our files +3. Decide: do we share metrics_collector.py or use process_result_isb1.py? +4. Agree on result format for combined Pareto visualization diff --git a/datasets/isb1/GMI_EXECUTION_PLAN.md b/datasets/isb1/GMI_EXECUTION_PLAN.md new file mode 100644 index 000000000..1ae696acd --- /dev/null +++ b/datasets/isb1/GMI_EXECUTION_PLAN.md @@ -0,0 +1,175 @@ +# ISB1 KV Cache Benchmark — GMI Cloud Execution Plan + +## Available Hardware + +| GPU | HBM | Available | Max Context Before Offload | +|-----|-----|-----------|---------------------------| +| **GB200** | 192GB HBM3e | ✅ | ~384K tokens (FP8 KV) | +| **H100** | 80GB HBM3 | ✅ | ~128K tokens (FP8 KV) | + +## Execution Order + +Run benchmarks in this order — cheapest/fastest first to validate the setup works. + +### Phase 1: Validation Run (1 hour) + +Prove the pipeline works end-to-end before burning GPU hours. + +```bash +# On H100 — single model, single concurrency, 5 min duration +export MODEL=deepseek-ai/DeepSeek-R1-0528 +export TP=8 +export EXPORT_FILE=datasets/isb1/exports/extension_131k/vllm/code_131k1k.json + +# Launch server +bash benchmarks/single_node/dsr1_fp8_h100_vllm.sh + +# Run ONE cell: 2 users, offload=off, 300s +python utils/bench_serving/benchmark_export_replay.py \ + --export-file $EXPORT_FILE \ + --max-concurrency 2 \ + --duration 300 \ + --request-mode multi-turn + +# Verify result has actual_context_len > 0 +python utils/process_result_isb1.py --result-file results/*.json +``` + +**Pass criteria:** TTFT and throughput numbers appear. `actual_context_len` > 100K. + +### Phase 2: H100 KV Stress Sweep (8 hours) + +H100 80GB is the interesting GPU — KV cache fills up first. + +```bash +# Models to test: +# 1. DeepSeek-R1 FP8 (TP8) +# 2. GPT-OSS FP4 (TP8) + +# Sweep per model: +# users: [2, 4, 8, 16, 32, 64] # H100 can't do 128+ at 131K +# offload-modes: [on, off, noprefix] +# duration: 1800s (30 min) +# export: extension_131k/vllm/code_131k1k.json + +# Total cells: 2 models × 6 concurrency × 3 offload = 36 cells +# Time: 36 × 30min = 18 hours → with 2 models sequential = ~9 hours +``` + +**What to look for:** +- Offload cliff: at what concurrency does offload=on start helping? +- Prefix cache hit rate: does it stay >50% under load? +- Preemption count: how many requests get evicted? +- TTFT degradation: when does p99 TTFT exceed 10s? + +### Phase 3: GB200 KV Stress Sweep (8 hours) + +GB200 192GB has 2.4x more HBM — the cliff comes later. + +```bash +# Same sweep but higher concurrency (more HBM room): +# users: [2, 4, 8, 16, 32, 64, 128, 256] +# offload-modes: [on, off, noprefix] +# duration: 1800s + +# Add Qwen 3.5 (needs more memory for MoE): +# 3 models × 8 concurrency × 3 offload = 72 cells +# Time: 72 × 30min = 36 hours → might need to cut duration to 900s +``` + +**What to look for:** +- At what concurrency does GB200 hit its offload cliff? +- Is the cliff at ~3x H100's cliff (proportional to HBM)? +- Does 192GB allow prefix caching to stay effective longer? + +### Phase 4: Long Context Preview (4 hours, GB200 only) + +500K and 1M token traces — only GB200 has enough memory. + +```bash +# 500K preview (Qwen 3.5 only): +export EXPORT_FILE=datasets/isb1/exports/preview/long_context_500k/\ +inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json + +# 1M preview (Qwen 3.5 only): +export EXPORT_FILE=datasets/isb1/exports/preview/long_context_1m/\ +inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json + +# Low concurrency (these are HUGE contexts): +# users: [1, 2, 4] +# offload-modes: [on, off] +# duration: 900s +``` + +**What to look for:** +- Can GB200 serve 1M context at all? +- What's the TTFT for a 1M token prefill? +- Does KV offload work at this scale? + +## Estimated GPU Time + +| Phase | GPU | Duration | Cost (est) | +|-------|-----|----------|------------| +| 1. Validation | H100 | 1 hour | ~$3 | +| 2. H100 sweep | H100 | 9 hours | ~$27 | +| 3. GB200 sweep | GB200 | 18 hours | ~$90 | +| 4. Long context | GB200 | 4 hours | ~$20 | +| **Total** | | **32 hours** | **~$140** | + +## Portable Run Script + +Use `gmi_portable_benchmark.sh` for manual runs without GitHub Actions: + +```bash +# Set GMI-specific env vars +export GMI_API_KEY="..." +export HF_TOKEN="..." +export MODEL=deepseek-ai/DeepSeek-R1-0528 +export GPU_TYPE=h100 # or gb200 + +# Run the portable benchmark +bash datasets/isb1/scripts/gmi_portable_benchmark.sh \ + --model $MODEL \ + --gpu $GPU_TYPE \ + --export-file datasets/isb1/exports/extension_131k/vllm/code_131k1k.json \ + --users 2,4,8,16,32,64 \ + --offload-modes on,off,noprefix \ + --duration 1800 +``` + +## Result Collection + +After each phase, results go to: +``` +results/ +├── h100_dsr1_fp8_kv_stress/ +│ ├── users_2_offload_on.json +│ ├── users_2_offload_off.json +│ └── ... +└── gb200_dsr1_fp8_kv_stress/ + └── ... +``` + +Process and visualize: +```bash +# Aggregate results +python datasets/isb1/scripts/collect_sweep_results.py \ + --results-dir results/ \ + --output results/sweep_summary.json + +# Generate Pareto frontier chart +python datasets/isb1/scripts/plot_pareto.py \ + --summary results/sweep_summary.json \ + --output results/pareto_frontier.png +``` + +## What Success Looks Like + +After all phases, we have: +1. **Pareto frontier chart:** throughput vs p99 TTFT for H100 and GB200 +2. **Offload cliff identification:** exact concurrency where offload starts helping +3. **Prefix cache benefit:** measured hit rate under realistic multi-turn load +4. **HBM scaling evidence:** does 2.4x more HBM give 2.4x more capacity? +5. **Long context feasibility:** can GB200 serve 500K/1M context at all? + +These results go into the InferenceX PR as evidence that the benchmark works. diff --git a/datasets/isb1/README.md b/datasets/isb1/README.md new file mode 100644 index 000000000..e3746eb58 --- /dev/null +++ b/datasets/isb1/README.md @@ -0,0 +1,125 @@ +# ISB1 replay artifacts for InferenceX + +This directory is the InferenceX-side consumer package for ISB1 replay. + +InferenceX consumes committed file artifacts only: +- replay export JSON bundles under `datasets/isb1/exports/` +- consumer configs in `.github/configs/isb1-*.yaml` +- replay processing through `utils/bench_serving/benchmark_export_replay.py` +- result normalization through `utils/process_result_isb1.py` + + +## Why not random data? + +Random data benchmarks show worst-case performance. Real inference workloads +have multi-turn conversations where each turn shares context with previous +turns. This enables: + +- **Prefix caching** — 60-95% of each request's tokens are shared with the + previous turn. Prefix cache hit rates directly affect throughput. +- **KV cache reuse** — the server reuses computed KV cache entries instead of + recomputing them. This is the biggest performance optimization in production. +- **Realistic offload behavior** — KV cache grows across turns, eventually + exceeding GPU memory and requiring CPU offload. Random data never reaches + this point because each request is independent. + +These traces stress-test the exact KV cache behaviors that determine real +production performance. + +InferenceX does **not** import external runtime code and does **not** make live-serving claims from export-file existence alone. + +--- + +## Current ground truth (verified 2026-04-12) + +The definitive strict audit found: + +- **26 PASSED** +- **0 FAILED** +- **10 N/A** + +Strict audit rule: count only model-architecture-valid cells. + +### Strict verified coverage + +| Model | Chat | Code | +|---|---|---| +| `dsr1` | `8k`, `32k`, `64k`, `131k` | `8k`, `32k`, `64k`, `131k` | +| `gptoss` | `8k`, `32k`, `64k`, `131k` | `8k`, `32k`, `64k`, `131k` | +| `qwen3.5` | `8k`, `32k`, `64k`, `131k`, `500k` | `8k`, `32k`, `64k`, `131k`, `500k` | + +### Existing but excluded from the strict pass count + +- `gptoss` `500k` chat/code preview files exist, but strict coverage stops at `131k` +- `qwen3.5` `1M` chat/code preview files exist, but were excluded from the strict audit +- `dsr1` has no strict `500k` or `1M` lane because the model tops out at `163840` + +--- + +## Inventory + +### Export-file counts + +- **50 export files** +- **3 JSON manifests** +- **53 total JSON files** under `datasets/isb1/exports/` +- **888 total cells** +- **5,094 total turns** +- **13 MB actual message content** +- **All export files are valid JSON** + +### Export-file breakdown + +| Class | Count | +|---|---:| +| Core `8k1k` | 8 | +| Extension `32k1k` | 8 | +| Extension `64k1k` | 8 | +| Extension `131k1k` | 10 | +| Preview `offload_core` | 4 | +| Preview `500k` | 8 | +| Preview `1M` | 4 | +| JSON manifests | 3 | + +--- + +## Claim boundary + +Safe claims: +- InferenceX carries the full audited ISB1 replay corpus described above. +- Strict replay-file coverage is **26 passed / 0 failed / 10 N/A**. +- DSR1 strict coverage stops at `131k`. +- GPT-OSS strict coverage stops at `131k`. +- Qwen strict coverage reaches `500k`. +- GPT-OSS `500k` and Qwen `1M` files exist, but are excluded from the strict pass count. + +Unsafe claims: +- `26/26` valid cells verified (10 N/A due to model `max_position_embeddings` limits: DSR1=163,840, GPT-OSS=131,072, Qwen3.5=1,010,000) +- strict GPT-OSS `500k` coverage +- strict Qwen `1M` coverage +- turning preview-file existence into live benchmark certification + +--- + +## Key docs + +- [`COVERAGE_AUDIT_2026-04-11.md`](COVERAGE_AUDIT_2026-04-11.md) — definitive strict audit, file-path mapping, and N/A rationale +- [`LONG_CONTEXT_TRUTH_MATRIX.md`](LONG_CONTEXT_TRUTH_MATRIX.md) — canonical claim boundary +- [`SUPPORT_MATRIX.md`](SUPPORT_MATRIX.md) — lane-by-lane audited support table +- [`PRODUCER_GAPS.md`](PRODUCER_GAPS.md) — what remains truly open vs no longer applicable +- [`RUNBOOK_EXTERNAL_GMI.md`](RUNBOOK_EXTERNAL_GMI.md) — external operator path +- [`RUNBOOK_INTERNAL_SEMIANALYSIS.md`](RUNBOOK_INTERNAL_SEMIANALYSIS.md) — internal workflow-backed path +- [`INVESTIGATION_KV_CACHE_PROFILING_2026-04-11.md`](INVESTIGATION_KV_CACHE_PROFILING_2026-04-11.md) — what the long-context preview paths actually measure + +--- + +## Export roots + +- `datasets/isb1/exports/core/` +- `datasets/isb1/exports/extension_32k/` +- `datasets/isb1/exports/extension_64k/` +- `datasets/isb1/exports/extension_131k/` +- `datasets/isb1/exports/preview/offload_core/` +- `datasets/isb1/exports/preview/long_context_500k/` +- `datasets/isb1/exports/preview/long_context_1m/` + diff --git a/datasets/isb1/exports/core/chat_8k1k.json b/datasets/isb1/exports/core/chat_8k1k.json new file mode 100644 index 000000000..c3c2e1124 --- /dev/null +++ b/datasets/isb1/exports/core/chat_8k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08070a63d22aa247e38475fdd7e206ea41bab731f2499f0d32210b317933b075 +size 3615534 diff --git a/datasets/isb1/exports/core/chat_8k1k_qwen3.5.json b/datasets/isb1/exports/core/chat_8k1k_qwen3.5.json new file mode 100644 index 000000000..243cea119 --- /dev/null +++ b/datasets/isb1/exports/core/chat_8k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04d60ff42c01d6bf117a6bddac7eae99cef2d052235101fa540fd3a7eb6466de +size 136407 diff --git a/datasets/isb1/exports/core/code_8k1k.json b/datasets/isb1/exports/core/code_8k1k.json new file mode 100644 index 000000000..1c1dd2461 --- /dev/null +++ b/datasets/isb1/exports/core/code_8k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c746a650eb624d9d40ee19aad4a9d126b4e60602f13793c09a6a8cfde81d6ee +size 2605444 diff --git a/datasets/isb1/exports/core/code_8k1k_qwen3.5.json b/datasets/isb1/exports/core/code_8k1k_qwen3.5.json new file mode 100644 index 000000000..52957e59e --- /dev/null +++ b/datasets/isb1/exports/core/code_8k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e4fc73e3ff51469ad736fda8e15df09a14bd2d430d8a9a1600ae2ca1cd13075 +size 138620 diff --git a/datasets/isb1/exports/extension_131k/chat_131k1k.json b/datasets/isb1/exports/extension_131k/chat_131k1k.json new file mode 100644 index 000000000..daefd2dad --- /dev/null +++ b/datasets/isb1/exports/extension_131k/chat_131k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eab224b3f15a3118204a912a3e53f3081c96ac2be1f4861b4dda5593580b2da1 +size 1231308 diff --git a/datasets/isb1/exports/extension_131k/chat_131k1k_dsr1.json b/datasets/isb1/exports/extension_131k/chat_131k1k_dsr1.json new file mode 100644 index 000000000..e1ce42508 --- /dev/null +++ b/datasets/isb1/exports/extension_131k/chat_131k1k_dsr1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea824f39557d4bc7cc5a3e09c61815ebd32b2a7c3e78046c62c4d9da340f92d2 +size 312933 diff --git a/datasets/isb1/exports/extension_131k/chat_131k1k_qwen3.5.json b/datasets/isb1/exports/extension_131k/chat_131k1k_qwen3.5.json new file mode 100644 index 000000000..c25a74094 --- /dev/null +++ b/datasets/isb1/exports/extension_131k/chat_131k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20550fdc8fbb1aeaa9cf2b4fdb7807f4e8abcac5b2f871de573ea061f88e8dc5 +size 312996 diff --git a/datasets/isb1/exports/extension_131k/code_131k1k.json b/datasets/isb1/exports/extension_131k/code_131k1k.json new file mode 100644 index 000000000..99915e4cd --- /dev/null +++ b/datasets/isb1/exports/extension_131k/code_131k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66df69260749a22f4af2d2d25a6dce23b3b466533f75338da599db87ace6e833 +size 5461532 diff --git a/datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json b/datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json new file mode 100644 index 000000000..0b041fb66 --- /dev/null +++ b/datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcd048663de0e325e601cdc44b0683a2dfbeecd53fe277937131250e1a86b3e4 +size 5027435 diff --git a/datasets/isb1/exports/extension_32k/chat_32k1k.json b/datasets/isb1/exports/extension_32k/chat_32k1k.json new file mode 100644 index 000000000..7378882af --- /dev/null +++ b/datasets/isb1/exports/extension_32k/chat_32k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606a6174834ddac7704bd199995d1b3f7c1d34b39ad4a904b80b09a22b1b04dc +size 1390574 diff --git a/datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json b/datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json new file mode 100644 index 000000000..8fd721f45 --- /dev/null +++ b/datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a44061cd4fac9b02347afcd4cbbfc4e5152020f23d6eccfccf548e198b4b7c70 +size 351049 diff --git a/datasets/isb1/exports/extension_32k/code_32k1k.json b/datasets/isb1/exports/extension_32k/code_32k1k.json new file mode 100644 index 000000000..5a09c88f5 --- /dev/null +++ b/datasets/isb1/exports/extension_32k/code_32k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49442fa6a1ec7114c26da5aa61ec7b7dfc6662f5e636edd95e5a019ae47ca2be +size 1337748 diff --git a/datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json b/datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json new file mode 100644 index 000000000..a110e6c14 --- /dev/null +++ b/datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f74b077263ea89567e9a09cfcecc5ea90040891170d4d65636156f9349733aa +size 337547 diff --git a/datasets/isb1/exports/extension_64k/chat_64k1k.json b/datasets/isb1/exports/extension_64k/chat_64k1k.json new file mode 100644 index 000000000..709a833b2 --- /dev/null +++ b/datasets/isb1/exports/extension_64k/chat_64k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0e7fa8895d4774cf36d9d78d9f02a35282f420598e7b373c5378330ea663b05 +size 2473612 diff --git a/datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json b/datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json new file mode 100644 index 000000000..79ad2cb87 --- /dev/null +++ b/datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0533834348310306dc9e56ad4d54671a7615c9d7852fa677320bad51ee2ceaa6 +size 621810 diff --git a/datasets/isb1/exports/extension_64k/code_64k1k.json b/datasets/isb1/exports/extension_64k/code_64k1k.json new file mode 100644 index 000000000..bb1ca8974 --- /dev/null +++ b/datasets/isb1/exports/extension_64k/code_64k1k.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1804919d069fb037802c0d97605fb8bc6b12050f242f9ca00fc7aa7f372db81b +size 788105 diff --git a/datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json b/datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json new file mode 100644 index 000000000..73beb4b57 --- /dev/null +++ b/datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9513a2d11519546a701d6b2889cbf18b01f5ba36abc3b6f8fb34669566e6c311 +size 200074 diff --git a/datasets/isb1/exports/preview/long_context_1m/README.md b/datasets/isb1/exports/preview/long_context_1m/README.md new file mode 100644 index 000000000..3e5ea5af9 --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_1m/README.md @@ -0,0 +1,33 @@ +# Gated 1M-class Qwen3.5 preview lane + +This directory carries the committed InferenceX-side Qwen3.5 artifacts for a +bounded `1M`-class ISB1 coding replay preview. + +## What these files are + +- dedicated replay bundles restricted to `qwen3_5_397b_a17b` +- producer cells for standalone `vllm` and standalone `sglang` +- committed bundle coverage for `nvidia:b200_sxm_180gb`, `nvidia:h100_sxm_80gb`, and `nvidia:h200_sxm_141gb` +- restricted to `ulc2_1m_plus` +- restricted to `support_status=reviewed_preview` at the selected export-cell level +- restricted to `benchmark_certification_status=dataset_replay_verified` +- exposed downstream only through the separate manual config + `.github/configs/isb1-qwen-1m-preview.yaml` +- explicit `max-model-len: 1048576` when the manual config is used + +## Current claim boundary + +These files are committed preview artifacts plus a gated/manual validation path. +They do **not** imply ordinary runnable ISB1 support in `isb1-master.yaml`. + +Safe wording: +- InferenceX carries bounded 1M-class Qwen3.5 replay preview artifacts. +- InferenceX carries a separate gated/manual Qwen3.5 1M validation path. + +Unsafe wording: +- native 1M served-lane support +- ordinary/general runnable consumer support +- KV-offload certification + +See `manifest.json` for the exact preview boundary and +`.github/configs/isb1-qwen-1m-preview.yaml` for the manual validation surface. diff --git a/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1.json b/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1.json new file mode 100644 index 000000000..a37edd86a --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd16cc4de821cf4803d662e4c5091359b7a5b2b730d03c976eb331be0cd6b1cb +size 286074 diff --git a/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json b/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json new file mode 100644 index 000000000..5fd23f78c --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35572a38f071d240519f7fdbd60aa203eb4832d835df97a8a5ef874d5d402456 +size 122465512 diff --git a/datasets/isb1/exports/preview/long_context_1m/manifest.json b/datasets/isb1/exports/preview/long_context_1m/manifest.json new file mode 100644 index 000000000..3c1cfb8db --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_1m/manifest.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63e05e30fc8eddf2dd35b21b0575af6943428b2ab7e6ebe5a3df257d0344ad8b +size 2445 diff --git a/datasets/isb1/exports/preview/long_context_500k/README.md b/datasets/isb1/exports/preview/long_context_500k/README.md new file mode 100644 index 000000000..8efb153d5 --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/README.md @@ -0,0 +1,45 @@ +# Bounded 500k-class preview lanes + +This directory carries the smallest honest InferenceX consumer handoff for bounded +`500k`-class ISB1 coding replay paths. + +## What these files are + +- dedicated replay bundles derived from committed `131k1k` extension exports +- restricted to `gpt_oss_120b` or `qwen3_5_397b_a17b` +- restricted to `xlc2_384k_512k` +- restricted to standalone `vllm` and standalone `sglang` +- restricted to `nvidia:b200_sxm_180gb`, `nvidia:h100_sxm_80gb`, and `nvidia:h200_sxm_141gb` +- restricted to `support_status=reviewed_preview` +- restricted to `benchmark_certification_status=dataset_replay_verified` +- wired in the consumer with explicit `max-model-len: 524288` + +## What these files are not + +- not a native InferenceX `500k+` served lane +- not a native InferenceX `1M+` served lane +- not a supported-tier long-context expansion +- not a chat preview lane +- not an offload-depth lane +- not a KV-offload certification claim + +## Why the files exist + +The existing `extension_131k/*/code_131k1k.json` and model-scoped +`code_131k1k_qwen3.5.json` bundles already contain honest `xlc2_384k_512k` +replay cells, but they are mixed with lower-band cells. The InferenceX workflow +selects rows by runtime, hardware, model, and support tier — not by +`context_band`. + +These dedicated files isolate only the `xlc2_384k_512k` rows so InferenceX can +run bounded `500k`-class previews without over-selecting lower-band cells. + +## Consumer contract + +- `isb1-master.yaml` pins these rows as `reviewed_preview` +- `isb1-master.yaml` pins `max-model-len: 524288` +- current search space is intentionally bounded to single-concurrency preview execution +- result processing preserves `context_bands`, `profile_id`, and the producer handoff claim boundary + +See `manifest.json` for the GPT-OSS derivation record and `manifest_qwen3.5.json` +for the Qwen derivation record. diff --git a/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1.json b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1.json new file mode 100644 index 000000000..ed88496d8 --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e200fb08b06dffc83189c393c0711e090cf8f579c719e69512e2fcfb3933e33 +size 153848 diff --git a/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1.json b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1.json new file mode 100644 index 000000000..37f8e26a2 --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aa883fbca2ea93ec4d3cb748265a1c66e98554c658d8a0e51ed877a95e7faf1 +size 150709 diff --git a/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json new file mode 100644 index 000000000..f996cc838 --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5102d06da0cf4adfc640f1206cb26812369150d888165813012fe85183fec35 +size 157679 diff --git a/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json new file mode 100644 index 000000000..00046987f --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18faa3c3271f2f1acf3892379d3e1d13f1e0e6e1bbefdf00e5e7c5cb54bb3c72 +size 32685533 diff --git a/datasets/isb1/exports/preview/long_context_500k/manifest.json b/datasets/isb1/exports/preview/long_context_500k/manifest.json new file mode 100644 index 000000000..deae83d6d --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/manifest.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fb9e807a7f1c9df7cc0244309f594561913d05aeff434eb3d3e1ee322e0ffd5 +size 2344 diff --git a/datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json b/datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json new file mode 100644 index 000000000..aed23b2db --- /dev/null +++ b/datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99682e56f2fff3506c27ce5b1e3c61273b7a0bdf9abf70e9a254b4af1cf2b936 +size 2303 diff --git a/datasets/isb1/scripts/adapt_trace_replay_result.py b/datasets/isb1/scripts/adapt_trace_replay_result.py new file mode 100644 index 000000000..445ab7d9c --- /dev/null +++ b/datasets/isb1/scripts/adapt_trace_replay_result.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +from pathlib import Path +from statistics import mean +from typing import Any + + +def _to_float(value: Any) -> float | None: + if value in (None, ""): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _percentile(values: list[float], p: float) -> float: + if not values: + return 0.0 + if len(values) == 1: + return values[0] + ordered = sorted(values) + idx = (len(ordered) - 1) * p + lo = int(idx) + hi = min(lo + 1, len(ordered) - 1) + frac = idx - lo + return ordered[lo] * (1 - frac) + ordered[hi] * frac + + +def _read_csv_rows(path: Path) -> list[dict[str, str]]: + with path.open("r", encoding="utf-8", newline="") as handle: + return list(csv.DictReader(handle)) + + +def _pick(row: dict[str, str], *keys: str) -> float | None: + for key in keys: + if key in row: + value = _to_float(row.get(key)) + if value is not None: + return value + return None + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Adapt kv-cache trace replay CSV output into ISB1 replay JSON schema" + ) + parser.add_argument("--input-dir", default="/workspace", help="Directory containing trace replay outputs") + parser.add_argument( + "--detailed-csv", + default="detailed_results.csv", + help="Detailed replay CSV filename (inside --input-dir)", + ) + parser.add_argument( + "--summary-json", + default=None, + help="Optional summary JSON path (used as supplemental source if present)", + ) + parser.add_argument("--output-json", required=True, help="Output adapted replay JSON path") + parser.add_argument("--model-id", default="", help="Model ID for output metadata") + parser.add_argument("--max-concurrency", type=int, default=1, help="Max concurrency used") + parser.add_argument("--request-mode", default="multi-turn", help="Request mode metadata") + parser.add_argument( + "--benchmark-certification-status", + default="dataset_replay_verified", + help="Benchmark certification status to stamp in selection", + ) + parser.add_argument( + "--support-status", + default="reviewed_preview", + help="Support status to stamp in selection", + ) + parser.add_argument( + "--result-stem", + default="", + help="Optional result stem to infer total wall time from /workspace/.json", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + input_dir = Path(args.input_dir) + detailed_csv_path = input_dir / args.detailed_csv + output_path = Path(args.output_json) + + if not detailed_csv_path.exists(): + raise SystemExit(f"Missing detailed CSV: {detailed_csv_path}") + + rows = _read_csv_rows(detailed_csv_path) + ttft_ms: list[float] = [] + tpot_ms: list[float] = [] + output_tokens: list[float] = [] + prompt_tokens: list[float] = [] + session_ids: set[str] = set() + + for row in rows: + ttft = _pick(row, "ttft_ms", "ttft", "time_to_first_token_ms") + if ttft is not None: + ttft_ms.append(ttft) + + tpot = _pick(row, "tpot_ms", "tpot", "time_per_output_token_ms") + if tpot is not None: + tpot_ms.append(tpot) + + out_tok = _pick(row, "output_tokens", "generated_tokens", "completion_tokens") + if out_tok is not None: + output_tokens.append(out_tok) + + in_tok = _pick(row, "input_tokens", "prompt_tokens", "content_token_count") + if in_tok is not None: + prompt_tokens.append(in_tok) + + for key in ("session_id", "session", "conversation_id"): + sid = row.get(key) + if sid: + session_ids.add(str(sid)) + break + + completed_sessions = len(session_ids) if session_ids else len(rows) + total_sessions = completed_sessions + + total_output_tokens = sum(output_tokens) + total_prompt_tokens = sum(prompt_tokens) + total_token_count = total_output_tokens + total_prompt_tokens + + total_wall_time_s = 0.0 + if args.result_stem: + maybe_summary = input_dir / f"{args.result_stem}.json" + if maybe_summary.exists(): + try: + summary = json.loads(maybe_summary.read_text(encoding="utf-8")) + total_wall_time_s = float( + _to_float(summary.get("test_duration_seconds")) + or _to_float(summary.get("duration_s")) + or _to_float(summary.get("total_duration_s")) + or 0.0 + ) + except Exception: + total_wall_time_s = 0.0 + + if total_wall_time_s <= 0 and args.summary_json: + summary_path = Path(args.summary_json) + if summary_path.exists(): + try: + summary = json.loads(summary_path.read_text(encoding="utf-8")) + total_wall_time_s = float( + _to_float(summary.get("test_duration_seconds")) + or _to_float(summary.get("duration_s")) + or _to_float(summary.get("total_duration_s")) + or 0.0 + ) + except Exception: + total_wall_time_s = 0.0 + + if total_wall_time_s <= 0: + total_wall_time_s = 1.0 + + aggregate_metrics = { + "total_token_throughput_tps": total_token_count / total_wall_time_s, + "output_throughput_tps": total_output_tokens / total_wall_time_s, + "mean_ttft_ms": mean(ttft_ms) if ttft_ms else 0.0, + "median_ttft_ms": _percentile(ttft_ms, 0.50), + "p99_ttft_ms": _percentile(ttft_ms, 0.99), + "mean_tpot_ms": mean(tpot_ms) if tpot_ms else 0.0, + "median_tpot_ms": _percentile(tpot_ms, 0.50), + "p99_tpot_ms": _percentile(tpot_ms, 0.99), + "completed_sessions": completed_sessions, + "total_sessions": total_sessions, + "session_throughput_sps": completed_sessions / total_wall_time_s, + "total_wall_time_s": total_wall_time_s, + } + + adapted = { + "model_id": args.model_id, + "max_concurrency": args.max_concurrency, + "request_mode": args.request_mode, + "harness_request_mode": "auto", + "aggregate_metrics": aggregate_metrics, + "selection": { + "support_statuses": [args.support_status], + "benchmark_certification_statuses": [args.benchmark_certification_status], + }, + "server_metrics_summary": { + "observability_status": "unavailable", + "gpu_cache_metric_name": None, + "cpu_cache_metric_name": None, + "gpu_cache_usage_peak": 0.0, + "cpu_cache_usage_peak": 0.0, + "preemption_count": 0, + "kv_offload_observed": False, + "cpu_cache_metric_available": False, + }, + "depth_telemetry": { + "total_actual_input_tokens": int(total_prompt_tokens), + "max_actual_context_len_per_turn": int(max(prompt_tokens) if prompt_tokens else 0), + }, + "num_sessions": total_sessions, + "max_turns": None, + "per_turn_metrics": {}, + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(adapted, indent=2, sort_keys=True), encoding="utf-8") + print(f"Wrote adapted replay JSON: {output_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/analyze_benchmark_distributions.py b/datasets/isb1/scripts/analyze_benchmark_distributions.py new file mode 100644 index 000000000..06c5a65f1 --- /dev/null +++ b/datasets/isb1/scripts/analyze_benchmark_distributions.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Analyze ISL/OSL/turn distributions for ISB1 exports or kv-cache traces") + parser.add_argument("--export-file", default=None, help="ISB1 export JSON file") + parser.add_argument("--trace-dir", default=None, help="kv-cache-tester trace directory") + parser.add_argument("--output-dir", required=True, help="Output directory") + return parser.parse_args() + + +def _percentile(values: list[float], p: float) -> float: + if not values: + return 0.0 + if len(values) == 1: + return values[0] + ordered = sorted(values) + idx = (len(ordered) - 1) * p + lo = int(idx) + hi = min(lo + 1, len(ordered) - 1) + frac = idx - lo + return ordered[lo] * (1 - frac) + ordered[hi] * frac + + +def _histogram(values: list[int], bins: list[int]) -> dict[str, int]: + counts: dict[str, int] = {} + for value in values: + placed = False + prev = 0 + for bound in bins: + if value <= bound: + key = f"{prev + 1}-{bound}" + counts[key] = counts.get(key, 0) + 1 + placed = True + break + prev = bound + if not placed: + key = f">{bins[-1]}" + counts[key] = counts.get(key, 0) + 1 + return counts + + +def _extract_isb1(export_payload: dict[str, Any]) -> tuple[list[int], list[int], list[int]]: + isl: list[int] = [] + osl: list[int] = [] + turns_per_session: list[int] = [] + + for cell in export_payload.get("exports", []): + session = cell.get("session") or {} + turns = session.get("turns") or [] + turns_per_session.append(len(turns)) + for turn in turns: + input_tokens = ( + turn.get("actual_input_tokens") + or turn.get("content_token_count") + or turn.get("prompt_tokens") + or turn.get("input_tokens") + or 0 + ) + output_tokens = ( + turn.get("expected_output_tokens") + or turn.get("target_output_tokens") + or turn.get("output_tokens") + or 0 + ) + try: + isl.append(int(input_tokens)) + except Exception: + isl.append(0) + try: + osl.append(int(output_tokens)) + except Exception: + osl.append(0) + + return isl, osl, turns_per_session + + +def _extract_trace_dir(trace_dir: Path) -> tuple[list[int], list[int], list[int]]: + isl: list[int] = [] + osl: list[int] = [] + turns_per_session: list[int] = [] + + files = list(sorted(trace_dir.glob("*.json"))) + if not files: + raise SystemExit(f"No JSON traces found in {trace_dir}") + + for path in files: + payload = json.loads(path.read_text(encoding="utf-8")) + sessions = payload.get("sessions") or [] + for session in sessions: + turns = session.get("turns") or [] + turns_per_session.append(len(turns)) + for turn in turns: + isl.append(int(turn.get("content_token_count", 0) or 0)) + osl.append(int(turn.get("target_output_tokens", 0) or 0)) + + return isl, osl, turns_per_session + + +def build_report(isl: list[int], osl: list[int], turns_per_session: list[int], source: str) -> dict[str, Any]: + return { + "source": source, + "num_sessions": len(turns_per_session), + "num_turns": len(isl), + "isl": { + "p50": _percentile([float(x) for x in isl], 0.50), + "p95": _percentile([float(x) for x in isl], 0.95), + "max": max(isl) if isl else 0, + "histogram": _histogram(isl, [1024, 4096, 8192, 16384, 32768, 65536]), + }, + "osl": { + "p50": _percentile([float(x) for x in osl], 0.50), + "p95": _percentile([float(x) for x in osl], 0.95), + "max": max(osl) if osl else 0, + "histogram": _histogram(osl, [64, 128, 256, 512, 1024, 2048, 4096]), + }, + "turns_per_session": { + "p50": _percentile([float(x) for x in turns_per_session], 0.50), + "p95": _percentile([float(x) for x in turns_per_session], 0.95), + "max": max(turns_per_session) if turns_per_session else 0, + "histogram": _histogram(turns_per_session, [2, 4, 8, 16, 32]), + }, + } + + +def main() -> int: + args = parse_args() + if bool(args.export_file) == bool(args.trace_dir): + raise SystemExit("Provide exactly one of --export-file or --trace-dir") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + if args.export_file: + export_path = Path(args.export_file) + payload = json.loads(export_path.read_text(encoding="utf-8")) + isl, osl, turns_per_session = _extract_isb1(payload) + report = build_report(isl, osl, turns_per_session, source=str(export_path)) + else: + trace_dir = Path(args.trace_dir) + isl, osl, turns_per_session = _extract_trace_dir(trace_dir) + report = build_report(isl, osl, turns_per_session, source=str(trace_dir)) + + output_path = output_dir / "distribution_report.json" + output_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8") + print(f"Wrote: {output_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/collect_sweep_results.py b/datasets/isb1/scripts/collect_sweep_results.py new file mode 100644 index 000000000..0d7155428 --- /dev/null +++ b/datasets/isb1/scripts/collect_sweep_results.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +import sqlite3 +from pathlib import Path +from typing import Any + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Aggregate sweep results from DB or agg_*.json directory") + parser.add_argument("--db-path", default=None, help="SQLite DB path") + parser.add_argument("--json-dir", default=None, help="Directory containing agg_*.json files") + parser.add_argument("--output-dir", required=True, help="Output directory") + parser.add_argument("--cliff-ttft-ms", type=float, default=5000.0, help="TTFT p99 threshold for capacity cliff") + return parser.parse_args() + + +def _to_float(value: Any) -> float | None: + if value in (None, ""): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _to_int(value: Any) -> int | None: + if value in (None, ""): + return None + try: + return int(float(value)) + except (TypeError, ValueError): + return None + + +def collect_from_db(db_path: Path) -> list[dict[str, Any]]: + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + """ + SELECT offload_mode, throughput_tok_s, ttft_p99_ms, max_concurrency, raw_result_json + FROM benchmark_runs + WHERE offload_mode IS NOT NULL + ORDER BY id ASC + """ + ).fetchall() + conn.close() + + out: list[dict[str, Any]] = [] + for row in rows: + concurrency = row["max_concurrency"] + if concurrency in (None, "") and row["raw_result_json"]: + try: + payload = json.loads(row["raw_result_json"]) + concurrency = payload.get("conc") or payload.get("max_concurrency") + except Exception: + pass + out.append( + { + "offload_mode": row["offload_mode"], + "concurrency": _to_int(concurrency), + "throughput_tok_s": _to_float(row["throughput_tok_s"]), + "ttft_p99_ms": _to_float(row["ttft_p99_ms"]), + "source": "db", + } + ) + return out + + +def collect_from_json_dir(json_dir: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for path in sorted(json_dir.glob("agg_*.json")): + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except Exception: + continue + rows.append( + { + "offload_mode": payload.get("offload_mode"), + "concurrency": _to_int(payload.get("conc") or payload.get("max_concurrency")), + "throughput_tok_s": _to_float(payload.get("throughput_tok_s") or payload.get("tput_per_gpu")), + "ttft_p99_ms": _to_float(payload.get("ttft_p99_ms") or payload.get("p99_ttft_ms")), + "source": str(path.name), + } + ) + return rows + + +def compute_capacity_cliff(rows: list[dict[str, Any]], threshold_ms: float) -> dict[str, Any]: + cliff: dict[str, Any] = {} + for mode in sorted({row.get("offload_mode") for row in rows if row.get("offload_mode")}): + mode_rows = sorted( + [r for r in rows if r.get("offload_mode") == mode and r.get("concurrency") is not None], + key=lambda r: r["concurrency"], + ) + cliff_row = None + for row in mode_rows: + if (row.get("ttft_p99_ms") or 0.0) > threshold_ms: + cliff_row = row + break + cliff[str(mode)] = cliff_row + return cliff + + +def compute_offload_benefit(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + by_conc: dict[int, dict[str, dict[str, Any]]] = {} + for row in rows: + conc = row.get("concurrency") + mode = row.get("offload_mode") + if conc is None or mode is None: + continue + by_conc.setdefault(int(conc), {})[str(mode)] = row + + deltas: list[dict[str, Any]] = [] + for conc in sorted(by_conc): + modes = by_conc[conc] + on = modes.get("on") + off = modes.get("off") + if not on or not off: + continue + on_tput = on.get("throughput_tok_s") or 0.0 + off_tput = off.get("throughput_tok_s") or 0.0 + deltas.append( + { + "concurrency": conc, + "throughput_on": on_tput, + "throughput_off": off_tput, + "offload_benefit_delta_tps": on_tput - off_tput, + } + ) + return deltas + + +def write_csv(path: Path, rows: list[dict[str, Any]]) -> None: + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.writer(handle) + writer.writerow(["offload_mode", "concurrency", "throughput_tok_s", "ttft_p99_ms", "source"]) + for row in rows: + writer.writerow([ + row.get("offload_mode"), + row.get("concurrency"), + row.get("throughput_tok_s"), + row.get("ttft_p99_ms"), + row.get("source"), + ]) + + +def main() -> int: + args = parse_args() + if not args.db_path and not args.json_dir: + raise SystemExit("Provide --db-path or --json-dir") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + rows: list[dict[str, Any]] = [] + if args.db_path: + rows.extend(collect_from_db(Path(args.db_path))) + if args.json_dir: + rows.extend(collect_from_json_dir(Path(args.json_dir))) + + summary = { + "num_rows": len(rows), + "capacity_cliff": compute_capacity_cliff(rows, args.cliff_ttft_ms), + "offload_benefit": compute_offload_benefit(rows), + "rows": rows, + } + + json_path = output_dir / "sweep_aggregate.json" + csv_path = output_dir / "sweep_aggregate.csv" + json_path.write_text(json.dumps(summary, indent=2, sort_keys=True), encoding="utf-8") + write_csv(csv_path, rows) + + print(f"Wrote: {json_path}") + print(f"Wrote: {csv_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/generate_qwen35_low_band_exports.py b/datasets/isb1/scripts/generate_qwen35_low_band_exports.py new file mode 100755 index 000000000..51be8b531 --- /dev/null +++ b/datasets/isb1/scripts/generate_qwen35_low_band_exports.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +"""Generate dedicated Qwen 3.5 ISB1 export bundles for 8k/32k/64k lanes. + +These files are derived from the committed generic export bundles by selecting only +GPT-OSS cells that are actually runnable (`supported` or `reviewed_preview`), then +rewriting model identity fields to the Qwen 3.5 replay identity while keeping trace +payloads unchanged. +""" + +from __future__ import annotations + +import json +from copy import deepcopy +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[3] +EXPORT_ROOT = ROOT / "datasets" / "isb1" / "exports" + +QWEN_MODEL_ID = "qwen3_5_397b_a17b" +GPTOSS_MODEL_ID = "gpt_oss_120b" +ALLOWED_SUPPORT_STATUSES = {"supported", "reviewed_preview"} + +TARGETS = [ + ("core", "8k1k", "chat", "vllm"), + ("core", "8k1k", "chat", "sglang"), + ("core", "8k1k", "code", "vllm"), + ("core", "8k1k", "code", "sglang"), + ("extension_32k", "32k1k", "chat", "vllm"), + ("extension_32k", "32k1k", "chat", "sglang"), + ("extension_32k", "32k1k", "code", "vllm"), + ("extension_32k", "32k1k", "code", "sglang"), + ("extension_64k", "64k1k", "chat", "vllm"), + ("extension_64k", "64k1k", "chat", "sglang"), + ("extension_64k", "64k1k", "code", "vllm"), + ("extension_64k", "64k1k", "code", "sglang"), +] + + +def _source_path(lane: str, shape: str, surface: str, engine: str) -> Path: + return EXPORT_ROOT / lane / engine / f"{surface}_{shape}.json" + + +def _target_path(lane: str, shape: str, surface: str, engine: str) -> Path: + return EXPORT_ROOT / lane / engine / f"{surface}_{shape}_qwen3.5.json" + + +def _rewrite_bundle_id(bundle_id: str, lane: str, engine: str, surface: str, shape: str) -> str: + expected_prefix = f"isb1_{lane}_{engine}_{surface}_{shape}" + if bundle_id != expected_prefix: + raise ValueError( + f"Unexpected bundle_id {bundle_id!r}; expected {expected_prefix!r} for {lane}/{engine}/{surface}_{shape}" + ) + return f"{bundle_id}_qwen3_5" + + +def _rewrite_cell(cell: dict) -> dict: + rewritten = deepcopy(cell) + rewritten["canonical_model_id"] = QWEN_MODEL_ID + rewritten["thinking_history_policy"] = "strip_reasoning" + rewritten["history_projection_mode"] = "strip_reasoning_history" + rewritten["support_status"] = "reviewed_preview" + return rewritten + + +def build_export(lane: str, shape: str, surface: str, engine: str) -> tuple[Path, int]: + source_path = _source_path(lane, shape, surface, engine) + target_path = _target_path(lane, shape, surface, engine) + + payload = json.loads(source_path.read_text()) + exports = payload.get("exports") + if not isinstance(exports, list): + raise ValueError(f"Missing exports list in {source_path}") + + filtered = [ + _rewrite_cell(cell) + for cell in exports + if cell.get("canonical_model_id") == GPTOSS_MODEL_ID + and cell.get("support_status") in ALLOWED_SUPPORT_STATUSES + ] + if not filtered: + raise ValueError(f"No runnable GPT-OSS cells found in {source_path}") + + payload["bundle_id"] = _rewrite_bundle_id(payload.get("bundle_id"), lane, engine, surface, shape) + payload["exports"] = filtered + + target_path.write_text(json.dumps(payload, indent=2) + "\n") + return target_path, len(filtered) + + +def main() -> int: + for lane, shape, surface, engine in TARGETS: + target_path, count = build_export(lane, shape, surface, engine) + print(f"wrote {target_path.relative_to(ROOT)} ({count} cells)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/gmi_analyze_sweep.py b/datasets/isb1/scripts/gmi_analyze_sweep.py new file mode 100644 index 000000000..d0c3465b2 --- /dev/null +++ b/datasets/isb1/scripts/gmi_analyze_sweep.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +import sqlite3 +import subprocess +import sys +from pathlib import Path +from statistics import median +from typing import Any + +from isb1_results_db import render_table + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Analyze KV sweep runs from ISB1 SQLite results.") + parser.add_argument("--db-path", required=True, help="Path to SQLite DB (isb1_results.db)") + parser.add_argument("--output-dir", default=".", help="Directory to write summary outputs") + parser.add_argument("--pareto", action="store_true", help="Also run plot_pareto.py") + parser.add_argument( + "--distributions", + action="store_true", + help="Also run analyze_benchmark_distributions.py", + ) + parser.add_argument("--export-file", default=None, help="Export JSON for --distributions") + parser.add_argument("--trace-dir", default=None, help="Trace directory for --distributions") + return parser.parse_args() + + +def _to_float(value: Any) -> float | None: + if value in (None, ""): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _to_int(value: Any) -> int | None: + if value in (None, ""): + return None + try: + return int(float(value)) + except (TypeError, ValueError): + return None + + +def _extract_concurrency(raw_result_json: str | None) -> int | None: + if not raw_result_json: + return None + try: + payload = json.loads(raw_result_json) + except json.JSONDecodeError: + return None + return _to_int(payload.get("conc") or payload.get("max_concurrency")) + + +def percentile(values: list[float], p: float) -> float | None: + if not values: + return None + ordered = sorted(values) + if len(ordered) == 1: + return ordered[0] + idx = (len(ordered) - 1) * p + lo = int(idx) + hi = min(lo + 1, len(ordered) - 1) + frac = idx - lo + return ordered[lo] * (1 - frac) + ordered[hi] * frac + + +def load_rows(db_path: Path) -> list[dict[str, Any]]: + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + """ + SELECT + id, + offload_mode, + ttft_p50_ms, + ttft_p99_ms, + throughput_tok_s, + preemption_count, + status, + raw_result_json + FROM benchmark_runs + WHERE offload_mode IS NOT NULL + ORDER BY id ASC + """ + ).fetchall() + conn.close() + + normalized: list[dict[str, Any]] = [] + for row in rows: + concurrency = _extract_concurrency(row["raw_result_json"]) + normalized.append( + { + "offload_mode": row["offload_mode"], + "concurrency": concurrency, + "ttft_p50_ms": _to_float(row["ttft_p50_ms"]), + "ttft_p99_ms": _to_float(row["ttft_p99_ms"]), + "throughput_tok_s": _to_float(row["throughput_tok_s"]), + "preemption_count": _to_int(row["preemption_count"]) or 0, + "status": row["status"], + } + ) + return normalized + + +def summarize(rows: list[dict[str, Any]]) -> dict[str, Any]: + grouped: dict[tuple[str, int], list[dict[str, Any]]] = {} + for row in rows: + if row["concurrency"] is None: + continue + key = (row["offload_mode"], row["concurrency"]) + grouped.setdefault(key, []).append(row) + + summary_rows: list[dict[str, Any]] = [] + for (offload_mode, concurrency), items in sorted(grouped.items(), key=lambda x: (x[0][0], x[0][1])): + ttft_p50_values = [x["ttft_p50_ms"] for x in items if x["ttft_p50_ms"] is not None] + ttft_p99_values = [x["ttft_p99_ms"] for x in items if x["ttft_p99_ms"] is not None] + throughput_values = [x["throughput_tok_s"] for x in items if x["throughput_tok_s"] is not None] + preemptions = [x["preemption_count"] for x in items] + success_count = sum(1 for x in items if x["status"] == "success") + + summary_rows.append( + { + "offload_mode": offload_mode, + "concurrency": concurrency, + "runs": len(items), + "success_runs": success_count, + "ttft_p50_ms": median(ttft_p50_values) if ttft_p50_values else None, + "ttft_p99_ms": percentile(ttft_p99_values, 0.99), + "throughput_tok_s": median(throughput_values) if throughput_values else None, + "preemptions": int(median(preemptions)) if preemptions else 0, + } + ) + + return { + "total_rows": len(rows), + "grouped_rows": len(summary_rows), + "summary": summary_rows, + } + + +def write_summary_json(output_dir: Path, summary: dict[str, Any]) -> Path: + output_path = output_dir / "sweep_summary.json" + output_path.write_text(json.dumps(summary, indent=2)) + return output_path + + +def write_pareto_csv(output_dir: Path, summary: dict[str, Any]) -> Path: + output_path = output_dir / "pareto_data.csv" + with output_path.open("w", newline="") as handle: + writer = csv.writer(handle) + writer.writerow(["offload_mode", "concurrency", "throughput_tok_s", "ttft_p99_ms"]) + for row in summary["summary"]: + writer.writerow( + [ + row["offload_mode"], + row["concurrency"], + row["throughput_tok_s"], + row["ttft_p99_ms"], + ] + ) + return output_path + + +def print_console_summary(summary: dict[str, Any]) -> None: + headers = [ + "offload_mode", + "concurrency", + "runs", + "success_runs", + "ttft_p50_ms", + "ttft_p99_ms", + "throughput_tok_s", + "preemptions", + ] + rows = [ + [ + row["offload_mode"], + row["concurrency"], + row["runs"], + row["success_runs"], + row["ttft_p50_ms"], + row["ttft_p99_ms"], + row["throughput_tok_s"], + row["preemptions"], + ] + for row in summary["summary"] + ] + + print(f"Total rows: {summary['total_rows']}") + print(f"Grouped rows: {summary['grouped_rows']}") + if rows: + print(render_table(headers, rows)) + else: + print("No sweep rows with offload_mode + concurrency found.") + + +def main() -> int: + args = parse_args() + db_path = Path(args.db_path) + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + rows = load_rows(db_path) + summary = summarize(rows) + summary_path = write_summary_json(output_dir, summary) + pareto_path = write_pareto_csv(output_dir, summary) + + print_console_summary(summary) + print(f"Wrote: {summary_path}") + print(f"Wrote: {pareto_path}") + + script_dir = Path(__file__).resolve().parent + + if args.pareto: + pareto_cmd = [ + sys.executable, + str(script_dir / "plot_pareto.py"), + "--db-path", + str(db_path), + "--output-dir", + str(output_dir), + ] + subprocess.run(pareto_cmd, check=True) + + if args.distributions: + dist_cmd = [ + sys.executable, + str(script_dir / "analyze_benchmark_distributions.py"), + "--output-dir", + str(output_dir), + ] + if args.export_file: + dist_cmd.extend(["--export-file", args.export_file]) + elif args.trace_dir: + dist_cmd.extend(["--trace-dir", args.trace_dir]) + else: + raise SystemExit("--distributions requires --export-file or --trace-dir") + subprocess.run(dist_cmd, check=True) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/gmi_full_suite.sh b/datasets/isb1/scripts/gmi_full_suite.sh new file mode 100755 index 000000000..fad23efc1 --- /dev/null +++ b/datasets/isb1/scripts/gmi_full_suite.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +PORTABLE_SCRIPT="$SCRIPT_DIR/gmi_portable_benchmark.sh" + +usage() { + echo "Usage: gmi_full_suite.sh --gpu-type [--db-path ]" +} + +GPU_TYPE="" +DB_PATH="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --gpu-type) + GPU_TYPE="$2" + shift 2 + ;; + --db-path) + DB_PATH="$2" + shift 2 + ;; + --help|-h) + usage + exit 0 + ;; + *) + echo "Unknown: $1" >&2 + exit 1 + ;; + esac +done + +[[ -n "$GPU_TYPE" ]] || { + usage >&2 + exit 1 +} + +case "$GPU_TYPE" in + h100|h200|b200) ;; + *) + echo "Unsupported --gpu-type: $GPU_TYPE" >&2 + exit 1 + ;; +esac + +[[ -x "$PORTABLE_SCRIPT" ]] || { + echo "Expected executable helper at $PORTABLE_SCRIPT" >&2 + exit 1 +} + +if [[ -n "$DB_PATH" ]]; then + export ISB1_RESULTS_DB_PATH="$DB_PATH" +fi + +PASSED=0 +FAILED=0 +SKIPPED=0 + +run_combo() { + local model="$1" + local engine="$2" + local band="$3" + local workload="${4:-code}" + + echo "=========================================" + echo ">>> $model × $engine × $band × $workload on $GPU_TYPE" + echo "=========================================" + + if "$PORTABLE_SCRIPT" \ + --gpu-type "$GPU_TYPE" \ + --model "$model" \ + --engine "$engine" \ + --context-band "$band" \ + --workload "$workload"; then + ((PASSED++)) || true + else + echo "FAILED: $model × $engine × $band × $workload" >&2 + ((FAILED++)) || true + fi +} + +# Core 8k — all models × all engines × chat + code +for model in qwen3.5 gptoss dsr1; do + for engine in vllm sglang; do + for workload in chat code; do + run_combo "$model" "$engine" 8k "$workload" + done + done +done + +# 131k — all models × all engines × chat + code +for model in qwen3.5 gptoss dsr1; do + for engine in vllm sglang; do + for workload in chat code; do + run_combo "$model" "$engine" 131k "$workload" + done + done +done + +# 500k — qwen3.5 + gptoss only (DSR1 max context=164k, exceeds model capability) +for model in qwen3.5 gptoss; do + for engine in vllm sglang; do + for workload in chat code; do + run_combo "$model" "$engine" 500k "$workload" + done + done +done + +# 1m — qwen3.5 only (only model supporting 1M context), b200 only +if [[ "$GPU_TYPE" == "b200" ]]; then + for engine in vllm sglang; do + for workload in chat code; do + run_combo qwen3.5 "$engine" 1m "$workload" + done + done +else + SKIPPED=4 +fi + +echo +echo "=========================================" +echo "SUITE COMPLETE: passed=$PASSED failed=$FAILED skipped=$SKIPPED" +echo "=========================================" + +if command -v python3 >/dev/null 2>&1; then + summary_cmd=(python3 "$SCRIPT_DIR/isb1_results_db.py" summary) + if [[ -n "$DB_PATH" ]]; then + summary_cmd+=(--db-path "$DB_PATH") + fi + "${summary_cmd[@]}" 2>/dev/null || true +fi + +[[ "$FAILED" -eq 0 ]] diff --git a/datasets/isb1/scripts/gmi_kv_sweep.sh b/datasets/isb1/scripts/gmi_kv_sweep.sh new file mode 100644 index 000000000..e953aba1a --- /dev/null +++ b/datasets/isb1/scripts/gmi_kv_sweep.sh @@ -0,0 +1,176 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +PORTABLE_SCRIPT="$SCRIPT_DIR/gmi_portable_benchmark.sh" + +usage() { + cat <<'EOF' +Usage: + gmi_kv_sweep.sh \ + --gpu-type \ + --model \ + --engine \ + --context-band <8k|32k|64k|131k|500k|1m> \ + --workload \ + [--users "2,4,8,16,32,64"] \ + [--offload-modes "on,off,noprefix"] \ + [--kv-cache-dtype ] \ + [--benchmark-duration-s ] \ + [--disable-prefix-caching] \ + [--total-cpu-dram-gb ] \ + [--trace-source ] \ + [--db-path ] +EOF +} + +die() { + echo "ERROR: $*" >&2 + exit 1 +} + +trim() { + local x="$1" + x="${x#${x%%[![:space:]]*}}" + x="${x%${x##*[![:space:]]}}" + printf '%s' "$x" +} + +GPU_TYPE="" +MODEL="" +ENGINE="" +CONTEXT_BAND="" +WORKLOAD="" +USERS="2,4,8,16,32,64" +OFFLOAD_MODES="on,off,noprefix" +KV_CACHE_DTYPE="" +BENCHMARK_DURATION_S="1800" +DISABLE_PREFIX_CACHING=false +TOTAL_CPU_DRAM_GB="" +TRACE_SOURCE="isb1" +DB_PATH="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --gpu-type) GPU_TYPE="$2"; shift 2 ;; + --model) MODEL="$2"; shift 2 ;; + --engine) ENGINE="$2"; shift 2 ;; + --context-band) CONTEXT_BAND="$2"; shift 2 ;; + --workload) WORKLOAD="$2"; shift 2 ;; + --users) USERS="$2"; shift 2 ;; + --offload-modes) OFFLOAD_MODES="$2"; shift 2 ;; + --kv-cache-dtype) KV_CACHE_DTYPE="$2"; shift 2 ;; + --benchmark-duration-s) BENCHMARK_DURATION_S="$2"; shift 2 ;; + --disable-prefix-caching) DISABLE_PREFIX_CACHING=true; shift ;; + --total-cpu-dram-gb) TOTAL_CPU_DRAM_GB="$2"; shift 2 ;; + --trace-source) TRACE_SOURCE="$2"; shift 2 ;; + --db-path) DB_PATH="$2"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) die "Unknown argument: $1" ;; + esac +done + +[[ -n "$GPU_TYPE" ]] || die "--gpu-type is required" +[[ -n "$MODEL" ]] || die "--model is required" +[[ -n "$ENGINE" ]] || die "--engine is required" +[[ -n "$CONTEXT_BAND" ]] || die "--context-band is required" +[[ -n "$WORKLOAD" ]] || die "--workload is required" +[[ -x "$PORTABLE_SCRIPT" ]] || die "Expected executable script: $PORTABLE_SCRIPT" + +case "$ENGINE" in + vllm|sglang) ;; + *) die "Unsupported --engine: $ENGINE" ;; +esac + +case "$TRACE_SOURCE" in + isb1|kv_cache_tester|aiperf) ;; + *) die "Unsupported --trace-source: $TRACE_SOURCE" ;; +esac + +IFS=',' read -r -a user_list <<< "$USERS" +IFS=',' read -r -a mode_list <<< "$OFFLOAD_MODES" + +[[ "${#user_list[@]}" -gt 0 ]] || die "--users cannot be empty" +[[ "${#mode_list[@]}" -gt 0 ]] || die "--offload-modes cannot be empty" + +TOTAL=0 +PASSED=0 +FAILED=0 + +for raw_mode in "${mode_list[@]}"; do + mode=$(trim "$raw_mode") + [[ -n "$mode" ]] || continue + + case "$mode" in + on|off|noprefix|legacy) ;; + *) die "Unsupported offload mode in --offload-modes: $mode" ;; + esac + + if [[ "$ENGINE" == "sglang" && "$mode" == "on" ]]; then + echo "Skipping mode=on for SGLang (no native offload support)" + continue + fi + + for raw_users in "${user_list[@]}"; do + users=$(trim "$raw_users") + [[ "$users" =~ ^[0-9]+$ ]] || die "Invalid user concurrency: $users" + + TOTAL=$((TOTAL + 1)) + echo "========================================================" + echo "Run $TOTAL: model=$MODEL engine=$ENGINE users=$users mode=$mode" + echo "========================================================" + + cmd=( + "$PORTABLE_SCRIPT" + --gpu-type "$GPU_TYPE" + --model "$MODEL" + --engine "$ENGINE" + --context-band "$CONTEXT_BAND" + --workload "$WORKLOAD" + --benchmark-type isb1_kv_stress + --benchmark-duration-s "$BENCHMARK_DURATION_S" + --max-concurrency "$users" + --trace-source "$TRACE_SOURCE" + --offload-mode "$mode" + ) + + if [[ -n "$KV_CACHE_DTYPE" ]]; then + cmd+=(--kv-cache-dtype "$KV_CACHE_DTYPE") + fi + if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then + cmd+=(--disable-prefix-caching) + fi + if [[ -n "$TOTAL_CPU_DRAM_GB" ]]; then + cmd+=(--total-cpu-dram-gb "$TOTAL_CPU_DRAM_GB") + fi + if [[ -n "$DB_PATH" ]]; then + if ISB1_RESULTS_DB_PATH="$DB_PATH" "${cmd[@]}"; then + PASSED=$((PASSED + 1)) + echo "PASS users=$users mode=$mode" + else + FAILED=$((FAILED + 1)) + echo "FAIL users=$users mode=$mode" >&2 + fi + else + if "${cmd[@]}"; then + PASSED=$((PASSED + 1)) + echo "PASS users=$users mode=$mode" + else + FAILED=$((FAILED + 1)) + echo "FAIL users=$users mode=$mode" >&2 + fi + fi + done +done + +echo +echo "KV sweep complete" +echo " total: $TOTAL" +echo " passed: $PASSED" +echo " failed: $FAILED" + +if [[ -n "$DB_PATH" && -f "$DB_PATH" ]]; then + echo " db: $DB_PATH" +fi + +[[ "$FAILED" -eq 0 ]] diff --git a/datasets/isb1/scripts/gmi_portable_benchmark.sh b/datasets/isb1/scripts/gmi_portable_benchmark.sh new file mode 100755 index 000000000..f41722e36 --- /dev/null +++ b/datasets/isb1/scripts/gmi_portable_benchmark.sh @@ -0,0 +1,1019 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +usage() { + cat <<'EOF' +Usage: + gmi_portable_benchmark.sh \ + --gpu-type \ + --model \ + --engine \ + --context-band <8k|32k|64k|131k|500k|1m> \ + --workload \ + [--benchmark-type ] \ + [--offload-mode ] \ + [--kv-cache-dtype ] \ + [--disable-prefix-caching] \ + [--total-cpu-dram-gb ] \ + [--benchmark-duration-s ] \ + [--max-concurrency ] \ + [--trace-source ] + +Required environment: + HF_TOKEN or HUGGING_FACE_HUB_TOKEN Hugging Face token for model access + +Optional environment: + PORT API port (default: 8000) + TP Tensor parallelism (default: 8) + HEALTH_TIMEOUT_S Readiness timeout in seconds (default: 1800) + HEALTH_POLL_INTERVAL_S Readiness poll interval (default: 10) + BENCHMARK_OUTPUT_ROOT Output root (default: /datasets/isb1/results/gmi) + GMI_RUN_LABEL Optional suffix added to result names +EOF +} + +die() { + echo "ERROR: $*" >&2 + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || die "Missing required command: $1" +} + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +REPO_ROOT=$(cd "$SCRIPT_DIR/../../.." && pwd) +source "$REPO_ROOT/benchmarks/benchmark_lib.sh" +PORT=${PORT:-8000} +TP=${TP:-8} +HEALTH_TIMEOUT_S=${HEALTH_TIMEOUT_S:-1800} +HEALTH_POLL_INTERVAL_S=${HEALTH_POLL_INTERVAL_S:-10} +BENCHMARK_OUTPUT_ROOT=${BENCHMARK_OUTPUT_ROOT:-"$REPO_ROOT/datasets/isb1/results/gmi"} +REQUEST_MODE=multi-turn +HARNESS_REQUEST_MODE=auto +IGNORE_WAITS=true + +GPU_TYPE="" +MODEL_KEY="" +ENGINE="" +CONTEXT_BAND="" +WORKLOAD="" +BENCHMARK_TYPE="isb1_replay" +OFFLOAD_MODE="" +KV_CACHE_DTYPE="" +DISABLE_PREFIX_CACHING=false +TOTAL_CPU_DRAM_GB="" +BENCHMARK_DURATION_S="" +MAX_CONCURRENCY_OVERRIDE="" +TRACE_SOURCE="isb1" + +while [[ $# -gt 0 ]]; do + case "$1" in + --gpu-type) + GPU_TYPE="$2" + shift 2 + ;; + --model) + MODEL_KEY="$2" + shift 2 + ;; + --engine) + ENGINE="$2" + shift 2 + ;; + --context-band) + CONTEXT_BAND="$2" + shift 2 + ;; + --workload) + WORKLOAD="$2" + shift 2 + ;; + --benchmark-type) + BENCHMARK_TYPE="$2" + shift 2 + ;; + --offload-mode) + OFFLOAD_MODE="$2" + shift 2 + ;; + --kv-cache-dtype) + KV_CACHE_DTYPE="$2" + shift 2 + ;; + --disable-prefix-caching) + DISABLE_PREFIX_CACHING=true + shift + ;; + --total-cpu-dram-gb) + TOTAL_CPU_DRAM_GB="$2" + shift 2 + ;; + --benchmark-duration-s) + BENCHMARK_DURATION_S="$2" + shift 2 + ;; + --max-concurrency) + MAX_CONCURRENCY_OVERRIDE="$2" + shift 2 + ;; + --trace-source) + TRACE_SOURCE="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + die "Unknown argument: $1" + ;; + esac +done + +[[ -n "$GPU_TYPE" ]] || die "--gpu-type is required" +[[ -n "$MODEL_KEY" ]] || die "--model is required" +[[ -n "$ENGINE" ]] || die "--engine is required" +[[ -n "$CONTEXT_BAND" ]] || die "--context-band is required" +[[ -n "$WORKLOAD" ]] || die "--workload is required" + +case "$GPU_TYPE" in + h100|h200|b200) ;; + *) die "Unsupported --gpu-type: $GPU_TYPE" ;; +esac + +case "$ENGINE" in + vllm|sglang) ;; + *) die "Unsupported --engine: $ENGINE" ;; +esac + +case "$CONTEXT_BAND" in + 8k|32k|64k|131k|500k|1m) ;; + *) die "Unsupported --context-band: $CONTEXT_BAND" ;; +esac + +case "$WORKLOAD" in + chat|code) ;; + *) die "Unsupported --workload: $WORKLOAD (must be chat or code)" ;; +esac + +case "$BENCHMARK_TYPE" in + isb1_replay|isb1_kv_stress) ;; + *) die "Unsupported --benchmark-type: $BENCHMARK_TYPE" ;; +esac + +case "$TRACE_SOURCE" in + isb1|kv_cache_tester|aiperf) ;; + *) die "Unsupported --trace-source: $TRACE_SOURCE" ;; +esac + +case "${OFFLOAD_MODE:-}" in + ""|on|off|noprefix|legacy) ;; + *) die "Unsupported --offload-mode: $OFFLOAD_MODE" ;; +esac + +case "${KV_CACHE_DTYPE:-}" in + ""|auto|fp8) ;; + *) die "Unsupported --kv-cache-dtype: $KV_CACHE_DTYPE" ;; +esac + +if [[ -n "$TOTAL_CPU_DRAM_GB" ]] && ! [[ "$TOTAL_CPU_DRAM_GB" =~ ^[0-9]+([.][0-9]+)?$ ]]; then + die "--total-cpu-dram-gb must be numeric" +fi +if [[ -n "$MAX_CONCURRENCY_OVERRIDE" ]] && ! [[ "$MAX_CONCURRENCY_OVERRIDE" =~ ^[0-9]+$ ]]; then + die "--max-concurrency must be a positive integer" +fi +if [[ -n "$BENCHMARK_DURATION_S" ]] && ! [[ "$BENCHMARK_DURATION_S" =~ ^[0-9]+([.][0-9]+)?$ ]]; then + die "--benchmark-duration-s must be numeric" +fi + +require_cmd docker +require_cmd curl +require_cmd python3 +require_cmd nvidia-smi + +HF_TOKEN_VALUE=${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-}} +[[ -n "$HF_TOKEN_VALUE" ]] || die "Set HF_TOKEN or HUGGING_FACE_HUB_TOKEN before running" + +if [[ -z "$TOTAL_CPU_DRAM_GB" ]]; then + if [[ -r /proc/meminfo ]]; then + TOTAL_CPU_DRAM_GB=$(awk '/MemTotal:/ {printf "%.0f", $2/1048576}' /proc/meminfo) + else + TOTAL_CPU_DRAM_GB=0 + fi +fi + +case "$MODEL_KEY" in + qwen3.5) + MODEL_HF_ID="Qwen/Qwen3.5-397B-A17B-FP8" + MODEL_PREFIX="qwen3.5" + CANONICAL_MODEL_ID="qwen3_5_397b_a17b" + PRECISION="fp8" + ;; + gptoss) + MODEL_HF_ID="openai/gpt-oss-120b" + MODEL_PREFIX="gptoss" + CANONICAL_MODEL_ID="gpt_oss_120b" + PRECISION="fp4" + ;; + dsr1) + MODEL_HF_ID="deepseek-ai/DeepSeek-R1-0528" + MODEL_PREFIX="dsr1" + CANONICAL_MODEL_ID="deepseek_r1_0528" + PRECISION="fp8" + ;; + *) + die "Unsupported --model: $MODEL_KEY" + ;; +esac + +case "$GPU_TYPE" in + b200) + HARDWARE_PROFILE_ID="nvidia:b200_sxm_180gb" + RUNNER_TYPE="b200-gmi-baremetal" + ;; + h100) + HARDWARE_PROFILE_ID="nvidia:h100_sxm_80gb" + RUNNER_TYPE="h100-gmi-baremetal" + ;; + h200) + HARDWARE_PROFILE_ID="nvidia:h200_sxm_141gb" + RUNNER_TYPE="h200-gmi-baremetal" + ;; +esac + +case "$ENGINE" in + vllm) + RUNTIME_STACK_ID="standalone:vllm" + if [[ "$GPU_TYPE" == "b200" ]]; then + IMAGE="vllm/vllm-openai:v0.19.0-cu130" + else + IMAGE="vllm/vllm-openai:v0.18.0" + fi + ;; + sglang) + RUNTIME_STACK_ID="standalone:sglang" + IMAGE="lmsysorg/sglang:v0.5.9-cu130" + ;; +esac + +case "$CONTEXT_BAND" in + 8k) + MAX_MODEL_LEN=10240 + MAX_CONCURRENCY=4 + NUM_WARMUP_SESSIONS=1 + MAX_SESSIONS="" + MAX_TURNS_PER_SESSION="" + MAX_NUM_BATCHED_TOKENS=8192 + MAX_ACTIVE_REQUESTS=128 + ;; + 32k) + MAX_MODEL_LEN=33792 + MAX_CONCURRENCY=4 + NUM_WARMUP_SESSIONS=1 + MAX_SESSIONS="" + MAX_TURNS_PER_SESSION="" + MAX_NUM_BATCHED_TOKENS=8192 + MAX_ACTIVE_REQUESTS=64 + ;; + 64k) + MAX_MODEL_LEN=66560 + MAX_CONCURRENCY=4 + NUM_WARMUP_SESSIONS=1 + MAX_SESSIONS="" + MAX_TURNS_PER_SESSION="" + MAX_NUM_BATCHED_TOKENS=4096 + MAX_ACTIVE_REQUESTS=64 + ;; + 131k) + MAX_MODEL_LEN=132296 + MAX_CONCURRENCY=2 + NUM_WARMUP_SESSIONS=1 + MAX_SESSIONS="" + MAX_TURNS_PER_SESSION="" + MAX_NUM_BATCHED_TOKENS=2048 + MAX_ACTIVE_REQUESTS=32 + ;; + 500k) + MAX_MODEL_LEN=524288 + MAX_CONCURRENCY=1 + NUM_WARMUP_SESSIONS=0 + MAX_SESSIONS=2 + MAX_TURNS_PER_SESSION=4 + MAX_NUM_BATCHED_TOKENS=1024 + MAX_ACTIVE_REQUESTS=8 + ;; + 1m) + MAX_MODEL_LEN=1048576 + MAX_CONCURRENCY=1 + NUM_WARMUP_SESSIONS=0 + MAX_SESSIONS=1 + MAX_TURNS_PER_SESSION=3 + MAX_NUM_BATCHED_TOKENS=1024 + MAX_ACTIVE_REQUESTS=4 + ;; +esac + +if [[ -n "$MAX_CONCURRENCY_OVERRIDE" ]]; then + MAX_CONCURRENCY="$MAX_CONCURRENCY_OVERRIDE" +fi + +select_export_file() { + case "$MODEL_KEY:$CONTEXT_BAND:$ENGINE:$WORKLOAD" in + # ── Chat exports (committed at 8k–131k) ────────────────────── + qwen3.5:8k:*:chat) + printf 'datasets/isb1/exports/core/%s/chat_8k1k_qwen3.5.json\n' "$ENGINE" + ;; + qwen3.5:32k:*:chat) + printf 'datasets/isb1/exports/extension_32k/%s/chat_32k1k_qwen3.5.json\n' "$ENGINE" + ;; + qwen3.5:64k:*:chat) + printf 'datasets/isb1/exports/extension_64k/%s/chat_64k1k_qwen3.5.json\n' "$ENGINE" + ;; + *:8k:*:chat) + printf 'datasets/isb1/exports/core/%s/chat_8k1k.json\n' "$ENGINE" + ;; + *:32k:*:chat) + printf 'datasets/isb1/exports/extension_32k/%s/chat_32k1k.json\n' "$ENGINE" + ;; + *:64k:*:chat) + printf 'datasets/isb1/exports/extension_64k/%s/chat_64k1k.json\n' "$ENGINE" + ;; + gptoss:131k:*:chat) + printf 'datasets/isb1/exports/extension_131k/%s/chat_131k1k.json\n' "$ENGINE" + ;; + qwen3.5:131k:*:chat) + printf 'datasets/isb1/exports/extension_131k/%s/chat_131k1k_qwen3.5.json\n' "$ENGINE" + ;; + dsr1:131k:*:chat) + printf 'datasets/isb1/exports/extension_131k/%s/chat_131k1k_dsr1.json\n' "$ENGINE" + ;; + gptoss:500k:*:chat) + printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1__%s.json\n' "$ENGINE" + ;; + qwen3.5:500k:*:chat) + printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1__%s.json\n' "$ENGINE" + ;; + # dsr1:500k:chat — model max 164k, exceeds capability + qwen3.5:1m:*:chat) + printf 'datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1__%s.json\n' "$ENGINE" + ;; + # dsr1:1m:chat, gptoss:1m:chat — models don't support 1M context + + # ── Code exports ────────────────────────────────────────────── + qwen3.5:8k:*:code) + printf 'datasets/isb1/exports/core/%s/code_8k1k_qwen3.5.json\n' "$ENGINE" + ;; + qwen3.5:32k:*:code) + printf 'datasets/isb1/exports/extension_32k/%s/code_32k1k_qwen3.5.json\n' "$ENGINE" + ;; + qwen3.5:64k:*:code) + printf 'datasets/isb1/exports/extension_64k/%s/code_64k1k_qwen3.5.json\n' "$ENGINE" + ;; + qwen3.5:131k:*:code) + printf 'datasets/isb1/exports/extension_131k/%s/code_131k1k_qwen3.5.json\n' "$ENGINE" + ;; + qwen3.5:500k:*:code) + printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__%s.json\n' "$ENGINE" + ;; + qwen3.5:1m:*:code) + printf 'datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__%s.json\n' "$ENGINE" + ;; + gptoss:8k:*:code) + printf 'datasets/isb1/exports/core/%s/code_8k1k.json\n' "$ENGINE" + ;; + gptoss:32k:*:code) + printf 'datasets/isb1/exports/extension_32k/%s/code_32k1k.json\n' "$ENGINE" + ;; + gptoss:64k:*:code) + printf 'datasets/isb1/exports/extension_64k/%s/code_64k1k.json\n' "$ENGINE" + ;; + gptoss:131k:*:code) + printf 'datasets/isb1/exports/extension_131k/%s/code_131k1k.json\n' "$ENGINE" + ;; + gptoss:500k:*:code) + printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__%s.json\n' "$ENGINE" + ;; + # gptoss:1m — GPT-OSS max_position_embeddings=131072; 1M exceeds model capability + dsr1:8k:*:code) + printf 'datasets/isb1/exports/core/%s/code_8k1k.json\n' "$ENGINE" + ;; + dsr1:32k:*:code) + printf 'datasets/isb1/exports/extension_32k/%s/code_32k1k.json\n' "$ENGINE" + ;; + dsr1:64k:*:code) + printf 'datasets/isb1/exports/extension_64k/%s/code_64k1k.json\n' "$ENGINE" + ;; + dsr1:131k:*:code) + printf 'datasets/isb1/exports/extension_131k/%s/code_131k1k.json\n' "$ENGINE" + ;; + # dsr1:500k/1m — DeepSeek R1 max_position_embeddings=163840; 500k/1M exceed model capability + *) + return 1 + ;; + esac +} + +TRACE_DIR="" +TRACE_REPLAY_SUMMARY_JSON="" +if [[ "$TRACE_SOURCE" == "isb1" ]]; then + EXPORT_FILE=$(select_export_file) || die "No committed ISB1 export for model=$MODEL_KEY engine=$ENGINE context=$CONTEXT_BAND workload=$WORKLOAD" + EXPORT_PATH="$REPO_ROOT/$EXPORT_FILE" + [[ -f "$EXPORT_PATH" ]] || die "Export file not found: $EXPORT_FILE" + + readarray -t EXPORT_METADATA < <( + python3 - "$EXPORT_PATH" "$RUNTIME_STACK_ID" "$HARDWARE_PROFILE_ID" "$CANONICAL_MODEL_ID" <<'PY' +import json +import sys +from pathlib import Path + +export_path = Path(sys.argv[1]) +runtime_stack_id = sys.argv[2] +hardware_profile_id = sys.argv[3] +canonical_model_id = sys.argv[4] +payload = json.loads(export_path.read_text()) +matches = [ + cell + for cell in payload.get("exports", []) + if cell.get("runtime_stack_id") == runtime_stack_id + and cell.get("hardware_profile_id") == hardware_profile_id + and cell.get("canonical_model_id") == canonical_model_id +] +if not matches: + raise SystemExit( + f"No matching export cells for runtime={runtime_stack_id} hardware={hardware_profile_id} model={canonical_model_id}" + ) +support_statuses = sorted({cell.get("support_status") for cell in matches if cell.get("support_status")}) +cert_statuses = sorted( + {cell.get("benchmark_certification_status") for cell in matches if cell.get("benchmark_certification_status")} +) +trace_ids = sorted({cell.get("trace_id") for cell in matches if cell.get("trace_id")}) +if len(support_statuses) > 1: + raise SystemExit(f"Ambiguous support statuses: {support_statuses}") +if len(cert_statuses) > 1: + raise SystemExit(f"Ambiguous certification statuses: {cert_statuses}") +print(support_statuses[0] if support_statuses else "") +print(cert_statuses[0] if cert_statuses else "") +print(",".join(trace_ids)) +print(len(matches)) +PY + ) + + SUPPORT_STATUS=${EXPORT_METADATA[0]} + BENCHMARK_CERTIFICATION_STATUS=${EXPORT_METADATA[1]} + TRACE_IDS=${EXPORT_METADATA[2]} + MATCHED_CELL_COUNT=${EXPORT_METADATA[3]} +else + SUPPORT_STATUS=${SUPPORT_STATUS:-reviewed_preview} + BENCHMARK_CERTIFICATION_STATUS=${BENCHMARK_CERTIFICATION_STATUS:-dataset_replay_verified} + TRACE_IDS="$TRACE_SOURCE" + MATCHED_CELL_COUNT="n/a" + if [[ "$TRACE_SOURCE" == "kv_cache_tester" ]]; then + TRACE_DIR=${TRACE_DIR:-"$REPO_ROOT/experimental/multiturn/vllm_benchmark/kv-cache-tester/traces"} + EXPORT_FILE="experimental/multiturn/vllm_benchmark/trace_source_kv_cache_tester.json" + else + TRACE_DIR=${TRACE_DIR:-"$REPO_ROOT/experimental/multiturn/vllm_benchmark/aiperf_traces"} + EXPORT_FILE="experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json" + fi + EXPORT_PATH="$REPO_ROOT/$EXPORT_FILE" +fi + +case "$ENGINE" in + vllm) + VLLM_CPU_OFFLOAD_GB="" + VLLM_SWAP_SPACE_GB="" + if [[ "$CONTEXT_BAND" == "500k" ]]; then + VLLM_CPU_OFFLOAD_GB=40 + VLLM_SWAP_SPACE_GB=32 + elif [[ "$CONTEXT_BAND" == "1m" ]]; then + VLLM_CPU_OFFLOAD_GB=80 + VLLM_SWAP_SPACE_GB=64 + fi + case "$CONTEXT_BAND" in + 8k|32k) VLLM_MAX_NUM_SEQS=128 ;; + 64k) VLLM_MAX_NUM_SEQS=64 ;; + 131k) VLLM_MAX_NUM_SEQS=32 ;; + 500k) VLLM_MAX_NUM_SEQS=8 ;; + 1m) VLLM_MAX_NUM_SEQS=4 ;; + esac + ;; + sglang) + case "$GPU_TYPE" in + h100) + SGLANG_MEM_FRACTION_STATIC=0.80 + SGLANG_CHUNKED_PREFILL_SIZE=8192 + ;; + h200) + SGLANG_MEM_FRACTION_STATIC=0.82 + SGLANG_CHUNKED_PREFILL_SIZE=16384 + ;; + b200) + SGLANG_MEM_FRACTION_STATIC=0.85 + SGLANG_CHUNKED_PREFILL_SIZE=32768 + ;; + esac + if [[ "$CONTEXT_BAND" == "500k" || "$CONTEXT_BAND" == "1m" ]]; then + SGLANG_MEM_FRACTION_STATIC=0.85 + SGLANG_CHUNKED_PREFILL_SIZE=8192 + fi + ;; +esac + +DATE_STAMP=$(date +%Y%m%d-%H%M%S) +SAFE_CONTEXT=${CONTEXT_BAND//[^[:alnum:]]/_} +SAFE_MODEL=${MODEL_KEY//[^[:alnum:]._-]/_} +SAFE_ENGINE=${ENGINE//[^[:alnum:]._-]/_} +SAFE_GPU=${GPU_TYPE//[^[:alnum:]._-]/_} +SAFE_WORKLOAD=${WORKLOAD//[^[:alnum:]._-]/_} +RUN_LABEL=${GMI_RUN_LABEL:-} +if [[ -n "$RUN_LABEL" ]]; then + RUN_LABEL="-${RUN_LABEL//[^[:alnum:]._-]/_}" +fi +RESULT_STEM="gmi-${SAFE_GPU}-${SAFE_MODEL}-${SAFE_ENGINE}-${SAFE_WORKLOAD}-${SAFE_CONTEXT}-${DATE_STAMP}${RUN_LABEL}" +RUN_DIR="$BENCHMARK_OUTPUT_ROOT/$RESULT_STEM" +SERVER_LOG="$RUN_DIR/server.log" +SUMMARY_JSON="$RUN_DIR/agg_${RESULT_STEM}.json" +TRACE_REPLAY_SUMMARY_JSON="$RUN_DIR/trace_replay_summary.json" +GPU_PROFILE_CSV="$RUN_DIR/${RESULT_STEM}_gpu_profile.csv" +GPU_PROFILER_PID="" +GPU_MEM_PEAK=0 +GPU_MEM_AVG=0 +GPU_UTIL_AVG=0 +mkdir -p "$RUN_DIR" +mkdir -p "$HOME/.cache/huggingface" + +CONTAINER_NAME="isb1-${RESULT_STEM}" +LOG_TAIL_PID="" +CONTAINER_ID="" +ISB1_RESULTS_DB_PATH=${ISB1_RESULTS_DB_PATH:-} + +stop_gpu_profiler() { + if [[ -n "$GPU_PROFILER_PID" ]]; then + kill "$GPU_PROFILER_PID" >/dev/null 2>&1 || true + wait "$GPU_PROFILER_PID" >/dev/null 2>&1 || true + GPU_PROFILER_PID="" + fi +} + +cleanup() { + local exit_code=$? + set +e + stop_gpu_profiler + if [[ -n "$LOG_TAIL_PID" ]]; then + kill "$LOG_TAIL_PID" >/dev/null 2>&1 || true + fi + if [[ -n "$CONTAINER_NAME" ]]; then + docker rm -f "$CONTAINER_NAME" >/dev/null 2>&1 || true + fi + exit $exit_code +} +trap cleanup EXIT + +launch_server() { + # Apply YaRN for Qwen long-context + apply_yarn_config_if_needed "$MODEL_HF_ID" "$MAX_MODEL_LEN" 2>/dev/null || true + + local docker_cmd=() + docker_cmd=( + docker run -d --rm + --name "$CONTAINER_NAME" + --gpus all + --ipc host + --network host + --shm-size 16g + -e HF_TOKEN="$HF_TOKEN_VALUE" + -e HUGGING_FACE_HUB_TOKEN="$HF_TOKEN_VALUE" + -e NVIDIA_VISIBLE_DEVICES=all + -e PYTHONUNBUFFERED=1 + -v "$HOME/.cache/huggingface:/root/.cache/huggingface" + -v "$REPO_ROOT:/workspace" + -w /workspace + ) + + if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then + docker_cmd+=(-e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1) + docker_cmd+=(-e SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1) + fi + + if [[ "$ENGINE" == "vllm" ]]; then + local cmd=( + vllm serve "$MODEL_HF_ID" + --host 0.0.0.0 + --port "$PORT" + --tensor-parallel-size "$TP" + --gpu-memory-utilization 0.90 + --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" + --max-model-len "$MAX_MODEL_LEN" + --max-num-seqs "$VLLM_MAX_NUM_SEQS" + --disable-log-requests + --trust-remote-code + ) + + case "${OFFLOAD_MODE:-}" in + on) + cmd+=( + --kv_offloading_backend native + --kv_offloading_size "$TOTAL_CPU_DRAM_GB" + --disable-hybrid-kv-cache-manager + ) + ;; + off) + ;; + noprefix) + cmd+=(--no-enable-prefix-caching) + ;; + legacy|"") + if [[ -n "$VLLM_CPU_OFFLOAD_GB" ]]; then + cmd+=(--cpu-offload-gb "$VLLM_CPU_OFFLOAD_GB") + fi + if [[ -n "$VLLM_SWAP_SPACE_GB" ]]; then + cmd+=(--swap-space "$VLLM_SWAP_SPACE_GB") + fi + ;; + esac + + if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then + cmd+=(--no-enable-prefix-caching) + fi + + if [[ "${KV_CACHE_DTYPE:-}" == "fp8" ]]; then + cmd+=(--kv-cache-dtype fp8) + fi + + if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then + cmd+=(--hf-overrides "$YARN_OVERRIDE_JSON") + fi + + CONTAINER_ID=$("${docker_cmd[@]}" "$IMAGE" bash -lc "$(printf '%q ' "${cmd[@]}")") + else + local cmd=( + python3 -m sglang.launch_server + --model-path "$MODEL_HF_ID" + --host 0.0.0.0 + --port "$PORT" + --trust-remote-code + --tensor-parallel-size "$TP" + --data-parallel-size 1 + --context-length "$MAX_MODEL_LEN" + --max-running-requests "$MAX_ACTIVE_REQUESTS" + --cuda-graph-max-bs "$MAX_ACTIVE_REQUESTS" + --chunked-prefill-size "$SGLANG_CHUNKED_PREFILL_SIZE" + --max-prefill-tokens "$SGLANG_CHUNKED_PREFILL_SIZE" + --mem-fraction-static "$SGLANG_MEM_FRACTION_STATIC" + --attention-backend flashinfer + --stream-interval 10 + --decode-log-interval 1 + ) + + case "${OFFLOAD_MODE:-}" in + on) + echo "WARNING: OFFLOAD_MODE=on is not supported for SGLang; continuing without native offload" >&2 + ;; + noprefix) + cmd+=(--disable-radix-cache) + ;; + off|legacy|"") + ;; + esac + + if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then + cmd+=(--disable-radix-cache) + fi + + if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then + cmd+=(--json-model-override-args "$YARN_OVERRIDE_JSON") + fi + + CONTAINER_ID=$("${docker_cmd[@]}" "$IMAGE" bash -lc "$(printf '%q ' "${cmd[@]}")") + fi + + [[ -n "$CONTAINER_ID" ]] || die "Failed to start Docker container" + docker logs -f "$CONTAINER_NAME" > "$SERVER_LOG" 2>&1 & + LOG_TAIL_PID=$! +} + +wait_for_server_ready() { + local deadline=$((SECONDS + HEALTH_TIMEOUT_S)) + until curl --output /dev/null --silent --fail "http://127.0.0.1:${PORT}/health"; do + if ! docker ps --format '{{.Names}}' | grep -Fxq "$CONTAINER_NAME"; then + echo "Container exited before becoming healthy. Recent logs:" >&2 + docker logs "$CONTAINER_NAME" >&2 || true + return 1 + fi + if (( SECONDS >= deadline )); then + echo "Timed out waiting for http://127.0.0.1:${PORT}/health" >&2 + docker logs "$CONTAINER_NAME" | tail -n 200 >&2 || true + return 1 + fi + sleep "$HEALTH_POLL_INTERVAL_S" + done +} + +echo "==> GMI portable benchmark" +echo "repo: $REPO_ROOT" +echo "gpu-type: $GPU_TYPE" +echo "model: $MODEL_KEY ($MODEL_HF_ID)" +echo "engine: $ENGINE" +echo "context-band: $CONTEXT_BAND" +echo "workload: $WORKLOAD" +echo "benchmark-type: $BENCHMARK_TYPE" +echo "trace-source: $TRACE_SOURCE" +echo "max-concurrency: $MAX_CONCURRENCY" +echo "max-model-len: $MAX_MODEL_LEN" +echo "docker image: $IMAGE" +echo "export-file: $EXPORT_FILE" +if [[ "$TRACE_SOURCE" != "isb1" ]]; then + echo "trace-dir: $TRACE_DIR" +fi +echo "runtime-stack-id: $RUNTIME_STACK_ID" +echo "hardware-profile-id: $HARDWARE_PROFILE_ID" +echo "canonical-model-id: $CANONICAL_MODEL_ID" +echo "support-status: ${SUPPORT_STATUS:-}" +echo "certification: ${BENCHMARK_CERTIFICATION_STATUS:-}" +echo "matched export cells: $MATCHED_CELL_COUNT" +echo "trace-ids: ${TRACE_IDS:-}" +echo "output dir: $RUN_DIR" +echo "offload-mode: ${OFFLOAD_MODE:-legacy}" +echo "kv-cache-dtype: ${KV_CACHE_DTYPE:-auto}" +echo "disable-prefix-cache: $DISABLE_PREFIX_CACHING" +echo "total-cpu-dram-gb: $TOTAL_CPU_DRAM_GB" +if [[ "$ENGINE" == "vllm" ]]; then + echo "vllm cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB:-0}" + echo "vllm swap-space-gb: ${VLLM_SWAP_SPACE_GB:-0}" +else + echo "sglang mem fraction: $SGLANG_MEM_FRACTION_STATIC" + echo "sglang chunked pf: $SGLANG_CHUNKED_PREFILL_SIZE" +fi + +"$SCRIPT_DIR/gpu_profile_collector.sh" --output "$GPU_PROFILE_CSV" --interval 2 & +GPU_PROFILER_PID=$! + +launch_server +wait_for_server_ready + +if [[ "$TRACE_SOURCE" == "isb1" ]]; then + echo "==> Server is healthy; starting export replay" + + benchmark_cmd=( + python3 "$REPO_ROOT/utils/bench_serving/benchmark_export_replay.py" + --model "$MODEL_HF_ID" + --base-url "http://127.0.0.1:${PORT}" + --export-file "$EXPORT_PATH" + --request-mode "$HARNESS_REQUEST_MODE" + --max-concurrency "$MAX_CONCURRENCY" + --num-warmup-sessions "$NUM_WARMUP_SESSIONS" + --save-result + --result-dir "$RUN_DIR" + --result-filename "$RESULT_STEM.json" + --runtime-stack-id "$RUNTIME_STACK_ID" + --hardware-profile-id "$HARDWARE_PROFILE_ID" + --canonical-model-id "$CANONICAL_MODEL_ID" + --metadata "benchmark_type=$BENCHMARK_TYPE" + --metadata "export_file=$EXPORT_FILE" + --metadata "runtime_stack_id=$RUNTIME_STACK_ID" + --metadata "hardware_profile_id=$HARDWARE_PROFILE_ID" + --metadata "canonical_model_id=$CANONICAL_MODEL_ID" + --metadata "request_mode=$REQUEST_MODE" + --metadata "gmi_gpu_type=$GPU_TYPE" + --metadata "gmi_engine=$ENGINE" + --metadata "gmi_context_band=$CONTEXT_BAND" + --metadata "gmi_workload=$WORKLOAD" + --trust-remote-code + ) + if [[ -n "$BENCHMARK_DURATION_S" ]]; then + benchmark_cmd+=(--metadata "benchmark_duration_s=$BENCHMARK_DURATION_S") + fi + if [[ "$BENCHMARK_TYPE" == "isb1_kv_stress" ]]; then + benchmark_cmd+=(--metadata "campaign_class=kv_stress") + fi + if [[ -n "$SUPPORT_STATUS" ]]; then + benchmark_cmd+=(--support-status "$SUPPORT_STATUS") + fi + if [[ -n "$MAX_SESSIONS" ]]; then + benchmark_cmd+=(--max-sessions "$MAX_SESSIONS") + fi + if [[ -n "$MAX_TURNS_PER_SESSION" ]]; then + benchmark_cmd+=(--max-turns-per-session "$MAX_TURNS_PER_SESSION") + fi + if [[ "$IGNORE_WAITS" == "true" ]]; then + benchmark_cmd+=(--ignore-waits) + fi + if [[ "$ENGINE" == "vllm" ]]; then + if [[ -n "$VLLM_CPU_OFFLOAD_GB" ]]; then + benchmark_cmd+=(--metadata "vllm_cpu_offload_gb=$VLLM_CPU_OFFLOAD_GB") + fi + if [[ -n "$VLLM_SWAP_SPACE_GB" ]]; then + benchmark_cmd+=(--metadata "vllm_swap_space_gb=$VLLM_SWAP_SPACE_GB") + fi + else + benchmark_cmd+=(--metadata "sglang_mem_fraction_override=$SGLANG_MEM_FRACTION_STATIC") + benchmark_cmd+=(--metadata "sglang_chunked_prefill_override=$SGLANG_CHUNKED_PREFILL_SIZE") + fi + + "${benchmark_cmd[@]}" +else + echo "==> Server is healthy; starting trace replay ($TRACE_SOURCE)" + + trace_cmd=( + python3 "$REPO_ROOT/experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py" + --api-endpoint "http://localhost:$PORT" + --trace-directory "$TRACE_DIR" + --output-dir "$RUN_DIR" + --start-users "$MAX_CONCURRENCY" + --max-users "$MAX_CONCURRENCY" + --test-duration "${BENCHMARK_DURATION_S:-1800}" + --seed 42 + --no-color + ) + + "${trace_cmd[@]}" + + python3 "$SCRIPT_DIR/adapt_trace_replay_result.py" \ + --input-dir "$RUN_DIR" \ + --detailed-csv detailed_results.csv \ + --summary-json "$TRACE_REPLAY_SUMMARY_JSON" \ + --output-json "$RUN_DIR/${RESULT_STEM}.json" \ + --model-id "$MODEL_HF_ID" \ + --max-concurrency "$MAX_CONCURRENCY" \ + --request-mode "$REQUEST_MODE" \ + --support-status "$SUPPORT_STATUS" \ + --benchmark-certification-status "$BENCHMARK_CERTIFICATION_STATUS" \ + --result-stem "$RESULT_STEM" +fi + +echo "==> Processing ISB1 result" +( + cd "$RUN_DIR" + export RUNNER_TYPE="$RUNNER_TYPE" + export FRAMEWORK="$ENGINE" + export PRECISION="$PRECISION" + export RESULT_FILENAME="$RESULT_STEM" + export MODEL_PREFIX="$MODEL_PREFIX" + export IMAGE="$IMAGE" + export TP="$TP" + export EP_SIZE=1 + export DP_ATTENTION=false + export BENCHMARK_TYPE="$BENCHMARK_TYPE" + export EXPORT_FILE="$EXPORT_FILE" + export RUNTIME_STACK_ID="$RUNTIME_STACK_ID" + export HARDWARE_PROFILE_ID="$HARDWARE_PROFILE_ID" + export CANONICAL_MODEL_ID="$CANONICAL_MODEL_ID" + export REQUEST_MODE="$REQUEST_MODE" + export TRACE_SOURCE="$TRACE_SOURCE" + export WORKLOAD_TYPE="$WORKLOAD" + export MAX_CONCURRENCY="$MAX_CONCURRENCY" + export IGNORE_WAITS="$IGNORE_WAITS" + export DISPATCH_REF="manual:gmi-portable" + export MAX_MODEL_LEN="$MAX_MODEL_LEN" + export OFFLOAD_MODE="${OFFLOAD_MODE:-}" + export KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-}" + export DISABLE_PREFIX_CACHING="$DISABLE_PREFIX_CACHING" + if [[ -n "$BENCHMARK_DURATION_S" ]]; then + export BENCHMARK_DURATION_S="$BENCHMARK_DURATION_S" + fi + if [[ -n "$SUPPORT_STATUS" ]]; then + export SUPPORT_STATUS="$SUPPORT_STATUS" + fi + if [[ -n "$VLLM_CPU_OFFLOAD_GB" ]]; then + export VLLM_CPU_OFFLOAD_GB="$VLLM_CPU_OFFLOAD_GB" + fi + if [[ -n "$VLLM_SWAP_SPACE_GB" ]]; then + export VLLM_SWAP_SPACE_GB="$VLLM_SWAP_SPACE_GB" + fi + if [[ -n "${SGLANG_MEM_FRACTION_STATIC:-}" ]]; then + export SGLANG_MEM_FRACTION_OVERRIDE="$SGLANG_MEM_FRACTION_STATIC" + fi + if [[ -n "${SGLANG_CHUNKED_PREFILL_SIZE:-}" ]]; then + export SGLANG_CHUNKED_PREFILL_OVERRIDE="$SGLANG_CHUNKED_PREFILL_SIZE" + fi + python3 "$REPO_ROOT/utils/process_result_isb1.py" | tee "$SUMMARY_JSON" +) + +stop_gpu_profiler + +if [[ -f "$GPU_PROFILE_CSV" ]]; then + GPU_STATS=$(python3 - "$GPU_PROFILE_CSV" <<'PY' +import csv +import sys + +with open(sys.argv[1], newline="") as handle: + rows = list(csv.DictReader(handle)) + +if rows: + mems = [float(row.get("mem_used_mb", "0") or 0) for row in rows] + utils = [float(row.get("gpu_util_pct", "0") or 0) for row in rows] + print(f"{max(mems) / 1024:.2f} {sum(mems) / len(mems) / 1024:.2f} {sum(utils) / len(utils):.1f}") +else: + print("0 0 0") +PY + 2>/dev/null) || GPU_STATS="0 0 0" + read -r GPU_MEM_PEAK GPU_MEM_AVG GPU_UTIL_AVG <<< "$GPU_STATS" +fi + +if [[ "$BENCHMARK_TYPE" == "isb1_kv_stress" ]]; then + CAMPAIGN_METADATA_JSON="$RUN_DIR/kv_stress_campaign_metadata.json" + python3 - \ + "$CAMPAIGN_METADATA_JSON" \ + "$BENCHMARK_TYPE" \ + "$WORKLOAD" \ + "$MAX_CONCURRENCY" \ + "${OFFLOAD_MODE:-}" \ + "${KV_CACHE_DTYPE:-}" \ + "$DISABLE_PREFIX_CACHING" \ + "${BENCHMARK_DURATION_S:-}" <<'PY' +import json +import sys + +payload = { + "benchmark_type": sys.argv[2], + "campaign_class": "kv_stress", + "workload_type": sys.argv[3], + "max_concurrency": sys.argv[4], + "offload_mode": sys.argv[5] or None, + "kv_cache_dtype": sys.argv[6] or None, + "disable_prefix_caching": sys.argv[7], + "benchmark_duration_s": sys.argv[8] or None, +} +with open(sys.argv[1], "w", encoding="utf-8") as f: + json.dump(payload, f, indent=2, sort_keys=True) +PY +fi + +if [[ -f "$SUMMARY_JSON" ]] && command -v python3 >/dev/null 2>&1; then + db_ingest_cmd=( + python3 "$SCRIPT_DIR/isb1_results_db.py" ingest "$SUMMARY_JSON" + --gpu-type "$GPU_TYPE" + --model "$MODEL_KEY" + --engine "$ENGINE" + --context-band "$CONTEXT_BAND" + --workload-type "$WORKLOAD" + --trace-source "$TRACE_SOURCE" + --max-model-len "$MAX_MODEL_LEN" + --tp "$TP" + --gpu-mem-peak-gb "${GPU_MEM_PEAK:-0}" + --gpu-mem-avg-gb "${GPU_MEM_AVG:-0}" + --gpu-util-avg-pct "${GPU_UTIL_AVG:-0}" + --gpu-profile-csv "$GPU_PROFILE_CSV" + ) + if [[ -n "$ISB1_RESULTS_DB_PATH" ]]; then + db_ingest_cmd+=(--db-path "$ISB1_RESULTS_DB_PATH") + fi + if [[ -n "${OFFLOAD_MODE:-}" ]]; then + db_ingest_cmd+=(--offload-mode "$OFFLOAD_MODE") + fi + if [[ -n "${KV_CACHE_DTYPE:-}" ]]; then + db_ingest_cmd+=(--kv-cache-dtype "$KV_CACHE_DTYPE") + fi + if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then + db_ingest_cmd+=(--disable-prefix-caching 1) + fi + if [[ -n "$BENCHMARK_DURATION_S" ]]; then + db_ingest_cmd+=(--benchmark-duration-s "$BENCHMARK_DURATION_S") + fi + if [[ "$BENCHMARK_TYPE" == "isb1_kv_stress" ]]; then + db_ingest_cmd+=(--campaign-class kv_stress) + fi + if [[ "$ENGINE" == "vllm" ]]; then + if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then + db_ingest_cmd+=(--vllm-cpu-offload-gb "$VLLM_CPU_OFFLOAD_GB") + fi + if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then + db_ingest_cmd+=(--vllm-swap-space-gb "$VLLM_SWAP_SPACE_GB") + fi + else + db_ingest_cmd+=(--sglang-mem-fraction "$SGLANG_MEM_FRACTION_STATIC") + db_ingest_cmd+=(--sglang-chunked-prefill "$SGLANG_CHUNKED_PREFILL_SIZE") + fi + "${db_ingest_cmd[@]}" 2>/dev/null || echo "WARNING: DB ingest failed" >&2 +fi + +python3 - "$SUMMARY_JSON" <<'PY' +import json +import sys +from pathlib import Path + +summary = json.loads(Path(sys.argv[1]).read_text()) +print("==> Summary") +for key, value in [ + ("result_filename", summary.get("result_filename")), + ("support_status", summary.get("support_status")), + ("benchmark_certification_status", summary.get("benchmark_certification_status")), + ("completed_sessions", f"{summary.get('completed_sessions')}/{summary.get('total_sessions')}"), + ("effective_max_context_depth", summary.get("effective_max_context_depth")), + ("context_pressure_class", summary.get("context_pressure_class")), + ("context_pressure_signal", summary.get("context_pressure_signal", {}).get("status")), + ("depth_coverage_ratio", summary.get("depth_coverage_ratio")), + ("depth_coverage_class", summary.get("depth_coverage_class")), + ("max_actual_context_len", summary.get("max_actual_context_len_per_turn")), + ("preemption_count", summary.get("preemption_count")), + ("session_throughput_sps", summary.get("session_throughput_sps")), + ("tput_per_gpu", summary.get("tput_per_gpu")), + ("output_tput_per_gpu", summary.get("output_tput_per_gpu")), + ("mean_ttft_s", summary.get("mean_ttft")), + ("p99_ttft_s", summary.get("p99_ttft")), + ("server_logs", Path(sys.argv[1]).with_name("server.log")), + ("raw_replay_result", Path(sys.argv[1]).with_name(summary.get("result_filename", "run") + ".json")), + ("processed_result", Path(sys.argv[1])), +]: + print(f" {key}: {value}") +PY diff --git a/datasets/isb1/scripts/gmi_test_matrix.sh b/datasets/isb1/scripts/gmi_test_matrix.sh new file mode 100755 index 000000000..5deadb072 --- /dev/null +++ b/datasets/isb1/scripts/gmi_test_matrix.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +usage() { + cat <<'EOF' +Usage: + gmi_test_matrix.sh --gpu-type + +Runs a curated GMI Cloud matrix: + - Qwen3.5 × vllm × 131k + - Qwen3.5 × vllm × 500k + - Qwen3.5 × sglang × 500k + - GPT-OSS × vllm × 131k + - DSR1 × sglang × 131k +EOF +} + +GPU_TYPE="" +while [[ $# -gt 0 ]]; do + case "$1" in + --gpu-type) + GPU_TYPE="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 1 + ;; + esac +done + +[[ -n "$GPU_TYPE" ]] || { + usage >&2 + exit 1 +} + +case "$GPU_TYPE" in + h100|h200|b200) ;; + *) + echo "Unsupported --gpu-type: $GPU_TYPE" >&2 + exit 1 + ;; +esac + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +PORTABLE_SCRIPT="$SCRIPT_DIR/gmi_portable_benchmark.sh" +[[ -x "$PORTABLE_SCRIPT" ]] || { + echo "Expected executable helper at $PORTABLE_SCRIPT" >&2 + exit 1 +} + +run_case() { + local model="$1" + local engine="$2" + local context_band="$3" + local workload="${4:-code}" + + echo + echo "============================================================" + echo "Running: gpu=${GPU_TYPE} model=${model} engine=${engine} context=${context_band} workload=${workload}" + echo "============================================================" + + "$PORTABLE_SCRIPT" \ + --gpu-type "$GPU_TYPE" \ + --model "$model" \ + --engine "$engine" \ + --context-band "$context_band" \ + --workload "$workload" +} + +run_case qwen3.5 vllm 8k chat +run_case qwen3.5 vllm 131k code +run_case qwen3.5 vllm 500k code +run_case qwen3.5 sglang 500k chat +run_case gptoss vllm 131k code +run_case gptoss vllm 131k chat +run_case gptoss vllm 500k chat +run_case dsr1 sglang 131k code +run_case dsr1 sglang 131k chat +run_case qwen3.5 vllm 1m code + +echo +echo "Curated GMI test matrix completed successfully." diff --git a/datasets/isb1/scripts/gpu_profile_collector.sh b/datasets/isb1/scripts/gpu_profile_collector.sh new file mode 100755 index 000000000..4ba03f223 --- /dev/null +++ b/datasets/isb1/scripts/gpu_profile_collector.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +# Usage: gpu_profile_collector.sh --output /tmp/gpu.csv [--interval 2] +# Runs nvidia-smi polling until killed (SIGTERM/SIGINT) + +OUTPUT="" +INTERVAL=2 + +while [[ $# -gt 0 ]]; do + case "$1" in + --output) + OUTPUT="$2" + shift 2 + ;; + --interval) + INTERVAL="$2" + shift 2 + ;; + *) + echo "Unknown arg: $1" >&2 + exit 1 + ;; + esac +done + +[[ -n "$OUTPUT" ]] || { + echo "ERROR: --output required" >&2 + exit 1 +} + +mkdir -p "$(dirname "$OUTPUT")" +echo "timestamp,gpu_bus_id,gpu_util_pct,mem_util_pct,mem_used_mb,mem_total_mb,temp_c,power_w" > "$OUTPUT" + +trap 'exit 0' SIGTERM SIGINT + +while true; do + nvidia-smi \ + --query-gpu=timestamp,gpu_bus_id,utilization.gpu,utilization.memory,memory.used,memory.total,temperature.gpu,power.draw \ + --format=csv,noheader,nounits >> "$OUTPUT" 2>/dev/null || true + sleep "$INTERVAL" +done diff --git a/datasets/isb1/scripts/isb1_results_db.py b/datasets/isb1/scripts/isb1_results_db.py new file mode 100644 index 000000000..e052fa766 --- /dev/null +++ b/datasets/isb1/scripts/isb1_results_db.py @@ -0,0 +1,816 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +import sqlite3 +import sys +import uuid +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Iterable, Sequence + +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_ROOT = SCRIPT_DIR.parent.parent.parent +DEFAULT_DB_PATH = REPO_ROOT / "datasets/isb1/results/isb1_results.db" +TABLE_NAME = "benchmark_runs" + +SCHEMA_SQL = f""" +CREATE TABLE IF NOT EXISTS {TABLE_NAME} ( + id INTEGER PRIMARY KEY, + run_id TEXT, + timestamp TEXT, + gpu_type TEXT, + model TEXT, + engine TEXT, + context_band TEXT, + workload_type TEXT, + max_model_len INTEGER, + tp INTEGER, + vllm_cpu_offload_gb REAL, + vllm_swap_space_gb REAL, + sglang_mem_fraction REAL, + sglang_chunked_prefill INTEGER, + ttft_p50_ms REAL, + ttft_p99_ms REAL, + tpot_p50_ms REAL, + tpot_p99_ms REAL, + throughput_tok_s REAL, + total_sessions INTEGER, + completed_sessions INTEGER, + total_turns INTEGER, + completed_turns INTEGER, + preemption_count INTEGER, + gpu_mem_peak_gb REAL, + gpu_mem_avg_gb REAL, + gpu_util_avg_pct REAL, + kv_cache_usage_pct REAL, + server_startup_s REAL, + benchmark_duration_s REAL, + campaign_class TEXT, + trace_source TEXT, + total_actual_input_tokens INTEGER, + max_actual_context_len INTEGER, + depth_coverage_ratio REAL, + depth_coverage_class TEXT, + producer_estimated_kv_bytes_peak INTEGER, + producer_expected_offload_mode TEXT, + offload_mode_match INTEGER, + offload_mode TEXT, + kv_cache_dtype TEXT, + disable_prefix_caching INTEGER, + cpu_cache_usage_peak_pct REAL, + raw_result_json TEXT, + status TEXT, + error_message TEXT +) +""" + +INSERT_COLUMNS = [ + "run_id", + "timestamp", + "gpu_type", + "model", + "engine", + "context_band", + "workload_type", + "max_model_len", + "tp", + "vllm_cpu_offload_gb", + "vllm_swap_space_gb", + "sglang_mem_fraction", + "sglang_chunked_prefill", + "ttft_p50_ms", + "ttft_p99_ms", + "tpot_p50_ms", + "tpot_p99_ms", + "throughput_tok_s", + "total_sessions", + "completed_sessions", + "total_turns", + "completed_turns", + "preemption_count", + "gpu_mem_peak_gb", + "gpu_mem_avg_gb", + "gpu_util_avg_pct", + "kv_cache_usage_pct", + "server_startup_s", + "benchmark_duration_s", + "campaign_class", + "trace_source", + "total_actual_input_tokens", + "max_actual_context_len", + "depth_coverage_ratio", + "depth_coverage_class", + "producer_estimated_kv_bytes_peak", + "producer_expected_offload_mode", + "offload_mode_match", + "offload_mode", + "kv_cache_dtype", + "disable_prefix_caching", + "cpu_cache_usage_peak_pct", + "raw_result_json", + "status", + "error_message", +] + +GROUPABLE_COLUMNS = { + "gpu_type", + "model", + "engine", + "context_band", + "workload_type", + "status", + "tp", + "max_model_len", + "depth_coverage_class", + "offload_mode", + "campaign_class", + "trace_source", +} + +DEFAULT_QUERY_COLUMNS = [ + "timestamp", + "gpu_type", + "model", + "engine", + "context_band", + "workload_type", + "status", + "ttft_p50_ms", + "ttft_p99_ms", + "throughput_tok_s", + "gpu_mem_peak_gb", + "gpu_util_avg_pct", + "preemption_count", + "depth_coverage_ratio", + "max_actual_context_len", + "depth_coverage_class", + "run_id", +] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Store and analyze ISB1 benchmark runs in SQLite.") + subparsers = parser.add_subparsers(dest="command", required=True) + + ingest = subparsers.add_parser("ingest", help="Read a processed ISB1 JSON file and insert a benchmark run.") + ingest.add_argument("json_file", help="Path to utils/process_result_isb1.py output JSON.") + ingest.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.") + ingest.add_argument("--gpu-type", required=True, choices=["h100", "h200", "b200"]) + ingest.add_argument("--model", required=True, choices=["qwen3.5", "gptoss", "dsr1"]) + ingest.add_argument("--engine", required=True, choices=["vllm", "sglang"]) + ingest.add_argument("--context-band", required=True, choices=["8k", "32k", "64k", "131k", "500k", "1m"]) + ingest.add_argument("--workload-type", choices=["chat", "code"], help="Workload type (chat or code)") + ingest.add_argument("--run-id", help="Optional run UUID. Generated if omitted.") + ingest.add_argument("--timestamp", help="Optional ISO-8601 timestamp. Uses current UTC time if omitted.") + ingest.add_argument("--max-model-len", type=int) + ingest.add_argument("--tp", type=int) + ingest.add_argument("--vllm-cpu-offload-gb", type=float) + ingest.add_argument("--vllm-swap-space-gb", type=float) + ingest.add_argument("--sglang-mem-fraction", type=float) + ingest.add_argument("--sglang-chunked-prefill", type=int) + ingest.add_argument("--ttft-p50-ms", type=float) + ingest.add_argument("--ttft-p99-ms", type=float) + ingest.add_argument("--tpot-p50-ms", type=float) + ingest.add_argument("--tpot-p99-ms", type=float) + ingest.add_argument("--throughput-tok-s", type=float) + ingest.add_argument("--total-sessions", type=int) + ingest.add_argument("--completed-sessions", type=int) + ingest.add_argument("--total-turns", type=int) + ingest.add_argument("--completed-turns", type=int) + ingest.add_argument("--preemption-count", type=int) + ingest.add_argument("--gpu-mem-peak-gb", type=float) + ingest.add_argument("--gpu-mem-avg-gb", type=float) + ingest.add_argument("--gpu-util-avg-pct", type=float) + ingest.add_argument("--kv-cache-usage-pct", type=float) + ingest.add_argument("--server-startup-s", type=float) + ingest.add_argument("--benchmark-duration-s", type=float) + ingest.add_argument("--campaign-class") + ingest.add_argument("--trace-source", choices=["isb1", "kv_cache_tester", "aiperf"]) + ingest.add_argument("--offload-mode", choices=["on", "off", "noprefix", "legacy"]) + ingest.add_argument("--kv-cache-dtype", choices=["auto", "fp8"]) + ingest.add_argument("--disable-prefix-caching", type=int, choices=[0, 1]) + ingest.add_argument("--gpu-profile-csv", help="Optional GPU profile CSV path to stash in raw_result_json metadata.") + ingest.add_argument("--status", default="success", choices=["success", "failed", "timeout"]) + ingest.add_argument("--error-message") + + query = subparsers.add_parser("query", help="Print runs or an aggregated grouped view.") + query.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.") + query.add_argument("--group-by", help="Comma-separated columns to group by, for example gpu_type,context_band.") + + export_csv = subparsers.add_parser("export-csv", help="Export all benchmark rows to CSV.") + export_csv.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.") + export_csv.add_argument("--output", help="Destination CSV path. Defaults to stdout.") + + summary = subparsers.add_parser("summary", help="Print a concise findings summary.") + summary.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.") + + return parser.parse_args() + + +_MIGRATIONS = [ + f"ALTER TABLE {TABLE_NAME} ADD COLUMN total_actual_input_tokens INTEGER", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN max_actual_context_len INTEGER", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN depth_coverage_ratio REAL", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN depth_coverage_class TEXT", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN producer_estimated_kv_bytes_peak INTEGER", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN producer_expected_offload_mode TEXT", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN offload_mode_match INTEGER", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN offload_mode TEXT", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN kv_cache_dtype TEXT", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN disable_prefix_caching INTEGER", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN cpu_cache_usage_peak_pct REAL", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN workload_type TEXT", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN campaign_class TEXT", + f"ALTER TABLE {TABLE_NAME} ADD COLUMN trace_source TEXT", +] + + +def ensure_db(conn: sqlite3.Connection) -> None: + conn.execute(SCHEMA_SQL) + conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_run_id ON {TABLE_NAME}(run_id)") + conn.execute( + f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_grouping " + f"ON {TABLE_NAME}(gpu_type, model, engine, context_band, status)" + ) + # Idempotent migrations for existing databases + for migration_sql in _MIGRATIONS: + try: + conn.execute(migration_sql) + except sqlite3.OperationalError: + pass # Column already exists + conn.commit() + + +def connect_db(db_path: str | Path) -> sqlite3.Connection: + db_path = Path(db_path) + db_path.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + ensure_db(conn) + return conn + + +def utc_now_iso() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def to_float(value: Any) -> float | None: + if value in (None, ""): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def to_int(value: Any) -> int | None: + if value in (None, ""): + return None + try: + return int(float(value)) + except (TypeError, ValueError): + return None + + +def seconds_to_ms(value: Any) -> float | None: + parsed = to_float(value) + return None if parsed is None else parsed * 1000.0 + + +def choose(*values: Any) -> Any: + for value in values: + if value not in (None, ""): + return value + return None + + +def load_payload(path: str | Path) -> dict[str, Any]: + payload = json.loads(Path(path).read_text()) + if not isinstance(payload, dict): + raise SystemExit(f"Expected a JSON object in {path}") + return payload + + +def derive_total_turns(payload: dict[str, Any], total_sessions: int | None) -> int | None: + max_turns = to_int(payload.get("max_turns")) + if max_turns is not None and total_sessions is not None: + return max_turns * total_sessions + per_turn_metrics = payload.get("per_turn_metrics") or {} + if isinstance(per_turn_metrics, dict) and total_sessions is not None: + return len(per_turn_metrics) * total_sessions + return None + + +def derive_completed_turns(payload: dict[str, Any]) -> int | None: + per_turn_metrics = payload.get("per_turn_metrics") or {} + if not isinstance(per_turn_metrics, dict): + return None + completed = 0 + saw_value = False + for turn_metrics in per_turn_metrics.values(): + if not isinstance(turn_metrics, dict): + continue + value = to_int(turn_metrics.get("completed")) + if value is None: + continue + completed += value + saw_value = True + return completed if saw_value else None + + +def build_raw_payload(payload: dict[str, Any], args: argparse.Namespace) -> dict[str, Any]: + enriched = dict(payload) + metadata = { + "source_json": str(Path(args.json_file).resolve()), + "db_path": str(Path(args.db_path).resolve()), + } + if args.gpu_profile_csv: + metadata["gpu_profile_csv"] = str(Path(args.gpu_profile_csv).resolve()) + if args.status != "success": + metadata["status_override"] = args.status + if args.error_message: + metadata["error_message"] = args.error_message + enriched["_isb1_results_db"] = metadata + return enriched + + +def insert_run(args: argparse.Namespace) -> None: + payload = load_payload(args.json_file) + aggregate = payload.get("aggregate_metrics") or {} + runtime_overrides = payload.get("runtime_overrides") or {} + server_metrics_summary = payload.get("server_metrics_summary") or {} + + total_sessions = to_int(choose(args.total_sessions, payload.get("total_sessions"), aggregate.get("total_sessions"))) + completed_sessions = to_int( + choose(args.completed_sessions, payload.get("completed_sessions"), aggregate.get("completed_sessions")) + ) + + gpu_cache_peak = to_float(server_metrics_summary.get("gpu_cache_usage_peak")) + if gpu_cache_peak is None: + gpu_cache_peak = to_float(payload.get("peak_gpu_cache_usage")) + + row = { + "run_id": args.run_id or str(uuid.uuid4()), + "timestamp": args.timestamp or utc_now_iso(), + "gpu_type": args.gpu_type, + "model": args.model, + "engine": args.engine, + "context_band": args.context_band, + "workload_type": choose( + getattr(args, 'workload_type', None), + payload.get("benchmark_surface"), + ), + "max_model_len": to_int(choose(args.max_model_len, payload.get("max_model_len"))), + "tp": to_int(choose(args.tp, payload.get("tp"))), + "vllm_cpu_offload_gb": to_float( + choose( + args.vllm_cpu_offload_gb, + runtime_overrides.get("vllm_cpu_offload_gb"), + payload.get("vllm_cpu_offload_gb"), + ) + ), + "vllm_swap_space_gb": to_float( + choose( + args.vllm_swap_space_gb, + runtime_overrides.get("vllm_swap_space_gb"), + payload.get("vllm_swap_space_gb"), + ) + ), + "sglang_mem_fraction": to_float( + choose( + args.sglang_mem_fraction, + runtime_overrides.get("sglang_mem_fraction_override"), + payload.get("sglang_mem_fraction_override"), + ) + ), + "sglang_chunked_prefill": to_int( + choose( + args.sglang_chunked_prefill, + runtime_overrides.get("sglang_chunked_prefill_override"), + payload.get("sglang_chunked_prefill_override"), + ) + ), + "ttft_p50_ms": to_float( + choose(args.ttft_p50_ms, aggregate.get("median_ttft_ms"), seconds_to_ms(payload.get("median_ttft"))) + ), + "ttft_p99_ms": to_float( + choose(args.ttft_p99_ms, aggregate.get("p99_ttft_ms"), seconds_to_ms(payload.get("p99_ttft"))) + ), + "tpot_p50_ms": to_float( + choose(args.tpot_p50_ms, aggregate.get("median_tpot_ms"), seconds_to_ms(payload.get("median_tpot"))) + ), + "tpot_p99_ms": to_float( + choose(args.tpot_p99_ms, aggregate.get("p99_tpot_ms"), seconds_to_ms(payload.get("p99_tpot"))) + ), + "throughput_tok_s": to_float( + choose(args.throughput_tok_s, aggregate.get("total_token_throughput_tps"), payload.get("throughput_tok_s")) + ), + "total_sessions": total_sessions, + "completed_sessions": completed_sessions, + "total_turns": to_int(choose(args.total_turns, derive_total_turns(payload, total_sessions))), + "completed_turns": to_int(choose(args.completed_turns, derive_completed_turns(payload))), + "preemption_count": to_int(choose(args.preemption_count, payload.get("preemption_count"))), + "gpu_mem_peak_gb": to_float(choose(args.gpu_mem_peak_gb, payload.get("gpu_mem_peak_gb"))), + "gpu_mem_avg_gb": to_float(choose(args.gpu_mem_avg_gb, payload.get("gpu_mem_avg_gb"))), + "gpu_util_avg_pct": to_float(choose(args.gpu_util_avg_pct, payload.get("gpu_util_avg_pct"))), + "kv_cache_usage_pct": to_float( + choose(args.kv_cache_usage_pct, payload.get("kv_cache_usage_pct"), gpu_cache_peak * 100.0 if gpu_cache_peak is not None else None) + ), + "server_startup_s": to_float(choose(args.server_startup_s, payload.get("server_startup_s"))), + "benchmark_duration_s": to_float( + choose(args.benchmark_duration_s, payload.get("benchmark_duration_s"), aggregate.get("total_wall_time_s")) + ), + "campaign_class": choose( + getattr(args, 'campaign_class', None), + payload.get("campaign_class"), + ), + "trace_source": choose( + getattr(args, 'trace_source', None), + payload.get("trace_source"), + ), + "total_actual_input_tokens": to_int( + (payload.get("depth_telemetry") or {}).get("total_actual_input_tokens") + or payload.get("total_actual_input_tokens") + ), + "max_actual_context_len": to_int( + (payload.get("depth_telemetry") or {}).get("max_actual_context_len_per_turn") + or payload.get("max_actual_context_len_per_turn") + ), + "depth_coverage_ratio": to_float(payload.get("depth_coverage_ratio")), + "depth_coverage_class": payload.get("depth_coverage_class"), + "producer_estimated_kv_bytes_peak": to_int(payload.get("producer_estimated_kv_bytes_peak")), + "producer_expected_offload_mode": payload.get("producer_expected_offload_mode"), + "offload_mode_match": ( + 1 if payload.get("producer_expectation_validation", {}).get("offload_mode_match") is True + else 0 if payload.get("producer_expectation_validation", {}).get("offload_mode_match") is False + else None + ), + "offload_mode": choose(getattr(args, 'offload_mode', None), payload.get("offload_mode")), + "kv_cache_dtype": choose(getattr(args, 'kv_cache_dtype', None), payload.get("kv_cache_dtype")), + "disable_prefix_caching": to_int( + choose( + getattr(args, 'disable_prefix_caching', None), + payload.get("disable_prefix_caching"), + ) + ), + "cpu_cache_usage_peak_pct": to_float( + payload.get("peak_cpu_cache_usage", 0.0) * 100.0 + if payload.get("peak_cpu_cache_usage") is not None else None + ), + "raw_result_json": json.dumps(build_raw_payload(payload, args), sort_keys=True), + "status": args.status, + "error_message": choose(args.error_message, payload.get("error_message")), + } + + conn = connect_db(args.db_path) + placeholders = ", ".join("?" for _ in INSERT_COLUMNS) + sql = f"INSERT INTO {TABLE_NAME} ({', '.join(INSERT_COLUMNS)}) VALUES ({placeholders})" + conn.execute(sql, [row[column] for column in INSERT_COLUMNS]) + conn.commit() + conn.close() + + print( + f"Inserted run {row['run_id']} into {Path(args.db_path)} " + f"({row['gpu_type']} {row['model']} {row['engine']} {row['context_band']}, status={row['status']})." + ) + + +def fetch_rows(conn: sqlite3.Connection, sql: str, params: Sequence[Any] = ()) -> list[sqlite3.Row]: + return list(conn.execute(sql, params)) + + +def stringify(value: Any) -> str: + if value is None: + return "" + if isinstance(value, float): + return f"{value:.2f}" + return str(value) + + +def render_table(headers: Sequence[str], rows: Iterable[Sequence[Any]]) -> str: + normalized_rows = [[stringify(value) for value in row] for row in rows] + widths = [len(header) for header in headers] + for row in normalized_rows: + for idx, value in enumerate(row): + widths[idx] = max(widths[idx], len(value)) + + def fmt_row(row: Sequence[str]) -> str: + return " | ".join(value.ljust(widths[idx]) for idx, value in enumerate(row)) + + divider = "-+-".join("-" * width for width in widths) + lines = [fmt_row(headers), divider] + for row in normalized_rows: + lines.append(fmt_row(row)) + return "\n".join(lines) + + +def print_query(args: argparse.Namespace) -> None: + conn = connect_db(args.db_path) + + if args.group_by: + group_columns = [column.strip() for column in args.group_by.split(",") if column.strip()] + if not group_columns: + raise SystemExit("--group-by requires at least one column") + invalid = [column for column in group_columns if column not in GROUPABLE_COLUMNS] + if invalid: + raise SystemExit( + f"Unsupported --group-by columns: {', '.join(invalid)}. " + f"Allowed: {', '.join(sorted(GROUPABLE_COLUMNS))}" + ) + + select_prefix = ", ".join(group_columns) + sql = f""" + SELECT + {select_prefix}, + COUNT(*) AS runs, + SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) AS success_runs, + SUM(CASE WHEN status != 'success' THEN 1 ELSE 0 END) AS non_success_runs, + ROUND(AVG(ttft_p50_ms), 2) AS avg_ttft_p50_ms, + ROUND(AVG(throughput_tok_s), 2) AS avg_throughput_tok_s, + ROUND(MAX(gpu_mem_peak_gb), 2) AS max_gpu_mem_peak_gb, + SUM(CASE WHEN COALESCE(preemption_count, 0) > 0 THEN 1 ELSE 0 END) AS preemption_runs + FROM {TABLE_NAME} + GROUP BY {select_prefix} + ORDER BY {select_prefix} + """ + rows = fetch_rows(conn, sql) + headers = group_columns + [ + "runs", + "success_runs", + "non_success_runs", + "avg_ttft_p50_ms", + "avg_throughput_tok_s", + "max_gpu_mem_peak_gb", + "preemption_runs", + ] + print(render_table(headers, ([row[header] for header in headers] for row in rows))) + else: + sql = f"SELECT {', '.join(DEFAULT_QUERY_COLUMNS)} FROM {TABLE_NAME} ORDER BY id DESC" + rows = fetch_rows(conn, sql) + print(render_table(DEFAULT_QUERY_COLUMNS, ([row[column] for column in DEFAULT_QUERY_COLUMNS] for row in rows))) + + conn.close() + + +def export_csv_rows(args: argparse.Namespace) -> None: + conn = connect_db(args.db_path) + rows = fetch_rows(conn, f"SELECT * FROM {TABLE_NAME} ORDER BY id ASC") + headers = [description[0] for description in conn.execute(f"SELECT * FROM {TABLE_NAME} LIMIT 0").description] + + if args.output: + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + handle = output_path.open("w", newline="") + else: + handle = sys.stdout + + try: + writer = csv.writer(handle) + writer.writerow(headers) + for row in rows: + writer.writerow([row[header] for header in headers]) + finally: + if args.output: + handle.close() + print(f"Exported {len(rows)} rows to {args.output}") + + conn.close() + + +def print_summary(args: argparse.Namespace) -> None: + conn = connect_db(args.db_path) + total_runs = conn.execute(f"SELECT COUNT(*) FROM {TABLE_NAME}").fetchone()[0] + if total_runs == 0: + print(f"No runs found in {args.db_path}") + conn.close() + return + + status_rows = fetch_rows(conn, f"SELECT status, COUNT(*) AS count FROM {TABLE_NAME} GROUP BY status ORDER BY status") + preemption_rows = fetch_rows( + conn, + f""" + SELECT gpu_type, model, engine, context_band, preemption_count, status + FROM {TABLE_NAME} + WHERE COALESCE(preemption_count, 0) > 0 + ORDER BY preemption_count DESC, id DESC + LIMIT 10 + """, + ) + highest_memory_rows = fetch_rows( + conn, + f""" + SELECT gpu_type, model, engine, context_band, gpu_mem_peak_gb, kv_cache_usage_pct, status + FROM {TABLE_NAME} + WHERE gpu_mem_peak_gb IS NOT NULL + ORDER BY gpu_mem_peak_gb DESC, id DESC + LIMIT 5 + """, + ) + slowest_ttft_rows = fetch_rows( + conn, + f""" + SELECT gpu_type, model, engine, context_band, ttft_p50_ms, ttft_p99_ms, status + FROM {TABLE_NAME} + WHERE ttft_p50_ms IS NOT NULL + ORDER BY ttft_p50_ms DESC, id DESC + LIMIT 5 + """, + ) + highest_kv_rows = fetch_rows( + conn, + f""" + SELECT gpu_type, model, engine, context_band, kv_cache_usage_pct, gpu_mem_peak_gb, status + FROM {TABLE_NAME} + WHERE kv_cache_usage_pct IS NOT NULL + ORDER BY kv_cache_usage_pct DESC, id DESC + LIMIT 5 + """, + ) + long_context_rollup = fetch_rows( + conn, + f""" + SELECT + context_band, + COUNT(*) AS runs, + SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) AS success_runs, + ROUND(AVG(ttft_p50_ms), 2) AS avg_ttft_p50_ms, + ROUND(MAX(gpu_mem_peak_gb), 2) AS max_gpu_mem_peak_gb, + SUM(CASE WHEN COALESCE(preemption_count, 0) > 0 THEN 1 ELSE 0 END) AS preemption_runs + FROM {TABLE_NAME} + WHERE context_band IN ('131k', '500k', '1m') + GROUP BY context_band + ORDER BY CASE context_band WHEN '131k' THEN 1 WHEN '500k' THEN 2 WHEN '1m' THEN 3 ELSE 99 END + """, + ) + + print(f"ISB1 results summary ({args.db_path})") + print(f"Total runs: {total_runs}") + print(render_table(["status", "count"], ([row["status"], row["count"]] for row in status_rows))) + print() + + if long_context_rollup: + print("Long-context rollup") + print( + render_table( + ["context_band", "runs", "success_runs", "avg_ttft_p50_ms", "max_gpu_mem_peak_gb", "preemption_runs"], + ( + [ + row["context_band"], + row["runs"], + row["success_runs"], + row["avg_ttft_p50_ms"], + row["max_gpu_mem_peak_gb"], + row["preemption_runs"], + ] + for row in long_context_rollup + ), + ) + ) + print() + + # Depth coverage rollup + depth_coverage_rows = fetch_rows( + conn, + f""" + SELECT + context_band, + COUNT(*) AS runs, + ROUND(AVG(depth_coverage_ratio), 4) AS avg_depth_coverage, + MAX(max_actual_context_len) AS max_actual_ctx, + SUM(CASE WHEN depth_coverage_class = 'configuration_only' THEN 1 ELSE 0 END) AS config_only_runs, + SUM(CASE WHEN depth_coverage_class = 'full' THEN 1 ELSE 0 END) AS full_depth_runs + FROM {TABLE_NAME} + WHERE context_band IN ('131k', '500k', '1m') + AND depth_coverage_ratio IS NOT NULL + GROUP BY context_band + ORDER BY CASE context_band WHEN '131k' THEN 1 WHEN '500k' THEN 2 WHEN '1m' THEN 3 ELSE 99 END + """, + ) + if depth_coverage_rows: + print("Depth coverage (actual vs configured)") + print( + render_table( + ["context_band", "runs", "avg_depth_coverage", "max_actual_ctx", "config_only_runs", "full_depth_runs"], + ( + [ + row["context_band"], + row["runs"], + row["avg_depth_coverage"], + row["max_actual_ctx"], + row["config_only_runs"], + row["full_depth_runs"], + ] + for row in depth_coverage_rows + ), + ) + ) + print() + + if preemption_rows: + print("Runs with preemptions") + print( + render_table( + ["gpu_type", "model", "engine", "context_band", "preemption_count", "status"], + ( + [ + row["gpu_type"], + row["model"], + row["engine"], + row["context_band"], + row["preemption_count"], + row["status"], + ] + for row in preemption_rows + ), + ) + ) + print() + else: + print("Runs with preemptions: none") + print() + + if highest_memory_rows: + print("Highest peak GPU memory") + print( + render_table( + ["gpu_type", "model", "engine", "context_band", "gpu_mem_peak_gb", "kv_cache_usage_pct", "status"], + ( + [ + row["gpu_type"], + row["model"], + row["engine"], + row["context_band"], + row["gpu_mem_peak_gb"], + row["kv_cache_usage_pct"], + row["status"], + ] + for row in highest_memory_rows + ), + ) + ) + print() + + if slowest_ttft_rows: + print("Slowest TTFT p50 runs") + print( + render_table( + ["gpu_type", "model", "engine", "context_band", "ttft_p50_ms", "ttft_p99_ms", "status"], + ( + [ + row["gpu_type"], + row["model"], + row["engine"], + row["context_band"], + row["ttft_p50_ms"], + row["ttft_p99_ms"], + row["status"], + ] + for row in slowest_ttft_rows + ), + ) + ) + print() + + if highest_kv_rows: + print("Highest KV-cache usage") + print( + render_table( + ["gpu_type", "model", "engine", "context_band", "kv_cache_usage_pct", "gpu_mem_peak_gb", "status"], + ( + [ + row["gpu_type"], + row["model"], + row["engine"], + row["context_band"], + row["kv_cache_usage_pct"], + row["gpu_mem_peak_gb"], + row["status"], + ] + for row in highest_kv_rows + ), + ) + ) + + conn.close() + + +def main() -> int: + args = parse_args() + if args.command == "ingest": + insert_run(args) + elif args.command == "query": + print_query(args) + elif args.command == "export-csv": + export_csv_rows(args) + elif args.command == "summary": + print_summary(args) + else: + raise SystemExit(f"Unknown command: {args.command}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/metrics_collector.py b/datasets/isb1/scripts/metrics_collector.py new file mode 100644 index 000000000..3de1f7615 --- /dev/null +++ b/datasets/isb1/scripts/metrics_collector.py @@ -0,0 +1,356 @@ +#!/usr/bin/env python3 +"""Prometheus metrics scraper for ISB1 KV stress benchmarks.""" + +from __future__ import annotations + +import argparse +import asyncio +import csv +import json +import re +import signal +import statistics +import time +from pathlib import Path +from typing import Dict +from urllib.request import Request, urlopen + +PROM_LINE_RE = re.compile( + r"^\s*([a-zA-Z_:][a-zA-Z0-9_:]*)(?:\{[^}]*\})?\s+([-+]?(?:\d+\.\d*|\d*\.\d+|\d+)(?:[eE][-+]?\d+)?)\s*$" +) + +CANONICAL_METRICS: dict[str, tuple[str, ...]] = { + # Required vLLM metrics + "vllm:gpu_cache_usage_perc": ( + "vllm:gpu_cache_usage_perc", + "vllm_gpu_cache_usage_perc", + ), + "vllm:cpu_cache_usage_perc": ( + "vllm:cpu_cache_usage_perc", + "vllm_cpu_cache_usage_perc", + ), + "vllm:num_preemptions_total": ( + "vllm:num_preemptions_total", + "vllm_num_preemptions_total", + ), + "vllm:num_requests_running": ( + "vllm:num_requests_running", + "vllm_num_requests_running", + ), + "vllm:num_requests_waiting": ( + "vllm:num_requests_waiting", + "vllm_num_requests_waiting", + ), + "vllm:kv_offload_bytes_gpu_to_cpu": ( + "vllm:kv_offload_bytes_gpu_to_cpu", + "vllm_kv_offload_bytes_gpu_to_cpu", + ), + "vllm:kv_offload_bytes_cpu_to_gpu": ( + "vllm:kv_offload_bytes_cpu_to_gpu", + "vllm_kv_offload_bytes_cpu_to_gpu", + ), + "vllm:prompt_tokens_total": ( + "vllm:prompt_tokens_total", + "vllm_prompt_tokens_total", + ), + "vllm:generation_tokens_total": ( + "vllm:generation_tokens_total", + "vllm_generation_tokens_total", + ), + # Optional but useful in vLLM + "vllm:num_requests_swapped": ( + "vllm:num_requests_swapped", + "vllm_num_requests_swapped", + ), + # PR #993 parity metrics (vLLM) + "vllm:prefix_cache_hit_rate": ( + "vllm:prefix_cache_hit_rate", + "vllm_prefix_cache_hit_rate", + ), + "vllm:cpu_prefix_cache_hit_rate": ( + "vllm:cpu_prefix_cache_hit_rate", + "vllm_cpu_prefix_cache_hit_rate", + ), + "vllm:kv_offload_time_gpu_to_cpu_seconds": ( + "vllm:kv_offload_time_gpu_to_cpu_seconds", + "vllm_kv_offload_time_gpu_to_cpu_seconds", + ), + "vllm:kv_offload_time_cpu_to_gpu_seconds": ( + "vllm:kv_offload_time_cpu_to_gpu_seconds", + "vllm_kv_offload_time_cpu_to_gpu_seconds", + ), + "vllm:prompt_tokens_local_compute": ( + "vllm:prompt_tokens_local_compute", + "vllm_prompt_tokens_local_compute", + ), + "vllm:prompt_tokens_local_cache_hit": ( + "vllm:prompt_tokens_local_cache_hit", + "vllm_prompt_tokens_local_cache_hit", + ), + "vllm:prompt_tokens_external_kv_transfer": ( + "vllm:prompt_tokens_external_kv_transfer", + "vllm_prompt_tokens_external_kv_transfer", + ), + # SGLang equivalents (best-effort) + "sglang:kv_cache_usage": ( + "sglang:kv_cache_usage", + "sglang_kv_cache_usage", + "sglang_kv_cache_utilization", + ), + "sglang:cache_hit_rate": ( + "sglang:cache_hit_rate", + "sglang_cache_hit_rate", + "sglang_radix_cache_hit_rate", + ), + "sglang:num_requests_running": ( + "sglang:num_requests_running", + "sglang_num_requests_running", + "sglang_scheduler_num_running_requests", + ), + "sglang:num_requests_waiting": ( + "sglang:num_requests_waiting", + "sglang_num_requests_waiting", + "sglang_scheduler_num_waiting_requests", + ), + "sglang:prompt_tokens_total": ( + "sglang:prompt_tokens_total", + "sglang_prompt_tokens_total", + "sglang_num_prompt_tokens_total", + ), + "sglang:generation_tokens_total": ( + "sglang:generation_tokens_total", + "sglang_generation_tokens_total", + "sglang_num_generation_tokens_total", + ), + # PR #993 parity metrics (SGLang) + "sglang:num_preemptions_total": ( + "sglang:num_preemptions_total", + "sglang_num_preemptions_total", + ), + "sglang:prefix_cache_queries_total": ( + "sglang:prefix_cache_queries_total", + "sglang_prefix_cache_queries_total", + ), +} + + +def _normalize_name(name: str) -> str: + return name.replace(":", "_") + + +def parse_prometheus_rows(payload: str) -> list[tuple[str, float]]: + rows: list[tuple[str, float]] = [] + for line in payload.splitlines(): + if not line or line.startswith("#"): + continue + match = PROM_LINE_RE.match(line) + if not match: + continue + name, raw_value = match.groups() + try: + rows.append((name, float(raw_value))) + except ValueError: + continue + return rows + + +def parse_prometheus_text(payload: str) -> Dict[str, float]: + samples: Dict[str, float] = {} + for name, value in parse_prometheus_rows(payload): + samples[name] = value + return samples + + +def map_canonical_metrics(samples: Dict[str, float]) -> Dict[str, float]: + mapped: Dict[str, float] = {} + + normalized_index: Dict[str, float] = {} + for key, value in samples.items(): + normalized_index[_normalize_name(key)] = value + + for canonical_name, aliases in CANONICAL_METRICS.items(): + value = None + for alias in aliases: + if alias in samples: + value = samples[alias] + break + alias_norm = _normalize_name(alias) + if alias_norm in normalized_index: + value = normalized_index[alias_norm] + break + if value is not None: + mapped[canonical_name] = value + + return mapped + + +def fetch_metrics(metrics_url: str, timeout_s: float = 5.0) -> str: + request = Request(metrics_url, headers={"Accept": "text/plain"}) + with urlopen(request, timeout=timeout_s) as response: # nosec B310 + return response.read().decode("utf-8", errors="replace") + + +def _percentile(values: list[float], p: float) -> float: + if not values: + return 0.0 + if len(values) == 1: + return values[0] + sorted_values = sorted(values) + rank = (len(sorted_values) - 1) * p + lo = int(rank) + hi = min(lo + 1, len(sorted_values) - 1) + frac = rank - lo + return sorted_values[lo] * (1.0 - frac) + sorted_values[hi] * frac + + +def _build_summary(metric_values: dict[str, list[float]]) -> dict[str, dict[str, float]]: + summary: dict[str, dict[str, float]] = {} + for metric_name, values in metric_values.items(): + if not values: + continue + summary[metric_name] = { + "count": float(len(values)), + "min": min(values), + "max": max(values), + "mean": statistics.fmean(values), + "p50": _percentile(values, 0.50), + "p99": _percentile(values, 0.99), + } + return summary + + +async def scrape_loop( + metrics_url: str, + output_path: Path, + interval_s: float, + duration_s: float, + wide: bool, + summary_json_path: Path | None, +) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + + stop_event = asyncio.Event() + + def _request_stop(*_: object) -> None: + stop_event.set() + + try: + loop = asyncio.get_running_loop() + loop.add_signal_handler(signal.SIGINT, _request_stop) + loop.add_signal_handler(signal.SIGTERM, _request_stop) + except NotImplementedError: + pass + + started_at = time.time() + metric_values: dict[str, list[float]] = {} + + wide_path = output_path.with_name("kv_metrics_wide.csv") + + with output_path.open("w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow(["timestamp", "metric_name", "metric_value"]) + + wide_file = None + wide_writer = None + if wide: + wide_file = wide_path.open("w", newline="", encoding="utf-8") + wide_writer = csv.writer(wide_file) + wide_writer.writerow(["timestamp", "metric_name", "metric_value"]) + + try: + while not stop_event.is_set(): + now = time.time() + if duration_s > 0 and (now - started_at) >= duration_s: + break + + try: + raw_text = await asyncio.to_thread(fetch_metrics, metrics_url) + raw_rows = parse_prometheus_rows(raw_text) + samples = parse_prometheus_text(raw_text) + mapped = map_canonical_metrics(samples) + + if wide_writer is not None: + for raw_metric_name, raw_metric_value in raw_rows: + wide_writer.writerow( + [f"{now:.3f}", raw_metric_name, f"{raw_metric_value:.8f}"] + ) + wide_file.flush() + + for metric_name, metric_value in mapped.items(): + writer.writerow([f"{now:.3f}", metric_name, f"{metric_value:.8f}"]) + metric_values.setdefault(metric_name, []).append(metric_value) + f.flush() + except Exception as exc: + writer.writerow([f"{now:.3f}", "collector:error", repr(exc)]) + f.flush() + + await asyncio.sleep(interval_s) + finally: + if wide_file is not None: + wide_file.close() + + if summary_json_path is not None: + summary_json_path.parent.mkdir(parents=True, exist_ok=True) + summary_json_path.write_text( + json.dumps(_build_summary(metric_values), indent=2, sort_keys=True), + encoding="utf-8", + ) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Scrape Prometheus metrics into CSV") + parser.add_argument( + "--metrics-url", + default="http://0.0.0.0:8888/metrics", + help="Prometheus endpoint URL", + ) + parser.add_argument( + "--output", + default="kv_metrics.csv", + help="CSV output path", + ) + parser.add_argument( + "--interval", + type=float, + default=2.0, + help="Scrape interval in seconds", + ) + parser.add_argument( + "--duration", + type=float, + default=0.0, + help="Optional max duration in seconds (0 means run until interrupted)", + ) + parser.add_argument( + "--wide", + action="store_true", + help="Also scrape all non-comment Prometheus metric lines into kv_metrics_wide.csv", + ) + parser.add_argument( + "--summary-json", + nargs="?", + const="kv_metrics_summary.json", + default=None, + help="Write per-metric min/max/mean/p50/p99 summary JSON (default: kv_metrics_summary.json)", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + summary_json_path = Path(args.summary_json) if args.summary_json else None + asyncio.run( + scrape_loop( + metrics_url=args.metrics_url, + output_path=Path(args.output), + interval_s=max(args.interval, 0.1), + duration_s=max(args.duration, 0.0), + wide=args.wide, + summary_json_path=summary_json_path, + ) + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/datasets/isb1/scripts/plot_pareto.py b/datasets/isb1/scripts/plot_pareto.py new file mode 100644 index 000000000..964696ad1 --- /dev/null +++ b/datasets/isb1/scripts/plot_pareto.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +import sqlite3 +from pathlib import Path +from typing import Any + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Compute Pareto frontier for KV sweep throughput vs p99 TTFT") + parser.add_argument("--db-path", default=None, help="SQLite DB path (benchmark_runs)") + parser.add_argument("--json-dir", default=None, help="Directory containing sweep summary JSON files") + parser.add_argument("--output-dir", required=True, help="Directory for pareto outputs") + return parser.parse_args() + + +def _to_float(value: Any) -> float | None: + if value in (None, ""): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def load_rows_from_db(db_path: Path) -> list[dict[str, Any]]: + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + """ + SELECT offload_mode, ttft_p99_ms, throughput_tok_s, max_concurrency, raw_result_json + FROM benchmark_runs + WHERE offload_mode IS NOT NULL + AND ttft_p99_ms IS NOT NULL + AND throughput_tok_s IS NOT NULL + ORDER BY id ASC + """ + ).fetchall() + conn.close() + + normalized: list[dict[str, Any]] = [] + for row in rows: + concurrency = row["max_concurrency"] + if concurrency in (None, "") and row["raw_result_json"]: + try: + payload = json.loads(row["raw_result_json"]) + concurrency = payload.get("conc") or payload.get("max_concurrency") + except Exception: + pass + normalized.append( + { + "offload_mode": row["offload_mode"], + "concurrency": int(concurrency) if concurrency not in (None, "") else None, + "throughput_tok_s": _to_float(row["throughput_tok_s"]), + "ttft_p99_ms": _to_float(row["ttft_p99_ms"]), + "source": "db", + } + ) + return normalized + + +def load_rows_from_json_dir(json_dir: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for path in sorted(json_dir.glob("*.json")): + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except Exception: + continue + + if isinstance(payload, dict) and isinstance(payload.get("summary"), list): + for row in payload["summary"]: + rows.append( + { + "offload_mode": row.get("offload_mode"), + "concurrency": row.get("concurrency"), + "throughput_tok_s": _to_float(row.get("throughput_tok_s")), + "ttft_p99_ms": _to_float(row.get("ttft_p99_ms")), + "source": str(path.name), + } + ) + elif isinstance(payload, list): + for row in payload: + if isinstance(row, dict): + rows.append( + { + "offload_mode": row.get("offload_mode"), + "concurrency": row.get("concurrency"), + "throughput_tok_s": _to_float(row.get("throughput_tok_s")), + "ttft_p99_ms": _to_float(row.get("ttft_p99_ms")), + "source": str(path.name), + } + ) + return rows + + +def compute_pareto_frontier(points: list[dict[str, Any]]) -> list[dict[str, Any]]: + valid = [p for p in points if p["throughput_tok_s"] is not None and p["ttft_p99_ms"] is not None] + if not valid: + return [] + + # maximize throughput, minimize ttft_p99_ms + sorted_points = sorted(valid, key=lambda p: (p["throughput_tok_s"], -p["ttft_p99_ms"]), reverse=True) + frontier: list[dict[str, Any]] = [] + best_latency = float("inf") + for point in sorted_points: + latency = point["ttft_p99_ms"] + if latency <= best_latency: + frontier.append(point) + best_latency = latency + return sorted(frontier, key=lambda p: (p["throughput_tok_s"], p["ttft_p99_ms"])) + + +def write_csv(path: Path, rows: list[dict[str, Any]], frontier_keys: set[tuple[str, int | None, float, float]]) -> None: + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.writer(handle) + writer.writerow(["offload_mode", "concurrency", "throughput_tok_s", "ttft_p99_ms", "is_frontier", "source"]) + for row in rows: + key = (row.get("offload_mode") or "", row.get("concurrency"), row.get("throughput_tok_s") or 0.0, row.get("ttft_p99_ms") or 0.0) + writer.writerow([ + row.get("offload_mode"), + row.get("concurrency"), + row.get("throughput_tok_s"), + row.get("ttft_p99_ms"), + key in frontier_keys, + row.get("source"), + ]) + + +def maybe_write_plot(output_path: Path, grouped_frontiers: dict[str, list[dict[str, Any]]]) -> bool: + try: + import matplotlib.pyplot as plt # type: ignore + except Exception: + return False + + plt.figure(figsize=(10, 6)) + for mode, frontier in sorted(grouped_frontiers.items()): + x = [p["throughput_tok_s"] for p in frontier] + y = [p["ttft_p99_ms"] for p in frontier] + if not x: + continue + plt.plot(x, y, marker="o", label=mode) + plt.xlabel("Throughput (tokens/sec)") + plt.ylabel("p99 TTFT (ms)") + plt.title("Pareto Frontier by Offload Mode") + plt.legend() + plt.grid(True, alpha=0.3) + output_path.parent.mkdir(parents=True, exist_ok=True) + plt.tight_layout() + plt.savefig(output_path) + plt.close() + return True + + +def main() -> int: + args = parse_args() + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + if not args.db_path and not args.json_dir: + raise SystemExit("Provide --db-path or --json-dir") + + rows: list[dict[str, Any]] = [] + if args.db_path: + rows.extend(load_rows_from_db(Path(args.db_path))) + if args.json_dir: + rows.extend(load_rows_from_json_dir(Path(args.json_dir))) + + grouped: dict[str, list[dict[str, Any]]] = {} + for row in rows: + mode = row.get("offload_mode") + if not mode: + continue + grouped.setdefault(mode, []).append(row) + + grouped_frontiers: dict[str, list[dict[str, Any]]] = {} + for mode, points in grouped.items(): + grouped_frontiers[mode] = compute_pareto_frontier(points) + + frontier_keys: set[tuple[str, int | None, float, float]] = set() + for mode, frontier in grouped_frontiers.items(): + for point in frontier: + frontier_keys.add((mode, point.get("concurrency"), point.get("throughput_tok_s") or 0.0, point.get("ttft_p99_ms") or 0.0)) + + csv_path = output_dir / "pareto_data.csv" + write_csv(csv_path, rows, frontier_keys) + + summary = { + "total_points": len(rows), + "offload_modes": sorted(grouped.keys()), + "frontier": {mode: frontier for mode, frontier in grouped_frontiers.items()}, + } + summary_path = output_dir / "pareto_summary.json" + summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True), encoding="utf-8") + + plot_written = maybe_write_plot(output_dir / "pareto_frontier.png", grouped_frontiers) + + print(f"Wrote: {csv_path}") + print(f"Wrote: {summary_path}") + if plot_written: + print(f"Wrote: {output_dir / 'pareto_frontier.png'}") + else: + print("Skipped pareto_frontier.png (matplotlib unavailable)") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/README.md b/experimental/README.md index f39dfc4af..8ba1ba9b5 100644 --- a/experimental/README.md +++ b/experimental/README.md @@ -1,5 +1,11 @@ # Experimental -This folder contains experimental WIP code that is mostly Claude Code generated. +This folder contains experimental WIP code and planning material. -**Warning:** Code in this directory is very basic and likely contains errors or incomplete implementations. It is not intended for production use or as part of the official InferenceMAX results. +Relevant roadmap docs: + +For the current official ISB1 support statement, use: +- `datasets/isb1/SUPPORT_MATRIX.md` +- `datasets/isb1/README.md` + +**Warning:** code and notes in this directory may be incomplete, experimental, or future-looking. They are not by themselves the official statement of supported InferenceX ISB1 capability. diff --git a/experimental/multiturn/README.md b/experimental/multiturn/README.md index 05b22f67e..fd9114b37 100644 --- a/experimental/multiturn/README.md +++ b/experimental/multiturn/README.md @@ -1,16 +1,27 @@ -## Experimental WIP: Multi turn with/without CPU KVCache Offloading - -lit review -- https://lmsys.org/blog/2025-09-10-sglang-hicache/ -- sglang calls GPU HBM as (L1) and CPU DRAM as (L2) -- https://lmsys.org/images/blog/hicache/mooncake_benchmark.png -- single turn long context Q&A https://arxiv.org/abs/2311.04939 (seems more like an shared prefix style similar to cascade attention (pre cursor to sglang radix attention )) https://flashinfer.ai/2024/02/02/cascade-inference.html -- synethic & sharegpt vllm multi turn datasets https://github.com/vllm-project/vllm/tree/main/benchmarks/multi_turn -- Production Alibiba Multi turn dataset https://arxiv.org/abs/2506.02634 (seem to not provide the acutal prompts and outputs tho, more just prompt lengths and output lengths, etc.) -- sglang synthetic multi turn benchmark script here https://github.com/sgl-project/sglang/tree/main/benchmark/hicache -- interestingly sglang blog simulates PD disagg via just setting OSL as 1 -- MT-bench https://arxiv.org/abs/2402.14762 -```bash -python3 benchmark/hicache/bench_multiturn.py --model-path $MODEL_PATH --disable-random-sample \ ---output-length 1 --request-length 2048 \ # simulate P-D disaggregation -``` +# Experimental multiturn notes + +This directory contains working notes, investigations, and planning material for multiturn and long-context benchmarking. + +## Official ISB1 replay status lives elsewhere + +Do **not** treat this directory as the source of truth for the currently supported InferenceX ISB1 surface. + +For the official, reviewable statement of what is landed now, use: +- `datasets/isb1/SUPPORT_MATRIX.md` +- `datasets/isb1/README.md` +- `.github/configs/isb1-master.yaml` + +## Relevant roadmap docs + +- `ISB1_MULTITURN_LONG_CONTEXT_CANONICAL_SYNTHESIS_2026-04-09.md` — canonical synthesis for next implementation phases; use this first for planning context. +- `ISB1_INFERENCEX_PHASED_PR_ROADMAP_2026-04-09.md` — phased landing plan used to split schema/workflow/data/extension/polish work into mergeable stages. + +## Scope warning + +Files in this directory may discuss future or experimental directions such as: +- KV offload investigations +- synthetic multiturn ideas +- broader long-context expansion +- experiments outside the currently merged official replay lane + +Those notes are useful for planning, but they are **not** themselves an official support claim. diff --git a/experimental/multiturn/vllm_benchmark/.gitignore b/experimental/multiturn/vllm_benchmark/.gitignore new file mode 100644 index 000000000..5c371b81e --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/.gitignore @@ -0,0 +1,7 @@ +# Python +__pycache__/ +*.pyc + +# Generated artifacts +*.log +*.tmp diff --git a/experimental/multiturn/vllm_benchmark/README.md b/experimental/multiturn/vllm_benchmark/README.md new file mode 100644 index 000000000..b2ea6f175 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/README.md @@ -0,0 +1,33 @@ +# vLLM Benchmark (Experimental) + +This directory tracks the PR #993 parity surface for multi-turn trace replay and KV stress experiments. + +## Trace sources + +- **ISB-1 exports**: existing committed replay exports. +- **kv-cache-tester**: `kv-cache-tester/` is a placeholder for the external trace replay repo. +- **AIPerf synthetic traces**: `aiperf_traces/` provides fallback synthetic traces. + +## Analysis tools + +The parity analysis scripts live under `datasets/isb1/scripts/`: + +- `plot_pareto.py` +- `analyze_benchmark_distributions.py` +- `collect_sweep_results.py` +- `adapt_trace_replay_result.py` + +## LMCache variants + +LMCache launch helpers are under `launch/`: + +- `lmcache_vllm_h200.sh` +- `lmcache_vllm_b200.sh` + +## Per-hardware replay scripts + +Trace replay scripts are under `scripts/` for per-model/per-engine/per-hardware combinations. + +--- + +**Experimental infrastructure. Not part of official ISB-1 support matrix.** diff --git a/experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json b/experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json new file mode 100644 index 000000000..683556038 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json @@ -0,0 +1,5559 @@ +{ + "sessions": [ + { + "turns": [ + { + "role": "user", + "content_token_count": 4355, + "target_output_tokens": 229 + }, + { + "role": "user", + "content_token_count": 13955, + "target_output_tokens": 384 + }, + { + "role": "user", + "content_token_count": 1941, + "target_output_tokens": 89 + }, + { + "role": "user", + "content_token_count": 11403, + "target_output_tokens": 2247 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 13567, + "target_output_tokens": 663 + }, + { + "role": "user", + "content_token_count": 49742, + "target_output_tokens": 366 + }, + { + "role": "user", + "content_token_count": 13186, + "target_output_tokens": 686 + }, + { + "role": "user", + "content_token_count": 7600, + "target_output_tokens": 418 + }, + { + "role": "user", + "content_token_count": 5978, + "target_output_tokens": 385 + }, + { + "role": "user", + "content_token_count": 1998, + "target_output_tokens": 706 + }, + { + "role": "user", + "content_token_count": 1582, + "target_output_tokens": 667 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 14644, + "target_output_tokens": 467 + }, + { + "role": "user", + "content_token_count": 20321, + "target_output_tokens": 971 + }, + { + "role": "user", + "content_token_count": 2950, + "target_output_tokens": 274 + }, + { + "role": "user", + "content_token_count": 4932, + "target_output_tokens": 680 + }, + { + "role": "user", + "content_token_count": 9971, + "target_output_tokens": 706 + }, + { + "role": "user", + "content_token_count": 3348, + "target_output_tokens": 440 + }, + { + "role": "user", + "content_token_count": 13343, + "target_output_tokens": 431 + }, + { + "role": "user", + "content_token_count": 6230, + "target_output_tokens": 2231 + }, + { + "role": "user", + "content_token_count": 8168, + "target_output_tokens": 421 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1487, + "target_output_tokens": 986 + }, + { + "role": "user", + "content_token_count": 2684, + "target_output_tokens": 549 + }, + { + "role": "user", + "content_token_count": 3065, + "target_output_tokens": 366 + }, + { + "role": "user", + "content_token_count": 12135, + "target_output_tokens": 1145 + }, + { + "role": "user", + "content_token_count": 14716, + "target_output_tokens": 1074 + }, + { + "role": "user", + "content_token_count": 16644, + "target_output_tokens": 1062 + }, + { + "role": "user", + "content_token_count": 12355, + "target_output_tokens": 285 + }, + { + "role": "user", + "content_token_count": 3108, + "target_output_tokens": 291 + }, + { + "role": "user", + "content_token_count": 7234, + "target_output_tokens": 1235 + }, + { + "role": "user", + "content_token_count": 25179, + "target_output_tokens": 493 + }, + { + "role": "user", + "content_token_count": 6480, + "target_output_tokens": 431 + }, + { + "role": "user", + "content_token_count": 13902, + "target_output_tokens": 652 + }, + { + "role": "user", + "content_token_count": 6014, + "target_output_tokens": 1037 + }, + { + "role": "user", + "content_token_count": 41352, + "target_output_tokens": 649 + }, + { + "role": "user", + "content_token_count": 8852, + "target_output_tokens": 319 + }, + { + "role": "user", + "content_token_count": 8795, + "target_output_tokens": 736 + }, + { + "role": "user", + "content_token_count": 27778, + "target_output_tokens": 373 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6962, + "target_output_tokens": 1351 + }, + { + "role": "user", + "content_token_count": 2614, + "target_output_tokens": 248 + }, + { + "role": "user", + "content_token_count": 11529, + "target_output_tokens": 248 + }, + { + "role": "user", + "content_token_count": 5165, + "target_output_tokens": 653 + }, + { + "role": "user", + "content_token_count": 2132, + "target_output_tokens": 318 + }, + { + "role": "user", + "content_token_count": 5290, + "target_output_tokens": 614 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 23469, + "target_output_tokens": 546 + }, + { + "role": "user", + "content_token_count": 7665, + "target_output_tokens": 360 + }, + { + "role": "user", + "content_token_count": 27018, + "target_output_tokens": 1332 + }, + { + "role": "user", + "content_token_count": 1887, + "target_output_tokens": 326 + }, + { + "role": "user", + "content_token_count": 5249, + "target_output_tokens": 346 + }, + { + "role": "user", + "content_token_count": 7443, + "target_output_tokens": 828 + }, + { + "role": "user", + "content_token_count": 6496, + "target_output_tokens": 100 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 9221, + "target_output_tokens": 430 + }, + { + "role": "user", + "content_token_count": 7697, + "target_output_tokens": 1197 + }, + { + "role": "user", + "content_token_count": 5421, + "target_output_tokens": 277 + }, + { + "role": "user", + "content_token_count": 8799, + "target_output_tokens": 540 + }, + { + "role": "user", + "content_token_count": 14993, + "target_output_tokens": 768 + }, + { + "role": "user", + "content_token_count": 28612, + "target_output_tokens": 581 + }, + { + "role": "user", + "content_token_count": 42160, + "target_output_tokens": 366 + }, + { + "role": "user", + "content_token_count": 9846, + "target_output_tokens": 544 + }, + { + "role": "user", + "content_token_count": 15085, + "target_output_tokens": 302 + }, + { + "role": "user", + "content_token_count": 8267, + "target_output_tokens": 596 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 23256, + "target_output_tokens": 821 + }, + { + "role": "user", + "content_token_count": 36819, + "target_output_tokens": 183 + }, + { + "role": "user", + "content_token_count": 1590, + "target_output_tokens": 2201 + }, + { + "role": "user", + "content_token_count": 12229, + "target_output_tokens": 1265 + }, + { + "role": "user", + "content_token_count": 7483, + "target_output_tokens": 1819 + }, + { + "role": "user", + "content_token_count": 2288, + "target_output_tokens": 970 + }, + { + "role": "user", + "content_token_count": 33871, + "target_output_tokens": 703 + }, + { + "role": "user", + "content_token_count": 8650, + "target_output_tokens": 147 + }, + { + "role": "user", + "content_token_count": 10018, + "target_output_tokens": 487 + }, + { + "role": "user", + "content_token_count": 21103, + "target_output_tokens": 805 + }, + { + "role": "user", + "content_token_count": 17500, + "target_output_tokens": 493 + }, + { + "role": "user", + "content_token_count": 1678, + "target_output_tokens": 129 + }, + { + "role": "user", + "content_token_count": 29345, + "target_output_tokens": 303 + }, + { + "role": "user", + "content_token_count": 4555, + "target_output_tokens": 483 + }, + { + "role": "user", + "content_token_count": 39008, + "target_output_tokens": 631 + }, + { + "role": "user", + "content_token_count": 3284, + "target_output_tokens": 142 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 7400, + "target_output_tokens": 948 + }, + { + "role": "user", + "content_token_count": 3992, + "target_output_tokens": 387 + }, + { + "role": "user", + "content_token_count": 8450, + "target_output_tokens": 313 + }, + { + "role": "user", + "content_token_count": 8606, + "target_output_tokens": 89 + }, + { + "role": "user", + "content_token_count": 4775, + "target_output_tokens": 3004 + }, + { + "role": "user", + "content_token_count": 44546, + "target_output_tokens": 758 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 10548, + "target_output_tokens": 522 + }, + { + "role": "user", + "content_token_count": 23492, + "target_output_tokens": 463 + }, + { + "role": "user", + "content_token_count": 2803, + "target_output_tokens": 3146 + }, + { + "role": "user", + "content_token_count": 2080, + "target_output_tokens": 257 + }, + { + "role": "user", + "content_token_count": 8416, + "target_output_tokens": 1401 + }, + { + "role": "user", + "content_token_count": 3410, + "target_output_tokens": 4096 + }, + { + "role": "user", + "content_token_count": 20886, + "target_output_tokens": 246 + }, + { + "role": "user", + "content_token_count": 16891, + "target_output_tokens": 111 + }, + { + "role": "user", + "content_token_count": 4933, + "target_output_tokens": 654 + }, + { + "role": "user", + "content_token_count": 5560, + "target_output_tokens": 634 + }, + { + "role": "user", + "content_token_count": 8380, + "target_output_tokens": 158 + }, + { + "role": "user", + "content_token_count": 17894, + "target_output_tokens": 278 + }, + { + "role": "user", + "content_token_count": 4907, + "target_output_tokens": 312 + }, + { + "role": "user", + "content_token_count": 5810, + "target_output_tokens": 1418 + }, + { + "role": "user", + "content_token_count": 6056, + "target_output_tokens": 515 + }, + { + "role": "user", + "content_token_count": 6750, + "target_output_tokens": 279 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6845, + "target_output_tokens": 83 + }, + { + "role": "user", + "content_token_count": 3847, + "target_output_tokens": 2093 + }, + { + "role": "user", + "content_token_count": 2327, + "target_output_tokens": 926 + }, + { + "role": "user", + "content_token_count": 11838, + "target_output_tokens": 453 + }, + { + "role": "user", + "content_token_count": 5787, + "target_output_tokens": 1590 + }, + { + "role": "user", + "content_token_count": 16091, + "target_output_tokens": 84 + }, + { + "role": "user", + "content_token_count": 15625, + "target_output_tokens": 168 + }, + { + "role": "user", + "content_token_count": 24568, + "target_output_tokens": 789 + }, + { + "role": "user", + "content_token_count": 25763, + "target_output_tokens": 605 + }, + { + "role": "user", + "content_token_count": 20307, + "target_output_tokens": 570 + }, + { + "role": "user", + "content_token_count": 6868, + "target_output_tokens": 294 + }, + { + "role": "user", + "content_token_count": 18094, + "target_output_tokens": 170 + }, + { + "role": "user", + "content_token_count": 4778, + "target_output_tokens": 511 + }, + { + "role": "user", + "content_token_count": 3934, + "target_output_tokens": 495 + }, + { + "role": "user", + "content_token_count": 12163, + "target_output_tokens": 795 + }, + { + "role": "user", + "content_token_count": 12752, + "target_output_tokens": 3072 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 17618, + "target_output_tokens": 1691 + }, + { + "role": "user", + "content_token_count": 12217, + "target_output_tokens": 164 + }, + { + "role": "user", + "content_token_count": 31341, + "target_output_tokens": 777 + }, + { + "role": "user", + "content_token_count": 2248, + "target_output_tokens": 1106 + }, + { + "role": "user", + "content_token_count": 11819, + "target_output_tokens": 812 + }, + { + "role": "user", + "content_token_count": 5636, + "target_output_tokens": 187 + }, + { + "role": "user", + "content_token_count": 5477, + "target_output_tokens": 403 + }, + { + "role": "user", + "content_token_count": 19604, + "target_output_tokens": 390 + }, + { + "role": "user", + "content_token_count": 8663, + "target_output_tokens": 865 + }, + { + "role": "user", + "content_token_count": 16969, + "target_output_tokens": 407 + }, + { + "role": "user", + "content_token_count": 22672, + "target_output_tokens": 371 + }, + { + "role": "user", + "content_token_count": 4500, + "target_output_tokens": 257 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6952, + "target_output_tokens": 1454 + }, + { + "role": "user", + "content_token_count": 21170, + "target_output_tokens": 1383 + }, + { + "role": "user", + "content_token_count": 9252, + "target_output_tokens": 209 + }, + { + "role": "user", + "content_token_count": 6023, + "target_output_tokens": 155 + }, + { + "role": "user", + "content_token_count": 30200, + "target_output_tokens": 2025 + }, + { + "role": "user", + "content_token_count": 8146, + "target_output_tokens": 132 + }, + { + "role": "user", + "content_token_count": 15151, + "target_output_tokens": 300 + }, + { + "role": "user", + "content_token_count": 6381, + "target_output_tokens": 739 + }, + { + "role": "user", + "content_token_count": 3225, + "target_output_tokens": 454 + }, + { + "role": "user", + "content_token_count": 5177, + "target_output_tokens": 2094 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 17308, + "target_output_tokens": 484 + }, + { + "role": "user", + "content_token_count": 27306, + "target_output_tokens": 413 + }, + { + "role": "user", + "content_token_count": 24589, + "target_output_tokens": 1070 + }, + { + "role": "user", + "content_token_count": 7202, + "target_output_tokens": 256 + }, + { + "role": "user", + "content_token_count": 6018, + "target_output_tokens": 200 + }, + { + "role": "user", + "content_token_count": 3867, + "target_output_tokens": 593 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 16341, + "target_output_tokens": 1754 + }, + { + "role": "user", + "content_token_count": 4374, + "target_output_tokens": 1779 + }, + { + "role": "user", + "content_token_count": 5850, + "target_output_tokens": 290 + }, + { + "role": "user", + "content_token_count": 5391, + "target_output_tokens": 2242 + }, + { + "role": "user", + "content_token_count": 18534, + "target_output_tokens": 187 + }, + { + "role": "user", + "content_token_count": 1541, + "target_output_tokens": 1352 + }, + { + "role": "user", + "content_token_count": 512, + "target_output_tokens": 917 + }, + { + "role": "user", + "content_token_count": 6840, + "target_output_tokens": 397 + }, + { + "role": "user", + "content_token_count": 4664, + "target_output_tokens": 585 + }, + { + "role": "user", + "content_token_count": 7184, + "target_output_tokens": 846 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 7488, + "target_output_tokens": 545 + }, + { + "role": "user", + "content_token_count": 6149, + "target_output_tokens": 180 + }, + { + "role": "user", + "content_token_count": 18544, + "target_output_tokens": 1062 + }, + { + "role": "user", + "content_token_count": 23779, + "target_output_tokens": 962 + }, + { + "role": "user", + "content_token_count": 7158, + "target_output_tokens": 624 + }, + { + "role": "user", + "content_token_count": 5401, + "target_output_tokens": 264 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6126, + "target_output_tokens": 366 + }, + { + "role": "user", + "content_token_count": 10891, + "target_output_tokens": 787 + }, + { + "role": "user", + "content_token_count": 7206, + "target_output_tokens": 446 + }, + { + "role": "user", + "content_token_count": 14885, + "target_output_tokens": 534 + }, + { + "role": "user", + "content_token_count": 16761, + "target_output_tokens": 418 + }, + { + "role": "user", + "content_token_count": 8153, + "target_output_tokens": 322 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6173, + "target_output_tokens": 792 + }, + { + "role": "user", + "content_token_count": 7491, + "target_output_tokens": 360 + }, + { + "role": "user", + "content_token_count": 11004, + "target_output_tokens": 522 + }, + { + "role": "user", + "content_token_count": 30822, + "target_output_tokens": 733 + }, + { + "role": "user", + "content_token_count": 16828, + "target_output_tokens": 660 + }, + { + "role": "user", + "content_token_count": 10930, + "target_output_tokens": 2180 + }, + { + "role": "user", + "content_token_count": 9511, + "target_output_tokens": 182 + }, + { + "role": "user", + "content_token_count": 9162, + "target_output_tokens": 683 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 28818, + "target_output_tokens": 245 + }, + { + "role": "user", + "content_token_count": 6134, + "target_output_tokens": 472 + }, + { + "role": "user", + "content_token_count": 6634, + "target_output_tokens": 813 + }, + { + "role": "user", + "content_token_count": 10762, + "target_output_tokens": 182 + }, + { + "role": "user", + "content_token_count": 5519, + "target_output_tokens": 1891 + }, + { + "role": "user", + "content_token_count": 9813, + "target_output_tokens": 544 + }, + { + "role": "user", + "content_token_count": 27459, + "target_output_tokens": 1087 + }, + { + "role": "user", + "content_token_count": 11085, + "target_output_tokens": 192 + }, + { + "role": "user", + "content_token_count": 13108, + "target_output_tokens": 444 + }, + { + "role": "user", + "content_token_count": 24568, + "target_output_tokens": 203 + }, + { + "role": "user", + "content_token_count": 12813, + "target_output_tokens": 800 + }, + { + "role": "user", + "content_token_count": 6876, + "target_output_tokens": 126 + }, + { + "role": "user", + "content_token_count": 9155, + "target_output_tokens": 4096 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5653, + "target_output_tokens": 908 + }, + { + "role": "user", + "content_token_count": 2275, + "target_output_tokens": 410 + }, + { + "role": "user", + "content_token_count": 3348, + "target_output_tokens": 708 + }, + { + "role": "user", + "content_token_count": 7689, + "target_output_tokens": 448 + }, + { + "role": "user", + "content_token_count": 8998, + "target_output_tokens": 1126 + }, + { + "role": "user", + "content_token_count": 1847, + "target_output_tokens": 1767 + }, + { + "role": "user", + "content_token_count": 5015, + "target_output_tokens": 484 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 37087, + "target_output_tokens": 360 + }, + { + "role": "user", + "content_token_count": 9919, + "target_output_tokens": 3052 + }, + { + "role": "user", + "content_token_count": 3728, + "target_output_tokens": 265 + }, + { + "role": "user", + "content_token_count": 13398, + "target_output_tokens": 274 + }, + { + "role": "user", + "content_token_count": 5429, + "target_output_tokens": 994 + }, + { + "role": "user", + "content_token_count": 998, + "target_output_tokens": 116 + }, + { + "role": "user", + "content_token_count": 1326, + "target_output_tokens": 718 + }, + { + "role": "user", + "content_token_count": 9401, + "target_output_tokens": 712 + }, + { + "role": "user", + "content_token_count": 9097, + "target_output_tokens": 84 + }, + { + "role": "user", + "content_token_count": 5568, + "target_output_tokens": 126 + }, + { + "role": "user", + "content_token_count": 29693, + "target_output_tokens": 361 + }, + { + "role": "user", + "content_token_count": 4150, + "target_output_tokens": 804 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 13188, + "target_output_tokens": 1389 + }, + { + "role": "user", + "content_token_count": 20963, + "target_output_tokens": 792 + }, + { + "role": "user", + "content_token_count": 15129, + "target_output_tokens": 325 + }, + { + "role": "user", + "content_token_count": 7575, + "target_output_tokens": 149 + }, + { + "role": "user", + "content_token_count": 20166, + "target_output_tokens": 668 + }, + { + "role": "user", + "content_token_count": 7192, + "target_output_tokens": 332 + }, + { + "role": "user", + "content_token_count": 10367, + "target_output_tokens": 610 + }, + { + "role": "user", + "content_token_count": 5248, + "target_output_tokens": 157 + }, + { + "role": "user", + "content_token_count": 9240, + "target_output_tokens": 216 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 2873, + "target_output_tokens": 154 + }, + { + "role": "user", + "content_token_count": 10140, + "target_output_tokens": 2818 + }, + { + "role": "user", + "content_token_count": 4864, + "target_output_tokens": 1018 + }, + { + "role": "user", + "content_token_count": 10400, + "target_output_tokens": 210 + }, + { + "role": "user", + "content_token_count": 9931, + "target_output_tokens": 431 + }, + { + "role": "user", + "content_token_count": 19920, + "target_output_tokens": 1335 + }, + { + "role": "user", + "content_token_count": 12765, + "target_output_tokens": 479 + }, + { + "role": "user", + "content_token_count": 16121, + "target_output_tokens": 634 + }, + { + "role": "user", + "content_token_count": 16426, + "target_output_tokens": 303 + }, + { + "role": "user", + "content_token_count": 8657, + "target_output_tokens": 606 + }, + { + "role": "user", + "content_token_count": 3219, + "target_output_tokens": 126 + }, + { + "role": "user", + "content_token_count": 3934, + "target_output_tokens": 90 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 29139, + "target_output_tokens": 283 + }, + { + "role": "user", + "content_token_count": 11018, + "target_output_tokens": 2117 + }, + { + "role": "user", + "content_token_count": 12413, + "target_output_tokens": 123 + }, + { + "role": "user", + "content_token_count": 4620, + "target_output_tokens": 1279 + }, + { + "role": "user", + "content_token_count": 14998, + "target_output_tokens": 857 + }, + { + "role": "user", + "content_token_count": 6874, + "target_output_tokens": 377 + }, + { + "role": "user", + "content_token_count": 9962, + "target_output_tokens": 369 + }, + { + "role": "user", + "content_token_count": 35116, + "target_output_tokens": 178 + }, + { + "role": "user", + "content_token_count": 9970, + "target_output_tokens": 516 + }, + { + "role": "user", + "content_token_count": 11643, + "target_output_tokens": 543 + }, + { + "role": "user", + "content_token_count": 14700, + "target_output_tokens": 547 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1351, + "target_output_tokens": 2192 + }, + { + "role": "user", + "content_token_count": 23550, + "target_output_tokens": 200 + }, + { + "role": "user", + "content_token_count": 2511, + "target_output_tokens": 347 + }, + { + "role": "user", + "content_token_count": 20677, + "target_output_tokens": 589 + }, + { + "role": "user", + "content_token_count": 3425, + "target_output_tokens": 1138 + }, + { + "role": "user", + "content_token_count": 22755, + "target_output_tokens": 1462 + }, + { + "role": "user", + "content_token_count": 6087, + "target_output_tokens": 840 + }, + { + "role": "user", + "content_token_count": 9876, + "target_output_tokens": 164 + }, + { + "role": "user", + "content_token_count": 5481, + "target_output_tokens": 787 + }, + { + "role": "user", + "content_token_count": 4935, + "target_output_tokens": 471 + }, + { + "role": "user", + "content_token_count": 4601, + "target_output_tokens": 373 + }, + { + "role": "user", + "content_token_count": 7449, + "target_output_tokens": 1129 + }, + { + "role": "user", + "content_token_count": 7437, + "target_output_tokens": 664 + }, + { + "role": "user", + "content_token_count": 18022, + "target_output_tokens": 609 + }, + { + "role": "user", + "content_token_count": 6651, + "target_output_tokens": 593 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3803, + "target_output_tokens": 185 + }, + { + "role": "user", + "content_token_count": 4171, + "target_output_tokens": 471 + }, + { + "role": "user", + "content_token_count": 2991, + "target_output_tokens": 2486 + }, + { + "role": "user", + "content_token_count": 11107, + "target_output_tokens": 846 + }, + { + "role": "user", + "content_token_count": 12672, + "target_output_tokens": 1246 + }, + { + "role": "user", + "content_token_count": 9802, + "target_output_tokens": 404 + }, + { + "role": "user", + "content_token_count": 7244, + "target_output_tokens": 665 + }, + { + "role": "user", + "content_token_count": 11618, + "target_output_tokens": 1037 + }, + { + "role": "user", + "content_token_count": 4494, + "target_output_tokens": 365 + }, + { + "role": "user", + "content_token_count": 3666, + "target_output_tokens": 262 + }, + { + "role": "user", + "content_token_count": 10055, + "target_output_tokens": 395 + }, + { + "role": "user", + "content_token_count": 5900, + "target_output_tokens": 778 + }, + { + "role": "user", + "content_token_count": 2260, + "target_output_tokens": 112 + }, + { + "role": "user", + "content_token_count": 3803, + "target_output_tokens": 1263 + }, + { + "role": "user", + "content_token_count": 38195, + "target_output_tokens": 1187 + }, + { + "role": "user", + "content_token_count": 15430, + "target_output_tokens": 304 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 15126, + "target_output_tokens": 363 + }, + { + "role": "user", + "content_token_count": 11997, + "target_output_tokens": 65 + }, + { + "role": "user", + "content_token_count": 12124, + "target_output_tokens": 304 + }, + { + "role": "user", + "content_token_count": 2942, + "target_output_tokens": 722 + }, + { + "role": "user", + "content_token_count": 10438, + "target_output_tokens": 1058 + }, + { + "role": "user", + "content_token_count": 11401, + "target_output_tokens": 517 + }, + { + "role": "user", + "content_token_count": 22839, + "target_output_tokens": 1334 + }, + { + "role": "user", + "content_token_count": 4480, + "target_output_tokens": 409 + }, + { + "role": "user", + "content_token_count": 8627, + "target_output_tokens": 625 + }, + { + "role": "user", + "content_token_count": 2553, + "target_output_tokens": 1775 + }, + { + "role": "user", + "content_token_count": 5008, + "target_output_tokens": 1304 + }, + { + "role": "user", + "content_token_count": 14883, + "target_output_tokens": 920 + }, + { + "role": "user", + "content_token_count": 14845, + "target_output_tokens": 188 + }, + { + "role": "user", + "content_token_count": 7446, + "target_output_tokens": 116 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1555, + "target_output_tokens": 87 + }, + { + "role": "user", + "content_token_count": 4544, + "target_output_tokens": 466 + }, + { + "role": "user", + "content_token_count": 3256, + "target_output_tokens": 560 + }, + { + "role": "user", + "content_token_count": 3753, + "target_output_tokens": 201 + }, + { + "role": "user", + "content_token_count": 12476, + "target_output_tokens": 1849 + }, + { + "role": "user", + "content_token_count": 8975, + "target_output_tokens": 1635 + }, + { + "role": "user", + "content_token_count": 2877, + "target_output_tokens": 355 + }, + { + "role": "user", + "content_token_count": 4514, + "target_output_tokens": 181 + }, + { + "role": "user", + "content_token_count": 5382, + "target_output_tokens": 458 + }, + { + "role": "user", + "content_token_count": 3729, + "target_output_tokens": 292 + }, + { + "role": "user", + "content_token_count": 23202, + "target_output_tokens": 850 + }, + { + "role": "user", + "content_token_count": 6266, + "target_output_tokens": 373 + }, + { + "role": "user", + "content_token_count": 2491, + "target_output_tokens": 651 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5699, + "target_output_tokens": 448 + }, + { + "role": "user", + "content_token_count": 8399, + "target_output_tokens": 96 + }, + { + "role": "user", + "content_token_count": 24606, + "target_output_tokens": 892 + }, + { + "role": "user", + "content_token_count": 1881, + "target_output_tokens": 404 + }, + { + "role": "user", + "content_token_count": 14270, + "target_output_tokens": 302 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 2662, + "target_output_tokens": 159 + }, + { + "role": "user", + "content_token_count": 27451, + "target_output_tokens": 742 + }, + { + "role": "user", + "content_token_count": 6138, + "target_output_tokens": 752 + }, + { + "role": "user", + "content_token_count": 3040, + "target_output_tokens": 95 + }, + { + "role": "user", + "content_token_count": 3937, + "target_output_tokens": 394 + }, + { + "role": "user", + "content_token_count": 10143, + "target_output_tokens": 205 + }, + { + "role": "user", + "content_token_count": 4055, + "target_output_tokens": 665 + }, + { + "role": "user", + "content_token_count": 4486, + "target_output_tokens": 491 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 11225, + "target_output_tokens": 3158 + }, + { + "role": "user", + "content_token_count": 5709, + "target_output_tokens": 206 + }, + { + "role": "user", + "content_token_count": 8289, + "target_output_tokens": 2061 + }, + { + "role": "user", + "content_token_count": 11501, + "target_output_tokens": 625 + }, + { + "role": "user", + "content_token_count": 3024, + "target_output_tokens": 131 + }, + { + "role": "user", + "content_token_count": 6949, + "target_output_tokens": 743 + }, + { + "role": "user", + "content_token_count": 3555, + "target_output_tokens": 205 + }, + { + "role": "user", + "content_token_count": 4155, + "target_output_tokens": 478 + }, + { + "role": "user", + "content_token_count": 11184, + "target_output_tokens": 279 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 15198, + "target_output_tokens": 865 + }, + { + "role": "user", + "content_token_count": 27300, + "target_output_tokens": 352 + }, + { + "role": "user", + "content_token_count": 4084, + "target_output_tokens": 694 + }, + { + "role": "user", + "content_token_count": 2879, + "target_output_tokens": 643 + }, + { + "role": "user", + "content_token_count": 8411, + "target_output_tokens": 1094 + }, + { + "role": "user", + "content_token_count": 3496, + "target_output_tokens": 845 + }, + { + "role": "user", + "content_token_count": 14540, + "target_output_tokens": 288 + }, + { + "role": "user", + "content_token_count": 4651, + "target_output_tokens": 385 + }, + { + "role": "user", + "content_token_count": 14792, + "target_output_tokens": 842 + }, + { + "role": "user", + "content_token_count": 6271, + "target_output_tokens": 317 + }, + { + "role": "user", + "content_token_count": 7613, + "target_output_tokens": 763 + }, + { + "role": "user", + "content_token_count": 5852, + "target_output_tokens": 418 + }, + { + "role": "user", + "content_token_count": 11166, + "target_output_tokens": 2196 + }, + { + "role": "user", + "content_token_count": 19005, + "target_output_tokens": 1055 + }, + { + "role": "user", + "content_token_count": 5886, + "target_output_tokens": 492 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4062, + "target_output_tokens": 1211 + }, + { + "role": "user", + "content_token_count": 2190, + "target_output_tokens": 717 + }, + { + "role": "user", + "content_token_count": 7556, + "target_output_tokens": 257 + }, + { + "role": "user", + "content_token_count": 5768, + "target_output_tokens": 1324 + }, + { + "role": "user", + "content_token_count": 5463, + "target_output_tokens": 1404 + }, + { + "role": "user", + "content_token_count": 19173, + "target_output_tokens": 808 + }, + { + "role": "user", + "content_token_count": 7797, + "target_output_tokens": 808 + }, + { + "role": "user", + "content_token_count": 4039, + "target_output_tokens": 414 + }, + { + "role": "user", + "content_token_count": 2391, + "target_output_tokens": 436 + }, + { + "role": "user", + "content_token_count": 1957, + "target_output_tokens": 1098 + }, + { + "role": "user", + "content_token_count": 16198, + "target_output_tokens": 852 + }, + { + "role": "user", + "content_token_count": 3101, + "target_output_tokens": 532 + }, + { + "role": "user", + "content_token_count": 4035, + "target_output_tokens": 833 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1220, + "target_output_tokens": 138 + }, + { + "role": "user", + "content_token_count": 14648, + "target_output_tokens": 168 + }, + { + "role": "user", + "content_token_count": 8228, + "target_output_tokens": 537 + }, + { + "role": "user", + "content_token_count": 2352, + "target_output_tokens": 462 + }, + { + "role": "user", + "content_token_count": 7794, + "target_output_tokens": 259 + }, + { + "role": "user", + "content_token_count": 2734, + "target_output_tokens": 819 + }, + { + "role": "user", + "content_token_count": 17235, + "target_output_tokens": 1471 + }, + { + "role": "user", + "content_token_count": 1357, + "target_output_tokens": 762 + }, + { + "role": "user", + "content_token_count": 10804, + "target_output_tokens": 156 + }, + { + "role": "user", + "content_token_count": 16389, + "target_output_tokens": 983 + }, + { + "role": "user", + "content_token_count": 5074, + "target_output_tokens": 431 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 10280, + "target_output_tokens": 119 + }, + { + "role": "user", + "content_token_count": 4370, + "target_output_tokens": 817 + }, + { + "role": "user", + "content_token_count": 6854, + "target_output_tokens": 1795 + }, + { + "role": "user", + "content_token_count": 15223, + "target_output_tokens": 543 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6116, + "target_output_tokens": 309 + }, + { + "role": "user", + "content_token_count": 6257, + "target_output_tokens": 1301 + }, + { + "role": "user", + "content_token_count": 16623, + "target_output_tokens": 1520 + }, + { + "role": "user", + "content_token_count": 9563, + "target_output_tokens": 1403 + }, + { + "role": "user", + "content_token_count": 9134, + "target_output_tokens": 840 + }, + { + "role": "user", + "content_token_count": 6453, + "target_output_tokens": 388 + }, + { + "role": "user", + "content_token_count": 2951, + "target_output_tokens": 376 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3444, + "target_output_tokens": 414 + }, + { + "role": "user", + "content_token_count": 2321, + "target_output_tokens": 901 + }, + { + "role": "user", + "content_token_count": 3638, + "target_output_tokens": 1425 + }, + { + "role": "user", + "content_token_count": 7123, + "target_output_tokens": 1696 + }, + { + "role": "user", + "content_token_count": 2057, + "target_output_tokens": 351 + }, + { + "role": "user", + "content_token_count": 18346, + "target_output_tokens": 587 + }, + { + "role": "user", + "content_token_count": 9716, + "target_output_tokens": 640 + }, + { + "role": "user", + "content_token_count": 6768, + "target_output_tokens": 388 + }, + { + "role": "user", + "content_token_count": 3788, + "target_output_tokens": 250 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 2734, + "target_output_tokens": 1979 + }, + { + "role": "user", + "content_token_count": 4136, + "target_output_tokens": 2452 + }, + { + "role": "user", + "content_token_count": 7721, + "target_output_tokens": 550 + }, + { + "role": "user", + "content_token_count": 1881, + "target_output_tokens": 648 + }, + { + "role": "user", + "content_token_count": 6673, + "target_output_tokens": 406 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6955, + "target_output_tokens": 1459 + }, + { + "role": "user", + "content_token_count": 1014, + "target_output_tokens": 1007 + }, + { + "role": "user", + "content_token_count": 13098, + "target_output_tokens": 1459 + }, + { + "role": "user", + "content_token_count": 4876, + "target_output_tokens": 947 + }, + { + "role": "user", + "content_token_count": 9889, + "target_output_tokens": 1563 + }, + { + "role": "user", + "content_token_count": 2544, + "target_output_tokens": 3149 + }, + { + "role": "user", + "content_token_count": 9006, + "target_output_tokens": 245 + }, + { + "role": "user", + "content_token_count": 18694, + "target_output_tokens": 1384 + }, + { + "role": "user", + "content_token_count": 1467, + "target_output_tokens": 1471 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 17406, + "target_output_tokens": 286 + }, + { + "role": "user", + "content_token_count": 3679, + "target_output_tokens": 636 + }, + { + "role": "user", + "content_token_count": 2184, + "target_output_tokens": 321 + }, + { + "role": "user", + "content_token_count": 7967, + "target_output_tokens": 187 + }, + { + "role": "user", + "content_token_count": 6174, + "target_output_tokens": 654 + }, + { + "role": "user", + "content_token_count": 7180, + "target_output_tokens": 270 + }, + { + "role": "user", + "content_token_count": 10946, + "target_output_tokens": 95 + }, + { + "role": "user", + "content_token_count": 2518, + "target_output_tokens": 430 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6603, + "target_output_tokens": 646 + }, + { + "role": "user", + "content_token_count": 10518, + "target_output_tokens": 1096 + }, + { + "role": "user", + "content_token_count": 14848, + "target_output_tokens": 408 + }, + { + "role": "user", + "content_token_count": 2262, + "target_output_tokens": 499 + }, + { + "role": "user", + "content_token_count": 6591, + "target_output_tokens": 662 + }, + { + "role": "user", + "content_token_count": 5042, + "target_output_tokens": 540 + }, + { + "role": "user", + "content_token_count": 14974, + "target_output_tokens": 3408 + }, + { + "role": "user", + "content_token_count": 5658, + "target_output_tokens": 1060 + }, + { + "role": "user", + "content_token_count": 5558, + "target_output_tokens": 1785 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3100, + "target_output_tokens": 849 + }, + { + "role": "user", + "content_token_count": 12776, + "target_output_tokens": 945 + }, + { + "role": "user", + "content_token_count": 2376, + "target_output_tokens": 1003 + }, + { + "role": "user", + "content_token_count": 6865, + "target_output_tokens": 462 + }, + { + "role": "user", + "content_token_count": 3111, + "target_output_tokens": 509 + }, + { + "role": "user", + "content_token_count": 16078, + "target_output_tokens": 342 + }, + { + "role": "user", + "content_token_count": 16493, + "target_output_tokens": 733 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 8957, + "target_output_tokens": 307 + }, + { + "role": "user", + "content_token_count": 19094, + "target_output_tokens": 427 + }, + { + "role": "user", + "content_token_count": 2869, + "target_output_tokens": 405 + }, + { + "role": "user", + "content_token_count": 18384, + "target_output_tokens": 185 + }, + { + "role": "user", + "content_token_count": 6443, + "target_output_tokens": 1522 + }, + { + "role": "user", + "content_token_count": 5348, + "target_output_tokens": 662 + }, + { + "role": "user", + "content_token_count": 3869, + "target_output_tokens": 175 + }, + { + "role": "user", + "content_token_count": 5106, + "target_output_tokens": 761 + }, + { + "role": "user", + "content_token_count": 16260, + "target_output_tokens": 2221 + }, + { + "role": "user", + "content_token_count": 3983, + "target_output_tokens": 90 + }, + { + "role": "user", + "content_token_count": 2900, + "target_output_tokens": 809 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4829, + "target_output_tokens": 226 + }, + { + "role": "user", + "content_token_count": 2384, + "target_output_tokens": 491 + }, + { + "role": "user", + "content_token_count": 26292, + "target_output_tokens": 659 + }, + { + "role": "user", + "content_token_count": 12843, + "target_output_tokens": 692 + }, + { + "role": "user", + "content_token_count": 3004, + "target_output_tokens": 300 + }, + { + "role": "user", + "content_token_count": 21070, + "target_output_tokens": 1321 + }, + { + "role": "user", + "content_token_count": 12368, + "target_output_tokens": 129 + }, + { + "role": "user", + "content_token_count": 6159, + "target_output_tokens": 1480 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5460, + "target_output_tokens": 249 + }, + { + "role": "user", + "content_token_count": 9185, + "target_output_tokens": 229 + }, + { + "role": "user", + "content_token_count": 29343, + "target_output_tokens": 319 + }, + { + "role": "user", + "content_token_count": 7542, + "target_output_tokens": 1027 + }, + { + "role": "user", + "content_token_count": 3182, + "target_output_tokens": 248 + }, + { + "role": "user", + "content_token_count": 9888, + "target_output_tokens": 1865 + }, + { + "role": "user", + "content_token_count": 7401, + "target_output_tokens": 854 + }, + { + "role": "user", + "content_token_count": 6561, + "target_output_tokens": 654 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6488, + "target_output_tokens": 77 + }, + { + "role": "user", + "content_token_count": 6158, + "target_output_tokens": 374 + }, + { + "role": "user", + "content_token_count": 12575, + "target_output_tokens": 1325 + }, + { + "role": "user", + "content_token_count": 18730, + "target_output_tokens": 325 + }, + { + "role": "user", + "content_token_count": 2581, + "target_output_tokens": 1027 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 1888 + }, + { + "role": "user", + "content_token_count": 1787, + "target_output_tokens": 970 + }, + { + "role": "user", + "content_token_count": 7304, + "target_output_tokens": 181 + }, + { + "role": "user", + "content_token_count": 4038, + "target_output_tokens": 2854 + }, + { + "role": "user", + "content_token_count": 9441, + "target_output_tokens": 985 + }, + { + "role": "user", + "content_token_count": 5386, + "target_output_tokens": 550 + }, + { + "role": "user", + "content_token_count": 895, + "target_output_tokens": 550 + }, + { + "role": "user", + "content_token_count": 3238, + "target_output_tokens": 467 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 9749, + "target_output_tokens": 594 + }, + { + "role": "user", + "content_token_count": 6586, + "target_output_tokens": 303 + }, + { + "role": "user", + "content_token_count": 13734, + "target_output_tokens": 1592 + }, + { + "role": "user", + "content_token_count": 4723, + "target_output_tokens": 2155 + }, + { + "role": "user", + "content_token_count": 19342, + "target_output_tokens": 161 + }, + { + "role": "user", + "content_token_count": 7921, + "target_output_tokens": 130 + }, + { + "role": "user", + "content_token_count": 26045, + "target_output_tokens": 613 + }, + { + "role": "user", + "content_token_count": 9327, + "target_output_tokens": 158 + }, + { + "role": "user", + "content_token_count": 5054, + "target_output_tokens": 652 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 753 + }, + { + "role": "user", + "content_token_count": 13763, + "target_output_tokens": 501 + }, + { + "role": "user", + "content_token_count": 7809, + "target_output_tokens": 618 + }, + { + "role": "user", + "content_token_count": 1780, + "target_output_tokens": 1609 + }, + { + "role": "user", + "content_token_count": 13566, + "target_output_tokens": 219 + }, + { + "role": "user", + "content_token_count": 8244, + "target_output_tokens": 707 + }, + { + "role": "user", + "content_token_count": 3690, + "target_output_tokens": 2575 + }, + { + "role": "user", + "content_token_count": 8579, + "target_output_tokens": 289 + }, + { + "role": "user", + "content_token_count": 13461, + "target_output_tokens": 835 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 7460, + "target_output_tokens": 564 + }, + { + "role": "user", + "content_token_count": 12306, + "target_output_tokens": 643 + }, + { + "role": "user", + "content_token_count": 4237, + "target_output_tokens": 436 + }, + { + "role": "user", + "content_token_count": 2239, + "target_output_tokens": 1437 + }, + { + "role": "user", + "content_token_count": 4323, + "target_output_tokens": 1610 + }, + { + "role": "user", + "content_token_count": 8322, + "target_output_tokens": 628 + }, + { + "role": "user", + "content_token_count": 8307, + "target_output_tokens": 321 + }, + { + "role": "user", + "content_token_count": 8038, + "target_output_tokens": 221 + }, + { + "role": "user", + "content_token_count": 9312, + "target_output_tokens": 119 + }, + { + "role": "user", + "content_token_count": 8570, + "target_output_tokens": 1070 + }, + { + "role": "user", + "content_token_count": 43634, + "target_output_tokens": 801 + }, + { + "role": "user", + "content_token_count": 9896, + "target_output_tokens": 559 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 11595, + "target_output_tokens": 458 + }, + { + "role": "user", + "content_token_count": 8292, + "target_output_tokens": 942 + }, + { + "role": "user", + "content_token_count": 3946, + "target_output_tokens": 490 + }, + { + "role": "user", + "content_token_count": 2955, + "target_output_tokens": 712 + }, + { + "role": "user", + "content_token_count": 4839, + "target_output_tokens": 272 + }, + { + "role": "user", + "content_token_count": 4011, + "target_output_tokens": 335 + }, + { + "role": "user", + "content_token_count": 5086, + "target_output_tokens": 315 + }, + { + "role": "user", + "content_token_count": 5209, + "target_output_tokens": 764 + }, + { + "role": "user", + "content_token_count": 6710, + "target_output_tokens": 146 + }, + { + "role": "user", + "content_token_count": 2382, + "target_output_tokens": 277 + }, + { + "role": "user", + "content_token_count": 18762, + "target_output_tokens": 312 + }, + { + "role": "user", + "content_token_count": 3554, + "target_output_tokens": 393 + }, + { + "role": "user", + "content_token_count": 10240, + "target_output_tokens": 130 + }, + { + "role": "user", + "content_token_count": 10301, + "target_output_tokens": 986 + }, + { + "role": "user", + "content_token_count": 4008, + "target_output_tokens": 461 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 21422, + "target_output_tokens": 346 + }, + { + "role": "user", + "content_token_count": 5246, + "target_output_tokens": 217 + }, + { + "role": "user", + "content_token_count": 13646, + "target_output_tokens": 499 + }, + { + "role": "user", + "content_token_count": 5532, + "target_output_tokens": 249 + }, + { + "role": "user", + "content_token_count": 5178, + "target_output_tokens": 149 + }, + { + "role": "user", + "content_token_count": 1034, + "target_output_tokens": 316 + }, + { + "role": "user", + "content_token_count": 3570, + "target_output_tokens": 318 + }, + { + "role": "user", + "content_token_count": 9334, + "target_output_tokens": 1761 + }, + { + "role": "user", + "content_token_count": 4071, + "target_output_tokens": 227 + }, + { + "role": "user", + "content_token_count": 11734, + "target_output_tokens": 340 + }, + { + "role": "user", + "content_token_count": 5927, + "target_output_tokens": 302 + }, + { + "role": "user", + "content_token_count": 7918, + "target_output_tokens": 337 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 2647, + "target_output_tokens": 301 + }, + { + "role": "user", + "content_token_count": 14271, + "target_output_tokens": 1313 + }, + { + "role": "user", + "content_token_count": 5670, + "target_output_tokens": 954 + }, + { + "role": "user", + "content_token_count": 5014, + "target_output_tokens": 2103 + }, + { + "role": "user", + "content_token_count": 14137, + "target_output_tokens": 997 + }, + { + "role": "user", + "content_token_count": 8872, + "target_output_tokens": 1332 + }, + { + "role": "user", + "content_token_count": 2096, + "target_output_tokens": 4096 + }, + { + "role": "user", + "content_token_count": 16766, + "target_output_tokens": 587 + }, + { + "role": "user", + "content_token_count": 5742, + "target_output_tokens": 493 + }, + { + "role": "user", + "content_token_count": 21664, + "target_output_tokens": 696 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3432, + "target_output_tokens": 203 + }, + { + "role": "user", + "content_token_count": 4013, + "target_output_tokens": 79 + }, + { + "role": "user", + "content_token_count": 23484, + "target_output_tokens": 220 + }, + { + "role": "user", + "content_token_count": 1546, + "target_output_tokens": 289 + }, + { + "role": "user", + "content_token_count": 4542, + "target_output_tokens": 515 + }, + { + "role": "user", + "content_token_count": 5260, + "target_output_tokens": 378 + }, + { + "role": "user", + "content_token_count": 5487, + "target_output_tokens": 654 + }, + { + "role": "user", + "content_token_count": 7881, + "target_output_tokens": 380 + }, + { + "role": "user", + "content_token_count": 3358, + "target_output_tokens": 687 + }, + { + "role": "user", + "content_token_count": 11898, + "target_output_tokens": 180 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 38833, + "target_output_tokens": 534 + }, + { + "role": "user", + "content_token_count": 5781, + "target_output_tokens": 725 + }, + { + "role": "user", + "content_token_count": 7261, + "target_output_tokens": 165 + }, + { + "role": "user", + "content_token_count": 1280, + "target_output_tokens": 129 + }, + { + "role": "user", + "content_token_count": 5792, + "target_output_tokens": 466 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 10544, + "target_output_tokens": 692 + }, + { + "role": "user", + "content_token_count": 15136, + "target_output_tokens": 836 + }, + { + "role": "user", + "content_token_count": 5686, + "target_output_tokens": 1758 + }, + { + "role": "user", + "content_token_count": 12712, + "target_output_tokens": 2240 + }, + { + "role": "user", + "content_token_count": 4875, + "target_output_tokens": 482 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 60523, + "target_output_tokens": 271 + }, + { + "role": "user", + "content_token_count": 10297, + "target_output_tokens": 631 + }, + { + "role": "user", + "content_token_count": 16059, + "target_output_tokens": 648 + }, + { + "role": "user", + "content_token_count": 20684, + "target_output_tokens": 487 + }, + { + "role": "user", + "content_token_count": 6343, + "target_output_tokens": 637 + }, + { + "role": "user", + "content_token_count": 29821, + "target_output_tokens": 436 + }, + { + "role": "user", + "content_token_count": 2615, + "target_output_tokens": 187 + }, + { + "role": "user", + "content_token_count": 4564, + "target_output_tokens": 980 + }, + { + "role": "user", + "content_token_count": 7889, + "target_output_tokens": 907 + }, + { + "role": "user", + "content_token_count": 14777, + "target_output_tokens": 361 + }, + { + "role": "user", + "content_token_count": 5646, + "target_output_tokens": 1521 + }, + { + "role": "user", + "content_token_count": 13268, + "target_output_tokens": 554 + }, + { + "role": "user", + "content_token_count": 10637, + "target_output_tokens": 1013 + }, + { + "role": "user", + "content_token_count": 5757, + "target_output_tokens": 1339 + }, + { + "role": "user", + "content_token_count": 5184, + "target_output_tokens": 628 + }, + { + "role": "user", + "content_token_count": 12479, + "target_output_tokens": 792 + }, + { + "role": "user", + "content_token_count": 18012, + "target_output_tokens": 167 + }, + { + "role": "user", + "content_token_count": 14643, + "target_output_tokens": 532 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1938, + "target_output_tokens": 1098 + }, + { + "role": "user", + "content_token_count": 685, + "target_output_tokens": 986 + }, + { + "role": "user", + "content_token_count": 3023, + "target_output_tokens": 292 + }, + { + "role": "user", + "content_token_count": 26370, + "target_output_tokens": 332 + }, + { + "role": "user", + "content_token_count": 7935, + "target_output_tokens": 179 + }, + { + "role": "user", + "content_token_count": 2052, + "target_output_tokens": 99 + }, + { + "role": "user", + "content_token_count": 5165, + "target_output_tokens": 747 + }, + { + "role": "user", + "content_token_count": 13734, + "target_output_tokens": 435 + }, + { + "role": "user", + "content_token_count": 979, + "target_output_tokens": 760 + }, + { + "role": "user", + "content_token_count": 4084, + "target_output_tokens": 604 + }, + { + "role": "user", + "content_token_count": 19546, + "target_output_tokens": 183 + }, + { + "role": "user", + "content_token_count": 1609, + "target_output_tokens": 191 + }, + { + "role": "user", + "content_token_count": 3857, + "target_output_tokens": 1024 + }, + { + "role": "user", + "content_token_count": 21131, + "target_output_tokens": 1830 + }, + { + "role": "user", + "content_token_count": 4129, + "target_output_tokens": 343 + }, + { + "role": "user", + "content_token_count": 30740, + "target_output_tokens": 635 + }, + { + "role": "user", + "content_token_count": 10871, + "target_output_tokens": 995 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 8416, + "target_output_tokens": 664 + }, + { + "role": "user", + "content_token_count": 6856, + "target_output_tokens": 360 + }, + { + "role": "user", + "content_token_count": 12991, + "target_output_tokens": 1554 + }, + { + "role": "user", + "content_token_count": 2681, + "target_output_tokens": 1392 + }, + { + "role": "user", + "content_token_count": 2083, + "target_output_tokens": 1322 + }, + { + "role": "user", + "content_token_count": 2529, + "target_output_tokens": 862 + }, + { + "role": "user", + "content_token_count": 4854, + "target_output_tokens": 412 + }, + { + "role": "user", + "content_token_count": 5826, + "target_output_tokens": 904 + }, + { + "role": "user", + "content_token_count": 1412, + "target_output_tokens": 197 + }, + { + "role": "user", + "content_token_count": 16884, + "target_output_tokens": 319 + }, + { + "role": "user", + "content_token_count": 2209, + "target_output_tokens": 370 + }, + { + "role": "user", + "content_token_count": 6010, + "target_output_tokens": 1294 + }, + { + "role": "user", + "content_token_count": 19805, + "target_output_tokens": 2855 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 7510, + "target_output_tokens": 354 + }, + { + "role": "user", + "content_token_count": 20508, + "target_output_tokens": 390 + }, + { + "role": "user", + "content_token_count": 14364, + "target_output_tokens": 234 + }, + { + "role": "user", + "content_token_count": 5578, + "target_output_tokens": 672 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 7461, + "target_output_tokens": 2138 + }, + { + "role": "user", + "content_token_count": 8915, + "target_output_tokens": 721 + }, + { + "role": "user", + "content_token_count": 827, + "target_output_tokens": 458 + }, + { + "role": "user", + "content_token_count": 5858, + "target_output_tokens": 252 + }, + { + "role": "user", + "content_token_count": 3199, + "target_output_tokens": 864 + }, + { + "role": "user", + "content_token_count": 17479, + "target_output_tokens": 387 + }, + { + "role": "user", + "content_token_count": 6488, + "target_output_tokens": 768 + }, + { + "role": "user", + "content_token_count": 11265, + "target_output_tokens": 797 + }, + { + "role": "user", + "content_token_count": 6991, + "target_output_tokens": 802 + }, + { + "role": "user", + "content_token_count": 12962, + "target_output_tokens": 559 + }, + { + "role": "user", + "content_token_count": 6638, + "target_output_tokens": 2509 + }, + { + "role": "user", + "content_token_count": 2297, + "target_output_tokens": 803 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 11614, + "target_output_tokens": 248 + }, + { + "role": "user", + "content_token_count": 3234, + "target_output_tokens": 64 + }, + { + "role": "user", + "content_token_count": 18001, + "target_output_tokens": 64 + }, + { + "role": "user", + "content_token_count": 17797, + "target_output_tokens": 792 + }, + { + "role": "user", + "content_token_count": 15525, + "target_output_tokens": 341 + }, + { + "role": "user", + "content_token_count": 11380, + "target_output_tokens": 308 + }, + { + "role": "user", + "content_token_count": 20150, + "target_output_tokens": 336 + }, + { + "role": "user", + "content_token_count": 10705, + "target_output_tokens": 149 + }, + { + "role": "user", + "content_token_count": 5871, + "target_output_tokens": 432 + }, + { + "role": "user", + "content_token_count": 5526, + "target_output_tokens": 406 + }, + { + "role": "user", + "content_token_count": 7675, + "target_output_tokens": 1587 + }, + { + "role": "user", + "content_token_count": 2277, + "target_output_tokens": 1478 + }, + { + "role": "user", + "content_token_count": 9244, + "target_output_tokens": 168 + }, + { + "role": "user", + "content_token_count": 9135, + "target_output_tokens": 141 + }, + { + "role": "user", + "content_token_count": 6477, + "target_output_tokens": 847 + }, + { + "role": "user", + "content_token_count": 5213, + "target_output_tokens": 381 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 11902, + "target_output_tokens": 363 + }, + { + "role": "user", + "content_token_count": 4133, + "target_output_tokens": 763 + }, + { + "role": "user", + "content_token_count": 34974, + "target_output_tokens": 595 + }, + { + "role": "user", + "content_token_count": 3005, + "target_output_tokens": 748 + }, + { + "role": "user", + "content_token_count": 13140, + "target_output_tokens": 1585 + }, + { + "role": "user", + "content_token_count": 10800, + "target_output_tokens": 451 + }, + { + "role": "user", + "content_token_count": 7703, + "target_output_tokens": 308 + }, + { + "role": "user", + "content_token_count": 6180, + "target_output_tokens": 421 + }, + { + "role": "user", + "content_token_count": 7095, + "target_output_tokens": 2469 + }, + { + "role": "user", + "content_token_count": 27521, + "target_output_tokens": 645 + }, + { + "role": "user", + "content_token_count": 14207, + "target_output_tokens": 615 + }, + { + "role": "user", + "content_token_count": 7467, + "target_output_tokens": 736 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 20561, + "target_output_tokens": 111 + }, + { + "role": "user", + "content_token_count": 1000, + "target_output_tokens": 934 + }, + { + "role": "user", + "content_token_count": 32461, + "target_output_tokens": 115 + }, + { + "role": "user", + "content_token_count": 7010, + "target_output_tokens": 128 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 567 + }, + { + "role": "user", + "content_token_count": 9176, + "target_output_tokens": 146 + }, + { + "role": "user", + "content_token_count": 11138, + "target_output_tokens": 2089 + }, + { + "role": "user", + "content_token_count": 24757, + "target_output_tokens": 204 + }, + { + "role": "user", + "content_token_count": 6580, + "target_output_tokens": 1229 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4856, + "target_output_tokens": 587 + }, + { + "role": "user", + "content_token_count": 4192, + "target_output_tokens": 631 + }, + { + "role": "user", + "content_token_count": 7377, + "target_output_tokens": 358 + }, + { + "role": "user", + "content_token_count": 4030, + "target_output_tokens": 437 + }, + { + "role": "user", + "content_token_count": 8482, + "target_output_tokens": 404 + }, + { + "role": "user", + "content_token_count": 10934, + "target_output_tokens": 397 + }, + { + "role": "user", + "content_token_count": 5271, + "target_output_tokens": 105 + }, + { + "role": "user", + "content_token_count": 1504, + "target_output_tokens": 207 + }, + { + "role": "user", + "content_token_count": 12542, + "target_output_tokens": 497 + }, + { + "role": "user", + "content_token_count": 3169, + "target_output_tokens": 418 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 34022, + "target_output_tokens": 920 + }, + { + "role": "user", + "content_token_count": 4306, + "target_output_tokens": 383 + }, + { + "role": "user", + "content_token_count": 3490, + "target_output_tokens": 1086 + }, + { + "role": "user", + "content_token_count": 3939, + "target_output_tokens": 1038 + }, + { + "role": "user", + "content_token_count": 26508, + "target_output_tokens": 1136 + }, + { + "role": "user", + "content_token_count": 7044, + "target_output_tokens": 3317 + }, + { + "role": "user", + "content_token_count": 2441, + "target_output_tokens": 962 + }, + { + "role": "user", + "content_token_count": 2360, + "target_output_tokens": 442 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 13707, + "target_output_tokens": 159 + }, + { + "role": "user", + "content_token_count": 3362, + "target_output_tokens": 495 + }, + { + "role": "user", + "content_token_count": 3014, + "target_output_tokens": 156 + }, + { + "role": "user", + "content_token_count": 9534, + "target_output_tokens": 430 + }, + { + "role": "user", + "content_token_count": 8037, + "target_output_tokens": 724 + }, + { + "role": "user", + "content_token_count": 12462, + "target_output_tokens": 814 + }, + { + "role": "user", + "content_token_count": 18227, + "target_output_tokens": 371 + }, + { + "role": "user", + "content_token_count": 2077, + "target_output_tokens": 867 + }, + { + "role": "user", + "content_token_count": 10950, + "target_output_tokens": 412 + }, + { + "role": "user", + "content_token_count": 12169, + "target_output_tokens": 331 + }, + { + "role": "user", + "content_token_count": 4436, + "target_output_tokens": 260 + }, + { + "role": "user", + "content_token_count": 2961, + "target_output_tokens": 952 + }, + { + "role": "user", + "content_token_count": 21323, + "target_output_tokens": 1066 + }, + { + "role": "user", + "content_token_count": 14035, + "target_output_tokens": 1134 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 14500, + "target_output_tokens": 1813 + }, + { + "role": "user", + "content_token_count": 4751, + "target_output_tokens": 1726 + }, + { + "role": "user", + "content_token_count": 14083, + "target_output_tokens": 444 + }, + { + "role": "user", + "content_token_count": 2668, + "target_output_tokens": 199 + }, + { + "role": "user", + "content_token_count": 6391, + "target_output_tokens": 3392 + }, + { + "role": "user", + "content_token_count": 33050, + "target_output_tokens": 2319 + }, + { + "role": "user", + "content_token_count": 19617, + "target_output_tokens": 401 + }, + { + "role": "user", + "content_token_count": 9052, + "target_output_tokens": 220 + }, + { + "role": "user", + "content_token_count": 21741, + "target_output_tokens": 1047 + }, + { + "role": "user", + "content_token_count": 19064, + "target_output_tokens": 340 + }, + { + "role": "user", + "content_token_count": 1184, + "target_output_tokens": 804 + }, + { + "role": "user", + "content_token_count": 50708, + "target_output_tokens": 1268 + }, + { + "role": "user", + "content_token_count": 1043, + "target_output_tokens": 528 + }, + { + "role": "user", + "content_token_count": 7976, + "target_output_tokens": 600 + }, + { + "role": "user", + "content_token_count": 2967, + "target_output_tokens": 193 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4241, + "target_output_tokens": 1292 + }, + { + "role": "user", + "content_token_count": 8073, + "target_output_tokens": 1244 + }, + { + "role": "user", + "content_token_count": 21650, + "target_output_tokens": 603 + }, + { + "role": "user", + "content_token_count": 30704, + "target_output_tokens": 109 + }, + { + "role": "user", + "content_token_count": 3793, + "target_output_tokens": 486 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 455 + }, + { + "role": "user", + "content_token_count": 12867, + "target_output_tokens": 244 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5205, + "target_output_tokens": 190 + }, + { + "role": "user", + "content_token_count": 9530, + "target_output_tokens": 323 + }, + { + "role": "user", + "content_token_count": 5813, + "target_output_tokens": 662 + }, + { + "role": "user", + "content_token_count": 6079, + "target_output_tokens": 710 + }, + { + "role": "user", + "content_token_count": 3766, + "target_output_tokens": 319 + }, + { + "role": "user", + "content_token_count": 10983, + "target_output_tokens": 419 + }, + { + "role": "user", + "content_token_count": 38098, + "target_output_tokens": 897 + }, + { + "role": "user", + "content_token_count": 7410, + "target_output_tokens": 1273 + }, + { + "role": "user", + "content_token_count": 6534, + "target_output_tokens": 439 + }, + { + "role": "user", + "content_token_count": 2603, + "target_output_tokens": 363 + }, + { + "role": "user", + "content_token_count": 4395, + "target_output_tokens": 72 + }, + { + "role": "user", + "content_token_count": 6739, + "target_output_tokens": 424 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 23588, + "target_output_tokens": 164 + }, + { + "role": "user", + "content_token_count": 17832, + "target_output_tokens": 506 + }, + { + "role": "user", + "content_token_count": 22461, + "target_output_tokens": 198 + }, + { + "role": "user", + "content_token_count": 10329, + "target_output_tokens": 1380 + }, + { + "role": "user", + "content_token_count": 16613, + "target_output_tokens": 523 + }, + { + "role": "user", + "content_token_count": 18924, + "target_output_tokens": 1091 + }, + { + "role": "user", + "content_token_count": 6640, + "target_output_tokens": 936 + }, + { + "role": "user", + "content_token_count": 5752, + "target_output_tokens": 1079 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 16422, + "target_output_tokens": 611 + }, + { + "role": "user", + "content_token_count": 8736, + "target_output_tokens": 1393 + }, + { + "role": "user", + "content_token_count": 30989, + "target_output_tokens": 357 + }, + { + "role": "user", + "content_token_count": 32378, + "target_output_tokens": 365 + }, + { + "role": "user", + "content_token_count": 4826, + "target_output_tokens": 1142 + }, + { + "role": "user", + "content_token_count": 7705, + "target_output_tokens": 2254 + }, + { + "role": "user", + "content_token_count": 1630, + "target_output_tokens": 1219 + }, + { + "role": "user", + "content_token_count": 5323, + "target_output_tokens": 838 + }, + { + "role": "user", + "content_token_count": 21581, + "target_output_tokens": 654 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 8355, + "target_output_tokens": 529 + }, + { + "role": "user", + "content_token_count": 33639, + "target_output_tokens": 650 + }, + { + "role": "user", + "content_token_count": 9794, + "target_output_tokens": 355 + }, + { + "role": "user", + "content_token_count": 5952, + "target_output_tokens": 608 + }, + { + "role": "user", + "content_token_count": 7696, + "target_output_tokens": 163 + }, + { + "role": "user", + "content_token_count": 8151, + "target_output_tokens": 108 + }, + { + "role": "user", + "content_token_count": 11377, + "target_output_tokens": 486 + }, + { + "role": "user", + "content_token_count": 2795, + "target_output_tokens": 765 + }, + { + "role": "user", + "content_token_count": 8478, + "target_output_tokens": 361 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3254, + "target_output_tokens": 524 + }, + { + "role": "user", + "content_token_count": 13573, + "target_output_tokens": 1371 + }, + { + "role": "user", + "content_token_count": 4347, + "target_output_tokens": 538 + }, + { + "role": "user", + "content_token_count": 52807, + "target_output_tokens": 1303 + }, + { + "role": "user", + "content_token_count": 6319, + "target_output_tokens": 278 + }, + { + "role": "user", + "content_token_count": 4295, + "target_output_tokens": 640 + }, + { + "role": "user", + "content_token_count": 2030, + "target_output_tokens": 358 + }, + { + "role": "user", + "content_token_count": 13300, + "target_output_tokens": 504 + }, + { + "role": "user", + "content_token_count": 4151, + "target_output_tokens": 1040 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 10729, + "target_output_tokens": 621 + }, + { + "role": "user", + "content_token_count": 6674, + "target_output_tokens": 433 + }, + { + "role": "user", + "content_token_count": 11618, + "target_output_tokens": 156 + }, + { + "role": "user", + "content_token_count": 13713, + "target_output_tokens": 934 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 9731, + "target_output_tokens": 318 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 507 + }, + { + "role": "user", + "content_token_count": 3019, + "target_output_tokens": 450 + }, + { + "role": "user", + "content_token_count": 10288, + "target_output_tokens": 668 + }, + { + "role": "user", + "content_token_count": 22301, + "target_output_tokens": 815 + }, + { + "role": "user", + "content_token_count": 5283, + "target_output_tokens": 275 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3544, + "target_output_tokens": 843 + }, + { + "role": "user", + "content_token_count": 7783, + "target_output_tokens": 332 + }, + { + "role": "user", + "content_token_count": 2684, + "target_output_tokens": 845 + }, + { + "role": "user", + "content_token_count": 10549, + "target_output_tokens": 275 + }, + { + "role": "user", + "content_token_count": 9460, + "target_output_tokens": 608 + }, + { + "role": "user", + "content_token_count": 3164, + "target_output_tokens": 542 + }, + { + "role": "user", + "content_token_count": 3760, + "target_output_tokens": 494 + }, + { + "role": "user", + "content_token_count": 5991, + "target_output_tokens": 458 + }, + { + "role": "user", + "content_token_count": 3873, + "target_output_tokens": 800 + }, + { + "role": "user", + "content_token_count": 4054, + "target_output_tokens": 400 + }, + { + "role": "user", + "content_token_count": 3102, + "target_output_tokens": 2786 + }, + { + "role": "user", + "content_token_count": 5452, + "target_output_tokens": 3343 + }, + { + "role": "user", + "content_token_count": 2904, + "target_output_tokens": 483 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 2269, + "target_output_tokens": 738 + }, + { + "role": "user", + "content_token_count": 18252, + "target_output_tokens": 64 + }, + { + "role": "user", + "content_token_count": 16077, + "target_output_tokens": 369 + }, + { + "role": "user", + "content_token_count": 2591, + "target_output_tokens": 1498 + }, + { + "role": "user", + "content_token_count": 955, + "target_output_tokens": 964 + }, + { + "role": "user", + "content_token_count": 15421, + "target_output_tokens": 1148 + }, + { + "role": "user", + "content_token_count": 26417, + "target_output_tokens": 282 + }, + { + "role": "user", + "content_token_count": 2450, + "target_output_tokens": 641 + }, + { + "role": "user", + "content_token_count": 3723, + "target_output_tokens": 1544 + }, + { + "role": "user", + "content_token_count": 24848, + "target_output_tokens": 1652 + }, + { + "role": "user", + "content_token_count": 1198, + "target_output_tokens": 303 + }, + { + "role": "user", + "content_token_count": 3660, + "target_output_tokens": 378 + }, + { + "role": "user", + "content_token_count": 8385, + "target_output_tokens": 971 + }, + { + "role": "user", + "content_token_count": 17089, + "target_output_tokens": 146 + }, + { + "role": "user", + "content_token_count": 13626, + "target_output_tokens": 1436 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6980, + "target_output_tokens": 779 + }, + { + "role": "user", + "content_token_count": 14266, + "target_output_tokens": 998 + }, + { + "role": "user", + "content_token_count": 19395, + "target_output_tokens": 931 + }, + { + "role": "user", + "content_token_count": 27605, + "target_output_tokens": 864 + }, + { + "role": "user", + "content_token_count": 7245, + "target_output_tokens": 462 + }, + { + "role": "user", + "content_token_count": 3242, + "target_output_tokens": 90 + }, + { + "role": "user", + "content_token_count": 2781, + "target_output_tokens": 1296 + }, + { + "role": "user", + "content_token_count": 1676, + "target_output_tokens": 1609 + }, + { + "role": "user", + "content_token_count": 9287, + "target_output_tokens": 1339 + }, + { + "role": "user", + "content_token_count": 7842, + "target_output_tokens": 686 + }, + { + "role": "user", + "content_token_count": 7397, + "target_output_tokens": 133 + }, + { + "role": "user", + "content_token_count": 12946, + "target_output_tokens": 579 + }, + { + "role": "user", + "content_token_count": 6842, + "target_output_tokens": 1282 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 14195, + "target_output_tokens": 466 + }, + { + "role": "user", + "content_token_count": 4463, + "target_output_tokens": 558 + }, + { + "role": "user", + "content_token_count": 1089, + "target_output_tokens": 2126 + }, + { + "role": "user", + "content_token_count": 9114, + "target_output_tokens": 483 + }, + { + "role": "user", + "content_token_count": 4745, + "target_output_tokens": 810 + }, + { + "role": "user", + "content_token_count": 11648, + "target_output_tokens": 395 + }, + { + "role": "user", + "content_token_count": 2438, + "target_output_tokens": 444 + }, + { + "role": "user", + "content_token_count": 15094, + "target_output_tokens": 357 + }, + { + "role": "user", + "content_token_count": 5004, + "target_output_tokens": 1692 + }, + { + "role": "user", + "content_token_count": 17422, + "target_output_tokens": 161 + }, + { + "role": "user", + "content_token_count": 18830, + "target_output_tokens": 350 + }, + { + "role": "user", + "content_token_count": 3203, + "target_output_tokens": 1336 + }, + { + "role": "user", + "content_token_count": 4912, + "target_output_tokens": 1071 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 10200, + "target_output_tokens": 315 + }, + { + "role": "user", + "content_token_count": 43481, + "target_output_tokens": 953 + }, + { + "role": "user", + "content_token_count": 6381, + "target_output_tokens": 473 + }, + { + "role": "user", + "content_token_count": 2352, + "target_output_tokens": 361 + }, + { + "role": "user", + "content_token_count": 11246, + "target_output_tokens": 486 + }, + { + "role": "user", + "content_token_count": 38916, + "target_output_tokens": 252 + }, + { + "role": "user", + "content_token_count": 29292, + "target_output_tokens": 332 + }, + { + "role": "user", + "content_token_count": 7163, + "target_output_tokens": 737 + }, + { + "role": "user", + "content_token_count": 4145, + "target_output_tokens": 316 + }, + { + "role": "user", + "content_token_count": 4769, + "target_output_tokens": 298 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5594, + "target_output_tokens": 1686 + }, + { + "role": "user", + "content_token_count": 4311, + "target_output_tokens": 398 + }, + { + "role": "user", + "content_token_count": 13684, + "target_output_tokens": 419 + }, + { + "role": "user", + "content_token_count": 33855, + "target_output_tokens": 188 + }, + { + "role": "user", + "content_token_count": 2118, + "target_output_tokens": 1128 + }, + { + "role": "user", + "content_token_count": 2030, + "target_output_tokens": 184 + }, + { + "role": "user", + "content_token_count": 10739, + "target_output_tokens": 561 + }, + { + "role": "user", + "content_token_count": 5555, + "target_output_tokens": 366 + }, + { + "role": "user", + "content_token_count": 16640, + "target_output_tokens": 668 + }, + { + "role": "user", + "content_token_count": 23253, + "target_output_tokens": 884 + }, + { + "role": "user", + "content_token_count": 3965, + "target_output_tokens": 740 + }, + { + "role": "user", + "content_token_count": 8551, + "target_output_tokens": 1807 + }, + { + "role": "user", + "content_token_count": 3578, + "target_output_tokens": 766 + }, + { + "role": "user", + "content_token_count": 4639, + "target_output_tokens": 1157 + }, + { + "role": "user", + "content_token_count": 6212, + "target_output_tokens": 437 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5004, + "target_output_tokens": 178 + }, + { + "role": "user", + "content_token_count": 5596, + "target_output_tokens": 867 + }, + { + "role": "user", + "content_token_count": 12366, + "target_output_tokens": 1221 + }, + { + "role": "user", + "content_token_count": 5092, + "target_output_tokens": 167 + }, + { + "role": "user", + "content_token_count": 11259, + "target_output_tokens": 286 + }, + { + "role": "user", + "content_token_count": 18357, + "target_output_tokens": 1419 + }, + { + "role": "user", + "content_token_count": 12445, + "target_output_tokens": 425 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1753, + "target_output_tokens": 457 + }, + { + "role": "user", + "content_token_count": 4410, + "target_output_tokens": 138 + }, + { + "role": "user", + "content_token_count": 3759, + "target_output_tokens": 295 + }, + { + "role": "user", + "content_token_count": 11816, + "target_output_tokens": 830 + }, + { + "role": "user", + "content_token_count": 16209, + "target_output_tokens": 141 + }, + { + "role": "user", + "content_token_count": 46023, + "target_output_tokens": 2056 + }, + { + "role": "user", + "content_token_count": 5420, + "target_output_tokens": 422 + }, + { + "role": "user", + "content_token_count": 2445, + "target_output_tokens": 2119 + }, + { + "role": "user", + "content_token_count": 3724, + "target_output_tokens": 1277 + }, + { + "role": "user", + "content_token_count": 3168, + "target_output_tokens": 391 + }, + { + "role": "user", + "content_token_count": 9061, + "target_output_tokens": 1199 + }, + { + "role": "user", + "content_token_count": 4255, + "target_output_tokens": 1880 + }, + { + "role": "user", + "content_token_count": 20542, + "target_output_tokens": 449 + }, + { + "role": "user", + "content_token_count": 18541, + "target_output_tokens": 211 + }, + { + "role": "user", + "content_token_count": 17405, + "target_output_tokens": 878 + }, + { + "role": "user", + "content_token_count": 7086, + "target_output_tokens": 396 + }, + { + "role": "user", + "content_token_count": 4469, + "target_output_tokens": 189 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4594, + "target_output_tokens": 567 + }, + { + "role": "user", + "content_token_count": 15961, + "target_output_tokens": 276 + }, + { + "role": "user", + "content_token_count": 18817, + "target_output_tokens": 296 + }, + { + "role": "user", + "content_token_count": 8980, + "target_output_tokens": 446 + }, + { + "role": "user", + "content_token_count": 13739, + "target_output_tokens": 476 + }, + { + "role": "user", + "content_token_count": 4954, + "target_output_tokens": 1124 + }, + { + "role": "user", + "content_token_count": 7155, + "target_output_tokens": 2553 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 8108, + "target_output_tokens": 337 + }, + { + "role": "user", + "content_token_count": 7213, + "target_output_tokens": 198 + }, + { + "role": "user", + "content_token_count": 6441, + "target_output_tokens": 932 + }, + { + "role": "user", + "content_token_count": 25889, + "target_output_tokens": 494 + }, + { + "role": "user", + "content_token_count": 5672, + "target_output_tokens": 322 + }, + { + "role": "user", + "content_token_count": 6174, + "target_output_tokens": 984 + }, + { + "role": "user", + "content_token_count": 13080, + "target_output_tokens": 594 + }, + { + "role": "user", + "content_token_count": 23119, + "target_output_tokens": 64 + }, + { + "role": "user", + "content_token_count": 10812, + "target_output_tokens": 939 + }, + { + "role": "user", + "content_token_count": 27801, + "target_output_tokens": 925 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 3640, + "target_output_tokens": 108 + }, + { + "role": "user", + "content_token_count": 2053, + "target_output_tokens": 655 + }, + { + "role": "user", + "content_token_count": 16255, + "target_output_tokens": 1911 + }, + { + "role": "user", + "content_token_count": 13439, + "target_output_tokens": 629 + }, + { + "role": "user", + "content_token_count": 25472, + "target_output_tokens": 1323 + }, + { + "role": "user", + "content_token_count": 10114, + "target_output_tokens": 674 + }, + { + "role": "user", + "content_token_count": 1708, + "target_output_tokens": 1493 + }, + { + "role": "user", + "content_token_count": 5384, + "target_output_tokens": 1587 + }, + { + "role": "user", + "content_token_count": 6730, + "target_output_tokens": 408 + }, + { + "role": "user", + "content_token_count": 1746, + "target_output_tokens": 413 + }, + { + "role": "user", + "content_token_count": 1684, + "target_output_tokens": 1349 + }, + { + "role": "user", + "content_token_count": 22551, + "target_output_tokens": 426 + }, + { + "role": "user", + "content_token_count": 10297, + "target_output_tokens": 772 + }, + { + "role": "user", + "content_token_count": 13002, + "target_output_tokens": 1444 + }, + { + "role": "user", + "content_token_count": 16737, + "target_output_tokens": 1199 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 7675, + "target_output_tokens": 354 + }, + { + "role": "user", + "content_token_count": 5654, + "target_output_tokens": 220 + }, + { + "role": "user", + "content_token_count": 946, + "target_output_tokens": 515 + }, + { + "role": "user", + "content_token_count": 6573, + "target_output_tokens": 1712 + }, + { + "role": "user", + "content_token_count": 47344, + "target_output_tokens": 554 + }, + { + "role": "user", + "content_token_count": 10099, + "target_output_tokens": 1064 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4184, + "target_output_tokens": 213 + }, + { + "role": "user", + "content_token_count": 20020, + "target_output_tokens": 727 + }, + { + "role": "user", + "content_token_count": 5788, + "target_output_tokens": 464 + }, + { + "role": "user", + "content_token_count": 16426, + "target_output_tokens": 188 + }, + { + "role": "user", + "content_token_count": 6170, + "target_output_tokens": 1080 + }, + { + "role": "user", + "content_token_count": 12316, + "target_output_tokens": 659 + }, + { + "role": "user", + "content_token_count": 2817, + "target_output_tokens": 148 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 14649, + "target_output_tokens": 769 + }, + { + "role": "user", + "content_token_count": 13707, + "target_output_tokens": 314 + }, + { + "role": "user", + "content_token_count": 1901, + "target_output_tokens": 480 + }, + { + "role": "user", + "content_token_count": 4892, + "target_output_tokens": 562 + }, + { + "role": "user", + "content_token_count": 18481, + "target_output_tokens": 195 + }, + { + "role": "user", + "content_token_count": 3762, + "target_output_tokens": 564 + }, + { + "role": "user", + "content_token_count": 8463, + "target_output_tokens": 286 + }, + { + "role": "user", + "content_token_count": 11078, + "target_output_tokens": 90 + }, + { + "role": "user", + "content_token_count": 1106, + "target_output_tokens": 2149 + }, + { + "role": "user", + "content_token_count": 3393, + "target_output_tokens": 1477 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 285 + }, + { + "role": "user", + "content_token_count": 11370, + "target_output_tokens": 417 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 19821, + "target_output_tokens": 217 + }, + { + "role": "user", + "content_token_count": 20454, + "target_output_tokens": 689 + }, + { + "role": "user", + "content_token_count": 6158, + "target_output_tokens": 495 + }, + { + "role": "user", + "content_token_count": 10407, + "target_output_tokens": 172 + }, + { + "role": "user", + "content_token_count": 6777, + "target_output_tokens": 244 + }, + { + "role": "user", + "content_token_count": 52928, + "target_output_tokens": 476 + }, + { + "role": "user", + "content_token_count": 42478, + "target_output_tokens": 223 + }, + { + "role": "user", + "content_token_count": 4347, + "target_output_tokens": 593 + }, + { + "role": "user", + "content_token_count": 12237, + "target_output_tokens": 123 + }, + { + "role": "user", + "content_token_count": 17586, + "target_output_tokens": 598 + }, + { + "role": "user", + "content_token_count": 2461, + "target_output_tokens": 501 + }, + { + "role": "user", + "content_token_count": 4825, + "target_output_tokens": 168 + }, + { + "role": "user", + "content_token_count": 2679, + "target_output_tokens": 2852 + }, + { + "role": "user", + "content_token_count": 7837, + "target_output_tokens": 492 + }, + { + "role": "user", + "content_token_count": 65536, + "target_output_tokens": 277 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5214, + "target_output_tokens": 2004 + }, + { + "role": "user", + "content_token_count": 11163, + "target_output_tokens": 2005 + }, + { + "role": "user", + "content_token_count": 25193, + "target_output_tokens": 211 + }, + { + "role": "user", + "content_token_count": 2010, + "target_output_tokens": 256 + }, + { + "role": "user", + "content_token_count": 9992, + "target_output_tokens": 1115 + }, + { + "role": "user", + "content_token_count": 12896, + "target_output_tokens": 623 + }, + { + "role": "user", + "content_token_count": 3791, + "target_output_tokens": 998 + }, + { + "role": "user", + "content_token_count": 8003, + "target_output_tokens": 338 + }, + { + "role": "user", + "content_token_count": 4495, + "target_output_tokens": 552 + }, + { + "role": "user", + "content_token_count": 1634, + "target_output_tokens": 2271 + }, + { + "role": "user", + "content_token_count": 5760, + "target_output_tokens": 97 + }, + { + "role": "user", + "content_token_count": 10434, + "target_output_tokens": 609 + }, + { + "role": "user", + "content_token_count": 23376, + "target_output_tokens": 112 + }, + { + "role": "user", + "content_token_count": 8046, + "target_output_tokens": 544 + }, + { + "role": "user", + "content_token_count": 1341, + "target_output_tokens": 1666 + }, + { + "role": "user", + "content_token_count": 12979, + "target_output_tokens": 341 + }, + { + "role": "user", + "content_token_count": 8061, + "target_output_tokens": 463 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 14288, + "target_output_tokens": 1379 + }, + { + "role": "user", + "content_token_count": 7502, + "target_output_tokens": 164 + }, + { + "role": "user", + "content_token_count": 2894, + "target_output_tokens": 68 + }, + { + "role": "user", + "content_token_count": 28437, + "target_output_tokens": 318 + }, + { + "role": "user", + "content_token_count": 9110, + "target_output_tokens": 780 + }, + { + "role": "user", + "content_token_count": 7833, + "target_output_tokens": 1300 + }, + { + "role": "user", + "content_token_count": 35537, + "target_output_tokens": 227 + }, + { + "role": "user", + "content_token_count": 6575, + "target_output_tokens": 341 + }, + { + "role": "user", + "content_token_count": 5057, + "target_output_tokens": 747 + }, + { + "role": "user", + "content_token_count": 1020, + "target_output_tokens": 566 + }, + { + "role": "user", + "content_token_count": 29797, + "target_output_tokens": 461 + }, + { + "role": "user", + "content_token_count": 6275, + "target_output_tokens": 244 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 5975, + "target_output_tokens": 713 + }, + { + "role": "user", + "content_token_count": 4182, + "target_output_tokens": 813 + }, + { + "role": "user", + "content_token_count": 31157, + "target_output_tokens": 394 + }, + { + "role": "user", + "content_token_count": 5352, + "target_output_tokens": 628 + }, + { + "role": "user", + "content_token_count": 5323, + "target_output_tokens": 468 + }, + { + "role": "user", + "content_token_count": 8404, + "target_output_tokens": 603 + }, + { + "role": "user", + "content_token_count": 10457, + "target_output_tokens": 528 + }, + { + "role": "user", + "content_token_count": 21616, + "target_output_tokens": 1002 + }, + { + "role": "user", + "content_token_count": 11231, + "target_output_tokens": 266 + }, + { + "role": "user", + "content_token_count": 3555, + "target_output_tokens": 981 + }, + { + "role": "user", + "content_token_count": 2347, + "target_output_tokens": 311 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 512, + "target_output_tokens": 1289 + }, + { + "role": "user", + "content_token_count": 14824, + "target_output_tokens": 595 + }, + { + "role": "user", + "content_token_count": 2459, + "target_output_tokens": 491 + }, + { + "role": "user", + "content_token_count": 5155, + "target_output_tokens": 854 + }, + { + "role": "user", + "content_token_count": 1706, + "target_output_tokens": 335 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4693, + "target_output_tokens": 552 + }, + { + "role": "user", + "content_token_count": 3717, + "target_output_tokens": 321 + }, + { + "role": "user", + "content_token_count": 11640, + "target_output_tokens": 525 + }, + { + "role": "user", + "content_token_count": 7120, + "target_output_tokens": 1424 + }, + { + "role": "user", + "content_token_count": 6218, + "target_output_tokens": 1656 + }, + { + "role": "user", + "content_token_count": 11256, + "target_output_tokens": 3945 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 6313, + "target_output_tokens": 1528 + }, + { + "role": "user", + "content_token_count": 5148, + "target_output_tokens": 196 + }, + { + "role": "user", + "content_token_count": 15406, + "target_output_tokens": 461 + }, + { + "role": "user", + "content_token_count": 2451, + "target_output_tokens": 404 + }, + { + "role": "user", + "content_token_count": 9688, + "target_output_tokens": 847 + }, + { + "role": "user", + "content_token_count": 14736, + "target_output_tokens": 366 + }, + { + "role": "user", + "content_token_count": 8049, + "target_output_tokens": 1021 + }, + { + "role": "user", + "content_token_count": 5751, + "target_output_tokens": 3843 + }, + { + "role": "user", + "content_token_count": 11137, + "target_output_tokens": 390 + }, + { + "role": "user", + "content_token_count": 34636, + "target_output_tokens": 895 + }, + { + "role": "user", + "content_token_count": 11915, + "target_output_tokens": 599 + }, + { + "role": "user", + "content_token_count": 8409, + "target_output_tokens": 86 + }, + { + "role": "user", + "content_token_count": 3406, + "target_output_tokens": 2233 + }, + { + "role": "user", + "content_token_count": 15118, + "target_output_tokens": 677 + }, + { + "role": "user", + "content_token_count": 11251, + "target_output_tokens": 203 + }, + { + "role": "user", + "content_token_count": 7848, + "target_output_tokens": 198 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 19708, + "target_output_tokens": 526 + }, + { + "role": "user", + "content_token_count": 6199, + "target_output_tokens": 262 + }, + { + "role": "user", + "content_token_count": 5688, + "target_output_tokens": 957 + }, + { + "role": "user", + "content_token_count": 8993, + "target_output_tokens": 1558 + }, + { + "role": "user", + "content_token_count": 14718, + "target_output_tokens": 207 + }, + { + "role": "user", + "content_token_count": 10274, + "target_output_tokens": 744 + }, + { + "role": "user", + "content_token_count": 10756, + "target_output_tokens": 330 + }, + { + "role": "user", + "content_token_count": 55245, + "target_output_tokens": 171 + }, + { + "role": "user", + "content_token_count": 14177, + "target_output_tokens": 343 + }, + { + "role": "user", + "content_token_count": 11266, + "target_output_tokens": 370 + }, + { + "role": "user", + "content_token_count": 5359, + "target_output_tokens": 1273 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 1649, + "target_output_tokens": 218 + }, + { + "role": "user", + "content_token_count": 8871, + "target_output_tokens": 629 + }, + { + "role": "user", + "content_token_count": 11623, + "target_output_tokens": 247 + }, + { + "role": "user", + "content_token_count": 17643, + "target_output_tokens": 536 + }, + { + "role": "user", + "content_token_count": 1355, + "target_output_tokens": 127 + }, + { + "role": "user", + "content_token_count": 10824, + "target_output_tokens": 363 + }, + { + "role": "user", + "content_token_count": 3760, + "target_output_tokens": 810 + }, + { + "role": "user", + "content_token_count": 13120, + "target_output_tokens": 179 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 2614, + "target_output_tokens": 270 + }, + { + "role": "user", + "content_token_count": 4555, + "target_output_tokens": 271 + }, + { + "role": "user", + "content_token_count": 5387, + "target_output_tokens": 216 + }, + { + "role": "user", + "content_token_count": 3338, + "target_output_tokens": 694 + }, + { + "role": "user", + "content_token_count": 9274, + "target_output_tokens": 488 + }, + { + "role": "user", + "content_token_count": 41006, + "target_output_tokens": 1179 + }, + { + "role": "user", + "content_token_count": 11764, + "target_output_tokens": 336 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4551, + "target_output_tokens": 391 + }, + { + "role": "user", + "content_token_count": 7744, + "target_output_tokens": 590 + }, + { + "role": "user", + "content_token_count": 6922, + "target_output_tokens": 1285 + }, + { + "role": "user", + "content_token_count": 15085, + "target_output_tokens": 881 + }, + { + "role": "user", + "content_token_count": 23696, + "target_output_tokens": 380 + }, + { + "role": "user", + "content_token_count": 13825, + "target_output_tokens": 1441 + }, + { + "role": "user", + "content_token_count": 7353, + "target_output_tokens": 686 + } + ] + }, + { + "turns": [ + { + "role": "user", + "content_token_count": 4844, + "target_output_tokens": 520 + }, + { + "role": "user", + "content_token_count": 11126, + "target_output_tokens": 170 + }, + { + "role": "user", + "content_token_count": 2742, + "target_output_tokens": 549 + }, + { + "role": "user", + "content_token_count": 4533, + "target_output_tokens": 309 + } + ] + } + ] +} \ No newline at end of file diff --git a/experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py b/experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py new file mode 100644 index 000000000..ccc51ca7a --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +"""Generate synthetic AIPerf-style trace sessions for kv-cache-tester-compatible replay.""" + +from __future__ import annotations + +import argparse +import json +import math +import random +from pathlib import Path + + +def lognormal_sigma(p50: float, p95: float) -> float: + return math.log(p95 / p50) / 1.645 + + +def sample_tokens(rng: random.Random, p50: float, p95: float, min_v: int, max_v: int) -> int: + sigma = lognormal_sigma(p50, p95) + mu = math.log(p50) + sampled = int(round(rng.lognormvariate(mu, sigma))) + return max(min_v, min(max_v, sampled)) + + +def generate_sessions(count: int, seed: int) -> dict: + rng = random.Random(seed) + sessions = [] + + # Target coding-workload distributions: + # ISL p50~8k, p95~32k + # OSL p50~512, p95~2k + for _ in range(count): + num_turns = rng.randint(4, 18) + turns = [] + for _ in range(num_turns): + turns.append( + { + "role": "user", + "content_token_count": sample_tokens( + rng, + p50=8000, + p95=32000, + min_v=512, + max_v=65536, + ), + "target_output_tokens": sample_tokens( + rng, + p50=512, + p95=2000, + min_v=64, + max_v=4096, + ), + } + ) + sessions.append({"turns": turns}) + + return {"sessions": sessions} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Generate synthetic AIPerf traces") + parser.add_argument("--sessions", type=int, default=100, help="Number of sessions") + parser.add_argument("--seed", type=int, default=993, help="Random seed") + parser.add_argument( + "--output", + type=Path, + default=Path(__file__).with_name("aiperf_synthetic_traces.json"), + help="Output JSON path", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + payload = generate_sessions(args.sessions, args.seed) + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md b/experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md new file mode 100644 index 000000000..94731fd42 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md @@ -0,0 +1,11 @@ +# kv-cache-tester placeholder + +This directory should be populated with the external `kv-cache-tester` repository. + +Expected structure includes trace replay tooling and real trace assets used by experimental multiturn benchmarks. + +## Initialization + +If/when access is available, initialize this directory by checking out the kv-cache-tester repo contents here (for example via approved submodule setup or direct clone workflow owned by maintainers). + +Do not replace this placeholder with unapproved external URLs in this branch. diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/.gitkeep b/experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/experimental/multiturn/vllm_benchmark/launch/README.md b/experimental/multiturn/vllm_benchmark/launch/README.md new file mode 100644 index 000000000..00d33ecba --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/launch/README.md @@ -0,0 +1,8 @@ +# LMCache launch scripts (experimental) + +These scripts launch vLLM with LMCache KV transfer enabled: + +- `lmcache_vllm_h200.sh` +- `lmcache_vllm_b200.sh` + +They are experimental parity utilities and are not wired into the standard InferenceX benchmark dispatch lanes. diff --git a/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh new file mode 100755 index 000000000..f83b4b7f2 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP + +PORT=${PORT:-8888} +SERVER_LOG=/workspace/server.log +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} + +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +python3 -m pip install -q lmcache + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}' + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +echo "LMCache vLLM server running (PID=$SERVER_PID, log=$SERVER_LOG)" +wait "$SERVER_PID" diff --git a/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh new file mode 100755 index 000000000..f83b4b7f2 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP + +PORT=${PORT:-8888} +SERVER_LOG=/workspace/server.log +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} + +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +python3 -m pip install -q lmcache + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}' + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +echo "LMCache vLLM server running (PID=$SERVER_PID, log=$SERVER_LOG)" +wait "$SERVER_PID" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh new file mode 100755 index 000000000..f917c03c3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh new file mode 100755 index 000000000..f917c03c3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh new file mode 100755 index 000000000..7c46b0c31 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272} +RADIX_CACHE_ARGS="" +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +launch_sglang_server "$MODEL" "$PORT" --trust-remote-code --ep-size "${EP_SIZE:-1}" --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}" --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}" --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}" --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}" --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" --context-length "$CONTEXT_LENGTH" --stream-interval "${SGLANG_STREAM_INTERVAL:-10}" ${RADIX_CACHE_ARGS} + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh new file mode 100755 index 000000000..f917c03c3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh new file mode 100755 index 000000000..7c46b0c31 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272} +RADIX_CACHE_ARGS="" +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +launch_sglang_server "$MODEL" "$PORT" --trust-remote-code --ep-size "${EP_SIZE:-1}" --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}" --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}" --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}" --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}" --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" --context-length "$CONTEXT_LENGTH" --stream-interval "${SGLANG_STREAM_INTERVAL:-10}" ${RADIX_CACHE_ARGS} + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh new file mode 100755 index 000000000..f917c03c3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh new file mode 100755 index 000000000..7c46b0c31 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272} +RADIX_CACHE_ARGS="" +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +launch_sglang_server "$MODEL" "$PORT" --trust-remote-code --ep-size "${EP_SIZE:-1}" --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}" --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}" --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}" --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}" --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" --context-length "$CONTEXT_LENGTH" --stream-interval "${SGLANG_STREAM_INTERVAL:-10}" ${RADIX_CACHE_ARGS} + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh new file mode 100755 index 000000000..f917c03c3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh new file mode 100755 index 000000000..7c46b0c31 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272} +RADIX_CACHE_ARGS="" +if [[ -n "${OFFLOAD_MODE:-}" ]]; then + apply_sglang_offload_config +fi + +launch_sglang_server "$MODEL" "$PORT" --trust-remote-code --ep-size "${EP_SIZE:-1}" --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}" --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}" --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}" --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}" --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}" --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}" --context-length "$CONTEXT_LENGTH" --stream-interval "${SGLANG_STREAM_INTERVAL:-10}" ${RADIX_CACHE_ARGS} + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh new file mode 100755 index 000000000..f917c03c3 --- /dev/null +++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC RESULT_FILENAME + +PORT=${PORT:-8888} +TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces} +BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800} +SERVER_LOG=/workspace/server.log + +CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272} +cat > config.yaml << EOF +kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8} +max-cudagraph-capture-size: 2048 +max-num-batched-tokens: 8192 +max-model-len: $CALCULATED_MAX_MODEL_LEN +EOF + +launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +start_gpu_monitor +start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0 + +set -x +python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py --api-endpoint "http://localhost:$PORT" --trace-directory "$TRACE_DIR" --output-dir /workspace/ --start-users "$CONC" --max-users "$CONC" --test-duration "$BENCHMARK_DURATION_S" --seed 42 --no-color +set +x + +stop_kv_metrics_collector +stop_gpu_monitor + +python3 datasets/isb1/scripts/adapt_trace_replay_result.py --input-dir /workspace --detailed-csv detailed_results.csv --output-json "/workspace/${RESULT_FILENAME}.json" --model-id "$MODEL" --max-concurrency "$CONC" --request-mode "${REQUEST_MODE:-multi-turn}" --support-status "${SUPPORT_STATUS:-reviewed_preview}" --result-stem "$RESULT_FILENAME" diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 847b7ee80..644b2c3a4 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -1,5 +1,7 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + # System-specific configuration for B200 DGXC Slurm cluster SLURM_PARTITION="gpu" SLURM_ACCOUNT="benchmark" @@ -215,8 +217,7 @@ else HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache" SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') - SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "b200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 LOCK_FILE="${SQUASH_FILE}.lock" salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" @@ -243,5 +244,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh + bash "$SCRIPT_PATH" + + scancel $JOB_ID fi diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index f8c614936..caa1e8364 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -1,8 +1,9 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') -SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "b200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 PORT=8888 # Create unique cache directory based on model parameters @@ -30,13 +31,17 @@ docker run --rm --init --network host --name $server_name \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ +-e SPEC_DECODING -e DISAGG \ +-e BENCHMARK_TYPE -e EXPORT_FILE -e RUNTIME_STACK_ID -e HARDWARE_PROFILE_ID -e CANONICAL_MODEL_ID -e REQUEST_MODE -e MAX_CONCURRENCY \ +-e SUPPORT_STATUS -e VLLM_CPU_OFFLOAD_GB -e VLLM_SWAP_SPACE_GB -e SGLANG_MEM_FRACTION_OVERRIDE -e SGLANG_CHUNKED_PREFILL_OVERRIDE \ +-e MAX_SESSIONS -e MAX_TURNS_PER_SESSION -e MAX_OUTPUT_LEN -e NUM_WARMUP_SESSIONS -e IGNORE_WAITS -e IGNORE_EOS \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ -benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh" +"$SCRIPT_PATH" # Try graceful first docker stop -t 90 "$server_name" || true diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index c321ee0f9..cbcc7469b 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -1,9 +1,10 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/" PARTITION="main" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') -SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "b200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 UCX_NET_DEVICES=eth0 @@ -17,4 +18,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh \ No newline at end of file +bash "$SCRIPT_PATH" \ No newline at end of file diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh index 5100419b9..44c46600d 100644 --- a/runners/launch_h100-cr.sh +++ b/runners/launch_h100-cr.sh @@ -1,7 +1,10 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/" PORT=8888 +SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h100" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 server_name="bmk-server" @@ -10,9 +13,13 @@ docker run --rm --network=host --name=$server_name \ --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e EP_SIZE -e DP_ATTENTION -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \ +-e SPEC_DECODING -e DISAGG \ +-e BENCHMARK_TYPE -e EXPORT_FILE -e RUNTIME_STACK_ID -e HARDWARE_PROFILE_ID -e CANONICAL_MODEL_ID -e REQUEST_MODE -e MAX_CONCURRENCY \ +-e SUPPORT_STATUS -e VLLM_CPU_OFFLOAD_GB -e VLLM_SWAP_SPACE_GB -e SGLANG_MEM_FRACTION_OVERRIDE -e SGLANG_CHUNKED_PREFILL_OVERRIDE \ +-e MAX_SESSIONS -e MAX_TURNS_PER_SESSION -e MAX_OUTPUT_LEN -e NUM_WARMUP_SESSIONS -e IGNORE_WAITS -e IGNORE_EOS \ -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $IMAGE \ -benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_h100.sh" +"$SCRIPT_PATH" diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 49a42e981..bb10dcb6d 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -1,9 +1,12 @@ #!/usr/bin/env bash +source "$(dirname "$0")/lib_single_node_script.sh" + export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" PARTITION="h100" SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" LOCK_FILE="${SQUASH_FILE}.lock" +SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h100" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 set -x @@ -31,7 +34,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ -bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh +bash "$SCRIPT_PATH" rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index bb0335955..11570289a 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -1,5 +1,7 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + # System-specific configuration for H100 DGXC Slurm cluster SLURM_PARTITION="hpc-gpu-1" SLURM_ACCOUNT="customer" @@ -230,6 +232,7 @@ else HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/" SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h100" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 salloc --exclude="$SLURM_EXCLUDED_NODELIST" --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) @@ -247,7 +250,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh + bash "$SCRIPT_PATH" scancel $JOB_ID diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 657f84792..5a49efcc6 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -1,11 +1,12 @@ #!/usr/bin/env bash +source "$(dirname "$0")/lib_single_node_script.sh" + export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') -SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +SCRIPT_PATH=$(resolve_single_node_benchmark_script "$MODEL_CODE" "$PRECISION" "h200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 PARTITION="h200" SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" @@ -44,7 +45,7 @@ srun --jobid=$JOB_ID \ --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash "$SCRIPT_PATH" rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 9b3b771a5..a6f4d2986 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -1,5 +1,7 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + # System-specific configuration for H200 DGXC Slurm cluster SLURM_PARTITION="main" SLURM_ACCOUNT="sa-shared" @@ -233,6 +235,7 @@ else # Convert pyxis image format (nvcr.io#path) to docker format (nvcr.io/path) for enroot import DOCKER_IMAGE=$(echo "$IMAGE" | sed 's/#/\//g') LOCK_FILE="${SQUASH_FILE}.lock" + SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) @@ -258,7 +261,7 @@ else --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ - bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp').sh + bash "$SCRIPT_PATH" scancel $JOB_ID diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh index 9d157a858..3b697fb51 100644 --- a/runners/launch_h200-nb.sh +++ b/runners/launch_h200-nb.sh @@ -1,11 +1,12 @@ #!/usr/bin/bash +source "$(dirname "$0")/lib_single_node_script.sh" + export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" -FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') -SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') +SCRIPT_PATH=$(resolve_single_node_benchmark_script "$MODEL_CODE" "$PRECISION" "h200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1 PARTITION="main" @@ -19,4 +20,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" --container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ -bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh +bash "$SCRIPT_PATH" diff --git a/runners/lib_single_node_script.sh b/runners/lib_single_node_script.sh new file mode 100644 index 000000000..194668856 --- /dev/null +++ b/runners/lib_single_node_script.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +resolve_single_node_benchmark_script() { + local model_code="$1" + local precision="$2" + local runner_code="$3" + local framework="${4:-}" + local spec_decoding="${5:-}" + local script_base="benchmarks/single_node/${model_code}_${precision}_${runner_code}" + + if [[ "${BENCHMARK_TYPE:-}" == "isb1_replay" ]] && [[ "$framework" == "sglang" || "$framework" == "vllm" ]]; then + local runtime_candidate="${script_base}_${framework}.sh" + if [[ -f "$runtime_candidate" ]]; then + printf '%s\n' "$runtime_candidate" + return 0 + fi + fi + + local framework_suffix="" + local spec_suffix="" + if [[ "$framework" == "trt" ]]; then + framework_suffix="_trt" + fi + if [[ "$spec_decoding" == "mtp" ]]; then + spec_suffix="_mtp" + fi + + local legacy_candidate="${script_base}${framework_suffix}${spec_suffix}.sh" + if [[ -f "$legacy_candidate" ]]; then + printf '%s\n' "$legacy_candidate" + return 0 + fi + + echo "ERROR: Could not resolve single-node benchmark script." >&2 + echo " model=$model_code precision=$precision runner=$runner_code framework=${framework:-} spec_decoding=${spec_decoding:-} benchmark_type=${BENCHMARK_TYPE:-}" >&2 + if [[ "${BENCHMARK_TYPE:-}" == "isb1_replay" ]] && [[ "$framework" == "sglang" || "$framework" == "vllm" ]]; then + echo " checked runtime-aware candidate: ${script_base}_${framework}.sh" >&2 + fi + echo " checked legacy candidate: $legacy_candidate" >&2 + return 1 +} diff --git a/utils/bench_serving/benchmark_export_replay.py b/utils/bench_serving/benchmark_export_replay.py new file mode 100644 index 000000000..c67a5fd41 --- /dev/null +++ b/utils/bench_serving/benchmark_export_replay.py @@ -0,0 +1,1536 @@ +# SPDX-License-Identifier: Apache-2.0 +r"""Replay ISB1 export sessions against OpenAI-compatible inference servers. + +Supported export formats: + - ``inferencex_multiturn`` (direct-ingest session turns) + - ``inferencex_trace_replay`` (event-based trace replay) + +Supported request modes: + - ``chat``: send full message history to ``/v1/chat/completions`` + - ``completions``: project the message history into a single tagged prompt + and send it to ``/v1/completions`` + - ``auto``: prefer chat for standalone vLLM/SGLang cells and completions + for TRT / Dynamo projection cells +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import math +import os +import random +import sys +import time +import warnings +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any, Callable, Optional + +import aiohttp +import numpy as np +from tqdm.asyncio import tqdm + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60, sock_read=5 * 60) +DEFAULT_IMAGE_TOKEN_ESTIMATE = 2048 +DEFAULT_FALLBACK_OUTPUT_LEN = 256 +CHAT_NATIVE_RUNTIMES = {"standalone:vllm", "standalone:sglang"} +COMPLETIONS_PREFERRED_RUNTIMES = { + "standalone:trt_llm", + "dynamo:vllm", + "dynamo:sglang", + "dynamo:trt_llm", +} +ROLE_LABELS = { + "system": "SYSTEM", + "user": "USER", + "assistant": "ASSISTANT", + "tool": "TOOL", + "retrieval": "RETRIEVAL", + "execution": "EXECUTION", +} +MODULE_DIR = Path(__file__).resolve().parent +if str(MODULE_DIR) not in sys.path: + sys.path.insert(0, str(MODULE_DIR)) + + +@dataclass +class TurnResult: + turn_idx: int + context_len: int + output_len: int + ttft: float = 0.0 + tpot: float = 0.0 + e2el: float = 0.0 + itl: list[float] = field(default_factory=list) + success: bool = True + error: str = "" + request_mode: str = "chat" + actual_context_len: int = 0 + + +@dataclass +class SessionResult: + session_id: str + turns: list[TurnResult] = field(default_factory=list) + total_input_tokens: int = 0 + total_actual_input_tokens: int = 0 + total_output_tokens: int = 0 + total_duration: float = 0.0 + + +@dataclass +class ReplayTurn: + turn_idx: int + turn_id: Any + output_len: int + wait_before_s: float + context_len: int + actual_context_len: int + chat_messages: list[dict[str, Any]] + completion_prompt: str + + +@dataclass +class ReplaySession: + session_id: str + trace_id: str + runtime_stack_id: str + hardware_profile_id: str + canonical_model_id: str + support_status: str + benchmark_certification_status: str + request_mode: str + adapter_id: str + turns: list[ReplayTurn] + + +def _csv_values(raw: Optional[str]) -> set[str] | None: + if raw is None: + return None + values = {item.strip() for item in raw.split(",") if item.strip()} + return values or None + + +def _matches_filter(value: str, allowed: set[str] | None) -> bool: + return allowed is None or value in allowed + + +def _fallback_text_token_count(text: str) -> int: + stripped = (text or "").strip() + if not stripped: + return 0 + return max(1, math.ceil(len(stripped) / 4)) + + +def build_text_token_counter( + tokenizer_id: Optional[str], + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, +) -> Callable[[str], int]: + if not tokenizer_id: + return _fallback_text_token_count + + try: + from backend_request_func import get_tokenizer + + tokenizer = get_tokenizer( + tokenizer_id, + tokenizer_mode=tokenizer_mode, + trust_remote_code=trust_remote_code, + ) + except Exception as exc: + warnings.warn( + "Falling back to approximate token counting because tokenizer load " + f"failed for {tokenizer_id!r}: {exc}", + stacklevel=2, + ) + return _fallback_text_token_count + + def _count(text: str) -> int: + return len(tokenizer.encode(text or "", add_special_tokens=False)) + + return _count + + +def _render_block_as_text(block: dict[str, Any]) -> str: + block_type = str(block.get("type", "text")) + text = (block.get("text") or "").strip() + if block_type == "text": + return text + if block_type == "code": + return f"[CODE]\n{text}" if text else "[CODE]" + if block_type == "log": + return f"[LOG]\n{text}" if text else "[LOG]" + if block_type == "document": + label = block.get("asset_path") or block.get("uri") or "" + if text and label: + return f"[DOCUMENT: {label}]\n{text}" + if text: + return f"[DOCUMENT]\n{text}" + return f"[DOCUMENT: {label}]" if label else "[DOCUMENT]" + if block_type == "table": + return f"[TABLE]\n{text}" if text else "[TABLE]" + if block_type == "image": + label = block.get("uri") or block.get("asset_path") or text or "image" + return f"[IMAGE: {label}]" + return text or f"[{block_type.upper()}]" + + +def _extract_message_text(message: dict[str, Any]) -> str: + if isinstance(message.get("content"), str): + body = message["content"] + elif isinstance(message.get("content"), list): + parts: list[str] = [] + for part in message["content"]: + part_type = str(part.get("type", "text")) + if part_type == "text": + parts.append((part.get("text") or "").strip()) + elif part_type == "image_url": + url = "" + if isinstance(part.get("image_url"), dict): + url = part["image_url"].get("url") or "" + parts.append(f"[IMAGE: {url or 'image'}]") + body = "\n\n".join(item for item in parts if item) + else: + content_blocks = message.get("content_blocks") or [] + body = "\n\n".join( + filter(None, (_render_block_as_text(block) for block in content_blocks)) + ) + + role = str(message.get("role", "user")) + if role in {"tool", "retrieval", "execution"}: + prefix = f"[{ROLE_LABELS.get(role, role.upper())} RESULT]" + return f"{prefix}\n{body}" if body else prefix + return body + + +def _message_to_chat_payload(message: dict[str, Any]) -> dict[str, Any]: + role = str(message.get("role", "user")) + projected_role = role if role in {"system", "user", "assistant"} else "user" + content_blocks = message.get("content_blocks") or [] + + if not content_blocks: + return {"role": projected_role, "content": _extract_message_text(message)} + + parts: list[dict[str, Any]] = [] + if role not in {"system", "user", "assistant"}: + parts.append( + { + "type": "text", + "text": f"[{ROLE_LABELS.get(role, role.upper())} RESULT]", + } + ) + + for block in content_blocks: + block_type = str(block.get("type", "text")) + if block_type == "image" and block.get("uri"): + parts.append( + { + "type": "image_url", + "image_url": {"url": block["uri"]}, + } + ) + continue + + text = _render_block_as_text(block) + if text: + parts.append({"type": "text", "text": text}) + + if not parts: + return {"role": projected_role, "content": ""} + if len(parts) == 1 and parts[0]["type"] == "text": + return {"role": projected_role, "content": parts[0]["text"]} + return {"role": projected_role, "content": parts} + + +def _message_token_estimate( + message: dict[str, Any], + count_text_tokens: Callable[[str], int], + image_token_estimate: int, +) -> int: + content_blocks = message.get("content_blocks") or [] + if not content_blocks: + return count_text_tokens(_extract_message_text(message)) + + total = 0 + role = str(message.get("role", "user")) + if role in {"tool", "retrieval", "execution"}: + total += count_text_tokens(f"[{ROLE_LABELS.get(role, role.upper())} RESULT]") + + for block in content_blocks: + block_type = str(block.get("type", "text")) + if block_type == "image": + total += int( + block.get("asset_token_count") + or block.get("metadata", {}).get("token_count") + or image_token_estimate + ) + continue + if block.get("asset_token_count") and block.get("asset_path"): + total += int(block["asset_token_count"]) + continue + total += count_text_tokens(_render_block_as_text(block)) + return total + + +def _chat_payload_token_count( + chat_messages: list[dict[str, Any]], + count_text_tokens: Callable[[str], int], +) -> int: + """Count tokens in the rendered chat payload that will actually be sent over HTTP.""" + total = 0 + for msg in chat_messages: + content = msg.get("content", "") + if isinstance(content, str): + total += count_text_tokens(content) + elif isinstance(content, list): + for part in content: + if part.get("type") == "text": + total += count_text_tokens(part.get("text", "")) + elif part.get("type") == "image_url": + total += DEFAULT_IMAGE_TOKEN_ESTIMATE + return total + + +def _messages_to_completion_prompt(messages: list[dict[str, Any]]) -> str: + prompt_parts: list[str] = [] + for message in messages: + role = ROLE_LABELS.get(str(message.get("role", "user")), "USER") + body = _extract_message_text(message).strip() + prompt_parts.append(f"{role}:\n{body}" if body else f"{role}:") + prompt_parts.append("ASSISTANT:\n") + return "\n\n".join(prompt_parts) + + +def resolve_request_mode(runtime_stack_id: str, requested_mode: str) -> str: + if requested_mode != "auto": + return requested_mode + if runtime_stack_id in CHAT_NATIVE_RUNTIMES: + return "chat" + if runtime_stack_id in COMPLETIONS_PREFERRED_RUNTIMES: + return "completions" + return "chat" + + +def _parse_prometheus_sample(line: str) -> tuple[str, float] | None: + """Parse a Prometheus sample line into ``(metric_name, value)``.""" + raw_line = line.strip() + if not raw_line or raw_line.startswith("#"): + return None + + try: + metric_with_labels, raw_value = raw_line.rsplit(maxsplit=1) + metric_name = metric_with_labels.split("{", 1)[0] + return metric_name, float(raw_value) + except (TypeError, ValueError): + return None + + +def _resolve_output_len( + raw_output_len: Any, + fallback_output_len: int, + output_len_cap: Optional[int], +) -> int: + try: + output_len = int(raw_output_len) + except (TypeError, ValueError): + output_len = fallback_output_len + if output_len <= 0: + output_len = fallback_output_len + if output_len_cap is not None: + output_len = min(output_len, output_len_cap) + return output_len + + +def _build_turn_from_messages( + turn_idx: int, + turn_id: Any, + messages: list[dict[str, Any]], + output_len: int, + wait_before_s: float, + request_mode: str, + count_text_tokens: Callable[[str], int], + image_token_estimate: int, +) -> ReplayTurn: + chat_messages = [_message_to_chat_payload(message) for message in messages] + completion_prompt = _messages_to_completion_prompt(messages) + if request_mode == "chat": + context_len = sum( + _message_token_estimate(message, count_text_tokens, image_token_estimate) + for message in messages + ) + actual_context_len = _chat_payload_token_count(chat_messages, count_text_tokens) + else: + context_len = count_text_tokens(completion_prompt) + actual_context_len = context_len # completions mode already uses rendered text + return ReplayTurn( + turn_idx=turn_idx, + turn_id=turn_id, + output_len=output_len, + wait_before_s=wait_before_s, + context_len=context_len, + actual_context_len=actual_context_len, + chat_messages=chat_messages, + completion_prompt=completion_prompt, + ) + + +def _build_session_from_multiturn_cell( + cell: dict[str, Any], + request_mode: str, + count_text_tokens: Callable[[str], int], + image_token_estimate: int, + ignore_waits: bool, + fallback_output_len: int, + output_len_cap: Optional[int], + max_turns_per_session: Optional[int], +) -> ReplaySession: + session = cell["session"] + turns: list[ReplayTurn] = [] + for raw_turn in session.get("turns", []): + turns.append( + _build_turn_from_messages( + turn_idx=int(raw_turn.get("turn_idx", len(turns))), + turn_id=raw_turn.get("turn_id"), + messages=list(raw_turn.get("messages", [])), + output_len=_resolve_output_len( + raw_turn.get("expected_output_tokens"), + fallback_output_len, + output_len_cap, + ), + wait_before_s=0.0 + if ignore_waits + else float(raw_turn.get("wait_before_ms", 0)) / 1000.0, + request_mode=request_mode, + count_text_tokens=count_text_tokens, + image_token_estimate=image_token_estimate, + ) + ) + if max_turns_per_session is not None and len(turns) >= max_turns_per_session: + break + + return ReplaySession( + session_id=str(session.get("session_id", cell["trace_id"])), + trace_id=str(cell["trace_id"]), + runtime_stack_id=str(cell["runtime_stack_id"]), + hardware_profile_id=str(cell["hardware_profile_id"]), + canonical_model_id=str(cell["canonical_model_id"]), + support_status=str(cell.get("support_status", "unknown")), + benchmark_certification_status=str( + cell.get("benchmark_certification_status", "unknown") + ), + request_mode=request_mode, + adapter_id="inferencex_multiturn", + turns=turns, + ) + + +def _build_session_from_trace_replay_cell( + cell: dict[str, Any], + request_mode: str, + count_text_tokens: Callable[[str], int], + image_token_estimate: int, + ignore_waits: bool, + fallback_output_len: int, + output_len_cap: Optional[int], + max_turns_per_session: Optional[int], +) -> ReplaySession: + turns: list[ReplayTurn] = [] + prior_offset_ms = 0 + for index, event in enumerate(cell.get("events", [])): + offset_ms = int(event.get("arrival_time_offset_ms", 0) or 0) + wait_before_ms = 0 if index == 0 else max(0, offset_ms - prior_offset_ms) + prior_offset_ms = offset_ms + turns.append( + _build_turn_from_messages( + turn_idx=index, + turn_id=event.get("turn_id"), + messages=list(event.get("input_messages", [])), + output_len=_resolve_output_len( + event.get("target_output_tokens"), + fallback_output_len, + output_len_cap, + ), + wait_before_s=0.0 if ignore_waits else wait_before_ms / 1000.0, + request_mode=request_mode, + count_text_tokens=count_text_tokens, + image_token_estimate=image_token_estimate, + ) + ) + if max_turns_per_session is not None and len(turns) >= max_turns_per_session: + break + + return ReplaySession( + session_id=str(cell.get("trace_metadata", {}).get("session_id", cell["trace_id"])), + trace_id=str(cell["trace_id"]), + runtime_stack_id=str(cell["runtime_stack_id"]), + hardware_profile_id=str(cell["hardware_profile_id"]), + canonical_model_id=str(cell["canonical_model_id"]), + support_status=str(cell.get("support_status", "unknown")), + benchmark_certification_status=str( + cell.get("benchmark_certification_status", "unknown") + ), + request_mode=request_mode, + adapter_id="inferencex_trace_replay", + turns=turns, + ) + + +def load_replay_sessions( + export_file: str, + count_text_tokens: Callable[[str], int], + runtime_stack_ids: set[str] | None = None, + hardware_profile_ids: set[str] | None = None, + canonical_model_ids: set[str] | None = None, + trace_ids: set[str] | None = None, + support_statuses: set[str] | None = None, + request_mode: str = "auto", + image_token_estimate: int = DEFAULT_IMAGE_TOKEN_ESTIMATE, + ignore_waits: bool = False, + fallback_output_len: int = DEFAULT_FALLBACK_OUTPUT_LEN, + output_len_cap: Optional[int] = None, + session_offset: int = 0, + max_sessions: Optional[int] = None, + max_turns_per_session: Optional[int] = None, + shuffle_sessions: bool = False, + seed: int = 0, + allow_mixed_selection: bool = False, +) -> tuple[list[ReplaySession], dict[str, Any]]: + payload = json.loads(Path(export_file).read_text()) + adapter_id = str(payload.get("adapter_id", "unknown")) + export_cells = list(payload.get("exports", [])) + if adapter_id not in {"inferencex_multiturn", "inferencex_trace_replay"}: + raise ValueError( + f"Unsupported export adapter {adapter_id!r}. Expected " + "'inferencex_multiturn' or 'inferencex_trace_replay'." + ) + + selected_cells = [ + cell + for cell in export_cells + if _matches_filter(str(cell.get("runtime_stack_id", "")), runtime_stack_ids) + and _matches_filter(str(cell.get("hardware_profile_id", "")), hardware_profile_ids) + and _matches_filter(str(cell.get("canonical_model_id", "")), canonical_model_ids) + and _matches_filter(str(cell.get("trace_id", "")), trace_ids) + and _matches_filter(str(cell.get("support_status", "")), support_statuses) + ] + if not selected_cells: + raise ValueError( + "No export cells matched the requested filters. " + "Check runtime_stack_id / hardware_profile_id / canonical_model_id / " + "trace_id / support_status." + ) + + if shuffle_sessions: + random.Random(seed).shuffle(selected_cells) + + if session_offset: + selected_cells = selected_cells[session_offset:] + if max_sessions is not None: + selected_cells = selected_cells[:max_sessions] + if not selected_cells: + raise ValueError("Selection became empty after applying session_offset/max_sessions.") + + uniqueness = { + "runtime_stack_id": sorted({str(cell["runtime_stack_id"]) for cell in selected_cells}), + "hardware_profile_id": sorted({str(cell["hardware_profile_id"]) for cell in selected_cells}), + "canonical_model_id": sorted({str(cell["canonical_model_id"]) for cell in selected_cells}), + } + if not allow_mixed_selection: + mixed_fields = [field for field, values in uniqueness.items() if len(values) > 1] + if mixed_fields: + details = ", ".join(f"{field}={uniqueness[field]}" for field in mixed_fields) + raise ValueError( + "Selected export cells span multiple target server identities; " + f"filter more narrowly or pass --allow-mixed-selection. Mixed fields: {details}" + ) + + sessions: list[ReplaySession] = [] + for cell in selected_cells: + resolved_mode = resolve_request_mode(str(cell["runtime_stack_id"]), request_mode) + if adapter_id == "inferencex_multiturn": + sessions.append( + _build_session_from_multiturn_cell( + cell=cell, + request_mode=resolved_mode, + count_text_tokens=count_text_tokens, + image_token_estimate=image_token_estimate, + ignore_waits=ignore_waits, + fallback_output_len=fallback_output_len, + output_len_cap=output_len_cap, + max_turns_per_session=max_turns_per_session, + ) + ) + else: + sessions.append( + _build_session_from_trace_replay_cell( + cell=cell, + request_mode=resolved_mode, + count_text_tokens=count_text_tokens, + image_token_estimate=image_token_estimate, + ignore_waits=ignore_waits, + fallback_output_len=fallback_output_len, + output_len_cap=output_len_cap, + max_turns_per_session=max_turns_per_session, + ) + ) + + selection_metadata = { + "adapter_id": adapter_id, + "export_file": str(export_file), + "selected_sessions": len(sessions), + "trace_ids": [session.trace_id for session in sessions], + "runtime_stack_ids": sorted({session.runtime_stack_id for session in sessions}), + "hardware_profile_ids": sorted({session.hardware_profile_id for session in sessions}), + "canonical_model_ids": sorted({session.canonical_model_id for session in sessions}), + "support_statuses": sorted({session.support_status for session in sessions}), + "support_status_counts": { + status: sum(1 for session in sessions if session.support_status == status) + for status in sorted({session.support_status for session in sessions}) + }, + "benchmark_certification_statuses": sorted( + {session.benchmark_certification_status for session in sessions} + ), + "benchmark_certification_status_counts": { + status: sum( + 1 + for session in sessions + if session.benchmark_certification_status == status + ) + for status in sorted( + {session.benchmark_certification_status for session in sessions} + ) + }, + "request_mode_mix": { + mode: sum(1 for session in sessions if session.request_mode == mode) + for mode in sorted({session.request_mode for session in sessions}) + }, + } + return sessions, selection_metadata + + +async def _iter_sse_lines( + response: aiohttp.ClientResponse, +): + """Yield individual SSE data payloads from a streaming response. + + Buffers partial lines across TCP chunks and splits multi-line chunks. + Handles the common case where multiple ``data: {...}`` frames arrive + in a single TCP read, or a single frame is split across reads. + """ + buffer = b"" + async for chunk in response.content: + buffer += chunk + while b"\n" in buffer: + line, buffer = buffer.split(b"\n", 1) + line = line.strip() + if not line: + continue + decoded = line.decode("utf-8") + if decoded.startswith(":"): + continue # SSE comment / keep-alive + if decoded.startswith("data: "): + payload_str = decoded[6:].strip() + elif decoded.startswith("data:"): + payload_str = decoded[5:].strip() + else: + continue + if payload_str == "[DONE]": + return + yield payload_str + # Flush remaining buffer + remaining = buffer.strip() + if remaining: + decoded = remaining.decode("utf-8") + for prefix in ("data: ", "data:"): + if decoded.startswith(prefix): + payload_str = decoded[len(prefix):].strip() + if payload_str and payload_str != "[DONE]": + yield payload_str + break + + +async def _stream_chat_request( + api_url: str, + payload: dict[str, Any], + headers: dict[str, str], + context_len: int, + count_text_tokens: Callable[[str], int], + request_mode: str, +) -> tuple[TurnResult, int]: + turn = TurnResult( + turn_idx=-1, + context_len=context_len, + output_len=0, + success=False, + request_mode=request_mode, + ) + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + + async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: + async with session.post(url=api_url, json=payload, headers=headers) as response: + if response.status != 200: + error_text = (await response.text()).strip() + turn.error = f"HTTP {response.status}: {error_text or response.reason}" + return turn, response.status + + async for sse_payload in _iter_sse_lines(response): + data = json.loads(sse_payload) + if choices := data.get("choices"): + delta = choices[0].get("delta", {}) + content = delta.get("content") + if isinstance(content, list): + content = "".join( + part.get("text", "") + for part in content + if isinstance(part, dict) and part.get("type") == "text" + ) + if content: + timestamp = time.perf_counter() + if ttft == 0.0: + ttft = timestamp - st + turn.ttft = ttft + else: + turn.itl.append(timestamp - most_recent_timestamp) + most_recent_timestamp = timestamp + generated_text += content + elif usage := data.get("usage"): + turn.output_len = int(usage.get("completion_tokens") or 0) + + turn.e2el = max(0.0, most_recent_timestamp - st) + turn.success = True + if turn.output_len == 0 and generated_text: + turn.output_len = count_text_tokens(generated_text) + if turn.output_len > 1: + turn.tpot = (turn.e2el - turn.ttft) / (turn.output_len - 1) + return turn, 200 + + +async def _send_chat_turn( + chat_messages: list[dict[str, Any]], + model_id: str, + model_name: Optional[str], + api_url: str, + output_len: int, + context_len: int, + count_text_tokens: Callable[[str], int], + ignore_eos: bool = False, +) -> TurnResult: + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY', 'EMPTY')}", + } + payload_base = { + "model": model_name or model_id, + "messages": chat_messages, + "temperature": 0.0, + "stream": True, + "stream_options": {"include_usage": True}, + } + if ignore_eos: + payload_base["ignore_eos"] = True + + errors: list[str] = [] + for max_tokens_key in ("max_completion_tokens", "max_tokens"): + payload = {**payload_base, max_tokens_key: output_len} + turn, status = await _stream_chat_request( + api_url=api_url, + payload=payload, + headers=headers, + context_len=context_len, + count_text_tokens=count_text_tokens, + request_mode="chat", + ) + if turn.success: + return turn + errors.append(turn.error) + if status not in {400, 404, 422}: + break + + return TurnResult( + turn_idx=-1, + context_len=context_len, + output_len=0, + success=False, + error=" | ".join(error for error in errors if error), + request_mode="chat", + ) + + +async def _send_completion_turn( + prompt: str, + model_id: str, + model_name: Optional[str], + api_url: str, + output_len: int, + context_len: int, + count_text_tokens: Callable[[str], int], + ignore_eos: bool = False, +) -> TurnResult: + payload = { + "model": model_name or model_id, + "prompt": prompt, + "temperature": 0.0, + "max_tokens": output_len, + "stream": True, + "stream_options": {"include_usage": True}, + } + if ignore_eos: + payload["ignore_eos"] = True + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY', 'EMPTY')}", + } + + turn = TurnResult( + turn_idx=-1, + context_len=context_len, + output_len=0, + success=False, + request_mode="completions", + ) + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + + try: + async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: + async with session.post(url=api_url, json=payload, headers=headers) as response: + if response.status != 200: + error_text = (await response.text()).strip() + turn.error = f"HTTP {response.status}: {error_text or response.reason}" + return turn + + async for sse_payload in _iter_sse_lines(response): + data = json.loads(sse_payload) + if choices := data.get("choices"): + choice = choices[0] + content = choice.get("text") + if content is None: + delta = choice.get("delta", {}) + content = delta.get("content") + if isinstance(content, list): + content = "".join( + part.get("text", "") + for part in content + if isinstance(part, dict) and part.get("type") == "text" + ) + if content: + timestamp = time.perf_counter() + if ttft == 0.0: + ttft = timestamp - st + turn.ttft = ttft + else: + turn.itl.append(timestamp - most_recent_timestamp) + most_recent_timestamp = timestamp + generated_text += content + elif usage := data.get("usage"): + turn.output_len = int(usage.get("completion_tokens") or 0) + except Exception as exc: + turn.error = str(exc) + return turn + + turn.e2el = max(0.0, most_recent_timestamp - st) + turn.success = True + if turn.output_len == 0 and generated_text: + turn.output_len = count_text_tokens(generated_text) + if turn.output_len > 1: + turn.tpot = (turn.e2el - turn.ttft) / (turn.output_len - 1) + return turn + + +async def poll_server_metrics(api_url: str, interval: float = 2.0) -> list[dict[str, float]]: + """Poll ``/metrics`` periodically to capture KV / cache status.""" + import urllib.parse + + parsed = urllib.parse.urlparse(api_url) + metrics_url = f"{parsed.scheme}://{parsed.netloc}/metrics" + metrics_history: list[dict[str, float]] = [] + + try: + async with aiohttp.ClientSession(trust_env=True) as session: + while True: + try: + async with session.get(metrics_url, timeout=aiohttp.ClientTimeout(total=5.0)) as response: + if response.status == 200: + text = await response.text() + snapshot: dict[str, float] = {} + for line in text.split("\n"): + parsed_line = _parse_prometheus_sample(line) + if parsed_line is None: + continue + metric_name, metric_value = parsed_line + if metric_name == "vllm:gpu_cache_usage_perc": + snapshot["vllm_gpu_cache_usage"] = metric_value + elif metric_name == "vllm:cpu_cache_usage_perc": + snapshot["vllm_cpu_cache_usage"] = metric_value + elif metric_name == "sglang:cache_hit_rate": + snapshot["sglang_cache_hit_rate"] = metric_value + elif metric_name == "sglang:kv_cache_usage": + snapshot["sglang_kv_cache_usage"] = metric_value + elif metric_name == "sglang:token_usage": + snapshot["sglang_token_usage"] = metric_value + elif metric_name == "vllm:num_preemptions_total": + snapshot["vllm_preemptions_total"] = metric_value + elif metric_name == "vllm:num_requests_running": + snapshot["vllm_requests_running"] = metric_value + elif metric_name == "vllm:num_requests_waiting": + snapshot["vllm_requests_waiting"] = metric_value + if snapshot: + metrics_history.append(snapshot) + except Exception: + pass + await asyncio.sleep(interval) + except asyncio.CancelledError: + pass + + return metrics_history + + +def _percentile(values: list[float], percentile: float) -> float: + if not values: + return 0.0 + return float(np.percentile(values, percentile)) + + +def calculate_multiturn_metrics( + session_results: list[SessionResult], + max_turns: int, + selected_percentiles: list[float], +) -> dict[str, Any]: + ms = 1000.0 + per_turn: dict[str, dict[str, Any]] = {} + + for turn_index in range(max_turns): + ttfts: list[float] = [] + tpots: list[float] = [] + e2els: list[float] = [] + context_lens: list[int] = [] + actual_context_lens: list[int] = [] + output_lens: list[int] = [] + successes = 0 + for session in session_results: + if turn_index < len(session.turns): + turn = session.turns[turn_index] + if turn.success: + ttfts.append(turn.ttft) + tpots.append(turn.tpot) + e2els.append(turn.e2el) + context_lens.append(turn.context_len) + actual_context_lens.append(turn.actual_context_len) + output_lens.append(turn.output_len) + successes += 1 + + key = f"turn_{turn_index + 1}" + metrics: dict[str, Any] = { + "completed": successes, + "mean_context_len": float(np.mean(context_lens)) if context_lens else 0.0, + "mean_actual_context_len": float(np.mean(actual_context_lens)) if actual_context_lens else 0.0, + "mean_output_len": float(np.mean(output_lens)) if output_lens else 0.0, + } + for label, values in (("ttft", ttfts), ("tpot", tpots), ("e2el", e2els)): + metrics[f"mean_{label}_ms"] = float(np.mean(values)) * ms if values else 0.0 + metrics[f"median_{label}_ms"] = float(np.median(values)) * ms if values else 0.0 + metrics[f"std_{label}_ms"] = float(np.std(values)) * ms if values else 0.0 + for percentile in selected_percentiles: + percentile_label = str(int(percentile)) if int(percentile) == percentile else str(percentile) + metrics[f"p{percentile_label}_{label}_ms"] = _percentile(values, percentile) * ms + per_turn[key] = metrics + + all_ttfts: list[float] = [] + all_tpots: list[float] = [] + all_e2els: list[float] = [] + total_input = 0 + total_actual_input = 0 + total_output = 0 + completed_sessions = 0 + total_wall = 0.0 + max_actual_context_per_turn = 0 + + for session in session_results: + if session.turns and all(turn.success for turn in session.turns): + completed_sessions += 1 + total_input += session.total_input_tokens + total_actual_input += session.total_actual_input_tokens + total_output += session.total_output_tokens + total_wall = max(total_wall, session.total_duration) + for turn in session.turns: + if turn.success: + all_ttfts.append(turn.ttft) + all_tpots.append(turn.tpot) + all_e2els.append(turn.e2el) + if turn.actual_context_len > max_actual_context_per_turn: + max_actual_context_per_turn = turn.actual_context_len + + aggregate: dict[str, Any] = { + "completed_sessions": completed_sessions, + "total_sessions": len(session_results), + "total_input_tokens": total_input, + "total_actual_input_tokens": total_actual_input, + "max_actual_context_len_per_turn": max_actual_context_per_turn, + "total_output_tokens": total_output, + "total_wall_time_s": total_wall, + "session_throughput_sps": completed_sessions / total_wall if total_wall > 0 else 0.0, + "output_throughput_tps": total_output / total_wall if total_wall > 0 else 0.0, + "total_token_throughput_tps": (total_input + total_output) / total_wall if total_wall > 0 else 0.0, + } + for label, values in (("ttft", all_ttfts), ("tpot", all_tpots), ("e2el", all_e2els)): + aggregate[f"mean_{label}_ms"] = float(np.mean(values)) * ms if values else 0.0 + aggregate[f"median_{label}_ms"] = float(np.median(values)) * ms if values else 0.0 + aggregate[f"std_{label}_ms"] = float(np.std(values)) * ms if values else 0.0 + for percentile in selected_percentiles: + percentile_label = str(int(percentile)) if int(percentile) == percentile else str(percentile) + aggregate[f"p{percentile_label}_{label}_ms"] = _percentile(values, percentile) * ms + + return {"per_turn_metrics": per_turn, "aggregate_metrics": aggregate} + + +async def _run_replay_session( + session: ReplaySession, + model_id: str, + model_name: Optional[str], + chat_api_url: str, + completion_api_url: str, + count_text_tokens: Callable[[str], int], + pbar: Optional[tqdm], + ignore_eos: bool, +) -> SessionResult: + result = SessionResult(session_id=session.session_id) + start = time.perf_counter() + + for replay_turn in session.turns: + if replay_turn.wait_before_s > 0: + await asyncio.sleep(replay_turn.wait_before_s) + + if session.request_mode == "chat": + turn_result = await _send_chat_turn( + chat_messages=replay_turn.chat_messages, + model_id=model_id, + model_name=model_name, + api_url=chat_api_url, + output_len=replay_turn.output_len, + context_len=replay_turn.context_len, + count_text_tokens=count_text_tokens, + ignore_eos=ignore_eos, + ) + else: + turn_result = await _send_completion_turn( + prompt=replay_turn.completion_prompt, + model_id=model_id, + model_name=model_name, + api_url=completion_api_url, + output_len=replay_turn.output_len, + context_len=replay_turn.context_len, + count_text_tokens=count_text_tokens, + ignore_eos=ignore_eos, + ) + + turn_result.turn_idx = replay_turn.turn_idx + turn_result.actual_context_len = replay_turn.actual_context_len + result.turns.append(turn_result) + if turn_result.success: + result.total_input_tokens += turn_result.context_len + result.total_actual_input_tokens += turn_result.actual_context_len + result.total_output_tokens += turn_result.output_len + if pbar is not None: + pbar.update(1) + + result.total_duration = time.perf_counter() - start + return result + + +async def _run_warmup_sessions( + sessions: list[ReplaySession], + model_id: str, + model_name: Optional[str], + chat_api_url: str, + completion_api_url: str, + count_text_tokens: Callable[[str], int], + num_warmup_sessions: int, + ignore_eos: bool, +) -> None: + if num_warmup_sessions <= 0 or not sessions: + return + + print(f"Running {num_warmup_sessions} warmup session(s) (results discarded) ...") + warmup_jobs: list[asyncio.Task[SessionResult]] = [] + for index in range(num_warmup_sessions): + source = sessions[index % len(sessions)] + warmup_turns = [ + ReplayTurn( + turn_idx=turn.turn_idx, + turn_id=turn.turn_id, + output_len=turn.output_len, + wait_before_s=0.0, + context_len=turn.context_len, + actual_context_len=turn.actual_context_len, + chat_messages=turn.chat_messages, + completion_prompt=turn.completion_prompt, + ) + for turn in source.turns[: min(2, len(source.turns))] + ] + warmup_jobs.append( + asyncio.create_task( + _run_replay_session( + session=ReplaySession( + session_id=f"warmup-{index}", + trace_id=source.trace_id, + runtime_stack_id=source.runtime_stack_id, + hardware_profile_id=source.hardware_profile_id, + canonical_model_id=source.canonical_model_id, + support_status=source.support_status, + benchmark_certification_status=source.benchmark_certification_status, + request_mode=source.request_mode, + adapter_id=source.adapter_id, + turns=warmup_turns, + ), + model_id=model_id, + model_name=model_name, + chat_api_url=chat_api_url, + completion_api_url=completion_api_url, + count_text_tokens=count_text_tokens, + pbar=None, + ignore_eos=ignore_eos, + ) + ) + ) + + results = await asyncio.gather(*warmup_jobs, return_exceptions=True) + succeeded = sum( + 1 + for result in results + if isinstance(result, SessionResult) and any(turn.success for turn in result.turns) + ) + failed = num_warmup_sessions - succeeded + if failed: + print( + f" ⚠️ {failed}/{num_warmup_sessions} warmup session(s) failed. " + "Check the server endpoint and selected export cell." + ) + else: + print(f" ✅ {succeeded} warmup session(s) completed successfully.") + print() + + +async def run_export_replay_benchmark( + sessions: list[ReplaySession], + selection_metadata: dict[str, Any], + model_id: str, + model_name: Optional[str], + chat_api_url: str, + completion_api_url: str, + count_text_tokens: Callable[[str], int], + max_concurrency: int, + selected_percentiles: list[float], + disable_tqdm: bool, + num_warmup_sessions: int = 1, + ignore_eos: bool = False, +) -> dict[str, Any]: + if not sessions: + raise ValueError("No replay sessions were selected.") + + max_turns = max(len(session.turns) for session in sessions) + total_turns = sum(len(session.turns) for session in sessions) + + print("============================================================") + print(" Export Replay Selection") + print("============================================================") + print(f" Adapter: {selection_metadata['adapter_id']}") + print(f" Sessions selected: {selection_metadata['selected_sessions']}") + print(f" Runtime stack(s): {', '.join(selection_metadata['runtime_stack_ids'])}") + print(f" Hardware profile(s): {', '.join(selection_metadata['hardware_profile_ids'])}") + print(f" Canonical model(s): {', '.join(selection_metadata['canonical_model_ids'])}") + print( + " Support status(es): " + f"{', '.join(selection_metadata['support_statuses'])}" + ) + print( + " Certification status: " + f"{', '.join(selection_metadata['benchmark_certification_statuses'])}" + ) + print(f" Request mode mix: {selection_metadata['request_mode_mix']}") + print(f" Total turns: {total_turns}") + print("============================================================") + print() + + await _run_warmup_sessions( + sessions=sessions, + model_id=model_id, + model_name=model_name, + chat_api_url=chat_api_url, + completion_api_url=completion_api_url, + count_text_tokens=count_text_tokens, + num_warmup_sessions=num_warmup_sessions, + ignore_eos=ignore_eos, + ) + + pbar = None if disable_tqdm else tqdm(total=total_turns, desc="turns") + semaphore = asyncio.Semaphore(max_concurrency) + + async def _limited_run(session: ReplaySession) -> SessionResult: + async with semaphore: + return await _run_replay_session( + session=session, + model_id=model_id, + model_name=model_name, + chat_api_url=chat_api_url, + completion_api_url=completion_api_url, + count_text_tokens=count_text_tokens, + pbar=pbar, + ignore_eos=ignore_eos, + ) + + print( + f"Starting export replay benchmark: {len(sessions)} sessions, " + f"max_turns={max_turns}, max_concurrency={max_concurrency}" + ) + benchmark_start = time.perf_counter() + metrics_task = asyncio.create_task(poll_server_metrics(chat_api_url, interval=2.0)) + jobs = [asyncio.create_task(_limited_run(session)) for session in sessions] + session_results = await asyncio.gather(*jobs) + benchmark_duration = time.perf_counter() - benchmark_start + + metrics_task.cancel() + try: + server_metrics = await metrics_task + except asyncio.CancelledError: + server_metrics = [] + + if pbar is not None: + pbar.close() + + metrics = calculate_multiturn_metrics( + session_results=session_results, + max_turns=max_turns, + selected_percentiles=selected_percentiles, + ) + aggregate = metrics["aggregate_metrics"] + per_turn = metrics["per_turn_metrics"] + + cache_usage_avg = 0.0 + cache_hit_rate_avg = 0.0 + gpu_cache_usage_avg = 0.0 + gpu_cache_usage_peak = 0.0 + cpu_cache_usage_avg = 0.0 + cpu_cache_usage_peak = 0.0 + gpu_cache_metric_name: str | None = None + cpu_cache_metric_name: str | None = None + observability_status = "no_cache_metrics" + cpu_samples: list[float] = [] + kv_offload_observed = False + if server_metrics: + vllm_gpu_samples = [ + item["vllm_gpu_cache_usage"] + for item in server_metrics + if "vllm_gpu_cache_usage" in item + ] + sglang_gpu_samples: list[float] = [] + saw_sglang_kv_metric = False + saw_sglang_token_metric = False + for item in server_metrics: + if "sglang_kv_cache_usage" in item: + sglang_gpu_samples.append(item["sglang_kv_cache_usage"]) + saw_sglang_kv_metric = True + elif "sglang_token_usage" in item: + sglang_gpu_samples.append(item["sglang_token_usage"]) + saw_sglang_token_metric = True + + if saw_sglang_kv_metric: + gpu_cache_metric_name = "sglang:kv_cache_usage" + elif saw_sglang_token_metric: + gpu_cache_metric_name = "sglang:token_usage" + + if vllm_gpu_samples: + gpu_samples = vllm_gpu_samples + gpu_cache_metric_name = "vllm:gpu_cache_usage_perc" + else: + gpu_samples = sglang_gpu_samples + + cpu_samples = [ + item["vllm_cpu_cache_usage"] + for item in server_metrics + if "vllm_cpu_cache_usage" in item + ] + if cpu_samples: + cpu_cache_metric_name = "vllm:cpu_cache_usage_perc" + cache_hit_samples = [ + item["sglang_cache_hit_rate"] + for item in server_metrics + if "sglang_cache_hit_rate" in item + ] + + if gpu_samples: + gpu_cache_usage_avg = float(np.mean(gpu_samples)) + gpu_cache_usage_peak = float(np.max(gpu_samples)) + cache_usage_avg = gpu_cache_usage_avg + if cpu_samples: + cpu_cache_usage_avg = float(np.mean(cpu_samples)) + cpu_cache_usage_peak = float(np.max(cpu_samples)) + kv_offload_observed = any(sample > 0.0 for sample in cpu_samples) + if cache_hit_samples: + cache_hit_rate_avg = float(np.mean(cache_hit_samples)) + if cpu_samples: + observability_status = "direct_cpu_cache_metric" + elif gpu_samples or cache_hit_samples: + observability_status = "indirect_without_cpu_cache_metric" + + print() + print("{s:{c}^{n}}".format(s=" Export Replay Benchmark Result ", n=60, c="=")) + print(f" {'Completed sessions:':<35} {aggregate['completed_sessions']}/{aggregate['total_sessions']}") + print(f" {'Benchmark duration (s):':<35} {benchmark_duration:.2f}") + print(f" {'Total input tokens (estimated):':<35} {aggregate['total_input_tokens']}") + print(f" {'Total input tokens (actual sent):':<35} {aggregate['total_actual_input_tokens']}") + print(f" {'Max actual context/turn:':<35} {aggregate['max_actual_context_len_per_turn']}") + print(f" {'Total output tokens:':<35} {aggregate['total_output_tokens']}") + print(f" {'Session throughput (sessions/s):':<35} {aggregate['session_throughput_sps']:.2f}") + print(f" {'Output throughput (tok/s):':<35} {aggregate['output_throughput_tps']:.2f}") + print(f" {'Total throughput (tok/s):':<35} {aggregate['total_token_throughput_tps']:.2f}") + if server_metrics: + print() + print(f" {'Server KV Cache Usage (avg):':<35} {cache_usage_avg:.1%}") + if cpu_cache_metric_name: + print(f" {'Server CPU Cache Usage (avg):':<35} {cpu_cache_usage_avg:.1%}") + if cache_hit_rate_avg > 0: + print(f" {'Prefix Cache Hit Rate (avg):':<35} {cache_hit_rate_avg:.1%}") + if observability_status == "indirect_without_cpu_cache_metric": + print( + f" {'Offload observability:':<35} " + "indirect only (no direct CPU cache metric)" + ) + print() + print("{s:{c}^{n}}".format(s=" Per-Turn TTFT Progression ", n=60, c="-")) + print(f" {'Turn':<8} {'Est Ctx':<10} {'Act Ctx':<10} {'Mean TTFT':<14} {'P99 TTFT':<14} {'Mean E2EL':<14}") + print(f" {'─'*8} {'─'*10} {'─'*10} {'─'*14} {'─'*14} {'─'*14}") + for turn_index in range(max_turns): + key = f"turn_{turn_index + 1}" + if key not in per_turn: + continue + turn_metrics = per_turn[key] + print( + f" {turn_index + 1:<8} " + f"{turn_metrics['mean_context_len']:<10.0f} " + f"{turn_metrics.get('mean_actual_context_len', 0.0):<10.0f} " + f"{turn_metrics['mean_ttft_ms']:<14.1f} " + f"{turn_metrics.get('p99_ttft_ms', 0.0):<14.1f} " + f"{turn_metrics['mean_e2el_ms']:<14.1f}" + ) + print("=" * 60) + + return { + "mode": "export_replay", + "adapter_id": selection_metadata["adapter_id"], + "selection": selection_metadata, + "duration": benchmark_duration, + "num_sessions": len(sessions), + "max_turns": max_turns, + "max_concurrency": max_concurrency, + "num_warmup_sessions": num_warmup_sessions, + "server_metrics_summary": { + "cache_usage_avg": cache_usage_avg, + "cache_hit_rate_avg": cache_hit_rate_avg, + "gpu_cache_usage_avg": gpu_cache_usage_avg, + "gpu_cache_usage_peak": gpu_cache_usage_peak, + "gpu_cache_metric_name": gpu_cache_metric_name, + "cpu_cache_usage_avg": cpu_cache_usage_avg, + "cpu_cache_usage_peak": cpu_cache_usage_peak, + "cpu_cache_metric_name": cpu_cache_metric_name, + "cpu_cache_metric_available": bool(cpu_samples), + "observability_status": observability_status, + # Observability-only signal; not a certification or quality claim. + "kv_offload_observed": kv_offload_observed, + "samples": len(server_metrics), + "preemption_count": int( + max( + (item.get("vllm_preemptions_total", 0.0) for item in server_metrics), + default=0.0, + ) + ) if server_metrics else 0, + "peak_requests_running": float( + max( + (item.get("vllm_requests_running", 0.0) for item in server_metrics), + default=0.0, + ) + ) if server_metrics else 0.0, + "peak_requests_waiting": float( + max( + (item.get("vllm_requests_waiting", 0.0) for item in server_metrics), + default=0.0, + ) + ) if server_metrics else 0.0, + }, + "depth_telemetry": { + "total_estimated_input_tokens": aggregate["total_input_tokens"], + "total_actual_input_tokens": aggregate["total_actual_input_tokens"], + "max_actual_context_len_per_turn": aggregate["max_actual_context_len_per_turn"], + }, + **metrics, + } + + +def main(args: argparse.Namespace) -> None: + random.seed(args.seed) + np.random.seed(args.seed) + + base_url = args.base_url or f"http://{args.host}:{args.port}" + base_url = base_url.rstrip("/") + chat_api_url = args.chat_api_url or f"{base_url}{args.chat_endpoint}" + completion_api_url = args.completion_api_url or f"{base_url}{args.completion_endpoint}" + + tokenizer_id = None if args.skip_tokenizer_load else (args.tokenizer or args.model) + count_text_tokens = build_text_token_counter( + tokenizer_id=tokenizer_id, + tokenizer_mode=args.tokenizer_mode, + trust_remote_code=args.trust_remote_code, + ) + sessions, selection_metadata = load_replay_sessions( + export_file=args.export_file, + count_text_tokens=count_text_tokens, + runtime_stack_ids=_csv_values(args.runtime_stack_id), + hardware_profile_ids=_csv_values(args.hardware_profile_id), + canonical_model_ids=_csv_values(args.canonical_model_id), + trace_ids=_csv_values(args.trace_id), + support_statuses=_csv_values(args.support_status), + request_mode=args.request_mode, + image_token_estimate=args.image_token_estimate, + ignore_waits=args.ignore_waits, + fallback_output_len=args.fallback_output_len, + output_len_cap=args.max_output_len, + session_offset=args.session_offset, + max_sessions=args.max_sessions, + max_turns_per_session=args.max_turns_per_session, + shuffle_sessions=args.shuffle_sessions, + seed=args.seed, + allow_mixed_selection=args.allow_mixed_selection, + ) + + result = asyncio.run( + run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection_metadata, + model_id=args.model, + model_name=args.served_model_name, + chat_api_url=chat_api_url, + completion_api_url=completion_api_url, + count_text_tokens=count_text_tokens, + max_concurrency=args.max_concurrency, + selected_percentiles=[float(item) for item in args.metric_percentiles.split(",")], + disable_tqdm=args.disable_tqdm, + num_warmup_sessions=args.num_warmup_sessions, + ignore_eos=args.ignore_eos, + ) + ) + + if args.save_result: + result_json: dict[str, Any] = { + "date": datetime.now().strftime("%Y%m%d-%H%M%S"), + "model_id": args.model, + } + if tokenizer_id is not None: + result_json["tokenizer_id"] = tokenizer_id + if args.metadata: + for item in args.metadata: + if "=" in item: + key, value = item.split("=", 1) + result_json[key.strip()] = value.strip() + result_json = {**result_json, **result} + + file_name = args.result_filename or f"export-replay-{Path(args.export_file).stem}.json" + if args.result_dir: + os.makedirs(args.result_dir, exist_ok=True) + file_name = os.path.join(args.result_dir, file_name) + + with open(file_name, "w", encoding="utf-8") as handle: + json.dump(result_json, handle, indent=2) + print(f"\nResults saved to {file_name}") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description=( + "Replay ISB1 export sessions against an OpenAI-compatible server. " + "Supports chat-completions replay for standalone vLLM/SGLang and " + "prompt-projected completions replay for TRT / Dynamo-style cells." + ) + ) + + parser.add_argument("--export-file", type=str, required=True, + help="Path to an inferencex_multiturn or inferencex_trace_replay export JSON") + parser.add_argument("--base-url", type=str, default=None, + help="Server base URL, e.g. http://0.0.0.0:8000") + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--chat-endpoint", type=str, default="/v1/chat/completions") + parser.add_argument("--completion-endpoint", type=str, default="/v1/completions") + parser.add_argument("--chat-api-url", type=str, default=None, + help="Override the full chat endpoint URL") + parser.add_argument("--completion-api-url", type=str, default=None, + help="Override the full completions endpoint URL") + + parser.add_argument("--model", type=str, required=True, + help="Model identifier sent to the target server") + parser.add_argument("--served-model-name", type=str, default=None, + help="Served model name if different from --model") + parser.add_argument("--tokenizer", type=str, default=None, + help="Tokenizer name/path if different from --model") + parser.add_argument("--tokenizer-mode", type=str, default="auto", + choices=["auto", "slow", "mistral", "custom"]) + parser.add_argument("--trust-remote-code", action="store_true") + parser.add_argument("--skip-tokenizer-load", action="store_true", + help="Use approximate token counting instead of loading a tokenizer") + + parser.add_argument("--runtime-stack-id", type=str, default=None, + help="Comma-separated runtime_stack_id filter(s)") + parser.add_argument("--hardware-profile-id", type=str, default=None, + help="Comma-separated hardware_profile_id filter(s)") + parser.add_argument("--canonical-model-id", type=str, default=None, + help="Comma-separated canonical_model_id filter(s)") + parser.add_argument("--trace-id", type=str, default=None, + help="Comma-separated trace_id filter(s)") + parser.add_argument("--support-status", type=str, default=None, + help="Comma-separated support_status filter(s)") + parser.add_argument("--request-mode", type=str, default="auto", + choices=["auto", "chat", "completions"]) + parser.add_argument("--allow-mixed-selection", action="store_true", + help="Allow multiple runtime/model/hardware identities in one run") + parser.add_argument("--shuffle-sessions", action="store_true") + parser.add_argument("--session-offset", type=int, default=0) + parser.add_argument("--max-sessions", type=int, default=None) + parser.add_argument("--max-turns-per-session", type=int, default=None) + parser.add_argument("--ignore-waits", action="store_true", + help="Ignore export wait_before/arrival-time gaps") + parser.add_argument("--fallback-output-len", type=int, default=DEFAULT_FALLBACK_OUTPUT_LEN, + help="Fallback output length when export metadata is missing") + parser.add_argument("--max-output-len", type=int, default=None, + help="Optional cap applied to each exported target output length") + parser.add_argument("--image-token-estimate", type=int, default=DEFAULT_IMAGE_TOKEN_ESTIMATE, + help="Approximate token cost for image blocks when no explicit token count exists") + + parser.add_argument("--max-concurrency", type=int, default=8, + help="Maximum concurrently active replay sessions") + parser.add_argument("--num-warmup-sessions", type=int, default=1, + help="Warmup sessions to prime KV/prefix cache before measurement") + parser.add_argument("--ignore-eos", action="store_true") + + parser.add_argument("--save-result", action="store_true") + parser.add_argument("--result-dir", type=str, default=None) + parser.add_argument("--result-filename", type=str, default=None) + parser.add_argument("--metadata", metavar="KEY=VALUE", nargs="*") + parser.add_argument("--metric-percentiles", type=str, default="90,99,99.9") + + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--disable-tqdm", action="store_true") + + main(parser.parse_args()) diff --git a/utils/gate_isb1.py b/utils/gate_isb1.py new file mode 100644 index 000000000..e223e8c29 --- /dev/null +++ b/utils/gate_isb1.py @@ -0,0 +1,298 @@ +import argparse +import json +from pathlib import Path +from typing import Any, Callable + + +Row = dict[str, Any] +Criterion = tuple[str, Callable[[Row], bool]] + +EXPECTED_131K_COVERAGE = { + ("b200", "vllm"), + ("b200", "sglang"), + ("h100", "vllm"), + ("h100", "sglang"), + ("h200", "vllm"), + ("h200", "sglang"), +} +EXPECTED_1M_COVERAGE = { + ("b200", "vllm"), + ("b200", "sglang"), +} + + +def normalize_hw_label(hw: str | None) -> str: + """Normalize runner labels like h200-cw-1 to coverage labels like h200.""" + if not hw: + return "" + return hw.split("-", 1)[0] + + +def load_rows(report_path: Path) -> list[Row]: + """Load aggregated ISB1 rows from JSON.""" + payload = json.loads(report_path.read_text()) + if isinstance(payload, list): + return [row for row in payload if isinstance(row, dict)] + if isinstance(payload, dict): + return [payload] + raise ValueError(f"Unsupported ISB1 payload type: {type(payload)!r}") + + +def build_row_reference(row: Row, failed_criteria: list[str] | None = None) -> Row: + """Build a concise row reference for gate reports.""" + reference: Row = { + "result_filename": row.get("result_filename"), + "artifact_stems": row.get("artifact_stems") or {}, + "hw": row.get("hw"), + "framework": row.get("framework"), + "infmax_model_prefix": row.get("infmax_model_prefix"), + "support_status": row.get("support_status"), + "context_pressure_status": (row.get("context_pressure_signal") or {}).get("status"), + } + if failed_criteria: + reference["failed_criteria"] = failed_criteria + return reference + + +def completed_sessions_match(row: Row) -> bool: + return row.get("completed_sessions") == row.get("total_sessions") + + +def throughput_positive(row: Row) -> bool: + return float(row.get("session_throughput_sps") or 0.0) > 0.0 + + +def certification_verified(row: Row) -> bool: + return row.get("benchmark_certification_status") == "dataset_replay_verified" + + +def context_not_suspicious(row: Row) -> bool: + return not bool(row.get("context_pressure_suspicious")) + + +def vllm_context_ok(row: Row) -> bool: + if row.get("framework") != "vllm": + return True + signal = row.get("context_pressure_signal") or {} + return signal.get("status") == "ok" and not bool(row.get("context_pressure_suspicious")) + + +def get_present_coverage(rows: list[Row]) -> set[tuple[str, str]]: + return { + (normalize_hw_label(row.get("hw")), row.get("framework", "")) + for row in rows + } + + +def evaluate_gate( + gate_id: str, + label: str, + rows: list[Row], + criteria: list[Criterion], + *, + expected_coverage: set[tuple[str, str]] | None = None, + exact_coverage: bool = False, +) -> Row: + """Evaluate a gate definition over matching rows.""" + if not rows: + return { + "id": gate_id, + "label": label, + "status": "no_rows", + "matched_rows": 0, + "failing_rows": [], + "review_required_rows": [], + "missing_coverage": [], + "unexpected_coverage": [], + } + + failing_rows = [] + review_required_rows = [] + for row in rows: + failed_criteria = [description for description, checker in criteria if not checker(row)] + if failed_criteria: + failing_rows.append(build_row_reference(row, failed_criteria)) + signal = row.get("context_pressure_signal") or {} + if signal.get("requires_log_review"): + review_required_rows.append(build_row_reference(row)) + + missing_coverage: list[list[str]] = [] + unexpected_coverage: list[list[str]] = [] + if expected_coverage is not None: + present_coverage = get_present_coverage(rows) + missing_coverage = [list(item) for item in sorted(expected_coverage - present_coverage)] + if exact_coverage: + unexpected_coverage = [list(item) for item in sorted(present_coverage - expected_coverage)] + + status = "pass" + if failing_rows or missing_coverage or unexpected_coverage: + status = "fail" + + return { + "id": gate_id, + "label": label, + "status": status, + "matched_rows": len(rows), + "failing_rows": failing_rows, + "review_required_rows": review_required_rows, + "missing_coverage": missing_coverage, + "unexpected_coverage": unexpected_coverage, + } + + +def build_gate_report(rows: list[Row], advisory: bool = True) -> Row: + """Build the full advisory gate report for an aggregated ISB1 result set.""" + gates = [ + evaluate_gate( + "control_lanes", + "DSR1/GPT-OSS control lanes", + [ + row + for row in rows + if row.get("infmax_model_prefix") in {"dsr1", "gptoss"} + and row.get("support_status") == "supported" + ], + [ + ("completed_sessions == total_sessions", completed_sessions_match), + ("session_throughput_sps > 0", throughput_positive), + ], + ), + evaluate_gate( + "qwen_131k", + "Qwen 131k preview lanes", + [ + row + for row in rows + if row.get("infmax_model_prefix") == "qwen3.5" + and row.get("support_status") == "reviewed_preview" + and (row.get("effective_max_context_depth") or 0) < 200000 + ], + [ + ("completed_sessions == total_sessions", completed_sessions_match), + ("session_throughput_sps > 0", throughput_positive), + ], + expected_coverage=EXPECTED_131K_COVERAGE, + ), + evaluate_gate( + "qwen_500k", + "Qwen 500k preview lanes", + [ + row + for row in rows + if row.get("infmax_model_prefix") == "qwen3.5" + and row.get("effective_max_context_depth") == 524288 + and row.get("context_pressure_class") == "extended_500k" + ], + [ + ("completed_sessions == total_sessions", completed_sessions_match), + ( + "benchmark_certification_status == dataset_replay_verified", + certification_verified, + ), + ("context_pressure_suspicious == false", context_not_suspicious), + ("vllm context_pressure_signal.status == ok", vllm_context_ok), + ], + ), + evaluate_gate( + "qwen_1m", + "Qwen 1M preview lanes", + [ + row + for row in rows + if row.get("infmax_model_prefix") == "qwen3.5" + and row.get("effective_max_context_depth") == 1048576 + and row.get("context_pressure_class") == "extended_1m" + ], + [ + ("completed_sessions == total_sessions", completed_sessions_match), + ("context_pressure_suspicious == false", context_not_suspicious), + ("vllm context_pressure_signal.status == ok", vllm_context_ok), + ], + expected_coverage=EXPECTED_1M_COVERAGE, + exact_coverage=True, + ), + ] + + statuses = {gate["status"] for gate in gates} + if "fail" in statuses: + overall = "fail" + elif statuses == {"pass"}: + overall = "pass" + else: + overall = "partial" + + return { + "gates": gates, + "overall": overall, + "advisory": advisory, + } + + +def render_markdown(report: Row) -> str: + """Render a concise markdown advisory summary for workflow step summaries.""" + lines = [ + "## ISB1 Advisory Gates", + "", + f"Overall: **{report['overall'].upper()}** ({'advisory' if report['advisory'] else 'strict'})", + "", + ] + + for gate in report["gates"]: + lines.append(f"### {gate['label']} — {gate['status'].upper()}") + lines.append("") + lines.append(f"- Matched rows: {gate['matched_rows']}") + if gate["missing_coverage"]: + formatted = ", ".join(f"{hw}/{framework}" for hw, framework in gate["missing_coverage"]) + lines.append(f"- Missing coverage: {formatted}") + if gate["unexpected_coverage"]: + formatted = ", ".join( + f"{hw}/{framework}" for hw, framework in gate["unexpected_coverage"] + ) + lines.append(f"- Unexpected coverage: {formatted}") + if gate["failing_rows"]: + lines.append("- Failing rows:") + for row in gate["failing_rows"]: + failed_criteria = ", ".join(row.get("failed_criteria", [])) or "unknown" + lines.append( + f" - `{row.get('result_filename', 'unknown')}` ({row.get('hw', '-')}/" + f"{row.get('framework', '-')}) failed: {failed_criteria}" + ) + elif gate["matched_rows"]: + lines.append("- No failing rows.") + if gate["review_required_rows"]: + review_rows = ", ".join( + f"`{row.get('result_filename', 'unknown')}`" for row in gate["review_required_rows"] + ) + lines.append( + "- Manual log review still required for: " + f"{review_rows}" + ) + lines.append("") + + return "\n".join(lines).rstrip() + "\n" + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Evaluate advisory ISB1 gates.") + parser.add_argument("report_path", type=Path) + parser.add_argument("--strict", action="store_true") + parser.add_argument("--format", choices=["json", "markdown"], default="json") + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + report = build_gate_report(load_rows(args.report_path), advisory=not args.strict) + + if args.format == "markdown": + print(render_markdown(report)) + else: + print(json.dumps(report, indent=2)) + + if args.strict and report["overall"] == "fail": + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index bc4562415..14c69d3e9 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -10,7 +10,11 @@ from validation import ( validate_matrix_entry, + validate_isb1_matrix_entry, + validate_isb1_kv_stress_matrix_entry, load_config_files, + load_isb1_config_files, + load_isb1_kv_stress_config_files, load_runner_file, Fields ) @@ -374,6 +378,243 @@ def generate_full_sweep(args, all_config_data, runner_data): return matrix_values +def generate_isb1_sweep(args, all_config_data, runner_data): + """Generate ISB1 replay sweep configurations with optional filtering.""" + if args.runner_type: + valid_runner_types = set(runner_data.keys()) + invalid_runners = set(args.runner_type) - valid_runner_types + if invalid_runners: + raise ValueError( + f"Invalid runner type(s): {invalid_runners}. " + f"Valid runner types are: {', '.join(sorted(valid_runner_types))}" + ) + + matrix_values = [] + + for _, val in all_config_data.items(): + if args.model_prefix and val[Fields.MODEL_PREFIX.value] not in args.model_prefix: + continue + + if args.precision and val[Fields.PRECISION.value] not in args.precision: + continue + + if args.framework and val[Fields.FRAMEWORK.value] not in args.framework: + continue + + if args.runner_type and val[Fields.RUNNER.value] not in args.runner_type: + continue + + image = val[Fields.IMAGE.value] + model = val[Fields.MODEL.value] + model_code = val[Fields.MODEL_PREFIX.value] + precision = val[Fields.PRECISION.value] + framework = val[Fields.FRAMEWORK.value] + runner = val[Fields.RUNNER.value] + benchmark_type = val[Fields.BENCHMARK_TYPE.value] + runtime_stack_id = val[Fields.RUNTIME_STACK_ID.value] + hardware_profile_id = val[Fields.HARDWARE_PROFILE_ID.value] + canonical_model_id = val[Fields.CANONICAL_MODEL_ID.value] + max_model_len = val.get(Fields.MAX_MODEL_LEN.value) + + runner_nodes_to_use = None + if args.runner_node_filter: + runner_nodes = runner_data.get(runner, []) + runner_nodes_to_use = [ + node for node in runner_nodes if args.runner_node_filter in node + ] + if not runner_nodes_to_use: + continue + + replay_configs = val[Fields.REPLAY_CONFIGS.value] + for replay_config in replay_configs: + export_file = replay_config[Fields.EXPORT_FILE.value] + request_mode = replay_config[Fields.REQUEST_MODE.value] + support_status = replay_config.get(Fields.SUPPORT_STATUS.value) + + for replay_space in replay_config[Fields.SEARCH_SPACE.value]: + max_concurrency = replay_space[Fields.MAX_CONCURRENCY.value] + + if args.max_concurrency is not None: + if args.max_concurrency <= 0: + continue + max_concurrency = min(max_concurrency, args.max_concurrency) + + runners_for_entry = ( + runner_nodes_to_use if runner_nodes_to_use else [runner] + ) + for runner_value in runners_for_entry: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner_value, + Fields.BENCHMARK_TYPE.value: benchmark_type, + Fields.EXPORT_FILE.value: export_file, + Fields.RUNTIME_STACK_ID.value: runtime_stack_id, + Fields.HARDWARE_PROFILE_ID.value: hardware_profile_id, + Fields.CANONICAL_MODEL_ID.value: canonical_model_id, + Fields.SUPPORT_STATUS.value: support_status, + Fields.REQUEST_MODE.value: request_mode, + Fields.MAX_CONCURRENCY.value: max_concurrency, + Fields.MAX_SESSIONS.value: replay_space.get(Fields.MAX_SESSIONS.value), + Fields.MAX_TURNS_PER_SESSION.value: replay_space.get(Fields.MAX_TURNS_PER_SESSION.value), + Fields.MAX_OUTPUT_LEN.value: replay_space.get(Fields.MAX_OUTPUT_LEN.value), + Fields.NUM_WARMUP_SESSIONS.value: replay_space.get( + Fields.NUM_WARMUP_SESSIONS.value, 0 + ), + Fields.IGNORE_WAITS.value: replay_space.get( + Fields.IGNORE_WAITS.value, False + ), + Fields.IGNORE_EOS.value: replay_space.get( + Fields.IGNORE_EOS.value, False + ), + Fields.MAX_MODEL_LEN.value: max_model_len, + Fields.OFFLOAD_MODE.value: val.get(Fields.OFFLOAD_MODE.value), + Fields.KV_CACHE_DTYPE.value: val.get(Fields.KV_CACHE_DTYPE.value), + Fields.DISABLE_PREFIX_CACHING.value: val.get( + Fields.DISABLE_PREFIX_CACHING.value + ), + 'benchmark-duration-s': replay_space.get('benchmark-duration-s'), + Fields.EXP_NAME.value: f"{model_code}_isb1", + } + validate_isb1_matrix_entry(entry) + matrix_values.append(entry) + + return matrix_values + + +def generate_isb1_kv_stress_sweep(args, all_config_data, runner_data): + """Generate ISB1 KV stress sweep configurations with optional filtering.""" + if args.runner_type: + valid_runner_types = set(runner_data.keys()) + invalid_runners = set(args.runner_type) - valid_runner_types + if invalid_runners: + raise ValueError( + f"Invalid runner type(s): {invalid_runners}. " + f"Valid runner types are: {', '.join(sorted(valid_runner_types))}" + ) + + matrix_values = [] + + for _, val in all_config_data.items(): + if args.model_prefix and val[Fields.MODEL_PREFIX.value] not in args.model_prefix: + continue + + if args.precision and val[Fields.PRECISION.value] not in args.precision: + continue + + if args.framework and val[Fields.FRAMEWORK.value] not in args.framework: + continue + + if args.runner_type and val[Fields.RUNNER.value] not in args.runner_type: + continue + + image = val[Fields.IMAGE.value] + model = val[Fields.MODEL.value] + model_code = val[Fields.MODEL_PREFIX.value] + precision = val[Fields.PRECISION.value] + framework = val[Fields.FRAMEWORK.value] + runner = val[Fields.RUNNER.value] + benchmark_type = val[Fields.BENCHMARK_TYPE.value] + runtime_stack_id = val[Fields.RUNTIME_STACK_ID.value] + hardware_profile_id = val[Fields.HARDWARE_PROFILE_ID.value] + canonical_model_id = val[Fields.CANONICAL_MODEL_ID.value] + max_model_len = val.get(Fields.MAX_MODEL_LEN.value) + kv_cache_dtype = val[Fields.KV_CACHE_DTYPE.value] + + runner_nodes_to_use = None + if args.runner_node_filter: + runner_nodes = runner_data.get(runner, []) + runner_nodes_to_use = [ + node for node in runner_nodes if args.runner_node_filter in node + ] + if not runner_nodes_to_use: + continue + + kv_stress_configs = val[Fields.KV_STRESS_CONFIGS.value] + for kv_stress_config in kv_stress_configs: + export_file = kv_stress_config[Fields.EXPORT_FILE.value] + request_mode = kv_stress_config[Fields.REQUEST_MODE.value] + support_status = kv_stress_config.get(Fields.SUPPORT_STATUS.value) + workload_type = kv_stress_config[Fields.WORKLOAD_TYPE.value] + + runners_for_entry = ( + runner_nodes_to_use if runner_nodes_to_use else [runner] + ) + + def _append_kv_stress_entry( + max_concurrency: int, + offload_mode: str, + duration_s: int, + *, + tp: int | None = None, + ep: int | None = None, + ) -> None: + disable_prefix_caching = offload_mode == "noprefix" + for runner_value in runners_for_entry: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner_value, + Fields.BENCHMARK_TYPE.value: benchmark_type, + Fields.EXPORT_FILE.value: export_file, + Fields.RUNTIME_STACK_ID.value: runtime_stack_id, + Fields.HARDWARE_PROFILE_ID.value: hardware_profile_id, + Fields.CANONICAL_MODEL_ID.value: canonical_model_id, + Fields.SUPPORT_STATUS.value: support_status, + Fields.REQUEST_MODE.value: request_mode, + Fields.MAX_CONCURRENCY.value: max_concurrency, + Fields.OFFLOAD_MODE.value: offload_mode, + Fields.KV_CACHE_DTYPE.value: kv_cache_dtype, + Fields.DISABLE_PREFIX_CACHING.value: disable_prefix_caching, + 'benchmark-duration-s': duration_s, + Fields.WORKLOAD_TYPE.value: workload_type, + Fields.MAX_MODEL_LEN.value: max_model_len, + Fields.EXP_NAME.value: f"{model_code}_isb1_kv_stress", + } + if tp is not None: + entry[Fields.TP.value] = tp + if ep is not None: + entry[Fields.EP.value] = ep + validate_isb1_kv_stress_matrix_entry(entry) + matrix_values.append(entry) + + tp_configs = kv_stress_config.get('tp-configs') + if tp_configs: + for tp_config in tp_configs: + tp_value = tp_config[Fields.TP.value] + ep_value = tp_config.get(Fields.EP.value, 1) + users = tp_config[Fields.USERS.value] + offload_modes = tp_config[Fields.OFFLOAD_MODES.value] + duration_s = tp_config[Fields.DURATION_S.value] + + for max_concurrency in users: + for offload_mode in offload_modes: + _append_kv_stress_entry( + max_concurrency, + offload_mode, + duration_s, + tp=tp_value, + ep=ep_value, + ) + else: + for stress_space in kv_stress_config[Fields.SEARCH_SPACE.value]: + users = stress_space[Fields.USERS.value] + offload_modes = stress_space[Fields.OFFLOAD_MODES.value] + duration_s = stress_space[Fields.DURATION_S.value] + + for max_concurrency in users: + for offload_mode in offload_modes: + _append_kv_stress_entry(max_concurrency, offload_mode, duration_s) + + return matrix_values + + def generate_runner_model_sweep_config(args, all_config_data, runner_data): """Generate runner-model sweep configurations. @@ -885,6 +1126,86 @@ def main(): help='Show this help message and exit' ) + # Subcommand: isb1-sweep + isb1_sweep_parser = subparsers.add_parser( + 'isb1-sweep', + parents=[parent_parser], + add_help=False, + help='Generate ISB1 replay sweep configurations' + ) + isb1_sweep_parser.add_argument( + '--model-prefix', + nargs='+', + required=False, + help='Model prefix(es) to filter configurations (optional, can specify multiple)' + ) + isb1_sweep_parser.add_argument( + '--precision', + nargs='+', + required=False, + help='Precision(s) to filter by (optional, can specify multiple)' + ) + isb1_sweep_parser.add_argument( + '--framework', + nargs='+', + required=False, + help='Framework(s) to filter by (optional, can specify multiple)' + ) + isb1_sweep_parser.add_argument( + '--runner-type', + nargs='+', + required=False, + help='Runner type(s) to filter by (e.g., h200, b200) (optional, can specify multiple)' + ) + isb1_sweep_parser.add_argument( + '--max-concurrency', + type=int, + required=False, + help='Maximum replay concurrency value to include (caps higher values)' + ) + isb1_sweep_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + + # Subcommand: isb1-kv-stress-sweep + isb1_kv_stress_sweep_parser = subparsers.add_parser( + 'isb1-kv-stress-sweep', + parents=[parent_parser], + add_help=False, + help='Generate ISB1 KV stress sweep configurations' + ) + isb1_kv_stress_sweep_parser.add_argument( + '--model-prefix', + nargs='+', + required=False, + help='Model prefix(es) to filter configurations (optional, can specify multiple)' + ) + isb1_kv_stress_sweep_parser.add_argument( + '--precision', + nargs='+', + required=False, + help='Precision(s) to filter by (optional, can specify multiple)' + ) + isb1_kv_stress_sweep_parser.add_argument( + '--framework', + nargs='+', + required=False, + help='Framework(s) to filter by (optional, can specify multiple)' + ) + isb1_kv_stress_sweep_parser.add_argument( + '--runner-type', + nargs='+', + required=False, + help='Runner type(s) to filter by (e.g., h200, b200) (optional, can specify multiple)' + ) + isb1_kv_stress_sweep_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + # Subcommand: test-config test_config_keys_parser = subparsers.add_parser( 'test-config', @@ -915,7 +1236,12 @@ def main(): apply_node_type_defaults(args) # Load and validate configuration files (validation happens by default in load functions) - all_config_data = load_config_files(args.config_files) + if args.command == 'isb1-sweep': + all_config_data = load_isb1_config_files(args.config_files) + elif args.command == 'isb1-kv-stress-sweep': + all_config_data = load_isb1_kv_stress_config_files(args.config_files) + else: + all_config_data = load_config_files(args.config_files) runner_data = load_runner_file(args.runner_config) # Route to appropriate function based on subcommand @@ -924,13 +1250,17 @@ def main(): elif args.command == 'runner-model-sweep': matrix_values = generate_runner_model_sweep_config( args, all_config_data, runner_data) + elif args.command == 'isb1-sweep': + matrix_values = generate_isb1_sweep(args, all_config_data, runner_data) + elif args.command == 'isb1-kv-stress-sweep': + matrix_values = generate_isb1_kv_stress_sweep(args, all_config_data, runner_data) elif args.command == 'test-config': matrix_values = generate_test_config_sweep(args, all_config_data) else: parser.error(f"Unknown command: {args.command}") # Handle eval options (mutually exclusive: --no-evals or --evals-only) - if not args.no_evals: + if args.command not in ('isb1-sweep', 'isb1-kv-stress-sweep') and not args.no_evals: matrix_values = mark_eval_entries(matrix_values) if args.evals_only: matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)] diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py index d05299472..cbee3f0a6 100644 --- a/utils/matrix_logic/test_generate_sweep_configs.py +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -1,22 +1,73 @@ """Comprehensive tests for generate_sweep_configs.py""" import pytest import argparse +import json +from pathlib import Path from generate_sweep_configs import ( seq_len_stoi, seq_len_itos, seq_len_to_str, generate_full_sweep, + generate_isb1_sweep, + generate_isb1_kv_stress_sweep, generate_runner_model_sweep_config, apply_node_type_defaults, expand_config_keys, mark_eval_entries, ) +from validation import ( + load_config_files, + load_isb1_config_files, + load_isb1_kv_stress_config_files, +) # ============================================================================= # Test Fixtures # ============================================================================= + +def _write_isb1_export_fixture( + root: Path, + relative_path: str, + *, + runtime_stack_id: str, + hardware_profile_id: str, + canonical_model_id: str, + support_status: str, + benchmark_certification_status: str = "dataset_replay_verified", +) -> None: + export_path = root / relative_path + export_path.parent.mkdir(parents=True, exist_ok=True) + export_path.write_text( + json.dumps( + { + "adapter_id": "inferencex_multiturn", + "exports": [ + { + "trace_id": f"{export_path.stem}-trace", + "runtime_stack_id": runtime_stack_id, + "hardware_profile_id": hardware_profile_id, + "canonical_model_id": canonical_model_id, + "support_status": support_status, + "benchmark_certification_status": benchmark_certification_status, + "session": { + "session_id": "fixture-session", + "turns": [ + { + "turn_idx": 0, + "turn_id": 0, + "messages": [{"role": "user", "content": "hi"}], + "expected_output_tokens": 8, + } + ], + }, + } + ], + } + ) + ) + @pytest.fixture def sample_single_node_config(): """Single node config based on dsr1-fp8-mi300x-sglang.""" @@ -149,6 +200,161 @@ def full_sweep_args_multi_node(): return args +@pytest.fixture +def sample_isb1_config(): + """ISB1 replay config based on NVIDIA H200 replay lane.""" + return { + "dsr1-isb1-h200-vllm": { + "image": "vllm/vllm-openai:v0.8.5", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "benchmark-type": "isb1_replay", + "runtime-stack-id": "vllm-0.8.5-h200", + "hardware-profile-id": "h200-8gpu", + "canonical-model-id": "deepseek-r1-0528", + "max-model-len": 16384, + "replay-configs": [ + { + "export-file": "datasets/isb1/exports/core/chat_8k1k.json", + "request-mode": "multi-turn", + "support-status": "supported", + "search-space": [ + { + "max-concurrency": 4, + "max-sessions": 2, + "max-turns-per-session": 6, + "max-output-len": 512, + "num-warmup-sessions": 1, + "ignore-waits": True, + "ignore-eos": False, + }, + {"max-concurrency": 8}, + {"max-concurrency": 16}, + ], + }, + { + "export-file": "datasets/isb1/exports/core/code_8k1k.json", + "request-mode": "multi-turn", + "support-status": "supported", + "search-space": [ + {"max-concurrency": 4}, + {"max-concurrency": 8}, + ], + }, + ], + } + } + + +@pytest.fixture +def isb1_sweep_args(): + """Args for isb1-sweep command.""" + args = argparse.Namespace() + args.model_prefix = None + args.precision = None + args.framework = None + args.runner_type = None + args.max_concurrency = None + args.runner_node_filter = None + return args + + +@pytest.fixture +def sample_isb1_kv_stress_config(): + """ISB1 KV stress config with users/offload-mode search space.""" + return { + "gptoss-fp4-h200-isb1-kv-stress-vllm-code": { + "image": "vllm/vllm-openai:v0.18.0", + "model": "openai/gpt-oss-120b", + "model-prefix": "gptoss", + "precision": "fp4", + "framework": "vllm", + "runner": "h200", + "benchmark-type": "isb1_kv_stress", + "runtime-stack-id": "standalone:vllm", + "hardware-profile-id": "nvidia:h200_sxm_141gb", + "canonical-model-id": "gpt_oss_120b", + "max-model-len": 131272, + "kv-cache-dtype": "fp8", + "kv-stress-configs": [ + { + "export-file": "datasets/isb1/exports/extension_131k/vllm/code_131k1k.json", + "request-mode": "multi-turn", + "support-status": "reviewed_preview", + "workload-type": "code", + "search-space": [ + { + "users": [2, 4, 8], + "offload-modes": ["on", "off", "noprefix"], + "duration-s": 1800, + } + ], + } + ], + } + } + + +@pytest.fixture +def sample_isb1_kv_stress_tp_config(): + """ISB1 KV stress config using per-TP expansion.""" + return { + "gptoss-fp4-h200-isb1-kv-stress-vllm-code-tp": { + "image": "vllm/vllm-openai:v0.18.0", + "model": "openai/gpt-oss-120b", + "model-prefix": "gptoss", + "precision": "fp4", + "framework": "vllm", + "runner": "h200", + "benchmark-type": "isb1_kv_stress", + "runtime-stack-id": "standalone:vllm", + "hardware-profile-id": "nvidia:h200_sxm_141gb", + "canonical-model-id": "gpt_oss_120b", + "max-model-len": 131272, + "kv-cache-dtype": "fp8", + "kv-stress-configs": [ + { + "export-file": "datasets/isb1/exports/extension_131k/vllm/code_131k1k.json", + "request-mode": "multi-turn", + "support-status": "reviewed_preview", + "workload-type": "code", + "search-space": [ + { + "users": [1], + "offload-modes": ["off"], + "duration-s": 10, + } + ], + "tp-configs": [ + { + "tp": 8, + "ep": 1, + "users": [2, 4, 8], + "offload-modes": ["on", "off", "noprefix"], + "duration-s": 1800, + } + ], + } + ], + } + } + + +@pytest.fixture +def isb1_kv_stress_sweep_args(): + """Args for isb1-kv-stress-sweep command.""" + args = argparse.Namespace() + args.model_prefix = None + args.precision = None + args.framework = None + args.runner_type = None + args.runner_node_filter = None + return args + + # ============================================================================= # Test seq_len mappings # ============================================================================= @@ -181,6 +387,573 @@ def test_unknown_sequence_lengths(self): assert seq_len_to_str(4096, 1024) == "4096_1024" +# ============================================================================= +# Test generate_isb1_sweep +# ============================================================================= + +class TestGenerateISB1Sweep: + """Tests for generate_isb1_sweep.""" + + def test_basic_sweep_generation(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 5 + + def test_matrix_entry_structure(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + entry = result[0] + assert entry["benchmark-type"] == "isb1_replay" + assert entry["export-file"].endswith("chat_8k1k.json") + assert entry["runtime-stack-id"] == "vllm-0.8.5-h200" + assert entry["hardware-profile-id"] == "h200-8gpu" + assert entry["canonical-model-id"] == "deepseek-r1-0528" + assert entry["support-status"] == "supported" + assert entry["request-mode"] == "multi-turn" + assert entry["max-concurrency"] == 4 + assert entry["max-sessions"] == 2 + assert entry["max-turns-per-session"] == 6 + assert entry["max-output-len"] == 512 + assert entry["num-warmup-sessions"] == 1 + assert entry["ignore-waits"] is True + assert entry["ignore-eos"] is False + assert entry["max-model-len"] == 16384 + assert entry["exp-name"] == "dsr1_isb1" + assert "run-eval" not in entry + + def test_filter_by_model_prefix(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.model_prefix = ["dsr1"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 5 + + isb1_sweep_args.model_prefix = ["gptoss"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert result == [] + + def test_filter_by_precision(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.precision = ["fp8"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 5 + + isb1_sweep_args.precision = ["fp4"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert result == [] + + def test_filter_by_framework(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.framework = ["vllm"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 5 + + isb1_sweep_args.framework = ["sglang"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert result == [] + + def test_filter_by_runner_type(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.runner_type = ["h200"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 5 + + isb1_sweep_args.runner_type = ["h100"] + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert result == [] + + def test_invalid_runner_type_raises_error(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.runner_type = ["not-a-runner"] + with pytest.raises(ValueError, match="Invalid runner type"): + generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + + def test_max_concurrency_cap(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.max_concurrency = 6 + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 5 + assert sorted(entry["max-concurrency"] for entry in result) == [4, 4, 6, 6, 6] + + def test_non_positive_max_concurrency_skips_all(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.max_concurrency = 0 + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert result == [] + + def test_max_model_len_passthrough_optional(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert all(entry["max-model-len"] == 16384 for entry in result) + + sample_isb1_config["dsr1-isb1-h200-vllm"].pop("max-model-len") + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert all(entry["max-model-len"] is None for entry in result) + + def test_runner_node_filter_expands_runner_nodes(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.runner_node_filter = "cw" + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert len(result) == 10 + assert all(entry["runner"].startswith("h200-cw") for entry in result) + + def test_runner_node_filter_no_match_returns_empty(self, sample_isb1_config, sample_runner_config, isb1_sweep_args): + isb1_sweep_args.runner_node_filter = "does-not-exist" + result = generate_isb1_sweep( + isb1_sweep_args, + sample_isb1_config, + sample_runner_config, + ) + assert result == [] + + def test_main_routes_isb1_sweep(self, tmp_path, sample_isb1_config, sample_runner_config, monkeypatch): + import yaml + import sys + from generate_sweep_configs import main + + sample_entry = sample_isb1_config["dsr1-isb1-h200-vllm"] + for replay_config in sample_entry["replay-configs"]: + _write_isb1_export_fixture( + tmp_path, + replay_config["export-file"], + runtime_stack_id=sample_entry["runtime-stack-id"], + hardware_profile_id=sample_entry["hardware-profile-id"], + canonical_model_id=sample_entry["canonical-model-id"], + support_status=replay_config["support-status"], + ) + + config_file = tmp_path / "isb1.yaml" + runner_file = tmp_path / "runners.yaml" + config_file.write_text(yaml.dump(sample_isb1_config)) + runner_file.write_text(yaml.dump(sample_runner_config)) + + monkeypatch.setattr( + sys, + "argv", + [ + "generate_sweep_configs.py", + "isb1-sweep", + "--config-files", + str(config_file), + "--runner-config", + str(runner_file), + ], + ) + + result = main() + assert len(result) == 5 + assert all(entry["benchmark-type"] == "isb1_replay" for entry in result) + + +class TestKVStressSweep: + """Tests for generate_isb1_kv_stress_sweep.""" + + def test_basic_kv_stress_sweep_generation( + self, + sample_isb1_kv_stress_config, + sample_runner_config, + isb1_kv_stress_sweep_args, + ): + result = generate_isb1_kv_stress_sweep( + isb1_kv_stress_sweep_args, + sample_isb1_kv_stress_config, + sample_runner_config, + ) + # users(3) * offload-modes(3) = 9 flattened rows + assert len(result) == 9 + + def test_flatten_users_x_offload_modes( + self, + sample_isb1_kv_stress_config, + sample_runner_config, + isb1_kv_stress_sweep_args, + ): + result = generate_isb1_kv_stress_sweep( + isb1_kv_stress_sweep_args, + sample_isb1_kv_stress_config, + sample_runner_config, + ) + + assert all(entry["benchmark-type"] == "isb1_kv_stress" for entry in result) + assert all(isinstance(entry["max-concurrency"], int) for entry in result) + assert all(isinstance(entry["offload-mode"], str) for entry in result) + assert all(entry["benchmark-duration-s"] == 1800 for entry in result) + assert all(entry["kv-cache-dtype"] == "fp8" for entry in result) + assert all(entry["workload-type"] == "code" for entry in result) + + pairs = {(entry["max-concurrency"], entry["offload-mode"]) for entry in result} + assert pairs == { + (2, "on"), + (2, "off"), + (2, "noprefix"), + (4, "on"), + (4, "off"), + (4, "noprefix"), + (8, "on"), + (8, "off"), + (8, "noprefix"), + } + + def test_tp_config_expansion_produces_expected_rows( + self, + sample_isb1_kv_stress_tp_config, + sample_runner_config, + isb1_kv_stress_sweep_args, + ): + result = generate_isb1_kv_stress_sweep( + isb1_kv_stress_sweep_args, + sample_isb1_kv_stress_tp_config, + sample_runner_config, + ) + + # users(3) * offload-modes(3) = 9 rows from tp-configs expansion + assert len(result) == 9 + assert {entry["tp"] for entry in result} == {8} + assert {entry["ep"] for entry in result} == {1} + + def test_repo_kv_stress_config_loads_and_expands(self, isb1_kv_stress_sweep_args): + repo_root = Path(__file__).resolve().parents[2] + config_data = load_isb1_kv_stress_config_files( + [str(repo_root / ".github/configs/isb1-kv-stress.yaml")] + ) + runner_data = { + "b200": ["b200-nb_0"], + "h200": ["h200-cw_2"], + } + + matrix = generate_isb1_kv_stress_sweep( + isb1_kv_stress_sweep_args, + config_data, + runner_data, + ) + + # 4 configs (gptoss/qwen * b200/h200) * 8 users * 3 offload modes + assert len(matrix) == 96 + assert all(entry["benchmark-type"] == "isb1_kv_stress" for entry in matrix) + assert all("tp" not in entry for entry in matrix) + assert all("ep" not in entry for entry in matrix) + + +class TestISB1SweepIsolation: + """Tests for ISB1 sweep isolation from throughput config lane.""" + + def test_repo_isb1_master_includes_runtime_expansion_cells(self, isb1_sweep_args): + repo_root = Path(__file__).resolve().parents[2] + config_data = load_isb1_config_files( + [str(repo_root / ".github/configs/isb1-master.yaml")] + ) + runner_data = { + "b200": ["b200-nb_0"], + "h100": ["h100-cw_0"], + "h200": ["h200-cw_2"], + } + + matrix = generate_isb1_sweep(isb1_sweep_args, config_data, runner_data) + config_keys = set(config_data) + matrix_key_triples = { + (entry["model-prefix"], entry["framework"], entry["runner"]) + for entry in matrix + } + + assert "dsr1-fp8-b200-isb1-vllm" in config_keys + assert "dsr1-fp8-h200-isb1-vllm" in config_keys + assert "gptoss-fp4-b200-isb1-sglang" in config_keys + assert "gptoss-fp4-h100-isb1-sglang" in config_keys + assert "gptoss-fp4-h200-isb1-sglang" in config_keys + assert "gptoss-fp4-h100-isb1-sglang-offload-core-preview-chat" in config_keys + assert "gptoss-fp4-h100-isb1-vllm-offload-core-preview-code" in config_keys + assert "gptoss-fp4-h100-isb1-sglang-500k-preview-code" in config_keys + assert "gptoss-fp4-h100-isb1-vllm-500k-preview-code" in config_keys + assert "qwen3.5-fp8-b200-isb1-sglang-500k-preview-code" in config_keys + assert "qwen3.5-fp8-h100-isb1-sglang-500k-preview-code" in config_keys + assert "qwen3.5-fp8-h200-isb1-sglang-500k-preview-code" in config_keys + assert "qwen3.5-fp8-b200-isb1-vllm-500k-preview-code" in config_keys + assert "qwen3.5-fp8-h100-isb1-vllm-500k-preview-code" in config_keys + assert "qwen3.5-fp8-h200-isb1-vllm-500k-preview-code" in config_keys + assert "qwen3.5-fp8-b200-isb1-sglang-extension" in config_keys + assert "qwen3.5-fp8-h100-isb1-sglang-extension" in config_keys + assert "qwen3.5-fp8-h200-isb1-sglang-extension" in config_keys + assert "qwen3.5-fp8-b200-isb1-vllm-extension" in config_keys + assert "qwen3.5-fp8-h100-isb1-vllm-extension" in config_keys + assert "qwen3.5-fp8-h200-isb1-vllm-extension" in config_keys + + assert ("dsr1", "vllm", "b200") in matrix_key_triples + assert ("dsr1", "vllm", "h200") in matrix_key_triples + assert ("gptoss", "sglang", "b200") in matrix_key_triples + assert ("gptoss", "sglang", "h100") in matrix_key_triples + assert ("gptoss", "sglang", "h200") in matrix_key_triples + assert ("qwen3.5", "sglang", "b200") in matrix_key_triples + assert ("qwen3.5", "sglang", "h100") in matrix_key_triples + assert ("qwen3.5", "sglang", "h200") in matrix_key_triples + assert ("qwen3.5", "vllm", "b200") in matrix_key_triples + assert ("qwen3.5", "vllm", "h100") in matrix_key_triples + assert ("qwen3.5", "vllm", "h200") in matrix_key_triples + + assert "dsr1-fp8-h100-isb1-sglang" not in config_keys + assert "dsr1-fp8-h100-isb1-vllm" not in config_keys + + assert any( + entry["export-file"].endswith("extension_32k/vllm/chat_32k1k.json") + and entry["support-status"] == "supported" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("core/vllm/code_8k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert not any( + entry["export-file"].endswith("core/vllm/code_8k1k.json") + and entry["support-status"] == "supported" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_32k/vllm/code_32k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_64k/vllm/code_64k1k.json") + and entry["support-status"] == "supported" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_64k/sglang/chat_64k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert any( + "preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json" + in entry["export-file"] + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_131k/sglang/chat_131k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_131k/sglang/code_131k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_131k/vllm/chat_131k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + assert any( + entry["export-file"].endswith("extension_131k/vllm/code_131k1k.json") + and entry["support-status"] == "reviewed_preview" + for entry in matrix + ) + qwen_sglang_entries = [ + entry + for entry in matrix + if entry["export-file"].endswith( + "extension_131k/sglang/code_131k1k_qwen3.5.json" + ) + ] + assert len(qwen_sglang_entries) == 6 + assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_sglang_entries) + assert all(entry["framework"] == "sglang" for entry in qwen_sglang_entries) + assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_sglang_entries) + assert {entry["max-concurrency"] for entry in qwen_sglang_entries} == {2, 4} + + qwen_vllm_entries = [ + entry + for entry in matrix + if entry["export-file"].endswith( + "extension_131k/vllm/code_131k1k_qwen3.5.json" + ) + ] + assert len(qwen_vllm_entries) == 6 + assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_vllm_entries) + assert all(entry["framework"] == "vllm" for entry in qwen_vllm_entries) + assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_vllm_entries) + assert {entry["max-concurrency"] for entry in qwen_vllm_entries} == {2, 4} + + sglang_500k_entries = [ + entry + for entry in matrix + if entry["export-file"].endswith( + "preview/long_context_500k/" + "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json" + ) + ] + assert len(sglang_500k_entries) == 3 + assert all(entry["support-status"] == "reviewed_preview" for entry in sglang_500k_entries) + assert all(entry["max-model-len"] == 524288 for entry in sglang_500k_entries) + assert all(entry["max-concurrency"] == 1 for entry in sglang_500k_entries) + + vllm_500k_entries = [ + entry + for entry in matrix + if entry["export-file"].endswith( + "preview/long_context_500k/" + "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json" + ) + ] + assert len(vllm_500k_entries) == 3 + assert all(entry["support-status"] == "reviewed_preview" for entry in vllm_500k_entries) + assert all(entry["max-model-len"] == 524288 for entry in vllm_500k_entries) + assert all(entry["max-concurrency"] == 1 for entry in vllm_500k_entries) + + qwen_sglang_500k_entries = [ + entry + for entry in matrix + if entry["export-file"].endswith( + "preview/long_context_500k/" + "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json" + ) + ] + assert len(qwen_sglang_500k_entries) == 3 + assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_sglang_500k_entries) + assert all(entry["framework"] == "sglang" for entry in qwen_sglang_500k_entries) + assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_sglang_500k_entries) + assert all(entry["max-model-len"] == 524288 for entry in qwen_sglang_500k_entries) + assert all(entry["max-concurrency"] == 1 for entry in qwen_sglang_500k_entries) + + qwen_vllm_500k_entries = [ + entry + for entry in matrix + if entry["export-file"].endswith( + "preview/long_context_500k/" + "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json" + ) + ] + assert len(qwen_vllm_500k_entries) == 3 + assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_vllm_500k_entries) + assert all(entry["framework"] == "vllm" for entry in qwen_vllm_500k_entries) + assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_vllm_500k_entries) + assert all(entry["max-model-len"] == 524288 for entry in qwen_vllm_500k_entries) + assert all(entry["max-concurrency"] == 1 for entry in qwen_vllm_500k_entries) + + assert not any( + entry["export-file"].endswith( + "preview/long_context_1m/" + "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json" + ) + or entry["export-file"].endswith( + "preview/long_context_1m/" + "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__sglang.json" + ) + for entry in matrix + ) + + def test_repo_qwen_1m_preview_config_is_manual_and_separate(self, isb1_sweep_args): + repo_root = Path(__file__).resolve().parents[2] + config_data = load_isb1_config_files( + [str(repo_root / ".github/configs/isb1-qwen-1m-preview.yaml")] + ) + runner_data = { + "b200": ["b200-nb_0"], + "h100": ["h100-cw_0"], + "h200": ["h200-cw_2"], + } + + matrix = generate_isb1_sweep(isb1_sweep_args, config_data, runner_data) + config_keys = set(config_data) + + assert config_keys == { + "qwen3.5-fp8-b200-isb1-sglang-1m-gated-preview-code", + "qwen3.5-fp8-b200-isb1-vllm-1m-gated-preview-code", + } + assert len(matrix) == 2 + assert {entry["runner"] for entry in matrix} == {"b200"} + assert {entry["framework"] for entry in matrix} == {"sglang", "vllm"} + assert {entry["model-prefix"] for entry in matrix} == {"qwen3.5"} + assert {entry["support-status"] for entry in matrix} == {"reviewed_preview"} + assert {entry["max-model-len"] for entry in matrix} == {1048576} + assert {entry["max-concurrency"] for entry in matrix} == {1} + assert {entry["max-sessions"] for entry in matrix} == {1} + assert {entry["max-turns-per-session"] for entry in matrix} == {3} + assert { + entry["canonical-model-id"] for entry in matrix + } == {"qwen3_5_397b_a17b"} + assert { + entry["export-file"] for entry in matrix + } == { + "datasets/isb1/exports/preview/long_context_1m/" + "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__sglang.json", + "datasets/isb1/exports/preview/long_context_1m/" + "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json", + } + assert all((repo_root / entry["export-file"]).exists() for entry in matrix) + + + def test_isb1_config_does_not_validate_as_throughput(self, tmp_path, sample_isb1_config): + import yaml + + config_file = tmp_path / "isb1.yaml" + config_file.write_text(yaml.dump(sample_isb1_config)) + + with pytest.raises(ValueError): + load_config_files([str(config_file)]) + + def test_throughput_config_does_not_validate_as_isb1(self, tmp_path, sample_single_node_config): + import yaml + + config_file = tmp_path / "throughput.yaml" + config_file.write_text(yaml.dump(sample_single_node_config)) + + with pytest.raises(ValueError): + load_isb1_config_files([str(config_file)]) + + # ============================================================================= # Test generate_full_sweep for single-node # ============================================================================= diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py index 0f1f44c27..06267da22 100644 --- a/utils/matrix_logic/test_validation.py +++ b/utils/matrix_logic/test_validation.py @@ -1,20 +1,31 @@ """Comprehensive tests for validation.py""" +import json +from pathlib import Path + import pytest +import yaml from validation import ( Fields, SingleNodeMatrixEntry, MultiNodeMatrixEntry, + ISB1ReplayMatrixEntry, WorkerConfig, SingleNodeSearchSpaceEntry, MultiNodeSearchSpaceEntry, + ISB1ReplaySearchSpaceEntry, + ISB1ReplayConfigEntry, SingleNodeSeqLenConfig, MultiNodeSeqLenConfig, SingleNodeMasterConfigEntry, MultiNodeMasterConfigEntry, + ISB1MasterConfigEntry, validate_matrix_entry, + validate_isb1_matrix_entry, validate_master_config, + validate_isb1_master_config, validate_runner_config, load_config_files, + load_isb1_config_files, load_runner_file, ) @@ -23,6 +34,68 @@ # Test Fixtures # ============================================================================= + +def _write_isb1_export_fixture( + root: Path, + relative_path: str, + *, + runtime_stack_id: str, + hardware_profile_id: str, + canonical_model_id: str, + support_status: str, + benchmark_certification_status: str = "dataset_replay_verified", +) -> None: + export_path = root / relative_path + export_path.parent.mkdir(parents=True, exist_ok=True) + export_path.write_text( + json.dumps( + { + "adapter_id": "inferencex_multiturn", + "exports": [ + { + "trace_id": f"{export_path.stem}-trace", + "runtime_stack_id": runtime_stack_id, + "hardware_profile_id": hardware_profile_id, + "canonical_model_id": canonical_model_id, + "support_status": support_status, + "benchmark_certification_status": benchmark_certification_status, + "session": { + "session_id": "fixture-session", + "turns": [ + { + "turn_idx": 0, + "turn_id": 0, + "messages": [{"role": "user", "content": "hello"}], + "expected_output_tokens": 8, + } + ], + }, + } + ], + } + ) + ) + + +def _write_manifest_fixture( + root: Path, + relative_path: str, + *, + export_file: str, + max_model_len: int, +) -> None: + manifest_path = root / relative_path + manifest_path.parent.mkdir(parents=True, exist_ok=True) + manifest_path.write_text( + json.dumps( + { + "manifest_version": "0.1.0", + "max_model_len": max_model_len, + "exports": [{"export_file": export_file}], + } + ) + ) + @pytest.fixture def valid_single_node_matrix_entry(): """Valid single node matrix entry based on dsr1-fp4-mi355x-sglang config.""" @@ -159,6 +232,74 @@ def valid_multinode_master_config(): } +@pytest.fixture +def valid_isb1_master_config(): + """Valid ISB1 replay master config for NVIDIA PR1a.""" + return { + "image": "vllm/vllm-openai:v0.8.5", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "benchmark-type": "isb1_replay", + "runtime-stack-id": "vllm-0.8.5-h200", + "hardware-profile-id": "h200-8gpu", + "canonical-model-id": "deepseek-r1-0528", + "max-model-len": 16384, + "replay-configs": [ + { + "export-file": "datasets/isb1/exports/core/chat_8k1k.json", + "request-mode": "multi-turn", + "support-status": "supported", + "search-space": [ + { + "max-concurrency": 4, + "max-sessions": 2, + "max-turns-per-session": 6, + "max-output-len": 512, + "num-warmup-sessions": 1, + "ignore-waits": True, + "ignore-eos": False, + }, + { + "max-concurrency": 8, + }, + ], + } + ], + } + + +@pytest.fixture +def valid_isb1_matrix_entry(valid_isb1_master_config): + """Valid ISB1 replay matrix entry.""" + return { + "image": valid_isb1_master_config["image"], + "model": valid_isb1_master_config["model"], + "model-prefix": valid_isb1_master_config["model-prefix"], + "precision": valid_isb1_master_config["precision"], + "framework": valid_isb1_master_config["framework"], + "runner": valid_isb1_master_config["runner"], + "benchmark-type": valid_isb1_master_config["benchmark-type"], + "export-file": valid_isb1_master_config["replay-configs"][0]["export-file"], + "runtime-stack-id": valid_isb1_master_config["runtime-stack-id"], + "hardware-profile-id": valid_isb1_master_config["hardware-profile-id"], + "canonical-model-id": valid_isb1_master_config["canonical-model-id"], + "support-status": valid_isb1_master_config["replay-configs"][0]["support-status"], + "request-mode": valid_isb1_master_config["replay-configs"][0]["request-mode"], + "max-concurrency": 4, + "max-sessions": 2, + "max-turns-per-session": 6, + "max-output-len": 512, + "num-warmup-sessions": 1, + "ignore-waits": True, + "ignore-eos": False, + "max-model-len": valid_isb1_master_config["max-model-len"], + "exp-name": "dsr1_isb1", + } + + @pytest.fixture def valid_runner_config(): """Valid runner config based on .github/configs/runners.yaml.""" @@ -193,6 +334,10 @@ def test_key_fields_exist(self): assert Fields.SPEC_DECODING.value == "spec-decoding" assert Fields.PREFILL.value == "prefill" assert Fields.DECODE.value == "decode" + assert Fields.BENCHMARK_TYPE.value == "benchmark-type" + assert Fields.SUPPORT_STATUS.value == "support-status" + assert Fields.MAX_CONCURRENCY.value == "max-concurrency" + assert Fields.REPLAY_CONFIGS.value == "replay-configs" # ============================================================================= @@ -658,6 +803,153 @@ def test_disagg_default_false(self, valid_single_node_master_config): assert config.disagg is False +# ============================================================================= +# Test ISB1 replay models +# ============================================================================= + +class TestISB1ReplaySearchSpaceEntry: + """Tests for ISB1ReplaySearchSpaceEntry model.""" + + def test_valid_with_required_only(self): + config = ISB1ReplaySearchSpaceEntry(**{ + "max-concurrency": 4, + }) + assert config.max_concurrency == 4 + assert config.num_warmup_sessions == 0 + assert config.ignore_waits is False + assert config.ignore_eos is False + + def test_valid_with_all_fields(self): + config = ISB1ReplaySearchSpaceEntry(**{ + "max-concurrency": 8, + "max-sessions": 2, + "max-turns-per-session": 6, + "max-output-len": 512, + "num-warmup-sessions": 1, + "ignore-waits": True, + "ignore-eos": True, + }) + assert config.max_sessions == 2 + assert config.max_turns_per_session == 6 + assert config.max_output_len == 512 + assert config.num_warmup_sessions == 1 + assert config.ignore_waits is True + assert config.ignore_eos is True + + def test_missing_required_field(self): + with pytest.raises(Exception): + ISB1ReplaySearchSpaceEntry(**{ + "max-sessions": 2, + }) + + def test_extra_field_forbidden(self): + with pytest.raises(Exception): + ISB1ReplaySearchSpaceEntry(**{ + "max-concurrency": 4, + "unknown-field": "value", + }) + + +class TestISB1ReplayConfigEntry: + """Tests for ISB1ReplayConfigEntry model.""" + + def test_valid_entry(self): + config = ISB1ReplayConfigEntry(**{ + "export-file": "datasets/isb1/exports/core/chat_8k1k.json", + "request-mode": "multi-turn", + "support-status": "supported", + "search-space": [{"max-concurrency": 4}], + }) + assert config.export_file.endswith("chat_8k1k.json") + assert config.request_mode == "multi-turn" + assert config.support_status == "supported" + assert len(config.search_space) == 1 + + def test_invalid_support_status(self): + with pytest.raises(Exception): + ISB1ReplayConfigEntry(**{ + "export-file": "datasets/isb1/exports/core/chat_8k1k.json", + "request-mode": "multi-turn", + "support-status": "definitely_supported", + "search-space": [{"max-concurrency": 4}], + }) + + def test_missing_export_file(self): + with pytest.raises(Exception): + ISB1ReplayConfigEntry(**{ + "request-mode": "multi-turn", + "search-space": [{"max-concurrency": 4}], + }) + + def test_missing_request_mode(self): + with pytest.raises(Exception): + ISB1ReplayConfigEntry(**{ + "export-file": "datasets/isb1/exports/core/chat_8k1k.json", + "search-space": [{"max-concurrency": 4}], + }) + + def test_empty_search_space(self): + with pytest.raises(Exception): + ISB1ReplayConfigEntry(**{ + "export-file": "datasets/isb1/exports/core/chat_8k1k.json", + "request-mode": "multi-turn", + "search-space": [], + }) + + +class TestISB1MasterConfigEntry: + """Tests for ISB1MasterConfigEntry model.""" + + def test_valid_isb1_master_config(self, valid_isb1_master_config): + config = ISB1MasterConfigEntry(**valid_isb1_master_config) + assert config.benchmark_type == "isb1_replay" + assert config.model_prefix == "dsr1" + assert config.runner == "h200" + assert config.max_model_len == 16384 + assert len(config.replay_configs) == 1 + + def test_max_model_len_optional(self, valid_isb1_master_config): + del valid_isb1_master_config["max-model-len"] + config = ISB1MasterConfigEntry(**valid_isb1_master_config) + assert config.max_model_len is None + + def test_benchmark_type_must_match(self, valid_isb1_master_config): + valid_isb1_master_config["benchmark-type"] = "throughput" + with pytest.raises(Exception): + ISB1MasterConfigEntry(**valid_isb1_master_config) + + def test_throughput_only_field_rejected(self, valid_isb1_master_config): + valid_isb1_master_config["multinode"] = False + with pytest.raises(Exception): + ISB1MasterConfigEntry(**valid_isb1_master_config) + + def test_missing_required_field(self, valid_isb1_master_config): + del valid_isb1_master_config["runtime-stack-id"] + with pytest.raises(Exception): + ISB1MasterConfigEntry(**valid_isb1_master_config) + + +class TestISB1ReplayMatrixEntry: + """Tests for ISB1ReplayMatrixEntry model.""" + + def test_valid_entry(self, valid_isb1_matrix_entry): + entry = ISB1ReplayMatrixEntry(**valid_isb1_matrix_entry) + assert entry.benchmark_type == "isb1_replay" + assert entry.support_status == "supported" + assert entry.max_concurrency == 4 + assert entry.exp_name == "dsr1_isb1" + + def test_missing_required_field(self, valid_isb1_matrix_entry): + del valid_isb1_matrix_entry["export-file"] + with pytest.raises(Exception): + ISB1ReplayMatrixEntry(**valid_isb1_matrix_entry) + + def test_extra_throughput_field_forbidden(self, valid_isb1_matrix_entry): + valid_isb1_matrix_entry["tp"] = 8 + with pytest.raises(Exception): + ISB1ReplayMatrixEntry(**valid_isb1_matrix_entry) + + # ============================================================================= # Test validate_master_config function # ============================================================================= @@ -696,6 +988,37 @@ def test_invalid_config_raises_valueerror(self, valid_single_node_master_config) assert "failed validation" in str(exc_info.value) +class TestValidateISB1MasterConfig: + """Tests for validate_isb1_master_config function.""" + + def test_valid_isb1_config(self, valid_isb1_master_config): + configs = {"dsr1-isb1-h200-vllm": valid_isb1_master_config} + result = validate_isb1_master_config(configs) + assert result == configs + + def test_invalid_isb1_config_raises_valueerror(self, valid_isb1_master_config): + del valid_isb1_master_config["model"] + configs = {"broken-isb1-config": valid_isb1_master_config} + with pytest.raises(ValueError) as exc_info: + validate_isb1_master_config(configs) + assert "broken-isb1-config" in str(exc_info.value) + assert "failed validation" in str(exc_info.value) + + +class TestValidateISB1MatrixEntry: + """Tests for validate_isb1_matrix_entry function.""" + + def test_valid_entry(self, valid_isb1_matrix_entry): + result = validate_isb1_matrix_entry(valid_isb1_matrix_entry) + assert result == valid_isb1_matrix_entry + + def test_invalid_entry_raises_valueerror(self, valid_isb1_matrix_entry): + del valid_isb1_matrix_entry["benchmark-type"] + with pytest.raises(ValueError) as exc_info: + validate_isb1_matrix_entry(valid_isb1_matrix_entry) + assert "failed validation" in str(exc_info.value) + + # ============================================================================= # Test validate_runner_config function # ============================================================================= @@ -823,6 +1146,224 @@ def test_validation_runs_by_default(self, tmp_path): assert "failed validation" in str(exc_info.value) +class TestLoadISB1ConfigFiles: + """Tests for load_isb1_config_files function.""" + + def test_load_single_file_with_validation(self, tmp_path, valid_isb1_master_config): + config_file = tmp_path / "isb1-config.yaml" + _write_isb1_export_fixture( + tmp_path, + valid_isb1_master_config["replay-configs"][0]["export-file"], + runtime_stack_id=valid_isb1_master_config["runtime-stack-id"], + hardware_profile_id=valid_isb1_master_config["hardware-profile-id"], + canonical_model_id=valid_isb1_master_config["canonical-model-id"], + support_status=valid_isb1_master_config["replay-configs"][0]["support-status"], + ) + + config_file.write_text( + yaml.dump({"dsr1-isb1-h200-vllm": valid_isb1_master_config}) + ) + result = load_isb1_config_files([str(config_file)]) + assert "dsr1-isb1-h200-vllm" in result + assert result["dsr1-isb1-h200-vllm"]["benchmark-type"] == "isb1_replay" + + def test_export_contract_rejects_mismatched_support_status( + self, tmp_path, valid_isb1_master_config + ): + config_file = tmp_path / "isb1-config.yaml" + _write_isb1_export_fixture( + tmp_path, + valid_isb1_master_config["replay-configs"][0]["export-file"], + runtime_stack_id=valid_isb1_master_config["runtime-stack-id"], + hardware_profile_id=valid_isb1_master_config["hardware-profile-id"], + canonical_model_id=valid_isb1_master_config["canonical-model-id"], + support_status="reviewed_preview", + ) + config_file.write_text( + yaml.dump({"dsr1-isb1-h200-vllm": valid_isb1_master_config}) + ) + + with pytest.raises(ValueError) as exc_info: + load_isb1_config_files([str(config_file)]) + assert "support-status" in str(exc_info.value) + assert "Available support tiers" in str(exc_info.value) + + def test_export_contract_requires_dataset_replay_verified_certification( + self, tmp_path, valid_isb1_master_config + ): + config_file = tmp_path / "isb1-config.yaml" + _write_isb1_export_fixture( + tmp_path, + valid_isb1_master_config["replay-configs"][0]["export-file"], + runtime_stack_id=valid_isb1_master_config["runtime-stack-id"], + hardware_profile_id=valid_isb1_master_config["hardware-profile-id"], + canonical_model_id=valid_isb1_master_config["canonical-model-id"], + support_status=valid_isb1_master_config["replay-configs"][0]["support-status"], + benchmark_certification_status="pending_review", + ) + config_file.write_text( + yaml.dump({"dsr1-isb1-h200-vllm": valid_isb1_master_config}) + ) + + with pytest.raises(ValueError) as exc_info: + load_isb1_config_files([str(config_file)]) + assert "benchmark_certification_status" in str(exc_info.value) + assert "dataset_replay_verified" in str(exc_info.value) + + def test_export_contract_requires_max_model_len_for_preview_style_export( + self, tmp_path, valid_isb1_master_config + ): + config_file = tmp_path / "isb1-config.yaml" + preview_config = { + **valid_isb1_master_config, + "replay-configs": [ + { + **valid_isb1_master_config["replay-configs"][0], + "export-file": ( + "datasets/isb1/exports/preview/offload_core/" + "inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json" + ), + "support-status": "reviewed_preview", + } + ], + } + del preview_config["max-model-len"] + + _write_isb1_export_fixture( + tmp_path, + preview_config["replay-configs"][0]["export-file"], + runtime_stack_id=preview_config["runtime-stack-id"], + hardware_profile_id=preview_config["hardware-profile-id"], + canonical_model_id=preview_config["canonical-model-id"], + support_status="reviewed_preview", + ) + config_file.write_text(yaml.dump({"preview-row": preview_config})) + + with pytest.raises(ValueError) as exc_info: + load_isb1_config_files([str(config_file)]) + assert "max-model-len" in str(exc_info.value) + + def test_export_contract_accepts_preview_style_export_with_explicit_max_model_len( + self, tmp_path, valid_isb1_master_config + ): + config_file = tmp_path / "isb1-config.yaml" + preview_config = { + **valid_isb1_master_config, + "runtime-stack-id": "standalone:vllm", + "hardware-profile-id": "nvidia:h100_sxm_80gb", + "canonical-model-id": "gpt_oss_120b", + "max-model-len": 524288, + "replay-configs": [ + { + **valid_isb1_master_config["replay-configs"][0], + "export-file": ( + "datasets/isb1/exports/preview/long_context_500k/" + "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json" + ), + "support-status": "reviewed_preview", + } + ], + } + + _write_isb1_export_fixture( + tmp_path, + preview_config["replay-configs"][0]["export-file"], + runtime_stack_id=preview_config["runtime-stack-id"], + hardware_profile_id=preview_config["hardware-profile-id"], + canonical_model_id=preview_config["canonical-model-id"], + support_status="reviewed_preview", + ) + config_file.write_text(yaml.dump({"preview-row": preview_config})) + + result = load_isb1_config_files([str(config_file)]) + assert "preview-row" in result + + def test_export_contract_warns_when_manifest_max_model_len_mismatches_config( + self, tmp_path, valid_isb1_master_config + ): + config_file = tmp_path / "isb1-config.yaml" + preview_config = { + **valid_isb1_master_config, + "runtime-stack-id": "standalone:vllm", + "hardware-profile-id": "nvidia:h100_sxm_80gb", + "canonical-model-id": "qwen3_5_397b_a17b", + "max-model-len": 524288, + "replay-configs": [ + { + **valid_isb1_master_config["replay-configs"][0], + "export-file": ( + "datasets/isb1/exports/preview/long_context_500k/" + "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json" + ), + "support-status": "reviewed_preview", + } + ], + } + + export_file = preview_config["replay-configs"][0]["export-file"] + _write_isb1_export_fixture( + tmp_path, + export_file, + runtime_stack_id=preview_config["runtime-stack-id"], + hardware_profile_id=preview_config["hardware-profile-id"], + canonical_model_id=preview_config["canonical-model-id"], + support_status="reviewed_preview", + ) + _write_manifest_fixture( + tmp_path, + "datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json", + export_file=export_file, + max_model_len=1048576, + ) + config_file.write_text(yaml.dump({"preview-row": preview_config})) + + with pytest.warns(UserWarning, match="max-model-len"): + result = load_isb1_config_files([str(config_file)]) + assert "preview-row" in result + + def test_load_single_file_without_validation(self, tmp_path): + config_file = tmp_path / "isb1-config.yaml" + config_file.write_text(""" +test-isb1: + image: test-image + benchmark-type: isb1_replay +""") + result = load_isb1_config_files([str(config_file)], validate=False) + assert "test-isb1" in result + assert result["test-isb1"]["benchmark-type"] == "isb1_replay" + + def test_validation_runs_by_default(self, tmp_path): + config_file = tmp_path / "isb1-config.yaml" + config_file.write_text(""" +invalid-isb1: + image: test-image + benchmark-type: isb1_replay +""") + with pytest.raises(ValueError) as exc_info: + load_isb1_config_files([str(config_file)]) + assert "failed validation" in str(exc_info.value) + + def test_duplicate_keys_raise_error(self, tmp_path): + config1 = tmp_path / "config1.yaml" + config1.write_text(""" +duplicate-key: + benchmark-type: isb1_replay +""") + config2 = tmp_path / "config2.yaml" + config2.write_text(""" +duplicate-key: + benchmark-type: isb1_replay +""") + with pytest.raises(ValueError) as exc_info: + load_isb1_config_files([str(config1), str(config2)], validate=False) + assert "Duplicate configuration keys" in str(exc_info.value) + + def test_nonexistent_file_raises_error(self): + with pytest.raises(ValueError) as exc_info: + load_isb1_config_files(["nonexistent-isb1.yaml"]) + assert "does not exist" in str(exc_info.value) + + # ============================================================================= # Test load_runner_file # ============================================================================= diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index 312952b96..331e374b4 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -2,8 +2,12 @@ from typing import List, Optional, Union, Literal from enum import Enum +import json import pprint +import re +import warnings import yaml +from pathlib import Path """ The below class defines the field names expected to be present in the JSON entries @@ -55,6 +59,31 @@ class Fields(Enum): RUN_EVAL = 'run-eval' EVAL_ONLY = 'eval-only' + # ISB1 replay fields + BENCHMARK_TYPE = 'benchmark-type' + EXPORT_FILE = 'export-file' + RUNTIME_STACK_ID = 'runtime-stack-id' + HARDWARE_PROFILE_ID = 'hardware-profile-id' + CANONICAL_MODEL_ID = 'canonical-model-id' + REQUEST_MODE = 'request-mode' + MAX_CONCURRENCY = 'max-concurrency' + SUPPORT_STATUS = 'support-status' + MAX_SESSIONS = 'max-sessions' + MAX_TURNS_PER_SESSION = 'max-turns-per-session' + MAX_OUTPUT_LEN = 'max-output-len' + NUM_WARMUP_SESSIONS = 'num-warmup-sessions' + IGNORE_WAITS = 'ignore-waits' + IGNORE_EOS = 'ignore-eos' + REPLAY_CONFIGS = 'replay-configs' + KV_STRESS_CONFIGS = 'kv-stress-configs' + OFFLOAD_MODE = 'offload-mode' + OFFLOAD_MODES = 'offload-modes' + KV_CACHE_DTYPE = 'kv-cache-dtype' + DISABLE_PREFIX_CACHING = 'disable-prefix-caching' + USERS = 'users' + DURATION_S = 'duration-s' + WORKLOAD_TYPE = 'workload-type' + """ Below is the validation logic for the OUTPUT of utils/matrix_logic/generate_sweep_configs.py, i.e., @@ -147,6 +176,119 @@ def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict: return entry +class ISB1ReplayMatrixEntry(BaseModel): + """Pydantic model for validating ISB1 replay matrix entry structure.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + benchmark_type: Literal["isb1_replay"] = Field( + alias=Fields.BENCHMARK_TYPE.value + ) + export_file: str = Field(alias=Fields.EXPORT_FILE.value) + runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value) + hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value) + canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value) + support_status: Optional[ + Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"] + ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value) + request_mode: str = Field(alias=Fields.REQUEST_MODE.value) + max_concurrency: int = Field(alias=Fields.MAX_CONCURRENCY.value, gt=0) + max_sessions: Optional[int] = Field( + default=None, alias=Fields.MAX_SESSIONS.value, gt=0 + ) + max_turns_per_session: Optional[int] = Field( + default=None, alias=Fields.MAX_TURNS_PER_SESSION.value, gt=0 + ) + max_output_len: Optional[int] = Field( + default=None, alias=Fields.MAX_OUTPUT_LEN.value, gt=0 + ) + num_warmup_sessions: int = Field( + default=0, alias=Fields.NUM_WARMUP_SESSIONS.value, ge=0 + ) + ignore_waits: bool = Field(default=False, alias=Fields.IGNORE_WAITS.value) + ignore_eos: bool = Field(default=False, alias=Fields.IGNORE_EOS.value) + max_model_len: Optional[int] = Field( + default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0 + ) + offload_mode: Optional[Literal["on", "off", "noprefix", "legacy"]] = Field( + default=None, alias=Fields.OFFLOAD_MODE.value + ) + kv_cache_dtype: Optional[Literal["auto", "fp8"]] = Field( + default=None, alias=Fields.KV_CACHE_DTYPE.value + ) + disable_prefix_caching: Optional[bool] = Field( + default=None, alias=Fields.DISABLE_PREFIX_CACHING.value + ) + benchmark_duration_s: Optional[int] = Field( + default=None, alias='benchmark-duration-s', gt=0 + ) + exp_name: str = Field(alias=Fields.EXP_NAME.value) + + +def validate_isb1_matrix_entry(entry: dict) -> dict: + """Validate that ISB1 replay matrix entries match the expected structure.""" + try: + ISB1ReplayMatrixEntry(**entry) + except ValidationError as e: + raise ValueError( + f"The following ISB1 matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}" + ) + return entry + + +class ISB1KVStressMatrixEntry(BaseModel): + """Pydantic model for validating ISB1 KV stress matrix entry structure.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + benchmark_type: Literal["isb1_kv_stress"] = Field( + alias=Fields.BENCHMARK_TYPE.value + ) + export_file: str = Field(alias=Fields.EXPORT_FILE.value) + runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value) + hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value) + canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value) + support_status: Optional[ + Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"] + ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value) + request_mode: str = Field(alias=Fields.REQUEST_MODE.value) + max_concurrency: int = Field(alias=Fields.MAX_CONCURRENCY.value, gt=0) + offload_mode: Literal["on", "off", "noprefix", "legacy"] = Field( + alias=Fields.OFFLOAD_MODE.value + ) + kv_cache_dtype: Literal["auto", "fp8"] = Field(alias=Fields.KV_CACHE_DTYPE.value) + disable_prefix_caching: bool = Field(alias=Fields.DISABLE_PREFIX_CACHING.value) + benchmark_duration_s: int = Field(alias='benchmark-duration-s', gt=0) + workload_type: Literal["chat", "code"] = Field(alias=Fields.WORKLOAD_TYPE.value) + tp: Optional[int] = Field(default=None, alias=Fields.TP.value, gt=0) + ep: Optional[int] = Field(default=None, alias=Fields.EP.value, gt=0) + max_model_len: Optional[int] = Field( + default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0 + ) + exp_name: str = Field(alias=Fields.EXP_NAME.value) + + +def validate_isb1_kv_stress_matrix_entry(entry: dict) -> dict: + """Validate that ISB1 KV stress matrix entries match the expected structure.""" + try: + ISB1KVStressMatrixEntry(**entry) + except ValidationError as e: + raise ValueError( + f"The following ISB1 KV stress matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}" + ) + return entry + + """ Below is the validation logic for the INPUT to utils/matrix_logic/generate_sweep_configs.py, i.e., the master configuration files found in .github/configs. The validation enforces a strict set of @@ -237,6 +379,89 @@ def validate_conc_fields(self): return _validate_conc_fields(self) +class ISB1ReplaySearchSpaceEntry(BaseModel): + """ISB1 replay search space configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + max_concurrency: int = Field(alias=Fields.MAX_CONCURRENCY.value, gt=0) + max_sessions: Optional[int] = Field( + default=None, alias=Fields.MAX_SESSIONS.value, gt=0 + ) + max_turns_per_session: Optional[int] = Field( + default=None, alias=Fields.MAX_TURNS_PER_SESSION.value, gt=0 + ) + max_output_len: Optional[int] = Field( + default=None, alias=Fields.MAX_OUTPUT_LEN.value, gt=0 + ) + num_warmup_sessions: int = Field( + default=0, alias=Fields.NUM_WARMUP_SESSIONS.value, ge=0 + ) + ignore_waits: bool = Field(default=False, alias=Fields.IGNORE_WAITS.value) + ignore_eos: bool = Field(default=False, alias=Fields.IGNORE_EOS.value) + benchmark_duration_s: Optional[int] = Field( + default=None, alias='benchmark-duration-s', gt=0 + ) + + +class ISB1ReplayConfigEntry(BaseModel): + """Per-export replay configuration for ISB1.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + export_file: str = Field(alias=Fields.EXPORT_FILE.value) + request_mode: str = Field(alias=Fields.REQUEST_MODE.value) + support_status: Optional[ + Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"] + ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value) + search_space: List[ISB1ReplaySearchSpaceEntry] = Field( + alias=Fields.SEARCH_SPACE.value, min_length=1 + ) + + +class ISB1KVStressSearchSpaceEntry(BaseModel): + """ISB1 KV stress search space configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + users: List[int] = Field(alias=Fields.USERS.value, min_length=1) + offload_modes: List[Literal["on", "off", "noprefix", "legacy"]] = Field( + alias=Fields.OFFLOAD_MODES.value, + min_length=1, + ) + duration_s: int = Field(alias=Fields.DURATION_S.value, gt=0) + + +class ISB1KVStressTPConfig(BaseModel): + """Per-TP KV stress configuration for ISB1 parity sweeps.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + tp: int = Field(gt=0) + ep: int = Field(default=1, gt=0) + users: List[int] = Field(alias=Fields.USERS.value, min_length=1) + offload_modes: List[Literal["on", "off", "noprefix", "legacy"]] = Field( + alias=Fields.OFFLOAD_MODES.value, + min_length=1, + ) + duration_s: int = Field(alias=Fields.DURATION_S.value, gt=0) + + +class ISB1KVStressConfigEntry(BaseModel): + """Per-export KV stress configuration for ISB1.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + export_file: str = Field(alias=Fields.EXPORT_FILE.value) + request_mode: str = Field(alias=Fields.REQUEST_MODE.value) + support_status: Optional[ + Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"] + ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value) + workload_type: Literal["chat", "code"] = Field(alias=Fields.WORKLOAD_TYPE.value) + search_space: List[ISB1KVStressSearchSpaceEntry] = Field( + alias=Fields.SEARCH_SPACE.value, min_length=1 + ) + tp_configs: Optional[List[ISB1KVStressTPConfig]] = Field( + default=None, + alias='tp-configs', + ) + + class SingleNodeSeqLenConfig(BaseModel): """Single node sequence length configuration.""" model_config = ConfigDict(extra='forbid', populate_by_name=True) @@ -289,6 +514,335 @@ class MultiNodeMasterConfigEntry(BaseModel): alias=Fields.SEQ_LEN_CONFIGS.value) +class ISB1MasterConfigEntry(BaseModel): + """Top-level ISB1 replay master configuration entry.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + benchmark_type: Literal["isb1_replay"] = Field( + alias=Fields.BENCHMARK_TYPE.value + ) + runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value) + hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value) + canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value) + max_model_len: Optional[int] = Field( + default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0 + ) + offload_mode: Optional[Literal["on", "off", "noprefix", "legacy"]] = Field( + default=None, alias=Fields.OFFLOAD_MODE.value + ) + kv_cache_dtype: Optional[Literal["auto", "fp8"]] = Field( + default=None, alias=Fields.KV_CACHE_DTYPE.value + ) + disable_prefix_caching: Optional[bool] = Field( + default=None, alias=Fields.DISABLE_PREFIX_CACHING.value + ) + replay_configs: List[ISB1ReplayConfigEntry] = Field( + alias=Fields.REPLAY_CONFIGS.value, min_length=1 + ) + + +class ISB1KVStressMasterConfigEntry(BaseModel): + """Top-level ISB1 KV stress master configuration entry.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + benchmark_type: Literal["isb1_kv_stress"] = Field( + alias=Fields.BENCHMARK_TYPE.value + ) + runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value) + hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value) + canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value) + max_model_len: Optional[int] = Field( + default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0 + ) + kv_cache_dtype: Literal["auto", "fp8"] = Field(alias=Fields.KV_CACHE_DTYPE.value) + kv_stress_configs: List[ISB1KVStressConfigEntry] = Field( + alias=Fields.KV_STRESS_CONFIGS.value, + min_length=1, + ) + + +ISB1_SHAPE_STEM_RE = re.compile(r"(?P\d+)k(?P\d+)k") +ISB1_RUNNABLE_CERTIFICATION_STATUSES = ["dataset_replay_verified"] + + +def _candidate_config_roots(config_file: str) -> list[Path]: + """Return candidate repo roots for resolving relative export-file paths.""" + config_path = Path(config_file).resolve() + parent_candidates = [config_path.parents[i] for i in range(min(3, len(config_path.parents)))] + candidates = [ + config_path.parent, + *parent_candidates, + Path.cwd().resolve(), + ] + + unique_candidates: list[Path] = [] + for candidate in candidates: + if candidate not in unique_candidates: + unique_candidates.append(candidate) + return unique_candidates + + +def _resolve_export_path(config_file: str, export_file: str) -> Path: + """Resolve an export file relative to the config file or current repo root.""" + export_path = Path(export_file) + if export_path.is_absolute(): + return export_path + + candidate_roots = _candidate_config_roots(config_file) + for candidate_root in candidate_roots: + candidate = candidate_root / export_path + if candidate.exists(): + return candidate + + return candidate_roots[0] / export_path + + +def _load_export_payload(export_path: Path) -> dict: + """Load an ISB1 export payload from disk.""" + try: + with export_path.open("r") as handle: + payload = json.load(handle) + except FileNotFoundError as exc: + raise ValueError(f"Referenced ISB1 export file does not exist: '{export_path}'.") from exc + except json.JSONDecodeError as exc: + raise ValueError(f"Referenced ISB1 export file is not valid JSON: '{export_path}'.") from exc + + exports = payload.get("exports") + if not isinstance(exports, list) or not exports: + raise ValueError( + f"Referenced ISB1 export file must contain a non-empty 'exports' list: '{export_path}'." + ) + return payload + + +def _identity_cells(payload: dict, entry: dict) -> list[dict]: + """Return export cells matching the configured runtime/hardware/model identity.""" + return [ + cell + for cell in payload["exports"] + if cell.get("runtime_stack_id") == entry[Fields.RUNTIME_STACK_ID.value] + and cell.get("hardware_profile_id") == entry[Fields.HARDWARE_PROFILE_ID.value] + and cell.get("canonical_model_id") == entry[Fields.CANONICAL_MODEL_ID.value] + ] + + +def _warn_manifest_max_model_len_mismatch( + *, + export_path: Path, + export_file: str, + max_model_len: Optional[int], + key: str, +) -> None: + """Emit advisory warning if sibling manifest max_model_len disagrees with config.""" + if max_model_len is None: + return + + for manifest_path in sorted(export_path.parent.glob("manifest*.json")): + try: + manifest_payload = json.loads(manifest_path.read_text()) + except (OSError, json.JSONDecodeError): + continue + + manifest_exports = manifest_payload.get("exports") + if isinstance(manifest_exports, list): + export_files = { + item.get("export_file") + for item in manifest_exports + if isinstance(item, dict) and isinstance(item.get("export_file"), str) + } + if export_files and export_file not in export_files: + continue + + manifest_max_model_len = manifest_payload.get("max_model_len") + if manifest_max_model_len is None: + continue + + try: + manifest_max_model_len = int(manifest_max_model_len) + except (TypeError, ValueError): + continue + + if manifest_max_model_len != max_model_len: + warnings.warn( + f"ISB1 master config entry '{key}' sets '{Fields.MAX_MODEL_LEN.value}'=" + f"{max_model_len} for export '{export_file}', but sibling manifest " + f"'{manifest_path}' declares max_model_len={manifest_max_model_len}.", + stacklevel=2, + ) + + +def certify_isb1_replay_contract(master_configs: dict, config_file: str) -> dict: + """Validate that every replay-config resolves to a real, runnable export selection.""" + for key, entry in master_configs.items(): + max_model_len = entry.get(Fields.MAX_MODEL_LEN.value) + + for replay_config in entry[Fields.REPLAY_CONFIGS.value]: + export_file = replay_config[Fields.EXPORT_FILE.value] + support_status = replay_config.get(Fields.SUPPORT_STATUS.value) + export_path = _resolve_export_path(config_file, export_file) + payload = _load_export_payload(export_path) + _warn_manifest_max_model_len_mismatch( + export_path=export_path, + export_file=export_file, + max_model_len=max_model_len, + key=key, + ) + + if not ISB1_SHAPE_STEM_RE.search(export_path.stem) and max_model_len is None: + raise ValueError( + f"ISB1 master config entry '{key}' references mixed-shape export " + f"'{export_file}' without '{Fields.MAX_MODEL_LEN.value}'." + ) + + identity_cells = _identity_cells(payload, entry) + identity_statuses = sorted( + { + cell.get("support_status") + for cell in identity_cells + if cell.get("support_status") is not None + } + ) + matching_cells = [ + cell + for cell in identity_cells + if support_status is None or cell.get("support_status") == support_status + ] + + if support_status is None and len(identity_statuses) > 1: + raise ValueError( + f"ISB1 master config entry '{key}' must pin " + f"'{Fields.SUPPORT_STATUS.value}' for export '{export_file}'. " + f"Matching cells span multiple tiers: {identity_statuses}." + ) + + if not matching_cells: + available_statuses = identity_statuses or [""] + raise ValueError( + f"ISB1 master config entry '{key}' requests export '{export_file}' " + f"with support-status '{support_status}', but no export cell matches " + f"runtime_stack_id='{entry[Fields.RUNTIME_STACK_ID.value]}', " + f"hardware_profile_id='{entry[Fields.HARDWARE_PROFILE_ID.value]}', " + f"canonical_model_id='{entry[Fields.CANONICAL_MODEL_ID.value]}'. " + f"Available support tiers for that identity: {available_statuses}." + ) + + certification_statuses = sorted( + { + cell.get("benchmark_certification_status") + for cell in matching_cells + if cell.get("benchmark_certification_status") is not None + } + ) + if not certification_statuses: + raise ValueError( + f"ISB1 master config entry '{key}' requests export '{export_file}' " + "but the selected export cells do not declare " + "'benchmark_certification_status'." + ) + if certification_statuses != ISB1_RUNNABLE_CERTIFICATION_STATUSES: + raise ValueError( + f"ISB1 master config entry '{key}' requests export '{export_file}' " + "with runnable support tier selection, but the selected export cells " + f"have benchmark_certification_status values {certification_statuses}. " + "Current InferenceX consumer lanes only accept " + f"{ISB1_RUNNABLE_CERTIFICATION_STATUSES}." + ) + + return master_configs + + +def certify_isb1_kv_stress_contract(master_configs: dict, config_file: str) -> dict: + """Validate that every kv-stress-config resolves to a real, runnable export selection.""" + for key, entry in master_configs.items(): + max_model_len = entry.get(Fields.MAX_MODEL_LEN.value) + + for kv_stress_config in entry[Fields.KV_STRESS_CONFIGS.value]: + export_file = kv_stress_config[Fields.EXPORT_FILE.value] + support_status = kv_stress_config.get(Fields.SUPPORT_STATUS.value) + export_path = _resolve_export_path(config_file, export_file) + payload = _load_export_payload(export_path) + _warn_manifest_max_model_len_mismatch( + export_path=export_path, + export_file=export_file, + max_model_len=max_model_len, + key=key, + ) + + if not ISB1_SHAPE_STEM_RE.search(export_path.stem) and max_model_len is None: + raise ValueError( + f"ISB1 KV stress config entry '{key}' references mixed-shape export " + f"'{export_file}' without '{Fields.MAX_MODEL_LEN.value}'." + ) + + identity_cells = _identity_cells(payload, entry) + identity_statuses = sorted( + { + cell.get("support_status") + for cell in identity_cells + if cell.get("support_status") is not None + } + ) + matching_cells = [ + cell + for cell in identity_cells + if support_status is None or cell.get("support_status") == support_status + ] + + if support_status is None and len(identity_statuses) > 1: + raise ValueError( + f"ISB1 KV stress config entry '{key}' must pin " + f"'{Fields.SUPPORT_STATUS.value}' for export '{export_file}'. " + f"Matching cells span multiple tiers: {identity_statuses}." + ) + + if not matching_cells: + available_statuses = identity_statuses or [""] + raise ValueError( + f"ISB1 KV stress config entry '{key}' requests export '{export_file}' " + f"with support-status '{support_status}', but no export cell matches " + f"runtime_stack_id='{entry[Fields.RUNTIME_STACK_ID.value]}', " + f"hardware_profile_id='{entry[Fields.HARDWARE_PROFILE_ID.value]}', " + f"canonical_model_id='{entry[Fields.CANONICAL_MODEL_ID.value]}'. " + f"Available support tiers for that identity: {available_statuses}." + ) + + certification_statuses = sorted( + { + cell.get("benchmark_certification_status") + for cell in matching_cells + if cell.get("benchmark_certification_status") is not None + } + ) + if not certification_statuses: + raise ValueError( + f"ISB1 KV stress config entry '{key}' requests export '{export_file}' " + "but the selected export cells do not declare " + "'benchmark_certification_status'." + ) + if certification_statuses != ISB1_RUNNABLE_CERTIFICATION_STATUSES: + raise ValueError( + f"ISB1 KV stress config entry '{key}' requests export '{export_file}' " + "with runnable support tier selection, but the selected export cells " + f"have benchmark_certification_status values {certification_statuses}. " + "Current InferenceX consumer lanes only accept " + f"{ISB1_RUNNABLE_CERTIFICATION_STATUSES}." + ) + + return master_configs + + def validate_master_config(master_configs: dict) -> List[dict]: """Validate input master configuration structure.""" for key, entry in master_configs.items(): @@ -304,6 +858,30 @@ def validate_master_config(master_configs: dict) -> List[dict]: f"Master config entry '{key}' failed validation:\n{e}") return master_configs + +def validate_isb1_master_config(master_configs: dict) -> List[dict]: + """Validate ISB1 replay master configuration structure.""" + for key, entry in master_configs.items(): + try: + ISB1MasterConfigEntry(**entry) + except ValidationError as e: + raise ValueError( + f"ISB1 master config entry '{key}' failed validation:\n{e}" + ) + return master_configs + + +def validate_isb1_kv_stress_master_config(master_configs: dict) -> List[dict]: + """Validate ISB1 KV stress master configuration structure.""" + for key, entry in master_configs.items(): + try: + ISB1KVStressMasterConfigEntry(**entry) + except ValidationError as e: + raise ValueError( + f"ISB1 KV stress master config entry '{key}' failed validation:\n{e}" + ) + return master_configs + # Runner Config Validation @@ -371,26 +949,17 @@ class ChangelogMatrixEntry(BaseModel): # ============================================================================= -def load_config_files(config_files: List[str], validate: bool = True) -> dict: - """Load and merge configuration files. - - Args: - config_files: List of paths to YAML configuration files. - validate: If True, run validate_master_config on loaded data. Defaults to True. - - Returns: - Merged configuration dictionary. - - Raises: - ValueError: If file doesn't exist, isn't a dict, or has duplicate keys. - """ +def _load_and_merge_yaml_files(config_files: List[str]) -> dict: + """Load and merge YAML configuration files.""" all_config_data = {} for config_file in config_files: try: with open(config_file, 'r') as f: config_data = yaml.safe_load(f) - assert isinstance( - config_data, dict), f"Config file '{config_file}' must contain a dictionary" + if not isinstance(config_data, dict): + raise ValueError( + f"Config file '{config_file}' must contain a dictionary" + ) # Don't allow '*' wildcard in master config keys as we need to reserve these # for expansion in process_changelog.py @@ -411,12 +980,60 @@ def load_config_files(config_files: List[str], validate: bool = True) -> dict: except FileNotFoundError: raise ValueError(f"Input file '{config_file}' does not exist.") + return all_config_data + + +def load_config_files(config_files: List[str], validate: bool = True) -> dict: + """Load and merge throughput configuration files. + + Args: + config_files: List of paths to YAML configuration files. + validate: If True, run validate_master_config on loaded data. Defaults to True. + + Returns: + Merged configuration dictionary. + + Raises: + ValueError: If file doesn't exist, isn't a dict, or has duplicate keys. + """ + all_config_data = _load_and_merge_yaml_files(config_files) + if validate: validate_master_config(all_config_data) return all_config_data +def load_isb1_config_files(config_files: List[str], validate: bool = True) -> dict: + """Load and merge ISB1 replay configuration files.""" + all_config_data = _load_and_merge_yaml_files(config_files) + + if validate: + validate_isb1_master_config(all_config_data) + for config_file in config_files: + certify_isb1_replay_contract( + _load_and_merge_yaml_files([config_file]), + config_file=config_file, + ) + + return all_config_data + + +def load_isb1_kv_stress_config_files(config_files: List[str], validate: bool = True) -> dict: + """Load and merge ISB1 KV stress configuration files.""" + all_config_data = _load_and_merge_yaml_files(config_files) + + if validate: + validate_isb1_kv_stress_master_config(all_config_data) + for config_file in config_files: + certify_isb1_kv_stress_contract( + _load_and_merge_yaml_files([config_file]), + config_file=config_file, + ) + + return all_config_data + + def load_runner_file(runner_file: str, validate: bool = True) -> dict: """Load runner configuration file. diff --git a/utils/process_result.py b/utils/process_result.py index 0a84a1f18..e680239d1 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -4,6 +4,15 @@ from pathlib import Path +def fail_if_isb1_replay_requested(): + """Guard against sending ISB1 replay results through the throughput processor.""" + if os.environ.get('BENCHMARK_TYPE') == 'isb1_replay': + raise SystemExit( + 'process_result.py does not support ISB1 replay results. ' + 'Use utils/process_result_isb1.py instead.' + ) + + def get_required_env_vars(required_vars): """Load and validate required environment variables.""" env_values = {} @@ -22,6 +31,8 @@ def get_required_env_vars(required_vars): return env_values +fail_if_isb1_replay_requested() + # Base required env vars base_env = get_required_env_vars([ 'RUNNER_TYPE', 'FRAMEWORK', 'PRECISION', 'SPEC_DECODING', @@ -42,6 +53,12 @@ def get_required_env_vars(required_vars): with open(f'{result_filename}.json') as f: bmk_result = json.load(f) +if 'aggregate_metrics' in bmk_result and 'total_token_throughput_tps' in bmk_result['aggregate_metrics']: + raise SystemExit( + 'Detected an ISB1 replay-style result payload in process_result.py. ' + 'Use utils/process_result_isb1.py instead.' + ) + data = { 'hw': hw, 'conc': int(bmk_result['max_concurrency']), diff --git a/utils/process_result_isb1.py b/utils/process_result_isb1.py new file mode 100644 index 000000000..7f338ab2c --- /dev/null +++ b/utils/process_result_isb1.py @@ -0,0 +1,490 @@ +import json +import os +import re +import sys +from pathlib import Path +from typing import Any, Optional, Tuple + +ISB1_RUNNABLE_CERTIFICATION_STATUSES = ["dataset_replay_verified"] + + +def get_required_env_vars(required_vars): + """Load and validate required environment variables.""" + env_values = {} + missing_env_vars = [] + + for var_name in required_vars: + value = os.environ.get(var_name) + if value is None: + missing_env_vars.append(var_name) + env_values[var_name] = value + + if missing_env_vars: + raise EnvironmentError( + f"Missing required environment variables: {', '.join(missing_env_vars)}" + ) + + return env_values + + +def parse_export_shape(export_file: str) -> Tuple[int, int, Optional[str], str, dict[str, Any]]: + """Derive ISL/OSL plus export lane/surface and preview metadata from the export path/file.""" + export_path = Path(export_file) + match = re.search(r"(?P\d+)k(?P\d+)k", export_path.stem) + + isl = int(os.environ.get("ISL", "0") or 0) + osl = int(os.environ.get("OSL", "0") or 0) + surface = export_path.stem + metadata: dict[str, Any] = {} + + if match: + isl = int(match.group("isl")) * 1024 + osl = int(match.group("osl")) * 1024 + surface = export_path.stem[: match.start()].rstrip("_-") or export_path.stem + + lane = None + if "exports" in export_path.parts: + exports_idx = export_path.parts.index("exports") + if exports_idx + 1 < len(export_path.parts): + lane = export_path.parts[exports_idx + 1] + if lane == "preview" and exports_idx + 2 < len(export_path.parts): + lane = f"preview/{export_path.parts[exports_idx + 2]}" + + try: + payload = json.loads(export_path.read_text()) + except (FileNotFoundError, json.JSONDecodeError): + payload = None + + if payload is not None: + served_shape = payload.get("served_shape") or {} + isl = int(served_shape.get("isl", isl) or isl) + osl = int(served_shape.get("osl", osl) or osl) + surface = payload.get("surface") or payload.get("adapter_surface") or surface + + context_bands = sorted( + { + cell.get("context_band") + for cell in payload.get("exports", []) + if cell.get("context_band") + } + ) + metadata = { + "adapter_id": payload.get("adapter_id"), + "bundle_id": payload.get("bundle_id"), + "profile_id": payload.get("profile_id"), + "duration_tier": payload.get("duration_tier"), + "context_bands": context_bands, + "adapter_support_status": payload.get("adapter_support_status"), + "profile_tier": payload.get("tier"), + } + producer_handoff = payload.get("producer_handoff_metadata") or {} + if producer_handoff: + metadata["producer_handoff_class"] = producer_handoff.get("class") + metadata["producer_claim_boundary"] = producer_handoff.get("claim_boundary") + + # Extract producer KV expectations from first export cell trace_metadata + first_cell = (payload.get("exports") or [{}])[0] if payload.get("exports") else {} + trace_metadata = first_cell.get("trace_metadata", {}) + if trace_metadata: + metadata["producer_estimated_kv_bytes_peak"] = trace_metadata.get("estimated_kv_bytes_peak") + pressure_profile = trace_metadata.get("context_pressure_profile", {}) + metadata["producer_expected_offload_mode"] = ( + pressure_profile.get("expected_offload_mode") + or trace_metadata.get("expected_offload_mode") + ) + + return isl, osl, lane, surface, metadata + + +def validate_support_status_selection( + expected_support_status: Optional[str], selection: dict[str, Any] +) -> None: + """Ensure processed ISB1 output is labeled with the tier actually selected by the harness.""" + if not expected_support_status: + return + + selected_statuses = selection.get("support_statuses") or [] + if not selected_statuses: + raise ValueError( + "ISB1 replay result is missing selection.support_statuses; " + "cannot certify the processed support tier." + ) + + unique_statuses = sorted(set(selected_statuses)) + if unique_statuses != [expected_support_status]: + raise ValueError( + "ISB1 replay result support-status mismatch: " + f"workflow requested '{expected_support_status}' but harness selected {unique_statuses}." + ) + + +def validate_certification_selection(selection: dict[str, Any]) -> None: + """Ensure processed ISB1 output carries the expected runnable certification.""" + selected_statuses = selection.get("benchmark_certification_statuses") or [] + if not selected_statuses: + raise ValueError( + "ISB1 replay result is missing selection.benchmark_certification_statuses; " + "cannot certify the processed replay result." + ) + + unique_statuses = sorted(set(selected_statuses)) + if unique_statuses != ISB1_RUNNABLE_CERTIFICATION_STATUSES: + raise ValueError( + "ISB1 replay result benchmark-certification mismatch: " + "current consumer lanes require " + f"{ISB1_RUNNABLE_CERTIFICATION_STATUSES}, but harness selected {unique_statuses}." + ) + + +def build_context_pressure_signal( + context_pressure_class: str, + kv_offload_observed: bool, + peak_cpu_cache_usage: float, + cpu_cache_metric_available: bool, + depth_coverage_ratio: Optional[float] = None, + max_actual_context_len: Optional[int] = None, +) -> dict[str, Any]: + """Emit a machine-readable status for preview-lane context-pressure validation.""" + if context_pressure_class == "standard": + status = "not_applicable" + reason = "standard_context" + requires_log_review = False + elif depth_coverage_ratio is not None and depth_coverage_ratio < 0.1: + status = "depth_mismatch" + reason = "configured_depth_not_exercised" + requires_log_review = True + elif not cpu_cache_metric_available: + status = "observability_gap" + reason = "no_direct_cpu_cache_metric" + requires_log_review = True + elif not kv_offload_observed and peak_cpu_cache_usage == 0.0: + status = "suspicious" + reason = "high_context_without_cpu_cache_usage" + requires_log_review = True + else: + status = "ok" + reason = "cpu_cache_signal_present" + requires_log_review = False + + result = { + "status": status, + "reason": reason, + "requires_log_review": requires_log_review, + "cpu_cache_metric_available": cpu_cache_metric_available, + } + if depth_coverage_ratio is not None: + result["depth_coverage_ratio"] = round(depth_coverage_ratio, 4) + if max_actual_context_len is not None: + result["max_actual_context_len"] = max_actual_context_len + return result + + +def build_runtime_overrides(replay_result: dict[str, Any]) -> dict[str, Optional[str]]: + """Return a stable runtime-overrides payload for aggregated ISB1 results.""" + override_mapping = { + "vllm_cpu_offload_gb": "VLLM_CPU_OFFLOAD_GB", + "vllm_swap_space_gb": "VLLM_SWAP_SPACE_GB", + "sglang_mem_fraction_override": "SGLANG_MEM_FRACTION_OVERRIDE", + "sglang_chunked_prefill_override": "SGLANG_CHUNKED_PREFILL_OVERRIDE", + } + runtime_overrides: dict[str, Optional[str]] = {} + + for result_key, env_var in override_mapping.items(): + value = replay_result.get(result_key) + if value in (None, ""): + value = os.environ.get(env_var) + runtime_overrides[result_key] = value if value not in (None, "") else None + + return runtime_overrides + + +def build_artifact_stems(result_filename: str) -> dict[str, str]: + """Return artifact names emitted by benchmark-isb1-tmpl.yml for this result stem.""" + return { + "processed": f"isb1_{result_filename}", + "raw_replay": f"replay_{result_filename}", + "server_logs": f"server_logs_{result_filename}", + "gpu_metrics": f"gpu_metrics_{result_filename}", + } + + +def build_dispatch_ref() -> Optional[str]: + """Return the best available workflow dispatch ref for traceability.""" + for env_var in ("DISPATCH_REF", "INPUT_REF", "GITHUB_REF"): + value = os.environ.get(env_var) + if value not in (None, ""): + return value + return None + + +base_env = get_required_env_vars( + [ + "RUNNER_TYPE", + "FRAMEWORK", + "PRECISION", + "RESULT_FILENAME", + "MODEL_PREFIX", + "IMAGE", + "TP", + "EP_SIZE", + "DP_ATTENTION", + "BENCHMARK_TYPE", + "EXPORT_FILE", + "RUNTIME_STACK_ID", + "HARDWARE_PROFILE_ID", + "CANONICAL_MODEL_ID", + "REQUEST_MODE", + "MAX_CONCURRENCY", + ] +) + +result_filename = base_env["RESULT_FILENAME"] +with open(f"{result_filename}.json") as f: + replay_result = json.load(f) + +aggregate = replay_result["aggregate_metrics"] +tp_size = int(base_env["TP"]) +ep_size = int(base_env["EP_SIZE"]) +validate_support_status_selection( + os.environ.get("SUPPORT_STATUS") or None, + replay_result.get("selection", {}), +) +validate_certification_selection(replay_result.get("selection", {})) +isl, osl, export_lane, benchmark_surface, export_metadata = parse_export_shape( + base_env["EXPORT_FILE"] +) + +total_tput = float(aggregate["total_token_throughput_tps"]) +output_tput = float(aggregate["output_throughput_tps"]) + +server_metrics_summary = replay_result.get("server_metrics_summary", {}) +cpu_cache_metric_available_raw = server_metrics_summary.get("cpu_cache_metric_available") +cpu_cache_metric_available = bool(cpu_cache_metric_available_raw) +if cpu_cache_metric_available_raw is None: + # Backward-compatibility shim for older replay outputs that predate the + # explicit availability field. Presence of the metric name/fields is a + # better signal than the sampled value because a real metric can be present + # and legitimately report 0.0. + cpu_cache_metric_available = bool(server_metrics_summary.get("cpu_cache_metric_name")) or any( + metric_name in server_metrics_summary + for metric_name in ("cpu_cache_usage_avg", "cpu_cache_usage_peak") + ) + +data = { + "hw": base_env["RUNNER_TYPE"], + "conc": int(replay_result.get("max_concurrency", base_env["MAX_CONCURRENCY"])), + "image": base_env["IMAGE"], + "model": replay_result["model_id"], + "infmax_model_prefix": base_env["MODEL_PREFIX"], + "framework": base_env["FRAMEWORK"], + "precision": base_env["PRECISION"], + "spec_decoding": os.environ.get("SPEC_DECODING", "none"), + "disagg": False, + "isl": isl, + "osl": osl, + "is_multinode": False, + "tp": tp_size, + "ep": ep_size, + "dp_attention": base_env["DP_ATTENTION"], + "tput_per_gpu": total_tput / tp_size, + "output_tput_per_gpu": output_tput / tp_size, + "input_tput_per_gpu": (total_tput - output_tput) / tp_size, + "benchmark_type": base_env["BENCHMARK_TYPE"], + "result_filename": result_filename, + "artifact_stems": build_artifact_stems(result_filename), + "dispatch_ref": build_dispatch_ref(), + "export_file": base_env["EXPORT_FILE"], + "export_lane": export_lane, + "benchmark_surface": benchmark_surface, + "adapter_id": export_metadata.get("adapter_id"), + "bundle_id": export_metadata.get("bundle_id"), + "profile_id": export_metadata.get("profile_id"), + "duration_tier": export_metadata.get("duration_tier"), + "context_bands": export_metadata.get("context_bands", []), + "adapter_support_status": export_metadata.get("adapter_support_status"), + "profile_tier": export_metadata.get("profile_tier"), + "producer_handoff_class": export_metadata.get("producer_handoff_class"), + "producer_claim_boundary": export_metadata.get("producer_claim_boundary"), + "runtime_stack_id": base_env["RUNTIME_STACK_ID"], + "hardware_profile_id": base_env["HARDWARE_PROFILE_ID"], + "canonical_model_id": base_env["CANONICAL_MODEL_ID"], + "support_status": os.environ.get("SUPPORT_STATUS") or None, + "benchmark_certification_status": replay_result.get("selection", {}).get( + "benchmark_certification_statuses", [None] + )[0], + "request_mode": base_env["REQUEST_MODE"], + "workload_type": os.environ.get("WORKLOAD_TYPE") or benchmark_surface, + "benchmark_duration_s": ( + float(os.environ["BENCHMARK_DURATION_S"]) + if os.environ.get("BENCHMARK_DURATION_S") not in (None, "") + else None + ), + "campaign_class": ( + "kv_stress" + if base_env["BENCHMARK_TYPE"] == "isb1_kv_stress" + else "replay" + ), + "harness_request_mode": replay_result.get("harness_request_mode", "auto"), + "mode": replay_result.get("mode"), + "selection": replay_result.get("selection", {}), + "aggregate_metrics": aggregate, + "per_turn_metrics": replay_result.get("per_turn_metrics", {}), + "server_metrics_summary": server_metrics_summary, + "cache_observability_status": server_metrics_summary.get("observability_status"), + "gpu_cache_metric_name": server_metrics_summary.get("gpu_cache_metric_name"), + "cpu_cache_metric_name": server_metrics_summary.get("cpu_cache_metric_name"), + "cpu_cache_metric_available": cpu_cache_metric_available, + "kv_offload_observed": bool(server_metrics_summary.get("kv_offload_observed", False)), + "peak_gpu_cache_usage": float(server_metrics_summary.get("gpu_cache_usage_peak", 0.0)), + "peak_cpu_cache_usage": float(server_metrics_summary.get("cpu_cache_usage_peak", 0.0)), + "session_throughput_sps": float(aggregate.get("session_throughput_sps", 0.0)), + "completed_sessions": int(aggregate.get("completed_sessions", 0)), + "total_sessions": int(aggregate.get("total_sessions", 0)), + "num_sessions": replay_result.get("num_sessions"), + "max_turns": replay_result.get("max_turns"), + "num_warmup_sessions": replay_result.get( + "num_warmup_sessions", int(os.environ.get("NUM_WARMUP_SESSIONS", "0") or 0) + ), + "max_model_len": ( + int(os.environ["MAX_MODEL_LEN"]) + if os.environ.get("MAX_MODEL_LEN") not in (None, "") + else None + ), + "max_sessions": ( + int(os.environ["MAX_SESSIONS"]) + if os.environ.get("MAX_SESSIONS") not in (None, "") + else None + ), + "max_turns_per_session": ( + int(os.environ["MAX_TURNS_PER_SESSION"]) + if os.environ.get("MAX_TURNS_PER_SESSION") not in (None, "") + else None + ), + "max_output_len": ( + int(os.environ["MAX_OUTPUT_LEN"]) + if os.environ.get("MAX_OUTPUT_LEN") not in (None, "") + else None + ), + "ignore_waits": os.environ.get("IGNORE_WAITS", "false").lower() == "true", + "ignore_eos": os.environ.get("IGNORE_EOS", "false").lower() == "true", + "offload_mode": os.environ.get("OFFLOAD_MODE") or None, + "kv_cache_dtype": os.environ.get("KV_CACHE_DTYPE") or None, + "disable_prefix_caching": os.environ.get("DISABLE_PREFIX_CACHING", "false").lower() == "true", + "runtime_overrides": build_runtime_overrides(replay_result), +} + +effective_max_context_depth = data["max_model_len"] or (isl + osl + 200) +data["effective_max_context_depth"] = effective_max_context_depth +if effective_max_context_depth > 600000: + data["context_pressure_class"] = "extended_1m" +elif effective_max_context_depth > 200000: + data["context_pressure_class"] = "extended_500k" +else: + data["context_pressure_class"] = "standard" + +# Depth telemetry: actual vs configured context depth +depth_telemetry = replay_result.get("depth_telemetry", {}) +max_actual_context_len = int(depth_telemetry.get("max_actual_context_len_per_turn") or 0) or None +total_actual_input_tokens = int(depth_telemetry.get("total_actual_input_tokens") or 0) or None +depth_coverage_ratio = None +if max_actual_context_len and effective_max_context_depth > 0: + depth_coverage_ratio = max_actual_context_len / effective_max_context_depth + +data["total_actual_input_tokens"] = total_actual_input_tokens +data["max_actual_context_len_per_turn"] = max_actual_context_len +data["depth_coverage_ratio"] = round(depth_coverage_ratio, 4) if depth_coverage_ratio is not None else None +data["depth_gap_tokens"] = ( + effective_max_context_depth - max_actual_context_len + if max_actual_context_len is not None else None +) + +# Depth coverage classification +if depth_coverage_ratio is not None: + if depth_coverage_ratio >= 0.9: + data["depth_coverage_class"] = "full" + elif depth_coverage_ratio >= 0.5: + data["depth_coverage_class"] = "partial" + elif depth_coverage_ratio >= 0.1: + data["depth_coverage_class"] = "bounded_preview" + else: + data["depth_coverage_class"] = "configuration_only" +else: + data["depth_coverage_class"] = None + +# Producer expectation comparison +producer_estimated_kv_bytes_peak = export_metadata.get("producer_estimated_kv_bytes_peak") +producer_expected_offload_mode = export_metadata.get("producer_expected_offload_mode") +data["producer_estimated_kv_bytes_peak"] = producer_estimated_kv_bytes_peak +data["producer_expected_offload_mode"] = producer_expected_offload_mode + +offload_mode_match = None +if producer_expected_offload_mode and data["context_pressure_class"] != "standard": + if producer_expected_offload_mode in ("hard_offload", "soft_offload"): + offload_mode_match = data["kv_offload_observed"] + elif producer_expected_offload_mode == "none": + offload_mode_match = True +data["producer_expectation_validation"] = { + "offload_mode_match": offload_mode_match, + "kv_bytes_validation": "not_available", + "depth_exercised": bool(depth_coverage_ratio and depth_coverage_ratio >= 0.5), +} + +# Preemption count from server metrics +data["preemption_count"] = int( + server_metrics_summary.get("preemption_count") + or replay_result.get("preemption_count") + or 0 +) + +context_pressure_signal = build_context_pressure_signal( + context_pressure_class=data["context_pressure_class"], + kv_offload_observed=data["kv_offload_observed"], + peak_cpu_cache_usage=data["peak_cpu_cache_usage"], + cpu_cache_metric_available=data["cpu_cache_metric_available"], + depth_coverage_ratio=depth_coverage_ratio, + max_actual_context_len=max_actual_context_len, +) +data["context_pressure_signal"] = context_pressure_signal +data["context_pressure_suspicious"] = context_pressure_signal["status"] == "suspicious" + +if data["context_pressure_suspicious"]: + print( + "WARNING: Preview lane at " + f"max-model-len={effective_max_context_depth} saw no CPU cache usage. " + "The server may have silently capped context or failed to activate KV offload. " + "Check server.log for OOM or context truncation.", + file=sys.stderr, + ) +elif context_pressure_signal["status"] == "depth_mismatch": + print( + "WARNING: Preview lane at " + f"max-model-len={effective_max_context_depth} had max actual context of " + f"{max_actual_context_len} tokens (depth_coverage_ratio=" + f"{depth_coverage_ratio:.4f}). The server was configured for " + f"{data['context_pressure_class'].replace('extended_', '')} but requests only exercised " + f"{max_actual_context_len} tokens. This is expected for file-backed replay previews; " + "it does not prove KV pressure at the configured depth.", + file=sys.stderr, + ) +elif context_pressure_signal["status"] == "observability_gap": + print( + "WARNING: Preview lane at " + f"max-model-len={effective_max_context_depth} lacks a direct CPU cache metric " + "for this framework. Inspect server.log and operator tuning notes before " + "treating the run as credible long-context evidence.", + file=sys.stderr, + ) + +for key, value in aggregate.items(): + if key.endswith("_ms"): + data[key.replace("_ms", "")] = float(value) / 1000.0 + if "tpot" in key: + metric_value = float(value) + data[key.replace("_ms", "").replace("tpot", "intvty")] = ( + 1000.0 / metric_value if metric_value > 0 else 0.0 + ) + +print(json.dumps(data, indent=2)) + +with open(f"agg_{result_filename}.json", "w") as f: + json.dump(data, f, indent=2) diff --git a/utils/summarize_isb1.py b/utils/summarize_isb1.py new file mode 100644 index 000000000..3c2428a4b --- /dev/null +++ b/utils/summarize_isb1.py @@ -0,0 +1,238 @@ +import argparse +import json +from pathlib import Path +from typing import Any + +try: + from tabulate import tabulate as _tabulate +except ImportError: # pragma: no cover - fallback for minimal local environments + _tabulate = None + + +SUPPORT_STATUS_ORDER = { + "supported": 0, + "reviewed_preview": 1, + "gated": 2, + "artifact_only": 3, + "unsupported": 4, + None: 5, +} + + +def load_isb1_rows(results_dir: Path) -> list[dict[str, Any]]: + """Load processed ISB1 rows from a results directory.""" + rows: list[dict[str, Any]] = [] + for result_path in results_dir.rglob("*.json"): + try: + payload = json.loads(result_path.read_text()) + except (OSError, json.JSONDecodeError): + continue + + candidates = payload if isinstance(payload, list) else [payload] + for candidate in candidates: + if isinstance(candidate, dict) and candidate.get("benchmark_type") == "isb1_replay": + rows.append(candidate) + return rows + + +def sort_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Sort rows in an operator-friendly order.""" + return sorted( + rows, + key=lambda row: ( + SUPPORT_STATUS_ORDER.get(row.get("support_status"), 99), + row.get("infmax_model_prefix", ""), + row.get("hw", ""), + row.get("framework", ""), + row.get("effective_max_context_depth", 0) or 0, + row.get("result_filename", ""), + ), + ) + + +def format_float(value: Any, precision: int = 2) -> str: + """Format a numeric value for markdown tables.""" + if value is None: + return "-" + try: + return f"{float(value):.{precision}f}" + except (TypeError, ValueError): + return str(value) + + +def format_bool(value: Any) -> str: + """Format a truthy value as yes/no for operators.""" + return "yes" if bool(value) else "no" + + +def render_table(headers: list[str], rows: list[list[Any]], tablefmt: str) -> str: + """Render a markdown/plain table with a lightweight fallback if tabulate is absent.""" + normalized_rows = [[str(cell) for cell in row] for row in rows] + if _tabulate is not None: + return _tabulate(normalized_rows, headers=headers, tablefmt=tablefmt) + + widths = [len(header) for header in headers] + for row in normalized_rows: + for index, cell in enumerate(row): + widths[index] = max(widths[index], len(cell)) + + def render_row(row: list[str]) -> str: + cells = [cell.ljust(widths[index]) for index, cell in enumerate(row)] + return f"| {' | '.join(cells)} |" + + divider = f"| {' | '.join('-' * width for width in widths)} |" + lines = [render_row(headers), divider] + lines.extend(render_row(row) for row in normalized_rows) + return "\n".join(lines) + + +def build_lane_summary_table(rows: list[dict[str, Any]], tablefmt: str) -> str: + """Render the main operator lane summary table.""" + headers = [ + "Lane", + "Model", + "HW", + "Framework", + "Support", + "Cert", + "Max Ctx", + "Context Class", + "Sessions", + "Session Tput", + "TTFT Median (s)", + "Ctx Pressure", + "Log Review", + "KV Offload", + "GPU Cache Peak", + "CPU Cache Peak", + ] + table_rows = [ + [ + row.get("result_filename", "-"), + row.get("infmax_model_prefix", "-"), + row.get("hw", "-"), + row.get("framework", "-"), + row.get("support_status", "-"), + row.get("benchmark_certification_status", "-"), + row.get("effective_max_context_depth", "-"), + row.get("context_pressure_class", "-"), + f"{row.get('completed_sessions', 0)}/{row.get('total_sessions', 0)}", + format_float(row.get("session_throughput_sps"), 2), + format_float(row.get("median_ttft"), 3), + (row.get("context_pressure_signal") or {}).get("status", "-"), + format_bool((row.get("context_pressure_signal") or {}).get("requires_log_review")), + format_bool(row.get("kv_offload_observed")), + format_float(row.get("peak_gpu_cache_usage"), 2), + format_float(row.get("peak_cpu_cache_usage"), 2), + ] + for row in rows + ] + return render_table(headers, table_rows, tablefmt) + + +def build_runtime_override_table(rows: list[dict[str, Any]], tablefmt: str) -> str | None: + """Render the runtime override table when any override is present.""" + override_rows = [] + for row in rows: + runtime_overrides = row.get("runtime_overrides") or {} + if not any(value not in (None, "") for value in runtime_overrides.values()): + continue + override_rows.append( + [ + row.get("result_filename", "-"), + row.get("infmax_model_prefix", "-"), + row.get("hw", "-"), + row.get("framework", "-"), + runtime_overrides.get("vllm_cpu_offload_gb") or "-", + runtime_overrides.get("vllm_swap_space_gb") or "-", + runtime_overrides.get("sglang_mem_fraction_override") or "-", + runtime_overrides.get("sglang_chunked_prefill_override") or "-", + row.get("dispatch_ref") or "-", + ] + ) + + if not override_rows: + return None + + headers = [ + "Lane", + "Model", + "HW", + "Framework", + "VLLM CPU Offload GB", + "VLLM Swap GB", + "SGLang Mem Fraction", + "SGLang Chunked Prefill", + "Dispatch Ref", + ] + return render_table(headers, override_rows, tablefmt) + + +def build_action_items(rows: list[dict[str, Any]]) -> list[str]: + """Build operator action items for suspicious or manual-review rows.""" + items: list[str] = [] + for row in rows: + signal = row.get("context_pressure_signal") or {} + if not row.get("context_pressure_suspicious") and not signal.get("requires_log_review"): + continue + + artifact_stems = row.get("artifact_stems") or {} + items.append( + "- " + f"`{row.get('result_filename', 'unknown')}` ({row.get('infmax_model_prefix', '-')}/" + f"{row.get('hw', '-')}/{row.get('framework', '-')}) " + f"requires follow-up: context pressure `{signal.get('status', 'unknown')}`; " + f"review replay `{artifact_stems.get('raw_replay', '-')}`, " + f"logs `{artifact_stems.get('server_logs', '-')}`, " + f"GPU metrics `{artifact_stems.get('gpu_metrics', '-')}`" + + ( + f", dispatch `{row.get('dispatch_ref')}`" + if row.get("dispatch_ref") + else "" + ) + + "." + ) + return items + + +def generate_summary(results_dir: Path, tablefmt: str = "github") -> str: + """Generate an ISB1-specific operator summary in markdown/plain text.""" + rows = sort_rows(load_isb1_rows(results_dir)) + sections = ["## ISB1 Operator Summary", ""] + + if not rows: + sections.append("No ISB1 replay rows found.") + return "\n".join(sections).rstrip() + "\n" + + sections.extend(["### Lane Summary", "", build_lane_summary_table(rows, tablefmt), ""]) + + runtime_override_table = build_runtime_override_table(rows, tablefmt) + if runtime_override_table: + sections.extend(["### Runtime Overrides", "", runtime_override_table, ""]) + + action_items = build_action_items(rows) + sections.append("### Action Items") + sections.append("") + if action_items: + sections.extend(action_items) + else: + sections.append("- None. No suspicious or manual-log-review rows were detected.") + + return "\n".join(sections).rstrip() + "\n" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Generate an ISB1-specific operator summary.") + parser.add_argument("results_dir", type=Path) + parser.add_argument("--format", choices=["github", "plain"], default="github") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + print(generate_summary(args.results_dir, tablefmt=args.format)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/utils/test_benchmark_export_replay.py b/utils/test_benchmark_export_replay.py new file mode 100644 index 000000000..31e4dc656 --- /dev/null +++ b/utils/test_benchmark_export_replay.py @@ -0,0 +1,766 @@ +import asyncio +import json +from pathlib import Path + +from aiohttp import web + +from bench_serving.benchmark_export_replay import ( + load_replay_sessions, + run_export_replay_benchmark, +) + + +def _count_tokens(text: str) -> int: + return max(1, len((text or "").split())) if text else 0 + + +def _multiturn_payload(runtime_stack_id: str = "standalone:sglang") -> dict: + return { + "adapter_id": "inferencex_multiturn", + "exports": [ + { + "trace_id": "trace-chat-1", + "runtime_stack_id": runtime_stack_id, + "hardware_profile_id": "nvidia:h200_sxm_141gb", + "canonical_model_id": "qwen3_30b_a3b", + "support_status": "supported", + "benchmark_certification_status": "dataset_replay_verified", + "session": { + "session_id": "session-chat-1", + "turns": [ + { + "turn_idx": 0, + "turn_id": 0, + "messages": [ + { + "role": "user", + "content_blocks": [ + {"type": "text", "text": "Investigate the flaky test."} + ], + } + ], + "expected_output_tokens": 8, + "wait_before_ms": 0, + }, + { + "turn_idx": 1, + "turn_id": 1, + "messages": [ + { + "role": "user", + "content_blocks": [ + {"type": "text", "text": "Investigate the flaky test."} + ], + }, + { + "role": "assistant", + "content_blocks": [ + {"type": "text", "text": "I found a race in the setup."} + ], + }, + { + "role": "tool", + "content_blocks": [ + {"type": "log", "text": "pytest -k flaky_test -> failed"} + ], + }, + ], + "expected_output_tokens": 6, + "wait_before_ms": 10, + }, + ], + }, + } + ], + } + + +def _trace_replay_payload(runtime_stack_id: str = "standalone:trt_llm") -> dict: + return { + "adapter_id": "inferencex_trace_replay", + "exports": [ + { + "trace_id": "trace-replay-1", + "runtime_stack_id": runtime_stack_id, + "hardware_profile_id": "nvidia:b200_sxm_180gb", + "canonical_model_id": "gpt_oss_120b", + "support_status": "supported", + "benchmark_certification_status": "dataset_replay_verified", + "trace_metadata": {"session_id": "session-replay-1"}, + "events": [ + { + "turn_id": 0, + "arrival_time_offset_ms": 0, + "input_messages": [ + { + "role": "user", + "content_blocks": [ + {"type": "text", "text": "Summarize the incident report."} + ], + } + ], + "target_output_tokens": 7, + }, + { + "turn_id": 1, + "arrival_time_offset_ms": 25, + "input_messages": [ + { + "role": "user", + "content_blocks": [ + {"type": "text", "text": "Summarize the incident report."} + ], + }, + { + "role": "assistant", + "content_blocks": [ + {"type": "text", "text": "The outage started after deploy."} + ], + }, + ], + "target_output_tokens": 5, + }, + ], + } + ], + } + + +async def _start_mock_server( + sse_mode: str = "normal", + metrics_text: str | None = None, +) -> tuple[web.AppRunner, str]: + """Start a mock OpenAI-compatible server. + + sse_mode controls how SSE frames are written to the wire: + - "normal": one data frame per write (default) + - "multiline": multiple data frames packed into a single write + - "split": a single data frame split across two writes + """ + + async def _stream_response(request: web.Request, chunks: list[dict]) -> web.StreamResponse: + response = web.StreamResponse( + status=200, + headers={"Content-Type": "text/event-stream"}, + ) + await response.prepare(request) + + if sse_mode == "multiline": + # Pack ALL data frames into a single TCP write + blob = b"" + for chunk in chunks: + blob += f"data: {json.dumps(chunk)}\n\n".encode() + blob += b"data: [DONE]\n\n" + await response.write(blob) + elif sse_mode == "split": + # Split the first frame across two writes + for idx, chunk in enumerate(chunks): + frame = f"data: {json.dumps(chunk)}\n\n".encode() + if idx == 0: + mid = len(frame) // 2 + await response.write(frame[:mid]) + await asyncio.sleep(0.005) + await response.write(frame[mid:]) + else: + await response.write(frame) + await asyncio.sleep(0.005) + await response.write(b"data: [DONE]\n\n") + else: + for chunk in chunks: + await response.write(f"data: {json.dumps(chunk)}\n\n".encode()) + await asyncio.sleep(0.005) + await response.write(b"data: [DONE]\n\n") + + await response.write_eof() + return response + + async def chat_handler(request: web.Request) -> web.StreamResponse: + payload = await request.json() + # Verify the fallback from max_completion_tokens -> max_tokens. + if "max_completion_tokens" in payload: + return web.json_response({"error": "unsupported field"}, status=400) + assert payload["messages"] + return await _stream_response( + request, + [ + {"choices": [{"delta": {"content": "patched"}}]}, + {"usage": {"completion_tokens": 2}}, + ], + ) + + async def completions_handler(request: web.Request) -> web.StreamResponse: + payload = await request.json() + assert payload["prompt"].startswith("USER:") + return await _stream_response( + request, + [ + {"choices": [{"text": "resolved"}]}, + {"usage": {"completion_tokens": 2}}, + ], + ) + + async def metrics_handler(_: web.Request) -> web.Response: + return web.Response( + text=metrics_text + or ( + "vllm:gpu_cache_usage_perc 0.42\n" + "vllm:cpu_cache_usage_perc 0.25\n" + "sglang:cache_hit_rate 0.8\n" + ) + ) + + app = web.Application() + app.router.add_post("/v1/chat/completions", chat_handler) + app.router.add_post("/v1/completions", completions_handler) + app.router.add_get("/metrics", metrics_handler) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, host="127.0.0.1", port=0) + await site.start() + sockets = getattr(site, "_server").sockets + port = sockets[0].getsockname()[1] + return runner, f"http://127.0.0.1:{port}" + + +def test_load_replay_sessions_multiturn_chat(tmp_path: Path) -> None: + export_file = tmp_path / "multiturn.json" + export_file.write_text(json.dumps(_multiturn_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="auto", + ignore_waits=False, + ) + + assert len(sessions) == 1 + assert sessions[0].request_mode == "chat" + assert sessions[0].turns[1].wait_before_s == 0.01 + assert selection["support_statuses"] == ["supported"] + assert selection["support_status_counts"] == {"supported": 1} + assert selection["benchmark_certification_statuses"] == ["dataset_replay_verified"] + assert selection["benchmark_certification_status_counts"] == { + "dataset_replay_verified": 1 + } + assert selection["request_mode_mix"] == {"chat": 1} + + +def test_load_replay_sessions_trace_replay_auto_uses_completions(tmp_path: Path) -> None: + export_file = tmp_path / "trace_replay.json" + export_file.write_text(json.dumps(_trace_replay_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:trt_llm"}, + hardware_profile_ids={"nvidia:b200_sxm_180gb"}, + canonical_model_ids={"gpt_oss_120b"}, + request_mode="auto", + ) + + assert len(sessions) == 1 + assert sessions[0].request_mode == "completions" + assert sessions[0].turns[1].wait_before_s == 0.025 + assert sessions[0].turns[0].completion_prompt.startswith("USER:") + assert selection["support_statuses"] == ["supported"] + assert selection["benchmark_certification_statuses"] == ["dataset_replay_verified"] + assert selection["request_mode_mix"] == {"completions": 1} + + +def test_load_replay_sessions_support_status_filter(tmp_path: Path) -> None: + payload = _multiturn_payload() + payload["exports"].append( + { + **payload["exports"][0], + "trace_id": "trace-chat-preview", + "support_status": "reviewed_preview", + } + ) + export_file = tmp_path / "multiturn_mixed_status.json" + export_file.write_text(json.dumps(payload)) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + support_statuses={"supported"}, + request_mode="auto", + ignore_waits=False, + ) + + assert [session.trace_id for session in sessions] == ["trace-chat-1"] + assert selection["support_statuses"] == ["supported"] + assert selection["support_status_counts"] == {"supported": 1} + assert selection["benchmark_certification_statuses"] == ["dataset_replay_verified"] + + +def test_run_export_replay_benchmark_chat(tmp_path: Path) -> None: + export_file = tmp_path / "multiturn.json" + export_file.write_text(json.dumps(_multiturn_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="chat", + ignore_waits=True, + ) + + async def _run() -> dict: + runner, base_url = await _start_mock_server() + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="Qwen/Qwen3-30B-A3B", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=1, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + assert result["aggregate_metrics"]["completed_sessions"] == 1 + assert result["selection"]["request_mode_mix"] == {"chat": 1} + assert result["server_metrics_summary"]["samples"] >= 0 + assert result["server_metrics_summary"]["gpu_cache_usage_peak"] == 0.42 + assert result["server_metrics_summary"]["cpu_cache_usage_peak"] == 0.25 + assert result["server_metrics_summary"]["gpu_cache_metric_name"] == "vllm:gpu_cache_usage_perc" + assert result["server_metrics_summary"]["cpu_cache_metric_name"] == "vllm:cpu_cache_usage_perc" + assert result["server_metrics_summary"]["cpu_cache_metric_available"] is True + assert result["server_metrics_summary"]["observability_status"] == "direct_cpu_cache_metric" + assert result["server_metrics_summary"]["kv_offload_observed"] is True + + +def test_run_export_replay_benchmark_completions(tmp_path: Path) -> None: + export_file = tmp_path / "trace_replay.json" + export_file.write_text(json.dumps(_trace_replay_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:trt_llm"}, + hardware_profile_ids={"nvidia:b200_sxm_180gb"}, + canonical_model_ids={"gpt_oss_120b"}, + request_mode="completions", + ignore_waits=True, + ) + + async def _run() -> dict: + runner, base_url = await _start_mock_server() + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="gpt-oss-120b", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=0, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + assert result["aggregate_metrics"]["completed_sessions"] == 1 + assert result["selection"]["request_mode_mix"] == {"completions": 1} + + +def test_run_export_replay_benchmark_sglang_token_usage_metrics(tmp_path: Path) -> None: + export_file = tmp_path / "multiturn_sglang_metrics.json" + export_file.write_text(json.dumps(_multiturn_payload(runtime_stack_id="standalone:sglang"))) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="chat", + ignore_waits=True, + ) + + async def _run() -> dict: + runner, base_url = await _start_mock_server( + metrics_text=( + 'sglang:token_usage{model_name="Qwen/Qwen3-30B-A3B"} 0.61\n' + 'sglang:cache_hit_rate{model_name="Qwen/Qwen3-30B-A3B"} 0.8\n' + ) + ) + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="Qwen/Qwen3-30B-A3B", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=0, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + summary = result["server_metrics_summary"] + assert result["aggregate_metrics"]["completed_sessions"] == 1 + assert summary["samples"] >= 0 + assert summary["gpu_cache_usage_peak"] == 0.61 + assert summary["gpu_cache_metric_name"] == "sglang:token_usage" + assert summary["cpu_cache_metric_name"] is None + assert summary["cpu_cache_metric_available"] is False + assert summary["cache_hit_rate_avg"] == 0.8 + assert summary["observability_status"] == "indirect_without_cpu_cache_metric" + assert summary["kv_offload_observed"] is False + + +def test_sse_multiline_chunks(tmp_path: Path) -> None: + """Verify replay works when the server packs multiple SSE frames into one TCP write.""" + export_file = tmp_path / "multiturn.json" + export_file.write_text(json.dumps(_multiturn_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="chat", + ignore_waits=True, + ) + + async def _run() -> dict: + runner, base_url = await _start_mock_server(sse_mode="multiline") + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="Qwen/Qwen3-30B-A3B", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=0, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + assert result["aggregate_metrics"]["completed_sessions"] == 1 + + +def test_sse_split_across_chunks(tmp_path: Path) -> None: + """Verify replay works when a single SSE frame is split across TCP writes.""" + export_file = tmp_path / "multiturn.json" + export_file.write_text(json.dumps(_multiturn_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="chat", + ignore_waits=True, + ) + + async def _run() -> dict: + runner, base_url = await _start_mock_server(sse_mode="split") + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="Qwen/Qwen3-30B-A3B", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=0, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + assert result["aggregate_metrics"]["completed_sessions"] == 1 + + +def test_empty_content_no_phantom_itl(tmp_path: Path) -> None: + """Verify that SSE chunks with empty/null content don't inflate ITL counts.""" + export_file = tmp_path / "multiturn.json" + # Use a single-turn export to isolate ITL counting + single_turn_payload = { + "adapter_id": "inferencex_multiturn", + "exports": [ + { + "trace_id": "trace-itl-1", + "runtime_stack_id": "standalone:sglang", + "hardware_profile_id": "nvidia:h200_sxm_141gb", + "canonical_model_id": "qwen3_30b_a3b", + "support_status": "supported", + "session": { + "session_id": "session-itl-1", + "turns": [ + { + "turn_idx": 0, + "turn_id": 0, + "messages": [ + { + "role": "user", + "content_blocks": [ + {"type": "text", "text": "Hello"} + ], + } + ], + "expected_output_tokens": 4, + "wait_before_ms": 0, + }, + ], + }, + } + ], + } + export_file.write_text(json.dumps(single_turn_payload)) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="chat", + ignore_waits=True, + ) + + async def _run() -> dict: + # Custom server that sends empty-content chunks between real ones + async def _chat_with_empty(request: web.Request) -> web.StreamResponse: + payload = await request.json() + if "max_completion_tokens" in payload: + return web.json_response({"error": "unsupported"}, status=400) + + response = web.StreamResponse( + status=200, + headers={"Content-Type": "text/event-stream"}, + ) + await response.prepare(request) + # Frame 1: real content + await response.write( + f'data: {{"choices": [{{"delta": {{"content": "hello"}}}}]}}\n\n'.encode() + ) + await asyncio.sleep(0.005) + # Frame 2: empty content (should not generate ITL entry) + await response.write( + f'data: {{"choices": [{{"delta": {{"content": ""}}}}]}}\n\n'.encode() + ) + await asyncio.sleep(0.005) + # Frame 3: null content (should not generate ITL entry) + await response.write( + f'data: {{"choices": [{{"delta": {{}}}}]}}\n\n'.encode() + ) + await asyncio.sleep(0.005) + # Frame 4: real content + await response.write( + f'data: {{"choices": [{{"delta": {{"content": " world"}}}}]}}\n\n'.encode() + ) + await asyncio.sleep(0.005) + # Usage frame + await response.write( + f'data: {{"usage": {{"completion_tokens": 2}}}}\n\n'.encode() + ) + await response.write(b"data: [DONE]\n\n") + await response.write_eof() + return response + + app = web.Application() + app.router.add_post("/v1/chat/completions", _chat_with_empty) + app.router.add_get("/metrics", lambda _: web.Response(text="")) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, host="127.0.0.1", port=0) + await site.start() + sockets = getattr(site, "_server").sockets + port = sockets[0].getsockname()[1] + base_url = f"http://127.0.0.1:{port}" + + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="Qwen/Qwen3-30B-A3B", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=0, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + agg = result["aggregate_metrics"] + assert agg["completed_sessions"] == 1 + # With 2 real content chunks, ITL should have exactly 1 entry + # (first content = TTFT, second content = 1 ITL). Empty/null chunks + # must not inflate this count. + turn_metrics = result["per_turn_metrics"]["turn_1"] + assert turn_metrics["completed"] == 1 + + +def test_actual_context_len_for_file_backed_assets(tmp_path: Path) -> None: + """Verify that actual_context_len counts rendered payload tokens, not asset metadata.""" + payload = { + "adapter_id": "inferencex_trace_replay", + "exports": [ + { + "trace_id": "test-asset-trace", + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "nvidia:h200_sxm_141gb", + "canonical_model_id": "gpt_oss_120b", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "context_band": "xlc2_384k_512k", + "trace_metadata": { + "session_id": "test-session", + "estimated_kv_bytes_peak": 27000000000, + "expected_offload_mode": "soft_offload", + }, + "events": [ + { + "event_id": "evt-0", + "trace_id": "test-asset-trace", + "session_id": "test-session", + "turn_id": 0, + "arrival_time_offset_ms": 0, + "input_messages": [ + { + "role": "user", + "content_blocks": [ + {"type": "text", "text": "Analyze this codebase"}, + { + "type": "table", + "text": None, + "asset_path": "synthetic_v0/context_assets/big_file.md", + "asset_token_count": 500000, + "asset_byte_count": 2500000, + }, + ], + } + ], + "output": {"output_token_count": 100}, + } + ], + } + ], + } + export_file = tmp_path / "asset_test.json" + export_file.write_text(json.dumps(payload)) + + sessions, _ = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:vllm"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"gpt_oss_120b"}, + request_mode="chat", + ignore_waits=True, + ) + + assert len(sessions) == 1 + turn = sessions[0].turns[0] + + # Estimated context_len should include the 500k asset_token_count + assert turn.context_len >= 500000 + + # Actual context_len should be much smaller — just the rendered text + # "[TABLE]" is ~1 token + "Analyze this codebase" is ~3 tokens + assert turn.actual_context_len < 100 + assert turn.actual_context_len > 0 + + # The gap proves the measurement works + assert turn.context_len > turn.actual_context_len * 100 + + +def test_depth_telemetry_in_benchmark_result(tmp_path: Path) -> None: + """Verify depth_telemetry block is emitted in benchmark results.""" + export_file = tmp_path / "multiturn.json" + export_file.write_text(json.dumps(_multiturn_payload())) + + sessions, selection = load_replay_sessions( + export_file=str(export_file), + count_text_tokens=_count_tokens, + runtime_stack_ids={"standalone:sglang"}, + hardware_profile_ids={"nvidia:h200_sxm_141gb"}, + canonical_model_ids={"qwen3_30b_a3b"}, + request_mode="chat", + ignore_waits=True, + ) + + async def _run() -> dict: + runner, base_url = await _start_mock_server() + try: + return await run_export_replay_benchmark( + sessions=sessions, + selection_metadata=selection, + model_id="Qwen/Qwen3-30B-A3B", + model_name=None, + chat_api_url=f"{base_url}/v1/chat/completions", + completion_api_url=f"{base_url}/v1/completions", + count_text_tokens=_count_tokens, + max_concurrency=1, + selected_percentiles=[99], + disable_tqdm=True, + num_warmup_sessions=0, + ) + finally: + await runner.cleanup() + + result = asyncio.run(_run()) + + # depth_telemetry block must exist + assert "depth_telemetry" in result + dt = result["depth_telemetry"] + assert "total_estimated_input_tokens" in dt + assert "total_actual_input_tokens" in dt + assert "max_actual_context_len_per_turn" in dt + assert dt["total_actual_input_tokens"] > 0 + assert dt["max_actual_context_len_per_turn"] > 0 + + # Aggregate metrics must also carry actual input tokens + agg = result["aggregate_metrics"] + assert "total_actual_input_tokens" in agg + assert "max_actual_context_len_per_turn" in agg + + # Per-turn metrics should have actual context length + for turn_key, turn_metrics in result["per_turn_metrics"].items(): + assert "mean_actual_context_len" in turn_metrics diff --git a/utils/test_gate_isb1.py b/utils/test_gate_isb1.py new file mode 100644 index 000000000..3a9e590e0 --- /dev/null +++ b/utils/test_gate_isb1.py @@ -0,0 +1,218 @@ +import json +from pathlib import Path + +from gate_isb1 import build_gate_report, load_rows, main + + +def make_row( + *, + result_filename: str, + model: str, + hw: str, + framework: str, + support_status: str, + effective_max_context_depth: int, + context_pressure_class: str, + context_status: str, + requires_log_review: bool = False, + context_pressure_suspicious: bool = False, + completed_sessions: int = 2, + total_sessions: int = 2, + session_throughput_sps: float = 1.0, + benchmark_certification_status: str = "dataset_replay_verified", +): + return { + "benchmark_type": "isb1_replay", + "result_filename": result_filename, + "artifact_stems": { + "processed": f"isb1_{result_filename}", + "raw_replay": f"replay_{result_filename}", + "server_logs": f"server_logs_{result_filename}", + "gpu_metrics": f"gpu_metrics_{result_filename}", + }, + "infmax_model_prefix": model, + "hw": hw, + "framework": framework, + "support_status": support_status, + "effective_max_context_depth": effective_max_context_depth, + "context_pressure_class": context_pressure_class, + "context_pressure_signal": { + "status": context_status, + "requires_log_review": requires_log_review, + }, + "context_pressure_suspicious": context_pressure_suspicious, + "completed_sessions": completed_sessions, + "total_sessions": total_sessions, + "session_throughput_sps": session_throughput_sps, + "benchmark_certification_status": benchmark_certification_status, + } + + +def test_build_gate_report_passes_with_sglang_observability_gap(): + rows = [ + make_row( + result_filename="dsr1_control_b200_vllm", + model="dsr1", + hw="b200-cw-1", + framework="vllm", + support_status="supported", + effective_max_context_depth=9416, + context_pressure_class="standard", + context_status="not_applicable", + ), + make_row( + result_filename="gptoss_control_h100_vllm", + model="gptoss", + hw="h100-cw-1", + framework="vllm", + support_status="supported", + effective_max_context_depth=9416, + context_pressure_class="standard", + context_status="not_applicable", + ), + ] + + for hw in ("b200-cw-1", "h100-cw-1", "h200-cw-1"): + for framework in ("vllm", "sglang"): + rows.append( + make_row( + result_filename=f"qwen_131k_{hw}_{framework}", + model="qwen3.5", + hw=hw, + framework=framework, + support_status="reviewed_preview", + effective_max_context_depth=131272, + context_pressure_class="standard", + context_status="not_applicable", + ) + ) + rows.append( + make_row( + result_filename=f"qwen_500k_{hw}_{framework}", + model="qwen3.5", + hw=hw, + framework=framework, + support_status="reviewed_preview", + effective_max_context_depth=524288, + context_pressure_class="extended_500k", + context_status="ok" if framework == "vllm" else "observability_gap", + requires_log_review=framework == "sglang", + ) + ) + + rows.extend( + [ + make_row( + result_filename="qwen_1m_b200_vllm", + model="qwen3.5", + hw="b200-cw-1", + framework="vllm", + support_status="reviewed_preview", + effective_max_context_depth=1048576, + context_pressure_class="extended_1m", + context_status="ok", + ), + make_row( + result_filename="qwen_1m_b200_sglang", + model="qwen3.5", + hw="b200-cw-1", + framework="sglang", + support_status="reviewed_preview", + effective_max_context_depth=1048576, + context_pressure_class="extended_1m", + context_status="observability_gap", + requires_log_review=True, + ), + ] + ) + + report = build_gate_report(rows) + + assert report["overall"] == "pass" + assert all(gate["status"] == "pass" for gate in report["gates"]) + qwen_500k_gate = next(gate for gate in report["gates"] if gate["id"] == "qwen_500k") + assert qwen_500k_gate["review_required_rows"] + assert any( + row["result_filename"] == "qwen_500k_b200-cw-1_sglang" + for row in qwen_500k_gate["review_required_rows"] + ) + + +def test_build_gate_report_fails_control_lane_and_preserves_artifact_refs(): + rows = [ + make_row( + result_filename="dsr1_control_b200_vllm", + model="dsr1", + hw="b200-cw-1", + framework="vllm", + support_status="supported", + effective_max_context_depth=9416, + context_pressure_class="standard", + context_status="not_applicable", + completed_sessions=1, + total_sessions=2, + session_throughput_sps=0.0, + ) + ] + + report = build_gate_report(rows) + + assert report["overall"] == "fail" + control_gate = next(gate for gate in report["gates"] if gate["id"] == "control_lanes") + assert control_gate["status"] == "fail" + assert control_gate["failing_rows"][0]["result_filename"] == "dsr1_control_b200_vllm" + assert control_gate["failing_rows"][0]["artifact_stems"]["server_logs"] == "server_logs_dsr1_control_b200_vllm" + assert "completed_sessions == total_sessions" in control_gate["failing_rows"][0]["failed_criteria"] + assert "session_throughput_sps > 0" in control_gate["failing_rows"][0]["failed_criteria"] + + +def test_build_gate_report_fails_when_qwen_131k_coverage_is_missing(): + rows = [ + make_row( + result_filename="qwen_131k_b200_vllm", + model="qwen3.5", + hw="b200-cw-1", + framework="vllm", + support_status="reviewed_preview", + effective_max_context_depth=131272, + context_pressure_class="standard", + context_status="not_applicable", + ) + ] + + report = build_gate_report(rows) + + assert report["overall"] == "fail" + qwen_131k_gate = next(gate for gate in report["gates"] if gate["id"] == "qwen_131k") + assert qwen_131k_gate["status"] == "fail" + assert ["b200", "sglang"] in qwen_131k_gate["missing_coverage"] + assert ["h200", "vllm"] in qwen_131k_gate["missing_coverage"] + + +def test_build_gate_report_handles_no_rows(): + report = build_gate_report([]) + + assert report["overall"] == "partial" + assert all(gate["status"] == "no_rows" for gate in report["gates"]) + + +def test_gate_main_strict_returns_nonzero_on_failure(tmp_path): + payload = [ + make_row( + result_filename="dsr1_control_b200_vllm", + model="dsr1", + hw="b200-cw-1", + framework="vllm", + support_status="supported", + effective_max_context_depth=9416, + context_pressure_class="standard", + context_status="not_applicable", + completed_sessions=1, + total_sessions=2, + ) + ] + report_path = tmp_path / "agg_isb1.json" + report_path.write_text(json.dumps(payload)) + + assert load_rows(report_path)[0]["result_filename"] == "dsr1_control_b200_vllm" + assert main([str(report_path), "--strict"]) == 1 diff --git a/utils/test_process_result.py b/utils/test_process_result.py index 2a6389a78..8bc51d593 100644 --- a/utils/test_process_result.py +++ b/utils/test_process_result.py @@ -47,6 +47,7 @@ def base_env_vars(): "OSL": "1024", "DISAGG": "false", "MODEL_PREFIX": "dsr1", + "IMAGE": "lmsysorg/sglang:v0.4.6.post5-cu126", } @@ -299,6 +300,32 @@ def test_missing_result_file(self, tmp_path, single_node_env_vars): assert result.returncode != 0 + def test_isb1_replay_env_guard(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """ISB1 replay runs should fail fast with a helpful processor redirect.""" + env = single_node_env_vars.copy() + env["BENCHMARK_TYPE"] = "isb1_replay" + + result = run_script(tmp_path, env, sample_benchmark_result) + + assert result.returncode != 0 + assert "Use utils/process_result_isb1.py instead" in result.stderr + + def test_isb1_replay_payload_guard(self, tmp_path, single_node_env_vars): + """Replay-shaped payloads should be rejected even without BENCHMARK_TYPE set.""" + replay_like_result = { + "model_id": "test-model", + "max_concurrency": 4, + "aggregate_metrics": { + "total_token_throughput_tps": 1000.0, + "output_throughput_tps": 800.0, + }, + } + + result = run_script(tmp_path, single_node_env_vars, replay_like_result) + + assert result.returncode != 0 + assert "Detected an ISB1 replay-style result payload" in result.stderr + # ============================================================================= # Test latency and throughput calculations diff --git a/utils/test_process_result_isb1.py b/utils/test_process_result_isb1.py new file mode 100644 index 000000000..f2a4f06fb --- /dev/null +++ b/utils/test_process_result_isb1.py @@ -0,0 +1,1006 @@ +import json +import subprocess +import sys +from pathlib import Path + +import pytest + +SCRIPT_PATH = Path(__file__).parent / "process_result_isb1.py" + + +def write_export_fixture(tmp_path: Path, relative_path: str, payload: dict) -> str: + export_path = tmp_path / relative_path + export_path.parent.mkdir(parents=True, exist_ok=True) + export_path.write_text(json.dumps(payload)) + return str(export_path.relative_to(tmp_path)) + + +@pytest.fixture +def sample_replay_result(): + return { + "model_id": "deepseek-ai/DeepSeek-R1-0528", + "mode": "export_replay", + "max_concurrency": 8, + "num_sessions": 2, + "max_turns": 4, + "num_warmup_sessions": 1, + "harness_request_mode": "auto", + "selection": { + "adapter_id": "inferencex_multiturn", + "selected_sessions": 2, + "runtime_stack_ids": ["vllm-0.8.5-h200"], + "hardware_profile_ids": ["h200-8gpu"], + "canonical_model_ids": ["deepseek-r1-0528"], + "support_statuses": ["supported"], + "support_status_counts": {"supported": 2}, + "benchmark_certification_statuses": ["dataset_replay_verified"], + "benchmark_certification_status_counts": { + "dataset_replay_verified": 2 + }, + "request_mode_mix": {"chat": 2}, + }, + "server_metrics_summary": { + "cache_usage_avg": 0.45, + "cache_hit_rate_avg": 0.15, + "gpu_cache_usage_avg": 0.45, + "gpu_cache_usage_peak": 0.78, + "gpu_cache_metric_name": "vllm:gpu_cache_usage_perc", + "cpu_cache_usage_avg": 0.12, + "cpu_cache_usage_peak": 0.31, + "cpu_cache_metric_name": "vllm:cpu_cache_usage_perc", + "cpu_cache_metric_available": True, + "observability_status": "direct_cpu_cache_metric", + "kv_offload_observed": True, + "samples": 5, + }, + "per_turn_metrics": { + "turn_1": { + "completed": 2, + "mean_context_len": 8192.0, + "mean_ttft_ms": 180.0, + "p99_ttft_ms": 300.0, + "mean_e2el_ms": 1000.0, + } + }, + "aggregate_metrics": { + "completed_sessions": 2, + "total_sessions": 2, + "total_input_tokens": 1000, + "total_output_tokens": 300, + "total_wall_time_s": 2.0, + "session_throughput_sps": 1.0, + "output_throughput_tps": 150.0, + "total_token_throughput_tps": 650.0, + "mean_ttft_ms": 200.0, + "median_ttft_ms": 180.0, + "p99_ttft_ms": 500.0, + "mean_tpot_ms": 20.0, + "median_tpot_ms": 25.0, + "p99_tpot_ms": 50.0, + "mean_e2el_ms": 1200.0, + "median_e2el_ms": 1100.0, + "p99_e2el_ms": 2000.0, + }, + } + + +@pytest.fixture +def base_env(): + return { + "RUNNER_TYPE": "h200-cw-1", + "FRAMEWORK": "vllm", + "PRECISION": "fp8", + "RESULT_FILENAME": "isb1_result", + "MODEL_PREFIX": "dsr1", + "IMAGE": "vllm/vllm-openai:v0.8.5", + "TP": "8", + "EP_SIZE": "1", + "DP_ATTENTION": "false", + "BENCHMARK_TYPE": "isb1_replay", + "EXPORT_FILE": "datasets/isb1/exports/core/chat_8k1k.json", + "RUNTIME_STACK_ID": "vllm-0.8.5-h200", + "HARDWARE_PROFILE_ID": "h200-8gpu", + "CANONICAL_MODEL_ID": "deepseek-r1-0528", + "SUPPORT_STATUS": "supported", + "REQUEST_MODE": "multi-turn", + "MAX_CONCURRENCY": "8", + "SPEC_DECODING": "none", + "IGNORE_WAITS": "true", + "GITHUB_REF": "refs/heads/test-isb1-traceability", + } + + +def run_script(tmp_path, env, replay_result, result_filename="isb1_result"): + result_file = tmp_path / f"{result_filename}.json" + result_file.write_text(json.dumps(replay_result)) + + env = env.copy() + env["RESULT_FILENAME"] = result_filename + + return subprocess.run( + [sys.executable, str(SCRIPT_PATH)], + cwd=tmp_path, + env=env, + capture_output=True, + text=True, + ) + + +def assert_traceability_fields( + output_data: dict, result_filename: str, dispatch_ref: str = "refs/heads/test-isb1-traceability" +): + assert output_data["result_filename"] == result_filename + assert output_data["artifact_stems"] == { + "processed": f"isb1_{result_filename}", + "raw_replay": f"replay_{result_filename}", + "server_logs": f"server_logs_{result_filename}", + "gpu_metrics": f"gpu_metrics_{result_filename}", + } + assert output_data["dispatch_ref"] == dispatch_ref + + +def test_isb1_replay_processing(tmp_path, sample_replay_result, base_env): + export_file = write_export_fixture( + tmp_path, + "datasets/isb1/exports/core/chat_8k1k.json", + { + "adapter_id": "inferencex_multiturn", + "bundle_id": "bundle-core-chat", + "surface": "chat", + "exports": [ + { + "trace_id": "trace-1", + "runtime_stack_id": "vllm-0.8.5-h200", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "deepseek-r1-0528", + "support_status": "supported", + } + ], + }, + ) + env = base_env.copy() + env["EXPORT_FILE"] = export_file + + result = run_script(tmp_path, env, sample_replay_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + + assert output_data["benchmark_type"] == "isb1_replay" + assert output_data["request_mode"] == "multi-turn" + assert output_data["harness_request_mode"] == "auto" + assert output_data["isl"] == 8192 + assert output_data["osl"] == 1024 + assert output_data["export_lane"] == "core" + assert output_data["benchmark_surface"] == "chat" + assert output_data["support_status"] == "supported" + assert output_data["benchmark_certification_status"] == "dataset_replay_verified" + assert output_data["effective_max_context_depth"] == 8192 + 1024 + 200 + assert output_data["context_pressure_class"] == "standard" + assert output_data["context_pressure_signal"]["status"] == "not_applicable" + assert output_data["context_pressure_suspicious"] is False + assert output_data["completed_sessions"] == 2 + assert output_data["session_throughput_sps"] == pytest.approx(1.0) + assert output_data["tput_per_gpu"] == pytest.approx(650.0 / 8) + assert output_data["output_tput_per_gpu"] == pytest.approx(150.0 / 8) + assert output_data["input_tput_per_gpu"] == pytest.approx((650.0 - 150.0) / 8) + assert output_data["median_ttft"] == pytest.approx(0.18) + assert output_data["median_intvty"] == pytest.approx(40.0) + assert output_data["median_e2el"] == pytest.approx(1.1) + assert output_data["kv_offload_observed"] is True + assert output_data["peak_gpu_cache_usage"] == pytest.approx(0.78) + assert output_data["peak_cpu_cache_usage"] == pytest.approx(0.31) + assert output_data["selection"]["request_mode_mix"] == {"chat": 2} + assert output_data["selection"]["support_status_counts"] == {"supported": 2} + assert output_data["per_turn_metrics"]["turn_1"]["completed"] == 2 + assert output_data["runtime_overrides"] == { + "vllm_cpu_offload_gb": None, + "vllm_swap_space_gb": None, + "sglang_mem_fraction_override": None, + "sglang_chunked_prefill_override": None, + } + assert_traceability_fields(output_data, "isb1_result") + + output_file = tmp_path / "agg_isb1_result.json" + assert output_file.exists() + persisted_output = json.loads(output_file.read_text()) + assert_traceability_fields(persisted_output, "isb1_result") + + +def test_offload_mode_env_propagation(tmp_path, sample_replay_result, base_env): + export_file = write_export_fixture( + tmp_path, + "datasets/isb1/exports/core/chat_8k1k.json", + { + "adapter_id": "inferencex_multiturn", + "surface": "chat", + "exports": [ + { + "trace_id": "trace-1", + "runtime_stack_id": "vllm-0.8.5-h200", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "deepseek-r1-0528", + "support_status": "supported", + } + ], + }, + ) + env = base_env.copy() + env["EXPORT_FILE"] = export_file + env["OFFLOAD_MODE"] = "noprefix" + env["KV_CACHE_DTYPE"] = "fp8" + env["DISABLE_PREFIX_CACHING"] = "true" + + result = run_script(tmp_path, env, sample_replay_result, result_filename="isb1_offload_env") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["offload_mode"] == "noprefix" + assert output_data["kv_cache_dtype"] == "fp8" + assert output_data["disable_prefix_caching"] is True + + +def test_support_status_mismatch_fails(tmp_path, sample_replay_result, base_env): + export_file = write_export_fixture( + tmp_path, + "datasets/isb1/exports/core/chat_8k1k.json", + { + "adapter_id": "inferencex_multiturn", + "surface": "chat", + "exports": [ + { + "trace_id": "trace-1", + "runtime_stack_id": "vllm-0.8.5-h200", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "deepseek-r1-0528", + "support_status": "supported", + } + ], + }, + ) + replay_result = { + **sample_replay_result, + "selection": { + **sample_replay_result["selection"], + "support_statuses": ["supported"], + "support_status_counts": {"supported": 2}, + }, + } + env = base_env.copy() + env["EXPORT_FILE"] = export_file + env["SUPPORT_STATUS"] = "reviewed_preview" + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_mismatch") + assert result.returncode != 0 + assert "support-status mismatch" in result.stderr + + +def test_certification_status_mismatch_fails(tmp_path, sample_replay_result, base_env): + export_file = write_export_fixture( + tmp_path, + "datasets/isb1/exports/core/chat_8k1k.json", + { + "adapter_id": "inferencex_multiturn", + "surface": "chat", + "exports": [ + { + "trace_id": "trace-1", + "runtime_stack_id": "vllm-0.8.5-h200", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "deepseek-r1-0528", + "support_status": "supported", + "benchmark_certification_status": "dataset_replay_verified", + } + ], + }, + ) + replay_result = { + **sample_replay_result, + "selection": { + **sample_replay_result["selection"], + "benchmark_certification_statuses": ["pending_review"], + "benchmark_certification_status_counts": {"pending_review": 2}, + }, + } + env = base_env.copy() + env["EXPORT_FILE"] = export_file + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_cert_mismatch") + assert result.returncode != 0 + assert "benchmark-certification mismatch" in result.stderr + + +def test_missing_required_env_vars_fails(tmp_path, sample_replay_result): + result_file = tmp_path / "isb1_result.json" + result_file.write_text(json.dumps(sample_replay_result)) + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH)], + cwd=tmp_path, + env={"PATH": "/usr/bin", "RESULT_FILENAME": "isb1_result"}, + capture_output=True, + text=True, + ) + + assert result.returncode != 0 + assert "Missing required environment variables" in result.stderr + + +def test_dispatch_ref_prefers_explicit_override(tmp_path, sample_replay_result, base_env): + export_file = write_export_fixture( + tmp_path, + "datasets/isb1/exports/core/chat_8k1k.json", + { + "adapter_id": "inferencex_multiturn", + "bundle_id": "bundle-core-chat", + "surface": "chat", + "exports": [ + { + "trace_id": "trace-1", + "runtime_stack_id": "vllm-0.8.5-h200", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "deepseek-r1-0528", + "support_status": "supported", + } + ], + }, + ) + env = base_env.copy() + env["EXPORT_FILE"] = export_file + env["DISPATCH_REF"] = "refs/tags/isb1-dispatch-override" + + result = run_script(tmp_path, env, sample_replay_result, result_filename="isb1_dispatch_override") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert_traceability_fields( + output_data, + "isb1_dispatch_override", + dispatch_ref="refs/tags/isb1-dispatch-override", + ) + + +def test_preview_offload_core_processing(tmp_path, sample_replay_result, base_env): + preview_export = ( + write_export_fixture( + tmp_path, + "datasets/isb1/exports/preview/offload_core/" + "inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json", + { + "adapter_id": "inferencex_multiturn", + "profile_id": "chat_hopper_blackwell_offload_core_v1", + "duration_tier": "smoke", + "adapter_surface": "chat", + "tier": "reviewed_preview", + "adapter_support_status": "reviewed_preview", + "exports": [ + { + "context_band": "lc1_8k_16k", + }, + { + "context_band": "lc3_96k_128k", + }, + ], + "producer_handoff_metadata": { + "class": "phase_2_offload_core_preview", + "claim_boundary": "Not blanket certification.", + }, + }, + ) + ) + + env = base_env.copy() + env["EXPORT_FILE"] = preview_export + env["SUPPORT_STATUS"] = "reviewed_preview" + env["MAX_MODEL_LEN"] = "131272" + replay_result = { + **sample_replay_result, + "selection": { + **sample_replay_result["selection"], + "support_statuses": ["reviewed_preview"], + "support_status_counts": {"reviewed_preview": 2}, + }, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_preview") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["export_lane"] == "preview/offload_core" + assert output_data["benchmark_surface"] == "chat" + assert output_data["profile_id"] == "chat_hopper_blackwell_offload_core_v1" + assert output_data["duration_tier"] == "smoke" + assert output_data["context_bands"] == ["lc1_8k_16k", "lc3_96k_128k"] + assert output_data["producer_handoff_class"] == "phase_2_offload_core_preview" + assert output_data["support_status"] == "reviewed_preview" + assert output_data["isl"] == 0 + assert output_data["osl"] == 0 + assert_traceability_fields(output_data, "isb1_preview") + + +def test_qwen_500k_preview_processing_preserves_served_shape_and_context_band( + tmp_path, sample_replay_result, base_env +): + preview_export = write_export_fixture( + tmp_path, + "datasets/isb1/exports/preview/long_context_500k/" + "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json", + { + "adapter_id": "inferencex_trace_replay", + "bundle_id": "isb1_preview_long_context_500k_vllm_code_xlc2_qwen3_5", + "profile_id": "coding_qwen3.5_xlc2_500k_preview_v1", + "duration_tier": "standard", + "surface": "code", + "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, + "tier": "reviewed_preview", + "adapter_support_status": "reviewed_preview", + "producer_handoff_metadata": { + "class": "bounded_500k_class", + "claim_boundary": "Replay-derived 500k preview only.", + }, + "exports": [ + { + "context_band": "xlc2_384k_512k", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "nvidia:b200_sxm_180gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "kv_mode": "offload_cliff", + }, + { + "context_band": "xlc2_384k_512k", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "nvidia:h100_sxm_80gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "kv_mode": "offload_cliff", + }, + { + "context_band": "xlc2_384k_512k", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "nvidia:h200_sxm_141gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "kv_mode": "offload_cliff", + }, + ], + }, + ) + + env = base_env.copy() + env.update( + { + "RUNNER_TYPE": "b200-cw-1", + "FRAMEWORK": "vllm", + "MODEL_PREFIX": "qwen3.5", + "IMAGE": "vllm/vllm-openai:v0.8.5", + "EXPORT_FILE": preview_export, + "RUNTIME_STACK_ID": "standalone:vllm", + "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb", + "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b", + "SUPPORT_STATUS": "reviewed_preview", + "MAX_MODEL_LEN": "524288", + "VLLM_CPU_OFFLOAD_GB": "120", + "VLLM_SWAP_SPACE_GB": "24", + } + ) + replay_result = { + **sample_replay_result, + "model_id": "Qwen/Qwen3.5-397B-A17B-FP8", + "vllm_cpu_offload_gb": "128", + "vllm_swap_space_gb": "32", + "selection": { + **sample_replay_result["selection"], + "runtime_stack_ids": ["standalone:vllm"], + "hardware_profile_ids": ["nvidia:b200_sxm_180gb"], + "canonical_model_ids": ["qwen3_5_397b_a17b"], + "support_statuses": ["reviewed_preview"], + "support_status_counts": {"reviewed_preview": 3}, + "request_mode_mix": {"code": 3}, + }, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["export_lane"] == "preview/long_context_500k" + assert output_data["benchmark_surface"] == "code" + assert output_data["profile_id"] == "coding_qwen3.5_xlc2_500k_preview_v1" + assert output_data["context_bands"] == ["xlc2_384k_512k"] + assert output_data["producer_handoff_class"] == "bounded_500k_class" + assert output_data["support_status"] == "reviewed_preview" + assert output_data["benchmark_certification_status"] == "dataset_replay_verified" + assert output_data["isl"] == 131072 + assert output_data["osl"] == 1024 + assert output_data["max_model_len"] == 524288 + assert output_data["effective_max_context_depth"] == 524288 + assert output_data["context_pressure_class"] == "extended_500k" + assert output_data["context_pressure_signal"]["status"] == "ok" + assert output_data["context_pressure_suspicious"] is False + assert output_data["kv_offload_observed"] is True + assert output_data["runtime_overrides"] == { + "vllm_cpu_offload_gb": "128", + "vllm_swap_space_gb": "32", + "sglang_mem_fraction_override": None, + "sglang_chunked_prefill_override": None, + } + assert_traceability_fields(output_data, "isb1_qwen_500k") + + +def test_qwen_1m_preview_processing_preserves_8k_served_shape_and_offload_metadata( + tmp_path, sample_replay_result, base_env +): + preview_export = write_export_fixture( + tmp_path, + "datasets/isb1/exports/preview/long_context_1m/" + "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json", + { + "adapter_id": "inferencex_trace_replay", + "bundle_id": "isb1_preview_long_context_1m_vllm_code_ulc2_qwen3_5", + "profile_id": "coding_qwen3.5_ulc2_1m_preview_v1", + "duration_tier": "standard", + "surface": "code", + "served_shape": {"shape_family": "8k1k", "isl": 8192, "osl": 1024}, + "tier": "reviewed_preview", + "adapter_support_status": "reviewed_preview", + "producer_handoff_metadata": { + "class": "bounded_1m_class", + "claim_boundary": "Manual 1M preview only.", + }, + "exports": [ + { + "context_band": "ulc2_1m_plus", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "nvidia:b200_sxm_180gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "kv_mode": "offload_cliff", + } + ], + }, + ) + + env = base_env.copy() + env.update( + { + "RUNNER_TYPE": "b200-cw-1", + "FRAMEWORK": "vllm", + "MODEL_PREFIX": "qwen3.5", + "IMAGE": "vllm/vllm-openai:v0.8.5", + "EXPORT_FILE": preview_export, + "RUNTIME_STACK_ID": "standalone:vllm", + "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb", + "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b", + "SUPPORT_STATUS": "reviewed_preview", + "MAX_MODEL_LEN": "1048576", + "MAX_SESSIONS": "1", + "MAX_TURNS_PER_SESSION": "3", + } + ) + replay_result = { + **sample_replay_result, + "model_id": "Qwen/Qwen3.5-397B-A17B-FP8", + "selection": { + **sample_replay_result["selection"], + "runtime_stack_ids": ["standalone:vllm"], + "hardware_profile_ids": ["nvidia:b200_sxm_180gb"], + "canonical_model_ids": ["qwen3_5_397b_a17b"], + "support_statuses": ["reviewed_preview"], + "support_status_counts": {"reviewed_preview": 1}, + "request_mode_mix": {"code": 1}, + }, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_1m") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["export_lane"] == "preview/long_context_1m" + assert output_data["benchmark_surface"] == "code" + assert output_data["profile_id"] == "coding_qwen3.5_ulc2_1m_preview_v1" + assert output_data["context_bands"] == ["ulc2_1m_plus"] + assert output_data["producer_handoff_class"] == "bounded_1m_class" + assert output_data["support_status"] == "reviewed_preview" + assert output_data["benchmark_certification_status"] == "dataset_replay_verified" + assert output_data["isl"] == 8192 + assert output_data["osl"] == 1024 + assert output_data["max_model_len"] == 1048576 + assert output_data["effective_max_context_depth"] == 1048576 + assert output_data["context_pressure_class"] == "extended_1m" + assert output_data["context_pressure_signal"]["status"] == "ok" + assert output_data["context_pressure_suspicious"] is False + assert output_data["max_sessions"] == 1 + assert output_data["max_turns_per_session"] == 3 + assert output_data["kv_offload_observed"] is True + assert_traceability_fields(output_data, "isb1_qwen_1m") + + +def test_context_pressure_warning_on_high_context_without_cpu_cache( + tmp_path, sample_replay_result, base_env +): + preview_export = write_export_fixture( + tmp_path, + "datasets/isb1/exports/preview/long_context_500k/" + "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json", + { + "adapter_id": "inferencex_trace_replay", + "bundle_id": "isb1_preview_long_context_500k_vllm_code_xlc2_qwen3_5", + "profile_id": "coding_qwen3.5_xlc2_500k_preview_v1", + "duration_tier": "standard", + "surface": "code", + "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, + "tier": "reviewed_preview", + "adapter_support_status": "reviewed_preview", + "exports": [ + { + "context_band": "xlc2_384k_512k", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "nvidia:b200_sxm_180gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "kv_mode": "offload_cliff", + } + ], + }, + ) + + env = base_env.copy() + env.update( + { + "RUNNER_TYPE": "b200-cw-1", + "FRAMEWORK": "vllm", + "MODEL_PREFIX": "qwen3.5", + "IMAGE": "vllm/vllm-openai:v0.8.5", + "EXPORT_FILE": preview_export, + "RUNTIME_STACK_ID": "standalone:vllm", + "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb", + "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b", + "SUPPORT_STATUS": "reviewed_preview", + "MAX_MODEL_LEN": "524288", + } + ) + replay_result = { + **sample_replay_result, + "model_id": "Qwen/Qwen3.5-397B-A17B-FP8", + "selection": { + **sample_replay_result["selection"], + "runtime_stack_ids": ["standalone:vllm"], + "hardware_profile_ids": ["nvidia:b200_sxm_180gb"], + "canonical_model_ids": ["qwen3_5_397b_a17b"], + "support_statuses": ["reviewed_preview"], + "support_status_counts": {"reviewed_preview": 1}, + "request_mode_mix": {"code": 1}, + }, + "server_metrics_summary": { + "cache_usage_avg": 0.45, + "cache_hit_rate_avg": 0.15, + "gpu_cache_usage_avg": 0.45, + "gpu_cache_usage_peak": 0.91, + "gpu_cache_metric_name": "vllm:gpu_cache_usage_perc", + "cpu_cache_usage_avg": 0.0, + "cpu_cache_usage_peak": 0.0, + "cpu_cache_metric_name": "vllm:cpu_cache_usage_perc", + "cpu_cache_metric_available": True, + "observability_status": "direct_cpu_cache_metric", + "kv_offload_observed": False, + "samples": 5, + }, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k_warn") + assert result.returncode == 0, f"Script failed: {result.stderr}" + assert "saw no CPU cache usage" in result.stderr + + output_data = json.loads(result.stdout) + assert output_data["context_pressure_signal"]["status"] == "suspicious" + assert output_data["context_pressure_suspicious"] is True + assert_traceability_fields(output_data, "isb1_qwen_500k_warn") + + +def test_context_pressure_signal_marks_sglang_observability_gap( + tmp_path, sample_replay_result, base_env +): + preview_export = write_export_fixture( + tmp_path, + "datasets/isb1/exports/preview/long_context_500k/" + "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json", + { + "adapter_id": "inferencex_trace_replay", + "bundle_id": "isb1_preview_long_context_500k_sglang_code_xlc2_qwen3_5", + "profile_id": "coding_qwen3.5_xlc2_500k_preview_v1", + "duration_tier": "standard", + "surface": "code", + "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, + "tier": "reviewed_preview", + "adapter_support_status": "reviewed_preview", + "exports": [ + { + "context_band": "xlc2_384k_512k", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "runtime_stack_id": "standalone:sglang", + "hardware_profile_id": "nvidia:b200_sxm_180gb", + "canonical_model_id": "qwen3_5_397b_a17b", + "kv_mode": "offload_cliff", + } + ], + }, + ) + + env = base_env.copy() + env.update( + { + "RUNNER_TYPE": "b200-cw-1", + "FRAMEWORK": "sglang", + "MODEL_PREFIX": "qwen3.5", + "IMAGE": "lmsysorg/sglang:v0.5.9-cu130", + "EXPORT_FILE": preview_export, + "RUNTIME_STACK_ID": "standalone:sglang", + "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb", + "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b", + "SUPPORT_STATUS": "reviewed_preview", + "MAX_MODEL_LEN": "524288", + "SGLANG_MEM_FRACTION_OVERRIDE": "0.77", + "SGLANG_CHUNKED_PREFILL_OVERRIDE": "65536", + } + ) + replay_result = { + **sample_replay_result, + "model_id": "Qwen/Qwen3.5-397B-A17B-FP8", + "selection": { + **sample_replay_result["selection"], + "runtime_stack_ids": ["standalone:sglang"], + "hardware_profile_ids": ["nvidia:b200_sxm_180gb"], + "canonical_model_ids": ["qwen3_5_397b_a17b"], + "support_statuses": ["reviewed_preview"], + "support_status_counts": {"reviewed_preview": 1}, + "request_mode_mix": {"code": 1}, + }, + "server_metrics_summary": { + "cache_usage_avg": 0.52, + "cache_hit_rate_avg": 0.23, + "gpu_cache_usage_avg": 0.52, + "gpu_cache_usage_peak": 0.88, + "gpu_cache_metric_name": "sglang:token_usage", + "cpu_cache_usage_avg": 0.0, + "cpu_cache_usage_peak": 0.0, + "cpu_cache_metric_name": None, + "cpu_cache_metric_available": False, + "observability_status": "indirect_without_cpu_cache_metric", + "kv_offload_observed": False, + "samples": 5, + }, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k_sglang") + assert result.returncode == 0, f"Script failed: {result.stderr}" + assert "lacks a direct CPU cache metric" in result.stderr + + output_data = json.loads(result.stdout) + assert output_data["context_pressure_signal"]["status"] == "observability_gap" + assert output_data["context_pressure_signal"]["requires_log_review"] is True + assert output_data["context_pressure_suspicious"] is False + assert output_data["runtime_overrides"] == { + "vllm_cpu_offload_gb": None, + "vllm_swap_space_gb": None, + "sglang_mem_fraction_override": "0.77", + "sglang_chunked_prefill_override": "65536", + } + assert_traceability_fields(output_data, "isb1_qwen_500k_sglang") + + +def test_depth_coverage_ratio_for_500k_preview(tmp_path, base_env, sample_replay_result): + """Verify depth coverage ratio and class for a 500k preview with 131k actual tokens.""" + export_payload = { + "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, + "surface": "code", + "exports": [ + { + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "qwen3_5_397b_a17b", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "context_band": "xlc2_384k_512k", + "trace_metadata": { + "estimated_kv_bytes_peak": 27294647296, + "context_pressure_profile": { + "expected_offload_mode": "soft_offload", + }, + "expected_offload_mode": "soft_offload", + }, + } + ], + } + export_file = write_export_fixture( + tmp_path, "datasets/isb1/exports/preview/long_context_500k/test_500k.json", export_payload + ) + + env = base_env.copy() + env["EXPORT_FILE"] = export_file + env["MODEL_PREFIX"] = "qwen3.5" + env["CANONICAL_MODEL_ID"] = "qwen3_5_397b_a17b" + env["SUPPORT_STATUS"] = "reviewed_preview" + env["MAX_MODEL_LEN"] = "524288" + env["FRAMEWORK"] = "vllm" + + replay_result = sample_replay_result.copy() + replay_result["selection"] = { + **replay_result["selection"], + "support_statuses": ["reviewed_preview"], + } + replay_result["server_metrics_summary"] = { + "gpu_cache_usage_avg": 0.35, + "gpu_cache_usage_peak": 0.42, + "cpu_cache_usage_avg": 0.15, + "cpu_cache_usage_peak": 0.25, + "cpu_cache_metric_available": True, + "observability_status": "direct_cpu_cache_metric", + "kv_offload_observed": True, + "samples": 10, + } + replay_result["depth_telemetry"] = { + "total_estimated_input_tokens": 500000, + "total_actual_input_tokens": 131072, + "max_actual_context_len_per_turn": 131072, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k_depth") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + + # Depth coverage ratio: 131072 / 524288 ≈ 0.25 + assert output_data["depth_coverage_ratio"] is not None + assert 0.24 < output_data["depth_coverage_ratio"] < 0.26 + assert output_data["depth_coverage_class"] == "bounded_preview" + assert output_data["max_actual_context_len_per_turn"] == 131072 + assert output_data["depth_gap_tokens"] == 524288 - 131072 + + # Producer expectation validation + assert output_data["producer_estimated_kv_bytes_peak"] == 27294647296 + assert output_data["producer_expected_offload_mode"] == "soft_offload" + assert output_data["producer_expectation_validation"]["offload_mode_match"] is True + assert output_data["producer_expectation_validation"]["depth_exercised"] is False + + # Preemption count + assert output_data["preemption_count"] == 0 + + +def test_depth_mismatch_warning_for_configuration_only(tmp_path, base_env, sample_replay_result): + """Verify depth_mismatch status when actual context is <10% of configured.""" + export_payload = { + "served_shape": {"shape_family": "8k1k", "isl": 8192, "osl": 1024}, + "surface": "code", + "exports": [ + { + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "qwen3_5_397b_a17b", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "context_band": "ulc2_1m_plus", + "trace_metadata": { + "estimated_kv_bytes_peak": 39500000000, + "expected_offload_mode": "hard_offload", + }, + } + ], + } + export_file = write_export_fixture( + tmp_path, "datasets/isb1/exports/preview/long_context_1m/test_1m.json", export_payload + ) + + env = base_env.copy() + env["EXPORT_FILE"] = export_file + env["MODEL_PREFIX"] = "qwen3.5" + env["CANONICAL_MODEL_ID"] = "qwen3_5_397b_a17b" + env["SUPPORT_STATUS"] = "reviewed_preview" + env["MAX_MODEL_LEN"] = "1048576" + env["FRAMEWORK"] = "vllm" + + replay_result = sample_replay_result.copy() + replay_result["selection"] = { + **replay_result["selection"], + "support_statuses": ["reviewed_preview"], + } + replay_result["server_metrics_summary"] = { + "gpu_cache_usage_avg": 0.10, + "gpu_cache_usage_peak": 0.15, + "cpu_cache_usage_avg": 0.05, + "cpu_cache_usage_peak": 0.10, + "cpu_cache_metric_available": True, + "observability_status": "direct_cpu_cache_metric", + "kv_offload_observed": True, + "samples": 5, + } + # 1M preview sends only 8k actual tokens + replay_result["depth_telemetry"] = { + "total_estimated_input_tokens": 1600000, + "total_actual_input_tokens": 8192, + "max_actual_context_len_per_turn": 8192, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_1m_depth") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + + # 8192 / 1048576 ≈ 0.0078 — less than 0.1 threshold + assert output_data["depth_coverage_ratio"] < 0.01 + assert output_data["depth_coverage_class"] == "configuration_only" + assert output_data["context_pressure_signal"]["status"] == "depth_mismatch" + assert output_data["context_pressure_signal"]["reason"] == "configured_depth_not_exercised" + assert "depth_coverage_ratio" in output_data["context_pressure_signal"] + assert "configured for" in result.stderr + + +def test_producer_expectation_offload_mismatch(tmp_path, base_env, sample_replay_result): + """Verify producer expectation validation when offload is expected but not observed.""" + export_payload = { + "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024}, + "surface": "code", + "exports": [ + { + "runtime_stack_id": "standalone:vllm", + "hardware_profile_id": "h200-8gpu", + "canonical_model_id": "gpt_oss_120b", + "support_status": "reviewed_preview", + "benchmark_certification_status": "dataset_replay_verified", + "context_band": "xlc2_384k_512k", + "trace_metadata": { + "estimated_kv_bytes_peak": 27000000000, + "context_pressure_profile": { + "expected_offload_mode": "hard_offload", + }, + }, + } + ], + } + export_file = write_export_fixture( + tmp_path, "datasets/isb1/exports/preview/long_context_500k/test_mismatch.json", export_payload + ) + + env = base_env.copy() + env["EXPORT_FILE"] = export_file + env["MODEL_PREFIX"] = "gptoss" + env["CANONICAL_MODEL_ID"] = "gpt_oss_120b" + env["SUPPORT_STATUS"] = "reviewed_preview" + env["MAX_MODEL_LEN"] = "524288" + + replay_result = sample_replay_result.copy() + replay_result["selection"] = { + **replay_result["selection"], + "support_statuses": ["reviewed_preview"], + } + replay_result["server_metrics_summary"] = { + "gpu_cache_usage_avg": 0.50, + "gpu_cache_usage_peak": 0.60, + "cpu_cache_usage_avg": 0.0, + "cpu_cache_usage_peak": 0.0, + "cpu_cache_metric_available": True, + "observability_status": "direct_cpu_cache_metric", + "kv_offload_observed": False, + "samples": 10, + } + replay_result["depth_telemetry"] = { + "total_estimated_input_tokens": 400000, + "total_actual_input_tokens": 131072, + "max_actual_context_len_per_turn": 131072, + } + + result = run_script(tmp_path, env, replay_result, result_filename="isb1_mismatch") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + + # Producer expected hard_offload, but kv_offload_observed is False + assert output_data["producer_expectation_validation"]["offload_mode_match"] is False + assert output_data["producer_expected_offload_mode"] == "hard_offload" + assert output_data["kv_offload_observed"] is False diff --git a/utils/test_summarize_isb1.py b/utils/test_summarize_isb1.py new file mode 100644 index 000000000..3f4320594 --- /dev/null +++ b/utils/test_summarize_isb1.py @@ -0,0 +1,105 @@ +import json +from pathlib import Path + +from summarize_isb1 import generate_summary + + +def write_result(path: Path, payload: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload)) + + +def make_row(**overrides): + row = { + "benchmark_type": "isb1_replay", + "result_filename": "isb1_control_vllm_b200", + "artifact_stems": { + "processed": "isb1_isb1_control_vllm_b200", + "raw_replay": "replay_isb1_control_vllm_b200", + "server_logs": "server_logs_isb1_control_vllm_b200", + "gpu_metrics": "gpu_metrics_isb1_control_vllm_b200", + }, + "dispatch_ref": "refs/heads/test-summary", + "infmax_model_prefix": "dsr1", + "hw": "b200-cw-1", + "framework": "vllm", + "support_status": "supported", + "benchmark_certification_status": "dataset_replay_verified", + "effective_max_context_depth": 9416, + "context_pressure_class": "standard", + "context_pressure_signal": { + "status": "not_applicable", + "requires_log_review": False, + }, + "context_pressure_suspicious": False, + "completed_sessions": 2, + "total_sessions": 2, + "session_throughput_sps": 1.25, + "median_ttft": 0.18, + "kv_offload_observed": True, + "peak_gpu_cache_usage": 0.78, + "peak_cpu_cache_usage": 0.31, + "runtime_overrides": { + "vllm_cpu_offload_gb": None, + "vllm_swap_space_gb": None, + "sglang_mem_fraction_override": None, + "sglang_chunked_prefill_override": None, + }, + } + row.update(overrides) + return row + + +def test_generate_summary_surfaces_lane_override_and_action_sections(tmp_path): + control_row = make_row() + review_row = make_row( + result_filename="isb1_qwen_500k_sglang", + artifact_stems={ + "processed": "isb1_isb1_qwen_500k_sglang", + "raw_replay": "replay_isb1_qwen_500k_sglang", + "server_logs": "server_logs_isb1_qwen_500k_sglang", + "gpu_metrics": "gpu_metrics_isb1_qwen_500k_sglang", + }, + infmax_model_prefix="qwen3.5", + hw="h200-cw-1", + framework="sglang", + support_status="reviewed_preview", + effective_max_context_depth=524288, + context_pressure_class="extended_500k", + context_pressure_signal={ + "status": "observability_gap", + "requires_log_review": True, + }, + runtime_overrides={ + "vllm_cpu_offload_gb": None, + "vllm_swap_space_gb": None, + "sglang_mem_fraction_override": "0.77", + "sglang_chunked_prefill_override": "65536", + }, + kv_offload_observed=False, + peak_gpu_cache_usage=0.88, + peak_cpu_cache_usage=0.0, + ) + non_isb1_row = {"benchmark_type": "throughput", "ignored": True} + + write_result(tmp_path / "results" / "control.json", control_row) + write_result(tmp_path / "results" / "review.json", review_row) + write_result(tmp_path / "results" / "non_isb1.json", non_isb1_row) + + summary = generate_summary(tmp_path / "results") + + assert "## ISB1 Operator Summary" in summary + assert "### Lane Summary" in summary + assert "### Runtime Overrides" in summary + assert "### Action Items" in summary + assert "isb1_qwen_500k_sglang" in summary + assert "observability_gap" in summary + assert "65536" in summary + assert "server_logs_isb1_qwen_500k_sglang" in summary + assert "non_isb1" not in summary + + +def test_generate_summary_handles_empty_results(tmp_path): + summary = generate_summary(tmp_path / "results") + assert "No ISB1 replay rows found." in summary + assert "Lane Summary" not in summary diff --git a/utils/test_verify_producer_sync.py b/utils/test_verify_producer_sync.py new file mode 100644 index 000000000..ba42c8586 --- /dev/null +++ b/utils/test_verify_producer_sync.py @@ -0,0 +1,64 @@ +import json +import subprocess +import sys +from pathlib import Path + +SCRIPT_PATH = Path(__file__).parent / "verify_producer_sync.py" + + +RELEVANT_FILES = { + "extension_131k/sglang/code_131k1k_qwen3.5.json": {"name": "e131k"}, + "preview/long_context_500k/manifest_qwen3.5.json": {"name": "500k"}, + "preview/long_context_1m/manifest.json": {"name": "1m"}, +} + + +def _write_tree(root: Path, files: dict[str, dict]) -> None: + for relative_path, payload in files.items(): + file_path = root / relative_path + file_path.parent.mkdir(parents=True, exist_ok=True) + file_path.write_text(json.dumps(payload, sort_keys=True)) + + +def _run_verify(producer_root: Path, consumer_root: Path) -> subprocess.CompletedProcess[str]: + return subprocess.run( + [ + sys.executable, + str(SCRIPT_PATH), + "--producer-root", + str(producer_root), + "--consumer-root", + str(consumer_root), + ], + capture_output=True, + text=True, + check=False, + ) + + +def test_verify_producer_sync_passes_for_identical_trees(tmp_path: Path) -> None: + producer_root = tmp_path / "producer" + consumer_root = tmp_path / "consumer" + _write_tree(producer_root, RELEVANT_FILES) + _write_tree(consumer_root, RELEVANT_FILES) + + result = _run_verify(producer_root, consumer_root) + + assert result.returncode == 0 + assert "sync check passed" in result.stdout + + +def test_verify_producer_sync_fails_on_content_mismatch(tmp_path: Path) -> None: + producer_root = tmp_path / "producer" + consumer_root = tmp_path / "consumer" + _write_tree(producer_root, RELEVANT_FILES) + _write_tree(consumer_root, RELEVANT_FILES) + + mismatched_path = consumer_root / "preview/long_context_500k/manifest_qwen3.5.json" + mismatched_path.write_text(json.dumps({"name": "changed"}, sort_keys=True)) + + result = _run_verify(producer_root, consumer_root) + + assert result.returncode == 1 + assert "content_mismatch" in result.stderr + assert "preview/long_context_500k/manifest_qwen3.5.json" in result.stderr diff --git a/utils/verify_producer_sync.py b/utils/verify_producer_sync.py new file mode 100644 index 000000000..48cdac077 --- /dev/null +++ b/utils/verify_producer_sync.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""Verify producer/consumer sync for ISB1 preview and extension exports.""" + +from __future__ import annotations + +import argparse +import sys +from dataclasses import dataclass +from pathlib import Path + + +RELEVANT_SUBTREES = ( + "extension_131k", + "preview/long_context_500k", + "preview/long_context_1m", +) + + +@dataclass +class SyncIssue: + kind: str + path: str + + +def _json_files(root: Path) -> set[str]: + if not root.exists(): + return set() + return { + str(path.relative_to(root)) + for path in root.rglob("*.json") + if path.is_file() + } + + +def _compare_subtree(producer_root: Path, consumer_root: Path, subtree: str) -> list[SyncIssue]: + issues: list[SyncIssue] = [] + + producer_subtree = producer_root / subtree + consumer_subtree = consumer_root / subtree + + producer_files = _json_files(producer_subtree) + consumer_files = _json_files(consumer_subtree) + + if not producer_subtree.exists(): + issues.append(SyncIssue("missing_producer_subtree", subtree)) + return issues + if not consumer_subtree.exists(): + issues.append(SyncIssue("missing_consumer_subtree", subtree)) + return issues + + for relative_path in sorted(producer_files - consumer_files): + issues.append(SyncIssue("missing_in_consumer", f"{subtree}/{relative_path}")) + + for relative_path in sorted(consumer_files - producer_files): + issues.append(SyncIssue("extra_in_consumer", f"{subtree}/{relative_path}")) + + for relative_path in sorted(producer_files & consumer_files): + producer_file = producer_subtree / relative_path + consumer_file = consumer_subtree / relative_path + if producer_file.read_bytes() != consumer_file.read_bytes(): + issues.append(SyncIssue("content_mismatch", f"{subtree}/{relative_path}")) + + return issues + + +def verify_sync(producer_root: Path, consumer_root: Path) -> list[SyncIssue]: + issues: list[SyncIssue] = [] + for subtree in RELEVANT_SUBTREES: + issues.extend(_compare_subtree(producer_root, consumer_root, subtree)) + return issues + + +def _default_consumer_root() -> Path: + return Path(__file__).resolve().parents[1] / "datasets" / "isb1" / "exports" + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Verify that committed ISB1 consumer preview/extension exports are " + "synced with producer exports." + ) + ) + parser.add_argument( + "--producer-root", + required=True, + type=Path, + help="Path to ISB1 producer exports root (…/upstream/inferencex/exports)", + ) + parser.add_argument( + "--consumer-root", + default=_default_consumer_root(), + type=Path, + help="Path to InferenceX consumer exports root (default: datasets/isb1/exports)", + ) + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + issues = verify_sync(args.producer_root.resolve(), args.consumer_root.resolve()) + + if not issues: + print( + "Producer/consumer export sync check passed for: " + + ", ".join(RELEVANT_SUBTREES) + ) + return 0 + + print("Producer/consumer export sync check failed:", file=sys.stderr) + for issue in issues: + print(f"- {issue.kind}: {issue.path}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main())