diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 000000000..476a21b1c
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+datasets/isb1/exports/preview/long_context_1m/*.json filter=lfs diff=lfs merge=lfs -text
+datasets/isb1/exports/**/*.json filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/configs/isb1-kv-stress-pr993.yaml b/.github/configs/isb1-kv-stress-pr993.yaml
new file mode 100644
index 000000000..544ecd9dd
--- /dev/null
+++ b/.github/configs/isb1-kv-stress-pr993.yaml
@@ -0,0 +1,3589 @@
+dsr1-fp4-b200-dynamo-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id001
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id001
+    workload-type: code
+  model: deepseek-r1-fp4
+  model-prefix: dsr1
+  precision: fp4
+  runner: b200-multinode
+  runtime-stack-id: dynamo:sglang
+dsr1-fp4-b200-dynamo-sglang-mtp-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: lmsysorg/sglang:v0.5.8.post1-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id002
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id002
+    workload-type: code
+  model: deepseek-r1-fp4
+  model-prefix: dsr1
+  precision: fp4
+  runner: b200-multinode
+  runtime-stack-id: dynamo:sglang
+dsr1-fp4-b200-dynamo-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-trt
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id003
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id003
+    workload-type: code
+  model: deepseek-r1-fp4
+  model-prefix: dsr1
+  precision: fp4
+  runner: b200-multinode
+  runtime-stack-id: dynamo:trt
+dsr1-fp4-b200-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: lmsysorg/sglang:v0.5.9-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id004
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id004
+    workload-type: code
+  model: nvidia/DeepSeek-R1-0528-FP4-V2
+  model-prefix: dsr1
+  precision: fp4
+  runner: b200
+  runtime-stack-id: standalone:sglang
+dsr1-fp4-b200-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: trt
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id005
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id005
+    workload-type: code
+  model: nvidia/DeepSeek-R1-0528-FP4-V2
+  model-prefix: dsr1
+  precision: fp4
+  runner: b200
+  runtime-stack-id: standalone:trt
+dsr1-fp4-b200-trt-mtp-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: trt
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id006
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id006
+    workload-type: code
+  model: nvidia/DeepSeek-R1-0528-FP4-V2
+  model-prefix: dsr1
+  precision: fp4
+  runner: b200
+  runtime-stack-id: standalone:trt
+dsr1-fp4-b300-dynamo-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-trt
+  hardware-profile-id: nvidia:b300_sxm_288gb
+  image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id007
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id007
+    workload-type: code
+  model: deepseek-r1-fp4
+  model-prefix: dsr1
+  precision: fp4
+  runner: b300
+  runtime-stack-id: dynamo:trt
+dsr1-fp4-gb200-dynamo-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-sglang
+  hardware-profile-id: nvidia:gb200_grace_blackwell_192gb
+  image: lmsysorg/sglang:v0.5.8-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id008
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id008
+    workload-type: code
+  model: nvidia/DeepSeek-R1-0528-NVFP4-v2
+  model-prefix: dsr1
+  precision: fp4
+  runner: gb200
+  runtime-stack-id: dynamo:sglang
+dsr1-fp4-gb200-dynamo-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-trt
+  hardware-profile-id: nvidia:gb200_grace_blackwell_192gb
+  image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id009
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id009
+    workload-type: code
+  model: nvidia/DeepSeek-R1-0528-NVFP4-v2
+  model-prefix: dsr1
+  precision: fp4
+  runner: gb200
+  runtime-stack-id: dynamo:trt
+dsr1-fp4-gb300-dynamo-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-sglang
+  hardware-profile-id: nvidia:gb300_grace_blackwell_288gb
+  image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id010
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id010
+    workload-type: code
+  model: nvidia/DeepSeek-R1-0528-NVFP4-v2
+  model-prefix: dsr1
+  precision: fp4
+  runner: gb300
+  runtime-stack-id: dynamo:sglang
+dsr1-fp4-gb300-dynamo-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-trt
+  hardware-profile-id: nvidia:gb300_grace_blackwell_288gb
+  image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id011
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id011
+    workload-type: code
+  model: nvidia/DeepSeek-R1-0528-NVFP4-v2
+  model-prefix: dsr1
+  precision: fp4
+  runner: gb300
+  runtime-stack-id: dynamo:trt
+dsr1-fp4-mi355x-atom-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: atom
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id012
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id012
+    workload-type: code
+  model: amd/DeepSeek-R1-0528-MXFP4-Preview
+  model-prefix: dsr1
+  precision: fp4
+  runner: mi355x
+  runtime-stack-id: standalone:atom
+dsr1-fp4-mi355x-atom-mtp-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: atom
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/atom:rocm7.2.0-ubuntu24.04-pytorch2.9-atom0.1.1
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id013
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id013
+    workload-type: code
+  model: amd/DeepSeek-R1-0528-MXFP4
+  model-prefix: dsr1
+  precision: fp4
+  runner: mi355x
+  runtime-stack-id: standalone:atom
+dsr1-fp4-mi355x-sglang-disagg-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: sglang-disagg
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id014
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id014
+    workload-type: code
+  model: amd/DeepSeek-R1-0528-MXFP4
+  model-prefix: dsr1
+  precision: fp4
+  runner: mi355x-disagg
+  runtime-stack-id: standalone:sglang-disagg
+dsr1-fp4-mi355x-sglang-disagg-mtp-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: sglang-disagg
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id015
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id015
+    workload-type: code
+  model: amd/DeepSeek-R1-0528-MXFP4
+  model-prefix: dsr1
+  precision: fp4
+  runner: mi355x-disagg
+  runtime-stack-id: standalone:sglang-disagg
+dsr1-fp4-mi355x-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: sglang
+  hardware-profile-id: amd:mi355x_288gb
+  image: lmsysorg/sglang:v0.5.9-rocm700-mi35x
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id016
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id016
+    workload-type: code
+  model: amd/DeepSeek-R1-0528-MXFP4-Preview
+  model-prefix: dsr1
+  precision: fp4
+  runner: mi355x
+  runtime-stack-id: standalone:sglang
+dsr1-fp8-b200-dynamo-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id017
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id017
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: b200-multinode
+  runtime-stack-id: dynamo:sglang
+dsr1-fp8-b200-dynamo-sglang-mtp-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: lmsysorg/sglang:v0.5.8.post1-cu130-amd64
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id018
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id018
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: b200-multinode
+  runtime-stack-id: dynamo:sglang
+dsr1-fp8-b200-dynamo-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-trt
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id019
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id019
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: b200-multinode
+  runtime-stack-id: dynamo:trt
+dsr1-fp8-b200-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: lmsysorg/sglang:v0.5.9-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id020
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id020
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: b200
+  runtime-stack-id: standalone:sglang
+dsr1-fp8-b200-sglang-mtp-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: lmsysorg/sglang:v0.5.9-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id021
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id021
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: b200
+  runtime-stack-id: standalone:sglang
+dsr1-fp8-b200-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: trt
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id022
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id022
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: b200
+  runtime-stack-id: standalone:trt
+dsr1-fp8-b200-trt-mtp-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: trt
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc6.post3
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id023
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id023
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: b200
+  runtime-stack-id: standalone:trt
+dsr1-fp8-b300-dynamo-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-trt
+  hardware-profile-id: nvidia:b300_sxm_288gb
+  image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id024
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id024
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: b300
+  runtime-stack-id: dynamo:trt
+dsr1-fp8-gb200-dynamo-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-sglang
+  hardware-profile-id: nvidia:gb200_grace_blackwell_192gb
+  image: lmsysorg/sglang:v0.5.8.post1-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id025
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id025
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: gb200
+  runtime-stack-id: dynamo:sglang
+dsr1-fp8-gb200-dynamo-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-trt
+  hardware-profile-id: nvidia:gb200_grace_blackwell_192gb
+  image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id026
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id026
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: gb200
+  runtime-stack-id: dynamo:trt
+dsr1-fp8-gb300-dynamo-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-sglang
+  hardware-profile-id: nvidia:gb300_grace_blackwell_288gb
+  image: lmsysorg/sglang:v0.5.8.post1-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id027
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id027
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: gb300
+  runtime-stack-id: dynamo:sglang
+dsr1-fp8-gb300-dynamo-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-trt
+  hardware-profile-id: nvidia:gb300_grace_blackwell_288gb
+  image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id028
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id028
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: gb300
+  runtime-stack-id: dynamo:trt
+dsr1-fp8-h100-dynamo-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-sglang
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  image: lmsysorg/sglang:v0.5.8-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id029
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id029
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: h100-multinode
+  runtime-stack-id: dynamo:sglang
+dsr1-fp8-h100-dynamo-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-trt
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post3
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id030
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id030
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: h100-multinode
+  runtime-stack-id: dynamo:trt
+dsr1-fp8-h200-dynamo-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  image: lmsysorg/sglang:v0.5.8.post1-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id031
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id031
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: h200-multinode
+  runtime-stack-id: dynamo:sglang
+dsr1-fp8-h200-dynamo-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: dynamo-trt
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id032
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id032
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: h200-multinode
+  runtime-stack-id: dynamo:trt
+dsr1-fp8-h200-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  image: lmsysorg/sglang:v0.5.9-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id033
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id033
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: h200
+  runtime-stack-id: standalone:sglang
+dsr1-fp8-h200-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: trt
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id034
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id034
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: h200
+  runtime-stack-id: standalone:trt
+dsr1-fp8-h200-trt-mtp-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: trt
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id035
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id035
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: h200
+  runtime-stack-id: standalone:trt
+dsr1-fp8-mi300x-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: sglang
+  hardware-profile-id: amd:mi300x_192gb
+  image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id036
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id036
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: mi300x
+  runtime-stack-id: standalone:sglang
+dsr1-fp8-mi325x-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: sglang
+  hardware-profile-id: amd:mi325x_288gb
+  image: lmsysorg/sglang:v0.5.9-rocm700-mi30x
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id037
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id037
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: mi325x
+  runtime-stack-id: standalone:sglang
+dsr1-fp8-mi355x-atom-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: atom
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id038
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id038
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: mi355x
+  runtime-stack-id: standalone:atom
+dsr1-fp8-mi355x-atom-mtp-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: atom
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id039
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id039
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: mi355x
+  runtime-stack-id: standalone:atom
+dsr1-fp8-mi355x-sglang-disagg-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: sglang-disagg
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id040
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id040
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: mi355x-disagg
+  runtime-stack-id: standalone:sglang-disagg
+dsr1-fp8-mi355x-sglang-disagg-mtp-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: sglang-disagg
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id041
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id041
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: mi355x-disagg
+  runtime-stack-id: standalone:sglang-disagg
+dsr1-fp8-mi355x-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: deepseek_r1_0528
+  framework: sglang
+  hardware-profile-id: amd:mi355x_288gb
+  image: lmsysorg/sglang:v0.5.9-rocm700-mi35x
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id042
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id042
+    workload-type: code
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  runner: mi355x
+  runtime-stack-id: standalone:sglang
+glm5-fp4-b200-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: glm_5
+  framework: sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id043
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id043
+    workload-type: code
+  model: nvidia/GLM-5-NVFP4
+  model-prefix: glm5
+  precision: fp4
+  runner: b200
+  runtime-stack-id: standalone:sglang
+glm5-fp8-b200-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: glm_5
+  framework: sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: lmsysorg/sglang:nightly-dev-cu13-20260317-1eea7448
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id044
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id044
+    workload-type: code
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  precision: fp8
+  runner: b200
+  runtime-stack-id: standalone:sglang
+glm5-fp8-h200-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: glm_5
+  framework: sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  image: lmsysorg/sglang:glm5-hopper
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id045
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id045
+    workload-type: code
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  precision: fp8
+  runner: h200
+  runtime-stack-id: standalone:sglang
+glm5-fp8-mi355x-atom-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: glm_5
+  framework: atom
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2.post
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id046
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id046
+    workload-type: code
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  precision: fp8
+  runner: mi355x
+  runtime-stack-id: standalone:atom
+glm5-fp8-mi355x-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: glm_5
+  framework: sglang
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260219
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id047
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id047
+    workload-type: code
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  precision: fp8
+  runner: mi355x
+  runtime-stack-id: standalone:sglang
+gptoss-fp4-b200-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: gpt_oss_120b
+  framework: trt
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2.post2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id048
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id048
+    workload-type: code
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  runner: b200
+  runtime-stack-id: standalone:trt
+gptoss-fp4-b200-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: gpt_oss_120b
+  framework: vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: vllm/vllm-openai:v0.15.1
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id049
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id049
+    workload-type: code
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  runner: b200
+  runtime-stack-id: standalone:vllm
+gptoss-fp4-gb200-dynamo-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: gpt_oss_120b
+  framework: dynamo-trt
+  hardware-profile-id: nvidia:gb200_grace_blackwell_192gb
+  image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.7.0.post2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id050
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id050
+    workload-type: code
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  runner: gb200
+  runtime-stack-id: dynamo:trt
+gptoss-fp4-h100-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: gpt_oss_120b
+  framework: vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  image: vllm/vllm-openai:v0.18.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id051
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id051
+    workload-type: code
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  runner: h100
+  runtime-stack-id: standalone:vllm
+gptoss-fp4-h200-trt-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: gpt_oss_120b
+  framework: trt
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc11
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id052
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id052
+    workload-type: code
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  runner: h200
+  runtime-stack-id: standalone:trt
+gptoss-fp4-h200-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: gpt_oss_120b
+  framework: vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  image: vllm/vllm-openai:v0.18.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id053
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id053
+    workload-type: code
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  runner: h200
+  runtime-stack-id: standalone:vllm
+gptoss-fp4-mi300x-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: gpt_oss_120b
+  framework: vllm
+  hardware-profile-id: amd:mi300x_192gb
+  image: vllm/vllm-openai-rocm:v0.17.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id054
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id054
+    workload-type: code
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  runner: mi300x
+  runtime-stack-id: standalone:vllm
+gptoss-fp4-mi325x-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: gpt_oss_120b
+  framework: vllm
+  hardware-profile-id: amd:mi325x_288gb
+  image: vllm/vllm-openai-rocm:v0.17.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id055
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id055
+    workload-type: code
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  runner: mi325x
+  runtime-stack-id: standalone:vllm
+gptoss-fp4-mi355x-atom-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: gpt_oss_120b
+  framework: atom
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id056
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id056
+    workload-type: code
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  runner: mi355x
+  runtime-stack-id: standalone:atom
+gptoss-fp4-mi355x-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: gpt_oss_120b
+  framework: vllm
+  hardware-profile-id: amd:mi355x_288gb
+  image: vllm/vllm-openai-rocm:v0.17.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id057
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id057
+    workload-type: code
+  model: amd/gpt-oss-120b-w-mxfp4-a-fp8
+  model-prefix: gptoss
+  precision: fp4
+  runner: mi355x
+  runtime-stack-id: standalone:vllm
+kimik2.5-fp4-b200-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: kimi_k2_5
+  framework: vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: vllm/vllm-openai:v0.17.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id058
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id058
+    workload-type: code
+  model: nvidia/Kimi-K2.5-NVFP4
+  model-prefix: kimik2.5
+  precision: fp4
+  runner: b200
+  runtime-stack-id: standalone:vllm
+kimik2.5-fp4-gb200-dynamo-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: kimi_k2_5
+  framework: dynamo-vllm
+  hardware-profile-id: nvidia:gb200_grace_blackwell_192gb
+  image: vllm/vllm-openai:v0.18.0-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id059
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id059
+    workload-type: code
+  model: nvidia/Kimi-K2.5-NVFP4
+  model-prefix: kimik2.5
+  precision: fp4
+  runner: gb200
+  runtime-stack-id: dynamo:vllm
+kimik2.5-fp4-mi355x-atom-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: kimi_k2_5
+  framework: atom
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id060
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id060
+    workload-type: code
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  precision: fp4
+  runner: mi355x
+  runtime-stack-id: standalone:atom
+kimik2.5-fp4-mi355x-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: kimi_k2_5
+  framework: vllm
+  hardware-profile-id: amd:mi355x_288gb
+  image: vllm/vllm-openai-rocm:v0.18.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id061
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id061
+    workload-type: code
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  precision: fp4
+  runner: mi355x
+  runtime-stack-id: standalone:vllm
+kimik2.5-int4-b200-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: kimi_k2_5
+  framework: vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: vllm/vllm-openai:v0.15.1
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id062
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id062
+    workload-type: code
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  precision: int4
+  runner: b200
+  runtime-stack-id: standalone:vllm
+kimik2.5-int4-h200-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: kimi_k2_5
+  framework: vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  image: vllm/vllm-openai:v0.16.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id063
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id063
+    workload-type: code
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  precision: int4
+  runner: h200
+  runtime-stack-id: standalone:vllm
+kimik2.5-int4-mi300x-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: kimi_k2_5
+  framework: vllm
+  hardware-profile-id: amd:mi300x_192gb
+  image: vllm/vllm-openai-rocm:v0.18.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id064
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id064
+    workload-type: code
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  precision: int4
+  runner: mi300x
+  runtime-stack-id: standalone:vllm
+kimik2.5-int4-mi325x-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: kimi_k2_5
+  framework: vllm
+  hardware-profile-id: amd:mi325x_288gb
+  image: vllm/vllm-openai-rocm:v0.18.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id065
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id065
+    workload-type: code
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  precision: int4
+  runner: mi325x
+  runtime-stack-id: standalone:vllm
+kimik2.5-int4-mi355x-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: kimi_k2_5
+  framework: vllm
+  hardware-profile-id: amd:mi355x_288gb
+  image: vllm/vllm-openai-rocm:v0.18.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id066
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id066
+    workload-type: code
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  precision: int4
+  runner: mi355x
+  runtime-stack-id: standalone:vllm
+minimaxm2.5-fp4-b200-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: minimax_m2_5
+  framework: vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: vllm/vllm-openai:v0.19.0-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id067
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id067
+    workload-type: code
+  model: nvidia/MiniMax-M2.5-NVFP4
+  model-prefix: minimaxm2.5
+  precision: fp4
+  runner: b200
+  runtime-stack-id: standalone:vllm
+minimaxm2.5-fp8-b200-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: minimax_m2_5
+  framework: vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: vllm/vllm-openai:v0.19.0-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id068
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id068
+    workload-type: code
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  precision: fp8
+  runner: b200
+  runtime-stack-id: standalone:vllm
+minimaxm2.5-fp8-h100-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: minimax_m2_5
+  framework: vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  image: vllm/vllm-openai:v0.18.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id069
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id069
+    workload-type: code
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  precision: fp8
+  runner: h100
+  runtime-stack-id: standalone:vllm
+minimaxm2.5-fp8-h200-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: minimax_m2_5
+  framework: vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  image: vllm/vllm-openai:v0.18.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id070
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id070
+    workload-type: code
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  precision: fp8
+  runner: h200
+  runtime-stack-id: standalone:vllm
+minimaxm2.5-fp8-mi300x-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: minimax_m2_5
+  framework: vllm
+  hardware-profile-id: amd:mi300x_192gb
+  image: vllm/vllm-openai-rocm:v0.16.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id071
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id071
+    workload-type: code
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  precision: fp8
+  runner: mi300x
+  runtime-stack-id: standalone:vllm
+minimaxm2.5-fp8-mi325x-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: minimax_m2_5
+  framework: vllm
+  hardware-profile-id: amd:mi325x_288gb
+  image: vllm/vllm-openai-rocm:v0.18.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id072
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id072
+    workload-type: code
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  precision: fp8
+  runner: mi325x
+  runtime-stack-id: standalone:vllm
+minimaxm2.5-fp8-mi355x-atom-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: minimax_m2_5
+  framework: atom
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id073
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id073
+    workload-type: code
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  precision: fp8
+  runner: mi355x
+  runtime-stack-id: standalone:atom
+minimaxm2.5-fp8-mi355x-vllm-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: minimax_m2_5
+  framework: vllm
+  hardware-profile-id: amd:mi355x_288gb
+  image: vllm/vllm-openai-rocm:v0.19.0
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id074
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: unsupported
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id074
+    workload-type: code
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  precision: fp8
+  runner: mi355x
+  runtime-stack-id: standalone:vllm
+qwen3.5-bf16-b200-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: qwen3_5_397b_a17b
+  framework: sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id075
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id075
+    workload-type: code
+  model: Qwen/Qwen3.5-397B-A17B
+  model-prefix: qwen3.5
+  precision: bf16
+  runner: b200
+  runtime-stack-id: standalone:sglang
+qwen3.5-bf16-mi300x-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: qwen3_5_397b_a17b
+  framework: sglang
+  hardware-profile-id: amd:mi300x_192gb
+  image: lmsysorg/sglang:v0.5.9-rocm720-mi30x
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id076
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id076
+    workload-type: code
+  model: Qwen/Qwen3.5-397B-A17B
+  model-prefix: qwen3.5
+  precision: bf16
+  runner: mi300x
+  runtime-stack-id: standalone:sglang
+qwen3.5-bf16-mi325x-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: qwen3_5_397b_a17b
+  framework: sglang
+  hardware-profile-id: amd:mi325x_288gb
+  image: lmsysorg/sglang:v0.5.9-rocm720-mi30x
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id077
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id077
+    workload-type: code
+  model: Qwen/Qwen3.5-397B-A17B
+  model-prefix: qwen3.5
+  precision: bf16
+  runner: mi325x
+  runtime-stack-id: standalone:sglang
+qwen3.5-bf16-mi355x-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: qwen3_5_397b_a17b
+  framework: sglang
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id078
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id078
+    workload-type: code
+  model: Qwen/Qwen3.5-397B-A17B
+  model-prefix: qwen3.5
+  precision: bf16
+  runner: mi355x
+  runtime-stack-id: standalone:sglang
+qwen3.5-fp4-b200-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: qwen3_5_397b_a17b
+  framework: sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: lmsysorg/sglang:nightly-dev-20260402-d7256eb6
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id079
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id079
+    workload-type: code
+  model: nvidia/Qwen3.5-397B-A17B-NVFP4
+  model-prefix: qwen3.5
+  precision: fp4
+  runner: b200
+  runtime-stack-id: standalone:sglang
+qwen3.5-fp4-mi355x-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: qwen3_5_397b_a17b
+  framework: sglang
+  hardware-profile-id: amd:mi355x_288gb
+  image: lmsysorg/sglang:v0.5.10-rocm720-mi35x
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id080
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id080
+    workload-type: code
+  model: amd/Qwen3.5-397B-A17B-MXFP4
+  model-prefix: qwen3.5
+  precision: fp4
+  runner: mi355x
+  runtime-stack-id: standalone:sglang
+qwen3.5-fp8-b200-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: qwen3_5_397b_a17b
+  framework: sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: lmsysorg/sglang:v0.5.9-cu130-amd64
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id081
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id081
+    workload-type: code
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  runner: b200
+  runtime-stack-id: standalone:sglang
+qwen3.5-fp8-b200-sglang-mtp-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: qwen3_5_397b_a17b
+  framework: sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  image: lmsysorg/sglang:v0.5.9-cu130
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id082
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id082
+    workload-type: code
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  runner: b200
+  runtime-stack-id: standalone:sglang
+qwen3.5-fp8-h200-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: qwen3_5_397b_a17b
+  framework: sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  image: lmsysorg/sglang:v0.5.9-cu129-amd64
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id083
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id083
+    workload-type: code
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  runner: h200
+  runtime-stack-id: standalone:sglang
+qwen3.5-fp8-h200-sglang-mtp-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: qwen3_5_397b_a17b
+  framework: sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  image: lmsysorg/sglang:v0.5.10.post1
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id084
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id084
+    workload-type: code
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  runner: h200
+  runtime-stack-id: standalone:sglang
+qwen3.5-fp8-mi300x-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: qwen3_5_397b_a17b
+  framework: sglang
+  hardware-profile-id: amd:mi300x_192gb
+  image: lmsysorg/sglang:v0.5.9-rocm720-mi30x
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id085
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id085
+    workload-type: code
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  runner: mi300x
+  runtime-stack-id: standalone:sglang
+qwen3.5-fp8-mi325x-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: qwen3_5_397b_a17b
+  framework: sglang
+  hardware-profile-id: amd:mi325x_288gb
+  image: lmsysorg/sglang:v0.5.9-rocm720-mi30x
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id086
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id086
+    workload-type: code
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  runner: mi325x
+  runtime-stack-id: standalone:sglang
+qwen3.5-fp8-mi355x-sglang-isb1-kv-stress:
+  benchmark-type: isb1_kv_stress
+  canonical-model-id: qwen3_5_397b_a17b
+  framework: sglang
+  hardware-profile-id: amd:mi355x_288gb
+  image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+  - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+    request-mode: multi-turn
+    search-space:
+    - duration-s: 1800
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      users: &id087
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+    support-status: reviewed_preview
+    tp-configs:
+    - duration-s: 1800
+      ep: 1
+      offload-modes:
+      - 'on'
+      - 'off'
+      - noprefix
+      tp: 8
+      users: *id087
+    workload-type: code
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  runner: mi355x
+  runtime-stack-id: standalone:sglang
diff --git a/.github/configs/isb1-kv-stress.yaml b/.github/configs/isb1-kv-stress.yaml
new file mode 100644
index 000000000..9ee07ef5d
--- /dev/null
+++ b/.github/configs/isb1-kv-stress.yaml
@@ -0,0 +1,96 @@
+# Dedicated ISB1 KV cache stress sweeps (CTO-approved schema).
+#
+# This file is intentionally separate from isb1-master.yaml and uses
+# benchmark-type: isb1_kv_stress with kv-stress-configs.
+
+gptoss-fp4-h200-isb1-kv-stress-vllm-code:
+  image: vllm/vllm-openai:v0.18.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_kv_stress
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 131272
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      workload-type: code
+      search-space:
+        - users: [2, 4, 8, 16, 32, 64, 128, 256]
+          offload-modes: ["on", "off", "noprefix"]
+          duration-s: 1800
+
+gptoss-fp4-b200-isb1-kv-stress-vllm-code:
+  image: vllm/vllm-openai:v0.15.1
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: b200
+  benchmark-type: isb1_kv_stress
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 131272
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      workload-type: code
+      search-space:
+        - users: [2, 4, 8, 16, 32, 64, 128, 256]
+          offload-modes: ["on", "off", "noprefix"]
+          duration-s: 1800
+
+qwen3.5-fp8-h200-isb1-kv-stress-vllm-code:
+  image: vllm/vllm-openai:v0.18.0
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_kv_stress
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 131272
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      workload-type: code
+      search-space:
+        - users: [2, 4, 8, 16, 32, 64, 128, 256]
+          offload-modes: ["on", "off", "noprefix"]
+          duration-s: 1800
+
+qwen3.5-fp8-b200-isb1-kv-stress-vllm-code:
+  image: vllm/vllm-openai:v0.19.0-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: vllm
+  runner: b200
+  benchmark-type: isb1_kv_stress
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 131272
+  kv-cache-dtype: fp8
+  kv-stress-configs:
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      workload-type: code
+      search-space:
+        - users: [2, 4, 8, 16, 32, 64, 128, 256]
+          offload-modes: ["on", "off", "noprefix"]
+          duration-s: 1800
diff --git a/.github/configs/isb1-master.yaml b/.github/configs/isb1-master.yaml
new file mode 100644
index 000000000..99c111967
--- /dev/null
+++ b/.github/configs/isb1-master.yaml
@@ -0,0 +1,1723 @@
+# PR2 packaged the core 8k1k replay bundles.
+# PR4 adds truthful long-context extension replay lanes using only the materialized
+# extension_32k / extension_64k / extension_131k code bundles.
+# These extension lanes are served-shape replay artifacts derived from larger source
+# workloads; they are not native 500k+/1M+ InferenceX served-lane claims.
+#
+# Core entries keep an explicit 8k1k max-model-len. Extension entries intentionally
+# omit max-model-len so the ISB1 workflow derives the served-shape value from the
+# export stem (32k1k / 64k1k / 131k1k) at execution time.
+#
+# Official replay-configs pin support-status: supported so the workflow only replays
+# the supported subset of mixed-status export bundles.
+# All currently runnable rows also resolve to
+# benchmark_certification_status=dataset_replay_verified.
+# Phase 2 adds truthful chat-extension widening plus bounded preview/offload
+# lanes. Preview rows stay explicit via support-status: reviewed_preview and the
+# dedicated preview export paths. The current replay closure covers dsr1,
+# gptoss, and qwen3.5 across core 8k1k plus extension bands, with bounded
+# 500k code preview for gptoss and qwen3.5 on standalone sglang/vllm across
+# b200/h100/h200.
+
+dsr1-fp8-b200-isb1-sglang:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  framework: sglang
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: deepseek_r1_0528
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+dsr1-fp8-h200-isb1-sglang:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  framework: sglang
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: deepseek_r1_0528
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+dsr1-fp8-b200-isb1-vllm:
+  image: vllm/vllm-openai:v0.19.0-cu130
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  framework: vllm
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: deepseek_r1_0528
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+dsr1-fp8-h200-isb1-vllm:
+  image: vllm/vllm-openai:v0.18.0
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: deepseek_r1_0528
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+gptoss-fp4-b200-isb1-sglang:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: sglang
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+gptoss-fp4-h100-isb1-sglang:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: sglang
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+gptoss-fp4-h200-isb1-sglang:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: sglang
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+gptoss-fp4-b200-isb1-vllm:
+  # Keep the existing B200 GPT-OSS vLLM pin from the official throughput lane.
+  image: vllm/vllm-openai:v0.15.1
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+gptoss-fp4-h100-isb1-vllm:
+  image: vllm/vllm-openai:v0.18.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+gptoss-fp4-h200-isb1-vllm:
+  image: vllm/vllm-openai:v0.18.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+qwen3.5-fp8-b200-isb1-sglang:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: sglang
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+qwen3.5-fp8-h100-isb1-sglang:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: sglang
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+qwen3.5-fp8-h200-isb1-sglang:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: sglang
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/sglang/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/sglang/code_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+qwen3.5-fp8-b200-isb1-vllm:
+  image: vllm/vllm-openai:v0.19.0-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: vllm
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+qwen3.5-fp8-h100-isb1-vllm:
+  image: vllm/vllm-openai:v0.18.0
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+qwen3.5-fp8-h200-isb1-vllm:
+  image: vllm/vllm-openai:v0.18.0
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+dsr1-fp8-b200-isb1-sglang-extension:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  framework: sglang
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: deepseek_r1_0528
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+dsr1-fp8-h200-isb1-sglang-extension:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  framework: sglang
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: deepseek_r1_0528
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+dsr1-fp8-b200-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.19.0-cu130
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  framework: vllm
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: deepseek_r1_0528
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+dsr1-fp8-h200-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.18.0
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: deepseek_r1_0528
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+gptoss-fp4-b200-isb1-sglang-extension:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: sglang
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: gpt_oss_120b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+    - export-file: datasets/isb1/exports/extension_131k/sglang/chat_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+gptoss-fp4-h100-isb1-sglang-extension:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: sglang
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: gpt_oss_120b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+    - export-file: datasets/isb1/exports/extension_131k/sglang/chat_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+gptoss-fp4-h200-isb1-sglang-extension:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: sglang
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: gpt_oss_120b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+    - export-file: datasets/isb1/exports/extension_131k/sglang/chat_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+gptoss-fp4-b200-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.15.1
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: gpt_oss_120b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+    - export-file: datasets/isb1/exports/extension_131k/vllm/chat_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+gptoss-fp4-h100-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.18.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: gpt_oss_120b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+    - export-file: datasets/isb1/exports/extension_131k/vllm/chat_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+gptoss-fp4-h200-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.18.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: gpt_oss_120b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+    - export-file: datasets/isb1/exports/extension_131k/vllm/chat_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+qwen3.5-fp8-b200-isb1-sglang-extension:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: sglang
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: qwen3_5_397b_a17b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k_qwen3.5.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+qwen3.5-fp8-h100-isb1-sglang-extension:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: sglang
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: qwen3_5_397b_a17b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k_qwen3.5.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+qwen3.5-fp8-h200-isb1-sglang-extension:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: sglang
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: qwen3_5_397b_a17b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/sglang/code_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/sglang/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/sglang/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/sglang/code_131k1k_qwen3.5.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+qwen3.5-fp8-b200-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.19.0-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: vllm
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: qwen3_5_397b_a17b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+qwen3.5-fp8-h100-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.18.0
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: qwen3_5_397b_a17b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+qwen3.5-fp8-h200-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.18.0
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: qwen3_5_397b_a17b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_32k/vllm/chat_32k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_64k/vllm/chat_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+        - max-concurrency: 4
+
+qwen3.5-fp8-b200-isb1-sglang-500k-preview-code:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: sglang
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 524288
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+
+qwen3.5-fp8-h100-isb1-sglang-500k-preview-code:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: sglang
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 524288
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+
+qwen3.5-fp8-h200-isb1-sglang-500k-preview-code:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: sglang
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 524288
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+
+qwen3.5-fp8-b200-isb1-vllm-500k-preview-code:
+  image: vllm/vllm-openai:v0.19.0-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: vllm
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 524288
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+
+qwen3.5-fp8-h100-isb1-vllm-500k-preview-code:
+  image: vllm/vllm-openai:v0.18.0
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 524288
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+
+qwen3.5-fp8-h200-isb1-vllm-500k-preview-code:
+  image: vllm/vllm-openai:v0.18.0
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 524288
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+
+gptoss-fp4-b200-isb1-sglang-500k-preview-code:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: sglang
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 524288
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+
+gptoss-fp4-h100-isb1-sglang-500k-preview-code:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: sglang
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 524288
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+
+gptoss-fp4-h200-isb1-sglang-500k-preview-code:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: sglang
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 524288
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+
+gptoss-fp4-b200-isb1-vllm-500k-preview-code:
+  image: vllm/vllm-openai:v0.15.1
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 524288
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+
+gptoss-fp4-h100-isb1-vllm-500k-preview-code:
+  image: vllm/vllm-openai:v0.18.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 524288
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+
+gptoss-fp4-h200-isb1-vllm-500k-preview-code:
+  image: vllm/vllm-openai:v0.18.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 524288
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+
+gptoss-fp4-b200-isb1-sglang-offload-core-preview-chat:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: sglang
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 131272
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+    - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__standard.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 4
+          max-turns-per-session: 6
+          num-warmup-sessions: 0
+
+gptoss-fp4-h100-isb1-sglang-offload-core-preview-chat:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: sglang
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 131272
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+    - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__standard.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 4
+          max-turns-per-session: 6
+          num-warmup-sessions: 0
+
+gptoss-fp4-h200-isb1-sglang-offload-core-preview-chat:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: sglang
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 131272
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+    - export-file: datasets/isb1/exports/preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__standard.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 4
+          max-turns-per-session: 6
+          num-warmup-sessions: 0
+
+gptoss-fp4-b200-isb1-vllm-offload-core-preview-code:
+  image: vllm/vllm-openai:v0.15.1
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 131272
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__smoke.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+    - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__standard.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 4
+          max-turns-per-session: 6
+          num-warmup-sessions: 0
+
+gptoss-fp4-h100-isb1-vllm-offload-core-preview-code:
+  image: vllm/vllm-openai:v0.18.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 131272
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__smoke.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+    - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__standard.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 4
+          max-turns-per-session: 6
+          num-warmup-sessions: 0
+
+gptoss-fp4-h200-isb1-vllm-offload-core-preview-code:
+  image: vllm/vllm-openai:v0.18.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  precision: fp4
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 131272
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__smoke.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 2
+          max-turns-per-session: 4
+          num-warmup-sessions: 0
+    - export-file: datasets/isb1/exports/preview/offload_core/inferencex_trace_replay__coding_hopper_blackwell_offload_core_v1__standard.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 4
+          max-turns-per-session: 6
+          num-warmup-sessions: 0
diff --git a/.github/configs/isb1-qwen-1m-preview.yaml b/.github/configs/isb1-qwen-1m-preview.yaml
new file mode 100644
index 000000000..1de9c7339
--- /dev/null
+++ b/.github/configs/isb1-qwen-1m-preview.yaml
@@ -0,0 +1,53 @@
+# Manual-only gated Qwen 1M preview surface.
+# The selected export cells remain support-status=reviewed_preview and
+# benchmark_certification_status=dataset_replay_verified, but this file is
+# intentionally separate from isb1-master.yaml so the lane stays out of the
+# ordinary runnable support statement.
+#
+# Use only for explicit validation dispatches while KV-offload observability and
+# correctness remain under review. Running this file does not imply native 1M
+# served-lane support or KV-offload certification.
+
+qwen3.5-fp8-b200-isb1-sglang-1m-gated-preview-code:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: sglang
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 1048576
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__sglang.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 1
+          max-turns-per-session: 3
+          num-warmup-sessions: 0
+
+qwen3.5-fp8-b200-isb1-vllm-1m-gated-preview-code:
+  image: vllm/vllm-openai:v0.19.0-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: vllm
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: qwen3_5_397b_a17b
+  max-model-len: 1048576
+  replay-configs:
+    - export-file: datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 1
+          max-sessions: 1
+          max-turns-per-session: 3
+          num-warmup-sessions: 0
diff --git a/.github/configs/isb1-triattn-preview.yaml b/.github/configs/isb1-triattn-preview.yaml
new file mode 100644
index 000000000..629cb8fe9
--- /dev/null
+++ b/.github/configs/isb1-triattn-preview.yaml
@@ -0,0 +1,291 @@
+# TriAttention KV-compression preview lanes for ISB1 replay benchmarks.
+#
+# These entries deploy vLLM with the TriAttention plugin enabled for runtime
+# KV-cache compression on H100/H200 Hopper-class GPUs. The plugin uses env
+# vars TRIATTN_RUNTIME_KV_BUDGET and TRIATTN_RUNTIME_SPARSE_STATS_PATH,
+# configured in the benchmark scripts.
+#
+# Key differences from baseline vLLM ISB1 entries:
+#   - model-prefix includes "triattn" suffix to route to dedicated scripts
+#   - Prefix caching disabled (incompatible with KV compression)
+#   - max-num-batched-tokens lowered to 1024 (prevents OOM from large prefills)
+#   - KV budget auto-detected: 2048 for code workloads, 12000 for chat workloads
+#
+# This file is intentionally separate from isb1-master.yaml — TriAttention
+# preview lanes stay out of the ordinary runnable support statement.
+# Use only for explicit validation dispatches.
+#
+# Prerequisites:
+#   - triattention pip package installed in the container (or installed at runtime)
+#   - Optional: pre-calibrated stats at /workspace/triattn_stats/<canonical_model_id>_stats.pt
+
+# ---------------------------------------------------------------------------
+# DeepSeek-R1 FP8 — H100/H200 with TriAttention — core 8k1k
+# ---------------------------------------------------------------------------
+
+dsr1triattn-fp8-h100-isb1-vllm:
+  image: vllm/vllm-openai:v0.18.0
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1triattn
+  precision: fp8
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: deepseek_r1_0528
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+dsr1triattn-fp8-h200-isb1-vllm:
+  image: vllm/vllm-openai:v0.18.0
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1triattn
+  precision: fp8
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: deepseek_r1_0528
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+# ---------------------------------------------------------------------------
+# DeepSeek-R1 FP8 — H100/H200 with TriAttention — long-context extensions
+# ---------------------------------------------------------------------------
+
+dsr1triattn-fp8-h100-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.18.0
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1triattn
+  precision: fp8
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: deepseek_r1_0528
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+    - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+
+dsr1triattn-fp8-h200-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.18.0
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1triattn
+  precision: fp8
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: deepseek_r1_0528
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+    - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+
+# ---------------------------------------------------------------------------
+# Qwen 3.5 FP8 — H100/H200 with TriAttention — extension only
+# (Qwen 3.5 is not present in core 8k1k exports; only extension 131k)
+# ---------------------------------------------------------------------------
+
+qwen3.5triattn-fp8-h100-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.18.0
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5triattn
+  precision: fp8
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: qwen3_5_397b_a17b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+
+qwen3.5triattn-fp8-h200-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.18.0
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5triattn
+  precision: fp8
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: qwen3_5_397b_a17b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k_qwen3.5.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          num-warmup-sessions: 1
+
+# ---------------------------------------------------------------------------
+# GPT-OSS-120B FP4 — H100/H200 with TriAttention — core 8k1k
+# ---------------------------------------------------------------------------
+
+gptosstriattn-fp4-h100-isb1-vllm:
+  image: vllm/vllm-openai:v0.18.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptosstriattn
+  precision: fp4
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+gptosstriattn-fp4-h200-isb1-vllm:
+  image: vllm/vllm-openai:v0.18.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptosstriattn
+  precision: fp4
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: gpt_oss_120b
+  max-model-len: 10240
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/vllm/chat_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+    - export-file: datasets/isb1/exports/core/vllm/code_8k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+        - max-concurrency: 8
+
+# ---------------------------------------------------------------------------
+# GPT-OSS-120B FP4 — H100/H200 with TriAttention — long-context extensions
+# ---------------------------------------------------------------------------
+
+gptosstriattn-fp4-h100-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.18.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptosstriattn
+  precision: fp4
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: gpt_oss_120b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+    - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+
+gptosstriattn-fp4-h200-isb1-vllm-extension:
+  image: vllm/vllm-openai:v0.18.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptosstriattn
+  precision: fp4
+  framework: vllm
+  runner: h200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h200_sxm_141gb
+  canonical-model-id: gpt_oss_120b
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_32k/vllm/code_32k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
+    - export-file: datasets/isb1/exports/extension_64k/vllm/code_64k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 4
+          num-warmup-sessions: 1
diff --git a/.github/workflows/benchmark-isb1-tmpl.yml b/.github/workflows/benchmark-isb1-tmpl.yml
new file mode 100644
index 000000000..d152d2062
--- /dev/null
+++ b/.github/workflows/benchmark-isb1-tmpl.yml
@@ -0,0 +1,451 @@
+name: Template - Benchmark ISB1
+on:
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+      image:
+        required: true
+        type: string
+      model:
+        required: true
+        type: string
+      model-prefix:
+        required: true
+        type: string
+      precision:
+        required: true
+        type: string
+      framework:
+        required: true
+        type: string
+      exp-name:
+        required: true
+        type: string
+      benchmark-type:
+        required: true
+        type: string
+      export-file:
+        required: true
+        type: string
+      runtime-stack-id:
+        required: true
+        type: string
+      hardware-profile-id:
+        required: true
+        type: string
+      canonical-model-id:
+        required: true
+        type: string
+      support-status:
+        required: false
+        type: string
+        default: ''
+      request-mode:
+        required: true
+        type: string
+      max-concurrency:
+        required: true
+        type: string
+      max-sessions:
+        required: false
+        type: string
+        default: ''
+      max-turns-per-session:
+        required: false
+        type: string
+        default: ''
+      max-output-len:
+        required: false
+        type: string
+        default: ''
+      num-warmup-sessions:
+        required: false
+        type: string
+        default: '0'
+      ignore-waits:
+        required: false
+        type: boolean
+        default: false
+      ignore-eos:
+        required: false
+        type: boolean
+        default: false
+      max-model-len:
+        required: false
+        type: string
+        default: ''
+      tp-override:
+        required: false
+        type: string
+        default: ''
+      ep-override:
+        required: false
+        type: string
+        default: ''
+      trace-source:
+        required: false
+        type: string
+        default: ''
+      offload-mode:
+        required: false
+        type: string
+        default: ''
+      kv-cache-dtype:
+        required: false
+        type: string
+        default: ''
+      disable-prefix-caching:
+        required: false
+        type: boolean
+        default: false
+      benchmark-duration-s:
+        required: false
+        type: string
+        default: ''
+      workload-type:
+        required: false
+        type: string
+        default: ''
+      vllm-cpu-offload-gb:
+        required: false
+        type: string
+        default: ''
+      vllm-swap-space-gb:
+        required: false
+        type: string
+        default: ''
+      sglang-mem-fraction-override:
+        required: false
+        type: string
+        default: ''
+      sglang-chunked-prefill-override:
+        required: false
+        type: string
+        default: ''
+      ref:
+        description: Git ref (branch/sha) to checkout
+        required: false
+        type: string
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
+  EXP_NAME: ${{ inputs.exp-name }}
+  MODEL: ${{ inputs.model }}
+  MODEL_PREFIX: ${{ inputs.model-prefix }}
+  IMAGE: ${{ inputs.image }}
+  FRAMEWORK: ${{ inputs.framework }}
+  PRECISION: ${{ inputs.precision }}
+  BENCHMARK_TYPE: ${{ inputs.benchmark-type }}
+  EXPORT_FILE: ${{ inputs.export-file }}
+  RUNTIME_STACK_ID: ${{ inputs.runtime-stack-id }}
+  HARDWARE_PROFILE_ID: ${{ inputs.hardware-profile-id }}
+  CANONICAL_MODEL_ID: ${{ inputs.canonical-model-id }}
+  SUPPORT_STATUS: ${{ inputs.support-status }}
+  REQUEST_MODE: ${{ inputs.request-mode }}
+  MAX_CONCURRENCY: ${{ inputs.max-concurrency }}
+  MAX_SESSIONS: ${{ inputs.max-sessions }}
+  MAX_TURNS_PER_SESSION: ${{ inputs.max-turns-per-session }}
+  MAX_OUTPUT_LEN: ${{ inputs.max-output-len }}
+  NUM_WARMUP_SESSIONS: ${{ inputs.num-warmup-sessions }}
+  IGNORE_WAITS: ${{ inputs.ignore-waits }}
+  IGNORE_EOS: ${{ inputs.ignore-eos }}
+  OFFLOAD_MODE: ${{ inputs.offload-mode }}
+  KV_CACHE_DTYPE: ${{ inputs.kv-cache-dtype }}
+  DISABLE_PREFIX_CACHING: ${{ inputs.disable-prefix-caching }}
+  BENCHMARK_DURATION_S: ${{ inputs.benchmark-duration-s }}
+  WORKLOAD_TYPE: ${{ inputs.workload-type }}
+  VLLM_CPU_OFFLOAD_GB: ${{ inputs.vllm-cpu-offload-gb }}
+  VLLM_SWAP_SPACE_GB: ${{ inputs.vllm-swap-space-gb }}
+  SGLANG_MEM_FRACTION_OVERRIDE: ${{ inputs.sglang-mem-fraction-override }}
+  SGLANG_CHUNKED_PREFILL_OVERRIDE: ${{ inputs.sglang-chunked-prefill-override }}
+  TP_OVERRIDE: ${{ inputs.tp-override }}
+  EP_OVERRIDE: ${{ inputs.ep-override }}
+  TRACE_SOURCE: ${{ inputs.trace-source }}
+  PYTHONDONTWRITEBYTECODE: '1'
+  PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
+
+permissions:
+  contents: read
+
+jobs:
+  benchmark:
+    runs-on: ${{ inputs.runner }}
+    timeout-minutes: 300
+    name: "${{ inputs.exp-name }} ${{ inputs.precision }} ${{ inputs.runner }} ${{ inputs.framework }} | ${{ inputs.benchmark-type }} conc-${{ inputs.max-concurrency }}"
+    steps:
+      - name: Resource cleanup (pre-run)
+        run: &resource-cleanup |
+          if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+            echo "[Docker] Cleaning up resources ..."
+            docker ps -aq | xargs -r docker rm -f
+            docker network prune -f
+            while [ -n "$(docker ps -aq)" ]; do
+              docker ps -a
+              sleep 5
+            done
+          fi
+
+          if command -v squeue >/dev/null 2>&1; then
+            if [[ "${{ runner.name }}" == h100-* || "${{ runner.name }}" == h200-* || "${{ runner.name }}" == b200-* ]]; then
+              echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..."
+              scancel --name="${{ runner.name }}" || true
+              while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do
+                squeue --name="${{ runner.name }}"
+                sleep 5
+              done
+            else
+              echo "[Slurm] Cleaning up jobs for user: $USER ..."
+              scancel -u "$USER" || true
+              while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do
+                squeue -u "$USER"
+                sleep 5
+              done
+            fi
+          fi
+
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+          ref: ${{ inputs.ref || github.ref }}
+          clean: false
+
+      - name: Certify ISB1 export contract
+        env:
+          INPUT_EXPORT_FILE: ${{ inputs.export-file }}
+          INPUT_RUNTIME_STACK_ID: ${{ inputs.runtime-stack-id }}
+          INPUT_HARDWARE_PROFILE_ID: ${{ inputs.hardware-profile-id }}
+          INPUT_CANONICAL_MODEL_ID: ${{ inputs.canonical-model-id }}
+          INPUT_SUPPORT_STATUS: ${{ inputs.support-status }}
+          INPUT_MAX_MODEL_LEN: ${{ inputs.max-model-len }}
+        run: |
+          python3 - <<'PY'
+          import json
+          import os
+          import re
+          from pathlib import Path
+
+          export_path = Path(os.environ["INPUT_EXPORT_FILE"])
+          if not export_path.exists():
+              raise SystemExit(f"Missing ISB1 export file: {export_path}")
+
+          payload = json.loads(export_path.read_text())
+          exports = payload.get("exports")
+          if not isinstance(exports, list) or not exports:
+              raise SystemExit(
+                  f"ISB1 export file must contain a non-empty 'exports' list: {export_path}"
+              )
+
+          support_status = os.environ.get("INPUT_SUPPORT_STATUS", "").strip() or None
+          explicit_max_model_len = os.environ.get("INPUT_MAX_MODEL_LEN", "").strip()
+          if not re.search(r"(?P<isl>\d+)k(?P<osl>\d+)k", export_path.stem) and not explicit_max_model_len:
+              raise SystemExit(
+                  "Mixed-shape ISB1 exports require explicit max-model-len in the workflow input. "
+                  f"Missing for '{export_path}'."
+              )
+
+          identity_cells = [
+              cell
+              for cell in exports
+              if cell.get("runtime_stack_id") == os.environ["INPUT_RUNTIME_STACK_ID"]
+              and cell.get("hardware_profile_id") == os.environ["INPUT_HARDWARE_PROFILE_ID"]
+              and cell.get("canonical_model_id") == os.environ["INPUT_CANONICAL_MODEL_ID"]
+          ]
+          identity_statuses = sorted(
+              {
+                  cell.get("support_status")
+                  for cell in identity_cells
+                  if cell.get("support_status") is not None
+              }
+          )
+          matching_cells = [
+              cell
+              for cell in identity_cells
+              if support_status is None or cell.get("support_status") == support_status
+          ]
+
+          if support_status is None and len(identity_statuses) > 1:
+              raise SystemExit(
+                  f"Ambiguous ISB1 support tier for {export_path}; identity spans {identity_statuses}. "
+                  "Pin support-status explicitly."
+              )
+          if not matching_cells:
+              raise SystemExit(
+                  "No ISB1 export cell matches the requested workflow identity/tier for "
+                  f"{export_path}. Available tiers for that identity: {identity_statuses or ['<none>']}"
+              )
+
+          certification_statuses = sorted(
+              {
+                  cell.get("benchmark_certification_status")
+                  for cell in matching_cells
+                  if cell.get("benchmark_certification_status") is not None
+              }
+          )
+          if not certification_statuses:
+              raise SystemExit(
+                  "Selected ISB1 export cells must declare benchmark_certification_status. "
+                  f"Missing for '{export_path}'."
+              )
+          if certification_statuses != ["dataset_replay_verified"]:
+              raise SystemExit(
+                  "Current InferenceX ISB1 consumer lanes only accept "
+                  "benchmark_certification_status=dataset_replay_verified. "
+                  f"Selected cells for '{export_path}' resolved to {certification_statuses}."
+              )
+
+          print(
+              "Certified ISB1 export contract for "
+              f"{export_path} with support-status={support_status or '<unset>'} "
+              f"and benchmark_certification_status={certification_statuses[0]}"
+          )
+          PY
+
+      - name: Derive ISB1 runner env
+        env:
+          INPUT_RUNNER: ${{ inputs.runner }}
+          INPUT_EXPORT_FILE: ${{ inputs.export-file }}
+          INPUT_MAX_MODEL_LEN: ${{ inputs.max-model-len }}
+          INPUT_MAX_CONCURRENCY: ${{ inputs.max-concurrency }}
+          INPUT_TP_OVERRIDE: ${{ inputs.tp-override }}
+        run: |
+          python3 - <<'PY' >> "$GITHUB_ENV"
+          import json
+          import os
+          import re
+          from pathlib import Path
+
+          runner = os.environ["INPUT_RUNNER"].lower()
+          export_file = os.environ["INPUT_EXPORT_FILE"]
+          explicit_max_model_len = os.environ.get("INPUT_MAX_MODEL_LEN", "").strip()
+          max_concurrency = os.environ["INPUT_MAX_CONCURRENCY"]
+
+          if runner.startswith(("h100", "h200", "b200")):
+              tp = 8
+          else:
+              raise SystemExit(
+                  f"ISB1 replay lane is NVIDIA-first in PR1b; unsupported runner '{runner}'."
+              )
+
+          tp_override = os.environ.get("INPUT_TP_OVERRIDE", "").strip()
+          if tp_override:
+              tp = int(tp_override)
+
+          if tp < 8:
+              raise SystemExit(
+                  f"ISB1 replay requires TP=8 on NVIDIA runners; derived TP={tp} for runner '{runner}'."
+              )
+
+          export_path = Path(export_file)
+          match = re.search(r"(?P<isl>\d+)k(?P<osl>\d+)k", export_path.stem)
+
+          if match:
+              isl = int(match.group("isl")) * 1024
+              osl = int(match.group("osl")) * 1024
+          else:
+              try:
+                  payload = json.loads(export_path.read_text())
+              except Exception as exc:
+                  raise SystemExit(
+                      f"Could not inspect preview export metadata from '{export_file}': {exc}"
+                  )
+              served_shape = payload.get("served_shape") or {}
+              isl = int(served_shape.get("isl", 0) or 0)
+              osl = int(served_shape.get("osl", 0) or 0)
+              if not explicit_max_model_len:
+                  raise SystemExit(
+                      "Mixed-shape preview exports require explicit max-model-len in the ISB1 config. "
+                      f"Missing for '{export_file}'."
+                  )
+
+          if explicit_max_model_len:
+              max_model_len = int(explicit_max_model_len)
+          else:
+              max_model_len = isl + osl + (200 if max(isl, osl) >= 8192 else 20)
+
+          print(f"TP={tp}")
+          print("EP_SIZE=1")
+          print("DP_ATTENTION=false")
+          print("SPEC_DECODING=none")
+          print("DISAGG=false")
+          print(f"CONC={max_concurrency}")
+          print(f"ISL={isl}")
+          print(f"OSL={osl}")
+          print(f"MAX_MODEL_LEN={max_model_len}")
+          print("RANDOM_RANGE_RATIO=1.0")
+          print(f"EXPORT_STEM={Path(export_file).stem}")
+          PY
+
+      - id: launch
+        name: Launch job script
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+          RUNNER_TYPE: ${{ inputs.runner }}
+        run: |
+          RESULT_FILENAME="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_${BENCHMARK_TYPE}_${EXPORT_STEM}_conc${MAX_CONCURRENCY}_${RUNNER_NAME}"
+          echo "RESULT_FILENAME=${RESULT_FILENAME}" >> "$GITHUB_ENV"
+          echo "result_filename=${RESULT_FILENAME}" >> "$GITHUB_OUTPUT"
+          bash ./runners/launch_${RUNNER_NAME%%_*}.sh
+
+          FOUND_RESULT_FILE=
+          for i in {1..10}; do
+            if [ -f "$RESULT_FILENAME.json" ]; then
+              FOUND_RESULT_FILE=true
+              break
+            fi
+            echo "Waiting for result file... (attempt $i)"
+            sleep 1
+          done
+
+          if [ -z "$FOUND_RESULT_FILE" ]; then
+            echo "Run failed: Replay result $RESULT_FILENAME.json not found." >&2
+            exit 1
+          fi
+
+      - name: Process result
+        run: |
+          python3 utils/process_result_isb1.py
+
+      - name: Upload result
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        with:
+          name: isb1_${{ steps.launch.outputs.result_filename }}
+          path: agg_${{ steps.launch.outputs.result_filename }}.json
+
+      - name: Upload raw replay result
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        with:
+          name: replay_${{ steps.launch.outputs.result_filename }}
+          path: ${{ steps.launch.outputs.result_filename }}.json
+          if-no-files-found: ignore
+
+      - name: Upload server logs
+        if: always()
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        with:
+          name: server_logs_${{ steps.launch.outputs.result_filename }}
+          path: server.log
+          if-no-files-found: ignore
+
+      - name: Upload GPU metrics
+        if: always()
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        with:
+          name: gpu_metrics_${{ steps.launch.outputs.result_filename }}
+          path: gpu_metrics.csv
+          if-no-files-found: ignore
+
+      - name: Upload KV metrics
+        if: always()
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        with:
+          name: kv_metrics_${{ steps.launch.outputs.result_filename }}
+          path: kv_metrics.csv
+          if-no-files-found: ignore
+
+      - name: Resource cleanup (post-run)
+        if: always()
+        run: *resource-cleanup
diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml
index 353918609..6582914ca 100644
--- a/.github/workflows/collect-results.yml
+++ b/.github/workflows/collect-results.yml
@@ -29,6 +29,7 @@ jobs:
           pattern: ${{ inputs.result-prefix && format('{0}_*', inputs.result-prefix) || '*' }}
 
       - name: Print summary
+        if: inputs.result-prefix != 'isb1'
         run: |
           pip install tabulate
           python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY
@@ -38,8 +39,29 @@ jobs:
           pip install tabulate
           python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }}
 
+      - name: ISB1 operator summary
+        if: inputs.result-prefix == 'isb1'
+        run: |
+          pip install tabulate
+          python3 utils/summarize_isb1.py results/ >> $GITHUB_STEP_SUMMARY
+
+      - name: ISB1 gate report
+        if: inputs.result-prefix == 'isb1'
+        run: |
+          AGGREGATE_PATH="agg_${{ inputs.result-prefix }}.json"
+          python3 utils/gate_isb1.py "$AGGREGATE_PATH" | tee isb1_gate_report.json
+          python3 utils/gate_isb1.py "$AGGREGATE_PATH" --format markdown >> $GITHUB_STEP_SUMMARY
+
       - name: Upload aggregated results
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: results_${{ inputs.result-prefix || 'all' }}
           path: agg_${{ inputs.result-prefix || 'all' }}.json
+
+      - name: Upload ISB1 gate report
+        if: inputs.result-prefix == 'isb1'
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        with:
+          name: isb1_gate_report
+          path: isb1_gate_report.json
+          if-no-files-found: ignore
diff --git a/.github/workflows/run-isb1-kv-stress-sweep.yml b/.github/workflows/run-isb1-kv-stress-sweep.yml
new file mode 100644
index 000000000..f72ef3307
--- /dev/null
+++ b/.github/workflows/run-isb1-kv-stress-sweep.yml
@@ -0,0 +1,110 @@
+name: Run ISB1 KV Stress Sweep
+run-name: ISB1 KV Stress - ${{ github.event.inputs.config-file || '.github/configs/isb1-kv-stress.yaml' }}
+
+on:
+  workflow_dispatch:
+    inputs:
+      config-file:
+        description: ISB1 KV stress config file path
+        required: true
+        default: .github/configs/isb1-kv-stress.yaml
+      runner-type:
+        description: Optional space-separated runner filters (e.g. h200 b200)
+        required: false
+        default: ''
+      runner-config:
+        description: Runner config YAML
+        required: false
+        default: .github/configs/runners.yaml
+      ref:
+        description: Git ref to checkout
+        required: false
+        default: ''
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    outputs:
+      kv-stress-matrix: ${{ steps.generate.outputs.kv-stress-matrix }}
+      has-matrix: ${{ steps.generate.outputs.has-matrix }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Install dependencies
+        run: pip install pydantic pyyaml
+
+      - id: generate
+        env:
+          CONFIG_FILE: ${{ inputs.config-file }}
+          RUNNER_CONFIG: ${{ inputs.runner-config }}
+          RUNNER_TYPE: ${{ inputs.runner-type }}
+        run: |
+          if [ ! -f "$CONFIG_FILE" ]; then
+            echo "Missing ISB1 KV stress config file: $CONFIG_FILE" >&2
+            exit 1
+          fi
+
+          cmd=(python3 utils/matrix_logic/generate_sweep_configs.py isb1-kv-stress-sweep --config-files "$CONFIG_FILE" --runner-config "$RUNNER_CONFIG")
+
+          if [ -n "$RUNNER_TYPE" ]; then
+            read -r -a runner_types <<< "$RUNNER_TYPE"
+            cmd+=(--runner-type "${runner_types[@]}")
+          fi
+
+          matrix_json="$("${cmd[@]}")"
+          compact_matrix="$(printf '%s' "$matrix_json" | python3 -c 'import json,sys; print(json.dumps(json.load(sys.stdin)))')"
+          has_matrix="$(printf '%s' "$compact_matrix" | python3 -c 'import json,sys; print("true" if json.load(sys.stdin) else "false")')"
+
+          {
+            echo "kv-stress-matrix=$compact_matrix"
+            echo "has-matrix=$has_matrix"
+          } >> "$GITHUB_OUTPUT"
+
+  sweep:
+    needs: setup
+    if: ${{ needs.setup.outputs.has-matrix == 'true' }}
+    uses: ./.github/workflows/benchmark-isb1-tmpl.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        config: ${{ fromJson(needs.setup.outputs.kv-stress-matrix) }}
+    secrets: inherit
+    with:
+      runner: ${{ matrix.config.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      model-prefix: ${{ matrix.config.model-prefix }}
+      precision: ${{ matrix.config.precision }}
+      framework: ${{ matrix.config.framework }}
+      exp-name: ${{ matrix.config.exp-name }}
+      benchmark-type: ${{ matrix.config.benchmark-type }}
+      export-file: ${{ matrix.config.export-file }}
+      runtime-stack-id: ${{ matrix.config.runtime-stack-id }}
+      hardware-profile-id: ${{ matrix.config.hardware-profile-id }}
+      canonical-model-id: ${{ matrix.config.canonical-model-id }}
+      support-status: ${{ matrix.config.support-status || '' }}
+      request-mode: ${{ matrix.config.request-mode }}
+      max-concurrency: ${{ matrix.config.max-concurrency }}
+      max-model-len: ${{ matrix.config.max-model-len || '' }}
+      tp-override: ${{ matrix.config.tp || '' }}
+      ep-override: ${{ matrix.config.ep || '' }}
+      trace-source: ${{ matrix.config.trace-source || '' }}
+      offload-mode: ${{ matrix.config.offload-mode }}
+      kv-cache-dtype: ${{ matrix.config.kv-cache-dtype }}
+      disable-prefix-caching: ${{ matrix.config.disable-prefix-caching }}
+      benchmark-duration-s: ${{ matrix.config.benchmark-duration-s }}
+      workload-type: ${{ matrix.config.workload-type }}
+      ref: ${{ inputs.ref || github.ref }}
+
+  collect-results:
+    needs: [setup, sweep]
+    if: ${{ always() && needs.setup.outputs.has-matrix == 'true' && needs.sweep.result != 'skipped' }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      result-prefix: isb1
diff --git a/.github/workflows/run-isb1-sweep.yml b/.github/workflows/run-isb1-sweep.yml
new file mode 100644
index 000000000..a8f3177de
--- /dev/null
+++ b/.github/workflows/run-isb1-sweep.yml
@@ -0,0 +1,256 @@
+name: Run ISB1 Sweep
+run-name: ISB1 Sweep - ${{ github.event.inputs.config-files || '.github/configs/isb1-master.yaml' }}
+
+on:
+  workflow_dispatch:
+    inputs:
+      config-files:
+        description: Space-separated ISB1 config file paths
+        required: true
+        default: .github/configs/isb1-master.yaml
+      runner-config:
+        description: Runner config YAML
+        required: false
+        default: .github/configs/runners.yaml
+      model-prefix:
+        description: Optional space-separated model-prefix filters
+        required: false
+        default: ''
+      precision:
+        description: Optional space-separated precision filters
+        required: false
+        default: ''
+      framework:
+        description: Optional space-separated framework filters
+        required: false
+        default: ''
+      runner-type:
+        description: Optional space-separated runner filters
+        required: false
+        default: ''
+      runner-node-filter:
+        description: Optional runner-node substring filter
+        required: false
+        default: ''
+      max-concurrency:
+        description: Optional cap applied to replay max-concurrency
+        required: false
+        default: ''
+      vllm-cpu-offload-gb:
+        description: Optional vLLM CPU offload budget in GB for long-context runs
+        required: false
+        default: ''
+      vllm-swap-space-gb:
+        description: Optional vLLM swap-space budget in GB for long-context runs
+        required: false
+        default: ''
+      sglang-mem-fraction-override:
+        description: Optional SGLang mem-fraction-static override for long-context runs
+        required: false
+        default: ''
+      sglang-chunked-prefill-override:
+        description: Optional SGLang chunked-prefill-size override for long-context runs
+        required: false
+        default: ''
+      ref:
+        description: Git ref to checkout
+        required: false
+        default: ''
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    outputs:
+      replay-matrix: ${{ steps.generate.outputs.replay-matrix }}
+      has-matrix: ${{ steps.generate.outputs.has-matrix }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Install dependencies
+        run: pip install pydantic pyyaml
+
+      - id: generate
+        env:
+          CONFIG_FILES: ${{ inputs.config-files }}
+          RUNNER_CONFIG: ${{ inputs.runner-config }}
+          MODEL_PREFIX: ${{ inputs.model-prefix }}
+          PRECISION: ${{ inputs.precision }}
+          FRAMEWORK: ${{ inputs.framework }}
+          RUNNER_TYPE: ${{ inputs.runner-type }}
+          RUNNER_NODE_FILTER: ${{ inputs.runner-node-filter }}
+          MAX_CONCURRENCY: ${{ inputs.max-concurrency }}
+        run: |
+          read -r -a config_files <<< "$CONFIG_FILES"
+
+          for config_file in "${config_files[@]}"; do
+            if [ ! -f "$config_file" ]; then
+              echo "Missing ISB1 config file: $config_file" >&2
+              echo "PR1b adds the workflow lane only; the committed config arrives in PR2." >&2
+              exit 1
+            fi
+          done
+
+          cmd=(python3 utils/matrix_logic/generate_sweep_configs.py isb1-sweep --config-files "${config_files[@]}" --runner-config "$RUNNER_CONFIG")
+
+          if [ -n "$MODEL_PREFIX" ]; then
+            read -r -a model_prefixes <<< "$MODEL_PREFIX"
+            cmd+=(--model-prefix "${model_prefixes[@]}")
+          fi
+          if [ -n "$PRECISION" ]; then
+            read -r -a precisions <<< "$PRECISION"
+            cmd+=(--precision "${precisions[@]}")
+          fi
+          if [ -n "$FRAMEWORK" ]; then
+            read -r -a frameworks <<< "$FRAMEWORK"
+            cmd+=(--framework "${frameworks[@]}")
+          fi
+          if [ -n "$RUNNER_TYPE" ]; then
+            read -r -a runner_types <<< "$RUNNER_TYPE"
+            cmd+=(--runner-type "${runner_types[@]}")
+          fi
+          if [ -n "$RUNNER_NODE_FILTER" ]; then
+            cmd+=(--runner-node-filter "$RUNNER_NODE_FILTER")
+          fi
+          if [ -n "$MAX_CONCURRENCY" ]; then
+            cmd+=(--max-concurrency "$MAX_CONCURRENCY")
+          fi
+
+          matrix_json="$("${cmd[@]}")"
+          compact_matrix="$(printf '%s' "$matrix_json" | python3 -c 'import json,sys; print(json.dumps(json.load(sys.stdin)))')"
+          has_matrix="$(printf '%s' "$compact_matrix" | python3 -c 'import json,sys; print("true" if json.load(sys.stdin) else "false")')"
+
+          {
+            echo "replay-matrix=$compact_matrix"
+            echo "has-matrix=$has_matrix"
+          } >> "$GITHUB_OUTPUT"
+
+      - name: Write ISB1 preflight run manifest
+        env:
+          REPLAY_MATRIX: ${{ steps.generate.outputs.replay-matrix }}
+          HAS_MATRIX: ${{ steps.generate.outputs.has-matrix }}
+          INPUT_CONFIG_FILES: ${{ inputs.config-files }}
+          INPUT_RUNNER_CONFIG: ${{ inputs.runner-config }}
+          INPUT_MODEL_PREFIX: ${{ inputs.model-prefix }}
+          INPUT_PRECISION: ${{ inputs.precision }}
+          INPUT_FRAMEWORK: ${{ inputs.framework }}
+          INPUT_RUNNER_TYPE: ${{ inputs.runner-type }}
+          INPUT_RUNNER_NODE_FILTER: ${{ inputs.runner-node-filter }}
+          INPUT_MAX_CONCURRENCY: ${{ inputs.max-concurrency }}
+          INPUT_VLLM_CPU_OFFLOAD_GB: ${{ inputs.vllm-cpu-offload-gb }}
+          INPUT_VLLM_SWAP_SPACE_GB: ${{ inputs.vllm-swap-space-gb }}
+          INPUT_SGLANG_MEM_FRACTION_OVERRIDE: ${{ inputs.sglang-mem-fraction-override }}
+          INPUT_SGLANG_CHUNKED_PREFILL_OVERRIDE: ${{ inputs.sglang-chunked-prefill-override }}
+          INPUT_REF: ${{ inputs.ref || github.ref }}
+          WORKFLOW_RUN_ID: ${{ github.run_id }}
+          WORKFLOW_RUN_ATTEMPT: ${{ github.run_attempt }}
+          WORKFLOW_SHA: ${{ github.sha }}
+        run: |
+          python3 - <<'PY'
+          import json
+          import os
+          from collections import Counter
+
+          matrix_rows = json.loads(os.environ.get("REPLAY_MATRIX") or "[]")
+
+          def count_by(field: str) -> dict[str, int]:
+              values = [row.get(field) for row in matrix_rows]
+              normalized = ["" if value is None else str(value) for value in values]
+              return dict(sorted(Counter(normalized).items()))
+
+          manifest = {
+              "dispatch_inputs": {
+                  "config-files": os.environ.get("INPUT_CONFIG_FILES", ""),
+                  "runner-config": os.environ.get("INPUT_RUNNER_CONFIG", ""),
+                  "model-prefix": os.environ.get("INPUT_MODEL_PREFIX", ""),
+                  "precision": os.environ.get("INPUT_PRECISION", ""),
+                  "framework": os.environ.get("INPUT_FRAMEWORK", ""),
+                  "runner-type": os.environ.get("INPUT_RUNNER_TYPE", ""),
+                  "runner-node-filter": os.environ.get("INPUT_RUNNER_NODE_FILTER", ""),
+                  "max-concurrency": os.environ.get("INPUT_MAX_CONCURRENCY", ""),
+                  "vllm-cpu-offload-gb": os.environ.get("INPUT_VLLM_CPU_OFFLOAD_GB", ""),
+                  "vllm-swap-space-gb": os.environ.get("INPUT_VLLM_SWAP_SPACE_GB", ""),
+                  "sglang-mem-fraction-override": os.environ.get("INPUT_SGLANG_MEM_FRACTION_OVERRIDE", ""),
+                  "sglang-chunked-prefill-override": os.environ.get("INPUT_SGLANG_CHUNKED_PREFILL_OVERRIDE", ""),
+                  "ref": os.environ.get("INPUT_REF", ""),
+              },
+              "matrix_summary": {
+                  "has_matrix": os.environ.get("HAS_MATRIX", "false"),
+                  "total_cells": len(matrix_rows),
+                  "by_model_prefix": count_by("model-prefix"),
+                  "by_framework": count_by("framework"),
+                  "by_runner": count_by("runner"),
+                  "by_support_status": count_by("support-status"),
+              },
+              "workflow_context": {
+                  "run_id": os.environ.get("WORKFLOW_RUN_ID", ""),
+                  "run_attempt": os.environ.get("WORKFLOW_RUN_ATTEMPT", ""),
+                  "sha": os.environ.get("WORKFLOW_SHA", ""),
+              },
+              "matrix_rows": matrix_rows,
+          }
+
+          with open("isb1_run_manifest.json", "w", encoding="utf-8") as fh:
+              json.dump(manifest, fh, indent=2, sort_keys=True)
+          PY
+
+      - name: Upload ISB1 run manifest
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        with:
+          name: isb1_run_manifest
+          path: isb1_run_manifest.json
+          if-no-files-found: error
+
+  sweep:
+    needs: setup
+    if: ${{ needs.setup.outputs.has-matrix == 'true' }}
+    uses: ./.github/workflows/benchmark-isb1-tmpl.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        config: ${{ fromJson(needs.setup.outputs.replay-matrix) }}
+    secrets: inherit
+    with:
+      runner: ${{ matrix.config.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      model-prefix: ${{ matrix.config.model-prefix }}
+      precision: ${{ matrix.config.precision }}
+      framework: ${{ matrix.config.framework }}
+      exp-name: ${{ matrix.config.exp-name }}
+      benchmark-type: ${{ matrix.config.benchmark-type }}
+      export-file: ${{ matrix.config.export-file }}
+      runtime-stack-id: ${{ matrix.config.runtime-stack-id }}
+      hardware-profile-id: ${{ matrix.config.hardware-profile-id }}
+      canonical-model-id: ${{ matrix.config.canonical-model-id }}
+      support-status: ${{ matrix.config.support-status || '' }}
+      request-mode: ${{ matrix.config.request-mode }}
+      max-concurrency: ${{ matrix.config.max-concurrency }}
+      max-sessions: ${{ matrix.config.max-sessions || '' }}
+      max-turns-per-session: ${{ matrix.config.max-turns-per-session || '' }}
+      max-output-len: ${{ matrix.config.max-output-len || '' }}
+      num-warmup-sessions: ${{ matrix.config.num-warmup-sessions || '0' }}
+      ignore-waits: ${{ matrix.config.ignore-waits || false }}
+      ignore-eos: ${{ matrix.config.ignore-eos || false }}
+      max-model-len: ${{ matrix.config.max-model-len || '' }}
+      offload-mode: ${{ matrix.config.offload-mode || '' }}
+      kv-cache-dtype: ${{ matrix.config.kv-cache-dtype || '' }}
+      disable-prefix-caching: ${{ matrix.config.disable-prefix-caching || false }}
+      benchmark-duration-s: ${{ matrix.config.benchmark-duration-s || '' }}
+      vllm-cpu-offload-gb: ${{ inputs.vllm-cpu-offload-gb || '' }}
+      vllm-swap-space-gb: ${{ inputs.vllm-swap-space-gb || '' }}
+      sglang-mem-fraction-override: ${{ inputs.sglang-mem-fraction-override || '' }}
+      sglang-chunked-prefill-override: ${{ inputs.sglang-chunked-prefill-override || '' }}
+      ref: ${{ inputs.ref || github.ref }}
+
+  collect-results:
+    needs: [setup, sweep]
+    if: ${{ always() && needs.setup.outputs.has-matrix == 'true' && needs.sweep.result != 'skipped' }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      result-prefix: isb1
diff --git a/.gitignore b/.gitignore
index 03d36472a..1b87019c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
 **/__pycache__/**
-**/.coverage
\ No newline at end of file
+**/.coverage
+**/.DS_Store
+prompt-exports/
+.claude
\ No newline at end of file
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 535313252..ea35df323 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -66,6 +66,304 @@ stop_gpu_monitor() {
     GPU_MONITOR_PID=""
 }
 
+KV_METRICS_PID=""
+KV_METRICS_CSV="/workspace/kv_metrics.csv"
+VLLM_OFFLOAD_EXTRA_ARGS=""
+VLLM_EXTRA_ARGS=""
+SGLANG_EXTRA_ARGS=""
+
+build_yarn_override_json() {
+    local max_model_len="${1:?}"
+    local factor="2.0"
+    if (( max_model_len > 600000 )); then
+        factor="4.0"
+    fi
+    echo "{\"text_config\":{\"rope_parameters\":{\"mrope_interleaved\":true,\"mrope_section\":[11,11,10],\"rope_type\":\"yarn\",\"rope_theta\":10000000,\"partial_rotary_factor\":0.25,\"factor\":${factor},\"original_max_position_embeddings\":262144}}}"
+}
+
+apply_yarn_config_if_needed() {
+    local model="${1:?}"
+    local max_model_len="${2:?}"
+    if [[ "$model" == *"Qwen3.5"* || "$model" == *"qwen3.5"* || "$model" == *"Qwen3_5"* ]] && (( max_model_len > 262144 )); then
+        YARN_OVERRIDE_JSON=$(build_yarn_override_json "$max_model_len")
+        export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
+        export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
+        echo "YaRN enabled: factor=$(echo "$YARN_OVERRIDE_JSON" | grep -o '"factor":[0-9.]*' | cut -d: -f2) for max-model-len=$max_model_len"
+    fi
+}
+
+_append_config_kv_once() {
+    local key="$1"
+    local value="$2"
+
+    if [[ ! -f config.yaml ]]; then
+        return 0
+    fi
+
+    if ! grep -Eq "^${key}:" config.yaml; then
+        echo "${key}: ${value}" >> config.yaml
+    fi
+}
+
+_remove_config_kv() {
+    local key="$1"
+
+    if [[ ! -f config.yaml ]]; then
+        return 0
+    fi
+
+    local tmp_file
+    tmp_file=$(mktemp)
+    grep -Ev "^${key}:" config.yaml > "$tmp_file"
+    mv "$tmp_file" config.yaml
+}
+
+_detect_total_cpu_dram_gb() {
+    if [[ -n "${TOTAL_CPU_DRAM_GB:-}" ]]; then
+        echo "${TOTAL_CPU_DRAM_GB}"
+        return 0
+    fi
+
+    if [[ -f /proc/meminfo ]]; then
+        awk '/MemTotal/{printf "%.0f", $2/1048576}' /proc/meminfo
+        return 0
+    fi
+
+    if command -v sysctl >/dev/null 2>&1; then
+        local mem_bytes
+        mem_bytes=$(sysctl -n hw.memsize 2>/dev/null || echo "")
+        if [[ -n "$mem_bytes" ]]; then
+            awk -v bytes="$mem_bytes" 'BEGIN {printf "%.0f", bytes/1073741824}'
+            return 0
+        fi
+    fi
+
+    echo "64"
+}
+
+apply_vllm_offload_config() {
+    local mode="${OFFLOAD_MODE:-legacy}"
+    local detected_dram_gb=""
+
+    VLLM_OFFLOAD_EXTRA_ARGS=""
+    VLLM_EXTRA_ARGS=""
+
+    case "$mode" in
+        on)
+            PREFIX_CACHING_CONFIG=""
+            _remove_config_kv "no-enable-prefix-caching"
+            _remove_config_kv "cpu-offload-gb"
+            _remove_config_kv "swap-space"
+            detected_dram_gb="$(_detect_total_cpu_dram_gb)"
+            VLLM_OFFLOAD_EXTRA_ARGS="--kv_offloading_backend native --kv_offloading_size ${detected_dram_gb} --disable-hybrid-kv-cache-manager"
+            ;;
+        off)
+            PREFIX_CACHING_CONFIG=""
+            _remove_config_kv "no-enable-prefix-caching"
+            _remove_config_kv "cpu-offload-gb"
+            _remove_config_kv "swap-space"
+            ;;
+        noprefix)
+            PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true"
+            _remove_config_kv "cpu-offload-gb"
+            _remove_config_kv "swap-space"
+            _append_config_kv_once "no-enable-prefix-caching" "true"
+            ;;
+        legacy|"")
+            if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then
+                _append_config_kv_once "cpu-offload-gb" "${VLLM_CPU_OFFLOAD_GB}"
+            fi
+            if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then
+                _append_config_kv_once "swap-space" "${VLLM_SWAP_SPACE_GB}"
+            fi
+            ;;
+        *)
+            echo "WARN: Unknown OFFLOAD_MODE='${mode}', falling back to legacy behavior" >&2
+            if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then
+                _append_config_kv_once "cpu-offload-gb" "${VLLM_CPU_OFFLOAD_GB}"
+            fi
+            if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then
+                _append_config_kv_once "swap-space" "${VLLM_SWAP_SPACE_GB}"
+            fi
+            ;;
+    esac
+
+    if [[ "${DISABLE_PREFIX_CACHING:-false}" == "true" ]]; then
+        PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true"
+        _append_config_kv_once "no-enable-prefix-caching" "true"
+    fi
+
+    if [[ "${KV_CACHE_DTYPE:-}" == "fp8" ]]; then
+        _append_config_kv_once "kv-cache-dtype" "fp8"
+    fi
+
+    if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then
+        VLLM_EXTRA_ARGS="${VLLM_EXTRA_ARGS:-} --hf-overrides '${YARN_OVERRIDE_JSON}'"
+    fi
+}
+
+apply_sglang_offload_config() {
+    local mode="${OFFLOAD_MODE:-legacy}"
+
+    SGLANG_EXTRA_ARGS=""
+
+    case "$mode" in
+        on)
+            echo "WARN: OFFLOAD_MODE=on requested for SGLang, but native KV offload is not supported. Leaving cache mode unchanged." >&2
+            ;;
+        off)
+            RADIX_CACHE_ARGS=""
+            ;;
+        noprefix)
+            RADIX_CACHE_ARGS="--disable-radix-cache"
+            ;;
+        legacy|"")
+            ;;
+        *)
+            echo "WARN: Unknown OFFLOAD_MODE='${mode}' for SGLang; leaving radix cache args unchanged." >&2
+            ;;
+    esac
+
+    if [[ "${DISABLE_PREFIX_CACHING:-false}" == "true" ]]; then
+        RADIX_CACHE_ARGS="--disable-radix-cache"
+    fi
+
+    if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then
+        SGLANG_EXTRA_ARGS="${SGLANG_EXTRA_ARGS:-} --json-model-override-args '${YARN_OVERRIDE_JSON}'"
+    fi
+}
+
+# launch_vllm_server <model> <port> <config_yaml_path> [extra args...]
+# Sets: SERVER_PID, SERVER_LOG
+launch_vllm_server() {
+    local model="$1"
+    local port="$2"
+    local config_yaml_path="$3"
+    shift 3 || true
+    local extra_args=("$@")
+
+    if [[ -z "$model" || -z "$port" || -z "$config_yaml_path" ]]; then
+        echo "launch_vllm_server requires: model port config_yaml_path" >&2
+        return 1
+    fi
+
+    hf download "$model"
+    apply_vllm_offload_config
+
+    SERVER_LOG="${SERVER_LOG:-/workspace/server.log}"
+
+    local vllm_max_num_seqs="${VLLM_MAX_NUM_SEQS:-}"
+    if [[ -z "$vllm_max_num_seqs" ]]; then
+        local conc_value="${CONC:-256}"
+        if [[ "$conc_value" =~ ^[0-9]+$ ]] && (( conc_value > 256 )); then
+            vllm_max_num_seqs="$conc_value"
+        else
+            vllm_max_num_seqs="256"
+        fi
+    fi
+
+    local vllm_tp="${TP:-1}"
+    local vllm_gpu_mem_util="${VLLM_GPU_MEMORY_UTILIZATION:-0.9}"
+
+    local offload_args=()
+    if [[ -n "$VLLM_OFFLOAD_EXTRA_ARGS" ]]; then
+        # shellcheck disable=SC2206
+        offload_args=($VLLM_OFFLOAD_EXTRA_ARGS)
+    fi
+
+    PYTHONNOUSERSITE=1 vllm serve "$model" --host 0.0.0.0 --port "$port" \
+        --config "$config_yaml_path" \
+        --gpu-memory-utilization "$vllm_gpu_mem_util" \
+        --tensor-parallel-size "$vllm_tp" \
+        --max-num-seqs "$vllm_max_num_seqs" \
+        "${extra_args[@]}" \
+        "${offload_args[@]}" \
+        > "$SERVER_LOG" 2>&1 &
+
+    SERVER_PID=$!
+    export SERVER_PID
+    export SERVER_LOG
+}
+
+# launch_sglang_server <model> <port> [extra args...]
+# Sets: SERVER_PID, SERVER_LOG
+launch_sglang_server() {
+    local model="$1"
+    local port="$2"
+    shift 2 || true
+    local extra_args=("$@")
+
+    if [[ -z "$model" || -z "$port" ]]; then
+        echo "launch_sglang_server requires: model port" >&2
+        return 1
+    fi
+
+    hf download "$model"
+    if [[ -n "${OFFLOAD_MODE:-}" || "${DISABLE_PREFIX_CACHING:-false}" == "true" ]]; then
+        apply_sglang_offload_config
+    fi
+
+    SERVER_LOG="${SERVER_LOG:-/workspace/server.log}"
+
+    local sglang_tp="${TP:-1}"
+    local sglang_dp="${DP_SIZE:-1}"
+
+    PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \
+        --model-path "$model" \
+        --host 0.0.0.0 \
+        --port "$port" \
+        --tensor-parallel-size "$sglang_tp" \
+        --data-parallel-size "$sglang_dp" \
+        "${extra_args[@]}" \
+        > "$SERVER_LOG" 2>&1 &
+
+    SERVER_PID=$!
+    export SERVER_PID
+    export SERVER_LOG
+}
+
+start_kv_metrics_collector() {
+    local port="${1:-8888}"
+    local output="${2:-$KV_METRICS_CSV}"
+    local interval="${3:-2.0}"
+    local collector_script
+
+    collector_script="$(cd "$(dirname "${BASH_SOURCE[0]}")/../datasets/isb1/scripts" && pwd)/metrics_collector.py"
+
+    if [[ ! -f "$collector_script" ]]; then
+        echo "[KV Metrics] Collector script not found at $collector_script, skipping"
+        return 0
+    fi
+
+    if [[ -n "$KV_METRICS_PID" ]] && kill -0 "$KV_METRICS_PID" 2>/dev/null; then
+        echo "[KV Metrics] Collector already running (PID=$KV_METRICS_PID)"
+        return 0
+    fi
+
+    KV_METRICS_CSV="$output"
+    python3 "$collector_script" \
+        --metrics-url "http://0.0.0.0:${port}/metrics" \
+        --output "$output" \
+        --interval "$interval" >/tmp/kv_metrics_collector.log 2>&1 &
+    KV_METRICS_PID=$!
+
+    echo "[KV Metrics] Started (PID=$KV_METRICS_PID, interval=${interval}s, output=$output)"
+}
+
+stop_kv_metrics_collector() {
+    if [[ -n "$KV_METRICS_PID" ]] && kill -0 "$KV_METRICS_PID" 2>/dev/null; then
+        kill "$KV_METRICS_PID" 2>/dev/null || true
+        wait "$KV_METRICS_PID" 2>/dev/null || true
+        echo "[KV Metrics] Stopped (PID=$KV_METRICS_PID)"
+        if [[ -f "$KV_METRICS_CSV" ]]; then
+            local lines
+            lines=$(wc -l < "$KV_METRICS_CSV")
+            echo "[KV Metrics] Collected $lines rows -> $KV_METRICS_CSV"
+        fi
+    fi
+    KV_METRICS_PID=""
+}
+
 # Check if required environment variables are set
 # Usage: check_env_vars VAR1 VAR2 VAR3 ...
 # Exits with code 1 if any variable is not set
@@ -395,6 +693,194 @@ run_benchmark_serving() {
     return $benchmark_exit_code
 }
 
+is_isb1_replay_benchmark() {
+    [[ "${BENCHMARK_TYPE:-}" == "isb1_replay" ]]
+}
+
+is_isb1_kv_stress_benchmark() {
+    [[ "${BENCHMARK_TYPE:-}" == "isb1_kv_stress" ]]
+}
+
+resolve_replay_request_mode_for_harness() {
+    local requested_mode="${1:-auto}"
+
+    case "$requested_mode" in
+        ""|auto|chat|completions)
+            printf '%s' "${requested_mode:-auto}"
+            ;;
+        multi-turn|multi_turn|multiturn)
+            printf 'auto'
+            ;;
+        *)
+            echo "WARN: Unsupported replay request mode '$requested_mode'; using 'auto' for the harness boundary" >&2
+            printf 'auto'
+            ;;
+    esac
+}
+
+run_isb1_kv_stress_campaign_cell() {
+    check_env_vars \
+        BENCHMARK_TYPE \
+        EXPORT_FILE \
+        MAX_CONCURRENCY \
+        OFFLOAD_MODE \
+        BENCHMARK_DURATION_S \
+        KV_CACHE_DTYPE \
+        WORKLOAD_TYPE
+
+    if ! is_isb1_kv_stress_benchmark; then
+        echo "Error: run_isb1_kv_stress_campaign_cell called with BENCHMARK_TYPE='${BENCHMARK_TYPE:-}'" >&2
+        return 1
+    fi
+
+    local port="${PORT:-8888}"
+    local kv_metrics_output="/workspace/kv_metrics.csv"
+    local metadata_path="/workspace/kv_stress_campaign_metadata.json"
+    local replay_exit_code=0
+
+    start_gpu_monitor
+    start_kv_metrics_collector "$port" "$kv_metrics_output" 2.0
+
+    run_benchmark_export_replay "$@" || replay_exit_code=$?
+
+    stop_kv_metrics_collector
+    stop_gpu_monitor
+
+    python3 - <<'PY'
+import json
+import os
+import time
+
+metadata = {
+    "benchmark_type": os.getenv("BENCHMARK_TYPE", ""),
+    "export_file": os.getenv("EXPORT_FILE", ""),
+    "runtime_stack_id": os.getenv("RUNTIME_STACK_ID", ""),
+    "hardware_profile_id": os.getenv("HARDWARE_PROFILE_ID", ""),
+    "canonical_model_id": os.getenv("CANONICAL_MODEL_ID", ""),
+    "request_mode": os.getenv("REQUEST_MODE", ""),
+    "max_concurrency": os.getenv("MAX_CONCURRENCY", ""),
+    "offload_mode": os.getenv("OFFLOAD_MODE", ""),
+    "disable_prefix_caching": os.getenv("DISABLE_PREFIX_CACHING", ""),
+    "kv_cache_dtype": os.getenv("KV_CACHE_DTYPE", ""),
+    "benchmark_duration_s": os.getenv("BENCHMARK_DURATION_S", ""),
+    "workload_type": os.getenv("WORKLOAD_TYPE", ""),
+    "metrics_files": {
+        "gpu": "/workspace/gpu_metrics.csv",
+        "kv": "/workspace/kv_metrics.csv",
+    },
+    "captured_at_epoch_s": int(time.time()),
+}
+with open("/workspace/kv_stress_campaign_metadata.json", "w", encoding="utf-8") as f:
+    json.dump(metadata, f, indent=2, sort_keys=True)
+PY
+
+    echo "[KV Stress] Campaign metadata written to $metadata_path"
+    return "$replay_exit_code"
+}
+
+run_single_node_benchmark() {
+    if ! is_isb1_replay_benchmark && ! is_isb1_kv_stress_benchmark; then
+        run_benchmark_serving "$@"
+        return $?
+    fi
+
+    set +x
+    local model=""
+    local port=""
+    local result_filename=""
+    local result_dir=""
+    local workspace_dir=""
+    local trust_remote_code=false
+    local server_pid=""
+
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            --model)               model="$2";           shift 2 ;;
+            --port)                port="$2";            shift 2 ;;
+            --result-filename)     result_filename="$2"; shift 2 ;;
+            --result-dir)          result_dir="$2";      shift 2 ;;
+            --bench-serving-dir)   workspace_dir="$2";   shift 2 ;;
+            --trust-remote-code)   trust_remote_code=true; shift ;;
+            --server-pid)          server_pid="$2";      shift 2 ;;
+            --backend|--input-len|--output-len|--random-range-ratio|--num-prompts|--max-concurrency)
+                shift 2
+                ;;
+            --use-chat-template)
+                shift
+                ;;
+            *)
+                echo "Unknown parameter: $1"
+                return 1
+                ;;
+        esac
+    done
+
+    if [[ -z "$model" ]]; then
+        echo "Error: --model is required"
+        return 1
+    fi
+    if [[ -z "$port" ]]; then
+        echo "Error: --port is required"
+        return 1
+    fi
+    if [[ -z "$result_filename" ]]; then
+        echo "Error: --result-filename is required"
+        return 1
+    fi
+    if [[ -z "$result_dir" ]]; then
+        echo "Error: --result-dir is required"
+        return 1
+    fi
+
+    local replay_args=(
+        --model "$model"
+        --port "$port"
+        --export-file "${EXPORT_FILE}"
+        --runtime-stack-id "${RUNTIME_STACK_ID}"
+        --hardware-profile-id "${HARDWARE_PROFILE_ID}"
+        --canonical-model-id "${CANONICAL_MODEL_ID}"
+        --request-mode "${REQUEST_MODE:-auto}"
+        --max-concurrency "${MAX_CONCURRENCY}"
+        --num-warmup-sessions "${NUM_WARMUP_SESSIONS:-0}"
+        --result-filename "$result_filename"
+        --result-dir "$result_dir"
+    )
+
+    if [[ -n "$workspace_dir" ]]; then
+        replay_args+=(--bench-serving-dir "$workspace_dir")
+    fi
+    if [[ -n "${MAX_SESSIONS:-}" ]]; then
+        replay_args+=(--max-sessions "${MAX_SESSIONS}")
+    fi
+    if [[ -n "${SUPPORT_STATUS:-}" ]]; then
+        replay_args+=(--support-status "${SUPPORT_STATUS}")
+    fi
+    if [[ -n "${MAX_TURNS_PER_SESSION:-}" ]]; then
+        replay_args+=(--max-turns-per-session "${MAX_TURNS_PER_SESSION}")
+    fi
+    if [[ -n "${MAX_OUTPUT_LEN:-}" ]]; then
+        replay_args+=(--max-output-len "${MAX_OUTPUT_LEN}")
+    fi
+    if [[ "${IGNORE_WAITS:-false}" == "true" ]]; then
+        replay_args+=(--ignore-waits)
+    fi
+    if [[ "${IGNORE_EOS:-false}" == "true" ]]; then
+        replay_args+=(--ignore-eos)
+    fi
+    if [[ "$trust_remote_code" == true ]]; then
+        replay_args+=(--trust-remote-code)
+    fi
+    if [[ -n "$server_pid" ]]; then
+        replay_args+=(--server-pid "$server_pid")
+    fi
+
+    if is_isb1_kv_stress_benchmark; then
+        run_isb1_kv_stress_campaign_cell "${replay_args[@]}"
+    else
+        run_benchmark_export_replay "${replay_args[@]}"
+    fi
+}
+
 
 # --------------------------------
 # Profiling trace helpers
@@ -805,3 +1291,215 @@ run_eval() {
     fi
     return $eval_rc
 }
+
+
+# ---------------------------------------------------------------------------
+# Multi-turn benchmark wrapper
+# ---------------------------------------------------------------------------
+
+# Run multi-turn chat benchmark with standardized parameters.
+# Exercises growing KV cache across conversation turns via /v1/chat/completions.
+#
+# IMPORTANT: The server MUST be started with prefix/radix caching ENABLED
+# for meaningful multi-turn results.  Do NOT use --disable-radix-cache or
+# --no-enable-prefix-caching with multi-turn benchmarks.
+# Replay ISB1 export sessions/events against a running server.
+#
+# Supports:
+#   - inferencex_multiturn exports via /v1/chat/completions (standalone vLLM/SGLang)
+#   - inferencex_trace_replay exports via either chat or projected completions
+#     mode (useful for TRT / Dynamo-style cells)
+#
+# Parameters:
+#   --model: Model name sent to the target server
+#   --port: Server port
+#   --export-file: Path to export JSON
+#   --runtime-stack-id: Filter selected export cells to one runtime stack
+#   --hardware-profile-id: Filter selected export cells to one hardware row
+#   --canonical-model-id: Filter selected export cells to one canonical model row
+#   --request-mode: auto|chat|completions (default: auto)
+#   --max-concurrency: Max concurrent replay sessions
+#   --num-warmup-sessions: Warmup sessions before measurement
+#   --result-filename: Result filename without extension
+#   --result-dir: Result directory
+#   --max-sessions: Optional session limit for smoke runs
+#   --max-turns-per-session: Optional turn cap for smoke runs
+#   --max-output-len: Optional per-turn output cap
+#   --ignore-waits: Ignore inter-turn wait gaps from export metadata
+#   --trust-remote-code: Optional flag
+#   --server-pid: Optional server process ID to monitor
+run_benchmark_export_replay() {
+    set +x
+    local model=""
+    local port=""
+    local export_file=""
+    local runtime_stack_id=""
+    local hardware_profile_id=""
+    local canonical_model_id=""
+    local trace_id=""
+    local support_status=""
+    local request_mode="auto"
+    local max_concurrency="8"
+    local num_warmup_sessions="1"
+    local result_filename=""
+    local result_dir=""
+    local workspace_dir=""
+    local max_sessions=""
+    local max_turns_per_session=""
+    local max_output_len=""
+    local ignore_waits=false
+    local trust_remote_code=false
+    local ignore_eos=false
+    local server_pid=""
+
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            --model)                  model="$2";                  shift 2 ;;
+            --port)                   port="$2";                   shift 2 ;;
+            --export-file)            export_file="$2";            shift 2 ;;
+            --runtime-stack-id)       runtime_stack_id="$2";       shift 2 ;;
+            --hardware-profile-id)    hardware_profile_id="$2";    shift 2 ;;
+            --canonical-model-id)     canonical_model_id="$2";     shift 2 ;;
+            --trace-id)               trace_id="$2";               shift 2 ;;
+            --support-status)         support_status="$2";         shift 2 ;;
+            --request-mode)           request_mode="$2";           shift 2 ;;
+            --max-concurrency)        max_concurrency="$2";        shift 2 ;;
+            --num-warmup-sessions)    num_warmup_sessions="$2";    shift 2 ;;
+            --result-filename)        result_filename="$2";        shift 2 ;;
+            --result-dir)             result_dir="$2";            shift 2 ;;
+            --bench-serving-dir)      workspace_dir="$2";          shift 2 ;;
+            --max-sessions)           max_sessions="$2";           shift 2 ;;
+            --max-turns-per-session)  max_turns_per_session="$2";  shift 2 ;;
+            --max-output-len)         max_output_len="$2";         shift 2 ;;
+            --ignore-waits)           ignore_waits=true;           shift   ;;
+            --trust-remote-code)      trust_remote_code=true;      shift   ;;
+            --ignore-eos)             ignore_eos=true;             shift   ;;
+            --server-pid)             server_pid="$2";             shift 2 ;;
+            *)                        echo "Unknown parameter: $1"; return 1 ;;
+        esac
+    done
+
+    if [[ -z "$model" ]]; then echo "Error: --model is required"; return 1; fi
+    if [[ -z "$port" ]]; then echo "Error: --port is required"; return 1; fi
+    if [[ -z "$export_file" ]]; then echo "Error: --export-file is required"; return 1; fi
+    if [[ -z "$result_filename" ]]; then echo "Error: --result-filename is required"; return 1; fi
+    if [[ -z "$result_dir" ]]; then echo "Error: --result-dir is required"; return 1; fi
+
+    if [[ -z "$workspace_dir" ]]; then
+        workspace_dir=$(pwd)
+    fi
+
+    local requested_request_mode="$request_mode"
+    local harness_request_mode
+    harness_request_mode=$(resolve_replay_request_mode_for_harness "$request_mode")
+
+    local benchmark_cmd=(
+        python3 "$workspace_dir/utils/bench_serving/benchmark_export_replay.py"
+        --model "$model"
+        --base-url "http://0.0.0.0:$port"
+        --export-file "$export_file"
+        --request-mode "$harness_request_mode"
+        --max-concurrency "$max_concurrency"
+        --num-warmup-sessions "$num_warmup_sessions"
+        --save-result
+        --result-dir "$result_dir"
+        --result-filename "$result_filename.json"
+        --metadata
+        "benchmark_type=${BENCHMARK_TYPE:-isb1_replay}"
+        "export_file=$export_file"
+        "runtime_stack_id=$runtime_stack_id"
+        "hardware_profile_id=$hardware_profile_id"
+        "canonical_model_id=$canonical_model_id"
+        "request_mode=$requested_request_mode"
+        "harness_request_mode=$harness_request_mode"
+    )
+
+    if [[ -n "${WORKLOAD_TYPE:-}" ]]; then
+        benchmark_cmd+=(--metadata "workload_type=${WORKLOAD_TYPE}")
+    fi
+    if [[ -n "${BENCHMARK_DURATION_S:-}" ]]; then
+        benchmark_cmd+=(--metadata "benchmark_duration_s=${BENCHMARK_DURATION_S}")
+    fi
+    if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+        benchmark_cmd+=(--metadata "offload_mode=${OFFLOAD_MODE}")
+    fi
+    if [[ -n "${KV_CACHE_DTYPE:-}" ]]; then
+        benchmark_cmd+=(--metadata "kv_cache_dtype=${KV_CACHE_DTYPE}")
+    fi
+    if [[ -n "${DISABLE_PREFIX_CACHING:-}" ]]; then
+        benchmark_cmd+=(--metadata "disable_prefix_caching=${DISABLE_PREFIX_CACHING}")
+    fi
+
+    if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then
+        benchmark_cmd+=(--metadata "vllm_cpu_offload_gb=${VLLM_CPU_OFFLOAD_GB}")
+    fi
+    if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then
+        benchmark_cmd+=(--metadata "vllm_swap_space_gb=${VLLM_SWAP_SPACE_GB}")
+    fi
+    if [[ -n "${SGLANG_MEM_FRACTION_OVERRIDE:-}" ]]; then
+        benchmark_cmd+=(--metadata "sglang_mem_fraction_override=${SGLANG_MEM_FRACTION_OVERRIDE}")
+    fi
+    if [[ -n "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-}" ]]; then
+        benchmark_cmd+=(--metadata "sglang_chunked_prefill_override=${SGLANG_CHUNKED_PREFILL_OVERRIDE}")
+    fi
+
+    if [[ -n "$runtime_stack_id" ]]; then
+        benchmark_cmd+=(--runtime-stack-id "$runtime_stack_id")
+    fi
+    if [[ -n "$hardware_profile_id" ]]; then
+        benchmark_cmd+=(--hardware-profile-id "$hardware_profile_id")
+    fi
+    if [[ -n "$canonical_model_id" ]]; then
+        benchmark_cmd+=(--canonical-model-id "$canonical_model_id")
+    fi
+    if [[ -n "$trace_id" ]]; then
+        benchmark_cmd+=(--trace-id "$trace_id")
+    fi
+    if [[ -n "$support_status" ]]; then
+        benchmark_cmd+=(--support-status "$support_status")
+    fi
+    if [[ -n "$max_sessions" ]]; then
+        benchmark_cmd+=(--max-sessions "$max_sessions")
+    fi
+    if [[ -n "$max_turns_per_session" ]]; then
+        benchmark_cmd+=(--max-turns-per-session "$max_turns_per_session")
+    fi
+    if [[ -n "$max_output_len" ]]; then
+        benchmark_cmd+=(--max-output-len "$max_output_len")
+    fi
+    if [[ "$ignore_waits" == true ]]; then
+        benchmark_cmd+=(--ignore-waits)
+    fi
+    if [[ "$trust_remote_code" == true ]]; then
+        benchmark_cmd+=(--trust-remote-code)
+    fi
+    if [[ "$ignore_eos" == true ]]; then
+        benchmark_cmd+=(--ignore-eos)
+    fi
+
+    set -x
+    if [[ -n "$server_pid" ]]; then
+        "${benchmark_cmd[@]}" &
+        local benchmark_pid=$!
+
+        while kill -0 "$benchmark_pid" 2>/dev/null; do
+            if ! kill -0 "$server_pid" 2>/dev/null; then
+                echo "ERROR: Server process $server_pid died during export replay benchmark"
+                kill "$benchmark_pid" 2>/dev/null
+                wait "$benchmark_pid" 2>/dev/null
+                set +x
+                return 1
+            fi
+            sleep 2
+        done
+
+        wait "$benchmark_pid"
+        local benchmark_exit_code=$?
+    else
+        "${benchmark_cmd[@]}"
+        local benchmark_exit_code=$?
+    fi
+    set +x
+
+    return $benchmark_exit_code
+}
diff --git a/benchmarks/single_node/dsr1_fp4_b200.sh b/benchmarks/single_node/dsr1_fp4_b200.sh
index d88941628..e11290b95 100644
--- a/benchmarks/single_node/dsr1_fp4_b200.sh
+++ b/benchmarks/single_node/dsr1_fp4_b200.sh
@@ -31,13 +31,26 @@ else
 fi
 echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
-EVAL_CONTEXT_ARGS=""
+RUNTIME_CONTEXT_ARGS=""
+if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then
+    RUNTIME_CONTEXT_ARGS="--context-length $MAX_MODEL_LEN"
+fi
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
-    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+    RUNTIME_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+RADIX_CACHE_ARGS="--disable-radix-cache"
+if is_isb1_replay_benchmark; then
+    RADIX_CACHE_ARGS=""
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_sglang_offload_config
 fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
 
 set -x
 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
@@ -45,7 +58,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.
 --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
 --chunked-prefill-size 16384 \
 --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
---enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+--enable-symm-mem $RADIX_CACHE_ARGS --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -54,7 +67,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 
 pip install -q datasets pandas
 
-run_benchmark_serving \
+run_single_node_benchmark \
     --model "$MODEL" \
     --port "$PORT" \
     --backend vllm \
@@ -64,7 +77,8 @@ run_benchmark_serving \
     --num-prompts $((CONC * 10)) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
@@ -73,5 +87,8 @@ if [ "${RUN_EVAL}" = "true" ]; then
 fi
 
 # Stop GPU monitoring
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
 stop_gpu_monitor
 set +x
diff --git a/benchmarks/single_node/dsr1_fp8_b200.sh b/benchmarks/single_node/dsr1_fp8_b200.sh
index e6d8a0e9c..0fbe9bd6c 100644
--- a/benchmarks/single_node/dsr1_fp8_b200.sh
+++ b/benchmarks/single_node/dsr1_fp8_b200.sh
@@ -38,9 +38,9 @@ if [[ $TP -eq 8 ]]; then
   MAX_RUNNING_REQUESTS=128
   CUDA_GRAPH_MAX_BATCH_SIZE=128
 
-  MEM_FRAC_STATIC=0.82
-  CHUNKED_PREFILL_SIZE=32768
-  MAX_PREFILL_TOKENS=32768
+  MEM_FRAC_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.82}"
+  CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}"
+  MAX_PREFILL_TOKENS="$CHUNKED_PREFILL_SIZE"
 elif [[ $TP -eq 4 ]]; then
   if [[ $ISL -ne 8192 ]] || [[ $OSL -ne 1024 ]]; then 
     echo "TP=4 not yet supported for ISL=$ISL OSL=$OSL!"
@@ -52,9 +52,9 @@ elif [[ $TP -eq 4 ]]; then
   MAX_RUNNING_REQUESTS=32
   CUDA_GRAPH_MAX_BATCH_SIZE=32
 
-  MEM_FRAC_STATIC=0.95
-  CHUNKED_PREFILL_SIZE=8192
-  MAX_PREFILL_TOKENS=8192
+  MEM_FRAC_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.95}"
+  CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-8192}"
+  MAX_PREFILL_TOKENS="$CHUNKED_PREFILL_SIZE"
 
   SCHEDULER_RECV_INTERVAL=10
 else
@@ -63,21 +63,34 @@ else
 fi
 echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
-EVAL_CONTEXT_ARGS=""
+RUNTIME_CONTEXT_ARGS=""
+if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then
+    RUNTIME_CONTEXT_ARGS="--context-length $MAX_MODEL_LEN"
+fi
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
-    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+    RUNTIME_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+RADIX_CACHE_ARGS="--disable-radix-cache"
+if is_isb1_replay_benchmark; then
+    RADIX_CACHE_ARGS=""
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_sglang_offload_config
 fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
 
 set -x
 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
 --tensor-parallel-size=$TP --data-parallel-size=1 \
 --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
 --mem-fraction-static $MEM_FRAC_STATIC --kv-cache-dtype fp8_e4m3 --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
---enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \
---attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+--enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL $RADIX_CACHE_ARGS \
+--attention-backend trtllm_mla --stream-interval 30 --ep-size $EP_SIZE --moe-runner-backend flashinfer_trtllm --quantization fp8 $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -86,7 +99,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 
 pip install -q datasets pandas
 
-run_benchmark_serving \
+run_single_node_benchmark \
     --model "$MODEL" \
     --port "$PORT" \
     --backend vllm \
@@ -96,7 +109,8 @@ run_benchmark_serving \
     --num-prompts "$((CONC * 10))" \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
@@ -105,5 +119,8 @@ if [ "${RUN_EVAL}" = "true" ]; then
 fi
 
 # Stop GPU monitoring
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
 stop_gpu_monitor
-set +x
\ No newline at end of file
+set +x
diff --git a/benchmarks/single_node/dsr1_fp8_b200_vllm.sh b/benchmarks/single_node/dsr1_fp8_b200_vllm.sh
new file mode 100644
index 000000000..5c3639fa9
--- /dev/null
+++ b/benchmarks/single_node/dsr1_fp8_b200_vllm.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true"
+if is_isb1_replay_benchmark; then
+    PREFIX_CACHING_CONFIG=""
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_vllm_offload_config
+fi
+
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}'
+$PREFIX_CACHING_CONFIG
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then
+    echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml
+fi
+if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then
+    echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_vllm_offload_config
+fi
+
+export TORCH_CUDA_ARCH_LIST="10.0"
+export PYTHONNOUSERSITE=1
+export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl
+
+# Start GPU monitoring (power, temperature, clocks every second)
+start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
+
+set -x
+vllm serve $MODEL --host 0.0.0.0 --port $PORT \
+--config config.yaml \
+--gpu-memory-utilization 0.9 \
+--tensor-parallel-size $TP \
+--max-num-seqs 256 \
+--disable-log-requests \
+--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \
+> $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+# Wait for server to be ready
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID" \
+    --trust-remote-code
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/dsr1_fp8_h200.sh b/benchmarks/single_node/dsr1_fp8_h200.sh
index c820d180b..a9730917a 100644
--- a/benchmarks/single_node/dsr1_fp8_h200.sh
+++ b/benchmarks/single_node/dsr1_fp8_h200.sh
@@ -23,34 +23,50 @@ PORT=${PORT:-8888}
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
 
 export TORCH_CUDA_ARCH_LIST="9.0"
 
-EVAL_CONTEXT_ARGS=""
+RUNTIME_CONTEXT_ARGS=""
+if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then
+    RUNTIME_CONTEXT_ARGS="--context-length $MAX_MODEL_LEN"
+fi
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
-    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+    RUNTIME_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+RADIX_CACHE_ARGS="--disable-radix-cache"
+if is_isb1_replay_benchmark; then
+    RADIX_CACHE_ARGS=""
 fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_sglang_offload_config
+fi
+
+MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.82}"
+CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}"
 
 set -x
 if [[ $ISL -eq 1024 && $OSL -eq 1024 ]]; then
     PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \
     --host 0.0.0.0 --port $PORT --trust-remote-code \
     --tensor-parallel-size=$TP --data-parallel-size=1 \
-    --disable-radix-cache --max-running-requests 512 --cuda-graph-max-bs 512 \
-    --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \
+    $RADIX_CACHE_ARGS --max-running-requests 512 --cuda-graph-max-bs 512 \
+    --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens "$CHUNKED_PREFILL_SIZE" --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --attention-backend flashinfer --stream-interval 10 \
     --decode-log-interval 1 \
-    $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+    $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 else
     PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL \
     --host 0.0.0.0 --port $PORT --trust-remote-code \
     --tensor-parallel-size=$TP --data-parallel-size=1 \
-    --disable-radix-cache --max-running-requests 256 --cuda-graph-max-bs 256 \
-    --chunked-prefill-size 32768 --max-prefill-tokens 32768 --mem-fraction-static 0.82 \
+    $RADIX_CACHE_ARGS --max-running-requests 256 --cuda-graph-max-bs 256 \
+    --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens "$CHUNKED_PREFILL_SIZE" --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --attention-backend flashinfer --stream-interval 10 \
     --decode-log-interval 1 \
-    $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+    $RUNTIME_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 fi
 
 SERVER_PID=$!
@@ -58,7 +74,7 @@ SERVER_PID=$!
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-run_benchmark_serving \
+run_single_node_benchmark \
     --model "$MODEL" \
     --port "$PORT" \
     --backend vllm \
@@ -68,7 +84,8 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
@@ -77,5 +94,8 @@ if [ "${RUN_EVAL}" = "true" ]; then
 fi
 
 # Stop GPU monitoring
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
 stop_gpu_monitor
 set +x
diff --git a/benchmarks/single_node/dsr1_fp8_h200_vllm.sh b/benchmarks/single_node/dsr1_fp8_h200_vllm.sh
new file mode 100644
index 000000000..65348e831
--- /dev/null
+++ b/benchmarks/single_node/dsr1_fp8_h200_vllm.sh
@@ -0,0 +1,92 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true"
+if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then
+    PREFIX_CACHING_CONFIG=""
+fi
+
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}'
+$PREFIX_CACHING_CONFIG
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+apply_vllm_offload_config
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+export PYTHONNOUSERSITE=1
+
+if ! is_isb1_kv_stress_benchmark; then
+    start_gpu_monitor
+fi
+
+set -x
+vllm serve $MODEL --host 0.0.0.0 --port $PORT \
+--config config.yaml \
+--gpu-memory-utilization 0.9 \
+--tensor-parallel-size $TP \
+--max-num-seqs 256 \
+--disable-log-requests \
+--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \
+> $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID" \
+    --trust-remote-code
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if ! is_isb1_kv_stress_benchmark; then
+    stop_gpu_monitor
+fi
+set +x
diff --git a/benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh b/benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh
new file mode 100755
index 000000000..60f06b13e
--- /dev/null
+++ b/benchmarks/single_node/dsr1triattn_fp8_h100_vllm.sh
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+# TriAttention-enabled vLLM benchmark for DeepSeek-R1 FP8 on H100.
+#
+# Differences from baseline dsr1_fp8_h200_vllm.sh:
+#   - Installs triattention vLLM plugin
+#   - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads)
+#   - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available
+#   - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks)
+#   - Explicitly disables prefix caching (incompatible with KV compression)
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+# --- TriAttention plugin setup ---
+pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image."
+
+# Auto-detect KV budget from export filename: chat workloads get larger budget.
+TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}"
+if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then
+    TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}"
+fi
+export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET"
+
+# Use pre-calibrated sparse stats if available on the runner.
+TRIATTN_STATS="/workspace/triattn_stats/deepseek_r1_0528_stats.pt"
+if [[ -f "$TRIATTN_STATS" ]]; then
+    export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS"
+    echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS"
+else
+    echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression."
+fi
+
+export ENABLE_TRIATTENTION=1
+echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET  STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-<none>}"
+# --- End TriAttention setup ---
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+enable-prefix-caching: false
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 1024
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+export PYTHONNOUSERSITE=1
+
+start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
+
+set -x
+vllm serve $MODEL --host 0.0.0.0 --port $PORT \
+--config config.yaml \
+--gpu-memory-utilization 0.9 \
+--tensor-parallel-size $TP \
+--max-num-seqs 256 \
+--disable-log-requests \
+--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \
+> $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID" \
+    --trust-remote-code
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh b/benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh
new file mode 100755
index 000000000..1c4722964
--- /dev/null
+++ b/benchmarks/single_node/dsr1triattn_fp8_h200_vllm.sh
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+# TriAttention-enabled vLLM benchmark for DeepSeek-R1 FP8 on H200.
+#
+# Differences from baseline dsr1_fp8_h200_vllm.sh:
+#   - Installs triattention vLLM plugin
+#   - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads)
+#   - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available
+#   - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks)
+#   - Explicitly disables prefix caching (incompatible with KV compression)
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+# --- TriAttention plugin setup ---
+pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image."
+
+# Auto-detect KV budget from export filename: chat workloads get larger budget.
+TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}"
+if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then
+    TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}"
+fi
+export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET"
+
+# Use pre-calibrated sparse stats if available on the runner.
+TRIATTN_STATS="/workspace/triattn_stats/deepseek_r1_0528_stats.pt"
+if [[ -f "$TRIATTN_STATS" ]]; then
+    export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS"
+    echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS"
+else
+    echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression."
+fi
+
+export ENABLE_TRIATTENTION=1
+echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET  STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-<none>}"
+# --- End TriAttention setup ---
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+enable-prefix-caching: false
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 1024
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+export PYTHONNOUSERSITE=1
+
+start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
+
+set -x
+vllm serve $MODEL --host 0.0.0.0 --port $PORT \
+--config config.yaml \
+--gpu-memory-utilization 0.9 \
+--tensor-parallel-size $TP \
+--max-num-seqs 256 \
+--disable-log-requests \
+--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \
+> $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID" \
+    --trust-remote-code
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/gptoss_fp4_b200.sh
index f6a6f72e9..95240230e 100644
--- a/benchmarks/single_node/gptoss_fp4_b200.sh
+++ b/benchmarks/single_node/gptoss_fp4_b200.sh
@@ -34,15 +34,33 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
 fi
 
+PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true"
+if is_isb1_replay_benchmark; then
+    PREFIX_CACHING_CONFIG=""
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_vllm_offload_config
+fi
+
 cat > config.yaml << EOF
 kv-cache-dtype: fp8
 compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}'
-no-enable-prefix-caching: true
+$PREFIX_CACHING_CONFIG
 max-cudagraph-capture-size: 2048
 max-num-batched-tokens: 8192
 max-model-len: $CALCULATED_MAX_MODEL_LEN
 EOF
 
+if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then
+    echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml
+fi
+if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then
+    echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_vllm_offload_config
+fi
+
 export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
@@ -52,6 +70,9 @@ PORT=${PORT:-8888}
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
 
 set -x
 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
@@ -59,7 +80,8 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --gpu-memory-utilization 0.9 \
 --tensor-parallel-size $TP \
 --max-num-seqs 512 \
---disable-log-requests > $SERVER_LOG 2>&1 &
+--disable-log-requests $VLLM_OFFLOAD_EXTRA_ARGS \
+> $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -68,7 +90,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 
 pip install -q datasets pandas
 
-run_benchmark_serving \
+run_single_node_benchmark \
     --model "$MODEL" \
     --port "$PORT" \
     --backend vllm \
@@ -78,7 +100,8 @@ run_benchmark_serving \
     --num-prompts $(( CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
@@ -87,5 +110,8 @@ if [ "${RUN_EVAL}" = "true" ]; then
 fi
 
 # Stop GPU monitoring
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
 stop_gpu_monitor
 set +x
diff --git a/benchmarks/single_node/gptoss_fp4_b200_sglang.sh b/benchmarks/single_node/gptoss_fp4_b200_sglang.sh
new file mode 100644
index 000000000..f3d9ad82c
--- /dev/null
+++ b/benchmarks/single_node/gptoss_fp4_b200_sglang.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+export NCCL_NVLS_ENABLE=1
+export SGL_ENABLE_JIT_DEEPGEMM=false
+export SGLANG_ENABLE_FLASHINFER_GEMM=true
+export PYTHONUNBUFFERED=1
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+if [[ $CONC -ge 16 ]]; then
+  SCHEDULER_RECV_INTERVAL=30
+else
+  SCHEDULER_RECV_INTERVAL=10
+fi
+
+CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN"
+fi
+
+RADIX_CACHE_ARGS="--disable-radix-cache"
+if is_isb1_replay_benchmark; then
+    RADIX_CACHE_ARGS=""
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_sglang_offload_config
+fi
+
+MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}"
+CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}"
+
+start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
+
+set -x
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path="$MODEL" --host=0.0.0.0 --port="$PORT" \
+--trust-remote-code \
+--tensor-parallel-size="$TP" --data-parallel-size=1 \
+--cuda-graph-max-bs 128 --max-running-requests 128 \
+--mem-fraction-static "$MEM_FRACTION_STATIC" --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens 32768 \
+--context-length "$CONTEXT_LENGTH" --kv-cache-dtype fp8_e4m3 \
+$RADIX_CACHE_ARGS --enable-flashinfer-allreduce-fusion \
+--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" \
+--reasoning-parser gpt-oss --tokenizer-worker-num 6 --stream-interval 30 > "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh
index 8d0e773a2..dc5baf287 100644
--- a/benchmarks/single_node/gptoss_fp4_h100.sh
+++ b/benchmarks/single_node/gptoss_fp4_h100.sh
@@ -17,20 +17,42 @@ fi
 
 hf download "$MODEL"
 
-MAX_MODEL_LEN=10240
+if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then
+    MAX_MODEL_LEN="${MAX_MODEL_LEN}"
+else
+    MAX_MODEL_LEN=10240
+fi
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
     MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
 fi
 
+PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true"
+if is_isb1_replay_benchmark; then
+    PREFIX_CACHING_CONFIG=""
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_vllm_offload_config
+fi
+
 cat > config.yaml << EOF
-no-enable-prefix-caching: true
+$PREFIX_CACHING_CONFIG
 max-cudagraph-capture-size: 2048
 max-num-batched-tokens: 8192
 max-model-len: $MAX_MODEL_LEN
 EOF
 
+if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then
+    echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml
+fi
+if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then
+    echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_vllm_offload_config
+fi
+
 export PYTHONNOUSERSITE=1
 export VLLM_MXFP4_USE_MARLIN=1
 SERVER_LOG=/workspace/server.log
@@ -38,13 +60,17 @@ PORT=${PORT:-8888}
 
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
 
 set -x
 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
 --config config.yaml \
 --gpu-memory-utilization=0.9 \
 --tensor-parallel-size=$TP \
---max-num-seqs=$CONC > $SERVER_LOG 2>&1 &
+--max-num-seqs=$CONC $VLLM_OFFLOAD_EXTRA_ARGS \
+> $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -53,7 +79,7 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 
 pip install -q datasets pandas
 
-run_benchmark_serving \
+run_single_node_benchmark \
     --model "$MODEL" \
     --port "$PORT" \
     --backend vllm \
@@ -63,7 +89,8 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
@@ -72,5 +99,8 @@ if [ "${RUN_EVAL}" = "true" ]; then
 fi
 
 # Stop GPU monitoring
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
 stop_gpu_monitor
 set +x
diff --git a/benchmarks/single_node/gptoss_fp4_h100_sglang.sh b/benchmarks/single_node/gptoss_fp4_h100_sglang.sh
new file mode 100644
index 000000000..a045cd99c
--- /dev/null
+++ b/benchmarks/single_node/gptoss_fp4_h100_sglang.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN"
+fi
+
+RADIX_CACHE_ARGS="--disable-radix-cache"
+if is_isb1_replay_benchmark; then
+    RADIX_CACHE_ARGS=""
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_sglang_offload_config
+fi
+
+MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}"
+CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}"
+
+start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
+
+set -x
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \
+--host 0.0.0.0 --port "$PORT" --trust-remote-code \
+--tensor-parallel-size="$TP" --data-parallel-size=1 \
+$RADIX_CACHE_ARGS --max-running-requests 256 --cuda-graph-max-bs 256 \
+--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens 32768 --mem-fraction-static "$MEM_FRACTION_STATIC" \
+--context-length "$CONTEXT_LENGTH" --reasoning-parser gpt-oss --stream-interval 10 > "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/gptoss_fp4_h200.sh
index 2a9359b96..9be9959bf 100644
--- a/benchmarks/single_node/gptoss_fp4_h200.sh
+++ b/benchmarks/single_node/gptoss_fp4_h200.sh
@@ -18,7 +18,9 @@ fi
 hf download "$MODEL"
 
 # Start GPU monitoring (power, temperature, clocks every second)
-start_gpu_monitor
+if ! is_isb1_kv_stress_benchmark; then
+    start_gpu_monitor
+fi
 
 set -x
 pip install datasets pandas
@@ -37,14 +39,21 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
 fi
 
+PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true"
+if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then
+    PREFIX_CACHING_CONFIG=""
+fi
+
 # Create config.yaml
 cat > config.yaml << EOF
-no-enable-prefix-caching: true
+$PREFIX_CACHING_CONFIG
 max-cudagraph-capture-size: 2048
 max-num-batched-tokens: 8192
 max-model-len: $CALCULATED_MAX_MODEL_LEN
 EOF
 
+apply_vllm_offload_config
+
 SERVER_LOG=/workspace/server.log
 export TORCH_CUDA_ARCH_LIST="9.0"
 PORT=${PORT:-8888}
@@ -55,14 +64,15 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
  --config config.yaml \
  --gpu-memory-utilization 0.9 \
  --tensor-parallel-size $TP \
- --max-num-seqs $CONC > $SERVER_LOG 2>&1 &
+ --max-num-seqs $CONC $VLLM_OFFLOAD_EXTRA_ARGS \
+> $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
 # Wait for server to be ready
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
-run_benchmark_serving \
+run_single_node_benchmark \
     --model "$MODEL" \
     --port "$PORT" \
     --backend vllm \
@@ -72,7 +82,8 @@ run_benchmark_serving \
     --num-prompts $(( $CONC * 10 )) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
 
 # After throughput, run evaluation only if RUN_EVAL is true
 if [ "${RUN_EVAL}" = "true" ]; then
@@ -81,5 +92,7 @@ if [ "${RUN_EVAL}" = "true" ]; then
 fi
 
 # Stop GPU monitoring
-stop_gpu_monitor
+if ! is_isb1_kv_stress_benchmark; then
+    stop_gpu_monitor
+fi
 set +x
diff --git a/benchmarks/single_node/gptoss_fp4_h200_sglang.sh b/benchmarks/single_node/gptoss_fp4_h200_sglang.sh
new file mode 100644
index 000000000..069b1a452
--- /dev/null
+++ b/benchmarks/single_node/gptoss_fp4_h200_sglang.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN"
+fi
+
+RADIX_CACHE_ARGS="--disable-radix-cache"
+if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then
+    RADIX_CACHE_ARGS=""
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_sglang_offload_config
+fi
+
+MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}"
+CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}"
+
+if ! is_isb1_kv_stress_benchmark; then
+    start_gpu_monitor
+fi
+
+set -x
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \
+--host 0.0.0.0 --port "$PORT" --trust-remote-code \
+--tensor-parallel-size="$TP" --data-parallel-size=1 \
+$RADIX_CACHE_ARGS --max-running-requests 256 --cuda-graph-max-bs 256 \
+--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens 32768 --mem-fraction-static "$MEM_FRACTION_STATIC" \
+--context-length "$CONTEXT_LENGTH" --reasoning-parser gpt-oss --stream-interval 10 > "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if ! is_isb1_kv_stress_benchmark; then
+    stop_gpu_monitor
+fi
+set +x
diff --git a/benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh b/benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh
new file mode 100755
index 000000000..cfff2a12d
--- /dev/null
+++ b/benchmarks/single_node/gptosstriattn_fp4_h100_vllm.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+# TriAttention-enabled vLLM benchmark for GPT-OSS-120B FP4 on H100.
+#
+# Differences from baseline gptoss_fp4_h100.sh:
+#   - Installs triattention vLLM plugin
+#   - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads)
+#   - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available
+#   - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks)
+#   - Explicitly disables prefix caching (incompatible with KV compression)
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+# --- TriAttention plugin setup ---
+pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image."
+
+# Auto-detect KV budget from export filename: chat workloads get larger budget.
+TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}"
+if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then
+    TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}"
+fi
+export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET"
+
+# Use pre-calibrated sparse stats if available on the runner.
+TRIATTN_STATS="/workspace/triattn_stats/gpt_oss_120b_stats.pt"
+if [[ -f "$TRIATTN_STATS" ]]; then
+    export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS"
+    echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS"
+else
+    echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression."
+fi
+
+export ENABLE_TRIATTENTION=1
+echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET  STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-<none>}"
+# --- End TriAttention setup ---
+
+if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then
+    MAX_MODEL_LEN="${MAX_MODEL_LEN}"
+else
+    MAX_MODEL_LEN=10240
+fi
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+cat > config.yaml << EOF
+enable-prefix-caching: false
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 1024
+max-model-len: $MAX_MODEL_LEN
+EOF
+
+if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then
+    echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml
+fi
+if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then
+    echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_vllm_offload_config
+fi
+
+export PYTHONNOUSERSITE=1
+export VLLM_MXFP4_USE_MARLIN=1
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
+
+set -x
+vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
+--config config.yaml \
+--gpu-memory-utilization=0.9 \
+--tensor-parallel-size=$TP \
+--max-num-seqs=$CONC $VLLM_OFFLOAD_EXTRA_ARGS \
+> $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh b/benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh
new file mode 100755
index 000000000..fc6f465bc
--- /dev/null
+++ b/benchmarks/single_node/gptosstriattn_fp4_h200_vllm.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+# TriAttention-enabled vLLM benchmark for GPT-OSS-120B FP4 on H200.
+#
+# Differences from baseline gptoss_fp4_h100.sh:
+#   - Installs triattention vLLM plugin
+#   - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads)
+#   - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available
+#   - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks)
+#   - Explicitly disables prefix caching (incompatible with KV compression)
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+# --- TriAttention plugin setup ---
+pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image."
+
+# Auto-detect KV budget from export filename: chat workloads get larger budget.
+TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}"
+if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then
+    TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}"
+fi
+export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET"
+
+# Use pre-calibrated sparse stats if available on the runner.
+TRIATTN_STATS="/workspace/triattn_stats/gpt_oss_120b_stats.pt"
+if [[ -f "$TRIATTN_STATS" ]]; then
+    export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS"
+    echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS"
+else
+    echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression."
+fi
+
+export ENABLE_TRIATTENTION=1
+echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET  STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-<none>}"
+# --- End TriAttention setup ---
+
+if is_isb1_replay_benchmark && [ -n "${MAX_MODEL_LEN:-}" ]; then
+    MAX_MODEL_LEN="${MAX_MODEL_LEN}"
+else
+    MAX_MODEL_LEN=10240
+fi
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+cat > config.yaml << EOF
+enable-prefix-caching: false
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 1024
+max-model-len: $MAX_MODEL_LEN
+EOF
+
+if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then
+    echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml
+fi
+if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then
+    echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_vllm_offload_config
+fi
+
+export PYTHONNOUSERSITE=1
+export VLLM_MXFP4_USE_MARLIN=1
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
+
+set -x
+vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
+--config config.yaml \
+--gpu-memory-utilization=0.9 \
+--tensor-parallel-size=$TP \
+--max-num-seqs=$CONC $VLLM_OFFLOAD_EXTRA_ARGS \
+> $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $(( $CONC * 10 )) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh b/benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh
new file mode 100755
index 000000000..97fb5127c
--- /dev/null
+++ b/benchmarks/single_node/qwen3.5_fp8_b200_sglang.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+export NCCL_NVLS_ENABLE=1
+export SGL_ENABLE_JIT_DEEPGEMM=false
+export SGLANG_ENABLE_FLASHINFER_GEMM=true
+export PYTHONUNBUFFERED=1
+export TORCH_CUDA_ARCH_LIST="10.0"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+if [[ $CONC -ge 16 ]]; then
+  SCHEDULER_RECV_INTERVAL=30
+else
+  SCHEDULER_RECV_INTERVAL=10
+fi
+
+CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN"
+fi
+
+MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.8}"
+CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}"
+
+RADIX_CACHE_ARGS="--disable-radix-cache"
+if is_isb1_replay_benchmark; then
+    RADIX_CACHE_ARGS=""
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_sglang_offload_config
+fi
+
+start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
+
+set -x
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path="$MODEL" --host=0.0.0.0 --port="$PORT" \
+--trust-remote-code \
+--tensor-parallel-size="$TP" --data-parallel-size=1 --ep-size 1 \
+--quantization fp8 --kv-cache-dtype fp8_e4m3 \
+--mamba-ssm-dtype bfloat16 \
+--cuda-graph-max-bs "$CONC" --max-running-requests 128 \
+--mem-fraction-static "$MEM_FRACTION_STATIC" --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --max-prefill-tokens "$CHUNKED_PREFILL_SIZE" \
+--context-length "$CONTEXT_LENGTH" \
+--attention-backend trtllm_mha --moe-runner-backend flashinfer_trtllm \
+$RADIX_CACHE_ARGS --enable-flashinfer-allreduce-fusion \
+--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" \
+--reasoning-parser qwen3 --tool-call-parser qwen3_coder \
+--tokenizer-worker-num 6 --stream-interval 30 > "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh b/benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh
new file mode 100755
index 000000000..e48c56700
--- /dev/null
+++ b/benchmarks/single_node/qwen3.5_fp8_b200_vllm.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+apply_yarn_config_if_needed "$MODEL" "$CALCULATED_MAX_MODEL_LEN"
+
+PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true"
+if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then
+    PREFIX_CACHING_CONFIG=""
+fi
+
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}'
+$PREFIX_CACHING_CONFIG
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+apply_vllm_offload_config
+
+export TORCH_CUDA_ARCH_LIST="10.0"
+export PYTHONNOUSERSITE=1
+export VLLM_FLASHINFER_ALLREDUCE_BACKEND=mnnvl
+
+if ! is_isb1_kv_stress_benchmark; then
+    start_gpu_monitor
+fi
+
+set -x
+vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
+--config config.yaml \
+--gpu-memory-utilization 0.9 \
+--tensor-parallel-size "$TP" \
+--max-num-seqs 256 \
+--disable-log-requests \
+--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS $VLLM_EXTRA_ARGS \
+> "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID" \
+    --trust-remote-code
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if ! is_isb1_kv_stress_benchmark; then
+    stop_gpu_monitor
+fi
+set +x
diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh b/benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh
new file mode 100755
index 000000000..61df75cff
--- /dev/null
+++ b/benchmarks/single_node/qwen3.5_fp8_h100_sglang.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN"
+fi
+
+MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.8}"
+CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-16384}"
+
+RADIX_CACHE_ARGS="--disable-radix-cache"
+if is_isb1_replay_benchmark; then
+    RADIX_CACHE_ARGS=""
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_sglang_offload_config
+fi
+
+start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
+
+set -x
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \
+--host 0.0.0.0 --port "$PORT" --trust-remote-code \
+--tensor-parallel-size="$TP" --data-parallel-size=1 --ep-size 1 \
+--quantization fp8 --kv-cache-dtype fp8_e4m3 \
+--mamba-ssm-dtype bfloat16 \
+$RADIX_CACHE_ARGS --enable-flashinfer-allreduce-fusion \
+--max-running-requests 128 --cuda-graph-max-bs 128 \
+--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --mem-fraction-static "$MEM_FRACTION_STATIC" \
+--context-length "$CONTEXT_LENGTH" \
+--reasoning-parser qwen3 --tool-call-parser qwen3_coder \
+--attention-backend flashinfer \
+--stream-interval 30 --tokenizer-worker-num 6 > "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh b/benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh
new file mode 100755
index 000000000..6f576ea0f
--- /dev/null
+++ b/benchmarks/single_node/qwen3.5_fp8_h100_vllm.sh
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true"
+if is_isb1_replay_benchmark; then
+    PREFIX_CACHING_CONFIG=""
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_vllm_offload_config
+fi
+
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+$PREFIX_CACHING_CONFIG
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then
+    echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml
+fi
+if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then
+    echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_vllm_offload_config
+fi
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+export PYTHONNOUSERSITE=1
+
+start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
+
+set -x
+vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
+--config config.yaml \
+--gpu-memory-utilization 0.9 \
+--tensor-parallel-size "$TP" \
+--max-num-seqs 256 \
+--disable-log-requests \
+--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \
+> "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID" \
+    --trust-remote-code
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh b/benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh
new file mode 100755
index 000000000..b3d5ea50b
--- /dev/null
+++ b/benchmarks/single_node/qwen3.5_fp8_h200_sglang.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+CONTEXT_LENGTH=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CONTEXT_LENGTH="$EVAL_MAX_MODEL_LEN"
+fi
+
+apply_yarn_config_if_needed "$MODEL" "$CONTEXT_LENGTH"
+
+MEM_FRACTION_STATIC="${SGLANG_MEM_FRACTION_OVERRIDE:-0.8}"
+CHUNKED_PREFILL_SIZE="${SGLANG_CHUNKED_PREFILL_OVERRIDE:-16384}"
+
+RADIX_CACHE_ARGS="--disable-radix-cache"
+if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then
+    RADIX_CACHE_ARGS=""
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_sglang_offload_config
+fi
+
+if ! is_isb1_kv_stress_benchmark; then
+    start_gpu_monitor
+fi
+
+set -x
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path "$MODEL" \
+--host 0.0.0.0 --port "$PORT" --trust-remote-code \
+--tensor-parallel-size="$TP" --data-parallel-size=1 --ep-size 1 \
+--reasoning-parser qwen3 --tool-call-parser qwen3_coder \
+--enable-flashinfer-allreduce-fusion \
+--max-running-requests 128 \
+--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \
+--mem-fraction-static "$MEM_FRACTION_STATIC" \
+--cuda-graph-max-bs 128 \
+--context-length "$CONTEXT_LENGTH" \
+--kv-cache-dtype fp8_e4m3 \
+--quantization fp8 \
+--attention-backend flashinfer \
+--stream-interval 30 \
+--tokenizer-worker-num 6 \
+--mamba-ssm-dtype bfloat16 \
+$RADIX_CACHE_ARGS \
+$SGLANG_EXTRA_ARGS \
+> "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID"
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if ! is_isb1_kv_stress_benchmark; then
+    stop_gpu_monitor
+fi
+set +x
diff --git a/benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh b/benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh
new file mode 100755
index 000000000..de5c66c44
--- /dev/null
+++ b/benchmarks/single_node/qwen3.5_fp8_h200_vllm.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+apply_yarn_config_if_needed "$MODEL" "$CALCULATED_MAX_MODEL_LEN"
+
+PREFIX_CACHING_CONFIG="no-enable-prefix-caching: true"
+if is_isb1_replay_benchmark || is_isb1_kv_stress_benchmark; then
+    PREFIX_CACHING_CONFIG=""
+fi
+
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+$PREFIX_CACHING_CONFIG
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+apply_vllm_offload_config
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+export PYTHONNOUSERSITE=1
+
+if ! is_isb1_kv_stress_benchmark; then
+    start_gpu_monitor
+fi
+
+set -x
+vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
+--config config.yaml \
+--gpu-memory-utilization 0.9 \
+--tensor-parallel-size "$TP" \
+--max-num-seqs 256 \
+--disable-log-requests \
+--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS $VLLM_EXTRA_ARGS \
+> "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID" \
+    --trust-remote-code
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if ! is_isb1_kv_stress_benchmark; then
+    stop_gpu_monitor
+fi
+set +x
diff --git a/benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh b/benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh
new file mode 100755
index 000000000..87e81ab22
--- /dev/null
+++ b/benchmarks/single_node/qwen3.5triattn_fp8_h100_vllm.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+# TriAttention-enabled vLLM benchmark for Qwen 3.5 FP8 on H100.
+#
+# Differences from baseline qwen3.5_fp8_h100_vllm.sh:
+#   - Installs triattention vLLM plugin
+#   - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads)
+#   - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available
+#   - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks)
+#   - Explicitly disables prefix caching (incompatible with KV compression)
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+# --- TriAttention plugin setup ---
+pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image."
+
+# Auto-detect KV budget from export filename: chat workloads get larger budget.
+TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}"
+if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then
+    TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}"
+fi
+export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET"
+
+# Use pre-calibrated sparse stats if available on the runner.
+TRIATTN_STATS="/workspace/triattn_stats/qwen3_5_397b_a17b_stats.pt"
+if [[ -f "$TRIATTN_STATS" ]]; then
+    export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS"
+    echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS"
+else
+    echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression."
+fi
+
+export ENABLE_TRIATTENTION=1
+echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET  STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-<none>}"
+# --- End TriAttention setup ---
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+enable-prefix-caching: false
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 1024
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then
+    echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml
+fi
+if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then
+    echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_vllm_offload_config
+fi
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+export PYTHONNOUSERSITE=1
+
+start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
+
+set -x
+vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
+--config config.yaml \
+--gpu-memory-utilization 0.9 \
+--tensor-parallel-size "$TP" \
+--max-num-seqs 256 \
+--disable-log-requests \
+--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \
+> "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID" \
+    --trust-remote-code
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh b/benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh
new file mode 100755
index 000000000..83fb3b8c6
--- /dev/null
+++ b/benchmarks/single_node/qwen3.5triattn_fp8_h200_vllm.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+# TriAttention-enabled vLLM benchmark for Qwen 3.5 FP8 on H200.
+#
+# Differences from baseline qwen3.5_fp8_h200_vllm.sh:
+#   - Installs triattention vLLM plugin
+#   - Sets TRIATTN_RUNTIME_KV_BUDGET (2048 for code, 12000 for chat workloads)
+#   - Sets TRIATTN_RUNTIME_SPARSE_STATS_PATH when calibrated stats are available
+#   - Lowers max-num-batched-tokens to 1024 (prevents OOM from large prefill chunks)
+#   - Explicitly disables prefix caching (incompatible with KV compression)
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+nvidia-smi
+
+hf download "$MODEL"
+
+# --- TriAttention plugin setup ---
+pip install -q triattention 2>/dev/null || echo "[TriAttention] Package not pre-installed; relying on container image."
+
+# Auto-detect KV budget from export filename: chat workloads get larger budget.
+TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-2048}"
+if [[ "${EXPORT_FILE:-}" == *chat_* ]]; then
+    TRIATTN_KV_BUDGET="${TRIATTN_RUNTIME_KV_BUDGET:-12000}"
+fi
+export TRIATTN_RUNTIME_KV_BUDGET="$TRIATTN_KV_BUDGET"
+
+# Use pre-calibrated sparse stats if available on the runner.
+TRIATTN_STATS="/workspace/triattn_stats/qwen3_5_397b_a17b_stats.pt"
+if [[ -f "$TRIATTN_STATS" ]]; then
+    export TRIATTN_RUNTIME_SPARSE_STATS_PATH="$TRIATTN_STATS"
+    echo "[TriAttention] Using calibrated stats: $TRIATTN_STATS"
+else
+    echo "[TriAttention] No calibrated stats found at $TRIATTN_STATS; using budget-only compression."
+fi
+
+export ENABLE_TRIATTENTION=1
+echo "[TriAttention] KV_BUDGET=$TRIATTN_KV_BUDGET  STATS=${TRIATTN_RUNTIME_SPARSE_STATS_PATH:-<none>}"
+# --- End TriAttention setup ---
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-$((ISL + OSL + 200))}
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    CALCULATED_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+fi
+
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+enable-prefix-caching: false
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 1024
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then
+    echo "cpu-offload-gb: ${VLLM_CPU_OFFLOAD_GB}" >> config.yaml
+fi
+if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then
+    echo "swap-space: ${VLLM_SWAP_SPACE_GB}" >> config.yaml
+fi
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    apply_vllm_offload_config
+fi
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+export PYTHONNOUSERSITE=1
+
+start_gpu_monitor
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    start_kv_metrics_collector "${PORT:-8888}" /workspace/kv_metrics.csv 2.0
+fi
+
+set -x
+vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
+--config config.yaml \
+--gpu-memory-utilization 0.9 \
+--tensor-parallel-size "$TP" \
+--max-num-seqs 256 \
+--disable-log-requests \
+--trust-remote-code $VLLM_OFFLOAD_EXTRA_ARGS \
+> "$SERVER_LOG" 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_single_node_benchmark \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/ \
+    --server-pid "$SERVER_PID" \
+    --trust-remote-code
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    stop_kv_metrics_collector
+fi
+stop_gpu_monitor
+set +x
diff --git a/datasets/isb1/.gitattributes b/datasets/isb1/.gitattributes
new file mode 100644
index 000000000..d7fa37c52
--- /dev/null
+++ b/datasets/isb1/.gitattributes
@@ -0,0 +1,2 @@
+exports/**/*.json linguist-generated=true
+exports/**/*.json text eol=lf
diff --git a/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md b/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md
new file mode 100644
index 000000000..175765ab1
--- /dev/null
+++ b/datasets/isb1/COEXISTENCE_WITH_KV_CACHE_TESTER.md
@@ -0,0 +1,122 @@
+---
+version: 1.0.0
+date: 2026-04-14
+author: William Chen
+status: proposed
+---
+
+# ISB1 ↔ kv-cache-tester Coexistence Plan
+
+## The Two Systems
+
+| | kv-cache-tester (Cameron's) | ISB1 (ours) |
+|---|---|---|
+| **Location** | `experimental/multiturn/vllm_benchmark/kv-cache-tester/` | `datasets/isb1/exports/` |
+| **Traces** | 522 real Claude Code sessions | 35 synthetic multi-turn traces |
+| **Source** | Real production agentic workloads | Synthetic with controlled stress patterns |
+| **Replay** | `trace_replay_tester.py` | `benchmark_export_replay.py` |
+| **Config** | `multiturn-agentic-trace.yaml` | `isb1-kv-stress-pr993.yaml` |
+| **Metrics** | Prometheus sidecar (`metrics_collector.py`) | `process_result_isb1.py` |
+
+## Why Both Are Needed
+
+**kv-cache-tester** shows how chips perform under **real workloads** — actual Claude Code
+sessions with natural token distributions. This is the ground truth for "how does inference
+actually work in production?"
+
+**ISB1** shows how chips perform under **controlled stress conditions** — specific KV cache
+behaviors that real workloads rarely trigger but production systems must handle:
+
+| Stress Pattern | kv-cache-tester | ISB1 |
+|---|---|---|
+| Natural agentic workload distribution | ✅ (522 real traces) | ❌ |
+| Targeted prefix reuse testing | ❌ | ✅ (high_prefix stress class) |
+| Forced KV offload cliff | ❌ (depends on trace) | ✅ (offload_cliff stress, 128K-1M context) |
+| Session reactivation after idle | ❌ | ✅ (reactivation stress, idle windows) |
+| KV compaction under long sessions | ❌ | ✅ (compaction_heavy stress, 25+ turns) |
+| Shared prefix fanout | ❌ | ✅ (fanout stress, branching requests) |
+| 500K-1M context depth | ❌ (real traces are shorter) | ✅ (xlc2/ulc1/ulc2 bands) |
+
+Together they give the Pareto frontier Cameron wants: kv-cache-tester at realistic operating
+points, ISB1 at stress-test extremes.
+
+## How They Coexist in PR #993
+
+### Configs (no conflict)
+```yaml
+# Cameron's existing config — uses kv-cache-tester traces
+# .github/configs/multiturn-agentic-trace.yaml
+h200-fp8-llama70b:
+  trace-file: experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/...
+
+# Our config — uses ISB1 export traces  
+# .github/configs/isb1-kv-stress-pr993.yaml
+dsr1-fp8-h200-isb1-kv-stress-vllm-pr993:
+  export-file: datasets/isb1/exports/extension_131k/vllm/code_131k1k.json
+```
+
+### Workflows (no conflict)
+```yaml
+# Cameron's workflow
+# .github/workflows/multiturn-sweep.yml → benchmark-multiturn-tmpl.yml
+#   Uses: trace_replay_tester.py
+
+# Our workflow  
+# .github/workflows/run-isb1-sweep.yml → benchmark-isb1-tmpl.yml
+#   Uses: benchmark_export_replay.py
+```
+
+### Data directories (no conflict)
+```
+experimental/multiturn/vllm_benchmark/    ← Cameron's (untouched)
+  kv-cache-tester/                          522 real traces + replayer
+  aiperf/                                   AIPerf submodule
+  bench/metrics_collector.py                Prometheus sidecar
+  analysis/plot_pareto.py                   Pareto charts
+
+datasets/isb1/                            ← Ours (separate directory)
+  exports/                                  ISB1 replay bundles
+    extension_131k/                         131K context (DSR1, GPT-OSS, Qwen)
+    preview/long_context_500k/              500K Qwen preview
+    preview/long_context_1m/                1M Qwen preview
+```
+
+### Shared infrastructure we USE from PR #993
+- vLLM offload API flags (`--kv_offloading_backend native`, etc.)
+- Prometheus metrics collector (could share `metrics_collector.py`)
+- Offload mode sweep pattern (on/off/noprefix)
+- Runner launch scripts (`runners/launch_*.sh`)
+- Concurrency sweep structure
+
+### What we DO NOT touch
+- `experimental/multiturn/vllm_benchmark/` — entirely Cameron's
+- `kv-cache-tester/` submodule — real traces, don't modify
+- `aiperf/` submodule — alternative benchmark, don't modify
+- `benchmark-multiturn-tmpl.yml` — Cameron's workflow template
+
+## Recommended PR Structure
+
+### Option A: Single PR with two benchmark lanes (cleanest)
+PR #993 ships with BOTH:
+- Lane 1: kv-cache-tester (real traces) — Cameron's existing work
+- Lane 2: ISB1 (synthetic stress traces) — our addition
+
+Both use the same vLLM server configs, offload modes, and concurrency sweeps.
+Results are compared side by side — real vs stress.
+
+### Option B: ISB1 as follow-up PR (safest)
+PR #993 ships with kv-cache-tester only (Cameron's work).
+We submit a follow-up PR that adds ISB1 as a second benchmark lane.
+Uses the same runner infrastructure and offload configs.
+
+### Recommendation: Option A
+Cameron explicitly asked for "realistic multi-turn benchmarks" at GTC. Having both
+real traces AND synthetic stress traces in the same PR makes a stronger story:
+"Here's how chips perform under real workloads AND here's where they break under
+targeted KV stress." That's the complete Pareto frontier.
+
+## What We Need From Cameron's Team
+1. Confirm ISB1 configs don't conflict with multiturn-agentic-trace.yaml
+2. Confirm datasets/isb1/exports/ is the right location for our files
+3. Decide: do we share metrics_collector.py or use process_result_isb1.py?
+4. Agree on result format for combined Pareto visualization
diff --git a/datasets/isb1/GMI_EXECUTION_PLAN.md b/datasets/isb1/GMI_EXECUTION_PLAN.md
new file mode 100644
index 000000000..1ae696acd
--- /dev/null
+++ b/datasets/isb1/GMI_EXECUTION_PLAN.md
@@ -0,0 +1,175 @@
+# ISB1 KV Cache Benchmark — GMI Cloud Execution Plan
+
+## Available Hardware
+
+| GPU | HBM | Available | Max Context Before Offload |
+|-----|-----|-----------|---------------------------|
+| **GB200** | 192GB HBM3e | ✅ | ~384K tokens (FP8 KV) |
+| **H100** | 80GB HBM3 | ✅ | ~128K tokens (FP8 KV) |
+
+## Execution Order
+
+Run benchmarks in this order — cheapest/fastest first to validate the setup works.
+
+### Phase 1: Validation Run (1 hour)
+
+Prove the pipeline works end-to-end before burning GPU hours.
+
+```bash
+# On H100 — single model, single concurrency, 5 min duration
+export MODEL=deepseek-ai/DeepSeek-R1-0528
+export TP=8
+export EXPORT_FILE=datasets/isb1/exports/extension_131k/vllm/code_131k1k.json
+
+# Launch server
+bash benchmarks/single_node/dsr1_fp8_h100_vllm.sh
+
+# Run ONE cell: 2 users, offload=off, 300s
+python utils/bench_serving/benchmark_export_replay.py \
+  --export-file $EXPORT_FILE \
+  --max-concurrency 2 \
+  --duration 300 \
+  --request-mode multi-turn
+
+# Verify result has actual_context_len > 0
+python utils/process_result_isb1.py --result-file results/*.json
+```
+
+**Pass criteria:** TTFT and throughput numbers appear. `actual_context_len` > 100K.
+
+### Phase 2: H100 KV Stress Sweep (8 hours)
+
+H100 80GB is the interesting GPU — KV cache fills up first.
+
+```bash
+# Models to test:
+#   1. DeepSeek-R1 FP8 (TP8)
+#   2. GPT-OSS FP4 (TP8)
+
+# Sweep per model:
+#   users: [2, 4, 8, 16, 32, 64]        # H100 can't do 128+ at 131K
+#   offload-modes: [on, off, noprefix]
+#   duration: 1800s (30 min)
+#   export: extension_131k/vllm/code_131k1k.json
+
+# Total cells: 2 models × 6 concurrency × 3 offload = 36 cells
+# Time: 36 × 30min = 18 hours → with 2 models sequential = ~9 hours
+```
+
+**What to look for:**
+- Offload cliff: at what concurrency does offload=on start helping?
+- Prefix cache hit rate: does it stay >50% under load?
+- Preemption count: how many requests get evicted?
+- TTFT degradation: when does p99 TTFT exceed 10s?
+
+### Phase 3: GB200 KV Stress Sweep (8 hours)
+
+GB200 192GB has 2.4x more HBM — the cliff comes later.
+
+```bash
+# Same sweep but higher concurrency (more HBM room):
+#   users: [2, 4, 8, 16, 32, 64, 128, 256]
+#   offload-modes: [on, off, noprefix]
+#   duration: 1800s
+
+# Add Qwen 3.5 (needs more memory for MoE):
+#   3 models × 8 concurrency × 3 offload = 72 cells
+#   Time: 72 × 30min = 36 hours → might need to cut duration to 900s
+```
+
+**What to look for:**
+- At what concurrency does GB200 hit its offload cliff?
+- Is the cliff at ~3x H100's cliff (proportional to HBM)?
+- Does 192GB allow prefix caching to stay effective longer?
+
+### Phase 4: Long Context Preview (4 hours, GB200 only)
+
+500K and 1M token traces — only GB200 has enough memory.
+
+```bash
+# 500K preview (Qwen 3.5 only):
+export EXPORT_FILE=datasets/isb1/exports/preview/long_context_500k/\
+inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json
+
+# 1M preview (Qwen 3.5 only):
+export EXPORT_FILE=datasets/isb1/exports/preview/long_context_1m/\
+inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json
+
+# Low concurrency (these are HUGE contexts):
+#   users: [1, 2, 4]
+#   offload-modes: [on, off]
+#   duration: 900s
+```
+
+**What to look for:**
+- Can GB200 serve 1M context at all?
+- What's the TTFT for a 1M token prefill?
+- Does KV offload work at this scale?
+
+## Estimated GPU Time
+
+| Phase | GPU | Duration | Cost (est) |
+|-------|-----|----------|------------|
+| 1. Validation | H100 | 1 hour | ~$3 |
+| 2. H100 sweep | H100 | 9 hours | ~$27 |
+| 3. GB200 sweep | GB200 | 18 hours | ~$90 |
+| 4. Long context | GB200 | 4 hours | ~$20 |
+| **Total** | | **32 hours** | **~$140** |
+
+## Portable Run Script
+
+Use `gmi_portable_benchmark.sh` for manual runs without GitHub Actions:
+
+```bash
+# Set GMI-specific env vars
+export GMI_API_KEY="..."
+export HF_TOKEN="..."
+export MODEL=deepseek-ai/DeepSeek-R1-0528
+export GPU_TYPE=h100  # or gb200
+
+# Run the portable benchmark
+bash datasets/isb1/scripts/gmi_portable_benchmark.sh \
+  --model $MODEL \
+  --gpu $GPU_TYPE \
+  --export-file datasets/isb1/exports/extension_131k/vllm/code_131k1k.json \
+  --users 2,4,8,16,32,64 \
+  --offload-modes on,off,noprefix \
+  --duration 1800
+```
+
+## Result Collection
+
+After each phase, results go to:
+```
+results/
+├── h100_dsr1_fp8_kv_stress/
+│   ├── users_2_offload_on.json
+│   ├── users_2_offload_off.json
+│   └── ...
+└── gb200_dsr1_fp8_kv_stress/
+    └── ...
+```
+
+Process and visualize:
+```bash
+# Aggregate results
+python datasets/isb1/scripts/collect_sweep_results.py \
+  --results-dir results/ \
+  --output results/sweep_summary.json
+
+# Generate Pareto frontier chart
+python datasets/isb1/scripts/plot_pareto.py \
+  --summary results/sweep_summary.json \
+  --output results/pareto_frontier.png
+```
+
+## What Success Looks Like
+
+After all phases, we have:
+1. **Pareto frontier chart:** throughput vs p99 TTFT for H100 and GB200
+2. **Offload cliff identification:** exact concurrency where offload starts helping
+3. **Prefix cache benefit:** measured hit rate under realistic multi-turn load
+4. **HBM scaling evidence:** does 2.4x more HBM give 2.4x more capacity?
+5. **Long context feasibility:** can GB200 serve 500K/1M context at all?
+
+These results go into the InferenceX PR as evidence that the benchmark works.
diff --git a/datasets/isb1/README.md b/datasets/isb1/README.md
new file mode 100644
index 000000000..e3746eb58
--- /dev/null
+++ b/datasets/isb1/README.md
@@ -0,0 +1,125 @@
+# ISB1 replay artifacts for InferenceX
+
+This directory is the InferenceX-side consumer package for ISB1 replay.
+
+InferenceX consumes committed file artifacts only:
+- replay export JSON bundles under `datasets/isb1/exports/`
+- consumer configs in `.github/configs/isb1-*.yaml`
+- replay processing through `utils/bench_serving/benchmark_export_replay.py`
+- result normalization through `utils/process_result_isb1.py`
+
+
+## Why not random data?
+
+Random data benchmarks show worst-case performance. Real inference workloads
+have multi-turn conversations where each turn shares context with previous
+turns. This enables:
+
+- **Prefix caching** — 60-95% of each request's tokens are shared with the
+  previous turn. Prefix cache hit rates directly affect throughput.
+- **KV cache reuse** — the server reuses computed KV cache entries instead of
+  recomputing them. This is the biggest performance optimization in production.
+- **Realistic offload behavior** — KV cache grows across turns, eventually
+  exceeding GPU memory and requiring CPU offload. Random data never reaches
+  this point because each request is independent.
+
+These traces stress-test the exact KV cache behaviors that determine real
+production performance.
+
+InferenceX does **not** import external runtime code and does **not** make live-serving claims from export-file existence alone.
+
+---
+
+## Current ground truth (verified 2026-04-12)
+
+The definitive strict audit found:
+
+- **26 PASSED**
+- **0 FAILED**
+- **10 N/A**
+
+Strict audit rule: count only model-architecture-valid cells.
+
+### Strict verified coverage
+
+| Model | Chat | Code |
+|---|---|---|
+| `dsr1` | `8k`, `32k`, `64k`, `131k` | `8k`, `32k`, `64k`, `131k` |
+| `gptoss` | `8k`, `32k`, `64k`, `131k` | `8k`, `32k`, `64k`, `131k` |
+| `qwen3.5` | `8k`, `32k`, `64k`, `131k`, `500k` | `8k`, `32k`, `64k`, `131k`, `500k` |
+
+### Existing but excluded from the strict pass count
+
+- `gptoss` `500k` chat/code preview files exist, but strict coverage stops at `131k`
+- `qwen3.5` `1M` chat/code preview files exist, but were excluded from the strict audit
+- `dsr1` has no strict `500k` or `1M` lane because the model tops out at `163840`
+
+---
+
+## Inventory
+
+### Export-file counts
+
+- **50 export files**
+- **3 JSON manifests**
+- **53 total JSON files** under `datasets/isb1/exports/`
+- **888 total cells**
+- **5,094 total turns**
+- **13 MB actual message content**
+- **All export files are valid JSON**
+
+### Export-file breakdown
+
+| Class | Count |
+|---|---:|
+| Core `8k1k` | 8 |
+| Extension `32k1k` | 8 |
+| Extension `64k1k` | 8 |
+| Extension `131k1k` | 10 |
+| Preview `offload_core` | 4 |
+| Preview `500k` | 8 |
+| Preview `1M` | 4 |
+| JSON manifests | 3 |
+
+---
+
+## Claim boundary
+
+Safe claims:
+- InferenceX carries the full audited ISB1 replay corpus described above.
+- Strict replay-file coverage is **26 passed / 0 failed / 10 N/A**.
+- DSR1 strict coverage stops at `131k`.
+- GPT-OSS strict coverage stops at `131k`.
+- Qwen strict coverage reaches `500k`.
+- GPT-OSS `500k` and Qwen `1M` files exist, but are excluded from the strict pass count.
+
+Unsafe claims:
+- `26/26` valid cells verified (10 N/A due to model `max_position_embeddings` limits: DSR1=163,840, GPT-OSS=131,072, Qwen3.5=1,010,000)
+- strict GPT-OSS `500k` coverage
+- strict Qwen `1M` coverage
+- turning preview-file existence into live benchmark certification
+
+---
+
+## Key docs
+
+- [`COVERAGE_AUDIT_2026-04-11.md`](COVERAGE_AUDIT_2026-04-11.md) — definitive strict audit, file-path mapping, and N/A rationale
+- [`LONG_CONTEXT_TRUTH_MATRIX.md`](LONG_CONTEXT_TRUTH_MATRIX.md) — canonical claim boundary
+- [`SUPPORT_MATRIX.md`](SUPPORT_MATRIX.md) — lane-by-lane audited support table
+- [`PRODUCER_GAPS.md`](PRODUCER_GAPS.md) — what remains truly open vs no longer applicable
+- [`RUNBOOK_EXTERNAL_GMI.md`](RUNBOOK_EXTERNAL_GMI.md) — external operator path
+- [`RUNBOOK_INTERNAL_SEMIANALYSIS.md`](RUNBOOK_INTERNAL_SEMIANALYSIS.md) — internal workflow-backed path
+- [`INVESTIGATION_KV_CACHE_PROFILING_2026-04-11.md`](INVESTIGATION_KV_CACHE_PROFILING_2026-04-11.md) — what the long-context preview paths actually measure
+
+---
+
+## Export roots
+
+- `datasets/isb1/exports/core/`
+- `datasets/isb1/exports/extension_32k/`
+- `datasets/isb1/exports/extension_64k/`
+- `datasets/isb1/exports/extension_131k/`
+- `datasets/isb1/exports/preview/offload_core/`
+- `datasets/isb1/exports/preview/long_context_500k/`
+- `datasets/isb1/exports/preview/long_context_1m/`
+
diff --git a/datasets/isb1/exports/core/chat_8k1k.json b/datasets/isb1/exports/core/chat_8k1k.json
new file mode 100644
index 000000000..c3c2e1124
--- /dev/null
+++ b/datasets/isb1/exports/core/chat_8k1k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08070a63d22aa247e38475fdd7e206ea41bab731f2499f0d32210b317933b075
+size 3615534
diff --git a/datasets/isb1/exports/core/chat_8k1k_qwen3.5.json b/datasets/isb1/exports/core/chat_8k1k_qwen3.5.json
new file mode 100644
index 000000000..243cea119
--- /dev/null
+++ b/datasets/isb1/exports/core/chat_8k1k_qwen3.5.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04d60ff42c01d6bf117a6bddac7eae99cef2d052235101fa540fd3a7eb6466de
+size 136407
diff --git a/datasets/isb1/exports/core/code_8k1k.json b/datasets/isb1/exports/core/code_8k1k.json
new file mode 100644
index 000000000..1c1dd2461
--- /dev/null
+++ b/datasets/isb1/exports/core/code_8k1k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c746a650eb624d9d40ee19aad4a9d126b4e60602f13793c09a6a8cfde81d6ee
+size 2605444
diff --git a/datasets/isb1/exports/core/code_8k1k_qwen3.5.json b/datasets/isb1/exports/core/code_8k1k_qwen3.5.json
new file mode 100644
index 000000000..52957e59e
--- /dev/null
+++ b/datasets/isb1/exports/core/code_8k1k_qwen3.5.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e4fc73e3ff51469ad736fda8e15df09a14bd2d430d8a9a1600ae2ca1cd13075
+size 138620
diff --git a/datasets/isb1/exports/extension_131k/chat_131k1k.json b/datasets/isb1/exports/extension_131k/chat_131k1k.json
new file mode 100644
index 000000000..daefd2dad
--- /dev/null
+++ b/datasets/isb1/exports/extension_131k/chat_131k1k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eab224b3f15a3118204a912a3e53f3081c96ac2be1f4861b4dda5593580b2da1
+size 1231308
diff --git a/datasets/isb1/exports/extension_131k/chat_131k1k_dsr1.json b/datasets/isb1/exports/extension_131k/chat_131k1k_dsr1.json
new file mode 100644
index 000000000..e1ce42508
--- /dev/null
+++ b/datasets/isb1/exports/extension_131k/chat_131k1k_dsr1.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea824f39557d4bc7cc5a3e09c61815ebd32b2a7c3e78046c62c4d9da340f92d2
+size 312933
diff --git a/datasets/isb1/exports/extension_131k/chat_131k1k_qwen3.5.json b/datasets/isb1/exports/extension_131k/chat_131k1k_qwen3.5.json
new file mode 100644
index 000000000..c25a74094
--- /dev/null
+++ b/datasets/isb1/exports/extension_131k/chat_131k1k_qwen3.5.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20550fdc8fbb1aeaa9cf2b4fdb7807f4e8abcac5b2f871de573ea061f88e8dc5
+size 312996
diff --git a/datasets/isb1/exports/extension_131k/code_131k1k.json b/datasets/isb1/exports/extension_131k/code_131k1k.json
new file mode 100644
index 000000000..99915e4cd
--- /dev/null
+++ b/datasets/isb1/exports/extension_131k/code_131k1k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66df69260749a22f4af2d2d25a6dce23b3b466533f75338da599db87ace6e833
+size 5461532
diff --git a/datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json b/datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
new file mode 100644
index 000000000..0b041fb66
--- /dev/null
+++ b/datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dcd048663de0e325e601cdc44b0683a2dfbeecd53fe277937131250e1a86b3e4
+size 5027435
diff --git a/datasets/isb1/exports/extension_32k/chat_32k1k.json b/datasets/isb1/exports/extension_32k/chat_32k1k.json
new file mode 100644
index 000000000..7378882af
--- /dev/null
+++ b/datasets/isb1/exports/extension_32k/chat_32k1k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:606a6174834ddac7704bd199995d1b3f7c1d34b39ad4a904b80b09a22b1b04dc
+size 1390574
diff --git a/datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json b/datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json
new file mode 100644
index 000000000..8fd721f45
--- /dev/null
+++ b/datasets/isb1/exports/extension_32k/chat_32k1k_qwen3.5.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a44061cd4fac9b02347afcd4cbbfc4e5152020f23d6eccfccf548e198b4b7c70
+size 351049
diff --git a/datasets/isb1/exports/extension_32k/code_32k1k.json b/datasets/isb1/exports/extension_32k/code_32k1k.json
new file mode 100644
index 000000000..5a09c88f5
--- /dev/null
+++ b/datasets/isb1/exports/extension_32k/code_32k1k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49442fa6a1ec7114c26da5aa61ec7b7dfc6662f5e636edd95e5a019ae47ca2be
+size 1337748
diff --git a/datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json b/datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json
new file mode 100644
index 000000000..a110e6c14
--- /dev/null
+++ b/datasets/isb1/exports/extension_32k/code_32k1k_qwen3.5.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f74b077263ea89567e9a09cfcecc5ea90040891170d4d65636156f9349733aa
+size 337547
diff --git a/datasets/isb1/exports/extension_64k/chat_64k1k.json b/datasets/isb1/exports/extension_64k/chat_64k1k.json
new file mode 100644
index 000000000..709a833b2
--- /dev/null
+++ b/datasets/isb1/exports/extension_64k/chat_64k1k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0e7fa8895d4774cf36d9d78d9f02a35282f420598e7b373c5378330ea663b05
+size 2473612
diff --git a/datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json b/datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json
new file mode 100644
index 000000000..79ad2cb87
--- /dev/null
+++ b/datasets/isb1/exports/extension_64k/chat_64k1k_qwen3.5.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0533834348310306dc9e56ad4d54671a7615c9d7852fa677320bad51ee2ceaa6
+size 621810
diff --git a/datasets/isb1/exports/extension_64k/code_64k1k.json b/datasets/isb1/exports/extension_64k/code_64k1k.json
new file mode 100644
index 000000000..bb1ca8974
--- /dev/null
+++ b/datasets/isb1/exports/extension_64k/code_64k1k.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1804919d069fb037802c0d97605fb8bc6b12050f242f9ca00fc7aa7f372db81b
+size 788105
diff --git a/datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json b/datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json
new file mode 100644
index 000000000..73beb4b57
--- /dev/null
+++ b/datasets/isb1/exports/extension_64k/code_64k1k_qwen3.5.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9513a2d11519546a701d6b2889cbf18b01f5ba36abc3b6f8fb34669566e6c311
+size 200074
diff --git a/datasets/isb1/exports/preview/long_context_1m/README.md b/datasets/isb1/exports/preview/long_context_1m/README.md
new file mode 100644
index 000000000..3e5ea5af9
--- /dev/null
+++ b/datasets/isb1/exports/preview/long_context_1m/README.md
@@ -0,0 +1,33 @@
+# Gated 1M-class Qwen3.5 preview lane
+
+This directory carries the committed InferenceX-side Qwen3.5 artifacts for a
+bounded `1M`-class ISB1 coding replay preview.
+
+## What these files are
+
+- dedicated replay bundles restricted to `qwen3_5_397b_a17b`
+- producer cells for standalone `vllm` and standalone `sglang`
+- committed bundle coverage for `nvidia:b200_sxm_180gb`, `nvidia:h100_sxm_80gb`, and `nvidia:h200_sxm_141gb`
+- restricted to `ulc2_1m_plus`
+- restricted to `support_status=reviewed_preview` at the selected export-cell level
+- restricted to `benchmark_certification_status=dataset_replay_verified`
+- exposed downstream only through the separate manual config
+  `.github/configs/isb1-qwen-1m-preview.yaml`
+- explicit `max-model-len: 1048576` when the manual config is used
+
+## Current claim boundary
+
+These files are committed preview artifacts plus a gated/manual validation path.
+They do **not** imply ordinary runnable ISB1 support in `isb1-master.yaml`.
+
+Safe wording:
+- InferenceX carries bounded 1M-class Qwen3.5 replay preview artifacts.
+- InferenceX carries a separate gated/manual Qwen3.5 1M validation path.
+
+Unsafe wording:
+- native 1M served-lane support
+- ordinary/general runnable consumer support
+- KV-offload certification
+
+See `manifest.json` for the exact preview boundary and
+`.github/configs/isb1-qwen-1m-preview.yaml` for the manual validation surface.
diff --git a/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1.json b/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1.json
new file mode 100644
index 000000000..a37edd86a
--- /dev/null
+++ b/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd16cc4de821cf4803d662e4c5091359b7a5b2b730d03c976eb331be0cd6b1cb
+size 286074
diff --git a/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json b/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json
new file mode 100644
index 000000000..5fd23f78c
--- /dev/null
+++ b/datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35572a38f071d240519f7fdbd60aa203eb4832d835df97a8a5ef874d5d402456
+size 122465512
diff --git a/datasets/isb1/exports/preview/long_context_1m/manifest.json b/datasets/isb1/exports/preview/long_context_1m/manifest.json
new file mode 100644
index 000000000..3c1cfb8db
--- /dev/null
+++ b/datasets/isb1/exports/preview/long_context_1m/manifest.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63e05e30fc8eddf2dd35b21b0575af6943428b2ab7e6ebe5a3df257d0344ad8b
+size 2445
diff --git a/datasets/isb1/exports/preview/long_context_500k/README.md b/datasets/isb1/exports/preview/long_context_500k/README.md
new file mode 100644
index 000000000..8efb153d5
--- /dev/null
+++ b/datasets/isb1/exports/preview/long_context_500k/README.md
@@ -0,0 +1,45 @@
+# Bounded 500k-class preview lanes
+
+This directory carries the smallest honest InferenceX consumer handoff for bounded
+`500k`-class ISB1 coding replay paths.
+
+## What these files are
+
+- dedicated replay bundles derived from committed `131k1k` extension exports
+- restricted to `gpt_oss_120b` or `qwen3_5_397b_a17b`
+- restricted to `xlc2_384k_512k`
+- restricted to standalone `vllm` and standalone `sglang`
+- restricted to `nvidia:b200_sxm_180gb`, `nvidia:h100_sxm_80gb`, and `nvidia:h200_sxm_141gb`
+- restricted to `support_status=reviewed_preview`
+- restricted to `benchmark_certification_status=dataset_replay_verified`
+- wired in the consumer with explicit `max-model-len: 524288`
+
+## What these files are not
+
+- not a native InferenceX `500k+` served lane
+- not a native InferenceX `1M+` served lane
+- not a supported-tier long-context expansion
+- not a chat preview lane
+- not an offload-depth lane
+- not a KV-offload certification claim
+
+## Why the files exist
+
+The existing `extension_131k/*/code_131k1k.json` and model-scoped
+`code_131k1k_qwen3.5.json` bundles already contain honest `xlc2_384k_512k`
+replay cells, but they are mixed with lower-band cells. The InferenceX workflow
+selects rows by runtime, hardware, model, and support tier — not by
+`context_band`.
+
+These dedicated files isolate only the `xlc2_384k_512k` rows so InferenceX can
+run bounded `500k`-class previews without over-selecting lower-band cells.
+
+## Consumer contract
+
+- `isb1-master.yaml` pins these rows as `reviewed_preview`
+- `isb1-master.yaml` pins `max-model-len: 524288`
+- current search space is intentionally bounded to single-concurrency preview execution
+- result processing preserves `context_bands`, `profile_id`, and the producer handoff claim boundary
+
+See `manifest.json` for the GPT-OSS derivation record and `manifest_qwen3.5.json`
+for the Qwen derivation record.
diff --git a/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1.json b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1.json
new file mode 100644
index 000000000..ed88496d8
--- /dev/null
+++ b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e200fb08b06dffc83189c393c0711e090cf8f579c719e69512e2fcfb3933e33
+size 153848
diff --git a/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1.json b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1.json
new file mode 100644
index 000000000..37f8e26a2
--- /dev/null
+++ b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9aa883fbca2ea93ec4d3cb748265a1c66e98554c658d8a0e51ed877a95e7faf1
+size 150709
diff --git a/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json
new file mode 100644
index 000000000..f996cc838
--- /dev/null
+++ b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5102d06da0cf4adfc640f1206cb26812369150d888165813012fe85183fec35
+size 157679
diff --git a/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json
new file mode 100644
index 000000000..00046987f
--- /dev/null
+++ b/datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18faa3c3271f2f1acf3892379d3e1d13f1e0e6e1bbefdf00e5e7c5cb54bb3c72
+size 32685533
diff --git a/datasets/isb1/exports/preview/long_context_500k/manifest.json b/datasets/isb1/exports/preview/long_context_500k/manifest.json
new file mode 100644
index 000000000..deae83d6d
--- /dev/null
+++ b/datasets/isb1/exports/preview/long_context_500k/manifest.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fb9e807a7f1c9df7cc0244309f594561913d05aeff434eb3d3e1ee322e0ffd5
+size 2344
diff --git a/datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json b/datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json
new file mode 100644
index 000000000..aed23b2db
--- /dev/null
+++ b/datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99682e56f2fff3506c27ce5b1e3c61273b7a0bdf9abf70e9a254b4af1cf2b936
+size 2303
diff --git a/datasets/isb1/scripts/adapt_trace_replay_result.py b/datasets/isb1/scripts/adapt_trace_replay_result.py
new file mode 100644
index 000000000..445ab7d9c
--- /dev/null
+++ b/datasets/isb1/scripts/adapt_trace_replay_result.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+from pathlib import Path
+from statistics import mean
+from typing import Any
+
+
+def _to_float(value: Any) -> float | None:
+    if value in (None, ""):
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _percentile(values: list[float], p: float) -> float:
+    if not values:
+        return 0.0
+    if len(values) == 1:
+        return values[0]
+    ordered = sorted(values)
+    idx = (len(ordered) - 1) * p
+    lo = int(idx)
+    hi = min(lo + 1, len(ordered) - 1)
+    frac = idx - lo
+    return ordered[lo] * (1 - frac) + ordered[hi] * frac
+
+
+def _read_csv_rows(path: Path) -> list[dict[str, str]]:
+    with path.open("r", encoding="utf-8", newline="") as handle:
+        return list(csv.DictReader(handle))
+
+
+def _pick(row: dict[str, str], *keys: str) -> float | None:
+    for key in keys:
+        if key in row:
+            value = _to_float(row.get(key))
+            if value is not None:
+                return value
+    return None
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Adapt kv-cache trace replay CSV output into ISB1 replay JSON schema"
+    )
+    parser.add_argument("--input-dir", default="/workspace", help="Directory containing trace replay outputs")
+    parser.add_argument(
+        "--detailed-csv",
+        default="detailed_results.csv",
+        help="Detailed replay CSV filename (inside --input-dir)",
+    )
+    parser.add_argument(
+        "--summary-json",
+        default=None,
+        help="Optional summary JSON path (used as supplemental source if present)",
+    )
+    parser.add_argument("--output-json", required=True, help="Output adapted replay JSON path")
+    parser.add_argument("--model-id", default="", help="Model ID for output metadata")
+    parser.add_argument("--max-concurrency", type=int, default=1, help="Max concurrency used")
+    parser.add_argument("--request-mode", default="multi-turn", help="Request mode metadata")
+    parser.add_argument(
+        "--benchmark-certification-status",
+        default="dataset_replay_verified",
+        help="Benchmark certification status to stamp in selection",
+    )
+    parser.add_argument(
+        "--support-status",
+        default="reviewed_preview",
+        help="Support status to stamp in selection",
+    )
+    parser.add_argument(
+        "--result-stem",
+        default="",
+        help="Optional result stem to infer total wall time from /workspace/<stem>.json",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    input_dir = Path(args.input_dir)
+    detailed_csv_path = input_dir / args.detailed_csv
+    output_path = Path(args.output_json)
+
+    if not detailed_csv_path.exists():
+        raise SystemExit(f"Missing detailed CSV: {detailed_csv_path}")
+
+    rows = _read_csv_rows(detailed_csv_path)
+    ttft_ms: list[float] = []
+    tpot_ms: list[float] = []
+    output_tokens: list[float] = []
+    prompt_tokens: list[float] = []
+    session_ids: set[str] = set()
+
+    for row in rows:
+        ttft = _pick(row, "ttft_ms", "ttft", "time_to_first_token_ms")
+        if ttft is not None:
+            ttft_ms.append(ttft)
+
+        tpot = _pick(row, "tpot_ms", "tpot", "time_per_output_token_ms")
+        if tpot is not None:
+            tpot_ms.append(tpot)
+
+        out_tok = _pick(row, "output_tokens", "generated_tokens", "completion_tokens")
+        if out_tok is not None:
+            output_tokens.append(out_tok)
+
+        in_tok = _pick(row, "input_tokens", "prompt_tokens", "content_token_count")
+        if in_tok is not None:
+            prompt_tokens.append(in_tok)
+
+        for key in ("session_id", "session", "conversation_id"):
+            sid = row.get(key)
+            if sid:
+                session_ids.add(str(sid))
+                break
+
+    completed_sessions = len(session_ids) if session_ids else len(rows)
+    total_sessions = completed_sessions
+
+    total_output_tokens = sum(output_tokens)
+    total_prompt_tokens = sum(prompt_tokens)
+    total_token_count = total_output_tokens + total_prompt_tokens
+
+    total_wall_time_s = 0.0
+    if args.result_stem:
+        maybe_summary = input_dir / f"{args.result_stem}.json"
+        if maybe_summary.exists():
+            try:
+                summary = json.loads(maybe_summary.read_text(encoding="utf-8"))
+                total_wall_time_s = float(
+                    _to_float(summary.get("test_duration_seconds"))
+                    or _to_float(summary.get("duration_s"))
+                    or _to_float(summary.get("total_duration_s"))
+                    or 0.0
+                )
+            except Exception:
+                total_wall_time_s = 0.0
+
+    if total_wall_time_s <= 0 and args.summary_json:
+        summary_path = Path(args.summary_json)
+        if summary_path.exists():
+            try:
+                summary = json.loads(summary_path.read_text(encoding="utf-8"))
+                total_wall_time_s = float(
+                    _to_float(summary.get("test_duration_seconds"))
+                    or _to_float(summary.get("duration_s"))
+                    or _to_float(summary.get("total_duration_s"))
+                    or 0.0
+                )
+            except Exception:
+                total_wall_time_s = 0.0
+
+    if total_wall_time_s <= 0:
+        total_wall_time_s = 1.0
+
+    aggregate_metrics = {
+        "total_token_throughput_tps": total_token_count / total_wall_time_s,
+        "output_throughput_tps": total_output_tokens / total_wall_time_s,
+        "mean_ttft_ms": mean(ttft_ms) if ttft_ms else 0.0,
+        "median_ttft_ms": _percentile(ttft_ms, 0.50),
+        "p99_ttft_ms": _percentile(ttft_ms, 0.99),
+        "mean_tpot_ms": mean(tpot_ms) if tpot_ms else 0.0,
+        "median_tpot_ms": _percentile(tpot_ms, 0.50),
+        "p99_tpot_ms": _percentile(tpot_ms, 0.99),
+        "completed_sessions": completed_sessions,
+        "total_sessions": total_sessions,
+        "session_throughput_sps": completed_sessions / total_wall_time_s,
+        "total_wall_time_s": total_wall_time_s,
+    }
+
+    adapted = {
+        "model_id": args.model_id,
+        "max_concurrency": args.max_concurrency,
+        "request_mode": args.request_mode,
+        "harness_request_mode": "auto",
+        "aggregate_metrics": aggregate_metrics,
+        "selection": {
+            "support_statuses": [args.support_status],
+            "benchmark_certification_statuses": [args.benchmark_certification_status],
+        },
+        "server_metrics_summary": {
+            "observability_status": "unavailable",
+            "gpu_cache_metric_name": None,
+            "cpu_cache_metric_name": None,
+            "gpu_cache_usage_peak": 0.0,
+            "cpu_cache_usage_peak": 0.0,
+            "preemption_count": 0,
+            "kv_offload_observed": False,
+            "cpu_cache_metric_available": False,
+        },
+        "depth_telemetry": {
+            "total_actual_input_tokens": int(total_prompt_tokens),
+            "max_actual_context_len_per_turn": int(max(prompt_tokens) if prompt_tokens else 0),
+        },
+        "num_sessions": total_sessions,
+        "max_turns": None,
+        "per_turn_metrics": {},
+    }
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(json.dumps(adapted, indent=2, sort_keys=True), encoding="utf-8")
+    print(f"Wrote adapted replay JSON: {output_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/datasets/isb1/scripts/analyze_benchmark_distributions.py b/datasets/isb1/scripts/analyze_benchmark_distributions.py
new file mode 100644
index 000000000..06c5a65f1
--- /dev/null
+++ b/datasets/isb1/scripts/analyze_benchmark_distributions.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Analyze ISL/OSL/turn distributions for ISB1 exports or kv-cache traces")
+    parser.add_argument("--export-file", default=None, help="ISB1 export JSON file")
+    parser.add_argument("--trace-dir", default=None, help="kv-cache-tester trace directory")
+    parser.add_argument("--output-dir", required=True, help="Output directory")
+    return parser.parse_args()
+
+
+def _percentile(values: list[float], p: float) -> float:
+    if not values:
+        return 0.0
+    if len(values) == 1:
+        return values[0]
+    ordered = sorted(values)
+    idx = (len(ordered) - 1) * p
+    lo = int(idx)
+    hi = min(lo + 1, len(ordered) - 1)
+    frac = idx - lo
+    return ordered[lo] * (1 - frac) + ordered[hi] * frac
+
+
+def _histogram(values: list[int], bins: list[int]) -> dict[str, int]:
+    counts: dict[str, int] = {}
+    for value in values:
+        placed = False
+        prev = 0
+        for bound in bins:
+            if value <= bound:
+                key = f"{prev + 1}-{bound}"
+                counts[key] = counts.get(key, 0) + 1
+                placed = True
+                break
+            prev = bound
+        if not placed:
+            key = f">{bins[-1]}"
+            counts[key] = counts.get(key, 0) + 1
+    return counts
+
+
+def _extract_isb1(export_payload: dict[str, Any]) -> tuple[list[int], list[int], list[int]]:
+    isl: list[int] = []
+    osl: list[int] = []
+    turns_per_session: list[int] = []
+
+    for cell in export_payload.get("exports", []):
+        session = cell.get("session") or {}
+        turns = session.get("turns") or []
+        turns_per_session.append(len(turns))
+        for turn in turns:
+            input_tokens = (
+                turn.get("actual_input_tokens")
+                or turn.get("content_token_count")
+                or turn.get("prompt_tokens")
+                or turn.get("input_tokens")
+                or 0
+            )
+            output_tokens = (
+                turn.get("expected_output_tokens")
+                or turn.get("target_output_tokens")
+                or turn.get("output_tokens")
+                or 0
+            )
+            try:
+                isl.append(int(input_tokens))
+            except Exception:
+                isl.append(0)
+            try:
+                osl.append(int(output_tokens))
+            except Exception:
+                osl.append(0)
+
+    return isl, osl, turns_per_session
+
+
+def _extract_trace_dir(trace_dir: Path) -> tuple[list[int], list[int], list[int]]:
+    isl: list[int] = []
+    osl: list[int] = []
+    turns_per_session: list[int] = []
+
+    files = list(sorted(trace_dir.glob("*.json")))
+    if not files:
+        raise SystemExit(f"No JSON traces found in {trace_dir}")
+
+    for path in files:
+        payload = json.loads(path.read_text(encoding="utf-8"))
+        sessions = payload.get("sessions") or []
+        for session in sessions:
+            turns = session.get("turns") or []
+            turns_per_session.append(len(turns))
+            for turn in turns:
+                isl.append(int(turn.get("content_token_count", 0) or 0))
+                osl.append(int(turn.get("target_output_tokens", 0) or 0))
+
+    return isl, osl, turns_per_session
+
+
+def build_report(isl: list[int], osl: list[int], turns_per_session: list[int], source: str) -> dict[str, Any]:
+    return {
+        "source": source,
+        "num_sessions": len(turns_per_session),
+        "num_turns": len(isl),
+        "isl": {
+            "p50": _percentile([float(x) for x in isl], 0.50),
+            "p95": _percentile([float(x) for x in isl], 0.95),
+            "max": max(isl) if isl else 0,
+            "histogram": _histogram(isl, [1024, 4096, 8192, 16384, 32768, 65536]),
+        },
+        "osl": {
+            "p50": _percentile([float(x) for x in osl], 0.50),
+            "p95": _percentile([float(x) for x in osl], 0.95),
+            "max": max(osl) if osl else 0,
+            "histogram": _histogram(osl, [64, 128, 256, 512, 1024, 2048, 4096]),
+        },
+        "turns_per_session": {
+            "p50": _percentile([float(x) for x in turns_per_session], 0.50),
+            "p95": _percentile([float(x) for x in turns_per_session], 0.95),
+            "max": max(turns_per_session) if turns_per_session else 0,
+            "histogram": _histogram(turns_per_session, [2, 4, 8, 16, 32]),
+        },
+    }
+
+
+def main() -> int:
+    args = parse_args()
+    if bool(args.export_file) == bool(args.trace_dir):
+        raise SystemExit("Provide exactly one of --export-file or --trace-dir")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    if args.export_file:
+        export_path = Path(args.export_file)
+        payload = json.loads(export_path.read_text(encoding="utf-8"))
+        isl, osl, turns_per_session = _extract_isb1(payload)
+        report = build_report(isl, osl, turns_per_session, source=str(export_path))
+    else:
+        trace_dir = Path(args.trace_dir)
+        isl, osl, turns_per_session = _extract_trace_dir(trace_dir)
+        report = build_report(isl, osl, turns_per_session, source=str(trace_dir))
+
+    output_path = output_dir / "distribution_report.json"
+    output_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8")
+    print(f"Wrote: {output_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/datasets/isb1/scripts/collect_sweep_results.py b/datasets/isb1/scripts/collect_sweep_results.py
new file mode 100644
index 000000000..0d7155428
--- /dev/null
+++ b/datasets/isb1/scripts/collect_sweep_results.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import sqlite3
+from pathlib import Path
+from typing import Any
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Aggregate sweep results from DB or agg_*.json directory")
+    parser.add_argument("--db-path", default=None, help="SQLite DB path")
+    parser.add_argument("--json-dir", default=None, help="Directory containing agg_*.json files")
+    parser.add_argument("--output-dir", required=True, help="Output directory")
+    parser.add_argument("--cliff-ttft-ms", type=float, default=5000.0, help="TTFT p99 threshold for capacity cliff")
+    return parser.parse_args()
+
+
+def _to_float(value: Any) -> float | None:
+    if value in (None, ""):
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _to_int(value: Any) -> int | None:
+    if value in (None, ""):
+        return None
+    try:
+        return int(float(value))
+    except (TypeError, ValueError):
+        return None
+
+
+def collect_from_db(db_path: Path) -> list[dict[str, Any]]:
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = sqlite3.Row
+    rows = conn.execute(
+        """
+        SELECT offload_mode, throughput_tok_s, ttft_p99_ms, max_concurrency, raw_result_json
+        FROM benchmark_runs
+        WHERE offload_mode IS NOT NULL
+        ORDER BY id ASC
+        """
+    ).fetchall()
+    conn.close()
+
+    out: list[dict[str, Any]] = []
+    for row in rows:
+        concurrency = row["max_concurrency"]
+        if concurrency in (None, "") and row["raw_result_json"]:
+            try:
+                payload = json.loads(row["raw_result_json"])
+                concurrency = payload.get("conc") or payload.get("max_concurrency")
+            except Exception:
+                pass
+        out.append(
+            {
+                "offload_mode": row["offload_mode"],
+                "concurrency": _to_int(concurrency),
+                "throughput_tok_s": _to_float(row["throughput_tok_s"]),
+                "ttft_p99_ms": _to_float(row["ttft_p99_ms"]),
+                "source": "db",
+            }
+        )
+    return out
+
+
+def collect_from_json_dir(json_dir: Path) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    for path in sorted(json_dir.glob("agg_*.json")):
+        try:
+            payload = json.loads(path.read_text(encoding="utf-8"))
+        except Exception:
+            continue
+        rows.append(
+            {
+                "offload_mode": payload.get("offload_mode"),
+                "concurrency": _to_int(payload.get("conc") or payload.get("max_concurrency")),
+                "throughput_tok_s": _to_float(payload.get("throughput_tok_s") or payload.get("tput_per_gpu")),
+                "ttft_p99_ms": _to_float(payload.get("ttft_p99_ms") or payload.get("p99_ttft_ms")),
+                "source": str(path.name),
+            }
+        )
+    return rows
+
+
+def compute_capacity_cliff(rows: list[dict[str, Any]], threshold_ms: float) -> dict[str, Any]:
+    cliff: dict[str, Any] = {}
+    for mode in sorted({row.get("offload_mode") for row in rows if row.get("offload_mode")}):
+        mode_rows = sorted(
+            [r for r in rows if r.get("offload_mode") == mode and r.get("concurrency") is not None],
+            key=lambda r: r["concurrency"],
+        )
+        cliff_row = None
+        for row in mode_rows:
+            if (row.get("ttft_p99_ms") or 0.0) > threshold_ms:
+                cliff_row = row
+                break
+        cliff[str(mode)] = cliff_row
+    return cliff
+
+
+def compute_offload_benefit(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    by_conc: dict[int, dict[str, dict[str, Any]]] = {}
+    for row in rows:
+        conc = row.get("concurrency")
+        mode = row.get("offload_mode")
+        if conc is None or mode is None:
+            continue
+        by_conc.setdefault(int(conc), {})[str(mode)] = row
+
+    deltas: list[dict[str, Any]] = []
+    for conc in sorted(by_conc):
+        modes = by_conc[conc]
+        on = modes.get("on")
+        off = modes.get("off")
+        if not on or not off:
+            continue
+        on_tput = on.get("throughput_tok_s") or 0.0
+        off_tput = off.get("throughput_tok_s") or 0.0
+        deltas.append(
+            {
+                "concurrency": conc,
+                "throughput_on": on_tput,
+                "throughput_off": off_tput,
+                "offload_benefit_delta_tps": on_tput - off_tput,
+            }
+        )
+    return deltas
+
+
+def write_csv(path: Path, rows: list[dict[str, Any]]) -> None:
+    with path.open("w", newline="", encoding="utf-8") as handle:
+        writer = csv.writer(handle)
+        writer.writerow(["offload_mode", "concurrency", "throughput_tok_s", "ttft_p99_ms", "source"])
+        for row in rows:
+            writer.writerow([
+                row.get("offload_mode"),
+                row.get("concurrency"),
+                row.get("throughput_tok_s"),
+                row.get("ttft_p99_ms"),
+                row.get("source"),
+            ])
+
+
+def main() -> int:
+    args = parse_args()
+    if not args.db_path and not args.json_dir:
+        raise SystemExit("Provide --db-path or --json-dir")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    rows: list[dict[str, Any]] = []
+    if args.db_path:
+        rows.extend(collect_from_db(Path(args.db_path)))
+    if args.json_dir:
+        rows.extend(collect_from_json_dir(Path(args.json_dir)))
+
+    summary = {
+        "num_rows": len(rows),
+        "capacity_cliff": compute_capacity_cliff(rows, args.cliff_ttft_ms),
+        "offload_benefit": compute_offload_benefit(rows),
+        "rows": rows,
+    }
+
+    json_path = output_dir / "sweep_aggregate.json"
+    csv_path = output_dir / "sweep_aggregate.csv"
+    json_path.write_text(json.dumps(summary, indent=2, sort_keys=True), encoding="utf-8")
+    write_csv(csv_path, rows)
+
+    print(f"Wrote: {json_path}")
+    print(f"Wrote: {csv_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/datasets/isb1/scripts/generate_qwen35_low_band_exports.py b/datasets/isb1/scripts/generate_qwen35_low_band_exports.py
new file mode 100755
index 000000000..51be8b531
--- /dev/null
+++ b/datasets/isb1/scripts/generate_qwen35_low_band_exports.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+"""Generate dedicated Qwen 3.5 ISB1 export bundles for 8k/32k/64k lanes.
+
+These files are derived from the committed generic export bundles by selecting only
+GPT-OSS cells that are actually runnable (`supported` or `reviewed_preview`), then
+rewriting model identity fields to the Qwen 3.5 replay identity while keeping trace
+payloads unchanged.
+"""
+
+from __future__ import annotations
+
+import json
+from copy import deepcopy
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[3]
+EXPORT_ROOT = ROOT / "datasets" / "isb1" / "exports"
+
+QWEN_MODEL_ID = "qwen3_5_397b_a17b"
+GPTOSS_MODEL_ID = "gpt_oss_120b"
+ALLOWED_SUPPORT_STATUSES = {"supported", "reviewed_preview"}
+
+TARGETS = [
+    ("core", "8k1k", "chat", "vllm"),
+    ("core", "8k1k", "chat", "sglang"),
+    ("core", "8k1k", "code", "vllm"),
+    ("core", "8k1k", "code", "sglang"),
+    ("extension_32k", "32k1k", "chat", "vllm"),
+    ("extension_32k", "32k1k", "chat", "sglang"),
+    ("extension_32k", "32k1k", "code", "vllm"),
+    ("extension_32k", "32k1k", "code", "sglang"),
+    ("extension_64k", "64k1k", "chat", "vllm"),
+    ("extension_64k", "64k1k", "chat", "sglang"),
+    ("extension_64k", "64k1k", "code", "vllm"),
+    ("extension_64k", "64k1k", "code", "sglang"),
+]
+
+
+def _source_path(lane: str, shape: str, surface: str, engine: str) -> Path:
+    return EXPORT_ROOT / lane / engine / f"{surface}_{shape}.json"
+
+
+def _target_path(lane: str, shape: str, surface: str, engine: str) -> Path:
+    return EXPORT_ROOT / lane / engine / f"{surface}_{shape}_qwen3.5.json"
+
+
+def _rewrite_bundle_id(bundle_id: str, lane: str, engine: str, surface: str, shape: str) -> str:
+    expected_prefix = f"isb1_{lane}_{engine}_{surface}_{shape}"
+    if bundle_id != expected_prefix:
+        raise ValueError(
+            f"Unexpected bundle_id {bundle_id!r}; expected {expected_prefix!r} for {lane}/{engine}/{surface}_{shape}"
+        )
+    return f"{bundle_id}_qwen3_5"
+
+
+def _rewrite_cell(cell: dict) -> dict:
+    rewritten = deepcopy(cell)
+    rewritten["canonical_model_id"] = QWEN_MODEL_ID
+    rewritten["thinking_history_policy"] = "strip_reasoning"
+    rewritten["history_projection_mode"] = "strip_reasoning_history"
+    rewritten["support_status"] = "reviewed_preview"
+    return rewritten
+
+
+def build_export(lane: str, shape: str, surface: str, engine: str) -> tuple[Path, int]:
+    source_path = _source_path(lane, shape, surface, engine)
+    target_path = _target_path(lane, shape, surface, engine)
+
+    payload = json.loads(source_path.read_text())
+    exports = payload.get("exports")
+    if not isinstance(exports, list):
+        raise ValueError(f"Missing exports list in {source_path}")
+
+    filtered = [
+        _rewrite_cell(cell)
+        for cell in exports
+        if cell.get("canonical_model_id") == GPTOSS_MODEL_ID
+        and cell.get("support_status") in ALLOWED_SUPPORT_STATUSES
+    ]
+    if not filtered:
+        raise ValueError(f"No runnable GPT-OSS cells found in {source_path}")
+
+    payload["bundle_id"] = _rewrite_bundle_id(payload.get("bundle_id"), lane, engine, surface, shape)
+    payload["exports"] = filtered
+
+    target_path.write_text(json.dumps(payload, indent=2) + "\n")
+    return target_path, len(filtered)
+
+
+def main() -> int:
+    for lane, shape, surface, engine in TARGETS:
+        target_path, count = build_export(lane, shape, surface, engine)
+        print(f"wrote {target_path.relative_to(ROOT)} ({count} cells)")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/datasets/isb1/scripts/gmi_analyze_sweep.py b/datasets/isb1/scripts/gmi_analyze_sweep.py
new file mode 100644
index 000000000..d0c3465b2
--- /dev/null
+++ b/datasets/isb1/scripts/gmi_analyze_sweep.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import sqlite3
+import subprocess
+import sys
+from pathlib import Path
+from statistics import median
+from typing import Any
+
+from isb1_results_db import render_table
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Analyze KV sweep runs from ISB1 SQLite results.")
+    parser.add_argument("--db-path", required=True, help="Path to SQLite DB (isb1_results.db)")
+    parser.add_argument("--output-dir", default=".", help="Directory to write summary outputs")
+    parser.add_argument("--pareto", action="store_true", help="Also run plot_pareto.py")
+    parser.add_argument(
+        "--distributions",
+        action="store_true",
+        help="Also run analyze_benchmark_distributions.py",
+    )
+    parser.add_argument("--export-file", default=None, help="Export JSON for --distributions")
+    parser.add_argument("--trace-dir", default=None, help="Trace directory for --distributions")
+    return parser.parse_args()
+
+
+def _to_float(value: Any) -> float | None:
+    if value in (None, ""):
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _to_int(value: Any) -> int | None:
+    if value in (None, ""):
+        return None
+    try:
+        return int(float(value))
+    except (TypeError, ValueError):
+        return None
+
+
+def _extract_concurrency(raw_result_json: str | None) -> int | None:
+    if not raw_result_json:
+        return None
+    try:
+        payload = json.loads(raw_result_json)
+    except json.JSONDecodeError:
+        return None
+    return _to_int(payload.get("conc") or payload.get("max_concurrency"))
+
+
+def percentile(values: list[float], p: float) -> float | None:
+    if not values:
+        return None
+    ordered = sorted(values)
+    if len(ordered) == 1:
+        return ordered[0]
+    idx = (len(ordered) - 1) * p
+    lo = int(idx)
+    hi = min(lo + 1, len(ordered) - 1)
+    frac = idx - lo
+    return ordered[lo] * (1 - frac) + ordered[hi] * frac
+
+
+def load_rows(db_path: Path) -> list[dict[str, Any]]:
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = sqlite3.Row
+    rows = conn.execute(
+        """
+        SELECT
+          id,
+          offload_mode,
+          ttft_p50_ms,
+          ttft_p99_ms,
+          throughput_tok_s,
+          preemption_count,
+          status,
+          raw_result_json
+        FROM benchmark_runs
+        WHERE offload_mode IS NOT NULL
+        ORDER BY id ASC
+        """
+    ).fetchall()
+    conn.close()
+
+    normalized: list[dict[str, Any]] = []
+    for row in rows:
+        concurrency = _extract_concurrency(row["raw_result_json"])
+        normalized.append(
+            {
+                "offload_mode": row["offload_mode"],
+                "concurrency": concurrency,
+                "ttft_p50_ms": _to_float(row["ttft_p50_ms"]),
+                "ttft_p99_ms": _to_float(row["ttft_p99_ms"]),
+                "throughput_tok_s": _to_float(row["throughput_tok_s"]),
+                "preemption_count": _to_int(row["preemption_count"]) or 0,
+                "status": row["status"],
+            }
+        )
+    return normalized
+
+
+def summarize(rows: list[dict[str, Any]]) -> dict[str, Any]:
+    grouped: dict[tuple[str, int], list[dict[str, Any]]] = {}
+    for row in rows:
+        if row["concurrency"] is None:
+            continue
+        key = (row["offload_mode"], row["concurrency"])
+        grouped.setdefault(key, []).append(row)
+
+    summary_rows: list[dict[str, Any]] = []
+    for (offload_mode, concurrency), items in sorted(grouped.items(), key=lambda x: (x[0][0], x[0][1])):
+        ttft_p50_values = [x["ttft_p50_ms"] for x in items if x["ttft_p50_ms"] is not None]
+        ttft_p99_values = [x["ttft_p99_ms"] for x in items if x["ttft_p99_ms"] is not None]
+        throughput_values = [x["throughput_tok_s"] for x in items if x["throughput_tok_s"] is not None]
+        preemptions = [x["preemption_count"] for x in items]
+        success_count = sum(1 for x in items if x["status"] == "success")
+
+        summary_rows.append(
+            {
+                "offload_mode": offload_mode,
+                "concurrency": concurrency,
+                "runs": len(items),
+                "success_runs": success_count,
+                "ttft_p50_ms": median(ttft_p50_values) if ttft_p50_values else None,
+                "ttft_p99_ms": percentile(ttft_p99_values, 0.99),
+                "throughput_tok_s": median(throughput_values) if throughput_values else None,
+                "preemptions": int(median(preemptions)) if preemptions else 0,
+            }
+        )
+
+    return {
+        "total_rows": len(rows),
+        "grouped_rows": len(summary_rows),
+        "summary": summary_rows,
+    }
+
+
+def write_summary_json(output_dir: Path, summary: dict[str, Any]) -> Path:
+    output_path = output_dir / "sweep_summary.json"
+    output_path.write_text(json.dumps(summary, indent=2))
+    return output_path
+
+
+def write_pareto_csv(output_dir: Path, summary: dict[str, Any]) -> Path:
+    output_path = output_dir / "pareto_data.csv"
+    with output_path.open("w", newline="") as handle:
+        writer = csv.writer(handle)
+        writer.writerow(["offload_mode", "concurrency", "throughput_tok_s", "ttft_p99_ms"])
+        for row in summary["summary"]:
+            writer.writerow(
+                [
+                    row["offload_mode"],
+                    row["concurrency"],
+                    row["throughput_tok_s"],
+                    row["ttft_p99_ms"],
+                ]
+            )
+    return output_path
+
+
+def print_console_summary(summary: dict[str, Any]) -> None:
+    headers = [
+        "offload_mode",
+        "concurrency",
+        "runs",
+        "success_runs",
+        "ttft_p50_ms",
+        "ttft_p99_ms",
+        "throughput_tok_s",
+        "preemptions",
+    ]
+    rows = [
+        [
+            row["offload_mode"],
+            row["concurrency"],
+            row["runs"],
+            row["success_runs"],
+            row["ttft_p50_ms"],
+            row["ttft_p99_ms"],
+            row["throughput_tok_s"],
+            row["preemptions"],
+        ]
+        for row in summary["summary"]
+    ]
+
+    print(f"Total rows: {summary['total_rows']}")
+    print(f"Grouped rows: {summary['grouped_rows']}")
+    if rows:
+        print(render_table(headers, rows))
+    else:
+        print("No sweep rows with offload_mode + concurrency found.")
+
+
+def main() -> int:
+    args = parse_args()
+    db_path = Path(args.db_path)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    rows = load_rows(db_path)
+    summary = summarize(rows)
+    summary_path = write_summary_json(output_dir, summary)
+    pareto_path = write_pareto_csv(output_dir, summary)
+
+    print_console_summary(summary)
+    print(f"Wrote: {summary_path}")
+    print(f"Wrote: {pareto_path}")
+
+    script_dir = Path(__file__).resolve().parent
+
+    if args.pareto:
+        pareto_cmd = [
+            sys.executable,
+            str(script_dir / "plot_pareto.py"),
+            "--db-path",
+            str(db_path),
+            "--output-dir",
+            str(output_dir),
+        ]
+        subprocess.run(pareto_cmd, check=True)
+
+    if args.distributions:
+        dist_cmd = [
+            sys.executable,
+            str(script_dir / "analyze_benchmark_distributions.py"),
+            "--output-dir",
+            str(output_dir),
+        ]
+        if args.export_file:
+            dist_cmd.extend(["--export-file", args.export_file])
+        elif args.trace_dir:
+            dist_cmd.extend(["--trace-dir", args.trace_dir])
+        else:
+            raise SystemExit("--distributions requires --export-file or --trace-dir")
+        subprocess.run(dist_cmd, check=True)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/datasets/isb1/scripts/gmi_full_suite.sh b/datasets/isb1/scripts/gmi_full_suite.sh
new file mode 100755
index 000000000..fad23efc1
--- /dev/null
+++ b/datasets/isb1/scripts/gmi_full_suite.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+PORTABLE_SCRIPT="$SCRIPT_DIR/gmi_portable_benchmark.sh"
+
+usage() {
+  echo "Usage: gmi_full_suite.sh --gpu-type <h100|h200|b200> [--db-path <path>]"
+}
+
+GPU_TYPE=""
+DB_PATH=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --gpu-type)
+      GPU_TYPE="$2"
+      shift 2
+      ;;
+    --db-path)
+      DB_PATH="$2"
+      shift 2
+      ;;
+    --help|-h)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown: $1" >&2
+      exit 1
+      ;;
+  esac
+done
+
+[[ -n "$GPU_TYPE" ]] || {
+  usage >&2
+  exit 1
+}
+
+case "$GPU_TYPE" in
+  h100|h200|b200) ;;
+  *)
+    echo "Unsupported --gpu-type: $GPU_TYPE" >&2
+    exit 1
+    ;;
+esac
+
+[[ -x "$PORTABLE_SCRIPT" ]] || {
+  echo "Expected executable helper at $PORTABLE_SCRIPT" >&2
+  exit 1
+}
+
+if [[ -n "$DB_PATH" ]]; then
+  export ISB1_RESULTS_DB_PATH="$DB_PATH"
+fi
+
+PASSED=0
+FAILED=0
+SKIPPED=0
+
+run_combo() {
+  local model="$1"
+  local engine="$2"
+  local band="$3"
+  local workload="${4:-code}"
+
+  echo "========================================="
+  echo ">>> $model × $engine × $band × $workload on $GPU_TYPE"
+  echo "========================================="
+
+  if "$PORTABLE_SCRIPT" \
+    --gpu-type "$GPU_TYPE" \
+    --model "$model" \
+    --engine "$engine" \
+    --context-band "$band" \
+    --workload "$workload"; then
+    ((PASSED++)) || true
+  else
+    echo "FAILED: $model × $engine × $band × $workload" >&2
+    ((FAILED++)) || true
+  fi
+}
+
+# Core 8k — all models × all engines × chat + code
+for model in qwen3.5 gptoss dsr1; do
+  for engine in vllm sglang; do
+    for workload in chat code; do
+      run_combo "$model" "$engine" 8k "$workload"
+    done
+  done
+done
+
+# 131k — all models × all engines × chat + code
+for model in qwen3.5 gptoss dsr1; do
+  for engine in vllm sglang; do
+    for workload in chat code; do
+      run_combo "$model" "$engine" 131k "$workload"
+    done
+  done
+done
+
+# 500k — qwen3.5 + gptoss only (DSR1 max context=164k, exceeds model capability)
+for model in qwen3.5 gptoss; do
+  for engine in vllm sglang; do
+    for workload in chat code; do
+      run_combo "$model" "$engine" 500k "$workload"
+    done
+  done
+done
+
+# 1m — qwen3.5 only (only model supporting 1M context), b200 only
+if [[ "$GPU_TYPE" == "b200" ]]; then
+  for engine in vllm sglang; do
+    for workload in chat code; do
+      run_combo qwen3.5 "$engine" 1m "$workload"
+    done
+  done
+else
+  SKIPPED=4
+fi
+
+echo
+echo "========================================="
+echo "SUITE COMPLETE: passed=$PASSED failed=$FAILED skipped=$SKIPPED"
+echo "========================================="
+
+if command -v python3 >/dev/null 2>&1; then
+  summary_cmd=(python3 "$SCRIPT_DIR/isb1_results_db.py" summary)
+  if [[ -n "$DB_PATH" ]]; then
+    summary_cmd+=(--db-path "$DB_PATH")
+  fi
+  "${summary_cmd[@]}" 2>/dev/null || true
+fi
+
+[[ "$FAILED" -eq 0 ]]
diff --git a/datasets/isb1/scripts/gmi_kv_sweep.sh b/datasets/isb1/scripts/gmi_kv_sweep.sh
new file mode 100644
index 000000000..e953aba1a
--- /dev/null
+++ b/datasets/isb1/scripts/gmi_kv_sweep.sh
@@ -0,0 +1,176 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+PORTABLE_SCRIPT="$SCRIPT_DIR/gmi_portable_benchmark.sh"
+
+usage() {
+  cat <<'EOF'
+Usage:
+  gmi_kv_sweep.sh \
+    --gpu-type <h100|h200|b200> \
+    --model <qwen3.5|gptoss|dsr1> \
+    --engine <vllm|sglang> \
+    --context-band <8k|32k|64k|131k|500k|1m> \
+    --workload <chat|code> \
+    [--users "2,4,8,16,32,64"] \
+    [--offload-modes "on,off,noprefix"] \
+    [--kv-cache-dtype <auto|fp8>] \
+    [--benchmark-duration-s <seconds>] \
+    [--disable-prefix-caching] \
+    [--total-cpu-dram-gb <N>] \
+    [--trace-source <isb1|kv_cache_tester|aiperf>] \
+    [--db-path <sqlite-path>]
+EOF
+}
+
+die() {
+  echo "ERROR: $*" >&2
+  exit 1
+}
+
+trim() {
+  local x="$1"
+  x="${x#${x%%[![:space:]]*}}"
+  x="${x%${x##*[![:space:]]}}"
+  printf '%s' "$x"
+}
+
+GPU_TYPE=""
+MODEL=""
+ENGINE=""
+CONTEXT_BAND=""
+WORKLOAD=""
+USERS="2,4,8,16,32,64"
+OFFLOAD_MODES="on,off,noprefix"
+KV_CACHE_DTYPE=""
+BENCHMARK_DURATION_S="1800"
+DISABLE_PREFIX_CACHING=false
+TOTAL_CPU_DRAM_GB=""
+TRACE_SOURCE="isb1"
+DB_PATH=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --gpu-type) GPU_TYPE="$2"; shift 2 ;;
+    --model) MODEL="$2"; shift 2 ;;
+    --engine) ENGINE="$2"; shift 2 ;;
+    --context-band) CONTEXT_BAND="$2"; shift 2 ;;
+    --workload) WORKLOAD="$2"; shift 2 ;;
+    --users) USERS="$2"; shift 2 ;;
+    --offload-modes) OFFLOAD_MODES="$2"; shift 2 ;;
+    --kv-cache-dtype) KV_CACHE_DTYPE="$2"; shift 2 ;;
+    --benchmark-duration-s) BENCHMARK_DURATION_S="$2"; shift 2 ;;
+    --disable-prefix-caching) DISABLE_PREFIX_CACHING=true; shift ;;
+    --total-cpu-dram-gb) TOTAL_CPU_DRAM_GB="$2"; shift 2 ;;
+    --trace-source) TRACE_SOURCE="$2"; shift 2 ;;
+    --db-path) DB_PATH="$2"; shift 2 ;;
+    -h|--help) usage; exit 0 ;;
+    *) die "Unknown argument: $1" ;;
+  esac
+done
+
+[[ -n "$GPU_TYPE" ]] || die "--gpu-type is required"
+[[ -n "$MODEL" ]] || die "--model is required"
+[[ -n "$ENGINE" ]] || die "--engine is required"
+[[ -n "$CONTEXT_BAND" ]] || die "--context-band is required"
+[[ -n "$WORKLOAD" ]] || die "--workload is required"
+[[ -x "$PORTABLE_SCRIPT" ]] || die "Expected executable script: $PORTABLE_SCRIPT"
+
+case "$ENGINE" in
+  vllm|sglang) ;;
+  *) die "Unsupported --engine: $ENGINE" ;;
+esac
+
+case "$TRACE_SOURCE" in
+  isb1|kv_cache_tester|aiperf) ;;
+  *) die "Unsupported --trace-source: $TRACE_SOURCE" ;;
+esac
+
+IFS=',' read -r -a user_list <<< "$USERS"
+IFS=',' read -r -a mode_list <<< "$OFFLOAD_MODES"
+
+[[ "${#user_list[@]}" -gt 0 ]] || die "--users cannot be empty"
+[[ "${#mode_list[@]}" -gt 0 ]] || die "--offload-modes cannot be empty"
+
+TOTAL=0
+PASSED=0
+FAILED=0
+
+for raw_mode in "${mode_list[@]}"; do
+  mode=$(trim "$raw_mode")
+  [[ -n "$mode" ]] || continue
+
+  case "$mode" in
+    on|off|noprefix|legacy) ;;
+    *) die "Unsupported offload mode in --offload-modes: $mode" ;;
+  esac
+
+  if [[ "$ENGINE" == "sglang" && "$mode" == "on" ]]; then
+    echo "Skipping mode=on for SGLang (no native offload support)"
+    continue
+  fi
+
+  for raw_users in "${user_list[@]}"; do
+    users=$(trim "$raw_users")
+    [[ "$users" =~ ^[0-9]+$ ]] || die "Invalid user concurrency: $users"
+
+    TOTAL=$((TOTAL + 1))
+    echo "========================================================"
+    echo "Run $TOTAL: model=$MODEL engine=$ENGINE users=$users mode=$mode"
+    echo "========================================================"
+
+    cmd=(
+      "$PORTABLE_SCRIPT"
+      --gpu-type "$GPU_TYPE"
+      --model "$MODEL"
+      --engine "$ENGINE"
+      --context-band "$CONTEXT_BAND"
+      --workload "$WORKLOAD"
+      --benchmark-type isb1_kv_stress
+      --benchmark-duration-s "$BENCHMARK_DURATION_S"
+      --max-concurrency "$users"
+      --trace-source "$TRACE_SOURCE"
+      --offload-mode "$mode"
+    )
+
+    if [[ -n "$KV_CACHE_DTYPE" ]]; then
+      cmd+=(--kv-cache-dtype "$KV_CACHE_DTYPE")
+    fi
+    if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then
+      cmd+=(--disable-prefix-caching)
+    fi
+    if [[ -n "$TOTAL_CPU_DRAM_GB" ]]; then
+      cmd+=(--total-cpu-dram-gb "$TOTAL_CPU_DRAM_GB")
+    fi
+    if [[ -n "$DB_PATH" ]]; then
+      if ISB1_RESULTS_DB_PATH="$DB_PATH" "${cmd[@]}"; then
+        PASSED=$((PASSED + 1))
+        echo "PASS users=$users mode=$mode"
+      else
+        FAILED=$((FAILED + 1))
+        echo "FAIL users=$users mode=$mode" >&2
+      fi
+    else
+      if "${cmd[@]}"; then
+        PASSED=$((PASSED + 1))
+        echo "PASS users=$users mode=$mode"
+      else
+        FAILED=$((FAILED + 1))
+        echo "FAIL users=$users mode=$mode" >&2
+      fi
+    fi
+  done
+done
+
+echo
+echo "KV sweep complete"
+echo "  total:  $TOTAL"
+echo "  passed: $PASSED"
+echo "  failed: $FAILED"
+
+if [[ -n "$DB_PATH" && -f "$DB_PATH" ]]; then
+  echo "  db:     $DB_PATH"
+fi
+
+[[ "$FAILED" -eq 0 ]]
diff --git a/datasets/isb1/scripts/gmi_portable_benchmark.sh b/datasets/isb1/scripts/gmi_portable_benchmark.sh
new file mode 100755
index 000000000..f41722e36
--- /dev/null
+++ b/datasets/isb1/scripts/gmi_portable_benchmark.sh
@@ -0,0 +1,1019 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+usage() {
+  cat <<'EOF'
+Usage:
+  gmi_portable_benchmark.sh \
+    --gpu-type <h100|h200|b200> \
+    --model <qwen3.5|gptoss|dsr1> \
+    --engine <vllm|sglang> \
+    --context-band <8k|32k|64k|131k|500k|1m> \
+    --workload <chat|code> \
+    [--benchmark-type <isb1_replay|isb1_kv_stress>] \
+    [--offload-mode <on|off|noprefix|legacy>] \
+    [--kv-cache-dtype <auto|fp8>] \
+    [--disable-prefix-caching] \
+    [--total-cpu-dram-gb <N>] \
+    [--benchmark-duration-s <seconds>] \
+    [--max-concurrency <N>] \
+    [--trace-source <isb1|kv_cache_tester|aiperf>]
+
+Required environment:
+  HF_TOKEN or HUGGING_FACE_HUB_TOKEN  Hugging Face token for model access
+
+Optional environment:
+  PORT                    API port (default: 8000)
+  TP                      Tensor parallelism (default: 8)
+  HEALTH_TIMEOUT_S        Readiness timeout in seconds (default: 1800)
+  HEALTH_POLL_INTERVAL_S  Readiness poll interval (default: 10)
+  BENCHMARK_OUTPUT_ROOT   Output root (default: <repo>/datasets/isb1/results/gmi)
+  GMI_RUN_LABEL           Optional suffix added to result names
+EOF
+}
+
+die() {
+  echo "ERROR: $*" >&2
+  exit 1
+}
+
+require_cmd() {
+  command -v "$1" >/dev/null 2>&1 || die "Missing required command: $1"
+}
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+REPO_ROOT=$(cd "$SCRIPT_DIR/../../.." && pwd)
+source "$REPO_ROOT/benchmarks/benchmark_lib.sh"
+PORT=${PORT:-8000}
+TP=${TP:-8}
+HEALTH_TIMEOUT_S=${HEALTH_TIMEOUT_S:-1800}
+HEALTH_POLL_INTERVAL_S=${HEALTH_POLL_INTERVAL_S:-10}
+BENCHMARK_OUTPUT_ROOT=${BENCHMARK_OUTPUT_ROOT:-"$REPO_ROOT/datasets/isb1/results/gmi"}
+REQUEST_MODE=multi-turn
+HARNESS_REQUEST_MODE=auto
+IGNORE_WAITS=true
+
+GPU_TYPE=""
+MODEL_KEY=""
+ENGINE=""
+CONTEXT_BAND=""
+WORKLOAD=""
+BENCHMARK_TYPE="isb1_replay"
+OFFLOAD_MODE=""
+KV_CACHE_DTYPE=""
+DISABLE_PREFIX_CACHING=false
+TOTAL_CPU_DRAM_GB=""
+BENCHMARK_DURATION_S=""
+MAX_CONCURRENCY_OVERRIDE=""
+TRACE_SOURCE="isb1"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --gpu-type)
+      GPU_TYPE="$2"
+      shift 2
+      ;;
+    --model)
+      MODEL_KEY="$2"
+      shift 2
+      ;;
+    --engine)
+      ENGINE="$2"
+      shift 2
+      ;;
+    --context-band)
+      CONTEXT_BAND="$2"
+      shift 2
+      ;;
+    --workload)
+      WORKLOAD="$2"
+      shift 2
+      ;;
+    --benchmark-type)
+      BENCHMARK_TYPE="$2"
+      shift 2
+      ;;
+    --offload-mode)
+      OFFLOAD_MODE="$2"
+      shift 2
+      ;;
+    --kv-cache-dtype)
+      KV_CACHE_DTYPE="$2"
+      shift 2
+      ;;
+    --disable-prefix-caching)
+      DISABLE_PREFIX_CACHING=true
+      shift
+      ;;
+    --total-cpu-dram-gb)
+      TOTAL_CPU_DRAM_GB="$2"
+      shift 2
+      ;;
+    --benchmark-duration-s)
+      BENCHMARK_DURATION_S="$2"
+      shift 2
+      ;;
+    --max-concurrency)
+      MAX_CONCURRENCY_OVERRIDE="$2"
+      shift 2
+      ;;
+    --trace-source)
+      TRACE_SOURCE="$2"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      die "Unknown argument: $1"
+      ;;
+  esac
+done
+
+[[ -n "$GPU_TYPE" ]] || die "--gpu-type is required"
+[[ -n "$MODEL_KEY" ]] || die "--model is required"
+[[ -n "$ENGINE" ]] || die "--engine is required"
+[[ -n "$CONTEXT_BAND" ]] || die "--context-band is required"
+[[ -n "$WORKLOAD" ]] || die "--workload is required"
+
+case "$GPU_TYPE" in
+  h100|h200|b200) ;;
+  *) die "Unsupported --gpu-type: $GPU_TYPE" ;;
+esac
+
+case "$ENGINE" in
+  vllm|sglang) ;;
+  *) die "Unsupported --engine: $ENGINE" ;;
+esac
+
+case "$CONTEXT_BAND" in
+  8k|32k|64k|131k|500k|1m) ;;
+  *) die "Unsupported --context-band: $CONTEXT_BAND" ;;
+esac
+
+case "$WORKLOAD" in
+  chat|code) ;;
+  *) die "Unsupported --workload: $WORKLOAD (must be chat or code)" ;;
+esac
+
+case "$BENCHMARK_TYPE" in
+  isb1_replay|isb1_kv_stress) ;;
+  *) die "Unsupported --benchmark-type: $BENCHMARK_TYPE" ;;
+esac
+
+case "$TRACE_SOURCE" in
+  isb1|kv_cache_tester|aiperf) ;;
+  *) die "Unsupported --trace-source: $TRACE_SOURCE" ;;
+esac
+
+case "${OFFLOAD_MODE:-}" in
+  ""|on|off|noprefix|legacy) ;;
+  *) die "Unsupported --offload-mode: $OFFLOAD_MODE" ;;
+esac
+
+case "${KV_CACHE_DTYPE:-}" in
+  ""|auto|fp8) ;;
+  *) die "Unsupported --kv-cache-dtype: $KV_CACHE_DTYPE" ;;
+esac
+
+if [[ -n "$TOTAL_CPU_DRAM_GB" ]] && ! [[ "$TOTAL_CPU_DRAM_GB" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
+  die "--total-cpu-dram-gb must be numeric"
+fi
+if [[ -n "$MAX_CONCURRENCY_OVERRIDE" ]] && ! [[ "$MAX_CONCURRENCY_OVERRIDE" =~ ^[0-9]+$ ]]; then
+  die "--max-concurrency must be a positive integer"
+fi
+if [[ -n "$BENCHMARK_DURATION_S" ]] && ! [[ "$BENCHMARK_DURATION_S" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
+  die "--benchmark-duration-s must be numeric"
+fi
+
+require_cmd docker
+require_cmd curl
+require_cmd python3
+require_cmd nvidia-smi
+
+HF_TOKEN_VALUE=${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-}}
+[[ -n "$HF_TOKEN_VALUE" ]] || die "Set HF_TOKEN or HUGGING_FACE_HUB_TOKEN before running"
+
+if [[ -z "$TOTAL_CPU_DRAM_GB" ]]; then
+  if [[ -r /proc/meminfo ]]; then
+    TOTAL_CPU_DRAM_GB=$(awk '/MemTotal:/ {printf "%.0f", $2/1048576}' /proc/meminfo)
+  else
+    TOTAL_CPU_DRAM_GB=0
+  fi
+fi
+
+case "$MODEL_KEY" in
+  qwen3.5)
+    MODEL_HF_ID="Qwen/Qwen3.5-397B-A17B-FP8"
+    MODEL_PREFIX="qwen3.5"
+    CANONICAL_MODEL_ID="qwen3_5_397b_a17b"
+    PRECISION="fp8"
+    ;;
+  gptoss)
+    MODEL_HF_ID="openai/gpt-oss-120b"
+    MODEL_PREFIX="gptoss"
+    CANONICAL_MODEL_ID="gpt_oss_120b"
+    PRECISION="fp4"
+    ;;
+  dsr1)
+    MODEL_HF_ID="deepseek-ai/DeepSeek-R1-0528"
+    MODEL_PREFIX="dsr1"
+    CANONICAL_MODEL_ID="deepseek_r1_0528"
+    PRECISION="fp8"
+    ;;
+  *)
+    die "Unsupported --model: $MODEL_KEY"
+    ;;
+esac
+
+case "$GPU_TYPE" in
+  b200)
+    HARDWARE_PROFILE_ID="nvidia:b200_sxm_180gb"
+    RUNNER_TYPE="b200-gmi-baremetal"
+    ;;
+  h100)
+    HARDWARE_PROFILE_ID="nvidia:h100_sxm_80gb"
+    RUNNER_TYPE="h100-gmi-baremetal"
+    ;;
+  h200)
+    HARDWARE_PROFILE_ID="nvidia:h200_sxm_141gb"
+    RUNNER_TYPE="h200-gmi-baremetal"
+    ;;
+esac
+
+case "$ENGINE" in
+  vllm)
+    RUNTIME_STACK_ID="standalone:vllm"
+    if [[ "$GPU_TYPE" == "b200" ]]; then
+      IMAGE="vllm/vllm-openai:v0.19.0-cu130"
+    else
+      IMAGE="vllm/vllm-openai:v0.18.0"
+    fi
+    ;;
+  sglang)
+    RUNTIME_STACK_ID="standalone:sglang"
+    IMAGE="lmsysorg/sglang:v0.5.9-cu130"
+    ;;
+esac
+
+case "$CONTEXT_BAND" in
+  8k)
+    MAX_MODEL_LEN=10240
+    MAX_CONCURRENCY=4
+    NUM_WARMUP_SESSIONS=1
+    MAX_SESSIONS=""
+    MAX_TURNS_PER_SESSION=""
+    MAX_NUM_BATCHED_TOKENS=8192
+    MAX_ACTIVE_REQUESTS=128
+    ;;
+  32k)
+    MAX_MODEL_LEN=33792
+    MAX_CONCURRENCY=4
+    NUM_WARMUP_SESSIONS=1
+    MAX_SESSIONS=""
+    MAX_TURNS_PER_SESSION=""
+    MAX_NUM_BATCHED_TOKENS=8192
+    MAX_ACTIVE_REQUESTS=64
+    ;;
+  64k)
+    MAX_MODEL_LEN=66560
+    MAX_CONCURRENCY=4
+    NUM_WARMUP_SESSIONS=1
+    MAX_SESSIONS=""
+    MAX_TURNS_PER_SESSION=""
+    MAX_NUM_BATCHED_TOKENS=4096
+    MAX_ACTIVE_REQUESTS=64
+    ;;
+  131k)
+    MAX_MODEL_LEN=132296
+    MAX_CONCURRENCY=2
+    NUM_WARMUP_SESSIONS=1
+    MAX_SESSIONS=""
+    MAX_TURNS_PER_SESSION=""
+    MAX_NUM_BATCHED_TOKENS=2048
+    MAX_ACTIVE_REQUESTS=32
+    ;;
+  500k)
+    MAX_MODEL_LEN=524288
+    MAX_CONCURRENCY=1
+    NUM_WARMUP_SESSIONS=0
+    MAX_SESSIONS=2
+    MAX_TURNS_PER_SESSION=4
+    MAX_NUM_BATCHED_TOKENS=1024
+    MAX_ACTIVE_REQUESTS=8
+    ;;
+  1m)
+    MAX_MODEL_LEN=1048576
+    MAX_CONCURRENCY=1
+    NUM_WARMUP_SESSIONS=0
+    MAX_SESSIONS=1
+    MAX_TURNS_PER_SESSION=3
+    MAX_NUM_BATCHED_TOKENS=1024
+    MAX_ACTIVE_REQUESTS=4
+    ;;
+esac
+
+if [[ -n "$MAX_CONCURRENCY_OVERRIDE" ]]; then
+  MAX_CONCURRENCY="$MAX_CONCURRENCY_OVERRIDE"
+fi
+
+select_export_file() {
+  case "$MODEL_KEY:$CONTEXT_BAND:$ENGINE:$WORKLOAD" in
+    # ── Chat exports (committed at 8k–131k) ──────────────────────
+    qwen3.5:8k:*:chat)
+      printf 'datasets/isb1/exports/core/%s/chat_8k1k_qwen3.5.json\n' "$ENGINE"
+      ;;
+    qwen3.5:32k:*:chat)
+      printf 'datasets/isb1/exports/extension_32k/%s/chat_32k1k_qwen3.5.json\n' "$ENGINE"
+      ;;
+    qwen3.5:64k:*:chat)
+      printf 'datasets/isb1/exports/extension_64k/%s/chat_64k1k_qwen3.5.json\n' "$ENGINE"
+      ;;
+    *:8k:*:chat)
+      printf 'datasets/isb1/exports/core/%s/chat_8k1k.json\n' "$ENGINE"
+      ;;
+    *:32k:*:chat)
+      printf 'datasets/isb1/exports/extension_32k/%s/chat_32k1k.json\n' "$ENGINE"
+      ;;
+    *:64k:*:chat)
+      printf 'datasets/isb1/exports/extension_64k/%s/chat_64k1k.json\n' "$ENGINE"
+      ;;
+    gptoss:131k:*:chat)
+      printf 'datasets/isb1/exports/extension_131k/%s/chat_131k1k.json\n' "$ENGINE"
+      ;;
+    qwen3.5:131k:*:chat)
+      printf 'datasets/isb1/exports/extension_131k/%s/chat_131k1k_qwen3.5.json\n' "$ENGINE"
+      ;;
+    dsr1:131k:*:chat)
+      printf 'datasets/isb1/exports/extension_131k/%s/chat_131k1k_dsr1.json\n' "$ENGINE"
+      ;;
+    gptoss:500k:*:chat)
+      printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_gptoss_xlc2_500k_preview_v1__%s.json\n' "$ENGINE"
+      ;;
+    qwen3.5:500k:*:chat)
+      printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__chat_qwen3.5_xlc2_500k_preview_v1__%s.json\n' "$ENGINE"
+      ;;
+    # dsr1:500k:chat — model max 164k, exceeds capability
+    qwen3.5:1m:*:chat)
+      printf 'datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__chat_qwen3.5_ulc2_1m_preview_v1__%s.json\n' "$ENGINE"
+      ;;
+    # dsr1:1m:chat, gptoss:1m:chat — models don't support 1M context
+
+    # ── Code exports ──────────────────────────────────────────────
+    qwen3.5:8k:*:code)
+      printf 'datasets/isb1/exports/core/%s/code_8k1k_qwen3.5.json\n' "$ENGINE"
+      ;;
+    qwen3.5:32k:*:code)
+      printf 'datasets/isb1/exports/extension_32k/%s/code_32k1k_qwen3.5.json\n' "$ENGINE"
+      ;;
+    qwen3.5:64k:*:code)
+      printf 'datasets/isb1/exports/extension_64k/%s/code_64k1k_qwen3.5.json\n' "$ENGINE"
+      ;;
+    qwen3.5:131k:*:code)
+      printf 'datasets/isb1/exports/extension_131k/%s/code_131k1k_qwen3.5.json\n' "$ENGINE"
+      ;;
+    qwen3.5:500k:*:code)
+      printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__%s.json\n' "$ENGINE"
+      ;;
+    qwen3.5:1m:*:code)
+      printf 'datasets/isb1/exports/preview/long_context_1m/inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__%s.json\n' "$ENGINE"
+      ;;
+    gptoss:8k:*:code)
+      printf 'datasets/isb1/exports/core/%s/code_8k1k.json\n' "$ENGINE"
+      ;;
+    gptoss:32k:*:code)
+      printf 'datasets/isb1/exports/extension_32k/%s/code_32k1k.json\n' "$ENGINE"
+      ;;
+    gptoss:64k:*:code)
+      printf 'datasets/isb1/exports/extension_64k/%s/code_64k1k.json\n' "$ENGINE"
+      ;;
+    gptoss:131k:*:code)
+      printf 'datasets/isb1/exports/extension_131k/%s/code_131k1k.json\n' "$ENGINE"
+      ;;
+    gptoss:500k:*:code)
+      printf 'datasets/isb1/exports/preview/long_context_500k/inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__%s.json\n' "$ENGINE"
+      ;;
+    # gptoss:1m — GPT-OSS max_position_embeddings=131072; 1M exceeds model capability
+    dsr1:8k:*:code)
+      printf 'datasets/isb1/exports/core/%s/code_8k1k.json\n' "$ENGINE"
+      ;;
+    dsr1:32k:*:code)
+      printf 'datasets/isb1/exports/extension_32k/%s/code_32k1k.json\n' "$ENGINE"
+      ;;
+    dsr1:64k:*:code)
+      printf 'datasets/isb1/exports/extension_64k/%s/code_64k1k.json\n' "$ENGINE"
+      ;;
+    dsr1:131k:*:code)
+      printf 'datasets/isb1/exports/extension_131k/%s/code_131k1k.json\n' "$ENGINE"
+      ;;
+    # dsr1:500k/1m — DeepSeek R1 max_position_embeddings=163840; 500k/1M exceed model capability
+    *)
+      return 1
+      ;;
+  esac
+}
+
+TRACE_DIR=""
+TRACE_REPLAY_SUMMARY_JSON=""
+if [[ "$TRACE_SOURCE" == "isb1" ]]; then
+  EXPORT_FILE=$(select_export_file) || die "No committed ISB1 export for model=$MODEL_KEY engine=$ENGINE context=$CONTEXT_BAND workload=$WORKLOAD"
+  EXPORT_PATH="$REPO_ROOT/$EXPORT_FILE"
+  [[ -f "$EXPORT_PATH" ]] || die "Export file not found: $EXPORT_FILE"
+
+  readarray -t EXPORT_METADATA < <(
+    python3 - "$EXPORT_PATH" "$RUNTIME_STACK_ID" "$HARDWARE_PROFILE_ID" "$CANONICAL_MODEL_ID" <<'PY'
+import json
+import sys
+from pathlib import Path
+
+export_path = Path(sys.argv[1])
+runtime_stack_id = sys.argv[2]
+hardware_profile_id = sys.argv[3]
+canonical_model_id = sys.argv[4]
+payload = json.loads(export_path.read_text())
+matches = [
+    cell
+    for cell in payload.get("exports", [])
+    if cell.get("runtime_stack_id") == runtime_stack_id
+    and cell.get("hardware_profile_id") == hardware_profile_id
+    and cell.get("canonical_model_id") == canonical_model_id
+]
+if not matches:
+    raise SystemExit(
+        f"No matching export cells for runtime={runtime_stack_id} hardware={hardware_profile_id} model={canonical_model_id}"
+    )
+support_statuses = sorted({cell.get("support_status") for cell in matches if cell.get("support_status")})
+cert_statuses = sorted(
+    {cell.get("benchmark_certification_status") for cell in matches if cell.get("benchmark_certification_status")}
+)
+trace_ids = sorted({cell.get("trace_id") for cell in matches if cell.get("trace_id")})
+if len(support_statuses) > 1:
+    raise SystemExit(f"Ambiguous support statuses: {support_statuses}")
+if len(cert_statuses) > 1:
+    raise SystemExit(f"Ambiguous certification statuses: {cert_statuses}")
+print(support_statuses[0] if support_statuses else "")
+print(cert_statuses[0] if cert_statuses else "")
+print(",".join(trace_ids))
+print(len(matches))
+PY
+  )
+
+  SUPPORT_STATUS=${EXPORT_METADATA[0]}
+  BENCHMARK_CERTIFICATION_STATUS=${EXPORT_METADATA[1]}
+  TRACE_IDS=${EXPORT_METADATA[2]}
+  MATCHED_CELL_COUNT=${EXPORT_METADATA[3]}
+else
+  SUPPORT_STATUS=${SUPPORT_STATUS:-reviewed_preview}
+  BENCHMARK_CERTIFICATION_STATUS=${BENCHMARK_CERTIFICATION_STATUS:-dataset_replay_verified}
+  TRACE_IDS="$TRACE_SOURCE"
+  MATCHED_CELL_COUNT="n/a"
+  if [[ "$TRACE_SOURCE" == "kv_cache_tester" ]]; then
+    TRACE_DIR=${TRACE_DIR:-"$REPO_ROOT/experimental/multiturn/vllm_benchmark/kv-cache-tester/traces"}
+    EXPORT_FILE="experimental/multiturn/vllm_benchmark/trace_source_kv_cache_tester.json"
+  else
+    TRACE_DIR=${TRACE_DIR:-"$REPO_ROOT/experimental/multiturn/vllm_benchmark/aiperf_traces"}
+    EXPORT_FILE="experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json"
+  fi
+  EXPORT_PATH="$REPO_ROOT/$EXPORT_FILE"
+fi
+
+case "$ENGINE" in
+  vllm)
+    VLLM_CPU_OFFLOAD_GB=""
+    VLLM_SWAP_SPACE_GB=""
+    if [[ "$CONTEXT_BAND" == "500k" ]]; then
+      VLLM_CPU_OFFLOAD_GB=40
+      VLLM_SWAP_SPACE_GB=32
+    elif [[ "$CONTEXT_BAND" == "1m" ]]; then
+      VLLM_CPU_OFFLOAD_GB=80
+      VLLM_SWAP_SPACE_GB=64
+    fi
+    case "$CONTEXT_BAND" in
+      8k|32k) VLLM_MAX_NUM_SEQS=128 ;;
+      64k) VLLM_MAX_NUM_SEQS=64 ;;
+      131k) VLLM_MAX_NUM_SEQS=32 ;;
+      500k) VLLM_MAX_NUM_SEQS=8 ;;
+      1m) VLLM_MAX_NUM_SEQS=4 ;;
+    esac
+    ;;
+  sglang)
+    case "$GPU_TYPE" in
+      h100)
+        SGLANG_MEM_FRACTION_STATIC=0.80
+        SGLANG_CHUNKED_PREFILL_SIZE=8192
+        ;;
+      h200)
+        SGLANG_MEM_FRACTION_STATIC=0.82
+        SGLANG_CHUNKED_PREFILL_SIZE=16384
+        ;;
+      b200)
+        SGLANG_MEM_FRACTION_STATIC=0.85
+        SGLANG_CHUNKED_PREFILL_SIZE=32768
+        ;;
+    esac
+    if [[ "$CONTEXT_BAND" == "500k" || "$CONTEXT_BAND" == "1m" ]]; then
+      SGLANG_MEM_FRACTION_STATIC=0.85
+      SGLANG_CHUNKED_PREFILL_SIZE=8192
+    fi
+    ;;
+esac
+
+DATE_STAMP=$(date +%Y%m%d-%H%M%S)
+SAFE_CONTEXT=${CONTEXT_BAND//[^[:alnum:]]/_}
+SAFE_MODEL=${MODEL_KEY//[^[:alnum:]._-]/_}
+SAFE_ENGINE=${ENGINE//[^[:alnum:]._-]/_}
+SAFE_GPU=${GPU_TYPE//[^[:alnum:]._-]/_}
+SAFE_WORKLOAD=${WORKLOAD//[^[:alnum:]._-]/_}
+RUN_LABEL=${GMI_RUN_LABEL:-}
+if [[ -n "$RUN_LABEL" ]]; then
+  RUN_LABEL="-${RUN_LABEL//[^[:alnum:]._-]/_}"
+fi
+RESULT_STEM="gmi-${SAFE_GPU}-${SAFE_MODEL}-${SAFE_ENGINE}-${SAFE_WORKLOAD}-${SAFE_CONTEXT}-${DATE_STAMP}${RUN_LABEL}"
+RUN_DIR="$BENCHMARK_OUTPUT_ROOT/$RESULT_STEM"
+SERVER_LOG="$RUN_DIR/server.log"
+SUMMARY_JSON="$RUN_DIR/agg_${RESULT_STEM}.json"
+TRACE_REPLAY_SUMMARY_JSON="$RUN_DIR/trace_replay_summary.json"
+GPU_PROFILE_CSV="$RUN_DIR/${RESULT_STEM}_gpu_profile.csv"
+GPU_PROFILER_PID=""
+GPU_MEM_PEAK=0
+GPU_MEM_AVG=0
+GPU_UTIL_AVG=0
+mkdir -p "$RUN_DIR"
+mkdir -p "$HOME/.cache/huggingface"
+
+CONTAINER_NAME="isb1-${RESULT_STEM}"
+LOG_TAIL_PID=""
+CONTAINER_ID=""
+ISB1_RESULTS_DB_PATH=${ISB1_RESULTS_DB_PATH:-}
+
+stop_gpu_profiler() {
+  if [[ -n "$GPU_PROFILER_PID" ]]; then
+    kill "$GPU_PROFILER_PID" >/dev/null 2>&1 || true
+    wait "$GPU_PROFILER_PID" >/dev/null 2>&1 || true
+    GPU_PROFILER_PID=""
+  fi
+}
+
+cleanup() {
+  local exit_code=$?
+  set +e
+  stop_gpu_profiler
+  if [[ -n "$LOG_TAIL_PID" ]]; then
+    kill "$LOG_TAIL_PID" >/dev/null 2>&1 || true
+  fi
+  if [[ -n "$CONTAINER_NAME" ]]; then
+    docker rm -f "$CONTAINER_NAME" >/dev/null 2>&1 || true
+  fi
+  exit $exit_code
+}
+trap cleanup EXIT
+
+launch_server() {
+  # Apply YaRN for Qwen long-context
+  apply_yarn_config_if_needed "$MODEL_HF_ID" "$MAX_MODEL_LEN" 2>/dev/null || true
+
+  local docker_cmd=()
+  docker_cmd=(
+    docker run -d --rm
+    --name "$CONTAINER_NAME"
+    --gpus all
+    --ipc host
+    --network host
+    --shm-size 16g
+    -e HF_TOKEN="$HF_TOKEN_VALUE"
+    -e HUGGING_FACE_HUB_TOKEN="$HF_TOKEN_VALUE"
+    -e NVIDIA_VISIBLE_DEVICES=all
+    -e PYTHONUNBUFFERED=1
+    -v "$HOME/.cache/huggingface:/root/.cache/huggingface"
+    -v "$REPO_ROOT:/workspace"
+    -w /workspace
+  )
+
+  if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then
+    docker_cmd+=(-e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1)
+    docker_cmd+=(-e SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1)
+  fi
+
+  if [[ "$ENGINE" == "vllm" ]]; then
+    local cmd=(
+      vllm serve "$MODEL_HF_ID"
+      --host 0.0.0.0
+      --port "$PORT"
+      --tensor-parallel-size "$TP"
+      --gpu-memory-utilization 0.90
+      --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS"
+      --max-model-len "$MAX_MODEL_LEN"
+      --max-num-seqs "$VLLM_MAX_NUM_SEQS"
+      --disable-log-requests
+      --trust-remote-code
+    )
+
+    case "${OFFLOAD_MODE:-}" in
+      on)
+        cmd+=(
+          --kv_offloading_backend native
+          --kv_offloading_size "$TOTAL_CPU_DRAM_GB"
+          --disable-hybrid-kv-cache-manager
+        )
+        ;;
+      off)
+        ;;
+      noprefix)
+        cmd+=(--no-enable-prefix-caching)
+        ;;
+      legacy|"")
+        if [[ -n "$VLLM_CPU_OFFLOAD_GB" ]]; then
+          cmd+=(--cpu-offload-gb "$VLLM_CPU_OFFLOAD_GB")
+        fi
+        if [[ -n "$VLLM_SWAP_SPACE_GB" ]]; then
+          cmd+=(--swap-space "$VLLM_SWAP_SPACE_GB")
+        fi
+        ;;
+    esac
+
+    if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then
+      cmd+=(--no-enable-prefix-caching)
+    fi
+
+    if [[ "${KV_CACHE_DTYPE:-}" == "fp8" ]]; then
+      cmd+=(--kv-cache-dtype fp8)
+    fi
+
+    if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then
+      cmd+=(--hf-overrides "$YARN_OVERRIDE_JSON")
+    fi
+
+    CONTAINER_ID=$("${docker_cmd[@]}" "$IMAGE" bash -lc "$(printf '%q ' "${cmd[@]}")")
+  else
+    local cmd=(
+      python3 -m sglang.launch_server
+      --model-path "$MODEL_HF_ID"
+      --host 0.0.0.0
+      --port "$PORT"
+      --trust-remote-code
+      --tensor-parallel-size "$TP"
+      --data-parallel-size 1
+      --context-length "$MAX_MODEL_LEN"
+      --max-running-requests "$MAX_ACTIVE_REQUESTS"
+      --cuda-graph-max-bs "$MAX_ACTIVE_REQUESTS"
+      --chunked-prefill-size "$SGLANG_CHUNKED_PREFILL_SIZE"
+      --max-prefill-tokens "$SGLANG_CHUNKED_PREFILL_SIZE"
+      --mem-fraction-static "$SGLANG_MEM_FRACTION_STATIC"
+      --attention-backend flashinfer
+      --stream-interval 10
+      --decode-log-interval 1
+    )
+
+    case "${OFFLOAD_MODE:-}" in
+      on)
+        echo "WARNING: OFFLOAD_MODE=on is not supported for SGLang; continuing without native offload" >&2
+        ;;
+      noprefix)
+        cmd+=(--disable-radix-cache)
+        ;;
+      off|legacy|"")
+        ;;
+    esac
+
+    if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then
+      cmd+=(--disable-radix-cache)
+    fi
+
+    if [[ -n "${YARN_OVERRIDE_JSON:-}" ]]; then
+      cmd+=(--json-model-override-args "$YARN_OVERRIDE_JSON")
+    fi
+
+    CONTAINER_ID=$("${docker_cmd[@]}" "$IMAGE" bash -lc "$(printf '%q ' "${cmd[@]}")")
+  fi
+
+  [[ -n "$CONTAINER_ID" ]] || die "Failed to start Docker container"
+  docker logs -f "$CONTAINER_NAME" > "$SERVER_LOG" 2>&1 &
+  LOG_TAIL_PID=$!
+}
+
+wait_for_server_ready() {
+  local deadline=$((SECONDS + HEALTH_TIMEOUT_S))
+  until curl --output /dev/null --silent --fail "http://127.0.0.1:${PORT}/health"; do
+    if ! docker ps --format '{{.Names}}' | grep -Fxq "$CONTAINER_NAME"; then
+      echo "Container exited before becoming healthy. Recent logs:" >&2
+      docker logs "$CONTAINER_NAME" >&2 || true
+      return 1
+    fi
+    if (( SECONDS >= deadline )); then
+      echo "Timed out waiting for http://127.0.0.1:${PORT}/health" >&2
+      docker logs "$CONTAINER_NAME" | tail -n 200 >&2 || true
+      return 1
+    fi
+    sleep "$HEALTH_POLL_INTERVAL_S"
+  done
+}
+
+echo "==> GMI portable benchmark"
+echo "repo:                 $REPO_ROOT"
+echo "gpu-type:             $GPU_TYPE"
+echo "model:                $MODEL_KEY ($MODEL_HF_ID)"
+echo "engine:               $ENGINE"
+echo "context-band:         $CONTEXT_BAND"
+echo "workload:             $WORKLOAD"
+echo "benchmark-type:       $BENCHMARK_TYPE"
+echo "trace-source:         $TRACE_SOURCE"
+echo "max-concurrency:      $MAX_CONCURRENCY"
+echo "max-model-len:        $MAX_MODEL_LEN"
+echo "docker image:         $IMAGE"
+echo "export-file:          $EXPORT_FILE"
+if [[ "$TRACE_SOURCE" != "isb1" ]]; then
+  echo "trace-dir:            $TRACE_DIR"
+fi
+echo "runtime-stack-id:     $RUNTIME_STACK_ID"
+echo "hardware-profile-id:  $HARDWARE_PROFILE_ID"
+echo "canonical-model-id:   $CANONICAL_MODEL_ID"
+echo "support-status:       ${SUPPORT_STATUS:-<none>}"
+echo "certification:        ${BENCHMARK_CERTIFICATION_STATUS:-<none>}"
+echo "matched export cells: $MATCHED_CELL_COUNT"
+echo "trace-ids:            ${TRACE_IDS:-<mixed>}"
+echo "output dir:           $RUN_DIR"
+echo "offload-mode:         ${OFFLOAD_MODE:-legacy}"
+echo "kv-cache-dtype:       ${KV_CACHE_DTYPE:-auto}"
+echo "disable-prefix-cache: $DISABLE_PREFIX_CACHING"
+echo "total-cpu-dram-gb:    $TOTAL_CPU_DRAM_GB"
+if [[ "$ENGINE" == "vllm" ]]; then
+  echo "vllm cpu-offload-gb:  ${VLLM_CPU_OFFLOAD_GB:-0}"
+  echo "vllm swap-space-gb:   ${VLLM_SWAP_SPACE_GB:-0}"
+else
+  echo "sglang mem fraction:  $SGLANG_MEM_FRACTION_STATIC"
+  echo "sglang chunked pf:    $SGLANG_CHUNKED_PREFILL_SIZE"
+fi
+
+"$SCRIPT_DIR/gpu_profile_collector.sh" --output "$GPU_PROFILE_CSV" --interval 2 &
+GPU_PROFILER_PID=$!
+
+launch_server
+wait_for_server_ready
+
+if [[ "$TRACE_SOURCE" == "isb1" ]]; then
+  echo "==> Server is healthy; starting export replay"
+
+  benchmark_cmd=(
+    python3 "$REPO_ROOT/utils/bench_serving/benchmark_export_replay.py"
+    --model "$MODEL_HF_ID"
+    --base-url "http://127.0.0.1:${PORT}"
+    --export-file "$EXPORT_PATH"
+    --request-mode "$HARNESS_REQUEST_MODE"
+    --max-concurrency "$MAX_CONCURRENCY"
+    --num-warmup-sessions "$NUM_WARMUP_SESSIONS"
+    --save-result
+    --result-dir "$RUN_DIR"
+    --result-filename "$RESULT_STEM.json"
+    --runtime-stack-id "$RUNTIME_STACK_ID"
+    --hardware-profile-id "$HARDWARE_PROFILE_ID"
+    --canonical-model-id "$CANONICAL_MODEL_ID"
+    --metadata "benchmark_type=$BENCHMARK_TYPE"
+    --metadata "export_file=$EXPORT_FILE"
+    --metadata "runtime_stack_id=$RUNTIME_STACK_ID"
+    --metadata "hardware_profile_id=$HARDWARE_PROFILE_ID"
+    --metadata "canonical_model_id=$CANONICAL_MODEL_ID"
+    --metadata "request_mode=$REQUEST_MODE"
+    --metadata "gmi_gpu_type=$GPU_TYPE"
+    --metadata "gmi_engine=$ENGINE"
+    --metadata "gmi_context_band=$CONTEXT_BAND"
+    --metadata "gmi_workload=$WORKLOAD"
+    --trust-remote-code
+  )
+  if [[ -n "$BENCHMARK_DURATION_S" ]]; then
+    benchmark_cmd+=(--metadata "benchmark_duration_s=$BENCHMARK_DURATION_S")
+  fi
+  if [[ "$BENCHMARK_TYPE" == "isb1_kv_stress" ]]; then
+    benchmark_cmd+=(--metadata "campaign_class=kv_stress")
+  fi
+  if [[ -n "$SUPPORT_STATUS" ]]; then
+    benchmark_cmd+=(--support-status "$SUPPORT_STATUS")
+  fi
+  if [[ -n "$MAX_SESSIONS" ]]; then
+    benchmark_cmd+=(--max-sessions "$MAX_SESSIONS")
+  fi
+  if [[ -n "$MAX_TURNS_PER_SESSION" ]]; then
+    benchmark_cmd+=(--max-turns-per-session "$MAX_TURNS_PER_SESSION")
+  fi
+  if [[ "$IGNORE_WAITS" == "true" ]]; then
+    benchmark_cmd+=(--ignore-waits)
+  fi
+  if [[ "$ENGINE" == "vllm" ]]; then
+    if [[ -n "$VLLM_CPU_OFFLOAD_GB" ]]; then
+      benchmark_cmd+=(--metadata "vllm_cpu_offload_gb=$VLLM_CPU_OFFLOAD_GB")
+    fi
+    if [[ -n "$VLLM_SWAP_SPACE_GB" ]]; then
+      benchmark_cmd+=(--metadata "vllm_swap_space_gb=$VLLM_SWAP_SPACE_GB")
+    fi
+  else
+    benchmark_cmd+=(--metadata "sglang_mem_fraction_override=$SGLANG_MEM_FRACTION_STATIC")
+    benchmark_cmd+=(--metadata "sglang_chunked_prefill_override=$SGLANG_CHUNKED_PREFILL_SIZE")
+  fi
+
+  "${benchmark_cmd[@]}"
+else
+  echo "==> Server is healthy; starting trace replay ($TRACE_SOURCE)"
+
+  trace_cmd=(
+    python3 "$REPO_ROOT/experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py"
+    --api-endpoint "http://localhost:$PORT"
+    --trace-directory "$TRACE_DIR"
+    --output-dir "$RUN_DIR"
+    --start-users "$MAX_CONCURRENCY"
+    --max-users "$MAX_CONCURRENCY"
+    --test-duration "${BENCHMARK_DURATION_S:-1800}"
+    --seed 42
+    --no-color
+  )
+
+  "${trace_cmd[@]}"
+
+  python3 "$SCRIPT_DIR/adapt_trace_replay_result.py" \
+    --input-dir "$RUN_DIR" \
+    --detailed-csv detailed_results.csv \
+    --summary-json "$TRACE_REPLAY_SUMMARY_JSON" \
+    --output-json "$RUN_DIR/${RESULT_STEM}.json" \
+    --model-id "$MODEL_HF_ID" \
+    --max-concurrency "$MAX_CONCURRENCY" \
+    --request-mode "$REQUEST_MODE" \
+    --support-status "$SUPPORT_STATUS" \
+    --benchmark-certification-status "$BENCHMARK_CERTIFICATION_STATUS" \
+    --result-stem "$RESULT_STEM"
+fi
+
+echo "==> Processing ISB1 result"
+(
+  cd "$RUN_DIR"
+  export RUNNER_TYPE="$RUNNER_TYPE"
+  export FRAMEWORK="$ENGINE"
+  export PRECISION="$PRECISION"
+  export RESULT_FILENAME="$RESULT_STEM"
+  export MODEL_PREFIX="$MODEL_PREFIX"
+  export IMAGE="$IMAGE"
+  export TP="$TP"
+  export EP_SIZE=1
+  export DP_ATTENTION=false
+  export BENCHMARK_TYPE="$BENCHMARK_TYPE"
+  export EXPORT_FILE="$EXPORT_FILE"
+  export RUNTIME_STACK_ID="$RUNTIME_STACK_ID"
+  export HARDWARE_PROFILE_ID="$HARDWARE_PROFILE_ID"
+  export CANONICAL_MODEL_ID="$CANONICAL_MODEL_ID"
+  export REQUEST_MODE="$REQUEST_MODE"
+  export TRACE_SOURCE="$TRACE_SOURCE"
+  export WORKLOAD_TYPE="$WORKLOAD"
+  export MAX_CONCURRENCY="$MAX_CONCURRENCY"
+  export IGNORE_WAITS="$IGNORE_WAITS"
+  export DISPATCH_REF="manual:gmi-portable"
+  export MAX_MODEL_LEN="$MAX_MODEL_LEN"
+  export OFFLOAD_MODE="${OFFLOAD_MODE:-}"
+  export KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-}"
+  export DISABLE_PREFIX_CACHING="$DISABLE_PREFIX_CACHING"
+  if [[ -n "$BENCHMARK_DURATION_S" ]]; then
+    export BENCHMARK_DURATION_S="$BENCHMARK_DURATION_S"
+  fi
+  if [[ -n "$SUPPORT_STATUS" ]]; then
+    export SUPPORT_STATUS="$SUPPORT_STATUS"
+  fi
+  if [[ -n "$VLLM_CPU_OFFLOAD_GB" ]]; then
+    export VLLM_CPU_OFFLOAD_GB="$VLLM_CPU_OFFLOAD_GB"
+  fi
+  if [[ -n "$VLLM_SWAP_SPACE_GB" ]]; then
+    export VLLM_SWAP_SPACE_GB="$VLLM_SWAP_SPACE_GB"
+  fi
+  if [[ -n "${SGLANG_MEM_FRACTION_STATIC:-}" ]]; then
+    export SGLANG_MEM_FRACTION_OVERRIDE="$SGLANG_MEM_FRACTION_STATIC"
+  fi
+  if [[ -n "${SGLANG_CHUNKED_PREFILL_SIZE:-}" ]]; then
+    export SGLANG_CHUNKED_PREFILL_OVERRIDE="$SGLANG_CHUNKED_PREFILL_SIZE"
+  fi
+  python3 "$REPO_ROOT/utils/process_result_isb1.py" | tee "$SUMMARY_JSON"
+)
+
+stop_gpu_profiler
+
+if [[ -f "$GPU_PROFILE_CSV" ]]; then
+  GPU_STATS=$(python3 - "$GPU_PROFILE_CSV" <<'PY'
+import csv
+import sys
+
+with open(sys.argv[1], newline="") as handle:
+    rows = list(csv.DictReader(handle))
+
+if rows:
+    mems = [float(row.get("mem_used_mb", "0") or 0) for row in rows]
+    utils = [float(row.get("gpu_util_pct", "0") or 0) for row in rows]
+    print(f"{max(mems) / 1024:.2f} {sum(mems) / len(mems) / 1024:.2f} {sum(utils) / len(utils):.1f}")
+else:
+    print("0 0 0")
+PY
+  2>/dev/null) || GPU_STATS="0 0 0"
+  read -r GPU_MEM_PEAK GPU_MEM_AVG GPU_UTIL_AVG <<< "$GPU_STATS"
+fi
+
+if [[ "$BENCHMARK_TYPE" == "isb1_kv_stress" ]]; then
+  CAMPAIGN_METADATA_JSON="$RUN_DIR/kv_stress_campaign_metadata.json"
+  python3 - \
+    "$CAMPAIGN_METADATA_JSON" \
+    "$BENCHMARK_TYPE" \
+    "$WORKLOAD" \
+    "$MAX_CONCURRENCY" \
+    "${OFFLOAD_MODE:-}" \
+    "${KV_CACHE_DTYPE:-}" \
+    "$DISABLE_PREFIX_CACHING" \
+    "${BENCHMARK_DURATION_S:-}" <<'PY'
+import json
+import sys
+
+payload = {
+    "benchmark_type": sys.argv[2],
+    "campaign_class": "kv_stress",
+    "workload_type": sys.argv[3],
+    "max_concurrency": sys.argv[4],
+    "offload_mode": sys.argv[5] or None,
+    "kv_cache_dtype": sys.argv[6] or None,
+    "disable_prefix_caching": sys.argv[7],
+    "benchmark_duration_s": sys.argv[8] or None,
+}
+with open(sys.argv[1], "w", encoding="utf-8") as f:
+    json.dump(payload, f, indent=2, sort_keys=True)
+PY
+fi
+
+if [[ -f "$SUMMARY_JSON" ]] && command -v python3 >/dev/null 2>&1; then
+  db_ingest_cmd=(
+    python3 "$SCRIPT_DIR/isb1_results_db.py" ingest "$SUMMARY_JSON"
+    --gpu-type "$GPU_TYPE"
+    --model "$MODEL_KEY"
+    --engine "$ENGINE"
+    --context-band "$CONTEXT_BAND"
+    --workload-type "$WORKLOAD"
+    --trace-source "$TRACE_SOURCE"
+    --max-model-len "$MAX_MODEL_LEN"
+    --tp "$TP"
+    --gpu-mem-peak-gb "${GPU_MEM_PEAK:-0}"
+    --gpu-mem-avg-gb "${GPU_MEM_AVG:-0}"
+    --gpu-util-avg-pct "${GPU_UTIL_AVG:-0}"
+    --gpu-profile-csv "$GPU_PROFILE_CSV"
+  )
+  if [[ -n "$ISB1_RESULTS_DB_PATH" ]]; then
+    db_ingest_cmd+=(--db-path "$ISB1_RESULTS_DB_PATH")
+  fi
+  if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+    db_ingest_cmd+=(--offload-mode "$OFFLOAD_MODE")
+  fi
+  if [[ -n "${KV_CACHE_DTYPE:-}" ]]; then
+    db_ingest_cmd+=(--kv-cache-dtype "$KV_CACHE_DTYPE")
+  fi
+  if [[ "$DISABLE_PREFIX_CACHING" == "true" ]]; then
+    db_ingest_cmd+=(--disable-prefix-caching 1)
+  fi
+  if [[ -n "$BENCHMARK_DURATION_S" ]]; then
+    db_ingest_cmd+=(--benchmark-duration-s "$BENCHMARK_DURATION_S")
+  fi
+  if [[ "$BENCHMARK_TYPE" == "isb1_kv_stress" ]]; then
+    db_ingest_cmd+=(--campaign-class kv_stress)
+  fi
+  if [[ "$ENGINE" == "vllm" ]]; then
+    if [[ -n "${VLLM_CPU_OFFLOAD_GB:-}" ]]; then
+      db_ingest_cmd+=(--vllm-cpu-offload-gb "$VLLM_CPU_OFFLOAD_GB")
+    fi
+    if [[ -n "${VLLM_SWAP_SPACE_GB:-}" ]]; then
+      db_ingest_cmd+=(--vllm-swap-space-gb "$VLLM_SWAP_SPACE_GB")
+    fi
+  else
+    db_ingest_cmd+=(--sglang-mem-fraction "$SGLANG_MEM_FRACTION_STATIC")
+    db_ingest_cmd+=(--sglang-chunked-prefill "$SGLANG_CHUNKED_PREFILL_SIZE")
+  fi
+  "${db_ingest_cmd[@]}" 2>/dev/null || echo "WARNING: DB ingest failed" >&2
+fi
+
+python3 - "$SUMMARY_JSON" <<'PY'
+import json
+import sys
+from pathlib import Path
+
+summary = json.loads(Path(sys.argv[1]).read_text())
+print("==> Summary")
+for key, value in [
+    ("result_filename", summary.get("result_filename")),
+    ("support_status", summary.get("support_status")),
+    ("benchmark_certification_status", summary.get("benchmark_certification_status")),
+    ("completed_sessions", f"{summary.get('completed_sessions')}/{summary.get('total_sessions')}"),
+    ("effective_max_context_depth", summary.get("effective_max_context_depth")),
+    ("context_pressure_class", summary.get("context_pressure_class")),
+    ("context_pressure_signal", summary.get("context_pressure_signal", {}).get("status")),
+    ("depth_coverage_ratio", summary.get("depth_coverage_ratio")),
+    ("depth_coverage_class", summary.get("depth_coverage_class")),
+    ("max_actual_context_len", summary.get("max_actual_context_len_per_turn")),
+    ("preemption_count", summary.get("preemption_count")),
+    ("session_throughput_sps", summary.get("session_throughput_sps")),
+    ("tput_per_gpu", summary.get("tput_per_gpu")),
+    ("output_tput_per_gpu", summary.get("output_tput_per_gpu")),
+    ("mean_ttft_s", summary.get("mean_ttft")),
+    ("p99_ttft_s", summary.get("p99_ttft")),
+    ("server_logs", Path(sys.argv[1]).with_name("server.log")),
+    ("raw_replay_result", Path(sys.argv[1]).with_name(summary.get("result_filename", "run") + ".json")),
+    ("processed_result", Path(sys.argv[1])),
+]:
+    print(f"  {key}: {value}")
+PY
diff --git a/datasets/isb1/scripts/gmi_test_matrix.sh b/datasets/isb1/scripts/gmi_test_matrix.sh
new file mode 100755
index 000000000..5deadb072
--- /dev/null
+++ b/datasets/isb1/scripts/gmi_test_matrix.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+usage() {
+  cat <<'EOF'
+Usage:
+  gmi_test_matrix.sh --gpu-type <h100|h200|b200>
+
+Runs a curated GMI Cloud matrix:
+  - Qwen3.5 × vllm × 131k
+  - Qwen3.5 × vllm × 500k
+  - Qwen3.5 × sglang × 500k
+  - GPT-OSS × vllm × 131k
+  - DSR1 × sglang × 131k
+EOF
+}
+
+GPU_TYPE=""
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --gpu-type)
+      GPU_TYPE="$2"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage >&2
+      exit 1
+      ;;
+  esac
+done
+
+[[ -n "$GPU_TYPE" ]] || {
+  usage >&2
+  exit 1
+}
+
+case "$GPU_TYPE" in
+  h100|h200|b200) ;;
+  *)
+    echo "Unsupported --gpu-type: $GPU_TYPE" >&2
+    exit 1
+    ;;
+esac
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+PORTABLE_SCRIPT="$SCRIPT_DIR/gmi_portable_benchmark.sh"
+[[ -x "$PORTABLE_SCRIPT" ]] || {
+  echo "Expected executable helper at $PORTABLE_SCRIPT" >&2
+  exit 1
+}
+
+run_case() {
+  local model="$1"
+  local engine="$2"
+  local context_band="$3"
+  local workload="${4:-code}"
+
+  echo
+  echo "============================================================"
+  echo "Running: gpu=${GPU_TYPE} model=${model} engine=${engine} context=${context_band} workload=${workload}"
+  echo "============================================================"
+
+  "$PORTABLE_SCRIPT" \
+    --gpu-type "$GPU_TYPE" \
+    --model "$model" \
+    --engine "$engine" \
+    --context-band "$context_band" \
+    --workload "$workload"
+}
+
+run_case qwen3.5 vllm 8k chat
+run_case qwen3.5 vllm 131k code
+run_case qwen3.5 vllm 500k code
+run_case qwen3.5 sglang 500k chat
+run_case gptoss vllm 131k code
+run_case gptoss vllm 131k chat
+run_case gptoss vllm 500k chat
+run_case dsr1 sglang 131k code
+run_case dsr1 sglang 131k chat
+run_case qwen3.5 vllm 1m code
+
+echo
+echo "Curated GMI test matrix completed successfully."
diff --git a/datasets/isb1/scripts/gpu_profile_collector.sh b/datasets/isb1/scripts/gpu_profile_collector.sh
new file mode 100755
index 000000000..4ba03f223
--- /dev/null
+++ b/datasets/isb1/scripts/gpu_profile_collector.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+# Usage: gpu_profile_collector.sh --output /tmp/gpu.csv [--interval 2]
+# Runs nvidia-smi polling until killed (SIGTERM/SIGINT)
+
+OUTPUT=""
+INTERVAL=2
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --output)
+      OUTPUT="$2"
+      shift 2
+      ;;
+    --interval)
+      INTERVAL="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown arg: $1" >&2
+      exit 1
+      ;;
+  esac
+done
+
+[[ -n "$OUTPUT" ]] || {
+  echo "ERROR: --output required" >&2
+  exit 1
+}
+
+mkdir -p "$(dirname "$OUTPUT")"
+echo "timestamp,gpu_bus_id,gpu_util_pct,mem_util_pct,mem_used_mb,mem_total_mb,temp_c,power_w" > "$OUTPUT"
+
+trap 'exit 0' SIGTERM SIGINT
+
+while true; do
+  nvidia-smi \
+    --query-gpu=timestamp,gpu_bus_id,utilization.gpu,utilization.memory,memory.used,memory.total,temperature.gpu,power.draw \
+    --format=csv,noheader,nounits >> "$OUTPUT" 2>/dev/null || true
+  sleep "$INTERVAL"
+done
diff --git a/datasets/isb1/scripts/isb1_results_db.py b/datasets/isb1/scripts/isb1_results_db.py
new file mode 100644
index 000000000..e052fa766
--- /dev/null
+++ b/datasets/isb1/scripts/isb1_results_db.py
@@ -0,0 +1,816 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import sqlite3
+import sys
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Iterable, Sequence
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent.parent.parent
+DEFAULT_DB_PATH = REPO_ROOT / "datasets/isb1/results/isb1_results.db"
+TABLE_NAME = "benchmark_runs"
+
+SCHEMA_SQL = f"""
+CREATE TABLE IF NOT EXISTS {TABLE_NAME} (
+  id INTEGER PRIMARY KEY,
+  run_id TEXT,
+  timestamp TEXT,
+  gpu_type TEXT,
+  model TEXT,
+  engine TEXT,
+  context_band TEXT,
+  workload_type TEXT,
+  max_model_len INTEGER,
+  tp INTEGER,
+  vllm_cpu_offload_gb REAL,
+  vllm_swap_space_gb REAL,
+  sglang_mem_fraction REAL,
+  sglang_chunked_prefill INTEGER,
+  ttft_p50_ms REAL,
+  ttft_p99_ms REAL,
+  tpot_p50_ms REAL,
+  tpot_p99_ms REAL,
+  throughput_tok_s REAL,
+  total_sessions INTEGER,
+  completed_sessions INTEGER,
+  total_turns INTEGER,
+  completed_turns INTEGER,
+  preemption_count INTEGER,
+  gpu_mem_peak_gb REAL,
+  gpu_mem_avg_gb REAL,
+  gpu_util_avg_pct REAL,
+  kv_cache_usage_pct REAL,
+  server_startup_s REAL,
+  benchmark_duration_s REAL,
+  campaign_class TEXT,
+  trace_source TEXT,
+  total_actual_input_tokens INTEGER,
+  max_actual_context_len INTEGER,
+  depth_coverage_ratio REAL,
+  depth_coverage_class TEXT,
+  producer_estimated_kv_bytes_peak INTEGER,
+  producer_expected_offload_mode TEXT,
+  offload_mode_match INTEGER,
+  offload_mode TEXT,
+  kv_cache_dtype TEXT,
+  disable_prefix_caching INTEGER,
+  cpu_cache_usage_peak_pct REAL,
+  raw_result_json TEXT,
+  status TEXT,
+  error_message TEXT
+)
+"""
+
+INSERT_COLUMNS = [
+    "run_id",
+    "timestamp",
+    "gpu_type",
+    "model",
+    "engine",
+    "context_band",
+    "workload_type",
+    "max_model_len",
+    "tp",
+    "vllm_cpu_offload_gb",
+    "vllm_swap_space_gb",
+    "sglang_mem_fraction",
+    "sglang_chunked_prefill",
+    "ttft_p50_ms",
+    "ttft_p99_ms",
+    "tpot_p50_ms",
+    "tpot_p99_ms",
+    "throughput_tok_s",
+    "total_sessions",
+    "completed_sessions",
+    "total_turns",
+    "completed_turns",
+    "preemption_count",
+    "gpu_mem_peak_gb",
+    "gpu_mem_avg_gb",
+    "gpu_util_avg_pct",
+    "kv_cache_usage_pct",
+    "server_startup_s",
+    "benchmark_duration_s",
+    "campaign_class",
+    "trace_source",
+    "total_actual_input_tokens",
+    "max_actual_context_len",
+    "depth_coverage_ratio",
+    "depth_coverage_class",
+    "producer_estimated_kv_bytes_peak",
+    "producer_expected_offload_mode",
+    "offload_mode_match",
+    "offload_mode",
+    "kv_cache_dtype",
+    "disable_prefix_caching",
+    "cpu_cache_usage_peak_pct",
+    "raw_result_json",
+    "status",
+    "error_message",
+]
+
+GROUPABLE_COLUMNS = {
+    "gpu_type",
+    "model",
+    "engine",
+    "context_band",
+    "workload_type",
+    "status",
+    "tp",
+    "max_model_len",
+    "depth_coverage_class",
+    "offload_mode",
+    "campaign_class",
+    "trace_source",
+}
+
+DEFAULT_QUERY_COLUMNS = [
+    "timestamp",
+    "gpu_type",
+    "model",
+    "engine",
+    "context_band",
+    "workload_type",
+    "status",
+    "ttft_p50_ms",
+    "ttft_p99_ms",
+    "throughput_tok_s",
+    "gpu_mem_peak_gb",
+    "gpu_util_avg_pct",
+    "preemption_count",
+    "depth_coverage_ratio",
+    "max_actual_context_len",
+    "depth_coverage_class",
+    "run_id",
+]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Store and analyze ISB1 benchmark runs in SQLite.")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    ingest = subparsers.add_parser("ingest", help="Read a processed ISB1 JSON file and insert a benchmark run.")
+    ingest.add_argument("json_file", help="Path to utils/process_result_isb1.py output JSON.")
+    ingest.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.")
+    ingest.add_argument("--gpu-type", required=True, choices=["h100", "h200", "b200"])
+    ingest.add_argument("--model", required=True, choices=["qwen3.5", "gptoss", "dsr1"])
+    ingest.add_argument("--engine", required=True, choices=["vllm", "sglang"])
+    ingest.add_argument("--context-band", required=True, choices=["8k", "32k", "64k", "131k", "500k", "1m"])
+    ingest.add_argument("--workload-type", choices=["chat", "code"], help="Workload type (chat or code)")
+    ingest.add_argument("--run-id", help="Optional run UUID. Generated if omitted.")
+    ingest.add_argument("--timestamp", help="Optional ISO-8601 timestamp. Uses current UTC time if omitted.")
+    ingest.add_argument("--max-model-len", type=int)
+    ingest.add_argument("--tp", type=int)
+    ingest.add_argument("--vllm-cpu-offload-gb", type=float)
+    ingest.add_argument("--vllm-swap-space-gb", type=float)
+    ingest.add_argument("--sglang-mem-fraction", type=float)
+    ingest.add_argument("--sglang-chunked-prefill", type=int)
+    ingest.add_argument("--ttft-p50-ms", type=float)
+    ingest.add_argument("--ttft-p99-ms", type=float)
+    ingest.add_argument("--tpot-p50-ms", type=float)
+    ingest.add_argument("--tpot-p99-ms", type=float)
+    ingest.add_argument("--throughput-tok-s", type=float)
+    ingest.add_argument("--total-sessions", type=int)
+    ingest.add_argument("--completed-sessions", type=int)
+    ingest.add_argument("--total-turns", type=int)
+    ingest.add_argument("--completed-turns", type=int)
+    ingest.add_argument("--preemption-count", type=int)
+    ingest.add_argument("--gpu-mem-peak-gb", type=float)
+    ingest.add_argument("--gpu-mem-avg-gb", type=float)
+    ingest.add_argument("--gpu-util-avg-pct", type=float)
+    ingest.add_argument("--kv-cache-usage-pct", type=float)
+    ingest.add_argument("--server-startup-s", type=float)
+    ingest.add_argument("--benchmark-duration-s", type=float)
+    ingest.add_argument("--campaign-class")
+    ingest.add_argument("--trace-source", choices=["isb1", "kv_cache_tester", "aiperf"])
+    ingest.add_argument("--offload-mode", choices=["on", "off", "noprefix", "legacy"])
+    ingest.add_argument("--kv-cache-dtype", choices=["auto", "fp8"])
+    ingest.add_argument("--disable-prefix-caching", type=int, choices=[0, 1])
+    ingest.add_argument("--gpu-profile-csv", help="Optional GPU profile CSV path to stash in raw_result_json metadata.")
+    ingest.add_argument("--status", default="success", choices=["success", "failed", "timeout"])
+    ingest.add_argument("--error-message")
+
+    query = subparsers.add_parser("query", help="Print runs or an aggregated grouped view.")
+    query.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.")
+    query.add_argument("--group-by", help="Comma-separated columns to group by, for example gpu_type,context_band.")
+
+    export_csv = subparsers.add_parser("export-csv", help="Export all benchmark rows to CSV.")
+    export_csv.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.")
+    export_csv.add_argument("--output", help="Destination CSV path. Defaults to stdout.")
+
+    summary = subparsers.add_parser("summary", help="Print a concise findings summary.")
+    summary.add_argument("--db-path", default=str(DEFAULT_DB_PATH), help="SQLite DB path.")
+
+    return parser.parse_args()
+
+
+_MIGRATIONS = [
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN total_actual_input_tokens INTEGER",
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN max_actual_context_len INTEGER",
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN depth_coverage_ratio REAL",
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN depth_coverage_class TEXT",
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN producer_estimated_kv_bytes_peak INTEGER",
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN producer_expected_offload_mode TEXT",
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN offload_mode_match INTEGER",
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN offload_mode TEXT",
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN kv_cache_dtype TEXT",
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN disable_prefix_caching INTEGER",
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN cpu_cache_usage_peak_pct REAL",
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN workload_type TEXT",
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN campaign_class TEXT",
+    f"ALTER TABLE {TABLE_NAME} ADD COLUMN trace_source TEXT",
+]
+
+
+def ensure_db(conn: sqlite3.Connection) -> None:
+    conn.execute(SCHEMA_SQL)
+    conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_run_id ON {TABLE_NAME}(run_id)")
+    conn.execute(
+        f"CREATE INDEX IF NOT EXISTS idx_{TABLE_NAME}_grouping "
+        f"ON {TABLE_NAME}(gpu_type, model, engine, context_band, status)"
+    )
+    # Idempotent migrations for existing databases
+    for migration_sql in _MIGRATIONS:
+        try:
+            conn.execute(migration_sql)
+        except sqlite3.OperationalError:
+            pass  # Column already exists
+    conn.commit()
+
+
+def connect_db(db_path: str | Path) -> sqlite3.Connection:
+    db_path = Path(db_path)
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = sqlite3.Row
+    ensure_db(conn)
+    return conn
+
+
+def utc_now_iso() -> str:
+    return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
+
+
+def to_float(value: Any) -> float | None:
+    if value in (None, ""):
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def to_int(value: Any) -> int | None:
+    if value in (None, ""):
+        return None
+    try:
+        return int(float(value))
+    except (TypeError, ValueError):
+        return None
+
+
+def seconds_to_ms(value: Any) -> float | None:
+    parsed = to_float(value)
+    return None if parsed is None else parsed * 1000.0
+
+
+def choose(*values: Any) -> Any:
+    for value in values:
+        if value not in (None, ""):
+            return value
+    return None
+
+
+def load_payload(path: str | Path) -> dict[str, Any]:
+    payload = json.loads(Path(path).read_text())
+    if not isinstance(payload, dict):
+        raise SystemExit(f"Expected a JSON object in {path}")
+    return payload
+
+
+def derive_total_turns(payload: dict[str, Any], total_sessions: int | None) -> int | None:
+    max_turns = to_int(payload.get("max_turns"))
+    if max_turns is not None and total_sessions is not None:
+        return max_turns * total_sessions
+    per_turn_metrics = payload.get("per_turn_metrics") or {}
+    if isinstance(per_turn_metrics, dict) and total_sessions is not None:
+        return len(per_turn_metrics) * total_sessions
+    return None
+
+
+def derive_completed_turns(payload: dict[str, Any]) -> int | None:
+    per_turn_metrics = payload.get("per_turn_metrics") or {}
+    if not isinstance(per_turn_metrics, dict):
+        return None
+    completed = 0
+    saw_value = False
+    for turn_metrics in per_turn_metrics.values():
+        if not isinstance(turn_metrics, dict):
+            continue
+        value = to_int(turn_metrics.get("completed"))
+        if value is None:
+            continue
+        completed += value
+        saw_value = True
+    return completed if saw_value else None
+
+
+def build_raw_payload(payload: dict[str, Any], args: argparse.Namespace) -> dict[str, Any]:
+    enriched = dict(payload)
+    metadata = {
+        "source_json": str(Path(args.json_file).resolve()),
+        "db_path": str(Path(args.db_path).resolve()),
+    }
+    if args.gpu_profile_csv:
+        metadata["gpu_profile_csv"] = str(Path(args.gpu_profile_csv).resolve())
+    if args.status != "success":
+        metadata["status_override"] = args.status
+    if args.error_message:
+        metadata["error_message"] = args.error_message
+    enriched["_isb1_results_db"] = metadata
+    return enriched
+
+
+def insert_run(args: argparse.Namespace) -> None:
+    payload = load_payload(args.json_file)
+    aggregate = payload.get("aggregate_metrics") or {}
+    runtime_overrides = payload.get("runtime_overrides") or {}
+    server_metrics_summary = payload.get("server_metrics_summary") or {}
+
+    total_sessions = to_int(choose(args.total_sessions, payload.get("total_sessions"), aggregate.get("total_sessions")))
+    completed_sessions = to_int(
+        choose(args.completed_sessions, payload.get("completed_sessions"), aggregate.get("completed_sessions"))
+    )
+
+    gpu_cache_peak = to_float(server_metrics_summary.get("gpu_cache_usage_peak"))
+    if gpu_cache_peak is None:
+        gpu_cache_peak = to_float(payload.get("peak_gpu_cache_usage"))
+
+    row = {
+        "run_id": args.run_id or str(uuid.uuid4()),
+        "timestamp": args.timestamp or utc_now_iso(),
+        "gpu_type": args.gpu_type,
+        "model": args.model,
+        "engine": args.engine,
+        "context_band": args.context_band,
+        "workload_type": choose(
+            getattr(args, 'workload_type', None),
+            payload.get("benchmark_surface"),
+        ),
+        "max_model_len": to_int(choose(args.max_model_len, payload.get("max_model_len"))),
+        "tp": to_int(choose(args.tp, payload.get("tp"))),
+        "vllm_cpu_offload_gb": to_float(
+            choose(
+                args.vllm_cpu_offload_gb,
+                runtime_overrides.get("vllm_cpu_offload_gb"),
+                payload.get("vllm_cpu_offload_gb"),
+            )
+        ),
+        "vllm_swap_space_gb": to_float(
+            choose(
+                args.vllm_swap_space_gb,
+                runtime_overrides.get("vllm_swap_space_gb"),
+                payload.get("vllm_swap_space_gb"),
+            )
+        ),
+        "sglang_mem_fraction": to_float(
+            choose(
+                args.sglang_mem_fraction,
+                runtime_overrides.get("sglang_mem_fraction_override"),
+                payload.get("sglang_mem_fraction_override"),
+            )
+        ),
+        "sglang_chunked_prefill": to_int(
+            choose(
+                args.sglang_chunked_prefill,
+                runtime_overrides.get("sglang_chunked_prefill_override"),
+                payload.get("sglang_chunked_prefill_override"),
+            )
+        ),
+        "ttft_p50_ms": to_float(
+            choose(args.ttft_p50_ms, aggregate.get("median_ttft_ms"), seconds_to_ms(payload.get("median_ttft")))
+        ),
+        "ttft_p99_ms": to_float(
+            choose(args.ttft_p99_ms, aggregate.get("p99_ttft_ms"), seconds_to_ms(payload.get("p99_ttft")))
+        ),
+        "tpot_p50_ms": to_float(
+            choose(args.tpot_p50_ms, aggregate.get("median_tpot_ms"), seconds_to_ms(payload.get("median_tpot")))
+        ),
+        "tpot_p99_ms": to_float(
+            choose(args.tpot_p99_ms, aggregate.get("p99_tpot_ms"), seconds_to_ms(payload.get("p99_tpot")))
+        ),
+        "throughput_tok_s": to_float(
+            choose(args.throughput_tok_s, aggregate.get("total_token_throughput_tps"), payload.get("throughput_tok_s"))
+        ),
+        "total_sessions": total_sessions,
+        "completed_sessions": completed_sessions,
+        "total_turns": to_int(choose(args.total_turns, derive_total_turns(payload, total_sessions))),
+        "completed_turns": to_int(choose(args.completed_turns, derive_completed_turns(payload))),
+        "preemption_count": to_int(choose(args.preemption_count, payload.get("preemption_count"))),
+        "gpu_mem_peak_gb": to_float(choose(args.gpu_mem_peak_gb, payload.get("gpu_mem_peak_gb"))),
+        "gpu_mem_avg_gb": to_float(choose(args.gpu_mem_avg_gb, payload.get("gpu_mem_avg_gb"))),
+        "gpu_util_avg_pct": to_float(choose(args.gpu_util_avg_pct, payload.get("gpu_util_avg_pct"))),
+        "kv_cache_usage_pct": to_float(
+            choose(args.kv_cache_usage_pct, payload.get("kv_cache_usage_pct"), gpu_cache_peak * 100.0 if gpu_cache_peak is not None else None)
+        ),
+        "server_startup_s": to_float(choose(args.server_startup_s, payload.get("server_startup_s"))),
+        "benchmark_duration_s": to_float(
+            choose(args.benchmark_duration_s, payload.get("benchmark_duration_s"), aggregate.get("total_wall_time_s"))
+        ),
+        "campaign_class": choose(
+            getattr(args, 'campaign_class', None),
+            payload.get("campaign_class"),
+        ),
+        "trace_source": choose(
+            getattr(args, 'trace_source', None),
+            payload.get("trace_source"),
+        ),
+        "total_actual_input_tokens": to_int(
+            (payload.get("depth_telemetry") or {}).get("total_actual_input_tokens")
+            or payload.get("total_actual_input_tokens")
+        ),
+        "max_actual_context_len": to_int(
+            (payload.get("depth_telemetry") or {}).get("max_actual_context_len_per_turn")
+            or payload.get("max_actual_context_len_per_turn")
+        ),
+        "depth_coverage_ratio": to_float(payload.get("depth_coverage_ratio")),
+        "depth_coverage_class": payload.get("depth_coverage_class"),
+        "producer_estimated_kv_bytes_peak": to_int(payload.get("producer_estimated_kv_bytes_peak")),
+        "producer_expected_offload_mode": payload.get("producer_expected_offload_mode"),
+        "offload_mode_match": (
+            1 if payload.get("producer_expectation_validation", {}).get("offload_mode_match") is True
+            else 0 if payload.get("producer_expectation_validation", {}).get("offload_mode_match") is False
+            else None
+        ),
+        "offload_mode": choose(getattr(args, 'offload_mode', None), payload.get("offload_mode")),
+        "kv_cache_dtype": choose(getattr(args, 'kv_cache_dtype', None), payload.get("kv_cache_dtype")),
+        "disable_prefix_caching": to_int(
+            choose(
+                getattr(args, 'disable_prefix_caching', None),
+                payload.get("disable_prefix_caching"),
+            )
+        ),
+        "cpu_cache_usage_peak_pct": to_float(
+            payload.get("peak_cpu_cache_usage", 0.0) * 100.0
+            if payload.get("peak_cpu_cache_usage") is not None else None
+        ),
+        "raw_result_json": json.dumps(build_raw_payload(payload, args), sort_keys=True),
+        "status": args.status,
+        "error_message": choose(args.error_message, payload.get("error_message")),
+    }
+
+    conn = connect_db(args.db_path)
+    placeholders = ", ".join("?" for _ in INSERT_COLUMNS)
+    sql = f"INSERT INTO {TABLE_NAME} ({', '.join(INSERT_COLUMNS)}) VALUES ({placeholders})"
+    conn.execute(sql, [row[column] for column in INSERT_COLUMNS])
+    conn.commit()
+    conn.close()
+
+    print(
+        f"Inserted run {row['run_id']} into {Path(args.db_path)} "
+        f"({row['gpu_type']} {row['model']} {row['engine']} {row['context_band']}, status={row['status']})."
+    )
+
+
+def fetch_rows(conn: sqlite3.Connection, sql: str, params: Sequence[Any] = ()) -> list[sqlite3.Row]:
+    return list(conn.execute(sql, params))
+
+
+def stringify(value: Any) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, float):
+        return f"{value:.2f}"
+    return str(value)
+
+
+def render_table(headers: Sequence[str], rows: Iterable[Sequence[Any]]) -> str:
+    normalized_rows = [[stringify(value) for value in row] for row in rows]
+    widths = [len(header) for header in headers]
+    for row in normalized_rows:
+        for idx, value in enumerate(row):
+            widths[idx] = max(widths[idx], len(value))
+
+    def fmt_row(row: Sequence[str]) -> str:
+        return " | ".join(value.ljust(widths[idx]) for idx, value in enumerate(row))
+
+    divider = "-+-".join("-" * width for width in widths)
+    lines = [fmt_row(headers), divider]
+    for row in normalized_rows:
+        lines.append(fmt_row(row))
+    return "\n".join(lines)
+
+
+def print_query(args: argparse.Namespace) -> None:
+    conn = connect_db(args.db_path)
+
+    if args.group_by:
+        group_columns = [column.strip() for column in args.group_by.split(",") if column.strip()]
+        if not group_columns:
+            raise SystemExit("--group-by requires at least one column")
+        invalid = [column for column in group_columns if column not in GROUPABLE_COLUMNS]
+        if invalid:
+            raise SystemExit(
+                f"Unsupported --group-by columns: {', '.join(invalid)}. "
+                f"Allowed: {', '.join(sorted(GROUPABLE_COLUMNS))}"
+            )
+
+        select_prefix = ", ".join(group_columns)
+        sql = f"""
+            SELECT
+              {select_prefix},
+              COUNT(*) AS runs,
+              SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) AS success_runs,
+              SUM(CASE WHEN status != 'success' THEN 1 ELSE 0 END) AS non_success_runs,
+              ROUND(AVG(ttft_p50_ms), 2) AS avg_ttft_p50_ms,
+              ROUND(AVG(throughput_tok_s), 2) AS avg_throughput_tok_s,
+              ROUND(MAX(gpu_mem_peak_gb), 2) AS max_gpu_mem_peak_gb,
+              SUM(CASE WHEN COALESCE(preemption_count, 0) > 0 THEN 1 ELSE 0 END) AS preemption_runs
+            FROM {TABLE_NAME}
+            GROUP BY {select_prefix}
+            ORDER BY {select_prefix}
+        """
+        rows = fetch_rows(conn, sql)
+        headers = group_columns + [
+            "runs",
+            "success_runs",
+            "non_success_runs",
+            "avg_ttft_p50_ms",
+            "avg_throughput_tok_s",
+            "max_gpu_mem_peak_gb",
+            "preemption_runs",
+        ]
+        print(render_table(headers, ([row[header] for header in headers] for row in rows)))
+    else:
+        sql = f"SELECT {', '.join(DEFAULT_QUERY_COLUMNS)} FROM {TABLE_NAME} ORDER BY id DESC"
+        rows = fetch_rows(conn, sql)
+        print(render_table(DEFAULT_QUERY_COLUMNS, ([row[column] for column in DEFAULT_QUERY_COLUMNS] for row in rows)))
+
+    conn.close()
+
+
+def export_csv_rows(args: argparse.Namespace) -> None:
+    conn = connect_db(args.db_path)
+    rows = fetch_rows(conn, f"SELECT * FROM {TABLE_NAME} ORDER BY id ASC")
+    headers = [description[0] for description in conn.execute(f"SELECT * FROM {TABLE_NAME} LIMIT 0").description]
+
+    if args.output:
+        output_path = Path(args.output)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        handle = output_path.open("w", newline="")
+    else:
+        handle = sys.stdout
+
+    try:
+        writer = csv.writer(handle)
+        writer.writerow(headers)
+        for row in rows:
+            writer.writerow([row[header] for header in headers])
+    finally:
+        if args.output:
+            handle.close()
+            print(f"Exported {len(rows)} rows to {args.output}")
+
+    conn.close()
+
+
+def print_summary(args: argparse.Namespace) -> None:
+    conn = connect_db(args.db_path)
+    total_runs = conn.execute(f"SELECT COUNT(*) FROM {TABLE_NAME}").fetchone()[0]
+    if total_runs == 0:
+        print(f"No runs found in {args.db_path}")
+        conn.close()
+        return
+
+    status_rows = fetch_rows(conn, f"SELECT status, COUNT(*) AS count FROM {TABLE_NAME} GROUP BY status ORDER BY status")
+    preemption_rows = fetch_rows(
+        conn,
+        f"""
+        SELECT gpu_type, model, engine, context_band, preemption_count, status
+        FROM {TABLE_NAME}
+        WHERE COALESCE(preemption_count, 0) > 0
+        ORDER BY preemption_count DESC, id DESC
+        LIMIT 10
+        """,
+    )
+    highest_memory_rows = fetch_rows(
+        conn,
+        f"""
+        SELECT gpu_type, model, engine, context_band, gpu_mem_peak_gb, kv_cache_usage_pct, status
+        FROM {TABLE_NAME}
+        WHERE gpu_mem_peak_gb IS NOT NULL
+        ORDER BY gpu_mem_peak_gb DESC, id DESC
+        LIMIT 5
+        """,
+    )
+    slowest_ttft_rows = fetch_rows(
+        conn,
+        f"""
+        SELECT gpu_type, model, engine, context_band, ttft_p50_ms, ttft_p99_ms, status
+        FROM {TABLE_NAME}
+        WHERE ttft_p50_ms IS NOT NULL
+        ORDER BY ttft_p50_ms DESC, id DESC
+        LIMIT 5
+        """,
+    )
+    highest_kv_rows = fetch_rows(
+        conn,
+        f"""
+        SELECT gpu_type, model, engine, context_band, kv_cache_usage_pct, gpu_mem_peak_gb, status
+        FROM {TABLE_NAME}
+        WHERE kv_cache_usage_pct IS NOT NULL
+        ORDER BY kv_cache_usage_pct DESC, id DESC
+        LIMIT 5
+        """,
+    )
+    long_context_rollup = fetch_rows(
+        conn,
+        f"""
+        SELECT
+          context_band,
+          COUNT(*) AS runs,
+          SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) AS success_runs,
+          ROUND(AVG(ttft_p50_ms), 2) AS avg_ttft_p50_ms,
+          ROUND(MAX(gpu_mem_peak_gb), 2) AS max_gpu_mem_peak_gb,
+          SUM(CASE WHEN COALESCE(preemption_count, 0) > 0 THEN 1 ELSE 0 END) AS preemption_runs
+        FROM {TABLE_NAME}
+        WHERE context_band IN ('131k', '500k', '1m')
+        GROUP BY context_band
+        ORDER BY CASE context_band WHEN '131k' THEN 1 WHEN '500k' THEN 2 WHEN '1m' THEN 3 ELSE 99 END
+        """,
+    )
+
+    print(f"ISB1 results summary ({args.db_path})")
+    print(f"Total runs: {total_runs}")
+    print(render_table(["status", "count"], ([row["status"], row["count"]] for row in status_rows)))
+    print()
+
+    if long_context_rollup:
+        print("Long-context rollup")
+        print(
+            render_table(
+                ["context_band", "runs", "success_runs", "avg_ttft_p50_ms", "max_gpu_mem_peak_gb", "preemption_runs"],
+                (
+                    [
+                        row["context_band"],
+                        row["runs"],
+                        row["success_runs"],
+                        row["avg_ttft_p50_ms"],
+                        row["max_gpu_mem_peak_gb"],
+                        row["preemption_runs"],
+                    ]
+                    for row in long_context_rollup
+                ),
+            )
+        )
+        print()
+
+    # Depth coverage rollup
+    depth_coverage_rows = fetch_rows(
+        conn,
+        f"""
+        SELECT
+          context_band,
+          COUNT(*) AS runs,
+          ROUND(AVG(depth_coverage_ratio), 4) AS avg_depth_coverage,
+          MAX(max_actual_context_len) AS max_actual_ctx,
+          SUM(CASE WHEN depth_coverage_class = 'configuration_only' THEN 1 ELSE 0 END) AS config_only_runs,
+          SUM(CASE WHEN depth_coverage_class = 'full' THEN 1 ELSE 0 END) AS full_depth_runs
+        FROM {TABLE_NAME}
+        WHERE context_band IN ('131k', '500k', '1m')
+          AND depth_coverage_ratio IS NOT NULL
+        GROUP BY context_band
+        ORDER BY CASE context_band WHEN '131k' THEN 1 WHEN '500k' THEN 2 WHEN '1m' THEN 3 ELSE 99 END
+        """,
+    )
+    if depth_coverage_rows:
+        print("Depth coverage (actual vs configured)")
+        print(
+            render_table(
+                ["context_band", "runs", "avg_depth_coverage", "max_actual_ctx", "config_only_runs", "full_depth_runs"],
+                (
+                    [
+                        row["context_band"],
+                        row["runs"],
+                        row["avg_depth_coverage"],
+                        row["max_actual_ctx"],
+                        row["config_only_runs"],
+                        row["full_depth_runs"],
+                    ]
+                    for row in depth_coverage_rows
+                ),
+            )
+        )
+        print()
+
+    if preemption_rows:
+        print("Runs with preemptions")
+        print(
+            render_table(
+                ["gpu_type", "model", "engine", "context_band", "preemption_count", "status"],
+                (
+                    [
+                        row["gpu_type"],
+                        row["model"],
+                        row["engine"],
+                        row["context_band"],
+                        row["preemption_count"],
+                        row["status"],
+                    ]
+                    for row in preemption_rows
+                ),
+            )
+        )
+        print()
+    else:
+        print("Runs with preemptions: none")
+        print()
+
+    if highest_memory_rows:
+        print("Highest peak GPU memory")
+        print(
+            render_table(
+                ["gpu_type", "model", "engine", "context_band", "gpu_mem_peak_gb", "kv_cache_usage_pct", "status"],
+                (
+                    [
+                        row["gpu_type"],
+                        row["model"],
+                        row["engine"],
+                        row["context_band"],
+                        row["gpu_mem_peak_gb"],
+                        row["kv_cache_usage_pct"],
+                        row["status"],
+                    ]
+                    for row in highest_memory_rows
+                ),
+            )
+        )
+        print()
+
+    if slowest_ttft_rows:
+        print("Slowest TTFT p50 runs")
+        print(
+            render_table(
+                ["gpu_type", "model", "engine", "context_band", "ttft_p50_ms", "ttft_p99_ms", "status"],
+                (
+                    [
+                        row["gpu_type"],
+                        row["model"],
+                        row["engine"],
+                        row["context_band"],
+                        row["ttft_p50_ms"],
+                        row["ttft_p99_ms"],
+                        row["status"],
+                    ]
+                    for row in slowest_ttft_rows
+                ),
+            )
+        )
+        print()
+
+    if highest_kv_rows:
+        print("Highest KV-cache usage")
+        print(
+            render_table(
+                ["gpu_type", "model", "engine", "context_band", "kv_cache_usage_pct", "gpu_mem_peak_gb", "status"],
+                (
+                    [
+                        row["gpu_type"],
+                        row["model"],
+                        row["engine"],
+                        row["context_band"],
+                        row["kv_cache_usage_pct"],
+                        row["gpu_mem_peak_gb"],
+                        row["status"],
+                    ]
+                    for row in highest_kv_rows
+                ),
+            )
+        )
+
+    conn.close()
+
+
+def main() -> int:
+    args = parse_args()
+    if args.command == "ingest":
+        insert_run(args)
+    elif args.command == "query":
+        print_query(args)
+    elif args.command == "export-csv":
+        export_csv_rows(args)
+    elif args.command == "summary":
+        print_summary(args)
+    else:
+        raise SystemExit(f"Unknown command: {args.command}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/datasets/isb1/scripts/metrics_collector.py b/datasets/isb1/scripts/metrics_collector.py
new file mode 100644
index 000000000..3de1f7615
--- /dev/null
+++ b/datasets/isb1/scripts/metrics_collector.py
@@ -0,0 +1,356 @@
+#!/usr/bin/env python3
+"""Prometheus metrics scraper for ISB1 KV stress benchmarks."""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import csv
+import json
+import re
+import signal
+import statistics
+import time
+from pathlib import Path
+from typing import Dict
+from urllib.request import Request, urlopen
+
+PROM_LINE_RE = re.compile(
+    r"^\s*([a-zA-Z_:][a-zA-Z0-9_:]*)(?:\{[^}]*\})?\s+([-+]?(?:\d+\.\d*|\d*\.\d+|\d+)(?:[eE][-+]?\d+)?)\s*$"
+)
+
+CANONICAL_METRICS: dict[str, tuple[str, ...]] = {
+    # Required vLLM metrics
+    "vllm:gpu_cache_usage_perc": (
+        "vllm:gpu_cache_usage_perc",
+        "vllm_gpu_cache_usage_perc",
+    ),
+    "vllm:cpu_cache_usage_perc": (
+        "vllm:cpu_cache_usage_perc",
+        "vllm_cpu_cache_usage_perc",
+    ),
+    "vllm:num_preemptions_total": (
+        "vllm:num_preemptions_total",
+        "vllm_num_preemptions_total",
+    ),
+    "vllm:num_requests_running": (
+        "vllm:num_requests_running",
+        "vllm_num_requests_running",
+    ),
+    "vllm:num_requests_waiting": (
+        "vllm:num_requests_waiting",
+        "vllm_num_requests_waiting",
+    ),
+    "vllm:kv_offload_bytes_gpu_to_cpu": (
+        "vllm:kv_offload_bytes_gpu_to_cpu",
+        "vllm_kv_offload_bytes_gpu_to_cpu",
+    ),
+    "vllm:kv_offload_bytes_cpu_to_gpu": (
+        "vllm:kv_offload_bytes_cpu_to_gpu",
+        "vllm_kv_offload_bytes_cpu_to_gpu",
+    ),
+    "vllm:prompt_tokens_total": (
+        "vllm:prompt_tokens_total",
+        "vllm_prompt_tokens_total",
+    ),
+    "vllm:generation_tokens_total": (
+        "vllm:generation_tokens_total",
+        "vllm_generation_tokens_total",
+    ),
+    # Optional but useful in vLLM
+    "vllm:num_requests_swapped": (
+        "vllm:num_requests_swapped",
+        "vllm_num_requests_swapped",
+    ),
+    # PR #993 parity metrics (vLLM)
+    "vllm:prefix_cache_hit_rate": (
+        "vllm:prefix_cache_hit_rate",
+        "vllm_prefix_cache_hit_rate",
+    ),
+    "vllm:cpu_prefix_cache_hit_rate": (
+        "vllm:cpu_prefix_cache_hit_rate",
+        "vllm_cpu_prefix_cache_hit_rate",
+    ),
+    "vllm:kv_offload_time_gpu_to_cpu_seconds": (
+        "vllm:kv_offload_time_gpu_to_cpu_seconds",
+        "vllm_kv_offload_time_gpu_to_cpu_seconds",
+    ),
+    "vllm:kv_offload_time_cpu_to_gpu_seconds": (
+        "vllm:kv_offload_time_cpu_to_gpu_seconds",
+        "vllm_kv_offload_time_cpu_to_gpu_seconds",
+    ),
+    "vllm:prompt_tokens_local_compute": (
+        "vllm:prompt_tokens_local_compute",
+        "vllm_prompt_tokens_local_compute",
+    ),
+    "vllm:prompt_tokens_local_cache_hit": (
+        "vllm:prompt_tokens_local_cache_hit",
+        "vllm_prompt_tokens_local_cache_hit",
+    ),
+    "vllm:prompt_tokens_external_kv_transfer": (
+        "vllm:prompt_tokens_external_kv_transfer",
+        "vllm_prompt_tokens_external_kv_transfer",
+    ),
+    # SGLang equivalents (best-effort)
+    "sglang:kv_cache_usage": (
+        "sglang:kv_cache_usage",
+        "sglang_kv_cache_usage",
+        "sglang_kv_cache_utilization",
+    ),
+    "sglang:cache_hit_rate": (
+        "sglang:cache_hit_rate",
+        "sglang_cache_hit_rate",
+        "sglang_radix_cache_hit_rate",
+    ),
+    "sglang:num_requests_running": (
+        "sglang:num_requests_running",
+        "sglang_num_requests_running",
+        "sglang_scheduler_num_running_requests",
+    ),
+    "sglang:num_requests_waiting": (
+        "sglang:num_requests_waiting",
+        "sglang_num_requests_waiting",
+        "sglang_scheduler_num_waiting_requests",
+    ),
+    "sglang:prompt_tokens_total": (
+        "sglang:prompt_tokens_total",
+        "sglang_prompt_tokens_total",
+        "sglang_num_prompt_tokens_total",
+    ),
+    "sglang:generation_tokens_total": (
+        "sglang:generation_tokens_total",
+        "sglang_generation_tokens_total",
+        "sglang_num_generation_tokens_total",
+    ),
+    # PR #993 parity metrics (SGLang)
+    "sglang:num_preemptions_total": (
+        "sglang:num_preemptions_total",
+        "sglang_num_preemptions_total",
+    ),
+    "sglang:prefix_cache_queries_total": (
+        "sglang:prefix_cache_queries_total",
+        "sglang_prefix_cache_queries_total",
+    ),
+}
+
+
+def _normalize_name(name: str) -> str:
+    return name.replace(":", "_")
+
+
+def parse_prometheus_rows(payload: str) -> list[tuple[str, float]]:
+    rows: list[tuple[str, float]] = []
+    for line in payload.splitlines():
+        if not line or line.startswith("#"):
+            continue
+        match = PROM_LINE_RE.match(line)
+        if not match:
+            continue
+        name, raw_value = match.groups()
+        try:
+            rows.append((name, float(raw_value)))
+        except ValueError:
+            continue
+    return rows
+
+
+def parse_prometheus_text(payload: str) -> Dict[str, float]:
+    samples: Dict[str, float] = {}
+    for name, value in parse_prometheus_rows(payload):
+        samples[name] = value
+    return samples
+
+
+def map_canonical_metrics(samples: Dict[str, float]) -> Dict[str, float]:
+    mapped: Dict[str, float] = {}
+
+    normalized_index: Dict[str, float] = {}
+    for key, value in samples.items():
+        normalized_index[_normalize_name(key)] = value
+
+    for canonical_name, aliases in CANONICAL_METRICS.items():
+        value = None
+        for alias in aliases:
+            if alias in samples:
+                value = samples[alias]
+                break
+            alias_norm = _normalize_name(alias)
+            if alias_norm in normalized_index:
+                value = normalized_index[alias_norm]
+                break
+        if value is not None:
+            mapped[canonical_name] = value
+
+    return mapped
+
+
+def fetch_metrics(metrics_url: str, timeout_s: float = 5.0) -> str:
+    request = Request(metrics_url, headers={"Accept": "text/plain"})
+    with urlopen(request, timeout=timeout_s) as response:  # nosec B310
+        return response.read().decode("utf-8", errors="replace")
+
+
+def _percentile(values: list[float], p: float) -> float:
+    if not values:
+        return 0.0
+    if len(values) == 1:
+        return values[0]
+    sorted_values = sorted(values)
+    rank = (len(sorted_values) - 1) * p
+    lo = int(rank)
+    hi = min(lo + 1, len(sorted_values) - 1)
+    frac = rank - lo
+    return sorted_values[lo] * (1.0 - frac) + sorted_values[hi] * frac
+
+
+def _build_summary(metric_values: dict[str, list[float]]) -> dict[str, dict[str, float]]:
+    summary: dict[str, dict[str, float]] = {}
+    for metric_name, values in metric_values.items():
+        if not values:
+            continue
+        summary[metric_name] = {
+            "count": float(len(values)),
+            "min": min(values),
+            "max": max(values),
+            "mean": statistics.fmean(values),
+            "p50": _percentile(values, 0.50),
+            "p99": _percentile(values, 0.99),
+        }
+    return summary
+
+
+async def scrape_loop(
+    metrics_url: str,
+    output_path: Path,
+    interval_s: float,
+    duration_s: float,
+    wide: bool,
+    summary_json_path: Path | None,
+) -> None:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    stop_event = asyncio.Event()
+
+    def _request_stop(*_: object) -> None:
+        stop_event.set()
+
+    try:
+        loop = asyncio.get_running_loop()
+        loop.add_signal_handler(signal.SIGINT, _request_stop)
+        loop.add_signal_handler(signal.SIGTERM, _request_stop)
+    except NotImplementedError:
+        pass
+
+    started_at = time.time()
+    metric_values: dict[str, list[float]] = {}
+
+    wide_path = output_path.with_name("kv_metrics_wide.csv")
+
+    with output_path.open("w", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        writer.writerow(["timestamp", "metric_name", "metric_value"])
+
+        wide_file = None
+        wide_writer = None
+        if wide:
+            wide_file = wide_path.open("w", newline="", encoding="utf-8")
+            wide_writer = csv.writer(wide_file)
+            wide_writer.writerow(["timestamp", "metric_name", "metric_value"])
+
+        try:
+            while not stop_event.is_set():
+                now = time.time()
+                if duration_s > 0 and (now - started_at) >= duration_s:
+                    break
+
+                try:
+                    raw_text = await asyncio.to_thread(fetch_metrics, metrics_url)
+                    raw_rows = parse_prometheus_rows(raw_text)
+                    samples = parse_prometheus_text(raw_text)
+                    mapped = map_canonical_metrics(samples)
+
+                    if wide_writer is not None:
+                        for raw_metric_name, raw_metric_value in raw_rows:
+                            wide_writer.writerow(
+                                [f"{now:.3f}", raw_metric_name, f"{raw_metric_value:.8f}"]
+                            )
+                        wide_file.flush()
+
+                    for metric_name, metric_value in mapped.items():
+                        writer.writerow([f"{now:.3f}", metric_name, f"{metric_value:.8f}"])
+                        metric_values.setdefault(metric_name, []).append(metric_value)
+                    f.flush()
+                except Exception as exc:
+                    writer.writerow([f"{now:.3f}", "collector:error", repr(exc)])
+                    f.flush()
+
+                await asyncio.sleep(interval_s)
+        finally:
+            if wide_file is not None:
+                wide_file.close()
+
+    if summary_json_path is not None:
+        summary_json_path.parent.mkdir(parents=True, exist_ok=True)
+        summary_json_path.write_text(
+            json.dumps(_build_summary(metric_values), indent=2, sort_keys=True),
+            encoding="utf-8",
+        )
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Scrape Prometheus metrics into CSV")
+    parser.add_argument(
+        "--metrics-url",
+        default="http://0.0.0.0:8888/metrics",
+        help="Prometheus endpoint URL",
+    )
+    parser.add_argument(
+        "--output",
+        default="kv_metrics.csv",
+        help="CSV output path",
+    )
+    parser.add_argument(
+        "--interval",
+        type=float,
+        default=2.0,
+        help="Scrape interval in seconds",
+    )
+    parser.add_argument(
+        "--duration",
+        type=float,
+        default=0.0,
+        help="Optional max duration in seconds (0 means run until interrupted)",
+    )
+    parser.add_argument(
+        "--wide",
+        action="store_true",
+        help="Also scrape all non-comment Prometheus metric lines into kv_metrics_wide.csv",
+    )
+    parser.add_argument(
+        "--summary-json",
+        nargs="?",
+        const="kv_metrics_summary.json",
+        default=None,
+        help="Write per-metric min/max/mean/p50/p99 summary JSON (default: kv_metrics_summary.json)",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    summary_json_path = Path(args.summary_json) if args.summary_json else None
+    asyncio.run(
+        scrape_loop(
+            metrics_url=args.metrics_url,
+            output_path=Path(args.output),
+            interval_s=max(args.interval, 0.1),
+            duration_s=max(args.duration, 0.0),
+            wide=args.wide,
+            summary_json_path=summary_json_path,
+        )
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/datasets/isb1/scripts/plot_pareto.py b/datasets/isb1/scripts/plot_pareto.py
new file mode 100644
index 000000000..964696ad1
--- /dev/null
+++ b/datasets/isb1/scripts/plot_pareto.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import sqlite3
+from pathlib import Path
+from typing import Any
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Compute Pareto frontier for KV sweep throughput vs p99 TTFT")
+    parser.add_argument("--db-path", default=None, help="SQLite DB path (benchmark_runs)")
+    parser.add_argument("--json-dir", default=None, help="Directory containing sweep summary JSON files")
+    parser.add_argument("--output-dir", required=True, help="Directory for pareto outputs")
+    return parser.parse_args()
+
+
+def _to_float(value: Any) -> float | None:
+    if value in (None, ""):
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def load_rows_from_db(db_path: Path) -> list[dict[str, Any]]:
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = sqlite3.Row
+    rows = conn.execute(
+        """
+        SELECT offload_mode, ttft_p99_ms, throughput_tok_s, max_concurrency, raw_result_json
+        FROM benchmark_runs
+        WHERE offload_mode IS NOT NULL
+          AND ttft_p99_ms IS NOT NULL
+          AND throughput_tok_s IS NOT NULL
+        ORDER BY id ASC
+        """
+    ).fetchall()
+    conn.close()
+
+    normalized: list[dict[str, Any]] = []
+    for row in rows:
+        concurrency = row["max_concurrency"]
+        if concurrency in (None, "") and row["raw_result_json"]:
+            try:
+                payload = json.loads(row["raw_result_json"])
+                concurrency = payload.get("conc") or payload.get("max_concurrency")
+            except Exception:
+                pass
+        normalized.append(
+            {
+                "offload_mode": row["offload_mode"],
+                "concurrency": int(concurrency) if concurrency not in (None, "") else None,
+                "throughput_tok_s": _to_float(row["throughput_tok_s"]),
+                "ttft_p99_ms": _to_float(row["ttft_p99_ms"]),
+                "source": "db",
+            }
+        )
+    return normalized
+
+
+def load_rows_from_json_dir(json_dir: Path) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    for path in sorted(json_dir.glob("*.json")):
+        try:
+            payload = json.loads(path.read_text(encoding="utf-8"))
+        except Exception:
+            continue
+
+        if isinstance(payload, dict) and isinstance(payload.get("summary"), list):
+            for row in payload["summary"]:
+                rows.append(
+                    {
+                        "offload_mode": row.get("offload_mode"),
+                        "concurrency": row.get("concurrency"),
+                        "throughput_tok_s": _to_float(row.get("throughput_tok_s")),
+                        "ttft_p99_ms": _to_float(row.get("ttft_p99_ms")),
+                        "source": str(path.name),
+                    }
+                )
+        elif isinstance(payload, list):
+            for row in payload:
+                if isinstance(row, dict):
+                    rows.append(
+                        {
+                            "offload_mode": row.get("offload_mode"),
+                            "concurrency": row.get("concurrency"),
+                            "throughput_tok_s": _to_float(row.get("throughput_tok_s")),
+                            "ttft_p99_ms": _to_float(row.get("ttft_p99_ms")),
+                            "source": str(path.name),
+                        }
+                    )
+    return rows
+
+
+def compute_pareto_frontier(points: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    valid = [p for p in points if p["throughput_tok_s"] is not None and p["ttft_p99_ms"] is not None]
+    if not valid:
+        return []
+
+    # maximize throughput, minimize ttft_p99_ms
+    sorted_points = sorted(valid, key=lambda p: (p["throughput_tok_s"], -p["ttft_p99_ms"]), reverse=True)
+    frontier: list[dict[str, Any]] = []
+    best_latency = float("inf")
+    for point in sorted_points:
+        latency = point["ttft_p99_ms"]
+        if latency <= best_latency:
+            frontier.append(point)
+            best_latency = latency
+    return sorted(frontier, key=lambda p: (p["throughput_tok_s"], p["ttft_p99_ms"]))
+
+
+def write_csv(path: Path, rows: list[dict[str, Any]], frontier_keys: set[tuple[str, int | None, float, float]]) -> None:
+    with path.open("w", newline="", encoding="utf-8") as handle:
+        writer = csv.writer(handle)
+        writer.writerow(["offload_mode", "concurrency", "throughput_tok_s", "ttft_p99_ms", "is_frontier", "source"])
+        for row in rows:
+            key = (row.get("offload_mode") or "", row.get("concurrency"), row.get("throughput_tok_s") or 0.0, row.get("ttft_p99_ms") or 0.0)
+            writer.writerow([
+                row.get("offload_mode"),
+                row.get("concurrency"),
+                row.get("throughput_tok_s"),
+                row.get("ttft_p99_ms"),
+                key in frontier_keys,
+                row.get("source"),
+            ])
+
+
+def maybe_write_plot(output_path: Path, grouped_frontiers: dict[str, list[dict[str, Any]]]) -> bool:
+    try:
+        import matplotlib.pyplot as plt  # type: ignore
+    except Exception:
+        return False
+
+    plt.figure(figsize=(10, 6))
+    for mode, frontier in sorted(grouped_frontiers.items()):
+        x = [p["throughput_tok_s"] for p in frontier]
+        y = [p["ttft_p99_ms"] for p in frontier]
+        if not x:
+            continue
+        plt.plot(x, y, marker="o", label=mode)
+    plt.xlabel("Throughput (tokens/sec)")
+    plt.ylabel("p99 TTFT (ms)")
+    plt.title("Pareto Frontier by Offload Mode")
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    plt.tight_layout()
+    plt.savefig(output_path)
+    plt.close()
+    return True
+
+
+def main() -> int:
+    args = parse_args()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    if not args.db_path and not args.json_dir:
+        raise SystemExit("Provide --db-path or --json-dir")
+
+    rows: list[dict[str, Any]] = []
+    if args.db_path:
+        rows.extend(load_rows_from_db(Path(args.db_path)))
+    if args.json_dir:
+        rows.extend(load_rows_from_json_dir(Path(args.json_dir)))
+
+    grouped: dict[str, list[dict[str, Any]]] = {}
+    for row in rows:
+        mode = row.get("offload_mode")
+        if not mode:
+            continue
+        grouped.setdefault(mode, []).append(row)
+
+    grouped_frontiers: dict[str, list[dict[str, Any]]] = {}
+    for mode, points in grouped.items():
+        grouped_frontiers[mode] = compute_pareto_frontier(points)
+
+    frontier_keys: set[tuple[str, int | None, float, float]] = set()
+    for mode, frontier in grouped_frontiers.items():
+        for point in frontier:
+            frontier_keys.add((mode, point.get("concurrency"), point.get("throughput_tok_s") or 0.0, point.get("ttft_p99_ms") or 0.0))
+
+    csv_path = output_dir / "pareto_data.csv"
+    write_csv(csv_path, rows, frontier_keys)
+
+    summary = {
+        "total_points": len(rows),
+        "offload_modes": sorted(grouped.keys()),
+        "frontier": {mode: frontier for mode, frontier in grouped_frontiers.items()},
+    }
+    summary_path = output_dir / "pareto_summary.json"
+    summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True), encoding="utf-8")
+
+    plot_written = maybe_write_plot(output_dir / "pareto_frontier.png", grouped_frontiers)
+
+    print(f"Wrote: {csv_path}")
+    print(f"Wrote: {summary_path}")
+    if plot_written:
+        print(f"Wrote: {output_dir / 'pareto_frontier.png'}")
+    else:
+        print("Skipped pareto_frontier.png (matplotlib unavailable)")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/README.md b/experimental/README.md
index f39dfc4af..8ba1ba9b5 100644
--- a/experimental/README.md
+++ b/experimental/README.md
@@ -1,5 +1,11 @@
 # Experimental
 
-This folder contains experimental WIP code that is mostly Claude Code generated.
+This folder contains experimental WIP code and planning material.
 
-**Warning:** Code in this directory is very basic and likely contains errors or incomplete implementations. It is not intended for production use or as part of the official InferenceMAX results.
+Relevant roadmap docs:
+
+For the current official ISB1 support statement, use:
+- `datasets/isb1/SUPPORT_MATRIX.md`
+- `datasets/isb1/README.md`
+
+**Warning:** code and notes in this directory may be incomplete, experimental, or future-looking. They are not by themselves the official statement of supported InferenceX ISB1 capability.
diff --git a/experimental/multiturn/README.md b/experimental/multiturn/README.md
index 05b22f67e..fd9114b37 100644
--- a/experimental/multiturn/README.md
+++ b/experimental/multiturn/README.md
@@ -1,16 +1,27 @@
-## Experimental WIP: Multi turn with/without CPU KVCache Offloading
-
-lit review
-- https://lmsys.org/blog/2025-09-10-sglang-hicache/
--  sglang calls GPU HBM as (L1) and CPU DRAM as (L2)
-- https://lmsys.org/images/blog/hicache/mooncake_benchmark.png
-- single turn long context Q&A  https://arxiv.org/abs/2311.04939 (seems more like an shared prefix style similar to cascade attention (pre cursor to sglang radix attention )) https://flashinfer.ai/2024/02/02/cascade-inference.html
-- synethic & sharegpt vllm multi turn datasets https://github.com/vllm-project/vllm/tree/main/benchmarks/multi_turn
-- Production Alibiba Multi turn dataset https://arxiv.org/abs/2506.02634 (seem to not provide the acutal prompts and outputs tho, more just prompt lengths and output lengths, etc.)
-- sglang synthetic multi turn benchmark script here https://github.com/sgl-project/sglang/tree/main/benchmark/hicache
-- interestingly sglang blog simulates PD disagg via just setting OSL as 1
-- MT-bench https://arxiv.org/abs/2402.14762
-```bash
-python3 benchmark/hicache/bench_multiturn.py --model-path $MODEL_PATH --disable-random-sample \
---output-length 1 --request-length 2048 \ # simulate P-D disaggregation
-```
+# Experimental multiturn notes
+
+This directory contains working notes, investigations, and planning material for multiturn and long-context benchmarking.
+
+## Official ISB1 replay status lives elsewhere
+
+Do **not** treat this directory as the source of truth for the currently supported InferenceX ISB1 surface.
+
+For the official, reviewable statement of what is landed now, use:
+- `datasets/isb1/SUPPORT_MATRIX.md`
+- `datasets/isb1/README.md`
+- `.github/configs/isb1-master.yaml`
+
+## Relevant roadmap docs
+
+- `ISB1_MULTITURN_LONG_CONTEXT_CANONICAL_SYNTHESIS_2026-04-09.md` — canonical synthesis for next implementation phases; use this first for planning context.
+- `ISB1_INFERENCEX_PHASED_PR_ROADMAP_2026-04-09.md` — phased landing plan used to split schema/workflow/data/extension/polish work into mergeable stages.
+
+## Scope warning
+
+Files in this directory may discuss future or experimental directions such as:
+- KV offload investigations
+- synthetic multiturn ideas
+- broader long-context expansion
+- experiments outside the currently merged official replay lane
+
+Those notes are useful for planning, but they are **not** themselves an official support claim.
diff --git a/experimental/multiturn/vllm_benchmark/.gitignore b/experimental/multiturn/vllm_benchmark/.gitignore
new file mode 100644
index 000000000..5c371b81e
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/.gitignore
@@ -0,0 +1,7 @@
+# Python
+__pycache__/
+*.pyc
+
+# Generated artifacts
+*.log
+*.tmp
diff --git a/experimental/multiturn/vllm_benchmark/README.md b/experimental/multiturn/vllm_benchmark/README.md
new file mode 100644
index 000000000..b2ea6f175
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/README.md
@@ -0,0 +1,33 @@
+# vLLM Benchmark (Experimental)
+
+This directory tracks the PR #993 parity surface for multi-turn trace replay and KV stress experiments.
+
+## Trace sources
+
+- **ISB-1 exports**: existing committed replay exports.
+- **kv-cache-tester**: `kv-cache-tester/` is a placeholder for the external trace replay repo.
+- **AIPerf synthetic traces**: `aiperf_traces/` provides fallback synthetic traces.
+
+## Analysis tools
+
+The parity analysis scripts live under `datasets/isb1/scripts/`:
+
+- `plot_pareto.py`
+- `analyze_benchmark_distributions.py`
+- `collect_sweep_results.py`
+- `adapt_trace_replay_result.py`
+
+## LMCache variants
+
+LMCache launch helpers are under `launch/`:
+
+- `lmcache_vllm_h200.sh`
+- `lmcache_vllm_b200.sh`
+
+## Per-hardware replay scripts
+
+Trace replay scripts are under `scripts/` for per-model/per-engine/per-hardware combinations.
+
+---
+
+**Experimental infrastructure. Not part of official ISB-1 support matrix.**
diff --git a/experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json b/experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json
new file mode 100644
index 000000000..683556038
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/aiperf_traces/aiperf_synthetic_traces.json
@@ -0,0 +1,5559 @@
+{
+  "sessions": [
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 4355,
+          "target_output_tokens": 229
+        },
+        {
+          "role": "user",
+          "content_token_count": 13955,
+          "target_output_tokens": 384
+        },
+        {
+          "role": "user",
+          "content_token_count": 1941,
+          "target_output_tokens": 89
+        },
+        {
+          "role": "user",
+          "content_token_count": 11403,
+          "target_output_tokens": 2247
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 13567,
+          "target_output_tokens": 663
+        },
+        {
+          "role": "user",
+          "content_token_count": 49742,
+          "target_output_tokens": 366
+        },
+        {
+          "role": "user",
+          "content_token_count": 13186,
+          "target_output_tokens": 686
+        },
+        {
+          "role": "user",
+          "content_token_count": 7600,
+          "target_output_tokens": 418
+        },
+        {
+          "role": "user",
+          "content_token_count": 5978,
+          "target_output_tokens": 385
+        },
+        {
+          "role": "user",
+          "content_token_count": 1998,
+          "target_output_tokens": 706
+        },
+        {
+          "role": "user",
+          "content_token_count": 1582,
+          "target_output_tokens": 667
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 14644,
+          "target_output_tokens": 467
+        },
+        {
+          "role": "user",
+          "content_token_count": 20321,
+          "target_output_tokens": 971
+        },
+        {
+          "role": "user",
+          "content_token_count": 2950,
+          "target_output_tokens": 274
+        },
+        {
+          "role": "user",
+          "content_token_count": 4932,
+          "target_output_tokens": 680
+        },
+        {
+          "role": "user",
+          "content_token_count": 9971,
+          "target_output_tokens": 706
+        },
+        {
+          "role": "user",
+          "content_token_count": 3348,
+          "target_output_tokens": 440
+        },
+        {
+          "role": "user",
+          "content_token_count": 13343,
+          "target_output_tokens": 431
+        },
+        {
+          "role": "user",
+          "content_token_count": 6230,
+          "target_output_tokens": 2231
+        },
+        {
+          "role": "user",
+          "content_token_count": 8168,
+          "target_output_tokens": 421
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 1487,
+          "target_output_tokens": 986
+        },
+        {
+          "role": "user",
+          "content_token_count": 2684,
+          "target_output_tokens": 549
+        },
+        {
+          "role": "user",
+          "content_token_count": 3065,
+          "target_output_tokens": 366
+        },
+        {
+          "role": "user",
+          "content_token_count": 12135,
+          "target_output_tokens": 1145
+        },
+        {
+          "role": "user",
+          "content_token_count": 14716,
+          "target_output_tokens": 1074
+        },
+        {
+          "role": "user",
+          "content_token_count": 16644,
+          "target_output_tokens": 1062
+        },
+        {
+          "role": "user",
+          "content_token_count": 12355,
+          "target_output_tokens": 285
+        },
+        {
+          "role": "user",
+          "content_token_count": 3108,
+          "target_output_tokens": 291
+        },
+        {
+          "role": "user",
+          "content_token_count": 7234,
+          "target_output_tokens": 1235
+        },
+        {
+          "role": "user",
+          "content_token_count": 25179,
+          "target_output_tokens": 493
+        },
+        {
+          "role": "user",
+          "content_token_count": 6480,
+          "target_output_tokens": 431
+        },
+        {
+          "role": "user",
+          "content_token_count": 13902,
+          "target_output_tokens": 652
+        },
+        {
+          "role": "user",
+          "content_token_count": 6014,
+          "target_output_tokens": 1037
+        },
+        {
+          "role": "user",
+          "content_token_count": 41352,
+          "target_output_tokens": 649
+        },
+        {
+          "role": "user",
+          "content_token_count": 8852,
+          "target_output_tokens": 319
+        },
+        {
+          "role": "user",
+          "content_token_count": 8795,
+          "target_output_tokens": 736
+        },
+        {
+          "role": "user",
+          "content_token_count": 27778,
+          "target_output_tokens": 373
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 6962,
+          "target_output_tokens": 1351
+        },
+        {
+          "role": "user",
+          "content_token_count": 2614,
+          "target_output_tokens": 248
+        },
+        {
+          "role": "user",
+          "content_token_count": 11529,
+          "target_output_tokens": 248
+        },
+        {
+          "role": "user",
+          "content_token_count": 5165,
+          "target_output_tokens": 653
+        },
+        {
+          "role": "user",
+          "content_token_count": 2132,
+          "target_output_tokens": 318
+        },
+        {
+          "role": "user",
+          "content_token_count": 5290,
+          "target_output_tokens": 614
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 23469,
+          "target_output_tokens": 546
+        },
+        {
+          "role": "user",
+          "content_token_count": 7665,
+          "target_output_tokens": 360
+        },
+        {
+          "role": "user",
+          "content_token_count": 27018,
+          "target_output_tokens": 1332
+        },
+        {
+          "role": "user",
+          "content_token_count": 1887,
+          "target_output_tokens": 326
+        },
+        {
+          "role": "user",
+          "content_token_count": 5249,
+          "target_output_tokens": 346
+        },
+        {
+          "role": "user",
+          "content_token_count": 7443,
+          "target_output_tokens": 828
+        },
+        {
+          "role": "user",
+          "content_token_count": 6496,
+          "target_output_tokens": 100
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 9221,
+          "target_output_tokens": 430
+        },
+        {
+          "role": "user",
+          "content_token_count": 7697,
+          "target_output_tokens": 1197
+        },
+        {
+          "role": "user",
+          "content_token_count": 5421,
+          "target_output_tokens": 277
+        },
+        {
+          "role": "user",
+          "content_token_count": 8799,
+          "target_output_tokens": 540
+        },
+        {
+          "role": "user",
+          "content_token_count": 14993,
+          "target_output_tokens": 768
+        },
+        {
+          "role": "user",
+          "content_token_count": 28612,
+          "target_output_tokens": 581
+        },
+        {
+          "role": "user",
+          "content_token_count": 42160,
+          "target_output_tokens": 366
+        },
+        {
+          "role": "user",
+          "content_token_count": 9846,
+          "target_output_tokens": 544
+        },
+        {
+          "role": "user",
+          "content_token_count": 15085,
+          "target_output_tokens": 302
+        },
+        {
+          "role": "user",
+          "content_token_count": 8267,
+          "target_output_tokens": 596
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 23256,
+          "target_output_tokens": 821
+        },
+        {
+          "role": "user",
+          "content_token_count": 36819,
+          "target_output_tokens": 183
+        },
+        {
+          "role": "user",
+          "content_token_count": 1590,
+          "target_output_tokens": 2201
+        },
+        {
+          "role": "user",
+          "content_token_count": 12229,
+          "target_output_tokens": 1265
+        },
+        {
+          "role": "user",
+          "content_token_count": 7483,
+          "target_output_tokens": 1819
+        },
+        {
+          "role": "user",
+          "content_token_count": 2288,
+          "target_output_tokens": 970
+        },
+        {
+          "role": "user",
+          "content_token_count": 33871,
+          "target_output_tokens": 703
+        },
+        {
+          "role": "user",
+          "content_token_count": 8650,
+          "target_output_tokens": 147
+        },
+        {
+          "role": "user",
+          "content_token_count": 10018,
+          "target_output_tokens": 487
+        },
+        {
+          "role": "user",
+          "content_token_count": 21103,
+          "target_output_tokens": 805
+        },
+        {
+          "role": "user",
+          "content_token_count": 17500,
+          "target_output_tokens": 493
+        },
+        {
+          "role": "user",
+          "content_token_count": 1678,
+          "target_output_tokens": 129
+        },
+        {
+          "role": "user",
+          "content_token_count": 29345,
+          "target_output_tokens": 303
+        },
+        {
+          "role": "user",
+          "content_token_count": 4555,
+          "target_output_tokens": 483
+        },
+        {
+          "role": "user",
+          "content_token_count": 39008,
+          "target_output_tokens": 631
+        },
+        {
+          "role": "user",
+          "content_token_count": 3284,
+          "target_output_tokens": 142
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 7400,
+          "target_output_tokens": 948
+        },
+        {
+          "role": "user",
+          "content_token_count": 3992,
+          "target_output_tokens": 387
+        },
+        {
+          "role": "user",
+          "content_token_count": 8450,
+          "target_output_tokens": 313
+        },
+        {
+          "role": "user",
+          "content_token_count": 8606,
+          "target_output_tokens": 89
+        },
+        {
+          "role": "user",
+          "content_token_count": 4775,
+          "target_output_tokens": 3004
+        },
+        {
+          "role": "user",
+          "content_token_count": 44546,
+          "target_output_tokens": 758
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 10548,
+          "target_output_tokens": 522
+        },
+        {
+          "role": "user",
+          "content_token_count": 23492,
+          "target_output_tokens": 463
+        },
+        {
+          "role": "user",
+          "content_token_count": 2803,
+          "target_output_tokens": 3146
+        },
+        {
+          "role": "user",
+          "content_token_count": 2080,
+          "target_output_tokens": 257
+        },
+        {
+          "role": "user",
+          "content_token_count": 8416,
+          "target_output_tokens": 1401
+        },
+        {
+          "role": "user",
+          "content_token_count": 3410,
+          "target_output_tokens": 4096
+        },
+        {
+          "role": "user",
+          "content_token_count": 20886,
+          "target_output_tokens": 246
+        },
+        {
+          "role": "user",
+          "content_token_count": 16891,
+          "target_output_tokens": 111
+        },
+        {
+          "role": "user",
+          "content_token_count": 4933,
+          "target_output_tokens": 654
+        },
+        {
+          "role": "user",
+          "content_token_count": 5560,
+          "target_output_tokens": 634
+        },
+        {
+          "role": "user",
+          "content_token_count": 8380,
+          "target_output_tokens": 158
+        },
+        {
+          "role": "user",
+          "content_token_count": 17894,
+          "target_output_tokens": 278
+        },
+        {
+          "role": "user",
+          "content_token_count": 4907,
+          "target_output_tokens": 312
+        },
+        {
+          "role": "user",
+          "content_token_count": 5810,
+          "target_output_tokens": 1418
+        },
+        {
+          "role": "user",
+          "content_token_count": 6056,
+          "target_output_tokens": 515
+        },
+        {
+          "role": "user",
+          "content_token_count": 6750,
+          "target_output_tokens": 279
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 6845,
+          "target_output_tokens": 83
+        },
+        {
+          "role": "user",
+          "content_token_count": 3847,
+          "target_output_tokens": 2093
+        },
+        {
+          "role": "user",
+          "content_token_count": 2327,
+          "target_output_tokens": 926
+        },
+        {
+          "role": "user",
+          "content_token_count": 11838,
+          "target_output_tokens": 453
+        },
+        {
+          "role": "user",
+          "content_token_count": 5787,
+          "target_output_tokens": 1590
+        },
+        {
+          "role": "user",
+          "content_token_count": 16091,
+          "target_output_tokens": 84
+        },
+        {
+          "role": "user",
+          "content_token_count": 15625,
+          "target_output_tokens": 168
+        },
+        {
+          "role": "user",
+          "content_token_count": 24568,
+          "target_output_tokens": 789
+        },
+        {
+          "role": "user",
+          "content_token_count": 25763,
+          "target_output_tokens": 605
+        },
+        {
+          "role": "user",
+          "content_token_count": 20307,
+          "target_output_tokens": 570
+        },
+        {
+          "role": "user",
+          "content_token_count": 6868,
+          "target_output_tokens": 294
+        },
+        {
+          "role": "user",
+          "content_token_count": 18094,
+          "target_output_tokens": 170
+        },
+        {
+          "role": "user",
+          "content_token_count": 4778,
+          "target_output_tokens": 511
+        },
+        {
+          "role": "user",
+          "content_token_count": 3934,
+          "target_output_tokens": 495
+        },
+        {
+          "role": "user",
+          "content_token_count": 12163,
+          "target_output_tokens": 795
+        },
+        {
+          "role": "user",
+          "content_token_count": 12752,
+          "target_output_tokens": 3072
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 17618,
+          "target_output_tokens": 1691
+        },
+        {
+          "role": "user",
+          "content_token_count": 12217,
+          "target_output_tokens": 164
+        },
+        {
+          "role": "user",
+          "content_token_count": 31341,
+          "target_output_tokens": 777
+        },
+        {
+          "role": "user",
+          "content_token_count": 2248,
+          "target_output_tokens": 1106
+        },
+        {
+          "role": "user",
+          "content_token_count": 11819,
+          "target_output_tokens": 812
+        },
+        {
+          "role": "user",
+          "content_token_count": 5636,
+          "target_output_tokens": 187
+        },
+        {
+          "role": "user",
+          "content_token_count": 5477,
+          "target_output_tokens": 403
+        },
+        {
+          "role": "user",
+          "content_token_count": 19604,
+          "target_output_tokens": 390
+        },
+        {
+          "role": "user",
+          "content_token_count": 8663,
+          "target_output_tokens": 865
+        },
+        {
+          "role": "user",
+          "content_token_count": 16969,
+          "target_output_tokens": 407
+        },
+        {
+          "role": "user",
+          "content_token_count": 22672,
+          "target_output_tokens": 371
+        },
+        {
+          "role": "user",
+          "content_token_count": 4500,
+          "target_output_tokens": 257
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 6952,
+          "target_output_tokens": 1454
+        },
+        {
+          "role": "user",
+          "content_token_count": 21170,
+          "target_output_tokens": 1383
+        },
+        {
+          "role": "user",
+          "content_token_count": 9252,
+          "target_output_tokens": 209
+        },
+        {
+          "role": "user",
+          "content_token_count": 6023,
+          "target_output_tokens": 155
+        },
+        {
+          "role": "user",
+          "content_token_count": 30200,
+          "target_output_tokens": 2025
+        },
+        {
+          "role": "user",
+          "content_token_count": 8146,
+          "target_output_tokens": 132
+        },
+        {
+          "role": "user",
+          "content_token_count": 15151,
+          "target_output_tokens": 300
+        },
+        {
+          "role": "user",
+          "content_token_count": 6381,
+          "target_output_tokens": 739
+        },
+        {
+          "role": "user",
+          "content_token_count": 3225,
+          "target_output_tokens": 454
+        },
+        {
+          "role": "user",
+          "content_token_count": 5177,
+          "target_output_tokens": 2094
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 17308,
+          "target_output_tokens": 484
+        },
+        {
+          "role": "user",
+          "content_token_count": 27306,
+          "target_output_tokens": 413
+        },
+        {
+          "role": "user",
+          "content_token_count": 24589,
+          "target_output_tokens": 1070
+        },
+        {
+          "role": "user",
+          "content_token_count": 7202,
+          "target_output_tokens": 256
+        },
+        {
+          "role": "user",
+          "content_token_count": 6018,
+          "target_output_tokens": 200
+        },
+        {
+          "role": "user",
+          "content_token_count": 3867,
+          "target_output_tokens": 593
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 16341,
+          "target_output_tokens": 1754
+        },
+        {
+          "role": "user",
+          "content_token_count": 4374,
+          "target_output_tokens": 1779
+        },
+        {
+          "role": "user",
+          "content_token_count": 5850,
+          "target_output_tokens": 290
+        },
+        {
+          "role": "user",
+          "content_token_count": 5391,
+          "target_output_tokens": 2242
+        },
+        {
+          "role": "user",
+          "content_token_count": 18534,
+          "target_output_tokens": 187
+        },
+        {
+          "role": "user",
+          "content_token_count": 1541,
+          "target_output_tokens": 1352
+        },
+        {
+          "role": "user",
+          "content_token_count": 512,
+          "target_output_tokens": 917
+        },
+        {
+          "role": "user",
+          "content_token_count": 6840,
+          "target_output_tokens": 397
+        },
+        {
+          "role": "user",
+          "content_token_count": 4664,
+          "target_output_tokens": 585
+        },
+        {
+          "role": "user",
+          "content_token_count": 7184,
+          "target_output_tokens": 846
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 7488,
+          "target_output_tokens": 545
+        },
+        {
+          "role": "user",
+          "content_token_count": 6149,
+          "target_output_tokens": 180
+        },
+        {
+          "role": "user",
+          "content_token_count": 18544,
+          "target_output_tokens": 1062
+        },
+        {
+          "role": "user",
+          "content_token_count": 23779,
+          "target_output_tokens": 962
+        },
+        {
+          "role": "user",
+          "content_token_count": 7158,
+          "target_output_tokens": 624
+        },
+        {
+          "role": "user",
+          "content_token_count": 5401,
+          "target_output_tokens": 264
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 6126,
+          "target_output_tokens": 366
+        },
+        {
+          "role": "user",
+          "content_token_count": 10891,
+          "target_output_tokens": 787
+        },
+        {
+          "role": "user",
+          "content_token_count": 7206,
+          "target_output_tokens": 446
+        },
+        {
+          "role": "user",
+          "content_token_count": 14885,
+          "target_output_tokens": 534
+        },
+        {
+          "role": "user",
+          "content_token_count": 16761,
+          "target_output_tokens": 418
+        },
+        {
+          "role": "user",
+          "content_token_count": 8153,
+          "target_output_tokens": 322
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 6173,
+          "target_output_tokens": 792
+        },
+        {
+          "role": "user",
+          "content_token_count": 7491,
+          "target_output_tokens": 360
+        },
+        {
+          "role": "user",
+          "content_token_count": 11004,
+          "target_output_tokens": 522
+        },
+        {
+          "role": "user",
+          "content_token_count": 30822,
+          "target_output_tokens": 733
+        },
+        {
+          "role": "user",
+          "content_token_count": 16828,
+          "target_output_tokens": 660
+        },
+        {
+          "role": "user",
+          "content_token_count": 10930,
+          "target_output_tokens": 2180
+        },
+        {
+          "role": "user",
+          "content_token_count": 9511,
+          "target_output_tokens": 182
+        },
+        {
+          "role": "user",
+          "content_token_count": 9162,
+          "target_output_tokens": 683
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 28818,
+          "target_output_tokens": 245
+        },
+        {
+          "role": "user",
+          "content_token_count": 6134,
+          "target_output_tokens": 472
+        },
+        {
+          "role": "user",
+          "content_token_count": 6634,
+          "target_output_tokens": 813
+        },
+        {
+          "role": "user",
+          "content_token_count": 10762,
+          "target_output_tokens": 182
+        },
+        {
+          "role": "user",
+          "content_token_count": 5519,
+          "target_output_tokens": 1891
+        },
+        {
+          "role": "user",
+          "content_token_count": 9813,
+          "target_output_tokens": 544
+        },
+        {
+          "role": "user",
+          "content_token_count": 27459,
+          "target_output_tokens": 1087
+        },
+        {
+          "role": "user",
+          "content_token_count": 11085,
+          "target_output_tokens": 192
+        },
+        {
+          "role": "user",
+          "content_token_count": 13108,
+          "target_output_tokens": 444
+        },
+        {
+          "role": "user",
+          "content_token_count": 24568,
+          "target_output_tokens": 203
+        },
+        {
+          "role": "user",
+          "content_token_count": 12813,
+          "target_output_tokens": 800
+        },
+        {
+          "role": "user",
+          "content_token_count": 6876,
+          "target_output_tokens": 126
+        },
+        {
+          "role": "user",
+          "content_token_count": 9155,
+          "target_output_tokens": 4096
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 5653,
+          "target_output_tokens": 908
+        },
+        {
+          "role": "user",
+          "content_token_count": 2275,
+          "target_output_tokens": 410
+        },
+        {
+          "role": "user",
+          "content_token_count": 3348,
+          "target_output_tokens": 708
+        },
+        {
+          "role": "user",
+          "content_token_count": 7689,
+          "target_output_tokens": 448
+        },
+        {
+          "role": "user",
+          "content_token_count": 8998,
+          "target_output_tokens": 1126
+        },
+        {
+          "role": "user",
+          "content_token_count": 1847,
+          "target_output_tokens": 1767
+        },
+        {
+          "role": "user",
+          "content_token_count": 5015,
+          "target_output_tokens": 484
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 37087,
+          "target_output_tokens": 360
+        },
+        {
+          "role": "user",
+          "content_token_count": 9919,
+          "target_output_tokens": 3052
+        },
+        {
+          "role": "user",
+          "content_token_count": 3728,
+          "target_output_tokens": 265
+        },
+        {
+          "role": "user",
+          "content_token_count": 13398,
+          "target_output_tokens": 274
+        },
+        {
+          "role": "user",
+          "content_token_count": 5429,
+          "target_output_tokens": 994
+        },
+        {
+          "role": "user",
+          "content_token_count": 998,
+          "target_output_tokens": 116
+        },
+        {
+          "role": "user",
+          "content_token_count": 1326,
+          "target_output_tokens": 718
+        },
+        {
+          "role": "user",
+          "content_token_count": 9401,
+          "target_output_tokens": 712
+        },
+        {
+          "role": "user",
+          "content_token_count": 9097,
+          "target_output_tokens": 84
+        },
+        {
+          "role": "user",
+          "content_token_count": 5568,
+          "target_output_tokens": 126
+        },
+        {
+          "role": "user",
+          "content_token_count": 29693,
+          "target_output_tokens": 361
+        },
+        {
+          "role": "user",
+          "content_token_count": 4150,
+          "target_output_tokens": 804
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 13188,
+          "target_output_tokens": 1389
+        },
+        {
+          "role": "user",
+          "content_token_count": 20963,
+          "target_output_tokens": 792
+        },
+        {
+          "role": "user",
+          "content_token_count": 15129,
+          "target_output_tokens": 325
+        },
+        {
+          "role": "user",
+          "content_token_count": 7575,
+          "target_output_tokens": 149
+        },
+        {
+          "role": "user",
+          "content_token_count": 20166,
+          "target_output_tokens": 668
+        },
+        {
+          "role": "user",
+          "content_token_count": 7192,
+          "target_output_tokens": 332
+        },
+        {
+          "role": "user",
+          "content_token_count": 10367,
+          "target_output_tokens": 610
+        },
+        {
+          "role": "user",
+          "content_token_count": 5248,
+          "target_output_tokens": 157
+        },
+        {
+          "role": "user",
+          "content_token_count": 9240,
+          "target_output_tokens": 216
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 2873,
+          "target_output_tokens": 154
+        },
+        {
+          "role": "user",
+          "content_token_count": 10140,
+          "target_output_tokens": 2818
+        },
+        {
+          "role": "user",
+          "content_token_count": 4864,
+          "target_output_tokens": 1018
+        },
+        {
+          "role": "user",
+          "content_token_count": 10400,
+          "target_output_tokens": 210
+        },
+        {
+          "role": "user",
+          "content_token_count": 9931,
+          "target_output_tokens": 431
+        },
+        {
+          "role": "user",
+          "content_token_count": 19920,
+          "target_output_tokens": 1335
+        },
+        {
+          "role": "user",
+          "content_token_count": 12765,
+          "target_output_tokens": 479
+        },
+        {
+          "role": "user",
+          "content_token_count": 16121,
+          "target_output_tokens": 634
+        },
+        {
+          "role": "user",
+          "content_token_count": 16426,
+          "target_output_tokens": 303
+        },
+        {
+          "role": "user",
+          "content_token_count": 8657,
+          "target_output_tokens": 606
+        },
+        {
+          "role": "user",
+          "content_token_count": 3219,
+          "target_output_tokens": 126
+        },
+        {
+          "role": "user",
+          "content_token_count": 3934,
+          "target_output_tokens": 90
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 29139,
+          "target_output_tokens": 283
+        },
+        {
+          "role": "user",
+          "content_token_count": 11018,
+          "target_output_tokens": 2117
+        },
+        {
+          "role": "user",
+          "content_token_count": 12413,
+          "target_output_tokens": 123
+        },
+        {
+          "role": "user",
+          "content_token_count": 4620,
+          "target_output_tokens": 1279
+        },
+        {
+          "role": "user",
+          "content_token_count": 14998,
+          "target_output_tokens": 857
+        },
+        {
+          "role": "user",
+          "content_token_count": 6874,
+          "target_output_tokens": 377
+        },
+        {
+          "role": "user",
+          "content_token_count": 9962,
+          "target_output_tokens": 369
+        },
+        {
+          "role": "user",
+          "content_token_count": 35116,
+          "target_output_tokens": 178
+        },
+        {
+          "role": "user",
+          "content_token_count": 9970,
+          "target_output_tokens": 516
+        },
+        {
+          "role": "user",
+          "content_token_count": 11643,
+          "target_output_tokens": 543
+        },
+        {
+          "role": "user",
+          "content_token_count": 14700,
+          "target_output_tokens": 547
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 1351,
+          "target_output_tokens": 2192
+        },
+        {
+          "role": "user",
+          "content_token_count": 23550,
+          "target_output_tokens": 200
+        },
+        {
+          "role": "user",
+          "content_token_count": 2511,
+          "target_output_tokens": 347
+        },
+        {
+          "role": "user",
+          "content_token_count": 20677,
+          "target_output_tokens": 589
+        },
+        {
+          "role": "user",
+          "content_token_count": 3425,
+          "target_output_tokens": 1138
+        },
+        {
+          "role": "user",
+          "content_token_count": 22755,
+          "target_output_tokens": 1462
+        },
+        {
+          "role": "user",
+          "content_token_count": 6087,
+          "target_output_tokens": 840
+        },
+        {
+          "role": "user",
+          "content_token_count": 9876,
+          "target_output_tokens": 164
+        },
+        {
+          "role": "user",
+          "content_token_count": 5481,
+          "target_output_tokens": 787
+        },
+        {
+          "role": "user",
+          "content_token_count": 4935,
+          "target_output_tokens": 471
+        },
+        {
+          "role": "user",
+          "content_token_count": 4601,
+          "target_output_tokens": 373
+        },
+        {
+          "role": "user",
+          "content_token_count": 7449,
+          "target_output_tokens": 1129
+        },
+        {
+          "role": "user",
+          "content_token_count": 7437,
+          "target_output_tokens": 664
+        },
+        {
+          "role": "user",
+          "content_token_count": 18022,
+          "target_output_tokens": 609
+        },
+        {
+          "role": "user",
+          "content_token_count": 6651,
+          "target_output_tokens": 593
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 3803,
+          "target_output_tokens": 185
+        },
+        {
+          "role": "user",
+          "content_token_count": 4171,
+          "target_output_tokens": 471
+        },
+        {
+          "role": "user",
+          "content_token_count": 2991,
+          "target_output_tokens": 2486
+        },
+        {
+          "role": "user",
+          "content_token_count": 11107,
+          "target_output_tokens": 846
+        },
+        {
+          "role": "user",
+          "content_token_count": 12672,
+          "target_output_tokens": 1246
+        },
+        {
+          "role": "user",
+          "content_token_count": 9802,
+          "target_output_tokens": 404
+        },
+        {
+          "role": "user",
+          "content_token_count": 7244,
+          "target_output_tokens": 665
+        },
+        {
+          "role": "user",
+          "content_token_count": 11618,
+          "target_output_tokens": 1037
+        },
+        {
+          "role": "user",
+          "content_token_count": 4494,
+          "target_output_tokens": 365
+        },
+        {
+          "role": "user",
+          "content_token_count": 3666,
+          "target_output_tokens": 262
+        },
+        {
+          "role": "user",
+          "content_token_count": 10055,
+          "target_output_tokens": 395
+        },
+        {
+          "role": "user",
+          "content_token_count": 5900,
+          "target_output_tokens": 778
+        },
+        {
+          "role": "user",
+          "content_token_count": 2260,
+          "target_output_tokens": 112
+        },
+        {
+          "role": "user",
+          "content_token_count": 3803,
+          "target_output_tokens": 1263
+        },
+        {
+          "role": "user",
+          "content_token_count": 38195,
+          "target_output_tokens": 1187
+        },
+        {
+          "role": "user",
+          "content_token_count": 15430,
+          "target_output_tokens": 304
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 15126,
+          "target_output_tokens": 363
+        },
+        {
+          "role": "user",
+          "content_token_count": 11997,
+          "target_output_tokens": 65
+        },
+        {
+          "role": "user",
+          "content_token_count": 12124,
+          "target_output_tokens": 304
+        },
+        {
+          "role": "user",
+          "content_token_count": 2942,
+          "target_output_tokens": 722
+        },
+        {
+          "role": "user",
+          "content_token_count": 10438,
+          "target_output_tokens": 1058
+        },
+        {
+          "role": "user",
+          "content_token_count": 11401,
+          "target_output_tokens": 517
+        },
+        {
+          "role": "user",
+          "content_token_count": 22839,
+          "target_output_tokens": 1334
+        },
+        {
+          "role": "user",
+          "content_token_count": 4480,
+          "target_output_tokens": 409
+        },
+        {
+          "role": "user",
+          "content_token_count": 8627,
+          "target_output_tokens": 625
+        },
+        {
+          "role": "user",
+          "content_token_count": 2553,
+          "target_output_tokens": 1775
+        },
+        {
+          "role": "user",
+          "content_token_count": 5008,
+          "target_output_tokens": 1304
+        },
+        {
+          "role": "user",
+          "content_token_count": 14883,
+          "target_output_tokens": 920
+        },
+        {
+          "role": "user",
+          "content_token_count": 14845,
+          "target_output_tokens": 188
+        },
+        {
+          "role": "user",
+          "content_token_count": 7446,
+          "target_output_tokens": 116
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 1555,
+          "target_output_tokens": 87
+        },
+        {
+          "role": "user",
+          "content_token_count": 4544,
+          "target_output_tokens": 466
+        },
+        {
+          "role": "user",
+          "content_token_count": 3256,
+          "target_output_tokens": 560
+        },
+        {
+          "role": "user",
+          "content_token_count": 3753,
+          "target_output_tokens": 201
+        },
+        {
+          "role": "user",
+          "content_token_count": 12476,
+          "target_output_tokens": 1849
+        },
+        {
+          "role": "user",
+          "content_token_count": 8975,
+          "target_output_tokens": 1635
+        },
+        {
+          "role": "user",
+          "content_token_count": 2877,
+          "target_output_tokens": 355
+        },
+        {
+          "role": "user",
+          "content_token_count": 4514,
+          "target_output_tokens": 181
+        },
+        {
+          "role": "user",
+          "content_token_count": 5382,
+          "target_output_tokens": 458
+        },
+        {
+          "role": "user",
+          "content_token_count": 3729,
+          "target_output_tokens": 292
+        },
+        {
+          "role": "user",
+          "content_token_count": 23202,
+          "target_output_tokens": 850
+        },
+        {
+          "role": "user",
+          "content_token_count": 6266,
+          "target_output_tokens": 373
+        },
+        {
+          "role": "user",
+          "content_token_count": 2491,
+          "target_output_tokens": 651
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 5699,
+          "target_output_tokens": 448
+        },
+        {
+          "role": "user",
+          "content_token_count": 8399,
+          "target_output_tokens": 96
+        },
+        {
+          "role": "user",
+          "content_token_count": 24606,
+          "target_output_tokens": 892
+        },
+        {
+          "role": "user",
+          "content_token_count": 1881,
+          "target_output_tokens": 404
+        },
+        {
+          "role": "user",
+          "content_token_count": 14270,
+          "target_output_tokens": 302
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 2662,
+          "target_output_tokens": 159
+        },
+        {
+          "role": "user",
+          "content_token_count": 27451,
+          "target_output_tokens": 742
+        },
+        {
+          "role": "user",
+          "content_token_count": 6138,
+          "target_output_tokens": 752
+        },
+        {
+          "role": "user",
+          "content_token_count": 3040,
+          "target_output_tokens": 95
+        },
+        {
+          "role": "user",
+          "content_token_count": 3937,
+          "target_output_tokens": 394
+        },
+        {
+          "role": "user",
+          "content_token_count": 10143,
+          "target_output_tokens": 205
+        },
+        {
+          "role": "user",
+          "content_token_count": 4055,
+          "target_output_tokens": 665
+        },
+        {
+          "role": "user",
+          "content_token_count": 4486,
+          "target_output_tokens": 491
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 11225,
+          "target_output_tokens": 3158
+        },
+        {
+          "role": "user",
+          "content_token_count": 5709,
+          "target_output_tokens": 206
+        },
+        {
+          "role": "user",
+          "content_token_count": 8289,
+          "target_output_tokens": 2061
+        },
+        {
+          "role": "user",
+          "content_token_count": 11501,
+          "target_output_tokens": 625
+        },
+        {
+          "role": "user",
+          "content_token_count": 3024,
+          "target_output_tokens": 131
+        },
+        {
+          "role": "user",
+          "content_token_count": 6949,
+          "target_output_tokens": 743
+        },
+        {
+          "role": "user",
+          "content_token_count": 3555,
+          "target_output_tokens": 205
+        },
+        {
+          "role": "user",
+          "content_token_count": 4155,
+          "target_output_tokens": 478
+        },
+        {
+          "role": "user",
+          "content_token_count": 11184,
+          "target_output_tokens": 279
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 15198,
+          "target_output_tokens": 865
+        },
+        {
+          "role": "user",
+          "content_token_count": 27300,
+          "target_output_tokens": 352
+        },
+        {
+          "role": "user",
+          "content_token_count": 4084,
+          "target_output_tokens": 694
+        },
+        {
+          "role": "user",
+          "content_token_count": 2879,
+          "target_output_tokens": 643
+        },
+        {
+          "role": "user",
+          "content_token_count": 8411,
+          "target_output_tokens": 1094
+        },
+        {
+          "role": "user",
+          "content_token_count": 3496,
+          "target_output_tokens": 845
+        },
+        {
+          "role": "user",
+          "content_token_count": 14540,
+          "target_output_tokens": 288
+        },
+        {
+          "role": "user",
+          "content_token_count": 4651,
+          "target_output_tokens": 385
+        },
+        {
+          "role": "user",
+          "content_token_count": 14792,
+          "target_output_tokens": 842
+        },
+        {
+          "role": "user",
+          "content_token_count": 6271,
+          "target_output_tokens": 317
+        },
+        {
+          "role": "user",
+          "content_token_count": 7613,
+          "target_output_tokens": 763
+        },
+        {
+          "role": "user",
+          "content_token_count": 5852,
+          "target_output_tokens": 418
+        },
+        {
+          "role": "user",
+          "content_token_count": 11166,
+          "target_output_tokens": 2196
+        },
+        {
+          "role": "user",
+          "content_token_count": 19005,
+          "target_output_tokens": 1055
+        },
+        {
+          "role": "user",
+          "content_token_count": 5886,
+          "target_output_tokens": 492
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 4062,
+          "target_output_tokens": 1211
+        },
+        {
+          "role": "user",
+          "content_token_count": 2190,
+          "target_output_tokens": 717
+        },
+        {
+          "role": "user",
+          "content_token_count": 7556,
+          "target_output_tokens": 257
+        },
+        {
+          "role": "user",
+          "content_token_count": 5768,
+          "target_output_tokens": 1324
+        },
+        {
+          "role": "user",
+          "content_token_count": 5463,
+          "target_output_tokens": 1404
+        },
+        {
+          "role": "user",
+          "content_token_count": 19173,
+          "target_output_tokens": 808
+        },
+        {
+          "role": "user",
+          "content_token_count": 7797,
+          "target_output_tokens": 808
+        },
+        {
+          "role": "user",
+          "content_token_count": 4039,
+          "target_output_tokens": 414
+        },
+        {
+          "role": "user",
+          "content_token_count": 2391,
+          "target_output_tokens": 436
+        },
+        {
+          "role": "user",
+          "content_token_count": 1957,
+          "target_output_tokens": 1098
+        },
+        {
+          "role": "user",
+          "content_token_count": 16198,
+          "target_output_tokens": 852
+        },
+        {
+          "role": "user",
+          "content_token_count": 3101,
+          "target_output_tokens": 532
+        },
+        {
+          "role": "user",
+          "content_token_count": 4035,
+          "target_output_tokens": 833
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 1220,
+          "target_output_tokens": 138
+        },
+        {
+          "role": "user",
+          "content_token_count": 14648,
+          "target_output_tokens": 168
+        },
+        {
+          "role": "user",
+          "content_token_count": 8228,
+          "target_output_tokens": 537
+        },
+        {
+          "role": "user",
+          "content_token_count": 2352,
+          "target_output_tokens": 462
+        },
+        {
+          "role": "user",
+          "content_token_count": 7794,
+          "target_output_tokens": 259
+        },
+        {
+          "role": "user",
+          "content_token_count": 2734,
+          "target_output_tokens": 819
+        },
+        {
+          "role": "user",
+          "content_token_count": 17235,
+          "target_output_tokens": 1471
+        },
+        {
+          "role": "user",
+          "content_token_count": 1357,
+          "target_output_tokens": 762
+        },
+        {
+          "role": "user",
+          "content_token_count": 10804,
+          "target_output_tokens": 156
+        },
+        {
+          "role": "user",
+          "content_token_count": 16389,
+          "target_output_tokens": 983
+        },
+        {
+          "role": "user",
+          "content_token_count": 5074,
+          "target_output_tokens": 431
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 10280,
+          "target_output_tokens": 119
+        },
+        {
+          "role": "user",
+          "content_token_count": 4370,
+          "target_output_tokens": 817
+        },
+        {
+          "role": "user",
+          "content_token_count": 6854,
+          "target_output_tokens": 1795
+        },
+        {
+          "role": "user",
+          "content_token_count": 15223,
+          "target_output_tokens": 543
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 6116,
+          "target_output_tokens": 309
+        },
+        {
+          "role": "user",
+          "content_token_count": 6257,
+          "target_output_tokens": 1301
+        },
+        {
+          "role": "user",
+          "content_token_count": 16623,
+          "target_output_tokens": 1520
+        },
+        {
+          "role": "user",
+          "content_token_count": 9563,
+          "target_output_tokens": 1403
+        },
+        {
+          "role": "user",
+          "content_token_count": 9134,
+          "target_output_tokens": 840
+        },
+        {
+          "role": "user",
+          "content_token_count": 6453,
+          "target_output_tokens": 388
+        },
+        {
+          "role": "user",
+          "content_token_count": 2951,
+          "target_output_tokens": 376
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 3444,
+          "target_output_tokens": 414
+        },
+        {
+          "role": "user",
+          "content_token_count": 2321,
+          "target_output_tokens": 901
+        },
+        {
+          "role": "user",
+          "content_token_count": 3638,
+          "target_output_tokens": 1425
+        },
+        {
+          "role": "user",
+          "content_token_count": 7123,
+          "target_output_tokens": 1696
+        },
+        {
+          "role": "user",
+          "content_token_count": 2057,
+          "target_output_tokens": 351
+        },
+        {
+          "role": "user",
+          "content_token_count": 18346,
+          "target_output_tokens": 587
+        },
+        {
+          "role": "user",
+          "content_token_count": 9716,
+          "target_output_tokens": 640
+        },
+        {
+          "role": "user",
+          "content_token_count": 6768,
+          "target_output_tokens": 388
+        },
+        {
+          "role": "user",
+          "content_token_count": 3788,
+          "target_output_tokens": 250
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 2734,
+          "target_output_tokens": 1979
+        },
+        {
+          "role": "user",
+          "content_token_count": 4136,
+          "target_output_tokens": 2452
+        },
+        {
+          "role": "user",
+          "content_token_count": 7721,
+          "target_output_tokens": 550
+        },
+        {
+          "role": "user",
+          "content_token_count": 1881,
+          "target_output_tokens": 648
+        },
+        {
+          "role": "user",
+          "content_token_count": 6673,
+          "target_output_tokens": 406
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 6955,
+          "target_output_tokens": 1459
+        },
+        {
+          "role": "user",
+          "content_token_count": 1014,
+          "target_output_tokens": 1007
+        },
+        {
+          "role": "user",
+          "content_token_count": 13098,
+          "target_output_tokens": 1459
+        },
+        {
+          "role": "user",
+          "content_token_count": 4876,
+          "target_output_tokens": 947
+        },
+        {
+          "role": "user",
+          "content_token_count": 9889,
+          "target_output_tokens": 1563
+        },
+        {
+          "role": "user",
+          "content_token_count": 2544,
+          "target_output_tokens": 3149
+        },
+        {
+          "role": "user",
+          "content_token_count": 9006,
+          "target_output_tokens": 245
+        },
+        {
+          "role": "user",
+          "content_token_count": 18694,
+          "target_output_tokens": 1384
+        },
+        {
+          "role": "user",
+          "content_token_count": 1467,
+          "target_output_tokens": 1471
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 17406,
+          "target_output_tokens": 286
+        },
+        {
+          "role": "user",
+          "content_token_count": 3679,
+          "target_output_tokens": 636
+        },
+        {
+          "role": "user",
+          "content_token_count": 2184,
+          "target_output_tokens": 321
+        },
+        {
+          "role": "user",
+          "content_token_count": 7967,
+          "target_output_tokens": 187
+        },
+        {
+          "role": "user",
+          "content_token_count": 6174,
+          "target_output_tokens": 654
+        },
+        {
+          "role": "user",
+          "content_token_count": 7180,
+          "target_output_tokens": 270
+        },
+        {
+          "role": "user",
+          "content_token_count": 10946,
+          "target_output_tokens": 95
+        },
+        {
+          "role": "user",
+          "content_token_count": 2518,
+          "target_output_tokens": 430
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 6603,
+          "target_output_tokens": 646
+        },
+        {
+          "role": "user",
+          "content_token_count": 10518,
+          "target_output_tokens": 1096
+        },
+        {
+          "role": "user",
+          "content_token_count": 14848,
+          "target_output_tokens": 408
+        },
+        {
+          "role": "user",
+          "content_token_count": 2262,
+          "target_output_tokens": 499
+        },
+        {
+          "role": "user",
+          "content_token_count": 6591,
+          "target_output_tokens": 662
+        },
+        {
+          "role": "user",
+          "content_token_count": 5042,
+          "target_output_tokens": 540
+        },
+        {
+          "role": "user",
+          "content_token_count": 14974,
+          "target_output_tokens": 3408
+        },
+        {
+          "role": "user",
+          "content_token_count": 5658,
+          "target_output_tokens": 1060
+        },
+        {
+          "role": "user",
+          "content_token_count": 5558,
+          "target_output_tokens": 1785
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 3100,
+          "target_output_tokens": 849
+        },
+        {
+          "role": "user",
+          "content_token_count": 12776,
+          "target_output_tokens": 945
+        },
+        {
+          "role": "user",
+          "content_token_count": 2376,
+          "target_output_tokens": 1003
+        },
+        {
+          "role": "user",
+          "content_token_count": 6865,
+          "target_output_tokens": 462
+        },
+        {
+          "role": "user",
+          "content_token_count": 3111,
+          "target_output_tokens": 509
+        },
+        {
+          "role": "user",
+          "content_token_count": 16078,
+          "target_output_tokens": 342
+        },
+        {
+          "role": "user",
+          "content_token_count": 16493,
+          "target_output_tokens": 733
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 8957,
+          "target_output_tokens": 307
+        },
+        {
+          "role": "user",
+          "content_token_count": 19094,
+          "target_output_tokens": 427
+        },
+        {
+          "role": "user",
+          "content_token_count": 2869,
+          "target_output_tokens": 405
+        },
+        {
+          "role": "user",
+          "content_token_count": 18384,
+          "target_output_tokens": 185
+        },
+        {
+          "role": "user",
+          "content_token_count": 6443,
+          "target_output_tokens": 1522
+        },
+        {
+          "role": "user",
+          "content_token_count": 5348,
+          "target_output_tokens": 662
+        },
+        {
+          "role": "user",
+          "content_token_count": 3869,
+          "target_output_tokens": 175
+        },
+        {
+          "role": "user",
+          "content_token_count": 5106,
+          "target_output_tokens": 761
+        },
+        {
+          "role": "user",
+          "content_token_count": 16260,
+          "target_output_tokens": 2221
+        },
+        {
+          "role": "user",
+          "content_token_count": 3983,
+          "target_output_tokens": 90
+        },
+        {
+          "role": "user",
+          "content_token_count": 2900,
+          "target_output_tokens": 809
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 4829,
+          "target_output_tokens": 226
+        },
+        {
+          "role": "user",
+          "content_token_count": 2384,
+          "target_output_tokens": 491
+        },
+        {
+          "role": "user",
+          "content_token_count": 26292,
+          "target_output_tokens": 659
+        },
+        {
+          "role": "user",
+          "content_token_count": 12843,
+          "target_output_tokens": 692
+        },
+        {
+          "role": "user",
+          "content_token_count": 3004,
+          "target_output_tokens": 300
+        },
+        {
+          "role": "user",
+          "content_token_count": 21070,
+          "target_output_tokens": 1321
+        },
+        {
+          "role": "user",
+          "content_token_count": 12368,
+          "target_output_tokens": 129
+        },
+        {
+          "role": "user",
+          "content_token_count": 6159,
+          "target_output_tokens": 1480
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 5460,
+          "target_output_tokens": 249
+        },
+        {
+          "role": "user",
+          "content_token_count": 9185,
+          "target_output_tokens": 229
+        },
+        {
+          "role": "user",
+          "content_token_count": 29343,
+          "target_output_tokens": 319
+        },
+        {
+          "role": "user",
+          "content_token_count": 7542,
+          "target_output_tokens": 1027
+        },
+        {
+          "role": "user",
+          "content_token_count": 3182,
+          "target_output_tokens": 248
+        },
+        {
+          "role": "user",
+          "content_token_count": 9888,
+          "target_output_tokens": 1865
+        },
+        {
+          "role": "user",
+          "content_token_count": 7401,
+          "target_output_tokens": 854
+        },
+        {
+          "role": "user",
+          "content_token_count": 6561,
+          "target_output_tokens": 654
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 6488,
+          "target_output_tokens": 77
+        },
+        {
+          "role": "user",
+          "content_token_count": 6158,
+          "target_output_tokens": 374
+        },
+        {
+          "role": "user",
+          "content_token_count": 12575,
+          "target_output_tokens": 1325
+        },
+        {
+          "role": "user",
+          "content_token_count": 18730,
+          "target_output_tokens": 325
+        },
+        {
+          "role": "user",
+          "content_token_count": 2581,
+          "target_output_tokens": 1027
+        },
+        {
+          "role": "user",
+          "content_token_count": 65536,
+          "target_output_tokens": 1888
+        },
+        {
+          "role": "user",
+          "content_token_count": 1787,
+          "target_output_tokens": 970
+        },
+        {
+          "role": "user",
+          "content_token_count": 7304,
+          "target_output_tokens": 181
+        },
+        {
+          "role": "user",
+          "content_token_count": 4038,
+          "target_output_tokens": 2854
+        },
+        {
+          "role": "user",
+          "content_token_count": 9441,
+          "target_output_tokens": 985
+        },
+        {
+          "role": "user",
+          "content_token_count": 5386,
+          "target_output_tokens": 550
+        },
+        {
+          "role": "user",
+          "content_token_count": 895,
+          "target_output_tokens": 550
+        },
+        {
+          "role": "user",
+          "content_token_count": 3238,
+          "target_output_tokens": 467
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 9749,
+          "target_output_tokens": 594
+        },
+        {
+          "role": "user",
+          "content_token_count": 6586,
+          "target_output_tokens": 303
+        },
+        {
+          "role": "user",
+          "content_token_count": 13734,
+          "target_output_tokens": 1592
+        },
+        {
+          "role": "user",
+          "content_token_count": 4723,
+          "target_output_tokens": 2155
+        },
+        {
+          "role": "user",
+          "content_token_count": 19342,
+          "target_output_tokens": 161
+        },
+        {
+          "role": "user",
+          "content_token_count": 7921,
+          "target_output_tokens": 130
+        },
+        {
+          "role": "user",
+          "content_token_count": 26045,
+          "target_output_tokens": 613
+        },
+        {
+          "role": "user",
+          "content_token_count": 9327,
+          "target_output_tokens": 158
+        },
+        {
+          "role": "user",
+          "content_token_count": 5054,
+          "target_output_tokens": 652
+        },
+        {
+          "role": "user",
+          "content_token_count": 65536,
+          "target_output_tokens": 753
+        },
+        {
+          "role": "user",
+          "content_token_count": 13763,
+          "target_output_tokens": 501
+        },
+        {
+          "role": "user",
+          "content_token_count": 7809,
+          "target_output_tokens": 618
+        },
+        {
+          "role": "user",
+          "content_token_count": 1780,
+          "target_output_tokens": 1609
+        },
+        {
+          "role": "user",
+          "content_token_count": 13566,
+          "target_output_tokens": 219
+        },
+        {
+          "role": "user",
+          "content_token_count": 8244,
+          "target_output_tokens": 707
+        },
+        {
+          "role": "user",
+          "content_token_count": 3690,
+          "target_output_tokens": 2575
+        },
+        {
+          "role": "user",
+          "content_token_count": 8579,
+          "target_output_tokens": 289
+        },
+        {
+          "role": "user",
+          "content_token_count": 13461,
+          "target_output_tokens": 835
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 7460,
+          "target_output_tokens": 564
+        },
+        {
+          "role": "user",
+          "content_token_count": 12306,
+          "target_output_tokens": 643
+        },
+        {
+          "role": "user",
+          "content_token_count": 4237,
+          "target_output_tokens": 436
+        },
+        {
+          "role": "user",
+          "content_token_count": 2239,
+          "target_output_tokens": 1437
+        },
+        {
+          "role": "user",
+          "content_token_count": 4323,
+          "target_output_tokens": 1610
+        },
+        {
+          "role": "user",
+          "content_token_count": 8322,
+          "target_output_tokens": 628
+        },
+        {
+          "role": "user",
+          "content_token_count": 8307,
+          "target_output_tokens": 321
+        },
+        {
+          "role": "user",
+          "content_token_count": 8038,
+          "target_output_tokens": 221
+        },
+        {
+          "role": "user",
+          "content_token_count": 9312,
+          "target_output_tokens": 119
+        },
+        {
+          "role": "user",
+          "content_token_count": 8570,
+          "target_output_tokens": 1070
+        },
+        {
+          "role": "user",
+          "content_token_count": 43634,
+          "target_output_tokens": 801
+        },
+        {
+          "role": "user",
+          "content_token_count": 9896,
+          "target_output_tokens": 559
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 11595,
+          "target_output_tokens": 458
+        },
+        {
+          "role": "user",
+          "content_token_count": 8292,
+          "target_output_tokens": 942
+        },
+        {
+          "role": "user",
+          "content_token_count": 3946,
+          "target_output_tokens": 490
+        },
+        {
+          "role": "user",
+          "content_token_count": 2955,
+          "target_output_tokens": 712
+        },
+        {
+          "role": "user",
+          "content_token_count": 4839,
+          "target_output_tokens": 272
+        },
+        {
+          "role": "user",
+          "content_token_count": 4011,
+          "target_output_tokens": 335
+        },
+        {
+          "role": "user",
+          "content_token_count": 5086,
+          "target_output_tokens": 315
+        },
+        {
+          "role": "user",
+          "content_token_count": 5209,
+          "target_output_tokens": 764
+        },
+        {
+          "role": "user",
+          "content_token_count": 6710,
+          "target_output_tokens": 146
+        },
+        {
+          "role": "user",
+          "content_token_count": 2382,
+          "target_output_tokens": 277
+        },
+        {
+          "role": "user",
+          "content_token_count": 18762,
+          "target_output_tokens": 312
+        },
+        {
+          "role": "user",
+          "content_token_count": 3554,
+          "target_output_tokens": 393
+        },
+        {
+          "role": "user",
+          "content_token_count": 10240,
+          "target_output_tokens": 130
+        },
+        {
+          "role": "user",
+          "content_token_count": 10301,
+          "target_output_tokens": 986
+        },
+        {
+          "role": "user",
+          "content_token_count": 4008,
+          "target_output_tokens": 461
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 21422,
+          "target_output_tokens": 346
+        },
+        {
+          "role": "user",
+          "content_token_count": 5246,
+          "target_output_tokens": 217
+        },
+        {
+          "role": "user",
+          "content_token_count": 13646,
+          "target_output_tokens": 499
+        },
+        {
+          "role": "user",
+          "content_token_count": 5532,
+          "target_output_tokens": 249
+        },
+        {
+          "role": "user",
+          "content_token_count": 5178,
+          "target_output_tokens": 149
+        },
+        {
+          "role": "user",
+          "content_token_count": 1034,
+          "target_output_tokens": 316
+        },
+        {
+          "role": "user",
+          "content_token_count": 3570,
+          "target_output_tokens": 318
+        },
+        {
+          "role": "user",
+          "content_token_count": 9334,
+          "target_output_tokens": 1761
+        },
+        {
+          "role": "user",
+          "content_token_count": 4071,
+          "target_output_tokens": 227
+        },
+        {
+          "role": "user",
+          "content_token_count": 11734,
+          "target_output_tokens": 340
+        },
+        {
+          "role": "user",
+          "content_token_count": 5927,
+          "target_output_tokens": 302
+        },
+        {
+          "role": "user",
+          "content_token_count": 7918,
+          "target_output_tokens": 337
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 2647,
+          "target_output_tokens": 301
+        },
+        {
+          "role": "user",
+          "content_token_count": 14271,
+          "target_output_tokens": 1313
+        },
+        {
+          "role": "user",
+          "content_token_count": 5670,
+          "target_output_tokens": 954
+        },
+        {
+          "role": "user",
+          "content_token_count": 5014,
+          "target_output_tokens": 2103
+        },
+        {
+          "role": "user",
+          "content_token_count": 14137,
+          "target_output_tokens": 997
+        },
+        {
+          "role": "user",
+          "content_token_count": 8872,
+          "target_output_tokens": 1332
+        },
+        {
+          "role": "user",
+          "content_token_count": 2096,
+          "target_output_tokens": 4096
+        },
+        {
+          "role": "user",
+          "content_token_count": 16766,
+          "target_output_tokens": 587
+        },
+        {
+          "role": "user",
+          "content_token_count": 5742,
+          "target_output_tokens": 493
+        },
+        {
+          "role": "user",
+          "content_token_count": 21664,
+          "target_output_tokens": 696
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 3432,
+          "target_output_tokens": 203
+        },
+        {
+          "role": "user",
+          "content_token_count": 4013,
+          "target_output_tokens": 79
+        },
+        {
+          "role": "user",
+          "content_token_count": 23484,
+          "target_output_tokens": 220
+        },
+        {
+          "role": "user",
+          "content_token_count": 1546,
+          "target_output_tokens": 289
+        },
+        {
+          "role": "user",
+          "content_token_count": 4542,
+          "target_output_tokens": 515
+        },
+        {
+          "role": "user",
+          "content_token_count": 5260,
+          "target_output_tokens": 378
+        },
+        {
+          "role": "user",
+          "content_token_count": 5487,
+          "target_output_tokens": 654
+        },
+        {
+          "role": "user",
+          "content_token_count": 7881,
+          "target_output_tokens": 380
+        },
+        {
+          "role": "user",
+          "content_token_count": 3358,
+          "target_output_tokens": 687
+        },
+        {
+          "role": "user",
+          "content_token_count": 11898,
+          "target_output_tokens": 180
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 38833,
+          "target_output_tokens": 534
+        },
+        {
+          "role": "user",
+          "content_token_count": 5781,
+          "target_output_tokens": 725
+        },
+        {
+          "role": "user",
+          "content_token_count": 7261,
+          "target_output_tokens": 165
+        },
+        {
+          "role": "user",
+          "content_token_count": 1280,
+          "target_output_tokens": 129
+        },
+        {
+          "role": "user",
+          "content_token_count": 5792,
+          "target_output_tokens": 466
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 10544,
+          "target_output_tokens": 692
+        },
+        {
+          "role": "user",
+          "content_token_count": 15136,
+          "target_output_tokens": 836
+        },
+        {
+          "role": "user",
+          "content_token_count": 5686,
+          "target_output_tokens": 1758
+        },
+        {
+          "role": "user",
+          "content_token_count": 12712,
+          "target_output_tokens": 2240
+        },
+        {
+          "role": "user",
+          "content_token_count": 4875,
+          "target_output_tokens": 482
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 60523,
+          "target_output_tokens": 271
+        },
+        {
+          "role": "user",
+          "content_token_count": 10297,
+          "target_output_tokens": 631
+        },
+        {
+          "role": "user",
+          "content_token_count": 16059,
+          "target_output_tokens": 648
+        },
+        {
+          "role": "user",
+          "content_token_count": 20684,
+          "target_output_tokens": 487
+        },
+        {
+          "role": "user",
+          "content_token_count": 6343,
+          "target_output_tokens": 637
+        },
+        {
+          "role": "user",
+          "content_token_count": 29821,
+          "target_output_tokens": 436
+        },
+        {
+          "role": "user",
+          "content_token_count": 2615,
+          "target_output_tokens": 187
+        },
+        {
+          "role": "user",
+          "content_token_count": 4564,
+          "target_output_tokens": 980
+        },
+        {
+          "role": "user",
+          "content_token_count": 7889,
+          "target_output_tokens": 907
+        },
+        {
+          "role": "user",
+          "content_token_count": 14777,
+          "target_output_tokens": 361
+        },
+        {
+          "role": "user",
+          "content_token_count": 5646,
+          "target_output_tokens": 1521
+        },
+        {
+          "role": "user",
+          "content_token_count": 13268,
+          "target_output_tokens": 554
+        },
+        {
+          "role": "user",
+          "content_token_count": 10637,
+          "target_output_tokens": 1013
+        },
+        {
+          "role": "user",
+          "content_token_count": 5757,
+          "target_output_tokens": 1339
+        },
+        {
+          "role": "user",
+          "content_token_count": 5184,
+          "target_output_tokens": 628
+        },
+        {
+          "role": "user",
+          "content_token_count": 12479,
+          "target_output_tokens": 792
+        },
+        {
+          "role": "user",
+          "content_token_count": 18012,
+          "target_output_tokens": 167
+        },
+        {
+          "role": "user",
+          "content_token_count": 14643,
+          "target_output_tokens": 532
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 1938,
+          "target_output_tokens": 1098
+        },
+        {
+          "role": "user",
+          "content_token_count": 685,
+          "target_output_tokens": 986
+        },
+        {
+          "role": "user",
+          "content_token_count": 3023,
+          "target_output_tokens": 292
+        },
+        {
+          "role": "user",
+          "content_token_count": 26370,
+          "target_output_tokens": 332
+        },
+        {
+          "role": "user",
+          "content_token_count": 7935,
+          "target_output_tokens": 179
+        },
+        {
+          "role": "user",
+          "content_token_count": 2052,
+          "target_output_tokens": 99
+        },
+        {
+          "role": "user",
+          "content_token_count": 5165,
+          "target_output_tokens": 747
+        },
+        {
+          "role": "user",
+          "content_token_count": 13734,
+          "target_output_tokens": 435
+        },
+        {
+          "role": "user",
+          "content_token_count": 979,
+          "target_output_tokens": 760
+        },
+        {
+          "role": "user",
+          "content_token_count": 4084,
+          "target_output_tokens": 604
+        },
+        {
+          "role": "user",
+          "content_token_count": 19546,
+          "target_output_tokens": 183
+        },
+        {
+          "role": "user",
+          "content_token_count": 1609,
+          "target_output_tokens": 191
+        },
+        {
+          "role": "user",
+          "content_token_count": 3857,
+          "target_output_tokens": 1024
+        },
+        {
+          "role": "user",
+          "content_token_count": 21131,
+          "target_output_tokens": 1830
+        },
+        {
+          "role": "user",
+          "content_token_count": 4129,
+          "target_output_tokens": 343
+        },
+        {
+          "role": "user",
+          "content_token_count": 30740,
+          "target_output_tokens": 635
+        },
+        {
+          "role": "user",
+          "content_token_count": 10871,
+          "target_output_tokens": 995
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 8416,
+          "target_output_tokens": 664
+        },
+        {
+          "role": "user",
+          "content_token_count": 6856,
+          "target_output_tokens": 360
+        },
+        {
+          "role": "user",
+          "content_token_count": 12991,
+          "target_output_tokens": 1554
+        },
+        {
+          "role": "user",
+          "content_token_count": 2681,
+          "target_output_tokens": 1392
+        },
+        {
+          "role": "user",
+          "content_token_count": 2083,
+          "target_output_tokens": 1322
+        },
+        {
+          "role": "user",
+          "content_token_count": 2529,
+          "target_output_tokens": 862
+        },
+        {
+          "role": "user",
+          "content_token_count": 4854,
+          "target_output_tokens": 412
+        },
+        {
+          "role": "user",
+          "content_token_count": 5826,
+          "target_output_tokens": 904
+        },
+        {
+          "role": "user",
+          "content_token_count": 1412,
+          "target_output_tokens": 197
+        },
+        {
+          "role": "user",
+          "content_token_count": 16884,
+          "target_output_tokens": 319
+        },
+        {
+          "role": "user",
+          "content_token_count": 2209,
+          "target_output_tokens": 370
+        },
+        {
+          "role": "user",
+          "content_token_count": 6010,
+          "target_output_tokens": 1294
+        },
+        {
+          "role": "user",
+          "content_token_count": 19805,
+          "target_output_tokens": 2855
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 7510,
+          "target_output_tokens": 354
+        },
+        {
+          "role": "user",
+          "content_token_count": 20508,
+          "target_output_tokens": 390
+        },
+        {
+          "role": "user",
+          "content_token_count": 14364,
+          "target_output_tokens": 234
+        },
+        {
+          "role": "user",
+          "content_token_count": 5578,
+          "target_output_tokens": 672
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 7461,
+          "target_output_tokens": 2138
+        },
+        {
+          "role": "user",
+          "content_token_count": 8915,
+          "target_output_tokens": 721
+        },
+        {
+          "role": "user",
+          "content_token_count": 827,
+          "target_output_tokens": 458
+        },
+        {
+          "role": "user",
+          "content_token_count": 5858,
+          "target_output_tokens": 252
+        },
+        {
+          "role": "user",
+          "content_token_count": 3199,
+          "target_output_tokens": 864
+        },
+        {
+          "role": "user",
+          "content_token_count": 17479,
+          "target_output_tokens": 387
+        },
+        {
+          "role": "user",
+          "content_token_count": 6488,
+          "target_output_tokens": 768
+        },
+        {
+          "role": "user",
+          "content_token_count": 11265,
+          "target_output_tokens": 797
+        },
+        {
+          "role": "user",
+          "content_token_count": 6991,
+          "target_output_tokens": 802
+        },
+        {
+          "role": "user",
+          "content_token_count": 12962,
+          "target_output_tokens": 559
+        },
+        {
+          "role": "user",
+          "content_token_count": 6638,
+          "target_output_tokens": 2509
+        },
+        {
+          "role": "user",
+          "content_token_count": 2297,
+          "target_output_tokens": 803
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 11614,
+          "target_output_tokens": 248
+        },
+        {
+          "role": "user",
+          "content_token_count": 3234,
+          "target_output_tokens": 64
+        },
+        {
+          "role": "user",
+          "content_token_count": 18001,
+          "target_output_tokens": 64
+        },
+        {
+          "role": "user",
+          "content_token_count": 17797,
+          "target_output_tokens": 792
+        },
+        {
+          "role": "user",
+          "content_token_count": 15525,
+          "target_output_tokens": 341
+        },
+        {
+          "role": "user",
+          "content_token_count": 11380,
+          "target_output_tokens": 308
+        },
+        {
+          "role": "user",
+          "content_token_count": 20150,
+          "target_output_tokens": 336
+        },
+        {
+          "role": "user",
+          "content_token_count": 10705,
+          "target_output_tokens": 149
+        },
+        {
+          "role": "user",
+          "content_token_count": 5871,
+          "target_output_tokens": 432
+        },
+        {
+          "role": "user",
+          "content_token_count": 5526,
+          "target_output_tokens": 406
+        },
+        {
+          "role": "user",
+          "content_token_count": 7675,
+          "target_output_tokens": 1587
+        },
+        {
+          "role": "user",
+          "content_token_count": 2277,
+          "target_output_tokens": 1478
+        },
+        {
+          "role": "user",
+          "content_token_count": 9244,
+          "target_output_tokens": 168
+        },
+        {
+          "role": "user",
+          "content_token_count": 9135,
+          "target_output_tokens": 141
+        },
+        {
+          "role": "user",
+          "content_token_count": 6477,
+          "target_output_tokens": 847
+        },
+        {
+          "role": "user",
+          "content_token_count": 5213,
+          "target_output_tokens": 381
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 11902,
+          "target_output_tokens": 363
+        },
+        {
+          "role": "user",
+          "content_token_count": 4133,
+          "target_output_tokens": 763
+        },
+        {
+          "role": "user",
+          "content_token_count": 34974,
+          "target_output_tokens": 595
+        },
+        {
+          "role": "user",
+          "content_token_count": 3005,
+          "target_output_tokens": 748
+        },
+        {
+          "role": "user",
+          "content_token_count": 13140,
+          "target_output_tokens": 1585
+        },
+        {
+          "role": "user",
+          "content_token_count": 10800,
+          "target_output_tokens": 451
+        },
+        {
+          "role": "user",
+          "content_token_count": 7703,
+          "target_output_tokens": 308
+        },
+        {
+          "role": "user",
+          "content_token_count": 6180,
+          "target_output_tokens": 421
+        },
+        {
+          "role": "user",
+          "content_token_count": 7095,
+          "target_output_tokens": 2469
+        },
+        {
+          "role": "user",
+          "content_token_count": 27521,
+          "target_output_tokens": 645
+        },
+        {
+          "role": "user",
+          "content_token_count": 14207,
+          "target_output_tokens": 615
+        },
+        {
+          "role": "user",
+          "content_token_count": 7467,
+          "target_output_tokens": 736
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 20561,
+          "target_output_tokens": 111
+        },
+        {
+          "role": "user",
+          "content_token_count": 1000,
+          "target_output_tokens": 934
+        },
+        {
+          "role": "user",
+          "content_token_count": 32461,
+          "target_output_tokens": 115
+        },
+        {
+          "role": "user",
+          "content_token_count": 7010,
+          "target_output_tokens": 128
+        },
+        {
+          "role": "user",
+          "content_token_count": 65536,
+          "target_output_tokens": 567
+        },
+        {
+          "role": "user",
+          "content_token_count": 9176,
+          "target_output_tokens": 146
+        },
+        {
+          "role": "user",
+          "content_token_count": 11138,
+          "target_output_tokens": 2089
+        },
+        {
+          "role": "user",
+          "content_token_count": 24757,
+          "target_output_tokens": 204
+        },
+        {
+          "role": "user",
+          "content_token_count": 6580,
+          "target_output_tokens": 1229
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 4856,
+          "target_output_tokens": 587
+        },
+        {
+          "role": "user",
+          "content_token_count": 4192,
+          "target_output_tokens": 631
+        },
+        {
+          "role": "user",
+          "content_token_count": 7377,
+          "target_output_tokens": 358
+        },
+        {
+          "role": "user",
+          "content_token_count": 4030,
+          "target_output_tokens": 437
+        },
+        {
+          "role": "user",
+          "content_token_count": 8482,
+          "target_output_tokens": 404
+        },
+        {
+          "role": "user",
+          "content_token_count": 10934,
+          "target_output_tokens": 397
+        },
+        {
+          "role": "user",
+          "content_token_count": 5271,
+          "target_output_tokens": 105
+        },
+        {
+          "role": "user",
+          "content_token_count": 1504,
+          "target_output_tokens": 207
+        },
+        {
+          "role": "user",
+          "content_token_count": 12542,
+          "target_output_tokens": 497
+        },
+        {
+          "role": "user",
+          "content_token_count": 3169,
+          "target_output_tokens": 418
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 34022,
+          "target_output_tokens": 920
+        },
+        {
+          "role": "user",
+          "content_token_count": 4306,
+          "target_output_tokens": 383
+        },
+        {
+          "role": "user",
+          "content_token_count": 3490,
+          "target_output_tokens": 1086
+        },
+        {
+          "role": "user",
+          "content_token_count": 3939,
+          "target_output_tokens": 1038
+        },
+        {
+          "role": "user",
+          "content_token_count": 26508,
+          "target_output_tokens": 1136
+        },
+        {
+          "role": "user",
+          "content_token_count": 7044,
+          "target_output_tokens": 3317
+        },
+        {
+          "role": "user",
+          "content_token_count": 2441,
+          "target_output_tokens": 962
+        },
+        {
+          "role": "user",
+          "content_token_count": 2360,
+          "target_output_tokens": 442
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 13707,
+          "target_output_tokens": 159
+        },
+        {
+          "role": "user",
+          "content_token_count": 3362,
+          "target_output_tokens": 495
+        },
+        {
+          "role": "user",
+          "content_token_count": 3014,
+          "target_output_tokens": 156
+        },
+        {
+          "role": "user",
+          "content_token_count": 9534,
+          "target_output_tokens": 430
+        },
+        {
+          "role": "user",
+          "content_token_count": 8037,
+          "target_output_tokens": 724
+        },
+        {
+          "role": "user",
+          "content_token_count": 12462,
+          "target_output_tokens": 814
+        },
+        {
+          "role": "user",
+          "content_token_count": 18227,
+          "target_output_tokens": 371
+        },
+        {
+          "role": "user",
+          "content_token_count": 2077,
+          "target_output_tokens": 867
+        },
+        {
+          "role": "user",
+          "content_token_count": 10950,
+          "target_output_tokens": 412
+        },
+        {
+          "role": "user",
+          "content_token_count": 12169,
+          "target_output_tokens": 331
+        },
+        {
+          "role": "user",
+          "content_token_count": 4436,
+          "target_output_tokens": 260
+        },
+        {
+          "role": "user",
+          "content_token_count": 2961,
+          "target_output_tokens": 952
+        },
+        {
+          "role": "user",
+          "content_token_count": 21323,
+          "target_output_tokens": 1066
+        },
+        {
+          "role": "user",
+          "content_token_count": 14035,
+          "target_output_tokens": 1134
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 14500,
+          "target_output_tokens": 1813
+        },
+        {
+          "role": "user",
+          "content_token_count": 4751,
+          "target_output_tokens": 1726
+        },
+        {
+          "role": "user",
+          "content_token_count": 14083,
+          "target_output_tokens": 444
+        },
+        {
+          "role": "user",
+          "content_token_count": 2668,
+          "target_output_tokens": 199
+        },
+        {
+          "role": "user",
+          "content_token_count": 6391,
+          "target_output_tokens": 3392
+        },
+        {
+          "role": "user",
+          "content_token_count": 33050,
+          "target_output_tokens": 2319
+        },
+        {
+          "role": "user",
+          "content_token_count": 19617,
+          "target_output_tokens": 401
+        },
+        {
+          "role": "user",
+          "content_token_count": 9052,
+          "target_output_tokens": 220
+        },
+        {
+          "role": "user",
+          "content_token_count": 21741,
+          "target_output_tokens": 1047
+        },
+        {
+          "role": "user",
+          "content_token_count": 19064,
+          "target_output_tokens": 340
+        },
+        {
+          "role": "user",
+          "content_token_count": 1184,
+          "target_output_tokens": 804
+        },
+        {
+          "role": "user",
+          "content_token_count": 50708,
+          "target_output_tokens": 1268
+        },
+        {
+          "role": "user",
+          "content_token_count": 1043,
+          "target_output_tokens": 528
+        },
+        {
+          "role": "user",
+          "content_token_count": 7976,
+          "target_output_tokens": 600
+        },
+        {
+          "role": "user",
+          "content_token_count": 2967,
+          "target_output_tokens": 193
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 4241,
+          "target_output_tokens": 1292
+        },
+        {
+          "role": "user",
+          "content_token_count": 8073,
+          "target_output_tokens": 1244
+        },
+        {
+          "role": "user",
+          "content_token_count": 21650,
+          "target_output_tokens": 603
+        },
+        {
+          "role": "user",
+          "content_token_count": 30704,
+          "target_output_tokens": 109
+        },
+        {
+          "role": "user",
+          "content_token_count": 3793,
+          "target_output_tokens": 486
+        },
+        {
+          "role": "user",
+          "content_token_count": 65536,
+          "target_output_tokens": 455
+        },
+        {
+          "role": "user",
+          "content_token_count": 12867,
+          "target_output_tokens": 244
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 5205,
+          "target_output_tokens": 190
+        },
+        {
+          "role": "user",
+          "content_token_count": 9530,
+          "target_output_tokens": 323
+        },
+        {
+          "role": "user",
+          "content_token_count": 5813,
+          "target_output_tokens": 662
+        },
+        {
+          "role": "user",
+          "content_token_count": 6079,
+          "target_output_tokens": 710
+        },
+        {
+          "role": "user",
+          "content_token_count": 3766,
+          "target_output_tokens": 319
+        },
+        {
+          "role": "user",
+          "content_token_count": 10983,
+          "target_output_tokens": 419
+        },
+        {
+          "role": "user",
+          "content_token_count": 38098,
+          "target_output_tokens": 897
+        },
+        {
+          "role": "user",
+          "content_token_count": 7410,
+          "target_output_tokens": 1273
+        },
+        {
+          "role": "user",
+          "content_token_count": 6534,
+          "target_output_tokens": 439
+        },
+        {
+          "role": "user",
+          "content_token_count": 2603,
+          "target_output_tokens": 363
+        },
+        {
+          "role": "user",
+          "content_token_count": 4395,
+          "target_output_tokens": 72
+        },
+        {
+          "role": "user",
+          "content_token_count": 6739,
+          "target_output_tokens": 424
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 23588,
+          "target_output_tokens": 164
+        },
+        {
+          "role": "user",
+          "content_token_count": 17832,
+          "target_output_tokens": 506
+        },
+        {
+          "role": "user",
+          "content_token_count": 22461,
+          "target_output_tokens": 198
+        },
+        {
+          "role": "user",
+          "content_token_count": 10329,
+          "target_output_tokens": 1380
+        },
+        {
+          "role": "user",
+          "content_token_count": 16613,
+          "target_output_tokens": 523
+        },
+        {
+          "role": "user",
+          "content_token_count": 18924,
+          "target_output_tokens": 1091
+        },
+        {
+          "role": "user",
+          "content_token_count": 6640,
+          "target_output_tokens": 936
+        },
+        {
+          "role": "user",
+          "content_token_count": 5752,
+          "target_output_tokens": 1079
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 16422,
+          "target_output_tokens": 611
+        },
+        {
+          "role": "user",
+          "content_token_count": 8736,
+          "target_output_tokens": 1393
+        },
+        {
+          "role": "user",
+          "content_token_count": 30989,
+          "target_output_tokens": 357
+        },
+        {
+          "role": "user",
+          "content_token_count": 32378,
+          "target_output_tokens": 365
+        },
+        {
+          "role": "user",
+          "content_token_count": 4826,
+          "target_output_tokens": 1142
+        },
+        {
+          "role": "user",
+          "content_token_count": 7705,
+          "target_output_tokens": 2254
+        },
+        {
+          "role": "user",
+          "content_token_count": 1630,
+          "target_output_tokens": 1219
+        },
+        {
+          "role": "user",
+          "content_token_count": 5323,
+          "target_output_tokens": 838
+        },
+        {
+          "role": "user",
+          "content_token_count": 21581,
+          "target_output_tokens": 654
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 8355,
+          "target_output_tokens": 529
+        },
+        {
+          "role": "user",
+          "content_token_count": 33639,
+          "target_output_tokens": 650
+        },
+        {
+          "role": "user",
+          "content_token_count": 9794,
+          "target_output_tokens": 355
+        },
+        {
+          "role": "user",
+          "content_token_count": 5952,
+          "target_output_tokens": 608
+        },
+        {
+          "role": "user",
+          "content_token_count": 7696,
+          "target_output_tokens": 163
+        },
+        {
+          "role": "user",
+          "content_token_count": 8151,
+          "target_output_tokens": 108
+        },
+        {
+          "role": "user",
+          "content_token_count": 11377,
+          "target_output_tokens": 486
+        },
+        {
+          "role": "user",
+          "content_token_count": 2795,
+          "target_output_tokens": 765
+        },
+        {
+          "role": "user",
+          "content_token_count": 8478,
+          "target_output_tokens": 361
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 3254,
+          "target_output_tokens": 524
+        },
+        {
+          "role": "user",
+          "content_token_count": 13573,
+          "target_output_tokens": 1371
+        },
+        {
+          "role": "user",
+          "content_token_count": 4347,
+          "target_output_tokens": 538
+        },
+        {
+          "role": "user",
+          "content_token_count": 52807,
+          "target_output_tokens": 1303
+        },
+        {
+          "role": "user",
+          "content_token_count": 6319,
+          "target_output_tokens": 278
+        },
+        {
+          "role": "user",
+          "content_token_count": 4295,
+          "target_output_tokens": 640
+        },
+        {
+          "role": "user",
+          "content_token_count": 2030,
+          "target_output_tokens": 358
+        },
+        {
+          "role": "user",
+          "content_token_count": 13300,
+          "target_output_tokens": 504
+        },
+        {
+          "role": "user",
+          "content_token_count": 4151,
+          "target_output_tokens": 1040
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 10729,
+          "target_output_tokens": 621
+        },
+        {
+          "role": "user",
+          "content_token_count": 6674,
+          "target_output_tokens": 433
+        },
+        {
+          "role": "user",
+          "content_token_count": 11618,
+          "target_output_tokens": 156
+        },
+        {
+          "role": "user",
+          "content_token_count": 13713,
+          "target_output_tokens": 934
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 9731,
+          "target_output_tokens": 318
+        },
+        {
+          "role": "user",
+          "content_token_count": 65536,
+          "target_output_tokens": 507
+        },
+        {
+          "role": "user",
+          "content_token_count": 3019,
+          "target_output_tokens": 450
+        },
+        {
+          "role": "user",
+          "content_token_count": 10288,
+          "target_output_tokens": 668
+        },
+        {
+          "role": "user",
+          "content_token_count": 22301,
+          "target_output_tokens": 815
+        },
+        {
+          "role": "user",
+          "content_token_count": 5283,
+          "target_output_tokens": 275
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 3544,
+          "target_output_tokens": 843
+        },
+        {
+          "role": "user",
+          "content_token_count": 7783,
+          "target_output_tokens": 332
+        },
+        {
+          "role": "user",
+          "content_token_count": 2684,
+          "target_output_tokens": 845
+        },
+        {
+          "role": "user",
+          "content_token_count": 10549,
+          "target_output_tokens": 275
+        },
+        {
+          "role": "user",
+          "content_token_count": 9460,
+          "target_output_tokens": 608
+        },
+        {
+          "role": "user",
+          "content_token_count": 3164,
+          "target_output_tokens": 542
+        },
+        {
+          "role": "user",
+          "content_token_count": 3760,
+          "target_output_tokens": 494
+        },
+        {
+          "role": "user",
+          "content_token_count": 5991,
+          "target_output_tokens": 458
+        },
+        {
+          "role": "user",
+          "content_token_count": 3873,
+          "target_output_tokens": 800
+        },
+        {
+          "role": "user",
+          "content_token_count": 4054,
+          "target_output_tokens": 400
+        },
+        {
+          "role": "user",
+          "content_token_count": 3102,
+          "target_output_tokens": 2786
+        },
+        {
+          "role": "user",
+          "content_token_count": 5452,
+          "target_output_tokens": 3343
+        },
+        {
+          "role": "user",
+          "content_token_count": 2904,
+          "target_output_tokens": 483
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 2269,
+          "target_output_tokens": 738
+        },
+        {
+          "role": "user",
+          "content_token_count": 18252,
+          "target_output_tokens": 64
+        },
+        {
+          "role": "user",
+          "content_token_count": 16077,
+          "target_output_tokens": 369
+        },
+        {
+          "role": "user",
+          "content_token_count": 2591,
+          "target_output_tokens": 1498
+        },
+        {
+          "role": "user",
+          "content_token_count": 955,
+          "target_output_tokens": 964
+        },
+        {
+          "role": "user",
+          "content_token_count": 15421,
+          "target_output_tokens": 1148
+        },
+        {
+          "role": "user",
+          "content_token_count": 26417,
+          "target_output_tokens": 282
+        },
+        {
+          "role": "user",
+          "content_token_count": 2450,
+          "target_output_tokens": 641
+        },
+        {
+          "role": "user",
+          "content_token_count": 3723,
+          "target_output_tokens": 1544
+        },
+        {
+          "role": "user",
+          "content_token_count": 24848,
+          "target_output_tokens": 1652
+        },
+        {
+          "role": "user",
+          "content_token_count": 1198,
+          "target_output_tokens": 303
+        },
+        {
+          "role": "user",
+          "content_token_count": 3660,
+          "target_output_tokens": 378
+        },
+        {
+          "role": "user",
+          "content_token_count": 8385,
+          "target_output_tokens": 971
+        },
+        {
+          "role": "user",
+          "content_token_count": 17089,
+          "target_output_tokens": 146
+        },
+        {
+          "role": "user",
+          "content_token_count": 13626,
+          "target_output_tokens": 1436
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 6980,
+          "target_output_tokens": 779
+        },
+        {
+          "role": "user",
+          "content_token_count": 14266,
+          "target_output_tokens": 998
+        },
+        {
+          "role": "user",
+          "content_token_count": 19395,
+          "target_output_tokens": 931
+        },
+        {
+          "role": "user",
+          "content_token_count": 27605,
+          "target_output_tokens": 864
+        },
+        {
+          "role": "user",
+          "content_token_count": 7245,
+          "target_output_tokens": 462
+        },
+        {
+          "role": "user",
+          "content_token_count": 3242,
+          "target_output_tokens": 90
+        },
+        {
+          "role": "user",
+          "content_token_count": 2781,
+          "target_output_tokens": 1296
+        },
+        {
+          "role": "user",
+          "content_token_count": 1676,
+          "target_output_tokens": 1609
+        },
+        {
+          "role": "user",
+          "content_token_count": 9287,
+          "target_output_tokens": 1339
+        },
+        {
+          "role": "user",
+          "content_token_count": 7842,
+          "target_output_tokens": 686
+        },
+        {
+          "role": "user",
+          "content_token_count": 7397,
+          "target_output_tokens": 133
+        },
+        {
+          "role": "user",
+          "content_token_count": 12946,
+          "target_output_tokens": 579
+        },
+        {
+          "role": "user",
+          "content_token_count": 6842,
+          "target_output_tokens": 1282
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 14195,
+          "target_output_tokens": 466
+        },
+        {
+          "role": "user",
+          "content_token_count": 4463,
+          "target_output_tokens": 558
+        },
+        {
+          "role": "user",
+          "content_token_count": 1089,
+          "target_output_tokens": 2126
+        },
+        {
+          "role": "user",
+          "content_token_count": 9114,
+          "target_output_tokens": 483
+        },
+        {
+          "role": "user",
+          "content_token_count": 4745,
+          "target_output_tokens": 810
+        },
+        {
+          "role": "user",
+          "content_token_count": 11648,
+          "target_output_tokens": 395
+        },
+        {
+          "role": "user",
+          "content_token_count": 2438,
+          "target_output_tokens": 444
+        },
+        {
+          "role": "user",
+          "content_token_count": 15094,
+          "target_output_tokens": 357
+        },
+        {
+          "role": "user",
+          "content_token_count": 5004,
+          "target_output_tokens": 1692
+        },
+        {
+          "role": "user",
+          "content_token_count": 17422,
+          "target_output_tokens": 161
+        },
+        {
+          "role": "user",
+          "content_token_count": 18830,
+          "target_output_tokens": 350
+        },
+        {
+          "role": "user",
+          "content_token_count": 3203,
+          "target_output_tokens": 1336
+        },
+        {
+          "role": "user",
+          "content_token_count": 4912,
+          "target_output_tokens": 1071
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 10200,
+          "target_output_tokens": 315
+        },
+        {
+          "role": "user",
+          "content_token_count": 43481,
+          "target_output_tokens": 953
+        },
+        {
+          "role": "user",
+          "content_token_count": 6381,
+          "target_output_tokens": 473
+        },
+        {
+          "role": "user",
+          "content_token_count": 2352,
+          "target_output_tokens": 361
+        },
+        {
+          "role": "user",
+          "content_token_count": 11246,
+          "target_output_tokens": 486
+        },
+        {
+          "role": "user",
+          "content_token_count": 38916,
+          "target_output_tokens": 252
+        },
+        {
+          "role": "user",
+          "content_token_count": 29292,
+          "target_output_tokens": 332
+        },
+        {
+          "role": "user",
+          "content_token_count": 7163,
+          "target_output_tokens": 737
+        },
+        {
+          "role": "user",
+          "content_token_count": 4145,
+          "target_output_tokens": 316
+        },
+        {
+          "role": "user",
+          "content_token_count": 4769,
+          "target_output_tokens": 298
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 5594,
+          "target_output_tokens": 1686
+        },
+        {
+          "role": "user",
+          "content_token_count": 4311,
+          "target_output_tokens": 398
+        },
+        {
+          "role": "user",
+          "content_token_count": 13684,
+          "target_output_tokens": 419
+        },
+        {
+          "role": "user",
+          "content_token_count": 33855,
+          "target_output_tokens": 188
+        },
+        {
+          "role": "user",
+          "content_token_count": 2118,
+          "target_output_tokens": 1128
+        },
+        {
+          "role": "user",
+          "content_token_count": 2030,
+          "target_output_tokens": 184
+        },
+        {
+          "role": "user",
+          "content_token_count": 10739,
+          "target_output_tokens": 561
+        },
+        {
+          "role": "user",
+          "content_token_count": 5555,
+          "target_output_tokens": 366
+        },
+        {
+          "role": "user",
+          "content_token_count": 16640,
+          "target_output_tokens": 668
+        },
+        {
+          "role": "user",
+          "content_token_count": 23253,
+          "target_output_tokens": 884
+        },
+        {
+          "role": "user",
+          "content_token_count": 3965,
+          "target_output_tokens": 740
+        },
+        {
+          "role": "user",
+          "content_token_count": 8551,
+          "target_output_tokens": 1807
+        },
+        {
+          "role": "user",
+          "content_token_count": 3578,
+          "target_output_tokens": 766
+        },
+        {
+          "role": "user",
+          "content_token_count": 4639,
+          "target_output_tokens": 1157
+        },
+        {
+          "role": "user",
+          "content_token_count": 6212,
+          "target_output_tokens": 437
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 5004,
+          "target_output_tokens": 178
+        },
+        {
+          "role": "user",
+          "content_token_count": 5596,
+          "target_output_tokens": 867
+        },
+        {
+          "role": "user",
+          "content_token_count": 12366,
+          "target_output_tokens": 1221
+        },
+        {
+          "role": "user",
+          "content_token_count": 5092,
+          "target_output_tokens": 167
+        },
+        {
+          "role": "user",
+          "content_token_count": 11259,
+          "target_output_tokens": 286
+        },
+        {
+          "role": "user",
+          "content_token_count": 18357,
+          "target_output_tokens": 1419
+        },
+        {
+          "role": "user",
+          "content_token_count": 12445,
+          "target_output_tokens": 425
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 1753,
+          "target_output_tokens": 457
+        },
+        {
+          "role": "user",
+          "content_token_count": 4410,
+          "target_output_tokens": 138
+        },
+        {
+          "role": "user",
+          "content_token_count": 3759,
+          "target_output_tokens": 295
+        },
+        {
+          "role": "user",
+          "content_token_count": 11816,
+          "target_output_tokens": 830
+        },
+        {
+          "role": "user",
+          "content_token_count": 16209,
+          "target_output_tokens": 141
+        },
+        {
+          "role": "user",
+          "content_token_count": 46023,
+          "target_output_tokens": 2056
+        },
+        {
+          "role": "user",
+          "content_token_count": 5420,
+          "target_output_tokens": 422
+        },
+        {
+          "role": "user",
+          "content_token_count": 2445,
+          "target_output_tokens": 2119
+        },
+        {
+          "role": "user",
+          "content_token_count": 3724,
+          "target_output_tokens": 1277
+        },
+        {
+          "role": "user",
+          "content_token_count": 3168,
+          "target_output_tokens": 391
+        },
+        {
+          "role": "user",
+          "content_token_count": 9061,
+          "target_output_tokens": 1199
+        },
+        {
+          "role": "user",
+          "content_token_count": 4255,
+          "target_output_tokens": 1880
+        },
+        {
+          "role": "user",
+          "content_token_count": 20542,
+          "target_output_tokens": 449
+        },
+        {
+          "role": "user",
+          "content_token_count": 18541,
+          "target_output_tokens": 211
+        },
+        {
+          "role": "user",
+          "content_token_count": 17405,
+          "target_output_tokens": 878
+        },
+        {
+          "role": "user",
+          "content_token_count": 7086,
+          "target_output_tokens": 396
+        },
+        {
+          "role": "user",
+          "content_token_count": 4469,
+          "target_output_tokens": 189
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 4594,
+          "target_output_tokens": 567
+        },
+        {
+          "role": "user",
+          "content_token_count": 15961,
+          "target_output_tokens": 276
+        },
+        {
+          "role": "user",
+          "content_token_count": 18817,
+          "target_output_tokens": 296
+        },
+        {
+          "role": "user",
+          "content_token_count": 8980,
+          "target_output_tokens": 446
+        },
+        {
+          "role": "user",
+          "content_token_count": 13739,
+          "target_output_tokens": 476
+        },
+        {
+          "role": "user",
+          "content_token_count": 4954,
+          "target_output_tokens": 1124
+        },
+        {
+          "role": "user",
+          "content_token_count": 7155,
+          "target_output_tokens": 2553
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 8108,
+          "target_output_tokens": 337
+        },
+        {
+          "role": "user",
+          "content_token_count": 7213,
+          "target_output_tokens": 198
+        },
+        {
+          "role": "user",
+          "content_token_count": 6441,
+          "target_output_tokens": 932
+        },
+        {
+          "role": "user",
+          "content_token_count": 25889,
+          "target_output_tokens": 494
+        },
+        {
+          "role": "user",
+          "content_token_count": 5672,
+          "target_output_tokens": 322
+        },
+        {
+          "role": "user",
+          "content_token_count": 6174,
+          "target_output_tokens": 984
+        },
+        {
+          "role": "user",
+          "content_token_count": 13080,
+          "target_output_tokens": 594
+        },
+        {
+          "role": "user",
+          "content_token_count": 23119,
+          "target_output_tokens": 64
+        },
+        {
+          "role": "user",
+          "content_token_count": 10812,
+          "target_output_tokens": 939
+        },
+        {
+          "role": "user",
+          "content_token_count": 27801,
+          "target_output_tokens": 925
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 3640,
+          "target_output_tokens": 108
+        },
+        {
+          "role": "user",
+          "content_token_count": 2053,
+          "target_output_tokens": 655
+        },
+        {
+          "role": "user",
+          "content_token_count": 16255,
+          "target_output_tokens": 1911
+        },
+        {
+          "role": "user",
+          "content_token_count": 13439,
+          "target_output_tokens": 629
+        },
+        {
+          "role": "user",
+          "content_token_count": 25472,
+          "target_output_tokens": 1323
+        },
+        {
+          "role": "user",
+          "content_token_count": 10114,
+          "target_output_tokens": 674
+        },
+        {
+          "role": "user",
+          "content_token_count": 1708,
+          "target_output_tokens": 1493
+        },
+        {
+          "role": "user",
+          "content_token_count": 5384,
+          "target_output_tokens": 1587
+        },
+        {
+          "role": "user",
+          "content_token_count": 6730,
+          "target_output_tokens": 408
+        },
+        {
+          "role": "user",
+          "content_token_count": 1746,
+          "target_output_tokens": 413
+        },
+        {
+          "role": "user",
+          "content_token_count": 1684,
+          "target_output_tokens": 1349
+        },
+        {
+          "role": "user",
+          "content_token_count": 22551,
+          "target_output_tokens": 426
+        },
+        {
+          "role": "user",
+          "content_token_count": 10297,
+          "target_output_tokens": 772
+        },
+        {
+          "role": "user",
+          "content_token_count": 13002,
+          "target_output_tokens": 1444
+        },
+        {
+          "role": "user",
+          "content_token_count": 16737,
+          "target_output_tokens": 1199
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 7675,
+          "target_output_tokens": 354
+        },
+        {
+          "role": "user",
+          "content_token_count": 5654,
+          "target_output_tokens": 220
+        },
+        {
+          "role": "user",
+          "content_token_count": 946,
+          "target_output_tokens": 515
+        },
+        {
+          "role": "user",
+          "content_token_count": 6573,
+          "target_output_tokens": 1712
+        },
+        {
+          "role": "user",
+          "content_token_count": 47344,
+          "target_output_tokens": 554
+        },
+        {
+          "role": "user",
+          "content_token_count": 10099,
+          "target_output_tokens": 1064
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 4184,
+          "target_output_tokens": 213
+        },
+        {
+          "role": "user",
+          "content_token_count": 20020,
+          "target_output_tokens": 727
+        },
+        {
+          "role": "user",
+          "content_token_count": 5788,
+          "target_output_tokens": 464
+        },
+        {
+          "role": "user",
+          "content_token_count": 16426,
+          "target_output_tokens": 188
+        },
+        {
+          "role": "user",
+          "content_token_count": 6170,
+          "target_output_tokens": 1080
+        },
+        {
+          "role": "user",
+          "content_token_count": 12316,
+          "target_output_tokens": 659
+        },
+        {
+          "role": "user",
+          "content_token_count": 2817,
+          "target_output_tokens": 148
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 14649,
+          "target_output_tokens": 769
+        },
+        {
+          "role": "user",
+          "content_token_count": 13707,
+          "target_output_tokens": 314
+        },
+        {
+          "role": "user",
+          "content_token_count": 1901,
+          "target_output_tokens": 480
+        },
+        {
+          "role": "user",
+          "content_token_count": 4892,
+          "target_output_tokens": 562
+        },
+        {
+          "role": "user",
+          "content_token_count": 18481,
+          "target_output_tokens": 195
+        },
+        {
+          "role": "user",
+          "content_token_count": 3762,
+          "target_output_tokens": 564
+        },
+        {
+          "role": "user",
+          "content_token_count": 8463,
+          "target_output_tokens": 286
+        },
+        {
+          "role": "user",
+          "content_token_count": 11078,
+          "target_output_tokens": 90
+        },
+        {
+          "role": "user",
+          "content_token_count": 1106,
+          "target_output_tokens": 2149
+        },
+        {
+          "role": "user",
+          "content_token_count": 3393,
+          "target_output_tokens": 1477
+        },
+        {
+          "role": "user",
+          "content_token_count": 65536,
+          "target_output_tokens": 285
+        },
+        {
+          "role": "user",
+          "content_token_count": 11370,
+          "target_output_tokens": 417
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 19821,
+          "target_output_tokens": 217
+        },
+        {
+          "role": "user",
+          "content_token_count": 20454,
+          "target_output_tokens": 689
+        },
+        {
+          "role": "user",
+          "content_token_count": 6158,
+          "target_output_tokens": 495
+        },
+        {
+          "role": "user",
+          "content_token_count": 10407,
+          "target_output_tokens": 172
+        },
+        {
+          "role": "user",
+          "content_token_count": 6777,
+          "target_output_tokens": 244
+        },
+        {
+          "role": "user",
+          "content_token_count": 52928,
+          "target_output_tokens": 476
+        },
+        {
+          "role": "user",
+          "content_token_count": 42478,
+          "target_output_tokens": 223
+        },
+        {
+          "role": "user",
+          "content_token_count": 4347,
+          "target_output_tokens": 593
+        },
+        {
+          "role": "user",
+          "content_token_count": 12237,
+          "target_output_tokens": 123
+        },
+        {
+          "role": "user",
+          "content_token_count": 17586,
+          "target_output_tokens": 598
+        },
+        {
+          "role": "user",
+          "content_token_count": 2461,
+          "target_output_tokens": 501
+        },
+        {
+          "role": "user",
+          "content_token_count": 4825,
+          "target_output_tokens": 168
+        },
+        {
+          "role": "user",
+          "content_token_count": 2679,
+          "target_output_tokens": 2852
+        },
+        {
+          "role": "user",
+          "content_token_count": 7837,
+          "target_output_tokens": 492
+        },
+        {
+          "role": "user",
+          "content_token_count": 65536,
+          "target_output_tokens": 277
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 5214,
+          "target_output_tokens": 2004
+        },
+        {
+          "role": "user",
+          "content_token_count": 11163,
+          "target_output_tokens": 2005
+        },
+        {
+          "role": "user",
+          "content_token_count": 25193,
+          "target_output_tokens": 211
+        },
+        {
+          "role": "user",
+          "content_token_count": 2010,
+          "target_output_tokens": 256
+        },
+        {
+          "role": "user",
+          "content_token_count": 9992,
+          "target_output_tokens": 1115
+        },
+        {
+          "role": "user",
+          "content_token_count": 12896,
+          "target_output_tokens": 623
+        },
+        {
+          "role": "user",
+          "content_token_count": 3791,
+          "target_output_tokens": 998
+        },
+        {
+          "role": "user",
+          "content_token_count": 8003,
+          "target_output_tokens": 338
+        },
+        {
+          "role": "user",
+          "content_token_count": 4495,
+          "target_output_tokens": 552
+        },
+        {
+          "role": "user",
+          "content_token_count": 1634,
+          "target_output_tokens": 2271
+        },
+        {
+          "role": "user",
+          "content_token_count": 5760,
+          "target_output_tokens": 97
+        },
+        {
+          "role": "user",
+          "content_token_count": 10434,
+          "target_output_tokens": 609
+        },
+        {
+          "role": "user",
+          "content_token_count": 23376,
+          "target_output_tokens": 112
+        },
+        {
+          "role": "user",
+          "content_token_count": 8046,
+          "target_output_tokens": 544
+        },
+        {
+          "role": "user",
+          "content_token_count": 1341,
+          "target_output_tokens": 1666
+        },
+        {
+          "role": "user",
+          "content_token_count": 12979,
+          "target_output_tokens": 341
+        },
+        {
+          "role": "user",
+          "content_token_count": 8061,
+          "target_output_tokens": 463
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 14288,
+          "target_output_tokens": 1379
+        },
+        {
+          "role": "user",
+          "content_token_count": 7502,
+          "target_output_tokens": 164
+        },
+        {
+          "role": "user",
+          "content_token_count": 2894,
+          "target_output_tokens": 68
+        },
+        {
+          "role": "user",
+          "content_token_count": 28437,
+          "target_output_tokens": 318
+        },
+        {
+          "role": "user",
+          "content_token_count": 9110,
+          "target_output_tokens": 780
+        },
+        {
+          "role": "user",
+          "content_token_count": 7833,
+          "target_output_tokens": 1300
+        },
+        {
+          "role": "user",
+          "content_token_count": 35537,
+          "target_output_tokens": 227
+        },
+        {
+          "role": "user",
+          "content_token_count": 6575,
+          "target_output_tokens": 341
+        },
+        {
+          "role": "user",
+          "content_token_count": 5057,
+          "target_output_tokens": 747
+        },
+        {
+          "role": "user",
+          "content_token_count": 1020,
+          "target_output_tokens": 566
+        },
+        {
+          "role": "user",
+          "content_token_count": 29797,
+          "target_output_tokens": 461
+        },
+        {
+          "role": "user",
+          "content_token_count": 6275,
+          "target_output_tokens": 244
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 5975,
+          "target_output_tokens": 713
+        },
+        {
+          "role": "user",
+          "content_token_count": 4182,
+          "target_output_tokens": 813
+        },
+        {
+          "role": "user",
+          "content_token_count": 31157,
+          "target_output_tokens": 394
+        },
+        {
+          "role": "user",
+          "content_token_count": 5352,
+          "target_output_tokens": 628
+        },
+        {
+          "role": "user",
+          "content_token_count": 5323,
+          "target_output_tokens": 468
+        },
+        {
+          "role": "user",
+          "content_token_count": 8404,
+          "target_output_tokens": 603
+        },
+        {
+          "role": "user",
+          "content_token_count": 10457,
+          "target_output_tokens": 528
+        },
+        {
+          "role": "user",
+          "content_token_count": 21616,
+          "target_output_tokens": 1002
+        },
+        {
+          "role": "user",
+          "content_token_count": 11231,
+          "target_output_tokens": 266
+        },
+        {
+          "role": "user",
+          "content_token_count": 3555,
+          "target_output_tokens": 981
+        },
+        {
+          "role": "user",
+          "content_token_count": 2347,
+          "target_output_tokens": 311
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 512,
+          "target_output_tokens": 1289
+        },
+        {
+          "role": "user",
+          "content_token_count": 14824,
+          "target_output_tokens": 595
+        },
+        {
+          "role": "user",
+          "content_token_count": 2459,
+          "target_output_tokens": 491
+        },
+        {
+          "role": "user",
+          "content_token_count": 5155,
+          "target_output_tokens": 854
+        },
+        {
+          "role": "user",
+          "content_token_count": 1706,
+          "target_output_tokens": 335
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 4693,
+          "target_output_tokens": 552
+        },
+        {
+          "role": "user",
+          "content_token_count": 3717,
+          "target_output_tokens": 321
+        },
+        {
+          "role": "user",
+          "content_token_count": 11640,
+          "target_output_tokens": 525
+        },
+        {
+          "role": "user",
+          "content_token_count": 7120,
+          "target_output_tokens": 1424
+        },
+        {
+          "role": "user",
+          "content_token_count": 6218,
+          "target_output_tokens": 1656
+        },
+        {
+          "role": "user",
+          "content_token_count": 11256,
+          "target_output_tokens": 3945
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 6313,
+          "target_output_tokens": 1528
+        },
+        {
+          "role": "user",
+          "content_token_count": 5148,
+          "target_output_tokens": 196
+        },
+        {
+          "role": "user",
+          "content_token_count": 15406,
+          "target_output_tokens": 461
+        },
+        {
+          "role": "user",
+          "content_token_count": 2451,
+          "target_output_tokens": 404
+        },
+        {
+          "role": "user",
+          "content_token_count": 9688,
+          "target_output_tokens": 847
+        },
+        {
+          "role": "user",
+          "content_token_count": 14736,
+          "target_output_tokens": 366
+        },
+        {
+          "role": "user",
+          "content_token_count": 8049,
+          "target_output_tokens": 1021
+        },
+        {
+          "role": "user",
+          "content_token_count": 5751,
+          "target_output_tokens": 3843
+        },
+        {
+          "role": "user",
+          "content_token_count": 11137,
+          "target_output_tokens": 390
+        },
+        {
+          "role": "user",
+          "content_token_count": 34636,
+          "target_output_tokens": 895
+        },
+        {
+          "role": "user",
+          "content_token_count": 11915,
+          "target_output_tokens": 599
+        },
+        {
+          "role": "user",
+          "content_token_count": 8409,
+          "target_output_tokens": 86
+        },
+        {
+          "role": "user",
+          "content_token_count": 3406,
+          "target_output_tokens": 2233
+        },
+        {
+          "role": "user",
+          "content_token_count": 15118,
+          "target_output_tokens": 677
+        },
+        {
+          "role": "user",
+          "content_token_count": 11251,
+          "target_output_tokens": 203
+        },
+        {
+          "role": "user",
+          "content_token_count": 7848,
+          "target_output_tokens": 198
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 19708,
+          "target_output_tokens": 526
+        },
+        {
+          "role": "user",
+          "content_token_count": 6199,
+          "target_output_tokens": 262
+        },
+        {
+          "role": "user",
+          "content_token_count": 5688,
+          "target_output_tokens": 957
+        },
+        {
+          "role": "user",
+          "content_token_count": 8993,
+          "target_output_tokens": 1558
+        },
+        {
+          "role": "user",
+          "content_token_count": 14718,
+          "target_output_tokens": 207
+        },
+        {
+          "role": "user",
+          "content_token_count": 10274,
+          "target_output_tokens": 744
+        },
+        {
+          "role": "user",
+          "content_token_count": 10756,
+          "target_output_tokens": 330
+        },
+        {
+          "role": "user",
+          "content_token_count": 55245,
+          "target_output_tokens": 171
+        },
+        {
+          "role": "user",
+          "content_token_count": 14177,
+          "target_output_tokens": 343
+        },
+        {
+          "role": "user",
+          "content_token_count": 11266,
+          "target_output_tokens": 370
+        },
+        {
+          "role": "user",
+          "content_token_count": 5359,
+          "target_output_tokens": 1273
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 1649,
+          "target_output_tokens": 218
+        },
+        {
+          "role": "user",
+          "content_token_count": 8871,
+          "target_output_tokens": 629
+        },
+        {
+          "role": "user",
+          "content_token_count": 11623,
+          "target_output_tokens": 247
+        },
+        {
+          "role": "user",
+          "content_token_count": 17643,
+          "target_output_tokens": 536
+        },
+        {
+          "role": "user",
+          "content_token_count": 1355,
+          "target_output_tokens": 127
+        },
+        {
+          "role": "user",
+          "content_token_count": 10824,
+          "target_output_tokens": 363
+        },
+        {
+          "role": "user",
+          "content_token_count": 3760,
+          "target_output_tokens": 810
+        },
+        {
+          "role": "user",
+          "content_token_count": 13120,
+          "target_output_tokens": 179
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 2614,
+          "target_output_tokens": 270
+        },
+        {
+          "role": "user",
+          "content_token_count": 4555,
+          "target_output_tokens": 271
+        },
+        {
+          "role": "user",
+          "content_token_count": 5387,
+          "target_output_tokens": 216
+        },
+        {
+          "role": "user",
+          "content_token_count": 3338,
+          "target_output_tokens": 694
+        },
+        {
+          "role": "user",
+          "content_token_count": 9274,
+          "target_output_tokens": 488
+        },
+        {
+          "role": "user",
+          "content_token_count": 41006,
+          "target_output_tokens": 1179
+        },
+        {
+          "role": "user",
+          "content_token_count": 11764,
+          "target_output_tokens": 336
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 4551,
+          "target_output_tokens": 391
+        },
+        {
+          "role": "user",
+          "content_token_count": 7744,
+          "target_output_tokens": 590
+        },
+        {
+          "role": "user",
+          "content_token_count": 6922,
+          "target_output_tokens": 1285
+        },
+        {
+          "role": "user",
+          "content_token_count": 15085,
+          "target_output_tokens": 881
+        },
+        {
+          "role": "user",
+          "content_token_count": 23696,
+          "target_output_tokens": 380
+        },
+        {
+          "role": "user",
+          "content_token_count": 13825,
+          "target_output_tokens": 1441
+        },
+        {
+          "role": "user",
+          "content_token_count": 7353,
+          "target_output_tokens": 686
+        }
+      ]
+    },
+    {
+      "turns": [
+        {
+          "role": "user",
+          "content_token_count": 4844,
+          "target_output_tokens": 520
+        },
+        {
+          "role": "user",
+          "content_token_count": 11126,
+          "target_output_tokens": 170
+        },
+        {
+          "role": "user",
+          "content_token_count": 2742,
+          "target_output_tokens": 549
+        },
+        {
+          "role": "user",
+          "content_token_count": 4533,
+          "target_output_tokens": 309
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py b/experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py
new file mode 100644
index 000000000..ccc51ca7a
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/aiperf_traces/generate_aiperf_traces.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+"""Generate synthetic AIPerf-style trace sessions for kv-cache-tester-compatible replay."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import random
+from pathlib import Path
+
+
+def lognormal_sigma(p50: float, p95: float) -> float:
+    return math.log(p95 / p50) / 1.645
+
+
+def sample_tokens(rng: random.Random, p50: float, p95: float, min_v: int, max_v: int) -> int:
+    sigma = lognormal_sigma(p50, p95)
+    mu = math.log(p50)
+    sampled = int(round(rng.lognormvariate(mu, sigma)))
+    return max(min_v, min(max_v, sampled))
+
+
+def generate_sessions(count: int, seed: int) -> dict:
+    rng = random.Random(seed)
+    sessions = []
+
+    # Target coding-workload distributions:
+    # ISL p50~8k, p95~32k
+    # OSL p50~512, p95~2k
+    for _ in range(count):
+        num_turns = rng.randint(4, 18)
+        turns = []
+        for _ in range(num_turns):
+            turns.append(
+                {
+                    "role": "user",
+                    "content_token_count": sample_tokens(
+                        rng,
+                        p50=8000,
+                        p95=32000,
+                        min_v=512,
+                        max_v=65536,
+                    ),
+                    "target_output_tokens": sample_tokens(
+                        rng,
+                        p50=512,
+                        p95=2000,
+                        min_v=64,
+                        max_v=4096,
+                    ),
+                }
+            )
+        sessions.append({"turns": turns})
+
+    return {"sessions": sessions}
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Generate synthetic AIPerf traces")
+    parser.add_argument("--sessions", type=int, default=100, help="Number of sessions")
+    parser.add_argument("--seed", type=int, default=993, help="Random seed")
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path(__file__).with_name("aiperf_synthetic_traces.json"),
+        help="Output JSON path",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    payload = generate_sessions(args.sessions, args.seed)
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    args.output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md b/experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md
new file mode 100644
index 000000000..94731fd42
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/kv-cache-tester/README.md
@@ -0,0 +1,11 @@
+# kv-cache-tester placeholder
+
+This directory should be populated with the external `kv-cache-tester` repository.
+
+Expected structure includes trace replay tooling and real trace assets used by experimental multiturn benchmarks.
+
+## Initialization
+
+If/when access is available, initialize this directory by checking out the kv-cache-tester repo contents here (for example via approved submodule setup or direct clone workflow owned by maintainers).
+
+Do not replace this placeholder with unapproved external URLs in this branch.
diff --git a/experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/.gitkeep b/experimental/multiturn/vllm_benchmark/kv-cache-tester/traces/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/experimental/multiturn/vllm_benchmark/launch/README.md b/experimental/multiturn/vllm_benchmark/launch/README.md
new file mode 100644
index 000000000..00d33ecba
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/launch/README.md
@@ -0,0 +1,8 @@
+# LMCache launch scripts (experimental)
+
+These scripts launch vLLM with LMCache KV transfer enabled:
+
+- `lmcache_vllm_h200.sh`
+- `lmcache_vllm_b200.sh`
+
+They are experimental parity utilities and are not wired into the standard InferenceX benchmark dispatch lanes.
diff --git a/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh
new file mode 100755
index 000000000..f83b4b7f2
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_b200.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP
+
+PORT=${PORT:-8888}
+SERVER_LOG=/workspace/server.log
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272}
+
+cat > config.yaml << EOF
+kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8}
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+python3 -m pip install -q lmcache
+
+launch_vllm_server "$MODEL" "$PORT" config.yaml   --disable-log-requests   --trust-remote-code   --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}'
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+echo "LMCache vLLM server running (PID=$SERVER_PID, log=$SERVER_LOG)"
+wait "$SERVER_PID"
diff --git a/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh
new file mode 100755
index 000000000..f83b4b7f2
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/launch/lmcache_vllm_h200.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP
+
+PORT=${PORT:-8888}
+SERVER_LOG=/workspace/server.log
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272}
+
+cat > config.yaml << EOF
+kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8}
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+python3 -m pip install -q lmcache
+
+launch_vllm_server "$MODEL" "$PORT" config.yaml   --disable-log-requests   --trust-remote-code   --kv-transfer-config '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}'
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+echo "LMCache vLLM server running (PID=$SERVER_PID, log=$SERVER_LOG)"
+wait "$SERVER_PID"
diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh
new file mode 100755
index 000000000..f917c03c3
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_b200_vllm.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC RESULT_FILENAME
+
+PORT=${PORT:-8888}
+TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces}
+BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800}
+SERVER_LOG=/workspace/server.log
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272}
+cat > config.yaml << EOF
+kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8}
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+start_gpu_monitor
+start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0
+
+set -x
+python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py   --api-endpoint "http://localhost:$PORT"   --trace-directory "$TRACE_DIR"   --output-dir /workspace/   --start-users "$CONC"   --max-users "$CONC"   --test-duration "$BENCHMARK_DURATION_S"   --seed 42   --no-color
+set +x
+
+stop_kv_metrics_collector
+stop_gpu_monitor
+
+python3 datasets/isb1/scripts/adapt_trace_replay_result.py   --input-dir /workspace   --detailed-csv detailed_results.csv   --output-json "/workspace/${RESULT_FILENAME}.json"   --model-id "$MODEL"   --max-concurrency "$CONC"   --request-mode "${REQUEST_MODE:-multi-turn}"   --support-status "${SUPPORT_STATUS:-reviewed_preview}"   --result-stem "$RESULT_FILENAME"
diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh
new file mode 100755
index 000000000..f917c03c3
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_dsr1_fp8_h200_vllm.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC RESULT_FILENAME
+
+PORT=${PORT:-8888}
+TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces}
+BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800}
+SERVER_LOG=/workspace/server.log
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272}
+cat > config.yaml << EOF
+kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8}
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+start_gpu_monitor
+start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0
+
+set -x
+python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py   --api-endpoint "http://localhost:$PORT"   --trace-directory "$TRACE_DIR"   --output-dir /workspace/   --start-users "$CONC"   --max-users "$CONC"   --test-duration "$BENCHMARK_DURATION_S"   --seed 42   --no-color
+set +x
+
+stop_kv_metrics_collector
+stop_gpu_monitor
+
+python3 datasets/isb1/scripts/adapt_trace_replay_result.py   --input-dir /workspace   --detailed-csv detailed_results.csv   --output-json "/workspace/${RESULT_FILENAME}.json"   --model-id "$MODEL"   --max-concurrency "$CONC"   --request-mode "${REQUEST_MODE:-multi-turn}"   --support-status "${SUPPORT_STATUS:-reviewed_preview}"   --result-stem "$RESULT_FILENAME"
diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh
new file mode 100755
index 000000000..7c46b0c31
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_sglang.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC RESULT_FILENAME
+
+PORT=${PORT:-8888}
+TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces}
+BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800}
+SERVER_LOG=/workspace/server.log
+
+CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272}
+RADIX_CACHE_ARGS=""
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+  apply_sglang_offload_config
+fi
+
+launch_sglang_server "$MODEL" "$PORT"   --trust-remote-code   --ep-size "${EP_SIZE:-1}"   --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}"   --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}"   --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}"   --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}"   --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}"   --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}"   --context-length "$CONTEXT_LENGTH"   --stream-interval "${SGLANG_STREAM_INTERVAL:-10}"   ${RADIX_CACHE_ARGS}
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+start_gpu_monitor
+start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0
+
+set -x
+python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py   --api-endpoint "http://localhost:$PORT"   --trace-directory "$TRACE_DIR"   --output-dir /workspace/   --start-users "$CONC"   --max-users "$CONC"   --test-duration "$BENCHMARK_DURATION_S"   --seed 42   --no-color
+set +x
+
+stop_kv_metrics_collector
+stop_gpu_monitor
+
+python3 datasets/isb1/scripts/adapt_trace_replay_result.py   --input-dir /workspace   --detailed-csv detailed_results.csv   --output-json "/workspace/${RESULT_FILENAME}.json"   --model-id "$MODEL"   --max-concurrency "$CONC"   --request-mode "${REQUEST_MODE:-multi-turn}"   --support-status "${SUPPORT_STATUS:-reviewed_preview}"   --result-stem "$RESULT_FILENAME"
diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh
new file mode 100755
index 000000000..f917c03c3
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_b200_vllm.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC RESULT_FILENAME
+
+PORT=${PORT:-8888}
+TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces}
+BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800}
+SERVER_LOG=/workspace/server.log
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272}
+cat > config.yaml << EOF
+kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8}
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+start_gpu_monitor
+start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0
+
+set -x
+python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py   --api-endpoint "http://localhost:$PORT"   --trace-directory "$TRACE_DIR"   --output-dir /workspace/   --start-users "$CONC"   --max-users "$CONC"   --test-duration "$BENCHMARK_DURATION_S"   --seed 42   --no-color
+set +x
+
+stop_kv_metrics_collector
+stop_gpu_monitor
+
+python3 datasets/isb1/scripts/adapt_trace_replay_result.py   --input-dir /workspace   --detailed-csv detailed_results.csv   --output-json "/workspace/${RESULT_FILENAME}.json"   --model-id "$MODEL"   --max-concurrency "$CONC"   --request-mode "${REQUEST_MODE:-multi-turn}"   --support-status "${SUPPORT_STATUS:-reviewed_preview}"   --result-stem "$RESULT_FILENAME"
diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh
new file mode 100755
index 000000000..7c46b0c31
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_sglang.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC RESULT_FILENAME
+
+PORT=${PORT:-8888}
+TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces}
+BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800}
+SERVER_LOG=/workspace/server.log
+
+CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272}
+RADIX_CACHE_ARGS=""
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+  apply_sglang_offload_config
+fi
+
+launch_sglang_server "$MODEL" "$PORT"   --trust-remote-code   --ep-size "${EP_SIZE:-1}"   --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}"   --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}"   --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}"   --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}"   --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}"   --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}"   --context-length "$CONTEXT_LENGTH"   --stream-interval "${SGLANG_STREAM_INTERVAL:-10}"   ${RADIX_CACHE_ARGS}
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+start_gpu_monitor
+start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0
+
+set -x
+python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py   --api-endpoint "http://localhost:$PORT"   --trace-directory "$TRACE_DIR"   --output-dir /workspace/   --start-users "$CONC"   --max-users "$CONC"   --test-duration "$BENCHMARK_DURATION_S"   --seed 42   --no-color
+set +x
+
+stop_kv_metrics_collector
+stop_gpu_monitor
+
+python3 datasets/isb1/scripts/adapt_trace_replay_result.py   --input-dir /workspace   --detailed-csv detailed_results.csv   --output-json "/workspace/${RESULT_FILENAME}.json"   --model-id "$MODEL"   --max-concurrency "$CONC"   --request-mode "${REQUEST_MODE:-multi-turn}"   --support-status "${SUPPORT_STATUS:-reviewed_preview}"   --result-stem "$RESULT_FILENAME"
diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh
new file mode 100755
index 000000000..f917c03c3
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_gptoss_fp4_h200_vllm.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC RESULT_FILENAME
+
+PORT=${PORT:-8888}
+TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces}
+BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800}
+SERVER_LOG=/workspace/server.log
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272}
+cat > config.yaml << EOF
+kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8}
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+start_gpu_monitor
+start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0
+
+set -x
+python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py   --api-endpoint "http://localhost:$PORT"   --trace-directory "$TRACE_DIR"   --output-dir /workspace/   --start-users "$CONC"   --max-users "$CONC"   --test-duration "$BENCHMARK_DURATION_S"   --seed 42   --no-color
+set +x
+
+stop_kv_metrics_collector
+stop_gpu_monitor
+
+python3 datasets/isb1/scripts/adapt_trace_replay_result.py   --input-dir /workspace   --detailed-csv detailed_results.csv   --output-json "/workspace/${RESULT_FILENAME}.json"   --model-id "$MODEL"   --max-concurrency "$CONC"   --request-mode "${REQUEST_MODE:-multi-turn}"   --support-status "${SUPPORT_STATUS:-reviewed_preview}"   --result-stem "$RESULT_FILENAME"
diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh
new file mode 100755
index 000000000..7c46b0c31
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_sglang.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC RESULT_FILENAME
+
+PORT=${PORT:-8888}
+TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces}
+BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800}
+SERVER_LOG=/workspace/server.log
+
+CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272}
+RADIX_CACHE_ARGS=""
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+  apply_sglang_offload_config
+fi
+
+launch_sglang_server "$MODEL" "$PORT"   --trust-remote-code   --ep-size "${EP_SIZE:-1}"   --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}"   --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}"   --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}"   --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}"   --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}"   --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}"   --context-length "$CONTEXT_LENGTH"   --stream-interval "${SGLANG_STREAM_INTERVAL:-10}"   ${RADIX_CACHE_ARGS}
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+start_gpu_monitor
+start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0
+
+set -x
+python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py   --api-endpoint "http://localhost:$PORT"   --trace-directory "$TRACE_DIR"   --output-dir /workspace/   --start-users "$CONC"   --max-users "$CONC"   --test-duration "$BENCHMARK_DURATION_S"   --seed 42   --no-color
+set +x
+
+stop_kv_metrics_collector
+stop_gpu_monitor
+
+python3 datasets/isb1/scripts/adapt_trace_replay_result.py   --input-dir /workspace   --detailed-csv detailed_results.csv   --output-json "/workspace/${RESULT_FILENAME}.json"   --model-id "$MODEL"   --max-concurrency "$CONC"   --request-mode "${REQUEST_MODE:-multi-turn}"   --support-status "${SUPPORT_STATUS:-reviewed_preview}"   --result-stem "$RESULT_FILENAME"
diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh
new file mode 100755
index 000000000..f917c03c3
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_b200_vllm.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC RESULT_FILENAME
+
+PORT=${PORT:-8888}
+TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces}
+BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800}
+SERVER_LOG=/workspace/server.log
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272}
+cat > config.yaml << EOF
+kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8}
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+start_gpu_monitor
+start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0
+
+set -x
+python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py   --api-endpoint "http://localhost:$PORT"   --trace-directory "$TRACE_DIR"   --output-dir /workspace/   --start-users "$CONC"   --max-users "$CONC"   --test-duration "$BENCHMARK_DURATION_S"   --seed 42   --no-color
+set +x
+
+stop_kv_metrics_collector
+stop_gpu_monitor
+
+python3 datasets/isb1/scripts/adapt_trace_replay_result.py   --input-dir /workspace   --detailed-csv detailed_results.csv   --output-json "/workspace/${RESULT_FILENAME}.json"   --model-id "$MODEL"   --max-concurrency "$CONC"   --request-mode "${REQUEST_MODE:-multi-turn}"   --support-status "${SUPPORT_STATUS:-reviewed_preview}"   --result-stem "$RESULT_FILENAME"
diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh
new file mode 100755
index 000000000..7c46b0c31
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_sglang.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC RESULT_FILENAME
+
+PORT=${PORT:-8888}
+TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces}
+BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800}
+SERVER_LOG=/workspace/server.log
+
+CONTEXT_LENGTH=${MAX_MODEL_LEN:-131272}
+RADIX_CACHE_ARGS=""
+if [[ -n "${OFFLOAD_MODE:-}" ]]; then
+  apply_sglang_offload_config
+fi
+
+launch_sglang_server "$MODEL" "$PORT"   --trust-remote-code   --ep-size "${EP_SIZE:-1}"   --reasoning-parser "${SGLANG_REASONING_PARSER:-gpt-oss}"   --max-running-requests "${SGLANG_MAX_RUNNING_REQUESTS:-256}"   --cuda-graph-max-bs "${SGLANG_CUDA_GRAPH_MAX_BS:-256}"   --chunked-prefill-size "${SGLANG_CHUNKED_PREFILL_OVERRIDE:-32768}"   --max-prefill-tokens "${SGLANG_MAX_PREFILL_TOKENS:-32768}"   --mem-fraction-static "${SGLANG_MEM_FRACTION_OVERRIDE:-0.85}"   --context-length "$CONTEXT_LENGTH"   --stream-interval "${SGLANG_STREAM_INTERVAL:-10}"   ${RADIX_CACHE_ARGS}
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+start_gpu_monitor
+start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0
+
+set -x
+python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py   --api-endpoint "http://localhost:$PORT"   --trace-directory "$TRACE_DIR"   --output-dir /workspace/   --start-users "$CONC"   --max-users "$CONC"   --test-duration "$BENCHMARK_DURATION_S"   --seed 42   --no-color
+set +x
+
+stop_kv_metrics_collector
+stop_gpu_monitor
+
+python3 datasets/isb1/scripts/adapt_trace_replay_result.py   --input-dir /workspace   --detailed-csv detailed_results.csv   --output-json "/workspace/${RESULT_FILENAME}.json"   --model-id "$MODEL"   --max-concurrency "$CONC"   --request-mode "${REQUEST_MODE:-multi-turn}"   --support-status "${SUPPORT_STATUS:-reviewed_preview}"   --result-stem "$RESULT_FILENAME"
diff --git a/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh
new file mode 100755
index 000000000..f917c03c3
--- /dev/null
+++ b/experimental/multiturn/vllm_benchmark/scripts/trace_replay_qwen3.5_fp8_h200_vllm.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../../../../benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC RESULT_FILENAME
+
+PORT=${PORT:-8888}
+TRACE_DIR=${TRACE_DIR:-experimental/multiturn/vllm_benchmark/kv-cache-tester/traces}
+BENCHMARK_DURATION_S=${BENCHMARK_DURATION_S:-1800}
+SERVER_LOG=/workspace/server.log
+
+CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-131272}
+cat > config.yaml << EOF
+kv-cache-dtype: ${KV_CACHE_DTYPE:-fp8}
+max-cudagraph-capture-size: 2048
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+launch_vllm_server "$MODEL" "$PORT" config.yaml --disable-log-requests --trust-remote-code
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+start_gpu_monitor
+start_kv_metrics_collector "$PORT" /workspace/kv_metrics.csv 2.0
+
+set -x
+python3 experimental/multiturn/vllm_benchmark/kv-cache-tester/trace_replay_tester.py   --api-endpoint "http://localhost:$PORT"   --trace-directory "$TRACE_DIR"   --output-dir /workspace/   --start-users "$CONC"   --max-users "$CONC"   --test-duration "$BENCHMARK_DURATION_S"   --seed 42   --no-color
+set +x
+
+stop_kv_metrics_collector
+stop_gpu_monitor
+
+python3 datasets/isb1/scripts/adapt_trace_replay_result.py   --input-dir /workspace   --detailed-csv detailed_results.csv   --output-json "/workspace/${RESULT_FILENAME}.json"   --model-id "$MODEL"   --max-concurrency "$CONC"   --request-mode "${REQUEST_MODE:-multi-turn}"   --support-status "${SUPPORT_STATUS:-reviewed_preview}"   --result-stem "$RESULT_FILENAME"
diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index 847b7ee80..644b2c3a4 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/bash
 
+source "$(dirname "$0")/lib_single_node_script.sh"
+
 # System-specific configuration for B200 DGXC Slurm cluster
 SLURM_PARTITION="gpu"
 SLURM_ACCOUNT="benchmark"
@@ -215,8 +217,7 @@ else
 
     HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache"
     SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-    FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
-    SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
+    SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "b200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1
     LOCK_FILE="${SQUASH_FILE}.lock"
 
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
@@ -243,5 +244,7 @@ else
         --no-container-mount-home \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \
-        bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
+        bash "$SCRIPT_PATH"
+
+    scancel $JOB_ID
 fi
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index f8c614936..caa1e8364 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -1,8 +1,9 @@
 #!/usr/bin/bash
 
+source "$(dirname "$0")/lib_single_node_script.sh"
+
 HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/"
-FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
-SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
+SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "b200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1
 PORT=8888
 
 # Create unique cache directory based on model parameters
@@ -30,13 +31,17 @@ docker run --rm --init --network host --name $server_name \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
 -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \
+-e SPEC_DECODING -e DISAGG \
+-e BENCHMARK_TYPE -e EXPORT_FILE -e RUNTIME_STACK_ID -e HARDWARE_PROFILE_ID -e CANONICAL_MODEL_ID -e REQUEST_MODE -e MAX_CONCURRENCY \
+-e SUPPORT_STATUS -e VLLM_CPU_OFFLOAD_GB -e VLLM_SWAP_SPACE_GB -e SGLANG_MEM_FRACTION_OVERRIDE -e SGLANG_CHUNKED_PREFILL_OVERRIDE \
+-e MAX_SESSIONS -e MAX_TURNS_PER_SESSION -e MAX_OUTPUT_LEN -e NUM_WARMUP_SESSIONS -e IGNORE_WAITS -e IGNORE_EOS \
 -e NCCL_GRAPH_REGISTER=0 \
 -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE \
 --entrypoint=/bin/bash \
 $(echo "$IMAGE" | sed 's/#/\//') \
-benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh"
+"$SCRIPT_PATH"
 
 # Try graceful first
 docker stop -t 90 "$server_name" || true
diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh
index c321ee0f9..cbcc7469b 100644
--- a/runners/launch_b200-nb.sh
+++ b/runners/launch_b200-nb.sh
@@ -1,9 +1,10 @@
 #!/usr/bin/bash
 
+source "$(dirname "$0")/lib_single_node_script.sh"
+
 HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/"
 PARTITION="main"
-FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
-SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
+SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "b200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1
 
 UCX_NET_DEVICES=eth0
 
@@ -17,4 +18,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME"
 --container-writable \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \
-bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
\ No newline at end of file
+bash "$SCRIPT_PATH"
\ No newline at end of file
diff --git a/runners/launch_h100-cr.sh b/runners/launch_h100-cr.sh
index 5100419b9..44c46600d 100644
--- a/runners/launch_h100-cr.sh
+++ b/runners/launch_h100-cr.sh
@@ -1,7 +1,10 @@
 #!/usr/bin/bash
 
+source "$(dirname "$0")/lib_single_node_script.sh"
+
 HF_HUB_CACHE_MOUNT="/home/ubuntu/hf_hub_cache/"
 PORT=8888
+SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h100" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1
 
 server_name="bmk-server"
 
@@ -10,9 +13,13 @@ docker run --rm --network=host --name=$server_name \
 --runtime=nvidia --gpus=all --ipc=host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \
--e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
+-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e EP_SIZE -e DP_ATTENTION -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e RUN_EVAL -e EVAL_ONLY -e RUNNER_TYPE -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e PORT=$PORT \
+-e SPEC_DECODING -e DISAGG \
+-e BENCHMARK_TYPE -e EXPORT_FILE -e RUNTIME_STACK_ID -e HARDWARE_PROFILE_ID -e CANONICAL_MODEL_ID -e REQUEST_MODE -e MAX_CONCURRENCY \
+-e SUPPORT_STATUS -e VLLM_CPU_OFFLOAD_GB -e VLLM_SWAP_SPACE_GB -e SGLANG_MEM_FRACTION_OVERRIDE -e SGLANG_CHUNKED_PREFILL_OVERRIDE \
+-e MAX_SESSIONS -e MAX_TURNS_PER_SESSION -e MAX_OUTPUT_LEN -e NUM_WARMUP_SESSIONS -e IGNORE_WAITS -e IGNORE_EOS \
 -e PROFILE -e SGLANG_TORCH_PROFILER_DIR -e VLLM_TORCH_PROFILER_DIR -e VLLM_RPC_TIMEOUT \
 -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e TORCH_CUDA_ARCH_LIST="9.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
 --entrypoint=/bin/bash \
 $IMAGE \
-benchmarks/single_node/"${EXP_NAME%%_*}_${PRECISION}_h100.sh"
+"$SCRIPT_PATH"
diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh
index 49a42e981..bb10dcb6d 100644
--- a/runners/launch_h100-cw.sh
+++ b/runners/launch_h100-cw.sh
@@ -1,9 +1,12 @@
 #!/usr/bin/env bash
 
+source "$(dirname "$0")/lib_single_node_script.sh"
+
 export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache"
 PARTITION="h100"
 SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 LOCK_FILE="${SQUASH_FILE}.lock"
+SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h100" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1
 
 set -x
 
@@ -31,7 +34,7 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL,PORT=8888 \
-bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh
+bash "$SCRIPT_PATH"
 
 rmdir $SAGEMAKER_SHM_PATH
 scancel $JOB_ID
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index bb0335955..11570289a 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/bash
 
+source "$(dirname "$0")/lib_single_node_script.sh"
+
 # System-specific configuration for H100 DGXC Slurm cluster
 SLURM_PARTITION="hpc-gpu-1"
 SLURM_ACCOUNT="customer"
@@ -230,6 +232,7 @@ else
 
     HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/"
     SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h100" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1
 
     salloc --exclude="$SLURM_EXCLUDED_NODELIST" --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
@@ -247,7 +250,7 @@ else
         --no-container-mount-home \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \
-        bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h100.sh
+        bash "$SCRIPT_PATH"
 
     scancel $JOB_ID
 
diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh
index 657f84792..5a49efcc6 100644
--- a/runners/launch_h200-cw.sh
+++ b/runners/launch_h200-cw.sh
@@ -1,11 +1,12 @@
 #!/usr/bin/env bash
 
+source "$(dirname "$0")/lib_single_node_script.sh"
+
 export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache"
 export PORT=8888
 
 MODEL_CODE="${EXP_NAME%%_*}"
-FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
-SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
+SCRIPT_PATH=$(resolve_single_node_benchmark_script "$MODEL_CODE" "$PRECISION" "h200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1
 
 PARTITION="h200"
 SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
@@ -44,7 +45,7 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
+bash "$SCRIPT_PATH"
 
 rmdir $SAGEMAKER_SHM_PATH
 scancel $JOB_ID
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index 9b3b771a5..a6f4d2986 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/bash
 
+source "$(dirname "$0")/lib_single_node_script.sh"
+
 # System-specific configuration for H200 DGXC Slurm cluster
 SLURM_PARTITION="main"
 SLURM_ACCOUNT="sa-shared"
@@ -233,6 +235,7 @@ else
     # Convert pyxis image format (nvcr.io#path) to docker format (nvcr.io/path) for enroot import
     DOCKER_IMAGE=$(echo "$IMAGE" | sed 's/#/\//g')
     LOCK_FILE="${SQUASH_FILE}.lock"
+    SCRIPT_PATH=$(resolve_single_node_benchmark_script "${EXP_NAME%%_*}" "$PRECISION" "h200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1
 
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
@@ -258,7 +261,7 @@ else
         --no-container-mount-home \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \
-        bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_h200$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt')$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp').sh
+        bash "$SCRIPT_PATH"
 
     scancel $JOB_ID
 
diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh
index 9d157a858..3b697fb51 100644
--- a/runners/launch_h200-nb.sh
+++ b/runners/launch_h200-nb.sh
@@ -1,11 +1,12 @@
 #!/usr/bin/bash
 
+source "$(dirname "$0")/lib_single_node_script.sh"
+
 export HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/"
 export PORT=8888
 
 MODEL_CODE="${EXP_NAME%%_*}"
-FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
-SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
+SCRIPT_PATH=$(resolve_single_node_benchmark_script "$MODEL_CODE" "$PRECISION" "h200" "$FRAMEWORK" "${SPEC_DECODING:-none}") || exit 1
 
 PARTITION="main"
 
@@ -19,4 +20,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME"
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
+bash "$SCRIPT_PATH"
diff --git a/runners/lib_single_node_script.sh b/runners/lib_single_node_script.sh
new file mode 100644
index 000000000..194668856
--- /dev/null
+++ b/runners/lib_single_node_script.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+resolve_single_node_benchmark_script() {
+    local model_code="$1"
+    local precision="$2"
+    local runner_code="$3"
+    local framework="${4:-}"
+    local spec_decoding="${5:-}"
+    local script_base="benchmarks/single_node/${model_code}_${precision}_${runner_code}"
+
+    if [[ "${BENCHMARK_TYPE:-}" == "isb1_replay" ]] && [[ "$framework" == "sglang" || "$framework" == "vllm" ]]; then
+        local runtime_candidate="${script_base}_${framework}.sh"
+        if [[ -f "$runtime_candidate" ]]; then
+            printf '%s\n' "$runtime_candidate"
+            return 0
+        fi
+    fi
+
+    local framework_suffix=""
+    local spec_suffix=""
+    if [[ "$framework" == "trt" ]]; then
+        framework_suffix="_trt"
+    fi
+    if [[ "$spec_decoding" == "mtp" ]]; then
+        spec_suffix="_mtp"
+    fi
+
+    local legacy_candidate="${script_base}${framework_suffix}${spec_suffix}.sh"
+    if [[ -f "$legacy_candidate" ]]; then
+        printf '%s\n' "$legacy_candidate"
+        return 0
+    fi
+
+    echo "ERROR: Could not resolve single-node benchmark script." >&2
+    echo "  model=$model_code precision=$precision runner=$runner_code framework=${framework:-<unset>} spec_decoding=${spec_decoding:-<unset>} benchmark_type=${BENCHMARK_TYPE:-<unset>}" >&2
+    if [[ "${BENCHMARK_TYPE:-}" == "isb1_replay" ]] && [[ "$framework" == "sglang" || "$framework" == "vllm" ]]; then
+        echo "  checked runtime-aware candidate: ${script_base}_${framework}.sh" >&2
+    fi
+    echo "  checked legacy candidate: $legacy_candidate" >&2
+    return 1
+}
diff --git a/utils/bench_serving/benchmark_export_replay.py b/utils/bench_serving/benchmark_export_replay.py
new file mode 100644
index 000000000..c67a5fd41
--- /dev/null
+++ b/utils/bench_serving/benchmark_export_replay.py
@@ -0,0 +1,1536 @@
+# SPDX-License-Identifier: Apache-2.0
+r"""Replay ISB1 export sessions against OpenAI-compatible inference servers.
+
+Supported export formats:
+    - ``inferencex_multiturn`` (direct-ingest session turns)
+    - ``inferencex_trace_replay`` (event-based trace replay)
+
+Supported request modes:
+    - ``chat``: send full message history to ``/v1/chat/completions``
+    - ``completions``: project the message history into a single tagged prompt
+      and send it to ``/v1/completions``
+    - ``auto``: prefer chat for standalone vLLM/SGLang cells and completions
+      for TRT / Dynamo projection cells
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import math
+import os
+import random
+import sys
+import time
+import warnings
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Callable, Optional
+
+import aiohttp
+import numpy as np
+from tqdm.asyncio import tqdm
+
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
+
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60, sock_read=5 * 60)
+DEFAULT_IMAGE_TOKEN_ESTIMATE = 2048
+DEFAULT_FALLBACK_OUTPUT_LEN = 256
+CHAT_NATIVE_RUNTIMES = {"standalone:vllm", "standalone:sglang"}
+COMPLETIONS_PREFERRED_RUNTIMES = {
+    "standalone:trt_llm",
+    "dynamo:vllm",
+    "dynamo:sglang",
+    "dynamo:trt_llm",
+}
+ROLE_LABELS = {
+    "system": "SYSTEM",
+    "user": "USER",
+    "assistant": "ASSISTANT",
+    "tool": "TOOL",
+    "retrieval": "RETRIEVAL",
+    "execution": "EXECUTION",
+}
+MODULE_DIR = Path(__file__).resolve().parent
+if str(MODULE_DIR) not in sys.path:
+    sys.path.insert(0, str(MODULE_DIR))
+
+
+@dataclass
+class TurnResult:
+    turn_idx: int
+    context_len: int
+    output_len: int
+    ttft: float = 0.0
+    tpot: float = 0.0
+    e2el: float = 0.0
+    itl: list[float] = field(default_factory=list)
+    success: bool = True
+    error: str = ""
+    request_mode: str = "chat"
+    actual_context_len: int = 0
+
+
+@dataclass
+class SessionResult:
+    session_id: str
+    turns: list[TurnResult] = field(default_factory=list)
+    total_input_tokens: int = 0
+    total_actual_input_tokens: int = 0
+    total_output_tokens: int = 0
+    total_duration: float = 0.0
+
+
+@dataclass
+class ReplayTurn:
+    turn_idx: int
+    turn_id: Any
+    output_len: int
+    wait_before_s: float
+    context_len: int
+    actual_context_len: int
+    chat_messages: list[dict[str, Any]]
+    completion_prompt: str
+
+
+@dataclass
+class ReplaySession:
+    session_id: str
+    trace_id: str
+    runtime_stack_id: str
+    hardware_profile_id: str
+    canonical_model_id: str
+    support_status: str
+    benchmark_certification_status: str
+    request_mode: str
+    adapter_id: str
+    turns: list[ReplayTurn]
+
+
+def _csv_values(raw: Optional[str]) -> set[str] | None:
+    if raw is None:
+        return None
+    values = {item.strip() for item in raw.split(",") if item.strip()}
+    return values or None
+
+
+def _matches_filter(value: str, allowed: set[str] | None) -> bool:
+    return allowed is None or value in allowed
+
+
+def _fallback_text_token_count(text: str) -> int:
+    stripped = (text or "").strip()
+    if not stripped:
+        return 0
+    return max(1, math.ceil(len(stripped) / 4))
+
+
+def build_text_token_counter(
+    tokenizer_id: Optional[str],
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+) -> Callable[[str], int]:
+    if not tokenizer_id:
+        return _fallback_text_token_count
+
+    try:
+        from backend_request_func import get_tokenizer
+
+        tokenizer = get_tokenizer(
+            tokenizer_id,
+            tokenizer_mode=tokenizer_mode,
+            trust_remote_code=trust_remote_code,
+        )
+    except Exception as exc:
+        warnings.warn(
+            "Falling back to approximate token counting because tokenizer load "
+            f"failed for {tokenizer_id!r}: {exc}",
+            stacklevel=2,
+        )
+        return _fallback_text_token_count
+
+    def _count(text: str) -> int:
+        return len(tokenizer.encode(text or "", add_special_tokens=False))
+
+    return _count
+
+
+def _render_block_as_text(block: dict[str, Any]) -> str:
+    block_type = str(block.get("type", "text"))
+    text = (block.get("text") or "").strip()
+    if block_type == "text":
+        return text
+    if block_type == "code":
+        return f"[CODE]\n{text}" if text else "[CODE]"
+    if block_type == "log":
+        return f"[LOG]\n{text}" if text else "[LOG]"
+    if block_type == "document":
+        label = block.get("asset_path") or block.get("uri") or ""
+        if text and label:
+            return f"[DOCUMENT: {label}]\n{text}"
+        if text:
+            return f"[DOCUMENT]\n{text}"
+        return f"[DOCUMENT: {label}]" if label else "[DOCUMENT]"
+    if block_type == "table":
+        return f"[TABLE]\n{text}" if text else "[TABLE]"
+    if block_type == "image":
+        label = block.get("uri") or block.get("asset_path") or text or "image"
+        return f"[IMAGE: {label}]"
+    return text or f"[{block_type.upper()}]"
+
+
+def _extract_message_text(message: dict[str, Any]) -> str:
+    if isinstance(message.get("content"), str):
+        body = message["content"]
+    elif isinstance(message.get("content"), list):
+        parts: list[str] = []
+        for part in message["content"]:
+            part_type = str(part.get("type", "text"))
+            if part_type == "text":
+                parts.append((part.get("text") or "").strip())
+            elif part_type == "image_url":
+                url = ""
+                if isinstance(part.get("image_url"), dict):
+                    url = part["image_url"].get("url") or ""
+                parts.append(f"[IMAGE: {url or 'image'}]")
+        body = "\n\n".join(item for item in parts if item)
+    else:
+        content_blocks = message.get("content_blocks") or []
+        body = "\n\n".join(
+            filter(None, (_render_block_as_text(block) for block in content_blocks))
+        )
+
+    role = str(message.get("role", "user"))
+    if role in {"tool", "retrieval", "execution"}:
+        prefix = f"[{ROLE_LABELS.get(role, role.upper())} RESULT]"
+        return f"{prefix}\n{body}" if body else prefix
+    return body
+
+
+def _message_to_chat_payload(message: dict[str, Any]) -> dict[str, Any]:
+    role = str(message.get("role", "user"))
+    projected_role = role if role in {"system", "user", "assistant"} else "user"
+    content_blocks = message.get("content_blocks") or []
+
+    if not content_blocks:
+        return {"role": projected_role, "content": _extract_message_text(message)}
+
+    parts: list[dict[str, Any]] = []
+    if role not in {"system", "user", "assistant"}:
+        parts.append(
+            {
+                "type": "text",
+                "text": f"[{ROLE_LABELS.get(role, role.upper())} RESULT]",
+            }
+        )
+
+    for block in content_blocks:
+        block_type = str(block.get("type", "text"))
+        if block_type == "image" and block.get("uri"):
+            parts.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": block["uri"]},
+                }
+            )
+            continue
+
+        text = _render_block_as_text(block)
+        if text:
+            parts.append({"type": "text", "text": text})
+
+    if not parts:
+        return {"role": projected_role, "content": ""}
+    if len(parts) == 1 and parts[0]["type"] == "text":
+        return {"role": projected_role, "content": parts[0]["text"]}
+    return {"role": projected_role, "content": parts}
+
+
+def _message_token_estimate(
+    message: dict[str, Any],
+    count_text_tokens: Callable[[str], int],
+    image_token_estimate: int,
+) -> int:
+    content_blocks = message.get("content_blocks") or []
+    if not content_blocks:
+        return count_text_tokens(_extract_message_text(message))
+
+    total = 0
+    role = str(message.get("role", "user"))
+    if role in {"tool", "retrieval", "execution"}:
+        total += count_text_tokens(f"[{ROLE_LABELS.get(role, role.upper())} RESULT]")
+
+    for block in content_blocks:
+        block_type = str(block.get("type", "text"))
+        if block_type == "image":
+            total += int(
+                block.get("asset_token_count")
+                or block.get("metadata", {}).get("token_count")
+                or image_token_estimate
+            )
+            continue
+        if block.get("asset_token_count") and block.get("asset_path"):
+            total += int(block["asset_token_count"])
+            continue
+        total += count_text_tokens(_render_block_as_text(block))
+    return total
+
+
+def _chat_payload_token_count(
+    chat_messages: list[dict[str, Any]],
+    count_text_tokens: Callable[[str], int],
+) -> int:
+    """Count tokens in the rendered chat payload that will actually be sent over HTTP."""
+    total = 0
+    for msg in chat_messages:
+        content = msg.get("content", "")
+        if isinstance(content, str):
+            total += count_text_tokens(content)
+        elif isinstance(content, list):
+            for part in content:
+                if part.get("type") == "text":
+                    total += count_text_tokens(part.get("text", ""))
+                elif part.get("type") == "image_url":
+                    total += DEFAULT_IMAGE_TOKEN_ESTIMATE
+    return total
+
+
+def _messages_to_completion_prompt(messages: list[dict[str, Any]]) -> str:
+    prompt_parts: list[str] = []
+    for message in messages:
+        role = ROLE_LABELS.get(str(message.get("role", "user")), "USER")
+        body = _extract_message_text(message).strip()
+        prompt_parts.append(f"{role}:\n{body}" if body else f"{role}:")
+    prompt_parts.append("ASSISTANT:\n")
+    return "\n\n".join(prompt_parts)
+
+
+def resolve_request_mode(runtime_stack_id: str, requested_mode: str) -> str:
+    if requested_mode != "auto":
+        return requested_mode
+    if runtime_stack_id in CHAT_NATIVE_RUNTIMES:
+        return "chat"
+    if runtime_stack_id in COMPLETIONS_PREFERRED_RUNTIMES:
+        return "completions"
+    return "chat"
+
+
+def _parse_prometheus_sample(line: str) -> tuple[str, float] | None:
+    """Parse a Prometheus sample line into ``(metric_name, value)``."""
+    raw_line = line.strip()
+    if not raw_line or raw_line.startswith("#"):
+        return None
+
+    try:
+        metric_with_labels, raw_value = raw_line.rsplit(maxsplit=1)
+        metric_name = metric_with_labels.split("{", 1)[0]
+        return metric_name, float(raw_value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _resolve_output_len(
+    raw_output_len: Any,
+    fallback_output_len: int,
+    output_len_cap: Optional[int],
+) -> int:
+    try:
+        output_len = int(raw_output_len)
+    except (TypeError, ValueError):
+        output_len = fallback_output_len
+    if output_len <= 0:
+        output_len = fallback_output_len
+    if output_len_cap is not None:
+        output_len = min(output_len, output_len_cap)
+    return output_len
+
+
+def _build_turn_from_messages(
+    turn_idx: int,
+    turn_id: Any,
+    messages: list[dict[str, Any]],
+    output_len: int,
+    wait_before_s: float,
+    request_mode: str,
+    count_text_tokens: Callable[[str], int],
+    image_token_estimate: int,
+) -> ReplayTurn:
+    chat_messages = [_message_to_chat_payload(message) for message in messages]
+    completion_prompt = _messages_to_completion_prompt(messages)
+    if request_mode == "chat":
+        context_len = sum(
+            _message_token_estimate(message, count_text_tokens, image_token_estimate)
+            for message in messages
+        )
+        actual_context_len = _chat_payload_token_count(chat_messages, count_text_tokens)
+    else:
+        context_len = count_text_tokens(completion_prompt)
+        actual_context_len = context_len  # completions mode already uses rendered text
+    return ReplayTurn(
+        turn_idx=turn_idx,
+        turn_id=turn_id,
+        output_len=output_len,
+        wait_before_s=wait_before_s,
+        context_len=context_len,
+        actual_context_len=actual_context_len,
+        chat_messages=chat_messages,
+        completion_prompt=completion_prompt,
+    )
+
+
+def _build_session_from_multiturn_cell(
+    cell: dict[str, Any],
+    request_mode: str,
+    count_text_tokens: Callable[[str], int],
+    image_token_estimate: int,
+    ignore_waits: bool,
+    fallback_output_len: int,
+    output_len_cap: Optional[int],
+    max_turns_per_session: Optional[int],
+) -> ReplaySession:
+    session = cell["session"]
+    turns: list[ReplayTurn] = []
+    for raw_turn in session.get("turns", []):
+        turns.append(
+            _build_turn_from_messages(
+                turn_idx=int(raw_turn.get("turn_idx", len(turns))),
+                turn_id=raw_turn.get("turn_id"),
+                messages=list(raw_turn.get("messages", [])),
+                output_len=_resolve_output_len(
+                    raw_turn.get("expected_output_tokens"),
+                    fallback_output_len,
+                    output_len_cap,
+                ),
+                wait_before_s=0.0
+                if ignore_waits
+                else float(raw_turn.get("wait_before_ms", 0)) / 1000.0,
+                request_mode=request_mode,
+                count_text_tokens=count_text_tokens,
+                image_token_estimate=image_token_estimate,
+            )
+        )
+        if max_turns_per_session is not None and len(turns) >= max_turns_per_session:
+            break
+
+    return ReplaySession(
+        session_id=str(session.get("session_id", cell["trace_id"])),
+        trace_id=str(cell["trace_id"]),
+        runtime_stack_id=str(cell["runtime_stack_id"]),
+        hardware_profile_id=str(cell["hardware_profile_id"]),
+        canonical_model_id=str(cell["canonical_model_id"]),
+        support_status=str(cell.get("support_status", "unknown")),
+        benchmark_certification_status=str(
+            cell.get("benchmark_certification_status", "unknown")
+        ),
+        request_mode=request_mode,
+        adapter_id="inferencex_multiturn",
+        turns=turns,
+    )
+
+
+def _build_session_from_trace_replay_cell(
+    cell: dict[str, Any],
+    request_mode: str,
+    count_text_tokens: Callable[[str], int],
+    image_token_estimate: int,
+    ignore_waits: bool,
+    fallback_output_len: int,
+    output_len_cap: Optional[int],
+    max_turns_per_session: Optional[int],
+) -> ReplaySession:
+    turns: list[ReplayTurn] = []
+    prior_offset_ms = 0
+    for index, event in enumerate(cell.get("events", [])):
+        offset_ms = int(event.get("arrival_time_offset_ms", 0) or 0)
+        wait_before_ms = 0 if index == 0 else max(0, offset_ms - prior_offset_ms)
+        prior_offset_ms = offset_ms
+        turns.append(
+            _build_turn_from_messages(
+                turn_idx=index,
+                turn_id=event.get("turn_id"),
+                messages=list(event.get("input_messages", [])),
+                output_len=_resolve_output_len(
+                    event.get("target_output_tokens"),
+                    fallback_output_len,
+                    output_len_cap,
+                ),
+                wait_before_s=0.0 if ignore_waits else wait_before_ms / 1000.0,
+                request_mode=request_mode,
+                count_text_tokens=count_text_tokens,
+                image_token_estimate=image_token_estimate,
+            )
+        )
+        if max_turns_per_session is not None and len(turns) >= max_turns_per_session:
+            break
+
+    return ReplaySession(
+        session_id=str(cell.get("trace_metadata", {}).get("session_id", cell["trace_id"])),
+        trace_id=str(cell["trace_id"]),
+        runtime_stack_id=str(cell["runtime_stack_id"]),
+        hardware_profile_id=str(cell["hardware_profile_id"]),
+        canonical_model_id=str(cell["canonical_model_id"]),
+        support_status=str(cell.get("support_status", "unknown")),
+        benchmark_certification_status=str(
+            cell.get("benchmark_certification_status", "unknown")
+        ),
+        request_mode=request_mode,
+        adapter_id="inferencex_trace_replay",
+        turns=turns,
+    )
+
+
+def load_replay_sessions(
+    export_file: str,
+    count_text_tokens: Callable[[str], int],
+    runtime_stack_ids: set[str] | None = None,
+    hardware_profile_ids: set[str] | None = None,
+    canonical_model_ids: set[str] | None = None,
+    trace_ids: set[str] | None = None,
+    support_statuses: set[str] | None = None,
+    request_mode: str = "auto",
+    image_token_estimate: int = DEFAULT_IMAGE_TOKEN_ESTIMATE,
+    ignore_waits: bool = False,
+    fallback_output_len: int = DEFAULT_FALLBACK_OUTPUT_LEN,
+    output_len_cap: Optional[int] = None,
+    session_offset: int = 0,
+    max_sessions: Optional[int] = None,
+    max_turns_per_session: Optional[int] = None,
+    shuffle_sessions: bool = False,
+    seed: int = 0,
+    allow_mixed_selection: bool = False,
+) -> tuple[list[ReplaySession], dict[str, Any]]:
+    payload = json.loads(Path(export_file).read_text())
+    adapter_id = str(payload.get("adapter_id", "unknown"))
+    export_cells = list(payload.get("exports", []))
+    if adapter_id not in {"inferencex_multiturn", "inferencex_trace_replay"}:
+        raise ValueError(
+            f"Unsupported export adapter {adapter_id!r}. Expected "
+            "'inferencex_multiturn' or 'inferencex_trace_replay'."
+        )
+
+    selected_cells = [
+        cell
+        for cell in export_cells
+        if _matches_filter(str(cell.get("runtime_stack_id", "")), runtime_stack_ids)
+        and _matches_filter(str(cell.get("hardware_profile_id", "")), hardware_profile_ids)
+        and _matches_filter(str(cell.get("canonical_model_id", "")), canonical_model_ids)
+        and _matches_filter(str(cell.get("trace_id", "")), trace_ids)
+        and _matches_filter(str(cell.get("support_status", "")), support_statuses)
+    ]
+    if not selected_cells:
+        raise ValueError(
+            "No export cells matched the requested filters. "
+            "Check runtime_stack_id / hardware_profile_id / canonical_model_id / "
+            "trace_id / support_status."
+        )
+
+    if shuffle_sessions:
+        random.Random(seed).shuffle(selected_cells)
+
+    if session_offset:
+        selected_cells = selected_cells[session_offset:]
+    if max_sessions is not None:
+        selected_cells = selected_cells[:max_sessions]
+    if not selected_cells:
+        raise ValueError("Selection became empty after applying session_offset/max_sessions.")
+
+    uniqueness = {
+        "runtime_stack_id": sorted({str(cell["runtime_stack_id"]) for cell in selected_cells}),
+        "hardware_profile_id": sorted({str(cell["hardware_profile_id"]) for cell in selected_cells}),
+        "canonical_model_id": sorted({str(cell["canonical_model_id"]) for cell in selected_cells}),
+    }
+    if not allow_mixed_selection:
+        mixed_fields = [field for field, values in uniqueness.items() if len(values) > 1]
+        if mixed_fields:
+            details = ", ".join(f"{field}={uniqueness[field]}" for field in mixed_fields)
+            raise ValueError(
+                "Selected export cells span multiple target server identities; "
+                f"filter more narrowly or pass --allow-mixed-selection. Mixed fields: {details}"
+            )
+
+    sessions: list[ReplaySession] = []
+    for cell in selected_cells:
+        resolved_mode = resolve_request_mode(str(cell["runtime_stack_id"]), request_mode)
+        if adapter_id == "inferencex_multiturn":
+            sessions.append(
+                _build_session_from_multiturn_cell(
+                    cell=cell,
+                    request_mode=resolved_mode,
+                    count_text_tokens=count_text_tokens,
+                    image_token_estimate=image_token_estimate,
+                    ignore_waits=ignore_waits,
+                    fallback_output_len=fallback_output_len,
+                    output_len_cap=output_len_cap,
+                    max_turns_per_session=max_turns_per_session,
+                )
+            )
+        else:
+            sessions.append(
+                _build_session_from_trace_replay_cell(
+                    cell=cell,
+                    request_mode=resolved_mode,
+                    count_text_tokens=count_text_tokens,
+                    image_token_estimate=image_token_estimate,
+                    ignore_waits=ignore_waits,
+                    fallback_output_len=fallback_output_len,
+                    output_len_cap=output_len_cap,
+                    max_turns_per_session=max_turns_per_session,
+                )
+            )
+
+    selection_metadata = {
+        "adapter_id": adapter_id,
+        "export_file": str(export_file),
+        "selected_sessions": len(sessions),
+        "trace_ids": [session.trace_id for session in sessions],
+        "runtime_stack_ids": sorted({session.runtime_stack_id for session in sessions}),
+        "hardware_profile_ids": sorted({session.hardware_profile_id for session in sessions}),
+        "canonical_model_ids": sorted({session.canonical_model_id for session in sessions}),
+        "support_statuses": sorted({session.support_status for session in sessions}),
+        "support_status_counts": {
+            status: sum(1 for session in sessions if session.support_status == status)
+            for status in sorted({session.support_status for session in sessions})
+        },
+        "benchmark_certification_statuses": sorted(
+            {session.benchmark_certification_status for session in sessions}
+        ),
+        "benchmark_certification_status_counts": {
+            status: sum(
+                1
+                for session in sessions
+                if session.benchmark_certification_status == status
+            )
+            for status in sorted(
+                {session.benchmark_certification_status for session in sessions}
+            )
+        },
+        "request_mode_mix": {
+            mode: sum(1 for session in sessions if session.request_mode == mode)
+            for mode in sorted({session.request_mode for session in sessions})
+        },
+    }
+    return sessions, selection_metadata
+
+
+async def _iter_sse_lines(
+    response: aiohttp.ClientResponse,
+):
+    """Yield individual SSE data payloads from a streaming response.
+
+    Buffers partial lines across TCP chunks and splits multi-line chunks.
+    Handles the common case where multiple ``data: {...}`` frames arrive
+    in a single TCP read, or a single frame is split across reads.
+    """
+    buffer = b""
+    async for chunk in response.content:
+        buffer += chunk
+        while b"\n" in buffer:
+            line, buffer = buffer.split(b"\n", 1)
+            line = line.strip()
+            if not line:
+                continue
+            decoded = line.decode("utf-8")
+            if decoded.startswith(":"):
+                continue  # SSE comment / keep-alive
+            if decoded.startswith("data: "):
+                payload_str = decoded[6:].strip()
+            elif decoded.startswith("data:"):
+                payload_str = decoded[5:].strip()
+            else:
+                continue
+            if payload_str == "[DONE]":
+                return
+            yield payload_str
+    # Flush remaining buffer
+    remaining = buffer.strip()
+    if remaining:
+        decoded = remaining.decode("utf-8")
+        for prefix in ("data: ", "data:"):
+            if decoded.startswith(prefix):
+                payload_str = decoded[len(prefix):].strip()
+                if payload_str and payload_str != "[DONE]":
+                    yield payload_str
+                break
+
+
+async def _stream_chat_request(
+    api_url: str,
+    payload: dict[str, Any],
+    headers: dict[str, str],
+    context_len: int,
+    count_text_tokens: Callable[[str], int],
+    request_mode: str,
+) -> tuple[TurnResult, int]:
+    turn = TurnResult(
+        turn_idx=-1,
+        context_len=context_len,
+        output_len=0,
+        success=False,
+        request_mode=request_mode,
+    )
+    generated_text = ""
+    ttft = 0.0
+    st = time.perf_counter()
+    most_recent_timestamp = st
+
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
+        async with session.post(url=api_url, json=payload, headers=headers) as response:
+            if response.status != 200:
+                error_text = (await response.text()).strip()
+                turn.error = f"HTTP {response.status}: {error_text or response.reason}"
+                return turn, response.status
+
+            async for sse_payload in _iter_sse_lines(response):
+                data = json.loads(sse_payload)
+                if choices := data.get("choices"):
+                    delta = choices[0].get("delta", {})
+                    content = delta.get("content")
+                    if isinstance(content, list):
+                        content = "".join(
+                            part.get("text", "")
+                            for part in content
+                            if isinstance(part, dict) and part.get("type") == "text"
+                        )
+                    if content:
+                        timestamp = time.perf_counter()
+                        if ttft == 0.0:
+                            ttft = timestamp - st
+                            turn.ttft = ttft
+                        else:
+                            turn.itl.append(timestamp - most_recent_timestamp)
+                        most_recent_timestamp = timestamp
+                        generated_text += content
+                elif usage := data.get("usage"):
+                    turn.output_len = int(usage.get("completion_tokens") or 0)
+
+    turn.e2el = max(0.0, most_recent_timestamp - st)
+    turn.success = True
+    if turn.output_len == 0 and generated_text:
+        turn.output_len = count_text_tokens(generated_text)
+    if turn.output_len > 1:
+        turn.tpot = (turn.e2el - turn.ttft) / (turn.output_len - 1)
+    return turn, 200
+
+
+async def _send_chat_turn(
+    chat_messages: list[dict[str, Any]],
+    model_id: str,
+    model_name: Optional[str],
+    api_url: str,
+    output_len: int,
+    context_len: int,
+    count_text_tokens: Callable[[str], int],
+    ignore_eos: bool = False,
+) -> TurnResult:
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY', 'EMPTY')}",
+    }
+    payload_base = {
+        "model": model_name or model_id,
+        "messages": chat_messages,
+        "temperature": 0.0,
+        "stream": True,
+        "stream_options": {"include_usage": True},
+    }
+    if ignore_eos:
+        payload_base["ignore_eos"] = True
+
+    errors: list[str] = []
+    for max_tokens_key in ("max_completion_tokens", "max_tokens"):
+        payload = {**payload_base, max_tokens_key: output_len}
+        turn, status = await _stream_chat_request(
+            api_url=api_url,
+            payload=payload,
+            headers=headers,
+            context_len=context_len,
+            count_text_tokens=count_text_tokens,
+            request_mode="chat",
+        )
+        if turn.success:
+            return turn
+        errors.append(turn.error)
+        if status not in {400, 404, 422}:
+            break
+
+    return TurnResult(
+        turn_idx=-1,
+        context_len=context_len,
+        output_len=0,
+        success=False,
+        error=" | ".join(error for error in errors if error),
+        request_mode="chat",
+    )
+
+
+async def _send_completion_turn(
+    prompt: str,
+    model_id: str,
+    model_name: Optional[str],
+    api_url: str,
+    output_len: int,
+    context_len: int,
+    count_text_tokens: Callable[[str], int],
+    ignore_eos: bool = False,
+) -> TurnResult:
+    payload = {
+        "model": model_name or model_id,
+        "prompt": prompt,
+        "temperature": 0.0,
+        "max_tokens": output_len,
+        "stream": True,
+        "stream_options": {"include_usage": True},
+    }
+    if ignore_eos:
+        payload["ignore_eos"] = True
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY', 'EMPTY')}",
+    }
+
+    turn = TurnResult(
+        turn_idx=-1,
+        context_len=context_len,
+        output_len=0,
+        success=False,
+        request_mode="completions",
+    )
+    generated_text = ""
+    ttft = 0.0
+    st = time.perf_counter()
+    most_recent_timestamp = st
+
+    try:
+        async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
+            async with session.post(url=api_url, json=payload, headers=headers) as response:
+                if response.status != 200:
+                    error_text = (await response.text()).strip()
+                    turn.error = f"HTTP {response.status}: {error_text or response.reason}"
+                    return turn
+
+                async for sse_payload in _iter_sse_lines(response):
+                    data = json.loads(sse_payload)
+                    if choices := data.get("choices"):
+                        choice = choices[0]
+                        content = choice.get("text")
+                        if content is None:
+                            delta = choice.get("delta", {})
+                            content = delta.get("content")
+                        if isinstance(content, list):
+                            content = "".join(
+                                part.get("text", "")
+                                for part in content
+                                if isinstance(part, dict) and part.get("type") == "text"
+                            )
+                        if content:
+                            timestamp = time.perf_counter()
+                            if ttft == 0.0:
+                                ttft = timestamp - st
+                                turn.ttft = ttft
+                            else:
+                                turn.itl.append(timestamp - most_recent_timestamp)
+                            most_recent_timestamp = timestamp
+                            generated_text += content
+                    elif usage := data.get("usage"):
+                        turn.output_len = int(usage.get("completion_tokens") or 0)
+    except Exception as exc:
+        turn.error = str(exc)
+        return turn
+
+    turn.e2el = max(0.0, most_recent_timestamp - st)
+    turn.success = True
+    if turn.output_len == 0 and generated_text:
+        turn.output_len = count_text_tokens(generated_text)
+    if turn.output_len > 1:
+        turn.tpot = (turn.e2el - turn.ttft) / (turn.output_len - 1)
+    return turn
+
+
+async def poll_server_metrics(api_url: str, interval: float = 2.0) -> list[dict[str, float]]:
+    """Poll ``/metrics`` periodically to capture KV / cache status."""
+    import urllib.parse
+
+    parsed = urllib.parse.urlparse(api_url)
+    metrics_url = f"{parsed.scheme}://{parsed.netloc}/metrics"
+    metrics_history: list[dict[str, float]] = []
+
+    try:
+        async with aiohttp.ClientSession(trust_env=True) as session:
+            while True:
+                try:
+                    async with session.get(metrics_url, timeout=aiohttp.ClientTimeout(total=5.0)) as response:
+                        if response.status == 200:
+                            text = await response.text()
+                            snapshot: dict[str, float] = {}
+                            for line in text.split("\n"):
+                                parsed_line = _parse_prometheus_sample(line)
+                                if parsed_line is None:
+                                    continue
+                                metric_name, metric_value = parsed_line
+                                if metric_name == "vllm:gpu_cache_usage_perc":
+                                    snapshot["vllm_gpu_cache_usage"] = metric_value
+                                elif metric_name == "vllm:cpu_cache_usage_perc":
+                                    snapshot["vllm_cpu_cache_usage"] = metric_value
+                                elif metric_name == "sglang:cache_hit_rate":
+                                    snapshot["sglang_cache_hit_rate"] = metric_value
+                                elif metric_name == "sglang:kv_cache_usage":
+                                    snapshot["sglang_kv_cache_usage"] = metric_value
+                                elif metric_name == "sglang:token_usage":
+                                    snapshot["sglang_token_usage"] = metric_value
+                                elif metric_name == "vllm:num_preemptions_total":
+                                    snapshot["vllm_preemptions_total"] = metric_value
+                                elif metric_name == "vllm:num_requests_running":
+                                    snapshot["vllm_requests_running"] = metric_value
+                                elif metric_name == "vllm:num_requests_waiting":
+                                    snapshot["vllm_requests_waiting"] = metric_value
+                            if snapshot:
+                                metrics_history.append(snapshot)
+                except Exception:
+                    pass
+                await asyncio.sleep(interval)
+    except asyncio.CancelledError:
+        pass
+
+    return metrics_history
+
+
+def _percentile(values: list[float], percentile: float) -> float:
+    if not values:
+        return 0.0
+    return float(np.percentile(values, percentile))
+
+
+def calculate_multiturn_metrics(
+    session_results: list[SessionResult],
+    max_turns: int,
+    selected_percentiles: list[float],
+) -> dict[str, Any]:
+    ms = 1000.0
+    per_turn: dict[str, dict[str, Any]] = {}
+
+    for turn_index in range(max_turns):
+        ttfts: list[float] = []
+        tpots: list[float] = []
+        e2els: list[float] = []
+        context_lens: list[int] = []
+        actual_context_lens: list[int] = []
+        output_lens: list[int] = []
+        successes = 0
+        for session in session_results:
+            if turn_index < len(session.turns):
+                turn = session.turns[turn_index]
+                if turn.success:
+                    ttfts.append(turn.ttft)
+                    tpots.append(turn.tpot)
+                    e2els.append(turn.e2el)
+                    context_lens.append(turn.context_len)
+                    actual_context_lens.append(turn.actual_context_len)
+                    output_lens.append(turn.output_len)
+                    successes += 1
+
+        key = f"turn_{turn_index + 1}"
+        metrics: dict[str, Any] = {
+            "completed": successes,
+            "mean_context_len": float(np.mean(context_lens)) if context_lens else 0.0,
+            "mean_actual_context_len": float(np.mean(actual_context_lens)) if actual_context_lens else 0.0,
+            "mean_output_len": float(np.mean(output_lens)) if output_lens else 0.0,
+        }
+        for label, values in (("ttft", ttfts), ("tpot", tpots), ("e2el", e2els)):
+            metrics[f"mean_{label}_ms"] = float(np.mean(values)) * ms if values else 0.0
+            metrics[f"median_{label}_ms"] = float(np.median(values)) * ms if values else 0.0
+            metrics[f"std_{label}_ms"] = float(np.std(values)) * ms if values else 0.0
+            for percentile in selected_percentiles:
+                percentile_label = str(int(percentile)) if int(percentile) == percentile else str(percentile)
+                metrics[f"p{percentile_label}_{label}_ms"] = _percentile(values, percentile) * ms
+        per_turn[key] = metrics
+
+    all_ttfts: list[float] = []
+    all_tpots: list[float] = []
+    all_e2els: list[float] = []
+    total_input = 0
+    total_actual_input = 0
+    total_output = 0
+    completed_sessions = 0
+    total_wall = 0.0
+    max_actual_context_per_turn = 0
+
+    for session in session_results:
+        if session.turns and all(turn.success for turn in session.turns):
+            completed_sessions += 1
+        total_input += session.total_input_tokens
+        total_actual_input += session.total_actual_input_tokens
+        total_output += session.total_output_tokens
+        total_wall = max(total_wall, session.total_duration)
+        for turn in session.turns:
+            if turn.success:
+                all_ttfts.append(turn.ttft)
+                all_tpots.append(turn.tpot)
+                all_e2els.append(turn.e2el)
+                if turn.actual_context_len > max_actual_context_per_turn:
+                    max_actual_context_per_turn = turn.actual_context_len
+
+    aggregate: dict[str, Any] = {
+        "completed_sessions": completed_sessions,
+        "total_sessions": len(session_results),
+        "total_input_tokens": total_input,
+        "total_actual_input_tokens": total_actual_input,
+        "max_actual_context_len_per_turn": max_actual_context_per_turn,
+        "total_output_tokens": total_output,
+        "total_wall_time_s": total_wall,
+        "session_throughput_sps": completed_sessions / total_wall if total_wall > 0 else 0.0,
+        "output_throughput_tps": total_output / total_wall if total_wall > 0 else 0.0,
+        "total_token_throughput_tps": (total_input + total_output) / total_wall if total_wall > 0 else 0.0,
+    }
+    for label, values in (("ttft", all_ttfts), ("tpot", all_tpots), ("e2el", all_e2els)):
+        aggregate[f"mean_{label}_ms"] = float(np.mean(values)) * ms if values else 0.0
+        aggregate[f"median_{label}_ms"] = float(np.median(values)) * ms if values else 0.0
+        aggregate[f"std_{label}_ms"] = float(np.std(values)) * ms if values else 0.0
+        for percentile in selected_percentiles:
+            percentile_label = str(int(percentile)) if int(percentile) == percentile else str(percentile)
+            aggregate[f"p{percentile_label}_{label}_ms"] = _percentile(values, percentile) * ms
+
+    return {"per_turn_metrics": per_turn, "aggregate_metrics": aggregate}
+
+
+async def _run_replay_session(
+    session: ReplaySession,
+    model_id: str,
+    model_name: Optional[str],
+    chat_api_url: str,
+    completion_api_url: str,
+    count_text_tokens: Callable[[str], int],
+    pbar: Optional[tqdm],
+    ignore_eos: bool,
+) -> SessionResult:
+    result = SessionResult(session_id=session.session_id)
+    start = time.perf_counter()
+
+    for replay_turn in session.turns:
+        if replay_turn.wait_before_s > 0:
+            await asyncio.sleep(replay_turn.wait_before_s)
+
+        if session.request_mode == "chat":
+            turn_result = await _send_chat_turn(
+                chat_messages=replay_turn.chat_messages,
+                model_id=model_id,
+                model_name=model_name,
+                api_url=chat_api_url,
+                output_len=replay_turn.output_len,
+                context_len=replay_turn.context_len,
+                count_text_tokens=count_text_tokens,
+                ignore_eos=ignore_eos,
+            )
+        else:
+            turn_result = await _send_completion_turn(
+                prompt=replay_turn.completion_prompt,
+                model_id=model_id,
+                model_name=model_name,
+                api_url=completion_api_url,
+                output_len=replay_turn.output_len,
+                context_len=replay_turn.context_len,
+                count_text_tokens=count_text_tokens,
+                ignore_eos=ignore_eos,
+            )
+
+        turn_result.turn_idx = replay_turn.turn_idx
+        turn_result.actual_context_len = replay_turn.actual_context_len
+        result.turns.append(turn_result)
+        if turn_result.success:
+            result.total_input_tokens += turn_result.context_len
+            result.total_actual_input_tokens += turn_result.actual_context_len
+            result.total_output_tokens += turn_result.output_len
+        if pbar is not None:
+            pbar.update(1)
+
+    result.total_duration = time.perf_counter() - start
+    return result
+
+
+async def _run_warmup_sessions(
+    sessions: list[ReplaySession],
+    model_id: str,
+    model_name: Optional[str],
+    chat_api_url: str,
+    completion_api_url: str,
+    count_text_tokens: Callable[[str], int],
+    num_warmup_sessions: int,
+    ignore_eos: bool,
+) -> None:
+    if num_warmup_sessions <= 0 or not sessions:
+        return
+
+    print(f"Running {num_warmup_sessions} warmup session(s) (results discarded) ...")
+    warmup_jobs: list[asyncio.Task[SessionResult]] = []
+    for index in range(num_warmup_sessions):
+        source = sessions[index % len(sessions)]
+        warmup_turns = [
+            ReplayTurn(
+                turn_idx=turn.turn_idx,
+                turn_id=turn.turn_id,
+                output_len=turn.output_len,
+                wait_before_s=0.0,
+                context_len=turn.context_len,
+                actual_context_len=turn.actual_context_len,
+                chat_messages=turn.chat_messages,
+                completion_prompt=turn.completion_prompt,
+            )
+            for turn in source.turns[: min(2, len(source.turns))]
+        ]
+        warmup_jobs.append(
+            asyncio.create_task(
+                _run_replay_session(
+                    session=ReplaySession(
+                        session_id=f"warmup-{index}",
+                        trace_id=source.trace_id,
+                        runtime_stack_id=source.runtime_stack_id,
+                        hardware_profile_id=source.hardware_profile_id,
+                        canonical_model_id=source.canonical_model_id,
+                        support_status=source.support_status,
+                        benchmark_certification_status=source.benchmark_certification_status,
+                        request_mode=source.request_mode,
+                        adapter_id=source.adapter_id,
+                        turns=warmup_turns,
+                    ),
+                    model_id=model_id,
+                    model_name=model_name,
+                    chat_api_url=chat_api_url,
+                    completion_api_url=completion_api_url,
+                    count_text_tokens=count_text_tokens,
+                    pbar=None,
+                    ignore_eos=ignore_eos,
+                )
+            )
+        )
+
+    results = await asyncio.gather(*warmup_jobs, return_exceptions=True)
+    succeeded = sum(
+        1
+        for result in results
+        if isinstance(result, SessionResult) and any(turn.success for turn in result.turns)
+    )
+    failed = num_warmup_sessions - succeeded
+    if failed:
+        print(
+            f"  ⚠️  {failed}/{num_warmup_sessions} warmup session(s) failed. "
+            "Check the server endpoint and selected export cell."
+        )
+    else:
+        print(f"  ✅ {succeeded} warmup session(s) completed successfully.")
+    print()
+
+
+async def run_export_replay_benchmark(
+    sessions: list[ReplaySession],
+    selection_metadata: dict[str, Any],
+    model_id: str,
+    model_name: Optional[str],
+    chat_api_url: str,
+    completion_api_url: str,
+    count_text_tokens: Callable[[str], int],
+    max_concurrency: int,
+    selected_percentiles: list[float],
+    disable_tqdm: bool,
+    num_warmup_sessions: int = 1,
+    ignore_eos: bool = False,
+) -> dict[str, Any]:
+    if not sessions:
+        raise ValueError("No replay sessions were selected.")
+
+    max_turns = max(len(session.turns) for session in sessions)
+    total_turns = sum(len(session.turns) for session in sessions)
+
+    print("============================================================")
+    print(" Export Replay Selection")
+    print("============================================================")
+    print(f"  Adapter:               {selection_metadata['adapter_id']}")
+    print(f"  Sessions selected:     {selection_metadata['selected_sessions']}")
+    print(f"  Runtime stack(s):      {', '.join(selection_metadata['runtime_stack_ids'])}")
+    print(f"  Hardware profile(s):   {', '.join(selection_metadata['hardware_profile_ids'])}")
+    print(f"  Canonical model(s):    {', '.join(selection_metadata['canonical_model_ids'])}")
+    print(
+        "  Support status(es):    "
+        f"{', '.join(selection_metadata['support_statuses'])}"
+    )
+    print(
+        "  Certification status:  "
+        f"{', '.join(selection_metadata['benchmark_certification_statuses'])}"
+    )
+    print(f"  Request mode mix:      {selection_metadata['request_mode_mix']}")
+    print(f"  Total turns:           {total_turns}")
+    print("============================================================")
+    print()
+
+    await _run_warmup_sessions(
+        sessions=sessions,
+        model_id=model_id,
+        model_name=model_name,
+        chat_api_url=chat_api_url,
+        completion_api_url=completion_api_url,
+        count_text_tokens=count_text_tokens,
+        num_warmup_sessions=num_warmup_sessions,
+        ignore_eos=ignore_eos,
+    )
+
+    pbar = None if disable_tqdm else tqdm(total=total_turns, desc="turns")
+    semaphore = asyncio.Semaphore(max_concurrency)
+
+    async def _limited_run(session: ReplaySession) -> SessionResult:
+        async with semaphore:
+            return await _run_replay_session(
+                session=session,
+                model_id=model_id,
+                model_name=model_name,
+                chat_api_url=chat_api_url,
+                completion_api_url=completion_api_url,
+                count_text_tokens=count_text_tokens,
+                pbar=pbar,
+                ignore_eos=ignore_eos,
+            )
+
+    print(
+        f"Starting export replay benchmark: {len(sessions)} sessions, "
+        f"max_turns={max_turns}, max_concurrency={max_concurrency}"
+    )
+    benchmark_start = time.perf_counter()
+    metrics_task = asyncio.create_task(poll_server_metrics(chat_api_url, interval=2.0))
+    jobs = [asyncio.create_task(_limited_run(session)) for session in sessions]
+    session_results = await asyncio.gather(*jobs)
+    benchmark_duration = time.perf_counter() - benchmark_start
+
+    metrics_task.cancel()
+    try:
+        server_metrics = await metrics_task
+    except asyncio.CancelledError:
+        server_metrics = []
+
+    if pbar is not None:
+        pbar.close()
+
+    metrics = calculate_multiturn_metrics(
+        session_results=session_results,
+        max_turns=max_turns,
+        selected_percentiles=selected_percentiles,
+    )
+    aggregate = metrics["aggregate_metrics"]
+    per_turn = metrics["per_turn_metrics"]
+
+    cache_usage_avg = 0.0
+    cache_hit_rate_avg = 0.0
+    gpu_cache_usage_avg = 0.0
+    gpu_cache_usage_peak = 0.0
+    cpu_cache_usage_avg = 0.0
+    cpu_cache_usage_peak = 0.0
+    gpu_cache_metric_name: str | None = None
+    cpu_cache_metric_name: str | None = None
+    observability_status = "no_cache_metrics"
+    cpu_samples: list[float] = []
+    kv_offload_observed = False
+    if server_metrics:
+        vllm_gpu_samples = [
+            item["vllm_gpu_cache_usage"]
+            for item in server_metrics
+            if "vllm_gpu_cache_usage" in item
+        ]
+        sglang_gpu_samples: list[float] = []
+        saw_sglang_kv_metric = False
+        saw_sglang_token_metric = False
+        for item in server_metrics:
+            if "sglang_kv_cache_usage" in item:
+                sglang_gpu_samples.append(item["sglang_kv_cache_usage"])
+                saw_sglang_kv_metric = True
+            elif "sglang_token_usage" in item:
+                sglang_gpu_samples.append(item["sglang_token_usage"])
+                saw_sglang_token_metric = True
+
+        if saw_sglang_kv_metric:
+            gpu_cache_metric_name = "sglang:kv_cache_usage"
+        elif saw_sglang_token_metric:
+            gpu_cache_metric_name = "sglang:token_usage"
+
+        if vllm_gpu_samples:
+            gpu_samples = vllm_gpu_samples
+            gpu_cache_metric_name = "vllm:gpu_cache_usage_perc"
+        else:
+            gpu_samples = sglang_gpu_samples
+
+        cpu_samples = [
+            item["vllm_cpu_cache_usage"]
+            for item in server_metrics
+            if "vllm_cpu_cache_usage" in item
+        ]
+        if cpu_samples:
+            cpu_cache_metric_name = "vllm:cpu_cache_usage_perc"
+        cache_hit_samples = [
+            item["sglang_cache_hit_rate"]
+            for item in server_metrics
+            if "sglang_cache_hit_rate" in item
+        ]
+
+        if gpu_samples:
+            gpu_cache_usage_avg = float(np.mean(gpu_samples))
+            gpu_cache_usage_peak = float(np.max(gpu_samples))
+            cache_usage_avg = gpu_cache_usage_avg
+        if cpu_samples:
+            cpu_cache_usage_avg = float(np.mean(cpu_samples))
+            cpu_cache_usage_peak = float(np.max(cpu_samples))
+            kv_offload_observed = any(sample > 0.0 for sample in cpu_samples)
+        if cache_hit_samples:
+            cache_hit_rate_avg = float(np.mean(cache_hit_samples))
+        if cpu_samples:
+            observability_status = "direct_cpu_cache_metric"
+        elif gpu_samples or cache_hit_samples:
+            observability_status = "indirect_without_cpu_cache_metric"
+
+    print()
+    print("{s:{c}^{n}}".format(s=" Export Replay Benchmark Result ", n=60, c="="))
+    print(f"  {'Completed sessions:':<35} {aggregate['completed_sessions']}/{aggregate['total_sessions']}")
+    print(f"  {'Benchmark duration (s):':<35} {benchmark_duration:.2f}")
+    print(f"  {'Total input tokens (estimated):':<35} {aggregate['total_input_tokens']}")
+    print(f"  {'Total input tokens (actual sent):':<35} {aggregate['total_actual_input_tokens']}")
+    print(f"  {'Max actual context/turn:':<35} {aggregate['max_actual_context_len_per_turn']}")
+    print(f"  {'Total output tokens:':<35} {aggregate['total_output_tokens']}")
+    print(f"  {'Session throughput (sessions/s):':<35} {aggregate['session_throughput_sps']:.2f}")
+    print(f"  {'Output throughput (tok/s):':<35} {aggregate['output_throughput_tps']:.2f}")
+    print(f"  {'Total throughput (tok/s):':<35} {aggregate['total_token_throughput_tps']:.2f}")
+    if server_metrics:
+        print()
+        print(f"  {'Server KV Cache Usage (avg):':<35} {cache_usage_avg:.1%}")
+        if cpu_cache_metric_name:
+            print(f"  {'Server CPU Cache Usage (avg):':<35} {cpu_cache_usage_avg:.1%}")
+        if cache_hit_rate_avg > 0:
+            print(f"  {'Prefix Cache Hit Rate (avg):':<35} {cache_hit_rate_avg:.1%}")
+        if observability_status == "indirect_without_cpu_cache_metric":
+            print(
+                f"  {'Offload observability:':<35} "
+                "indirect only (no direct CPU cache metric)"
+            )
+    print()
+    print("{s:{c}^{n}}".format(s=" Per-Turn TTFT Progression ", n=60, c="-"))
+    print(f"  {'Turn':<8} {'Est Ctx':<10} {'Act Ctx':<10} {'Mean TTFT':<14} {'P99 TTFT':<14} {'Mean E2EL':<14}")
+    print(f"  {'─'*8} {'─'*10} {'─'*10} {'─'*14} {'─'*14} {'─'*14}")
+    for turn_index in range(max_turns):
+        key = f"turn_{turn_index + 1}"
+        if key not in per_turn:
+            continue
+        turn_metrics = per_turn[key]
+        print(
+            f"  {turn_index + 1:<8} "
+            f"{turn_metrics['mean_context_len']:<10.0f} "
+            f"{turn_metrics.get('mean_actual_context_len', 0.0):<10.0f} "
+            f"{turn_metrics['mean_ttft_ms']:<14.1f} "
+            f"{turn_metrics.get('p99_ttft_ms', 0.0):<14.1f} "
+            f"{turn_metrics['mean_e2el_ms']:<14.1f}"
+        )
+    print("=" * 60)
+
+    return {
+        "mode": "export_replay",
+        "adapter_id": selection_metadata["adapter_id"],
+        "selection": selection_metadata,
+        "duration": benchmark_duration,
+        "num_sessions": len(sessions),
+        "max_turns": max_turns,
+        "max_concurrency": max_concurrency,
+        "num_warmup_sessions": num_warmup_sessions,
+        "server_metrics_summary": {
+            "cache_usage_avg": cache_usage_avg,
+            "cache_hit_rate_avg": cache_hit_rate_avg,
+            "gpu_cache_usage_avg": gpu_cache_usage_avg,
+            "gpu_cache_usage_peak": gpu_cache_usage_peak,
+            "gpu_cache_metric_name": gpu_cache_metric_name,
+            "cpu_cache_usage_avg": cpu_cache_usage_avg,
+            "cpu_cache_usage_peak": cpu_cache_usage_peak,
+            "cpu_cache_metric_name": cpu_cache_metric_name,
+            "cpu_cache_metric_available": bool(cpu_samples),
+            "observability_status": observability_status,
+            # Observability-only signal; not a certification or quality claim.
+            "kv_offload_observed": kv_offload_observed,
+            "samples": len(server_metrics),
+            "preemption_count": int(
+                max(
+                    (item.get("vllm_preemptions_total", 0.0) for item in server_metrics),
+                    default=0.0,
+                )
+            ) if server_metrics else 0,
+            "peak_requests_running": float(
+                max(
+                    (item.get("vllm_requests_running", 0.0) for item in server_metrics),
+                    default=0.0,
+                )
+            ) if server_metrics else 0.0,
+            "peak_requests_waiting": float(
+                max(
+                    (item.get("vllm_requests_waiting", 0.0) for item in server_metrics),
+                    default=0.0,
+                )
+            ) if server_metrics else 0.0,
+        },
+        "depth_telemetry": {
+            "total_estimated_input_tokens": aggregate["total_input_tokens"],
+            "total_actual_input_tokens": aggregate["total_actual_input_tokens"],
+            "max_actual_context_len_per_turn": aggregate["max_actual_context_len_per_turn"],
+        },
+        **metrics,
+    }
+
+
+def main(args: argparse.Namespace) -> None:
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    base_url = args.base_url or f"http://{args.host}:{args.port}"
+    base_url = base_url.rstrip("/")
+    chat_api_url = args.chat_api_url or f"{base_url}{args.chat_endpoint}"
+    completion_api_url = args.completion_api_url or f"{base_url}{args.completion_endpoint}"
+
+    tokenizer_id = None if args.skip_tokenizer_load else (args.tokenizer or args.model)
+    count_text_tokens = build_text_token_counter(
+        tokenizer_id=tokenizer_id,
+        tokenizer_mode=args.tokenizer_mode,
+        trust_remote_code=args.trust_remote_code,
+    )
+    sessions, selection_metadata = load_replay_sessions(
+        export_file=args.export_file,
+        count_text_tokens=count_text_tokens,
+        runtime_stack_ids=_csv_values(args.runtime_stack_id),
+        hardware_profile_ids=_csv_values(args.hardware_profile_id),
+        canonical_model_ids=_csv_values(args.canonical_model_id),
+        trace_ids=_csv_values(args.trace_id),
+        support_statuses=_csv_values(args.support_status),
+        request_mode=args.request_mode,
+        image_token_estimate=args.image_token_estimate,
+        ignore_waits=args.ignore_waits,
+        fallback_output_len=args.fallback_output_len,
+        output_len_cap=args.max_output_len,
+        session_offset=args.session_offset,
+        max_sessions=args.max_sessions,
+        max_turns_per_session=args.max_turns_per_session,
+        shuffle_sessions=args.shuffle_sessions,
+        seed=args.seed,
+        allow_mixed_selection=args.allow_mixed_selection,
+    )
+
+    result = asyncio.run(
+        run_export_replay_benchmark(
+            sessions=sessions,
+            selection_metadata=selection_metadata,
+            model_id=args.model,
+            model_name=args.served_model_name,
+            chat_api_url=chat_api_url,
+            completion_api_url=completion_api_url,
+            count_text_tokens=count_text_tokens,
+            max_concurrency=args.max_concurrency,
+            selected_percentiles=[float(item) for item in args.metric_percentiles.split(",")],
+            disable_tqdm=args.disable_tqdm,
+            num_warmup_sessions=args.num_warmup_sessions,
+            ignore_eos=args.ignore_eos,
+        )
+    )
+
+    if args.save_result:
+        result_json: dict[str, Any] = {
+            "date": datetime.now().strftime("%Y%m%d-%H%M%S"),
+            "model_id": args.model,
+        }
+        if tokenizer_id is not None:
+            result_json["tokenizer_id"] = tokenizer_id
+        if args.metadata:
+            for item in args.metadata:
+                if "=" in item:
+                    key, value = item.split("=", 1)
+                    result_json[key.strip()] = value.strip()
+        result_json = {**result_json, **result}
+
+        file_name = args.result_filename or f"export-replay-{Path(args.export_file).stem}.json"
+        if args.result_dir:
+            os.makedirs(args.result_dir, exist_ok=True)
+            file_name = os.path.join(args.result_dir, file_name)
+
+        with open(file_name, "w", encoding="utf-8") as handle:
+            json.dump(result_json, handle, indent=2)
+        print(f"\nResults saved to {file_name}")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description=(
+            "Replay ISB1 export sessions against an OpenAI-compatible server. "
+            "Supports chat-completions replay for standalone vLLM/SGLang and "
+            "prompt-projected completions replay for TRT / Dynamo-style cells."
+        )
+    )
+
+    parser.add_argument("--export-file", type=str, required=True,
+                        help="Path to an inferencex_multiturn or inferencex_trace_replay export JSON")
+    parser.add_argument("--base-url", type=str, default=None,
+                        help="Server base URL, e.g. http://0.0.0.0:8000")
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--chat-endpoint", type=str, default="/v1/chat/completions")
+    parser.add_argument("--completion-endpoint", type=str, default="/v1/completions")
+    parser.add_argument("--chat-api-url", type=str, default=None,
+                        help="Override the full chat endpoint URL")
+    parser.add_argument("--completion-api-url", type=str, default=None,
+                        help="Override the full completions endpoint URL")
+
+    parser.add_argument("--model", type=str, required=True,
+                        help="Model identifier sent to the target server")
+    parser.add_argument("--served-model-name", type=str, default=None,
+                        help="Served model name if different from --model")
+    parser.add_argument("--tokenizer", type=str, default=None,
+                        help="Tokenizer name/path if different from --model")
+    parser.add_argument("--tokenizer-mode", type=str, default="auto",
+                        choices=["auto", "slow", "mistral", "custom"])
+    parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--skip-tokenizer-load", action="store_true",
+                        help="Use approximate token counting instead of loading a tokenizer")
+
+    parser.add_argument("--runtime-stack-id", type=str, default=None,
+                        help="Comma-separated runtime_stack_id filter(s)")
+    parser.add_argument("--hardware-profile-id", type=str, default=None,
+                        help="Comma-separated hardware_profile_id filter(s)")
+    parser.add_argument("--canonical-model-id", type=str, default=None,
+                        help="Comma-separated canonical_model_id filter(s)")
+    parser.add_argument("--trace-id", type=str, default=None,
+                        help="Comma-separated trace_id filter(s)")
+    parser.add_argument("--support-status", type=str, default=None,
+                        help="Comma-separated support_status filter(s)")
+    parser.add_argument("--request-mode", type=str, default="auto",
+                        choices=["auto", "chat", "completions"])
+    parser.add_argument("--allow-mixed-selection", action="store_true",
+                        help="Allow multiple runtime/model/hardware identities in one run")
+    parser.add_argument("--shuffle-sessions", action="store_true")
+    parser.add_argument("--session-offset", type=int, default=0)
+    parser.add_argument("--max-sessions", type=int, default=None)
+    parser.add_argument("--max-turns-per-session", type=int, default=None)
+    parser.add_argument("--ignore-waits", action="store_true",
+                        help="Ignore export wait_before/arrival-time gaps")
+    parser.add_argument("--fallback-output-len", type=int, default=DEFAULT_FALLBACK_OUTPUT_LEN,
+                        help="Fallback output length when export metadata is missing")
+    parser.add_argument("--max-output-len", type=int, default=None,
+                        help="Optional cap applied to each exported target output length")
+    parser.add_argument("--image-token-estimate", type=int, default=DEFAULT_IMAGE_TOKEN_ESTIMATE,
+                        help="Approximate token cost for image blocks when no explicit token count exists")
+
+    parser.add_argument("--max-concurrency", type=int, default=8,
+                        help="Maximum concurrently active replay sessions")
+    parser.add_argument("--num-warmup-sessions", type=int, default=1,
+                        help="Warmup sessions to prime KV/prefix cache before measurement")
+    parser.add_argument("--ignore-eos", action="store_true")
+
+    parser.add_argument("--save-result", action="store_true")
+    parser.add_argument("--result-dir", type=str, default=None)
+    parser.add_argument("--result-filename", type=str, default=None)
+    parser.add_argument("--metadata", metavar="KEY=VALUE", nargs="*")
+    parser.add_argument("--metric-percentiles", type=str, default="90,99,99.9")
+
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--disable-tqdm", action="store_true")
+
+    main(parser.parse_args())
diff --git a/utils/gate_isb1.py b/utils/gate_isb1.py
new file mode 100644
index 000000000..e223e8c29
--- /dev/null
+++ b/utils/gate_isb1.py
@@ -0,0 +1,298 @@
+import argparse
+import json
+from pathlib import Path
+from typing import Any, Callable
+
+
+Row = dict[str, Any]
+Criterion = tuple[str, Callable[[Row], bool]]
+
+EXPECTED_131K_COVERAGE = {
+    ("b200", "vllm"),
+    ("b200", "sglang"),
+    ("h100", "vllm"),
+    ("h100", "sglang"),
+    ("h200", "vllm"),
+    ("h200", "sglang"),
+}
+EXPECTED_1M_COVERAGE = {
+    ("b200", "vllm"),
+    ("b200", "sglang"),
+}
+
+
+def normalize_hw_label(hw: str | None) -> str:
+    """Normalize runner labels like h200-cw-1 to coverage labels like h200."""
+    if not hw:
+        return ""
+    return hw.split("-", 1)[0]
+
+
+def load_rows(report_path: Path) -> list[Row]:
+    """Load aggregated ISB1 rows from JSON."""
+    payload = json.loads(report_path.read_text())
+    if isinstance(payload, list):
+        return [row for row in payload if isinstance(row, dict)]
+    if isinstance(payload, dict):
+        return [payload]
+    raise ValueError(f"Unsupported ISB1 payload type: {type(payload)!r}")
+
+
+def build_row_reference(row: Row, failed_criteria: list[str] | None = None) -> Row:
+    """Build a concise row reference for gate reports."""
+    reference: Row = {
+        "result_filename": row.get("result_filename"),
+        "artifact_stems": row.get("artifact_stems") or {},
+        "hw": row.get("hw"),
+        "framework": row.get("framework"),
+        "infmax_model_prefix": row.get("infmax_model_prefix"),
+        "support_status": row.get("support_status"),
+        "context_pressure_status": (row.get("context_pressure_signal") or {}).get("status"),
+    }
+    if failed_criteria:
+        reference["failed_criteria"] = failed_criteria
+    return reference
+
+
+def completed_sessions_match(row: Row) -> bool:
+    return row.get("completed_sessions") == row.get("total_sessions")
+
+
+def throughput_positive(row: Row) -> bool:
+    return float(row.get("session_throughput_sps") or 0.0) > 0.0
+
+
+def certification_verified(row: Row) -> bool:
+    return row.get("benchmark_certification_status") == "dataset_replay_verified"
+
+
+def context_not_suspicious(row: Row) -> bool:
+    return not bool(row.get("context_pressure_suspicious"))
+
+
+def vllm_context_ok(row: Row) -> bool:
+    if row.get("framework") != "vllm":
+        return True
+    signal = row.get("context_pressure_signal") or {}
+    return signal.get("status") == "ok" and not bool(row.get("context_pressure_suspicious"))
+
+
+def get_present_coverage(rows: list[Row]) -> set[tuple[str, str]]:
+    return {
+        (normalize_hw_label(row.get("hw")), row.get("framework", ""))
+        for row in rows
+    }
+
+
+def evaluate_gate(
+    gate_id: str,
+    label: str,
+    rows: list[Row],
+    criteria: list[Criterion],
+    *,
+    expected_coverage: set[tuple[str, str]] | None = None,
+    exact_coverage: bool = False,
+) -> Row:
+    """Evaluate a gate definition over matching rows."""
+    if not rows:
+        return {
+            "id": gate_id,
+            "label": label,
+            "status": "no_rows",
+            "matched_rows": 0,
+            "failing_rows": [],
+            "review_required_rows": [],
+            "missing_coverage": [],
+            "unexpected_coverage": [],
+        }
+
+    failing_rows = []
+    review_required_rows = []
+    for row in rows:
+        failed_criteria = [description for description, checker in criteria if not checker(row)]
+        if failed_criteria:
+            failing_rows.append(build_row_reference(row, failed_criteria))
+        signal = row.get("context_pressure_signal") or {}
+        if signal.get("requires_log_review"):
+            review_required_rows.append(build_row_reference(row))
+
+    missing_coverage: list[list[str]] = []
+    unexpected_coverage: list[list[str]] = []
+    if expected_coverage is not None:
+        present_coverage = get_present_coverage(rows)
+        missing_coverage = [list(item) for item in sorted(expected_coverage - present_coverage)]
+        if exact_coverage:
+            unexpected_coverage = [list(item) for item in sorted(present_coverage - expected_coverage)]
+
+    status = "pass"
+    if failing_rows or missing_coverage or unexpected_coverage:
+        status = "fail"
+
+    return {
+        "id": gate_id,
+        "label": label,
+        "status": status,
+        "matched_rows": len(rows),
+        "failing_rows": failing_rows,
+        "review_required_rows": review_required_rows,
+        "missing_coverage": missing_coverage,
+        "unexpected_coverage": unexpected_coverage,
+    }
+
+
+def build_gate_report(rows: list[Row], advisory: bool = True) -> Row:
+    """Build the full advisory gate report for an aggregated ISB1 result set."""
+    gates = [
+        evaluate_gate(
+            "control_lanes",
+            "DSR1/GPT-OSS control lanes",
+            [
+                row
+                for row in rows
+                if row.get("infmax_model_prefix") in {"dsr1", "gptoss"}
+                and row.get("support_status") == "supported"
+            ],
+            [
+                ("completed_sessions == total_sessions", completed_sessions_match),
+                ("session_throughput_sps > 0", throughput_positive),
+            ],
+        ),
+        evaluate_gate(
+            "qwen_131k",
+            "Qwen 131k preview lanes",
+            [
+                row
+                for row in rows
+                if row.get("infmax_model_prefix") == "qwen3.5"
+                and row.get("support_status") == "reviewed_preview"
+                and (row.get("effective_max_context_depth") or 0) < 200000
+            ],
+            [
+                ("completed_sessions == total_sessions", completed_sessions_match),
+                ("session_throughput_sps > 0", throughput_positive),
+            ],
+            expected_coverage=EXPECTED_131K_COVERAGE,
+        ),
+        evaluate_gate(
+            "qwen_500k",
+            "Qwen 500k preview lanes",
+            [
+                row
+                for row in rows
+                if row.get("infmax_model_prefix") == "qwen3.5"
+                and row.get("effective_max_context_depth") == 524288
+                and row.get("context_pressure_class") == "extended_500k"
+            ],
+            [
+                ("completed_sessions == total_sessions", completed_sessions_match),
+                (
+                    "benchmark_certification_status == dataset_replay_verified",
+                    certification_verified,
+                ),
+                ("context_pressure_suspicious == false", context_not_suspicious),
+                ("vllm context_pressure_signal.status == ok", vllm_context_ok),
+            ],
+        ),
+        evaluate_gate(
+            "qwen_1m",
+            "Qwen 1M preview lanes",
+            [
+                row
+                for row in rows
+                if row.get("infmax_model_prefix") == "qwen3.5"
+                and row.get("effective_max_context_depth") == 1048576
+                and row.get("context_pressure_class") == "extended_1m"
+            ],
+            [
+                ("completed_sessions == total_sessions", completed_sessions_match),
+                ("context_pressure_suspicious == false", context_not_suspicious),
+                ("vllm context_pressure_signal.status == ok", vllm_context_ok),
+            ],
+            expected_coverage=EXPECTED_1M_COVERAGE,
+            exact_coverage=True,
+        ),
+    ]
+
+    statuses = {gate["status"] for gate in gates}
+    if "fail" in statuses:
+        overall = "fail"
+    elif statuses == {"pass"}:
+        overall = "pass"
+    else:
+        overall = "partial"
+
+    return {
+        "gates": gates,
+        "overall": overall,
+        "advisory": advisory,
+    }
+
+
+def render_markdown(report: Row) -> str:
+    """Render a concise markdown advisory summary for workflow step summaries."""
+    lines = [
+        "## ISB1 Advisory Gates",
+        "",
+        f"Overall: **{report['overall'].upper()}** ({'advisory' if report['advisory'] else 'strict'})",
+        "",
+    ]
+
+    for gate in report["gates"]:
+        lines.append(f"### {gate['label']} — {gate['status'].upper()}")
+        lines.append("")
+        lines.append(f"- Matched rows: {gate['matched_rows']}")
+        if gate["missing_coverage"]:
+            formatted = ", ".join(f"{hw}/{framework}" for hw, framework in gate["missing_coverage"])
+            lines.append(f"- Missing coverage: {formatted}")
+        if gate["unexpected_coverage"]:
+            formatted = ", ".join(
+                f"{hw}/{framework}" for hw, framework in gate["unexpected_coverage"]
+            )
+            lines.append(f"- Unexpected coverage: {formatted}")
+        if gate["failing_rows"]:
+            lines.append("- Failing rows:")
+            for row in gate["failing_rows"]:
+                failed_criteria = ", ".join(row.get("failed_criteria", [])) or "unknown"
+                lines.append(
+                    f"  - `{row.get('result_filename', 'unknown')}` ({row.get('hw', '-')}/"
+                    f"{row.get('framework', '-')}) failed: {failed_criteria}"
+                )
+        elif gate["matched_rows"]:
+            lines.append("- No failing rows.")
+        if gate["review_required_rows"]:
+            review_rows = ", ".join(
+                f"`{row.get('result_filename', 'unknown')}`" for row in gate["review_required_rows"]
+            )
+            lines.append(
+                "- Manual log review still required for: "
+                f"{review_rows}"
+            )
+        lines.append("")
+
+    return "\n".join(lines).rstrip() + "\n"
+
+
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Evaluate advisory ISB1 gates.")
+    parser.add_argument("report_path", type=Path)
+    parser.add_argument("--strict", action="store_true")
+    parser.add_argument("--format", choices=["json", "markdown"], default="json")
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = parse_args(argv)
+    report = build_gate_report(load_rows(args.report_path), advisory=not args.strict)
+
+    if args.format == "markdown":
+        print(render_markdown(report))
+    else:
+        print(json.dumps(report, indent=2))
+
+    if args.strict and report["overall"] == "fail":
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index bc4562415..14c69d3e9 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -10,7 +10,11 @@
 
 from validation import (
     validate_matrix_entry,
+    validate_isb1_matrix_entry,
+    validate_isb1_kv_stress_matrix_entry,
     load_config_files,
+    load_isb1_config_files,
+    load_isb1_kv_stress_config_files,
     load_runner_file,
     Fields
 )
@@ -374,6 +378,243 @@ def generate_full_sweep(args, all_config_data, runner_data):
     return matrix_values
 
 
+def generate_isb1_sweep(args, all_config_data, runner_data):
+    """Generate ISB1 replay sweep configurations with optional filtering."""
+    if args.runner_type:
+        valid_runner_types = set(runner_data.keys())
+        invalid_runners = set(args.runner_type) - valid_runner_types
+        if invalid_runners:
+            raise ValueError(
+                f"Invalid runner type(s): {invalid_runners}. "
+                f"Valid runner types are: {', '.join(sorted(valid_runner_types))}"
+            )
+
+    matrix_values = []
+
+    for _, val in all_config_data.items():
+        if args.model_prefix and val[Fields.MODEL_PREFIX.value] not in args.model_prefix:
+            continue
+
+        if args.precision and val[Fields.PRECISION.value] not in args.precision:
+            continue
+
+        if args.framework and val[Fields.FRAMEWORK.value] not in args.framework:
+            continue
+
+        if args.runner_type and val[Fields.RUNNER.value] not in args.runner_type:
+            continue
+
+        image = val[Fields.IMAGE.value]
+        model = val[Fields.MODEL.value]
+        model_code = val[Fields.MODEL_PREFIX.value]
+        precision = val[Fields.PRECISION.value]
+        framework = val[Fields.FRAMEWORK.value]
+        runner = val[Fields.RUNNER.value]
+        benchmark_type = val[Fields.BENCHMARK_TYPE.value]
+        runtime_stack_id = val[Fields.RUNTIME_STACK_ID.value]
+        hardware_profile_id = val[Fields.HARDWARE_PROFILE_ID.value]
+        canonical_model_id = val[Fields.CANONICAL_MODEL_ID.value]
+        max_model_len = val.get(Fields.MAX_MODEL_LEN.value)
+
+        runner_nodes_to_use = None
+        if args.runner_node_filter:
+            runner_nodes = runner_data.get(runner, [])
+            runner_nodes_to_use = [
+                node for node in runner_nodes if args.runner_node_filter in node
+            ]
+            if not runner_nodes_to_use:
+                continue
+
+        replay_configs = val[Fields.REPLAY_CONFIGS.value]
+        for replay_config in replay_configs:
+            export_file = replay_config[Fields.EXPORT_FILE.value]
+            request_mode = replay_config[Fields.REQUEST_MODE.value]
+            support_status = replay_config.get(Fields.SUPPORT_STATUS.value)
+
+            for replay_space in replay_config[Fields.SEARCH_SPACE.value]:
+                max_concurrency = replay_space[Fields.MAX_CONCURRENCY.value]
+
+                if args.max_concurrency is not None:
+                    if args.max_concurrency <= 0:
+                        continue
+                    max_concurrency = min(max_concurrency, args.max_concurrency)
+
+                runners_for_entry = (
+                    runner_nodes_to_use if runner_nodes_to_use else [runner]
+                )
+                for runner_value in runners_for_entry:
+                    entry = {
+                        Fields.IMAGE.value: image,
+                        Fields.MODEL.value: model,
+                        Fields.MODEL_PREFIX.value: model_code,
+                        Fields.PRECISION.value: precision,
+                        Fields.FRAMEWORK.value: framework,
+                        Fields.RUNNER.value: runner_value,
+                        Fields.BENCHMARK_TYPE.value: benchmark_type,
+                        Fields.EXPORT_FILE.value: export_file,
+                        Fields.RUNTIME_STACK_ID.value: runtime_stack_id,
+                        Fields.HARDWARE_PROFILE_ID.value: hardware_profile_id,
+                        Fields.CANONICAL_MODEL_ID.value: canonical_model_id,
+                        Fields.SUPPORT_STATUS.value: support_status,
+                        Fields.REQUEST_MODE.value: request_mode,
+                        Fields.MAX_CONCURRENCY.value: max_concurrency,
+                        Fields.MAX_SESSIONS.value: replay_space.get(Fields.MAX_SESSIONS.value),
+                        Fields.MAX_TURNS_PER_SESSION.value: replay_space.get(Fields.MAX_TURNS_PER_SESSION.value),
+                        Fields.MAX_OUTPUT_LEN.value: replay_space.get(Fields.MAX_OUTPUT_LEN.value),
+                        Fields.NUM_WARMUP_SESSIONS.value: replay_space.get(
+                            Fields.NUM_WARMUP_SESSIONS.value, 0
+                        ),
+                        Fields.IGNORE_WAITS.value: replay_space.get(
+                            Fields.IGNORE_WAITS.value, False
+                        ),
+                        Fields.IGNORE_EOS.value: replay_space.get(
+                            Fields.IGNORE_EOS.value, False
+                        ),
+                        Fields.MAX_MODEL_LEN.value: max_model_len,
+                        Fields.OFFLOAD_MODE.value: val.get(Fields.OFFLOAD_MODE.value),
+                        Fields.KV_CACHE_DTYPE.value: val.get(Fields.KV_CACHE_DTYPE.value),
+                        Fields.DISABLE_PREFIX_CACHING.value: val.get(
+                            Fields.DISABLE_PREFIX_CACHING.value
+                        ),
+                        'benchmark-duration-s': replay_space.get('benchmark-duration-s'),
+                        Fields.EXP_NAME.value: f"{model_code}_isb1",
+                    }
+                    validate_isb1_matrix_entry(entry)
+                    matrix_values.append(entry)
+
+    return matrix_values
+
+
+def generate_isb1_kv_stress_sweep(args, all_config_data, runner_data):
+    """Generate ISB1 KV stress sweep configurations with optional filtering."""
+    if args.runner_type:
+        valid_runner_types = set(runner_data.keys())
+        invalid_runners = set(args.runner_type) - valid_runner_types
+        if invalid_runners:
+            raise ValueError(
+                f"Invalid runner type(s): {invalid_runners}. "
+                f"Valid runner types are: {', '.join(sorted(valid_runner_types))}"
+            )
+
+    matrix_values = []
+
+    for _, val in all_config_data.items():
+        if args.model_prefix and val[Fields.MODEL_PREFIX.value] not in args.model_prefix:
+            continue
+
+        if args.precision and val[Fields.PRECISION.value] not in args.precision:
+            continue
+
+        if args.framework and val[Fields.FRAMEWORK.value] not in args.framework:
+            continue
+
+        if args.runner_type and val[Fields.RUNNER.value] not in args.runner_type:
+            continue
+
+        image = val[Fields.IMAGE.value]
+        model = val[Fields.MODEL.value]
+        model_code = val[Fields.MODEL_PREFIX.value]
+        precision = val[Fields.PRECISION.value]
+        framework = val[Fields.FRAMEWORK.value]
+        runner = val[Fields.RUNNER.value]
+        benchmark_type = val[Fields.BENCHMARK_TYPE.value]
+        runtime_stack_id = val[Fields.RUNTIME_STACK_ID.value]
+        hardware_profile_id = val[Fields.HARDWARE_PROFILE_ID.value]
+        canonical_model_id = val[Fields.CANONICAL_MODEL_ID.value]
+        max_model_len = val.get(Fields.MAX_MODEL_LEN.value)
+        kv_cache_dtype = val[Fields.KV_CACHE_DTYPE.value]
+
+        runner_nodes_to_use = None
+        if args.runner_node_filter:
+            runner_nodes = runner_data.get(runner, [])
+            runner_nodes_to_use = [
+                node for node in runner_nodes if args.runner_node_filter in node
+            ]
+            if not runner_nodes_to_use:
+                continue
+
+        kv_stress_configs = val[Fields.KV_STRESS_CONFIGS.value]
+        for kv_stress_config in kv_stress_configs:
+            export_file = kv_stress_config[Fields.EXPORT_FILE.value]
+            request_mode = kv_stress_config[Fields.REQUEST_MODE.value]
+            support_status = kv_stress_config.get(Fields.SUPPORT_STATUS.value)
+            workload_type = kv_stress_config[Fields.WORKLOAD_TYPE.value]
+
+            runners_for_entry = (
+                runner_nodes_to_use if runner_nodes_to_use else [runner]
+            )
+
+            def _append_kv_stress_entry(
+                max_concurrency: int,
+                offload_mode: str,
+                duration_s: int,
+                *,
+                tp: int | None = None,
+                ep: int | None = None,
+            ) -> None:
+                disable_prefix_caching = offload_mode == "noprefix"
+                for runner_value in runners_for_entry:
+                    entry = {
+                        Fields.IMAGE.value: image,
+                        Fields.MODEL.value: model,
+                        Fields.MODEL_PREFIX.value: model_code,
+                        Fields.PRECISION.value: precision,
+                        Fields.FRAMEWORK.value: framework,
+                        Fields.RUNNER.value: runner_value,
+                        Fields.BENCHMARK_TYPE.value: benchmark_type,
+                        Fields.EXPORT_FILE.value: export_file,
+                        Fields.RUNTIME_STACK_ID.value: runtime_stack_id,
+                        Fields.HARDWARE_PROFILE_ID.value: hardware_profile_id,
+                        Fields.CANONICAL_MODEL_ID.value: canonical_model_id,
+                        Fields.SUPPORT_STATUS.value: support_status,
+                        Fields.REQUEST_MODE.value: request_mode,
+                        Fields.MAX_CONCURRENCY.value: max_concurrency,
+                        Fields.OFFLOAD_MODE.value: offload_mode,
+                        Fields.KV_CACHE_DTYPE.value: kv_cache_dtype,
+                        Fields.DISABLE_PREFIX_CACHING.value: disable_prefix_caching,
+                        'benchmark-duration-s': duration_s,
+                        Fields.WORKLOAD_TYPE.value: workload_type,
+                        Fields.MAX_MODEL_LEN.value: max_model_len,
+                        Fields.EXP_NAME.value: f"{model_code}_isb1_kv_stress",
+                    }
+                    if tp is not None:
+                        entry[Fields.TP.value] = tp
+                    if ep is not None:
+                        entry[Fields.EP.value] = ep
+                    validate_isb1_kv_stress_matrix_entry(entry)
+                    matrix_values.append(entry)
+
+            tp_configs = kv_stress_config.get('tp-configs')
+            if tp_configs:
+                for tp_config in tp_configs:
+                    tp_value = tp_config[Fields.TP.value]
+                    ep_value = tp_config.get(Fields.EP.value, 1)
+                    users = tp_config[Fields.USERS.value]
+                    offload_modes = tp_config[Fields.OFFLOAD_MODES.value]
+                    duration_s = tp_config[Fields.DURATION_S.value]
+
+                    for max_concurrency in users:
+                        for offload_mode in offload_modes:
+                            _append_kv_stress_entry(
+                                max_concurrency,
+                                offload_mode,
+                                duration_s,
+                                tp=tp_value,
+                                ep=ep_value,
+                            )
+            else:
+                for stress_space in kv_stress_config[Fields.SEARCH_SPACE.value]:
+                    users = stress_space[Fields.USERS.value]
+                    offload_modes = stress_space[Fields.OFFLOAD_MODES.value]
+                    duration_s = stress_space[Fields.DURATION_S.value]
+
+                    for max_concurrency in users:
+                        for offload_mode in offload_modes:
+                            _append_kv_stress_entry(max_concurrency, offload_mode, duration_s)
+
+    return matrix_values
+
+
 def generate_runner_model_sweep_config(args, all_config_data, runner_data):
     """Generate runner-model sweep configurations.
 
@@ -885,6 +1126,86 @@ def main():
         help='Show this help message and exit'
     )
 
+    # Subcommand: isb1-sweep
+    isb1_sweep_parser = subparsers.add_parser(
+        'isb1-sweep',
+        parents=[parent_parser],
+        add_help=False,
+        help='Generate ISB1 replay sweep configurations'
+    )
+    isb1_sweep_parser.add_argument(
+        '--model-prefix',
+        nargs='+',
+        required=False,
+        help='Model prefix(es) to filter configurations (optional, can specify multiple)'
+    )
+    isb1_sweep_parser.add_argument(
+        '--precision',
+        nargs='+',
+        required=False,
+        help='Precision(s) to filter by (optional, can specify multiple)'
+    )
+    isb1_sweep_parser.add_argument(
+        '--framework',
+        nargs='+',
+        required=False,
+        help='Framework(s) to filter by (optional, can specify multiple)'
+    )
+    isb1_sweep_parser.add_argument(
+        '--runner-type',
+        nargs='+',
+        required=False,
+        help='Runner type(s) to filter by (e.g., h200, b200) (optional, can specify multiple)'
+    )
+    isb1_sweep_parser.add_argument(
+        '--max-concurrency',
+        type=int,
+        required=False,
+        help='Maximum replay concurrency value to include (caps higher values)'
+    )
+    isb1_sweep_parser.add_argument(
+        '-h', '--help',
+        action='help',
+        help='Show this help message and exit'
+    )
+
+    # Subcommand: isb1-kv-stress-sweep
+    isb1_kv_stress_sweep_parser = subparsers.add_parser(
+        'isb1-kv-stress-sweep',
+        parents=[parent_parser],
+        add_help=False,
+        help='Generate ISB1 KV stress sweep configurations'
+    )
+    isb1_kv_stress_sweep_parser.add_argument(
+        '--model-prefix',
+        nargs='+',
+        required=False,
+        help='Model prefix(es) to filter configurations (optional, can specify multiple)'
+    )
+    isb1_kv_stress_sweep_parser.add_argument(
+        '--precision',
+        nargs='+',
+        required=False,
+        help='Precision(s) to filter by (optional, can specify multiple)'
+    )
+    isb1_kv_stress_sweep_parser.add_argument(
+        '--framework',
+        nargs='+',
+        required=False,
+        help='Framework(s) to filter by (optional, can specify multiple)'
+    )
+    isb1_kv_stress_sweep_parser.add_argument(
+        '--runner-type',
+        nargs='+',
+        required=False,
+        help='Runner type(s) to filter by (e.g., h200, b200) (optional, can specify multiple)'
+    )
+    isb1_kv_stress_sweep_parser.add_argument(
+        '-h', '--help',
+        action='help',
+        help='Show this help message and exit'
+    )
+
     # Subcommand: test-config
     test_config_keys_parser = subparsers.add_parser(
         'test-config',
@@ -915,7 +1236,12 @@ def main():
     apply_node_type_defaults(args)
 
     # Load and validate configuration files (validation happens by default in load functions)
-    all_config_data = load_config_files(args.config_files)
+    if args.command == 'isb1-sweep':
+        all_config_data = load_isb1_config_files(args.config_files)
+    elif args.command == 'isb1-kv-stress-sweep':
+        all_config_data = load_isb1_kv_stress_config_files(args.config_files)
+    else:
+        all_config_data = load_config_files(args.config_files)
     runner_data = load_runner_file(args.runner_config)
 
     # Route to appropriate function based on subcommand
@@ -924,13 +1250,17 @@ def main():
     elif args.command == 'runner-model-sweep':
         matrix_values = generate_runner_model_sweep_config(
             args, all_config_data, runner_data)
+    elif args.command == 'isb1-sweep':
+        matrix_values = generate_isb1_sweep(args, all_config_data, runner_data)
+    elif args.command == 'isb1-kv-stress-sweep':
+        matrix_values = generate_isb1_kv_stress_sweep(args, all_config_data, runner_data)
     elif args.command == 'test-config':
         matrix_values = generate_test_config_sweep(args, all_config_data)
     else:
         parser.error(f"Unknown command: {args.command}")
         
     # Handle eval options (mutually exclusive: --no-evals or --evals-only)
-    if not args.no_evals:
+    if args.command not in ('isb1-sweep', 'isb1-kv-stress-sweep') and not args.no_evals:
         matrix_values = mark_eval_entries(matrix_values)
         if args.evals_only:
             matrix_values = [e for e in matrix_values if e.get(Fields.RUN_EVAL.value, False)]
diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py
index d05299472..cbee3f0a6 100644
--- a/utils/matrix_logic/test_generate_sweep_configs.py
+++ b/utils/matrix_logic/test_generate_sweep_configs.py
@@ -1,22 +1,73 @@
 """Comprehensive tests for generate_sweep_configs.py"""
 import pytest
 import argparse
+import json
+from pathlib import Path
 from generate_sweep_configs import (
     seq_len_stoi,
     seq_len_itos,
     seq_len_to_str,
     generate_full_sweep,
+    generate_isb1_sweep,
+    generate_isb1_kv_stress_sweep,
     generate_runner_model_sweep_config,
     apply_node_type_defaults,
     expand_config_keys,
     mark_eval_entries,
 )
+from validation import (
+    load_config_files,
+    load_isb1_config_files,
+    load_isb1_kv_stress_config_files,
+)
 
 
 # =============================================================================
 # Test Fixtures
 # =============================================================================
 
+
+def _write_isb1_export_fixture(
+    root: Path,
+    relative_path: str,
+    *,
+    runtime_stack_id: str,
+    hardware_profile_id: str,
+    canonical_model_id: str,
+    support_status: str,
+    benchmark_certification_status: str = "dataset_replay_verified",
+) -> None:
+    export_path = root / relative_path
+    export_path.parent.mkdir(parents=True, exist_ok=True)
+    export_path.write_text(
+        json.dumps(
+            {
+                "adapter_id": "inferencex_multiturn",
+                "exports": [
+                    {
+                        "trace_id": f"{export_path.stem}-trace",
+                        "runtime_stack_id": runtime_stack_id,
+                        "hardware_profile_id": hardware_profile_id,
+                        "canonical_model_id": canonical_model_id,
+                        "support_status": support_status,
+                        "benchmark_certification_status": benchmark_certification_status,
+                        "session": {
+                            "session_id": "fixture-session",
+                            "turns": [
+                                {
+                                    "turn_idx": 0,
+                                    "turn_id": 0,
+                                    "messages": [{"role": "user", "content": "hi"}],
+                                    "expected_output_tokens": 8,
+                                }
+                            ],
+                        },
+                    }
+                ],
+            }
+        )
+    )
+
 @pytest.fixture
 def sample_single_node_config():
     """Single node config based on dsr1-fp8-mi300x-sglang."""
@@ -149,6 +200,161 @@ def full_sweep_args_multi_node():
     return args
 
 
+@pytest.fixture
+def sample_isb1_config():
+    """ISB1 replay config based on NVIDIA H200 replay lane."""
+    return {
+        "dsr1-isb1-h200-vllm": {
+            "image": "vllm/vllm-openai:v0.8.5",
+            "model": "deepseek-ai/DeepSeek-R1-0528",
+            "model-prefix": "dsr1",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "benchmark-type": "isb1_replay",
+            "runtime-stack-id": "vllm-0.8.5-h200",
+            "hardware-profile-id": "h200-8gpu",
+            "canonical-model-id": "deepseek-r1-0528",
+            "max-model-len": 16384,
+            "replay-configs": [
+                {
+                    "export-file": "datasets/isb1/exports/core/chat_8k1k.json",
+                    "request-mode": "multi-turn",
+                    "support-status": "supported",
+                    "search-space": [
+                        {
+                            "max-concurrency": 4,
+                            "max-sessions": 2,
+                            "max-turns-per-session": 6,
+                            "max-output-len": 512,
+                            "num-warmup-sessions": 1,
+                            "ignore-waits": True,
+                            "ignore-eos": False,
+                        },
+                        {"max-concurrency": 8},
+                        {"max-concurrency": 16},
+                    ],
+                },
+                {
+                    "export-file": "datasets/isb1/exports/core/code_8k1k.json",
+                    "request-mode": "multi-turn",
+                    "support-status": "supported",
+                    "search-space": [
+                        {"max-concurrency": 4},
+                        {"max-concurrency": 8},
+                    ],
+                },
+            ],
+        }
+    }
+
+
+@pytest.fixture
+def isb1_sweep_args():
+    """Args for isb1-sweep command."""
+    args = argparse.Namespace()
+    args.model_prefix = None
+    args.precision = None
+    args.framework = None
+    args.runner_type = None
+    args.max_concurrency = None
+    args.runner_node_filter = None
+    return args
+
+
+@pytest.fixture
+def sample_isb1_kv_stress_config():
+    """ISB1 KV stress config with users/offload-mode search space."""
+    return {
+        "gptoss-fp4-h200-isb1-kv-stress-vllm-code": {
+            "image": "vllm/vllm-openai:v0.18.0",
+            "model": "openai/gpt-oss-120b",
+            "model-prefix": "gptoss",
+            "precision": "fp4",
+            "framework": "vllm",
+            "runner": "h200",
+            "benchmark-type": "isb1_kv_stress",
+            "runtime-stack-id": "standalone:vllm",
+            "hardware-profile-id": "nvidia:h200_sxm_141gb",
+            "canonical-model-id": "gpt_oss_120b",
+            "max-model-len": 131272,
+            "kv-cache-dtype": "fp8",
+            "kv-stress-configs": [
+                {
+                    "export-file": "datasets/isb1/exports/extension_131k/vllm/code_131k1k.json",
+                    "request-mode": "multi-turn",
+                    "support-status": "reviewed_preview",
+                    "workload-type": "code",
+                    "search-space": [
+                        {
+                            "users": [2, 4, 8],
+                            "offload-modes": ["on", "off", "noprefix"],
+                            "duration-s": 1800,
+                        }
+                    ],
+                }
+            ],
+        }
+    }
+
+
+@pytest.fixture
+def sample_isb1_kv_stress_tp_config():
+    """ISB1 KV stress config using per-TP expansion."""
+    return {
+        "gptoss-fp4-h200-isb1-kv-stress-vllm-code-tp": {
+            "image": "vllm/vllm-openai:v0.18.0",
+            "model": "openai/gpt-oss-120b",
+            "model-prefix": "gptoss",
+            "precision": "fp4",
+            "framework": "vllm",
+            "runner": "h200",
+            "benchmark-type": "isb1_kv_stress",
+            "runtime-stack-id": "standalone:vllm",
+            "hardware-profile-id": "nvidia:h200_sxm_141gb",
+            "canonical-model-id": "gpt_oss_120b",
+            "max-model-len": 131272,
+            "kv-cache-dtype": "fp8",
+            "kv-stress-configs": [
+                {
+                    "export-file": "datasets/isb1/exports/extension_131k/vllm/code_131k1k.json",
+                    "request-mode": "multi-turn",
+                    "support-status": "reviewed_preview",
+                    "workload-type": "code",
+                    "search-space": [
+                        {
+                            "users": [1],
+                            "offload-modes": ["off"],
+                            "duration-s": 10,
+                        }
+                    ],
+                    "tp-configs": [
+                        {
+                            "tp": 8,
+                            "ep": 1,
+                            "users": [2, 4, 8],
+                            "offload-modes": ["on", "off", "noprefix"],
+                            "duration-s": 1800,
+                        }
+                    ],
+                }
+            ],
+        }
+    }
+
+
+@pytest.fixture
+def isb1_kv_stress_sweep_args():
+    """Args for isb1-kv-stress-sweep command."""
+    args = argparse.Namespace()
+    args.model_prefix = None
+    args.precision = None
+    args.framework = None
+    args.runner_type = None
+    args.runner_node_filter = None
+    return args
+
+
 # =============================================================================
 # Test seq_len mappings
 # =============================================================================
@@ -181,6 +387,573 @@ def test_unknown_sequence_lengths(self):
         assert seq_len_to_str(4096, 1024) == "4096_1024"
 
 
+# =============================================================================
+# Test generate_isb1_sweep
+# =============================================================================
+
+class TestGenerateISB1Sweep:
+    """Tests for generate_isb1_sweep."""
+
+    def test_basic_sweep_generation(self, sample_isb1_config, sample_runner_config, isb1_sweep_args):
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert len(result) == 5
+
+    def test_matrix_entry_structure(self, sample_isb1_config, sample_runner_config, isb1_sweep_args):
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        entry = result[0]
+        assert entry["benchmark-type"] == "isb1_replay"
+        assert entry["export-file"].endswith("chat_8k1k.json")
+        assert entry["runtime-stack-id"] == "vllm-0.8.5-h200"
+        assert entry["hardware-profile-id"] == "h200-8gpu"
+        assert entry["canonical-model-id"] == "deepseek-r1-0528"
+        assert entry["support-status"] == "supported"
+        assert entry["request-mode"] == "multi-turn"
+        assert entry["max-concurrency"] == 4
+        assert entry["max-sessions"] == 2
+        assert entry["max-turns-per-session"] == 6
+        assert entry["max-output-len"] == 512
+        assert entry["num-warmup-sessions"] == 1
+        assert entry["ignore-waits"] is True
+        assert entry["ignore-eos"] is False
+        assert entry["max-model-len"] == 16384
+        assert entry["exp-name"] == "dsr1_isb1"
+        assert "run-eval" not in entry
+
+    def test_filter_by_model_prefix(self, sample_isb1_config, sample_runner_config, isb1_sweep_args):
+        isb1_sweep_args.model_prefix = ["dsr1"]
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert len(result) == 5
+
+        isb1_sweep_args.model_prefix = ["gptoss"]
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert result == []
+
+    def test_filter_by_precision(self, sample_isb1_config, sample_runner_config, isb1_sweep_args):
+        isb1_sweep_args.precision = ["fp8"]
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert len(result) == 5
+
+        isb1_sweep_args.precision = ["fp4"]
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert result == []
+
+    def test_filter_by_framework(self, sample_isb1_config, sample_runner_config, isb1_sweep_args):
+        isb1_sweep_args.framework = ["vllm"]
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert len(result) == 5
+
+        isb1_sweep_args.framework = ["sglang"]
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert result == []
+
+    def test_filter_by_runner_type(self, sample_isb1_config, sample_runner_config, isb1_sweep_args):
+        isb1_sweep_args.runner_type = ["h200"]
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert len(result) == 5
+
+        isb1_sweep_args.runner_type = ["h100"]
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert result == []
+
+    def test_invalid_runner_type_raises_error(self, sample_isb1_config, sample_runner_config, isb1_sweep_args):
+        isb1_sweep_args.runner_type = ["not-a-runner"]
+        with pytest.raises(ValueError, match="Invalid runner type"):
+            generate_isb1_sweep(
+                isb1_sweep_args,
+                sample_isb1_config,
+                sample_runner_config,
+            )
+
+    def test_max_concurrency_cap(self, sample_isb1_config, sample_runner_config, isb1_sweep_args):
+        isb1_sweep_args.max_concurrency = 6
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert len(result) == 5
+        assert sorted(entry["max-concurrency"] for entry in result) == [4, 4, 6, 6, 6]
+
+    def test_non_positive_max_concurrency_skips_all(self, sample_isb1_config, sample_runner_config, isb1_sweep_args):
+        isb1_sweep_args.max_concurrency = 0
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert result == []
+
+    def test_max_model_len_passthrough_optional(self, sample_isb1_config, sample_runner_config, isb1_sweep_args):
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert all(entry["max-model-len"] == 16384 for entry in result)
+
+        sample_isb1_config["dsr1-isb1-h200-vllm"].pop("max-model-len")
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert all(entry["max-model-len"] is None for entry in result)
+
+    def test_runner_node_filter_expands_runner_nodes(self, sample_isb1_config, sample_runner_config, isb1_sweep_args):
+        isb1_sweep_args.runner_node_filter = "cw"
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert len(result) == 10
+        assert all(entry["runner"].startswith("h200-cw") for entry in result)
+
+    def test_runner_node_filter_no_match_returns_empty(self, sample_isb1_config, sample_runner_config, isb1_sweep_args):
+        isb1_sweep_args.runner_node_filter = "does-not-exist"
+        result = generate_isb1_sweep(
+            isb1_sweep_args,
+            sample_isb1_config,
+            sample_runner_config,
+        )
+        assert result == []
+
+    def test_main_routes_isb1_sweep(self, tmp_path, sample_isb1_config, sample_runner_config, monkeypatch):
+        import yaml
+        import sys
+        from generate_sweep_configs import main
+
+        sample_entry = sample_isb1_config["dsr1-isb1-h200-vllm"]
+        for replay_config in sample_entry["replay-configs"]:
+            _write_isb1_export_fixture(
+                tmp_path,
+                replay_config["export-file"],
+                runtime_stack_id=sample_entry["runtime-stack-id"],
+                hardware_profile_id=sample_entry["hardware-profile-id"],
+                canonical_model_id=sample_entry["canonical-model-id"],
+                support_status=replay_config["support-status"],
+            )
+
+        config_file = tmp_path / "isb1.yaml"
+        runner_file = tmp_path / "runners.yaml"
+        config_file.write_text(yaml.dump(sample_isb1_config))
+        runner_file.write_text(yaml.dump(sample_runner_config))
+
+        monkeypatch.setattr(
+            sys,
+            "argv",
+            [
+                "generate_sweep_configs.py",
+                "isb1-sweep",
+                "--config-files",
+                str(config_file),
+                "--runner-config",
+                str(runner_file),
+            ],
+        )
+
+        result = main()
+        assert len(result) == 5
+        assert all(entry["benchmark-type"] == "isb1_replay" for entry in result)
+
+
+class TestKVStressSweep:
+    """Tests for generate_isb1_kv_stress_sweep."""
+
+    def test_basic_kv_stress_sweep_generation(
+        self,
+        sample_isb1_kv_stress_config,
+        sample_runner_config,
+        isb1_kv_stress_sweep_args,
+    ):
+        result = generate_isb1_kv_stress_sweep(
+            isb1_kv_stress_sweep_args,
+            sample_isb1_kv_stress_config,
+            sample_runner_config,
+        )
+        # users(3) * offload-modes(3) = 9 flattened rows
+        assert len(result) == 9
+
+    def test_flatten_users_x_offload_modes(
+        self,
+        sample_isb1_kv_stress_config,
+        sample_runner_config,
+        isb1_kv_stress_sweep_args,
+    ):
+        result = generate_isb1_kv_stress_sweep(
+            isb1_kv_stress_sweep_args,
+            sample_isb1_kv_stress_config,
+            sample_runner_config,
+        )
+
+        assert all(entry["benchmark-type"] == "isb1_kv_stress" for entry in result)
+        assert all(isinstance(entry["max-concurrency"], int) for entry in result)
+        assert all(isinstance(entry["offload-mode"], str) for entry in result)
+        assert all(entry["benchmark-duration-s"] == 1800 for entry in result)
+        assert all(entry["kv-cache-dtype"] == "fp8" for entry in result)
+        assert all(entry["workload-type"] == "code" for entry in result)
+
+        pairs = {(entry["max-concurrency"], entry["offload-mode"]) for entry in result}
+        assert pairs == {
+            (2, "on"),
+            (2, "off"),
+            (2, "noprefix"),
+            (4, "on"),
+            (4, "off"),
+            (4, "noprefix"),
+            (8, "on"),
+            (8, "off"),
+            (8, "noprefix"),
+        }
+
+    def test_tp_config_expansion_produces_expected_rows(
+        self,
+        sample_isb1_kv_stress_tp_config,
+        sample_runner_config,
+        isb1_kv_stress_sweep_args,
+    ):
+        result = generate_isb1_kv_stress_sweep(
+            isb1_kv_stress_sweep_args,
+            sample_isb1_kv_stress_tp_config,
+            sample_runner_config,
+        )
+
+        # users(3) * offload-modes(3) = 9 rows from tp-configs expansion
+        assert len(result) == 9
+        assert {entry["tp"] for entry in result} == {8}
+        assert {entry["ep"] for entry in result} == {1}
+
+    def test_repo_kv_stress_config_loads_and_expands(self, isb1_kv_stress_sweep_args):
+        repo_root = Path(__file__).resolve().parents[2]
+        config_data = load_isb1_kv_stress_config_files(
+            [str(repo_root / ".github/configs/isb1-kv-stress.yaml")]
+        )
+        runner_data = {
+            "b200": ["b200-nb_0"],
+            "h200": ["h200-cw_2"],
+        }
+
+        matrix = generate_isb1_kv_stress_sweep(
+            isb1_kv_stress_sweep_args,
+            config_data,
+            runner_data,
+        )
+
+        # 4 configs (gptoss/qwen * b200/h200) * 8 users * 3 offload modes
+        assert len(matrix) == 96
+        assert all(entry["benchmark-type"] == "isb1_kv_stress" for entry in matrix)
+        assert all("tp" not in entry for entry in matrix)
+        assert all("ep" not in entry for entry in matrix)
+
+
+class TestISB1SweepIsolation:
+    """Tests for ISB1 sweep isolation from throughput config lane."""
+
+    def test_repo_isb1_master_includes_runtime_expansion_cells(self, isb1_sweep_args):
+        repo_root = Path(__file__).resolve().parents[2]
+        config_data = load_isb1_config_files(
+            [str(repo_root / ".github/configs/isb1-master.yaml")]
+        )
+        runner_data = {
+            "b200": ["b200-nb_0"],
+            "h100": ["h100-cw_0"],
+            "h200": ["h200-cw_2"],
+        }
+
+        matrix = generate_isb1_sweep(isb1_sweep_args, config_data, runner_data)
+        config_keys = set(config_data)
+        matrix_key_triples = {
+            (entry["model-prefix"], entry["framework"], entry["runner"])
+            for entry in matrix
+        }
+
+        assert "dsr1-fp8-b200-isb1-vllm" in config_keys
+        assert "dsr1-fp8-h200-isb1-vllm" in config_keys
+        assert "gptoss-fp4-b200-isb1-sglang" in config_keys
+        assert "gptoss-fp4-h100-isb1-sglang" in config_keys
+        assert "gptoss-fp4-h200-isb1-sglang" in config_keys
+        assert "gptoss-fp4-h100-isb1-sglang-offload-core-preview-chat" in config_keys
+        assert "gptoss-fp4-h100-isb1-vllm-offload-core-preview-code" in config_keys
+        assert "gptoss-fp4-h100-isb1-sglang-500k-preview-code" in config_keys
+        assert "gptoss-fp4-h100-isb1-vllm-500k-preview-code" in config_keys
+        assert "qwen3.5-fp8-b200-isb1-sglang-500k-preview-code" in config_keys
+        assert "qwen3.5-fp8-h100-isb1-sglang-500k-preview-code" in config_keys
+        assert "qwen3.5-fp8-h200-isb1-sglang-500k-preview-code" in config_keys
+        assert "qwen3.5-fp8-b200-isb1-vllm-500k-preview-code" in config_keys
+        assert "qwen3.5-fp8-h100-isb1-vllm-500k-preview-code" in config_keys
+        assert "qwen3.5-fp8-h200-isb1-vllm-500k-preview-code" in config_keys
+        assert "qwen3.5-fp8-b200-isb1-sglang-extension" in config_keys
+        assert "qwen3.5-fp8-h100-isb1-sglang-extension" in config_keys
+        assert "qwen3.5-fp8-h200-isb1-sglang-extension" in config_keys
+        assert "qwen3.5-fp8-b200-isb1-vllm-extension" in config_keys
+        assert "qwen3.5-fp8-h100-isb1-vllm-extension" in config_keys
+        assert "qwen3.5-fp8-h200-isb1-vllm-extension" in config_keys
+
+        assert ("dsr1", "vllm", "b200") in matrix_key_triples
+        assert ("dsr1", "vllm", "h200") in matrix_key_triples
+        assert ("gptoss", "sglang", "b200") in matrix_key_triples
+        assert ("gptoss", "sglang", "h100") in matrix_key_triples
+        assert ("gptoss", "sglang", "h200") in matrix_key_triples
+        assert ("qwen3.5", "sglang", "b200") in matrix_key_triples
+        assert ("qwen3.5", "sglang", "h100") in matrix_key_triples
+        assert ("qwen3.5", "sglang", "h200") in matrix_key_triples
+        assert ("qwen3.5", "vllm", "b200") in matrix_key_triples
+        assert ("qwen3.5", "vllm", "h100") in matrix_key_triples
+        assert ("qwen3.5", "vllm", "h200") in matrix_key_triples
+
+        assert "dsr1-fp8-h100-isb1-sglang" not in config_keys
+        assert "dsr1-fp8-h100-isb1-vllm" not in config_keys
+
+        assert any(
+            entry["export-file"].endswith("extension_32k/vllm/chat_32k1k.json")
+            and entry["support-status"] == "supported"
+            for entry in matrix
+        )
+        assert any(
+            entry["export-file"].endswith("core/vllm/code_8k1k.json")
+            and entry["support-status"] == "reviewed_preview"
+            for entry in matrix
+        )
+        assert not any(
+            entry["export-file"].endswith("core/vllm/code_8k1k.json")
+            and entry["support-status"] == "supported"
+            for entry in matrix
+        )
+        assert any(
+            entry["export-file"].endswith("extension_32k/vllm/code_32k1k.json")
+            and entry["support-status"] == "reviewed_preview"
+            for entry in matrix
+        )
+        assert any(
+            entry["export-file"].endswith("extension_64k/vllm/code_64k1k.json")
+            and entry["support-status"] == "supported"
+            for entry in matrix
+        )
+        assert any(
+            entry["export-file"].endswith("extension_64k/sglang/chat_64k1k.json")
+            and entry["support-status"] == "reviewed_preview"
+            for entry in matrix
+        )
+        assert any(
+            "preview/offload_core/inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json"
+            in entry["export-file"]
+            and entry["support-status"] == "reviewed_preview"
+            for entry in matrix
+        )
+        assert any(
+            entry["export-file"].endswith("extension_131k/sglang/chat_131k1k.json")
+            and entry["support-status"] == "reviewed_preview"
+            for entry in matrix
+        )
+        assert any(
+            entry["export-file"].endswith("extension_131k/sglang/code_131k1k.json")
+            and entry["support-status"] == "reviewed_preview"
+            for entry in matrix
+        )
+        assert any(
+            entry["export-file"].endswith("extension_131k/vllm/chat_131k1k.json")
+            and entry["support-status"] == "reviewed_preview"
+            for entry in matrix
+        )
+        assert any(
+            entry["export-file"].endswith("extension_131k/vllm/code_131k1k.json")
+            and entry["support-status"] == "reviewed_preview"
+            for entry in matrix
+        )
+        qwen_sglang_entries = [
+            entry
+            for entry in matrix
+            if entry["export-file"].endswith(
+                "extension_131k/sglang/code_131k1k_qwen3.5.json"
+            )
+        ]
+        assert len(qwen_sglang_entries) == 6
+        assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_sglang_entries)
+        assert all(entry["framework"] == "sglang" for entry in qwen_sglang_entries)
+        assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_sglang_entries)
+        assert {entry["max-concurrency"] for entry in qwen_sglang_entries} == {2, 4}
+
+        qwen_vllm_entries = [
+            entry
+            for entry in matrix
+            if entry["export-file"].endswith(
+                "extension_131k/vllm/code_131k1k_qwen3.5.json"
+            )
+        ]
+        assert len(qwen_vllm_entries) == 6
+        assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_vllm_entries)
+        assert all(entry["framework"] == "vllm" for entry in qwen_vllm_entries)
+        assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_vllm_entries)
+        assert {entry["max-concurrency"] for entry in qwen_vllm_entries} == {2, 4}
+
+        sglang_500k_entries = [
+            entry
+            for entry in matrix
+            if entry["export-file"].endswith(
+                "preview/long_context_500k/"
+                "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__sglang.json"
+            )
+        ]
+        assert len(sglang_500k_entries) == 3
+        assert all(entry["support-status"] == "reviewed_preview" for entry in sglang_500k_entries)
+        assert all(entry["max-model-len"] == 524288 for entry in sglang_500k_entries)
+        assert all(entry["max-concurrency"] == 1 for entry in sglang_500k_entries)
+
+        vllm_500k_entries = [
+            entry
+            for entry in matrix
+            if entry["export-file"].endswith(
+                "preview/long_context_500k/"
+                "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json"
+            )
+        ]
+        assert len(vllm_500k_entries) == 3
+        assert all(entry["support-status"] == "reviewed_preview" for entry in vllm_500k_entries)
+        assert all(entry["max-model-len"] == 524288 for entry in vllm_500k_entries)
+        assert all(entry["max-concurrency"] == 1 for entry in vllm_500k_entries)
+
+        qwen_sglang_500k_entries = [
+            entry
+            for entry in matrix
+            if entry["export-file"].endswith(
+                "preview/long_context_500k/"
+                "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json"
+            )
+        ]
+        assert len(qwen_sglang_500k_entries) == 3
+        assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_sglang_500k_entries)
+        assert all(entry["framework"] == "sglang" for entry in qwen_sglang_500k_entries)
+        assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_sglang_500k_entries)
+        assert all(entry["max-model-len"] == 524288 for entry in qwen_sglang_500k_entries)
+        assert all(entry["max-concurrency"] == 1 for entry in qwen_sglang_500k_entries)
+
+        qwen_vllm_500k_entries = [
+            entry
+            for entry in matrix
+            if entry["export-file"].endswith(
+                "preview/long_context_500k/"
+                "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json"
+            )
+        ]
+        assert len(qwen_vllm_500k_entries) == 3
+        assert all(entry["model-prefix"] == "qwen3.5" for entry in qwen_vllm_500k_entries)
+        assert all(entry["framework"] == "vllm" for entry in qwen_vllm_500k_entries)
+        assert all(entry["support-status"] == "reviewed_preview" for entry in qwen_vllm_500k_entries)
+        assert all(entry["max-model-len"] == 524288 for entry in qwen_vllm_500k_entries)
+        assert all(entry["max-concurrency"] == 1 for entry in qwen_vllm_500k_entries)
+
+        assert not any(
+            entry["export-file"].endswith(
+                "preview/long_context_1m/"
+                "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json"
+            )
+            or entry["export-file"].endswith(
+                "preview/long_context_1m/"
+                "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__sglang.json"
+            )
+            for entry in matrix
+        )
+
+    def test_repo_qwen_1m_preview_config_is_manual_and_separate(self, isb1_sweep_args):
+        repo_root = Path(__file__).resolve().parents[2]
+        config_data = load_isb1_config_files(
+            [str(repo_root / ".github/configs/isb1-qwen-1m-preview.yaml")]
+        )
+        runner_data = {
+            "b200": ["b200-nb_0"],
+            "h100": ["h100-cw_0"],
+            "h200": ["h200-cw_2"],
+        }
+
+        matrix = generate_isb1_sweep(isb1_sweep_args, config_data, runner_data)
+        config_keys = set(config_data)
+
+        assert config_keys == {
+            "qwen3.5-fp8-b200-isb1-sglang-1m-gated-preview-code",
+            "qwen3.5-fp8-b200-isb1-vllm-1m-gated-preview-code",
+        }
+        assert len(matrix) == 2
+        assert {entry["runner"] for entry in matrix} == {"b200"}
+        assert {entry["framework"] for entry in matrix} == {"sglang", "vllm"}
+        assert {entry["model-prefix"] for entry in matrix} == {"qwen3.5"}
+        assert {entry["support-status"] for entry in matrix} == {"reviewed_preview"}
+        assert {entry["max-model-len"] for entry in matrix} == {1048576}
+        assert {entry["max-concurrency"] for entry in matrix} == {1}
+        assert {entry["max-sessions"] for entry in matrix} == {1}
+        assert {entry["max-turns-per-session"] for entry in matrix} == {3}
+        assert {
+            entry["canonical-model-id"] for entry in matrix
+        } == {"qwen3_5_397b_a17b"}
+        assert {
+            entry["export-file"] for entry in matrix
+        } == {
+            "datasets/isb1/exports/preview/long_context_1m/"
+            "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__sglang.json",
+            "datasets/isb1/exports/preview/long_context_1m/"
+            "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json",
+        }
+        assert all((repo_root / entry["export-file"]).exists() for entry in matrix)
+
+
+    def test_isb1_config_does_not_validate_as_throughput(self, tmp_path, sample_isb1_config):
+        import yaml
+
+        config_file = tmp_path / "isb1.yaml"
+        config_file.write_text(yaml.dump(sample_isb1_config))
+
+        with pytest.raises(ValueError):
+            load_config_files([str(config_file)])
+
+    def test_throughput_config_does_not_validate_as_isb1(self, tmp_path, sample_single_node_config):
+        import yaml
+
+        config_file = tmp_path / "throughput.yaml"
+        config_file.write_text(yaml.dump(sample_single_node_config))
+
+        with pytest.raises(ValueError):
+            load_isb1_config_files([str(config_file)])
+
+
 # =============================================================================
 # Test generate_full_sweep for single-node
 # =============================================================================
diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py
index 0f1f44c27..06267da22 100644
--- a/utils/matrix_logic/test_validation.py
+++ b/utils/matrix_logic/test_validation.py
@@ -1,20 +1,31 @@
 """Comprehensive tests for validation.py"""
+import json
+from pathlib import Path
+
 import pytest
+import yaml
 from validation import (
     Fields,
     SingleNodeMatrixEntry,
     MultiNodeMatrixEntry,
+    ISB1ReplayMatrixEntry,
     WorkerConfig,
     SingleNodeSearchSpaceEntry,
     MultiNodeSearchSpaceEntry,
+    ISB1ReplaySearchSpaceEntry,
+    ISB1ReplayConfigEntry,
     SingleNodeSeqLenConfig,
     MultiNodeSeqLenConfig,
     SingleNodeMasterConfigEntry,
     MultiNodeMasterConfigEntry,
+    ISB1MasterConfigEntry,
     validate_matrix_entry,
+    validate_isb1_matrix_entry,
     validate_master_config,
+    validate_isb1_master_config,
     validate_runner_config,
     load_config_files,
+    load_isb1_config_files,
     load_runner_file,
 )
 
@@ -23,6 +34,68 @@
 # Test Fixtures
 # =============================================================================
 
+
+def _write_isb1_export_fixture(
+    root: Path,
+    relative_path: str,
+    *,
+    runtime_stack_id: str,
+    hardware_profile_id: str,
+    canonical_model_id: str,
+    support_status: str,
+    benchmark_certification_status: str = "dataset_replay_verified",
+) -> None:
+    export_path = root / relative_path
+    export_path.parent.mkdir(parents=True, exist_ok=True)
+    export_path.write_text(
+        json.dumps(
+            {
+                "adapter_id": "inferencex_multiturn",
+                "exports": [
+                    {
+                        "trace_id": f"{export_path.stem}-trace",
+                        "runtime_stack_id": runtime_stack_id,
+                        "hardware_profile_id": hardware_profile_id,
+                        "canonical_model_id": canonical_model_id,
+                        "support_status": support_status,
+                        "benchmark_certification_status": benchmark_certification_status,
+                        "session": {
+                            "session_id": "fixture-session",
+                            "turns": [
+                                {
+                                    "turn_idx": 0,
+                                    "turn_id": 0,
+                                    "messages": [{"role": "user", "content": "hello"}],
+                                    "expected_output_tokens": 8,
+                                }
+                            ],
+                        },
+                    }
+                ],
+            }
+        )
+    )
+
+
+def _write_manifest_fixture(
+    root: Path,
+    relative_path: str,
+    *,
+    export_file: str,
+    max_model_len: int,
+) -> None:
+    manifest_path = root / relative_path
+    manifest_path.parent.mkdir(parents=True, exist_ok=True)
+    manifest_path.write_text(
+        json.dumps(
+            {
+                "manifest_version": "0.1.0",
+                "max_model_len": max_model_len,
+                "exports": [{"export_file": export_file}],
+            }
+        )
+    )
+
 @pytest.fixture
 def valid_single_node_matrix_entry():
     """Valid single node matrix entry based on dsr1-fp4-mi355x-sglang config."""
@@ -159,6 +232,74 @@ def valid_multinode_master_config():
     }
 
 
+@pytest.fixture
+def valid_isb1_master_config():
+    """Valid ISB1 replay master config for NVIDIA PR1a."""
+    return {
+        "image": "vllm/vllm-openai:v0.8.5",
+        "model": "deepseek-ai/DeepSeek-R1-0528",
+        "model-prefix": "dsr1",
+        "precision": "fp8",
+        "framework": "vllm",
+        "runner": "h200",
+        "benchmark-type": "isb1_replay",
+        "runtime-stack-id": "vllm-0.8.5-h200",
+        "hardware-profile-id": "h200-8gpu",
+        "canonical-model-id": "deepseek-r1-0528",
+        "max-model-len": 16384,
+        "replay-configs": [
+            {
+                "export-file": "datasets/isb1/exports/core/chat_8k1k.json",
+                "request-mode": "multi-turn",
+                "support-status": "supported",
+                "search-space": [
+                    {
+                        "max-concurrency": 4,
+                        "max-sessions": 2,
+                        "max-turns-per-session": 6,
+                        "max-output-len": 512,
+                        "num-warmup-sessions": 1,
+                        "ignore-waits": True,
+                        "ignore-eos": False,
+                    },
+                    {
+                        "max-concurrency": 8,
+                    },
+                ],
+            }
+        ],
+    }
+
+
+@pytest.fixture
+def valid_isb1_matrix_entry(valid_isb1_master_config):
+    """Valid ISB1 replay matrix entry."""
+    return {
+        "image": valid_isb1_master_config["image"],
+        "model": valid_isb1_master_config["model"],
+        "model-prefix": valid_isb1_master_config["model-prefix"],
+        "precision": valid_isb1_master_config["precision"],
+        "framework": valid_isb1_master_config["framework"],
+        "runner": valid_isb1_master_config["runner"],
+        "benchmark-type": valid_isb1_master_config["benchmark-type"],
+        "export-file": valid_isb1_master_config["replay-configs"][0]["export-file"],
+        "runtime-stack-id": valid_isb1_master_config["runtime-stack-id"],
+        "hardware-profile-id": valid_isb1_master_config["hardware-profile-id"],
+        "canonical-model-id": valid_isb1_master_config["canonical-model-id"],
+        "support-status": valid_isb1_master_config["replay-configs"][0]["support-status"],
+        "request-mode": valid_isb1_master_config["replay-configs"][0]["request-mode"],
+        "max-concurrency": 4,
+        "max-sessions": 2,
+        "max-turns-per-session": 6,
+        "max-output-len": 512,
+        "num-warmup-sessions": 1,
+        "ignore-waits": True,
+        "ignore-eos": False,
+        "max-model-len": valid_isb1_master_config["max-model-len"],
+        "exp-name": "dsr1_isb1",
+    }
+
+
 @pytest.fixture
 def valid_runner_config():
     """Valid runner config based on .github/configs/runners.yaml."""
@@ -193,6 +334,10 @@ def test_key_fields_exist(self):
         assert Fields.SPEC_DECODING.value == "spec-decoding"
         assert Fields.PREFILL.value == "prefill"
         assert Fields.DECODE.value == "decode"
+        assert Fields.BENCHMARK_TYPE.value == "benchmark-type"
+        assert Fields.SUPPORT_STATUS.value == "support-status"
+        assert Fields.MAX_CONCURRENCY.value == "max-concurrency"
+        assert Fields.REPLAY_CONFIGS.value == "replay-configs"
 
 
 # =============================================================================
@@ -658,6 +803,153 @@ def test_disagg_default_false(self, valid_single_node_master_config):
         assert config.disagg is False
 
 
+# =============================================================================
+# Test ISB1 replay models
+# =============================================================================
+
+class TestISB1ReplaySearchSpaceEntry:
+    """Tests for ISB1ReplaySearchSpaceEntry model."""
+
+    def test_valid_with_required_only(self):
+        config = ISB1ReplaySearchSpaceEntry(**{
+            "max-concurrency": 4,
+        })
+        assert config.max_concurrency == 4
+        assert config.num_warmup_sessions == 0
+        assert config.ignore_waits is False
+        assert config.ignore_eos is False
+
+    def test_valid_with_all_fields(self):
+        config = ISB1ReplaySearchSpaceEntry(**{
+            "max-concurrency": 8,
+            "max-sessions": 2,
+            "max-turns-per-session": 6,
+            "max-output-len": 512,
+            "num-warmup-sessions": 1,
+            "ignore-waits": True,
+            "ignore-eos": True,
+        })
+        assert config.max_sessions == 2
+        assert config.max_turns_per_session == 6
+        assert config.max_output_len == 512
+        assert config.num_warmup_sessions == 1
+        assert config.ignore_waits is True
+        assert config.ignore_eos is True
+
+    def test_missing_required_field(self):
+        with pytest.raises(Exception):
+            ISB1ReplaySearchSpaceEntry(**{
+                "max-sessions": 2,
+            })
+
+    def test_extra_field_forbidden(self):
+        with pytest.raises(Exception):
+            ISB1ReplaySearchSpaceEntry(**{
+                "max-concurrency": 4,
+                "unknown-field": "value",
+            })
+
+
+class TestISB1ReplayConfigEntry:
+    """Tests for ISB1ReplayConfigEntry model."""
+
+    def test_valid_entry(self):
+        config = ISB1ReplayConfigEntry(**{
+            "export-file": "datasets/isb1/exports/core/chat_8k1k.json",
+            "request-mode": "multi-turn",
+            "support-status": "supported",
+            "search-space": [{"max-concurrency": 4}],
+        })
+        assert config.export_file.endswith("chat_8k1k.json")
+        assert config.request_mode == "multi-turn"
+        assert config.support_status == "supported"
+        assert len(config.search_space) == 1
+
+    def test_invalid_support_status(self):
+        with pytest.raises(Exception):
+            ISB1ReplayConfigEntry(**{
+                "export-file": "datasets/isb1/exports/core/chat_8k1k.json",
+                "request-mode": "multi-turn",
+                "support-status": "definitely_supported",
+                "search-space": [{"max-concurrency": 4}],
+            })
+
+    def test_missing_export_file(self):
+        with pytest.raises(Exception):
+            ISB1ReplayConfigEntry(**{
+                "request-mode": "multi-turn",
+                "search-space": [{"max-concurrency": 4}],
+            })
+
+    def test_missing_request_mode(self):
+        with pytest.raises(Exception):
+            ISB1ReplayConfigEntry(**{
+                "export-file": "datasets/isb1/exports/core/chat_8k1k.json",
+                "search-space": [{"max-concurrency": 4}],
+            })
+
+    def test_empty_search_space(self):
+        with pytest.raises(Exception):
+            ISB1ReplayConfigEntry(**{
+                "export-file": "datasets/isb1/exports/core/chat_8k1k.json",
+                "request-mode": "multi-turn",
+                "search-space": [],
+            })
+
+
+class TestISB1MasterConfigEntry:
+    """Tests for ISB1MasterConfigEntry model."""
+
+    def test_valid_isb1_master_config(self, valid_isb1_master_config):
+        config = ISB1MasterConfigEntry(**valid_isb1_master_config)
+        assert config.benchmark_type == "isb1_replay"
+        assert config.model_prefix == "dsr1"
+        assert config.runner == "h200"
+        assert config.max_model_len == 16384
+        assert len(config.replay_configs) == 1
+
+    def test_max_model_len_optional(self, valid_isb1_master_config):
+        del valid_isb1_master_config["max-model-len"]
+        config = ISB1MasterConfigEntry(**valid_isb1_master_config)
+        assert config.max_model_len is None
+
+    def test_benchmark_type_must_match(self, valid_isb1_master_config):
+        valid_isb1_master_config["benchmark-type"] = "throughput"
+        with pytest.raises(Exception):
+            ISB1MasterConfigEntry(**valid_isb1_master_config)
+
+    def test_throughput_only_field_rejected(self, valid_isb1_master_config):
+        valid_isb1_master_config["multinode"] = False
+        with pytest.raises(Exception):
+            ISB1MasterConfigEntry(**valid_isb1_master_config)
+
+    def test_missing_required_field(self, valid_isb1_master_config):
+        del valid_isb1_master_config["runtime-stack-id"]
+        with pytest.raises(Exception):
+            ISB1MasterConfigEntry(**valid_isb1_master_config)
+
+
+class TestISB1ReplayMatrixEntry:
+    """Tests for ISB1ReplayMatrixEntry model."""
+
+    def test_valid_entry(self, valid_isb1_matrix_entry):
+        entry = ISB1ReplayMatrixEntry(**valid_isb1_matrix_entry)
+        assert entry.benchmark_type == "isb1_replay"
+        assert entry.support_status == "supported"
+        assert entry.max_concurrency == 4
+        assert entry.exp_name == "dsr1_isb1"
+
+    def test_missing_required_field(self, valid_isb1_matrix_entry):
+        del valid_isb1_matrix_entry["export-file"]
+        with pytest.raises(Exception):
+            ISB1ReplayMatrixEntry(**valid_isb1_matrix_entry)
+
+    def test_extra_throughput_field_forbidden(self, valid_isb1_matrix_entry):
+        valid_isb1_matrix_entry["tp"] = 8
+        with pytest.raises(Exception):
+            ISB1ReplayMatrixEntry(**valid_isb1_matrix_entry)
+
+
 # =============================================================================
 # Test validate_master_config function
 # =============================================================================
@@ -696,6 +988,37 @@ def test_invalid_config_raises_valueerror(self, valid_single_node_master_config)
         assert "failed validation" in str(exc_info.value)
 
 
+class TestValidateISB1MasterConfig:
+    """Tests for validate_isb1_master_config function."""
+
+    def test_valid_isb1_config(self, valid_isb1_master_config):
+        configs = {"dsr1-isb1-h200-vllm": valid_isb1_master_config}
+        result = validate_isb1_master_config(configs)
+        assert result == configs
+
+    def test_invalid_isb1_config_raises_valueerror(self, valid_isb1_master_config):
+        del valid_isb1_master_config["model"]
+        configs = {"broken-isb1-config": valid_isb1_master_config}
+        with pytest.raises(ValueError) as exc_info:
+            validate_isb1_master_config(configs)
+        assert "broken-isb1-config" in str(exc_info.value)
+        assert "failed validation" in str(exc_info.value)
+
+
+class TestValidateISB1MatrixEntry:
+    """Tests for validate_isb1_matrix_entry function."""
+
+    def test_valid_entry(self, valid_isb1_matrix_entry):
+        result = validate_isb1_matrix_entry(valid_isb1_matrix_entry)
+        assert result == valid_isb1_matrix_entry
+
+    def test_invalid_entry_raises_valueerror(self, valid_isb1_matrix_entry):
+        del valid_isb1_matrix_entry["benchmark-type"]
+        with pytest.raises(ValueError) as exc_info:
+            validate_isb1_matrix_entry(valid_isb1_matrix_entry)
+        assert "failed validation" in str(exc_info.value)
+
+
 # =============================================================================
 # Test validate_runner_config function
 # =============================================================================
@@ -823,6 +1146,224 @@ def test_validation_runs_by_default(self, tmp_path):
         assert "failed validation" in str(exc_info.value)
 
 
+class TestLoadISB1ConfigFiles:
+    """Tests for load_isb1_config_files function."""
+
+    def test_load_single_file_with_validation(self, tmp_path, valid_isb1_master_config):
+        config_file = tmp_path / "isb1-config.yaml"
+        _write_isb1_export_fixture(
+            tmp_path,
+            valid_isb1_master_config["replay-configs"][0]["export-file"],
+            runtime_stack_id=valid_isb1_master_config["runtime-stack-id"],
+            hardware_profile_id=valid_isb1_master_config["hardware-profile-id"],
+            canonical_model_id=valid_isb1_master_config["canonical-model-id"],
+            support_status=valid_isb1_master_config["replay-configs"][0]["support-status"],
+        )
+
+        config_file.write_text(
+            yaml.dump({"dsr1-isb1-h200-vllm": valid_isb1_master_config})
+        )
+        result = load_isb1_config_files([str(config_file)])
+        assert "dsr1-isb1-h200-vllm" in result
+        assert result["dsr1-isb1-h200-vllm"]["benchmark-type"] == "isb1_replay"
+
+    def test_export_contract_rejects_mismatched_support_status(
+        self, tmp_path, valid_isb1_master_config
+    ):
+        config_file = tmp_path / "isb1-config.yaml"
+        _write_isb1_export_fixture(
+            tmp_path,
+            valid_isb1_master_config["replay-configs"][0]["export-file"],
+            runtime_stack_id=valid_isb1_master_config["runtime-stack-id"],
+            hardware_profile_id=valid_isb1_master_config["hardware-profile-id"],
+            canonical_model_id=valid_isb1_master_config["canonical-model-id"],
+            support_status="reviewed_preview",
+        )
+        config_file.write_text(
+            yaml.dump({"dsr1-isb1-h200-vllm": valid_isb1_master_config})
+        )
+
+        with pytest.raises(ValueError) as exc_info:
+            load_isb1_config_files([str(config_file)])
+        assert "support-status" in str(exc_info.value)
+        assert "Available support tiers" in str(exc_info.value)
+
+    def test_export_contract_requires_dataset_replay_verified_certification(
+        self, tmp_path, valid_isb1_master_config
+    ):
+        config_file = tmp_path / "isb1-config.yaml"
+        _write_isb1_export_fixture(
+            tmp_path,
+            valid_isb1_master_config["replay-configs"][0]["export-file"],
+            runtime_stack_id=valid_isb1_master_config["runtime-stack-id"],
+            hardware_profile_id=valid_isb1_master_config["hardware-profile-id"],
+            canonical_model_id=valid_isb1_master_config["canonical-model-id"],
+            support_status=valid_isb1_master_config["replay-configs"][0]["support-status"],
+            benchmark_certification_status="pending_review",
+        )
+        config_file.write_text(
+            yaml.dump({"dsr1-isb1-h200-vllm": valid_isb1_master_config})
+        )
+
+        with pytest.raises(ValueError) as exc_info:
+            load_isb1_config_files([str(config_file)])
+        assert "benchmark_certification_status" in str(exc_info.value)
+        assert "dataset_replay_verified" in str(exc_info.value)
+
+    def test_export_contract_requires_max_model_len_for_preview_style_export(
+        self, tmp_path, valid_isb1_master_config
+    ):
+        config_file = tmp_path / "isb1-config.yaml"
+        preview_config = {
+            **valid_isb1_master_config,
+            "replay-configs": [
+                {
+                    **valid_isb1_master_config["replay-configs"][0],
+                    "export-file": (
+                        "datasets/isb1/exports/preview/offload_core/"
+                        "inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json"
+                    ),
+                    "support-status": "reviewed_preview",
+                }
+            ],
+        }
+        del preview_config["max-model-len"]
+
+        _write_isb1_export_fixture(
+            tmp_path,
+            preview_config["replay-configs"][0]["export-file"],
+            runtime_stack_id=preview_config["runtime-stack-id"],
+            hardware_profile_id=preview_config["hardware-profile-id"],
+            canonical_model_id=preview_config["canonical-model-id"],
+            support_status="reviewed_preview",
+        )
+        config_file.write_text(yaml.dump({"preview-row": preview_config}))
+
+        with pytest.raises(ValueError) as exc_info:
+            load_isb1_config_files([str(config_file)])
+        assert "max-model-len" in str(exc_info.value)
+
+    def test_export_contract_accepts_preview_style_export_with_explicit_max_model_len(
+        self, tmp_path, valid_isb1_master_config
+    ):
+        config_file = tmp_path / "isb1-config.yaml"
+        preview_config = {
+            **valid_isb1_master_config,
+            "runtime-stack-id": "standalone:vllm",
+            "hardware-profile-id": "nvidia:h100_sxm_80gb",
+            "canonical-model-id": "gpt_oss_120b",
+            "max-model-len": 524288,
+            "replay-configs": [
+                {
+                    **valid_isb1_master_config["replay-configs"][0],
+                    "export-file": (
+                        "datasets/isb1/exports/preview/long_context_500k/"
+                        "inferencex_trace_replay__coding_gptoss_xlc2_500k_preview_v1__vllm.json"
+                    ),
+                    "support-status": "reviewed_preview",
+                }
+            ],
+        }
+
+        _write_isb1_export_fixture(
+            tmp_path,
+            preview_config["replay-configs"][0]["export-file"],
+            runtime_stack_id=preview_config["runtime-stack-id"],
+            hardware_profile_id=preview_config["hardware-profile-id"],
+            canonical_model_id=preview_config["canonical-model-id"],
+            support_status="reviewed_preview",
+        )
+        config_file.write_text(yaml.dump({"preview-row": preview_config}))
+
+        result = load_isb1_config_files([str(config_file)])
+        assert "preview-row" in result
+
+    def test_export_contract_warns_when_manifest_max_model_len_mismatches_config(
+        self, tmp_path, valid_isb1_master_config
+    ):
+        config_file = tmp_path / "isb1-config.yaml"
+        preview_config = {
+            **valid_isb1_master_config,
+            "runtime-stack-id": "standalone:vllm",
+            "hardware-profile-id": "nvidia:h100_sxm_80gb",
+            "canonical-model-id": "qwen3_5_397b_a17b",
+            "max-model-len": 524288,
+            "replay-configs": [
+                {
+                    **valid_isb1_master_config["replay-configs"][0],
+                    "export-file": (
+                        "datasets/isb1/exports/preview/long_context_500k/"
+                        "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json"
+                    ),
+                    "support-status": "reviewed_preview",
+                }
+            ],
+        }
+
+        export_file = preview_config["replay-configs"][0]["export-file"]
+        _write_isb1_export_fixture(
+            tmp_path,
+            export_file,
+            runtime_stack_id=preview_config["runtime-stack-id"],
+            hardware_profile_id=preview_config["hardware-profile-id"],
+            canonical_model_id=preview_config["canonical-model-id"],
+            support_status="reviewed_preview",
+        )
+        _write_manifest_fixture(
+            tmp_path,
+            "datasets/isb1/exports/preview/long_context_500k/manifest_qwen3.5.json",
+            export_file=export_file,
+            max_model_len=1048576,
+        )
+        config_file.write_text(yaml.dump({"preview-row": preview_config}))
+
+        with pytest.warns(UserWarning, match="max-model-len"):
+            result = load_isb1_config_files([str(config_file)])
+        assert "preview-row" in result
+
+    def test_load_single_file_without_validation(self, tmp_path):
+        config_file = tmp_path / "isb1-config.yaml"
+        config_file.write_text("""
+test-isb1:
+  image: test-image
+  benchmark-type: isb1_replay
+""")
+        result = load_isb1_config_files([str(config_file)], validate=False)
+        assert "test-isb1" in result
+        assert result["test-isb1"]["benchmark-type"] == "isb1_replay"
+
+    def test_validation_runs_by_default(self, tmp_path):
+        config_file = tmp_path / "isb1-config.yaml"
+        config_file.write_text("""
+invalid-isb1:
+  image: test-image
+  benchmark-type: isb1_replay
+""")
+        with pytest.raises(ValueError) as exc_info:
+            load_isb1_config_files([str(config_file)])
+        assert "failed validation" in str(exc_info.value)
+
+    def test_duplicate_keys_raise_error(self, tmp_path):
+        config1 = tmp_path / "config1.yaml"
+        config1.write_text("""
+duplicate-key:
+  benchmark-type: isb1_replay
+""")
+        config2 = tmp_path / "config2.yaml"
+        config2.write_text("""
+duplicate-key:
+  benchmark-type: isb1_replay
+""")
+        with pytest.raises(ValueError) as exc_info:
+            load_isb1_config_files([str(config1), str(config2)], validate=False)
+        assert "Duplicate configuration keys" in str(exc_info.value)
+
+    def test_nonexistent_file_raises_error(self):
+        with pytest.raises(ValueError) as exc_info:
+            load_isb1_config_files(["nonexistent-isb1.yaml"])
+        assert "does not exist" in str(exc_info.value)
+
+
 # =============================================================================
 # Test load_runner_file
 # =============================================================================
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index 312952b96..331e374b4 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -2,8 +2,12 @@
 from typing import List, Optional, Union, Literal
 from enum import Enum
 
+import json
 import pprint
+import re
+import warnings
 import yaml
+from pathlib import Path
 
 """
     The below class defines the field names expected to be present in the JSON entries
@@ -55,6 +59,31 @@ class Fields(Enum):
     RUN_EVAL = 'run-eval'
     EVAL_ONLY = 'eval-only'
 
+    # ISB1 replay fields
+    BENCHMARK_TYPE = 'benchmark-type'
+    EXPORT_FILE = 'export-file'
+    RUNTIME_STACK_ID = 'runtime-stack-id'
+    HARDWARE_PROFILE_ID = 'hardware-profile-id'
+    CANONICAL_MODEL_ID = 'canonical-model-id'
+    REQUEST_MODE = 'request-mode'
+    MAX_CONCURRENCY = 'max-concurrency'
+    SUPPORT_STATUS = 'support-status'
+    MAX_SESSIONS = 'max-sessions'
+    MAX_TURNS_PER_SESSION = 'max-turns-per-session'
+    MAX_OUTPUT_LEN = 'max-output-len'
+    NUM_WARMUP_SESSIONS = 'num-warmup-sessions'
+    IGNORE_WAITS = 'ignore-waits'
+    IGNORE_EOS = 'ignore-eos'
+    REPLAY_CONFIGS = 'replay-configs'
+    KV_STRESS_CONFIGS = 'kv-stress-configs'
+    OFFLOAD_MODE = 'offload-mode'
+    OFFLOAD_MODES = 'offload-modes'
+    KV_CACHE_DTYPE = 'kv-cache-dtype'
+    DISABLE_PREFIX_CACHING = 'disable-prefix-caching'
+    USERS = 'users'
+    DURATION_S = 'duration-s'
+    WORKLOAD_TYPE = 'workload-type'
+
 
 """
     Below is the validation logic for the OUTPUT of utils/matrix_logic/generate_sweep_configs.py, i.e., 
@@ -147,6 +176,119 @@ def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict:
     return entry
 
 
+class ISB1ReplayMatrixEntry(BaseModel):
+    """Pydantic model for validating ISB1 replay matrix entry structure."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    image: str
+    model: str
+    model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
+    precision: str
+    framework: str
+    runner: str
+    benchmark_type: Literal["isb1_replay"] = Field(
+        alias=Fields.BENCHMARK_TYPE.value
+    )
+    export_file: str = Field(alias=Fields.EXPORT_FILE.value)
+    runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value)
+    hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value)
+    canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value)
+    support_status: Optional[
+        Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"]
+    ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value)
+    request_mode: str = Field(alias=Fields.REQUEST_MODE.value)
+    max_concurrency: int = Field(alias=Fields.MAX_CONCURRENCY.value, gt=0)
+    max_sessions: Optional[int] = Field(
+        default=None, alias=Fields.MAX_SESSIONS.value, gt=0
+    )
+    max_turns_per_session: Optional[int] = Field(
+        default=None, alias=Fields.MAX_TURNS_PER_SESSION.value, gt=0
+    )
+    max_output_len: Optional[int] = Field(
+        default=None, alias=Fields.MAX_OUTPUT_LEN.value, gt=0
+    )
+    num_warmup_sessions: int = Field(
+        default=0, alias=Fields.NUM_WARMUP_SESSIONS.value, ge=0
+    )
+    ignore_waits: bool = Field(default=False, alias=Fields.IGNORE_WAITS.value)
+    ignore_eos: bool = Field(default=False, alias=Fields.IGNORE_EOS.value)
+    max_model_len: Optional[int] = Field(
+        default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0
+    )
+    offload_mode: Optional[Literal["on", "off", "noprefix", "legacy"]] = Field(
+        default=None, alias=Fields.OFFLOAD_MODE.value
+    )
+    kv_cache_dtype: Optional[Literal["auto", "fp8"]] = Field(
+        default=None, alias=Fields.KV_CACHE_DTYPE.value
+    )
+    disable_prefix_caching: Optional[bool] = Field(
+        default=None, alias=Fields.DISABLE_PREFIX_CACHING.value
+    )
+    benchmark_duration_s: Optional[int] = Field(
+        default=None, alias='benchmark-duration-s', gt=0
+    )
+    exp_name: str = Field(alias=Fields.EXP_NAME.value)
+
+
+def validate_isb1_matrix_entry(entry: dict) -> dict:
+    """Validate that ISB1 replay matrix entries match the expected structure."""
+    try:
+        ISB1ReplayMatrixEntry(**entry)
+    except ValidationError as e:
+        raise ValueError(
+            f"The following ISB1 matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}"
+        )
+    return entry
+
+
+class ISB1KVStressMatrixEntry(BaseModel):
+    """Pydantic model for validating ISB1 KV stress matrix entry structure."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    image: str
+    model: str
+    model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
+    precision: str
+    framework: str
+    runner: str
+    benchmark_type: Literal["isb1_kv_stress"] = Field(
+        alias=Fields.BENCHMARK_TYPE.value
+    )
+    export_file: str = Field(alias=Fields.EXPORT_FILE.value)
+    runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value)
+    hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value)
+    canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value)
+    support_status: Optional[
+        Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"]
+    ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value)
+    request_mode: str = Field(alias=Fields.REQUEST_MODE.value)
+    max_concurrency: int = Field(alias=Fields.MAX_CONCURRENCY.value, gt=0)
+    offload_mode: Literal["on", "off", "noprefix", "legacy"] = Field(
+        alias=Fields.OFFLOAD_MODE.value
+    )
+    kv_cache_dtype: Literal["auto", "fp8"] = Field(alias=Fields.KV_CACHE_DTYPE.value)
+    disable_prefix_caching: bool = Field(alias=Fields.DISABLE_PREFIX_CACHING.value)
+    benchmark_duration_s: int = Field(alias='benchmark-duration-s', gt=0)
+    workload_type: Literal["chat", "code"] = Field(alias=Fields.WORKLOAD_TYPE.value)
+    tp: Optional[int] = Field(default=None, alias=Fields.TP.value, gt=0)
+    ep: Optional[int] = Field(default=None, alias=Fields.EP.value, gt=0)
+    max_model_len: Optional[int] = Field(
+        default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0
+    )
+    exp_name: str = Field(alias=Fields.EXP_NAME.value)
+
+
+def validate_isb1_kv_stress_matrix_entry(entry: dict) -> dict:
+    """Validate that ISB1 KV stress matrix entries match the expected structure."""
+    try:
+        ISB1KVStressMatrixEntry(**entry)
+    except ValidationError as e:
+        raise ValueError(
+            f"The following ISB1 KV stress matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}"
+        )
+    return entry
+
+
 """
     Below is the validation logic for the INPUT to utils/matrix_logic/generate_sweep_configs.py, i.e., 
     the master configuration files found in .github/configs. The validation enforces a strict set of 
@@ -237,6 +379,89 @@ def validate_conc_fields(self):
         return _validate_conc_fields(self)
 
 
+class ISB1ReplaySearchSpaceEntry(BaseModel):
+    """ISB1 replay search space configuration."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    max_concurrency: int = Field(alias=Fields.MAX_CONCURRENCY.value, gt=0)
+    max_sessions: Optional[int] = Field(
+        default=None, alias=Fields.MAX_SESSIONS.value, gt=0
+    )
+    max_turns_per_session: Optional[int] = Field(
+        default=None, alias=Fields.MAX_TURNS_PER_SESSION.value, gt=0
+    )
+    max_output_len: Optional[int] = Field(
+        default=None, alias=Fields.MAX_OUTPUT_LEN.value, gt=0
+    )
+    num_warmup_sessions: int = Field(
+        default=0, alias=Fields.NUM_WARMUP_SESSIONS.value, ge=0
+    )
+    ignore_waits: bool = Field(default=False, alias=Fields.IGNORE_WAITS.value)
+    ignore_eos: bool = Field(default=False, alias=Fields.IGNORE_EOS.value)
+    benchmark_duration_s: Optional[int] = Field(
+        default=None, alias='benchmark-duration-s', gt=0
+    )
+
+
+class ISB1ReplayConfigEntry(BaseModel):
+    """Per-export replay configuration for ISB1."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    export_file: str = Field(alias=Fields.EXPORT_FILE.value)
+    request_mode: str = Field(alias=Fields.REQUEST_MODE.value)
+    support_status: Optional[
+        Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"]
+    ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value)
+    search_space: List[ISB1ReplaySearchSpaceEntry] = Field(
+        alias=Fields.SEARCH_SPACE.value, min_length=1
+    )
+
+
+class ISB1KVStressSearchSpaceEntry(BaseModel):
+    """ISB1 KV stress search space configuration."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    users: List[int] = Field(alias=Fields.USERS.value, min_length=1)
+    offload_modes: List[Literal["on", "off", "noprefix", "legacy"]] = Field(
+        alias=Fields.OFFLOAD_MODES.value,
+        min_length=1,
+    )
+    duration_s: int = Field(alias=Fields.DURATION_S.value, gt=0)
+
+
+class ISB1KVStressTPConfig(BaseModel):
+    """Per-TP KV stress configuration for ISB1 parity sweeps."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    tp: int = Field(gt=0)
+    ep: int = Field(default=1, gt=0)
+    users: List[int] = Field(alias=Fields.USERS.value, min_length=1)
+    offload_modes: List[Literal["on", "off", "noprefix", "legacy"]] = Field(
+        alias=Fields.OFFLOAD_MODES.value,
+        min_length=1,
+    )
+    duration_s: int = Field(alias=Fields.DURATION_S.value, gt=0)
+
+
+class ISB1KVStressConfigEntry(BaseModel):
+    """Per-export KV stress configuration for ISB1."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    export_file: str = Field(alias=Fields.EXPORT_FILE.value)
+    request_mode: str = Field(alias=Fields.REQUEST_MODE.value)
+    support_status: Optional[
+        Literal["supported", "reviewed_preview", "gated", "artifact_only", "unsupported"]
+    ] = Field(default=None, alias=Fields.SUPPORT_STATUS.value)
+    workload_type: Literal["chat", "code"] = Field(alias=Fields.WORKLOAD_TYPE.value)
+    search_space: List[ISB1KVStressSearchSpaceEntry] = Field(
+        alias=Fields.SEARCH_SPACE.value, min_length=1
+    )
+    tp_configs: Optional[List[ISB1KVStressTPConfig]] = Field(
+        default=None,
+        alias='tp-configs',
+    )
+
+
 class SingleNodeSeqLenConfig(BaseModel):
     """Single node sequence length configuration."""
     model_config = ConfigDict(extra='forbid', populate_by_name=True)
@@ -289,6 +514,335 @@ class MultiNodeMasterConfigEntry(BaseModel):
         alias=Fields.SEQ_LEN_CONFIGS.value)
 
 
+class ISB1MasterConfigEntry(BaseModel):
+    """Top-level ISB1 replay master configuration entry."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    image: str
+    model: str
+    model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
+    precision: str
+    framework: str
+    runner: str
+    benchmark_type: Literal["isb1_replay"] = Field(
+        alias=Fields.BENCHMARK_TYPE.value
+    )
+    runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value)
+    hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value)
+    canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value)
+    max_model_len: Optional[int] = Field(
+        default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0
+    )
+    offload_mode: Optional[Literal["on", "off", "noprefix", "legacy"]] = Field(
+        default=None, alias=Fields.OFFLOAD_MODE.value
+    )
+    kv_cache_dtype: Optional[Literal["auto", "fp8"]] = Field(
+        default=None, alias=Fields.KV_CACHE_DTYPE.value
+    )
+    disable_prefix_caching: Optional[bool] = Field(
+        default=None, alias=Fields.DISABLE_PREFIX_CACHING.value
+    )
+    replay_configs: List[ISB1ReplayConfigEntry] = Field(
+        alias=Fields.REPLAY_CONFIGS.value, min_length=1
+    )
+
+
+class ISB1KVStressMasterConfigEntry(BaseModel):
+    """Top-level ISB1 KV stress master configuration entry."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    image: str
+    model: str
+    model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
+    precision: str
+    framework: str
+    runner: str
+    benchmark_type: Literal["isb1_kv_stress"] = Field(
+        alias=Fields.BENCHMARK_TYPE.value
+    )
+    runtime_stack_id: str = Field(alias=Fields.RUNTIME_STACK_ID.value)
+    hardware_profile_id: str = Field(alias=Fields.HARDWARE_PROFILE_ID.value)
+    canonical_model_id: str = Field(alias=Fields.CANONICAL_MODEL_ID.value)
+    max_model_len: Optional[int] = Field(
+        default=None, alias=Fields.MAX_MODEL_LEN.value, gt=0
+    )
+    kv_cache_dtype: Literal["auto", "fp8"] = Field(alias=Fields.KV_CACHE_DTYPE.value)
+    kv_stress_configs: List[ISB1KVStressConfigEntry] = Field(
+        alias=Fields.KV_STRESS_CONFIGS.value,
+        min_length=1,
+    )
+
+
+ISB1_SHAPE_STEM_RE = re.compile(r"(?P<isl>\d+)k(?P<osl>\d+)k")
+ISB1_RUNNABLE_CERTIFICATION_STATUSES = ["dataset_replay_verified"]
+
+
+def _candidate_config_roots(config_file: str) -> list[Path]:
+    """Return candidate repo roots for resolving relative export-file paths."""
+    config_path = Path(config_file).resolve()
+    parent_candidates = [config_path.parents[i] for i in range(min(3, len(config_path.parents)))]
+    candidates = [
+        config_path.parent,
+        *parent_candidates,
+        Path.cwd().resolve(),
+    ]
+
+    unique_candidates: list[Path] = []
+    for candidate in candidates:
+        if candidate not in unique_candidates:
+            unique_candidates.append(candidate)
+    return unique_candidates
+
+
+def _resolve_export_path(config_file: str, export_file: str) -> Path:
+    """Resolve an export file relative to the config file or current repo root."""
+    export_path = Path(export_file)
+    if export_path.is_absolute():
+        return export_path
+
+    candidate_roots = _candidate_config_roots(config_file)
+    for candidate_root in candidate_roots:
+        candidate = candidate_root / export_path
+        if candidate.exists():
+            return candidate
+
+    return candidate_roots[0] / export_path
+
+
+def _load_export_payload(export_path: Path) -> dict:
+    """Load an ISB1 export payload from disk."""
+    try:
+        with export_path.open("r") as handle:
+            payload = json.load(handle)
+    except FileNotFoundError as exc:
+        raise ValueError(f"Referenced ISB1 export file does not exist: '{export_path}'.") from exc
+    except json.JSONDecodeError as exc:
+        raise ValueError(f"Referenced ISB1 export file is not valid JSON: '{export_path}'.") from exc
+
+    exports = payload.get("exports")
+    if not isinstance(exports, list) or not exports:
+        raise ValueError(
+            f"Referenced ISB1 export file must contain a non-empty 'exports' list: '{export_path}'."
+        )
+    return payload
+
+
+def _identity_cells(payload: dict, entry: dict) -> list[dict]:
+    """Return export cells matching the configured runtime/hardware/model identity."""
+    return [
+        cell
+        for cell in payload["exports"]
+        if cell.get("runtime_stack_id") == entry[Fields.RUNTIME_STACK_ID.value]
+        and cell.get("hardware_profile_id") == entry[Fields.HARDWARE_PROFILE_ID.value]
+        and cell.get("canonical_model_id") == entry[Fields.CANONICAL_MODEL_ID.value]
+    ]
+
+
+def _warn_manifest_max_model_len_mismatch(
+    *,
+    export_path: Path,
+    export_file: str,
+    max_model_len: Optional[int],
+    key: str,
+) -> None:
+    """Emit advisory warning if sibling manifest max_model_len disagrees with config."""
+    if max_model_len is None:
+        return
+
+    for manifest_path in sorted(export_path.parent.glob("manifest*.json")):
+        try:
+            manifest_payload = json.loads(manifest_path.read_text())
+        except (OSError, json.JSONDecodeError):
+            continue
+
+        manifest_exports = manifest_payload.get("exports")
+        if isinstance(manifest_exports, list):
+            export_files = {
+                item.get("export_file")
+                for item in manifest_exports
+                if isinstance(item, dict) and isinstance(item.get("export_file"), str)
+            }
+            if export_files and export_file not in export_files:
+                continue
+
+        manifest_max_model_len = manifest_payload.get("max_model_len")
+        if manifest_max_model_len is None:
+            continue
+
+        try:
+            manifest_max_model_len = int(manifest_max_model_len)
+        except (TypeError, ValueError):
+            continue
+
+        if manifest_max_model_len != max_model_len:
+            warnings.warn(
+                f"ISB1 master config entry '{key}' sets '{Fields.MAX_MODEL_LEN.value}'="
+                f"{max_model_len} for export '{export_file}', but sibling manifest "
+                f"'{manifest_path}' declares max_model_len={manifest_max_model_len}.",
+                stacklevel=2,
+            )
+
+
+def certify_isb1_replay_contract(master_configs: dict, config_file: str) -> dict:
+    """Validate that every replay-config resolves to a real, runnable export selection."""
+    for key, entry in master_configs.items():
+        max_model_len = entry.get(Fields.MAX_MODEL_LEN.value)
+
+        for replay_config in entry[Fields.REPLAY_CONFIGS.value]:
+            export_file = replay_config[Fields.EXPORT_FILE.value]
+            support_status = replay_config.get(Fields.SUPPORT_STATUS.value)
+            export_path = _resolve_export_path(config_file, export_file)
+            payload = _load_export_payload(export_path)
+            _warn_manifest_max_model_len_mismatch(
+                export_path=export_path,
+                export_file=export_file,
+                max_model_len=max_model_len,
+                key=key,
+            )
+
+            if not ISB1_SHAPE_STEM_RE.search(export_path.stem) and max_model_len is None:
+                raise ValueError(
+                    f"ISB1 master config entry '{key}' references mixed-shape export "
+                    f"'{export_file}' without '{Fields.MAX_MODEL_LEN.value}'."
+                )
+
+            identity_cells = _identity_cells(payload, entry)
+            identity_statuses = sorted(
+                {
+                    cell.get("support_status")
+                    for cell in identity_cells
+                    if cell.get("support_status") is not None
+                }
+            )
+            matching_cells = [
+                cell
+                for cell in identity_cells
+                if support_status is None or cell.get("support_status") == support_status
+            ]
+
+            if support_status is None and len(identity_statuses) > 1:
+                raise ValueError(
+                    f"ISB1 master config entry '{key}' must pin "
+                    f"'{Fields.SUPPORT_STATUS.value}' for export '{export_file}'. "
+                    f"Matching cells span multiple tiers: {identity_statuses}."
+                )
+
+            if not matching_cells:
+                available_statuses = identity_statuses or ["<none>"]
+                raise ValueError(
+                    f"ISB1 master config entry '{key}' requests export '{export_file}' "
+                    f"with support-status '{support_status}', but no export cell matches "
+                    f"runtime_stack_id='{entry[Fields.RUNTIME_STACK_ID.value]}', "
+                    f"hardware_profile_id='{entry[Fields.HARDWARE_PROFILE_ID.value]}', "
+                    f"canonical_model_id='{entry[Fields.CANONICAL_MODEL_ID.value]}'. "
+                    f"Available support tiers for that identity: {available_statuses}."
+                )
+
+            certification_statuses = sorted(
+                {
+                    cell.get("benchmark_certification_status")
+                    for cell in matching_cells
+                    if cell.get("benchmark_certification_status") is not None
+                }
+            )
+            if not certification_statuses:
+                raise ValueError(
+                    f"ISB1 master config entry '{key}' requests export '{export_file}' "
+                    "but the selected export cells do not declare "
+                    "'benchmark_certification_status'."
+                )
+            if certification_statuses != ISB1_RUNNABLE_CERTIFICATION_STATUSES:
+                raise ValueError(
+                    f"ISB1 master config entry '{key}' requests export '{export_file}' "
+                    "with runnable support tier selection, but the selected export cells "
+                    f"have benchmark_certification_status values {certification_statuses}. "
+                    "Current InferenceX consumer lanes only accept "
+                    f"{ISB1_RUNNABLE_CERTIFICATION_STATUSES}."
+                )
+
+    return master_configs
+
+
+def certify_isb1_kv_stress_contract(master_configs: dict, config_file: str) -> dict:
+    """Validate that every kv-stress-config resolves to a real, runnable export selection."""
+    for key, entry in master_configs.items():
+        max_model_len = entry.get(Fields.MAX_MODEL_LEN.value)
+
+        for kv_stress_config in entry[Fields.KV_STRESS_CONFIGS.value]:
+            export_file = kv_stress_config[Fields.EXPORT_FILE.value]
+            support_status = kv_stress_config.get(Fields.SUPPORT_STATUS.value)
+            export_path = _resolve_export_path(config_file, export_file)
+            payload = _load_export_payload(export_path)
+            _warn_manifest_max_model_len_mismatch(
+                export_path=export_path,
+                export_file=export_file,
+                max_model_len=max_model_len,
+                key=key,
+            )
+
+            if not ISB1_SHAPE_STEM_RE.search(export_path.stem) and max_model_len is None:
+                raise ValueError(
+                    f"ISB1 KV stress config entry '{key}' references mixed-shape export "
+                    f"'{export_file}' without '{Fields.MAX_MODEL_LEN.value}'."
+                )
+
+            identity_cells = _identity_cells(payload, entry)
+            identity_statuses = sorted(
+                {
+                    cell.get("support_status")
+                    for cell in identity_cells
+                    if cell.get("support_status") is not None
+                }
+            )
+            matching_cells = [
+                cell
+                for cell in identity_cells
+                if support_status is None or cell.get("support_status") == support_status
+            ]
+
+            if support_status is None and len(identity_statuses) > 1:
+                raise ValueError(
+                    f"ISB1 KV stress config entry '{key}' must pin "
+                    f"'{Fields.SUPPORT_STATUS.value}' for export '{export_file}'. "
+                    f"Matching cells span multiple tiers: {identity_statuses}."
+                )
+
+            if not matching_cells:
+                available_statuses = identity_statuses or ["<none>"]
+                raise ValueError(
+                    f"ISB1 KV stress config entry '{key}' requests export '{export_file}' "
+                    f"with support-status '{support_status}', but no export cell matches "
+                    f"runtime_stack_id='{entry[Fields.RUNTIME_STACK_ID.value]}', "
+                    f"hardware_profile_id='{entry[Fields.HARDWARE_PROFILE_ID.value]}', "
+                    f"canonical_model_id='{entry[Fields.CANONICAL_MODEL_ID.value]}'. "
+                    f"Available support tiers for that identity: {available_statuses}."
+                )
+
+            certification_statuses = sorted(
+                {
+                    cell.get("benchmark_certification_status")
+                    for cell in matching_cells
+                    if cell.get("benchmark_certification_status") is not None
+                }
+            )
+            if not certification_statuses:
+                raise ValueError(
+                    f"ISB1 KV stress config entry '{key}' requests export '{export_file}' "
+                    "but the selected export cells do not declare "
+                    "'benchmark_certification_status'."
+                )
+            if certification_statuses != ISB1_RUNNABLE_CERTIFICATION_STATUSES:
+                raise ValueError(
+                    f"ISB1 KV stress config entry '{key}' requests export '{export_file}' "
+                    "with runnable support tier selection, but the selected export cells "
+                    f"have benchmark_certification_status values {certification_statuses}. "
+                    "Current InferenceX consumer lanes only accept "
+                    f"{ISB1_RUNNABLE_CERTIFICATION_STATUSES}."
+                )
+
+    return master_configs
+
+
 def validate_master_config(master_configs: dict) -> List[dict]:
     """Validate input master configuration structure."""
     for key, entry in master_configs.items():
@@ -304,6 +858,30 @@ def validate_master_config(master_configs: dict) -> List[dict]:
                 f"Master config entry '{key}' failed validation:\n{e}")
     return master_configs
 
+
+def validate_isb1_master_config(master_configs: dict) -> List[dict]:
+    """Validate ISB1 replay master configuration structure."""
+    for key, entry in master_configs.items():
+        try:
+            ISB1MasterConfigEntry(**entry)
+        except ValidationError as e:
+            raise ValueError(
+                f"ISB1 master config entry '{key}' failed validation:\n{e}"
+            )
+    return master_configs
+
+
+def validate_isb1_kv_stress_master_config(master_configs: dict) -> List[dict]:
+    """Validate ISB1 KV stress master configuration structure."""
+    for key, entry in master_configs.items():
+        try:
+            ISB1KVStressMasterConfigEntry(**entry)
+        except ValidationError as e:
+            raise ValueError(
+                f"ISB1 KV stress master config entry '{key}' failed validation:\n{e}"
+            )
+    return master_configs
+
 # Runner Config Validation
 
 
@@ -371,26 +949,17 @@ class ChangelogMatrixEntry(BaseModel):
 # =============================================================================
 
 
-def load_config_files(config_files: List[str], validate: bool = True) -> dict:
-    """Load and merge configuration files.
-
-    Args:
-        config_files: List of paths to YAML configuration files.
-        validate: If True, run validate_master_config on loaded data. Defaults to True.
-
-    Returns:
-        Merged configuration dictionary.
-
-    Raises:
-        ValueError: If file doesn't exist, isn't a dict, or has duplicate keys.
-    """
+def _load_and_merge_yaml_files(config_files: List[str]) -> dict:
+    """Load and merge YAML configuration files."""
     all_config_data = {}
     for config_file in config_files:
         try:
             with open(config_file, 'r') as f:
                 config_data = yaml.safe_load(f)
-                assert isinstance(
-                    config_data, dict), f"Config file '{config_file}' must contain a dictionary"
+                if not isinstance(config_data, dict):
+                    raise ValueError(
+                        f"Config file '{config_file}' must contain a dictionary"
+                    )
 
                 # Don't allow '*' wildcard in master config keys as we need to reserve these
                 # for expansion in process_changelog.py
@@ -411,12 +980,60 @@ def load_config_files(config_files: List[str], validate: bool = True) -> dict:
         except FileNotFoundError:
             raise ValueError(f"Input file '{config_file}' does not exist.")
 
+    return all_config_data
+
+
+def load_config_files(config_files: List[str], validate: bool = True) -> dict:
+    """Load and merge throughput configuration files.
+
+    Args:
+        config_files: List of paths to YAML configuration files.
+        validate: If True, run validate_master_config on loaded data. Defaults to True.
+
+    Returns:
+        Merged configuration dictionary.
+
+    Raises:
+        ValueError: If file doesn't exist, isn't a dict, or has duplicate keys.
+    """
+    all_config_data = _load_and_merge_yaml_files(config_files)
+
     if validate:
         validate_master_config(all_config_data)
 
     return all_config_data
 
 
+def load_isb1_config_files(config_files: List[str], validate: bool = True) -> dict:
+    """Load and merge ISB1 replay configuration files."""
+    all_config_data = _load_and_merge_yaml_files(config_files)
+
+    if validate:
+        validate_isb1_master_config(all_config_data)
+        for config_file in config_files:
+            certify_isb1_replay_contract(
+                _load_and_merge_yaml_files([config_file]),
+                config_file=config_file,
+            )
+
+    return all_config_data
+
+
+def load_isb1_kv_stress_config_files(config_files: List[str], validate: bool = True) -> dict:
+    """Load and merge ISB1 KV stress configuration files."""
+    all_config_data = _load_and_merge_yaml_files(config_files)
+
+    if validate:
+        validate_isb1_kv_stress_master_config(all_config_data)
+        for config_file in config_files:
+            certify_isb1_kv_stress_contract(
+                _load_and_merge_yaml_files([config_file]),
+                config_file=config_file,
+            )
+
+    return all_config_data
+
+
 def load_runner_file(runner_file: str, validate: bool = True) -> dict:
     """Load runner configuration file.
 
diff --git a/utils/process_result.py b/utils/process_result.py
index 0a84a1f18..e680239d1 100644
--- a/utils/process_result.py
+++ b/utils/process_result.py
@@ -4,6 +4,15 @@
 from pathlib import Path
 
 
+def fail_if_isb1_replay_requested():
+    """Guard against sending ISB1 replay results through the throughput processor."""
+    if os.environ.get('BENCHMARK_TYPE') == 'isb1_replay':
+        raise SystemExit(
+            'process_result.py does not support ISB1 replay results. '
+            'Use utils/process_result_isb1.py instead.'
+        )
+
+
 def get_required_env_vars(required_vars):
     """Load and validate required environment variables."""
     env_values = {}
@@ -22,6 +31,8 @@ def get_required_env_vars(required_vars):
     return env_values
 
 
+fail_if_isb1_replay_requested()
+
 # Base required env vars
 base_env = get_required_env_vars([
     'RUNNER_TYPE', 'FRAMEWORK', 'PRECISION', 'SPEC_DECODING',
@@ -42,6 +53,12 @@ def get_required_env_vars(required_vars):
 with open(f'{result_filename}.json') as f:
     bmk_result = json.load(f)
 
+if 'aggregate_metrics' in bmk_result and 'total_token_throughput_tps' in bmk_result['aggregate_metrics']:
+    raise SystemExit(
+        'Detected an ISB1 replay-style result payload in process_result.py. '
+        'Use utils/process_result_isb1.py instead.'
+    )
+
 data = {
     'hw': hw,
     'conc': int(bmk_result['max_concurrency']),
diff --git a/utils/process_result_isb1.py b/utils/process_result_isb1.py
new file mode 100644
index 000000000..7f338ab2c
--- /dev/null
+++ b/utils/process_result_isb1.py
@@ -0,0 +1,490 @@
+import json
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Any, Optional, Tuple
+
+ISB1_RUNNABLE_CERTIFICATION_STATUSES = ["dataset_replay_verified"]
+
+
+def get_required_env_vars(required_vars):
+    """Load and validate required environment variables."""
+    env_values = {}
+    missing_env_vars = []
+
+    for var_name in required_vars:
+        value = os.environ.get(var_name)
+        if value is None:
+            missing_env_vars.append(var_name)
+        env_values[var_name] = value
+
+    if missing_env_vars:
+        raise EnvironmentError(
+            f"Missing required environment variables: {', '.join(missing_env_vars)}"
+        )
+
+    return env_values
+
+
+def parse_export_shape(export_file: str) -> Tuple[int, int, Optional[str], str, dict[str, Any]]:
+    """Derive ISL/OSL plus export lane/surface and preview metadata from the export path/file."""
+    export_path = Path(export_file)
+    match = re.search(r"(?P<isl>\d+)k(?P<osl>\d+)k", export_path.stem)
+
+    isl = int(os.environ.get("ISL", "0") or 0)
+    osl = int(os.environ.get("OSL", "0") or 0)
+    surface = export_path.stem
+    metadata: dict[str, Any] = {}
+
+    if match:
+        isl = int(match.group("isl")) * 1024
+        osl = int(match.group("osl")) * 1024
+        surface = export_path.stem[: match.start()].rstrip("_-") or export_path.stem
+
+    lane = None
+    if "exports" in export_path.parts:
+        exports_idx = export_path.parts.index("exports")
+        if exports_idx + 1 < len(export_path.parts):
+            lane = export_path.parts[exports_idx + 1]
+            if lane == "preview" and exports_idx + 2 < len(export_path.parts):
+                lane = f"preview/{export_path.parts[exports_idx + 2]}"
+
+    try:
+        payload = json.loads(export_path.read_text())
+    except (FileNotFoundError, json.JSONDecodeError):
+        payload = None
+
+    if payload is not None:
+        served_shape = payload.get("served_shape") or {}
+        isl = int(served_shape.get("isl", isl) or isl)
+        osl = int(served_shape.get("osl", osl) or osl)
+        surface = payload.get("surface") or payload.get("adapter_surface") or surface
+
+        context_bands = sorted(
+            {
+                cell.get("context_band")
+                for cell in payload.get("exports", [])
+                if cell.get("context_band")
+            }
+        )
+        metadata = {
+            "adapter_id": payload.get("adapter_id"),
+            "bundle_id": payload.get("bundle_id"),
+            "profile_id": payload.get("profile_id"),
+            "duration_tier": payload.get("duration_tier"),
+            "context_bands": context_bands,
+            "adapter_support_status": payload.get("adapter_support_status"),
+            "profile_tier": payload.get("tier"),
+        }
+        producer_handoff = payload.get("producer_handoff_metadata") or {}
+        if producer_handoff:
+            metadata["producer_handoff_class"] = producer_handoff.get("class")
+            metadata["producer_claim_boundary"] = producer_handoff.get("claim_boundary")
+
+        # Extract producer KV expectations from first export cell trace_metadata
+        first_cell = (payload.get("exports") or [{}])[0] if payload.get("exports") else {}
+        trace_metadata = first_cell.get("trace_metadata", {})
+        if trace_metadata:
+            metadata["producer_estimated_kv_bytes_peak"] = trace_metadata.get("estimated_kv_bytes_peak")
+            pressure_profile = trace_metadata.get("context_pressure_profile", {})
+            metadata["producer_expected_offload_mode"] = (
+                pressure_profile.get("expected_offload_mode")
+                or trace_metadata.get("expected_offload_mode")
+            )
+
+    return isl, osl, lane, surface, metadata
+
+
+def validate_support_status_selection(
+    expected_support_status: Optional[str], selection: dict[str, Any]
+) -> None:
+    """Ensure processed ISB1 output is labeled with the tier actually selected by the harness."""
+    if not expected_support_status:
+        return
+
+    selected_statuses = selection.get("support_statuses") or []
+    if not selected_statuses:
+        raise ValueError(
+            "ISB1 replay result is missing selection.support_statuses; "
+            "cannot certify the processed support tier."
+        )
+
+    unique_statuses = sorted(set(selected_statuses))
+    if unique_statuses != [expected_support_status]:
+        raise ValueError(
+            "ISB1 replay result support-status mismatch: "
+            f"workflow requested '{expected_support_status}' but harness selected {unique_statuses}."
+        )
+
+
+def validate_certification_selection(selection: dict[str, Any]) -> None:
+    """Ensure processed ISB1 output carries the expected runnable certification."""
+    selected_statuses = selection.get("benchmark_certification_statuses") or []
+    if not selected_statuses:
+        raise ValueError(
+            "ISB1 replay result is missing selection.benchmark_certification_statuses; "
+            "cannot certify the processed replay result."
+        )
+
+    unique_statuses = sorted(set(selected_statuses))
+    if unique_statuses != ISB1_RUNNABLE_CERTIFICATION_STATUSES:
+        raise ValueError(
+            "ISB1 replay result benchmark-certification mismatch: "
+            "current consumer lanes require "
+            f"{ISB1_RUNNABLE_CERTIFICATION_STATUSES}, but harness selected {unique_statuses}."
+        )
+
+
+def build_context_pressure_signal(
+    context_pressure_class: str,
+    kv_offload_observed: bool,
+    peak_cpu_cache_usage: float,
+    cpu_cache_metric_available: bool,
+    depth_coverage_ratio: Optional[float] = None,
+    max_actual_context_len: Optional[int] = None,
+) -> dict[str, Any]:
+    """Emit a machine-readable status for preview-lane context-pressure validation."""
+    if context_pressure_class == "standard":
+        status = "not_applicable"
+        reason = "standard_context"
+        requires_log_review = False
+    elif depth_coverage_ratio is not None and depth_coverage_ratio < 0.1:
+        status = "depth_mismatch"
+        reason = "configured_depth_not_exercised"
+        requires_log_review = True
+    elif not cpu_cache_metric_available:
+        status = "observability_gap"
+        reason = "no_direct_cpu_cache_metric"
+        requires_log_review = True
+    elif not kv_offload_observed and peak_cpu_cache_usage == 0.0:
+        status = "suspicious"
+        reason = "high_context_without_cpu_cache_usage"
+        requires_log_review = True
+    else:
+        status = "ok"
+        reason = "cpu_cache_signal_present"
+        requires_log_review = False
+
+    result = {
+        "status": status,
+        "reason": reason,
+        "requires_log_review": requires_log_review,
+        "cpu_cache_metric_available": cpu_cache_metric_available,
+    }
+    if depth_coverage_ratio is not None:
+        result["depth_coverage_ratio"] = round(depth_coverage_ratio, 4)
+    if max_actual_context_len is not None:
+        result["max_actual_context_len"] = max_actual_context_len
+    return result
+
+
+def build_runtime_overrides(replay_result: dict[str, Any]) -> dict[str, Optional[str]]:
+    """Return a stable runtime-overrides payload for aggregated ISB1 results."""
+    override_mapping = {
+        "vllm_cpu_offload_gb": "VLLM_CPU_OFFLOAD_GB",
+        "vllm_swap_space_gb": "VLLM_SWAP_SPACE_GB",
+        "sglang_mem_fraction_override": "SGLANG_MEM_FRACTION_OVERRIDE",
+        "sglang_chunked_prefill_override": "SGLANG_CHUNKED_PREFILL_OVERRIDE",
+    }
+    runtime_overrides: dict[str, Optional[str]] = {}
+
+    for result_key, env_var in override_mapping.items():
+        value = replay_result.get(result_key)
+        if value in (None, ""):
+            value = os.environ.get(env_var)
+        runtime_overrides[result_key] = value if value not in (None, "") else None
+
+    return runtime_overrides
+
+
+def build_artifact_stems(result_filename: str) -> dict[str, str]:
+    """Return artifact names emitted by benchmark-isb1-tmpl.yml for this result stem."""
+    return {
+        "processed": f"isb1_{result_filename}",
+        "raw_replay": f"replay_{result_filename}",
+        "server_logs": f"server_logs_{result_filename}",
+        "gpu_metrics": f"gpu_metrics_{result_filename}",
+    }
+
+
+def build_dispatch_ref() -> Optional[str]:
+    """Return the best available workflow dispatch ref for traceability."""
+    for env_var in ("DISPATCH_REF", "INPUT_REF", "GITHUB_REF"):
+        value = os.environ.get(env_var)
+        if value not in (None, ""):
+            return value
+    return None
+
+
+base_env = get_required_env_vars(
+    [
+        "RUNNER_TYPE",
+        "FRAMEWORK",
+        "PRECISION",
+        "RESULT_FILENAME",
+        "MODEL_PREFIX",
+        "IMAGE",
+        "TP",
+        "EP_SIZE",
+        "DP_ATTENTION",
+        "BENCHMARK_TYPE",
+        "EXPORT_FILE",
+        "RUNTIME_STACK_ID",
+        "HARDWARE_PROFILE_ID",
+        "CANONICAL_MODEL_ID",
+        "REQUEST_MODE",
+        "MAX_CONCURRENCY",
+    ]
+)
+
+result_filename = base_env["RESULT_FILENAME"]
+with open(f"{result_filename}.json") as f:
+    replay_result = json.load(f)
+
+aggregate = replay_result["aggregate_metrics"]
+tp_size = int(base_env["TP"])
+ep_size = int(base_env["EP_SIZE"])
+validate_support_status_selection(
+    os.environ.get("SUPPORT_STATUS") or None,
+    replay_result.get("selection", {}),
+)
+validate_certification_selection(replay_result.get("selection", {}))
+isl, osl, export_lane, benchmark_surface, export_metadata = parse_export_shape(
+    base_env["EXPORT_FILE"]
+)
+
+total_tput = float(aggregate["total_token_throughput_tps"])
+output_tput = float(aggregate["output_throughput_tps"])
+
+server_metrics_summary = replay_result.get("server_metrics_summary", {})
+cpu_cache_metric_available_raw = server_metrics_summary.get("cpu_cache_metric_available")
+cpu_cache_metric_available = bool(cpu_cache_metric_available_raw)
+if cpu_cache_metric_available_raw is None:
+    # Backward-compatibility shim for older replay outputs that predate the
+    # explicit availability field. Presence of the metric name/fields is a
+    # better signal than the sampled value because a real metric can be present
+    # and legitimately report 0.0.
+    cpu_cache_metric_available = bool(server_metrics_summary.get("cpu_cache_metric_name")) or any(
+        metric_name in server_metrics_summary
+        for metric_name in ("cpu_cache_usage_avg", "cpu_cache_usage_peak")
+    )
+
+data = {
+    "hw": base_env["RUNNER_TYPE"],
+    "conc": int(replay_result.get("max_concurrency", base_env["MAX_CONCURRENCY"])),
+    "image": base_env["IMAGE"],
+    "model": replay_result["model_id"],
+    "infmax_model_prefix": base_env["MODEL_PREFIX"],
+    "framework": base_env["FRAMEWORK"],
+    "precision": base_env["PRECISION"],
+    "spec_decoding": os.environ.get("SPEC_DECODING", "none"),
+    "disagg": False,
+    "isl": isl,
+    "osl": osl,
+    "is_multinode": False,
+    "tp": tp_size,
+    "ep": ep_size,
+    "dp_attention": base_env["DP_ATTENTION"],
+    "tput_per_gpu": total_tput / tp_size,
+    "output_tput_per_gpu": output_tput / tp_size,
+    "input_tput_per_gpu": (total_tput - output_tput) / tp_size,
+    "benchmark_type": base_env["BENCHMARK_TYPE"],
+    "result_filename": result_filename,
+    "artifact_stems": build_artifact_stems(result_filename),
+    "dispatch_ref": build_dispatch_ref(),
+    "export_file": base_env["EXPORT_FILE"],
+    "export_lane": export_lane,
+    "benchmark_surface": benchmark_surface,
+    "adapter_id": export_metadata.get("adapter_id"),
+    "bundle_id": export_metadata.get("bundle_id"),
+    "profile_id": export_metadata.get("profile_id"),
+    "duration_tier": export_metadata.get("duration_tier"),
+    "context_bands": export_metadata.get("context_bands", []),
+    "adapter_support_status": export_metadata.get("adapter_support_status"),
+    "profile_tier": export_metadata.get("profile_tier"),
+    "producer_handoff_class": export_metadata.get("producer_handoff_class"),
+    "producer_claim_boundary": export_metadata.get("producer_claim_boundary"),
+    "runtime_stack_id": base_env["RUNTIME_STACK_ID"],
+    "hardware_profile_id": base_env["HARDWARE_PROFILE_ID"],
+    "canonical_model_id": base_env["CANONICAL_MODEL_ID"],
+    "support_status": os.environ.get("SUPPORT_STATUS") or None,
+    "benchmark_certification_status": replay_result.get("selection", {}).get(
+        "benchmark_certification_statuses", [None]
+    )[0],
+    "request_mode": base_env["REQUEST_MODE"],
+    "workload_type": os.environ.get("WORKLOAD_TYPE") or benchmark_surface,
+    "benchmark_duration_s": (
+        float(os.environ["BENCHMARK_DURATION_S"])
+        if os.environ.get("BENCHMARK_DURATION_S") not in (None, "")
+        else None
+    ),
+    "campaign_class": (
+        "kv_stress"
+        if base_env["BENCHMARK_TYPE"] == "isb1_kv_stress"
+        else "replay"
+    ),
+    "harness_request_mode": replay_result.get("harness_request_mode", "auto"),
+    "mode": replay_result.get("mode"),
+    "selection": replay_result.get("selection", {}),
+    "aggregate_metrics": aggregate,
+    "per_turn_metrics": replay_result.get("per_turn_metrics", {}),
+    "server_metrics_summary": server_metrics_summary,
+    "cache_observability_status": server_metrics_summary.get("observability_status"),
+    "gpu_cache_metric_name": server_metrics_summary.get("gpu_cache_metric_name"),
+    "cpu_cache_metric_name": server_metrics_summary.get("cpu_cache_metric_name"),
+    "cpu_cache_metric_available": cpu_cache_metric_available,
+    "kv_offload_observed": bool(server_metrics_summary.get("kv_offload_observed", False)),
+    "peak_gpu_cache_usage": float(server_metrics_summary.get("gpu_cache_usage_peak", 0.0)),
+    "peak_cpu_cache_usage": float(server_metrics_summary.get("cpu_cache_usage_peak", 0.0)),
+    "session_throughput_sps": float(aggregate.get("session_throughput_sps", 0.0)),
+    "completed_sessions": int(aggregate.get("completed_sessions", 0)),
+    "total_sessions": int(aggregate.get("total_sessions", 0)),
+    "num_sessions": replay_result.get("num_sessions"),
+    "max_turns": replay_result.get("max_turns"),
+    "num_warmup_sessions": replay_result.get(
+        "num_warmup_sessions", int(os.environ.get("NUM_WARMUP_SESSIONS", "0") or 0)
+    ),
+    "max_model_len": (
+        int(os.environ["MAX_MODEL_LEN"])
+        if os.environ.get("MAX_MODEL_LEN") not in (None, "")
+        else None
+    ),
+    "max_sessions": (
+        int(os.environ["MAX_SESSIONS"])
+        if os.environ.get("MAX_SESSIONS") not in (None, "")
+        else None
+    ),
+    "max_turns_per_session": (
+        int(os.environ["MAX_TURNS_PER_SESSION"])
+        if os.environ.get("MAX_TURNS_PER_SESSION") not in (None, "")
+        else None
+    ),
+    "max_output_len": (
+        int(os.environ["MAX_OUTPUT_LEN"])
+        if os.environ.get("MAX_OUTPUT_LEN") not in (None, "")
+        else None
+    ),
+    "ignore_waits": os.environ.get("IGNORE_WAITS", "false").lower() == "true",
+    "ignore_eos": os.environ.get("IGNORE_EOS", "false").lower() == "true",
+    "offload_mode": os.environ.get("OFFLOAD_MODE") or None,
+    "kv_cache_dtype": os.environ.get("KV_CACHE_DTYPE") or None,
+    "disable_prefix_caching": os.environ.get("DISABLE_PREFIX_CACHING", "false").lower() == "true",
+    "runtime_overrides": build_runtime_overrides(replay_result),
+}
+
+effective_max_context_depth = data["max_model_len"] or (isl + osl + 200)
+data["effective_max_context_depth"] = effective_max_context_depth
+if effective_max_context_depth > 600000:
+    data["context_pressure_class"] = "extended_1m"
+elif effective_max_context_depth > 200000:
+    data["context_pressure_class"] = "extended_500k"
+else:
+    data["context_pressure_class"] = "standard"
+
+# Depth telemetry: actual vs configured context depth
+depth_telemetry = replay_result.get("depth_telemetry", {})
+max_actual_context_len = int(depth_telemetry.get("max_actual_context_len_per_turn") or 0) or None
+total_actual_input_tokens = int(depth_telemetry.get("total_actual_input_tokens") or 0) or None
+depth_coverage_ratio = None
+if max_actual_context_len and effective_max_context_depth > 0:
+    depth_coverage_ratio = max_actual_context_len / effective_max_context_depth
+
+data["total_actual_input_tokens"] = total_actual_input_tokens
+data["max_actual_context_len_per_turn"] = max_actual_context_len
+data["depth_coverage_ratio"] = round(depth_coverage_ratio, 4) if depth_coverage_ratio is not None else None
+data["depth_gap_tokens"] = (
+    effective_max_context_depth - max_actual_context_len
+    if max_actual_context_len is not None else None
+)
+
+# Depth coverage classification
+if depth_coverage_ratio is not None:
+    if depth_coverage_ratio >= 0.9:
+        data["depth_coverage_class"] = "full"
+    elif depth_coverage_ratio >= 0.5:
+        data["depth_coverage_class"] = "partial"
+    elif depth_coverage_ratio >= 0.1:
+        data["depth_coverage_class"] = "bounded_preview"
+    else:
+        data["depth_coverage_class"] = "configuration_only"
+else:
+    data["depth_coverage_class"] = None
+
+# Producer expectation comparison
+producer_estimated_kv_bytes_peak = export_metadata.get("producer_estimated_kv_bytes_peak")
+producer_expected_offload_mode = export_metadata.get("producer_expected_offload_mode")
+data["producer_estimated_kv_bytes_peak"] = producer_estimated_kv_bytes_peak
+data["producer_expected_offload_mode"] = producer_expected_offload_mode
+
+offload_mode_match = None
+if producer_expected_offload_mode and data["context_pressure_class"] != "standard":
+    if producer_expected_offload_mode in ("hard_offload", "soft_offload"):
+        offload_mode_match = data["kv_offload_observed"]
+    elif producer_expected_offload_mode == "none":
+        offload_mode_match = True
+data["producer_expectation_validation"] = {
+    "offload_mode_match": offload_mode_match,
+    "kv_bytes_validation": "not_available",
+    "depth_exercised": bool(depth_coverage_ratio and depth_coverage_ratio >= 0.5),
+}
+
+# Preemption count from server metrics
+data["preemption_count"] = int(
+    server_metrics_summary.get("preemption_count")
+    or replay_result.get("preemption_count")
+    or 0
+)
+
+context_pressure_signal = build_context_pressure_signal(
+    context_pressure_class=data["context_pressure_class"],
+    kv_offload_observed=data["kv_offload_observed"],
+    peak_cpu_cache_usage=data["peak_cpu_cache_usage"],
+    cpu_cache_metric_available=data["cpu_cache_metric_available"],
+    depth_coverage_ratio=depth_coverage_ratio,
+    max_actual_context_len=max_actual_context_len,
+)
+data["context_pressure_signal"] = context_pressure_signal
+data["context_pressure_suspicious"] = context_pressure_signal["status"] == "suspicious"
+
+if data["context_pressure_suspicious"]:
+    print(
+        "WARNING: Preview lane at "
+        f"max-model-len={effective_max_context_depth} saw no CPU cache usage. "
+        "The server may have silently capped context or failed to activate KV offload. "
+        "Check server.log for OOM or context truncation.",
+        file=sys.stderr,
+    )
+elif context_pressure_signal["status"] == "depth_mismatch":
+    print(
+        "WARNING: Preview lane at "
+        f"max-model-len={effective_max_context_depth} had max actual context of "
+        f"{max_actual_context_len} tokens (depth_coverage_ratio="
+        f"{depth_coverage_ratio:.4f}). The server was configured for "
+        f"{data['context_pressure_class'].replace('extended_', '')} but requests only exercised "
+        f"{max_actual_context_len} tokens. This is expected for file-backed replay previews; "
+        "it does not prove KV pressure at the configured depth.",
+        file=sys.stderr,
+    )
+elif context_pressure_signal["status"] == "observability_gap":
+    print(
+        "WARNING: Preview lane at "
+        f"max-model-len={effective_max_context_depth} lacks a direct CPU cache metric "
+        "for this framework. Inspect server.log and operator tuning notes before "
+        "treating the run as credible long-context evidence.",
+        file=sys.stderr,
+    )
+
+for key, value in aggregate.items():
+    if key.endswith("_ms"):
+        data[key.replace("_ms", "")] = float(value) / 1000.0
+        if "tpot" in key:
+            metric_value = float(value)
+            data[key.replace("_ms", "").replace("tpot", "intvty")] = (
+                1000.0 / metric_value if metric_value > 0 else 0.0
+            )
+
+print(json.dumps(data, indent=2))
+
+with open(f"agg_{result_filename}.json", "w") as f:
+    json.dump(data, f, indent=2)
diff --git a/utils/summarize_isb1.py b/utils/summarize_isb1.py
new file mode 100644
index 000000000..3c2428a4b
--- /dev/null
+++ b/utils/summarize_isb1.py
@@ -0,0 +1,238 @@
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+try:
+    from tabulate import tabulate as _tabulate
+except ImportError:  # pragma: no cover - fallback for minimal local environments
+    _tabulate = None
+
+
+SUPPORT_STATUS_ORDER = {
+    "supported": 0,
+    "reviewed_preview": 1,
+    "gated": 2,
+    "artifact_only": 3,
+    "unsupported": 4,
+    None: 5,
+}
+
+
+def load_isb1_rows(results_dir: Path) -> list[dict[str, Any]]:
+    """Load processed ISB1 rows from a results directory."""
+    rows: list[dict[str, Any]] = []
+    for result_path in results_dir.rglob("*.json"):
+        try:
+            payload = json.loads(result_path.read_text())
+        except (OSError, json.JSONDecodeError):
+            continue
+
+        candidates = payload if isinstance(payload, list) else [payload]
+        for candidate in candidates:
+            if isinstance(candidate, dict) and candidate.get("benchmark_type") == "isb1_replay":
+                rows.append(candidate)
+    return rows
+
+
+def sort_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Sort rows in an operator-friendly order."""
+    return sorted(
+        rows,
+        key=lambda row: (
+            SUPPORT_STATUS_ORDER.get(row.get("support_status"), 99),
+            row.get("infmax_model_prefix", ""),
+            row.get("hw", ""),
+            row.get("framework", ""),
+            row.get("effective_max_context_depth", 0) or 0,
+            row.get("result_filename", ""),
+        ),
+    )
+
+
+def format_float(value: Any, precision: int = 2) -> str:
+    """Format a numeric value for markdown tables."""
+    if value is None:
+        return "-"
+    try:
+        return f"{float(value):.{precision}f}"
+    except (TypeError, ValueError):
+        return str(value)
+
+
+def format_bool(value: Any) -> str:
+    """Format a truthy value as yes/no for operators."""
+    return "yes" if bool(value) else "no"
+
+
+def render_table(headers: list[str], rows: list[list[Any]], tablefmt: str) -> str:
+    """Render a markdown/plain table with a lightweight fallback if tabulate is absent."""
+    normalized_rows = [[str(cell) for cell in row] for row in rows]
+    if _tabulate is not None:
+        return _tabulate(normalized_rows, headers=headers, tablefmt=tablefmt)
+
+    widths = [len(header) for header in headers]
+    for row in normalized_rows:
+        for index, cell in enumerate(row):
+            widths[index] = max(widths[index], len(cell))
+
+    def render_row(row: list[str]) -> str:
+        cells = [cell.ljust(widths[index]) for index, cell in enumerate(row)]
+        return f"| {' | '.join(cells)} |"
+
+    divider = f"| {' | '.join('-' * width for width in widths)} |"
+    lines = [render_row(headers), divider]
+    lines.extend(render_row(row) for row in normalized_rows)
+    return "\n".join(lines)
+
+
+def build_lane_summary_table(rows: list[dict[str, Any]], tablefmt: str) -> str:
+    """Render the main operator lane summary table."""
+    headers = [
+        "Lane",
+        "Model",
+        "HW",
+        "Framework",
+        "Support",
+        "Cert",
+        "Max Ctx",
+        "Context Class",
+        "Sessions",
+        "Session Tput",
+        "TTFT Median (s)",
+        "Ctx Pressure",
+        "Log Review",
+        "KV Offload",
+        "GPU Cache Peak",
+        "CPU Cache Peak",
+    ]
+    table_rows = [
+        [
+            row.get("result_filename", "-"),
+            row.get("infmax_model_prefix", "-"),
+            row.get("hw", "-"),
+            row.get("framework", "-"),
+            row.get("support_status", "-"),
+            row.get("benchmark_certification_status", "-"),
+            row.get("effective_max_context_depth", "-"),
+            row.get("context_pressure_class", "-"),
+            f"{row.get('completed_sessions', 0)}/{row.get('total_sessions', 0)}",
+            format_float(row.get("session_throughput_sps"), 2),
+            format_float(row.get("median_ttft"), 3),
+            (row.get("context_pressure_signal") or {}).get("status", "-"),
+            format_bool((row.get("context_pressure_signal") or {}).get("requires_log_review")),
+            format_bool(row.get("kv_offload_observed")),
+            format_float(row.get("peak_gpu_cache_usage"), 2),
+            format_float(row.get("peak_cpu_cache_usage"), 2),
+        ]
+        for row in rows
+    ]
+    return render_table(headers, table_rows, tablefmt)
+
+
+def build_runtime_override_table(rows: list[dict[str, Any]], tablefmt: str) -> str | None:
+    """Render the runtime override table when any override is present."""
+    override_rows = []
+    for row in rows:
+        runtime_overrides = row.get("runtime_overrides") or {}
+        if not any(value not in (None, "") for value in runtime_overrides.values()):
+            continue
+        override_rows.append(
+            [
+                row.get("result_filename", "-"),
+                row.get("infmax_model_prefix", "-"),
+                row.get("hw", "-"),
+                row.get("framework", "-"),
+                runtime_overrides.get("vllm_cpu_offload_gb") or "-",
+                runtime_overrides.get("vllm_swap_space_gb") or "-",
+                runtime_overrides.get("sglang_mem_fraction_override") or "-",
+                runtime_overrides.get("sglang_chunked_prefill_override") or "-",
+                row.get("dispatch_ref") or "-",
+            ]
+        )
+
+    if not override_rows:
+        return None
+
+    headers = [
+        "Lane",
+        "Model",
+        "HW",
+        "Framework",
+        "VLLM CPU Offload GB",
+        "VLLM Swap GB",
+        "SGLang Mem Fraction",
+        "SGLang Chunked Prefill",
+        "Dispatch Ref",
+    ]
+    return render_table(headers, override_rows, tablefmt)
+
+
+def build_action_items(rows: list[dict[str, Any]]) -> list[str]:
+    """Build operator action items for suspicious or manual-review rows."""
+    items: list[str] = []
+    for row in rows:
+        signal = row.get("context_pressure_signal") or {}
+        if not row.get("context_pressure_suspicious") and not signal.get("requires_log_review"):
+            continue
+
+        artifact_stems = row.get("artifact_stems") or {}
+        items.append(
+            "- "
+            f"`{row.get('result_filename', 'unknown')}` ({row.get('infmax_model_prefix', '-')}/"
+            f"{row.get('hw', '-')}/{row.get('framework', '-')}) "
+            f"requires follow-up: context pressure `{signal.get('status', 'unknown')}`; "
+            f"review replay `{artifact_stems.get('raw_replay', '-')}`, "
+            f"logs `{artifact_stems.get('server_logs', '-')}`, "
+            f"GPU metrics `{artifact_stems.get('gpu_metrics', '-')}`"
+            + (
+                f", dispatch `{row.get('dispatch_ref')}`"
+                if row.get("dispatch_ref")
+                else ""
+            )
+            + "."
+        )
+    return items
+
+
+def generate_summary(results_dir: Path, tablefmt: str = "github") -> str:
+    """Generate an ISB1-specific operator summary in markdown/plain text."""
+    rows = sort_rows(load_isb1_rows(results_dir))
+    sections = ["## ISB1 Operator Summary", ""]
+
+    if not rows:
+        sections.append("No ISB1 replay rows found.")
+        return "\n".join(sections).rstrip() + "\n"
+
+    sections.extend(["### Lane Summary", "", build_lane_summary_table(rows, tablefmt), ""])
+
+    runtime_override_table = build_runtime_override_table(rows, tablefmt)
+    if runtime_override_table:
+        sections.extend(["### Runtime Overrides", "", runtime_override_table, ""])
+
+    action_items = build_action_items(rows)
+    sections.append("### Action Items")
+    sections.append("")
+    if action_items:
+        sections.extend(action_items)
+    else:
+        sections.append("- None. No suspicious or manual-log-review rows were detected.")
+
+    return "\n".join(sections).rstrip() + "\n"
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Generate an ISB1-specific operator summary.")
+    parser.add_argument("results_dir", type=Path)
+    parser.add_argument("--format", choices=["github", "plain"], default="github")
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    print(generate_summary(args.results_dir, tablefmt=args.format))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/utils/test_benchmark_export_replay.py b/utils/test_benchmark_export_replay.py
new file mode 100644
index 000000000..31e4dc656
--- /dev/null
+++ b/utils/test_benchmark_export_replay.py
@@ -0,0 +1,766 @@
+import asyncio
+import json
+from pathlib import Path
+
+from aiohttp import web
+
+from bench_serving.benchmark_export_replay import (
+    load_replay_sessions,
+    run_export_replay_benchmark,
+)
+
+
+def _count_tokens(text: str) -> int:
+    return max(1, len((text or "").split())) if text else 0
+
+
+def _multiturn_payload(runtime_stack_id: str = "standalone:sglang") -> dict:
+    return {
+        "adapter_id": "inferencex_multiturn",
+        "exports": [
+            {
+                "trace_id": "trace-chat-1",
+                "runtime_stack_id": runtime_stack_id,
+                "hardware_profile_id": "nvidia:h200_sxm_141gb",
+                "canonical_model_id": "qwen3_30b_a3b",
+                "support_status": "supported",
+                "benchmark_certification_status": "dataset_replay_verified",
+                "session": {
+                    "session_id": "session-chat-1",
+                    "turns": [
+                        {
+                            "turn_idx": 0,
+                            "turn_id": 0,
+                            "messages": [
+                                {
+                                    "role": "user",
+                                    "content_blocks": [
+                                        {"type": "text", "text": "Investigate the flaky test."}
+                                    ],
+                                }
+                            ],
+                            "expected_output_tokens": 8,
+                            "wait_before_ms": 0,
+                        },
+                        {
+                            "turn_idx": 1,
+                            "turn_id": 1,
+                            "messages": [
+                                {
+                                    "role": "user",
+                                    "content_blocks": [
+                                        {"type": "text", "text": "Investigate the flaky test."}
+                                    ],
+                                },
+                                {
+                                    "role": "assistant",
+                                    "content_blocks": [
+                                        {"type": "text", "text": "I found a race in the setup."}
+                                    ],
+                                },
+                                {
+                                    "role": "tool",
+                                    "content_blocks": [
+                                        {"type": "log", "text": "pytest -k flaky_test -> failed"}
+                                    ],
+                                },
+                            ],
+                            "expected_output_tokens": 6,
+                            "wait_before_ms": 10,
+                        },
+                    ],
+                },
+            }
+        ],
+    }
+
+
+def _trace_replay_payload(runtime_stack_id: str = "standalone:trt_llm") -> dict:
+    return {
+        "adapter_id": "inferencex_trace_replay",
+        "exports": [
+            {
+                "trace_id": "trace-replay-1",
+                "runtime_stack_id": runtime_stack_id,
+                "hardware_profile_id": "nvidia:b200_sxm_180gb",
+                "canonical_model_id": "gpt_oss_120b",
+                "support_status": "supported",
+                "benchmark_certification_status": "dataset_replay_verified",
+                "trace_metadata": {"session_id": "session-replay-1"},
+                "events": [
+                    {
+                        "turn_id": 0,
+                        "arrival_time_offset_ms": 0,
+                        "input_messages": [
+                            {
+                                "role": "user",
+                                "content_blocks": [
+                                    {"type": "text", "text": "Summarize the incident report."}
+                                ],
+                            }
+                        ],
+                        "target_output_tokens": 7,
+                    },
+                    {
+                        "turn_id": 1,
+                        "arrival_time_offset_ms": 25,
+                        "input_messages": [
+                            {
+                                "role": "user",
+                                "content_blocks": [
+                                    {"type": "text", "text": "Summarize the incident report."}
+                                ],
+                            },
+                            {
+                                "role": "assistant",
+                                "content_blocks": [
+                                    {"type": "text", "text": "The outage started after deploy."}
+                                ],
+                            },
+                        ],
+                        "target_output_tokens": 5,
+                    },
+                ],
+            }
+        ],
+    }
+
+
+async def _start_mock_server(
+    sse_mode: str = "normal",
+    metrics_text: str | None = None,
+) -> tuple[web.AppRunner, str]:
+    """Start a mock OpenAI-compatible server.
+
+    sse_mode controls how SSE frames are written to the wire:
+      - "normal": one data frame per write (default)
+      - "multiline": multiple data frames packed into a single write
+      - "split": a single data frame split across two writes
+    """
+
+    async def _stream_response(request: web.Request, chunks: list[dict]) -> web.StreamResponse:
+        response = web.StreamResponse(
+            status=200,
+            headers={"Content-Type": "text/event-stream"},
+        )
+        await response.prepare(request)
+
+        if sse_mode == "multiline":
+            # Pack ALL data frames into a single TCP write
+            blob = b""
+            for chunk in chunks:
+                blob += f"data: {json.dumps(chunk)}\n\n".encode()
+            blob += b"data: [DONE]\n\n"
+            await response.write(blob)
+        elif sse_mode == "split":
+            # Split the first frame across two writes
+            for idx, chunk in enumerate(chunks):
+                frame = f"data: {json.dumps(chunk)}\n\n".encode()
+                if idx == 0:
+                    mid = len(frame) // 2
+                    await response.write(frame[:mid])
+                    await asyncio.sleep(0.005)
+                    await response.write(frame[mid:])
+                else:
+                    await response.write(frame)
+                    await asyncio.sleep(0.005)
+            await response.write(b"data: [DONE]\n\n")
+        else:
+            for chunk in chunks:
+                await response.write(f"data: {json.dumps(chunk)}\n\n".encode())
+                await asyncio.sleep(0.005)
+            await response.write(b"data: [DONE]\n\n")
+
+        await response.write_eof()
+        return response
+
+    async def chat_handler(request: web.Request) -> web.StreamResponse:
+        payload = await request.json()
+        # Verify the fallback from max_completion_tokens -> max_tokens.
+        if "max_completion_tokens" in payload:
+            return web.json_response({"error": "unsupported field"}, status=400)
+        assert payload["messages"]
+        return await _stream_response(
+            request,
+            [
+                {"choices": [{"delta": {"content": "patched"}}]},
+                {"usage": {"completion_tokens": 2}},
+            ],
+        )
+
+    async def completions_handler(request: web.Request) -> web.StreamResponse:
+        payload = await request.json()
+        assert payload["prompt"].startswith("USER:")
+        return await _stream_response(
+            request,
+            [
+                {"choices": [{"text": "resolved"}]},
+                {"usage": {"completion_tokens": 2}},
+            ],
+        )
+
+    async def metrics_handler(_: web.Request) -> web.Response:
+        return web.Response(
+            text=metrics_text
+            or (
+                "vllm:gpu_cache_usage_perc 0.42\n"
+                "vllm:cpu_cache_usage_perc 0.25\n"
+                "sglang:cache_hit_rate 0.8\n"
+            )
+        )
+
+    app = web.Application()
+    app.router.add_post("/v1/chat/completions", chat_handler)
+    app.router.add_post("/v1/completions", completions_handler)
+    app.router.add_get("/metrics", metrics_handler)
+
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, host="127.0.0.1", port=0)
+    await site.start()
+    sockets = getattr(site, "_server").sockets
+    port = sockets[0].getsockname()[1]
+    return runner, f"http://127.0.0.1:{port}"
+
+
+def test_load_replay_sessions_multiturn_chat(tmp_path: Path) -> None:
+    export_file = tmp_path / "multiturn.json"
+    export_file.write_text(json.dumps(_multiturn_payload()))
+
+    sessions, selection = load_replay_sessions(
+        export_file=str(export_file),
+        count_text_tokens=_count_tokens,
+        runtime_stack_ids={"standalone:sglang"},
+        hardware_profile_ids={"nvidia:h200_sxm_141gb"},
+        canonical_model_ids={"qwen3_30b_a3b"},
+        request_mode="auto",
+        ignore_waits=False,
+    )
+
+    assert len(sessions) == 1
+    assert sessions[0].request_mode == "chat"
+    assert sessions[0].turns[1].wait_before_s == 0.01
+    assert selection["support_statuses"] == ["supported"]
+    assert selection["support_status_counts"] == {"supported": 1}
+    assert selection["benchmark_certification_statuses"] == ["dataset_replay_verified"]
+    assert selection["benchmark_certification_status_counts"] == {
+        "dataset_replay_verified": 1
+    }
+    assert selection["request_mode_mix"] == {"chat": 1}
+
+
+def test_load_replay_sessions_trace_replay_auto_uses_completions(tmp_path: Path) -> None:
+    export_file = tmp_path / "trace_replay.json"
+    export_file.write_text(json.dumps(_trace_replay_payload()))
+
+    sessions, selection = load_replay_sessions(
+        export_file=str(export_file),
+        count_text_tokens=_count_tokens,
+        runtime_stack_ids={"standalone:trt_llm"},
+        hardware_profile_ids={"nvidia:b200_sxm_180gb"},
+        canonical_model_ids={"gpt_oss_120b"},
+        request_mode="auto",
+    )
+
+    assert len(sessions) == 1
+    assert sessions[0].request_mode == "completions"
+    assert sessions[0].turns[1].wait_before_s == 0.025
+    assert sessions[0].turns[0].completion_prompt.startswith("USER:")
+    assert selection["support_statuses"] == ["supported"]
+    assert selection["benchmark_certification_statuses"] == ["dataset_replay_verified"]
+    assert selection["request_mode_mix"] == {"completions": 1}
+
+
+def test_load_replay_sessions_support_status_filter(tmp_path: Path) -> None:
+    payload = _multiturn_payload()
+    payload["exports"].append(
+        {
+            **payload["exports"][0],
+            "trace_id": "trace-chat-preview",
+            "support_status": "reviewed_preview",
+        }
+    )
+    export_file = tmp_path / "multiturn_mixed_status.json"
+    export_file.write_text(json.dumps(payload))
+
+    sessions, selection = load_replay_sessions(
+        export_file=str(export_file),
+        count_text_tokens=_count_tokens,
+        runtime_stack_ids={"standalone:sglang"},
+        hardware_profile_ids={"nvidia:h200_sxm_141gb"},
+        canonical_model_ids={"qwen3_30b_a3b"},
+        support_statuses={"supported"},
+        request_mode="auto",
+        ignore_waits=False,
+    )
+
+    assert [session.trace_id for session in sessions] == ["trace-chat-1"]
+    assert selection["support_statuses"] == ["supported"]
+    assert selection["support_status_counts"] == {"supported": 1}
+    assert selection["benchmark_certification_statuses"] == ["dataset_replay_verified"]
+
+
+def test_run_export_replay_benchmark_chat(tmp_path: Path) -> None:
+    export_file = tmp_path / "multiturn.json"
+    export_file.write_text(json.dumps(_multiturn_payload()))
+
+    sessions, selection = load_replay_sessions(
+        export_file=str(export_file),
+        count_text_tokens=_count_tokens,
+        runtime_stack_ids={"standalone:sglang"},
+        hardware_profile_ids={"nvidia:h200_sxm_141gb"},
+        canonical_model_ids={"qwen3_30b_a3b"},
+        request_mode="chat",
+        ignore_waits=True,
+    )
+
+    async def _run() -> dict:
+        runner, base_url = await _start_mock_server()
+        try:
+            return await run_export_replay_benchmark(
+                sessions=sessions,
+                selection_metadata=selection,
+                model_id="Qwen/Qwen3-30B-A3B",
+                model_name=None,
+                chat_api_url=f"{base_url}/v1/chat/completions",
+                completion_api_url=f"{base_url}/v1/completions",
+                count_text_tokens=_count_tokens,
+                max_concurrency=1,
+                selected_percentiles=[99],
+                disable_tqdm=True,
+                num_warmup_sessions=1,
+            )
+        finally:
+            await runner.cleanup()
+
+    result = asyncio.run(_run())
+    assert result["aggregate_metrics"]["completed_sessions"] == 1
+    assert result["selection"]["request_mode_mix"] == {"chat": 1}
+    assert result["server_metrics_summary"]["samples"] >= 0
+    assert result["server_metrics_summary"]["gpu_cache_usage_peak"] == 0.42
+    assert result["server_metrics_summary"]["cpu_cache_usage_peak"] == 0.25
+    assert result["server_metrics_summary"]["gpu_cache_metric_name"] == "vllm:gpu_cache_usage_perc"
+    assert result["server_metrics_summary"]["cpu_cache_metric_name"] == "vllm:cpu_cache_usage_perc"
+    assert result["server_metrics_summary"]["cpu_cache_metric_available"] is True
+    assert result["server_metrics_summary"]["observability_status"] == "direct_cpu_cache_metric"
+    assert result["server_metrics_summary"]["kv_offload_observed"] is True
+
+
+def test_run_export_replay_benchmark_completions(tmp_path: Path) -> None:
+    export_file = tmp_path / "trace_replay.json"
+    export_file.write_text(json.dumps(_trace_replay_payload()))
+
+    sessions, selection = load_replay_sessions(
+        export_file=str(export_file),
+        count_text_tokens=_count_tokens,
+        runtime_stack_ids={"standalone:trt_llm"},
+        hardware_profile_ids={"nvidia:b200_sxm_180gb"},
+        canonical_model_ids={"gpt_oss_120b"},
+        request_mode="completions",
+        ignore_waits=True,
+    )
+
+    async def _run() -> dict:
+        runner, base_url = await _start_mock_server()
+        try:
+            return await run_export_replay_benchmark(
+                sessions=sessions,
+                selection_metadata=selection,
+                model_id="gpt-oss-120b",
+                model_name=None,
+                chat_api_url=f"{base_url}/v1/chat/completions",
+                completion_api_url=f"{base_url}/v1/completions",
+                count_text_tokens=_count_tokens,
+                max_concurrency=1,
+                selected_percentiles=[99],
+                disable_tqdm=True,
+                num_warmup_sessions=0,
+            )
+        finally:
+            await runner.cleanup()
+
+    result = asyncio.run(_run())
+    assert result["aggregate_metrics"]["completed_sessions"] == 1
+    assert result["selection"]["request_mode_mix"] == {"completions": 1}
+
+
+def test_run_export_replay_benchmark_sglang_token_usage_metrics(tmp_path: Path) -> None:
+    export_file = tmp_path / "multiturn_sglang_metrics.json"
+    export_file.write_text(json.dumps(_multiturn_payload(runtime_stack_id="standalone:sglang")))
+
+    sessions, selection = load_replay_sessions(
+        export_file=str(export_file),
+        count_text_tokens=_count_tokens,
+        runtime_stack_ids={"standalone:sglang"},
+        hardware_profile_ids={"nvidia:h200_sxm_141gb"},
+        canonical_model_ids={"qwen3_30b_a3b"},
+        request_mode="chat",
+        ignore_waits=True,
+    )
+
+    async def _run() -> dict:
+        runner, base_url = await _start_mock_server(
+            metrics_text=(
+                'sglang:token_usage{model_name="Qwen/Qwen3-30B-A3B"} 0.61\n'
+                'sglang:cache_hit_rate{model_name="Qwen/Qwen3-30B-A3B"} 0.8\n'
+            )
+        )
+        try:
+            return await run_export_replay_benchmark(
+                sessions=sessions,
+                selection_metadata=selection,
+                model_id="Qwen/Qwen3-30B-A3B",
+                model_name=None,
+                chat_api_url=f"{base_url}/v1/chat/completions",
+                completion_api_url=f"{base_url}/v1/completions",
+                count_text_tokens=_count_tokens,
+                max_concurrency=1,
+                selected_percentiles=[99],
+                disable_tqdm=True,
+                num_warmup_sessions=0,
+            )
+        finally:
+            await runner.cleanup()
+
+    result = asyncio.run(_run())
+    summary = result["server_metrics_summary"]
+    assert result["aggregate_metrics"]["completed_sessions"] == 1
+    assert summary["samples"] >= 0
+    assert summary["gpu_cache_usage_peak"] == 0.61
+    assert summary["gpu_cache_metric_name"] == "sglang:token_usage"
+    assert summary["cpu_cache_metric_name"] is None
+    assert summary["cpu_cache_metric_available"] is False
+    assert summary["cache_hit_rate_avg"] == 0.8
+    assert summary["observability_status"] == "indirect_without_cpu_cache_metric"
+    assert summary["kv_offload_observed"] is False
+
+
+def test_sse_multiline_chunks(tmp_path: Path) -> None:
+    """Verify replay works when the server packs multiple SSE frames into one TCP write."""
+    export_file = tmp_path / "multiturn.json"
+    export_file.write_text(json.dumps(_multiturn_payload()))
+
+    sessions, selection = load_replay_sessions(
+        export_file=str(export_file),
+        count_text_tokens=_count_tokens,
+        runtime_stack_ids={"standalone:sglang"},
+        hardware_profile_ids={"nvidia:h200_sxm_141gb"},
+        canonical_model_ids={"qwen3_30b_a3b"},
+        request_mode="chat",
+        ignore_waits=True,
+    )
+
+    async def _run() -> dict:
+        runner, base_url = await _start_mock_server(sse_mode="multiline")
+        try:
+            return await run_export_replay_benchmark(
+                sessions=sessions,
+                selection_metadata=selection,
+                model_id="Qwen/Qwen3-30B-A3B",
+                model_name=None,
+                chat_api_url=f"{base_url}/v1/chat/completions",
+                completion_api_url=f"{base_url}/v1/completions",
+                count_text_tokens=_count_tokens,
+                max_concurrency=1,
+                selected_percentiles=[99],
+                disable_tqdm=True,
+                num_warmup_sessions=0,
+            )
+        finally:
+            await runner.cleanup()
+
+    result = asyncio.run(_run())
+    assert result["aggregate_metrics"]["completed_sessions"] == 1
+
+
+def test_sse_split_across_chunks(tmp_path: Path) -> None:
+    """Verify replay works when a single SSE frame is split across TCP writes."""
+    export_file = tmp_path / "multiturn.json"
+    export_file.write_text(json.dumps(_multiturn_payload()))
+
+    sessions, selection = load_replay_sessions(
+        export_file=str(export_file),
+        count_text_tokens=_count_tokens,
+        runtime_stack_ids={"standalone:sglang"},
+        hardware_profile_ids={"nvidia:h200_sxm_141gb"},
+        canonical_model_ids={"qwen3_30b_a3b"},
+        request_mode="chat",
+        ignore_waits=True,
+    )
+
+    async def _run() -> dict:
+        runner, base_url = await _start_mock_server(sse_mode="split")
+        try:
+            return await run_export_replay_benchmark(
+                sessions=sessions,
+                selection_metadata=selection,
+                model_id="Qwen/Qwen3-30B-A3B",
+                model_name=None,
+                chat_api_url=f"{base_url}/v1/chat/completions",
+                completion_api_url=f"{base_url}/v1/completions",
+                count_text_tokens=_count_tokens,
+                max_concurrency=1,
+                selected_percentiles=[99],
+                disable_tqdm=True,
+                num_warmup_sessions=0,
+            )
+        finally:
+            await runner.cleanup()
+
+    result = asyncio.run(_run())
+    assert result["aggregate_metrics"]["completed_sessions"] == 1
+
+
+def test_empty_content_no_phantom_itl(tmp_path: Path) -> None:
+    """Verify that SSE chunks with empty/null content don't inflate ITL counts."""
+    export_file = tmp_path / "multiturn.json"
+    # Use a single-turn export to isolate ITL counting
+    single_turn_payload = {
+        "adapter_id": "inferencex_multiturn",
+        "exports": [
+            {
+                "trace_id": "trace-itl-1",
+                "runtime_stack_id": "standalone:sglang",
+                "hardware_profile_id": "nvidia:h200_sxm_141gb",
+                "canonical_model_id": "qwen3_30b_a3b",
+                "support_status": "supported",
+                "session": {
+                    "session_id": "session-itl-1",
+                    "turns": [
+                        {
+                            "turn_idx": 0,
+                            "turn_id": 0,
+                            "messages": [
+                                {
+                                    "role": "user",
+                                    "content_blocks": [
+                                        {"type": "text", "text": "Hello"}
+                                    ],
+                                }
+                            ],
+                            "expected_output_tokens": 4,
+                            "wait_before_ms": 0,
+                        },
+                    ],
+                },
+            }
+        ],
+    }
+    export_file.write_text(json.dumps(single_turn_payload))
+
+    sessions, selection = load_replay_sessions(
+        export_file=str(export_file),
+        count_text_tokens=_count_tokens,
+        runtime_stack_ids={"standalone:sglang"},
+        hardware_profile_ids={"nvidia:h200_sxm_141gb"},
+        canonical_model_ids={"qwen3_30b_a3b"},
+        request_mode="chat",
+        ignore_waits=True,
+    )
+
+    async def _run() -> dict:
+        # Custom server that sends empty-content chunks between real ones
+        async def _chat_with_empty(request: web.Request) -> web.StreamResponse:
+            payload = await request.json()
+            if "max_completion_tokens" in payload:
+                return web.json_response({"error": "unsupported"}, status=400)
+
+            response = web.StreamResponse(
+                status=200,
+                headers={"Content-Type": "text/event-stream"},
+            )
+            await response.prepare(request)
+            # Frame 1: real content
+            await response.write(
+                f'data: {{"choices": [{{"delta": {{"content": "hello"}}}}]}}\n\n'.encode()
+            )
+            await asyncio.sleep(0.005)
+            # Frame 2: empty content (should not generate ITL entry)
+            await response.write(
+                f'data: {{"choices": [{{"delta": {{"content": ""}}}}]}}\n\n'.encode()
+            )
+            await asyncio.sleep(0.005)
+            # Frame 3: null content (should not generate ITL entry)
+            await response.write(
+                f'data: {{"choices": [{{"delta": {{}}}}]}}\n\n'.encode()
+            )
+            await asyncio.sleep(0.005)
+            # Frame 4: real content
+            await response.write(
+                f'data: {{"choices": [{{"delta": {{"content": " world"}}}}]}}\n\n'.encode()
+            )
+            await asyncio.sleep(0.005)
+            # Usage frame
+            await response.write(
+                f'data: {{"usage": {{"completion_tokens": 2}}}}\n\n'.encode()
+            )
+            await response.write(b"data: [DONE]\n\n")
+            await response.write_eof()
+            return response
+
+        app = web.Application()
+        app.router.add_post("/v1/chat/completions", _chat_with_empty)
+        app.router.add_get("/metrics", lambda _: web.Response(text=""))
+
+        runner = web.AppRunner(app)
+        await runner.setup()
+        site = web.TCPSite(runner, host="127.0.0.1", port=0)
+        await site.start()
+        sockets = getattr(site, "_server").sockets
+        port = sockets[0].getsockname()[1]
+        base_url = f"http://127.0.0.1:{port}"
+
+        try:
+            return await run_export_replay_benchmark(
+                sessions=sessions,
+                selection_metadata=selection,
+                model_id="Qwen/Qwen3-30B-A3B",
+                model_name=None,
+                chat_api_url=f"{base_url}/v1/chat/completions",
+                completion_api_url=f"{base_url}/v1/completions",
+                count_text_tokens=_count_tokens,
+                max_concurrency=1,
+                selected_percentiles=[99],
+                disable_tqdm=True,
+                num_warmup_sessions=0,
+            )
+        finally:
+            await runner.cleanup()
+
+    result = asyncio.run(_run())
+    agg = result["aggregate_metrics"]
+    assert agg["completed_sessions"] == 1
+    # With 2 real content chunks, ITL should have exactly 1 entry
+    # (first content = TTFT, second content = 1 ITL). Empty/null chunks
+    # must not inflate this count.
+    turn_metrics = result["per_turn_metrics"]["turn_1"]
+    assert turn_metrics["completed"] == 1
+
+
+def test_actual_context_len_for_file_backed_assets(tmp_path: Path) -> None:
+    """Verify that actual_context_len counts rendered payload tokens, not asset metadata."""
+    payload = {
+        "adapter_id": "inferencex_trace_replay",
+        "exports": [
+            {
+                "trace_id": "test-asset-trace",
+                "runtime_stack_id": "standalone:vllm",
+                "hardware_profile_id": "nvidia:h200_sxm_141gb",
+                "canonical_model_id": "gpt_oss_120b",
+                "support_status": "reviewed_preview",
+                "benchmark_certification_status": "dataset_replay_verified",
+                "context_band": "xlc2_384k_512k",
+                "trace_metadata": {
+                    "session_id": "test-session",
+                    "estimated_kv_bytes_peak": 27000000000,
+                    "expected_offload_mode": "soft_offload",
+                },
+                "events": [
+                    {
+                        "event_id": "evt-0",
+                        "trace_id": "test-asset-trace",
+                        "session_id": "test-session",
+                        "turn_id": 0,
+                        "arrival_time_offset_ms": 0,
+                        "input_messages": [
+                            {
+                                "role": "user",
+                                "content_blocks": [
+                                    {"type": "text", "text": "Analyze this codebase"},
+                                    {
+                                        "type": "table",
+                                        "text": None,
+                                        "asset_path": "synthetic_v0/context_assets/big_file.md",
+                                        "asset_token_count": 500000,
+                                        "asset_byte_count": 2500000,
+                                    },
+                                ],
+                            }
+                        ],
+                        "output": {"output_token_count": 100},
+                    }
+                ],
+            }
+        ],
+    }
+    export_file = tmp_path / "asset_test.json"
+    export_file.write_text(json.dumps(payload))
+
+    sessions, _ = load_replay_sessions(
+        export_file=str(export_file),
+        count_text_tokens=_count_tokens,
+        runtime_stack_ids={"standalone:vllm"},
+        hardware_profile_ids={"nvidia:h200_sxm_141gb"},
+        canonical_model_ids={"gpt_oss_120b"},
+        request_mode="chat",
+        ignore_waits=True,
+    )
+
+    assert len(sessions) == 1
+    turn = sessions[0].turns[0]
+
+    # Estimated context_len should include the 500k asset_token_count
+    assert turn.context_len >= 500000
+
+    # Actual context_len should be much smaller — just the rendered text
+    # "[TABLE]" is ~1 token + "Analyze this codebase" is ~3 tokens
+    assert turn.actual_context_len < 100
+    assert turn.actual_context_len > 0
+
+    # The gap proves the measurement works
+    assert turn.context_len > turn.actual_context_len * 100
+
+
+def test_depth_telemetry_in_benchmark_result(tmp_path: Path) -> None:
+    """Verify depth_telemetry block is emitted in benchmark results."""
+    export_file = tmp_path / "multiturn.json"
+    export_file.write_text(json.dumps(_multiturn_payload()))
+
+    sessions, selection = load_replay_sessions(
+        export_file=str(export_file),
+        count_text_tokens=_count_tokens,
+        runtime_stack_ids={"standalone:sglang"},
+        hardware_profile_ids={"nvidia:h200_sxm_141gb"},
+        canonical_model_ids={"qwen3_30b_a3b"},
+        request_mode="chat",
+        ignore_waits=True,
+    )
+
+    async def _run() -> dict:
+        runner, base_url = await _start_mock_server()
+        try:
+            return await run_export_replay_benchmark(
+                sessions=sessions,
+                selection_metadata=selection,
+                model_id="Qwen/Qwen3-30B-A3B",
+                model_name=None,
+                chat_api_url=f"{base_url}/v1/chat/completions",
+                completion_api_url=f"{base_url}/v1/completions",
+                count_text_tokens=_count_tokens,
+                max_concurrency=1,
+                selected_percentiles=[99],
+                disable_tqdm=True,
+                num_warmup_sessions=0,
+            )
+        finally:
+            await runner.cleanup()
+
+    result = asyncio.run(_run())
+
+    # depth_telemetry block must exist
+    assert "depth_telemetry" in result
+    dt = result["depth_telemetry"]
+    assert "total_estimated_input_tokens" in dt
+    assert "total_actual_input_tokens" in dt
+    assert "max_actual_context_len_per_turn" in dt
+    assert dt["total_actual_input_tokens"] > 0
+    assert dt["max_actual_context_len_per_turn"] > 0
+
+    # Aggregate metrics must also carry actual input tokens
+    agg = result["aggregate_metrics"]
+    assert "total_actual_input_tokens" in agg
+    assert "max_actual_context_len_per_turn" in agg
+
+    # Per-turn metrics should have actual context length
+    for turn_key, turn_metrics in result["per_turn_metrics"].items():
+        assert "mean_actual_context_len" in turn_metrics
diff --git a/utils/test_gate_isb1.py b/utils/test_gate_isb1.py
new file mode 100644
index 000000000..3a9e590e0
--- /dev/null
+++ b/utils/test_gate_isb1.py
@@ -0,0 +1,218 @@
+import json
+from pathlib import Path
+
+from gate_isb1 import build_gate_report, load_rows, main
+
+
+def make_row(
+    *,
+    result_filename: str,
+    model: str,
+    hw: str,
+    framework: str,
+    support_status: str,
+    effective_max_context_depth: int,
+    context_pressure_class: str,
+    context_status: str,
+    requires_log_review: bool = False,
+    context_pressure_suspicious: bool = False,
+    completed_sessions: int = 2,
+    total_sessions: int = 2,
+    session_throughput_sps: float = 1.0,
+    benchmark_certification_status: str = "dataset_replay_verified",
+):
+    return {
+        "benchmark_type": "isb1_replay",
+        "result_filename": result_filename,
+        "artifact_stems": {
+            "processed": f"isb1_{result_filename}",
+            "raw_replay": f"replay_{result_filename}",
+            "server_logs": f"server_logs_{result_filename}",
+            "gpu_metrics": f"gpu_metrics_{result_filename}",
+        },
+        "infmax_model_prefix": model,
+        "hw": hw,
+        "framework": framework,
+        "support_status": support_status,
+        "effective_max_context_depth": effective_max_context_depth,
+        "context_pressure_class": context_pressure_class,
+        "context_pressure_signal": {
+            "status": context_status,
+            "requires_log_review": requires_log_review,
+        },
+        "context_pressure_suspicious": context_pressure_suspicious,
+        "completed_sessions": completed_sessions,
+        "total_sessions": total_sessions,
+        "session_throughput_sps": session_throughput_sps,
+        "benchmark_certification_status": benchmark_certification_status,
+    }
+
+
+def test_build_gate_report_passes_with_sglang_observability_gap():
+    rows = [
+        make_row(
+            result_filename="dsr1_control_b200_vllm",
+            model="dsr1",
+            hw="b200-cw-1",
+            framework="vllm",
+            support_status="supported",
+            effective_max_context_depth=9416,
+            context_pressure_class="standard",
+            context_status="not_applicable",
+        ),
+        make_row(
+            result_filename="gptoss_control_h100_vllm",
+            model="gptoss",
+            hw="h100-cw-1",
+            framework="vllm",
+            support_status="supported",
+            effective_max_context_depth=9416,
+            context_pressure_class="standard",
+            context_status="not_applicable",
+        ),
+    ]
+
+    for hw in ("b200-cw-1", "h100-cw-1", "h200-cw-1"):
+        for framework in ("vllm", "sglang"):
+            rows.append(
+                make_row(
+                    result_filename=f"qwen_131k_{hw}_{framework}",
+                    model="qwen3.5",
+                    hw=hw,
+                    framework=framework,
+                    support_status="reviewed_preview",
+                    effective_max_context_depth=131272,
+                    context_pressure_class="standard",
+                    context_status="not_applicable",
+                )
+            )
+            rows.append(
+                make_row(
+                    result_filename=f"qwen_500k_{hw}_{framework}",
+                    model="qwen3.5",
+                    hw=hw,
+                    framework=framework,
+                    support_status="reviewed_preview",
+                    effective_max_context_depth=524288,
+                    context_pressure_class="extended_500k",
+                    context_status="ok" if framework == "vllm" else "observability_gap",
+                    requires_log_review=framework == "sglang",
+                )
+            )
+
+    rows.extend(
+        [
+            make_row(
+                result_filename="qwen_1m_b200_vllm",
+                model="qwen3.5",
+                hw="b200-cw-1",
+                framework="vllm",
+                support_status="reviewed_preview",
+                effective_max_context_depth=1048576,
+                context_pressure_class="extended_1m",
+                context_status="ok",
+            ),
+            make_row(
+                result_filename="qwen_1m_b200_sglang",
+                model="qwen3.5",
+                hw="b200-cw-1",
+                framework="sglang",
+                support_status="reviewed_preview",
+                effective_max_context_depth=1048576,
+                context_pressure_class="extended_1m",
+                context_status="observability_gap",
+                requires_log_review=True,
+            ),
+        ]
+    )
+
+    report = build_gate_report(rows)
+
+    assert report["overall"] == "pass"
+    assert all(gate["status"] == "pass" for gate in report["gates"])
+    qwen_500k_gate = next(gate for gate in report["gates"] if gate["id"] == "qwen_500k")
+    assert qwen_500k_gate["review_required_rows"]
+    assert any(
+        row["result_filename"] == "qwen_500k_b200-cw-1_sglang"
+        for row in qwen_500k_gate["review_required_rows"]
+    )
+
+
+def test_build_gate_report_fails_control_lane_and_preserves_artifact_refs():
+    rows = [
+        make_row(
+            result_filename="dsr1_control_b200_vllm",
+            model="dsr1",
+            hw="b200-cw-1",
+            framework="vllm",
+            support_status="supported",
+            effective_max_context_depth=9416,
+            context_pressure_class="standard",
+            context_status="not_applicable",
+            completed_sessions=1,
+            total_sessions=2,
+            session_throughput_sps=0.0,
+        )
+    ]
+
+    report = build_gate_report(rows)
+
+    assert report["overall"] == "fail"
+    control_gate = next(gate for gate in report["gates"] if gate["id"] == "control_lanes")
+    assert control_gate["status"] == "fail"
+    assert control_gate["failing_rows"][0]["result_filename"] == "dsr1_control_b200_vllm"
+    assert control_gate["failing_rows"][0]["artifact_stems"]["server_logs"] == "server_logs_dsr1_control_b200_vllm"
+    assert "completed_sessions == total_sessions" in control_gate["failing_rows"][0]["failed_criteria"]
+    assert "session_throughput_sps > 0" in control_gate["failing_rows"][0]["failed_criteria"]
+
+
+def test_build_gate_report_fails_when_qwen_131k_coverage_is_missing():
+    rows = [
+        make_row(
+            result_filename="qwen_131k_b200_vllm",
+            model="qwen3.5",
+            hw="b200-cw-1",
+            framework="vllm",
+            support_status="reviewed_preview",
+            effective_max_context_depth=131272,
+            context_pressure_class="standard",
+            context_status="not_applicable",
+        )
+    ]
+
+    report = build_gate_report(rows)
+
+    assert report["overall"] == "fail"
+    qwen_131k_gate = next(gate for gate in report["gates"] if gate["id"] == "qwen_131k")
+    assert qwen_131k_gate["status"] == "fail"
+    assert ["b200", "sglang"] in qwen_131k_gate["missing_coverage"]
+    assert ["h200", "vllm"] in qwen_131k_gate["missing_coverage"]
+
+
+def test_build_gate_report_handles_no_rows():
+    report = build_gate_report([])
+
+    assert report["overall"] == "partial"
+    assert all(gate["status"] == "no_rows" for gate in report["gates"])
+
+
+def test_gate_main_strict_returns_nonzero_on_failure(tmp_path):
+    payload = [
+        make_row(
+            result_filename="dsr1_control_b200_vllm",
+            model="dsr1",
+            hw="b200-cw-1",
+            framework="vllm",
+            support_status="supported",
+            effective_max_context_depth=9416,
+            context_pressure_class="standard",
+            context_status="not_applicable",
+            completed_sessions=1,
+            total_sessions=2,
+        )
+    ]
+    report_path = tmp_path / "agg_isb1.json"
+    report_path.write_text(json.dumps(payload))
+
+    assert load_rows(report_path)[0]["result_filename"] == "dsr1_control_b200_vllm"
+    assert main([str(report_path), "--strict"]) == 1
diff --git a/utils/test_process_result.py b/utils/test_process_result.py
index 2a6389a78..8bc51d593 100644
--- a/utils/test_process_result.py
+++ b/utils/test_process_result.py
@@ -47,6 +47,7 @@ def base_env_vars():
         "OSL": "1024",
         "DISAGG": "false",
         "MODEL_PREFIX": "dsr1",
+        "IMAGE": "lmsysorg/sglang:v0.4.6.post5-cu126",
     }
 
 
@@ -299,6 +300,32 @@ def test_missing_result_file(self, tmp_path, single_node_env_vars):
 
         assert result.returncode != 0
 
+    def test_isb1_replay_env_guard(self, tmp_path, sample_benchmark_result, single_node_env_vars):
+        """ISB1 replay runs should fail fast with a helpful processor redirect."""
+        env = single_node_env_vars.copy()
+        env["BENCHMARK_TYPE"] = "isb1_replay"
+
+        result = run_script(tmp_path, env, sample_benchmark_result)
+
+        assert result.returncode != 0
+        assert "Use utils/process_result_isb1.py instead" in result.stderr
+
+    def test_isb1_replay_payload_guard(self, tmp_path, single_node_env_vars):
+        """Replay-shaped payloads should be rejected even without BENCHMARK_TYPE set."""
+        replay_like_result = {
+            "model_id": "test-model",
+            "max_concurrency": 4,
+            "aggregate_metrics": {
+                "total_token_throughput_tps": 1000.0,
+                "output_throughput_tps": 800.0,
+            },
+        }
+
+        result = run_script(tmp_path, single_node_env_vars, replay_like_result)
+
+        assert result.returncode != 0
+        assert "Detected an ISB1 replay-style result payload" in result.stderr
+
 
 # =============================================================================
 # Test latency and throughput calculations
diff --git a/utils/test_process_result_isb1.py b/utils/test_process_result_isb1.py
new file mode 100644
index 000000000..f2a4f06fb
--- /dev/null
+++ b/utils/test_process_result_isb1.py
@@ -0,0 +1,1006 @@
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+SCRIPT_PATH = Path(__file__).parent / "process_result_isb1.py"
+
+
+def write_export_fixture(tmp_path: Path, relative_path: str, payload: dict) -> str:
+    export_path = tmp_path / relative_path
+    export_path.parent.mkdir(parents=True, exist_ok=True)
+    export_path.write_text(json.dumps(payload))
+    return str(export_path.relative_to(tmp_path))
+
+
+@pytest.fixture
+def sample_replay_result():
+    return {
+        "model_id": "deepseek-ai/DeepSeek-R1-0528",
+        "mode": "export_replay",
+        "max_concurrency": 8,
+        "num_sessions": 2,
+        "max_turns": 4,
+        "num_warmup_sessions": 1,
+        "harness_request_mode": "auto",
+        "selection": {
+            "adapter_id": "inferencex_multiturn",
+            "selected_sessions": 2,
+            "runtime_stack_ids": ["vllm-0.8.5-h200"],
+            "hardware_profile_ids": ["h200-8gpu"],
+            "canonical_model_ids": ["deepseek-r1-0528"],
+            "support_statuses": ["supported"],
+            "support_status_counts": {"supported": 2},
+            "benchmark_certification_statuses": ["dataset_replay_verified"],
+            "benchmark_certification_status_counts": {
+                "dataset_replay_verified": 2
+            },
+            "request_mode_mix": {"chat": 2},
+        },
+        "server_metrics_summary": {
+            "cache_usage_avg": 0.45,
+            "cache_hit_rate_avg": 0.15,
+            "gpu_cache_usage_avg": 0.45,
+            "gpu_cache_usage_peak": 0.78,
+            "gpu_cache_metric_name": "vllm:gpu_cache_usage_perc",
+            "cpu_cache_usage_avg": 0.12,
+            "cpu_cache_usage_peak": 0.31,
+            "cpu_cache_metric_name": "vllm:cpu_cache_usage_perc",
+            "cpu_cache_metric_available": True,
+            "observability_status": "direct_cpu_cache_metric",
+            "kv_offload_observed": True,
+            "samples": 5,
+        },
+        "per_turn_metrics": {
+            "turn_1": {
+                "completed": 2,
+                "mean_context_len": 8192.0,
+                "mean_ttft_ms": 180.0,
+                "p99_ttft_ms": 300.0,
+                "mean_e2el_ms": 1000.0,
+            }
+        },
+        "aggregate_metrics": {
+            "completed_sessions": 2,
+            "total_sessions": 2,
+            "total_input_tokens": 1000,
+            "total_output_tokens": 300,
+            "total_wall_time_s": 2.0,
+            "session_throughput_sps": 1.0,
+            "output_throughput_tps": 150.0,
+            "total_token_throughput_tps": 650.0,
+            "mean_ttft_ms": 200.0,
+            "median_ttft_ms": 180.0,
+            "p99_ttft_ms": 500.0,
+            "mean_tpot_ms": 20.0,
+            "median_tpot_ms": 25.0,
+            "p99_tpot_ms": 50.0,
+            "mean_e2el_ms": 1200.0,
+            "median_e2el_ms": 1100.0,
+            "p99_e2el_ms": 2000.0,
+        },
+    }
+
+
+@pytest.fixture
+def base_env():
+    return {
+        "RUNNER_TYPE": "h200-cw-1",
+        "FRAMEWORK": "vllm",
+        "PRECISION": "fp8",
+        "RESULT_FILENAME": "isb1_result",
+        "MODEL_PREFIX": "dsr1",
+        "IMAGE": "vllm/vllm-openai:v0.8.5",
+        "TP": "8",
+        "EP_SIZE": "1",
+        "DP_ATTENTION": "false",
+        "BENCHMARK_TYPE": "isb1_replay",
+        "EXPORT_FILE": "datasets/isb1/exports/core/chat_8k1k.json",
+        "RUNTIME_STACK_ID": "vllm-0.8.5-h200",
+        "HARDWARE_PROFILE_ID": "h200-8gpu",
+        "CANONICAL_MODEL_ID": "deepseek-r1-0528",
+        "SUPPORT_STATUS": "supported",
+        "REQUEST_MODE": "multi-turn",
+        "MAX_CONCURRENCY": "8",
+        "SPEC_DECODING": "none",
+        "IGNORE_WAITS": "true",
+        "GITHUB_REF": "refs/heads/test-isb1-traceability",
+    }
+
+
+def run_script(tmp_path, env, replay_result, result_filename="isb1_result"):
+    result_file = tmp_path / f"{result_filename}.json"
+    result_file.write_text(json.dumps(replay_result))
+
+    env = env.copy()
+    env["RESULT_FILENAME"] = result_filename
+
+    return subprocess.run(
+        [sys.executable, str(SCRIPT_PATH)],
+        cwd=tmp_path,
+        env=env,
+        capture_output=True,
+        text=True,
+    )
+
+
+def assert_traceability_fields(
+    output_data: dict, result_filename: str, dispatch_ref: str = "refs/heads/test-isb1-traceability"
+):
+    assert output_data["result_filename"] == result_filename
+    assert output_data["artifact_stems"] == {
+        "processed": f"isb1_{result_filename}",
+        "raw_replay": f"replay_{result_filename}",
+        "server_logs": f"server_logs_{result_filename}",
+        "gpu_metrics": f"gpu_metrics_{result_filename}",
+    }
+    assert output_data["dispatch_ref"] == dispatch_ref
+
+
+def test_isb1_replay_processing(tmp_path, sample_replay_result, base_env):
+    export_file = write_export_fixture(
+        tmp_path,
+        "datasets/isb1/exports/core/chat_8k1k.json",
+        {
+            "adapter_id": "inferencex_multiturn",
+            "bundle_id": "bundle-core-chat",
+            "surface": "chat",
+            "exports": [
+                {
+                    "trace_id": "trace-1",
+                    "runtime_stack_id": "vllm-0.8.5-h200",
+                    "hardware_profile_id": "h200-8gpu",
+                    "canonical_model_id": "deepseek-r1-0528",
+                    "support_status": "supported",
+                }
+            ],
+        },
+    )
+    env = base_env.copy()
+    env["EXPORT_FILE"] = export_file
+
+    result = run_script(tmp_path, env, sample_replay_result)
+    assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+    output_data = json.loads(result.stdout)
+
+    assert output_data["benchmark_type"] == "isb1_replay"
+    assert output_data["request_mode"] == "multi-turn"
+    assert output_data["harness_request_mode"] == "auto"
+    assert output_data["isl"] == 8192
+    assert output_data["osl"] == 1024
+    assert output_data["export_lane"] == "core"
+    assert output_data["benchmark_surface"] == "chat"
+    assert output_data["support_status"] == "supported"
+    assert output_data["benchmark_certification_status"] == "dataset_replay_verified"
+    assert output_data["effective_max_context_depth"] == 8192 + 1024 + 200
+    assert output_data["context_pressure_class"] == "standard"
+    assert output_data["context_pressure_signal"]["status"] == "not_applicable"
+    assert output_data["context_pressure_suspicious"] is False
+    assert output_data["completed_sessions"] == 2
+    assert output_data["session_throughput_sps"] == pytest.approx(1.0)
+    assert output_data["tput_per_gpu"] == pytest.approx(650.0 / 8)
+    assert output_data["output_tput_per_gpu"] == pytest.approx(150.0 / 8)
+    assert output_data["input_tput_per_gpu"] == pytest.approx((650.0 - 150.0) / 8)
+    assert output_data["median_ttft"] == pytest.approx(0.18)
+    assert output_data["median_intvty"] == pytest.approx(40.0)
+    assert output_data["median_e2el"] == pytest.approx(1.1)
+    assert output_data["kv_offload_observed"] is True
+    assert output_data["peak_gpu_cache_usage"] == pytest.approx(0.78)
+    assert output_data["peak_cpu_cache_usage"] == pytest.approx(0.31)
+    assert output_data["selection"]["request_mode_mix"] == {"chat": 2}
+    assert output_data["selection"]["support_status_counts"] == {"supported": 2}
+    assert output_data["per_turn_metrics"]["turn_1"]["completed"] == 2
+    assert output_data["runtime_overrides"] == {
+        "vllm_cpu_offload_gb": None,
+        "vllm_swap_space_gb": None,
+        "sglang_mem_fraction_override": None,
+        "sglang_chunked_prefill_override": None,
+    }
+    assert_traceability_fields(output_data, "isb1_result")
+
+    output_file = tmp_path / "agg_isb1_result.json"
+    assert output_file.exists()
+    persisted_output = json.loads(output_file.read_text())
+    assert_traceability_fields(persisted_output, "isb1_result")
+
+
+def test_offload_mode_env_propagation(tmp_path, sample_replay_result, base_env):
+    export_file = write_export_fixture(
+        tmp_path,
+        "datasets/isb1/exports/core/chat_8k1k.json",
+        {
+            "adapter_id": "inferencex_multiturn",
+            "surface": "chat",
+            "exports": [
+                {
+                    "trace_id": "trace-1",
+                    "runtime_stack_id": "vllm-0.8.5-h200",
+                    "hardware_profile_id": "h200-8gpu",
+                    "canonical_model_id": "deepseek-r1-0528",
+                    "support_status": "supported",
+                }
+            ],
+        },
+    )
+    env = base_env.copy()
+    env["EXPORT_FILE"] = export_file
+    env["OFFLOAD_MODE"] = "noprefix"
+    env["KV_CACHE_DTYPE"] = "fp8"
+    env["DISABLE_PREFIX_CACHING"] = "true"
+
+    result = run_script(tmp_path, env, sample_replay_result, result_filename="isb1_offload_env")
+    assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+    output_data = json.loads(result.stdout)
+    assert output_data["offload_mode"] == "noprefix"
+    assert output_data["kv_cache_dtype"] == "fp8"
+    assert output_data["disable_prefix_caching"] is True
+
+
+def test_support_status_mismatch_fails(tmp_path, sample_replay_result, base_env):
+    export_file = write_export_fixture(
+        tmp_path,
+        "datasets/isb1/exports/core/chat_8k1k.json",
+        {
+            "adapter_id": "inferencex_multiturn",
+            "surface": "chat",
+            "exports": [
+                {
+                    "trace_id": "trace-1",
+                    "runtime_stack_id": "vllm-0.8.5-h200",
+                    "hardware_profile_id": "h200-8gpu",
+                    "canonical_model_id": "deepseek-r1-0528",
+                    "support_status": "supported",
+                }
+            ],
+        },
+    )
+    replay_result = {
+        **sample_replay_result,
+        "selection": {
+            **sample_replay_result["selection"],
+            "support_statuses": ["supported"],
+            "support_status_counts": {"supported": 2},
+        },
+    }
+    env = base_env.copy()
+    env["EXPORT_FILE"] = export_file
+    env["SUPPORT_STATUS"] = "reviewed_preview"
+
+    result = run_script(tmp_path, env, replay_result, result_filename="isb1_mismatch")
+    assert result.returncode != 0
+    assert "support-status mismatch" in result.stderr
+
+
+def test_certification_status_mismatch_fails(tmp_path, sample_replay_result, base_env):
+    export_file = write_export_fixture(
+        tmp_path,
+        "datasets/isb1/exports/core/chat_8k1k.json",
+        {
+            "adapter_id": "inferencex_multiturn",
+            "surface": "chat",
+            "exports": [
+                {
+                    "trace_id": "trace-1",
+                    "runtime_stack_id": "vllm-0.8.5-h200",
+                    "hardware_profile_id": "h200-8gpu",
+                    "canonical_model_id": "deepseek-r1-0528",
+                    "support_status": "supported",
+                    "benchmark_certification_status": "dataset_replay_verified",
+                }
+            ],
+        },
+    )
+    replay_result = {
+        **sample_replay_result,
+        "selection": {
+            **sample_replay_result["selection"],
+            "benchmark_certification_statuses": ["pending_review"],
+            "benchmark_certification_status_counts": {"pending_review": 2},
+        },
+    }
+    env = base_env.copy()
+    env["EXPORT_FILE"] = export_file
+
+    result = run_script(tmp_path, env, replay_result, result_filename="isb1_cert_mismatch")
+    assert result.returncode != 0
+    assert "benchmark-certification mismatch" in result.stderr
+
+
+def test_missing_required_env_vars_fails(tmp_path, sample_replay_result):
+    result_file = tmp_path / "isb1_result.json"
+    result_file.write_text(json.dumps(sample_replay_result))
+
+    result = subprocess.run(
+        [sys.executable, str(SCRIPT_PATH)],
+        cwd=tmp_path,
+        env={"PATH": "/usr/bin", "RESULT_FILENAME": "isb1_result"},
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.returncode != 0
+    assert "Missing required environment variables" in result.stderr
+
+
+def test_dispatch_ref_prefers_explicit_override(tmp_path, sample_replay_result, base_env):
+    export_file = write_export_fixture(
+        tmp_path,
+        "datasets/isb1/exports/core/chat_8k1k.json",
+        {
+            "adapter_id": "inferencex_multiturn",
+            "bundle_id": "bundle-core-chat",
+            "surface": "chat",
+            "exports": [
+                {
+                    "trace_id": "trace-1",
+                    "runtime_stack_id": "vllm-0.8.5-h200",
+                    "hardware_profile_id": "h200-8gpu",
+                    "canonical_model_id": "deepseek-r1-0528",
+                    "support_status": "supported",
+                }
+            ],
+        },
+    )
+    env = base_env.copy()
+    env["EXPORT_FILE"] = export_file
+    env["DISPATCH_REF"] = "refs/tags/isb1-dispatch-override"
+
+    result = run_script(tmp_path, env, sample_replay_result, result_filename="isb1_dispatch_override")
+    assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+    output_data = json.loads(result.stdout)
+    assert_traceability_fields(
+        output_data,
+        "isb1_dispatch_override",
+        dispatch_ref="refs/tags/isb1-dispatch-override",
+    )
+
+
+def test_preview_offload_core_processing(tmp_path, sample_replay_result, base_env):
+    preview_export = (
+        write_export_fixture(
+            tmp_path,
+            "datasets/isb1/exports/preview/offload_core/"
+            "inferencex_multiturn__chat_hopper_blackwell_offload_core_v1__smoke.json",
+            {
+                "adapter_id": "inferencex_multiturn",
+                "profile_id": "chat_hopper_blackwell_offload_core_v1",
+                "duration_tier": "smoke",
+                "adapter_surface": "chat",
+                "tier": "reviewed_preview",
+                "adapter_support_status": "reviewed_preview",
+                "exports": [
+                    {
+                        "context_band": "lc1_8k_16k",
+                    },
+                    {
+                        "context_band": "lc3_96k_128k",
+                    },
+                ],
+                "producer_handoff_metadata": {
+                    "class": "phase_2_offload_core_preview",
+                    "claim_boundary": "Not blanket certification.",
+                },
+            },
+        )
+    )
+
+    env = base_env.copy()
+    env["EXPORT_FILE"] = preview_export
+    env["SUPPORT_STATUS"] = "reviewed_preview"
+    env["MAX_MODEL_LEN"] = "131272"
+    replay_result = {
+        **sample_replay_result,
+        "selection": {
+            **sample_replay_result["selection"],
+            "support_statuses": ["reviewed_preview"],
+            "support_status_counts": {"reviewed_preview": 2},
+        },
+    }
+
+    result = run_script(tmp_path, env, replay_result, result_filename="isb1_preview")
+    assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+    output_data = json.loads(result.stdout)
+    assert output_data["export_lane"] == "preview/offload_core"
+    assert output_data["benchmark_surface"] == "chat"
+    assert output_data["profile_id"] == "chat_hopper_blackwell_offload_core_v1"
+    assert output_data["duration_tier"] == "smoke"
+    assert output_data["context_bands"] == ["lc1_8k_16k", "lc3_96k_128k"]
+    assert output_data["producer_handoff_class"] == "phase_2_offload_core_preview"
+    assert output_data["support_status"] == "reviewed_preview"
+    assert output_data["isl"] == 0
+    assert output_data["osl"] == 0
+    assert_traceability_fields(output_data, "isb1_preview")
+
+
+def test_qwen_500k_preview_processing_preserves_served_shape_and_context_band(
+    tmp_path, sample_replay_result, base_env
+):
+    preview_export = write_export_fixture(
+        tmp_path,
+        "datasets/isb1/exports/preview/long_context_500k/"
+        "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json",
+        {
+            "adapter_id": "inferencex_trace_replay",
+            "bundle_id": "isb1_preview_long_context_500k_vllm_code_xlc2_qwen3_5",
+            "profile_id": "coding_qwen3.5_xlc2_500k_preview_v1",
+            "duration_tier": "standard",
+            "surface": "code",
+            "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024},
+            "tier": "reviewed_preview",
+            "adapter_support_status": "reviewed_preview",
+            "producer_handoff_metadata": {
+                "class": "bounded_500k_class",
+                "claim_boundary": "Replay-derived 500k preview only.",
+            },
+            "exports": [
+                {
+                    "context_band": "xlc2_384k_512k",
+                    "support_status": "reviewed_preview",
+                    "benchmark_certification_status": "dataset_replay_verified",
+                    "runtime_stack_id": "standalone:vllm",
+                    "hardware_profile_id": "nvidia:b200_sxm_180gb",
+                    "canonical_model_id": "qwen3_5_397b_a17b",
+                    "kv_mode": "offload_cliff",
+                },
+                {
+                    "context_band": "xlc2_384k_512k",
+                    "support_status": "reviewed_preview",
+                    "benchmark_certification_status": "dataset_replay_verified",
+                    "runtime_stack_id": "standalone:vllm",
+                    "hardware_profile_id": "nvidia:h100_sxm_80gb",
+                    "canonical_model_id": "qwen3_5_397b_a17b",
+                    "kv_mode": "offload_cliff",
+                },
+                {
+                    "context_band": "xlc2_384k_512k",
+                    "support_status": "reviewed_preview",
+                    "benchmark_certification_status": "dataset_replay_verified",
+                    "runtime_stack_id": "standalone:vllm",
+                    "hardware_profile_id": "nvidia:h200_sxm_141gb",
+                    "canonical_model_id": "qwen3_5_397b_a17b",
+                    "kv_mode": "offload_cliff",
+                },
+            ],
+        },
+    )
+
+    env = base_env.copy()
+    env.update(
+        {
+            "RUNNER_TYPE": "b200-cw-1",
+            "FRAMEWORK": "vllm",
+            "MODEL_PREFIX": "qwen3.5",
+            "IMAGE": "vllm/vllm-openai:v0.8.5",
+            "EXPORT_FILE": preview_export,
+            "RUNTIME_STACK_ID": "standalone:vllm",
+            "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb",
+            "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b",
+            "SUPPORT_STATUS": "reviewed_preview",
+            "MAX_MODEL_LEN": "524288",
+            "VLLM_CPU_OFFLOAD_GB": "120",
+            "VLLM_SWAP_SPACE_GB": "24",
+        }
+    )
+    replay_result = {
+        **sample_replay_result,
+        "model_id": "Qwen/Qwen3.5-397B-A17B-FP8",
+        "vllm_cpu_offload_gb": "128",
+        "vllm_swap_space_gb": "32",
+        "selection": {
+            **sample_replay_result["selection"],
+            "runtime_stack_ids": ["standalone:vllm"],
+            "hardware_profile_ids": ["nvidia:b200_sxm_180gb"],
+            "canonical_model_ids": ["qwen3_5_397b_a17b"],
+            "support_statuses": ["reviewed_preview"],
+            "support_status_counts": {"reviewed_preview": 3},
+            "request_mode_mix": {"code": 3},
+        },
+    }
+
+    result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k")
+    assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+    output_data = json.loads(result.stdout)
+    assert output_data["export_lane"] == "preview/long_context_500k"
+    assert output_data["benchmark_surface"] == "code"
+    assert output_data["profile_id"] == "coding_qwen3.5_xlc2_500k_preview_v1"
+    assert output_data["context_bands"] == ["xlc2_384k_512k"]
+    assert output_data["producer_handoff_class"] == "bounded_500k_class"
+    assert output_data["support_status"] == "reviewed_preview"
+    assert output_data["benchmark_certification_status"] == "dataset_replay_verified"
+    assert output_data["isl"] == 131072
+    assert output_data["osl"] == 1024
+    assert output_data["max_model_len"] == 524288
+    assert output_data["effective_max_context_depth"] == 524288
+    assert output_data["context_pressure_class"] == "extended_500k"
+    assert output_data["context_pressure_signal"]["status"] == "ok"
+    assert output_data["context_pressure_suspicious"] is False
+    assert output_data["kv_offload_observed"] is True
+    assert output_data["runtime_overrides"] == {
+        "vllm_cpu_offload_gb": "128",
+        "vllm_swap_space_gb": "32",
+        "sglang_mem_fraction_override": None,
+        "sglang_chunked_prefill_override": None,
+    }
+    assert_traceability_fields(output_data, "isb1_qwen_500k")
+
+
+def test_qwen_1m_preview_processing_preserves_8k_served_shape_and_offload_metadata(
+    tmp_path, sample_replay_result, base_env
+):
+    preview_export = write_export_fixture(
+        tmp_path,
+        "datasets/isb1/exports/preview/long_context_1m/"
+        "inferencex_trace_replay__coding_qwen3.5_ulc2_1m_preview_v1__vllm.json",
+        {
+            "adapter_id": "inferencex_trace_replay",
+            "bundle_id": "isb1_preview_long_context_1m_vllm_code_ulc2_qwen3_5",
+            "profile_id": "coding_qwen3.5_ulc2_1m_preview_v1",
+            "duration_tier": "standard",
+            "surface": "code",
+            "served_shape": {"shape_family": "8k1k", "isl": 8192, "osl": 1024},
+            "tier": "reviewed_preview",
+            "adapter_support_status": "reviewed_preview",
+            "producer_handoff_metadata": {
+                "class": "bounded_1m_class",
+                "claim_boundary": "Manual 1M preview only.",
+            },
+            "exports": [
+                {
+                    "context_band": "ulc2_1m_plus",
+                    "support_status": "reviewed_preview",
+                    "benchmark_certification_status": "dataset_replay_verified",
+                    "runtime_stack_id": "standalone:vllm",
+                    "hardware_profile_id": "nvidia:b200_sxm_180gb",
+                    "canonical_model_id": "qwen3_5_397b_a17b",
+                    "kv_mode": "offload_cliff",
+                }
+            ],
+        },
+    )
+
+    env = base_env.copy()
+    env.update(
+        {
+            "RUNNER_TYPE": "b200-cw-1",
+            "FRAMEWORK": "vllm",
+            "MODEL_PREFIX": "qwen3.5",
+            "IMAGE": "vllm/vllm-openai:v0.8.5",
+            "EXPORT_FILE": preview_export,
+            "RUNTIME_STACK_ID": "standalone:vllm",
+            "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb",
+            "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b",
+            "SUPPORT_STATUS": "reviewed_preview",
+            "MAX_MODEL_LEN": "1048576",
+            "MAX_SESSIONS": "1",
+            "MAX_TURNS_PER_SESSION": "3",
+        }
+    )
+    replay_result = {
+        **sample_replay_result,
+        "model_id": "Qwen/Qwen3.5-397B-A17B-FP8",
+        "selection": {
+            **sample_replay_result["selection"],
+            "runtime_stack_ids": ["standalone:vllm"],
+            "hardware_profile_ids": ["nvidia:b200_sxm_180gb"],
+            "canonical_model_ids": ["qwen3_5_397b_a17b"],
+            "support_statuses": ["reviewed_preview"],
+            "support_status_counts": {"reviewed_preview": 1},
+            "request_mode_mix": {"code": 1},
+        },
+    }
+
+    result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_1m")
+    assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+    output_data = json.loads(result.stdout)
+    assert output_data["export_lane"] == "preview/long_context_1m"
+    assert output_data["benchmark_surface"] == "code"
+    assert output_data["profile_id"] == "coding_qwen3.5_ulc2_1m_preview_v1"
+    assert output_data["context_bands"] == ["ulc2_1m_plus"]
+    assert output_data["producer_handoff_class"] == "bounded_1m_class"
+    assert output_data["support_status"] == "reviewed_preview"
+    assert output_data["benchmark_certification_status"] == "dataset_replay_verified"
+    assert output_data["isl"] == 8192
+    assert output_data["osl"] == 1024
+    assert output_data["max_model_len"] == 1048576
+    assert output_data["effective_max_context_depth"] == 1048576
+    assert output_data["context_pressure_class"] == "extended_1m"
+    assert output_data["context_pressure_signal"]["status"] == "ok"
+    assert output_data["context_pressure_suspicious"] is False
+    assert output_data["max_sessions"] == 1
+    assert output_data["max_turns_per_session"] == 3
+    assert output_data["kv_offload_observed"] is True
+    assert_traceability_fields(output_data, "isb1_qwen_1m")
+
+
+def test_context_pressure_warning_on_high_context_without_cpu_cache(
+    tmp_path, sample_replay_result, base_env
+):
+    preview_export = write_export_fixture(
+        tmp_path,
+        "datasets/isb1/exports/preview/long_context_500k/"
+        "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__vllm.json",
+        {
+            "adapter_id": "inferencex_trace_replay",
+            "bundle_id": "isb1_preview_long_context_500k_vllm_code_xlc2_qwen3_5",
+            "profile_id": "coding_qwen3.5_xlc2_500k_preview_v1",
+            "duration_tier": "standard",
+            "surface": "code",
+            "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024},
+            "tier": "reviewed_preview",
+            "adapter_support_status": "reviewed_preview",
+            "exports": [
+                {
+                    "context_band": "xlc2_384k_512k",
+                    "support_status": "reviewed_preview",
+                    "benchmark_certification_status": "dataset_replay_verified",
+                    "runtime_stack_id": "standalone:vllm",
+                    "hardware_profile_id": "nvidia:b200_sxm_180gb",
+                    "canonical_model_id": "qwen3_5_397b_a17b",
+                    "kv_mode": "offload_cliff",
+                }
+            ],
+        },
+    )
+
+    env = base_env.copy()
+    env.update(
+        {
+            "RUNNER_TYPE": "b200-cw-1",
+            "FRAMEWORK": "vllm",
+            "MODEL_PREFIX": "qwen3.5",
+            "IMAGE": "vllm/vllm-openai:v0.8.5",
+            "EXPORT_FILE": preview_export,
+            "RUNTIME_STACK_ID": "standalone:vllm",
+            "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb",
+            "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b",
+            "SUPPORT_STATUS": "reviewed_preview",
+            "MAX_MODEL_LEN": "524288",
+        }
+    )
+    replay_result = {
+        **sample_replay_result,
+        "model_id": "Qwen/Qwen3.5-397B-A17B-FP8",
+        "selection": {
+            **sample_replay_result["selection"],
+            "runtime_stack_ids": ["standalone:vllm"],
+            "hardware_profile_ids": ["nvidia:b200_sxm_180gb"],
+            "canonical_model_ids": ["qwen3_5_397b_a17b"],
+            "support_statuses": ["reviewed_preview"],
+            "support_status_counts": {"reviewed_preview": 1},
+            "request_mode_mix": {"code": 1},
+        },
+        "server_metrics_summary": {
+            "cache_usage_avg": 0.45,
+            "cache_hit_rate_avg": 0.15,
+            "gpu_cache_usage_avg": 0.45,
+            "gpu_cache_usage_peak": 0.91,
+            "gpu_cache_metric_name": "vllm:gpu_cache_usage_perc",
+            "cpu_cache_usage_avg": 0.0,
+            "cpu_cache_usage_peak": 0.0,
+            "cpu_cache_metric_name": "vllm:cpu_cache_usage_perc",
+            "cpu_cache_metric_available": True,
+            "observability_status": "direct_cpu_cache_metric",
+            "kv_offload_observed": False,
+            "samples": 5,
+        },
+    }
+
+    result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k_warn")
+    assert result.returncode == 0, f"Script failed: {result.stderr}"
+    assert "saw no CPU cache usage" in result.stderr
+
+    output_data = json.loads(result.stdout)
+    assert output_data["context_pressure_signal"]["status"] == "suspicious"
+    assert output_data["context_pressure_suspicious"] is True
+    assert_traceability_fields(output_data, "isb1_qwen_500k_warn")
+
+
+def test_context_pressure_signal_marks_sglang_observability_gap(
+    tmp_path, sample_replay_result, base_env
+):
+    preview_export = write_export_fixture(
+        tmp_path,
+        "datasets/isb1/exports/preview/long_context_500k/"
+        "inferencex_trace_replay__coding_qwen3.5_xlc2_500k_preview_v1__sglang.json",
+        {
+            "adapter_id": "inferencex_trace_replay",
+            "bundle_id": "isb1_preview_long_context_500k_sglang_code_xlc2_qwen3_5",
+            "profile_id": "coding_qwen3.5_xlc2_500k_preview_v1",
+            "duration_tier": "standard",
+            "surface": "code",
+            "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024},
+            "tier": "reviewed_preview",
+            "adapter_support_status": "reviewed_preview",
+            "exports": [
+                {
+                    "context_band": "xlc2_384k_512k",
+                    "support_status": "reviewed_preview",
+                    "benchmark_certification_status": "dataset_replay_verified",
+                    "runtime_stack_id": "standalone:sglang",
+                    "hardware_profile_id": "nvidia:b200_sxm_180gb",
+                    "canonical_model_id": "qwen3_5_397b_a17b",
+                    "kv_mode": "offload_cliff",
+                }
+            ],
+        },
+    )
+
+    env = base_env.copy()
+    env.update(
+        {
+            "RUNNER_TYPE": "b200-cw-1",
+            "FRAMEWORK": "sglang",
+            "MODEL_PREFIX": "qwen3.5",
+            "IMAGE": "lmsysorg/sglang:v0.5.9-cu130",
+            "EXPORT_FILE": preview_export,
+            "RUNTIME_STACK_ID": "standalone:sglang",
+            "HARDWARE_PROFILE_ID": "nvidia:b200_sxm_180gb",
+            "CANONICAL_MODEL_ID": "qwen3_5_397b_a17b",
+            "SUPPORT_STATUS": "reviewed_preview",
+            "MAX_MODEL_LEN": "524288",
+            "SGLANG_MEM_FRACTION_OVERRIDE": "0.77",
+            "SGLANG_CHUNKED_PREFILL_OVERRIDE": "65536",
+        }
+    )
+    replay_result = {
+        **sample_replay_result,
+        "model_id": "Qwen/Qwen3.5-397B-A17B-FP8",
+        "selection": {
+            **sample_replay_result["selection"],
+            "runtime_stack_ids": ["standalone:sglang"],
+            "hardware_profile_ids": ["nvidia:b200_sxm_180gb"],
+            "canonical_model_ids": ["qwen3_5_397b_a17b"],
+            "support_statuses": ["reviewed_preview"],
+            "support_status_counts": {"reviewed_preview": 1},
+            "request_mode_mix": {"code": 1},
+        },
+        "server_metrics_summary": {
+            "cache_usage_avg": 0.52,
+            "cache_hit_rate_avg": 0.23,
+            "gpu_cache_usage_avg": 0.52,
+            "gpu_cache_usage_peak": 0.88,
+            "gpu_cache_metric_name": "sglang:token_usage",
+            "cpu_cache_usage_avg": 0.0,
+            "cpu_cache_usage_peak": 0.0,
+            "cpu_cache_metric_name": None,
+            "cpu_cache_metric_available": False,
+            "observability_status": "indirect_without_cpu_cache_metric",
+            "kv_offload_observed": False,
+            "samples": 5,
+        },
+    }
+
+    result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k_sglang")
+    assert result.returncode == 0, f"Script failed: {result.stderr}"
+    assert "lacks a direct CPU cache metric" in result.stderr
+
+    output_data = json.loads(result.stdout)
+    assert output_data["context_pressure_signal"]["status"] == "observability_gap"
+    assert output_data["context_pressure_signal"]["requires_log_review"] is True
+    assert output_data["context_pressure_suspicious"] is False
+    assert output_data["runtime_overrides"] == {
+        "vllm_cpu_offload_gb": None,
+        "vllm_swap_space_gb": None,
+        "sglang_mem_fraction_override": "0.77",
+        "sglang_chunked_prefill_override": "65536",
+    }
+    assert_traceability_fields(output_data, "isb1_qwen_500k_sglang")
+
+
+def test_depth_coverage_ratio_for_500k_preview(tmp_path, base_env, sample_replay_result):
+    """Verify depth coverage ratio and class for a 500k preview with 131k actual tokens."""
+    export_payload = {
+        "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024},
+        "surface": "code",
+        "exports": [
+            {
+                "runtime_stack_id": "standalone:vllm",
+                "hardware_profile_id": "h200-8gpu",
+                "canonical_model_id": "qwen3_5_397b_a17b",
+                "support_status": "reviewed_preview",
+                "benchmark_certification_status": "dataset_replay_verified",
+                "context_band": "xlc2_384k_512k",
+                "trace_metadata": {
+                    "estimated_kv_bytes_peak": 27294647296,
+                    "context_pressure_profile": {
+                        "expected_offload_mode": "soft_offload",
+                    },
+                    "expected_offload_mode": "soft_offload",
+                },
+            }
+        ],
+    }
+    export_file = write_export_fixture(
+        tmp_path, "datasets/isb1/exports/preview/long_context_500k/test_500k.json", export_payload
+    )
+
+    env = base_env.copy()
+    env["EXPORT_FILE"] = export_file
+    env["MODEL_PREFIX"] = "qwen3.5"
+    env["CANONICAL_MODEL_ID"] = "qwen3_5_397b_a17b"
+    env["SUPPORT_STATUS"] = "reviewed_preview"
+    env["MAX_MODEL_LEN"] = "524288"
+    env["FRAMEWORK"] = "vllm"
+
+    replay_result = sample_replay_result.copy()
+    replay_result["selection"] = {
+        **replay_result["selection"],
+        "support_statuses": ["reviewed_preview"],
+    }
+    replay_result["server_metrics_summary"] = {
+        "gpu_cache_usage_avg": 0.35,
+        "gpu_cache_usage_peak": 0.42,
+        "cpu_cache_usage_avg": 0.15,
+        "cpu_cache_usage_peak": 0.25,
+        "cpu_cache_metric_available": True,
+        "observability_status": "direct_cpu_cache_metric",
+        "kv_offload_observed": True,
+        "samples": 10,
+    }
+    replay_result["depth_telemetry"] = {
+        "total_estimated_input_tokens": 500000,
+        "total_actual_input_tokens": 131072,
+        "max_actual_context_len_per_turn": 131072,
+    }
+
+    result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_500k_depth")
+    assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+    output_data = json.loads(result.stdout)
+
+    # Depth coverage ratio: 131072 / 524288 ≈ 0.25
+    assert output_data["depth_coverage_ratio"] is not None
+    assert 0.24 < output_data["depth_coverage_ratio"] < 0.26
+    assert output_data["depth_coverage_class"] == "bounded_preview"
+    assert output_data["max_actual_context_len_per_turn"] == 131072
+    assert output_data["depth_gap_tokens"] == 524288 - 131072
+
+    # Producer expectation validation
+    assert output_data["producer_estimated_kv_bytes_peak"] == 27294647296
+    assert output_data["producer_expected_offload_mode"] == "soft_offload"
+    assert output_data["producer_expectation_validation"]["offload_mode_match"] is True
+    assert output_data["producer_expectation_validation"]["depth_exercised"] is False
+
+    # Preemption count
+    assert output_data["preemption_count"] == 0
+
+
+def test_depth_mismatch_warning_for_configuration_only(tmp_path, base_env, sample_replay_result):
+    """Verify depth_mismatch status when actual context is <10% of configured."""
+    export_payload = {
+        "served_shape": {"shape_family": "8k1k", "isl": 8192, "osl": 1024},
+        "surface": "code",
+        "exports": [
+            {
+                "runtime_stack_id": "standalone:vllm",
+                "hardware_profile_id": "h200-8gpu",
+                "canonical_model_id": "qwen3_5_397b_a17b",
+                "support_status": "reviewed_preview",
+                "benchmark_certification_status": "dataset_replay_verified",
+                "context_band": "ulc2_1m_plus",
+                "trace_metadata": {
+                    "estimated_kv_bytes_peak": 39500000000,
+                    "expected_offload_mode": "hard_offload",
+                },
+            }
+        ],
+    }
+    export_file = write_export_fixture(
+        tmp_path, "datasets/isb1/exports/preview/long_context_1m/test_1m.json", export_payload
+    )
+
+    env = base_env.copy()
+    env["EXPORT_FILE"] = export_file
+    env["MODEL_PREFIX"] = "qwen3.5"
+    env["CANONICAL_MODEL_ID"] = "qwen3_5_397b_a17b"
+    env["SUPPORT_STATUS"] = "reviewed_preview"
+    env["MAX_MODEL_LEN"] = "1048576"
+    env["FRAMEWORK"] = "vllm"
+
+    replay_result = sample_replay_result.copy()
+    replay_result["selection"] = {
+        **replay_result["selection"],
+        "support_statuses": ["reviewed_preview"],
+    }
+    replay_result["server_metrics_summary"] = {
+        "gpu_cache_usage_avg": 0.10,
+        "gpu_cache_usage_peak": 0.15,
+        "cpu_cache_usage_avg": 0.05,
+        "cpu_cache_usage_peak": 0.10,
+        "cpu_cache_metric_available": True,
+        "observability_status": "direct_cpu_cache_metric",
+        "kv_offload_observed": True,
+        "samples": 5,
+    }
+    # 1M preview sends only 8k actual tokens
+    replay_result["depth_telemetry"] = {
+        "total_estimated_input_tokens": 1600000,
+        "total_actual_input_tokens": 8192,
+        "max_actual_context_len_per_turn": 8192,
+    }
+
+    result = run_script(tmp_path, env, replay_result, result_filename="isb1_qwen_1m_depth")
+    assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+    output_data = json.loads(result.stdout)
+
+    # 8192 / 1048576 ≈ 0.0078 — less than 0.1 threshold
+    assert output_data["depth_coverage_ratio"] < 0.01
+    assert output_data["depth_coverage_class"] == "configuration_only"
+    assert output_data["context_pressure_signal"]["status"] == "depth_mismatch"
+    assert output_data["context_pressure_signal"]["reason"] == "configured_depth_not_exercised"
+    assert "depth_coverage_ratio" in output_data["context_pressure_signal"]
+    assert "configured for" in result.stderr
+
+
+def test_producer_expectation_offload_mismatch(tmp_path, base_env, sample_replay_result):
+    """Verify producer expectation validation when offload is expected but not observed."""
+    export_payload = {
+        "served_shape": {"shape_family": "131k1k", "isl": 131072, "osl": 1024},
+        "surface": "code",
+        "exports": [
+            {
+                "runtime_stack_id": "standalone:vllm",
+                "hardware_profile_id": "h200-8gpu",
+                "canonical_model_id": "gpt_oss_120b",
+                "support_status": "reviewed_preview",
+                "benchmark_certification_status": "dataset_replay_verified",
+                "context_band": "xlc2_384k_512k",
+                "trace_metadata": {
+                    "estimated_kv_bytes_peak": 27000000000,
+                    "context_pressure_profile": {
+                        "expected_offload_mode": "hard_offload",
+                    },
+                },
+            }
+        ],
+    }
+    export_file = write_export_fixture(
+        tmp_path, "datasets/isb1/exports/preview/long_context_500k/test_mismatch.json", export_payload
+    )
+
+    env = base_env.copy()
+    env["EXPORT_FILE"] = export_file
+    env["MODEL_PREFIX"] = "gptoss"
+    env["CANONICAL_MODEL_ID"] = "gpt_oss_120b"
+    env["SUPPORT_STATUS"] = "reviewed_preview"
+    env["MAX_MODEL_LEN"] = "524288"
+
+    replay_result = sample_replay_result.copy()
+    replay_result["selection"] = {
+        **replay_result["selection"],
+        "support_statuses": ["reviewed_preview"],
+    }
+    replay_result["server_metrics_summary"] = {
+        "gpu_cache_usage_avg": 0.50,
+        "gpu_cache_usage_peak": 0.60,
+        "cpu_cache_usage_avg": 0.0,
+        "cpu_cache_usage_peak": 0.0,
+        "cpu_cache_metric_available": True,
+        "observability_status": "direct_cpu_cache_metric",
+        "kv_offload_observed": False,
+        "samples": 10,
+    }
+    replay_result["depth_telemetry"] = {
+        "total_estimated_input_tokens": 400000,
+        "total_actual_input_tokens": 131072,
+        "max_actual_context_len_per_turn": 131072,
+    }
+
+    result = run_script(tmp_path, env, replay_result, result_filename="isb1_mismatch")
+    assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+    output_data = json.loads(result.stdout)
+
+    # Producer expected hard_offload, but kv_offload_observed is False
+    assert output_data["producer_expectation_validation"]["offload_mode_match"] is False
+    assert output_data["producer_expected_offload_mode"] == "hard_offload"
+    assert output_data["kv_offload_observed"] is False
diff --git a/utils/test_summarize_isb1.py b/utils/test_summarize_isb1.py
new file mode 100644
index 000000000..3f4320594
--- /dev/null
+++ b/utils/test_summarize_isb1.py
@@ -0,0 +1,105 @@
+import json
+from pathlib import Path
+
+from summarize_isb1 import generate_summary
+
+
+def write_result(path: Path, payload: dict) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload))
+
+
+def make_row(**overrides):
+    row = {
+        "benchmark_type": "isb1_replay",
+        "result_filename": "isb1_control_vllm_b200",
+        "artifact_stems": {
+            "processed": "isb1_isb1_control_vllm_b200",
+            "raw_replay": "replay_isb1_control_vllm_b200",
+            "server_logs": "server_logs_isb1_control_vllm_b200",
+            "gpu_metrics": "gpu_metrics_isb1_control_vllm_b200",
+        },
+        "dispatch_ref": "refs/heads/test-summary",
+        "infmax_model_prefix": "dsr1",
+        "hw": "b200-cw-1",
+        "framework": "vllm",
+        "support_status": "supported",
+        "benchmark_certification_status": "dataset_replay_verified",
+        "effective_max_context_depth": 9416,
+        "context_pressure_class": "standard",
+        "context_pressure_signal": {
+            "status": "not_applicable",
+            "requires_log_review": False,
+        },
+        "context_pressure_suspicious": False,
+        "completed_sessions": 2,
+        "total_sessions": 2,
+        "session_throughput_sps": 1.25,
+        "median_ttft": 0.18,
+        "kv_offload_observed": True,
+        "peak_gpu_cache_usage": 0.78,
+        "peak_cpu_cache_usage": 0.31,
+        "runtime_overrides": {
+            "vllm_cpu_offload_gb": None,
+            "vllm_swap_space_gb": None,
+            "sglang_mem_fraction_override": None,
+            "sglang_chunked_prefill_override": None,
+        },
+    }
+    row.update(overrides)
+    return row
+
+
+def test_generate_summary_surfaces_lane_override_and_action_sections(tmp_path):
+    control_row = make_row()
+    review_row = make_row(
+        result_filename="isb1_qwen_500k_sglang",
+        artifact_stems={
+            "processed": "isb1_isb1_qwen_500k_sglang",
+            "raw_replay": "replay_isb1_qwen_500k_sglang",
+            "server_logs": "server_logs_isb1_qwen_500k_sglang",
+            "gpu_metrics": "gpu_metrics_isb1_qwen_500k_sglang",
+        },
+        infmax_model_prefix="qwen3.5",
+        hw="h200-cw-1",
+        framework="sglang",
+        support_status="reviewed_preview",
+        effective_max_context_depth=524288,
+        context_pressure_class="extended_500k",
+        context_pressure_signal={
+            "status": "observability_gap",
+            "requires_log_review": True,
+        },
+        runtime_overrides={
+            "vllm_cpu_offload_gb": None,
+            "vllm_swap_space_gb": None,
+            "sglang_mem_fraction_override": "0.77",
+            "sglang_chunked_prefill_override": "65536",
+        },
+        kv_offload_observed=False,
+        peak_gpu_cache_usage=0.88,
+        peak_cpu_cache_usage=0.0,
+    )
+    non_isb1_row = {"benchmark_type": "throughput", "ignored": True}
+
+    write_result(tmp_path / "results" / "control.json", control_row)
+    write_result(tmp_path / "results" / "review.json", review_row)
+    write_result(tmp_path / "results" / "non_isb1.json", non_isb1_row)
+
+    summary = generate_summary(tmp_path / "results")
+
+    assert "## ISB1 Operator Summary" in summary
+    assert "### Lane Summary" in summary
+    assert "### Runtime Overrides" in summary
+    assert "### Action Items" in summary
+    assert "isb1_qwen_500k_sglang" in summary
+    assert "observability_gap" in summary
+    assert "65536" in summary
+    assert "server_logs_isb1_qwen_500k_sglang" in summary
+    assert "non_isb1" not in summary
+
+
+def test_generate_summary_handles_empty_results(tmp_path):
+    summary = generate_summary(tmp_path / "results")
+    assert "No ISB1 replay rows found." in summary
+    assert "Lane Summary" not in summary
diff --git a/utils/test_verify_producer_sync.py b/utils/test_verify_producer_sync.py
new file mode 100644
index 000000000..ba42c8586
--- /dev/null
+++ b/utils/test_verify_producer_sync.py
@@ -0,0 +1,64 @@
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+SCRIPT_PATH = Path(__file__).parent / "verify_producer_sync.py"
+
+
+RELEVANT_FILES = {
+    "extension_131k/sglang/code_131k1k_qwen3.5.json": {"name": "e131k"},
+    "preview/long_context_500k/manifest_qwen3.5.json": {"name": "500k"},
+    "preview/long_context_1m/manifest.json": {"name": "1m"},
+}
+
+
+def _write_tree(root: Path, files: dict[str, dict]) -> None:
+    for relative_path, payload in files.items():
+        file_path = root / relative_path
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+        file_path.write_text(json.dumps(payload, sort_keys=True))
+
+
+def _run_verify(producer_root: Path, consumer_root: Path) -> subprocess.CompletedProcess[str]:
+    return subprocess.run(
+        [
+            sys.executable,
+            str(SCRIPT_PATH),
+            "--producer-root",
+            str(producer_root),
+            "--consumer-root",
+            str(consumer_root),
+        ],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+
+
+def test_verify_producer_sync_passes_for_identical_trees(tmp_path: Path) -> None:
+    producer_root = tmp_path / "producer"
+    consumer_root = tmp_path / "consumer"
+    _write_tree(producer_root, RELEVANT_FILES)
+    _write_tree(consumer_root, RELEVANT_FILES)
+
+    result = _run_verify(producer_root, consumer_root)
+
+    assert result.returncode == 0
+    assert "sync check passed" in result.stdout
+
+
+def test_verify_producer_sync_fails_on_content_mismatch(tmp_path: Path) -> None:
+    producer_root = tmp_path / "producer"
+    consumer_root = tmp_path / "consumer"
+    _write_tree(producer_root, RELEVANT_FILES)
+    _write_tree(consumer_root, RELEVANT_FILES)
+
+    mismatched_path = consumer_root / "preview/long_context_500k/manifest_qwen3.5.json"
+    mismatched_path.write_text(json.dumps({"name": "changed"}, sort_keys=True))
+
+    result = _run_verify(producer_root, consumer_root)
+
+    assert result.returncode == 1
+    assert "content_mismatch" in result.stderr
+    assert "preview/long_context_500k/manifest_qwen3.5.json" in result.stderr
diff --git a/utils/verify_producer_sync.py b/utils/verify_producer_sync.py
new file mode 100644
index 000000000..48cdac077
--- /dev/null
+++ b/utils/verify_producer_sync.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""Verify producer/consumer sync for ISB1 preview and extension exports."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+
+RELEVANT_SUBTREES = (
+    "extension_131k",
+    "preview/long_context_500k",
+    "preview/long_context_1m",
+)
+
+
+@dataclass
+class SyncIssue:
+    kind: str
+    path: str
+
+
+def _json_files(root: Path) -> set[str]:
+    if not root.exists():
+        return set()
+    return {
+        str(path.relative_to(root))
+        for path in root.rglob("*.json")
+        if path.is_file()
+    }
+
+
+def _compare_subtree(producer_root: Path, consumer_root: Path, subtree: str) -> list[SyncIssue]:
+    issues: list[SyncIssue] = []
+
+    producer_subtree = producer_root / subtree
+    consumer_subtree = consumer_root / subtree
+
+    producer_files = _json_files(producer_subtree)
+    consumer_files = _json_files(consumer_subtree)
+
+    if not producer_subtree.exists():
+        issues.append(SyncIssue("missing_producer_subtree", subtree))
+        return issues
+    if not consumer_subtree.exists():
+        issues.append(SyncIssue("missing_consumer_subtree", subtree))
+        return issues
+
+    for relative_path in sorted(producer_files - consumer_files):
+        issues.append(SyncIssue("missing_in_consumer", f"{subtree}/{relative_path}"))
+
+    for relative_path in sorted(consumer_files - producer_files):
+        issues.append(SyncIssue("extra_in_consumer", f"{subtree}/{relative_path}"))
+
+    for relative_path in sorted(producer_files & consumer_files):
+        producer_file = producer_subtree / relative_path
+        consumer_file = consumer_subtree / relative_path
+        if producer_file.read_bytes() != consumer_file.read_bytes():
+            issues.append(SyncIssue("content_mismatch", f"{subtree}/{relative_path}"))
+
+    return issues
+
+
+def verify_sync(producer_root: Path, consumer_root: Path) -> list[SyncIssue]:
+    issues: list[SyncIssue] = []
+    for subtree in RELEVANT_SUBTREES:
+        issues.extend(_compare_subtree(producer_root, consumer_root, subtree))
+    return issues
+
+
+def _default_consumer_root() -> Path:
+    return Path(__file__).resolve().parents[1] / "datasets" / "isb1" / "exports"
+
+
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Verify that committed ISB1 consumer preview/extension exports are "
+            "synced with producer exports."
+        )
+    )
+    parser.add_argument(
+        "--producer-root",
+        required=True,
+        type=Path,
+        help="Path to ISB1 producer exports root (…/upstream/inferencex/exports)",
+    )
+    parser.add_argument(
+        "--consumer-root",
+        default=_default_consumer_root(),
+        type=Path,
+        help="Path to InferenceX consumer exports root (default: datasets/isb1/exports)",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = parse_args(argv)
+    issues = verify_sync(args.producer_root.resolve(), args.consumer_root.resolve())
+
+    if not issues:
+        print(
+            "Producer/consumer export sync check passed for: "
+            + ", ".join(RELEVANT_SUBTREES)
+        )
+        return 0
+
+    print("Producer/consumer export sync check failed:", file=sys.stderr)
+    for issue in issues:
+        print(f"- {issue.kind}: {issue.path}", file=sys.stderr)
+    return 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())