From 51b538bcf6c87c765ddce22e08788edd52049ef1 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Fri, 6 Feb 2026 20:49:57 -0600
Subject: [PATCH 1/2] bench: enable PTC + hard-restart by default, fix
 mux-run.sh fatal-on-exit

- Set enabledByDefault=true for programmatic-tool-calling and
  exec-subagent-hard-restart experiments
- CLI buildExperimentsObject now auto-enables default experiments
  when no explicit --experiment flags are passed
- mux-run.sh: replace fatal() on agent non-zero exit with a warning,
  allowing token extraction to run unconditionally
- Propagate agent exit code at end of script
---
 benchmarks/terminal_bench/mux-run.sh | 12 ++++++++++--
 src/cli/run.ts                       | 23 +++++++++++++++++------
 src/common/constants/experiments.ts  |  4 ++--
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/benchmarks/terminal_bench/mux-run.sh b/benchmarks/terminal_bench/mux-run.sh
index b3db7f3714..a148249bb2 100644
--- a/benchmarks/terminal_bench/mux-run.sh
+++ b/benchmarks/terminal_bench/mux-run.sh
@@ -91,9 +91,13 @@ if [[ -n "${MUX_TIMEOUT_MS}" ]]; then
 fi
 
 # Terminal-bench enforces timeouts via --global-agent-timeout-sec
-# Capture output to file while streaming to terminal for token extraction
+# Capture output to file while streaming to terminal for token extraction.
+# Don't exit on failure — always fall through to token extraction so timed-out
+# or crashed runs still get usage data captured.
+mux_exit_code=0
 if ! printf '%s' "${instruction}" | "${cmd[@]}" | tee "${MUX_OUTPUT_FILE}"; then
-  fatal "mux agent session failed"
+  mux_exit_code=$?
+  log "WARNING: mux agent session exited with code ${mux_exit_code}"
 fi
 
 # Extract usage and cost from the JSONL output.
@@ -145,3 +149,7 @@ result["input"] += subagent_input
 result["output"] += subagent_output
 print(json.dumps(result))
 ' "${MUX_OUTPUT_FILE}" > "${MUX_TOKEN_FILE}" 2>/dev/null || true
+
+# Propagate the agent's exit code so the harness can detect failures
+exit "${mux_exit_code}"
+
diff --git a/src/cli/run.ts b/src/cli/run.ts
index cfe69104dd..5a905150d8 100644
--- a/src/cli/run.ts
+++ b/src/cli/run.ts
@@ -68,7 +68,7 @@ import { DockerRuntime } from "@/node/runtime/DockerRuntime";
 import { runFullInit } from "@/node/runtime/runtimeFactory";
 import { execSync } from "child_process";
 import { getParseOptions } from "./argv";
-import { EXPERIMENT_IDS } from "@/common/constants/experiments";
+import { EXPERIMENT_IDS, EXPERIMENTS } from "@/common/constants/experiments";
 
 // Display labels for CLI help (OFF, LOW, MED, HIGH, MAX)
 const THINKING_LABELS_LIST = Object.values(THINKING_DISPLAY_LABELS).join(", ");
@@ -192,15 +192,26 @@ function collectExperiments(value: string, previous: string[]): string[] {
 
 /**
  * Convert experiment ID array to the experiments object expected by SendMessageOptions.
+ * Experiments with enabledByDefault=true are included automatically unless the user
+ * explicitly passes --experiment flags (which override defaults entirely).
  */
 function buildExperimentsObject(experimentIds: string[]): SendMessageOptions["experiments"] {
-  if (experimentIds.length === 0) return undefined;
+  // When user passes explicit --experiment flags, use exactly those.
+  // When no flags are passed, auto-enable experiments that are on by default.
+  const effectiveIds =
+    experimentIds.length > 0
+      ? experimentIds
+      : Object.values(EXPERIMENTS)
+          .filter((exp) => exp.enabledByDefault)
+          .map((exp) => exp.id);
+
+  if (effectiveIds.length === 0) return undefined;
 
   return {
-    programmaticToolCalling: experimentIds.includes("programmatic-tool-calling"),
-    programmaticToolCallingExclusive: experimentIds.includes("programmatic-tool-calling-exclusive"),
-    system1: experimentIds.includes("system-1"),
-    execSubagentHardRestart: experimentIds.includes("exec-subagent-hard-restart"),
+    programmaticToolCalling: effectiveIds.includes("programmatic-tool-calling"),
+    programmaticToolCallingExclusive: effectiveIds.includes("programmatic-tool-calling-exclusive"),
+    system1: effectiveIds.includes("system-1"),
+    execSubagentHardRestart: effectiveIds.includes("exec-subagent-hard-restart"),
   };
 }
 
diff --git a/src/common/constants/experiments.ts b/src/common/constants/experiments.ts
index 264db07005..40d12de1f8 100644
--- a/src/common/constants/experiments.ts
+++ b/src/common/constants/experiments.ts
@@ -43,7 +43,7 @@ export const EXPERIMENTS: Record<ExperimentId, ExperimentDefinition> = {
     id: EXPERIMENT_IDS.PROGRAMMATIC_TOOL_CALLING,
     name: "Programmatic Tool Calling",
     description: "Enable code_execution tool for multi-tool workflows in a sandboxed JS runtime",
-    enabledByDefault: false,
+    enabledByDefault: true,
     userOverridable: true,
     showInSettings: true,
   },
@@ -76,7 +76,7 @@ export const EXPERIMENTS: Record<ExperimentId, ExperimentDefinition> = {
     id: EXPERIMENT_IDS.EXEC_SUBAGENT_HARD_RESTART,
     name: "Exec sub-agent hard restart",
     description: "Hard-restart exec sub-agents on context overflow",
-    enabledByDefault: false,
+    enabledByDefault: true,
     userOverridable: true,
     showInSettings: true,
   },

From 55d38273042dbe11368eda8fe006ca3ba71f7248 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 7 Feb 2026 10:22:48 -0600
Subject: [PATCH 2/2] bench: add --explore-model CLI flag for sub-agent model
 override
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Plumbs MUX_EXPLORE_MODEL through mux_agent.py → mux-run.sh → CLI.
Sets config.agentAiDefaults.explore so explore sub-agents use a
fast/cheap model instead of inheriting the expensive parent model.
---
 benchmarks/terminal_bench/mux-run.sh   |  6 ++++++
 benchmarks/terminal_bench/mux_agent.py |  4 ++++
 src/cli/run.ts                         | 22 ++++++++++++++++++++++
 3 files changed, 32 insertions(+)

diff --git a/benchmarks/terminal_bench/mux-run.sh b/benchmarks/terminal_bench/mux-run.sh
index a148249bb2..e4b191162c 100644
--- a/benchmarks/terminal_bench/mux-run.sh
+++ b/benchmarks/terminal_bench/mux-run.sh
@@ -30,6 +30,7 @@ MUX_THINKING_LEVEL="${MUX_THINKING_LEVEL:-high}"
 MUX_MODE="${MUX_MODE:-exec}"
 MUX_RUNTIME="${MUX_RUNTIME:-}"
 MUX_EXPERIMENTS="${MUX_EXPERIMENTS:-}"
+MUX_EXPLORE_MODEL="${MUX_EXPLORE_MODEL:-}"
 
 resolve_project_path() {
   if [[ -n "${MUX_PROJECT_PATH}" ]]; then
@@ -81,6 +82,11 @@ if [[ -n "${MUX_EXPERIMENTS}" ]]; then
   done
 fi
 
+# Set explore sub-agent model (fast/cheap model for read-only investigation)
+if [[ -n "${MUX_EXPLORE_MODEL}" ]]; then
+  cmd+=(--explore-model "${MUX_EXPLORE_MODEL}")
+fi
+
 MUX_OUTPUT_FILE="/tmp/mux-output.jsonl"
 MUX_TOKEN_FILE="/tmp/mux-tokens.json"
 
diff --git a/benchmarks/terminal_bench/mux_agent.py b/benchmarks/terminal_bench/mux_agent.py
index 23c6053fe1..b8f4aaea2f 100644
--- a/benchmarks/terminal_bench/mux_agent.py
+++ b/benchmarks/terminal_bench/mux_agent.py
@@ -68,6 +68,7 @@ class MuxAgent(BaseInstalledAgent):
         "MUX_MODE",
         "MUX_RUNTIME",
         "MUX_EXPERIMENTS",
+        "MUX_EXPLORE_MODEL",
     )
 
     def __init__(
@@ -78,12 +79,15 @@ def __init__(
         thinking_level: str | None = None,
         experiments: str | None = None,
         timeout: int | str | None = None,
+        explore_model: str | None = None,
         **kwargs: Any,
     ) -> None:
         super().__init__(logs_dir=logs_dir, **kwargs)
         # Set MUX_TIMEOUT_MS if timeout is provided via agent kwargs
         if timeout is not None:
             os.environ["MUX_TIMEOUT_MS"] = str(int(timeout) * 1000)
+        if explore_model is not None:
+            os.environ["MUX_EXPLORE_MODEL"] = str(explore_model).strip()
         repo_root_env = os.environ.get("MUX_AGENT_REPO_ROOT")
         repo_root = (
             Path(repo_root_env).resolve()
diff --git a/src/cli/run.ts b/src/cli/run.ts
index 5a905150d8..5571239732 100644
--- a/src/cli/run.ts
+++ b/src/cli/run.ts
@@ -263,6 +263,10 @@ program
   .option("--mcp <server>", "MCP server as name=command (can be repeated)", collectMcpServers, [])
   .option("--no-mcp-config", "ignore global + repo MCP config files (use only --mcp servers)")
   .option("-e, --experiment <id>", "enable experiment (can be repeated)", collectExperiments, [])
+  .option(
+    "--explore-model <model>",
+    "model for explore sub-agents (fast/cheap recommended, e.g. anthropic:claude-haiku-3-5)"
+  )
   .option("-b, --budget <usd>", "stop when session cost exceeds budget (USD)", parseFloat)
   .option("--service-tier <tier>", "OpenAI service tier: auto, default, flex, priority", "auto")
   .addHelpText(
@@ -297,6 +301,7 @@ interface CLIOptions {
   mcp: MCPServerEntry[];
   mcpConfig: boolean;
   experiment: string[];
+  exploreModel?: string;
   budget?: number;
   serviceTier: "auto" | "default" | "flex" | "priority";
 }
@@ -351,6 +356,23 @@ async function main(): Promise<number> {
     fsSync.writeFileSync(secretsFile, JSON.stringify(existingSecrets, null, 2));
   }
 
+  // Set per-agent model defaults (e.g., --explore-model uses a fast model for explore sub-agents)
+  if (opts.exploreModel) {
+    const configFile = path.join(config.rootDir, "config.json");
+    const existing: Record<string, unknown> = fsSync.existsSync(configFile)
+      ? (JSON.parse(fsSync.readFileSync(configFile, "utf-8")) as Record<string, unknown>)
+      : {};
+    const prevDefaults =
+      existing.agentAiDefaults && typeof existing.agentAiDefaults === "object"
+        ? (existing.agentAiDefaults as Record<string, unknown>)
+        : {};
+    existing.agentAiDefaults = {
+      ...prevDefaults,
+      explore: { modelString: resolveModelAlias(opts.exploreModel) },
+    };
+    fsSync.writeFileSync(configFile, JSON.stringify(existing, null, 2));
+  }
+
   const workspaceId = generateWorkspaceId();
   const projectDir = path.resolve(opts.dir);
   await ensureDirectory(projectDir);