From 51b538bcf6c87c765ddce22e08788edd52049ef1 Mon Sep 17 00:00:00 2001 From: Ammar Date: Fri, 6 Feb 2026 20:49:57 -0600 Subject: [PATCH 1/2] bench: enable PTC + hard-restart by default, fix mux-run.sh fatal-on-exit - Set enabledByDefault=true for programmatic-tool-calling and exec-subagent-hard-restart experiments - CLI buildExperimentsObject now auto-enables default experiments when no explicit --experiment flags are passed - mux-run.sh: replace fatal() on agent non-zero exit with a warning, allowing token extraction to run unconditionally - Propagate agent exit code at end of script --- benchmarks/terminal_bench/mux-run.sh | 12 ++++++++++-- src/cli/run.ts | 23 +++++++++++++++++------ src/common/constants/experiments.ts | 4 ++-- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/benchmarks/terminal_bench/mux-run.sh b/benchmarks/terminal_bench/mux-run.sh index b3db7f3714..a148249bb2 100644 --- a/benchmarks/terminal_bench/mux-run.sh +++ b/benchmarks/terminal_bench/mux-run.sh @@ -91,9 +91,13 @@ if [[ -n "${MUX_TIMEOUT_MS}" ]]; then fi # Terminal-bench enforces timeouts via --global-agent-timeout-sec -# Capture output to file while streaming to terminal for token extraction +# Capture output to file while streaming to terminal for token extraction. +# Don't exit on failure — always fall through to token extraction so timed-out +# or crashed runs still get usage data captured. +mux_exit_code=0 if ! printf '%s' "${instruction}" | "${cmd[@]}" | tee "${MUX_OUTPUT_FILE}"; then - fatal "mux agent session failed" + mux_exit_code=$? + log "WARNING: mux agent session exited with code ${mux_exit_code}" fi # Extract usage and cost from the JSONL output. @@ -145,3 +149,7 @@ result["input"] += subagent_input result["output"] += subagent_output print(json.dumps(result)) ' "${MUX_OUTPUT_FILE}" > "${MUX_TOKEN_FILE}" 2>/dev/null || true + +# Propagate the agent's exit code so the harness can detect failures +exit "${mux_exit_code}" + diff --git a/src/cli/run.ts b/src/cli/run.ts index cfe69104dd..5a905150d8 100644 --- a/src/cli/run.ts +++ b/src/cli/run.ts @@ -68,7 +68,7 @@ import { DockerRuntime } from "@/node/runtime/DockerRuntime"; import { runFullInit } from "@/node/runtime/runtimeFactory"; import { execSync } from "child_process"; import { getParseOptions } from "./argv"; -import { EXPERIMENT_IDS } from "@/common/constants/experiments"; +import { EXPERIMENT_IDS, EXPERIMENTS } from "@/common/constants/experiments"; // Display labels for CLI help (OFF, LOW, MED, HIGH, MAX) const THINKING_LABELS_LIST = Object.values(THINKING_DISPLAY_LABELS).join(", "); @@ -192,15 +192,26 @@ function collectExperiments(value: string, previous: string[]): string[] { /** * Convert experiment ID array to the experiments object expected by SendMessageOptions. + * Experiments with enabledByDefault=true are included automatically unless the user + * explicitly passes --experiment flags (which override defaults entirely). */ function buildExperimentsObject(experimentIds: string[]): SendMessageOptions["experiments"] { - if (experimentIds.length === 0) return undefined; + // When user passes explicit --experiment flags, use exactly those. + // When no flags are passed, auto-enable experiments that are on by default. + const effectiveIds = + experimentIds.length > 0 + ? experimentIds + : Object.values(EXPERIMENTS) + .filter((exp) => exp.enabledByDefault) + .map((exp) => exp.id); + + if (effectiveIds.length === 0) return undefined; return { - programmaticToolCalling: experimentIds.includes("programmatic-tool-calling"), - programmaticToolCallingExclusive: experimentIds.includes("programmatic-tool-calling-exclusive"), - system1: experimentIds.includes("system-1"), - execSubagentHardRestart: experimentIds.includes("exec-subagent-hard-restart"), + programmaticToolCalling: effectiveIds.includes("programmatic-tool-calling"), + programmaticToolCallingExclusive: effectiveIds.includes("programmatic-tool-calling-exclusive"), + system1: effectiveIds.includes("system-1"), + execSubagentHardRestart: effectiveIds.includes("exec-subagent-hard-restart"), }; } diff --git a/src/common/constants/experiments.ts b/src/common/constants/experiments.ts index 264db07005..40d12de1f8 100644 --- a/src/common/constants/experiments.ts +++ b/src/common/constants/experiments.ts @@ -43,7 +43,7 @@ export const EXPERIMENTS: Record = { id: EXPERIMENT_IDS.PROGRAMMATIC_TOOL_CALLING, name: "Programmatic Tool Calling", description: "Enable code_execution tool for multi-tool workflows in a sandboxed JS runtime", - enabledByDefault: false, + enabledByDefault: true, userOverridable: true, showInSettings: true, }, @@ -76,7 +76,7 @@ export const EXPERIMENTS: Record = { id: EXPERIMENT_IDS.EXEC_SUBAGENT_HARD_RESTART, name: "Exec sub-agent hard restart", description: "Hard-restart exec sub-agents on context overflow", - enabledByDefault: false, + enabledByDefault: true, userOverridable: true, showInSettings: true, }, From 55d38273042dbe11368eda8fe006ca3ba71f7248 Mon Sep 17 00:00:00 2001 From: Ammar Date: Sat, 7 Feb 2026 10:22:48 -0600 Subject: [PATCH 2/2] bench: add --explore-model CLI flag for sub-agent model override MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plumbs MUX_EXPLORE_MODEL through mux_agent.py → mux-run.sh → CLI. Sets config.agentAiDefaults.explore so explore sub-agents use a fast/cheap model instead of inheriting the expensive parent model. --- benchmarks/terminal_bench/mux-run.sh | 6 ++++++ benchmarks/terminal_bench/mux_agent.py | 4 ++++ src/cli/run.ts | 22 ++++++++++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/benchmarks/terminal_bench/mux-run.sh b/benchmarks/terminal_bench/mux-run.sh index a148249bb2..e4b191162c 100644 --- a/benchmarks/terminal_bench/mux-run.sh +++ b/benchmarks/terminal_bench/mux-run.sh @@ -30,6 +30,7 @@ MUX_THINKING_LEVEL="${MUX_THINKING_LEVEL:-high}" MUX_MODE="${MUX_MODE:-exec}" MUX_RUNTIME="${MUX_RUNTIME:-}" MUX_EXPERIMENTS="${MUX_EXPERIMENTS:-}" +MUX_EXPLORE_MODEL="${MUX_EXPLORE_MODEL:-}" resolve_project_path() { if [[ -n "${MUX_PROJECT_PATH}" ]]; then @@ -81,6 +82,11 @@ if [[ -n "${MUX_EXPERIMENTS}" ]]; then done fi +# Set explore sub-agent model (fast/cheap model for read-only investigation) +if [[ -n "${MUX_EXPLORE_MODEL}" ]]; then + cmd+=(--explore-model "${MUX_EXPLORE_MODEL}") +fi + MUX_OUTPUT_FILE="/tmp/mux-output.jsonl" MUX_TOKEN_FILE="/tmp/mux-tokens.json" diff --git a/benchmarks/terminal_bench/mux_agent.py b/benchmarks/terminal_bench/mux_agent.py index 23c6053fe1..b8f4aaea2f 100644 --- a/benchmarks/terminal_bench/mux_agent.py +++ b/benchmarks/terminal_bench/mux_agent.py @@ -68,6 +68,7 @@ class MuxAgent(BaseInstalledAgent): "MUX_MODE", "MUX_RUNTIME", "MUX_EXPERIMENTS", + "MUX_EXPLORE_MODEL", ) def __init__( @@ -78,12 +79,15 @@ def __init__( thinking_level: str | None = None, experiments: str | None = None, timeout: int | str | None = None, + explore_model: str | None = None, **kwargs: Any, ) -> None: super().__init__(logs_dir=logs_dir, **kwargs) # Set MUX_TIMEOUT_MS if timeout is provided via agent kwargs if timeout is not None: os.environ["MUX_TIMEOUT_MS"] = str(int(timeout) * 1000) + if explore_model is not None: + os.environ["MUX_EXPLORE_MODEL"] = str(explore_model).strip() repo_root_env = os.environ.get("MUX_AGENT_REPO_ROOT") repo_root = ( Path(repo_root_env).resolve() diff --git a/src/cli/run.ts b/src/cli/run.ts index 5a905150d8..5571239732 100644 --- a/src/cli/run.ts +++ b/src/cli/run.ts @@ -263,6 +263,10 @@ program .option("--mcp ", "MCP server as name=command (can be repeated)", collectMcpServers, []) .option("--no-mcp-config", "ignore global + repo MCP config files (use only --mcp servers)") .option("-e, --experiment ", "enable experiment (can be repeated)", collectExperiments, []) + .option( + "--explore-model ", + "model for explore sub-agents (fast/cheap recommended, e.g. anthropic:claude-haiku-3-5)" + ) .option("-b, --budget ", "stop when session cost exceeds budget (USD)", parseFloat) .option("--service-tier ", "OpenAI service tier: auto, default, flex, priority", "auto") .addHelpText( @@ -297,6 +301,7 @@ interface CLIOptions { mcp: MCPServerEntry[]; mcpConfig: boolean; experiment: string[]; + exploreModel?: string; budget?: number; serviceTier: "auto" | "default" | "flex" | "priority"; } @@ -351,6 +356,23 @@ async function main(): Promise { fsSync.writeFileSync(secretsFile, JSON.stringify(existingSecrets, null, 2)); } + // Set per-agent model defaults (e.g., --explore-model uses a fast model for explore sub-agents) + if (opts.exploreModel) { + const configFile = path.join(config.rootDir, "config.json"); + const existing: Record = fsSync.existsSync(configFile) + ? (JSON.parse(fsSync.readFileSync(configFile, "utf-8")) as Record) + : {}; + const prevDefaults = + existing.agentAiDefaults && typeof existing.agentAiDefaults === "object" + ? (existing.agentAiDefaults as Record) + : {}; + existing.agentAiDefaults = { + ...prevDefaults, + explore: { modelString: resolveModelAlias(opts.exploreModel) }, + }; + fsSync.writeFileSync(configFile, JSON.stringify(existing, null, 2)); + } + const workspaceId = generateWorkspaceId(); const projectDir = path.resolve(opts.dir); await ensureDirectory(projectDir);