diff --git a/benchmarks/terminal_bench/mux-run.sh b/benchmarks/terminal_bench/mux-run.sh index b3db7f3714..e4b191162c 100644 --- a/benchmarks/terminal_bench/mux-run.sh +++ b/benchmarks/terminal_bench/mux-run.sh @@ -30,6 +30,7 @@ MUX_THINKING_LEVEL="${MUX_THINKING_LEVEL:-high}" MUX_MODE="${MUX_MODE:-exec}" MUX_RUNTIME="${MUX_RUNTIME:-}" MUX_EXPERIMENTS="${MUX_EXPERIMENTS:-}" +MUX_EXPLORE_MODEL="${MUX_EXPLORE_MODEL:-}" resolve_project_path() { if [[ -n "${MUX_PROJECT_PATH}" ]]; then @@ -81,6 +82,11 @@ if [[ -n "${MUX_EXPERIMENTS}" ]]; then done fi +# Set explore sub-agent model (fast/cheap model for read-only investigation) +if [[ -n "${MUX_EXPLORE_MODEL}" ]]; then + cmd+=(--explore-model "${MUX_EXPLORE_MODEL}") +fi + MUX_OUTPUT_FILE="/tmp/mux-output.jsonl" MUX_TOKEN_FILE="/tmp/mux-tokens.json" @@ -91,9 +97,13 @@ if [[ -n "${MUX_TIMEOUT_MS}" ]]; then fi # Terminal-bench enforces timeouts via --global-agent-timeout-sec -# Capture output to file while streaming to terminal for token extraction +# Capture output to file while streaming to terminal for token extraction. +# Don't exit on failure — always fall through to token extraction so timed-out +# or crashed runs still get usage data captured. +mux_exit_code=0 if ! printf '%s' "${instruction}" | "${cmd[@]}" | tee "${MUX_OUTPUT_FILE}"; then - fatal "mux agent session failed" + mux_exit_code=$? + log "WARNING: mux agent session exited with code ${mux_exit_code}" fi # Extract usage and cost from the JSONL output. @@ -145,3 +155,7 @@ result["input"] += subagent_input result["output"] += subagent_output print(json.dumps(result)) ' "${MUX_OUTPUT_FILE}" > "${MUX_TOKEN_FILE}" 2>/dev/null || true + +# Propagate the agent's exit code so the harness can detect failures +exit "${mux_exit_code}" + diff --git a/benchmarks/terminal_bench/mux_agent.py b/benchmarks/terminal_bench/mux_agent.py index 23c6053fe1..b8f4aaea2f 100644 --- a/benchmarks/terminal_bench/mux_agent.py +++ b/benchmarks/terminal_bench/mux_agent.py @@ -68,6 +68,7 @@ class MuxAgent(BaseInstalledAgent): "MUX_MODE", "MUX_RUNTIME", "MUX_EXPERIMENTS", + "MUX_EXPLORE_MODEL", ) def __init__( @@ -78,12 +79,15 @@ def __init__( thinking_level: str | None = None, experiments: str | None = None, timeout: int | str | None = None, + explore_model: str | None = None, **kwargs: Any, ) -> None: super().__init__(logs_dir=logs_dir, **kwargs) # Set MUX_TIMEOUT_MS if timeout is provided via agent kwargs if timeout is not None: os.environ["MUX_TIMEOUT_MS"] = str(int(timeout) * 1000) + if explore_model is not None: + os.environ["MUX_EXPLORE_MODEL"] = str(explore_model).strip() repo_root_env = os.environ.get("MUX_AGENT_REPO_ROOT") repo_root = ( Path(repo_root_env).resolve() diff --git a/src/cli/run.ts b/src/cli/run.ts index cfe69104dd..5571239732 100644 --- a/src/cli/run.ts +++ b/src/cli/run.ts @@ -68,7 +68,7 @@ import { DockerRuntime } from "@/node/runtime/DockerRuntime"; import { runFullInit } from "@/node/runtime/runtimeFactory"; import { execSync } from "child_process"; import { getParseOptions } from "./argv"; -import { EXPERIMENT_IDS } from "@/common/constants/experiments"; +import { EXPERIMENT_IDS, EXPERIMENTS } from "@/common/constants/experiments"; // Display labels for CLI help (OFF, LOW, MED, HIGH, MAX) const THINKING_LABELS_LIST = Object.values(THINKING_DISPLAY_LABELS).join(", "); @@ -192,15 +192,26 @@ function collectExperiments(value: string, previous: string[]): string[] { /** * Convert experiment ID array to the experiments object expected by SendMessageOptions. + * Experiments with enabledByDefault=true are included automatically unless the user + * explicitly passes --experiment flags (which override defaults entirely). */ function buildExperimentsObject(experimentIds: string[]): SendMessageOptions["experiments"] { - if (experimentIds.length === 0) return undefined; + // When user passes explicit --experiment flags, use exactly those. + // When no flags are passed, auto-enable experiments that are on by default. + const effectiveIds = + experimentIds.length > 0 + ? experimentIds + : Object.values(EXPERIMENTS) + .filter((exp) => exp.enabledByDefault) + .map((exp) => exp.id); + + if (effectiveIds.length === 0) return undefined; return { - programmaticToolCalling: experimentIds.includes("programmatic-tool-calling"), - programmaticToolCallingExclusive: experimentIds.includes("programmatic-tool-calling-exclusive"), - system1: experimentIds.includes("system-1"), - execSubagentHardRestart: experimentIds.includes("exec-subagent-hard-restart"), + programmaticToolCalling: effectiveIds.includes("programmatic-tool-calling"), + programmaticToolCallingExclusive: effectiveIds.includes("programmatic-tool-calling-exclusive"), + system1: effectiveIds.includes("system-1"), + execSubagentHardRestart: effectiveIds.includes("exec-subagent-hard-restart"), }; } @@ -252,6 +263,10 @@ program .option("--mcp ", "MCP server as name=command (can be repeated)", collectMcpServers, []) .option("--no-mcp-config", "ignore global + repo MCP config files (use only --mcp servers)") .option("-e, --experiment ", "enable experiment (can be repeated)", collectExperiments, []) + .option( + "--explore-model ", + "model for explore sub-agents (fast/cheap recommended, e.g. anthropic:claude-haiku-3-5)" + ) .option("-b, --budget ", "stop when session cost exceeds budget (USD)", parseFloat) .option("--service-tier ", "OpenAI service tier: auto, default, flex, priority", "auto") .addHelpText( @@ -286,6 +301,7 @@ interface CLIOptions { mcp: MCPServerEntry[]; mcpConfig: boolean; experiment: string[]; + exploreModel?: string; budget?: number; serviceTier: "auto" | "default" | "flex" | "priority"; } @@ -340,6 +356,23 @@ async function main(): Promise { fsSync.writeFileSync(secretsFile, JSON.stringify(existingSecrets, null, 2)); } + // Set per-agent model defaults (e.g., --explore-model uses a fast model for explore sub-agents) + if (opts.exploreModel) { + const configFile = path.join(config.rootDir, "config.json"); + const existing: Record = fsSync.existsSync(configFile) + ? (JSON.parse(fsSync.readFileSync(configFile, "utf-8")) as Record) + : {}; + const prevDefaults = + existing.agentAiDefaults && typeof existing.agentAiDefaults === "object" + ? (existing.agentAiDefaults as Record) + : {}; + existing.agentAiDefaults = { + ...prevDefaults, + explore: { modelString: resolveModelAlias(opts.exploreModel) }, + }; + fsSync.writeFileSync(configFile, JSON.stringify(existing, null, 2)); + } + const workspaceId = generateWorkspaceId(); const projectDir = path.resolve(opts.dir); await ensureDirectory(projectDir); diff --git a/src/common/constants/experiments.ts b/src/common/constants/experiments.ts index 264db07005..40d12de1f8 100644 --- a/src/common/constants/experiments.ts +++ b/src/common/constants/experiments.ts @@ -43,7 +43,7 @@ export const EXPERIMENTS: Record = { id: EXPERIMENT_IDS.PROGRAMMATIC_TOOL_CALLING, name: "Programmatic Tool Calling", description: "Enable code_execution tool for multi-tool workflows in a sandboxed JS runtime", - enabledByDefault: false, + enabledByDefault: true, userOverridable: true, showInSettings: true, }, @@ -76,7 +76,7 @@ export const EXPERIMENTS: Record = { id: EXPERIMENT_IDS.EXEC_SUBAGENT_HARD_RESTART, name: "Exec sub-agent hard restart", description: "Hard-restart exec sub-agents on context overflow", - enabledByDefault: false, + enabledByDefault: true, userOverridable: true, showInSettings: true, },