Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions benchmarks/terminal_bench/mux-run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ MUX_THINKING_LEVEL="${MUX_THINKING_LEVEL:-high}"
MUX_MODE="${MUX_MODE:-exec}"
MUX_RUNTIME="${MUX_RUNTIME:-}"
MUX_EXPERIMENTS="${MUX_EXPERIMENTS:-}"
MUX_EXPLORE_MODEL="${MUX_EXPLORE_MODEL:-}"

resolve_project_path() {
if [[ -n "${MUX_PROJECT_PATH}" ]]; then
Expand Down Expand Up @@ -81,6 +82,11 @@ if [[ -n "${MUX_EXPERIMENTS}" ]]; then
done
fi

# Set explore sub-agent model (fast/cheap model for read-only investigation)
if [[ -n "${MUX_EXPLORE_MODEL}" ]]; then
cmd+=(--explore-model "${MUX_EXPLORE_MODEL}")
fi

MUX_OUTPUT_FILE="/tmp/mux-output.jsonl"
MUX_TOKEN_FILE="/tmp/mux-tokens.json"

Expand All @@ -91,9 +97,13 @@ if [[ -n "${MUX_TIMEOUT_MS}" ]]; then
fi

# Terminal-bench enforces timeouts via --global-agent-timeout-sec
# Capture output to file while streaming to terminal for token extraction
# Capture output to file while streaming to terminal for token extraction.
# Don't exit on failure — always fall through to token extraction so timed-out
# or crashed runs still get usage data captured.
mux_exit_code=0
if ! printf '%s' "${instruction}" | "${cmd[@]}" | tee "${MUX_OUTPUT_FILE}"; then
fatal "mux agent session failed"
mux_exit_code=$?
log "WARNING: mux agent session exited with code ${mux_exit_code}"
Comment on lines 104 to +106

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve the failing exit code before ! negates it

Because the pipeline is wrapped in if ! …; then, the ! operator negates the pipeline status, so inside the block $? is always 0 when the command failed. With set -o pipefail, this means mux_exit_code becomes 0 on failures/timeouts, and the script exits 0 even when the agent crashed, so the harness can’t detect failures. Capture the pipeline status before applying ! (or use if ...; then ... else mux_exit_code=$?) to preserve the real exit code.

Useful? React with 👍 / 👎.

fi

# Extract usage and cost from the JSONL output.
Expand Down Expand Up @@ -145,3 +155,7 @@ result["input"] += subagent_input
result["output"] += subagent_output
print(json.dumps(result))
' "${MUX_OUTPUT_FILE}" > "${MUX_TOKEN_FILE}" 2>/dev/null || true

# Propagate the agent's exit code so the harness can detect failures
exit "${mux_exit_code}"

4 changes: 4 additions & 0 deletions benchmarks/terminal_bench/mux_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ class MuxAgent(BaseInstalledAgent):
"MUX_MODE",
"MUX_RUNTIME",
"MUX_EXPERIMENTS",
"MUX_EXPLORE_MODEL",
)

def __init__(
Expand All @@ -78,12 +79,15 @@ def __init__(
thinking_level: str | None = None,
experiments: str | None = None,
timeout: int | str | None = None,
explore_model: str | None = None,
**kwargs: Any,
) -> None:
super().__init__(logs_dir=logs_dir, **kwargs)
# Set MUX_TIMEOUT_MS if timeout is provided via agent kwargs
if timeout is not None:
os.environ["MUX_TIMEOUT_MS"] = str(int(timeout) * 1000)
if explore_model is not None:
os.environ["MUX_EXPLORE_MODEL"] = str(explore_model).strip()
repo_root_env = os.environ.get("MUX_AGENT_REPO_ROOT")
repo_root = (
Path(repo_root_env).resolve()
Expand Down
45 changes: 39 additions & 6 deletions src/cli/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ import { DockerRuntime } from "@/node/runtime/DockerRuntime";
import { runFullInit } from "@/node/runtime/runtimeFactory";
import { execSync } from "child_process";
import { getParseOptions } from "./argv";
import { EXPERIMENT_IDS } from "@/common/constants/experiments";
import { EXPERIMENT_IDS, EXPERIMENTS } from "@/common/constants/experiments";

// Display labels for CLI help (OFF, LOW, MED, HIGH, MAX)
const THINKING_LABELS_LIST = Object.values(THINKING_DISPLAY_LABELS).join(", ");
Expand Down Expand Up @@ -192,15 +192,26 @@ function collectExperiments(value: string, previous: string[]): string[] {

/**
* Convert experiment ID array to the experiments object expected by SendMessageOptions.
* Experiments with enabledByDefault=true are included automatically unless the user
* explicitly passes --experiment flags (which override defaults entirely).
*/
function buildExperimentsObject(experimentIds: string[]): SendMessageOptions["experiments"] {
if (experimentIds.length === 0) return undefined;
// When user passes explicit --experiment flags, use exactly those.
// When no flags are passed, auto-enable experiments that are on by default.
const effectiveIds =
experimentIds.length > 0
? experimentIds
: Object.values(EXPERIMENTS)
.filter((exp) => exp.enabledByDefault)
.map((exp) => exp.id);

if (effectiveIds.length === 0) return undefined;

return {
programmaticToolCalling: experimentIds.includes("programmatic-tool-calling"),
programmaticToolCallingExclusive: experimentIds.includes("programmatic-tool-calling-exclusive"),
system1: experimentIds.includes("system-1"),
execSubagentHardRestart: experimentIds.includes("exec-subagent-hard-restart"),
programmaticToolCalling: effectiveIds.includes("programmatic-tool-calling"),
programmaticToolCallingExclusive: effectiveIds.includes("programmatic-tool-calling-exclusive"),
system1: effectiveIds.includes("system-1"),
execSubagentHardRestart: effectiveIds.includes("exec-subagent-hard-restart"),
};
}

Expand Down Expand Up @@ -252,6 +263,10 @@ program
.option("--mcp <server>", "MCP server as name=command (can be repeated)", collectMcpServers, [])
.option("--no-mcp-config", "ignore global + repo MCP config files (use only --mcp servers)")
.option("-e, --experiment <id>", "enable experiment (can be repeated)", collectExperiments, [])
.option(
"--explore-model <model>",
"model for explore sub-agents (fast/cheap recommended, e.g. anthropic:claude-haiku-3-5)"
)
.option("-b, --budget <usd>", "stop when session cost exceeds budget (USD)", parseFloat)
.option("--service-tier <tier>", "OpenAI service tier: auto, default, flex, priority", "auto")
.addHelpText(
Expand Down Expand Up @@ -286,6 +301,7 @@ interface CLIOptions {
mcp: MCPServerEntry[];
mcpConfig: boolean;
experiment: string[];
exploreModel?: string;
budget?: number;
serviceTier: "auto" | "default" | "flex" | "priority";
}
Expand Down Expand Up @@ -340,6 +356,23 @@ async function main(): Promise<number> {
fsSync.writeFileSync(secretsFile, JSON.stringify(existingSecrets, null, 2));
}

// Set per-agent model defaults (e.g., --explore-model uses a fast model for explore sub-agents)
if (opts.exploreModel) {
const configFile = path.join(config.rootDir, "config.json");
const existing: Record<string, unknown> = fsSync.existsSync(configFile)
? (JSON.parse(fsSync.readFileSync(configFile, "utf-8")) as Record<string, unknown>)
: {};
const prevDefaults =
existing.agentAiDefaults && typeof existing.agentAiDefaults === "object"
? (existing.agentAiDefaults as Record<string, unknown>)
: {};
existing.agentAiDefaults = {
...prevDefaults,
explore: { modelString: resolveModelAlias(opts.exploreModel) },
};
fsSync.writeFileSync(configFile, JSON.stringify(existing, null, 2));
}

const workspaceId = generateWorkspaceId();
const projectDir = path.resolve(opts.dir);
await ensureDirectory(projectDir);
Expand Down
4 changes: 2 additions & 2 deletions src/common/constants/experiments.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ export const EXPERIMENTS: Record<ExperimentId, ExperimentDefinition> = {
id: EXPERIMENT_IDS.PROGRAMMATIC_TOOL_CALLING,
name: "Programmatic Tool Calling",
description: "Enable code_execution tool for multi-tool workflows in a sandboxed JS runtime",
enabledByDefault: false,
enabledByDefault: true,
userOverridable: true,
showInSettings: true,
},
Expand Down Expand Up @@ -76,7 +76,7 @@ export const EXPERIMENTS: Record<ExperimentId, ExperimentDefinition> = {
id: EXPERIMENT_IDS.EXEC_SUBAGENT_HARD_RESTART,
name: "Exec sub-agent hard restart",
description: "Hard-restart exec sub-agents on context overflow",
enabledByDefault: false,
enabledByDefault: true,
userOverridable: true,
showInSettings: true,
},
Expand Down
Loading