From 7baf33dd0c1dfbe321ea51db07c28864599ecb2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Wed, 25 Mar 2026 14:19:05 +0100 Subject: [PATCH 1/4] feat: add macOS desktop snapshot surfaces --- .github/workflows/macos.yml | 55 +++ README.md | 2 +- .../Sources/AgentDeviceMacOSHelper/main.swift | 426 ++++++++++++++++++ skills/agent-device/SKILL.md | 341 +++----------- .../agent-device/references/macos-desktop.md | 45 +- .../agent-device/references/snapshot-refs.md | 3 + src/core/session-surface.ts | 6 - src/daemon/handlers/__tests__/session.test.ts | 54 ++- .../__tests__/snapshot-handler.test.ts | 95 ++++ src/daemon/handlers/find.ts | 2 +- src/daemon/handlers/interaction-snapshot.ts | 48 +- src/daemon/handlers/interaction-touch.ts | 2 +- src/daemon/handlers/session-open.ts | 17 +- src/daemon/handlers/session.ts | 2 +- src/daemon/handlers/snapshot-capture.ts | 9 +- src/daemon/handlers/snapshot-wait.ts | 61 ++- src/daemon/snapshot-processing.ts | 1 + src/platforms/ios/macos-helper.ts | 35 ++ src/utils/command-schema.ts | 12 +- src/utils/snapshot.ts | 9 +- website/docs/docs/commands.md | 33 +- website/docs/docs/installation.md | 2 +- 22 files changed, 871 insertions(+), 389 deletions(-) create mode 100644 .github/workflows/macos.yml diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml new file mode 100644 index 00000000..f0083ff5 --- /dev/null +++ b/.github/workflows/macos.yml @@ -0,0 +1,55 @@ +name: macOS + +on: + pull_request: + push: + branches: + - main + +permissions: + contents: read + +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + integration-macos: + name: Integration Tests + runs-on: macos-26 + timeout-minutes: 80 + continue-on-error: true + env: + AGENT_DEVICE_DAEMON_TIMEOUT_MS: '300000' + AGENT_DEVICE_IOS_APP_LAUNCH_TIMEOUT_MS: '60000' + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Setup toolchain + uses: ./.github/actions/setup-node-pnpm + + - name: Resolve agent-device home + id: macos-agent-home + run: echo "dir=$HOME/.agent-device" >> "$GITHUB_OUTPUT" + + - name: Build macOS XCTest runner + run: pnpm build:xcuitest:macos + + - name: Build macOS helper + run: pnpm build:macos-helper + + - name: Run macOS integration test + run: node --test test/integration/macos.test.ts + + - name: Upload macOS artifacts + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: macos-artifacts + if-no-files-found: ignore + path: | + ${{ steps.macos-agent-home.outputs.dir }}/daemon.log + ${{ steps.macos-agent-home.outputs.dir }}/sessions/** + test/artifacts/** + test/screenshots/** diff --git a/README.md b/README.md index e3afc7d6..f9835ec5 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ For agents: npm install -g agent-device ``` -On macOS, `agent-device` now includes a local `agent-device-macos-helper` source package that is built on demand for desktop permission checks, alert handling, and other host-Mac support paths. Release distribution should use a signed/notarized helper build; source checkouts fall back to a local Swift build. +On macOS, `agent-device` includes a local `agent-device-macos-helper` source package that is built on demand for desktop permission checks, alert handling, and helper-backed desktop snapshot surfaces. Release distribution should use a signed/notarized helper build; source checkouts fall back to a local Swift build. ## Contributing diff --git a/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift b/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift index 75f599dc..64cc980e 100644 --- a/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift +++ b/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift @@ -53,6 +53,41 @@ struct AlertResponse: Encodable { let bundleId: String? } +struct RectResponse: Encodable { + let x: Double + let y: Double + let width: Double + let height: Double +} + +struct SnapshotNodeResponse: Encodable { + let index: Int + let type: String? + let role: String? + let subrole: String? + let label: String? + let value: String? + let identifier: String? + let rect: RectResponse? + let enabled: Bool? + let selected: Bool? + let hittable: Bool? + let depth: Int + let parentIndex: Int? + let pid: Int32? + let bundleId: String? + let appName: String? + let windowTitle: String? + let surface: String? +} + +struct SnapshotResponse: Encodable { + let surface: String + let nodes: [SnapshotNodeResponse] + let truncated = false + let backend = "macos-helper" +} + @main struct AgentDeviceMacOSHelper { static func main() { @@ -93,6 +128,8 @@ struct AgentDeviceMacOSHelper { return try handlePermission(arguments: Array(arguments.dropFirst())) case "alert": return try handleAlert(arguments: Array(arguments.dropFirst())) + case "snapshot": + return try handleSnapshot(arguments: Array(arguments.dropFirst())) default: throw HelperError.invalidArgs("unknown command: \(command)") } @@ -298,6 +335,28 @@ struct AgentDeviceMacOSHelper { ) ) } + + static func handleSnapshot(arguments: [String]) throws -> any Encodable { + guard let surface = optionValue(arguments: arguments, name: "--surface")? + .trimmingCharacters(in: .whitespacesAndNewlines) + .lowercased(), + !surface.isEmpty + else { + throw HelperError.invalidArgs("snapshot requires --surface ") + } + + switch surface { + case "frontmost-app": + let app = try resolveAlertApplication(bundleId: nil, surface: surface) + return SuccessEnvelope(data: SnapshotResponse(surface: surface, nodes: snapshotFrontmostApp(app))) + case "desktop": + return SuccessEnvelope(data: SnapshotResponse(surface: surface, nodes: snapshotDesktop())) + case "menubar": + return SuccessEnvelope(data: SnapshotResponse(surface: surface, nodes: snapshotMenuBar())) + default: + throw HelperError.invalidArgs("snapshot requires --surface ") + } + } } private func optionValue(arguments: [String], name: String) -> String? { @@ -354,6 +413,334 @@ private func resolveAlertApplication(bundleId: String?, surface: String?) throws throw HelperError.commandFailed("unable to resolve target app") } +private struct SnapshotContext { + let surface: String + let pid: Int32? + let bundleId: String? + let appName: String? + let windowTitle: String? +} + +private func snapshotFrontmostApp(_ app: NSRunningApplication) -> [SnapshotNodeResponse] { + let appElement = AXUIElementCreateApplication(app.processIdentifier) + var nodes: [SnapshotNodeResponse] = [] + var visited = Set() + appendElementSnapshot( + appElement, + depth: 0, + parentIndex: nil, + context: SnapshotContext( + surface: "frontmost-app", + pid: Int32(app.processIdentifier), + bundleId: app.bundleIdentifier, + appName: app.localizedName, + windowTitle: nil + ), + nodes: &nodes, + visited: &visited + ) + return nodes +} + +private func snapshotDesktop() -> [SnapshotNodeResponse] { + var nodes: [SnapshotNodeResponse] = [] + let rootIndex = appendSyntheticSnapshotNode( + into: &nodes, + type: "DesktopSurface", + label: "Desktop", + depth: 0, + parentIndex: nil, + surface: "desktop" + ) + + var runningApps = NSWorkspace.shared.runningApplications.filter { app in + app.activationPolicy != .prohibited + && !app.isTerminated + && (app.bundleIdentifier?.isEmpty == false || app.localizedName?.isEmpty == false) + } + runningApps.sort { left, right in + if left.isActive != right.isActive { + return left.isActive && !right.isActive + } + return (left.localizedName ?? "") < (right.localizedName ?? "") + } + + for app in runningApps { + let appElement = AXUIElementCreateApplication(app.processIdentifier) + let appContext = SnapshotContext( + surface: "desktop", + pid: Int32(app.processIdentifier), + bundleId: app.bundleIdentifier, + appName: app.localizedName, + windowTitle: nil + ) + var appVisited = Set() + let appIndex = appendElementSnapshot( + appElement, + depth: 1, + parentIndex: rootIndex, + context: appContext, + nodes: &nodes, + visited: &appVisited + ) + let visibleWindows = windows(of: appElement).filter(isVisibleSnapshotWindow) + if visibleWindows.isEmpty { + continue + } + var visited = appVisited + for window in visibleWindows { + let windowTitle = stringAttribute(window, attribute: kAXTitleAttribute as String) + appendElementSnapshot( + window, + depth: 2, + parentIndex: appIndex, + context: SnapshotContext( + surface: "desktop", + pid: Int32(app.processIdentifier), + bundleId: app.bundleIdentifier, + appName: app.localizedName, + windowTitle: windowTitle + ), + nodes: &nodes, + visited: &visited + ) + } + } + + return nodes +} + +private func snapshotMenuBar() -> [SnapshotNodeResponse] { + var nodes: [SnapshotNodeResponse] = [] + let rootIndex = appendSyntheticSnapshotNode( + into: &nodes, + type: "MenuBarSurface", + label: "Menu Bar", + depth: 0, + parentIndex: nil, + surface: "menubar" + ) + + if let frontmost = NSWorkspace.shared.frontmostApplication { + let frontmostElement = AXUIElementCreateApplication(frontmost.processIdentifier) + if let menuBar = elementAttribute(frontmostElement, attribute: kAXMenuBarAttribute as String) { + var frontmostVisited = Set() + appendElementSnapshot( + menuBar, + depth: 1, + parentIndex: rootIndex, + context: SnapshotContext( + surface: "menubar", + pid: Int32(frontmost.processIdentifier), + bundleId: frontmost.bundleIdentifier, + appName: frontmost.localizedName, + windowTitle: frontmost.localizedName + ), + nodes: &nodes, + visited: &frontmostVisited + ) + } + } + + if let systemUiServer = NSRunningApplication.runningApplications( + withBundleIdentifier: "com.apple.systemuiserver" + ).first { + let systemUiElement = AXUIElementCreateApplication(systemUiServer.processIdentifier) + if let menuExtras = elementAttribute(systemUiElement, attribute: kAXMenuBarAttribute as String) { + var systemUiVisited = Set() + appendElementSnapshot( + menuExtras, + depth: 1, + parentIndex: rootIndex, + context: SnapshotContext( + surface: "menubar", + pid: Int32(systemUiServer.processIdentifier), + bundleId: systemUiServer.bundleIdentifier, + appName: systemUiServer.localizedName, + windowTitle: "System Menu Extras" + ), + nodes: &nodes, + visited: &systemUiVisited + ) + } + } + + return nodes +} + +@discardableResult +private func appendSyntheticSnapshotNode( + into nodes: inout [SnapshotNodeResponse], + type: String, + label: String, + depth: Int, + parentIndex: Int?, + surface: String +) -> Int { + let index = nodes.count + nodes.append( + SnapshotNodeResponse( + index: index, + type: type, + role: type, + subrole: nil, + label: label, + value: nil, + identifier: "surface:\(surface):\(type.lowercased())", + rect: nil, + enabled: true, + selected: nil, + hittable: false, + depth: depth, + parentIndex: parentIndex, + pid: nil, + bundleId: nil, + appName: nil, + windowTitle: nil, + surface: surface + ) + ) + return index +} + +@discardableResult +private func appendElementSnapshot( + _ element: AXUIElement, + depth: Int, + parentIndex: Int?, + context: SnapshotContext, + nodes: inout [SnapshotNodeResponse], + visited: inout Set, + maxDepth: Int = 12 +) -> Int { + let elementHash = CFHash(element) + if visited.contains(elementHash) { + return parentIndex ?? 0 + } + visited.insert(elementHash) + + let role = stringAttribute(element, attribute: kAXRoleAttribute as String) + let subrole = stringAttribute(element, attribute: kAXSubroleAttribute as String) + let title = stringAttribute(element, attribute: kAXTitleAttribute as String) + let description = stringAttribute(element, attribute: kAXDescriptionAttribute as String) + let value = stringAttribute(element, attribute: kAXValueAttribute as String) + let identifier = stringAttribute(element, attribute: "AXIdentifier") + let rect = rectAttribute(element) + let enabled = boolAttribute(element, attribute: kAXEnabledAttribute as String) + let selected = boolAttribute(element, attribute: kAXSelectedAttribute as String) + let type = normalizedSnapshotType(role: role, subrole: subrole) + let windowTitle = context.windowTitle ?? inferWindowTitle(for: element) + + let index = nodes.count + nodes.append( + SnapshotNodeResponse( + index: index, + type: type, + role: role, + subrole: subrole, + label: title ?? description ?? value, + value: value, + identifier: identifier, + rect: rect, + enabled: enabled, + selected: selected, + hittable: (enabled ?? true) && rect != nil, + depth: depth, + parentIndex: parentIndex, + pid: context.pid, + bundleId: context.bundleId, + appName: context.appName, + windowTitle: windowTitle, + surface: context.surface + ) + ) + + guard depth < maxDepth else { + return index + } + + for child in children(of: element) { + appendElementSnapshot( + child, + depth: depth + 1, + parentIndex: index, + context: SnapshotContext( + surface: context.surface, + pid: context.pid, + bundleId: context.bundleId, + appName: context.appName, + windowTitle: windowTitle + ), + nodes: &nodes, + visited: &visited, + maxDepth: maxDepth + ) + } + + return index +} + +private func normalizedSnapshotType(role: String?, subrole: String?) -> String? { + switch role { + case "AXApplication": + return "Application" + case "AXWindow": + return subrole == "AXStandardWindow" ? "Window" : (subrole ?? "Window") + case "AXSheet": + return "Sheet" + case "AXDialog": + return "Dialog" + case "AXButton": + return "Button" + case "AXStaticText": + return "StaticText" + case "AXTextField": + return "TextField" + case "AXTextArea": + return "TextArea" + case "AXScrollArea": + return "ScrollArea" + case "AXGroup": + return "Group" + case "AXMenuBar": + return "MenuBar" + case "AXMenuBarItem": + return "MenuBarItem" + case "AXMenu": + return "Menu" + case "AXMenuItem": + return "MenuItem" + default: + if let subrole, !subrole.isEmpty { + return subrole + } + return role + } +} + +private func isVisibleSnapshotWindow(_ window: AXUIElement) -> Bool { + guard let rect = rectAttribute(window) else { + return false + } + if rect.width <= 0 || rect.height <= 0 { + return false + } + if boolAttribute(window, attribute: kAXMinimizedAttribute as String) == true { + return false + } + return true +} + +private func inferWindowTitle(for element: AXUIElement) -> String? { + if let title = stringAttribute(element, attribute: kAXTitleAttribute as String) { + return title + } + if let window = elementAttribute(element, attribute: kAXWindowAttribute as String) { + return stringAttribute(window, attribute: kAXTitleAttribute as String) + } + return nil +} + private func validatedBundleId(_ rawBundleId: String) throws -> String { let bundleId = rawBundleId.trimmingCharacters(in: .whitespacesAndNewlines) let pattern = #"^[A-Za-z0-9_-]+(?:\.[A-Za-z0-9_-]+)+$"# @@ -375,6 +762,16 @@ private func stringAttribute(_ element: AXUIElement, attribute: String) -> Strin return nil } +private func boolAttribute(_ element: AXUIElement, attribute: String) -> Bool? { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success, + let number = value as? NSNumber + else { + return nil + } + return number.boolValue +} + private func elementAttribute(_ element: AXUIElement, attribute: String) -> AXUIElement? { var value: CFTypeRef? guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success else { @@ -386,6 +783,35 @@ private func elementAttribute(_ element: AXUIElement, attribute: String) -> AXUI return unsafeBitCast(value, to: AXUIElement.self) } +private func rectAttribute(_ element: AXUIElement) -> RectResponse? { + var positionValue: CFTypeRef? + var sizeValue: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, kAXPositionAttribute as CFString, &positionValue) == .success, + AXUIElementCopyAttributeValue(element, kAXSizeAttribute as CFString, &sizeValue) == .success, + let axPosition = positionValue, + let axSize = sizeValue + else { + return nil + } + + var position = CGPoint.zero + var size = CGSize.zero + guard AXValueGetType(axPosition as! AXValue) == .cgPoint, + AXValueGetValue(axPosition as! AXValue, .cgPoint, &position), + AXValueGetType(axSize as! AXValue) == .cgSize, + AXValueGetValue(axSize as! AXValue, .cgSize, &size) + else { + return nil + } + + return RectResponse( + x: Double(position.x), + y: Double(position.y), + width: Double(size.width), + height: Double(size.height) + ) +} + private func children(of element: AXUIElement) -> [AXUIElement] { var value: CFTypeRef? guard AXUIElementCopyAttributeValue(element, kAXChildrenAttribute as CFString, &value) == .success, diff --git a/skills/agent-device/SKILL.md b/skills/agent-device/SKILL.md index 9f4085c8..c45d05bb 100644 --- a/skills/agent-device/SKILL.md +++ b/skills/agent-device/SKILL.md @@ -5,125 +5,84 @@ description: Automates interactions for Apple-platform apps (iOS, tvOS, macOS) a # Apple and Android Automation with agent-device -For exploration, use snapshot refs. For deterministic replay, use selectors. -For structured exploratory QA bug hunts and reporting, use [../dogfood/SKILL.md](../dogfood/SKILL.md). +Use this skill as a router. -## Start Here (Read This First) +Core rule: -Use this skill as a router, not a full manual. +- explore with `snapshot -i` and `@ref` +- stabilize with selectors +- use plain `snapshot` when you need to verify whether text is visible +- re-snapshot after every meaningful UI change -1. Pick one mode: - - Normal interaction flow - - Debug/crash flow - - Replay maintenance flow -2. Run one canonical flow below. -3. Open references only if blocked. +For exploratory QA bug hunts and reporting, use [../dogfood/SKILL.md](../dogfood/SKILL.md). -## Decision Map +## Quick route -- No target context yet: `devices` -> pick target -> `open`. -- Normal UI task: `open` -> `snapshot -i` -> `press/click/fill` -> `diff snapshot -i` -> `close` -- Debug/crash (iOS/Android): `open ` -> `logs clear --restart` -> reproduce -> `network dump` -> `logs path` -> targeted `grep` -- Replay drift: `replay -u ` -> verify updated selectors -- Remote multi-tenant run: allocate lease -> point client at remote daemon base URL -> run commands with tenant isolation flags -> heartbeat/release lease -- Device-scope isolation run: set iOS simulator set / Android allowlist -> run selectors within scope only -- macOS desktop task: run the macOS desktop flow, then open [references/macos-desktop.md](references/macos-desktop.md) if context menus, Finder rows, or desktop-specific snapshot behavior matters -- macOS desktop debugging: `open --platform macos` -> `logs clear --restart` -> reproduce -> `network dump` -> `logs path` +- Normal UI task: `open` -> `snapshot -i` -> `click/fill/press` -> `close` +- Debug task: `open` -> `logs clear --restart` -> reproduce -> `network dump` -> `logs path` +- Replay drift: `replay -u ` +- No target context yet: `devices` -> pick target -> `open` -## Target Selection Rules +## Target rules -- iOS local QA: use simulators unless the task explicitly requires a physical device. -- iOS local QA in mixed simulator/device environments: run `ensure-simulator` first and pass `--device`, `--udid`, or `--ios-simulator-device-set` on later commands. -- macOS desktop app automation: use `--platform macos`, or `--platform apple --target desktop` when the caller wants one Apple-family selector path. -- For macOS phase-1 non-default session targeting, use `open --platform macos --surface frontmost-app`. -- Android local QA: use `install` or `reinstall` for `.apk`/`.aab` files, then relaunch by installed package name. -- Android React Native + Metro flows: prefer `open --remote-config --relaunch`. -- In mixed-device environments, always pin the exact target with `--serial`, `--device`, `--udid`, or an isolation scope. -- For session-bound automation runs, prefer a pre-bound session/platform instead of repeating selectors on every command: set `AGENT_DEVICE_SESSION`, set `AGENT_DEVICE_PLATFORM`, and the daemon will enforce the shared lock policy across CLI, typed client, and RPC entry points. -- Use `--session-lock reject|strip` (or `AGENT_DEVICE_SESSION_LOCK`) only when you need to override the default reject behavior. Lock mode applies to nested `batch` steps too. +- iOS local QA: prefer simulators +- Android binary flow: `install` or `reinstall` first, then `open --relaunch` +- In mixed-device labs, always pin the target with `--device`, `--udid`, `--serial`, or an isolation scope +- For session-bound automation, prefer `AGENT_DEVICE_SESSION` + `AGENT_DEVICE_PLATFORM` -## Canonical Flows +## macOS rules -### 1) Normal Interaction Flow +- Use `open --platform macos` for normal Mac app automation +- Use `open --platform macos --surface frontmost-app|desktop|menubar` when you need desktop-global inspection first +- Use `app` sessions for `click`, `fill`, `press`, `scroll`, `screenshot`, and `record` +- Use `frontmost-app`, `desktop`, and `menubar` mainly for `snapshot`, `get`, `is`, and `wait` +- If you inspect with `desktop` or `menubar` and then need to act inside one app, open that app in a normal `app` session +- Prefer `@ref` or selectors over raw `x y` on macOS +- Use `click --button secondary` for context menus, then run `snapshot -i` again -```bash -agent-device open Settings --platform ios -agent-device snapshot -i -agent-device press @e3 -agent-device diff snapshot -i -agent-device fill @e5 "test" -agent-device close -``` +## Canonical flows -### 1a) Local iOS Simulator QA Flow +### Normal flow ```bash -agent-device ensure-simulator --platform ios --device "iPhone 16" --boot -agent-device open MyApp --platform ios --device "iPhone 16" --session qa-ios --relaunch +agent-device open Settings --platform ios agent-device snapshot -i agent-device press @e3 +agent-device fill @e5 "test" agent-device close ``` -Use this when a physical iPhone is also connected and you want deterministic simulator-only automation. - -### 1b) Android React Native + Metro QA Flow +### macOS app flow ```bash -agent-device reinstall MyApp /path/to/app-debug.apk --platform android --serial emulator-5554 -agent-device open com.example.myapp --remote-config ./agent-device.remote.json --relaunch +agent-device open TextEdit --platform macos agent-device snapshot -i +agent-device fill @e3 "desktop smoke test" +agent-device screenshot /tmp/macos-textedit.png agent-device close ``` -Do not use `open --relaunch` on Android. Install/reinstall binaries first, then relaunch by package. - -### 1c) Session-Bound Automation Flow +### macOS desktop-global inspect flow ```bash -export AGENT_DEVICE_SESSION=qa-ios -export AGENT_DEVICE_PLATFORM=ios -export AGENT_DEVICE_SESSION_LOCK=strip - -agent-device open MyApp --relaunch +agent-device open --platform macos --surface desktop agent-device snapshot -i -agent-device batch --steps-file /tmp/qa-steps.json --json +agent-device get attrs @e4 +agent-device is visible 'role="window" label="Notes"' +agent-device wait text "Notes" agent-device close ``` -Use this for orchestrators that must preserve one bound session/device across many plain CLI calls without a wrapper script. In `strip` mode, conflicting selectors such as `--target`, `--device`, `--udid`, `--serial`, and isolation-scope overrides are ignored instead of retargeting the run. - -### 1d) Android Emulator Session-Bound Flow +### Android relaunch flow ```bash -export AGENT_DEVICE_SESSION=qa-android -export AGENT_DEVICE_PLATFORM=android - -agent-device reinstall MyApp /path/to/app-debug.apk --serial emulator-5554 -agent-device --session-lock reject open com.example.myapp --relaunch -agent-device snapshot -i -agent-device close --shutdown -``` - -Use this when an Android emulator session must stay pinned while an agent or test runner issues plain CLI commands over time. - -### 1e) macOS Desktop Flow - -```bash -agent-device open TextEdit --platform macos +agent-device reinstall MyApp /path/to/app-debug.apk --platform android --serial emulator-5554 +agent-device open com.example.myapp --remote-config ./agent-device.remote.json --relaunch agent-device snapshot -i -agent-device fill @e3 "desktop smoke test" -agent-device screenshot /tmp/macos-textedit.png agent-device close ``` -Use this for host Mac desktop apps. Prefer the Apple runner interaction flow (`open`, `snapshot`, `press`, `click`, `fill`, `scroll`, `back`, `record`, `screenshot`). macOS also supports `clipboard read|write`, `trigger-app-event`, `logs`, `network dump`, `alert`, `settings appearance`, and `settings permission `. -Source checkouts build `agent-device-macos-helper` on first use for macOS permission/alert support; release builds should ship a signed helper. -Phase 1 exposes `app` and `frontmost-app` session surfaces. Broader `desktop` and `menubar` surfaces remain future work until the desktop-global backend lands. -Prefer selectors or snapshot refs (`@e...`) over raw x/y commands on macOS because the window origin can move between runs. -Open [references/macos-desktop.md](references/macos-desktop.md) when you need Finder-style list traversal, context-menu flows, or macOS-specific snapshot expectations. - -### 2) Debug/Crash Flow +### Debug flow ```bash agent-device open MyApp --platform ios @@ -132,221 +91,27 @@ agent-device network dump 25 agent-device logs path ``` -Logging is off by default. Enable only for debugging windows. -`logs clear --restart` requires an active app session (`open ` first). - -### 3) Replay Maintenance Flow +### Replay maintenance ```bash agent-device replay -u ./session.ad ``` -### 4) Remote Tenant Lease Flow (HTTP JSON-RPC) - -```bash -# Client points directly at the remote daemon HTTP base URL. -export AGENT_DEVICE_DAEMON_BASE_URL=http://mac-host.example:4310 -export AGENT_DEVICE_DAEMON_AUTH_TOKEN= - -# Allocate lease -curl -sS "${AGENT_DEVICE_DAEMON_BASE_URL}/rpc" \ - -H "content-type: application/json" \ - -H "Authorization: Bearer " \ - -d '{"jsonrpc":"2.0","id":"alloc-1","method":"agent_device.lease.allocate","params":{"runId":"run-123","tenantId":"acme","ttlMs":60000}}' - -# Use lease in tenant-isolated command execution -agent-device \ - --tenant acme \ - --session-isolation tenant \ - --run-id run-123 \ - --lease-id \ - session list --json - -# Heartbeat and release -curl -sS "${AGENT_DEVICE_DAEMON_BASE_URL}/rpc" \ - -H "content-type: application/json" \ - -H "Authorization: Bearer " \ - -d '{"jsonrpc":"2.0","id":"hb-1","method":"agent_device.lease.heartbeat","params":{"leaseId":"","ttlMs":60000}}' -curl -sS "${AGENT_DEVICE_DAEMON_BASE_URL}/rpc" \ - -H "content-type: application/json" \ - -H "Authorization: Bearer " \ - -d '{"jsonrpc":"2.0","id":"rel-1","method":"agent_device.lease.release","params":{"leaseId":""}}' -``` - -Notes: +## High-value guardrails -- `AGENT_DEVICE_DAEMON_BASE_URL` makes the CLI skip local daemon discovery/startup and call the remote HTTP daemon directly. -- `AGENT_DEVICE_DAEMON_AUTH_TOKEN` is sent in both the JSON-RPC request token and HTTP auth headers. -- In remote daemon mode, `--debug` does not tail a local `daemon.log`; inspect logs on the remote host instead. - -## Command Skeleton (Minimal) - -### Session and navigation - -```bash -agent-device devices -agent-device devices --platform ios --ios-simulator-device-set /tmp/tenant-a/simulators -agent-device devices --platform android --android-device-allowlist emulator-5554,device-1234 -agent-device ensure-simulator --device "iPhone 16" --ios-simulator-device-set /tmp/tenant-a/simulators -agent-device ensure-simulator --device "iPhone 16" --runtime com.apple.CoreSimulator.SimRuntime.iOS-18-4 --ios-simulator-device-set /tmp/tenant-a/simulators --boot -agent-device open [app|url] [url] -agent-device open [app] --relaunch -agent-device close [app] -agent-device install -agent-device install-from-source [--header "name:value"] -agent-device reinstall -agent-device session list -``` - -Use `boot` only as fallback when `open` cannot find/connect to a ready target. -If the workspace repeats the same selectors or device/session flags, prefer a checked-in `agent-device.json` or `--config ` over repeating them inline. -Environment-level defaults follow the same fields via `AGENT_DEVICE_*` names, so persistent host-specific values belong there rather than in committed project config. -That includes bound-session defaults such as `sessionLock` / `AGENT_DEVICE_SESSION_LOCK` when automation should consistently reject or strip conflicting device routing flags. -For Android emulators by AVD name, use `boot --platform android --device `. -For Android emulators without GUI, add `--headless`. -Use `--target mobile|tv` with `--platform` (required) to pick phone/tablet vs TV targets (AndroidTV/tvOS). -For Android React Native + Metro flows, install or reinstall the APK first, then use `open --remote-config --relaunch`; do not use `open --relaunch`. -For local iOS QA in mixed simulator/device environments, use `ensure-simulator` and pass `--device` or `--udid` so automation does not attach to a physical device by accident. -For session-bound automation, prefer `AGENT_DEVICE_SESSION` + `AGENT_DEVICE_PLATFORM`; that bound-session default now enables lock mode automatically. - -Isolation scoping quick reference: - -- `--ios-simulator-device-set ` scopes iOS simulator discovery + command execution to one simulator set. -- `--android-device-allowlist ` scopes Android discovery/selection to comma/space separated serials. -- Scope is applied before selectors (`--device`, `--udid`, `--serial`); out-of-scope selectors fail with `DEVICE_NOT_FOUND`. -- With iOS simulator-set scope enabled, iOS physical devices are not enumerated. -- In bound-session `strip` mode, conflicting per-call scope/selectors are ignored and the configured binding is restored for the request. Batch steps still inherit the parent `--platform` when they do not set their own. - -Simulator provisioning quick reference: - -- Use `ensure-simulator` to create or reuse a named iOS simulator inside a device set before starting a session. -- `--device ` is required (e.g. `"iPhone 16 Pro"`). `--runtime ` pins the runtime; omit to use the newest compatible one. -- `--boot` boots it immediately. Returns `udid`, `device`, `runtime`, `ios_simulator_device_set`, `created`, `booted`. -- Idempotent: safe to call repeatedly; reuses an existing matching simulator by default. - -TV quick reference: - -- AndroidTV: `open`/`apps` use TV launcher discovery automatically. -- TV target selection works on emulators/simulators and connected physical devices (AndroidTV + AppleTV). -- tvOS: runner-driven interactions and snapshots are supported (`snapshot`, `wait`, `press`, `fill`, `get`, `scroll`, `back`, `home`, `app-switcher`, `record` and related selector flows). -- tvOS `back`/`home`/`app-switcher` map to Siri Remote actions (`menu`, `home`, double-home) in the runner. -- tvOS follows iOS simulator-only command semantics for helpers like `pinch`, `settings`, and `push`. - -### Snapshot and targeting - -```bash -agent-device snapshot -i -agent-device diff snapshot -i -agent-device find "Sign In" click -agent-device press @e1 -agent-device fill @e2 "text" -agent-device is visible 'id="anchor"' -``` - -`press` is canonical tap command; `click` is an alias. -On macOS, use `click --button secondary <@ref|selector>` to open a context menu before the next `snapshot -i`. -For desktop-specific heuristics and Finder guidance, see [references/macos-desktop.md](references/macos-desktop.md). - -### Utilities - -```bash -agent-device appstate -agent-device clipboard read -agent-device clipboard write "token" -agent-device keyboard status -agent-device keyboard dismiss -agent-device perf --json -agent-device network dump [limit] [summary|headers|body|all] -agent-device push -agent-device trigger-app-event screenshot_taken '{"source":"qa"}' -agent-device get text @e1 -agent-device screenshot out.png -agent-device settings permission grant notifications -agent-device settings permission reset camera -agent-device settings permission grant accessibility --platform macos -agent-device settings permission reset screen-recording --platform macos -agent-device trace start -agent-device trace stop ./trace.log -``` - -### Batch (when sequence is already known) - -```bash -agent-device batch --steps-file /tmp/batch-steps.json --json -``` - -### Performance Check - -- Use `agent-device perf --json` (or `metrics --json`) after `open`. -- For detailed metric semantics, caveats, and interpretation guidance, see [references/perf-metrics.md](references/perf-metrics.md). - -## Guardrails (High Value Only) - -- Re-snapshot after UI mutations (navigation/modal/list changes). -- Prefer `snapshot -i`; scope/depth only when needed. -- Use refs for discovery, selectors for replay/assertions. -- `find "" click --json` returns `{ ref, locator, query, x, y }` — all derived from the matched snapshot node. Do not rely on these fields from raw `press`/`click` responses for observability; use `find` instead. -- Use `fill` for clear-then-type semantics; use `type` for focused append typing. -- Use `install` for in-place app upgrades (keep app data when platform permits), and `reinstall` for deterministic fresh-state runs. -- App binary format support for `install`/`reinstall`: Android `.apk`/`.aab`, iOS `.app`/`.ipa`. -- Android `.aab` requires `bundletool` in `PATH`, or `AGENT_DEVICE_BUNDLETOOL_JAR=` with `java` in `PATH`. -- Android `.aab` optional: set `AGENT_DEVICE_ANDROID_BUNDLETOOL_MODE=` to control bundletool `build-apks --mode` (default: `universal`). -- iOS `.ipa`: extract/install from `Payload/*.app`; when multiple app bundles are present, `` is used as a bundle id/name hint. -- iOS `appstate` is session-scoped; Android `appstate` is live foreground state. iOS responses include `device_udid` and `ios_simulator_device_set` for isolation verification. -- iOS `open` responses include `device_udid` and `ios_simulator_device_set` to confirm which simulator handled the session. -- Clipboard helpers: `clipboard read` / `clipboard write ` are supported on macOS, Android, and iOS simulators; iOS physical devices are not supported yet. -- Android keyboard helpers: `keyboard status|get|dismiss` report keyboard visibility/type and dismiss via keyevent when visible. -- `network dump` is best-effort and parses HTTP(s) entries from the session app log file. -- Biometric settings: iOS simulator supports `settings faceid|touchid `; Android supports `settings fingerprint ` where runtime tooling is available. -- For AndroidTV/tvOS selection, always pair `--target` with `--platform` (`ios`, `android`, or `apple` alias); target-only selection is invalid. -- `push` simulates notification delivery: - - iOS simulator uses APNs-style payload JSON. - - Android uses broadcast action + typed extras (string/boolean/number). -- `trigger-app-event` requires app-defined deep-link hooks and URL template configuration (`AGENT_DEVICE_APP_EVENT_URL_TEMPLATE` or platform-specific variants). -- On macOS, set `AGENT_DEVICE_MACOS_APP_EVENT_URL_TEMPLATE` when the desktop app uses a different deep-link template than iOS/Android. -- `trigger-app-event` requires an active session or explicit selectors (`--platform`, `--device`, `--udid`, `--serial`); on iOS physical devices, custom-scheme triggers require active app context. -- Canonical trigger behavior and caveats are documented in [`website/docs/docs/commands.md`](../../website/docs/docs/commands.md) under **App event triggers**. -- Permission settings are app-scoped on iOS/Android and require an active session app: - `settings permission [full|limited]` -- On macOS, use: - `settings permission ` -- macOS permission helpers check/request access and guide the user to System Settings when manual approval is required. -- iOS simulator permission alerts: use `alert wait` then `alert accept/dismiss` — `accept`/`dismiss` retry internally for up to 2 s so you do not need manual sleeps. See [references/permissions.md](references/permissions.md). -- `full|limited` mode applies only to iOS `photos`; other targets reject mode. -- On Android, non-ASCII `fill/type` may require an ADB keyboard IME on some system images; only install IME APKs from trusted sources and verify checksum/signature. -- If using `--save-script`, prefer explicit path syntax (`--save-script=flow.ad` or `./flow.ad`). -- For tenant-isolated remote runs, always pass `--tenant`, `--session-isolation tenant`, `--run-id`, and `--lease-id` together. -- Use short lease TTLs and heartbeat only while work is active; release leases immediately after run completion/failure. -- Env equivalents for scoped runs: `AGENT_DEVICE_IOS_SIMULATOR_DEVICE_SET` (compat `IOS_SIMULATOR_DEVICE_SET`) and - `AGENT_DEVICE_ANDROID_DEVICE_ALLOWLIST` (compat `ANDROID_DEVICE_ALLOWLIST`). -- For explicit remote client mode, prefer `AGENT_DEVICE_DAEMON_BASE_URL` / `--daemon-base-url` instead of relying on local daemon metadata or loopback-only ports. - -## Common Failure Patterns - -- `Failed to access Android app sandbox for /path/app-debug.apk`: Android relaunch/runtime-hint flow received an APK path instead of an installed package name. Use `reinstall` first, then `open --relaunch`. -- `mkdir: Needs 1 argument` while writing `ReactNativeDevPrefs.xml`: likely an older `agent-device` build or stale global install is still using the shell-based Android runtime-hint writer. Verify the exact binary being invoked. -- `Failed to terminate iOS app`: the flow may have selected a physical iPhone or an unavailable iOS target. Re-run with `ensure-simulator`, then pin the simulator with `--device` or `--udid`. - -## Security and Trust Notes - -- Prefer a preinstalled `agent-device` binary over on-demand package execution. -- If install is required, pin an exact version (for example: `npx --yes agent-device@ --help`). -- Signing/provisioning environment variables are optional, sensitive, and only for iOS physical-device setup. -- Logs/artifacts are written under `~/.agent-device`; replay scripts write to explicit paths you provide. -- For remote daemon mode, prefer `AGENT_DEVICE_DAEMON_SERVER_MODE=http|dual` on the host plus client-side `AGENT_DEVICE_DAEMON_BASE_URL`, with `AGENT_DEVICE_HTTP_AUTH_HOOK` and tenant-scoped lease admission where needed. -- Keep logging off unless debugging and use least-privilege/isolated environments for autonomous runs. - -## Common Mistakes - -- Mixing debug flow into normal runs (keep logs off unless debugging). -- Continuing to use stale refs after screen transitions. -- Using URL opens with Android `--activity` (unsupported combination). -- Treating `boot` as default first step instead of fallback. +- Prefer `snapshot -i`; use `--raw` only for structure debugging +- Use plain `snapshot` to verify text visibility; use `snapshot -i` mainly for interactive exploration and choosing refs +- Use refs for discovery, selectors for replay/assertions +- `fill` clears then types; `type` only types into the focused field +- `network dump` is best-effort and reads from the session app log +- `logs clear --restart` requires an active app session +- On macOS, helper-backed flows cover permissions, alerts, and desktop-global snapshot surfaces +- On macOS, do not assume `desktop` or `menubar` are the best surface for real interactions yet ## References -- [references/snapshot-refs.md](references/snapshot-refs.md) - [references/macos-desktop.md](references/macos-desktop.md) +- [references/snapshot-refs.md](references/snapshot-refs.md) - [references/logs-and-debug.md](references/logs-and-debug.md) - [references/session-management.md](references/session-management.md) - [references/permissions.md](references/permissions.md) diff --git a/skills/agent-device/references/macos-desktop.md b/skills/agent-device/references/macos-desktop.md index b0cead1d..67f7ab60 100644 --- a/skills/agent-device/references/macos-desktop.md +++ b/skills/agent-device/references/macos-desktop.md @@ -2,14 +2,22 @@ Use this reference for host Mac apps such as Finder, TextEdit, System Settings, Preview, or browser apps running as normal desktop windows. +## Start here + +- Use `open --platform macos` when you need to act inside one app. +- Use `open --platform macos --surface frontmost-app|desktop|menubar` when you need to inspect desktop-global UI first. +- Use `app` sessions for `click`, `fill`, `press`, `scroll`, `screenshot`, and `record`. +- Use `frontmost-app`, `desktop`, and `menubar` mainly for `snapshot`, `get`, `is`, and `wait`. +- Prefer `@ref` or selectors. Avoid raw coordinates unless there is no better target. + ## Mental model -- `snapshot -i` should describe UI that is visible to a human in the current front window. +- `snapshot -i` should describe UI visible to a human. - Context menus are not ambient UI. Open them explicitly with `click --button secondary`, then re-snapshot. - Prefer refs for exploration and selectors for deterministic replay/assertions. -- Avoid raw `x y` coordinates unless refs/selectors are impossible. +- If you inspect with `desktop` or `menubar` and then need to act on one app, switch to a normal `app` session. -## Canonical flow +## Canonical app flow ```bash agent-device open Finder --platform macos @@ -19,18 +27,36 @@ agent-device snapshot -i agent-device close ``` +## Canonical desktop-global flow + +```bash +agent-device open --platform macos --surface desktop +agent-device snapshot -i +agent-device get attrs @e4 +agent-device is visible 'role="window" label="Finder"' +agent-device close +``` + Surface variants: ```bash agent-device open --platform macos --surface frontmost-app +agent-device open --platform macos --surface desktop +agent-device open --platform macos --surface menubar ``` -- `frontmost-app` tracks the currently focused app explicitly in the session. -- `desktop` and `menubar` remain planned phase-2 surfaces for broader desktop-global automation work. +- `app`: default session surface; use this for most real interaction work. +- `frontmost-app`: inspect the currently focused app without naming it first. +- `desktop`: inspect visible desktop windows across apps. +- `menubar`: inspect the active app menu bar and system menu extras. + +Use `frontmost-app`, `desktop`, and `menubar` for read/inspect flows first. If the next step is a click/fill/press/scroll in one app, switch back to `app`. ## What to expect from snapshots -- `snapshot -i` prioritizes visible window content over dormant menu infrastructure. +- `app` snapshots should focus on the chosen app window. +- `desktop` snapshots can contain multiple windows from multiple apps. +- `menubar` snapshots can contain both app-menu items and system menu extras. - File rows, sidebar items, toolbar controls, search fields, and visible context menus should appear. - Finder and other native apps may expose duplicate-looking structures such as row wrapper nodes, `cell` nodes, and child `text` or `text-field` nodes. - Treat those as distinct AX nodes unless you have a stronger selector anchor. @@ -53,6 +79,8 @@ Expected pattern: Do not expect context-menu items to appear before the menu is opened. +Do not use `longpress` as a substitute for right-click on macOS. + ## Finder-specific guidance - `snapshot -i` should still expose visible folder rows even when nothing is selected. @@ -81,18 +109,21 @@ Good macOS selectors usually anchor on one of: - `label="failed-step.json"` - `role=button label="Search"` - `role=menu-item label="Rename"` +- `role=window label="Notes"` Prefer exact labels when the desktop UI is stable. Use `id=...` when the AX identifier is clearly app-owned and not a framework-generated `_NS:*` value. ## Things not to rely on - Mobile-only helpers like `install`, `reinstall`, and `push` -- Long-press as a substitute for right-click +- Desktop-global click/fill parity from `desktop` or `menubar` sessions - Raw coordinate assumptions across runs; macOS windows can move - Framework-generated `_NS:*` identifiers as stable selectors ## Troubleshooting - If visible window content is missing from `snapshot -i`, re-snapshot once after the UI settles. +- If `desktop` is too broad, retry with `frontmost-app` to narrow the inspect surface. +- If `menubar` is missing the menu you expect, make the target app frontmost first, then re-open the `menubar` surface and snapshot again. - If the wrong menu opened or no menu appeared, retry secondary-clicking the row/cell wrapper instead of the nested text node. - If the app has multiple windows, ensure the correct one is frontmost before relying on refs. diff --git a/skills/agent-device/references/snapshot-refs.md b/skills/agent-device/references/snapshot-refs.md index eba0d11e..f1adf1c4 100644 --- a/skills/agent-device/references/snapshot-refs.md +++ b/skills/agent-device/references/snapshot-refs.md @@ -12,6 +12,9 @@ For host Mac desktop apps, pair this reference with [macos-desktop.md](macos-des agent-device snapshot -i ``` +Use plain `snapshot` when you need to verify whether text is visible on screen. +Use `snapshot -i` mainly for interactive exploration and choosing refs. + Output: ``` diff --git a/src/core/session-surface.ts b/src/core/session-surface.ts index 3ebdd98d..0f9429d6 100644 --- a/src/core/session-surface.ts +++ b/src/core/session-surface.ts @@ -9,12 +9,6 @@ export const SESSION_SURFACES: readonly SessionSurface[] = [ 'menubar', ]; -export const PHASE1_MACOS_SESSION_SURFACES: readonly SessionSurface[] = ['app', 'frontmost-app']; - -export function isPhase1MacOsSessionSurface(surface: SessionSurface): boolean { - return PHASE1_MACOS_SESSION_SURFACES.includes(surface); -} - export function parseSessionSurface(value: string | undefined): SessionSurface { const normalized = value?.trim().toLowerCase(); if ( diff --git a/src/daemon/handlers/__tests__/session.test.ts b/src/daemon/handlers/__tests__/session.test.ts index 50396802..5d083871 100644 --- a/src/daemon/handlers/__tests__/session.test.ts +++ b/src/daemon/handlers/__tests__/session.test.ts @@ -2576,7 +2576,7 @@ test('open on existing macOS frontmost-app session preserves surface without --s ); }); -test('open on macOS rejects desktop surface until desktop-global backend lands', async () => { +test('open on macOS stores desktop surface without app context', async () => { const sessionStore = makeSessionStore(); const sessionName = 'macos-desktop-surface'; const response = await handleSessionCommands({ @@ -2606,11 +2606,53 @@ test('open on macOS rejects desktop surface until desktop-global backend lands', }), }); - assert.equal(response?.ok, false); - if (response && !response.ok) { - assert.equal(response.error.code, 'INVALID_ARGS'); - assert.match(response.error.message, /not supported yet/i); - assert.match(response.error.message, /app\|frontmost-app/i); + assert.equal(response?.ok, true); + const session = sessionStore.get(sessionName); + assert.equal(session?.surface, 'desktop'); + assert.equal(session?.appBundleId, undefined); + assert.equal(session?.appName, undefined); + if (response && response.ok) { + assert.equal(response.data?.surface, 'desktop'); + assert.equal(response.data?.appBundleId, undefined); + } +}); + +test('open on macOS stores menubar surface without app context', async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-menubar-surface'; + const response = await handleSessionCommands({ + req: { + token: 't', + session: sessionName, + command: 'open', + positionals: [], + flags: { + platform: 'macos', + surface: 'menubar', + }, + }, + sessionName, + logPath: path.join(os.tmpdir(), 'daemon.log'), + sessionStore, + invoke: noopInvoke, + dispatch: async () => ({}), + ensureReady: async () => {}, + resolveTargetDevice: async () => ({ + platform: 'macos', + id: 'host-macos-local', + name: 'Host Mac', + kind: 'device', + target: 'desktop', + booted: true, + }), + }); + + assert.equal(response?.ok, true); + const session = sessionStore.get(sessionName); + assert.equal(session?.surface, 'menubar'); + assert.equal(session?.appBundleId, undefined); + if (response && response.ok) { + assert.equal(response.data?.surface, 'menubar'); } }); diff --git a/src/daemon/handlers/__tests__/snapshot-handler.test.ts b/src/daemon/handlers/__tests__/snapshot-handler.test.ts index c9ee315b..0315fa2b 100644 --- a/src/daemon/handlers/__tests__/snapshot-handler.test.ts +++ b/src/daemon/handlers/__tests__/snapshot-handler.test.ts @@ -159,6 +159,101 @@ test('settings on macOS returns helper-backed permission status', async () => { ); }); +test('snapshot on macOS desktop surface uses helper-backed surface snapshot', async () => { + await withMockedMacOsHelper( + [ + '#!/bin/sh', + 'printf "%s\\n" "$@" > "$AGENT_DEVICE_TEST_ARGS_FILE"', + "cat <<'JSON'", + '{"ok":true,"data":{"surface":"desktop","nodes":[{"index":0,"depth":0,"type":"DesktopSurface","label":"Desktop","surface":"desktop"},{"index":1,"depth":1,"parentIndex":0,"type":"Window","label":"Notes","surface":"desktop","bundleId":"com.apple.Notes","appName":"Notes","windowTitle":"Notes","rect":{"x":32,"y":48,"width":640,"height":480}}],"truncated":false,"backend":"macos-helper"}}', + 'JSON', + '', + ].join('\n'), + async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-desktop-snapshot'; + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-desktop-snapshot-')); + const argsLogPath = path.join(tmpDir, 'args.log'); + const previousArgsFile = process.env.AGENT_DEVICE_TEST_ARGS_FILE; + process.env.AGENT_DEVICE_TEST_ARGS_FILE = argsLogPath; + sessionStore.set(sessionName, { + ...makeSession(sessionName, macOsDevice), + surface: 'desktop', + }); + + try { + const response = await handleSnapshotCommands({ + req: { + token: 't', + session: sessionName, + command: 'snapshot', + positionals: [], + flags: {}, + }, + sessionName, + logPath: '/tmp/daemon.log', + sessionStore, + }); + + assert.equal(response?.ok, true); + const logged = await fs.promises.readFile(argsLogPath, 'utf8'); + assert.equal(logged, 'snapshot\n--surface\ndesktop\n'); + const updated = sessionStore.get(sessionName); + assert.equal(updated?.snapshot?.backend, 'macos-helper'); + assert.equal(updated?.snapshot?.nodes[0]?.label, 'Desktop'); + assert.equal(updated?.snapshot?.nodes[1]?.windowTitle, 'Notes'); + } finally { + if (previousArgsFile === undefined) delete process.env.AGENT_DEVICE_TEST_ARGS_FILE; + else process.env.AGENT_DEVICE_TEST_ARGS_FILE = previousArgsFile; + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, + ); +}); + +test('wait text on macOS desktop surface polls helper-backed snapshots instead of runner text search', async () => { + await withMockedMacOsHelper( + [ + '#!/bin/sh', + "cat <<'JSON'", + '{"ok":true,"data":{"surface":"desktop","nodes":[{"index":0,"depth":0,"type":"DesktopSurface","label":"Desktop","surface":"desktop"},{"index":1,"depth":1,"parentIndex":0,"type":"StaticText","label":"Accessibility","surface":"desktop"}],"truncated":false,"backend":"macos-helper"}}', + 'JSON', + '', + ].join('\n'), + async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-desktop-wait'; + sessionStore.set(sessionName, { + ...makeSession(sessionName, macOsDevice), + surface: 'desktop', + }); + + let runnerCalls = 0; + const response = await handleSnapshotCommands({ + req: { + token: 't', + session: sessionName, + command: 'wait', + positionals: ['Accessibility', '10'], + flags: {}, + }, + sessionName, + logPath: '/tmp/daemon.log', + sessionStore, + runnerCommand: async () => { + runnerCalls += 1; + return { found: false }; + }, + }); + + assert.equal(response?.ok, true); + assert.equal(runnerCalls, 0); + const updated = sessionStore.get(sessionName); + assert.equal(updated?.snapshot?.backend, 'macos-helper'); + }, + ); +}); + test('diff rejects unsupported kind', async () => { const sessionStore = makeSessionStore(); const response = await handleSnapshotCommands({ diff --git a/src/daemon/handlers/find.ts b/src/daemon/handlers/find.ts index 4c13eab1..f51ea802 100644 --- a/src/daemon/handlers/find.ts +++ b/src/daemon/handlers/find.ts @@ -86,7 +86,7 @@ export async function handleFindCommands(params: { })) as { nodes?: RawSnapshotNode[]; truncated?: boolean; - backend?: 'xctest' | 'android'; + backend?: 'xctest' | 'android' | 'macos-helper'; }; const rawNodes = data?.nodes ?? []; const nodes = attachRefs(req.flags?.snapshotRaw ? rawNodes : pruneGroupNodes(rawNodes)); diff --git a/src/daemon/handlers/interaction-snapshot.ts b/src/daemon/handlers/interaction-snapshot.ts index 093aae37..d4fc9a1b 100644 --- a/src/daemon/handlers/interaction-snapshot.ts +++ b/src/daemon/handlers/interaction-snapshot.ts @@ -1,10 +1,9 @@ import { dispatchCommand, type CommandFlags } from '../../core/dispatch.ts'; -import { attachRefs, type RawSnapshotNode } from '../../utils/snapshot.ts'; -import { pruneGroupNodes } from '../snapshot-processing.ts'; import type { SessionStore } from '../session-store.ts'; import type { SessionState } from '../types.ts'; import type { SnapshotState } from '../../utils/snapshot.ts'; import type { ContextFromFlags } from './interaction-common.ts'; +import { captureSnapshot } from './snapshot-capture.ts'; export async function captureSnapshotForSession( session: SessionState, @@ -14,29 +13,30 @@ export async function captureSnapshotForSession( options: { interactiveOnly: boolean }, dispatch: typeof dispatchCommand = dispatchCommand, ): Promise { - const data = (await dispatch(session.device, 'snapshot', [], flags?.out, { - ...contextFromFlags( - { - ...(flags ?? {}), - snapshotInteractiveOnly: options.interactiveOnly, - snapshotCompact: options.interactiveOnly, - }, - session.appBundleId, - session.trace?.outPath, - ), - })) as { - nodes?: RawSnapshotNode[]; - truncated?: boolean; - backend?: 'xctest' | 'android'; - }; - const rawNodes = data?.nodes ?? []; - const nodes = attachRefs(flags?.snapshotRaw ? rawNodes : pruneGroupNodes(rawNodes)); - session.snapshot = { - nodes, - truncated: data?.truncated, - createdAt: Date.now(), - backend: data?.backend, + const effectiveFlags = { + ...(flags ?? {}), + snapshotInteractiveOnly: options.interactiveOnly, + snapshotCompact: options.interactiveOnly, }; + const dispatchContext = contextFromFlags( + effectiveFlags, + session.appBundleId, + session.trace?.outPath, + ); + const { snapshot } = await captureSnapshot({ + dispatchSnapshotCommand: dispatch, + device: session.device, + session, + req: { + token: '', + session: session.name, + command: 'snapshot', + positionals: [], + flags: effectiveFlags, + }, + logPath: dispatchContext.logPath ?? '', + }); + session.snapshot = snapshot; sessionStore.set(session.name, session); return session.snapshot; } diff --git a/src/daemon/handlers/interaction-touch.ts b/src/daemon/handlers/interaction-touch.ts index de74c1f9..6982b6f5 100644 --- a/src/daemon/handlers/interaction-touch.ts +++ b/src/daemon/handlers/interaction-touch.ts @@ -36,7 +36,7 @@ type CaptureSnapshotForSession = ( nodes: SnapshotNode[]; truncated?: boolean; createdAt: number; - backend?: 'xctest' | 'android'; + backend?: 'xctest' | 'android' | 'macos-helper'; }>; type ResolveRefTarget = diff --git a/src/daemon/handlers/session-open.ts b/src/daemon/handlers/session-open.ts index 4f31dccb..4885dba6 100644 --- a/src/daemon/handlers/session-open.ts +++ b/src/daemon/handlers/session-open.ts @@ -1,10 +1,6 @@ import { dispatchCommand, resolveTargetDevice } from '../../core/dispatch.ts'; import { isDeepLinkTarget } from '../../core/open-target.ts'; -import { - isPhase1MacOsSessionSurface, - parseSessionSurface, - type SessionSurface, -} from '../../core/session-surface.ts'; +import { parseSessionSurface, type SessionSurface } from '../../core/session-surface.ts'; import { ensureDeviceReady } from '../device-ready.ts'; import { contextFromFlags } from '../context.ts'; import { resolveFrontmostMacOsApp } from '../../platforms/ios/macos-helper.ts'; @@ -117,12 +113,6 @@ function resolveOpenSurface( return 'app'; } const surface = surfaceFlag ? parseSessionSurface(surfaceFlag) : 'app'; - if (!isPhase1MacOsSessionSurface(surface)) { - throw new AppError( - 'INVALID_ARGS', - `open --surface ${surface} is planned but not supported yet. Use app|frontmost-app for now.`, - ); - } if (surface !== 'app' && openTarget) { throw new AppError('INVALID_ARGS', `open --surface ${surface} does not accept an app target`); } @@ -148,12 +138,9 @@ function resolveRequestedOpenSurface(params: { async function resolveMacOsSurfaceAppState( surface: SessionSurface, ): Promise<{ appBundleId?: string; appName?: string }> { - if (surface === 'app') { + if (surface === 'app' || surface === 'desktop' || surface === 'menubar') { return {}; } - if (surface !== 'frontmost-app') { - throw new AppError('INVALID_ARGS', `open --surface ${surface} is not supported in phase 1`); - } const frontmost = await resolveFrontmostMacOsApp(); return { appBundleId: frontmost.bundleId, diff --git a/src/daemon/handlers/session.ts b/src/daemon/handlers/session.ts index a72dc810..2518ea9a 100644 --- a/src/daemon/handlers/session.ts +++ b/src/daemon/handlers/session.ts @@ -1454,7 +1454,7 @@ async function captureSnapshotForReplay( })) as { nodes?: RawSnapshotNode[]; truncated?: boolean; - backend?: 'xctest' | 'android'; + backend?: 'xctest' | 'android' | 'macos-helper'; }; const rawNodes = data?.nodes ?? []; const nodes = attachRefs(action.flags?.snapshotRaw ? rawNodes : pruneGroupNodes(rawNodes)); diff --git a/src/daemon/handlers/snapshot-capture.ts b/src/daemon/handlers/snapshot-capture.ts index a25af209..29c0ac80 100644 --- a/src/daemon/handlers/snapshot-capture.ts +++ b/src/daemon/handlers/snapshot-capture.ts @@ -1,4 +1,5 @@ import { dispatchCommand } from '../../core/dispatch.ts'; +import { runMacOsSnapshotAction } from '../../platforms/ios/macos-helper.ts'; import { attachRefs, findNodeByRef, @@ -23,6 +24,10 @@ export async function captureSnapshot( params: CaptureSnapshotParams, ): Promise<{ snapshot: SnapshotState }> { const { dispatchSnapshotCommand, device, session, req, logPath, snapshotScope } = params; + if (device.platform === 'macos' && session?.surface && session.surface !== 'app') { + const helperSnapshot = await runMacOsSnapshotAction(session.surface); + return { snapshot: buildSnapshotState(helperSnapshot, req.flags?.snapshotRaw) }; + } const data = (await dispatchSnapshotCommand(device, 'snapshot', [], req.flags?.out, { ...contextFromFlags( logPath, @@ -33,7 +38,7 @@ export async function captureSnapshot( })) as { nodes?: RawSnapshotNode[]; truncated?: boolean; - backend?: 'xctest' | 'android'; + backend?: 'xctest' | 'android' | 'macos-helper'; }; return { snapshot: buildSnapshotState(data, req.flags?.snapshotRaw) }; } @@ -42,7 +47,7 @@ export function buildSnapshotState( data: { nodes?: RawSnapshotNode[]; truncated?: boolean; - backend?: 'xctest' | 'android'; + backend?: 'xctest' | 'android' | 'macos-helper'; }, snapshotRaw: boolean | undefined, ): SnapshotState { diff --git a/src/daemon/handlers/snapshot-wait.ts b/src/daemon/handlers/snapshot-wait.ts index 7bfc427c..6fdaf802 100644 --- a/src/daemon/handlers/snapshot-wait.ts +++ b/src/daemon/handlers/snapshot-wait.ts @@ -3,13 +3,7 @@ import { dispatchCommand } from '../../core/dispatch.ts'; import { runIosRunnerCommand } from '../../platforms/ios/runner-client.ts'; import { snapshotAndroid } from '../../platforms/android/index.ts'; import { isApplePlatform } from '../../utils/device.ts'; -import { - attachRefs, - findNodeByRef, - normalizeRef, - type RawSnapshotNode, -} from '../../utils/snapshot.ts'; -import { contextFromFlags } from '../context.ts'; +import { attachRefs, findNodeByRef, normalizeRef } from '../../utils/snapshot.ts'; import { findNodeByLabel, resolveRefLabel } from '../snapshot-processing.ts'; import { SessionStore } from '../session-store.ts'; import { @@ -19,7 +13,7 @@ import { type SelectorChain, } from '../selectors.ts'; import type { DaemonRequest, DaemonResponse, SessionState } from '../types.ts'; -import { buildSnapshotState } from './snapshot-capture.ts'; +import { captureSnapshot } from './snapshot-capture.ts'; import { recordIfSession } from './snapshot-session.ts'; import { DEFAULT_TIMEOUT_MS, parseTimeout, POLL_INTERVAL_MS } from './parse-utils.ts'; @@ -118,6 +112,7 @@ export async function handleWaitCommand(params: HandleWaitCommandParams): Promis const textResult = resolveWaitText(parsed, session); if (!textResult.ok) return textResult.response; return await waitForText({ + dispatchSnapshotCommand, device, logPath, req, @@ -152,26 +147,20 @@ async function waitForSelector(params: { const timeout = parsed.timeoutMs ?? DEFAULT_TIMEOUT_MS; const start = Date.now(); while (Date.now() - start < timeout) { - const data = await dispatchSnapshotCommand(device, 'snapshot', [], req.flags?.out, { - ...contextFromFlags( - logPath, - { + const { snapshot } = await captureSnapshot({ + dispatchSnapshotCommand, + device, + session, + req: { + ...req, + flags: { ...req.flags, snapshotInteractiveOnly: false, snapshotCompact: false, }, - session?.appBundleId, - session?.trace?.outPath, - ), - }); - const snapshot = buildSnapshotState( - data as { - nodes?: RawSnapshotNode[]; - truncated?: boolean; - backend?: 'xctest' | 'android'; }, - req.flags?.snapshotRaw, - ); + logPath, + }); const nodes = snapshot.nodes; if (session) { session.snapshot = snapshot; @@ -258,6 +247,7 @@ function resolveWaitText( } async function waitForText(params: { + dispatchSnapshotCommand: typeof dispatchCommand; device: SessionState['device']; logPath: string; req: DaemonRequest; @@ -271,7 +261,30 @@ async function waitForText(params: { const timeout = timeoutMs ?? DEFAULT_TIMEOUT_MS; const start = Date.now(); while (Date.now() - start < timeout) { - if (isApplePlatform(device.platform)) { + if (device.platform === 'macos' && session?.surface && session.surface !== 'app') { + const { snapshot } = await captureSnapshot({ + dispatchSnapshotCommand: params.dispatchSnapshotCommand, + device, + session, + req: { + ...req, + flags: { + ...req.flags, + snapshotInteractiveOnly: false, + snapshotCompact: false, + }, + }, + logPath, + }); + if (session) { + session.snapshot = snapshot; + sessionStore.set(session.name, session); + } + if (findNodeByLabel(snapshot.nodes, text)) { + recordIfSession(sessionStore, session, req, { text, waitedMs: Date.now() - start }); + return { ok: true, data: { text, waitedMs: Date.now() - start } }; + } + } else if (isApplePlatform(device.platform)) { const result = (await runnerCommand( device, { command: 'findText', text, appBundleId: session?.appBundleId }, diff --git a/src/daemon/snapshot-processing.ts b/src/daemon/snapshot-processing.ts index e4aaefa9..29a0f659 100644 --- a/src/daemon/snapshot-processing.ts +++ b/src/daemon/snapshot-processing.ts @@ -82,6 +82,7 @@ export function normalizeType(type: string): string { let value = type .trim() .replace(/XCUIElementType/gi, '') + .replace(/^AX/gi, '') .toLowerCase(); const lastSeparator = Math.max(value.lastIndexOf('.'), value.lastIndexOf('/')); if (lastSeparator !== -1) { diff --git a/src/platforms/ios/macos-helper.ts b/src/platforms/ios/macos-helper.ts index 3eb76715..51270b36 100644 --- a/src/platforms/ios/macos-helper.ts +++ b/src/platforms/ios/macos-helper.ts @@ -9,6 +9,32 @@ import type { SessionSurface } from '../../core/session-surface.ts'; export type MacOsPermissionTarget = 'accessibility' | 'screen-recording' | 'input-monitoring'; +export type MacOsSnapshotNode = { + index: number; + type?: string; + role?: string; + subrole?: string; + label?: string; + value?: string; + identifier?: string; + rect?: { + x: number; + y: number; + width: number; + height: number; + }; + enabled?: boolean; + selected?: boolean; + hittable?: boolean; + depth?: number; + parentIndex?: number; + pid?: number; + bundleId?: string; + appName?: string; + windowTitle?: string; + surface?: string; +}; + type HelperSuccess> = { ok: true; data: T; @@ -249,3 +275,12 @@ export async function runMacOsAlertAction( } return await runMacOsHelper(args); } + +export async function runMacOsSnapshotAction(surface: Exclude): Promise<{ + surface: Exclude; + nodes: MacOsSnapshotNode[]; + truncated: false; + backend: 'macos-helper'; +}> { + return await runMacOsHelper(['snapshot', '--surface', surface]); +} diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index e01e32a4..e65d0503 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -1,5 +1,5 @@ import { SETTINGS_USAGE_OVERRIDE } from '../core/settings-contract.ts'; -import { PHASE1_MACOS_SESSION_SURFACES } from '../core/session-surface.ts'; +import { SESSION_SURFACES } from '../core/session-surface.ts'; export type CliFlags = { json: boolean; @@ -70,7 +70,7 @@ export type CliFlags = { saveScript?: boolean | string; shutdown?: boolean; relaunch?: boolean; - surface?: 'app' | 'frontmost-app'; + surface?: 'app' | 'frontmost-app' | 'desktop' | 'menubar'; headless?: boolean; restart?: boolean; noRecord?: boolean; @@ -305,9 +305,9 @@ const FLAG_DEFINITIONS: readonly FlagDefinition[] = [ key: 'surface', names: ['--surface'], type: 'enum', - enumValues: PHASE1_MACOS_SESSION_SURFACES, - usageLabel: '--surface app|frontmost-app', - usageDescription: 'macOS phase-1 session surface for open (defaults to app)', + enumValues: SESSION_SURFACES, + usageLabel: '--surface app|frontmost-app|desktop|menubar', + usageDescription: 'macOS session surface for open (defaults to app)', }, { key: 'headless', @@ -815,7 +815,7 @@ const COMMAND_SCHEMAS: Record = { }, open: { helpDescription: - 'Boot device/simulator; optionally launch app or deep link URL (macOS also supports --surface app|frontmost-app)', + 'Boot device/simulator; optionally launch app or deep link URL (macOS also supports --surface app|frontmost-app|desktop|menubar)', summary: 'Open an app, deep link or URL, save replays', positionalArgs: ['appOrUrl?', 'url?'], allowedFlags: ['activity', 'saveScript', 'relaunch', 'surface'], diff --git a/src/utils/snapshot.ts b/src/utils/snapshot.ts index 9f9e6d04..ae377897 100644 --- a/src/utils/snapshot.ts +++ b/src/utils/snapshot.ts @@ -16,6 +16,8 @@ export type SnapshotOptions = { export type RawSnapshotNode = { index: number; type?: string; + role?: string; + subrole?: string; label?: string; value?: string; identifier?: string; @@ -25,6 +27,11 @@ export type RawSnapshotNode = { hittable?: boolean; depth?: number; parentIndex?: number; + pid?: number; + bundleId?: string; + appName?: string; + windowTitle?: string; + surface?: string; }; export type SnapshotNode = RawSnapshotNode & { @@ -35,7 +42,7 @@ export type SnapshotState = { nodes: SnapshotNode[]; createdAt: number; truncated?: boolean; - backend?: 'xctest' | 'android'; + backend?: 'xctest' | 'android' | 'macos-helper'; }; export function attachRefs(nodes: RawSnapshotNode[]): SnapshotNode[] { diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index faac66a3..b9b7d2d1 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -17,6 +17,7 @@ agent-device boot --platform android agent-device boot --platform android --device Pixel_9_Pro_XL --headless agent-device open [app|url] [url] agent-device open --platform macos --surface frontmost-app +agent-device open --platform macos --surface desktop agent-device close [app] agent-device back agent-device home @@ -33,10 +34,10 @@ agent-device app-switcher - `open [app|url] [url]` already boots/activates the selected target when needed. - `open ` deep links are supported on Android and iOS. - `open ` opens a deep link on iOS. -- `open --platform macos --surface app|frontmost-app` selects the macOS phase-1 session surface explicitly. `app` is the default when an app argument is provided. +- `open --platform macos --surface app|frontmost-app|desktop|menubar` selects the macOS session surface explicitly. `app` is the default when an app argument is provided. - On iOS devices, `http(s)://` URLs open in Safari when no app is active. Custom scheme URLs require an active app in the session. - `AGENT_DEVICE_SESSION` and `AGENT_DEVICE_PLATFORM` can pre-bind a default session/platform for CLI automation runs, so normal commands (`open`, `snapshot`, `press`, `fill`, `screenshot`, `devices`, and `batch`) do not need those flags repeated on every call. -- A configured `AGENT_DEVICE_SESSION` now implies bound-session lock mode by default. The CLI forwards that policy to the daemon, which enforces the same conflict handling for CLI, typed client, and direct RPC requests. +- A configured `AGENT_DEVICE_SESSION` implies bound-session lock mode by default. The CLI forwards that policy to the daemon, which enforces the same conflict handling for CLI, typed client, and direct RPC requests. - `--session-lock reject|strip` sets the lock policy for a single CLI invocation, including nested batch steps. - `AGENT_DEVICE_SESSION_LOCK=reject|strip` sets the default lock policy for bound-session automation runs. The older `--session-locked`, `--session-lock-conflicts`, `AGENT_DEVICE_SESSION_LOCKED`, and `AGENT_DEVICE_SESSION_LOCK_CONFLICTS` forms remain supported as compatibility aliases. - Direct RPC callers can pass `meta.lockPolicy` and optional `meta.lockPlatform` on `agent_device.command` requests for the same daemon-enforced behavior. @@ -123,18 +124,40 @@ agent-device screenshot apple-tv.png --platform ios --target tv ```bash agent-device devices --platform macos agent-device open TextEdit --platform macos +agent-device open --platform macos --surface desktop agent-device snapshot -i --platform apple --target desktop ``` - `--platform macos` selects the host Mac as a `desktop` target. - `--platform apple --target desktop` selects the same macOS backend through the Apple-family alias. -- macOS uses the same runner-driven interaction/snapshot flow as iOS/tvOS for app-scoped `open`, `appstate`, `snapshot`, `press`, `fill`, `scroll`, `back`, `screenshot`, `record`, and selector-based commands. -- `open --platform macos --surface frontmost-app` stores the currently focused app as the session surface. -- `desktop` and `menubar` remain the planned phase-2 path for broader computer-use support; they are not exposed yet in the phase-1 CLI surface. +- Use `app` sessions for normal app control: `open`, `snapshot`, `click`, `fill`, `press`, `scroll`, `back`, `screenshot`, `record`. +- Use `frontmost-app`, `desktop`, and `menubar` when you need to inspect desktop-global UI before choosing one app. +- `open --platform macos --surface frontmost-app` inspects the currently focused app without naming it first. +- `open --platform macos --surface desktop` inspects visible windows across the desktop. +- `open --platform macos --surface menubar` inspects the active app menu bar and system menu extras. +- Use `frontmost-app`, `desktop`, and `menubar` mainly for `snapshot`, `get`, `is`, and `wait`. +- If you inspect with `desktop` or `menubar` and then need to click or fill inside one app, open that app in a normal `app` session. - macOS also supports `clipboard read|write`, `trigger-app-event`, `logs`, `network dump`, `alert`, `settings appearance`, and `settings permission `. - Prefer selector or `@ref`-driven interactions on macOS. Window position can shift between runs, so raw x/y point commands are less stable than snapshot-derived targets. +- Use `click --button secondary` for context menus on macOS, then run `snapshot -i` again. - Mobile-only helpers remain unsupported on macOS: `boot`, `home`, `app-switcher`, `install`, `reinstall`, `install-from-source`, and `push`. +Recommended loops: + +```bash +# One app, full interaction +agent-device open TextEdit --platform macos +agent-device snapshot -i +agent-device fill @e3 "hello" +agent-device close + +# Desktop-global inspection first +agent-device open --platform macos --surface desktop +agent-device snapshot -i +agent-device is visible 'role="window" label="Notes"' +agent-device close +``` + ## Snapshot and inspect ```bash diff --git a/website/docs/docs/installation.md b/website/docs/docs/installation.md index 25285957..d232df50 100644 --- a/website/docs/docs/installation.md +++ b/website/docs/docs/installation.md @@ -25,7 +25,7 @@ npx agent-device open Settings --platform ios ## macOS desktop notes -- The macOS desktop path now uses a local `agent-device-macos-helper` for permission checks (`settings permission ...`), alert handling, and related host-Mac support. +- The macOS desktop path uses a local `agent-device-macos-helper` for permission checks (`settings permission ...`), alert handling, and helper-backed desktop snapshot surfaces (`frontmost-app`, `desktop`, `menubar`). - Source checkouts build the helper lazily on first use and cache it under `~/.agent-device/macos-helper/current/`. - Release distribution should ship a stable signed/notarized helper build so macOS trust/TCC state is tied to a durable code signature instead of an ad-hoc local binary. From beb3c502dc002fc4f1d221cee1b15dcce3d58c2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Wed, 25 Mar 2026 14:27:33 +0100 Subject: [PATCH 2/4] fix: tighten macos desktop surface routing --- .../Sources/AgentDeviceMacOSHelper/main.swift | 47 ++++--- src/daemon/handlers/__tests__/find.test.ts | 69 +++++++++ .../handlers/__tests__/interaction.test.ts | 44 ++++++ .../__tests__/snapshot-handler.test.ts | 41 ++++++ src/daemon/handlers/find.ts | 50 +++---- src/daemon/handlers/interaction-touch.ts | 35 +++++ src/daemon/handlers/snapshot-capture.ts | 131 ++++++++++++++++-- src/utils/__tests__/args.test.ts | 5 +- 8 files changed, 353 insertions(+), 69 deletions(-) diff --git a/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift b/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift index 64cc980e..2de8d986 100644 --- a/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift +++ b/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift @@ -467,27 +467,23 @@ private func snapshotDesktop() -> [SnapshotNodeResponse] { for app in runningApps { let appElement = AXUIElementCreateApplication(app.processIdentifier) - let appContext = SnapshotContext( - surface: "desktop", - pid: Int32(app.processIdentifier), - bundleId: app.bundleIdentifier, - appName: app.localizedName, - windowTitle: nil - ) - var appVisited = Set() - let appIndex = appendElementSnapshot( - appElement, - depth: 1, - parentIndex: rootIndex, - context: appContext, - nodes: &nodes, - visited: &appVisited - ) let visibleWindows = windows(of: appElement).filter(isVisibleSnapshotWindow) if visibleWindows.isEmpty { continue } - var visited = appVisited + let appIndex = appendSyntheticSnapshotNode( + into: &nodes, + type: "Application", + label: app.localizedName ?? app.bundleIdentifier ?? "Application", + depth: 1, + parentIndex: rootIndex, + surface: "desktop", + identifier: app.bundleIdentifier, + pid: Int32(app.processIdentifier), + bundleId: app.bundleIdentifier, + appName: app.localizedName + ) + var visited = Set() for window in visibleWindows { let windowTitle = stringAttribute(window, attribute: kAXTitleAttribute as String) appendElementSnapshot( @@ -575,7 +571,12 @@ private func appendSyntheticSnapshotNode( label: String, depth: Int, parentIndex: Int?, - surface: String + surface: String, + identifier: String? = nil, + pid: Int32? = nil, + bundleId: String? = nil, + appName: String? = nil, + windowTitle: String? = nil ) -> Int { let index = nodes.count nodes.append( @@ -586,17 +587,17 @@ private func appendSyntheticSnapshotNode( subrole: nil, label: label, value: nil, - identifier: "surface:\(surface):\(type.lowercased())", + identifier: identifier ?? "surface:\(surface):\(type.lowercased())", rect: nil, enabled: true, selected: nil, hittable: false, depth: depth, parentIndex: parentIndex, - pid: nil, - bundleId: nil, - appName: nil, - windowTitle: nil, + pid: pid, + bundleId: bundleId, + appName: appName, + windowTitle: windowTitle, surface: surface ) ) diff --git a/src/daemon/handlers/__tests__/find.test.ts b/src/daemon/handlers/__tests__/find.test.ts index a6b11025..2925fb81 100644 --- a/src/daemon/handlers/__tests__/find.test.ts +++ b/src/daemon/handlers/__tests__/find.test.ts @@ -8,6 +8,7 @@ import { AppError } from '../../../utils/errors.ts'; import { SessionStore } from '../../session-store.ts'; import type { SessionState } from '../../types.ts'; import type { DaemonRequest } from '../../types.ts'; +import { withMockedMacOsHelper } from '../../../platforms/ios/__tests__/macos-helper-test-utils.ts'; function makeSessionStore(): SessionStore { const root = fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-find-handler-')); @@ -29,6 +30,22 @@ function makeSession(name: string): SessionState { }; } +function makeMacOsSession(name: string): SessionState { + return { + name, + device: { + platform: 'macos', + id: 'macos-host', + name: 'Mac', + kind: 'device', + booted: true, + }, + createdAt: Date.now(), + actions: [], + surface: 'desktop', + }; +} + const INCREMENT_NODE = { type: 'Button', label: 'Increment', @@ -249,3 +266,55 @@ test('handleFindCommands click returns deterministic metadata across locator var }); } }); + +test('handleFindCommands uses helper-backed snapshots for macOS desktop sessions', async () => { + await withMockedMacOsHelper( + [ + '#!/bin/sh', + 'printf "%s\\n" "$@" > "$AGENT_DEVICE_TEST_ARGS_FILE"', + "cat <<'JSON'", + '{"ok":true,"data":{"surface":"desktop","nodes":[{"index":0,"depth":0,"type":"DesktopSurface","label":"Desktop","surface":"desktop"},{"index":1,"depth":1,"parentIndex":0,"type":"Window","label":"Notes","surface":"desktop","rect":{"x":32,"y":48,"width":640,"height":480}}],"truncated":false,"backend":"macos-helper"}}', + 'JSON', + '', + ].join('\n'), + async ({ tmpDir }) => { + const argsLogPath = path.join(tmpDir, 'args.log'); + const previousArgsFile = process.env.AGENT_DEVICE_TEST_ARGS_FILE; + process.env.AGENT_DEVICE_TEST_ARGS_FILE = argsLogPath; + const sessionStore = makeSessionStore(); + const sessionName = 'macos-desktop-find'; + sessionStore.set(sessionName, makeMacOsSession(sessionName)); + let snapshotDispatchCalls = 0; + + try { + const response = await handleFindCommands({ + req: { + token: 't', + session: sessionName, + command: 'find', + positionals: ['label', 'Notes', 'get', 'attrs'], + flags: {}, + }, + sessionName, + logPath: '/tmp/test.log', + sessionStore, + invoke: async () => ({ ok: true }), + dispatch: async (_device, command) => { + if (command === 'snapshot') { + snapshotDispatchCalls += 1; + } + return {}; + }, + }); + + assert.equal(response?.ok, true); + assert.equal(snapshotDispatchCalls, 0); + const logged = await fs.promises.readFile(argsLogPath, 'utf8'); + assert.equal(logged, 'snapshot\n--surface\ndesktop\n'); + } finally { + if (previousArgsFile === undefined) delete process.env.AGENT_DEVICE_TEST_ARGS_FILE; + else process.env.AGENT_DEVICE_TEST_ARGS_FILE = previousArgsFile; + } + }, + ); +}); diff --git a/src/daemon/handlers/__tests__/interaction.test.ts b/src/daemon/handlers/__tests__/interaction.test.ts index 7b0000e3..311c98b5 100644 --- a/src/daemon/handlers/__tests__/interaction.test.ts +++ b/src/daemon/handlers/__tests__/interaction.test.ts @@ -47,6 +47,22 @@ function makeAndroidSession(name: string): SessionState { }; } +function makeMacOsDesktopSession(name: string): SessionState { + return { + name, + device: { + platform: 'macos', + id: 'macos-host', + name: 'Mac', + kind: 'device', + booted: true, + }, + createdAt: Date.now(), + actions: [], + surface: 'desktop', + }; +} + const contextFromFlags = (flags: CommandFlags | undefined) => ({ count: flags?.count, intervalMs: flags?.intervalMs, @@ -122,6 +138,34 @@ test('press coordinates dispatches press and records as press', async () => { assert.deepEqual(session?.actions[0]?.positionals, ['100', '200']); }); +test('click rejects macOS desktop surface interactions until helper routing exists', async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-desktop-click'; + sessionStore.set(sessionName, makeMacOsDesktopSession(sessionName)); + + const response = await handleInteractionCommands({ + req: { + token: 't', + session: sessionName, + command: 'click', + positionals: ['100', '200'], + flags: {}, + }, + sessionName, + sessionStore, + contextFromFlags, + dispatch: async () => { + throw new Error('dispatch should not be called'); + }, + }); + + assert.equal(response?.ok, false); + if (response && !response.ok) { + assert.equal(response.error.code, 'UNSUPPORTED_OPERATION'); + assert.match(response.error.message, /macOS desktop sessions/); + } +}); + test('press coordinates appends touch-visualization events while recording', async () => { const sessionStore = makeSessionStore(); const sessionName = 'default'; diff --git a/src/daemon/handlers/__tests__/snapshot-handler.test.ts b/src/daemon/handlers/__tests__/snapshot-handler.test.ts index 0315fa2b..aff505e5 100644 --- a/src/daemon/handlers/__tests__/snapshot-handler.test.ts +++ b/src/daemon/handlers/__tests__/snapshot-handler.test.ts @@ -211,6 +211,47 @@ test('snapshot on macOS desktop surface uses helper-backed surface snapshot', as ); }); +test('snapshot on macOS desktop surface applies scope and depth after helper capture', async () => { + await withMockedMacOsHelper( + [ + '#!/bin/sh', + "cat <<'JSON'", + '{"ok":true,"data":{"surface":"desktop","nodes":[{"index":0,"depth":0,"type":"DesktopSurface","label":"Desktop","surface":"desktop"},{"index":1,"depth":1,"parentIndex":0,"type":"Application","label":"Notes","surface":"desktop","bundleId":"com.apple.Notes","appName":"Notes"},{"index":2,"depth":2,"parentIndex":1,"type":"Window","label":"Notes","surface":"desktop","windowTitle":"Notes","rect":{"x":32,"y":48,"width":640,"height":480}},{"index":3,"depth":3,"parentIndex":2,"type":"StaticText","label":"Pinned","surface":"desktop","rect":{"x":40,"y":60,"width":80,"height":24}}],"truncated":false,"backend":"macos-helper"}}', + 'JSON', + '', + ].join('\n'), + async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-desktop-scoped-snapshot'; + sessionStore.set(sessionName, { + ...makeSession(sessionName, macOsDevice), + surface: 'desktop', + }); + + const response = await handleSnapshotCommands({ + req: { + token: 't', + session: sessionName, + command: 'snapshot', + positionals: [], + flags: { snapshotScope: 'Notes', snapshotDepth: 0 }, + }, + sessionName, + logPath: '/tmp/daemon.log', + sessionStore, + }); + + assert.equal(response?.ok, true); + const updated = sessionStore.get(sessionName); + assert.equal(updated?.snapshot?.backend, 'macos-helper'); + assert.equal(updated?.snapshot?.nodes.length, 1); + assert.equal(updated?.snapshot?.nodes[0]?.label, 'Notes'); + assert.equal(updated?.snapshot?.nodes[0]?.depth, 0); + assert.equal(updated?.snapshot?.nodes[0]?.parentIndex, undefined); + }, + ); +}); + test('wait text on macOS desktop surface polls helper-backed snapshots instead of runner text search', async () => { await withMockedMacOsHelper( [ diff --git a/src/daemon/handlers/find.ts b/src/daemon/handlers/find.ts index f51ea802..a428bda4 100644 --- a/src/daemon/handlers/find.ts +++ b/src/daemon/handlers/find.ts @@ -1,22 +1,14 @@ import { dispatchCommand, resolveTargetDevice } from '../../core/dispatch.ts'; import { findBestMatchesByLocator, type FindLocator } from '../../utils/finders.ts'; -import { - attachRefs, - centerOfRect, - type RawSnapshotNode, - type SnapshotState, -} from '../../utils/snapshot.ts'; +import { centerOfRect, type SnapshotState } from '../../utils/snapshot.ts'; import { AppError } from '../../utils/errors.ts'; import type { DaemonRequest, DaemonResponse } from '../types.ts'; import { SessionStore } from '../session-store.ts'; import { contextFromFlags } from '../context.ts'; import { ensureDeviceReady } from '../device-ready.ts'; -import { - extractNodeText, - findNearestHittableAncestor, - pruneGroupNodes, -} from '../snapshot-processing.ts'; +import { extractNodeText, findNearestHittableAncestor } from '../snapshot-processing.ts'; import { parseTimeout } from './parse-utils.ts'; +import { captureSnapshot } from './snapshot-capture.ts'; export async function handleFindCommands(params: { req: DaemonRequest; @@ -55,7 +47,6 @@ export async function handleFindCommands(params: { if (!session) { await ensureDeviceReady(device); } - const appBundleId = session?.appBundleId; const scope = shouldScopeFind(locator) ? query : undefined; const requiresRect = action === 'click' || action === 'focus' || action === 'fill' || action === 'type'; @@ -71,38 +62,29 @@ export async function handleFindCommands(params: { if (lastNodes && now - lastSnapshotAt < 750) { return { nodes: lastNodes }; } - const data = (await dispatch(device, 'snapshot', [], req.flags?.out, { - ...contextFromFlags( - logPath, - { + const { snapshot } = await captureSnapshot({ + dispatchSnapshotCommand: dispatch, + device, + session, + req: { + ...req, + flags: { ...req.flags, - snapshotScope: scope, snapshotInteractiveOnly: interactiveOnly, snapshotCompact: interactiveOnly, }, - appBundleId, - session?.trace?.outPath, - ), - })) as { - nodes?: RawSnapshotNode[]; - truncated?: boolean; - backend?: 'xctest' | 'android' | 'macos-helper'; - }; - const rawNodes = data?.nodes ?? []; - const nodes = attachRefs(req.flags?.snapshotRaw ? rawNodes : pruneGroupNodes(rawNodes)); + }, + logPath, + snapshotScope: scope, + }); + const nodes = snapshot.nodes; lastSnapshotAt = now; lastNodes = nodes; if (session) { - const snapshot: SnapshotState = { - nodes, - truncated: data?.truncated, - createdAt: Date.now(), - backend: data?.backend, - }; session.snapshot = snapshot; sessionStore.set(sessionName, session); } - return { nodes, truncated: data?.truncated, backend: data?.backend }; + return { nodes, truncated: snapshot.truncated, backend: snapshot.backend }; }; if (action === 'wait') { const timeout = timeoutMs ?? 10000; diff --git a/src/daemon/handlers/interaction-touch.ts b/src/daemon/handlers/interaction-touch.ts index 6982b6f5..21b1500d 100644 --- a/src/daemon/handlers/interaction-touch.ts +++ b/src/daemon/handlers/interaction-touch.ts @@ -90,6 +90,13 @@ export async function handleTouchInteractionCommands(params: { error: { code: 'SESSION_NOT_FOUND', message: 'No active session. Run open first.' }, }; } + const unsupportedSurfaceResponse = unsupportedMacOsDesktopSurfaceInteraction( + session, + commandLabel, + ); + if (unsupportedSurfaceResponse) { + return unsupportedSurfaceResponse; + } if (!isCommandSupportedOnDevice('press', session.device)) { return { ok: false, @@ -300,6 +307,15 @@ export async function handleTouchInteractionCommands(params: { if (command === 'fill') { const session = sessionStore.get(sessionName); + if (session) { + const unsupportedSurfaceResponse = unsupportedMacOsDesktopSurfaceInteraction( + session, + command, + ); + if (unsupportedSurfaceResponse) { + return unsupportedSurfaceResponse; + } + } if (session && !isCommandSupportedOnDevice('fill', session.device)) { return { ok: false, @@ -491,6 +507,25 @@ export async function handleTouchInteractionCommands(params: { return null; } +function unsupportedMacOsDesktopSurfaceInteraction( + session: SessionState, + command: 'click' | 'press' | 'fill', +): DaemonResponse | null { + if (session.device.platform !== 'macos') { + return null; + } + if (session.surface !== 'desktop' && session.surface !== 'menubar') { + return null; + } + return { + ok: false, + error: { + code: 'UNSUPPORTED_OPERATION', + message: `${command} is not supported on macOS ${session.surface} sessions yet. Open an app session to act, or use the ${session.surface} surface to inspect.`, + }, + }; +} + function parseCoordinateTarget(positionals: string[]): { x: number; y: number } | null { if (positionals.length < 2) return null; const x = Number(positionals[0]); diff --git a/src/daemon/handlers/snapshot-capture.ts b/src/daemon/handlers/snapshot-capture.ts index 29c0ac80..f16ef7f6 100644 --- a/src/daemon/handlers/snapshot-capture.ts +++ b/src/daemon/handlers/snapshot-capture.ts @@ -9,7 +9,7 @@ import { } from '../../utils/snapshot.ts'; import type { DaemonResponse, DaemonRequest, SessionState } from '../types.ts'; import { contextFromFlags } from '../context.ts'; -import { pruneGroupNodes, resolveRefLabel } from '../snapshot-processing.ts'; +import { findNodeByLabel, pruneGroupNodes, resolveRefLabel } from '../snapshot-processing.ts'; type CaptureSnapshotParams = { dispatchSnapshotCommand: typeof dispatchCommand; @@ -20,27 +20,38 @@ type CaptureSnapshotParams = { snapshotScope?: string; }; +type SnapshotData = { + nodes?: RawSnapshotNode[]; + truncated?: boolean; + backend?: 'xctest' | 'android' | 'macos-helper'; +}; + export async function captureSnapshot( params: CaptureSnapshotParams, ): Promise<{ snapshot: SnapshotState }> { + const { req } = params; + const data = await captureSnapshotData(params); + return { snapshot: buildSnapshotState(data, req.flags?.snapshotRaw) }; +} + +export async function captureSnapshotData(params: CaptureSnapshotParams): Promise { const { dispatchSnapshotCommand, device, session, req, logPath, snapshotScope } = params; if (device.platform === 'macos' && session?.surface && session.surface !== 'app') { const helperSnapshot = await runMacOsSnapshotAction(session.surface); - return { snapshot: buildSnapshotState(helperSnapshot, req.flags?.snapshotRaw) }; + return shapeMacOsSurfaceSnapshot(helperSnapshot, { + snapshotDepth: req.flags?.snapshotDepth, + snapshotInteractiveOnly: req.flags?.snapshotInteractiveOnly, + snapshotScope, + }); } - const data = (await dispatchSnapshotCommand(device, 'snapshot', [], req.flags?.out, { + return (await dispatchSnapshotCommand(device, 'snapshot', [], req.flags?.out, { ...contextFromFlags( logPath, { ...req.flags, snapshotScope }, session?.appBundleId, session?.trace?.outPath, ), - })) as { - nodes?: RawSnapshotNode[]; - truncated?: boolean; - backend?: 'xctest' | 'android' | 'macos-helper'; - }; - return { snapshot: buildSnapshotState(data, req.flags?.snapshotRaw) }; + })) as SnapshotData; } export function buildSnapshotState( @@ -61,6 +72,108 @@ export function buildSnapshotState( }; } +function shapeMacOsSurfaceSnapshot( + data: SnapshotData, + options: { + snapshotDepth?: number; + snapshotInteractiveOnly?: boolean; + snapshotScope?: string; + }, +): SnapshotData { + let nodes = data.nodes ?? []; + if (options.snapshotScope) { + nodes = scopeSnapshotNodes(nodes, options.snapshotScope); + } + if (options.snapshotInteractiveOnly) { + nodes = filterInteractiveSnapshotNodes(nodes); + } + if (typeof options.snapshotDepth === 'number') { + nodes = filterSnapshotNodesByDepth(nodes, options.snapshotDepth); + } + return { ...data, nodes }; +} + +function scopeSnapshotNodes(nodes: RawSnapshotNode[], scope: string): RawSnapshotNode[] { + const scopedNodes = attachRefs(nodes); + const match = findNodeByLabel(scopedNodes, scope); + if (!match) { + return []; + } + const startIndex = nodes.findIndex((node) => node.index === match.index); + if (startIndex === -1) { + return []; + } + const startDepth = nodes[startIndex]?.depth ?? 0; + const slice: RawSnapshotNode[] = []; + for (let index = startIndex; index < nodes.length; index += 1) { + const node = nodes[index]; + if (!node) continue; + const depth = node.depth ?? 0; + if (index > startIndex && depth <= startDepth) { + break; + } + slice.push(node); + } + return reindexSnapshotNodes(slice, startDepth); +} + +function filterInteractiveSnapshotNodes(nodes: RawSnapshotNode[]): RawSnapshotNode[] { + if (nodes.length === 0) { + return nodes; + } + const byIndex = new Map(); + for (const node of nodes) { + byIndex.set(node.index, node); + } + const keepIndexes = new Set(); + for (const node of nodes) { + if (!isInteractiveSnapshotNode(node)) continue; + let current: RawSnapshotNode | undefined = node; + while (current) { + if (keepIndexes.has(current.index)) break; + keepIndexes.add(current.index); + current = + typeof current.parentIndex === 'number' ? byIndex.get(current.parentIndex) : undefined; + } + } + if (keepIndexes.size === 0) { + return nodes; + } + return reindexSnapshotNodes(nodes.filter((node) => keepIndexes.has(node.index))); +} + +function filterSnapshotNodesByDepth(nodes: RawSnapshotNode[], maxDepth: number): RawSnapshotNode[] { + return reindexSnapshotNodes(nodes.filter((node) => (node.depth ?? 0) <= maxDepth)); +} + +function reindexSnapshotNodes(nodes: RawSnapshotNode[], depthOffset = 0): RawSnapshotNode[] { + const indexMap = new Map(); + for (const [index, node] of nodes.entries()) { + indexMap.set(node.index, index); + } + return nodes.map((node, index) => ({ + ...node, + index, + depth: Math.max(0, (node.depth ?? 0) - depthOffset), + parentIndex: typeof node.parentIndex === 'number' ? indexMap.get(node.parentIndex) : undefined, + })); +} + +function isInteractiveSnapshotNode(node: RawSnapshotNode): boolean { + if (node.hittable) return true; + if (node.rect) return true; + const role = `${node.type ?? ''} ${node.role ?? ''} ${node.subrole ?? ''}`.toLowerCase(); + return ( + role.includes('button') || + role.includes('menu') || + role.includes('textfield') || + role.includes('searchfield') || + role.includes('checkbox') || + role.includes('radio') || + role.includes('switch') + ); +} + export function resolveSnapshotScope( snapshotScope: string | undefined, session: SessionState | undefined, diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index feb45274..c8278f57 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -739,11 +739,10 @@ test('command usage shows command and global flags separately', () => { assert.match(help, /--platform ios\|macos\|android\|apple/); }); -test('open command usage documents macOS surface flag', () => { +test('open command usage documents macOS desktop surface flags', () => { const help = usageForCommand('open'); if (help === null) throw new Error('Expected command help text'); - assert.match(help, /--surface app\|frontmost-app/); - assert.doesNotMatch(help, /desktop\|menubar/); + assert.match(help, /--surface app\|frontmost-app\|desktop\|menubar/); assert.match(help, /macOS also supports --surface/); }); From 9993b67f248d402bf3399816459476f6f514e589 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Wed, 25 Mar 2026 14:40:57 +0100 Subject: [PATCH 3/4] refactor: harden macos helper snapshot traversal --- .../SnapshotTraversal.swift | 526 ++++++++++++++++++ .../Sources/AgentDeviceMacOSHelper/main.swift | 462 +-------------- .../handlers/__tests__/interaction.test.ts | 35 ++ .../__tests__/snapshot-handler.test.ts | 49 ++ src/daemon/handlers/find.ts | 12 +- src/daemon/handlers/interaction-snapshot.ts | 9 +- src/daemon/handlers/snapshot-capture.ts | 21 +- src/daemon/handlers/snapshot-wait.ts | 96 ++-- src/daemon/handlers/snapshot.ts | 6 +- src/daemon/snapshot-processing.ts | 10 +- src/platforms/ios/macos-helper.ts | 4 +- 11 files changed, 703 insertions(+), 527 deletions(-) create mode 100644 macos-helper/Sources/AgentDeviceMacOSHelper/SnapshotTraversal.swift diff --git a/macos-helper/Sources/AgentDeviceMacOSHelper/SnapshotTraversal.swift b/macos-helper/Sources/AgentDeviceMacOSHelper/SnapshotTraversal.swift new file mode 100644 index 00000000..4aa82332 --- /dev/null +++ b/macos-helper/Sources/AgentDeviceMacOSHelper/SnapshotTraversal.swift @@ -0,0 +1,526 @@ +import AppKit +import ApplicationServices +import Foundation + +private enum SnapshotTraversalLimits { + static let maxDesktopApps = 24 + static let maxNodes = 1500 + static let maxDepth = 12 +} + +struct RectResponse: Encodable { + let x: Double + let y: Double + let width: Double + let height: Double +} + +struct SnapshotNodeResponse: Encodable { + let index: Int + let type: String? + let role: String? + let subrole: String? + let label: String? + let value: String? + let identifier: String? + let rect: RectResponse? + let enabled: Bool? + let selected: Bool? + let hittable: Bool? + let depth: Int + let parentIndex: Int? + let pid: Int32? + let bundleId: String? + let appName: String? + let windowTitle: String? + let surface: String? +} + +struct SnapshotResponse: Encodable { + let surface: String + let nodes: [SnapshotNodeResponse] + let truncated: Bool + let backend = "macos-helper" +} + +private struct SnapshotBuildResult { + let nodes: [SnapshotNodeResponse] + let truncated: Bool +} + +private struct SnapshotContext { + let surface: String + let pid: Int32? + let bundleId: String? + let appName: String? + let windowTitle: String? +} + +private struct SnapshotTraversalState { + var nodes: [SnapshotNodeResponse] = [] + var visited: [AXUIElement] = [] + var truncated = false +} + +func captureSnapshotResponse(surface: String) throws -> SnapshotResponse { + let result: SnapshotBuildResult + switch surface { + case "frontmost-app": + result = try snapshotFrontmostApp() + case "desktop": + result = snapshotDesktop() + case "menubar": + result = snapshotMenuBar() + default: + throw HelperError.invalidArgs("snapshot requires --surface ") + } + + return SnapshotResponse(surface: surface, nodes: result.nodes, truncated: result.truncated) +} + +private func snapshotFrontmostApp() throws -> SnapshotBuildResult { + let app = try resolveTargetApplication(bundleId: nil, surface: "frontmost-app") + let appElement = AXUIElementCreateApplication(app.processIdentifier) + var state = SnapshotTraversalState() + _ = appendElementSnapshot( + appElement, + depth: 0, + parentIndex: nil, + context: SnapshotContext( + surface: "frontmost-app", + pid: Int32(app.processIdentifier), + bundleId: app.bundleIdentifier, + appName: app.localizedName, + windowTitle: nil + ), + state: &state + ) + return SnapshotBuildResult(nodes: state.nodes, truncated: state.truncated) +} + +private func snapshotDesktop() -> SnapshotBuildResult { + var state = SnapshotTraversalState() + guard + let rootIndex = appendSyntheticSnapshotNode( + into: &state, + type: "DesktopSurface", + label: "Desktop", + depth: 0, + parentIndex: nil, + surface: "desktop" + ) + else { + return SnapshotBuildResult(nodes: state.nodes, truncated: true) + } + + var runningApps = NSWorkspace.shared.runningApplications.filter { app in + app.activationPolicy != .prohibited + && !app.isTerminated + && (app.bundleIdentifier?.isEmpty == false || app.localizedName?.isEmpty == false) + } + runningApps.sort { left, right in + if left.isActive != right.isActive { + return left.isActive && !right.isActive + } + return (left.localizedName ?? "") < (right.localizedName ?? "") + } + + var includedApps = 0 + for app in runningApps { + if includedApps >= SnapshotTraversalLimits.maxDesktopApps { + state.truncated = true + break + } + if state.truncated { + break + } + + let appElement = AXUIElementCreateApplication(app.processIdentifier) + let visibleWindows = windows(of: appElement).filter(isVisibleSnapshotWindow) + if visibleWindows.isEmpty { + continue + } + + guard + let appIndex = appendSyntheticSnapshotNode( + into: &state, + type: "Application", + label: app.localizedName ?? app.bundleIdentifier ?? "Application", + depth: 1, + parentIndex: rootIndex, + surface: "desktop", + identifier: app.bundleIdentifier, + pid: Int32(app.processIdentifier), + bundleId: app.bundleIdentifier, + appName: app.localizedName + ) + else { + break + } + + includedApps += 1 + for window in visibleWindows { + if state.truncated { + break + } + let windowTitle = stringAttribute(window, attribute: kAXTitleAttribute as String) + _ = appendElementSnapshot( + window, + depth: 2, + parentIndex: appIndex, + context: SnapshotContext( + surface: "desktop", + pid: Int32(app.processIdentifier), + bundleId: app.bundleIdentifier, + appName: app.localizedName, + windowTitle: windowTitle + ), + state: &state + ) + } + } + + return SnapshotBuildResult(nodes: state.nodes, truncated: state.truncated) +} + +private func snapshotMenuBar() -> SnapshotBuildResult { + var state = SnapshotTraversalState() + guard + let rootIndex = appendSyntheticSnapshotNode( + into: &state, + type: "MenuBarSurface", + label: "Menu Bar", + depth: 0, + parentIndex: nil, + surface: "menubar" + ) + else { + return SnapshotBuildResult(nodes: state.nodes, truncated: true) + } + + if let frontmost = NSWorkspace.shared.frontmostApplication { + let frontmostElement = AXUIElementCreateApplication(frontmost.processIdentifier) + if let menuBar = elementAttribute(frontmostElement, attribute: kAXMenuBarAttribute as String) { + _ = appendElementSnapshot( + menuBar, + depth: 1, + parentIndex: rootIndex, + context: SnapshotContext( + surface: "menubar", + pid: Int32(frontmost.processIdentifier), + bundleId: frontmost.bundleIdentifier, + appName: frontmost.localizedName, + windowTitle: frontmost.localizedName + ), + state: &state + ) + } + } + + if !state.truncated, + let systemUiServer = NSRunningApplication.runningApplications( + withBundleIdentifier: "com.apple.systemuiserver" + ).first + { + let systemUiElement = AXUIElementCreateApplication(systemUiServer.processIdentifier) + if let menuExtras = elementAttribute(systemUiElement, attribute: kAXMenuBarAttribute as String) { + _ = appendElementSnapshot( + menuExtras, + depth: 1, + parentIndex: rootIndex, + context: SnapshotContext( + surface: "menubar", + pid: Int32(systemUiServer.processIdentifier), + bundleId: systemUiServer.bundleIdentifier, + appName: systemUiServer.localizedName, + windowTitle: "System Menu Extras" + ), + state: &state + ) + } + } + + return SnapshotBuildResult(nodes: state.nodes, truncated: state.truncated) +} + +@discardableResult +private func appendSyntheticSnapshotNode( + into state: inout SnapshotTraversalState, + type: String, + label: String, + depth: Int, + parentIndex: Int?, + surface: String, + identifier: String? = nil, + pid: Int32? = nil, + bundleId: String? = nil, + appName: String? = nil, + windowTitle: String? = nil +) -> Int? { + guard reserveSnapshotNodeCapacity(&state) else { + return nil + } + + let index = state.nodes.count + state.nodes.append( + SnapshotNodeResponse( + index: index, + type: type, + role: type, + subrole: nil, + label: label, + value: nil, + identifier: identifier ?? "surface:\(surface):\(type.lowercased())", + rect: nil, + enabled: true, + selected: nil, + hittable: false, + depth: depth, + parentIndex: parentIndex, + pid: pid, + bundleId: bundleId, + appName: appName, + windowTitle: windowTitle, + surface: surface + ) + ) + return index +} + +@discardableResult +private func appendElementSnapshot( + _ element: AXUIElement, + depth: Int, + parentIndex: Int?, + context: SnapshotContext, + state: inout SnapshotTraversalState, + maxDepth: Int = SnapshotTraversalLimits.maxDepth +) -> Int? { + if state.visited.contains(where: { CFEqual($0, element) }) { + return parentIndex + } + guard reserveSnapshotNodeCapacity(&state) else { + return parentIndex + } + state.visited.append(element) + + let role = stringAttribute(element, attribute: kAXRoleAttribute as String) + let subrole = stringAttribute(element, attribute: kAXSubroleAttribute as String) + let title = stringAttribute(element, attribute: kAXTitleAttribute as String) + let description = stringAttribute(element, attribute: kAXDescriptionAttribute as String) + let value = stringAttribute(element, attribute: kAXValueAttribute as String) + let identifier = stringAttribute(element, attribute: "AXIdentifier") + let rect = rectAttribute(element) + let enabled = boolAttribute(element, attribute: kAXEnabledAttribute as String) + let selected = boolAttribute(element, attribute: kAXSelectedAttribute as String) + let type = normalizedSnapshotType(role: role, subrole: subrole) + let windowTitle = context.windowTitle ?? inferWindowTitle(for: element) + + let index = state.nodes.count + state.nodes.append( + SnapshotNodeResponse( + index: index, + type: type, + role: role, + subrole: subrole, + label: title ?? description ?? value, + value: value, + identifier: identifier, + rect: rect, + enabled: enabled, + selected: selected, + hittable: (enabled ?? true) && rect != nil, + depth: depth, + parentIndex: parentIndex, + pid: context.pid, + bundleId: context.bundleId, + appName: context.appName, + windowTitle: windowTitle, + surface: context.surface + ) + ) + + guard depth < maxDepth, !state.truncated else { + return index + } + + for child in children(of: element) { + if state.truncated { + break + } + _ = appendElementSnapshot( + child, + depth: depth + 1, + parentIndex: index, + context: SnapshotContext( + surface: context.surface, + pid: context.pid, + bundleId: context.bundleId, + appName: context.appName, + windowTitle: windowTitle + ), + state: &state, + maxDepth: maxDepth + ) + } + + return index +} + +private func reserveSnapshotNodeCapacity(_ state: inout SnapshotTraversalState) -> Bool { + if state.nodes.count >= SnapshotTraversalLimits.maxNodes { + state.truncated = true + return false + } + return true +} + +private func normalizedSnapshotType(role: String?, subrole: String?) -> String? { + switch role { + case "AXApplication": + return "Application" + case "AXWindow": + return subrole == "AXStandardWindow" ? "Window" : (subrole ?? "Window") + case "AXSheet": + return "Sheet" + case "AXDialog": + return "Dialog" + case "AXButton": + return "Button" + case "AXStaticText": + return "StaticText" + case "AXTextField": + return "TextField" + case "AXTextArea": + return "TextArea" + case "AXScrollArea": + return "ScrollArea" + case "AXGroup": + return "Group" + case "AXMenuBar": + return "MenuBar" + case "AXMenuBarItem": + return "MenuBarItem" + case "AXMenu": + return "Menu" + case "AXMenuItem": + return "MenuItem" + default: + if let subrole, !subrole.isEmpty { + return subrole + } + return role + } +} + +private func isVisibleSnapshotWindow(_ window: AXUIElement) -> Bool { + guard let rect = rectAttribute(window) else { + return false + } + if rect.width <= 0 || rect.height <= 0 { + return false + } + if boolAttribute(window, attribute: kAXMinimizedAttribute as String) == true { + return false + } + return true +} + +private func inferWindowTitle(for element: AXUIElement) -> String? { + if let title = stringAttribute(element, attribute: kAXTitleAttribute as String) { + return title + } + if let window = elementAttribute(element, attribute: kAXWindowAttribute as String) { + return stringAttribute(window, attribute: kAXTitleAttribute as String) + } + return nil +} + +func stringAttribute(_ element: AXUIElement, attribute: String) -> String? { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success else { + return nil + } + if let text = value as? String { + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + return trimmed.isEmpty ? nil : trimmed + } + return nil +} + +func boolAttribute(_ element: AXUIElement, attribute: String) -> Bool? { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success, + let number = value as? NSNumber + else { + return nil + } + return number.boolValue +} + +func elementAttribute(_ element: AXUIElement, attribute: String) -> AXUIElement? { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success, + let value, + CFGetTypeID(value) == AXUIElementGetTypeID() + else { + return nil + } + return (value as! AXUIElement) +} + +func rectAttribute(_ element: AXUIElement) -> RectResponse? { + var positionValue: CFTypeRef? + var sizeValue: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, kAXPositionAttribute as CFString, &positionValue) == .success, + AXUIElementCopyAttributeValue(element, kAXSizeAttribute as CFString, &sizeValue) == .success, + let positionAxValue = accessibilityAxValue(positionValue), + let sizeAxValue = accessibilityAxValue(sizeValue) + else { + return nil + } + + var position = CGPoint.zero + var size = CGSize.zero + guard AXValueGetType(positionAxValue) == .cgPoint, + AXValueGetValue(positionAxValue, .cgPoint, &position), + AXValueGetType(sizeAxValue) == .cgSize, + AXValueGetValue(sizeAxValue, .cgSize, &size) + else { + return nil + } + + return RectResponse( + x: Double(position.x), + y: Double(position.y), + width: Double(size.width), + height: Double(size.height) + ) +} + +private func accessibilityAxValue(_ value: CFTypeRef?) -> AXValue? { + guard let value, CFGetTypeID(value) == AXValueGetTypeID() else { + return nil + } + return (value as! AXValue) +} + +func children(of element: AXUIElement) -> [AXUIElement] { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, kAXChildrenAttribute as CFString, &value) == .success, + let children = value as? [AXUIElement] + else { + return [] + } + return children +} + +func windows(of appElement: AXUIElement) -> [AXUIElement] { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(appElement, "AXWindows" as CFString, &value) == .success, + let windows = value as? [AXUIElement] + else { + return [] + } + return windows +} diff --git a/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift b/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift index 2de8d986..90bd4272 100644 --- a/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift +++ b/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift @@ -53,42 +53,6 @@ struct AlertResponse: Encodable { let bundleId: String? } -struct RectResponse: Encodable { - let x: Double - let y: Double - let width: Double - let height: Double -} - -struct SnapshotNodeResponse: Encodable { - let index: Int - let type: String? - let role: String? - let subrole: String? - let label: String? - let value: String? - let identifier: String? - let rect: RectResponse? - let enabled: Bool? - let selected: Bool? - let hittable: Bool? - let depth: Int - let parentIndex: Int? - let pid: Int32? - let bundleId: String? - let appName: String? - let windowTitle: String? - let surface: String? -} - -struct SnapshotResponse: Encodable { - let surface: String - let nodes: [SnapshotNodeResponse] - let truncated = false - let backend = "macos-helper" -} - -@main struct AgentDeviceMacOSHelper { static func main() { do { @@ -293,7 +257,7 @@ struct AgentDeviceMacOSHelper { } let bundleId = optionValue(arguments: Array(arguments.dropFirst()), name: "--bundle-id") let surface = optionValue(arguments: Array(arguments.dropFirst()), name: "--surface") - let app = try resolveAlertApplication(bundleId: bundleId, surface: surface) + let app = try resolveTargetApplication(bundleId: bundleId, surface: surface) guard let alertElement = findAlertElement(appElement: AXUIElementCreateApplication(app.processIdentifier)) else { throw HelperError.commandFailed( "alert not found", @@ -347,12 +311,9 @@ struct AgentDeviceMacOSHelper { switch surface { case "frontmost-app": - let app = try resolveAlertApplication(bundleId: nil, surface: surface) - return SuccessEnvelope(data: SnapshotResponse(surface: surface, nodes: snapshotFrontmostApp(app))) - case "desktop": - return SuccessEnvelope(data: SnapshotResponse(surface: surface, nodes: snapshotDesktop())) - case "menubar": - return SuccessEnvelope(data: SnapshotResponse(surface: surface, nodes: snapshotMenuBar())) + return SuccessEnvelope(data: try captureSnapshotResponse(surface: surface)) + case "desktop", "menubar": + return SuccessEnvelope(data: try captureSnapshotResponse(surface: surface)) default: throw HelperError.invalidArgs("snapshot requires --surface ") } @@ -386,7 +347,7 @@ private func writeJSON(_ value: T) throws { FileHandle.standardOutput.write(Data([0x0A])) } -private func resolveAlertApplication(bundleId: String?, surface: String?) throws -> NSRunningApplication { +func resolveTargetApplication(bundleId: String?, surface: String?) throws -> NSRunningApplication { let normalizedSurface = surface?.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() if normalizedSurface == "desktop" || normalizedSurface == "menubar" { throw HelperError.commandFailed( @@ -413,335 +374,6 @@ private func resolveAlertApplication(bundleId: String?, surface: String?) throws throw HelperError.commandFailed("unable to resolve target app") } -private struct SnapshotContext { - let surface: String - let pid: Int32? - let bundleId: String? - let appName: String? - let windowTitle: String? -} - -private func snapshotFrontmostApp(_ app: NSRunningApplication) -> [SnapshotNodeResponse] { - let appElement = AXUIElementCreateApplication(app.processIdentifier) - var nodes: [SnapshotNodeResponse] = [] - var visited = Set() - appendElementSnapshot( - appElement, - depth: 0, - parentIndex: nil, - context: SnapshotContext( - surface: "frontmost-app", - pid: Int32(app.processIdentifier), - bundleId: app.bundleIdentifier, - appName: app.localizedName, - windowTitle: nil - ), - nodes: &nodes, - visited: &visited - ) - return nodes -} - -private func snapshotDesktop() -> [SnapshotNodeResponse] { - var nodes: [SnapshotNodeResponse] = [] - let rootIndex = appendSyntheticSnapshotNode( - into: &nodes, - type: "DesktopSurface", - label: "Desktop", - depth: 0, - parentIndex: nil, - surface: "desktop" - ) - - var runningApps = NSWorkspace.shared.runningApplications.filter { app in - app.activationPolicy != .prohibited - && !app.isTerminated - && (app.bundleIdentifier?.isEmpty == false || app.localizedName?.isEmpty == false) - } - runningApps.sort { left, right in - if left.isActive != right.isActive { - return left.isActive && !right.isActive - } - return (left.localizedName ?? "") < (right.localizedName ?? "") - } - - for app in runningApps { - let appElement = AXUIElementCreateApplication(app.processIdentifier) - let visibleWindows = windows(of: appElement).filter(isVisibleSnapshotWindow) - if visibleWindows.isEmpty { - continue - } - let appIndex = appendSyntheticSnapshotNode( - into: &nodes, - type: "Application", - label: app.localizedName ?? app.bundleIdentifier ?? "Application", - depth: 1, - parentIndex: rootIndex, - surface: "desktop", - identifier: app.bundleIdentifier, - pid: Int32(app.processIdentifier), - bundleId: app.bundleIdentifier, - appName: app.localizedName - ) - var visited = Set() - for window in visibleWindows { - let windowTitle = stringAttribute(window, attribute: kAXTitleAttribute as String) - appendElementSnapshot( - window, - depth: 2, - parentIndex: appIndex, - context: SnapshotContext( - surface: "desktop", - pid: Int32(app.processIdentifier), - bundleId: app.bundleIdentifier, - appName: app.localizedName, - windowTitle: windowTitle - ), - nodes: &nodes, - visited: &visited - ) - } - } - - return nodes -} - -private func snapshotMenuBar() -> [SnapshotNodeResponse] { - var nodes: [SnapshotNodeResponse] = [] - let rootIndex = appendSyntheticSnapshotNode( - into: &nodes, - type: "MenuBarSurface", - label: "Menu Bar", - depth: 0, - parentIndex: nil, - surface: "menubar" - ) - - if let frontmost = NSWorkspace.shared.frontmostApplication { - let frontmostElement = AXUIElementCreateApplication(frontmost.processIdentifier) - if let menuBar = elementAttribute(frontmostElement, attribute: kAXMenuBarAttribute as String) { - var frontmostVisited = Set() - appendElementSnapshot( - menuBar, - depth: 1, - parentIndex: rootIndex, - context: SnapshotContext( - surface: "menubar", - pid: Int32(frontmost.processIdentifier), - bundleId: frontmost.bundleIdentifier, - appName: frontmost.localizedName, - windowTitle: frontmost.localizedName - ), - nodes: &nodes, - visited: &frontmostVisited - ) - } - } - - if let systemUiServer = NSRunningApplication.runningApplications( - withBundleIdentifier: "com.apple.systemuiserver" - ).first { - let systemUiElement = AXUIElementCreateApplication(systemUiServer.processIdentifier) - if let menuExtras = elementAttribute(systemUiElement, attribute: kAXMenuBarAttribute as String) { - var systemUiVisited = Set() - appendElementSnapshot( - menuExtras, - depth: 1, - parentIndex: rootIndex, - context: SnapshotContext( - surface: "menubar", - pid: Int32(systemUiServer.processIdentifier), - bundleId: systemUiServer.bundleIdentifier, - appName: systemUiServer.localizedName, - windowTitle: "System Menu Extras" - ), - nodes: &nodes, - visited: &systemUiVisited - ) - } - } - - return nodes -} - -@discardableResult -private func appendSyntheticSnapshotNode( - into nodes: inout [SnapshotNodeResponse], - type: String, - label: String, - depth: Int, - parentIndex: Int?, - surface: String, - identifier: String? = nil, - pid: Int32? = nil, - bundleId: String? = nil, - appName: String? = nil, - windowTitle: String? = nil -) -> Int { - let index = nodes.count - nodes.append( - SnapshotNodeResponse( - index: index, - type: type, - role: type, - subrole: nil, - label: label, - value: nil, - identifier: identifier ?? "surface:\(surface):\(type.lowercased())", - rect: nil, - enabled: true, - selected: nil, - hittable: false, - depth: depth, - parentIndex: parentIndex, - pid: pid, - bundleId: bundleId, - appName: appName, - windowTitle: windowTitle, - surface: surface - ) - ) - return index -} - -@discardableResult -private func appendElementSnapshot( - _ element: AXUIElement, - depth: Int, - parentIndex: Int?, - context: SnapshotContext, - nodes: inout [SnapshotNodeResponse], - visited: inout Set, - maxDepth: Int = 12 -) -> Int { - let elementHash = CFHash(element) - if visited.contains(elementHash) { - return parentIndex ?? 0 - } - visited.insert(elementHash) - - let role = stringAttribute(element, attribute: kAXRoleAttribute as String) - let subrole = stringAttribute(element, attribute: kAXSubroleAttribute as String) - let title = stringAttribute(element, attribute: kAXTitleAttribute as String) - let description = stringAttribute(element, attribute: kAXDescriptionAttribute as String) - let value = stringAttribute(element, attribute: kAXValueAttribute as String) - let identifier = stringAttribute(element, attribute: "AXIdentifier") - let rect = rectAttribute(element) - let enabled = boolAttribute(element, attribute: kAXEnabledAttribute as String) - let selected = boolAttribute(element, attribute: kAXSelectedAttribute as String) - let type = normalizedSnapshotType(role: role, subrole: subrole) - let windowTitle = context.windowTitle ?? inferWindowTitle(for: element) - - let index = nodes.count - nodes.append( - SnapshotNodeResponse( - index: index, - type: type, - role: role, - subrole: subrole, - label: title ?? description ?? value, - value: value, - identifier: identifier, - rect: rect, - enabled: enabled, - selected: selected, - hittable: (enabled ?? true) && rect != nil, - depth: depth, - parentIndex: parentIndex, - pid: context.pid, - bundleId: context.bundleId, - appName: context.appName, - windowTitle: windowTitle, - surface: context.surface - ) - ) - - guard depth < maxDepth else { - return index - } - - for child in children(of: element) { - appendElementSnapshot( - child, - depth: depth + 1, - parentIndex: index, - context: SnapshotContext( - surface: context.surface, - pid: context.pid, - bundleId: context.bundleId, - appName: context.appName, - windowTitle: windowTitle - ), - nodes: &nodes, - visited: &visited, - maxDepth: maxDepth - ) - } - - return index -} - -private func normalizedSnapshotType(role: String?, subrole: String?) -> String? { - switch role { - case "AXApplication": - return "Application" - case "AXWindow": - return subrole == "AXStandardWindow" ? "Window" : (subrole ?? "Window") - case "AXSheet": - return "Sheet" - case "AXDialog": - return "Dialog" - case "AXButton": - return "Button" - case "AXStaticText": - return "StaticText" - case "AXTextField": - return "TextField" - case "AXTextArea": - return "TextArea" - case "AXScrollArea": - return "ScrollArea" - case "AXGroup": - return "Group" - case "AXMenuBar": - return "MenuBar" - case "AXMenuBarItem": - return "MenuBarItem" - case "AXMenu": - return "Menu" - case "AXMenuItem": - return "MenuItem" - default: - if let subrole, !subrole.isEmpty { - return subrole - } - return role - } -} - -private func isVisibleSnapshotWindow(_ window: AXUIElement) -> Bool { - guard let rect = rectAttribute(window) else { - return false - } - if rect.width <= 0 || rect.height <= 0 { - return false - } - if boolAttribute(window, attribute: kAXMinimizedAttribute as String) == true { - return false - } - return true -} - -private func inferWindowTitle(for element: AXUIElement) -> String? { - if let title = stringAttribute(element, attribute: kAXTitleAttribute as String) { - return title - } - if let window = elementAttribute(element, attribute: kAXWindowAttribute as String) { - return stringAttribute(window, attribute: kAXTitleAttribute as String) - } - return nil -} - private func validatedBundleId(_ rawBundleId: String) throws -> String { let bundleId = rawBundleId.trimmingCharacters(in: .whitespacesAndNewlines) let pattern = #"^[A-Za-z0-9_-]+(?:\.[A-Za-z0-9_-]+)+$"# @@ -751,88 +383,6 @@ private func validatedBundleId(_ rawBundleId: String) throws -> String { return bundleId } -private func stringAttribute(_ element: AXUIElement, attribute: String) -> String? { - var value: CFTypeRef? - guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success else { - return nil - } - if let text = value as? String { - let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) - return trimmed.isEmpty ? nil : trimmed - } - return nil -} - -private func boolAttribute(_ element: AXUIElement, attribute: String) -> Bool? { - var value: CFTypeRef? - guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success, - let number = value as? NSNumber - else { - return nil - } - return number.boolValue -} - -private func elementAttribute(_ element: AXUIElement, attribute: String) -> AXUIElement? { - var value: CFTypeRef? - guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success else { - return nil - } - guard let value else { - return nil - } - return unsafeBitCast(value, to: AXUIElement.self) -} - -private func rectAttribute(_ element: AXUIElement) -> RectResponse? { - var positionValue: CFTypeRef? - var sizeValue: CFTypeRef? - guard AXUIElementCopyAttributeValue(element, kAXPositionAttribute as CFString, &positionValue) == .success, - AXUIElementCopyAttributeValue(element, kAXSizeAttribute as CFString, &sizeValue) == .success, - let axPosition = positionValue, - let axSize = sizeValue - else { - return nil - } - - var position = CGPoint.zero - var size = CGSize.zero - guard AXValueGetType(axPosition as! AXValue) == .cgPoint, - AXValueGetValue(axPosition as! AXValue, .cgPoint, &position), - AXValueGetType(axSize as! AXValue) == .cgSize, - AXValueGetValue(axSize as! AXValue, .cgSize, &size) - else { - return nil - } - - return RectResponse( - x: Double(position.x), - y: Double(position.y), - width: Double(size.width), - height: Double(size.height) - ) -} - -private func children(of element: AXUIElement) -> [AXUIElement] { - var value: CFTypeRef? - guard AXUIElementCopyAttributeValue(element, kAXChildrenAttribute as CFString, &value) == .success, - let children = value as? [AXUIElement] - else { - return [] - } - return children -} - -private func windows(of appElement: AXUIElement) -> [AXUIElement] { - var value: CFTypeRef? - guard AXUIElementCopyAttributeValue(appElement, "AXWindows" as CFString, &value) == .success, - let windows = value as? [AXUIElement] - else { - return [] - } - return windows -} - private func findAlertElement(appElement: AXUIElement) -> AXUIElement? { for window in windows(of: appElement) { if let role = stringAttribute(window, attribute: kAXRoleAttribute as String), @@ -916,3 +466,5 @@ private func resolveAlertActionButton(root: AXUIElement, buttons: [AXUIElement], return action == "accept" ? buttons.first : buttons.last } + +AgentDeviceMacOSHelper.main() diff --git a/src/daemon/handlers/__tests__/interaction.test.ts b/src/daemon/handlers/__tests__/interaction.test.ts index 311c98b5..29302e72 100644 --- a/src/daemon/handlers/__tests__/interaction.test.ts +++ b/src/daemon/handlers/__tests__/interaction.test.ts @@ -63,6 +63,13 @@ function makeMacOsDesktopSession(name: string): SessionState { }; } +function makeMacOsMenubarSession(name: string): SessionState { + return { + ...makeMacOsDesktopSession(name), + surface: 'menubar', + }; +} + const contextFromFlags = (flags: CommandFlags | undefined) => ({ count: flags?.count, intervalMs: flags?.intervalMs, @@ -166,6 +173,34 @@ test('click rejects macOS desktop surface interactions until helper routing exis } }); +test('fill rejects macOS menubar surface interactions until helper routing exists', async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-menubar-fill'; + sessionStore.set(sessionName, makeMacOsMenubarSession(sessionName)); + + const response = await handleInteractionCommands({ + req: { + token: 't', + session: sessionName, + command: 'fill', + positionals: ['@e2', 'hello'], + flags: {}, + }, + sessionName, + sessionStore, + contextFromFlags, + dispatch: async () => { + throw new Error('dispatch should not be called'); + }, + }); + + assert.equal(response?.ok, false); + if (response && !response.ok) { + assert.equal(response.error.code, 'UNSUPPORTED_OPERATION'); + assert.match(response.error.message, /macOS menubar sessions/); + } +}); + test('press coordinates appends touch-visualization events while recording', async () => { const sessionStore = makeSessionStore(); const sessionName = 'default'; diff --git a/src/daemon/handlers/__tests__/snapshot-handler.test.ts b/src/daemon/handlers/__tests__/snapshot-handler.test.ts index aff505e5..21344b0b 100644 --- a/src/daemon/handlers/__tests__/snapshot-handler.test.ts +++ b/src/daemon/handlers/__tests__/snapshot-handler.test.ts @@ -252,6 +252,55 @@ test('snapshot on macOS desktop surface applies scope and depth after helper cap ); }); +test('snapshot on macOS menubar surface uses helper-backed surface snapshot', async () => { + await withMockedMacOsHelper( + [ + '#!/bin/sh', + 'printf "%s\\n" "$@" > "$AGENT_DEVICE_TEST_ARGS_FILE"', + "cat <<'JSON'", + '{"ok":true,"data":{"surface":"menubar","nodes":[{"index":0,"depth":0,"type":"MenuBarSurface","label":"Menu Bar","surface":"menubar"},{"index":1,"depth":1,"parentIndex":0,"type":"MenuBarItem","label":"File","surface":"menubar"}],"truncated":false,"backend":"macos-helper"}}', + 'JSON', + '', + ].join('\n'), + async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-menubar-snapshot'; + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-menubar-snapshot-')); + const argsLogPath = path.join(tmpDir, 'args.log'); + const previousArgsFile = process.env.AGENT_DEVICE_TEST_ARGS_FILE; + process.env.AGENT_DEVICE_TEST_ARGS_FILE = argsLogPath; + sessionStore.set(sessionName, { + ...makeSession(sessionName, macOsDevice), + surface: 'menubar', + }); + + try { + const response = await handleSnapshotCommands({ + req: { + token: 't', + session: sessionName, + command: 'snapshot', + positionals: [], + flags: {}, + }, + sessionName, + logPath: '/tmp/daemon.log', + sessionStore, + }); + + assert.equal(response?.ok, true); + const logged = await fs.promises.readFile(argsLogPath, 'utf8'); + assert.equal(logged, 'snapshot\n--surface\nmenubar\n'); + assert.equal(sessionStore.get(sessionName)?.snapshot?.nodes[1]?.label, 'File'); + } finally { + if (previousArgsFile === undefined) delete process.env.AGENT_DEVICE_TEST_ARGS_FILE; + else process.env.AGENT_DEVICE_TEST_ARGS_FILE = previousArgsFile; + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, + ); +}); + test('wait text on macOS desktop surface polls helper-backed snapshots instead of runner text search', async () => { await withMockedMacOsHelper( [ diff --git a/src/daemon/handlers/find.ts b/src/daemon/handlers/find.ts index a428bda4..7661b474 100644 --- a/src/daemon/handlers/find.ts +++ b/src/daemon/handlers/find.ts @@ -66,14 +66,12 @@ export async function handleFindCommands(params: { dispatchSnapshotCommand: dispatch, device, session, - req: { - ...req, - flags: { - ...req.flags, - snapshotInteractiveOnly: interactiveOnly, - snapshotCompact: interactiveOnly, - }, + flags: { + ...req.flags, + snapshotInteractiveOnly: interactiveOnly, + snapshotCompact: interactiveOnly, }, + outPath: req.flags?.out, logPath, snapshotScope: scope, }); diff --git a/src/daemon/handlers/interaction-snapshot.ts b/src/daemon/handlers/interaction-snapshot.ts index d4fc9a1b..923fb686 100644 --- a/src/daemon/handlers/interaction-snapshot.ts +++ b/src/daemon/handlers/interaction-snapshot.ts @@ -27,13 +27,8 @@ export async function captureSnapshotForSession( dispatchSnapshotCommand: dispatch, device: session.device, session, - req: { - token: '', - session: session.name, - command: 'snapshot', - positionals: [], - flags: effectiveFlags, - }, + flags: effectiveFlags, + outPath: effectiveFlags.out, logPath: dispatchContext.logPath ?? '', }); session.snapshot = snapshot; diff --git a/src/daemon/handlers/snapshot-capture.ts b/src/daemon/handlers/snapshot-capture.ts index f16ef7f6..2f7b703d 100644 --- a/src/daemon/handlers/snapshot-capture.ts +++ b/src/daemon/handlers/snapshot-capture.ts @@ -1,4 +1,4 @@ -import { dispatchCommand } from '../../core/dispatch.ts'; +import { dispatchCommand, type CommandFlags } from '../../core/dispatch.ts'; import { runMacOsSnapshotAction } from '../../platforms/ios/macos-helper.ts'; import { attachRefs, @@ -7,7 +7,7 @@ import { type RawSnapshotNode, type SnapshotState, } from '../../utils/snapshot.ts'; -import type { DaemonResponse, DaemonRequest, SessionState } from '../types.ts'; +import type { DaemonResponse, SessionState } from '../types.ts'; import { contextFromFlags } from '../context.ts'; import { findNodeByLabel, pruneGroupNodes, resolveRefLabel } from '../snapshot-processing.ts'; @@ -15,7 +15,8 @@ type CaptureSnapshotParams = { dispatchSnapshotCommand: typeof dispatchCommand; device: SessionState['device']; session: SessionState | undefined; - req: DaemonRequest; + flags: CommandFlags | undefined; + outPath?: string; logPath: string; snapshotScope?: string; }; @@ -29,25 +30,25 @@ type SnapshotData = { export async function captureSnapshot( params: CaptureSnapshotParams, ): Promise<{ snapshot: SnapshotState }> { - const { req } = params; const data = await captureSnapshotData(params); - return { snapshot: buildSnapshotState(data, req.flags?.snapshotRaw) }; + return { snapshot: buildSnapshotState(data, params.flags?.snapshotRaw) }; } export async function captureSnapshotData(params: CaptureSnapshotParams): Promise { - const { dispatchSnapshotCommand, device, session, req, logPath, snapshotScope } = params; + const { dispatchSnapshotCommand, device, session, flags, outPath, logPath, snapshotScope } = + params; if (device.platform === 'macos' && session?.surface && session.surface !== 'app') { const helperSnapshot = await runMacOsSnapshotAction(session.surface); return shapeMacOsSurfaceSnapshot(helperSnapshot, { - snapshotDepth: req.flags?.snapshotDepth, - snapshotInteractiveOnly: req.flags?.snapshotInteractiveOnly, + snapshotDepth: flags?.snapshotDepth, + snapshotInteractiveOnly: flags?.snapshotInteractiveOnly, snapshotScope, }); } - return (await dispatchSnapshotCommand(device, 'snapshot', [], req.flags?.out, { + return (await dispatchSnapshotCommand(device, 'snapshot', [], outPath, { ...contextFromFlags( logPath, - { ...req.flags, snapshotScope }, + { ...flags, snapshotScope }, session?.appBundleId, session?.trace?.outPath, ), diff --git a/src/daemon/handlers/snapshot-wait.ts b/src/daemon/handlers/snapshot-wait.ts index 6fdaf802..b21edf4e 100644 --- a/src/daemon/handlers/snapshot-wait.ts +++ b/src/daemon/handlers/snapshot-wait.ts @@ -147,40 +147,23 @@ async function waitForSelector(params: { const timeout = parsed.timeoutMs ?? DEFAULT_TIMEOUT_MS; const start = Date.now(); while (Date.now() - start < timeout) { - const { snapshot } = await captureSnapshot({ + const snapshot = await captureWaitSnapshot({ dispatchSnapshotCommand, device, - session, - req: { - ...req, - flags: { - ...req.flags, - snapshotInteractiveOnly: false, - snapshotCompact: false, - }, - }, logPath, + req, + session, + sessionName, + sessionStore, }); - const nodes = snapshot.nodes; - if (session) { - session.snapshot = snapshot; - sessionStore.set(sessionName, session); - } - const match = findSelectorChainMatch(nodes, parsed.selector, { + const match = findSelectorChainMatch(snapshot.nodes, parsed.selector, { platform: device.platform, }); if (match) { - recordIfSession(sessionStore, session, req, { + return waitSuccess(sessionStore, session, req, { selector: match.selector.raw, waitedMs: Date.now() - start, }); - return { - ok: true, - data: { - selector: match.selector.raw, - waitedMs: Date.now() - start, - }, - }; } await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS)); } @@ -262,27 +245,20 @@ async function waitForText(params: { const start = Date.now(); while (Date.now() - start < timeout) { if (device.platform === 'macos' && session?.surface && session.surface !== 'app') { - const { snapshot } = await captureSnapshot({ + const snapshot = await captureWaitSnapshot({ dispatchSnapshotCommand: params.dispatchSnapshotCommand, device, - session, - req: { - ...req, - flags: { - ...req.flags, - snapshotInteractiveOnly: false, - snapshotCompact: false, - }, - }, logPath, + req, + session, + sessionName: session?.name ?? req.session ?? 'default', + sessionStore, }); - if (session) { - session.snapshot = snapshot; - sessionStore.set(session.name, session); - } if (findNodeByLabel(snapshot.nodes, text)) { - recordIfSession(sessionStore, session, req, { text, waitedMs: Date.now() - start }); - return { ok: true, data: { text, waitedMs: Date.now() - start } }; + return waitSuccess(sessionStore, session, req, { + text, + waitedMs: Date.now() - start, + }); } } else if (isApplePlatform(device.platform)) { const result = (await runnerCommand( @@ -313,3 +289,43 @@ async function waitForText(params: { error: { code: 'COMMAND_FAILED', message: `wait timed out for text: ${text}` }, }; } + +async function captureWaitSnapshot(params: { + dispatchSnapshotCommand: typeof dispatchCommand; + device: SessionState['device']; + logPath: string; + req: DaemonRequest; + session: SessionState | undefined; + sessionName: string; + sessionStore: SessionStore; +}): Promise { + const { dispatchSnapshotCommand, device, logPath, req, session, sessionName, sessionStore } = + params; + const { snapshot } = await captureSnapshot({ + dispatchSnapshotCommand, + device, + session, + flags: { + ...req.flags, + snapshotInteractiveOnly: false, + snapshotCompact: false, + }, + outPath: req.flags?.out, + logPath, + }); + if (session) { + session.snapshot = snapshot; + sessionStore.set(sessionName, session); + } + return snapshot; +} + +function waitSuccess( + sessionStore: SessionStore, + session: SessionState | undefined, + req: DaemonRequest, + data: Record, +): DaemonResponse { + recordIfSession(sessionStore, session, req, data); + return { ok: true, data }; +} diff --git a/src/daemon/handlers/snapshot.ts b/src/daemon/handlers/snapshot.ts index bb0de84d..75dd77a2 100644 --- a/src/daemon/handlers/snapshot.ts +++ b/src/daemon/handlers/snapshot.ts @@ -51,7 +51,8 @@ export async function handleSnapshotCommands(params: { dispatchSnapshotCommand, device, session, - req, + flags: req.flags, + outPath: req.flags?.out, logPath, snapshotScope: resolvedScope.scope, }); @@ -110,7 +111,8 @@ export async function handleSnapshotCommands(params: { dispatchSnapshotCommand, device, session, - req, + flags: req.flags, + outPath: req.flags?.out, logPath, snapshotScope: resolvedScope.scope, }); diff --git a/src/daemon/snapshot-processing.ts b/src/daemon/snapshot-processing.ts index 29a0f659..a74f062e 100644 --- a/src/daemon/snapshot-processing.ts +++ b/src/daemon/snapshot-processing.ts @@ -79,11 +79,11 @@ export function pruneGroupNodes(nodes: RawSnapshotNode[]): RawSnapshotNode[] { } export function normalizeType(type: string): string { - let value = type - .trim() - .replace(/XCUIElementType/gi, '') - .replace(/^AX/gi, '') - .toLowerCase(); + let value = type.trim().replace(/XCUIElementType/gi, ''); + if (value.startsWith('AX')) { + value = value.slice(2); + } + value = value.toLowerCase(); const lastSeparator = Math.max(value.lastIndexOf('.'), value.lastIndexOf('/')); if (lastSeparator !== -1) { value = value.slice(lastSeparator + 1); diff --git a/src/platforms/ios/macos-helper.ts b/src/platforms/ios/macos-helper.ts index 51270b36..fa1bf68e 100644 --- a/src/platforms/ios/macos-helper.ts +++ b/src/platforms/ios/macos-helper.ts @@ -9,6 +9,8 @@ import type { SessionSurface } from '../../core/session-surface.ts'; export type MacOsPermissionTarget = 'accessibility' | 'screen-recording' | 'input-monitoring'; +// Keep this shape aligned with macOS helper SnapshotNodeResponse in +// macos-helper/Sources/AgentDeviceMacOSHelper/SnapshotTraversal.swift. export type MacOsSnapshotNode = { index: number; type?: string; @@ -279,7 +281,7 @@ export async function runMacOsAlertAction( export async function runMacOsSnapshotAction(surface: Exclude): Promise<{ surface: Exclude; nodes: MacOsSnapshotNode[]; - truncated: false; + truncated: boolean; backend: 'macos-helper'; }> { return await runMacOsHelper(['snapshot', '--surface', surface]); From 45557d4bdf8785d971055c5485a331d918d18fc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Wed, 25 Mar 2026 14:51:08 +0100 Subject: [PATCH 4/4] fix: scope frontmost macos snapshots to visible windows --- .../SnapshotTraversal.swift | 117 ++++++++++-------- .../__tests__/snapshot-handler.test.ts | 55 ++++++++ 2 files changed, 122 insertions(+), 50 deletions(-) diff --git a/macos-helper/Sources/AgentDeviceMacOSHelper/SnapshotTraversal.swift b/macos-helper/Sources/AgentDeviceMacOSHelper/SnapshotTraversal.swift index 4aa82332..58006de9 100644 --- a/macos-helper/Sources/AgentDeviceMacOSHelper/SnapshotTraversal.swift +++ b/macos-helper/Sources/AgentDeviceMacOSHelper/SnapshotTraversal.swift @@ -80,19 +80,12 @@ func captureSnapshotResponse(surface: String) throws -> SnapshotResponse { private func snapshotFrontmostApp() throws -> SnapshotBuildResult { let app = try resolveTargetApplication(bundleId: nil, surface: "frontmost-app") - let appElement = AXUIElementCreateApplication(app.processIdentifier) var state = SnapshotTraversalState() - _ = appendElementSnapshot( - appElement, + _ = appendApplicationSnapshot( + app, depth: 0, parentIndex: nil, - context: SnapshotContext( - surface: "frontmost-app", - pid: Int32(app.processIdentifier), - bundleId: app.bundleIdentifier, - appName: app.localizedName, - windowTitle: nil - ), + surface: "frontmost-app", state: &state ) return SnapshotBuildResult(nodes: state.nodes, truncated: state.truncated) @@ -135,54 +128,78 @@ private func snapshotDesktop() -> SnapshotBuildResult { break } - let appElement = AXUIElementCreateApplication(app.processIdentifier) - let visibleWindows = windows(of: appElement).filter(isVisibleSnapshotWindow) - if visibleWindows.isEmpty { - continue - } - - guard - let appIndex = appendSyntheticSnapshotNode( - into: &state, - type: "Application", - label: app.localizedName ?? app.bundleIdentifier ?? "Application", - depth: 1, - parentIndex: rootIndex, - surface: "desktop", - identifier: app.bundleIdentifier, - pid: Int32(app.processIdentifier), - bundleId: app.bundleIdentifier, - appName: app.localizedName - ) - else { + let included = appendApplicationSnapshot( + app, + depth: 1, + parentIndex: rootIndex, + surface: "desktop", + state: &state + ) + if state.truncated { break } - - includedApps += 1 - for window in visibleWindows { - if state.truncated { - break - } - let windowTitle = stringAttribute(window, attribute: kAXTitleAttribute as String) - _ = appendElementSnapshot( - window, - depth: 2, - parentIndex: appIndex, - context: SnapshotContext( - surface: "desktop", - pid: Int32(app.processIdentifier), - bundleId: app.bundleIdentifier, - appName: app.localizedName, - windowTitle: windowTitle - ), - state: &state - ) + if included { + includedApps += 1 } } return SnapshotBuildResult(nodes: state.nodes, truncated: state.truncated) } +@discardableResult +private func appendApplicationSnapshot( + _ app: NSRunningApplication, + depth: Int, + parentIndex: Int?, + surface: String, + state: inout SnapshotTraversalState +) -> Bool { + let appElement = AXUIElementCreateApplication(app.processIdentifier) + let visibleWindows = windows(of: appElement).filter(isVisibleSnapshotWindow) + if visibleWindows.isEmpty { + return false + } + + guard + let appIndex = appendSyntheticSnapshotNode( + into: &state, + type: "Application", + label: app.localizedName ?? app.bundleIdentifier ?? "Application", + depth: depth, + parentIndex: parentIndex, + surface: surface, + identifier: app.bundleIdentifier, + pid: Int32(app.processIdentifier), + bundleId: app.bundleIdentifier, + appName: app.localizedName + ) + else { + return false + } + + for window in visibleWindows { + if state.truncated { + break + } + let windowTitle = stringAttribute(window, attribute: kAXTitleAttribute as String) + _ = appendElementSnapshot( + window, + depth: depth + 1, + parentIndex: appIndex, + context: SnapshotContext( + surface: surface, + pid: Int32(app.processIdentifier), + bundleId: app.bundleIdentifier, + appName: app.localizedName, + windowTitle: windowTitle + ), + state: &state + ) + } + + return true +} + private func snapshotMenuBar() -> SnapshotBuildResult { var state = SnapshotTraversalState() guard diff --git a/src/daemon/handlers/__tests__/snapshot-handler.test.ts b/src/daemon/handlers/__tests__/snapshot-handler.test.ts index 21344b0b..a53f2cf4 100644 --- a/src/daemon/handlers/__tests__/snapshot-handler.test.ts +++ b/src/daemon/handlers/__tests__/snapshot-handler.test.ts @@ -301,6 +301,61 @@ test('snapshot on macOS menubar surface uses helper-backed surface snapshot', as ); }); +test('snapshot on macOS frontmost-app surface uses helper-backed surface snapshot', async () => { + await withMockedMacOsHelper( + [ + '#!/bin/sh', + 'printf "%s\\n" "$@" > "$AGENT_DEVICE_TEST_ARGS_FILE"', + "cat <<'JSON'", + '{"ok":true,"data":{"surface":"frontmost-app","nodes":[{"index":0,"depth":0,"type":"Application","label":"TextEdit","surface":"frontmost-app","bundleId":"com.apple.TextEdit","appName":"TextEdit"},{"index":1,"depth":1,"parentIndex":0,"type":"Window","label":"Untitled","surface":"frontmost-app","windowTitle":"Untitled","rect":{"x":32,"y":48,"width":640,"height":480}}],"truncated":false,"backend":"macos-helper"}}', + 'JSON', + '', + ].join('\n'), + async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-frontmost-app-snapshot'; + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-frontmost-snapshot-')); + const argsLogPath = path.join(tmpDir, 'args.log'); + const previousArgsFile = process.env.AGENT_DEVICE_TEST_ARGS_FILE; + process.env.AGENT_DEVICE_TEST_ARGS_FILE = argsLogPath; + sessionStore.set(sessionName, { + ...makeSession(sessionName, macOsDevice), + surface: 'frontmost-app', + appBundleId: 'com.apple.systempreferences', + appName: 'System Settings', + }); + + try { + const response = await handleSnapshotCommands({ + req: { + token: 't', + session: sessionName, + command: 'snapshot', + positionals: [], + flags: {}, + }, + sessionName, + logPath: '/tmp/daemon.log', + sessionStore, + }); + + assert.equal(response?.ok, true); + const logged = await fs.promises.readFile(argsLogPath, 'utf8'); + assert.equal(logged, 'snapshot\n--surface\nfrontmost-app\n'); + const updated = sessionStore.get(sessionName); + assert.equal(updated?.snapshot?.backend, 'macos-helper'); + assert.equal(updated?.snapshot?.nodes[0]?.label, 'TextEdit'); + assert.equal(updated?.snapshot?.nodes[1]?.parentIndex, 0); + assert.equal(updated?.snapshot?.nodes[1]?.windowTitle, 'Untitled'); + } finally { + if (previousArgsFile === undefined) delete process.env.AGENT_DEVICE_TEST_ARGS_FILE; + else process.env.AGENT_DEVICE_TEST_ARGS_FILE = previousArgsFile; + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, + ); +}); + test('wait text on macOS desktop surface polls helper-backed snapshots instead of runner text search', async () => { await withMockedMacOsHelper( [