diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml new file mode 100644 index 00000000..f0083ff5 --- /dev/null +++ b/.github/workflows/macos.yml @@ -0,0 +1,55 @@ +name: macOS + +on: + pull_request: + push: + branches: + - main + +permissions: + contents: read + +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + integration-macos: + name: Integration Tests + runs-on: macos-26 + timeout-minutes: 80 + continue-on-error: true + env: + AGENT_DEVICE_DAEMON_TIMEOUT_MS: '300000' + AGENT_DEVICE_IOS_APP_LAUNCH_TIMEOUT_MS: '60000' + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Setup toolchain + uses: ./.github/actions/setup-node-pnpm + + - name: Resolve agent-device home + id: macos-agent-home + run: echo "dir=$HOME/.agent-device" >> "$GITHUB_OUTPUT" + + - name: Build macOS XCTest runner + run: pnpm build:xcuitest:macos + + - name: Build macOS helper + run: pnpm build:macos-helper + + - name: Run macOS integration test + run: node --test test/integration/macos.test.ts + + - name: Upload macOS artifacts + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: macos-artifacts + if-no-files-found: ignore + path: | + ${{ steps.macos-agent-home.outputs.dir }}/daemon.log + ${{ steps.macos-agent-home.outputs.dir }}/sessions/** + test/artifacts/** + test/screenshots/** diff --git a/README.md b/README.md index e3afc7d6..f9835ec5 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ For agents: npm install -g agent-device ``` -On macOS, `agent-device` now includes a local `agent-device-macos-helper` source package that is built on demand for desktop permission checks, alert handling, and other host-Mac support paths. Release distribution should use a signed/notarized helper build; source checkouts fall back to a local Swift build. +On macOS, `agent-device` includes a local `agent-device-macos-helper` source package that is built on demand for desktop permission checks, alert handling, and helper-backed desktop snapshot surfaces. Release distribution should use a signed/notarized helper build; source checkouts fall back to a local Swift build. ## Contributing diff --git a/macos-helper/Sources/AgentDeviceMacOSHelper/SnapshotTraversal.swift b/macos-helper/Sources/AgentDeviceMacOSHelper/SnapshotTraversal.swift new file mode 100644 index 00000000..58006de9 --- /dev/null +++ b/macos-helper/Sources/AgentDeviceMacOSHelper/SnapshotTraversal.swift @@ -0,0 +1,543 @@ +import AppKit +import ApplicationServices +import Foundation + +private enum SnapshotTraversalLimits { + static let maxDesktopApps = 24 + static let maxNodes = 1500 + static let maxDepth = 12 +} + +struct RectResponse: Encodable { + let x: Double + let y: Double + let width: Double + let height: Double +} + +struct SnapshotNodeResponse: Encodable { + let index: Int + let type: String? + let role: String? + let subrole: String? + let label: String? + let value: String? + let identifier: String? + let rect: RectResponse? + let enabled: Bool? + let selected: Bool? + let hittable: Bool? + let depth: Int + let parentIndex: Int? + let pid: Int32? + let bundleId: String? + let appName: String? + let windowTitle: String? + let surface: String? +} + +struct SnapshotResponse: Encodable { + let surface: String + let nodes: [SnapshotNodeResponse] + let truncated: Bool + let backend = "macos-helper" +} + +private struct SnapshotBuildResult { + let nodes: [SnapshotNodeResponse] + let truncated: Bool +} + +private struct SnapshotContext { + let surface: String + let pid: Int32? + let bundleId: String? + let appName: String? + let windowTitle: String? +} + +private struct SnapshotTraversalState { + var nodes: [SnapshotNodeResponse] = [] + var visited: [AXUIElement] = [] + var truncated = false +} + +func captureSnapshotResponse(surface: String) throws -> SnapshotResponse { + let result: SnapshotBuildResult + switch surface { + case "frontmost-app": + result = try snapshotFrontmostApp() + case "desktop": + result = snapshotDesktop() + case "menubar": + result = snapshotMenuBar() + default: + throw HelperError.invalidArgs("snapshot requires --surface ") + } + + return SnapshotResponse(surface: surface, nodes: result.nodes, truncated: result.truncated) +} + +private func snapshotFrontmostApp() throws -> SnapshotBuildResult { + let app = try resolveTargetApplication(bundleId: nil, surface: "frontmost-app") + var state = SnapshotTraversalState() + _ = appendApplicationSnapshot( + app, + depth: 0, + parentIndex: nil, + surface: "frontmost-app", + state: &state + ) + return SnapshotBuildResult(nodes: state.nodes, truncated: state.truncated) +} + +private func snapshotDesktop() -> SnapshotBuildResult { + var state = SnapshotTraversalState() + guard + let rootIndex = appendSyntheticSnapshotNode( + into: &state, + type: "DesktopSurface", + label: "Desktop", + depth: 0, + parentIndex: nil, + surface: "desktop" + ) + else { + return SnapshotBuildResult(nodes: state.nodes, truncated: true) + } + + var runningApps = NSWorkspace.shared.runningApplications.filter { app in + app.activationPolicy != .prohibited + && !app.isTerminated + && (app.bundleIdentifier?.isEmpty == false || app.localizedName?.isEmpty == false) + } + runningApps.sort { left, right in + if left.isActive != right.isActive { + return left.isActive && !right.isActive + } + return (left.localizedName ?? "") < (right.localizedName ?? "") + } + + var includedApps = 0 + for app in runningApps { + if includedApps >= SnapshotTraversalLimits.maxDesktopApps { + state.truncated = true + break + } + if state.truncated { + break + } + + let included = appendApplicationSnapshot( + app, + depth: 1, + parentIndex: rootIndex, + surface: "desktop", + state: &state + ) + if state.truncated { + break + } + if included { + includedApps += 1 + } + } + + return SnapshotBuildResult(nodes: state.nodes, truncated: state.truncated) +} + +@discardableResult +private func appendApplicationSnapshot( + _ app: NSRunningApplication, + depth: Int, + parentIndex: Int?, + surface: String, + state: inout SnapshotTraversalState +) -> Bool { + let appElement = AXUIElementCreateApplication(app.processIdentifier) + let visibleWindows = windows(of: appElement).filter(isVisibleSnapshotWindow) + if visibleWindows.isEmpty { + return false + } + + guard + let appIndex = appendSyntheticSnapshotNode( + into: &state, + type: "Application", + label: app.localizedName ?? app.bundleIdentifier ?? "Application", + depth: depth, + parentIndex: parentIndex, + surface: surface, + identifier: app.bundleIdentifier, + pid: Int32(app.processIdentifier), + bundleId: app.bundleIdentifier, + appName: app.localizedName + ) + else { + return false + } + + for window in visibleWindows { + if state.truncated { + break + } + let windowTitle = stringAttribute(window, attribute: kAXTitleAttribute as String) + _ = appendElementSnapshot( + window, + depth: depth + 1, + parentIndex: appIndex, + context: SnapshotContext( + surface: surface, + pid: Int32(app.processIdentifier), + bundleId: app.bundleIdentifier, + appName: app.localizedName, + windowTitle: windowTitle + ), + state: &state + ) + } + + return true +} + +private func snapshotMenuBar() -> SnapshotBuildResult { + var state = SnapshotTraversalState() + guard + let rootIndex = appendSyntheticSnapshotNode( + into: &state, + type: "MenuBarSurface", + label: "Menu Bar", + depth: 0, + parentIndex: nil, + surface: "menubar" + ) + else { + return SnapshotBuildResult(nodes: state.nodes, truncated: true) + } + + if let frontmost = NSWorkspace.shared.frontmostApplication { + let frontmostElement = AXUIElementCreateApplication(frontmost.processIdentifier) + if let menuBar = elementAttribute(frontmostElement, attribute: kAXMenuBarAttribute as String) { + _ = appendElementSnapshot( + menuBar, + depth: 1, + parentIndex: rootIndex, + context: SnapshotContext( + surface: "menubar", + pid: Int32(frontmost.processIdentifier), + bundleId: frontmost.bundleIdentifier, + appName: frontmost.localizedName, + windowTitle: frontmost.localizedName + ), + state: &state + ) + } + } + + if !state.truncated, + let systemUiServer = NSRunningApplication.runningApplications( + withBundleIdentifier: "com.apple.systemuiserver" + ).first + { + let systemUiElement = AXUIElementCreateApplication(systemUiServer.processIdentifier) + if let menuExtras = elementAttribute(systemUiElement, attribute: kAXMenuBarAttribute as String) { + _ = appendElementSnapshot( + menuExtras, + depth: 1, + parentIndex: rootIndex, + context: SnapshotContext( + surface: "menubar", + pid: Int32(systemUiServer.processIdentifier), + bundleId: systemUiServer.bundleIdentifier, + appName: systemUiServer.localizedName, + windowTitle: "System Menu Extras" + ), + state: &state + ) + } + } + + return SnapshotBuildResult(nodes: state.nodes, truncated: state.truncated) +} + +@discardableResult +private func appendSyntheticSnapshotNode( + into state: inout SnapshotTraversalState, + type: String, + label: String, + depth: Int, + parentIndex: Int?, + surface: String, + identifier: String? = nil, + pid: Int32? = nil, + bundleId: String? = nil, + appName: String? = nil, + windowTitle: String? = nil +) -> Int? { + guard reserveSnapshotNodeCapacity(&state) else { + return nil + } + + let index = state.nodes.count + state.nodes.append( + SnapshotNodeResponse( + index: index, + type: type, + role: type, + subrole: nil, + label: label, + value: nil, + identifier: identifier ?? "surface:\(surface):\(type.lowercased())", + rect: nil, + enabled: true, + selected: nil, + hittable: false, + depth: depth, + parentIndex: parentIndex, + pid: pid, + bundleId: bundleId, + appName: appName, + windowTitle: windowTitle, + surface: surface + ) + ) + return index +} + +@discardableResult +private func appendElementSnapshot( + _ element: AXUIElement, + depth: Int, + parentIndex: Int?, + context: SnapshotContext, + state: inout SnapshotTraversalState, + maxDepth: Int = SnapshotTraversalLimits.maxDepth +) -> Int? { + if state.visited.contains(where: { CFEqual($0, element) }) { + return parentIndex + } + guard reserveSnapshotNodeCapacity(&state) else { + return parentIndex + } + state.visited.append(element) + + let role = stringAttribute(element, attribute: kAXRoleAttribute as String) + let subrole = stringAttribute(element, attribute: kAXSubroleAttribute as String) + let title = stringAttribute(element, attribute: kAXTitleAttribute as String) + let description = stringAttribute(element, attribute: kAXDescriptionAttribute as String) + let value = stringAttribute(element, attribute: kAXValueAttribute as String) + let identifier = stringAttribute(element, attribute: "AXIdentifier") + let rect = rectAttribute(element) + let enabled = boolAttribute(element, attribute: kAXEnabledAttribute as String) + let selected = boolAttribute(element, attribute: kAXSelectedAttribute as String) + let type = normalizedSnapshotType(role: role, subrole: subrole) + let windowTitle = context.windowTitle ?? inferWindowTitle(for: element) + + let index = state.nodes.count + state.nodes.append( + SnapshotNodeResponse( + index: index, + type: type, + role: role, + subrole: subrole, + label: title ?? description ?? value, + value: value, + identifier: identifier, + rect: rect, + enabled: enabled, + selected: selected, + hittable: (enabled ?? true) && rect != nil, + depth: depth, + parentIndex: parentIndex, + pid: context.pid, + bundleId: context.bundleId, + appName: context.appName, + windowTitle: windowTitle, + surface: context.surface + ) + ) + + guard depth < maxDepth, !state.truncated else { + return index + } + + for child in children(of: element) { + if state.truncated { + break + } + _ = appendElementSnapshot( + child, + depth: depth + 1, + parentIndex: index, + context: SnapshotContext( + surface: context.surface, + pid: context.pid, + bundleId: context.bundleId, + appName: context.appName, + windowTitle: windowTitle + ), + state: &state, + maxDepth: maxDepth + ) + } + + return index +} + +private func reserveSnapshotNodeCapacity(_ state: inout SnapshotTraversalState) -> Bool { + if state.nodes.count >= SnapshotTraversalLimits.maxNodes { + state.truncated = true + return false + } + return true +} + +private func normalizedSnapshotType(role: String?, subrole: String?) -> String? { + switch role { + case "AXApplication": + return "Application" + case "AXWindow": + return subrole == "AXStandardWindow" ? "Window" : (subrole ?? "Window") + case "AXSheet": + return "Sheet" + case "AXDialog": + return "Dialog" + case "AXButton": + return "Button" + case "AXStaticText": + return "StaticText" + case "AXTextField": + return "TextField" + case "AXTextArea": + return "TextArea" + case "AXScrollArea": + return "ScrollArea" + case "AXGroup": + return "Group" + case "AXMenuBar": + return "MenuBar" + case "AXMenuBarItem": + return "MenuBarItem" + case "AXMenu": + return "Menu" + case "AXMenuItem": + return "MenuItem" + default: + if let subrole, !subrole.isEmpty { + return subrole + } + return role + } +} + +private func isVisibleSnapshotWindow(_ window: AXUIElement) -> Bool { + guard let rect = rectAttribute(window) else { + return false + } + if rect.width <= 0 || rect.height <= 0 { + return false + } + if boolAttribute(window, attribute: kAXMinimizedAttribute as String) == true { + return false + } + return true +} + +private func inferWindowTitle(for element: AXUIElement) -> String? { + if let title = stringAttribute(element, attribute: kAXTitleAttribute as String) { + return title + } + if let window = elementAttribute(element, attribute: kAXWindowAttribute as String) { + return stringAttribute(window, attribute: kAXTitleAttribute as String) + } + return nil +} + +func stringAttribute(_ element: AXUIElement, attribute: String) -> String? { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success else { + return nil + } + if let text = value as? String { + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + return trimmed.isEmpty ? nil : trimmed + } + return nil +} + +func boolAttribute(_ element: AXUIElement, attribute: String) -> Bool? { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success, + let number = value as? NSNumber + else { + return nil + } + return number.boolValue +} + +func elementAttribute(_ element: AXUIElement, attribute: String) -> AXUIElement? { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success, + let value, + CFGetTypeID(value) == AXUIElementGetTypeID() + else { + return nil + } + return (value as! AXUIElement) +} + +func rectAttribute(_ element: AXUIElement) -> RectResponse? { + var positionValue: CFTypeRef? + var sizeValue: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, kAXPositionAttribute as CFString, &positionValue) == .success, + AXUIElementCopyAttributeValue(element, kAXSizeAttribute as CFString, &sizeValue) == .success, + let positionAxValue = accessibilityAxValue(positionValue), + let sizeAxValue = accessibilityAxValue(sizeValue) + else { + return nil + } + + var position = CGPoint.zero + var size = CGSize.zero + guard AXValueGetType(positionAxValue) == .cgPoint, + AXValueGetValue(positionAxValue, .cgPoint, &position), + AXValueGetType(sizeAxValue) == .cgSize, + AXValueGetValue(sizeAxValue, .cgSize, &size) + else { + return nil + } + + return RectResponse( + x: Double(position.x), + y: Double(position.y), + width: Double(size.width), + height: Double(size.height) + ) +} + +private func accessibilityAxValue(_ value: CFTypeRef?) -> AXValue? { + guard let value, CFGetTypeID(value) == AXValueGetTypeID() else { + return nil + } + return (value as! AXValue) +} + +func children(of element: AXUIElement) -> [AXUIElement] { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(element, kAXChildrenAttribute as CFString, &value) == .success, + let children = value as? [AXUIElement] + else { + return [] + } + return children +} + +func windows(of appElement: AXUIElement) -> [AXUIElement] { + var value: CFTypeRef? + guard AXUIElementCopyAttributeValue(appElement, "AXWindows" as CFString, &value) == .success, + let windows = value as? [AXUIElement] + else { + return [] + } + return windows +} diff --git a/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift b/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift index 75f599dc..90bd4272 100644 --- a/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift +++ b/macos-helper/Sources/AgentDeviceMacOSHelper/main.swift @@ -53,7 +53,6 @@ struct AlertResponse: Encodable { let bundleId: String? } -@main struct AgentDeviceMacOSHelper { static func main() { do { @@ -93,6 +92,8 @@ struct AgentDeviceMacOSHelper { return try handlePermission(arguments: Array(arguments.dropFirst())) case "alert": return try handleAlert(arguments: Array(arguments.dropFirst())) + case "snapshot": + return try handleSnapshot(arguments: Array(arguments.dropFirst())) default: throw HelperError.invalidArgs("unknown command: \(command)") } @@ -256,7 +257,7 @@ struct AgentDeviceMacOSHelper { } let bundleId = optionValue(arguments: Array(arguments.dropFirst()), name: "--bundle-id") let surface = optionValue(arguments: Array(arguments.dropFirst()), name: "--surface") - let app = try resolveAlertApplication(bundleId: bundleId, surface: surface) + let app = try resolveTargetApplication(bundleId: bundleId, surface: surface) guard let alertElement = findAlertElement(appElement: AXUIElementCreateApplication(app.processIdentifier)) else { throw HelperError.commandFailed( "alert not found", @@ -298,6 +299,25 @@ struct AgentDeviceMacOSHelper { ) ) } + + static func handleSnapshot(arguments: [String]) throws -> any Encodable { + guard let surface = optionValue(arguments: arguments, name: "--surface")? + .trimmingCharacters(in: .whitespacesAndNewlines) + .lowercased(), + !surface.isEmpty + else { + throw HelperError.invalidArgs("snapshot requires --surface ") + } + + switch surface { + case "frontmost-app": + return SuccessEnvelope(data: try captureSnapshotResponse(surface: surface)) + case "desktop", "menubar": + return SuccessEnvelope(data: try captureSnapshotResponse(surface: surface)) + default: + throw HelperError.invalidArgs("snapshot requires --surface ") + } + } } private func optionValue(arguments: [String], name: String) -> String? { @@ -327,7 +347,7 @@ private func writeJSON(_ value: T) throws { FileHandle.standardOutput.write(Data([0x0A])) } -private func resolveAlertApplication(bundleId: String?, surface: String?) throws -> NSRunningApplication { +func resolveTargetApplication(bundleId: String?, surface: String?) throws -> NSRunningApplication { let normalizedSurface = surface?.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() if normalizedSurface == "desktop" || normalizedSurface == "menubar" { throw HelperError.commandFailed( @@ -363,49 +383,6 @@ private func validatedBundleId(_ rawBundleId: String) throws -> String { return bundleId } -private func stringAttribute(_ element: AXUIElement, attribute: String) -> String? { - var value: CFTypeRef? - guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success else { - return nil - } - if let text = value as? String { - let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) - return trimmed.isEmpty ? nil : trimmed - } - return nil -} - -private func elementAttribute(_ element: AXUIElement, attribute: String) -> AXUIElement? { - var value: CFTypeRef? - guard AXUIElementCopyAttributeValue(element, attribute as CFString, &value) == .success else { - return nil - } - guard let value else { - return nil - } - return unsafeBitCast(value, to: AXUIElement.self) -} - -private func children(of element: AXUIElement) -> [AXUIElement] { - var value: CFTypeRef? - guard AXUIElementCopyAttributeValue(element, kAXChildrenAttribute as CFString, &value) == .success, - let children = value as? [AXUIElement] - else { - return [] - } - return children -} - -private func windows(of appElement: AXUIElement) -> [AXUIElement] { - var value: CFTypeRef? - guard AXUIElementCopyAttributeValue(appElement, "AXWindows" as CFString, &value) == .success, - let windows = value as? [AXUIElement] - else { - return [] - } - return windows -} - private func findAlertElement(appElement: AXUIElement) -> AXUIElement? { for window in windows(of: appElement) { if let role = stringAttribute(window, attribute: kAXRoleAttribute as String), @@ -489,3 +466,5 @@ private func resolveAlertActionButton(root: AXUIElement, buttons: [AXUIElement], return action == "accept" ? buttons.first : buttons.last } + +AgentDeviceMacOSHelper.main() diff --git a/skills/agent-device/SKILL.md b/skills/agent-device/SKILL.md index 9f4085c8..c45d05bb 100644 --- a/skills/agent-device/SKILL.md +++ b/skills/agent-device/SKILL.md @@ -5,125 +5,84 @@ description: Automates interactions for Apple-platform apps (iOS, tvOS, macOS) a # Apple and Android Automation with agent-device -For exploration, use snapshot refs. For deterministic replay, use selectors. -For structured exploratory QA bug hunts and reporting, use [../dogfood/SKILL.md](../dogfood/SKILL.md). +Use this skill as a router. -## Start Here (Read This First) +Core rule: -Use this skill as a router, not a full manual. +- explore with `snapshot -i` and `@ref` +- stabilize with selectors +- use plain `snapshot` when you need to verify whether text is visible +- re-snapshot after every meaningful UI change -1. Pick one mode: - - Normal interaction flow - - Debug/crash flow - - Replay maintenance flow -2. Run one canonical flow below. -3. Open references only if blocked. +For exploratory QA bug hunts and reporting, use [../dogfood/SKILL.md](../dogfood/SKILL.md). -## Decision Map +## Quick route -- No target context yet: `devices` -> pick target -> `open`. -- Normal UI task: `open` -> `snapshot -i` -> `press/click/fill` -> `diff snapshot -i` -> `close` -- Debug/crash (iOS/Android): `open ` -> `logs clear --restart` -> reproduce -> `network dump` -> `logs path` -> targeted `grep` -- Replay drift: `replay -u ` -> verify updated selectors -- Remote multi-tenant run: allocate lease -> point client at remote daemon base URL -> run commands with tenant isolation flags -> heartbeat/release lease -- Device-scope isolation run: set iOS simulator set / Android allowlist -> run selectors within scope only -- macOS desktop task: run the macOS desktop flow, then open [references/macos-desktop.md](references/macos-desktop.md) if context menus, Finder rows, or desktop-specific snapshot behavior matters -- macOS desktop debugging: `open --platform macos` -> `logs clear --restart` -> reproduce -> `network dump` -> `logs path` +- Normal UI task: `open` -> `snapshot -i` -> `click/fill/press` -> `close` +- Debug task: `open` -> `logs clear --restart` -> reproduce -> `network dump` -> `logs path` +- Replay drift: `replay -u ` +- No target context yet: `devices` -> pick target -> `open` -## Target Selection Rules +## Target rules -- iOS local QA: use simulators unless the task explicitly requires a physical device. -- iOS local QA in mixed simulator/device environments: run `ensure-simulator` first and pass `--device`, `--udid`, or `--ios-simulator-device-set` on later commands. -- macOS desktop app automation: use `--platform macos`, or `--platform apple --target desktop` when the caller wants one Apple-family selector path. -- For macOS phase-1 non-default session targeting, use `open --platform macos --surface frontmost-app`. -- Android local QA: use `install` or `reinstall` for `.apk`/`.aab` files, then relaunch by installed package name. -- Android React Native + Metro flows: prefer `open --remote-config --relaunch`. -- In mixed-device environments, always pin the exact target with `--serial`, `--device`, `--udid`, or an isolation scope. -- For session-bound automation runs, prefer a pre-bound session/platform instead of repeating selectors on every command: set `AGENT_DEVICE_SESSION`, set `AGENT_DEVICE_PLATFORM`, and the daemon will enforce the shared lock policy across CLI, typed client, and RPC entry points. -- Use `--session-lock reject|strip` (or `AGENT_DEVICE_SESSION_LOCK`) only when you need to override the default reject behavior. Lock mode applies to nested `batch` steps too. +- iOS local QA: prefer simulators +- Android binary flow: `install` or `reinstall` first, then `open --relaunch` +- In mixed-device labs, always pin the target with `--device`, `--udid`, `--serial`, or an isolation scope +- For session-bound automation, prefer `AGENT_DEVICE_SESSION` + `AGENT_DEVICE_PLATFORM` -## Canonical Flows +## macOS rules -### 1) Normal Interaction Flow +- Use `open --platform macos` for normal Mac app automation +- Use `open --platform macos --surface frontmost-app|desktop|menubar` when you need desktop-global inspection first +- Use `app` sessions for `click`, `fill`, `press`, `scroll`, `screenshot`, and `record` +- Use `frontmost-app`, `desktop`, and `menubar` mainly for `snapshot`, `get`, `is`, and `wait` +- If you inspect with `desktop` or `menubar` and then need to act inside one app, open that app in a normal `app` session +- Prefer `@ref` or selectors over raw `x y` on macOS +- Use `click --button secondary` for context menus, then run `snapshot -i` again -```bash -agent-device open Settings --platform ios -agent-device snapshot -i -agent-device press @e3 -agent-device diff snapshot -i -agent-device fill @e5 "test" -agent-device close -``` +## Canonical flows -### 1a) Local iOS Simulator QA Flow +### Normal flow ```bash -agent-device ensure-simulator --platform ios --device "iPhone 16" --boot -agent-device open MyApp --platform ios --device "iPhone 16" --session qa-ios --relaunch +agent-device open Settings --platform ios agent-device snapshot -i agent-device press @e3 +agent-device fill @e5 "test" agent-device close ``` -Use this when a physical iPhone is also connected and you want deterministic simulator-only automation. - -### 1b) Android React Native + Metro QA Flow +### macOS app flow ```bash -agent-device reinstall MyApp /path/to/app-debug.apk --platform android --serial emulator-5554 -agent-device open com.example.myapp --remote-config ./agent-device.remote.json --relaunch +agent-device open TextEdit --platform macos agent-device snapshot -i +agent-device fill @e3 "desktop smoke test" +agent-device screenshot /tmp/macos-textedit.png agent-device close ``` -Do not use `open --relaunch` on Android. Install/reinstall binaries first, then relaunch by package. - -### 1c) Session-Bound Automation Flow +### macOS desktop-global inspect flow ```bash -export AGENT_DEVICE_SESSION=qa-ios -export AGENT_DEVICE_PLATFORM=ios -export AGENT_DEVICE_SESSION_LOCK=strip - -agent-device open MyApp --relaunch +agent-device open --platform macos --surface desktop agent-device snapshot -i -agent-device batch --steps-file /tmp/qa-steps.json --json +agent-device get attrs @e4 +agent-device is visible 'role="window" label="Notes"' +agent-device wait text "Notes" agent-device close ``` -Use this for orchestrators that must preserve one bound session/device across many plain CLI calls without a wrapper script. In `strip` mode, conflicting selectors such as `--target`, `--device`, `--udid`, `--serial`, and isolation-scope overrides are ignored instead of retargeting the run. - -### 1d) Android Emulator Session-Bound Flow +### Android relaunch flow ```bash -export AGENT_DEVICE_SESSION=qa-android -export AGENT_DEVICE_PLATFORM=android - -agent-device reinstall MyApp /path/to/app-debug.apk --serial emulator-5554 -agent-device --session-lock reject open com.example.myapp --relaunch -agent-device snapshot -i -agent-device close --shutdown -``` - -Use this when an Android emulator session must stay pinned while an agent or test runner issues plain CLI commands over time. - -### 1e) macOS Desktop Flow - -```bash -agent-device open TextEdit --platform macos +agent-device reinstall MyApp /path/to/app-debug.apk --platform android --serial emulator-5554 +agent-device open com.example.myapp --remote-config ./agent-device.remote.json --relaunch agent-device snapshot -i -agent-device fill @e3 "desktop smoke test" -agent-device screenshot /tmp/macos-textedit.png agent-device close ``` -Use this for host Mac desktop apps. Prefer the Apple runner interaction flow (`open`, `snapshot`, `press`, `click`, `fill`, `scroll`, `back`, `record`, `screenshot`). macOS also supports `clipboard read|write`, `trigger-app-event`, `logs`, `network dump`, `alert`, `settings appearance`, and `settings permission `. -Source checkouts build `agent-device-macos-helper` on first use for macOS permission/alert support; release builds should ship a signed helper. -Phase 1 exposes `app` and `frontmost-app` session surfaces. Broader `desktop` and `menubar` surfaces remain future work until the desktop-global backend lands. -Prefer selectors or snapshot refs (`@e...`) over raw x/y commands on macOS because the window origin can move between runs. -Open [references/macos-desktop.md](references/macos-desktop.md) when you need Finder-style list traversal, context-menu flows, or macOS-specific snapshot expectations. - -### 2) Debug/Crash Flow +### Debug flow ```bash agent-device open MyApp --platform ios @@ -132,221 +91,27 @@ agent-device network dump 25 agent-device logs path ``` -Logging is off by default. Enable only for debugging windows. -`logs clear --restart` requires an active app session (`open ` first). - -### 3) Replay Maintenance Flow +### Replay maintenance ```bash agent-device replay -u ./session.ad ``` -### 4) Remote Tenant Lease Flow (HTTP JSON-RPC) - -```bash -# Client points directly at the remote daemon HTTP base URL. -export AGENT_DEVICE_DAEMON_BASE_URL=http://mac-host.example:4310 -export AGENT_DEVICE_DAEMON_AUTH_TOKEN= - -# Allocate lease -curl -sS "${AGENT_DEVICE_DAEMON_BASE_URL}/rpc" \ - -H "content-type: application/json" \ - -H "Authorization: Bearer " \ - -d '{"jsonrpc":"2.0","id":"alloc-1","method":"agent_device.lease.allocate","params":{"runId":"run-123","tenantId":"acme","ttlMs":60000}}' - -# Use lease in tenant-isolated command execution -agent-device \ - --tenant acme \ - --session-isolation tenant \ - --run-id run-123 \ - --lease-id \ - session list --json - -# Heartbeat and release -curl -sS "${AGENT_DEVICE_DAEMON_BASE_URL}/rpc" \ - -H "content-type: application/json" \ - -H "Authorization: Bearer " \ - -d '{"jsonrpc":"2.0","id":"hb-1","method":"agent_device.lease.heartbeat","params":{"leaseId":"","ttlMs":60000}}' -curl -sS "${AGENT_DEVICE_DAEMON_BASE_URL}/rpc" \ - -H "content-type: application/json" \ - -H "Authorization: Bearer " \ - -d '{"jsonrpc":"2.0","id":"rel-1","method":"agent_device.lease.release","params":{"leaseId":""}}' -``` - -Notes: +## High-value guardrails -- `AGENT_DEVICE_DAEMON_BASE_URL` makes the CLI skip local daemon discovery/startup and call the remote HTTP daemon directly. -- `AGENT_DEVICE_DAEMON_AUTH_TOKEN` is sent in both the JSON-RPC request token and HTTP auth headers. -- In remote daemon mode, `--debug` does not tail a local `daemon.log`; inspect logs on the remote host instead. - -## Command Skeleton (Minimal) - -### Session and navigation - -```bash -agent-device devices -agent-device devices --platform ios --ios-simulator-device-set /tmp/tenant-a/simulators -agent-device devices --platform android --android-device-allowlist emulator-5554,device-1234 -agent-device ensure-simulator --device "iPhone 16" --ios-simulator-device-set /tmp/tenant-a/simulators -agent-device ensure-simulator --device "iPhone 16" --runtime com.apple.CoreSimulator.SimRuntime.iOS-18-4 --ios-simulator-device-set /tmp/tenant-a/simulators --boot -agent-device open [app|url] [url] -agent-device open [app] --relaunch -agent-device close [app] -agent-device install -agent-device install-from-source [--header "name:value"] -agent-device reinstall -agent-device session list -``` - -Use `boot` only as fallback when `open` cannot find/connect to a ready target. -If the workspace repeats the same selectors or device/session flags, prefer a checked-in `agent-device.json` or `--config ` over repeating them inline. -Environment-level defaults follow the same fields via `AGENT_DEVICE_*` names, so persistent host-specific values belong there rather than in committed project config. -That includes bound-session defaults such as `sessionLock` / `AGENT_DEVICE_SESSION_LOCK` when automation should consistently reject or strip conflicting device routing flags. -For Android emulators by AVD name, use `boot --platform android --device `. -For Android emulators without GUI, add `--headless`. -Use `--target mobile|tv` with `--platform` (required) to pick phone/tablet vs TV targets (AndroidTV/tvOS). -For Android React Native + Metro flows, install or reinstall the APK first, then use `open --remote-config --relaunch`; do not use `open --relaunch`. -For local iOS QA in mixed simulator/device environments, use `ensure-simulator` and pass `--device` or `--udid` so automation does not attach to a physical device by accident. -For session-bound automation, prefer `AGENT_DEVICE_SESSION` + `AGENT_DEVICE_PLATFORM`; that bound-session default now enables lock mode automatically. - -Isolation scoping quick reference: - -- `--ios-simulator-device-set ` scopes iOS simulator discovery + command execution to one simulator set. -- `--android-device-allowlist ` scopes Android discovery/selection to comma/space separated serials. -- Scope is applied before selectors (`--device`, `--udid`, `--serial`); out-of-scope selectors fail with `DEVICE_NOT_FOUND`. -- With iOS simulator-set scope enabled, iOS physical devices are not enumerated. -- In bound-session `strip` mode, conflicting per-call scope/selectors are ignored and the configured binding is restored for the request. Batch steps still inherit the parent `--platform` when they do not set their own. - -Simulator provisioning quick reference: - -- Use `ensure-simulator` to create or reuse a named iOS simulator inside a device set before starting a session. -- `--device ` is required (e.g. `"iPhone 16 Pro"`). `--runtime ` pins the runtime; omit to use the newest compatible one. -- `--boot` boots it immediately. Returns `udid`, `device`, `runtime`, `ios_simulator_device_set`, `created`, `booted`. -- Idempotent: safe to call repeatedly; reuses an existing matching simulator by default. - -TV quick reference: - -- AndroidTV: `open`/`apps` use TV launcher discovery automatically. -- TV target selection works on emulators/simulators and connected physical devices (AndroidTV + AppleTV). -- tvOS: runner-driven interactions and snapshots are supported (`snapshot`, `wait`, `press`, `fill`, `get`, `scroll`, `back`, `home`, `app-switcher`, `record` and related selector flows). -- tvOS `back`/`home`/`app-switcher` map to Siri Remote actions (`menu`, `home`, double-home) in the runner. -- tvOS follows iOS simulator-only command semantics for helpers like `pinch`, `settings`, and `push`. - -### Snapshot and targeting - -```bash -agent-device snapshot -i -agent-device diff snapshot -i -agent-device find "Sign In" click -agent-device press @e1 -agent-device fill @e2 "text" -agent-device is visible 'id="anchor"' -``` - -`press` is canonical tap command; `click` is an alias. -On macOS, use `click --button secondary <@ref|selector>` to open a context menu before the next `snapshot -i`. -For desktop-specific heuristics and Finder guidance, see [references/macos-desktop.md](references/macos-desktop.md). - -### Utilities - -```bash -agent-device appstate -agent-device clipboard read -agent-device clipboard write "token" -agent-device keyboard status -agent-device keyboard dismiss -agent-device perf --json -agent-device network dump [limit] [summary|headers|body|all] -agent-device push -agent-device trigger-app-event screenshot_taken '{"source":"qa"}' -agent-device get text @e1 -agent-device screenshot out.png -agent-device settings permission grant notifications -agent-device settings permission reset camera -agent-device settings permission grant accessibility --platform macos -agent-device settings permission reset screen-recording --platform macos -agent-device trace start -agent-device trace stop ./trace.log -``` - -### Batch (when sequence is already known) - -```bash -agent-device batch --steps-file /tmp/batch-steps.json --json -``` - -### Performance Check - -- Use `agent-device perf --json` (or `metrics --json`) after `open`. -- For detailed metric semantics, caveats, and interpretation guidance, see [references/perf-metrics.md](references/perf-metrics.md). - -## Guardrails (High Value Only) - -- Re-snapshot after UI mutations (navigation/modal/list changes). -- Prefer `snapshot -i`; scope/depth only when needed. -- Use refs for discovery, selectors for replay/assertions. -- `find "" click --json` returns `{ ref, locator, query, x, y }` — all derived from the matched snapshot node. Do not rely on these fields from raw `press`/`click` responses for observability; use `find` instead. -- Use `fill` for clear-then-type semantics; use `type` for focused append typing. -- Use `install` for in-place app upgrades (keep app data when platform permits), and `reinstall` for deterministic fresh-state runs. -- App binary format support for `install`/`reinstall`: Android `.apk`/`.aab`, iOS `.app`/`.ipa`. -- Android `.aab` requires `bundletool` in `PATH`, or `AGENT_DEVICE_BUNDLETOOL_JAR=` with `java` in `PATH`. -- Android `.aab` optional: set `AGENT_DEVICE_ANDROID_BUNDLETOOL_MODE=` to control bundletool `build-apks --mode` (default: `universal`). -- iOS `.ipa`: extract/install from `Payload/*.app`; when multiple app bundles are present, `` is used as a bundle id/name hint. -- iOS `appstate` is session-scoped; Android `appstate` is live foreground state. iOS responses include `device_udid` and `ios_simulator_device_set` for isolation verification. -- iOS `open` responses include `device_udid` and `ios_simulator_device_set` to confirm which simulator handled the session. -- Clipboard helpers: `clipboard read` / `clipboard write ` are supported on macOS, Android, and iOS simulators; iOS physical devices are not supported yet. -- Android keyboard helpers: `keyboard status|get|dismiss` report keyboard visibility/type and dismiss via keyevent when visible. -- `network dump` is best-effort and parses HTTP(s) entries from the session app log file. -- Biometric settings: iOS simulator supports `settings faceid|touchid `; Android supports `settings fingerprint ` where runtime tooling is available. -- For AndroidTV/tvOS selection, always pair `--target` with `--platform` (`ios`, `android`, or `apple` alias); target-only selection is invalid. -- `push` simulates notification delivery: - - iOS simulator uses APNs-style payload JSON. - - Android uses broadcast action + typed extras (string/boolean/number). -- `trigger-app-event` requires app-defined deep-link hooks and URL template configuration (`AGENT_DEVICE_APP_EVENT_URL_TEMPLATE` or platform-specific variants). -- On macOS, set `AGENT_DEVICE_MACOS_APP_EVENT_URL_TEMPLATE` when the desktop app uses a different deep-link template than iOS/Android. -- `trigger-app-event` requires an active session or explicit selectors (`--platform`, `--device`, `--udid`, `--serial`); on iOS physical devices, custom-scheme triggers require active app context. -- Canonical trigger behavior and caveats are documented in [`website/docs/docs/commands.md`](../../website/docs/docs/commands.md) under **App event triggers**. -- Permission settings are app-scoped on iOS/Android and require an active session app: - `settings permission [full|limited]` -- On macOS, use: - `settings permission ` -- macOS permission helpers check/request access and guide the user to System Settings when manual approval is required. -- iOS simulator permission alerts: use `alert wait` then `alert accept/dismiss` — `accept`/`dismiss` retry internally for up to 2 s so you do not need manual sleeps. See [references/permissions.md](references/permissions.md). -- `full|limited` mode applies only to iOS `photos`; other targets reject mode. -- On Android, non-ASCII `fill/type` may require an ADB keyboard IME on some system images; only install IME APKs from trusted sources and verify checksum/signature. -- If using `--save-script`, prefer explicit path syntax (`--save-script=flow.ad` or `./flow.ad`). -- For tenant-isolated remote runs, always pass `--tenant`, `--session-isolation tenant`, `--run-id`, and `--lease-id` together. -- Use short lease TTLs and heartbeat only while work is active; release leases immediately after run completion/failure. -- Env equivalents for scoped runs: `AGENT_DEVICE_IOS_SIMULATOR_DEVICE_SET` (compat `IOS_SIMULATOR_DEVICE_SET`) and - `AGENT_DEVICE_ANDROID_DEVICE_ALLOWLIST` (compat `ANDROID_DEVICE_ALLOWLIST`). -- For explicit remote client mode, prefer `AGENT_DEVICE_DAEMON_BASE_URL` / `--daemon-base-url` instead of relying on local daemon metadata or loopback-only ports. - -## Common Failure Patterns - -- `Failed to access Android app sandbox for /path/app-debug.apk`: Android relaunch/runtime-hint flow received an APK path instead of an installed package name. Use `reinstall` first, then `open --relaunch`. -- `mkdir: Needs 1 argument` while writing `ReactNativeDevPrefs.xml`: likely an older `agent-device` build or stale global install is still using the shell-based Android runtime-hint writer. Verify the exact binary being invoked. -- `Failed to terminate iOS app`: the flow may have selected a physical iPhone or an unavailable iOS target. Re-run with `ensure-simulator`, then pin the simulator with `--device` or `--udid`. - -## Security and Trust Notes - -- Prefer a preinstalled `agent-device` binary over on-demand package execution. -- If install is required, pin an exact version (for example: `npx --yes agent-device@ --help`). -- Signing/provisioning environment variables are optional, sensitive, and only for iOS physical-device setup. -- Logs/artifacts are written under `~/.agent-device`; replay scripts write to explicit paths you provide. -- For remote daemon mode, prefer `AGENT_DEVICE_DAEMON_SERVER_MODE=http|dual` on the host plus client-side `AGENT_DEVICE_DAEMON_BASE_URL`, with `AGENT_DEVICE_HTTP_AUTH_HOOK` and tenant-scoped lease admission where needed. -- Keep logging off unless debugging and use least-privilege/isolated environments for autonomous runs. - -## Common Mistakes - -- Mixing debug flow into normal runs (keep logs off unless debugging). -- Continuing to use stale refs after screen transitions. -- Using URL opens with Android `--activity` (unsupported combination). -- Treating `boot` as default first step instead of fallback. +- Prefer `snapshot -i`; use `--raw` only for structure debugging +- Use plain `snapshot` to verify text visibility; use `snapshot -i` mainly for interactive exploration and choosing refs +- Use refs for discovery, selectors for replay/assertions +- `fill` clears then types; `type` only types into the focused field +- `network dump` is best-effort and reads from the session app log +- `logs clear --restart` requires an active app session +- On macOS, helper-backed flows cover permissions, alerts, and desktop-global snapshot surfaces +- On macOS, do not assume `desktop` or `menubar` are the best surface for real interactions yet ## References -- [references/snapshot-refs.md](references/snapshot-refs.md) - [references/macos-desktop.md](references/macos-desktop.md) +- [references/snapshot-refs.md](references/snapshot-refs.md) - [references/logs-and-debug.md](references/logs-and-debug.md) - [references/session-management.md](references/session-management.md) - [references/permissions.md](references/permissions.md) diff --git a/skills/agent-device/references/macos-desktop.md b/skills/agent-device/references/macos-desktop.md index b0cead1d..67f7ab60 100644 --- a/skills/agent-device/references/macos-desktop.md +++ b/skills/agent-device/references/macos-desktop.md @@ -2,14 +2,22 @@ Use this reference for host Mac apps such as Finder, TextEdit, System Settings, Preview, or browser apps running as normal desktop windows. +## Start here + +- Use `open --platform macos` when you need to act inside one app. +- Use `open --platform macos --surface frontmost-app|desktop|menubar` when you need to inspect desktop-global UI first. +- Use `app` sessions for `click`, `fill`, `press`, `scroll`, `screenshot`, and `record`. +- Use `frontmost-app`, `desktop`, and `menubar` mainly for `snapshot`, `get`, `is`, and `wait`. +- Prefer `@ref` or selectors. Avoid raw coordinates unless there is no better target. + ## Mental model -- `snapshot -i` should describe UI that is visible to a human in the current front window. +- `snapshot -i` should describe UI visible to a human. - Context menus are not ambient UI. Open them explicitly with `click --button secondary`, then re-snapshot. - Prefer refs for exploration and selectors for deterministic replay/assertions. -- Avoid raw `x y` coordinates unless refs/selectors are impossible. +- If you inspect with `desktop` or `menubar` and then need to act on one app, switch to a normal `app` session. -## Canonical flow +## Canonical app flow ```bash agent-device open Finder --platform macos @@ -19,18 +27,36 @@ agent-device snapshot -i agent-device close ``` +## Canonical desktop-global flow + +```bash +agent-device open --platform macos --surface desktop +agent-device snapshot -i +agent-device get attrs @e4 +agent-device is visible 'role="window" label="Finder"' +agent-device close +``` + Surface variants: ```bash agent-device open --platform macos --surface frontmost-app +agent-device open --platform macos --surface desktop +agent-device open --platform macos --surface menubar ``` -- `frontmost-app` tracks the currently focused app explicitly in the session. -- `desktop` and `menubar` remain planned phase-2 surfaces for broader desktop-global automation work. +- `app`: default session surface; use this for most real interaction work. +- `frontmost-app`: inspect the currently focused app without naming it first. +- `desktop`: inspect visible desktop windows across apps. +- `menubar`: inspect the active app menu bar and system menu extras. + +Use `frontmost-app`, `desktop`, and `menubar` for read/inspect flows first. If the next step is a click/fill/press/scroll in one app, switch back to `app`. ## What to expect from snapshots -- `snapshot -i` prioritizes visible window content over dormant menu infrastructure. +- `app` snapshots should focus on the chosen app window. +- `desktop` snapshots can contain multiple windows from multiple apps. +- `menubar` snapshots can contain both app-menu items and system menu extras. - File rows, sidebar items, toolbar controls, search fields, and visible context menus should appear. - Finder and other native apps may expose duplicate-looking structures such as row wrapper nodes, `cell` nodes, and child `text` or `text-field` nodes. - Treat those as distinct AX nodes unless you have a stronger selector anchor. @@ -53,6 +79,8 @@ Expected pattern: Do not expect context-menu items to appear before the menu is opened. +Do not use `longpress` as a substitute for right-click on macOS. + ## Finder-specific guidance - `snapshot -i` should still expose visible folder rows even when nothing is selected. @@ -81,18 +109,21 @@ Good macOS selectors usually anchor on one of: - `label="failed-step.json"` - `role=button label="Search"` - `role=menu-item label="Rename"` +- `role=window label="Notes"` Prefer exact labels when the desktop UI is stable. Use `id=...` when the AX identifier is clearly app-owned and not a framework-generated `_NS:*` value. ## Things not to rely on - Mobile-only helpers like `install`, `reinstall`, and `push` -- Long-press as a substitute for right-click +- Desktop-global click/fill parity from `desktop` or `menubar` sessions - Raw coordinate assumptions across runs; macOS windows can move - Framework-generated `_NS:*` identifiers as stable selectors ## Troubleshooting - If visible window content is missing from `snapshot -i`, re-snapshot once after the UI settles. +- If `desktop` is too broad, retry with `frontmost-app` to narrow the inspect surface. +- If `menubar` is missing the menu you expect, make the target app frontmost first, then re-open the `menubar` surface and snapshot again. - If the wrong menu opened or no menu appeared, retry secondary-clicking the row/cell wrapper instead of the nested text node. - If the app has multiple windows, ensure the correct one is frontmost before relying on refs. diff --git a/skills/agent-device/references/snapshot-refs.md b/skills/agent-device/references/snapshot-refs.md index eba0d11e..f1adf1c4 100644 --- a/skills/agent-device/references/snapshot-refs.md +++ b/skills/agent-device/references/snapshot-refs.md @@ -12,6 +12,9 @@ For host Mac desktop apps, pair this reference with [macos-desktop.md](macos-des agent-device snapshot -i ``` +Use plain `snapshot` when you need to verify whether text is visible on screen. +Use `snapshot -i` mainly for interactive exploration and choosing refs. + Output: ``` diff --git a/src/core/session-surface.ts b/src/core/session-surface.ts index 3ebdd98d..0f9429d6 100644 --- a/src/core/session-surface.ts +++ b/src/core/session-surface.ts @@ -9,12 +9,6 @@ export const SESSION_SURFACES: readonly SessionSurface[] = [ 'menubar', ]; -export const PHASE1_MACOS_SESSION_SURFACES: readonly SessionSurface[] = ['app', 'frontmost-app']; - -export function isPhase1MacOsSessionSurface(surface: SessionSurface): boolean { - return PHASE1_MACOS_SESSION_SURFACES.includes(surface); -} - export function parseSessionSurface(value: string | undefined): SessionSurface { const normalized = value?.trim().toLowerCase(); if ( diff --git a/src/daemon/handlers/__tests__/find.test.ts b/src/daemon/handlers/__tests__/find.test.ts index a6b11025..2925fb81 100644 --- a/src/daemon/handlers/__tests__/find.test.ts +++ b/src/daemon/handlers/__tests__/find.test.ts @@ -8,6 +8,7 @@ import { AppError } from '../../../utils/errors.ts'; import { SessionStore } from '../../session-store.ts'; import type { SessionState } from '../../types.ts'; import type { DaemonRequest } from '../../types.ts'; +import { withMockedMacOsHelper } from '../../../platforms/ios/__tests__/macos-helper-test-utils.ts'; function makeSessionStore(): SessionStore { const root = fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-find-handler-')); @@ -29,6 +30,22 @@ function makeSession(name: string): SessionState { }; } +function makeMacOsSession(name: string): SessionState { + return { + name, + device: { + platform: 'macos', + id: 'macos-host', + name: 'Mac', + kind: 'device', + booted: true, + }, + createdAt: Date.now(), + actions: [], + surface: 'desktop', + }; +} + const INCREMENT_NODE = { type: 'Button', label: 'Increment', @@ -249,3 +266,55 @@ test('handleFindCommands click returns deterministic metadata across locator var }); } }); + +test('handleFindCommands uses helper-backed snapshots for macOS desktop sessions', async () => { + await withMockedMacOsHelper( + [ + '#!/bin/sh', + 'printf "%s\\n" "$@" > "$AGENT_DEVICE_TEST_ARGS_FILE"', + "cat <<'JSON'", + '{"ok":true,"data":{"surface":"desktop","nodes":[{"index":0,"depth":0,"type":"DesktopSurface","label":"Desktop","surface":"desktop"},{"index":1,"depth":1,"parentIndex":0,"type":"Window","label":"Notes","surface":"desktop","rect":{"x":32,"y":48,"width":640,"height":480}}],"truncated":false,"backend":"macos-helper"}}', + 'JSON', + '', + ].join('\n'), + async ({ tmpDir }) => { + const argsLogPath = path.join(tmpDir, 'args.log'); + const previousArgsFile = process.env.AGENT_DEVICE_TEST_ARGS_FILE; + process.env.AGENT_DEVICE_TEST_ARGS_FILE = argsLogPath; + const sessionStore = makeSessionStore(); + const sessionName = 'macos-desktop-find'; + sessionStore.set(sessionName, makeMacOsSession(sessionName)); + let snapshotDispatchCalls = 0; + + try { + const response = await handleFindCommands({ + req: { + token: 't', + session: sessionName, + command: 'find', + positionals: ['label', 'Notes', 'get', 'attrs'], + flags: {}, + }, + sessionName, + logPath: '/tmp/test.log', + sessionStore, + invoke: async () => ({ ok: true }), + dispatch: async (_device, command) => { + if (command === 'snapshot') { + snapshotDispatchCalls += 1; + } + return {}; + }, + }); + + assert.equal(response?.ok, true); + assert.equal(snapshotDispatchCalls, 0); + const logged = await fs.promises.readFile(argsLogPath, 'utf8'); + assert.equal(logged, 'snapshot\n--surface\ndesktop\n'); + } finally { + if (previousArgsFile === undefined) delete process.env.AGENT_DEVICE_TEST_ARGS_FILE; + else process.env.AGENT_DEVICE_TEST_ARGS_FILE = previousArgsFile; + } + }, + ); +}); diff --git a/src/daemon/handlers/__tests__/interaction.test.ts b/src/daemon/handlers/__tests__/interaction.test.ts index 7b0000e3..29302e72 100644 --- a/src/daemon/handlers/__tests__/interaction.test.ts +++ b/src/daemon/handlers/__tests__/interaction.test.ts @@ -47,6 +47,29 @@ function makeAndroidSession(name: string): SessionState { }; } +function makeMacOsDesktopSession(name: string): SessionState { + return { + name, + device: { + platform: 'macos', + id: 'macos-host', + name: 'Mac', + kind: 'device', + booted: true, + }, + createdAt: Date.now(), + actions: [], + surface: 'desktop', + }; +} + +function makeMacOsMenubarSession(name: string): SessionState { + return { + ...makeMacOsDesktopSession(name), + surface: 'menubar', + }; +} + const contextFromFlags = (flags: CommandFlags | undefined) => ({ count: flags?.count, intervalMs: flags?.intervalMs, @@ -122,6 +145,62 @@ test('press coordinates dispatches press and records as press', async () => { assert.deepEqual(session?.actions[0]?.positionals, ['100', '200']); }); +test('click rejects macOS desktop surface interactions until helper routing exists', async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-desktop-click'; + sessionStore.set(sessionName, makeMacOsDesktopSession(sessionName)); + + const response = await handleInteractionCommands({ + req: { + token: 't', + session: sessionName, + command: 'click', + positionals: ['100', '200'], + flags: {}, + }, + sessionName, + sessionStore, + contextFromFlags, + dispatch: async () => { + throw new Error('dispatch should not be called'); + }, + }); + + assert.equal(response?.ok, false); + if (response && !response.ok) { + assert.equal(response.error.code, 'UNSUPPORTED_OPERATION'); + assert.match(response.error.message, /macOS desktop sessions/); + } +}); + +test('fill rejects macOS menubar surface interactions until helper routing exists', async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-menubar-fill'; + sessionStore.set(sessionName, makeMacOsMenubarSession(sessionName)); + + const response = await handleInteractionCommands({ + req: { + token: 't', + session: sessionName, + command: 'fill', + positionals: ['@e2', 'hello'], + flags: {}, + }, + sessionName, + sessionStore, + contextFromFlags, + dispatch: async () => { + throw new Error('dispatch should not be called'); + }, + }); + + assert.equal(response?.ok, false); + if (response && !response.ok) { + assert.equal(response.error.code, 'UNSUPPORTED_OPERATION'); + assert.match(response.error.message, /macOS menubar sessions/); + } +}); + test('press coordinates appends touch-visualization events while recording', async () => { const sessionStore = makeSessionStore(); const sessionName = 'default'; diff --git a/src/daemon/handlers/__tests__/session.test.ts b/src/daemon/handlers/__tests__/session.test.ts index 50396802..5d083871 100644 --- a/src/daemon/handlers/__tests__/session.test.ts +++ b/src/daemon/handlers/__tests__/session.test.ts @@ -2576,7 +2576,7 @@ test('open on existing macOS frontmost-app session preserves surface without --s ); }); -test('open on macOS rejects desktop surface until desktop-global backend lands', async () => { +test('open on macOS stores desktop surface without app context', async () => { const sessionStore = makeSessionStore(); const sessionName = 'macos-desktop-surface'; const response = await handleSessionCommands({ @@ -2606,11 +2606,53 @@ test('open on macOS rejects desktop surface until desktop-global backend lands', }), }); - assert.equal(response?.ok, false); - if (response && !response.ok) { - assert.equal(response.error.code, 'INVALID_ARGS'); - assert.match(response.error.message, /not supported yet/i); - assert.match(response.error.message, /app\|frontmost-app/i); + assert.equal(response?.ok, true); + const session = sessionStore.get(sessionName); + assert.equal(session?.surface, 'desktop'); + assert.equal(session?.appBundleId, undefined); + assert.equal(session?.appName, undefined); + if (response && response.ok) { + assert.equal(response.data?.surface, 'desktop'); + assert.equal(response.data?.appBundleId, undefined); + } +}); + +test('open on macOS stores menubar surface without app context', async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-menubar-surface'; + const response = await handleSessionCommands({ + req: { + token: 't', + session: sessionName, + command: 'open', + positionals: [], + flags: { + platform: 'macos', + surface: 'menubar', + }, + }, + sessionName, + logPath: path.join(os.tmpdir(), 'daemon.log'), + sessionStore, + invoke: noopInvoke, + dispatch: async () => ({}), + ensureReady: async () => {}, + resolveTargetDevice: async () => ({ + platform: 'macos', + id: 'host-macos-local', + name: 'Host Mac', + kind: 'device', + target: 'desktop', + booted: true, + }), + }); + + assert.equal(response?.ok, true); + const session = sessionStore.get(sessionName); + assert.equal(session?.surface, 'menubar'); + assert.equal(session?.appBundleId, undefined); + if (response && response.ok) { + assert.equal(response.data?.surface, 'menubar'); } }); diff --git a/src/daemon/handlers/__tests__/snapshot-handler.test.ts b/src/daemon/handlers/__tests__/snapshot-handler.test.ts index c9ee315b..a53f2cf4 100644 --- a/src/daemon/handlers/__tests__/snapshot-handler.test.ts +++ b/src/daemon/handlers/__tests__/snapshot-handler.test.ts @@ -159,6 +159,246 @@ test('settings on macOS returns helper-backed permission status', async () => { ); }); +test('snapshot on macOS desktop surface uses helper-backed surface snapshot', async () => { + await withMockedMacOsHelper( + [ + '#!/bin/sh', + 'printf "%s\\n" "$@" > "$AGENT_DEVICE_TEST_ARGS_FILE"', + "cat <<'JSON'", + '{"ok":true,"data":{"surface":"desktop","nodes":[{"index":0,"depth":0,"type":"DesktopSurface","label":"Desktop","surface":"desktop"},{"index":1,"depth":1,"parentIndex":0,"type":"Window","label":"Notes","surface":"desktop","bundleId":"com.apple.Notes","appName":"Notes","windowTitle":"Notes","rect":{"x":32,"y":48,"width":640,"height":480}}],"truncated":false,"backend":"macos-helper"}}', + 'JSON', + '', + ].join('\n'), + async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-desktop-snapshot'; + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-desktop-snapshot-')); + const argsLogPath = path.join(tmpDir, 'args.log'); + const previousArgsFile = process.env.AGENT_DEVICE_TEST_ARGS_FILE; + process.env.AGENT_DEVICE_TEST_ARGS_FILE = argsLogPath; + sessionStore.set(sessionName, { + ...makeSession(sessionName, macOsDevice), + surface: 'desktop', + }); + + try { + const response = await handleSnapshotCommands({ + req: { + token: 't', + session: sessionName, + command: 'snapshot', + positionals: [], + flags: {}, + }, + sessionName, + logPath: '/tmp/daemon.log', + sessionStore, + }); + + assert.equal(response?.ok, true); + const logged = await fs.promises.readFile(argsLogPath, 'utf8'); + assert.equal(logged, 'snapshot\n--surface\ndesktop\n'); + const updated = sessionStore.get(sessionName); + assert.equal(updated?.snapshot?.backend, 'macos-helper'); + assert.equal(updated?.snapshot?.nodes[0]?.label, 'Desktop'); + assert.equal(updated?.snapshot?.nodes[1]?.windowTitle, 'Notes'); + } finally { + if (previousArgsFile === undefined) delete process.env.AGENT_DEVICE_TEST_ARGS_FILE; + else process.env.AGENT_DEVICE_TEST_ARGS_FILE = previousArgsFile; + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, + ); +}); + +test('snapshot on macOS desktop surface applies scope and depth after helper capture', async () => { + await withMockedMacOsHelper( + [ + '#!/bin/sh', + "cat <<'JSON'", + '{"ok":true,"data":{"surface":"desktop","nodes":[{"index":0,"depth":0,"type":"DesktopSurface","label":"Desktop","surface":"desktop"},{"index":1,"depth":1,"parentIndex":0,"type":"Application","label":"Notes","surface":"desktop","bundleId":"com.apple.Notes","appName":"Notes"},{"index":2,"depth":2,"parentIndex":1,"type":"Window","label":"Notes","surface":"desktop","windowTitle":"Notes","rect":{"x":32,"y":48,"width":640,"height":480}},{"index":3,"depth":3,"parentIndex":2,"type":"StaticText","label":"Pinned","surface":"desktop","rect":{"x":40,"y":60,"width":80,"height":24}}],"truncated":false,"backend":"macos-helper"}}', + 'JSON', + '', + ].join('\n'), + async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-desktop-scoped-snapshot'; + sessionStore.set(sessionName, { + ...makeSession(sessionName, macOsDevice), + surface: 'desktop', + }); + + const response = await handleSnapshotCommands({ + req: { + token: 't', + session: sessionName, + command: 'snapshot', + positionals: [], + flags: { snapshotScope: 'Notes', snapshotDepth: 0 }, + }, + sessionName, + logPath: '/tmp/daemon.log', + sessionStore, + }); + + assert.equal(response?.ok, true); + const updated = sessionStore.get(sessionName); + assert.equal(updated?.snapshot?.backend, 'macos-helper'); + assert.equal(updated?.snapshot?.nodes.length, 1); + assert.equal(updated?.snapshot?.nodes[0]?.label, 'Notes'); + assert.equal(updated?.snapshot?.nodes[0]?.depth, 0); + assert.equal(updated?.snapshot?.nodes[0]?.parentIndex, undefined); + }, + ); +}); + +test('snapshot on macOS menubar surface uses helper-backed surface snapshot', async () => { + await withMockedMacOsHelper( + [ + '#!/bin/sh', + 'printf "%s\\n" "$@" > "$AGENT_DEVICE_TEST_ARGS_FILE"', + "cat <<'JSON'", + '{"ok":true,"data":{"surface":"menubar","nodes":[{"index":0,"depth":0,"type":"MenuBarSurface","label":"Menu Bar","surface":"menubar"},{"index":1,"depth":1,"parentIndex":0,"type":"MenuBarItem","label":"File","surface":"menubar"}],"truncated":false,"backend":"macos-helper"}}', + 'JSON', + '', + ].join('\n'), + async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-menubar-snapshot'; + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-menubar-snapshot-')); + const argsLogPath = path.join(tmpDir, 'args.log'); + const previousArgsFile = process.env.AGENT_DEVICE_TEST_ARGS_FILE; + process.env.AGENT_DEVICE_TEST_ARGS_FILE = argsLogPath; + sessionStore.set(sessionName, { + ...makeSession(sessionName, macOsDevice), + surface: 'menubar', + }); + + try { + const response = await handleSnapshotCommands({ + req: { + token: 't', + session: sessionName, + command: 'snapshot', + positionals: [], + flags: {}, + }, + sessionName, + logPath: '/tmp/daemon.log', + sessionStore, + }); + + assert.equal(response?.ok, true); + const logged = await fs.promises.readFile(argsLogPath, 'utf8'); + assert.equal(logged, 'snapshot\n--surface\nmenubar\n'); + assert.equal(sessionStore.get(sessionName)?.snapshot?.nodes[1]?.label, 'File'); + } finally { + if (previousArgsFile === undefined) delete process.env.AGENT_DEVICE_TEST_ARGS_FILE; + else process.env.AGENT_DEVICE_TEST_ARGS_FILE = previousArgsFile; + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, + ); +}); + +test('snapshot on macOS frontmost-app surface uses helper-backed surface snapshot', async () => { + await withMockedMacOsHelper( + [ + '#!/bin/sh', + 'printf "%s\\n" "$@" > "$AGENT_DEVICE_TEST_ARGS_FILE"', + "cat <<'JSON'", + '{"ok":true,"data":{"surface":"frontmost-app","nodes":[{"index":0,"depth":0,"type":"Application","label":"TextEdit","surface":"frontmost-app","bundleId":"com.apple.TextEdit","appName":"TextEdit"},{"index":1,"depth":1,"parentIndex":0,"type":"Window","label":"Untitled","surface":"frontmost-app","windowTitle":"Untitled","rect":{"x":32,"y":48,"width":640,"height":480}}],"truncated":false,"backend":"macos-helper"}}', + 'JSON', + '', + ].join('\n'), + async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-frontmost-app-snapshot'; + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-frontmost-snapshot-')); + const argsLogPath = path.join(tmpDir, 'args.log'); + const previousArgsFile = process.env.AGENT_DEVICE_TEST_ARGS_FILE; + process.env.AGENT_DEVICE_TEST_ARGS_FILE = argsLogPath; + sessionStore.set(sessionName, { + ...makeSession(sessionName, macOsDevice), + surface: 'frontmost-app', + appBundleId: 'com.apple.systempreferences', + appName: 'System Settings', + }); + + try { + const response = await handleSnapshotCommands({ + req: { + token: 't', + session: sessionName, + command: 'snapshot', + positionals: [], + flags: {}, + }, + sessionName, + logPath: '/tmp/daemon.log', + sessionStore, + }); + + assert.equal(response?.ok, true); + const logged = await fs.promises.readFile(argsLogPath, 'utf8'); + assert.equal(logged, 'snapshot\n--surface\nfrontmost-app\n'); + const updated = sessionStore.get(sessionName); + assert.equal(updated?.snapshot?.backend, 'macos-helper'); + assert.equal(updated?.snapshot?.nodes[0]?.label, 'TextEdit'); + assert.equal(updated?.snapshot?.nodes[1]?.parentIndex, 0); + assert.equal(updated?.snapshot?.nodes[1]?.windowTitle, 'Untitled'); + } finally { + if (previousArgsFile === undefined) delete process.env.AGENT_DEVICE_TEST_ARGS_FILE; + else process.env.AGENT_DEVICE_TEST_ARGS_FILE = previousArgsFile; + fs.rmSync(tmpDir, { recursive: true, force: true }); + } + }, + ); +}); + +test('wait text on macOS desktop surface polls helper-backed snapshots instead of runner text search', async () => { + await withMockedMacOsHelper( + [ + '#!/bin/sh', + "cat <<'JSON'", + '{"ok":true,"data":{"surface":"desktop","nodes":[{"index":0,"depth":0,"type":"DesktopSurface","label":"Desktop","surface":"desktop"},{"index":1,"depth":1,"parentIndex":0,"type":"StaticText","label":"Accessibility","surface":"desktop"}],"truncated":false,"backend":"macos-helper"}}', + 'JSON', + '', + ].join('\n'), + async () => { + const sessionStore = makeSessionStore(); + const sessionName = 'macos-desktop-wait'; + sessionStore.set(sessionName, { + ...makeSession(sessionName, macOsDevice), + surface: 'desktop', + }); + + let runnerCalls = 0; + const response = await handleSnapshotCommands({ + req: { + token: 't', + session: sessionName, + command: 'wait', + positionals: ['Accessibility', '10'], + flags: {}, + }, + sessionName, + logPath: '/tmp/daemon.log', + sessionStore, + runnerCommand: async () => { + runnerCalls += 1; + return { found: false }; + }, + }); + + assert.equal(response?.ok, true); + assert.equal(runnerCalls, 0); + const updated = sessionStore.get(sessionName); + assert.equal(updated?.snapshot?.backend, 'macos-helper'); + }, + ); +}); + test('diff rejects unsupported kind', async () => { const sessionStore = makeSessionStore(); const response = await handleSnapshotCommands({ diff --git a/src/daemon/handlers/find.ts b/src/daemon/handlers/find.ts index 4c13eab1..7661b474 100644 --- a/src/daemon/handlers/find.ts +++ b/src/daemon/handlers/find.ts @@ -1,22 +1,14 @@ import { dispatchCommand, resolveTargetDevice } from '../../core/dispatch.ts'; import { findBestMatchesByLocator, type FindLocator } from '../../utils/finders.ts'; -import { - attachRefs, - centerOfRect, - type RawSnapshotNode, - type SnapshotState, -} from '../../utils/snapshot.ts'; +import { centerOfRect, type SnapshotState } from '../../utils/snapshot.ts'; import { AppError } from '../../utils/errors.ts'; import type { DaemonRequest, DaemonResponse } from '../types.ts'; import { SessionStore } from '../session-store.ts'; import { contextFromFlags } from '../context.ts'; import { ensureDeviceReady } from '../device-ready.ts'; -import { - extractNodeText, - findNearestHittableAncestor, - pruneGroupNodes, -} from '../snapshot-processing.ts'; +import { extractNodeText, findNearestHittableAncestor } from '../snapshot-processing.ts'; import { parseTimeout } from './parse-utils.ts'; +import { captureSnapshot } from './snapshot-capture.ts'; export async function handleFindCommands(params: { req: DaemonRequest; @@ -55,7 +47,6 @@ export async function handleFindCommands(params: { if (!session) { await ensureDeviceReady(device); } - const appBundleId = session?.appBundleId; const scope = shouldScopeFind(locator) ? query : undefined; const requiresRect = action === 'click' || action === 'focus' || action === 'fill' || action === 'type'; @@ -71,38 +62,27 @@ export async function handleFindCommands(params: { if (lastNodes && now - lastSnapshotAt < 750) { return { nodes: lastNodes }; } - const data = (await dispatch(device, 'snapshot', [], req.flags?.out, { - ...contextFromFlags( - logPath, - { - ...req.flags, - snapshotScope: scope, - snapshotInteractiveOnly: interactiveOnly, - snapshotCompact: interactiveOnly, - }, - appBundleId, - session?.trace?.outPath, - ), - })) as { - nodes?: RawSnapshotNode[]; - truncated?: boolean; - backend?: 'xctest' | 'android'; - }; - const rawNodes = data?.nodes ?? []; - const nodes = attachRefs(req.flags?.snapshotRaw ? rawNodes : pruneGroupNodes(rawNodes)); + const { snapshot } = await captureSnapshot({ + dispatchSnapshotCommand: dispatch, + device, + session, + flags: { + ...req.flags, + snapshotInteractiveOnly: interactiveOnly, + snapshotCompact: interactiveOnly, + }, + outPath: req.flags?.out, + logPath, + snapshotScope: scope, + }); + const nodes = snapshot.nodes; lastSnapshotAt = now; lastNodes = nodes; if (session) { - const snapshot: SnapshotState = { - nodes, - truncated: data?.truncated, - createdAt: Date.now(), - backend: data?.backend, - }; session.snapshot = snapshot; sessionStore.set(sessionName, session); } - return { nodes, truncated: data?.truncated, backend: data?.backend }; + return { nodes, truncated: snapshot.truncated, backend: snapshot.backend }; }; if (action === 'wait') { const timeout = timeoutMs ?? 10000; diff --git a/src/daemon/handlers/interaction-snapshot.ts b/src/daemon/handlers/interaction-snapshot.ts index 093aae37..923fb686 100644 --- a/src/daemon/handlers/interaction-snapshot.ts +++ b/src/daemon/handlers/interaction-snapshot.ts @@ -1,10 +1,9 @@ import { dispatchCommand, type CommandFlags } from '../../core/dispatch.ts'; -import { attachRefs, type RawSnapshotNode } from '../../utils/snapshot.ts'; -import { pruneGroupNodes } from '../snapshot-processing.ts'; import type { SessionStore } from '../session-store.ts'; import type { SessionState } from '../types.ts'; import type { SnapshotState } from '../../utils/snapshot.ts'; import type { ContextFromFlags } from './interaction-common.ts'; +import { captureSnapshot } from './snapshot-capture.ts'; export async function captureSnapshotForSession( session: SessionState, @@ -14,29 +13,25 @@ export async function captureSnapshotForSession( options: { interactiveOnly: boolean }, dispatch: typeof dispatchCommand = dispatchCommand, ): Promise { - const data = (await dispatch(session.device, 'snapshot', [], flags?.out, { - ...contextFromFlags( - { - ...(flags ?? {}), - snapshotInteractiveOnly: options.interactiveOnly, - snapshotCompact: options.interactiveOnly, - }, - session.appBundleId, - session.trace?.outPath, - ), - })) as { - nodes?: RawSnapshotNode[]; - truncated?: boolean; - backend?: 'xctest' | 'android'; - }; - const rawNodes = data?.nodes ?? []; - const nodes = attachRefs(flags?.snapshotRaw ? rawNodes : pruneGroupNodes(rawNodes)); - session.snapshot = { - nodes, - truncated: data?.truncated, - createdAt: Date.now(), - backend: data?.backend, + const effectiveFlags = { + ...(flags ?? {}), + snapshotInteractiveOnly: options.interactiveOnly, + snapshotCompact: options.interactiveOnly, }; + const dispatchContext = contextFromFlags( + effectiveFlags, + session.appBundleId, + session.trace?.outPath, + ); + const { snapshot } = await captureSnapshot({ + dispatchSnapshotCommand: dispatch, + device: session.device, + session, + flags: effectiveFlags, + outPath: effectiveFlags.out, + logPath: dispatchContext.logPath ?? '', + }); + session.snapshot = snapshot; sessionStore.set(session.name, session); return session.snapshot; } diff --git a/src/daemon/handlers/interaction-touch.ts b/src/daemon/handlers/interaction-touch.ts index de74c1f9..21b1500d 100644 --- a/src/daemon/handlers/interaction-touch.ts +++ b/src/daemon/handlers/interaction-touch.ts @@ -36,7 +36,7 @@ type CaptureSnapshotForSession = ( nodes: SnapshotNode[]; truncated?: boolean; createdAt: number; - backend?: 'xctest' | 'android'; + backend?: 'xctest' | 'android' | 'macos-helper'; }>; type ResolveRefTarget = @@ -90,6 +90,13 @@ export async function handleTouchInteractionCommands(params: { error: { code: 'SESSION_NOT_FOUND', message: 'No active session. Run open first.' }, }; } + const unsupportedSurfaceResponse = unsupportedMacOsDesktopSurfaceInteraction( + session, + commandLabel, + ); + if (unsupportedSurfaceResponse) { + return unsupportedSurfaceResponse; + } if (!isCommandSupportedOnDevice('press', session.device)) { return { ok: false, @@ -300,6 +307,15 @@ export async function handleTouchInteractionCommands(params: { if (command === 'fill') { const session = sessionStore.get(sessionName); + if (session) { + const unsupportedSurfaceResponse = unsupportedMacOsDesktopSurfaceInteraction( + session, + command, + ); + if (unsupportedSurfaceResponse) { + return unsupportedSurfaceResponse; + } + } if (session && !isCommandSupportedOnDevice('fill', session.device)) { return { ok: false, @@ -491,6 +507,25 @@ export async function handleTouchInteractionCommands(params: { return null; } +function unsupportedMacOsDesktopSurfaceInteraction( + session: SessionState, + command: 'click' | 'press' | 'fill', +): DaemonResponse | null { + if (session.device.platform !== 'macos') { + return null; + } + if (session.surface !== 'desktop' && session.surface !== 'menubar') { + return null; + } + return { + ok: false, + error: { + code: 'UNSUPPORTED_OPERATION', + message: `${command} is not supported on macOS ${session.surface} sessions yet. Open an app session to act, or use the ${session.surface} surface to inspect.`, + }, + }; +} + function parseCoordinateTarget(positionals: string[]): { x: number; y: number } | null { if (positionals.length < 2) return null; const x = Number(positionals[0]); diff --git a/src/daemon/handlers/session-open.ts b/src/daemon/handlers/session-open.ts index 4f31dccb..4885dba6 100644 --- a/src/daemon/handlers/session-open.ts +++ b/src/daemon/handlers/session-open.ts @@ -1,10 +1,6 @@ import { dispatchCommand, resolveTargetDevice } from '../../core/dispatch.ts'; import { isDeepLinkTarget } from '../../core/open-target.ts'; -import { - isPhase1MacOsSessionSurface, - parseSessionSurface, - type SessionSurface, -} from '../../core/session-surface.ts'; +import { parseSessionSurface, type SessionSurface } from '../../core/session-surface.ts'; import { ensureDeviceReady } from '../device-ready.ts'; import { contextFromFlags } from '../context.ts'; import { resolveFrontmostMacOsApp } from '../../platforms/ios/macos-helper.ts'; @@ -117,12 +113,6 @@ function resolveOpenSurface( return 'app'; } const surface = surfaceFlag ? parseSessionSurface(surfaceFlag) : 'app'; - if (!isPhase1MacOsSessionSurface(surface)) { - throw new AppError( - 'INVALID_ARGS', - `open --surface ${surface} is planned but not supported yet. Use app|frontmost-app for now.`, - ); - } if (surface !== 'app' && openTarget) { throw new AppError('INVALID_ARGS', `open --surface ${surface} does not accept an app target`); } @@ -148,12 +138,9 @@ function resolveRequestedOpenSurface(params: { async function resolveMacOsSurfaceAppState( surface: SessionSurface, ): Promise<{ appBundleId?: string; appName?: string }> { - if (surface === 'app') { + if (surface === 'app' || surface === 'desktop' || surface === 'menubar') { return {}; } - if (surface !== 'frontmost-app') { - throw new AppError('INVALID_ARGS', `open --surface ${surface} is not supported in phase 1`); - } const frontmost = await resolveFrontmostMacOsApp(); return { appBundleId: frontmost.bundleId, diff --git a/src/daemon/handlers/session.ts b/src/daemon/handlers/session.ts index a72dc810..2518ea9a 100644 --- a/src/daemon/handlers/session.ts +++ b/src/daemon/handlers/session.ts @@ -1454,7 +1454,7 @@ async function captureSnapshotForReplay( })) as { nodes?: RawSnapshotNode[]; truncated?: boolean; - backend?: 'xctest' | 'android'; + backend?: 'xctest' | 'android' | 'macos-helper'; }; const rawNodes = data?.nodes ?? []; const nodes = attachRefs(action.flags?.snapshotRaw ? rawNodes : pruneGroupNodes(rawNodes)); diff --git a/src/daemon/handlers/snapshot-capture.ts b/src/daemon/handlers/snapshot-capture.ts index a25af209..2f7b703d 100644 --- a/src/daemon/handlers/snapshot-capture.ts +++ b/src/daemon/handlers/snapshot-capture.ts @@ -1,4 +1,5 @@ -import { dispatchCommand } from '../../core/dispatch.ts'; +import { dispatchCommand, type CommandFlags } from '../../core/dispatch.ts'; +import { runMacOsSnapshotAction } from '../../platforms/ios/macos-helper.ts'; import { attachRefs, findNodeByRef, @@ -6,43 +7,59 @@ import { type RawSnapshotNode, type SnapshotState, } from '../../utils/snapshot.ts'; -import type { DaemonResponse, DaemonRequest, SessionState } from '../types.ts'; +import type { DaemonResponse, SessionState } from '../types.ts'; import { contextFromFlags } from '../context.ts'; -import { pruneGroupNodes, resolveRefLabel } from '../snapshot-processing.ts'; +import { findNodeByLabel, pruneGroupNodes, resolveRefLabel } from '../snapshot-processing.ts'; type CaptureSnapshotParams = { dispatchSnapshotCommand: typeof dispatchCommand; device: SessionState['device']; session: SessionState | undefined; - req: DaemonRequest; + flags: CommandFlags | undefined; + outPath?: string; logPath: string; snapshotScope?: string; }; +type SnapshotData = { + nodes?: RawSnapshotNode[]; + truncated?: boolean; + backend?: 'xctest' | 'android' | 'macos-helper'; +}; + export async function captureSnapshot( params: CaptureSnapshotParams, ): Promise<{ snapshot: SnapshotState }> { - const { dispatchSnapshotCommand, device, session, req, logPath, snapshotScope } = params; - const data = (await dispatchSnapshotCommand(device, 'snapshot', [], req.flags?.out, { + const data = await captureSnapshotData(params); + return { snapshot: buildSnapshotState(data, params.flags?.snapshotRaw) }; +} + +export async function captureSnapshotData(params: CaptureSnapshotParams): Promise { + const { dispatchSnapshotCommand, device, session, flags, outPath, logPath, snapshotScope } = + params; + if (device.platform === 'macos' && session?.surface && session.surface !== 'app') { + const helperSnapshot = await runMacOsSnapshotAction(session.surface); + return shapeMacOsSurfaceSnapshot(helperSnapshot, { + snapshotDepth: flags?.snapshotDepth, + snapshotInteractiveOnly: flags?.snapshotInteractiveOnly, + snapshotScope, + }); + } + return (await dispatchSnapshotCommand(device, 'snapshot', [], outPath, { ...contextFromFlags( logPath, - { ...req.flags, snapshotScope }, + { ...flags, snapshotScope }, session?.appBundleId, session?.trace?.outPath, ), - })) as { - nodes?: RawSnapshotNode[]; - truncated?: boolean; - backend?: 'xctest' | 'android'; - }; - return { snapshot: buildSnapshotState(data, req.flags?.snapshotRaw) }; + })) as SnapshotData; } export function buildSnapshotState( data: { nodes?: RawSnapshotNode[]; truncated?: boolean; - backend?: 'xctest' | 'android'; + backend?: 'xctest' | 'android' | 'macos-helper'; }, snapshotRaw: boolean | undefined, ): SnapshotState { @@ -56,6 +73,108 @@ export function buildSnapshotState( }; } +function shapeMacOsSurfaceSnapshot( + data: SnapshotData, + options: { + snapshotDepth?: number; + snapshotInteractiveOnly?: boolean; + snapshotScope?: string; + }, +): SnapshotData { + let nodes = data.nodes ?? []; + if (options.snapshotScope) { + nodes = scopeSnapshotNodes(nodes, options.snapshotScope); + } + if (options.snapshotInteractiveOnly) { + nodes = filterInteractiveSnapshotNodes(nodes); + } + if (typeof options.snapshotDepth === 'number') { + nodes = filterSnapshotNodesByDepth(nodes, options.snapshotDepth); + } + return { ...data, nodes }; +} + +function scopeSnapshotNodes(nodes: RawSnapshotNode[], scope: string): RawSnapshotNode[] { + const scopedNodes = attachRefs(nodes); + const match = findNodeByLabel(scopedNodes, scope); + if (!match) { + return []; + } + const startIndex = nodes.findIndex((node) => node.index === match.index); + if (startIndex === -1) { + return []; + } + const startDepth = nodes[startIndex]?.depth ?? 0; + const slice: RawSnapshotNode[] = []; + for (let index = startIndex; index < nodes.length; index += 1) { + const node = nodes[index]; + if (!node) continue; + const depth = node.depth ?? 0; + if (index > startIndex && depth <= startDepth) { + break; + } + slice.push(node); + } + return reindexSnapshotNodes(slice, startDepth); +} + +function filterInteractiveSnapshotNodes(nodes: RawSnapshotNode[]): RawSnapshotNode[] { + if (nodes.length === 0) { + return nodes; + } + const byIndex = new Map(); + for (const node of nodes) { + byIndex.set(node.index, node); + } + const keepIndexes = new Set(); + for (const node of nodes) { + if (!isInteractiveSnapshotNode(node)) continue; + let current: RawSnapshotNode | undefined = node; + while (current) { + if (keepIndexes.has(current.index)) break; + keepIndexes.add(current.index); + current = + typeof current.parentIndex === 'number' ? byIndex.get(current.parentIndex) : undefined; + } + } + if (keepIndexes.size === 0) { + return nodes; + } + return reindexSnapshotNodes(nodes.filter((node) => keepIndexes.has(node.index))); +} + +function filterSnapshotNodesByDepth(nodes: RawSnapshotNode[], maxDepth: number): RawSnapshotNode[] { + return reindexSnapshotNodes(nodes.filter((node) => (node.depth ?? 0) <= maxDepth)); +} + +function reindexSnapshotNodes(nodes: RawSnapshotNode[], depthOffset = 0): RawSnapshotNode[] { + const indexMap = new Map(); + for (const [index, node] of nodes.entries()) { + indexMap.set(node.index, index); + } + return nodes.map((node, index) => ({ + ...node, + index, + depth: Math.max(0, (node.depth ?? 0) - depthOffset), + parentIndex: typeof node.parentIndex === 'number' ? indexMap.get(node.parentIndex) : undefined, + })); +} + +function isInteractiveSnapshotNode(node: RawSnapshotNode): boolean { + if (node.hittable) return true; + if (node.rect) return true; + const role = `${node.type ?? ''} ${node.role ?? ''} ${node.subrole ?? ''}`.toLowerCase(); + return ( + role.includes('button') || + role.includes('menu') || + role.includes('textfield') || + role.includes('searchfield') || + role.includes('checkbox') || + role.includes('radio') || + role.includes('switch') + ); +} + export function resolveSnapshotScope( snapshotScope: string | undefined, session: SessionState | undefined, diff --git a/src/daemon/handlers/snapshot-wait.ts b/src/daemon/handlers/snapshot-wait.ts index 7bfc427c..b21edf4e 100644 --- a/src/daemon/handlers/snapshot-wait.ts +++ b/src/daemon/handlers/snapshot-wait.ts @@ -3,13 +3,7 @@ import { dispatchCommand } from '../../core/dispatch.ts'; import { runIosRunnerCommand } from '../../platforms/ios/runner-client.ts'; import { snapshotAndroid } from '../../platforms/android/index.ts'; import { isApplePlatform } from '../../utils/device.ts'; -import { - attachRefs, - findNodeByRef, - normalizeRef, - type RawSnapshotNode, -} from '../../utils/snapshot.ts'; -import { contextFromFlags } from '../context.ts'; +import { attachRefs, findNodeByRef, normalizeRef } from '../../utils/snapshot.ts'; import { findNodeByLabel, resolveRefLabel } from '../snapshot-processing.ts'; import { SessionStore } from '../session-store.ts'; import { @@ -19,7 +13,7 @@ import { type SelectorChain, } from '../selectors.ts'; import type { DaemonRequest, DaemonResponse, SessionState } from '../types.ts'; -import { buildSnapshotState } from './snapshot-capture.ts'; +import { captureSnapshot } from './snapshot-capture.ts'; import { recordIfSession } from './snapshot-session.ts'; import { DEFAULT_TIMEOUT_MS, parseTimeout, POLL_INTERVAL_MS } from './parse-utils.ts'; @@ -118,6 +112,7 @@ export async function handleWaitCommand(params: HandleWaitCommandParams): Promis const textResult = resolveWaitText(parsed, session); if (!textResult.ok) return textResult.response; return await waitForText({ + dispatchSnapshotCommand, device, logPath, req, @@ -152,46 +147,23 @@ async function waitForSelector(params: { const timeout = parsed.timeoutMs ?? DEFAULT_TIMEOUT_MS; const start = Date.now(); while (Date.now() - start < timeout) { - const data = await dispatchSnapshotCommand(device, 'snapshot', [], req.flags?.out, { - ...contextFromFlags( - logPath, - { - ...req.flags, - snapshotInteractiveOnly: false, - snapshotCompact: false, - }, - session?.appBundleId, - session?.trace?.outPath, - ), + const snapshot = await captureWaitSnapshot({ + dispatchSnapshotCommand, + device, + logPath, + req, + session, + sessionName, + sessionStore, }); - const snapshot = buildSnapshotState( - data as { - nodes?: RawSnapshotNode[]; - truncated?: boolean; - backend?: 'xctest' | 'android'; - }, - req.flags?.snapshotRaw, - ); - const nodes = snapshot.nodes; - if (session) { - session.snapshot = snapshot; - sessionStore.set(sessionName, session); - } - const match = findSelectorChainMatch(nodes, parsed.selector, { + const match = findSelectorChainMatch(snapshot.nodes, parsed.selector, { platform: device.platform, }); if (match) { - recordIfSession(sessionStore, session, req, { + return waitSuccess(sessionStore, session, req, { selector: match.selector.raw, waitedMs: Date.now() - start, }); - return { - ok: true, - data: { - selector: match.selector.raw, - waitedMs: Date.now() - start, - }, - }; } await new Promise((resolve) => setTimeout(resolve, POLL_INTERVAL_MS)); } @@ -258,6 +230,7 @@ function resolveWaitText( } async function waitForText(params: { + dispatchSnapshotCommand: typeof dispatchCommand; device: SessionState['device']; logPath: string; req: DaemonRequest; @@ -271,7 +244,23 @@ async function waitForText(params: { const timeout = timeoutMs ?? DEFAULT_TIMEOUT_MS; const start = Date.now(); while (Date.now() - start < timeout) { - if (isApplePlatform(device.platform)) { + if (device.platform === 'macos' && session?.surface && session.surface !== 'app') { + const snapshot = await captureWaitSnapshot({ + dispatchSnapshotCommand: params.dispatchSnapshotCommand, + device, + logPath, + req, + session, + sessionName: session?.name ?? req.session ?? 'default', + sessionStore, + }); + if (findNodeByLabel(snapshot.nodes, text)) { + return waitSuccess(sessionStore, session, req, { + text, + waitedMs: Date.now() - start, + }); + } + } else if (isApplePlatform(device.platform)) { const result = (await runnerCommand( device, { command: 'findText', text, appBundleId: session?.appBundleId }, @@ -300,3 +289,43 @@ async function waitForText(params: { error: { code: 'COMMAND_FAILED', message: `wait timed out for text: ${text}` }, }; } + +async function captureWaitSnapshot(params: { + dispatchSnapshotCommand: typeof dispatchCommand; + device: SessionState['device']; + logPath: string; + req: DaemonRequest; + session: SessionState | undefined; + sessionName: string; + sessionStore: SessionStore; +}): Promise { + const { dispatchSnapshotCommand, device, logPath, req, session, sessionName, sessionStore } = + params; + const { snapshot } = await captureSnapshot({ + dispatchSnapshotCommand, + device, + session, + flags: { + ...req.flags, + snapshotInteractiveOnly: false, + snapshotCompact: false, + }, + outPath: req.flags?.out, + logPath, + }); + if (session) { + session.snapshot = snapshot; + sessionStore.set(sessionName, session); + } + return snapshot; +} + +function waitSuccess( + sessionStore: SessionStore, + session: SessionState | undefined, + req: DaemonRequest, + data: Record, +): DaemonResponse { + recordIfSession(sessionStore, session, req, data); + return { ok: true, data }; +} diff --git a/src/daemon/handlers/snapshot.ts b/src/daemon/handlers/snapshot.ts index bb0de84d..75dd77a2 100644 --- a/src/daemon/handlers/snapshot.ts +++ b/src/daemon/handlers/snapshot.ts @@ -51,7 +51,8 @@ export async function handleSnapshotCommands(params: { dispatchSnapshotCommand, device, session, - req, + flags: req.flags, + outPath: req.flags?.out, logPath, snapshotScope: resolvedScope.scope, }); @@ -110,7 +111,8 @@ export async function handleSnapshotCommands(params: { dispatchSnapshotCommand, device, session, - req, + flags: req.flags, + outPath: req.flags?.out, logPath, snapshotScope: resolvedScope.scope, }); diff --git a/src/daemon/snapshot-processing.ts b/src/daemon/snapshot-processing.ts index e4aaefa9..a74f062e 100644 --- a/src/daemon/snapshot-processing.ts +++ b/src/daemon/snapshot-processing.ts @@ -79,10 +79,11 @@ export function pruneGroupNodes(nodes: RawSnapshotNode[]): RawSnapshotNode[] { } export function normalizeType(type: string): string { - let value = type - .trim() - .replace(/XCUIElementType/gi, '') - .toLowerCase(); + let value = type.trim().replace(/XCUIElementType/gi, ''); + if (value.startsWith('AX')) { + value = value.slice(2); + } + value = value.toLowerCase(); const lastSeparator = Math.max(value.lastIndexOf('.'), value.lastIndexOf('/')); if (lastSeparator !== -1) { value = value.slice(lastSeparator + 1); diff --git a/src/platforms/ios/macos-helper.ts b/src/platforms/ios/macos-helper.ts index 3eb76715..fa1bf68e 100644 --- a/src/platforms/ios/macos-helper.ts +++ b/src/platforms/ios/macos-helper.ts @@ -9,6 +9,34 @@ import type { SessionSurface } from '../../core/session-surface.ts'; export type MacOsPermissionTarget = 'accessibility' | 'screen-recording' | 'input-monitoring'; +// Keep this shape aligned with macOS helper SnapshotNodeResponse in +// macos-helper/Sources/AgentDeviceMacOSHelper/SnapshotTraversal.swift. +export type MacOsSnapshotNode = { + index: number; + type?: string; + role?: string; + subrole?: string; + label?: string; + value?: string; + identifier?: string; + rect?: { + x: number; + y: number; + width: number; + height: number; + }; + enabled?: boolean; + selected?: boolean; + hittable?: boolean; + depth?: number; + parentIndex?: number; + pid?: number; + bundleId?: string; + appName?: string; + windowTitle?: string; + surface?: string; +}; + type HelperSuccess> = { ok: true; data: T; @@ -249,3 +277,12 @@ export async function runMacOsAlertAction( } return await runMacOsHelper(args); } + +export async function runMacOsSnapshotAction(surface: Exclude): Promise<{ + surface: Exclude; + nodes: MacOsSnapshotNode[]; + truncated: boolean; + backend: 'macos-helper'; +}> { + return await runMacOsHelper(['snapshot', '--surface', surface]); +} diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index feb45274..c8278f57 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -739,11 +739,10 @@ test('command usage shows command and global flags separately', () => { assert.match(help, /--platform ios\|macos\|android\|apple/); }); -test('open command usage documents macOS surface flag', () => { +test('open command usage documents macOS desktop surface flags', () => { const help = usageForCommand('open'); if (help === null) throw new Error('Expected command help text'); - assert.match(help, /--surface app\|frontmost-app/); - assert.doesNotMatch(help, /desktop\|menubar/); + assert.match(help, /--surface app\|frontmost-app\|desktop\|menubar/); assert.match(help, /macOS also supports --surface/); }); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index e01e32a4..e65d0503 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -1,5 +1,5 @@ import { SETTINGS_USAGE_OVERRIDE } from '../core/settings-contract.ts'; -import { PHASE1_MACOS_SESSION_SURFACES } from '../core/session-surface.ts'; +import { SESSION_SURFACES } from '../core/session-surface.ts'; export type CliFlags = { json: boolean; @@ -70,7 +70,7 @@ export type CliFlags = { saveScript?: boolean | string; shutdown?: boolean; relaunch?: boolean; - surface?: 'app' | 'frontmost-app'; + surface?: 'app' | 'frontmost-app' | 'desktop' | 'menubar'; headless?: boolean; restart?: boolean; noRecord?: boolean; @@ -305,9 +305,9 @@ const FLAG_DEFINITIONS: readonly FlagDefinition[] = [ key: 'surface', names: ['--surface'], type: 'enum', - enumValues: PHASE1_MACOS_SESSION_SURFACES, - usageLabel: '--surface app|frontmost-app', - usageDescription: 'macOS phase-1 session surface for open (defaults to app)', + enumValues: SESSION_SURFACES, + usageLabel: '--surface app|frontmost-app|desktop|menubar', + usageDescription: 'macOS session surface for open (defaults to app)', }, { key: 'headless', @@ -815,7 +815,7 @@ const COMMAND_SCHEMAS: Record = { }, open: { helpDescription: - 'Boot device/simulator; optionally launch app or deep link URL (macOS also supports --surface app|frontmost-app)', + 'Boot device/simulator; optionally launch app or deep link URL (macOS also supports --surface app|frontmost-app|desktop|menubar)', summary: 'Open an app, deep link or URL, save replays', positionalArgs: ['appOrUrl?', 'url?'], allowedFlags: ['activity', 'saveScript', 'relaunch', 'surface'], diff --git a/src/utils/snapshot.ts b/src/utils/snapshot.ts index 9f9e6d04..ae377897 100644 --- a/src/utils/snapshot.ts +++ b/src/utils/snapshot.ts @@ -16,6 +16,8 @@ export type SnapshotOptions = { export type RawSnapshotNode = { index: number; type?: string; + role?: string; + subrole?: string; label?: string; value?: string; identifier?: string; @@ -25,6 +27,11 @@ export type RawSnapshotNode = { hittable?: boolean; depth?: number; parentIndex?: number; + pid?: number; + bundleId?: string; + appName?: string; + windowTitle?: string; + surface?: string; }; export type SnapshotNode = RawSnapshotNode & { @@ -35,7 +42,7 @@ export type SnapshotState = { nodes: SnapshotNode[]; createdAt: number; truncated?: boolean; - backend?: 'xctest' | 'android'; + backend?: 'xctest' | 'android' | 'macos-helper'; }; export function attachRefs(nodes: RawSnapshotNode[]): SnapshotNode[] { diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index faac66a3..b9b7d2d1 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -17,6 +17,7 @@ agent-device boot --platform android agent-device boot --platform android --device Pixel_9_Pro_XL --headless agent-device open [app|url] [url] agent-device open --platform macos --surface frontmost-app +agent-device open --platform macos --surface desktop agent-device close [app] agent-device back agent-device home @@ -33,10 +34,10 @@ agent-device app-switcher - `open [app|url] [url]` already boots/activates the selected target when needed. - `open ` deep links are supported on Android and iOS. - `open ` opens a deep link on iOS. -- `open --platform macos --surface app|frontmost-app` selects the macOS phase-1 session surface explicitly. `app` is the default when an app argument is provided. +- `open --platform macos --surface app|frontmost-app|desktop|menubar` selects the macOS session surface explicitly. `app` is the default when an app argument is provided. - On iOS devices, `http(s)://` URLs open in Safari when no app is active. Custom scheme URLs require an active app in the session. - `AGENT_DEVICE_SESSION` and `AGENT_DEVICE_PLATFORM` can pre-bind a default session/platform for CLI automation runs, so normal commands (`open`, `snapshot`, `press`, `fill`, `screenshot`, `devices`, and `batch`) do not need those flags repeated on every call. -- A configured `AGENT_DEVICE_SESSION` now implies bound-session lock mode by default. The CLI forwards that policy to the daemon, which enforces the same conflict handling for CLI, typed client, and direct RPC requests. +- A configured `AGENT_DEVICE_SESSION` implies bound-session lock mode by default. The CLI forwards that policy to the daemon, which enforces the same conflict handling for CLI, typed client, and direct RPC requests. - `--session-lock reject|strip` sets the lock policy for a single CLI invocation, including nested batch steps. - `AGENT_DEVICE_SESSION_LOCK=reject|strip` sets the default lock policy for bound-session automation runs. The older `--session-locked`, `--session-lock-conflicts`, `AGENT_DEVICE_SESSION_LOCKED`, and `AGENT_DEVICE_SESSION_LOCK_CONFLICTS` forms remain supported as compatibility aliases. - Direct RPC callers can pass `meta.lockPolicy` and optional `meta.lockPlatform` on `agent_device.command` requests for the same daemon-enforced behavior. @@ -123,18 +124,40 @@ agent-device screenshot apple-tv.png --platform ios --target tv ```bash agent-device devices --platform macos agent-device open TextEdit --platform macos +agent-device open --platform macos --surface desktop agent-device snapshot -i --platform apple --target desktop ``` - `--platform macos` selects the host Mac as a `desktop` target. - `--platform apple --target desktop` selects the same macOS backend through the Apple-family alias. -- macOS uses the same runner-driven interaction/snapshot flow as iOS/tvOS for app-scoped `open`, `appstate`, `snapshot`, `press`, `fill`, `scroll`, `back`, `screenshot`, `record`, and selector-based commands. -- `open --platform macos --surface frontmost-app` stores the currently focused app as the session surface. -- `desktop` and `menubar` remain the planned phase-2 path for broader computer-use support; they are not exposed yet in the phase-1 CLI surface. +- Use `app` sessions for normal app control: `open`, `snapshot`, `click`, `fill`, `press`, `scroll`, `back`, `screenshot`, `record`. +- Use `frontmost-app`, `desktop`, and `menubar` when you need to inspect desktop-global UI before choosing one app. +- `open --platform macos --surface frontmost-app` inspects the currently focused app without naming it first. +- `open --platform macos --surface desktop` inspects visible windows across the desktop. +- `open --platform macos --surface menubar` inspects the active app menu bar and system menu extras. +- Use `frontmost-app`, `desktop`, and `menubar` mainly for `snapshot`, `get`, `is`, and `wait`. +- If you inspect with `desktop` or `menubar` and then need to click or fill inside one app, open that app in a normal `app` session. - macOS also supports `clipboard read|write`, `trigger-app-event`, `logs`, `network dump`, `alert`, `settings appearance`, and `settings permission `. - Prefer selector or `@ref`-driven interactions on macOS. Window position can shift between runs, so raw x/y point commands are less stable than snapshot-derived targets. +- Use `click --button secondary` for context menus on macOS, then run `snapshot -i` again. - Mobile-only helpers remain unsupported on macOS: `boot`, `home`, `app-switcher`, `install`, `reinstall`, `install-from-source`, and `push`. +Recommended loops: + +```bash +# One app, full interaction +agent-device open TextEdit --platform macos +agent-device snapshot -i +agent-device fill @e3 "hello" +agent-device close + +# Desktop-global inspection first +agent-device open --platform macos --surface desktop +agent-device snapshot -i +agent-device is visible 'role="window" label="Notes"' +agent-device close +``` + ## Snapshot and inspect ```bash diff --git a/website/docs/docs/installation.md b/website/docs/docs/installation.md index 25285957..d232df50 100644 --- a/website/docs/docs/installation.md +++ b/website/docs/docs/installation.md @@ -25,7 +25,7 @@ npx agent-device open Settings --platform ios ## macOS desktop notes -- The macOS desktop path now uses a local `agent-device-macos-helper` for permission checks (`settings permission ...`), alert handling, and related host-Mac support. +- The macOS desktop path uses a local `agent-device-macos-helper` for permission checks (`settings permission ...`), alert handling, and helper-backed desktop snapshot surfaces (`frontmost-app`, `desktop`, `menubar`). - Source checkouts build the helper lazily on first use and cache it under `~/.agent-device/macos-helper/current/`. - Release distribution should ship a stable signed/notarized helper build so macOS trust/TCC state is tied to a durable code signature instead of an ad-hoc local binary.