Add screen numbering to cross-os MCP, fix macOS stale foreground

k4cper-g · claude · k4cper-g · commit 4a5a3dd84471 · 2026-02-24T19:43:21.000+01:00
Cross-OS MCP now has full parity with the main MCP server:
- Screen numbering (1, 2, 3...) for each connected machine
- Added snapshot_app, snapshot_desktop, screenshot, full find params
- Every tool accepts screen by number or name

Fixed macOS foreground detection using CGWindowListCopyWindowInfo
instead of NSWorkspace.frontmostApplication() which goes stale in
long-running processes without an NSRunLoop (e.g., MCP servers).

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/cup/platforms/macos.py b/cup/platforms/macos.py
@@ -395,7 +395,50 @@ def _cg_window_apps() -> dict[int, str]:
 
 
 def _macos_foreground_app() -> tuple[int, str, str | None]:
-    """Return (pid, app_name, bundle_id) of the frontmost application."""
+    """Return (pid, app_name, bundle_id) of the frontmost application.
+
+    Uses CGWindowListCopyWindowInfo to get fresh data from the window server.
+    NSWorkspace.frontmostApplication() goes stale in long-running processes
+    without an active NSRunLoop (e.g., MCP servers), so we only use it as a
+    fallback.
+    """
+    # CGWindowList returns windows in front-to-back order. The first
+    # layer-0 (normal) window that isn't a system daemon is the frontmost app.
+    try:
+        from Quartz import (
+            CGWindowListCopyWindowInfo,
+            kCGNullWindowID,
+            kCGWindowListOptionOnScreenOnly,
+        )
+
+        cg_windows = CGWindowListCopyWindowInfo(
+            kCGWindowListOptionOnScreenOnly, kCGNullWindowID,
+        )
+        if cg_windows:
+            for w in cg_windows:
+                if w.get("kCGWindowLayer", -1) != 0:
+                    continue
+                pid = w.get("kCGWindowOwnerPID")
+                owner = w.get("kCGWindowOwnerName", "")
+                if not pid or not owner:
+                    continue
+                if owner in _SYSTEM_OWNER_NAMES:
+                    continue
+                # Found the frontmost app — look up bundle ID via NSRunningApplication
+                bundle_id = None
+                try:
+                    from AppKit import NSRunningApplication
+                    ns_app = NSRunningApplication.runningApplicationWithProcessIdentifier_(pid)
+                    if ns_app is not None:
+                        owner = ns_app.localizedName() or owner
+                        bundle_id = ns_app.bundleIdentifier()
+                except Exception:
+                    pass
+                return (pid, owner, bundle_id)
+    except Exception:
+        pass
+
+    # Fallback: NSWorkspace (may be stale without NSRunLoop)
     workspace = NSWorkspace.sharedWorkspace()
     app = workspace.frontmostApplication()
     return (
diff --git a/examples/cross-os/README.md b/examples/cross-os/README.md
@@ -9,14 +9,14 @@ This demonstrates CUP's core value: **one protocol, every OS**. Claude sees iden
 ```
 ┌──────────────────────────────────────────────┐
 │           Claude Code / MCP Client           │
-│  "Open Notepad on windows, type hello,       │
-│   then open Notes on mac and paste it"       │
+│  "Open Notepad on screen 1, type hello,      │
+│   then open Notes on screen 2 and paste it"  │
 └──────────────────┬───────────────────────────┘
                    │ MCP (stdio)
 ┌──────────────────▼───────────────────────────┐
 │          mcp_server.py (MCP bridge)          │
-│  Exposes snapshot, action, find tools        │
-│  for each connected machine                  │
+│  Screen 1 = windows, Screen 2 = mac         │
+│  Tools: snapshot, action, find, screenshot   │
 └──────┬──────────────────────────┬────────────┘
        │ WebSocket                │ WebSocket
 ┌──────▼──────┐            ┌─────▼────────────┐
@@ -26,6 +26,8 @@ This demonstrates CUP's core value: **one protocol, every OS**. Claude sees iden
 └─────────────┘            └──────────────────┘
 ```
 
+Each connected machine is a numbered **screen** (1, 2, 3...). Every tool accepts a `screen` parameter — either the number or the friendly name.
+
 ## Files
 
 | File | Purpose |
@@ -79,23 +81,39 @@ Add to your Claude Code MCP config:
 }
 ```
 
-Replace the paths and IPs for your setup. If you're running Claude Code on your Windows machine, `windows` can point to `localhost`.
+Replace the paths and IPs for your setup. Machines are numbered as screens (1, 2, 3...) in the order listed.
 
 ### 4. Talk to Claude
 
 Now just ask Claude Code naturally:
 
 ```
-"What apps are open on both machines?"
+"What apps are open on all screens?"
+
+"Take a snapshot of screen 1"
 
-"Open Notepad on windows and type 'Hello from Mac', then open TextEdit on mac and type 'Hello from Windows'"
+"Open Notepad on windows and type 'Hello from Mac',
+ then open TextEdit on mac and type 'Hello from Windows'"
 
-"Take a snapshot of the foreground window on mac"
+"Click the Submit button on screen 2"
 
-"Click the Submit button on windows"
+"Take a screenshot of screen 1"
 ```
 
-Claude sees the CUP tools (`snapshot_machine`, `act_on_machine`, etc.) and uses them to interact with both machines.
+## Available Tools
+
+| Tool | Description |
+|------|-------------|
+| `list_screens()` | List all connected screens with number, name, OS |
+| `snapshot(screen)` | Capture foreground window's UI tree |
+| `snapshot_app(screen, app)` | Capture a specific app by title |
+| `snapshot_desktop(screen)` | Capture desktop icons/widgets |
+| `overview(screen)` | List open windows (near-instant) |
+| `action(screen, action, ...)` | Click, type, press keys, scroll, etc. |
+| `find(screen, query/role/name/state)` | Search the last tree for elements |
+| `open_app(screen, app_name)` | Open an app by name (fuzzy match) |
+| `screenshot(screen, region_*)` | Capture a PNG screenshot |
+| `snapshot_all(scope)` | Snapshot all screens in parallel |
 
 ## Standalone Agent (alternative)
 
@@ -117,16 +135,16 @@ python agent.py windows=ws://localhost:9800 mac=ws://192.168.1.30:9800
 
 ```
 # Cross-OS text relay
-"Copy the title of the focused window on Windows and type it into the terminal on Mac"
+"Copy the title of the focused window on screen 1 and type it into the terminal on screen 2"
 
 # Parallel app launch
-"Open a text editor on both machines and type today's date in each"
+"Open a text editor on all screens and type today's date in each"
 
 # Cross-OS comparison
-"Take a snapshot of both machines and tell me what apps are running on each"
+"Snapshot all screens and tell me what apps are running on each"
 
 # Multi-step workflow
-"On Windows, open Chrome and navigate to example.com. On Mac, open Safari and navigate to the same URL."
+"On windows, open Chrome and navigate to example.com. On mac, open Safari and navigate to the same URL."
 ```
 
 ## Using the client library directly
@@ -139,6 +157,7 @@ with RemoteSession("ws://192.168.1.10:9800") as win:
     print(win.snapshot(scope="overview"))
     win.open_app("notepad")
     tree = win.snapshot(scope="foreground")
+    png = win.screenshot()  # full screen PNG bytes
 
 # Multiple machines in parallel
 with MultiSession({
@@ -163,4 +182,4 @@ The cup_server uses a simple JSON-RPC protocol over WebSocket:
 {"id": 1, "result": "# CUP 0.1.0 | windows | 1920x1080\n..."}
 ```
 
-Methods: `snapshot`, `action`, `press`, `find`, `overview`, `open_app`, `batch`, `info`
+Methods: `snapshot`, `snapshot_desktop`, `action`, `press`, `find`, `overview`, `open_app`, `screenshot`, `batch`, `info`
diff --git a/examples/cross-os/cup_remote.py b/examples/cross-os/cup_remote.py
@@ -25,6 +25,7 @@
 
 from __future__ import annotations
 
+import base64
 import json
 import threading
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -144,6 +145,26 @@ def find(
     ) -> list[dict]:
         return self._call("find", query=query, role=role, name=name, state=state, limit=limit)
 
+    def snapshot_desktop(self, *, compact: bool = True) -> str | dict:
+        return self._call("snapshot_desktop", compact=compact)
+
+    def screenshot(
+        self,
+        *,
+        region: dict[str, int] | None = None,
+    ) -> bytes:
+        """Capture a screenshot and return PNG bytes."""
+        params: dict[str, Any] = {}
+        if region is not None:
+            params["region_x"] = region["x"]
+            params["region_y"] = region["y"]
+            params["region_w"] = region["w"]
+            params["region_h"] = region["h"]
+        result = self._call("screenshot", **params)
+        if not result.get("success"):
+            raise RuntimeError(result.get("error", "Screenshot failed"))
+        return base64.b64decode(result["data"])
+
     def open_app(self, name: str) -> ActionResult:
         result = self._call("open_app", name=name)
         return ActionResult(**result)
diff --git a/examples/cross-os/cup_server.py b/examples/cross-os/cup_server.py
@@ -14,13 +14,15 @@
     -> {"id": 2, "method": "action", "params": {"element_id": "e5", "action": "click"}}
     <- {"id": 2, "result": {"success": true, "message": "Clicked"}}
 
-Supported methods: snapshot, action, press, find, overview, open_app, info
+Supported methods: snapshot, snapshot_desktop, action, press, find, overview,
+                   open_app, screenshot, batch, info
 """
 
 from __future__ import annotations
 
 import argparse
 import asyncio
+import base64
 import json
 import platform
 import sys
@@ -104,6 +106,26 @@ def rpc_open_app(self, name: str) -> dict:
         result = self._session.open_app(name)
         return {"success": result.success, "message": result.message, "error": result.error}
 
+    def rpc_snapshot_desktop(self, compact: bool = True) -> str | dict:
+        return self._session.snapshot(scope="desktop", compact=compact)
+
+    def rpc_screenshot(
+        self,
+        region_x: int | None = None,
+        region_y: int | None = None,
+        region_w: int | None = None,
+        region_h: int | None = None,
+    ) -> dict:
+        """Capture screenshot and return as base64-encoded PNG."""
+        region = None
+        if all(v is not None for v in (region_x, region_y, region_w, region_h)):
+            region = {"x": region_x, "y": region_y, "w": region_w, "h": region_h}
+        try:
+            png_bytes = self._session.screenshot(region=region)
+            return {"success": True, "data": base64.b64encode(png_bytes).decode("ascii")}
+        except (ImportError, RuntimeError) as e:
+            return {"success": False, "error": str(e)}
+
     def rpc_batch(self, actions: list[dict]) -> list[dict]:
         results = self._session.batch(actions)
         return [{"success": r.success, "message": r.message, "error": r.error} for r in results]
diff --git a/examples/cross-os/mcp_server.py b/examples/cross-os/mcp_server.py