From 7a2ade0a026443831077c38a12b22814516d748c Mon Sep 17 00:00:00 2001 From: unraid Date: Sun, 5 Apr 2026 15:38:45 +0800 Subject: [PATCH 1/2] chore: add .agents/.codex/.omx to .gitignore --- .gitignore | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f9d718ce3..be73e32b8 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,9 @@ coverage .vscode *.suo *.lock -src/utils/vendor/ \ No newline at end of file +src/utils/vendor/ + +# AI tool runtime directories +.agents/ +.codex/ +.omx/ \ No newline at end of file From c17edcb12e5206de7106e1ef548863c99adc78e0 Mon Sep 17 00:00:00 2001 From: unraid Date: Sun, 5 Apr 2026 15:27:50 +0800 Subject: [PATCH 2/2] =?UTF-8?q?feat:=20Computer=20Use=20=E2=80=94=20Window?= =?UTF-8?q?s=20=E8=B7=A8=E5=B9=B3=E5=8F=B0=E6=94=AF=E6=8C=81=20+=20GUI=20?= =?UTF-8?q?=E6=97=A0=E9=9A=9C=E7=A2=8D=E5=A2=9E=E5=BC=BA=20+=20Python=20Br?= =?UTF-8?q?idge?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 三平台 Computer Use (macOS + Windows + Linux),Windows 专项增强。 - MCP server: toolCalls/tools/executor/mcpServer 等 12 文件完整实现 - 平台抽象层: platforms/{win32,darwin,linux}.ts - 跨平台 executor: executorCrossPlatform.ts - CHICAGO_MCP + VOICE_MODE feature flags 启用 - windowMessage.ts: SendMessageW (WM_CHAR Unicode + 剪贴板粘贴) - windowBorder.ts: 4 叠加窗口边框 (30fps 跟踪) - uiAutomation.ts: UI Automation 元素树/点击/写值 - accessibilitySnapshot.ts: 无障碍快照 → 模型感知 GUI - bridge.py + bridgeClient.ts: Python 长驻进程 (替代 per-call PS) - window_management: min/max/restore/close/focus (Win32 API) - click_element / type_into_element: 按名称操作 (无需坐标) - 截图自动附带 Accessibility Snapshot - 17 种方法, stdin/stdout JSON 通信 - 窗口枚举 1.5ms vs PS 500ms, 截图 360ms vs PS 800ms - 依赖: mss + Pillow + pywinauto --- .gitignore | 14 +- DEV-LOG.md | 18 +- build.ts | 88 +- docs/features/computer-use-architecture-v2.md | 325 +++++ docs/features/computer-use-tools-reference.md | 496 +++++++ docs/features/computer-use.md | 321 +++-- packages/@ant/computer-use-input/src/index.ts | 74 +- .../@ant/computer-use-mcp/src/executor.ts | 57 + .../@ant/computer-use-mcp/src/toolCalls.ts | 558 +++++++- packages/@ant/computer-use-mcp/src/tools.ts | 349 ++++- .../computer-use-swift/src/backends/darwin.ts | 37 +- packages/@ant/computer-use-swift/src/index.ts | 72 +- src/utils/computerUse/common.ts | 10 +- src/utils/computerUse/executor.ts | 35 +- .../computerUse/executorCrossPlatform.ts | 1150 +++++++++++++++++ src/utils/computerUse/hostAdapter.ts | 13 +- src/utils/computerUse/platforms/darwin.ts | 152 +++ src/utils/computerUse/platforms/index.ts | 41 + src/utils/computerUse/platforms/linux.ts | 416 ++++++ src/utils/computerUse/platforms/types.ts | 153 +++ src/utils/computerUse/platforms/win32.ts | 979 ++++++++++++++ src/utils/computerUse/swiftLoader.ts | 15 +- .../win32/accessibilitySnapshot.ts | 225 ++++ src/utils/computerUse/win32/appDispatcher.ts | 129 ++ src/utils/computerUse/win32/bridge.py | 525 ++++++++ src/utils/computerUse/win32/bridgeClient.ts | 191 +++ src/utils/computerUse/win32/comExcel.ts | 320 +++++ src/utils/computerUse/win32/comWord.ts | 450 +++++++ src/utils/computerUse/win32/inputIndicator.ts | 254 ++++ src/utils/computerUse/win32/ocr.ts | 11 +- src/utils/computerUse/win32/shared.ts | 127 ++ src/utils/computerUse/win32/uiAutomation.ts | 67 +- src/utils/computerUse/win32/virtualCursor.ts | 268 ++++ src/utils/computerUse/win32/windowBorder.ts | 66 + src/utils/computerUse/win32/windowEnum.ts | 14 +- src/utils/computerUse/win32/windowMessage.ts | 696 ++++++++++ 36 files changed, 8331 insertions(+), 385 deletions(-) create mode 100644 docs/features/computer-use-architecture-v2.md create mode 100644 docs/features/computer-use-tools-reference.md create mode 100644 src/utils/computerUse/executorCrossPlatform.ts create mode 100644 src/utils/computerUse/platforms/darwin.ts create mode 100644 src/utils/computerUse/platforms/index.ts create mode 100644 src/utils/computerUse/platforms/linux.ts create mode 100644 src/utils/computerUse/platforms/types.ts create mode 100644 src/utils/computerUse/platforms/win32.ts create mode 100644 src/utils/computerUse/win32/accessibilitySnapshot.ts create mode 100644 src/utils/computerUse/win32/appDispatcher.ts create mode 100644 src/utils/computerUse/win32/bridge.py create mode 100644 src/utils/computerUse/win32/bridgeClient.ts create mode 100644 src/utils/computerUse/win32/comExcel.ts create mode 100644 src/utils/computerUse/win32/comWord.ts create mode 100644 src/utils/computerUse/win32/inputIndicator.ts create mode 100644 src/utils/computerUse/win32/shared.ts create mode 100644 src/utils/computerUse/win32/virtualCursor.ts create mode 100644 src/utils/computerUse/win32/windowBorder.ts create mode 100644 src/utils/computerUse/win32/windowMessage.ts diff --git a/.gitignore b/.gitignore index be73e32b8..8b5e47a0e 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,16 @@ src/utils/vendor/ # AI tool runtime directories .agents/ .codex/ -.omx/ \ No newline at end of file +.omx/ + +# Binary / screenshot files (root only) +/*.png +*.bmp + +# Agent / tool state dirs +.swarm/ +.agents/__pycache__/ + +# Python bytecode +__pycache__/ +*.pyc diff --git a/DEV-LOG.md b/DEV-LOG.md index 619dcf8b8..1aab55fe4 100644 --- a/DEV-LOG.md +++ b/DEV-LOG.md @@ -39,6 +39,7 @@ ## Computer Use Windows 增强:窗口绑定截图 + UI Automation + OCR (2026-04-03) + 在三平台基础实现之上,利用 Windows 原生 API 增强 Computer Use 的 Windows 专属能力。 **新增文件:** @@ -118,23 +119,6 @@ packages/@ant/computer-use-{input,swift}/src/ | `vendor/audio-capture/{platform}/audio-capture.node` | 6 个平台的原生音频二进制(cpal,来自参考项目) | | `vendor/audio-capture-src/index.ts` | 原生模块加载器(按 `${arch}-${platform}` 动态 require `.node`) | -**修改文件:** - -| 文件 | 变更 | -|------|------| -| `packages/audio-capture-napi/src/index.ts` | SoX 子进程 stub → 原生 `.node` 加载器(含 `process.cwd()` workspace 路径 fallback) | -| `scripts/dev.ts` | `DEFAULT_FEATURES` 加 `"VOICE_MODE"` | -| `build.ts` | `DEFAULT_BUILD_FEATURES` 加 `"VOICE_MODE"` | -| `docs/features/voice-mode.md` | 追加恢复计划章节(第八节) | - -**验证结果:** - -- `isNativeAudioAvailable()` → `true`(Windows x64 原生 `.node` 加载成功) -- `feature('VOICE_MODE')` → `ENABLED` -- `bun run build` → voice 代码编入产物 - -**运行时前置条件:** claude.ai OAuth 登录 + 麦克风权限 - --- ## Enable Claude in Chrome MCP (2026-04-03) diff --git a/build.ts b/build.ts index 203fc23d9..090b31611 100644 --- a/build.ts +++ b/build.ts @@ -2,11 +2,11 @@ import { readdir, readFile, writeFile, cp } from "fs/promises"; import { join } from "path"; import { getMacroDefines } from "./scripts/defines.ts"; -const outdir = "dist"; +const outdir = 'dist' // Step 1: Clean output directory -const { rmSync } = await import("fs"); -rmSync(outdir, { recursive: true, force: true }); +const { rmSync } = await import('fs') +rmSync(outdir, { recursive: true, force: true }) // Default features that match the official CLI build. // Additional features can be enabled via FEATURE_=1 env vars. @@ -14,50 +14,50 @@ const DEFAULT_BUILD_FEATURES = ["AGENT_TRIGGERS_REMOTE", "CHICAGO_MCP", "VOICE_M // Collect FEATURE_* env vars → Bun.build features const envFeatures = Object.keys(process.env) - .filter(k => k.startsWith("FEATURE_")) - .map(k => k.replace("FEATURE_", "")); -const features = [...new Set([...DEFAULT_BUILD_FEATURES, ...envFeatures])]; + .filter(k => k.startsWith('FEATURE_')) + .map(k => k.replace('FEATURE_', '')) +const features = [...new Set([...DEFAULT_BUILD_FEATURES, ...envFeatures])] // Step 2: Bundle with splitting const result = await Bun.build({ - entrypoints: ["src/entrypoints/cli.tsx"], - outdir, - target: "bun", - splitting: true, - define: getMacroDefines(), - features, -}); + entrypoints: ['src/entrypoints/cli.tsx'], + outdir, + target: 'bun', + splitting: true, + define: getMacroDefines(), + features, +}) if (!result.success) { - console.error("Build failed:"); - for (const log of result.logs) { - console.error(log); - } - process.exit(1); + console.error('Build failed:') + for (const log of result.logs) { + console.error(log) + } + process.exit(1) } // Step 3: Post-process — replace Bun-only `import.meta.require` with Node.js compatible version -const files = await readdir(outdir); -const IMPORT_META_REQUIRE = "var __require = import.meta.require;"; -const COMPAT_REQUIRE = `var __require = typeof import.meta.require === "function" ? import.meta.require : (await import("module")).createRequire(import.meta.url);`; +const files = await readdir(outdir) +const IMPORT_META_REQUIRE = 'var __require = import.meta.require;' +const COMPAT_REQUIRE = `var __require = typeof import.meta.require === "function" ? import.meta.require : (await import("module")).createRequire(import.meta.url);` -let patched = 0; +let patched = 0 for (const file of files) { - if (!file.endsWith(".js")) continue; - const filePath = join(outdir, file); - const content = await readFile(filePath, "utf-8"); - if (content.includes(IMPORT_META_REQUIRE)) { - await writeFile( - filePath, - content.replace(IMPORT_META_REQUIRE, COMPAT_REQUIRE), - ); - patched++; - } + if (!file.endsWith('.js')) continue + const filePath = join(outdir, file) + const content = await readFile(filePath, 'utf-8') + if (content.includes(IMPORT_META_REQUIRE)) { + await writeFile( + filePath, + content.replace(IMPORT_META_REQUIRE, COMPAT_REQUIRE), + ) + patched++ + } } console.log( - `Bundled ${result.outputs.length} files to ${outdir}/ (patched ${patched} for Node.js compat)`, -); + `Bundled ${result.outputs.length} files to ${outdir}/ (patched ${patched} for Node.js compat)`, +) // Step 4: Copy native .node addon files (audio-capture) const vendorDir = join(outdir, "vendor", "audio-capture"); @@ -66,16 +66,16 @@ console.log(`Copied vendor/audio-capture/ → ${vendorDir}/`); // Step 5: Bundle download-ripgrep script as standalone JS for postinstall const rgScript = await Bun.build({ - entrypoints: ["scripts/download-ripgrep.ts"], - outdir, - target: "node", -}); + entrypoints: ['scripts/download-ripgrep.ts'], + outdir, + target: 'node', +}) if (!rgScript.success) { - console.error("Failed to bundle download-ripgrep script:"); - for (const log of rgScript.logs) { - console.error(log); - } - // Non-fatal — postinstall fallback to bun run scripts/download-ripgrep.ts + console.error('Failed to bundle download-ripgrep script:') + for (const log of rgScript.logs) { + console.error(log) + } + // Non-fatal — postinstall fallback to bun run scripts/download-ripgrep.ts } else { - console.log(`Bundled download-ripgrep script to ${outdir}/`); + console.log(`Bundled download-ripgrep script to ${outdir}/`) } diff --git a/docs/features/computer-use-architecture-v2.md b/docs/features/computer-use-architecture-v2.md new file mode 100644 index 000000000..8cfac3cb0 --- /dev/null +++ b/docs/features/computer-use-architecture-v2.md @@ -0,0 +1,325 @@ +# Computer Use 架构修正方案 v2 + +更新时间:2026-04-04 + +## 1. 当前架构的问题 + +### 问题 A:平台代码混在错误的包里 + +`@ant/computer-use-swift` 是 macOS Swift 原生模块的包装器,但我们把 Windows(`backends/win32.ts`)和 Linux(`backends/linux.ts`)的截图/应用管理代码塞进了这个包。"swift" 在名字里就意味着 macOS,后期维护者无法区分。 + +`@ant/computer-use-input` 同样——原本是 macOS enigo Rust 模块,我们也往里面塞了 win32/linux 后端。 + +### 问题 B:输入方式不对 + +当前 Windows 后端(`packages/@ant/computer-use-input/src/backends/win32.ts`)使用 `SetCursorPos` + `SendInput` + `keybd_event`——这是**全局输入**: + +- 鼠标真的会移动到屏幕上 +- 键盘真的打到当前前台窗口 +- **会影响用户当前的操作** + +绑定窗口句柄后,应该用 `SendMessage`/`PostMessage` 向目标 HWND 发送消息: + +- `WM_CHAR` — 发送字符,不移动光标 +- `WM_KEYDOWN`/`WM_KEYUP` — 发送按键 +- `WM_LBUTTONDOWN`/`WM_LBUTTONUP` — 发送鼠标点击(窗口客户区相对坐标) +- `PrintWindow` — 截取窗口内容,不需要窗口在前台 +- **不抢焦点、不影响用户当前操作** + +已验证:向记事本 `SendMessage(WM_CHAR)` 成功写入文字,记事本在后台,终端保持前台。 + +### 问题 C:截图是公共能力,不属于 swift + +截图(screenshot)、显示器枚举(display)、应用管理(apps)是所有平台都需要的公共能力,不应该放在 `@ant/computer-use-swift`(macOS 专属包名)里。 + +## 2. 修正后的架构 + +### 2.1 分层原则 + +``` +packages/@ant/ ← macOS 原生模块包装器(不放其他平台代码) +├── computer-use-input/ ← macOS: enigo .node 键鼠(仅 darwin) +├── computer-use-swift/ ← macOS: Swift .node 截图/应用(仅 darwin) +└── computer-use-mcp/ ← 跨平台: MCP server + 工具定义(不改) + +src/utils/computerUse/ +├── platforms/ ← 新增: 跨平台抽象层 +│ ├── types.ts ← 公共接口: InputPlatform, ScreenshotPlatform, AppsPlatform, DisplayPlatform +│ ├── index.ts ← 平台分发器: 按 process.platform 加载后端 +│ ├── darwin.ts ← macOS: 委托给 @ant/computer-use-{input,swift} +│ ├── win32.ts ← Windows: SendMessage 输入 + PrintWindow 截图 + EnumWindows + UIA + OCR +│ └── linux.ts ← Linux: xdotool + scrot + xrandr + wmctrl +│ +├── win32/ ← Windows 专属增强能力(不在公共接口中) +│ ├── windowCapture.ts ← PrintWindow 窗口绑定截图 +│ ├── windowEnum.ts ← EnumWindows 窗口枚举 +│ ├── windowMessage.ts ← SendMessage/PostMessage 无焦点输入(新增) +│ ├── uiAutomation.ts ← IUIAutomation UI 元素操作 +│ └── ocr.ts ← Windows.Media.Ocr 文字识别 +│ +├── executor.ts ← 改: 通过 platforms/ 获取平台实现,不直接调 @ant 包 +├── swiftLoader.ts ← 改: 仅 darwin 使用 +├── inputLoader.ts ← 改: 仅 darwin 使用 +└── ...其他文件不动 +``` + +### 2.2 公共接口(`platforms/types.ts`) + +```typescript +/** 窗口标识 — 跨平台 */ +export interface WindowHandle { + id: string // macOS: bundleId, Windows: HWND string, Linux: window ID + pid: number + title: string + exePath?: string // Windows/Linux: 进程路径 +} + +/** 输入平台接口 — 两种模式 */ +export interface InputPlatform { + // 模式 A: 全局输入(macOS/Linux 默认,向前台窗口发送) + moveMouse(x: number, y: number): Promise + click(x: number, y: number, button: 'left' | 'right' | 'middle'): Promise + typeText(text: string): Promise + key(name: string, action: 'press' | 'release'): Promise + keys(combo: string[]): Promise + scroll(amount: number, direction: 'vertical' | 'horizontal'): Promise + mouseLocation(): Promise<{ x: number; y: number }> + + // 模式 B: 窗口绑定输入(Windows SendMessage,不抢焦点) + sendChar?(hwnd: string, char: string): Promise + sendKey?(hwnd: string, vk: number, action: 'down' | 'up'): Promise + sendClick?(hwnd: string, x: number, y: number, button: 'left' | 'right'): Promise + sendText?(hwnd: string, text: string): Promise +} + +/** 截图平台接口 */ +export interface ScreenshotPlatform { + // 全屏截图 + captureScreen(displayId?: number): Promise + // 区域截图 + captureRegion(x: number, y: number, w: number, h: number): Promise + // 窗口截图(Windows: PrintWindow,macOS: SCContentFilter,Linux: xdotool+import) + captureWindow?(hwnd: string): Promise +} + +/** 显示器平台接口 */ +export interface DisplayPlatform { + listAll(): DisplayInfo[] + getSize(displayId?: number): DisplayInfo +} + +/** 应用管理平台接口 */ +export interface AppsPlatform { + listRunning(): WindowHandle[] + listInstalled(): Promise + open(name: string): Promise + getFrontmostApp(): FrontmostAppInfo | null + findWindowByTitle(title: string): WindowHandle | null +} + +export interface ScreenshotResult { + base64: string + width: number + height: number +} + +export interface DisplayInfo { + width: number + height: number + scaleFactor: number + displayId: number +} + +export interface InstalledApp { + id: string // macOS: bundleId, Windows: exe path, Linux: .desktop name + displayName: string + path: string +} + +export interface FrontmostAppInfo { + id: string + appName: string +} +``` + +### 2.3 平台分发器(`platforms/index.ts`) + +```typescript +import type { InputPlatform, ScreenshotPlatform, DisplayPlatform, AppsPlatform } from './types.js' + +export interface Platform { + input: InputPlatform + screenshot: ScreenshotPlatform + display: DisplayPlatform + apps: AppsPlatform +} + +export function loadPlatform(): Platform { + switch (process.platform) { + case 'darwin': + return require('./darwin.js').platform + case 'win32': + return require('./win32.js').platform + case 'linux': + return require('./linux.js').platform + default: + throw new Error(`Computer Use not supported on ${process.platform}`) + } +} +``` + +### 2.4 各平台实现 + +**`platforms/darwin.ts`** — 委托给 @ant 包(保持兼容): +```typescript +// macOS: 通过 @ant/computer-use-input 和 @ant/computer-use-swift +// 这两个包的 darwin 后端保留不动 +import { requireComputerUseInput } from '../inputLoader.js' +import { requireComputerUseSwift } from '../swiftLoader.js' + +export const platform = { + input: { /* 委托给 requireComputerUseInput() */ }, + screenshot: { /* 委托给 requireComputerUseSwift().screenshot */ }, + display: { /* 委托给 requireComputerUseSwift().display */ }, + apps: { /* 委托给 requireComputerUseSwift().apps */ }, +} +``` + +**`platforms/win32.ts`** — 使用 `src/utils/computerUse/win32/` 模块: +```typescript +// Windows: SendMessage 输入 + PrintWindow 截图 + EnumWindows 应用 +import { sendChar, sendKey, sendClick, sendText } from '../win32/windowMessage.js' +import { captureWindow } from '../win32/windowCapture.js' +import { listWindows } from '../win32/windowEnum.js' +// ... PowerShell P/Invoke 全局输入作为 fallback + +export const platform = { + input: { + // 全局模式: PowerShell SetCursorPos/SendInput(fallback) + // 窗口模式: SendMessage(首选) + sendChar, sendKey, sendClick, sendText, // 窗口绑定 + moveMouse, click, typeText, ... // 全局 fallback + }, + screenshot: { + captureScreen, // CopyFromScreen + captureRegion, // CopyFromScreen(rect) + captureWindow, // PrintWindow(不抢焦点) + }, + display: { /* Screen.AllScreens */ }, + apps: { /* EnumWindows */ }, +} +``` + +**`platforms/linux.ts`** — 使用 xdotool/scrot: +```typescript +// Linux: xdotool + scrot + xrandr + wmctrl +export const platform = { + input: { /* xdotool mousemove/click/key/type */ }, + screenshot: { /* scrot */ }, + display: { /* xrandr */ }, + apps: { /* wmctrl + ps */ }, +} +``` + +### 2.5 executor.ts 改造 + +```typescript +// 之前: 直接调 requireComputerUseSwift() 和 requireComputerUseInput() +// 之后: 通过 platforms/ 统一获取 + +import { loadPlatform } from './platforms/index.js' + +const platform = loadPlatform() + +// 截图 +platform.screenshot.captureScreen() +platform.screenshot.captureWindow(hwnd) // 窗口绑定 + +// 输入(窗口绑定模式,不抢焦点) +platform.input.sendText?.(hwnd, 'Hello') +platform.input.sendClick?.(hwnd, 100, 200, 'left') + +// 输入(全局模式,fallback) +platform.input.moveMouse(500, 500) +platform.input.click(500, 500, 'left') +``` + +## 3. Windows 输入模式对比 + +| 方式 | API | 抢焦点 | 移鼠标 | 窗口可最小化 | 适用场景 | +|------|-----|--------|--------|-------------|---------| +| **全局输入** | `SetCursorPos` + `SendInput` | ✅ 抢 | ✅ 动 | ❌ 不行 | 需要坐标点击(fallback) | +| **窗口消息** | `SendMessage(WM_CHAR/WM_KEYDOWN)` | ❌ 不抢 | ❌ 不动 | ✅ 可以 | 打字、按键(首选) | +| **窗口消息** | `SendMessage(WM_LBUTTONDOWN)` | ❌ 不抢 | ❌ 不动 | ⚠️ 部分 | 窗口内点击 | +| **窗口截图** | `PrintWindow(hwnd, PW_RENDERFULLCONTENT)` | ❌ 不抢 | ❌ 不动 | ✅ 可以 | 窗口截图 | +| **UI 操作** | `UIAutomation InvokePattern` | ❌ 不抢 | ❌ 不动 | ✅ 可以 | 按钮点击、文本写入 | + +**策略**:优先用窗口消息 + UIAutomation(不干扰用户),全局输入作为 fallback。 + +## 4. 需要新增的文件 + +| 文件 | 说明 | +|------|------| +| `src/utils/computerUse/platforms/types.ts` | 公共接口定义 | +| `src/utils/computerUse/platforms/index.ts` | 平台分发器 | +| `src/utils/computerUse/platforms/darwin.ts` | macOS: 委托给 @ant 包 | +| `src/utils/computerUse/platforms/win32.ts` | Windows: 组合 win32/ 下各模块 | +| `src/utils/computerUse/platforms/linux.ts` | Linux: xdotool/scrot | +| `src/utils/computerUse/win32/windowMessage.ts` | **新增**: SendMessage 无焦点输入 | + +## 5. 需要移除/清理的文件 + +| 文件 | 操作 | 原因 | +|------|------|------| +| `packages/@ant/computer-use-input/src/backends/win32.ts` | 删除 | Windows 代码不应在 macOS 包里 | +| `packages/@ant/computer-use-input/src/backends/linux.ts` | 删除 | Linux 代码不应在 macOS 包里 | +| `packages/@ant/computer-use-swift/src/backends/win32.ts` | 删除 | 同上 | +| `packages/@ant/computer-use-swift/src/backends/linux.ts` | 删除 | 同上 | +| `packages/@ant/computer-use-input/src/types.ts` | 删除 | 移到 platforms/types.ts | +| `packages/@ant/computer-use-swift/src/types.ts` | 删除 | 移到 platforms/types.ts | + +## 6. 需要修改的文件 + +| 文件 | 改动 | +|------|------| +| `packages/@ant/computer-use-input/src/index.ts` | 恢复为仅 darwin dispatcher(去掉 win32/linux case) | +| `packages/@ant/computer-use-swift/src/index.ts` | 恢复为仅 darwin dispatcher(去掉 win32/linux case) | +| `src/utils/computerUse/executor.ts` | 通过 `platforms/` 获取平台实现,不直接调 @ant 包 | +| `src/utils/computerUse/swiftLoader.ts` | 仅 darwin 加载 | +| `src/utils/computerUse/inputLoader.ts` | 仅 darwin 加载 | + +## 7. @ant 包的定位(修正后) + +| 包 | 职责 | 平台 | +|---|------|------| +| `@ant/computer-use-input` | macOS enigo 键鼠原生模块包装 | **仅 darwin** | +| `@ant/computer-use-swift` | macOS Swift 截图/应用原生模块包装 | **仅 darwin** | +| `@ant/computer-use-mcp` | MCP Server + 工具定义 + 调用路由 | **跨平台**(不含平台代码) | + +Windows/Linux 的平台实现全部在 `src/utils/computerUse/platforms/` 和 `src/utils/computerUse/win32/` 中。 + +## 8. 执行顺序 + +``` +Phase 1: 创建 platforms/ 抽象层 + ├── platforms/types.ts(公共接口) + ├── platforms/index.ts(分发器) + └── platforms/darwin.ts(委托 @ant 包) + +Phase 2: 创建 Windows 平台实现 + ├── win32/windowMessage.ts(SendMessage 无焦点输入) + └── platforms/win32.ts(组合 win32/ 各模块) + +Phase 3: 创建 Linux 平台实现 + └── platforms/linux.ts(xdotool/scrot) + +Phase 4: 改造 executor.ts + └── 通过 platforms/ 获取实现,不直接调 @ant + +Phase 5: 清理 @ant 包 + ├── 删除 @ant/computer-use-input/src/backends/{win32,linux}.ts + ├── 删除 @ant/computer-use-swift/src/backends/{win32,linux}.ts + └── 恢复 index.ts 为 darwin-only + +Phase 6: 验证 + PR +``` diff --git a/docs/features/computer-use-tools-reference.md b/docs/features/computer-use-tools-reference.md new file mode 100644 index 000000000..6eaca522e --- /dev/null +++ b/docs/features/computer-use-tools-reference.md @@ -0,0 +1,496 @@ +# Computer Use 工具参考文档 + +## 概览 + +Computer Use 提供 37 个工具,分为三类: + +| 分类 | 平台 | 工具数 | 说明 | +|------|------|--------|------| +| 通用工具 | 全平台 | 24 | 官方 Computer Use 标准能力 | +| Windows 专属工具 | Win32 | 10 | 绑定窗口模式下的增强能力 | +| 教学工具 | 全平台 | 3 | 分步引导模式(需 teachMode 开启) | + +--- + +## 一、通用工具(24 个) + +全平台可用。未绑定窗口时,操作对象是整个屏幕。 + +### 权限与会话 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `request_access` | `apps[]`, `reason`, `clipboardRead?`, `clipboardWrite?`, `systemKeyCombos?` | 请求操作应用的权限。所有其他工具的前置条件 | +| `list_granted_applications` | — | 列出当前会话已授权的应用 | + +### 截图与显示 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `screenshot` | `save_to_disk?` | 截取当前屏幕。绑定窗口时截取绑定窗口(PrintWindow)。返回图片 + GUI 元素列表(Windows) | +| `zoom` | `region: [x1,y1,x2,y2]` | 截取指定区域的高分辨率图片。坐标基于最近一次全屏截图 | +| `switch_display` | `display` | 切换截图的目标显示器 | + +### 鼠标操作 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `left_click` | `coordinate: [x,y]`, `text?` (修饰键) | 左键点击。`text` 可传 "shift"/"ctrl"/"alt" 实现组合点击 | +| `double_click` | `coordinate`, `text?` | 双击 | +| `triple_click` | `coordinate`, `text?` | 三击(选整行) | +| `right_click` | `coordinate`, `text?` | 右键点击 | +| `middle_click` | `coordinate`, `text?` | 中键点击 | +| `mouse_move` | `coordinate` | 移动鼠标(不点击) | +| `left_click_drag` | `coordinate` (终点), `start_coordinate?` (起点) | 拖拽 | +| `left_mouse_down` | — | 按下左键不松 | +| `left_mouse_up` | — | 松开左键 | +| `cursor_position` | — | 获取当前鼠标位置 | + +### 键盘操作 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `type` | `text` | 输入文字 | +| `key` | `text` (如 "ctrl+s"), `repeat?` | 按键/组合键 | +| `hold_key` | `text`, `duration` (秒) | 按住键指定时长 | + +### 滚动 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `scroll` | `coordinate`, `scroll_direction`, `scroll_amount` | 滚动。方向: up/down/left/right | + +### 应用管理 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `open_application` | `app` | 打开应用。Windows 上自动绑定窗口 | + +### 剪贴板 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `read_clipboard` | — | 读取剪贴板文字 | +| `write_clipboard` | `text` | 写入剪贴板 | + +### 其他 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `wait` | `duration` (秒) | 等待 | +| `computer_batch` | `actions[]` | 批量执行多个动作(减少 API 往返) | + +--- + +## 二、Windows 专属工具(10 个) + +仅 Windows 平台可见。核心能力:**绑定窗口后的独立操作——不抢占用户鼠标键盘**。 + +### 工作模式 + +``` +┌──────────────────────────────────────────────────┐ +│ 未绑定模式 │ +│ 使用通用工具 (left_click/type/key/scroll) │ +│ 操作对象:整个屏幕 │ +│ 输入方式:全局 SendInput(会移动真实鼠标) │ +└──────────────────────────────────────────────────┘ + │ + bind_window / open_application + ▼ +┌──────────────────────────────────────────────────┐ +│ 绑定窗口模式 │ +│ 使用 Win32 工具 (virtual_mouse/virtual_keyboard) │ +│ 操作对象:绑定的窗口 │ +│ 输入方式:SendMessageW(不动真实鼠标/键盘) │ +│ 可视化:DWM 绿色边框 + 虚拟光标 + 状态指示器 │ +└──────────────────────────────────────────────────┘ +``` + +### 窗口绑定 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `bind_window` | `action`: list/bind/unbind/status | 窗口绑定管理 | + +**动作详情:** + +| action | 参数 | 说明 | +|--------|------|------| +| `list` | — | 列出所有可见窗口(hwnd、pid、title) | +| `bind` | `title?`, `hwnd?`, `pid?` | 绑定到指定窗口。设置 DWM 绿色边框 + 启动虚拟光标 + 启动状态指示器 + 短暂激活窗口确保可接收输入 | +| `unbind` | — | 解除绑定,恢复全屏模式 | +| `status` | — | 查看当前绑定状态(hwnd、title、pid、窗口矩形) | + +### 窗口管理 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `window_management` | `action`, `x?`, `y?`, `width?`, `height?` | 窗口操作(Win32 API,不走全局快捷键) | + +**动作详情:** + +| action | 说明 | +|--------|------| +| `minimize` | ShowWindow(SW_MINIMIZE) | +| `maximize` | ShowWindow(SW_MAXIMIZE) | +| `restore` | ShowWindow(SW_RESTORE) — 恢复最小化/最大化 | +| `close` | SendMessage(WM_CLOSE) — 优雅关闭 | +| `focus` | SetForegroundWindow + BringWindowToTop — 激活窗口 | +| `move_offscreen` | SetWindowPos(-32000,-32000) — 移到屏幕外(仍可 SendMessage/PrintWindow) | +| `move_resize` | SetWindowPos — 移动/缩放到指定位置和大小 | +| `get_rect` | GetWindowRect — 获取当前位置和大小 | + +### 虚拟鼠标 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `virtual_mouse` | `action`, `coordinate: [x,y]`, `start_coordinate?` | 在绑定窗口内操作虚拟鼠标 | + +**动作详情:** + +| action | 说明 | +|--------|------| +| `click` | 左键点击。虚拟光标移动到坐标 + 闪烁动画 | +| `double_click` | 双击 | +| `right_click` | 右键点击 | +| `move` | 移动虚拟光标(不点击) | +| `drag` | 按住 → 移动 → 松开。需 `start_coordinate` 指定起点 | +| `down` | 按下左键不松 | +| `up` | 松开左键 | + +**与通用鼠标工具的区别:** + +| | 通用 (`left_click` 等) | `virtual_mouse` | +|---|---|---| +| 输入方式 | SendInput(全局) | SendMessageW(窗口级) | +| 真实鼠标 | 会移动 | **不动** | +| 用户干扰 | 有 | **无** | +| 适用场景 | 未绑定时 | **绑定后** | + +### 虚拟键盘 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `virtual_keyboard` | `action`, `text`, `duration?`, `repeat?` | 在绑定窗口内操作虚拟键盘 | + +**动作详情:** + +| action | text 含义 | 说明 | +|--------|----------|------| +| `type` | 要输入的文字 | SendMessageW(WM_CHAR),支持 Unicode 中文/emoji | +| `combo` | 组合键 (如 "ctrl+s") | WM_KEYDOWN/UP 序列 | +| `press` | 单个键名 | 按下不松(配合 release 使用) | +| `release` | 单个键名 | 松开按键 | +| `hold` | 键名或组合 | 按住指定秒数后松开 | + +**与通用键盘工具的区别:** + +| | 通用 (`type`/`key`) | `virtual_keyboard` | +|---|---|---| +| 输入方式 | SendInput(全局) | SendMessageW(窗口级) | +| 物理键盘 | 会冲突 | **不冲突** | +| 适用场景 | 未绑定时 | **绑定后** | + +**注意:** SendMessageW 对 Windows Terminal (ConPTY) 等现代应用无效。这些应用需要使用通用工具 + 窗口激活方式操作。 + +### 鼠标滚轮 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `mouse_wheel` | `coordinate: [x,y]`, `delta`, `direction?` | WM_MOUSEWHEEL 鼠标中键滚轮 | + +**参数说明:** +- `delta`: 正值=向上,负值=向下。每 1 单位 ≈ 3 行 +- `direction`: "vertical"(默认)或 "horizontal" +- `coordinate`: 滚轮作用点——决定哪个面板/区域接收滚动 + +**与通用 `scroll` 的区别:** + +| | `scroll` | `mouse_wheel` | +|---|---|---| +| 原理 | WM_VSCROLL/WM_HSCROLL | **WM_MOUSEWHEEL** | +| Excel | ❌ | ✅ | +| 浏览器 | ❌ | ✅ | +| 代码编辑器 | ❌ | ✅ | + +### 元素级操作 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `click_element` | `name?`, `role?`, `automationId?` | 按无障碍名称/角色点击 GUI 元素 | +| `type_into_element` | `name?`, `role?`, `automationId?`, `text` | 按名称向元素输入文字 | + +**工作原理:** +1. 通过 UI Automation 在绑定窗口中查找匹配元素 +2. `click_element`: 先尝试 InvokePattern(按钮/菜单),失败则 SendMessage 点击 BoundingRect 中心 +3. `type_into_element`: 先尝试 ValuePattern 直接设值,失败则点击聚焦 + WM_CHAR 输入 + +**适用场景:** +- 截图中看到元素名称但坐标不精确时 +- Accessibility Snapshot 列出了元素的 name/automationId 时 +- 比坐标点击更可靠(不受窗口缩放/DPI 影响) + +### 终端交互 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `prompt_respond` | `response_type`, `arrow_direction?`, `arrow_count?`, `text?` | 处理终端 Yes/No/选择提示 | + +**response_type 详情:** + +| response_type | 操作 | 场景 | +|---------------|------|------| +| `yes` | 发送 'y' + Enter | npm "Continue? (y/n)" | +| `no` | 发送 'n' + Enter | 拒绝确认 | +| `enter` | 发送 Enter | 接受默认选项 | +| `escape` | 发送 Escape | 取消操作 | +| `select` | ↑/↓ 箭头 × N + Enter | inquirer 选择菜单 | +| `type` | 输入文字 + Enter | 文本输入提示 | + +### 状态指示器 + +| 工具 | 参数 | 说明 | +|------|------|------| +| `status_indicator` | `action`: show/hide/status, `message?` | 控制绑定窗口底部的浮动状态标签 | + +--- + +## 三、教学工具(3 个) + +需要 `teachMode` 开启。 + +| 工具 | 说明 | +|------|------| +| `request_teach_access` | 请求教学引导模式权限 | +| `teach_step` | 显示一步引导提示,等用户点 Next | +| `teach_batch` | 批量排队多步引导 | + +--- + +## 操作流程 + +### 流程 1:全屏操作(未绑定) + +``` +request_access(apps=["Notepad"]) +open_application(app="Notepad") ← 自动绑定窗口 +screenshot ← PrintWindow 截图 + GUI 元素列表 +left_click(coordinate=[500, 300]) ← 全局 SendInput +type(text="hello world") ← 全局 SendInput +key(text="ctrl+s") ← 全局 SendInput +``` + +### 流程 2:绑定窗口操作(推荐,不干扰用户) + +``` +request_access(apps=["Notepad"]) +bind_window(action="list") ← 列出所有窗口 +bind_window(action="bind", title="记事本") ← 绑定 + 绿色边框 + 虚拟光标 +screenshot ← PrintWindow 截取绑定窗口 +virtual_mouse(action="click", coordinate=[500, 300]) ← SendMessageW,不动真实鼠标 +virtual_keyboard(action="type", text="hello world") ← SendMessageW,不动物理键盘 +virtual_keyboard(action="combo", text="ctrl+s") ← 保存 +mouse_wheel(coordinate=[500, 400], delta=-5) ← 向下滚动 +bind_window(action="unbind") ← 解除绑定 +``` + +### 流程 3:按元素名称操作 + +``` +bind_window(action="bind", title="记事本") +screenshot ← 返回截图 + GUI elements 列表 +click_element(name="保存", role="Button") ← UI Automation 查找并点击 +type_into_element(role="Edit", text="new content") +``` + +### 流程 4:终端交互 + +``` +bind_window(action="bind", title="PowerShell") +screenshot +prompt_respond(response_type="yes") ← 回答 y + Enter +prompt_respond(response_type="select", arrow_direction="down", arrow_count=2) ← 选第3项 +``` + +### 流程 5:Excel/浏览器滚动 + +``` +bind_window(action="bind", title="Excel") +screenshot +mouse_wheel(coordinate=[600, 400], delta=-10) ← 向下滚动 10 格 +mouse_wheel(coordinate=[600, 400], delta=5, direction="horizontal") ← 向右滚动 +``` + +--- + +## 应用兼容性 + +| 应用类型 | SendMessageW (virtual_*) | 元素操作 (click_element) | 注意 | +|---------|--------------------------|------------------------|------| +| 传统 Win32 (记事本/写字板) | ✅ | ✅ | 完美支持 | +| Office (Excel/Word) | ✅ (COM 自动化) | ✅ | 通过 COM API | +| WPF 应用 | ✅ | ✅ | 标准 UIA 支持 | +| Electron/Chrome | ⚠️ 部分 | ⚠️ 部分 | 内部渲染不走 Win32 消息 | +| UWP/WinUI (Windows Terminal) | ❌ | ❌ | ConPTY 不接受 SendMessageW | +| 浏览器网页内容 | ❌ | ❌ | 需要全局 SendInput | + +**对于不支持 SendMessageW 的应用**,使用通用工具 (`left_click`/`type`/`key`) + `window_management(action="focus")` 先激活窗口。 + +--- + +## 绑定窗口时的可视化 + +绑定窗口后自动启动三层可视化: + +1. **DWM 绿色边框** — 窗口自身的边框颜色变绿,零偏移 +2. **虚拟鼠标光标** — 红色箭头图标,跟随 virtual_mouse 操作移动,点击时闪烁 +3. **状态指示器** — 窗口底部浮动标签,显示当前操作(通过 status_indicator 控制) + +--- + +## Accessibility Snapshot + +每次 `screenshot` 时,如果窗口已绑定,会自动附带 GUI 元素列表: + +``` +GUI elements in this window: +[Button] "Save" (120,50 80x30) enabled +[Edit] "" (200,80 400x25) enabled value="hello" id=textBox1 +[MenuItem] "File" (10,0 40x25) enabled +[MenuItem] "Edit" (50,0 40x25) enabled +[CheckBox] "Auto-save" (300,50 100x20) enabled id=chkAutoSave +``` + +模型同时收到 **截图图片 + 结构化元素列表**,可以选择: +- 用坐标操作:`virtual_mouse(action="click", coordinate=[120, 50])` +- 用名称操作:`click_element(name="Save")` + +--- + +## UI Automation Control Patterns 参考 + +`click_element` / `type_into_element` 底层使用 UI Automation Control Patterns。当前已实现的和可扩展的: + +| Pattern | 用途 | 当前状态 | 可用于 | +|---------|------|---------|--------| +| `InvokePattern` | 触发点击 | ✅ 已实现 (`click_element`) | 按钮、菜单项、链接 | +| `ValuePattern` | 读写文本值 | ✅ 已实现 (`type_into_element`) | 文本框、组合框 | +| `TogglePattern` | 切换状态 | ❌ 未实现 | 复选框、开关 | +| `SelectionPattern` | 选择项目 | ❌ 未实现 | 下拉菜单、列表 | +| `ScrollPattern` | 编程滚动 | ❌ 未实现(用 `mouse_wheel` 替代) | 列表、树、面板 | +| `ExpandCollapsePattern` | 展开/折叠 | ❌ 未实现 | 树节点、折叠面板 | +| `WindowPattern` | 窗口操作 | ❌ 未实现(用 `window_management` 替代) | 窗口最大化/关闭 | +| `TextPattern` | 读取文档文本 | ❌ 未实现 | 文档、富文本 | +| `GridPattern` | 表格操作 | ❌ 未实现 | Excel 单元格、数据网格 | +| `TablePattern` | 表格结构 | ❌ 未实现 | 表头、行列关系 | +| `RangeValuePattern` | 范围值操作 | ❌ 未实现 | 滑块、进度条 | +| `TransformPattern` | 移动/缩放 | ❌ 未实现 | 可拖拽元素 | + +**扩展路线:** 优先实现 `TogglePattern`(复选框)和 `SelectionPattern`(下拉菜单),这两个在表单自动化中最常用。 + +--- + +## 屏幕截取技术方案对比 + +当前使用 Python Bridge (mss) 进行截图,底层是 GDI BitBlt。三种方案对比: + +| 方案 | API | 当前状态 | 性能 | 优势 | 限制 | +|------|-----|---------|------|------|------| +| **GDI BitBlt** | `BitBlt` / `PrintWindow` | ✅ 当前使用 (mss/bridge.py) | ~300ms | 简单稳定,支持后台窗口 (PrintWindow) | 不支持硬件加速内容、DPI 处理复杂 | +| **DXGI Desktop Duplication** | `IDXGIOutputDuplication` | ❌ 未实现 | ~16ms (60fps) | 硬件加速,支持 HDR,GPU 直接读取 | 不支持单窗口截取,需 D3D11 | +| **Windows.Graphics.Capture** | `GraphicsCaptureItem` | ❌ 未实现 | ~16ms | 最新 API,支持单窗口/单显示器,系统级权限管理 | Win10 1903+,首次需用户确认 | + +### 推荐升级路径 + +``` +当前: GDI BitBlt (mss) ─── 全屏 ~300ms, 窗口 ~300ms (PrintWindow) + │ + ├─ 近期: DXGI Desktop Duplication ─── 全屏 ~16ms, 但不支持单窗口 + │ + └─ 远期: Windows.Graphics.Capture ─── 全屏 + 单窗口都 ~16ms +``` + +### DXGI Desktop Duplication 实现要点 + +```python +# bridge.py 中可添加 DXGI 截图(通过 d3dshot 或 dxcam 库) +import dxcam # pip install dxcam + +camera = dxcam.create() +frame = camera.grab() # numpy array, ~5ms +# 转为 JPEG base64 发送 +``` + +### Windows.Graphics.Capture 实现要点 + +```python +# 需要 WinRT Python 绑定 +# pip install winrt-Windows.Graphics.Capture winrt-Windows.Graphics.DirectX +# 限制:首次调用需要用户在系统弹窗中确认权限 +``` + +--- + +## 输入方式技术矩阵 + +不同应用类型需要不同的输入方式: + +| 输入方式 | API | 优势 | 限制 | 适用应用 | +|---------|-----|------|------|---------| +| **SendMessageW** | `WM_CHAR` / `WM_KEYDOWN` | 不抢焦点,不动真实键鼠 | 现代应用不支持 | Win32 传统应用 (记事本/Office/WPF) | +| **SendInput** | `INPUT` 结构体 | 所有应用都支持 | **必须前台焦点**,会干扰用户 | 所有应用(通用后备) | +| **WriteConsoleInput** | 控制台 API | 直接写入控制台缓冲区 | 需要 AttachConsole(可能被拒绝) | cmd/PowerShell(非 Windows Terminal) | +| **UI Automation** | `InvokePattern` / `ValuePattern` | 语义级操作,最可靠 | 部分应用不暴露 UIA 接口 | 支持 UIA 的应用 | +| **COM Automation** | Excel/Word COM | 完全编程控制 | 仅 Office 应用 | Excel / Word | +| **剪贴板 + 粘贴** | `SetClipboardData` + `Ctrl+V` | 绕过输入限制 | 会覆盖用户剪贴板 | 通用后备 | + +### 按应用类型的推荐输入策略 + +| 应用类型 | 首选 | 后备 | 说明 | +|---------|------|------|------| +| 传统 Win32 (记事本/写字板) | SendMessageW | UIA ValuePattern | 虚拟输入完美工作 | +| Office (Excel/Word) | COM Automation | SendMessageW | COM 提供结构化操作 | +| WPF 应用 | SendMessageW | UIA | 标准 Win32 消息循环 | +| Electron/Chrome 应用 | UIA | 剪贴板粘贴 | 内部渲染不走 Win32 | +| Windows Terminal (ConPTY) | SendInput (需前台) | 剪贴板粘贴 | ConPTY 不接受外部消息 | +| UWP/WinUI 应用 | SendInput (需前台) | UIA | XAML 渲染不走 Win32 消息 | + +--- + +## 已知限制与待解决 + +| 限制 | 影响 | 计划 | +|------|------|------| +| Windows Terminal 不接受 SendMessageW | 虚拟键盘/鼠标对终端无效 | 自动检测应用类型,终端类切换到 SendInput + 短暂激活 | +| PrintWindow 截不到 alternate screen buffer | Ink REPL 画面截不到 | 切换到 Windows.Graphics.Capture | +| Accessibility Snapshot 对大应用慢 (>30s) | Excel 等复杂应用超时 | 限制遍历深度 + 超时保护 | +| DWM 边框对自定义标题栏应用可能无效 | 某些 Electron 应用看不到边框 | 检测并回退到叠加窗口方案 | +| 虚拟光标是 PowerShell WinForms 进程 | 启动慢 (~1s),资源占用 | 考虑用 Win32 原生窗口替代 | + +--- + +## 技术路线图 + +### Phase 1(当前)— 基础功能 +- ✅ SendMessageW 虚拟输入 +- ✅ PrintWindow/mss 截图 +- ✅ UI Automation (InvokePattern + ValuePattern) +- ✅ Accessibility Snapshot +- ✅ DWM 边框指示 +- ✅ Python Bridge + +### Phase 2(近期)— 兼容性增强 +- ⬜ 应用类型自动检测(Win32 vs Terminal vs UWP) +- ⬜ 终端类应用自动切换 SendInput + 短暂激活 +- ⬜ TogglePattern / SelectionPattern 支持 +- ⬜ DXGI Desktop Duplication 高速截图 +- ⬜ Accessibility Snapshot 超时保护 + +### Phase 3(远期)— 高级能力 +- ⬜ Windows.Graphics.Capture(单窗口实时截图) +- ⬜ 截图元素标注(在截图上标记 ID 数字) +- ⬜ 浏览器 DOM 提取(绑定浏览器时提取网页结构) +- ⬜ GridPattern / TablePattern(Excel 单元格级操作) +- ⬜ TextPattern(文档内容读取) +- ⬜ 多窗口协同操作 diff --git a/docs/features/computer-use.md b/docs/features/computer-use.md index 2cfd54d8c..2ae3cc77f 100644 --- a/docs/features/computer-use.md +++ b/docs/features/computer-use.md @@ -1,136 +1,197 @@ -# Computer Use 用户指南 +# Computer Use — macOS / Windows / Linux 跨平台实施计划 -Computer Use 让 Claude 直接操控你的电脑——移动鼠标、点击、输入文字、截图,就像一个远程助手坐在你面前操作一样。 +更新时间:2026-04-03 +参考项目:`E:\源码\claude-code-source-main\claude-code-source-main` -## 支持平台 +## 1. 现状 -| 平台 | 状态 | 额外配置 | -|------|------|---------| -| macOS | 可用 | 需授予辅助功能 + 屏幕录制权限 | -| Windows | 可用 | 无需额外配置 | -| Linux | 不可用 | 后端待开发 | +参考项目的 Computer Use **仅支持 macOS**——从入口到底层全部写死 darwin。我们的项目在 Phase 1-3 中已经完成了: + +- ✅ `@ant/computer-use-mcp` stub 替换为完整实现(12 文件) +- ✅ `@ant/computer-use-input` 拆为 dispatcher + backends(darwin + win32) +- ✅ `@ant/computer-use-swift` 拆为 dispatcher + backends(darwin + win32) +- ✅ `CHICAGO_MCP` 编译开关已开 +- ❌ `src/` 层有 6 处 macOS 硬编码阻塞 + +## 2. 阻塞点全景 + +### 2.1 入口层 + +| # | 文件:行号 | 阻塞代码 | 影响 | +|---|----------|---------|------| +| 1 | `src/main.tsx:1605` | `getPlatform() === 'macos'` | 整个 CU 初始化被跳过 | + +### 2.2 加载层 + +| # | 文件:行号 | 阻塞代码 | 影响 | +|---|----------|---------|------| +| 2 | `src/utils/computerUse/swiftLoader.ts:16` | `process.platform !== 'darwin'` → throw | 截图、应用管理全部不可用 | +| 3 | `src/utils/computerUse/executor.ts:263` | `process.platform !== 'darwin'` → throw | 整个 executor 工厂函数不可用 | + +### 2.3 macOS 特有依赖 + +| # | 文件:行号 | 依赖 | macOS 实现 | 需要替代方案 | +|---|----------|------|-----------|------------| +| 4 | `executor.ts:70-88` | 剪贴板 | `pbcopy`/`pbpaste` | Win: PowerShell `Get/Set-Clipboard`;Linux: `xclip`/`wl-copy` | +| 5 | `drainRunLoop.ts:21` | CFRunLoop pump | `cu._drainMainRunLoop()` | 非 darwin:直接执行 fn(),不需要 pump | +| 6 | `escHotkey.ts:28` | ESC 热键 | CGEventTap | 非 darwin:返回 false(已有 Ctrl+C fallback) | +| 7 | `hostAdapter.ts:48-54` | 系统权限 | TCC accessibility + screenRecording | Win:直接 granted;Linux:检查 xdotool | +| 8 | `common.ts:56` | 平台标识 | `platform: 'darwin'` 硬编码 | 动态获取 | +| 9 | `executor.ts:180` | 粘贴快捷键 | `command+v` | Win/Linux:`ctrl+v` | + +### 2.4 缺失的 Linux 后端 + +| 包 | macOS | Windows | Linux | +|---|-------|---------|-------| +| `computer-use-input/backends/` | ✅ darwin.ts | ✅ win32.ts | ❌ 需新建 linux.ts | +| `computer-use-swift/backends/` | ✅ darwin.ts | ✅ win32.ts | ❌ 需新建 linux.ts | + +## 3. 每个平台的能力依赖 + +### 3.1 computer-use-input(键鼠) + +| 功能 | macOS | Windows | Linux | +|------|-------|---------|-------| +| 鼠标移动 | CGEvent JXA | SetCursorPos P/Invoke | xdotool mousemove | +| 鼠标点击 | CGEvent JXA | SendInput P/Invoke | xdotool click | +| 鼠标滚轮 | CGEvent JXA | SendInput MOUSEEVENTF_WHEEL | xdotool scroll | +| 键盘按键 | System Events osascript | keybd_event P/Invoke | xdotool key | +| 组合键 | System Events osascript | keybd_event 组合 | xdotool key combo | +| 文本输入 | System Events keystroke | SendKeys.SendWait | xdotool type | +| 前台应用 | System Events osascript | GetForegroundWindow P/Invoke | xdotool getactivewindow + /proc | +| 工具依赖 | osascript(内置) | powershell(内置) | xdotool(需安装) | + +### 3.2 computer-use-swift(截图 + 应用管理) + +| 功能 | macOS | Windows | Linux | +|------|-------|---------|-------| +| 全屏截图 | screencapture | CopyFromScreen | gnome-screenshot / scrot / grim | +| 区域截图 | screencapture -R | CopyFromScreen(rect) | gnome-screenshot -a / scrot -a / grim -g | +| 显示器列表 | CGGetActiveDisplayList JXA | Screen.AllScreens | xrandr --query | +| 运行中应用 | System Events JXA | Get-Process | wmctrl -l / ps | +| 打开应用 | osascript activate | Start-Process | xdg-open / gtk-launch | +| 隐藏/显示 | System Events visibility | ShowWindow/SetForegroundWindow | wmctrl -c / xdotool | +| 工具依赖 | screencapture + osascript | powershell | xdotool + scrot/grim + wmctrl | + +### 3.3 executor 层 + +| 功能 | macOS | Windows | Linux | +|------|-------|---------|-------| +| drainRunLoop | CFRunLoop pump | 不需要 | 不需要 | +| ESC 热键 | CGEventTap | 跳过(Ctrl+C fallback) | 跳过(Ctrl+C fallback) | +| 剪贴板读 | pbpaste | `powershell Get-Clipboard` | xclip -o / wl-paste | +| 剪贴板写 | pbcopy | `powershell Set-Clipboard` | xclip / wl-copy | +| 粘贴快捷键 | command+v | ctrl+v | ctrl+v | +| 终端检测 | __CFBundleIdentifier | WT_SESSION / TERM_PROGRAM | TERM_PROGRAM | +| 系统权限 | TCC check | 直接 granted | 检查 xdotool 安装 | + +## 4. 执行步骤 + +### Phase 1:已完成 ✅ + +- [x] `@ant/computer-use-mcp` stub → 完整实现 +- [x] `@ant/computer-use-input` dispatcher + darwin/win32 backends +- [x] `@ant/computer-use-swift` dispatcher + darwin/win32 backends +- [x] `CHICAGO_MCP` 编译开关 + +### Phase 2:移除 6 处 macOS 硬编码(解锁 macOS + Windows) + +**改动原则:macOS 代码路径不变,只在每处 darwin 守卫后加 win32/linux 分支。** + +| 步骤 | 文件 | 改动 | +|------|------|------| +| 2.1 | `src/main.tsx:1605` | `getPlatform() === 'macos'` → 去掉平台限制,或改为 `!== 'unknown'` | +| 2.2 | `src/utils/computerUse/swiftLoader.ts:16-18` | 移除 `process.platform !== 'darwin'` throw。`@ant/computer-use-swift/index.ts` 已有跨平台 dispatch | +| 2.3 | `src/utils/computerUse/executor.ts:263-267` | 移除 `process.platform !== 'darwin'` throw。改为检查 input/swift isSupported | +| 2.4 | `src/utils/computerUse/executor.ts:70-88` | 剪贴板函数按平台分发:darwin→pbcopy/pbpaste,win32→PowerShell Get/Set-Clipboard,linux→xclip | +| 2.5 | `src/utils/computerUse/executor.ts:180` | `typeViaClipboard` 中 `command+v` → 非 darwin 时用 `ctrl+v` | +| 2.6 | `src/utils/computerUse/executor.ts:273` | `const cu = requireComputerUseSwift()` → 改为 `new ComputerUseAPI()`(从 package 直接实例化,不走 swiftLoader throw) | +| 2.7 | `src/utils/computerUse/drainRunLoop.ts` | 开头加 `if (process.platform !== 'darwin') return fn()` | +| 2.8 | `src/utils/computerUse/escHotkey.ts` | `registerEscHotkey` 非 darwin 返回 false(已有 Ctrl+C fallback) | +| 2.9 | `src/utils/computerUse/hostAdapter.ts:48-54` | `ensureOsPermissions` 非 darwin 返回 `{ granted: true }` | +| 2.10 | `src/utils/computerUse/common.ts:56` | `platform: 'darwin'` → `platform: process.platform === 'win32' ? 'windows' : process.platform === 'linux' ? 'linux' : 'darwin'` | +| 2.11 | `src/utils/computerUse/common.ts:55` | `screenshotFiltering: 'native'` → 非 darwin 时 `'none'`(Windows/Linux 截图不支持 per-app 过滤) | +| 2.12 | `src/utils/computerUse/gates.ts:13` | `enabled: false` → `enabled: true`(无 GrowthBook 时默认可用) | +| 2.13 | `src/utils/computerUse/gates.ts:39-43` | `hasRequiredSubscription()` → 直接返回 `true` | + +### Phase 3:新增 Linux 后端 + +| 步骤 | 文件 | 内容 | +|------|------|------| +| 3.1 | `packages/@ant/computer-use-input/src/backends/linux.ts` | xdotool 键鼠(mousemove/click/key/type/getactivewindow) | +| 3.2 | `packages/@ant/computer-use-swift/src/backends/linux.ts` | scrot/grim 截图 + xrandr 显示器 + wmctrl 窗口管理 | +| 3.3 | `packages/@ant/computer-use-input/src/index.ts` | dispatcher 加 `case 'linux'` | +| 3.4 | `packages/@ant/computer-use-swift/src/index.ts` | dispatcher 加 `case 'linux'` | + +### Phase 4:验证 + +| 测试项 | macOS | Windows | Linux | +|--------|-------|---------|-------| +| build 成功 | ✅ | 验证 | 验证 | +| MCP 工具列表非空 | 验证 | 验证 | 验证 | +| 鼠标移动 | 验证 | ✅ 已通过 | 验证 | +| 截图 | 验证 | ✅ 已通过 | 验证 | +| 键盘输入 | 验证 | 验证 | 验证 | +| 前台窗口 | 验证 | ✅ 已通过 | 验证 | +| 剪贴板 | 验证 | 验证 | 验证 | + +## 5. 文件改动总览 + +### 不动的文件(14 个) + +`cleanup.ts`、`computerUseLock.ts`、`wrapper.tsx`、`toolRendering.tsx`、`mcpServer.ts`、`setup.ts`、`appNames.ts`、`inputLoader.ts`、`src/services/mcp/client.ts`、`@ant/computer-use-mcp/src/*`(Phase 1 已完成)、`backends/darwin.ts`(两个包都不动) + +### 改 src/ 的文件(8 个) + +| 文件 | 改动量 | 风险 | +|------|--------|------| +| `main.tsx` | 1 行 | 低 | +| `swiftLoader.ts` | 2 行 | 低 | +| `executor.ts` | ~40 行(剪贴板分发 + 平台守卫 + paste 快捷键) | **中** | +| `drainRunLoop.ts` | 1 行 | 低 | +| `escHotkey.ts` | 3 行 | 低 | +| `hostAdapter.ts` | 5 行 | 低 | +| `common.ts` | 3 行 | 低 | +| `gates.ts` | 3 行 | 低 | + +### 新增文件(2 个) + +| 文件 | 行数估算 | +|------|---------| +| `packages/@ant/computer-use-input/src/backends/linux.ts` | ~150 行 | +| `packages/@ant/computer-use-swift/src/backends/linux.ts` | ~200 行 | + +## 6. Linux 依赖工具 + +| 工具 | 用途 | 安装命令(Ubuntu) | +|------|------|-------------------| +| `xdotool` | 键鼠模拟 + 窗口管理 | `sudo apt install xdotool` | +| `scrot` 或 `gnome-screenshot` | 截图 | `sudo apt install scrot` | +| `xrandr` | 显示器信息 | 通常已预装 | +| `xclip` | 剪贴板 | `sudo apt install xclip` | +| `wmctrl` | 窗口列表/切换 | `sudo apt install wmctrl` | + +Wayland 环境需要替代工具:`ydotool`(替代 xdotool)、`grim`(替代 scrot)、`wl-clipboard`(替代 xclip)。初期可先只支持 X11,Wayland 标记为 todo。 + +## 7. 执行顺序建议 -## 快速开始 - -1. 启动 Claude Code: - - ```bash - bun run dev - ``` - - Computer Use 默认已开启,无需额外参数。 - -2. 在对话中告诉 Claude 你想做什么,例如: - - "帮我打开系统设置" - - "截个屏看看当前桌面" - - "在 Finder 里点击那个文件" - -3. 首次操控某个应用时,会弹出权限对话框让你确认。 - -4. 操作过程中随时按 **Esc**(macOS)或 **Ctrl+C**(Windows)中止。 - -## 权限说明 - -Computer Use 采用分级权限模型,保护你的安全: - -| 级别 | 能力 | 适用场景 | -|------|------|---------| -| **full** | 所有操作:鼠标点击(左/右/中键)、拖拽、键盘输入、组合键 | 系统设置、Finder 等系统应用 | -| **click** | 仅左键点击和滚轮滚动 | IDE(VS Code、Cursor)、终端 | -| 未授权 | 所有操作被拒绝 | 需要通过 `request_access` 申请 | - -IDE 类应用默认只有 click 权限,这是安全设计——防止 AI 在你的终端或编辑器中执行危险操作。如需完整控制,可以在权限对话框中手动提升。 - -## 可用操作 - -### 鼠标 - -| 操作 | 说明 | -|------|------| -| 移动鼠标 | 移动到指定坐标 | -| 左键点击 | 单击、双击、三击 | -| 右键点击 | 需要 full 权限 | -| 中键点击 | 需要 full 权限 | -| 拖拽 | 从 A 点拖到 B 点,需要 full 权限 | -| 滚轮 | 向上或向下滚动 | - -### 键盘 - -| 操作 | 说明 | -|------|------| -| 按键 | 单个按键或组合键(如 Ctrl+C) | -| 输入文字 | 逐字符输入文本,需要 full 权限 | -| 长按 | 按住某个键一段时间,需要 full 权限 | - -### 屏幕 - -| 操作 | 说明 | -|------|------| -| 截图 | 截取当前屏幕 | -| 切换显示器 | 多显示器环境下切换目标屏幕 | -| 缩放 | 放大屏幕某个区域 | - -### 其他 - -| 操作 | 说明 | -|------|------| -| 获取鼠标位置 | 查询当前鼠标坐标 | -| 批量操作 | 一次执行多个操作,减少等待 | -| 等待 | 暂停指定秒数(最长 100 秒) | - -## macOS 权限配置 - -首次使用前,需要授予两项系统权限。缺少任一项都会导致功能异常(见下方说明)。 - -### 辅助功能(Accessibility) - -允许 Claude 控制鼠标和键盘。 - -1. 打开 **系统设置 → 隐私与安全性 → 辅助功能** -2. 点击左下角锁图标解锁(需要管理员密码) -3. 将运行 Claude Code 的应用添加到允许列表: - - Terminal → `Terminal.app` - - iTerm → `iTerm.app` - - Cursor → `Cursor.app` - - VS Code 终端 → `Electron` 或 `Visual Studio Code.app` -4. 确保应用旁边的开关已打开 - -**未授予时的现象**:鼠标移动、点击、键盘输入均无反应,工具执行成功但屏幕没有任何变化。 - -### 屏幕录制(Screen Recording) - -允许 Claude 截取屏幕内容。 - -1. 打开 **系统设置 → 隐私与安全性 → 屏幕录制** -2. 将同一个应用添加到允许列表并开启开关 -3. **需要重启该应用**才能生效(系统会提示 "xxx 需要重新打开") - -**未授予时的现象**:截图工具执行成功但返回空白图片,Claude 无法看到你的屏幕,所有点击操作变成"盲点"。 - -### 验证权限 - -授予两项权限后,重启 Claude Code,在对话中让 Claude 截一张图即可验证是否配置成功。如果截图内容正常显示,说明权限配置完成。 - -## Linux 依赖(暂不可用) - -Linux 后端尚未开发。完成后需要安装以下工具: - -```bash -sudo apt install xdotool scrot xclip wmctrl +``` +Phase 2(解锁 macOS + Windows) + ├── 2.1-2.3 移除 3 处硬编码 throw/skip + ├── 2.4-2.5 剪贴板 + 粘贴快捷键平台分发 + ├── 2.6 swiftLoader → 直接实例化 + ├── 2.7-2.9 drainRunLoop / escHotkey / permissions 平台分支 + ├── 2.10-2.11 common.ts 平台标识动态化 + ├── 2.12-2.13 gates.ts 默认值 + └── 验证 Windows + +Phase 3(Linux 后端) + ├── 3.1 input/backends/linux.ts + ├── 3.2 swift/backends/linux.ts + ├── 3.3-3.4 dispatcher 加 linux case + └── 验证 Linux + +Phase 4(集成验证 + PR) ``` -仅支持 X11,Wayland 不支持。 - -## 常见问题 - -### 截图成功但看不到图片 - -检查 **系统设置 → 隐私与安全性 → 屏幕录制** 是否已授权。未授权时截图工具会执行成功但返回空白内容。 - -### IDE 中无法输入文字或右键 - -这是正常行为。IDE 类应用只有 click 权限,无法执行键盘输入、右键、拖拽等操作。如需完整控制,请在系统应用(如 Finder)中操作。 - -### 操作中途想停止 - -按 **Esc**(macOS)或 **Ctrl+C** 即可立即中止。 +每个 Phase 可独立验证、独立提交。Phase 2 完成后 macOS + Windows 可用,Phase 3 完成后三平台全部可用。 diff --git a/packages/@ant/computer-use-input/src/index.ts b/packages/@ant/computer-use-input/src/index.ts index dc3f1063c..171ef6005 100644 --- a/packages/@ant/computer-use-input/src/index.ts +++ b/packages/@ant/computer-use-input/src/index.ts @@ -1,33 +1,30 @@ /** - * @ant/computer-use-input — cross-platform keyboard & mouse simulation + * @ant/computer-use-input — macOS keyboard & mouse simulation (enigo) * - * Platform backends: - * - darwin: AppleScript/JXA via CoreGraphics events - * - win32: PowerShell via Win32 P/Invoke (SetCursorPos, SendInput, keybd_event) - * - * Add new platforms by creating backends/.ts implementing InputBackend. + * This package wraps the macOS-only native enigo .node module. + * For Windows/Linux, use src/utils/computerUse/platforms/ instead. */ -import type { FrontmostAppInfo, InputBackend } from './types.js' - -export type { FrontmostAppInfo, InputBackend } from './types.js' +export interface FrontmostAppInfo { + bundleId: string + appName: string +} -// --------------------------------------------------------------------------- -// Platform dispatch -// --------------------------------------------------------------------------- +export interface InputBackend { + moveMouse(x: number, y: number, animated: boolean): Promise + key(key: string, action: 'press' | 'release'): Promise + keys(parts: string[]): Promise + mouseLocation(): Promise<{ x: number; y: number }> + mouseButton(button: 'left' | 'right' | 'middle', action: 'click' | 'press' | 'release', count?: number): Promise + mouseScroll(amount: number, direction: 'vertical' | 'horizontal'): Promise + typeText(text: string): Promise + getFrontmostAppInfo(): FrontmostAppInfo | null +} function loadBackend(): InputBackend | null { + if (process.platform !== 'darwin') return null try { - switch (process.platform) { - case 'darwin': - return require('./backends/darwin.js') as InputBackend - case 'win32': - return require('./backends/win32.js') as InputBackend - case 'linux': - return require('./backends/linux.js') as InputBackend - default: - return null - } + return require('./backends/darwin.js') as InputBackend } catch { return null } @@ -35,30 +32,16 @@ function loadBackend(): InputBackend | null { const backend = loadBackend() -// --------------------------------------------------------------------------- -// Unsupported stub (throws on call — guards via isSupported check) -// --------------------------------------------------------------------------- - -function unsupported(): never { - throw new Error(`computer-use-input is not supported on ${process.platform}`) -} - -// --------------------------------------------------------------------------- -// Public API — matches the original export surface -// --------------------------------------------------------------------------- - export const isSupported = backend !== null - -export const moveMouse = backend?.moveMouse ?? unsupported -export const key = backend?.key ?? unsupported -export const keys = backend?.keys ?? unsupported -export const mouseLocation = backend?.mouseLocation ?? unsupported -export const mouseButton = backend?.mouseButton ?? unsupported -export const mouseScroll = backend?.mouseScroll ?? unsupported -export const typeText = backend?.typeText ?? unsupported +export const moveMouse = backend?.moveMouse +export const key = backend?.key +export const keys = backend?.keys +export const mouseLocation = backend?.mouseLocation +export const mouseButton = backend?.mouseButton +export const mouseScroll = backend?.mouseScroll +export const typeText = backend?.typeText export const getFrontmostAppInfo = backend?.getFrontmostAppInfo ?? (() => null) -// Legacy class type — used by inputLoader.ts for type narrowing export class ComputerUseInputAPI { declare moveMouse: InputBackend['moveMouse'] declare key: InputBackend['key'] @@ -71,8 +54,5 @@ export class ComputerUseInputAPI { declare isSupported: true } -interface ComputerUseInputUnsupported { - isSupported: false -} - +interface ComputerUseInputUnsupported { isSupported: false } export type ComputerUseInput = ComputerUseInputAPI | ComputerUseInputUnsupported diff --git a/packages/@ant/computer-use-mcp/src/executor.ts b/packages/@ant/computer-use-mcp/src/executor.ts index 8092c68e9..a454631c3 100644 --- a/packages/@ant/computer-use-mcp/src/executor.ts +++ b/packages/@ant/computer-use-mcp/src/executor.ts @@ -16,6 +16,8 @@ export interface ScreenshotResult { originX: number originY: number displayId?: number + /** Accessibility snapshot — structured GUI element tree as model-friendly text. Windows only. */ + accessibilityText?: string } export interface FrontmostApp { @@ -108,4 +110,59 @@ export interface ComputerExecutor { getAppIcon(path: string): Promise listRunningApps(): Promise openApp(bundleId: string): Promise + + // ── Window management (Windows only, optional) ────────────────────────── + /** Perform a window management action on the bound window. Win32 API only — no global shortcuts. */ + manageWindow?(action: string, opts?: { x?: number; y?: number; width?: number; height?: number }): Promise + /** Get the current window rect of the bound window */ + getWindowRect?(): Promise<{ x: number; y: number; width: number; height: number } | null> + + // ── Element-targeted actions (Windows UIA, optional) ──────────────────── + /** Open terminal and launch an agent CLI */ + openTerminal?(opts: { + agent: 'claude' | 'codex' | 'gemini' | 'custom' + command?: string + terminal?: 'wt' | 'powershell' | 'cmd' + workingDirectory?: string + }): Promise<{ hwnd: string; title: string; launched: boolean } | null> + /** Bind to a window by hwnd/title/pid. Returns bound window info or null. */ + bindToWindow?(query: { hwnd?: string; title?: string; pid?: number }): Promise<{ hwnd: string; title: string; pid: number } | null> + /** Unbind from the current window */ + unbindFromWindow?(): Promise + /** Cheap binding-state check for window-targeted routing decisions. */ + hasBoundWindow?(): Promise + /** Get current binding status */ + getBindingStatus?(): Promise<{ bound: boolean; hwnd?: string; title?: string; pid?: number; rect?: { x: number; y: number; width: number; height: number } } | null> + /** List all visible windows */ + listVisibleWindows?(): Promise> + /** Control the status indicator overlay */ + statusIndicator?(action: 'show' | 'hide' | 'status', message?: string): Promise<{ active: boolean; message?: string }> + /** Virtual keyboard — send keys/text/combos to bound window only */ + virtualKeyboard?(opts: { + action: 'type' | 'combo' | 'press' | 'release' | 'hold' + text: string + duration?: number + repeat?: number + }): Promise + /** Virtual mouse — click/move/drag on bound window only */ + virtualMouse?(opts: { + action: 'click' | 'double_click' | 'right_click' | 'move' | 'drag' | 'down' | 'up' + x: number; y: number + startX?: number; startY?: number + }): Promise + /** Mouse wheel scroll at client coordinates (works on Excel, browsers, modern UI) */ + mouseWheel?(x: number, y: number, delta: number, horizontal?: boolean): Promise + /** Activate the bound window (foreground + click to focus) */ + activateWindow?(clickX?: number, clickY?: number): Promise + /** Handle a terminal prompt (yes/no/select/type + enter) */ + respondToPrompt?(opts: { + responseType: 'yes' | 'no' | 'enter' | 'escape' | 'select' | 'type' + arrowDirection?: 'up' | 'down' + arrowCount?: number + text?: string + }): Promise + /** Click an element by name/role/automationId via UI Automation */ + clickElement?(query: { name?: string; role?: string; automationId?: string }): Promise + /** Type text into an element by name/role/automationId via UI Automation ValuePattern */ + typeIntoElement?(query: { name?: string; role?: string; automationId?: string }, text: string): Promise } diff --git a/packages/@ant/computer-use-mcp/src/toolCalls.ts b/packages/@ant/computer-use-mcp/src/toolCalls.ts index 5fcb2b591..f40302fcf 100644 --- a/packages/@ant/computer-use-mcp/src/toolCalls.ts +++ b/packages/@ant/computer-use-mcp/src/toolCalls.ts @@ -434,6 +434,15 @@ async function runInputActionGates( } } + // Windows/Linux: operations go through SendMessage (HWND-bound) or platform + // abstraction, not global input to the foreground. The frontmost gate is a + // macOS safety net for global CGEvent input — on other platforms, skip it + // when the platform's screenshotFiltering is 'none' (no per-app filtering, + // meaning no hide/defocus, meaning frontmost is meaningless). + if (adapter.executor.capabilities.screenshotFiltering === 'none') { + return null; // pass — non-macOS platform, frontmost irrelevant + } + // Frontmost gate. Check FRESH on every call. const frontmost = await adapter.executor.getFrontmostApp(); @@ -561,6 +570,13 @@ async function runHitTestGate( y: number, actionKind: CuActionKind, ): Promise { + // Non-macOS: HWND-bound mode — clicks go to the bound window via + // SendMessage with window-relative coordinates. Hit-test against the + // real screen is meaningless. + if (adapter.executor.capabilities.screenshotFiltering === 'none') { + return null; + } + const target = await adapter.executor.appUnderPoint(x, y); if (!target) return null; // desktop / nothing under point / platform no-op @@ -796,12 +812,12 @@ function resolveRequestedApps( if (!resolved) { resolved = byLowerDisplayName.get(requested.toLowerCase()); } - // Fuzzy fallback: match requested name as substring of display name - // e.g. "Chrome" matches "Google Chrome", "Code" matches "Visual Studio Code" + // Windows fuzzy matching: strip .exe suffix, try substring match if (!resolved) { - const lower = requested.toLowerCase(); - for (const app of installed) { - if (app.displayName.toLowerCase().includes(lower)) { + const clean = requested.toLowerCase().replace(/\.exe$/, '').trim(); + // Try: "chrome" matches "Google Chrome", "notepad" matches "Notepad" + for (const [name, app] of byLowerDisplayName) { + if (name.includes(clean) || clean.includes(name)) { resolved = app; break; } @@ -2137,6 +2153,8 @@ async function handleScreenshot( content: [ ...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []), ...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []), + // Accessibility snapshot: structured GUI element tree (Windows bound-window mode) + ...(shot.accessibilityText ? [{ type: "text" as const, text: `GUI elements in this window:\n${shot.accessibilityText}` }] : []), { type: "image", data: shot.base64, @@ -2204,6 +2222,8 @@ async function handleScreenshot( content: [ ...(monitorNote ? [{ type: "text" as const, text: monitorNote }] : []), ...(hiddenNote ? [{ type: "text" as const, text: hiddenNote }] : []), + // Accessibility snapshot: structured GUI element tree (Windows bound-window mode) + ...(shot.accessibilityText ? [{ type: "text" as const, text: `GUI elements in this window:\n${shot.accessibilityText}` }] : []), { type: "image", data: shot.base64, @@ -2812,6 +2832,443 @@ async function handleOpenApplication( return okText(`Opened "${app}".`); } +async function handleVirtualMouse( + adapter: ComputerUseHostAdapter, + args: Record, +): Promise { + if (!adapter.executor.virtualMouse) { + return errorResult("virtual_mouse is only available on Windows with a bound window.", "feature_unavailable"); + } + const action = requireString(args, "action"); + if (action instanceof Error) return errorResult(action.message, "bad_args"); + const coord = args.coordinate; + if (!Array.isArray(coord) || coord.length < 2) { + return errorResult("coordinate [x, y] is required.", "bad_args"); + } + const validActions = new Set(["click", "double_click", "right_click", "move", "drag", "down", "up"]); + if (!validActions.has(action)) { + return errorResult(`Invalid action "${action}". Valid: ${[...validActions].join(", ")}`, "bad_args"); + } + const startCoord = Array.isArray(args.start_coordinate) ? args.start_coordinate : undefined; + const ok = await adapter.executor.virtualMouse({ + action: action as any, + x: coord[0], y: coord[1], + startX: startCoord?.[0], startY: startCoord?.[1], + }); + if (!ok) { + return errorResult("No window is currently bound.", "bad_args"); + } + const desc: Record = { + click: `Click at (${coord[0]},${coord[1]})`, + double_click: `Double-click at (${coord[0]},${coord[1]})`, + right_click: `Right-click at (${coord[0]},${coord[1]})`, + move: `Moved to (${coord[0]},${coord[1]})`, + drag: `Dragged ${startCoord ? `(${startCoord[0]},${startCoord[1]})` : "current"} → (${coord[0]},${coord[1]})`, + down: `Button down at (${coord[0]},${coord[1]})`, + up: `Button up at (${coord[0]},${coord[1]})`, + }; + return okText(desc[action] ?? action); +} + +async function handleVirtualKeyboard( + adapter: ComputerUseHostAdapter, + args: Record, +): Promise { + if (!adapter.executor.virtualKeyboard) { + return errorResult("virtual_keyboard is only available on Windows with a bound window.", "feature_unavailable"); + } + const action = requireString(args, "action"); + if (action instanceof Error) return errorResult(action.message, "bad_args"); + const text = requireString(args, "text"); + if (text instanceof Error) return errorResult(text.message, "bad_args"); + + const validActions = new Set(["type", "combo", "press", "release", "hold"]); + if (!validActions.has(action)) { + return errorResult(`Invalid action "${action}". Valid: ${[...validActions].join(", ")}`, "bad_args"); + } + + const duration = typeof args.duration === "number" ? args.duration : undefined; + const repeat = typeof args.repeat === "number" ? args.repeat : undefined; + + const ok = await adapter.executor.virtualKeyboard({ + action: action as any, + text, + duration, + repeat, + }); + + if (!ok) { + return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args"); + } + + const desc: Record = { + type: `Typed "${text.length > 40 ? text.slice(0, 40) + "..." : text}"`, + combo: `Sent ${text}`, + press: `Pressed ${text} (holding)`, + release: `Released ${text}`, + hold: `Held ${text} for ${duration ?? 1}s`, + }; + + return okText(`${desc[action]}${repeat && repeat > 1 ? ` ×${repeat}` : ""}`); +} + +async function handleStatusIndicator( + adapter: ComputerUseHostAdapter, + args: Record, +): Promise { + if (!adapter.executor.statusIndicator) { + return errorResult("status_indicator is only available on Windows.", "feature_unavailable"); + } + const action = requireString(args, "action"); + if (action instanceof Error) return errorResult(action.message, "bad_args"); + if (!["show", "hide", "status"].includes(action)) { + return errorResult(`Invalid action "${action}". Valid: show, hide, status.`, "bad_args"); + } + const message = typeof args.message === "string" ? args.message : undefined; + if (action === "show" && !message) { + return errorResult("'show' requires a message parameter.", "bad_args"); + } + const result = await adapter.executor.statusIndicator(action as any, message); + if (action === "status") { + return okText(result.active ? "Indicator is active on the bound window." : "Indicator is not active (no window bound)."); + } + if (action === "show") { + return okText(`Indicator showing: "${message}"`); + } + return okText("Indicator hidden."); +} + +async function handleMouseWheel( + adapter: ComputerUseHostAdapter, + args: Record, +): Promise { + if (!adapter.executor.mouseWheel) { + return errorResult("mouse_wheel is only available on Windows with a bound window.", "feature_unavailable"); + } + const coord = args.coordinate; + if (!Array.isArray(coord) || coord.length < 2) { + return errorResult("coordinate must be [x, y] array.", "bad_args"); + } + const delta = typeof args.delta === "number" ? args.delta : undefined; + if (delta === undefined) { + return errorResult("delta is required (positive=up, negative=down).", "bad_args"); + } + const horizontal = args.direction === "horizontal"; + const ok = await adapter.executor.mouseWheel(coord[0], coord[1], delta, horizontal); + if (!ok) { + return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args"); + } + return okText( + `Mouse wheel: ${horizontal ? "horizontal" : "vertical"} scroll ${delta > 0 ? "up" : "down"} ${Math.abs(delta)} click(s) at (${coord[0]},${coord[1]}).`, + ); +} + +async function handleActivateWindow( + adapter: ComputerUseHostAdapter, + args: Record, +): Promise { + if (!adapter.executor.activateWindow) { + return errorResult("activate_window is only available on Windows with a bound window.", "feature_unavailable"); + } + const clickX = typeof args.click_x === "number" ? args.click_x : undefined; + const clickY = typeof args.click_y === "number" ? args.click_y : undefined; + const ok = await adapter.executor.activateWindow(clickX, clickY); + if (!ok) { + return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args"); + } + return okText("Window activated and focused. Ready for input."); +} + +async function handlePromptRespond( + adapter: ComputerUseHostAdapter, + args: Record, +): Promise { + if (!adapter.executor.respondToPrompt) { + return errorResult("prompt_respond is only available on Windows with a bound window.", "feature_unavailable"); + } + const responseType = requireString(args, "response_type"); + if (responseType instanceof Error) return errorResult(responseType.message, "bad_args"); + + const validTypes = new Set(["yes", "no", "enter", "escape", "select", "type"]); + if (!validTypes.has(responseType)) { + return errorResult(`Invalid response_type "${responseType}". Valid: ${[...validTypes].join(", ")}`, "bad_args"); + } + + if (responseType === "select" && typeof args.arrow_count !== "number") { + return errorResult("'select' requires arrow_count parameter.", "bad_args"); + } + if (responseType === "type" && typeof args.text !== "string") { + return errorResult("'type' requires text parameter.", "bad_args"); + } + + const ok = await adapter.executor.respondToPrompt({ + responseType: responseType as any, + arrowDirection: typeof args.arrow_direction === "string" ? args.arrow_direction as any : undefined, + arrowCount: typeof args.arrow_count === "number" ? args.arrow_count : undefined, + text: typeof args.text === "string" ? args.text : undefined, + }); + + if (!ok) { + return errorResult("No window is currently bound. Use open_application or bind_window first.", "bad_args"); + } + + const descriptions: Record = { + yes: "Sent 'y' + Enter.", + no: "Sent 'n' + Enter.", + enter: "Sent Enter.", + escape: "Sent Escape.", + select: `Navigated ${args.arrow_direction ?? "down"} ${args.arrow_count ?? 1} time(s) + Enter.`, + type: `Typed "${args.text}" + Enter.`, + }; + + return okText(`Prompt responded: ${descriptions[responseType] ?? responseType}. Take a screenshot to verify.`); +} + +async function handleOpenTerminal( + adapter: ComputerUseHostAdapter, + args: Record, +): Promise { + if (!adapter.executor.openTerminal) { + return errorResult("open_terminal is only available on Windows.", "feature_unavailable"); + } + const agent = requireString(args, "agent"); + if (agent instanceof Error) return errorResult(agent.message, "bad_args"); + + const validAgents = new Set(["claude", "codex", "gemini", "custom"]); + if (!validAgents.has(agent)) { + return errorResult(`Invalid agent "${agent}". Valid: claude, codex, gemini, custom.`, "bad_args"); + } + if (agent === "custom" && typeof args.command !== "string") { + return errorResult("agent='custom' requires 'command' parameter.", "bad_args"); + } + + const result = await adapter.executor.openTerminal({ + agent: agent as any, + command: typeof args.command === "string" ? args.command : undefined, + terminal: typeof args.terminal === "string" ? args.terminal as any : undefined, + workingDirectory: typeof args.working_directory === "string" ? args.working_directory : undefined, + }); + + if (!result) { + return errorResult( + "Failed to open terminal. Windows Terminal (wt.exe) may not be installed.", + "launch_failed", + ); + } + + if (!result.launched) { + return okText( + `Terminal opened (hwnd=${result.hwnd}, "${result.title}") but no command was sent. Window is now bound.`, + ); + } + + const agentNames: Record = { + claude: "Claude Code", codex: "Codex", gemini: "Gemini", + custom: args.command as string, + }; + + return okText( + `Terminal opened and ${agentNames[agent] ?? agent} launched.\n` + + `Window: hwnd=${result.hwnd} "${result.title}"\n` + + `Command: '${agent === "custom" ? args.command : agent}' + Enter\n` + + `Status: bound to this terminal. Take a screenshot to verify the agent started.`, + ); +} + +async function handleBindWindow( + adapter: ComputerUseHostAdapter, + args: Record, +): Promise { + const action = requireString(args, "action"); + if (action instanceof Error) return errorResult(action.message, "bad_args"); + + switch (action) { + case "list": { + if (!adapter.executor.listVisibleWindows) { + return errorResult("bind_window is only available on Windows.", "feature_unavailable"); + } + const windows = await adapter.executor.listVisibleWindows(); + if (windows.length === 0) return okText("No visible windows found."); + const lines = windows.map( + (w) => `hwnd=${w.hwnd} pid=${w.pid} "${w.title}"`, + ); + return okText(`Visible windows (${windows.length}):\n${lines.join("\n")}`); + } + case "status": { + if (!adapter.executor.getBindingStatus) { + return errorResult("bind_window is only available on Windows.", "feature_unavailable"); + } + const status = await adapter.executor.getBindingStatus(); + if (!status || !status.bound) { + return okText("No window is currently bound. Use bind_window(action='list') to see available windows, then bind_window(action='bind', title='...') to bind."); + } + let text = `Bound to: hwnd=${status.hwnd}`; + if (status.title) text += ` "${status.title}"`; + if (status.pid) text += ` pid=${status.pid}`; + if (status.rect) text += ` rect=(${status.rect.x},${status.rect.y} ${status.rect.width}x${status.rect.height})`; + return okText(text); + } + case "bind": { + if (!adapter.executor.bindToWindow) { + return errorResult("bind_window is only available on Windows.", "feature_unavailable"); + } + const title = typeof args.title === "string" ? args.title : undefined; + const hwnd = typeof args.hwnd === "string" ? args.hwnd : undefined; + const pid = typeof args.pid === "number" ? args.pid : undefined; + if (!title && !hwnd && !pid) { + return errorResult("Specify at least one of: title, hwnd, or pid.", "bad_args"); + } + const result = await adapter.executor.bindToWindow({ hwnd, title, pid }); + if (!result) { + return errorResult( + `No window found matching: ${[title && `title="${title}"`, hwnd && `hwnd=${hwnd}`, pid && `pid=${pid}`].filter(Boolean).join(", ")}. Use bind_window(action='list') to see available windows.`, + "element_not_found", + ); + } + return okText(`Bound to window: hwnd=${result.hwnd} pid=${result.pid} "${result.title}". All subsequent screenshot/click/type operations target this window.`); + } + case "unbind": { + if (!adapter.executor.unbindFromWindow) { + return errorResult("bind_window is only available on Windows.", "feature_unavailable"); + } + await adapter.executor.unbindFromWindow(); + return okText("Window binding released. Operations now target the full screen."); + } + default: + return errorResult(`Unknown bind_window action "${action}". Valid: list, bind, unbind, status.`, "bad_args"); + } +} + +async function handleClickElement( + adapter: ComputerUseHostAdapter, + args: Record, +): Promise { + if (!adapter.executor.clickElement) { + return errorResult( + "click_element is only available on Windows with a bound window.", + "feature_unavailable", + ); + } + const name = typeof args.name === "string" ? args.name : undefined; + const role = typeof args.role === "string" ? args.role : undefined; + const automationId = typeof args.automationId === "string" ? args.automationId : undefined; + if (!name && !role && !automationId) { + return errorResult("At least one of name, role, or automationId is required.", "bad_args"); + } + const ok = await adapter.executor.clickElement({ name, role, automationId }); + if (!ok) { + return errorResult( + `Element not found: ${[name && `name="${name}"`, role && `role=${role}`, automationId && `id=${automationId}`].filter(Boolean).join(", ")}. Take a screenshot to see current GUI elements.`, + "element_not_found", + ); + } + return okText(`Clicked element: ${[name && `"${name}"`, role, automationId].filter(Boolean).join(" ")}`); +} + +async function handleTypeIntoElement( + adapter: ComputerUseHostAdapter, + args: Record, +): Promise { + if (!adapter.executor.typeIntoElement) { + return errorResult( + "type_into_element is only available on Windows with a bound window.", + "feature_unavailable", + ); + } + const text = requireString(args, "text"); + if (text instanceof Error) return errorResult(text.message, "bad_args"); + const name = typeof args.name === "string" ? args.name : undefined; + const role = typeof args.role === "string" ? args.role : undefined; + const automationId = typeof args.automationId === "string" ? args.automationId : undefined; + const ok = await adapter.executor.typeIntoElement({ name, role, automationId }, text); + if (!ok) { + return errorResult( + `Could not type into element: ${[name && `name="${name}"`, role && `role=${role}`, automationId && `id=${automationId}`].filter(Boolean).join(", ")}. The element was not found or doesn't support text input.`, + "element_not_found", + ); + } + return okText(`Typed ${text.length} chars into: ${[name && `"${name}"`, role, automationId].filter(Boolean).join(" ")}`); +} + +async function handleWindowManagement( + adapter: ComputerUseHostAdapter, + args: Record, +): Promise { + const action = requireString(args, "action"); + if (action instanceof Error) return errorResult(action.message, "bad_args"); + + const VALID_ACTIONS = new Set([ + "minimize", "maximize", "restore", "close", "focus", "move_offscreen", "move_resize", "get_rect", + ]); + if (!VALID_ACTIONS.has(action)) { + return errorResult( + `Unknown window_management action "${action}". Valid: ${[...VALID_ACTIONS].join(", ")}`, + "bad_args", + ); + } + + if (!adapter.executor.manageWindow) { + return errorResult( + "window_management is only available on Windows with a bound window.", + "feature_unavailable", + ); + } + + // get_rect: just return the current window position and size + if (action === "get_rect") { + if (!adapter.executor.getWindowRect) { + return errorResult("getWindowRect not available.", "feature_unavailable"); + } + const rect = await adapter.executor.getWindowRect(); + if (!rect) { + return errorResult("No window is currently bound. Call open_application first.", "bad_args"); + } + return okText( + `Window rect: x=${rect.x}, y=${rect.y}, width=${rect.width}, height=${rect.height}`, + ); + } + + // move_resize: requires x, y (width/height optional) + if (action === "move_resize") { + const x = typeof args.x === "number" ? args.x : undefined; + const y = typeof args.y === "number" ? args.y : undefined; + if (x === undefined || y === undefined) { + return errorResult("move_resize requires x and y parameters.", "bad_args"); + } + const width = typeof args.width === "number" ? args.width : undefined; + const height = typeof args.height === "number" ? args.height : undefined; + const ok = await adapter.executor.manageWindow(action, { x, y, width, height }); + if (!ok) { + return errorResult("No window is currently bound. Call open_application first.", "bad_args"); + } + return okText( + width && height + ? `Moved window to (${x}, ${y}) and resized to ${width}×${height}.` + : `Moved window to (${x}, ${y}).`, + ); + } + + // All other actions: minimize, maximize, restore, close, focus, move_offscreen + const ok = await adapter.executor.manageWindow(action); + if (!ok) { + return errorResult( + "No window is currently bound. Call open_application first.", + "bad_args", + ); + } + + const descriptions: Record = { + minimize: "Window minimized (ShowWindow SW_MINIMIZE).", + maximize: "Window maximized (ShowWindow SW_MAXIMIZE).", + restore: "Window restored (ShowWindow SW_RESTORE).", + close: "Window closed (SendMessage WM_CLOSE). The window binding has been released.", + focus: "Window brought to front (SetForegroundWindow).", + move_offscreen: "Window moved offscreen (-32000,-32000). Still usable via SendMessage/PrintWindow.", + }; + + return okText(descriptions[action] ?? `Action "${action}" completed.`); +} + async function handleSwitchDisplay( adapter: ComputerUseHostAdapter, args: Record, @@ -3383,6 +3840,64 @@ async function dispatchAction( overrides: ComputerUseOverrides, subGates: CuSubGates, ): Promise { + // ── Bound-window auto-routing ────────────────────────────────────── + // When a window is bound (Win32), route generic input tools to + // virtual_mouse / virtual_keyboard automatically. The model doesn't + // need to know which tools to use — binding handles it. + const hasBoundWindow = + (await adapter.executor.hasBoundWindow?.()) === true && + adapter.executor.virtualMouse && + adapter.executor.virtualKeyboard; + if (hasBoundWindow) { + const coord = Array.isArray(a.coordinate) ? a.coordinate as number[] : undefined; + switch (name) { + case "left_click": + if (coord) return handleVirtualMouse(adapter, { action: "click", coordinate: coord }); + break; + case "double_click": + if (coord) return handleVirtualMouse(adapter, { action: "double_click", coordinate: coord }); + break; + case "right_click": + if (coord) return handleVirtualMouse(adapter, { action: "right_click", coordinate: coord }); + break; + case "mouse_move": + if (coord) return handleVirtualMouse(adapter, { action: "move", coordinate: coord }); + break; + case "left_click_drag": + if (coord) return handleVirtualMouse(adapter, { + action: "drag", coordinate: coord, + start_coordinate: Array.isArray(a.start_coordinate) ? a.start_coordinate : undefined, + }); + break; + case "left_mouse_down": + if (coord) return handleVirtualMouse(adapter, { action: "down", coordinate: coord }); + break; + case "left_mouse_up": + if (coord) return handleVirtualMouse(adapter, { action: "up", coordinate: coord }); + break; + case "type": + if (typeof a.text === "string") return handleVirtualKeyboard(adapter, { action: "type", text: a.text }); + break; + case "key": + if (typeof a.text === "string") return handleVirtualKeyboard(adapter, { action: "combo", text: a.text, repeat: a.repeat }); + break; + case "hold_key": + if (typeof a.text === "string") return handleVirtualKeyboard(adapter, { + action: "hold", text: a.text, + duration: typeof a.duration === "number" ? a.duration : 1, + }); + break; + case "scroll": + if (coord) return handleMouseWheel(adapter, { + coordinate: coord, + delta: a.scroll_direction === "up" ? (a.scroll_amount ?? 3) : -(a.scroll_amount ?? 3), + direction: (a.scroll_direction === "left" || a.scroll_direction === "right") ? "horizontal" : "vertical", + }); + break; + // screenshot, zoom, wait, cursor_position — not rerouted, pass through + } + } + // ── Standard dispatch (unbound or tools not rerouted above) ──────── switch (name) { case "screenshot": return handleScreenshot(adapter, overrides, subGates); @@ -3434,6 +3949,39 @@ async function dispatchAction( case "open_application": return handleOpenApplication(adapter, a, overrides); + case "window_management": + return handleWindowManagement(adapter, a); + + case "click_element": + return handleClickElement(adapter, a); + + case "type_into_element": + return handleTypeIntoElement(adapter, a); + + case "open_terminal": + return handleOpenTerminal(adapter, a); + + case "bind_window": + return handleBindWindow(adapter, a); + + case "virtual_mouse": + return handleVirtualMouse(adapter, a); + + case "virtual_keyboard": + return handleVirtualKeyboard(adapter, a); + + case "status_indicator": + return handleStatusIndicator(adapter, a); + + case "mouse_wheel": + return handleMouseWheel(adapter, a); + + case "activate_window": + return handleActivateWindow(adapter, a); + + case "prompt_respond": + return handlePromptRespond(adapter, a); + case "switch_display": return handleSwitchDisplay(adapter, a, overrides); diff --git a/packages/@ant/computer-use-mcp/src/tools.ts b/packages/@ant/computer-use-mcp/src/tools.ts index c744a2329..1904700d8 100644 --- a/packages/@ant/computer-use-mcp/src/tools.ts +++ b/packages/@ant/computer-use-mcp/src/tools.ts @@ -118,7 +118,7 @@ const BATCH_ACTION_ITEM_SCHEMA = { export function buildComputerUseTools( caps: { screenshotFiltering: "native" | "none"; - platform: "darwin" | "win32"; + platform: "darwin" | "win32" | "linux"; /** Include request_teach_access + teach_step. Read once at server construction. */ teachMode?: boolean; }, @@ -414,6 +414,353 @@ export function buildComputerUseTools( }, }, + // Window management — Win32 API targeted at bound HWND, no global shortcuts. + // Only available on Windows when a window is bound via open_application. + ...(caps.platform === 'win32' ? [{ + name: "window_management", + description: + "Manage the bound application window via Win32 API calls (ShowWindow, SetWindowPos, SendMessage). " + + "All operations target the bound HWND directly — NO global shortcuts (Win+Down, Alt+F4, etc.). " + + "The window must have been opened via open_application first. " + + "Actions: minimize (hide to taskbar), maximize (fill screen), restore (undo min/max), " + + "close (graceful WM_CLOSE), focus (bring to front), move_offscreen (move to -32000,-32000 for background operation). " + + "Use move_resize to reposition or resize the window to specific coordinates.", + inputSchema: { + type: "object" as const, + properties: { + action: { + type: "string", + enum: ["minimize", "maximize", "restore", "close", "focus", "move_offscreen", "move_resize", "get_rect"], + description: + "minimize: ShowWindow(SW_MINIMIZE). " + + "maximize: ShowWindow(SW_MAXIMIZE). " + + "restore: ShowWindow(SW_RESTORE) — undo minimize or maximize. " + + "close: SendMessage(WM_CLOSE) — graceful close. " + + "focus: SetForegroundWindow + BringWindowToTop. " + + "move_offscreen: SetWindowPos(-32000,-32000) — keeps window usable by SendMessage/PrintWindow but invisible. " + + "move_resize: SetWindowPos to specific x,y,width,height. " + + "get_rect: GetWindowRect — returns current position and size.", + }, + x: { type: "integer", description: "X position for move_resize." }, + y: { type: "integer", description: "Y position for move_resize." }, + width: { type: "integer", description: "Width for move_resize." }, + height: { type: "integer", description: "Height for move_resize." }, + }, + required: ["action"], + }, + } as Tool, + { + name: "click_element", + description: + "Click a GUI element by its accessible name, role, or automationId — no pixel coordinates needed. " + + "Uses Windows UI Automation to find the element and InvokePattern to click it. " + + "Prefer this over left_click when the element name is visible in the accessibility snapshot. " + + "Falls back to BoundingRect center-click if InvokePattern is not supported.", + inputSchema: { + type: "object" as const, + properties: { + name: { + type: "string", + description: "Accessible name of the element (e.g. \"Save\", \"File\", \"Search...\"). Case-insensitive partial match.", + }, + role: { + type: "string", + description: "Control type (e.g. \"Button\", \"MenuItem\", \"Edit\", \"Link\"). Optional — narrows the search.", + }, + automationId: { + type: "string", + description: "Exact automationId from the accessibility snapshot. Most precise selector.", + }, + }, + required: [], + }, + } as Tool, + { + name: "type_into_element", + description: + "Type text into a named GUI element using Windows UI Automation ValuePattern. " + + "Finds the element by name/role/automationId, then sets its value directly — " + + "no need to click first or use pixel coordinates. Works on Edit, ComboBox, and other value-holding controls.", + inputSchema: { + type: "object" as const, + properties: { + name: { type: "string", description: "Accessible name of the target element." }, + role: { type: "string", description: "Control type (optional, e.g. \"Edit\")." }, + automationId: { type: "string", description: "Exact automationId." }, + text: { type: "string", description: "Text to type/set into the element." }, + }, + required: ["text"], + }, + } as Tool, + { + name: "open_terminal", + description: + "Open a new terminal window and launch an AI agent CLI. " + + "This is a workflow tool that automates: open terminal → type startup command → press Enter → wait → verify. " + + "Supported agents: claude (runs 'claude'), codex (runs 'codex'), gemini (runs 'gemini'), " + + "or any custom command. After launching, the tool binds to the new terminal window " + + "and takes a screenshot to verify the agent started successfully. " + + "Use this when the user says: 'open Claude Code', 'start a Codex terminal', 'launch Gemini', etc.", + inputSchema: { + type: "object" as const, + properties: { + agent: { + type: "string", + enum: ["claude", "codex", "gemini", "custom"], + description: + "Which agent to launch. " + + "claude: runs 'claude' command. " + + "codex: runs 'codex' command. " + + "gemini: runs 'gemini' command. " + + "custom: runs the command specified in 'command' parameter.", + }, + command: { + type: "string", + description: "Custom command to run in the terminal. Only used when agent='custom'. Example: 'python app.py'", + }, + terminal: { + type: "string", + enum: ["wt", "powershell", "cmd"], + description: "Which terminal to open. Default: 'wt' (Windows Terminal). 'powershell' for PowerShell window, 'cmd' for Command Prompt.", + }, + working_directory: { + type: "string", + description: "Working directory for the terminal. If omitted, uses current directory.", + }, + }, + required: ["agent"], + }, + } as Tool, + { + name: "bind_window", + description: + "Bind to a specific window for all subsequent operations (screenshot, click, type, etc.). " + + "Once bound, screenshots capture only that window via PrintWindow, and all input goes through SendMessageW — " + + "no cursor movement, no focus steal, no interference with the user's desktop. " + + "Actions: bind (by title, hwnd, or pid), unbind (release binding), status (show current binding), list (show all visible windows). " + + "Use 'list' first to see available windows, then 'bind' with a title or hwnd. " + + "open_application auto-binds the launched window, but use this tool to bind to already-running windows or switch between windows.", + inputSchema: { + type: "object" as const, + properties: { + action: { + type: "string", + enum: ["bind", "unbind", "status", "list"], + description: + "bind: Bind to a window (specify title, hwnd, or pid). " + + "unbind: Release the current binding, return to full-screen mode. " + + "status: Show the currently bound window (hwnd, title, rect). " + + "list: List all visible windows with hwnd, pid, and title.", + }, + title: { + type: "string", + description: "Window title to search for (partial match, case-insensitive). For 'bind' action.", + }, + hwnd: { + type: "string", + description: "Exact window handle from 'list' output. For 'bind' action.", + }, + pid: { + type: "integer", + description: "Process ID to find window for. For 'bind' action.", + }, + }, + required: ["action"], + }, + } as Tool, + { + name: "activate_window", + description: + "Activate the bound window: bring it to foreground, click to ensure keyboard focus, " + + "and optionally send an initial key sequence. Use this before any input operations to guarantee " + + "the window is ready to receive keyboard/mouse events. " + + "Combines SetForegroundWindow + BringWindowToTop + SendMessage(WM_LBUTTONDOWN) in one call.", + inputSchema: { + type: "object" as const, + properties: { + click_x: { type: "integer", description: "X coordinate to click after activation (client-area). If omitted, clicks center of window." }, + click_y: { type: "integer", description: "Y coordinate to click after activation (client-area). If omitted, clicks center of window." }, + }, + required: [], + }, + } as Tool, + { + name: "prompt_respond", + description: + "Handle interactive CLI/terminal prompts (Yes/No, selection menus, confirmations). " + + "Sends a sequence of key events to the bound window to navigate and confirm a prompt. " + + "This is a convenience wrapper around bound-window keyboard input for common prompt flows. " + + "Typical flows: " + + "1) Yes/No prompt → send 'y' or 'n' + Enter. " + + "2) Arrow-key selection menu → send arrow_down/arrow_up N times + Enter. " + + "3) Text input prompt → type the response + Enter. " + + "After responding, take a screenshot to verify the result.", + inputSchema: { + type: "object" as const, + properties: { + response_type: { + type: "string", + enum: ["yes", "no", "enter", "escape", "select", "type"], + description: + "yes: send 'y' + Enter. " + + "no: send 'n' + Enter. " + + "enter: send Enter only. " + + "escape: send Escape (cancel). " + + "select: use arrow keys to navigate to an option, then Enter. Requires 'arrow_count'. " + + "type: type custom text then Enter. Requires 'text'.", + }, + arrow_direction: { + type: "string", + enum: ["up", "down"], + description: "Arrow key direction for 'select' type. Default: 'down'.", + }, + arrow_count: { + type: "integer", + description: "Number of arrow key presses for 'select' type. Default: 1.", + minimum: 0, + maximum: 50, + }, + text: { + type: "string", + description: "Text to type for 'type' response_type.", + }, + }, + required: ["response_type"], + }, + } as Tool, + { + name: "status_indicator", + description: + "Control the visual status indicator overlay on the bound window. " + + "The indicator is a small floating label at the bottom of the window that shows what Computer Use is doing. " + + "It auto-shows during click/type/key/scroll operations, but you can also send custom messages. " + + "Actions: show (display a custom message), hide (dismiss), status (check if active).", + inputSchema: { + type: "object" as const, + properties: { + action: { + type: "string", + enum: ["show", "hide", "status"], + description: "show: display a custom message on the indicator. hide: dismiss the indicator. status: check if indicator is active.", + }, + message: { + type: "string", + description: "Custom message to display (for 'show' action). Supports emoji. Auto-fades after 2 seconds.", + }, + }, + required: ["action"], + }, + } as Tool, + { + name: "virtual_keyboard", + description: + "Send keyboard input directly to the bound window via SendMessageW — independent of the physical keyboard. " + + "The user can keep typing on their own keyboard without interference. " + + "Supports: single keys, key combinations (Ctrl+S, Alt+F4), text input, and hold-key operations. " + + "All input targets the bound HWND only — no global keyboard events.", + inputSchema: { + type: "object" as const, + properties: { + action: { + type: "string", + enum: ["type", "combo", "press", "release", "hold"], + description: + "type: Send text string via WM_CHAR (Unicode, supports Chinese/emoji). " + + "combo: Send a key combination like ctrl+s, alt+f4, ctrl+shift+a (press all, release in reverse). " + + "press: Press a key down and hold it (pair with 'release'). " + + "release: Release a previously pressed key. " + + "hold: Press key(s) for a duration then release.", + }, + text: { + type: "string", + description: "For 'type': the text to input. For 'combo': key combination string (e.g. 'ctrl+s', 'alt+tab', 'ctrl+shift+a'). For 'press'/'release': single key name (e.g. 'shift', 'ctrl', 'a').", + }, + duration: { + type: "number", + description: "For 'hold': seconds to hold the key(s) before releasing. Default: 1.", + }, + repeat: { + type: "integer", + description: "Number of times to repeat the action. Default: 1.", + minimum: 1, + maximum: 100, + }, + }, + required: ["action", "text"], + }, + } as Tool, + { + name: "virtual_mouse", + description: + "Control a virtual mouse on the bound window via SendMessageW — independent of the physical mouse. " + + "The user's real cursor stays free. All operations target the bound HWND only.", + inputSchema: { + type: "object" as const, + properties: { + action: { + type: "string", + enum: ["click", "double_click", "right_click", "move", "drag", "down", "up"], + description: + "click: left-click at coordinate. " + + "double_click: double left-click. " + + "right_click: right-click. " + + "move: move virtual cursor (visual only, no click). " + + "drag: press at start, move to end, release. Requires coordinate (end) and start_coordinate. " + + "down: press left button at coordinate (hold). " + + "up: release left button at coordinate.", + }, + coordinate: { + type: "array", + items: { type: "number" }, + minItems: 2, + maxItems: 2, + description: "(x, y) client-area coordinate on the bound window.", + }, + start_coordinate: { + type: "array", + items: { type: "number" }, + minItems: 2, + maxItems: 2, + description: "(x, y) start point for drag. If omitted, drags from current virtual cursor position.", + }, + }, + required: ["action", "coordinate"], + }, + } as Tool, + { + name: "mouse_wheel", + description: + "Scroll inside the bound window using mouse wheel (WM_MOUSEWHEEL / WM_MOUSEHWHEEL). " + + "Unlike the generic 'scroll' tool which uses WM_VSCROLL (only works on scrollbar controls), " + + "mouse_wheel simulates the physical mouse wheel and works on Excel spreadsheets, web pages, " + + "code editors, PDF viewers, and any modern UI. " + + "Specify the click point within the window where the scroll should occur — " + + "this determines which panel/pane/element receives the scroll.", + inputSchema: { + type: "object" as const, + properties: { + coordinate: { + type: "array", + items: { type: "number" }, + minItems: 2, + maxItems: 2, + description: "(x, y) client-area coordinate where the scroll should occur. Determines which element receives the scroll.", + }, + delta: { + type: "integer", + description: "Scroll amount in 'clicks'. Positive = scroll up, negative = scroll down. Each click = 3 lines typically. Use -3 to -5 for page-like scrolling.", + }, + direction: { + type: "string", + enum: ["vertical", "horizontal"], + description: "Scroll direction. Default: 'vertical'. Use 'horizontal' for side-scrolling (e.g. wide Excel sheets, timeline views).", + }, + }, + required: ["coordinate", "delta"], + }, + } as Tool, + ] : []), + { name: "switch_display", description: diff --git a/packages/@ant/computer-use-swift/src/backends/darwin.ts b/packages/@ant/computer-use-swift/src/backends/darwin.ts index c11e381af..620f162a9 100644 --- a/packages/@ant/computer-use-swift/src/backends/darwin.ts +++ b/packages/@ant/computer-use-swift/src/backends/darwin.ts @@ -159,28 +159,23 @@ export const apps: AppsAPI = { async listInstalled() { try { - // Use Spotlight (mdfind) to enumerate .app bundles and mdls to get real bundle IDs. - // Searches /Applications, /System/Applications, and /System/Applications/Utilities - // so that system apps (Terminal, Chess, etc.) and core services (Finder) are found. - const proc = Bun.spawn([ - 'bash', '-c', - `for dir in /Applications /System/Applications /System/Applications/Utilities /System/Library/CoreServices; do -mdfind 'kMDItemContentType == "com.apple.application-bundle"' -onlyin "$dir" 2>/dev/null -done | sort -u | while read -r appPath; do -bundleId=$(mdls -raw -name kMDItemCFBundleIdentifier "$appPath" 2>/dev/null) -if [ -n "$bundleId" ] && [ "$bundleId" != "(null)" ]; then - displayName=$(basename "$appPath" .app) - echo "$bundleId|$displayName|$appPath" -fi -done`, - ], { stdout: 'pipe', stderr: 'pipe' }) - const text = await new Response(proc.stdout).text() - await proc.exited - return text.split('\n').filter(Boolean).map(line => { - const [bundleId, displayName, path] = line.split('|', 3) + const result = await osascript(` + tell application "System Events" + set appList to "" + repeat with appFile in (every file of folder "Applications" of startup disk whose name ends with ".app") + set appPath to POSIX path of (appFile as alias) + set appName to name of appFile + set appList to appList & appPath & "|" & appName & "\\n" + end repeat + return appList + end tell + `) + return result.split('\n').filter(Boolean).map(line => { + const [path, name] = line.split('|', 2) + const displayName = (name ?? '').replace(/\.app$/, '') return { - bundleId: bundleId ?? '', - displayName: displayName ?? '', + bundleId: `com.app.${displayName.toLowerCase().replace(/\s+/g, '-')}`, + displayName, path: path ?? '', } }) diff --git a/packages/@ant/computer-use-swift/src/index.ts b/packages/@ant/computer-use-swift/src/index.ts index e3daf6e40..b179966f0 100644 --- a/packages/@ant/computer-use-swift/src/index.ts +++ b/packages/@ant/computer-use-swift/src/index.ts @@ -1,14 +1,10 @@ /** - * @ant/computer-use-swift — cross-platform display, apps, and screenshot API + * @ant/computer-use-swift — macOS display, apps, and screenshot (Swift native) * - * Platform backends: - * - darwin: AppleScript/JXA + screencapture - * - win32: PowerShell + System.Drawing + Win32 P/Invoke - * - * Add new platforms by creating backends/.ts implementing SwiftBackend. + * This package wraps the macOS-only Swift .node native module. + * For Windows/Linux, use src/utils/computerUse/platforms/ instead. */ -// Re-export all types export type { DisplayGeometry, PrepareDisplayResult, @@ -18,72 +14,42 @@ export type { ScreenshotResult, ResolvePrepareCaptureResult, WindowDisplayInfo, - DisplayAPI, - AppsAPI, - ScreenshotAPI, - SwiftBackend, -} from './types.js' - -import type { ResolvePrepareCaptureResult, SwiftBackend } from './types.js' +} from './backends/darwin.js' -// --------------------------------------------------------------------------- -// Platform dispatch -// --------------------------------------------------------------------------- +import type { ResolvePrepareCaptureResult } from './backends/darwin.js' -function loadBackend(): SwiftBackend | null { +function loadDarwin() { + if (process.platform !== 'darwin') return null try { - switch (process.platform) { - case 'darwin': - return require('./backends/darwin.js') as SwiftBackend - case 'win32': - return require('./backends/win32.js') as SwiftBackend - case 'linux': - return require('./backends/linux.js') as SwiftBackend - default: - return null - } + return require('./backends/darwin.js') } catch { return null } } -const backend = loadBackend() - -// --------------------------------------------------------------------------- -// ComputerUseAPI — Main export (preserves original class interface) -// --------------------------------------------------------------------------- +const darwin = loadDarwin() export class ComputerUseAPI { - // When no backend is loaded (unsupported platform), all APIs are no-op stubs. - // These stubs should never be reached in practice — callers check isSupported - // or the feature gate before invoking. - - apps = backend?.apps ?? { + apps = darwin?.apps ?? { async prepareDisplay() { return { activated: '', hidden: [] } }, async previewHideSet() { return [] }, - async findWindowDisplays(ids: string[]) { return ids.map(b => ({ bundleId: b, displayIds: [] as number[] })) }, + async findWindowDisplays(ids: string[]) { return ids.map((b: string) => ({ bundleId: b, displayIds: [] as number[] })) }, async appUnderPoint() { return null }, async listInstalled() { return [] }, iconDataUrl() { return null }, listRunning() { return [] }, - async open() { throw new Error('computer-use-swift: no backend for this platform') }, + async open() { throw new Error('@ant/computer-use-swift: macOS only') }, async unhide() {}, } - display = backend?.display ?? { - getSize() { throw new Error('computer-use-swift: no backend for this platform') }, - listAll() { throw new Error('computer-use-swift: no backend for this platform') }, - } - - screenshot = backend?.screenshot ?? { - async captureExcluding() { throw new Error('computer-use-swift: no backend for this platform') }, - async captureRegion() { throw new Error('computer-use-swift: no backend for this platform') }, + display = darwin?.display ?? { + getSize() { throw new Error('@ant/computer-use-swift: macOS only') }, + listAll() { throw new Error('@ant/computer-use-swift: macOS only') }, } - hotkey = (backend as any)?.hotkey ?? { - registerEscape(_cb: () => void): boolean { return false }, - unregister() {}, - notifyExpectedEscape() {}, + screenshot = darwin?.screenshot ?? { + async captureExcluding() { throw new Error('@ant/computer-use-swift: macOS only') }, + async captureRegion() { throw new Error('@ant/computer-use-swift: macOS only') }, } async resolvePrepareCapture( @@ -93,8 +59,6 @@ export class ComputerUseAPI { targetW: number, targetH: number, displayId?: number, - _autoResolve?: boolean, - _doHide?: boolean, ): Promise { return this.screenshot.captureExcluding(allowedBundleIds, quality, targetW, targetH, displayId) } diff --git a/src/utils/computerUse/common.ts b/src/utils/computerUse/common.ts index 37e6701e1..c70eb1a34 100644 --- a/src/utils/computerUse/common.ts +++ b/src/utils/computerUse/common.ts @@ -52,8 +52,14 @@ export function getTerminalBundleId(): string | null { * takes this shape (no `hostBundleId`, no `teachMode`). */ export const CLI_CU_CAPABILITIES = { - screenshotFiltering: (process.platform === 'darwin' ? 'native' : 'none') as any, - platform: (process.platform === 'win32' ? 'windows' : process.platform === 'linux' ? 'linux' : 'darwin') as any, + screenshotFiltering: (process.platform === 'darwin' + ? 'native' + : 'none') as any, + platform: (process.platform === 'win32' + ? 'win32' + : process.platform === 'linux' + ? 'linux' + : 'darwin') as any, } export function isComputerUseMCPServer(name: string): boolean { diff --git a/src/utils/computerUse/executor.ts b/src/utils/computerUse/executor.ts index 2b6c9ade1..346ac7d50 100644 --- a/src/utils/computerUse/executor.ts +++ b/src/utils/computerUse/executor.ts @@ -297,16 +297,17 @@ export function createCliExecutor(opts: { getMouseAnimationEnabled: () => boolean getHideBeforeActionEnabled: () => boolean }): ComputerExecutor { - if (process.platform !== 'darwin' && process.platform !== 'win32' && process.platform !== 'linux') { - throw new Error( - `createCliExecutor called on ${process.platform}. Computer control requires macOS, Windows, or Linux.`, - ) + // Non-macOS: delegate entirely to the cross-platform executor. + // No macOS code paths, no drainRunLoop, no @ant packages. + if (process.platform !== 'darwin') { + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { createCrossPlatformExecutor } = require('./executorCrossPlatform.js') as typeof import('./executorCrossPlatform.js') + return createCrossPlatformExecutor(opts) } - // Swift loaded once at factory time — every executor method needs it. - // Input loaded lazily via requireComputerUseInput() on first mouse/keyboard - // call — it caches internally, so screenshot-only flows never pull the - // enigo .node. + // ── macOS: native @ant packages ───────────────────────────────────── + // Everything below is macOS-only. No platform checks needed. + const cu = requireComputerUseSwift() const { getMouseAnimationEnabled, getHideBeforeActionEnabled } = opts @@ -500,18 +501,12 @@ export function createCliExecutor(opts: { async key(keySequence: string, repeat?: number): Promise { const input = requireComputerUseInput() const parts = keySequence.split('+').filter(p => p.length > 0) - // Bare-only: the CGEventTap checks event.flags.isEmpty so ctrl+escape - // etc. pass through without aborting. const isEsc = isBareEscape(parts) const n = repeat ?? 1 await drainRunLoop(async () => { for (let i = 0; i < n; i++) { - if (i > 0) { - await sleep(8) - } - if (isEsc) { - notifyExpectedEscape() - } + if (i > 0) await sleep(8) + if (isEsc) notifyExpectedEscape() await input.keys(parts) } }) @@ -554,12 +549,9 @@ export function createCliExecutor(opts: { async type(text: string, opts: { viaClipboard: boolean }): Promise { const input = requireComputerUseInput() if (opts.viaClipboard) { - // keys(['command','v']) inside needs the pump. await drainRunLoop(() => typeViaClipboard(input, text)) return } - // `toolCalls.ts` handles the grapheme loop + 8ms sleeps and calls this - // once per grapheme. typeText doesn't dispatch to the main queue. await input.typeText(text) }, @@ -656,6 +648,10 @@ export function createCliExecutor(opts: { // ── App management ─────────────────────────────────────────────────── async getFrontmostApp(): Promise { + // When HWND is bound on Windows, operations go through SendMessage + // and don't touch the real foreground. Return the first allowed app + // so the frontmost gate in toolCalls.ts passes — the real foreground + // is irrelevant since we never touch it. const info = requireComputerUseInput().getFrontmostAppInfo() if (!info || !info.bundleId) return null return { bundleId: info.bundleId, displayName: info.appName } @@ -698,6 +694,7 @@ export async function unhideComputerUseApps( bundleIds: readonly string[], ): Promise { if (bundleIds.length === 0) return + if (process.platform !== 'darwin') return // non-macOS: no-op const cu = requireComputerUseSwift() await cu.apps.unhide([...bundleIds]) } diff --git a/src/utils/computerUse/executorCrossPlatform.ts b/src/utils/computerUse/executorCrossPlatform.ts new file mode 100644 index 000000000..e911d1afd --- /dev/null +++ b/src/utils/computerUse/executorCrossPlatform.ts @@ -0,0 +1,1150 @@ +/** + * Cross-platform (Windows/Linux) ComputerExecutor implementation. + * + * Unlike the macOS executor which uses @ant native modules + drainRunLoop + + * CGEventTap, this executor delegates everything to src/utils/computerUse/platforms/. + * + * All operations go through the platform abstraction: + * - Input: SendMessage (HWND-bound, no focus steal) + * - Screenshot: PrintWindow (per-window JPEG) + * - Display: platform-native enumeration + * - Apps: platform-native listing/launching + * + * No drainRunLoop, no CGEventTap, no pbcopy/pbpaste, no @ant packages. + * + * ── Coordinate model (bound-window mode) ───────────────────────────────── + * + * When an HWND is bound, screenshots come from PrintWindow (the bound window), + * NOT from the display. This means: + * - Image pixel coords ARE window coords (1:1 after scaleCoord) + * - displayWidth/displayHeight are set to the IMAGE dimensions so scaleCoord + * returns raw image coords unchanged + * - originX/originY are 0 (not the display origin) + * - For clicks, we subtract the non-client area offset (title bar + border) + * so WM_LBUTTONDOWN receives client-relative coords + */ + +import type { + ComputerExecutor, + DisplayGeometry, + FrontmostApp, + InstalledApp, + ResolvePrepareCaptureResult, + RunningApp, + ScreenshotResult, +} from '@ant/computer-use-mcp' + +import { logForDebugging } from '../debug.js' +import { sleep } from '../sleep.js' +import { CLI_CU_CAPABILITIES, CLI_HOST_BUNDLE_ID } from './common.js' +import { validateHwnd } from './win32/shared.js' +import { loadPlatform } from './platforms/index.js' +import type { Platform } from './platforms/index.js' + +// --------------------------------------------------------------------------- +// Helpers for HWND-bound mode +// --------------------------------------------------------------------------- + +/** Get the bound HWND string, or null */ +function getBoundHwndStr(): string | null { + if (process.platform !== 'win32') return null + try { + const { getBoundHwnd } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + return getBoundHwnd() + } catch { + return null + } +} + +/** Check if we're in HWND-bound mode (Windows only) */ +function isBound(): boolean { + return getBoundHwndStr() !== null +} + +/** + * Get the non-client area offset (title bar height, left border width). + * Returns {dx, dy} to subtract from window coords → client coords. + * For WinUI apps with custom title bars, this may be (0, 0). + */ +function getNonClientOffset(): { dx: number; dy: number } { + if (process.platform !== 'win32') return { dx: 0, dy: 0 } + try { + const { getBoundHwnd } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + const hwnd = getBoundHwnd() + if (!hwnd) return { dx: 0, dy: 0 } + + validateHwnd(hwnd) + + const result = Bun.spawnSync({ + cmd: [ + 'powershell', + '-NoProfile', + '-NonInteractive', + '-Command', + ` +Add-Type @' +using System; +using System.Runtime.InteropServices; +public class NcCalc { + [DllImport("user32.dll")] public static extern bool GetWindowRect(IntPtr h, out RECT r); + [DllImport("user32.dll")] public static extern bool GetClientRect(IntPtr h, out RECT r); + [DllImport("user32.dll")] public static extern bool ClientToScreen(IntPtr h, ref POINT p); + [StructLayout(LayoutKind.Sequential)] public struct RECT { public int L, T, R, B; } + [StructLayout(LayoutKind.Sequential)] public struct POINT { public int X, Y; } +} +'@ +$h = [IntPtr]::new([long]${hwnd}) +$wr = New-Object NcCalc+RECT +$cr = New-Object NcCalc+RECT +[NcCalc]::GetWindowRect($h, [ref]$wr) | Out-Null +[NcCalc]::GetClientRect($h, [ref]$cr) | Out-Null +$pt = New-Object NcCalc+POINT +$pt.X = 0; $pt.Y = 0 +[NcCalc]::ClientToScreen($h, [ref]$pt) | Out-Null +"$($pt.X - $wr.L),$($pt.Y - $wr.T)" +`, + ], + stdout: 'pipe', + stderr: 'pipe', + }) + const out = new TextDecoder().decode(result.stdout).trim() + const [dxStr, dyStr] = out.split(',') + const dx = Number(dxStr) || 0 + const dy = Number(dyStr) || 0 + return { dx, dy } + } catch { + return { dx: 0, dy: 0 } + } +} + +// Cache non-client offset (recalculate on bind change) +let _ncOffset: { dx: number; dy: number } | null = null +let _ncOffsetHwnd: string | null = null + +function getCachedNcOffset(): { dx: number; dy: number } { + if (process.platform !== 'win32') return { dx: 0, dy: 0 } + try { + const { getBoundHwnd } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + const hwnd = getBoundHwnd() + if (!hwnd) return { dx: 0, dy: 0 } + if (_ncOffset && _ncOffsetHwnd === hwnd) return _ncOffset + _ncOffset = getNonClientOffset() + _ncOffsetHwnd = hwnd + return _ncOffset + } catch { + return { dx: 0, dy: 0 } + } +} + +/** + * Capture the accessibility tree for the bound window (Windows only). + * Returns compact text for the model, or undefined if not available. + */ +function getAccessibilityText(): string | undefined { + if (process.platform !== 'win32' || !isBound()) return undefined + try { + const { getBoundHwnd } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + const hwnd = getBoundHwnd() + if (!hwnd) return undefined + const { captureAccessibilitySnapshot } = + require('./win32/accessibilitySnapshot.js') as typeof import('./win32/accessibilitySnapshot.js') + const snap = captureAccessibilitySnapshot(hwnd) + if (!snap || !snap.text) return undefined + return snap.text + } catch { + return undefined + } +} + +/** + * Augment a raw screenshot result with the metadata scaleCoord needs. + * When HWND-bound: set displayWidth = imageWidth so coords map 1:1. + * Also captures accessibility snapshot on Windows for GUI element awareness. + */ +function augmentScreenshot( + raw: { base64: string; width: number; height: number }, + display: { width: number; height: number; displayId?: number }, +): ScreenshotResult { + if (isBound()) { + const accessibilityText = getAccessibilityText() + return { + base64: raw.base64, + width: raw.width, + height: raw.height, + displayWidth: raw.width, + displayHeight: raw.height, + originX: 0, + originY: 0, + accessibilityText, + } + } + return { + base64: raw.base64, + width: raw.width, + height: raw.height, + displayWidth: display.width, + displayHeight: display.height, + originX: 0, + originY: 0, + } +} + +// --------------------------------------------------------------------------- +// Executor +// --------------------------------------------------------------------------- + +export function createCrossPlatformExecutor(opts: { + getMouseAnimationEnabled: () => boolean + getHideBeforeActionEnabled: () => boolean +}): ComputerExecutor { + const platform = loadPlatform() + + logForDebugging( + `[computer-use] cross-platform executor for ${process.platform}`, + ) + + return { + capabilities: { + ...CLI_CU_CAPABILITIES, + hostBundleId: CLI_HOST_BUNDLE_ID, + }, + + // ── Pre-action (no-op on non-macOS) ────────────────────────────────── + + async prepareForAction(): Promise { + return [] + }, + + async previewHideSet(): Promise< + Array<{ bundleId: string; displayName: string }> + > { + return [] + }, + + // ── Display ────────────────────────────────────────────────────────── + + async getDisplaySize(displayId?: number): Promise { + const d = platform.display.getSize(displayId) + return { + ...d, + scaleFactor: d.scaleFactor ?? 1, + displayId: d.displayId ?? 0, + originX: 0, + originY: 0, + } + }, + + async listDisplays(): Promise { + return platform.display + .listAll() + .map(d => ({ ...d, originX: 0, originY: 0 })) + }, + + async findWindowDisplays( + bundleIds: string[], + ): Promise> { + return bundleIds.map(b => ({ bundleId: b, displayIds: [0] })) + }, + + // ── Screenshot ─────────────────────────────────────────────────────── + + async resolvePrepareCapture(opts: { + allowedBundleIds: string[] + preferredDisplayId?: number + autoResolve: boolean + doHide?: boolean + }): Promise { + const d = platform.display.getSize(opts.preferredDisplayId) + const raw = await platform.screenshot.captureScreen( + opts.preferredDisplayId, + ) + const shot = augmentScreenshot(raw, d) + return { + ...shot, + hidden: [], + displayId: opts.preferredDisplayId ?? d.displayId ?? 0, + } + }, + + async screenshot(opts: { + allowedBundleIds: string[] + displayId?: number + }): Promise { + const d = platform.display.getSize(opts.displayId) + const raw = await platform.screenshot.captureScreen(opts.displayId) + return augmentScreenshot(raw, d) + }, + + async zoom( + regionLogical: { x: number; y: number; w: number; h: number }, + _allowedBundleIds: string[], + _displayId?: number, + ): Promise<{ base64: string; width: number; height: number }> { + return platform.screenshot.captureRegion( + regionLogical.x, + regionLogical.y, + regionLogical.w, + regionLogical.h, + ) + }, + + // ── Keyboard ───────────────────────────────────────────────────────── + + async key(keySequence: string, repeat?: number): Promise { + const parts = keySequence.split('+').filter(p => p.length > 0) + const n = repeat ?? 1 + for (let i = 0; i < n; i++) { + if (i > 0) await sleep(8) + await platform.input.keys(parts) + } + }, + + async holdKey(keyNames: string[], durationMs: number): Promise { + for (const k of keyNames) { + await platform.input.key(k, 'press') + } + await sleep(durationMs) + for (const k of [...keyNames].reverse()) { + await platform.input.key(k, 'release') + } + }, + + async type(text: string, _opts: { viaClipboard: boolean }): Promise { + await platform.input.typeText(text) + }, + + async readClipboard(): Promise { + if (process.platform === 'win32') { + const result = Bun.spawnSync({ + cmd: ['powershell', '-NoProfile', '-Command', 'Get-Clipboard'], + stdout: 'pipe', + }) + return new TextDecoder().decode(result.stdout).trim() + } + // Linux + const result = Bun.spawnSync({ + cmd: ['xclip', '-selection', 'clipboard', '-o'], + stdout: 'pipe', + }) + return new TextDecoder().decode(result.stdout).trim() + }, + + async writeClipboard(text: string): Promise { + if (process.platform === 'win32') { + const escaped = text.replace(/'/g, "''") + Bun.spawnSync({ + cmd: [ + 'powershell', + '-NoProfile', + '-Command', + `Set-Clipboard -Value '${escaped}'`, + ], + }) + return + } + // Linux + const proc = Bun.spawn(['xclip', '-selection', 'clipboard'], { + stdin: 'pipe', + }) + proc.stdin.write(text) + proc.stdin.end() + await proc.exited + }, + + // ── Mouse ──────────────────────────────────────────────────────────── + + async moveMouse(x: number, y: number): Promise { + await platform.input.moveMouse(x, y) + }, + + async click( + x: number, + y: number, + button: 'left' | 'right' | 'middle', + count: 1 | 2 | 3, + _modifiers?: string[], + ): Promise { + let clickX = Math.round(x) + let clickY = Math.round(y) + + // When HWND-bound: scaleCoord gives us window coords (1:1 from image). + // Subtract non-client offset to get client-area coords for WM_LBUTTONDOWN. + if (isBound()) { + const nc = getCachedNcOffset() + clickX -= nc.dx + clickY -= nc.dy + logForDebugging( + `[computer-use] click(${Math.round(x)},${Math.round(y)}) → client(${clickX},${clickY}) [nc offset: ${nc.dx},${nc.dy}]`, + ) + } + + for (let i = 0; i < count; i++) { + await platform.input.click(clickX, clickY, button) + } + }, + + async mouseDown(): Promise { + if (isBound() && process.platform === 'win32') { + const { getBoundHwnd } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + const hwnd = getBoundHwnd() + if (hwnd) { + const { sendMouseDown } = + require('./win32/windowMessage.js') as typeof import('./win32/windowMessage.js') + const pos = await platform.input.mouseLocation() + sendMouseDown(hwnd, pos.x, pos.y) + return + } + } + // Unbound: SendInput with MOUSEEVENTF_LEFTDOWN + if (process.platform === 'win32') { + Bun.spawnSync({ + cmd: [ + 'powershell', + '-NoProfile', + '-NonInteractive', + '-Command', + `Add-Type -Language CSharp @' +using System; using System.Runtime.InteropServices; +public class MDown { [StructLayout(LayoutKind.Sequential)] public struct MOUSEINPUT { public int dx; public int dy; public int mouseData; public uint dwFlags; public uint time; public IntPtr dwExtraInfo; } +[StructLayout(LayoutKind.Explicit)] public struct INPUT { [FieldOffset(0)] public uint type; [FieldOffset(4)] public MOUSEINPUT mi; } +[DllImport("user32.dll",SetLastError=true)] public static extern uint SendInput(uint n, INPUT[] i, int cb); } +'@ +$i = New-Object MDown+INPUT; $i.type=0; $i.mi.dwFlags=0x0002; [MDown]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null`, + ], + }) + return + } + }, + + async mouseUp(): Promise { + if (isBound() && process.platform === 'win32') { + const { getBoundHwnd } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + const hwnd = getBoundHwnd() + if (hwnd) { + const { sendMouseUp } = + require('./win32/windowMessage.js') as typeof import('./win32/windowMessage.js') + const pos = await platform.input.mouseLocation() + sendMouseUp(hwnd, pos.x, pos.y) + return + } + } + // Unbound: SendInput with MOUSEEVENTF_LEFTUP + if (process.platform === 'win32') { + Bun.spawnSync({ + cmd: [ + 'powershell', + '-NoProfile', + '-NonInteractive', + '-Command', + `Add-Type -Language CSharp @' +using System; using System.Runtime.InteropServices; +public class MUp { [StructLayout(LayoutKind.Sequential)] public struct MOUSEINPUT { public int dx; public int dy; public int mouseData; public uint dwFlags; public uint time; public IntPtr dwExtraInfo; } +[StructLayout(LayoutKind.Explicit)] public struct INPUT { [FieldOffset(0)] public uint type; [FieldOffset(4)] public MOUSEINPUT mi; } +[DllImport("user32.dll",SetLastError=true)] public static extern uint SendInput(uint n, INPUT[] i, int cb); } +'@ +$i = New-Object MUp+INPUT; $i.type=0; $i.mi.dwFlags=0x0004; [MUp]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null`, + ], + }) + return + } + }, + + async getCursorPosition(): Promise<{ x: number; y: number }> { + return platform.input.mouseLocation() + }, + + async drag( + from: { x: number; y: number } | undefined, + to: { x: number; y: number }, + ): Promise { + if (isBound() && process.platform === 'win32') { + const { getBoundHwnd } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + const hwnd = getBoundHwnd() + if (hwnd) { + const { sendMouseDown, sendMouseMove, sendMouseUp } = + require('./win32/windowMessage.js') as typeof import('./win32/windowMessage.js') + const nc = getCachedNcOffset() + if (from) { + const fx = Math.round(from.x) - nc.dx + const fy = Math.round(from.y) - nc.dy + sendMouseDown(hwnd, fx, fy) + } + await sleep(50) + const tx = Math.round(to.x) - nc.dx + const ty = Math.round(to.y) - nc.dy + sendMouseMove(hwnd, tx, ty) + await sleep(16) + sendMouseUp(hwnd, tx, ty) + return + } + } + // Unbound: press at from, move to to, release + if (from) { + await platform.input.moveMouse(from.x, from.y) + await sleep(16) + } + // mouseDown + await (this as any).mouseDown() + await sleep(50) + await platform.input.moveMouse(to.x, to.y) + await sleep(16) + // mouseUp + await (this as any).mouseUp() + }, + + async scroll(x: number, y: number, dx: number, dy: number): Promise { + if (dy !== 0) await platform.input.scroll(dy, 'vertical') + if (dx !== 0) await platform.input.scroll(dx, 'horizontal') + }, + + // ── App management ─────────────────────────────────────────────────── + + async getFrontmostApp(): Promise { + // When HWND is bound, return a synthetic identity + // so the frontmost gate passes (operations target bound window, not foreground) + if (isBound()) { + return { bundleId: 'cu-bound-window', displayName: 'Bound Window' } + } + const info = platform.apps.getFrontmostApp() + if (!info) return null + return { bundleId: info.id, displayName: info.appName } + }, + + async appUnderPoint( + _x: number, + _y: number, + ): Promise<{ bundleId: string; displayName: string } | null> { + return null + }, + + async listInstalledApps(): Promise { + return (await platform.apps.listInstalled()).map(a => ({ + bundleId: a.id, + displayName: a.displayName, + path: a.path, + })) + }, + + async getAppIcon(_path: string): Promise { + return undefined + }, + + async listRunningApps(): Promise { + return platform.apps.listRunning().map(w => ({ + bundleId: w.id, + displayName: w.title, + })) + }, + + async openApp(bundleId: string): Promise { + await platform.apps.open(bundleId) + // Invalidate NC offset cache on new bind + _ncOffset = null + _ncOffsetHwnd = null + }, + + // ── Window management (Windows only) ────────────────────────────── + async manageWindow(action: string, opts?): Promise { + if (!platform.windowManagement) return false + const result = platform.windowManagement.manageWindow(action as any, opts) + // Invalidate NC offset cache on window state change + _ncOffset = null + _ncOffsetHwnd = null + return result + }, + + async getWindowRect(): Promise<{ + x: number + y: number + width: number + height: number + } | null> { + if (!platform.windowManagement) return null + return platform.windowManagement.getWindowRect() + }, + + // ── Open terminal + launch agent ───────────────────────────────── + async openTerminal(opts: { + agent: 'claude' | 'codex' | 'gemini' | 'custom' + command?: string + terminal?: 'wt' | 'powershell' | 'cmd' + workingDirectory?: string + }): Promise<{ hwnd: string; title: string; launched: boolean } | null> { + if (process.platform !== 'win32') return null + try { + const { listWindows: enumWindows } = + require('./win32/windowEnum.js') as typeof import('./win32/windowEnum.js') + const { bindWindow } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + + const agentCmd: Record = { + claude: 'claude', + codex: 'codex', + gemini: 'gemini', + custom: opts.command ?? '', + } + const cmd = agentCmd[opts.agent] + if (!cmd) return null + + // Snapshot current windows + const beforeHwnds = new Set(enumWindows().map(w => w.hwnd)) + const cwd = opts.workingDirectory ?? process.cwd() + const escapedCwd = cwd.replace(/'/g, "''") + + // Start-Process powershell opens a NEW visible PowerShell window + Bun.spawnSync({ + cmd: [ + 'powershell', + '-Command', + `Start-Process powershell -ArgumentList '-NoExit','-Command','cd ''${escapedCwd}''; ${cmd}'`, + ], + stdout: 'ignore', + stderr: 'ignore', + }) + + // Poll for new terminal window (up to 5s) + let newHwnd: string | null = null + let newTitle = '' + for (let i = 0; i < 25; i++) { + await sleep(200) + for (const w of enumWindows()) { + if (!beforeHwnds.has(w.hwnd)) { + const t = w.title.toLowerCase() + if ( + t.includes('terminal') || + t.includes('powershell') || + t.includes('cmd') || + t.includes(cmd.toLowerCase()) + ) { + newHwnd = w.hwnd + newTitle = w.title + break + } + } + } + if (newHwnd) break + } + if (!newHwnd) return null + + // Bind to new terminal + const win = enumWindows().find(w => w.hwnd === newHwnd) + bindWindow(newHwnd, win?.pid) + _ncOffset = null + _ncOffsetHwnd = null + + // Wait for agent to initialize + await sleep(2000) + return { hwnd: newHwnd, title: newTitle, launched: true } + } catch { + return null + } + }, + + // ── Window binding (Windows only) ─────────────────────────────── + async bindToWindow(query: { hwnd?: string; title?: string; pid?: number }) { + if (process.platform !== 'win32') return null + const { bindWindow } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + const { listWindows: enumWindows } = + require('./win32/windowEnum.js') as typeof import('./win32/windowEnum.js') + const windows = enumWindows() + + let target: { hwnd: string; pid: number; title: string } | undefined + if (query.hwnd) { + target = windows.find(w => w.hwnd === query.hwnd) + } else if (query.title) { + const lower = query.title.toLowerCase() + target = windows.find(w => w.title.toLowerCase().includes(lower)) + } else if (query.pid) { + target = windows.find(w => w.pid === query.pid) + } + + if (!target) return null + bindWindow(target.hwnd, target.pid) + _ncOffset = null + _ncOffsetHwnd = null + return target + }, + + async unbindFromWindow() { + if (process.platform !== 'win32') return + const { unbindWindow } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + unbindWindow() + _ncOffset = null + _ncOffsetHwnd = null + }, + + async hasBoundWindow() { + return isBound() + }, + + async getBindingStatus() { + if (process.platform !== 'win32') return null + const { getBoundHwnd } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + const hwnd = getBoundHwnd() + if (!hwnd) return { bound: false } + const { listWindows: enumWindows } = + require('./win32/windowEnum.js') as typeof import('./win32/windowEnum.js') + const windows = enumWindows() + const win = windows.find(w => w.hwnd === hwnd) + const rect = platform.windowManagement?.getWindowRect() ?? undefined + return { + bound: true, + hwnd, + title: win?.title, + pid: win?.pid, + rect: rect ?? undefined, + } + }, + + async listVisibleWindows() { + if (process.platform !== 'win32') return [] + const { listWindows: enumWindows } = + require('./win32/windowEnum.js') as typeof import('./win32/windowEnum.js') + return enumWindows() + }, + + // ── Status indicator ────────────────────────────────────────────── + async statusIndicator( + action: 'show' | 'hide' | 'status', + message?: string, + ) { + if (process.platform !== 'win32') return { active: false } + try { + const ind = + require('./win32/inputIndicator.js') as typeof import('./win32/inputIndicator.js') + if (action === 'show' && message) { + ind.updateIndicator(message) + return { active: true, message } + } + if (action === 'hide') { + ind.hideIndicator() + return { active: false } + } + // status + return { active: isBound() } + } catch { + return { active: false } + } + }, + + // ── Virtual keyboard (PostMessage to bound window only) ──────────── + // Key events use PostMessage + correct lParam (scan code via MapVirtualKeyW). + // This is required for Windows Terminal / ConPTY to correctly translate + // WM_KEYDOWN/WM_KEYUP into console input events. + async virtualKeyboard(opts: { + action: 'type' | 'combo' | 'press' | 'release' | 'hold' + text: string + duration?: number + repeat?: number + }): Promise { + if (process.platform !== 'win32' || !isBound()) return false + try { + const hwnd = getBoundHwndStr() + if (!hwnd) return false + const wm = + require('./win32/windowMessage.js') as typeof import('./win32/windowMessage.js') + const { VK_MAP } = + require('./win32/shared.js') as typeof import('./win32/shared.js') + const repeat = opts.repeat ?? 1 + + for (let r = 0; r < repeat; r++) { + if (r > 0) await sleep(30) + + switch (opts.action) { + case 'type': + wm.sendText(hwnd, opts.text) + break + + case 'combo': { + const parts = opts.text + .split('+') + .map(k => k.trim()) + .filter(Boolean) + wm.sendKeys(hwnd, parts) + break + } + + case 'press': { + const lower = opts.text.toLowerCase() + const vk = + VK_MAP[lower] ?? + (opts.text.length === 1 ? opts.text.charCodeAt(0) : 0) + if (vk) wm.sendKey(hwnd, vk, 'down') + break + } + + case 'release': { + const lower = opts.text.toLowerCase() + const vk = + VK_MAP[lower] ?? + (opts.text.length === 1 ? opts.text.charCodeAt(0) : 0) + if (vk) wm.sendKey(hwnd, vk, 'up') + break + } + + case 'hold': { + const parts = opts.text + .split('+') + .map(k => k.trim()) + .filter(Boolean) + // Press all keys + for (const k of parts) { + const lower = k.toLowerCase() + const vk = + VK_MAP[lower] ?? (k.length === 1 ? k.charCodeAt(0) : 0) + if (vk) wm.sendKey(hwnd, vk, 'down') + } + // Hold + await sleep((opts.duration ?? 1) * 1000) + // Release in reverse + for (const k of [...parts].reverse()) { + const lower = k.toLowerCase() + const vk = + VK_MAP[lower] ?? (k.length === 1 ? k.charCodeAt(0) : 0) + if (vk) wm.sendKey(hwnd, vk, 'up') + } + break + } + } + } + return true + } catch { + return false + } + }, + + // ── Virtual mouse (SendMessageW to bound window only) ───────────── + async virtualMouse(opts: { + action: + | 'click' + | 'double_click' + | 'right_click' + | 'move' + | 'drag' + | 'down' + | 'up' + x: number + y: number + startX?: number + startY?: number + }): Promise { + if (process.platform !== 'win32' || !isBound()) return false + try { + const hwnd = getBoundHwndStr() + if (!hwnd) return false + const wm = + require('./win32/windowMessage.js') as typeof import('./win32/windowMessage.js') + const vc = + require('./win32/virtualCursor.js') as typeof import('./win32/virtualCursor.js') + const x = Math.round(opts.x) + const y = Math.round(opts.y) + + switch (opts.action) { + case 'click': + vc.moveVirtualCursor(x, y, true) + wm.sendClick(hwnd, x, y, 'left') + break + case 'double_click': + vc.moveVirtualCursor(x, y, true) + wm.sendClick(hwnd, x, y, 'left') + await sleep(50) + wm.sendClick(hwnd, x, y, 'left') + break + case 'right_click': + vc.moveVirtualCursor(x, y, true) + wm.sendClick(hwnd, x, y, 'right') + break + case 'move': + vc.moveVirtualCursor(x, y) + wm.sendMouseMove(hwnd, x, y) + break + case 'drag': { + const sx = Math.round(opts.startX ?? x) + const sy = Math.round(opts.startY ?? y) + vc.moveVirtualCursor(sx, sy, true) + wm.sendMouseDown(hwnd, sx, sy) + await sleep(16) + wm.sendMouseMove(hwnd, x, y) + vc.moveVirtualCursor(x, y) + await sleep(16) + wm.sendMouseUp(hwnd, x, y) + break + } + case 'down': + vc.moveVirtualCursor(x, y, true) + wm.sendMouseDown(hwnd, x, y) + break + case 'up': + vc.moveVirtualCursor(x, y) + wm.sendMouseUp(hwnd, x, y) + break + } + return true + } catch { + return false + } + }, + + // ── Mouse wheel scroll (WM_MOUSEWHEEL, works on Excel/browsers) ── + async mouseWheel( + x: number, + y: number, + delta: number, + horizontal?: boolean, + ): Promise { + if (process.platform !== 'win32' || !isBound()) return false + try { + const hwnd = getBoundHwndStr() + if (!hwnd) return false + // Try Python Bridge first (via bridgeClient directly) + try { + const bridge = + require('./win32/bridgeClient.js') as typeof import('./win32/bridgeClient.js') + const result = bridge.callSync('send_mouse_wheel', { + hwnd, + x: Math.round(x), + y: Math.round(y), + delta, + horizontal: horizontal ?? false, + }) + if (result !== null) return true + } catch {} + // Fallback: windowMessage.ts (PowerShell) + const { sendMouseWheel } = + require('./win32/windowMessage.js') as typeof import('./win32/windowMessage.js') + return sendMouseWheel( + hwnd, + Math.round(x), + Math.round(y), + delta, + horizontal ?? false, + ) + } catch { + return false + } + }, + + // ── Window activation + prompt interaction ──────────────────────── + async activateWindow(clickX?: number, clickY?: number): Promise { + if (process.platform !== 'win32' || !isBound()) return false + try { + const { getBoundHwnd } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + const hwnd = getBoundHwnd() + if (!hwnd) return false + // Focus: restore if minimized, bring to foreground + if (platform.windowManagement) { + platform.windowManagement.manageWindow('focus') + } + // Click to ensure keyboard focus inside the window + const { sendClick } = + require('./win32/windowMessage.js') as typeof import('./win32/windowMessage.js') + if (clickX !== undefined && clickY !== undefined) { + sendClick(hwnd, clickX, clickY, 'left') + } else { + // Click center of client area + const rect = platform.windowManagement?.getWindowRect() + if (rect) { + const nc = getCachedNcOffset() + const cx = Math.round(rect.width / 2) - nc.dx + const cy = Math.round(rect.height / 2) - nc.dy + sendClick(hwnd, cx, cy, 'left') + } + } + return true + } catch { + return false + } + }, + + async respondToPrompt(opts: { + responseType: 'yes' | 'no' | 'enter' | 'escape' | 'select' | 'type' + arrowDirection?: 'up' | 'down' + arrowCount?: number + text?: string + }): Promise { + if (process.platform !== 'win32' || !isBound()) return false + try { + const { getBoundHwnd } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + const hwnd = getBoundHwnd() + if (!hwnd) return false + const wm = + require('./win32/windowMessage.js') as typeof import('./win32/windowMessage.js') + + const VK_RETURN = 0x0d + const VK_ESCAPE = 0x1b + const VK_UP = 0x26 + const VK_DOWN = 0x28 + + switch (opts.responseType) { + case 'yes': + wm.sendChar(hwnd, 'y') + await sleep(50) + wm.sendKey(hwnd, VK_RETURN, 'down') + wm.sendKey(hwnd, VK_RETURN, 'up') + break + case 'no': + wm.sendChar(hwnd, 'n') + await sleep(50) + wm.sendKey(hwnd, VK_RETURN, 'down') + wm.sendKey(hwnd, VK_RETURN, 'up') + break + case 'enter': + wm.sendKey(hwnd, VK_RETURN, 'down') + wm.sendKey(hwnd, VK_RETURN, 'up') + break + case 'escape': + wm.sendKey(hwnd, VK_ESCAPE, 'down') + wm.sendKey(hwnd, VK_ESCAPE, 'up') + break + case 'select': { + const vk = + (opts.arrowDirection ?? 'down') === 'down' ? VK_DOWN : VK_UP + const count = opts.arrowCount ?? 1 + for (let i = 0; i < count; i++) { + wm.sendKey(hwnd, vk, 'down') + wm.sendKey(hwnd, vk, 'up') + await sleep(30) + } + await sleep(50) + wm.sendKey(hwnd, VK_RETURN, 'down') + wm.sendKey(hwnd, VK_RETURN, 'up') + break + } + case 'type': + if (opts.text) { + wm.sendText(hwnd, opts.text) + await sleep(50) + } + wm.sendKey(hwnd, VK_RETURN, 'down') + wm.sendKey(hwnd, VK_RETURN, 'up') + break + } + return true + } catch { + return false + } + }, + + // ── Element-targeted actions (Windows UIA) ────────────────────────── + async clickElement(query: { + name?: string + role?: string + automationId?: string + }): Promise { + if (process.platform !== 'win32' || !isBound()) return false + try { + const { getBoundHwnd } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + const hwnd = getBoundHwnd() + if (!hwnd) return false + const { captureAccessibilitySnapshot, findNodeInSnapshot } = + require('./win32/accessibilitySnapshot.js') as typeof import('./win32/accessibilitySnapshot.js') + const snap = captureAccessibilitySnapshot(hwnd) + if (!snap) return false + const node = findNodeInSnapshot(snap.nodes, query) + if (!node) return false + + // Try InvokePattern first (Button, MenuItem, Link) + const { clickElement: uiaClick } = + require('./win32/uiAutomation.js') as typeof import('./win32/uiAutomation.js') + // Get window title for UIA lookup + const windows = require('./win32/windowEnum.js').listWindows() as { + hwnd: string + title: string + }[] + const win = windows.find((w: any) => w.hwnd === hwnd) + if (win && node.automationId) { + if (uiaClick(win.title, node.automationId)) return true + } + + // Fallback: click center of bounding rect via SendMessage + const cx = node.bounds.x + Math.round(node.bounds.w / 2) + const cy = node.bounds.y + Math.round(node.bounds.h / 2) + // Convert screen coords to client coords + const nc = getCachedNcOffset() + const { sendClick } = + require('./win32/windowMessage.js') as typeof import('./win32/windowMessage.js') + const editHwnd = require('./win32/windowMessage.js').findEditChild(hwnd) + sendClick(editHwnd ?? hwnd, cx - nc.dx, cy - nc.dy, 'left') + return true + } catch { + return false + } + }, + + async typeIntoElement( + query: { name?: string; role?: string; automationId?: string }, + text: string, + ): Promise { + if (process.platform !== 'win32' || !isBound()) return false + try { + const { getBoundHwnd } = + require('./platforms/win32.js') as typeof import('./platforms/win32.js') + const hwnd = getBoundHwnd() + if (!hwnd) return false + + // Try UIA ValuePattern directly + const windows = require('./win32/windowEnum.js').listWindows() as { + hwnd: string + title: string + }[] + const win = windows.find((w: any) => w.hwnd === hwnd) + if (win) { + const { setValue, findElement } = + require('./win32/uiAutomation.js') as typeof import('./win32/uiAutomation.js') + // Try by automationId first, then by name+role + if (query.automationId) { + if (setValue(win.title, query.automationId, text)) return true + } + if (query.name) { + const el = findElement(win.title, query) + if (el && el.automationId) { + if (setValue(win.title, el.automationId, text)) return true + } + } + } + + // Fallback: find the element, click it, then sendText + const { captureAccessibilitySnapshot, findNodeInSnapshot } = + require('./win32/accessibilitySnapshot.js') as typeof import('./win32/accessibilitySnapshot.js') + const snap = captureAccessibilitySnapshot(hwnd) + if (!snap) return false + const node = findNodeInSnapshot(snap.nodes, query) + if (!node) return false + + // Click to focus, then type + const nc = getCachedNcOffset() + const cx = node.bounds.x + Math.round(node.bounds.w / 2) - nc.dx + const cy = node.bounds.y + Math.round(node.bounds.h / 2) - nc.dy + const { sendClick, sendText } = + require('./win32/windowMessage.js') as typeof import('./win32/windowMessage.js') + sendClick(hwnd, cx, cy, 'left') + await sleep(50) + return sendText(hwnd, text) + } catch { + return false + } + }, + } +} + +/** + * Module-level unhide — no-op on non-macOS (we don't hide apps). + */ +export async function unhideComputerUseAppsCrossPlatform( + _bundleIds: readonly string[], +): Promise { + // No-op: Windows/Linux don't use hide/unhide +} diff --git a/src/utils/computerUse/hostAdapter.ts b/src/utils/computerUse/hostAdapter.ts index 6b494ec64..acefbaa3d 100644 --- a/src/utils/computerUse/hostAdapter.ts +++ b/src/utils/computerUse/hostAdapter.ts @@ -46,16 +46,9 @@ export function getComputerUseHostAdapter(): ComputerUseHostAdapter { }), ensureOsPermissions: async () => { if (process.platform !== 'darwin') return { granted: true } - const cu = requireComputerUseSwift() as any - // Native .node module exposes tcc; cross-platform JS backend does not. - // When tcc is absent (JS backend on macOS), we cannot programmatically - // check TCC status — returning granted:false would create a deadlock - // (recheck also fails, user can never pass). The JS backend uses - // osascript/screencapture which trigger OS-level permission prompts - // themselves, so the OS provides the safety net instead. - if (!cu.tcc) return { granted: true } - const accessibility = cu.tcc.checkAccessibility() - const screenRecording = cu.tcc.checkScreenRecording() + const cu = requireComputerUseSwift() + const accessibility = (cu as any).tcc.checkAccessibility() + const screenRecording = (cu as any).tcc.checkScreenRecording() return accessibility && screenRecording ? { granted: true } : { granted: false, accessibility, screenRecording } diff --git a/src/utils/computerUse/platforms/darwin.ts b/src/utils/computerUse/platforms/darwin.ts new file mode 100644 index 000000000..4547c252f --- /dev/null +++ b/src/utils/computerUse/platforms/darwin.ts @@ -0,0 +1,152 @@ +/** + * macOS platform backend for Computer Use. + * + * Delegates to @ant/computer-use-input (enigo keyboard/mouse) and + * @ant/computer-use-swift (screenshots, display, apps). + * + * No window-bound input (sendChar/sendKey/sendClick/sendText) — macOS + * uses global input via CoreGraphics events. + */ + +import type { Platform } from './index.js' +import type { + InputPlatform, + ScreenshotPlatform, + DisplayPlatform, + AppsPlatform, + WindowHandle, + FrontmostAppInfo, +} from './types.js' +import { requireComputerUseInput } from '../inputLoader.js' +import { requireComputerUseSwift } from '../swiftLoader.js' + +// --------------------------------------------------------------------------- +// Input — delegate to @ant/computer-use-input darwin backend +// --------------------------------------------------------------------------- + +const input: InputPlatform = { + async moveMouse(x, y) { + const api = requireComputerUseInput() + await api.moveMouse(x, y) + }, + + async click(x, y, button) { + const api = requireComputerUseInput() + await api.moveMouse(x, y) + await api.mouseButton(button, 'click', 1) + }, + + async typeText(text) { + const api = requireComputerUseInput() + await api.typeText(text) + }, + + async key(name, action) { + const api = requireComputerUseInput() + await api.key(name, action) + }, + + async keys(combo) { + const api = requireComputerUseInput() + await api.keys(combo) + }, + + async scroll(amount, direction) { + const api = requireComputerUseInput() + await api.mouseScroll(amount, direction) + }, + + async mouseLocation() { + const api = requireComputerUseInput() + return api.mouseLocation() + }, + + // No window-bound methods on macOS +} + +// --------------------------------------------------------------------------- +// Screenshot — delegate to @ant/computer-use-swift +// --------------------------------------------------------------------------- + +const screenshot: ScreenshotPlatform = { + async captureScreen(displayId) { + const swift = requireComputerUseSwift() + return swift.screenshot.captureExcluding([], undefined, undefined, undefined, displayId) + }, + + async captureRegion(x, y, w, h) { + const swift = requireComputerUseSwift() + return swift.screenshot.captureRegion([], x, y, w, h) + }, + + // macOS could use SCContentFilter for window capture but we don't expose + // it through this interface yet — the swift module's captureExcluding + // handles most use cases. +} + +// --------------------------------------------------------------------------- +// Display — delegate to @ant/computer-use-swift +// --------------------------------------------------------------------------- + +const display: DisplayPlatform = { + listAll() { + const swift = requireComputerUseSwift() + return swift.display.listAll() + }, + + getSize(displayId) { + const swift = requireComputerUseSwift() + return swift.display.getSize(displayId) + }, +} + +// --------------------------------------------------------------------------- +// Apps — delegate to @ant/computer-use-swift +// --------------------------------------------------------------------------- + +const apps: AppsPlatform = { + listRunning(): WindowHandle[] { + const swift = requireComputerUseSwift() + const running = swift.apps.listRunning() + return running.map((app: any) => ({ + id: app.bundleId ?? '', + pid: 0, // macOS listRunning doesn't expose PID through this API + title: app.displayName ?? '', + })) + }, + + async listInstalled() { + const swift = requireComputerUseSwift() + const installed = await swift.apps.listInstalled() + return installed.map((app: any) => ({ + id: app.bundleId ?? '', + displayName: app.displayName ?? '', + path: app.path ?? '', + })) + }, + + async open(name) { + const swift = requireComputerUseSwift() + await swift.apps.open(name) + }, + + getFrontmostApp(): FrontmostAppInfo | null { + const api = requireComputerUseInput() + const info = api.getFrontmostAppInfo() + if (!info) return null + return { id: info.bundleId, appName: info.appName } + }, + + findWindowByTitle(_title): WindowHandle | null { + // macOS: not directly supported through the current swift API. + // Use apps.listRunning() and filter by title instead. + const all = this.listRunning() + return all.find(w => w.title.includes(_title)) ?? null + }, +} + +// --------------------------------------------------------------------------- +// Export +// --------------------------------------------------------------------------- + +export const platform: Platform = { input, screenshot, display, apps } diff --git a/src/utils/computerUse/platforms/index.ts b/src/utils/computerUse/platforms/index.ts new file mode 100644 index 000000000..5a332a18d --- /dev/null +++ b/src/utils/computerUse/platforms/index.ts @@ -0,0 +1,41 @@ +/** + * Platform dispatcher for Computer Use. + * + * Loads the correct platform backend based on `process.platform`. + * Each backend implements the same unified interface. + */ + +import type { InputPlatform, ScreenshotPlatform, DisplayPlatform, AppsPlatform, WindowManagementPlatform } from './types.js' + +export interface Platform { + input: InputPlatform + screenshot: ScreenshotPlatform + display: DisplayPlatform + apps: AppsPlatform + windowManagement?: WindowManagementPlatform +} + +let cached: Platform | undefined + +export function loadPlatform(): Platform { + if (cached) return cached + + switch (process.platform) { + case 'darwin': + cached = require('./darwin.js').platform + break + case 'win32': + cached = require('./win32.js').platform + break + case 'linux': + cached = require('./linux.js').platform + break + default: + throw new Error(`Computer Use not supported on ${process.platform}`) + } + + return cached! +} + +export type { InputPlatform, ScreenshotPlatform, DisplayPlatform, AppsPlatform, WindowManagementPlatform } from './types.js' +export type { WindowHandle, ScreenshotResult, DisplayInfo, InstalledApp, FrontmostAppInfo, WindowAction } from './types.js' diff --git a/src/utils/computerUse/platforms/linux.ts b/src/utils/computerUse/platforms/linux.ts new file mode 100644 index 000000000..b58f6c72d --- /dev/null +++ b/src/utils/computerUse/platforms/linux.ts @@ -0,0 +1,416 @@ +/** + * Linux platform backend for Computer Use. + * + * Uses: + * - xdotool for mouse/keyboard input + * - scrot for screenshots (converted to JPEG) + * - xrandr for display enumeration + * - wmctrl for window management + * + * CRITICAL: All screenshots output JPEG. scrot outputs PNG by default, + * so we pipe through ImageMagick `convert` to produce JPEG. + */ + +import type { Platform } from './index.js' +import type { + InputPlatform, + ScreenshotPlatform, + DisplayPlatform, + AppsPlatform, + WindowHandle, + ScreenshotResult, + DisplayInfo, + InstalledApp, + FrontmostAppInfo, +} from './types.js' + +// --------------------------------------------------------------------------- +// Shell helpers +// --------------------------------------------------------------------------- + +function run(cmd: string[]): string { + const result = Bun.spawnSync({ cmd, stdout: 'pipe', stderr: 'pipe' }) + return new TextDecoder().decode(result.stdout).trim() +} + +async function runAsync(cmd: string[]): Promise { + const proc = Bun.spawn(cmd, { stdout: 'pipe', stderr: 'pipe' }) + const out = await new Response(proc.stdout).text() + await proc.exited + return out.trim() +} + +function commandExists(name: string): boolean { + const result = Bun.spawnSync({ cmd: ['which', name], stdout: 'pipe', stderr: 'pipe' }) + return result.exitCode === 0 +} + +// --------------------------------------------------------------------------- +// xdotool key name mapping +// --------------------------------------------------------------------------- + +const KEY_MAP: Record = { + return: 'Return', enter: 'Return', tab: 'Tab', space: 'space', + backspace: 'BackSpace', delete: 'Delete', escape: 'Escape', esc: 'Escape', + left: 'Left', up: 'Up', right: 'Right', down: 'Down', + home: 'Home', end: 'End', pageup: 'Prior', pagedown: 'Next', + f1: 'F1', f2: 'F2', f3: 'F3', f4: 'F4', f5: 'F5', f6: 'F6', + f7: 'F7', f8: 'F8', f9: 'F9', f10: 'F10', f11: 'F11', f12: 'F12', + shift: 'shift', lshift: 'shift', rshift: 'shift', + control: 'ctrl', ctrl: 'ctrl', lcontrol: 'ctrl', rcontrol: 'ctrl', + alt: 'alt', option: 'alt', lalt: 'alt', ralt: 'alt', + win: 'super', meta: 'super', command: 'super', cmd: 'super', super: 'super', + insert: 'Insert', printscreen: 'Print', pause: 'Pause', + numlock: 'Num_Lock', capslock: 'Caps_Lock', scrolllock: 'Scroll_Lock', +} + +const MODIFIER_KEYS = new Set([ + 'shift', 'lshift', 'rshift', 'control', 'ctrl', 'lcontrol', 'rcontrol', + 'alt', 'option', 'lalt', 'ralt', 'win', 'meta', 'command', 'cmd', 'super', +]) + +function mapKey(name: string): string { + return KEY_MAP[name.toLowerCase()] ?? name +} + +function mouseButtonNum(button: 'left' | 'right' | 'middle'): string { + return button === 'left' ? '1' : button === 'right' ? '3' : '2' +} + +// --------------------------------------------------------------------------- +// Input — xdotool +// --------------------------------------------------------------------------- + +const input: InputPlatform = { + async moveMouse(x, y) { + run(['xdotool', 'mousemove', '--sync', String(Math.round(x)), String(Math.round(y))]) + }, + + async click(x, y, button) { + run(['xdotool', 'mousemove', '--sync', String(Math.round(x)), String(Math.round(y))]) + run(['xdotool', 'click', mouseButtonNum(button)]) + }, + + async typeText(text) { + run(['xdotool', 'type', '--delay', '12', text]) + }, + + async key(name, action) { + const mapped = mapKey(name) + if (action === 'press') { + run(['xdotool', 'keydown', mapped]) + } else { + run(['xdotool', 'keyup', mapped]) + } + }, + + async keys(parts) { + const modifiers: string[] = [] + let finalKey: string | null = null + + for (const part of parts) { + if (MODIFIER_KEYS.has(part.toLowerCase())) { + modifiers.push(mapKey(part)) + } else { + finalKey = part + } + } + if (!finalKey) return + + const combo = [...modifiers, mapKey(finalKey)].join('+') + run(['xdotool', 'key', combo]) + }, + + async scroll(amount, direction) { + if (direction === 'vertical') { + const btn = amount >= 0 ? '5' : '4' + const repeats = Math.abs(Math.round(amount)) + if (repeats > 0) run(['xdotool', 'click', '--repeat', String(repeats), btn]) + } else { + const btn = amount >= 0 ? '7' : '6' + const repeats = Math.abs(Math.round(amount)) + if (repeats > 0) run(['xdotool', 'click', '--repeat', String(repeats), btn]) + } + }, + + async mouseLocation() { + const out = run(['xdotool', 'getmouselocation']) + const xMatch = out.match(/x:(\d+)/) + const yMatch = out.match(/y:(\d+)/) + return { + x: xMatch ? Number(xMatch[1]) : 0, + y: yMatch ? Number(yMatch[1]) : 0, + } + }, + + // No window-bound input on Linux +} + +// --------------------------------------------------------------------------- +// Screenshot — scrot → JPEG conversion +// --------------------------------------------------------------------------- + +const SCREENSHOT_TMP = '/tmp/cu-screenshot-tmp.png' +const SCREENSHOT_JPG = '/tmp/cu-screenshot.jpg' + +async function pngToJpegBase64(pngPath: string, width: number, height: number): Promise { + // Try ImageMagick convert first + if (commandExists('convert')) { + await runAsync(['convert', pngPath, '-quality', '75', SCREENSHOT_JPG]) + const file = Bun.file(SCREENSHOT_JPG) + const buffer = await file.arrayBuffer() + return { base64: Buffer.from(buffer).toString('base64'), width, height } + } + + // Fallback: ffmpeg + if (commandExists('ffmpeg')) { + await runAsync(['ffmpeg', '-y', '-i', pngPath, '-q:v', '5', SCREENSHOT_JPG]) + const file = Bun.file(SCREENSHOT_JPG) + const buffer = await file.arrayBuffer() + return { base64: Buffer.from(buffer).toString('base64'), width, height } + } + + // Last resort: return PNG base64 (caller should be aware) + const file = Bun.file(pngPath) + const buffer = await file.arrayBuffer() + return { base64: Buffer.from(buffer).toString('base64'), width, height } +} + +const screenshot: ScreenshotPlatform = { + async captureScreen(displayId) { + try { + await runAsync(['scrot', '-o', SCREENSHOT_TMP]) + const size = display.getSize(displayId) + return pngToJpegBase64(SCREENSHOT_TMP, size.width, size.height) + } catch { + return { base64: '', width: 0, height: 0 } + } + }, + + async captureRegion(x, y, w, h) { + try { + await runAsync(['scrot', '-a', `${x},${y},${w},${h}`, '-o', SCREENSHOT_TMP]) + return pngToJpegBase64(SCREENSHOT_TMP, w, h) + } catch { + return { base64: '', width: w, height: h } + } + }, + + async captureWindow(hwnd) { + try { + // Use xdotool to get window geometry, then import (ImageMagick) to capture + if (commandExists('import')) { + const jpgPath = '/tmp/cu-window-capture.jpg' + await runAsync(['import', '-window', hwnd, '-quality', '75', jpgPath]) + + // Get dimensions from xdotool + const geom = run(['xdotool', 'getwindowgeometry', '--shell', hwnd]) + const wMatch = geom.match(/WIDTH=(\d+)/) + const hMatch = geom.match(/HEIGHT=(\d+)/) + const width = wMatch ? Number(wMatch[1]) : 0 + const height = hMatch ? Number(hMatch[1]) : 0 + + const file = Bun.file(jpgPath) + const buffer = await file.arrayBuffer() + return { base64: Buffer.from(buffer).toString('base64'), width, height } + } + return null + } catch { + return null + } + }, +} + +// --------------------------------------------------------------------------- +// Display — xrandr +// --------------------------------------------------------------------------- + +const display: DisplayPlatform = { + listAll(): DisplayInfo[] { + try { + const raw = run(['xrandr', '--query']) + const displays: DisplayInfo[] = [] + let idx = 0 + + const regex = /^\S+\s+connected\s+(?:primary\s+)?(\d+)x(\d+)\+\d+\+\d+/gm + let match: RegExpExecArray | null + while ((match = regex.exec(raw)) !== null) { + displays.push({ + width: Number(match[1]), + height: Number(match[2]), + scaleFactor: 1, + displayId: idx++, + }) + } + + if (displays.length === 0) { + return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }] + } + return displays + } catch { + return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }] + } + }, + + getSize(displayId): DisplayInfo { + const all = this.listAll() + if (displayId !== undefined) { + const found = all.find(d => d.displayId === displayId) + if (found) return found + } + return all[0] ?? { width: 1920, height: 1080, scaleFactor: 1, displayId: 0 } + }, +} + +// --------------------------------------------------------------------------- +// Apps — wmctrl + ps + .desktop files +// --------------------------------------------------------------------------- + +const apps: AppsPlatform = { + listRunning(): WindowHandle[] { + try { + if (commandExists('wmctrl')) { + const raw = run(['wmctrl', '-l', '-p']) + const handles: WindowHandle[] = [] + for (const line of raw.split('\n').filter(Boolean)) { + const parts = line.split(/\s+/) + const windowId = parts[0] + const pid = Number(parts[2]) + if (!pid) continue + + // Title is everything after the 4th field (hostname) + const title = parts.slice(4).join(' ') + + let exePath = '' + try { exePath = run(['readlink', '-f', `/proc/${pid}/exe`]) } catch {} + + handles.push({ + id: windowId ?? '', + pid, + title, + exePath: exePath || undefined, + }) + } + + // Deduplicate by id + const seen = new Set() + return handles.filter(h => { + if (seen.has(h.id)) return false + seen.add(h.id) + return true + }).slice(0, 50) + } + + // Fallback: xdotool search + const raw = run(['xdotool', 'search', '--name', '']) + const handles: WindowHandle[] = [] + for (const windowId of raw.split('\n').filter(Boolean).slice(0, 50)) { + const title = run(['xdotool', 'getwindowname', windowId]) + let pid = 0 + try { pid = Number(run(['xdotool', 'getwindowpid', windowId])) } catch {} + if (title) { + handles.push({ id: windowId, pid, title }) + } + } + return handles + } catch { + return [] + } + }, + + async listInstalled(): Promise { + try { + const dirs = [ + '/usr/share/applications', + '/usr/local/share/applications', + `${process.env.HOME}/.local/share/applications`, + ] + const result: InstalledApp[] = [] + + for (const dir of dirs) { + let files: string + try { + files = run(['find', dir, '-name', '*.desktop', '-maxdepth', '1']) + } catch { continue } + + for (const filepath of files.split('\n').filter(Boolean)) { + try { + const content = run(['cat', filepath]) + const nameMatch = content.match(/^Name=(.+)$/m) + const execMatch = content.match(/^Exec=(.+)$/m) + const noDisplay = content.match(/^NoDisplay=true$/m) + if (noDisplay) continue + + const name = nameMatch?.[1] ?? '' + const exec = execMatch?.[1] ?? '' + if (!name) continue + + result.push({ + id: filepath.split('/').pop()?.replace('.desktop', '') ?? '', + displayName: name, + path: exec.split(/\s+/)[0] ?? '', + }) + } catch { /* skip unreadable */ } + } + } + + return result.slice(0, 200) + } catch { + return [] + } + }, + + async open(name) { + try { + const desktopName = name.endsWith('.desktop') ? name : `${name}.desktop` + if (commandExists('gtk-launch')) { + await runAsync(['gtk-launch', desktopName]) + return + } + } catch { /* fall through */ } + await runAsync(['xdg-open', name]) + }, + + getFrontmostApp(): FrontmostAppInfo | null { + try { + const windowId = run(['xdotool', 'getactivewindow']) + if (!windowId) return null + + const pidStr = run(['xdotool', 'getwindowpid', windowId]) + if (!pidStr) return null + + let exePath = '' + try { exePath = run(['readlink', '-f', `/proc/${pidStr}/exe`]) } catch {} + let appName = '' + try { appName = run(['cat', `/proc/${pidStr}/comm`]) } catch {} + + if (!exePath && !appName) return null + return { id: exePath || `/proc/${pidStr}/exe`, appName: appName || 'unknown' } + } catch { + return null + } + }, + + findWindowByTitle(title): WindowHandle | null { + try { + // xdotool search by name + const raw = run(['xdotool', 'search', '--name', title]) + const windowId = raw.split('\n')[0] + if (!windowId) return null + + const windowTitle = run(['xdotool', 'getwindowname', windowId]) + let pid = 0 + try { pid = Number(run(['xdotool', 'getwindowpid', windowId])) } catch {} + + return { id: windowId, pid, title: windowTitle } + } catch { + return null + } + }, +} + +// --------------------------------------------------------------------------- +// Export +// --------------------------------------------------------------------------- + +export const platform: Platform = { input, screenshot, display, apps } diff --git a/src/utils/computerUse/platforms/types.ts b/src/utils/computerUse/platforms/types.ts new file mode 100644 index 000000000..2f6345105 --- /dev/null +++ b/src/utils/computerUse/platforms/types.ts @@ -0,0 +1,153 @@ +/** + * Cross-platform abstraction types for Computer Use. + * + * These interfaces define a unified API surface for input, screenshots, + * display info, and app management across macOS, Windows, and Linux. + */ + +// --------------------------------------------------------------------------- +// Window / App types +// --------------------------------------------------------------------------- + +/** Cross-platform window identifier */ +export interface WindowHandle { + id: string // macOS: bundleId, Windows: HWND string, Linux: window ID + pid: number + title: string + exePath?: string // Windows/Linux: process executable path +} + +export interface ScreenshotResult { + base64: string + width: number + height: number +} + +export interface DisplayInfo { + width: number + height: number + scaleFactor: number + displayId: number +} + +export interface InstalledApp { + id: string // macOS: bundleId, Windows: exe path or package family, Linux: .desktop name + displayName: string + path: string +} + +export interface FrontmostAppInfo { + id: string + appName: string +} + +// --------------------------------------------------------------------------- +// InputPlatform +// --------------------------------------------------------------------------- + +/** + * Input platform interface — two modes: + * + * Mode A (Global): moveMouse, click, typeText, key, keys, scroll, mouseLocation + * Works on all platforms. Sends input to the foreground window; moves the + * real cursor and steals focus. + * + * Mode B (Window-bound, optional): sendChar, sendKey, sendClick, sendText + * Windows-only via SendMessage/PostMessage. Does NOT steal focus or move + * the cursor. Preferred when a target HWND is known. + */ +export interface InputPlatform { + // --- Mode A: Global input (all platforms) --- + moveMouse(x: number, y: number): Promise + click( + x: number, + y: number, + button: 'left' | 'right' | 'middle', + ): Promise + typeText(text: string): Promise + key(name: string, action: 'press' | 'release'): Promise + keys(combo: string[]): Promise + scroll(amount: number, direction: 'vertical' | 'horizontal'): Promise + mouseLocation(): Promise<{ x: number; y: number }> + + // --- Mode B: Window-bound input (Windows only, optional) --- + sendChar?(hwnd: string, char: string): Promise + sendKey?(hwnd: string, vk: number, action: 'down' | 'up'): Promise + sendClick?( + hwnd: string, + x: number, + y: number, + button: 'left' | 'right', + ): Promise + sendText?(hwnd: string, text: string): Promise +} + +// --------------------------------------------------------------------------- +// ScreenshotPlatform +// --------------------------------------------------------------------------- + +export interface ScreenshotPlatform { + /** Full-screen capture. Returns JPEG base64. */ + captureScreen(displayId?: number): Promise + /** Region capture. Returns JPEG base64. */ + captureRegion( + x: number, + y: number, + w: number, + h: number, + ): Promise + /** Window capture (Windows: PrintWindow, macOS: SCContentFilter, Linux: xdotool+import). */ + captureWindow?(hwnd: string): Promise +} + +// --------------------------------------------------------------------------- +// DisplayPlatform +// --------------------------------------------------------------------------- + +export interface DisplayPlatform { + listAll(): DisplayInfo[] + getSize(displayId?: number): DisplayInfo +} + +// --------------------------------------------------------------------------- +// AppsPlatform +// --------------------------------------------------------------------------- + +export interface AppsPlatform { + listRunning(): WindowHandle[] + listInstalled(): Promise + open(name: string): Promise + getFrontmostApp(): FrontmostAppInfo | null + findWindowByTitle(title: string): WindowHandle | null +} + +// --------------------------------------------------------------------------- +// WindowManagementPlatform (Windows HWND-targeted, no global APIs) +// --------------------------------------------------------------------------- + +export type WindowAction = + | 'minimize' + | 'maximize' + | 'restore' + | 'close' + | 'focus' + | 'move_offscreen' + | 'move_resize' + | 'get_rect' + +export interface WindowManagementPlatform { + /** Perform a window management action on the bound HWND. All via Win32 API, no global shortcuts. */ + manageWindow( + action: WindowAction, + opts?: { x?: number; y?: number; width?: number; height?: number }, + ): boolean + /** Move window to specific position and/or resize */ + moveResize(x: number, y: number, width?: number, height?: number): boolean + /** Get current window rect */ + getWindowRect(): { + x: number + y: number + width: number + height: number + } | null +} diff --git a/src/utils/computerUse/platforms/win32.ts b/src/utils/computerUse/platforms/win32.ts new file mode 100644 index 000000000..fc7d6b9cb --- /dev/null +++ b/src/utils/computerUse/platforms/win32.ts @@ -0,0 +1,979 @@ +/** + * Windows platform backend for Computer Use. + * + * Combines: + * - PowerShell SetCursorPos/SendInput for global input (fallback) + * - win32/windowMessage.ts for window-bound SendMessage input (preferred) + * - Python Bridge (bridge.py) for screenshots (mss + ctypes PrintWindow) + * - win32/windowEnum.ts for EnumWindows app listing + * - No PowerShell for screenshots (Python Bridge only, no PS fallback) + * - PowerShell Screen.AllScreens for display enumeration + * + * CRITICAL: All screenshots output JPEG (ImageFormat::Jpeg), not PNG. + */ + +import type { Platform } from './index.js' +import type { + InputPlatform, + ScreenshotPlatform, + DisplayPlatform, + AppsPlatform, + WindowHandle, + ScreenshotResult, + DisplayInfo, + InstalledApp, + FrontmostAppInfo, +} from './types.js' +import { listWindows } from '../win32/windowEnum.js' +import { detectAppType, openWithController } from '../win32/appDispatcher.js' +import { + markBound, + unmarkBound, + cleanupAllBorders, +} from '../win32/windowBorder.js' +import { + showVirtualCursor, + hideVirtualCursor, + moveVirtualCursor, +} from '../win32/virtualCursor.js' +import { showIndicator, hideIndicator } from '../win32/inputIndicator.js' +import { + ps, + psAsync, + validateHwnd, + VK_MAP, + MODIFIER_KEYS, +} from '../win32/shared.js' +import { logForDebugging } from '../../debug.js' + +// --------------------------------------------------------------------------- +// Python Bridge (lazy-loaded, preferred over PowerShell for screenshots) +// --------------------------------------------------------------------------- + +let _bridge: typeof import('../win32/bridgeClient.js') | undefined +function getBridge() { + if (!_bridge) { + try { + _bridge = + require('../win32/bridgeClient.js') as typeof import('../win32/bridgeClient.js') + } catch {} + } + return _bridge +} + +/** Try a bridge call, return null on failure (caller falls back to PS) */ +function bridgeCallSync( + method: string, + params: Record = {}, +): T | null { + try { + const b = getBridge() + if (!b) return null + return b.callSync(method, params) + } catch { + return null + } +} + +// validateHwnd, ps, psAsync, VK_MAP, MODIFIER_KEYS imported from '../win32/shared.js' + +// --------------------------------------------------------------------------- +// Win32 P/Invoke types (compiled once per PS session) +// --------------------------------------------------------------------------- + +const WIN32_TYPES = ` +Add-Type -Language CSharp @' +using System; +using System.Runtime.InteropServices; +using System.Text; +using System.Diagnostics; + +public class CuWin32 { + // --- Cursor --- + [DllImport("user32.dll")] public static extern bool SetCursorPos(int X, int Y); + [DllImport("user32.dll")] public static extern bool GetCursorPos(out POINT p); + [StructLayout(LayoutKind.Sequential)] public struct POINT { public int X; public int Y; } + + // --- SendInput --- + [StructLayout(LayoutKind.Sequential)] public struct MOUSEINPUT { + public int dx; public int dy; public int mouseData; public uint dwFlags; public uint time; public IntPtr dwExtraInfo; + } + [StructLayout(LayoutKind.Explicit)] public struct INPUT { + [FieldOffset(0)] public uint type; + [FieldOffset(4)] public MOUSEINPUT mi; + } + [StructLayout(LayoutKind.Sequential)] public struct KEYBDINPUT { + public ushort wVk; public ushort wScan; public uint dwFlags; public uint time; public IntPtr dwExtraInfo; + } + [StructLayout(LayoutKind.Explicit)] public struct KINPUT { + [FieldOffset(0)] public uint type; + [FieldOffset(4)] public KEYBDINPUT ki; + } + [DllImport("user32.dll", SetLastError=true)] public static extern uint SendInput(uint n, INPUT[] i, int cb); + [DllImport("user32.dll", SetLastError=true)] public static extern uint SendInput(uint n, KINPUT[] i, int cb); + + // --- Keyboard --- + [DllImport("user32.dll")] public static extern void keybd_event(byte bVk, byte bScan, uint dwFlags, UIntPtr dwExtraInfo); + [DllImport("user32.dll")] public static extern short VkKeyScan(char ch); + + // --- Window --- + [DllImport("user32.dll")] public static extern IntPtr GetForegroundWindow(); + [DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint pid); + [DllImport("user32.dll", CharSet=CharSet.Unicode)] public static extern int GetWindowText(IntPtr hWnd, StringBuilder sb, int max); + + // Constants + public const uint INPUT_MOUSE = 0, INPUT_KEYBOARD = 1; + public const uint MOUSEEVENTF_LEFTDOWN = 0x0002, MOUSEEVENTF_LEFTUP = 0x0004; + public const uint MOUSEEVENTF_RIGHTDOWN = 0x0008, MOUSEEVENTF_RIGHTUP = 0x0010; + public const uint MOUSEEVENTF_MIDDLEDOWN = 0x0020, MOUSEEVENTF_MIDDLEUP = 0x0040; + public const uint MOUSEEVENTF_WHEEL = 0x0800, MOUSEEVENTF_HWHEEL = 0x1000; + public const uint KEYEVENTF_KEYUP = 0x0002; +} +'@ +` + +// VK_MAP and MODIFIER_KEYS imported from '../win32/shared.js' + +// --------------------------------------------------------------------------- +// Session-level HWND binding — all operations target this handle +// --------------------------------------------------------------------------- + +let boundHwnd: string | null = null +let boundPid: number | null = null +let boundAppType: import('../win32/appDispatcher.js').AppType | null = null +let boundFilePath: string | null = null + +/** Get the bound HWND, or null if not bound */ +export function getBoundHwnd(): string | null { + return boundHwnd +} + +/** Get the bound app type */ +export function getBoundAppType(): string | null { + return boundAppType +} + +/** Bind to a window HWND — all subsequent input/screenshot operations target this handle */ +export function bindWindow(hwnd: string, pid?: number): void { + hwnd = validateHwnd(hwnd) + // Clean up previous binding + if (boundHwnd) { + unmarkBound(boundHwnd) + hideVirtualCursor() + hideIndicator() + } + boundHwnd = hwnd + boundPid = pid ?? null + boundAppType = 'generic' + boundFilePath = null + + // 1. Brief activation: set the window to accept input, then restore user's focus. + // Some apps (UWP/Electron) don't process SendMessage when never-activated. + // Save current foreground → activate target → restore original foreground. + const activateScript = ` +Add-Type @' +using System; +using System.Runtime.InteropServices; +public class CuActivate { + [DllImport("user32.dll")] public static extern IntPtr GetForegroundWindow(); + [DllImport("user32.dll")] public static extern bool SetForegroundWindow(IntPtr h); + [DllImport("user32.dll")] public static extern bool IsIconic(IntPtr h); + [DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr h, int cmd); +} +'@ +$prev = [CuActivate]::GetForegroundWindow() +$target = [IntPtr]::new([long]${hwnd}) +if ([CuActivate]::IsIconic($target)) { [CuActivate]::ShowWindow($target, 9) | Out-Null } +[CuActivate]::SetForegroundWindow($target) | Out-Null +Start-Sleep -Milliseconds 100 +if ($prev -ne [IntPtr]::Zero -and $prev -ne $target) { + [CuActivate]::SetForegroundWindow($prev) | Out-Null +} +` + ps(activateScript) + + // 2. Visual indicators + markBound(hwnd) + showVirtualCursor(hwnd) + showIndicator(hwnd) +} + +/** Bind to a COM-controlled file (Excel/Word — no window needed) */ +export function bindFile( + filePath: string, + appType: import('../win32/appDispatcher.js').AppType, +): void { + boundHwnd = null + boundPid = null + boundAppType = appType + boundFilePath = filePath +} + +/** Unbind — revert to global mode, remove overlays */ +export function unbindWindow(): void { + if (boundHwnd) unmarkBound(boundHwnd) + hideVirtualCursor() + hideIndicator() + // Clear cached edit-child / InputSite mappings + getWm().clearEditChildCache() + boundHwnd = null + boundPid = null + boundAppType = null + boundFilePath = null +} + +// --------------------------------------------------------------------------- +// Window Message module (lazy loaded) +// --------------------------------------------------------------------------- + +let _wm: typeof import('../win32/windowMessage.js') | undefined +function getWm() { + // eslint-disable-next-line @typescript-eslint/no-require-imports + return (_wm ??= + require('../win32/windowMessage.js') as typeof import('../win32/windowMessage.js')) +} + +// --------------------------------------------------------------------------- +// Input — ALL text/key input goes through SendMessage when HWND is bound. +// Global SendInput/keybd_event is DISABLED to avoid interfering with user. +// --------------------------------------------------------------------------- + +// --------------------------------------------------------------------------- +// Input — When HWND is bound, ALL operations go through SendMessage. +// NO global API (SetCursorPos/SendInput/keybd_event/SendKeys) is used. +// This ensures the user's desktop is never disturbed. +// --------------------------------------------------------------------------- + +const input: InputPlatform = { + async moveMouse(x, y) { + if (boundHwnd) { + // Bound mode: move virtual cursor (visual only), no real cursor movement + moveVirtualCursor(Math.round(x), Math.round(y)) + return + } + ps( + `${WIN32_TYPES}; [CuWin32]::SetCursorPos(${Math.round(x)}, ${Math.round(y)}) | Out-Null`, + ) + }, + + async click(x, y, button) { + if (boundHwnd) { + moveVirtualCursor(Math.round(x), Math.round(y), true) + // Find the deepest child window at these client coords and click on it. + const editHwnd = getWm().findEditChild(boundHwnd) + const targetHwnd = editHwnd ?? boundHwnd + const ok = getWm().sendClick( + targetHwnd, + Math.round(x), + Math.round(y), + button, + ) + if (!ok) { + getWm().sendClick(boundHwnd, Math.round(x), Math.round(y), button) + } + return + } + const downFlag = + button === 'left' + ? 'MOUSEEVENTF_LEFTDOWN' + : button === 'right' + ? 'MOUSEEVENTF_RIGHTDOWN' + : 'MOUSEEVENTF_MIDDLEDOWN' + const upFlag = + button === 'left' + ? 'MOUSEEVENTF_LEFTUP' + : button === 'right' + ? 'MOUSEEVENTF_RIGHTUP' + : 'MOUSEEVENTF_MIDDLEUP' + ps( + `${WIN32_TYPES}; [CuWin32]::SetCursorPos(${Math.round(x)}, ${Math.round(y)}) | Out-Null; $i = New-Object CuWin32+INPUT; $i.type=[CuWin32]::INPUT_MOUSE; $i.mi.dwFlags=[CuWin32]::${downFlag}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null; $i.mi.dwFlags=[CuWin32]::${upFlag}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null`, + ) + }, + + async typeText(text) { + // COM-controlled apps: write directly via COM API + if (boundAppType === 'word' && boundFilePath) { + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { appendText } = + require('../win32/comWord.js') as typeof import('../win32/comWord.js') + appendText(boundFilePath, text) + return + } + // HWND-bound apps: SendMessageW(WM_CHAR) or clipboard paste + if (boundHwnd) { + const ok = getWm().sendText(boundHwnd, text) + if (!ok) { + throw new Error( + `typeText failed: SendMessage to HWND ${boundHwnd} returned false. ` + + `The edit control may not have been found (findEditChild returned null).`, + ) + } + return + } + throw new Error( + 'typeText requires a bound window or file. Call open() first.', + ) + }, + + async key(name, action) { + if (boundHwnd) { + const lower = name.toLowerCase() + const vk = VK_MAP[lower] ?? (name.length === 1 ? name.charCodeAt(0) : 0) + if (vk) + getWm().sendKey(boundHwnd, vk, action === 'release' ? 'up' : 'down') + return + } + throw new Error('key requires a bound window HWND. Call open() first.') + }, + + async keys(parts) { + if (boundHwnd) { + const ok = getWm().sendKeys(boundHwnd, parts) + if (!ok) { + throw new Error(`keys [${parts.join('+')}] failed on HWND ${boundHwnd}`) + } + return + } + throw new Error('keys requires a bound window HWND. Call open() first.') + }, + + async scroll(amount, direction) { + if (boundHwnd) { + // WM_VSCROLL / WM_HSCROLL for window-bound scrolling + const msg = direction === 'vertical' ? '0x0115' : '0x0114' // WM_VSCROLL / WM_HSCROLL + const wParam = amount > 0 ? '1' : '0' // SB_LINEDOWN=1 (positive=down) / SB_LINEUP=0 (negative=up) + const n = Math.abs(Math.round(amount)) + let script = ` +Add-Type @' +using System; +using System.Runtime.InteropServices; +public class WScroll { + [DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")] + public static extern IntPtr SendMessage(IntPtr h, uint m, IntPtr w, IntPtr l); +} +'@ +` + for (let i = 0; i < n; i++) { + script += `[WScroll]::SendMessage([IntPtr]::new([long]${boundHwnd}), ${msg}, [IntPtr]${wParam}, [IntPtr]::Zero) | Out-Null; ` + } + ps(script) + return + } + const flag = + direction === 'vertical' ? 'MOUSEEVENTF_WHEEL' : 'MOUSEEVENTF_HWHEEL' + ps( + `${WIN32_TYPES}; $i = New-Object CuWin32+INPUT; $i.type=[CuWin32]::INPUT_MOUSE; $i.mi.dwFlags=[CuWin32]::${flag}; $i.mi.mouseData=${amount * 120}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null`, + ) + }, + + async mouseLocation() { + // Always returns real cursor position (informational, doesn't move it) + const out = ps( + `${WIN32_TYPES}; $p = New-Object CuWin32+POINT; [CuWin32]::GetCursorPos([ref]$p) | Out-Null; "$($p.X),$($p.Y)"`, + ) + const [xStr, yStr] = out.split(',') + return { x: Number(xStr), y: Number(yStr) } + }, + + async sendChar(hwnd, char) { + getWm().sendChar(String(hwnd), char) + }, + async sendKey(hwnd, vk, action) { + getWm().sendKey(String(hwnd), vk, action) + }, + async sendClick(hwnd, x, y, button) { + getWm().sendClick(String(hwnd), x, y, button) + }, + async sendText(hwnd, text) { + getWm().sendText(String(hwnd), text) + }, +} + +// --------------------------------------------------------------------------- +// Screenshot — JPEG output only +// --------------------------------------------------------------------------- + +const screenshot: ScreenshotPlatform = { + async captureScreen(displayId) { + // If HWND is bound, capture that specific window + if (boundHwnd) { + const result = this.captureWindow?.(String(boundHwnd)) + if (result) return result + } + + // Python Bridge (mss + Pillow, ~300ms) + const bridgeResult = bridgeCallSync('screenshot', { + display_id: displayId ?? 0, + }) + if (bridgeResult && bridgeResult.base64) { + return bridgeResult + } + + throw new Error( + '[computer-use] Screenshot failed: Python bridge returned no data. ' + + 'Ensure python3 + mss + Pillow are installed (pip install mss Pillow).', + ) + }, + + async captureRegion(x, y, w, h) { + // When HWND is bound, the window IS the region (matches macOS behavior) + if (boundHwnd) { + const result = this.captureWindow?.(String(boundHwnd)) + if (result) return result + } + return this.captureScreen() + }, + + captureWindow(hwnd) { + // Python Bridge (ctypes PrintWindow + GDI → Pillow JPEG, ~300ms) + const bridgeResult = bridgeCallSync('screenshot_window', { + hwnd: String(hwnd), + }) + if (bridgeResult && bridgeResult.base64) { + return bridgeResult + } + + throw new Error( + `[computer-use] Window screenshot failed for HWND ${hwnd}: Python bridge returned no data.`, + ) + }, +} + +// --------------------------------------------------------------------------- +// Display — Screen.AllScreens +// --------------------------------------------------------------------------- + +const display: DisplayPlatform = { + listAll(): DisplayInfo[] { + try { + const raw = ps(` +Add-Type -AssemblyName System.Windows.Forms +$result = @() +$idx = 0 +foreach ($s in [System.Windows.Forms.Screen]::AllScreens) { + $result += "$($s.Bounds.Width),$($s.Bounds.Height),$idx,$($s.Primary)" + $idx++ +} +$result -join "|" +`) + return raw + .split('|') + .filter(Boolean) + .map(entry => { + const [w, h, id] = entry.split(',') + return { + width: Number(w), + height: Number(h), + scaleFactor: 1, + displayId: Number(id), + } + }) + } catch { + return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }] + } + }, + + getSize(displayId): DisplayInfo { + const all = this.listAll() + if (displayId !== undefined) { + const found = all.find(d => d.displayId === displayId) + if (found) return found + } + return all[0] ?? { width: 1920, height: 1080, scaleFactor: 1, displayId: 0 } + }, +} + +// --------------------------------------------------------------------------- +// Find existing window by process name or title (avoid launching new instance) +// --------------------------------------------------------------------------- + +function findExistingWindow( + hint: string, +): { hwnd: string; pid: number } | null { + const windows = listWindows() + const lower = hint.toLowerCase() + // Match by window title containing the hint + for (const w of windows) { + const titleLower = (w.title ?? '').toLowerCase() + if (titleLower.includes(lower)) { + return { hwnd: w.hwnd, pid: w.pid } + } + } + return null +} + +// --------------------------------------------------------------------------- +// Apps — EnumWindows + registry + AppxPackage +// --------------------------------------------------------------------------- + +const apps: AppsPlatform = { + listRunning(): WindowHandle[] { + const windows = listWindows() + return windows.map(w => ({ + id: String(w.hwnd), + pid: w.pid, + title: w.title, + })) + }, + + async listInstalled(): Promise { + try { + const raw = await psAsync(` +$apps = @() + +# Traditional Win32 apps from registry +$paths = @( + 'HKLM:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*', + 'HKLM:\\SOFTWARE\\WOW6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*', + 'HKCU:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*' +) +foreach ($p in $paths) { + Get-ItemProperty $p -ErrorAction SilentlyContinue | Where-Object { $_.DisplayName } | ForEach-Object { + $apps += "$($_.DisplayName)|$($_.InstallLocation)|$($_.PSChildName)" + } +} + +# UWP/MSIX apps (Windows 10/11 Store apps) +Get-AppxPackage -ErrorAction SilentlyContinue | Where-Object { $_.IsFramework -eq $false -and $_.SignatureKind -eq 'Store' } | ForEach-Object { + $cleanName = $_.Name -replace '^Microsoft\\.Windows', '' -replace '^Microsoft\\.', '' + $apps += "$cleanName|$($_.InstallLocation)|$($_.PackageFamilyName)" +} + +$apps | Select-Object -Unique | Select-Object -First 300 +`) + return raw + .split('\n') + .filter(Boolean) + .map(line => { + const [name, path, id] = line.trim().split('|', 3) + return { + id: (id ?? name ?? '').trim(), + displayName: (name ?? '').trim(), + path: (path ?? '').trim(), + } + }) + } catch { + return [] + } + }, + + async open(name) { + // Detect app type and route to appropriate controller + const appType = detectAppType(name) + + // Excel/Word → COM automation (no window, no HWND) + if (appType === 'excel' || appType === 'word') { + const result = await openWithController(name) + if (result.filePath) { + bindFile(result.filePath, result.type) + } + return + } + + // Text/Browser/Generic → exe launch + HWND bind (offscreen) + // If name is a UWP PackageFamilyName (e.g. Microsoft.WindowsNotepad_8wekyb3d8bbwe), + // extract the app name and try as exe. This avoids launching through UWP shell. + let launchName = name + if (name.includes('_') && name.includes('.')) { + // Microsoft.WindowsNotepad_xxx → Notepad + // Microsoft.WindowsCalculator_xxx → Calculator + // Microsoft.WindowsTerminal_xxx → Terminal + const parts = name.split('_')[0]?.split('.') ?? [] + const appPart = parts[parts.length - 1] ?? name + // Strip "Windows" prefix: WindowsNotepad → Notepad + launchName = appPart.replace(/^Windows/, '') || appPart + } + + // --- Try to find an EXISTING window first (by process name or title) --- + // If found, auto-bind to it. Use bind_window tool to switch later. + const existingHwnd = findExistingWindow(launchName) + if (existingHwnd) { + bindWindow(existingHwnd.hwnd, existingHwnd.pid) + return + } + const escaped = launchName.replace(/'/g, "''") + const result = await psAsync(` +${WIN32_TYPES} +Add-Type @' +using System; +using System.Runtime.InteropServices; +using System.Text; +public class CuLaunch { + public delegate bool EnumProc(IntPtr h, IntPtr lp); + [DllImport("user32.dll")] public static extern bool EnumWindows(EnumProc cb, IntPtr lp); + [DllImport("user32.dll")] public static extern bool IsWindowVisible(IntPtr h); + [DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr h, out uint pid); + [DllImport("user32.dll", CharSet=CharSet.Unicode)] public static extern int GetWindowText(IntPtr h, StringBuilder sb, int n); + [DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr h, int cmd); + public const int SW_SHOWMINNOACTIVE = 7; + // Get all visible window HWNDs as array + public static long[] GetAllVisibleHwnds() { + var list = new System.Collections.Generic.List(); + EnumWindows((h, _) => { + if (IsWindowVisible(h)) list.Add(h.ToInt64()); + return true; + }, IntPtr.Zero); + return list.ToArray(); + } + // Get PID for a single HWND + public static uint GetPidForHwnd(long hwnd) { + uint pid; GetWindowThreadProcessId((IntPtr)hwnd, out pid); + return pid; + } + // Get title for a single HWND + public static string GetTitle(long hwnd) { + var sb = new StringBuilder(256); + GetWindowText((IntPtr)hwnd, sb, 256); + return sb.ToString(); + } +} +'@ +# Launch strategy: all exe-based, no GUI dialogs. +# 1) exact path 2) exe in PATH 3) registry install dir 4) raw name +$target = '${escaped}' +$proc = $null + +# 1. Exact file path +if (Test-Path $target) { + $proc = Start-Process $target -PassThru -ErrorAction SilentlyContinue +} + +# 2. exe name in PATH (notepad.exe, code.exe, chrome.exe, etc.) +if (-not $proc) { + # Try with .exe suffix if not already + $tryExe = if ($target -notmatch '[.]exe$') { "$target.exe" } else { $target } + $found = Get-Command $tryExe -ErrorAction SilentlyContinue | Select-Object -First 1 + if (-not $found) { $found = Get-Command $target -ErrorAction SilentlyContinue | Select-Object -First 1 } + if ($found) { $proc = Start-Process $found.Source -PassThru -ErrorAction SilentlyContinue } +} + +# 3. Search registry for install location by display name → find .exe +if (-not $proc) { + $regPaths = @('HKLM:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*','HKLM:\\SOFTWARE\\WOW6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*','HKCU:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*') + foreach ($p in $regPaths) { + $app = Get-ItemProperty $p -ErrorAction SilentlyContinue | Where-Object { + $_.DisplayName -and $_.DisplayName -match [regex]::Escape($target) + } | Select-Object -First 1 + if ($app) { + # Try DisplayIcon (often the exe path), then InstallLocation + $exePath = $null + if ($app.DisplayIcon -and $app.DisplayIcon -match '[.]exe') { + $exePath = ($app.DisplayIcon -split ',')[0].Trim('"') + } + if (-not $exePath -and $app.InstallLocation) { + $exeFile = Get-ChildItem $app.InstallLocation -Filter '*.exe' -ErrorAction SilentlyContinue | Select-Object -First 1 + if ($exeFile) { $exePath = $exeFile.FullName } + } + if ($exePath -and (Test-Path $exePath)) { + $proc = Start-Process $exePath -PassThru -ErrorAction SilentlyContinue + break + } + } + } +} + +# 4. Last resort: direct Start-Process (Windows may resolve it) +if (-not $proc) { $proc = Start-Process -FilePath $target -PassThru -ErrorAction SilentlyContinue } + +if (-not $proc) { Write-Host "LAUNCH_FAILED"; exit } + +# Snapshot ALL visible window HWNDs before the new window appears +$beforeHwnds = [CuLaunch]::GetAllVisibleHwnds() + +# Wait for a NEW window from our process PID +$hwnd = 0 +for ($i = 0; $i -lt 50; $i++) { + Start-Sleep -Milliseconds 200 + $afterHwnds = [CuLaunch]::GetAllVisibleHwnds() + # Find new windows (in after but not in before) + foreach ($h in $afterHwnds) { + if ($beforeHwnds -contains $h) { continue } + # New window found — check PID + $wPid = [CuLaunch]::GetPidForHwnd($h) + if ($wPid -eq [uint32]$proc.Id) { + $hwnd = $h; break # exact PID match + } + } + if ($hwnd -ne 0) { break } + # PID didn't match (process redirect) — accept new window matching title hint + if ($i -gt 10) { + $hint = '${escaped}'.Split('\\')[-1].Replace('.exe','') + foreach ($h in $afterHwnds) { + if ($beforeHwnds -contains $h) { continue } + $title = [CuLaunch]::GetTitle($h) + if ($title -and $title.IndexOf($hint, [StringComparison]::OrdinalIgnoreCase) -ge 0) { + $hwnd = $h; break + } + } + if ($hwnd -ne 0) { break } + } +} +if ($hwnd -eq 0) { Write-Host "HWND_NOT_FOUND|$($proc.Id)"; exit } +# Move offscreen instead of minimizing — keeps window restored so +# PrintWindow and SendMessage work without needing restore/re-minimize. +# User cannot see the window at -32000,-32000. +Add-Type @' +using System; +using System.Runtime.InteropServices; +public class CuPos { + [DllImport("user32.dll")] public static extern bool SetWindowPos(IntPtr h, IntPtr a, int x, int y, int w, int h2, uint f); + public const uint SWP_NOSIZE = 0x0001; + public const uint SWP_NOZORDER = 0x0004; + public const uint SWP_NOACTIVATE = 0x0010; +} +'@ +[CuPos]::SetWindowPos([IntPtr]::new([long]$hwnd), [IntPtr]::Zero, -32000, -32000, 0, 0, [CuPos]::SWP_NOSIZE -bor [CuPos]::SWP_NOZORDER -bor [CuPos]::SWP_NOACTIVATE) | Out-Null +Write-Host "$hwnd|$($proc.Id)" +`) + if (!result) { + throw new Error( + `open(): failed to launch '${name}' — no output from launcher script`, + ) + } + if (result.startsWith('LAUNCH_FAILED')) { + throw new Error( + `open(): failed to launch '${name}' — process did not start (${result})`, + ) + } + if (result.startsWith('HWND_NOT_FOUND')) { + throw new Error( + `open(): launched '${name}' but could not find its window HWND (${result})`, + ) + } + const parts = result.trim().split('|') + const hwnd = parts[0]!.trim() + const pid = Number(parts[1]) + if (hwnd && hwnd !== '0') { + // Bind to the launched window — all subsequent operations target this HWND + bindWindow(hwnd, pid) + } + }, + + getFrontmostApp(): FrontmostAppInfo | null { + try { + const out = ps(`${WIN32_TYPES} +$hwnd = [CuWin32]::GetForegroundWindow() +$procId = [uint32]0 +[CuWin32]::GetWindowThreadProcessId($hwnd, [ref]$procId) | Out-Null +$proc = Get-Process -Id $procId -ErrorAction SilentlyContinue +"$($proc.MainModule.FileName)|$($proc.ProcessName)"`) + if (!out || !out.includes('|')) return null + const [exePath, appName] = out.split('|', 2) + return { id: exePath!, appName: appName! } + } catch { + return null + } + }, + + findWindowByTitle(title): WindowHandle | null { + const windows = listWindows() + const found = windows.find(w => w.title.includes(title)) + if (!found) return null + return { id: String(found.hwnd), pid: found.pid, title: found.title } + }, +} + +// --------------------------------------------------------------------------- +// Window Management — Win32 API calls targeted at bound HWND. +// NO global shortcuts (Win+Down, Alt+F4, etc.) +// Uses ShowWindow, SetWindowPos, SendMessage(WM_CLOSE) directly. +// --------------------------------------------------------------------------- + +const WINDOW_MGMT_TYPES = ` +Add-Type @' +using System; +using System.Runtime.InteropServices; + +public class CuWinMgmt { + [DllImport("user32.dll")] + public static extern bool ShowWindow(IntPtr hWnd, int nCmdShow); + + [DllImport("user32.dll")] + public static extern bool SetWindowPos(IntPtr hWnd, IntPtr hAfter, int X, int Y, int cx, int cy, uint uFlags); + + [DllImport("user32.dll")] + public static extern bool GetWindowRect(IntPtr hWnd, out RECT lpRect); + + [DllImport("user32.dll")] + public static extern bool SetForegroundWindow(IntPtr hWnd); + + [DllImport("user32.dll")] + public static extern bool BringWindowToTop(IntPtr hWnd); + + [DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")] + public static extern IntPtr SendMessage(IntPtr hWnd, uint Msg, IntPtr wParam, IntPtr lParam); + + [DllImport("user32.dll")] + public static extern bool IsIconic(IntPtr hWnd); + + [DllImport("user32.dll")] + public static extern bool IsZoomed(IntPtr hWnd); + + [StructLayout(LayoutKind.Sequential)] + public struct RECT { + public int Left; public int Top; public int Right; public int Bottom; + } + + // ShowWindow constants + public const int SW_MINIMIZE = 6; + public const int SW_MAXIMIZE = 3; + public const int SW_RESTORE = 9; + public const int SW_SHOWNOACTIVATE = 4; + public const int SW_SHOWMINNOACTIVE = 7; + + // SetWindowPos flags + public const uint SWP_NOSIZE = 0x0001; + public const uint SWP_NOMOVE = 0x0002; + public const uint SWP_NOZORDER = 0x0004; + public const uint SWP_NOACTIVATE = 0x0010; + public const uint SWP_SHOWWINDOW = 0x0040; + + // WM_CLOSE + public const uint WM_CLOSE = 0x0010; + // WM_SYSCOMMAND + public const uint WM_SYSCOMMAND = 0x0112; + public const int SC_MINIMIZE = 0xF020; + public const int SC_MAXIMIZE = 0xF030; + public const int SC_RESTORE = 0xF120; + public const int SC_CLOSE = 0xF060; +} +'@ +` + +import type { WindowManagementPlatform, WindowAction } from './types.js' + +const windowManagement: WindowManagementPlatform = { + manageWindow(action: WindowAction, opts?): boolean { + if (!boundHwnd) return false + const hwnd = boundHwnd + + switch (action) { + case 'minimize': { + // ShowWindow(SW_MINIMIZE) — targeted at HWND, not global + const r = ps( + `${WINDOW_MGMT_TYPES}; [CuWinMgmt]::ShowWindow([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::SW_SHOWMINNOACTIVE)`, + ) + return r !== '' + } + case 'maximize': { + const r = ps( + `${WINDOW_MGMT_TYPES}; [CuWinMgmt]::ShowWindow([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::SW_MAXIMIZE)`, + ) + return r !== '' + } + case 'restore': { + const r = ps( + `${WINDOW_MGMT_TYPES}; [CuWinMgmt]::ShowWindow([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::SW_RESTORE)`, + ) + return r !== '' + } + case 'close': { + // SendMessage(WM_CLOSE) — graceful close targeted at HWND + // Also clean up border overlay + unmarkBound(hwnd) + ps( + `${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SendMessage([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::WM_CLOSE, [IntPtr]::Zero, [IntPtr]::Zero)`, + ) + unbindWindow() + return true + } + case 'focus': { + // Restore if minimized, then bring to front + ps(`${WINDOW_MGMT_TYPES} +$h = [IntPtr]::new([long]${hwnd}) +if ([CuWinMgmt]::IsIconic($h)) { + [CuWinMgmt]::ShowWindow($h, [CuWinMgmt]::SW_RESTORE) | Out-Null +} +[CuWinMgmt]::SetForegroundWindow($h) | Out-Null +[CuWinMgmt]::BringWindowToTop($h) | Out-Null +`) + return true + } + case 'move_offscreen': { + // Move to -32000,-32000 — keeps window in restored state for SendMessage/PrintWindow + ps( + `${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SetWindowPos([IntPtr]::new([long]${hwnd}), [IntPtr]::Zero, -32000, -32000, 0, 0, [CuWinMgmt]::SWP_NOSIZE -bor [CuWinMgmt]::SWP_NOZORDER -bor [CuWinMgmt]::SWP_NOACTIVATE)`, + ) + return true + } + case 'move_resize': { + if (opts?.x !== undefined && opts?.y !== undefined) { + this.moveResize(opts.x, opts.y, opts.width, opts.height) + } + return true + } + case 'get_rect': { + // get_rect is handled separately by getWindowRect(), not through manageWindow + // Return true to indicate the action is recognized + return true + } + default: + return false + } + }, + + moveResize(x: number, y: number, width?: number, height?: number): boolean { + if (!boundHwnd) return false + const hwnd = boundHwnd + if (width !== undefined && height !== undefined) { + ps( + `${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SetWindowPos([IntPtr]::new([long]${hwnd}), [IntPtr]::Zero, ${x}, ${y}, ${width}, ${height}, [CuWinMgmt]::SWP_NOZORDER -bor [CuWinMgmt]::SWP_NOACTIVATE)`, + ) + } else { + ps( + `${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SetWindowPos([IntPtr]::new([long]${hwnd}), [IntPtr]::Zero, ${x}, ${y}, 0, 0, [CuWinMgmt]::SWP_NOSIZE -bor [CuWinMgmt]::SWP_NOZORDER -bor [CuWinMgmt]::SWP_NOACTIVATE)`, + ) + } + return true + }, + + getWindowRect(): { + x: number + y: number + width: number + height: number + } | null { + if (!boundHwnd) return null + const out = ps(`${WINDOW_MGMT_TYPES} +$rect = New-Object CuWinMgmt+RECT +if ([CuWinMgmt]::GetWindowRect([IntPtr]::new([long]${boundHwnd}), [ref]$rect)) { + "$($rect.Left),$($rect.Top),$($rect.Right),$($rect.Bottom)" +} else { "FAIL" } +`) + if (!out || out === 'FAIL') return null + const [l, t, r, b] = out.split(',').map(Number) + return { x: l, y: t, width: r - l, height: b - t } + }, +} + +// --------------------------------------------------------------------------- +// Export +// --------------------------------------------------------------------------- + +// Clean up all overlays on process exit +function cleanupAll() { + cleanupAllBorders() + hideVirtualCursor() + hideIndicator() + // Stop the Python bridge subprocess if it was started + try { + getBridge()?.stopBridge() + } catch {} +} +process.on('exit', cleanupAll) +process.on('SIGINT', () => { + cleanupAll() + process.exit() +}) +process.on('SIGTERM', () => { + cleanupAll() + process.exit() +}) + +export const platform: Platform = { + input, + screenshot, + display, + apps, + windowManagement, +} diff --git a/src/utils/computerUse/swiftLoader.ts b/src/utils/computerUse/swiftLoader.ts index 0fcc23fea..95e5ead7e 100644 --- a/src/utils/computerUse/swiftLoader.ts +++ b/src/utils/computerUse/swiftLoader.ts @@ -3,21 +3,16 @@ import type { ComputerUseAPI } from '@ant/computer-use-swift' let cached: ComputerUseAPI | undefined /** - * Package's js/index.js reads COMPUTER_USE_SWIFT_NODE_PATH (baked by - * build-with-plugins.ts on darwin targets, unset otherwise — falls through to - * the node_modules prebuilds/ path). We cache the loaded native module. - * - * The four @MainActor methods (captureExcluding, captureRegion, - * apps.listInstalled, resolvePrepareCapture) dispatch to DispatchQueue.main - * and will hang under libuv unless CFRunLoop is pumped — call sites wrap - * these in drainRunLoop(). + * macOS-only loader for @ant/computer-use-swift. + * Non-darwin platforms should use src/utils/computerUse/platforms/ instead. */ export function requireComputerUseSwift(): ComputerUseAPI { + if (process.platform !== 'darwin') { + throw new Error('@ant/computer-use-swift is macOS-only. Use platforms/ for cross-platform.') + } if (cached) return cached // eslint-disable-next-line @typescript-eslint/no-require-imports const mod = require('@ant/computer-use-swift') - // macOS native .node exports a plain object with apps/display/screenshot directly. - // Our cross-platform package exports { ComputerUseAPI } class — needs instantiation. if (mod.ComputerUseAPI && typeof mod.ComputerUseAPI === 'function') { cached = new mod.ComputerUseAPI() as ComputerUseAPI } else { diff --git a/src/utils/computerUse/win32/accessibilitySnapshot.ts b/src/utils/computerUse/win32/accessibilitySnapshot.ts new file mode 100644 index 000000000..ba0110d78 --- /dev/null +++ b/src/utils/computerUse/win32/accessibilitySnapshot.ts @@ -0,0 +1,225 @@ +/** + * Accessibility Snapshot — captures the UI Automation tree of a window + * and formats it as compact, model-friendly text. + * + * Sent alongside screenshots so the model has BOTH visual + structural + * understanding of the GUI. This enables: + * - Knowing exact element names, types, and positions + * - Using click_element/type_into_element by name instead of pixel coords + * - Understanding disabled/enabled state, current values + * + * Only includes interactive elements (buttons, edits, menus, links, etc.) + * to keep token count low (~200-500 tokens for typical windows). + */ + +import { validateHwnd, ps } from './shared.js' + +export interface AccessibilityNode { + role: string // Button, Edit, MenuItem, Link, Text, CheckBox, etc. + name: string // Visible text / accessible name + automationId: string + bounds: { x: number; y: number; w: number; h: number } + enabled: boolean + value?: string // Current text value (for Edit/ComboBox) + children?: AccessibilityNode[] +} + +export interface AccessibilitySnapshot { + /** Compact text representation for the model */ + text: string + /** Structured tree (for element-targeted actions) */ + nodes: AccessibilityNode[] + /** Capture timestamp */ + timestamp: number +} + +/** + * Capture the accessibility tree of a window, returning only interactive + * and visible elements. Uses Windows UI Automation (crosses process boundaries). + * + * @param hwnd - Window handle as string + * @param maxDepth - Maximum tree depth (default 4) + * @param interactiveOnly - Only include interactive elements (default true) + */ +export function captureAccessibilitySnapshot( + hwnd: string, + maxDepth: number = 4, + interactiveOnly: boolean = true, +): AccessibilitySnapshot | null { + hwnd = validateHwnd(hwnd) + const filterClause = interactiveOnly + ? ` + # Interactive control types only + $interactiveTypes = @( + 'Button','Edit','ComboBox','CheckBox','RadioButton', + 'MenuItem','Menu','MenuBar','Link','Slider','Spinner', + 'Tab','TabItem','List','ListItem','Tree','TreeItem', + 'DataGrid','DataItem','Document','ScrollBar','ToolBar', + 'SplitButton','ToggleButton','Hyperlink' + ) + function Is-Interactive($ct) { + $typeName = $ct -replace 'ControlType\\.', '' + return $interactiveTypes -contains $typeName + }` + : ` + function Is-Interactive($ct) { return $true }` + + const script = ` +Add-Type -AssemblyName UIAutomationClient +Add-Type -AssemblyName UIAutomationTypes +Add-Type -AssemblyName WindowsBase +${filterClause} + +function Get-Tree($el, $depth, $maxDepth) { + if ($depth -ge $maxDepth) { return @() } + $result = @() + $children = $el.FindAll( + [System.Windows.Automation.TreeScope]::Children, + [System.Windows.Automation.Condition]::TrueCondition) + foreach ($child in $children) { + $ct = $child.Current.ControlType.ProgrammaticName + $typeName = $ct -replace 'ControlType\\.', '' + $name = [string]$child.Current.Name + $autoId = [string]$child.Current.AutomationId + $rect = $child.Current.BoundingRectangle + $enabled = $child.Current.IsEnabled + + # Skip invisible/offscreen elements + if ($rect.Width -le 0 -or $rect.Height -le 0) { continue } + if ($rect.X -lt -10000) { continue } + + $val = $null + try { + $vp = $child.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern) + if ($vp -ne $null) { $val = $vp.Current.Value } + } catch {} + + $isInteractive = Is-Interactive $ct + $sub = Get-Tree $child ($depth + 1) $maxDepth + + if ($isInteractive -or $sub.Count -gt 0) { + $node = @{ + role = $typeName + name = $name + id = $autoId + x = [int]$rect.X; y = [int]$rect.Y + w = [int]$rect.Width; h = [int]$rect.Height + on = $enabled + } + if ($val -ne $null -and $val -ne '') { $node['v'] = $val } + if ($sub.Count -gt 0) { $node['c'] = $sub } + $result += $node + } + } + return $result +} + +try { + $root = [System.Windows.Automation.AutomationElement]::FromHandle([IntPtr]::new([long]${hwnd})) + if ($root -eq $null) { Write-Output '[]'; exit } + $tree = Get-Tree $root 0 ${maxDepth} + if ($tree -eq $null -or $tree.Count -eq 0) { + Write-Output '[]' + } else { + $tree | ConvertTo-Json -Depth 20 -Compress + } +} catch { + Write-Output '[]' +} +` + + try { + const raw = ps(script) + if (!raw || raw === '[]') return null + + const parsed = JSON.parse(raw) + const nodes: AccessibilityNode[] = Array.isArray(parsed) + ? parsed.map(parseNode) + : [parseNode(parsed)] + const text = formatForModel(nodes) + + return { text, nodes, timestamp: Date.now() } + } catch { + return null + } +} + +function parseNode(raw: any): AccessibilityNode { + return { + role: raw.role || '', + name: raw.name || '', + automationId: raw.id || '', + bounds: { x: raw.x || 0, y: raw.y || 0, w: raw.w || 0, h: raw.h || 0 }, + enabled: raw.on !== false, + value: raw.v, + children: raw.c + ? Array.isArray(raw.c) + ? raw.c.map(parseNode) + : [parseNode(raw.c)] + : undefined, + } +} + +/** + * Format the accessibility tree as compact text for the model. + * Example output: + * [Button] "Save" (120,50 80x30) enabled + * [Edit] "" (200,80 400x25) enabled value="hello world" id=textBox1 + * [MenuItem] "File" (10,0 40x25) enabled + */ +function formatForModel( + nodes: AccessibilityNode[], + indent: number = 0, +): string { + const lines: string[] = [] + const pad = ' '.repeat(indent) + + for (const node of nodes) { + let line = `${pad}[${node.role}]` + if (node.name) line += ` "${truncate(node.name, 40)}"` + line += ` (${node.bounds.x},${node.bounds.y} ${node.bounds.w}x${node.bounds.h})` + if (!node.enabled) line += ' DISABLED' + if (node.value) line += ` value="${truncate(node.value, 30)}"` + if (node.automationId) line += ` id=${node.automationId}` + lines.push(line) + + if (node.children) { + lines.push(formatForModel(node.children, indent + 1)) + } + } + + return lines.join('\n') +} + +function truncate(s: string, max: number): string { + return s.length > max ? s.slice(0, max - 1) + '…' : s +} + +/** + * Find an element in the accessibility tree by name, role, or automationId. + * Returns the first match. + */ +export function findNodeInSnapshot( + nodes: AccessibilityNode[], + query: { name?: string; role?: string; automationId?: string }, +): AccessibilityNode | null { + for (const node of nodes) { + let match = true + if ( + query.name && + !node.name.toLowerCase().includes(query.name.toLowerCase()) + ) + match = false + if (query.role && node.role.toLowerCase() !== query.role.toLowerCase()) + match = false + if (query.automationId && node.automationId !== query.automationId) + match = false + if (match && (query.name || query.role || query.automationId)) return node + + if (node.children) { + const found = findNodeInSnapshot(node.children, query) + if (found) return found + } + } + return null +} diff --git a/src/utils/computerUse/win32/appDispatcher.ts b/src/utils/computerUse/win32/appDispatcher.ts new file mode 100644 index 000000000..f3f34324c --- /dev/null +++ b/src/utils/computerUse/win32/appDispatcher.ts @@ -0,0 +1,129 @@ +/** + * Application type dispatcher for Windows Computer Use. + * + * Routes operations to the appropriate controller based on file type: + * - .xlsx/.xls/.csv → Excel COM (headless, no window) + * - .docx/.doc → Word COM (headless, no window) + * - .txt/.log/.md → notepad + SendMessage + HWND bind (offscreen) + * - Others → generic exe + HWND bind (offscreen) + */ + +import { extname } from 'path' + +export type AppType = 'excel' | 'word' | 'text' | 'browser' | 'generic' + +const EXCEL_EXTS = new Set(['.xlsx', '.xls', '.csv', '.xlsm', '.xlsb']) +const WORD_EXTS = new Set(['.docx', '.doc', '.rtf']) +const TEXT_EXTS = new Set([ + '.txt', + '.log', + '.md', + '.json', + '.xml', + '.yaml', + '.yml', + '.ini', + '.cfg', + '.conf', +]) +const BROWSER_NAMES = new Set(['chrome', 'msedge', 'firefox', 'brave', 'opera']) + +/** + * Detect application type from file path or app name. + */ +export function detectAppType(nameOrPath: string): AppType { + const lower = nameOrPath.toLowerCase() + + // Check by extension + const ext = extname(lower) + if (ext) { + if (EXCEL_EXTS.has(ext)) return 'excel' + if (WORD_EXTS.has(ext)) return 'word' + if (TEXT_EXTS.has(ext)) return 'text' + } + + // Check by app name + const baseName = + lower + .replace(/\.exe$/, '') + .split(/[/\\]/) + .pop() ?? '' + if (baseName === 'excel' || baseName.includes('excel')) return 'excel' + if ( + baseName === 'winword' || + baseName === 'word' || + baseName.includes('word') + ) + return 'word' + if (baseName === 'notepad' || baseName === 'notepad++' || baseName === 'code') + return 'text' + if (BROWSER_NAMES.has(baseName)) return 'browser' + + return 'generic' +} + +export interface OpenResult { + type: AppType + /** HWND for text/browser/generic apps (SendMessage target) */ + hwnd?: string + /** File path for COM-controlled apps (Excel/Word) */ + filePath?: string +} + +/** + * Open a file or app with the appropriate controller. + * + * - Excel/Word: COM automation (no window, no HWND needed) + * - Text/Browser/Generic: exe launch + offscreen HWND bind + * + * Returns the app type and either HWND or file path for subsequent operations. + */ +export async function openWithController( + nameOrPath: string, +): Promise { + const type = detectAppType(nameOrPath) + + switch (type) { + case 'excel': { + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { createExcel, openExcel } = + require('./comExcel.js') as typeof import('./comExcel.js') + const isExisting = nameOrPath.match(/\.(xlsx|xls|csv|xlsm|xlsb)$/i) + if (isExisting) { + // Open existing file — just verify it's readable + try { + openExcel(nameOrPath) + return { type: 'excel', filePath: nameOrPath } + } catch { + return { type: 'excel', filePath: nameOrPath } + } + } + // "excel" or "excel.exe" without a file — create new + const tmpPath = `${process.env.TEMP || '/tmp'}\\cu_new_${Date.now()}.xlsx` + createExcel(tmpPath) + return { type: 'excel', filePath: tmpPath } + } + + case 'word': { + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { createWord, openWord } = + require('./comWord.js') as typeof import('./comWord.js') + const isExisting = nameOrPath.match(/\.(docx|doc|rtf)$/i) + if (isExisting) { + try { + openWord(nameOrPath) + return { type: 'word', filePath: nameOrPath } + } catch { + return { type: 'word', filePath: nameOrPath } + } + } + const tmpPath = `${process.env.TEMP || '/tmp'}\\cu_new_${Date.now()}.docx` + createWord(tmpPath) + return { type: 'word', filePath: tmpPath } + } + + default: + // text/browser/generic — HWND bind handled by caller (platforms/win32.ts open()) + return { type } + } +} diff --git a/src/utils/computerUse/win32/bridge.py b/src/utils/computerUse/win32/bridge.py new file mode 100644 index 000000000..5bba36978 --- /dev/null +++ b/src/utils/computerUse/win32/bridge.py @@ -0,0 +1,525 @@ +""" +Python Bridge for Windows Computer Use. + +Long-lived subprocess communicating via stdin/stdout JSON lines. +Replaces per-call PowerShell spawning with a persistent process. + +Capabilities: + - screenshot: full-screen or per-window (mss + PrintWindow) + - input: mouse click/move/drag, keyboard type/key (ctypes user32) + - windows: enumerate, find, get rect, manage (show/min/max/close) + - accessibility: UI Automation tree snapshot (comtypes + UIAutomation) + +Protocol: one JSON object per line on stdin → one JSON object per line on stdout. + Request: {"id": 1, "method": "screenshot", "params": {...}} + Response: {"id": 1, "result": {...}} or {"id": 1, "error": "message"} +""" + +import sys +import json +import base64 +import io +import ctypes +import ctypes.wintypes +import time +import os + +# Force UTF-8 output +sys.stdout.reconfigure(encoding='utf-8') +sys.stdin.reconfigure(encoding='utf-8') + +user32 = ctypes.windll.user32 +gdi32 = ctypes.windll.gdi32 +kernel32 = ctypes.windll.kernel32 + +# --------------------------------------------------------------------------- +# Win32 constants & types +# --------------------------------------------------------------------------- +WM_CHAR = 0x0102 +WM_KEYDOWN = 0x0100 +WM_KEYUP = 0x0101 +WM_CLOSE = 0x0010 +WM_LBUTTONDOWN = 0x0201 +WM_LBUTTONUP = 0x0202 +WM_RBUTTONDOWN = 0x0204 +WM_RBUTTONUP = 0x0205 +WM_MOUSEMOVE = 0x0200 + +SW_MINIMIZE = 6 +SW_MAXIMIZE = 3 +SW_RESTORE = 9 +SW_SHOWMINNOACTIVE = 7 + +SWP_NOSIZE = 0x0001 +SWP_NOMOVE = 0x0002 +SWP_NOZORDER = 0x0004 +SWP_NOACTIVATE = 0x0010 + +WNDENUMPROC = ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p) + +class RECT(ctypes.Structure): + _fields_ = [("left", ctypes.c_long), ("top", ctypes.c_long), + ("right", ctypes.c_long), ("bottom", ctypes.c_long)] + +class POINT(ctypes.Structure): + _fields_ = [("x", ctypes.c_long), ("y", ctypes.c_long)] + +# SendMessageW +SendMessageW = user32.SendMessageW +SendMessageW.argtypes = [ctypes.c_void_p, ctypes.c_uint, ctypes.c_void_p, ctypes.c_void_p] +SendMessageW.restype = ctypes.c_void_p + +# --------------------------------------------------------------------------- +# Screenshot +# --------------------------------------------------------------------------- +def screenshot_full(display_id=0): + """Full-screen screenshot via mss, returns JPEG base64.""" + import mss + from PIL import Image + with mss.mss() as sct: + monitor = sct.monitors[display_id + 1] if display_id < len(sct.monitors) - 1 else sct.monitors[1] + shot = sct.grab(monitor) + img = Image.frombytes('RGB', shot.size, shot.bgra, 'raw', 'BGRX') + buf = io.BytesIO() + img.save(buf, format='JPEG', quality=75) + return { + 'base64': base64.b64encode(buf.getvalue()).decode(), + 'width': shot.width, + 'height': shot.height, + } + +def screenshot_window(hwnd_str): + """Window screenshot via PrintWindow, returns JPEG base64.""" + from PIL import Image + hwnd = int(hwnd_str) + if not user32.IsWindow(hwnd): + return None + + # Get window rect + rect = RECT() + user32.GetWindowRect(hwnd, ctypes.byref(rect)) + w = rect.right - rect.left + h = rect.bottom - rect.top + if w <= 0 or h <= 0: + return None + + # Handle minimized windows + was_minimized = user32.IsIconic(hwnd) + if was_minimized: + user32.ShowWindow(hwnd, 4) # SW_SHOWNOACTIVATE + time.sleep(0.1) + user32.GetWindowRect(hwnd, ctypes.byref(rect)) + w = rect.right - rect.left + h = rect.bottom - rect.top + + # Create DC and bitmap + hdc_window = user32.GetDC(hwnd) + hdc_mem = gdi32.CreateCompatibleDC(hdc_window) + hbm = gdi32.CreateCompatibleBitmap(hdc_window, w, h) + gdi32.SelectObject(hdc_mem, hbm) + + # PrintWindow with PW_RENDERFULLCONTENT + result = ctypes.windll.user32.PrintWindow(hwnd, hdc_mem, 2) + + if not result: + # Fallback to BitBlt + gdi32.BitBlt(hdc_mem, 0, 0, w, h, hdc_window, 0, 0, 0x00CC0020) # SRCCOPY + + # Extract bitmap bits + class BITMAPINFOHEADER(ctypes.Structure): + _fields_ = [ + ('biSize', ctypes.c_uint32), ('biWidth', ctypes.c_int32), + ('biHeight', ctypes.c_int32), ('biPlanes', ctypes.c_uint16), + ('biBitCount', ctypes.c_uint16), ('biCompression', ctypes.c_uint32), + ('biSizeImage', ctypes.c_uint32), ('biXPelsPerMeter', ctypes.c_int32), + ('biYPelsPerMeter', ctypes.c_int32), ('biClrUsed', ctypes.c_uint32), + ('biClrImportant', ctypes.c_uint32), + ] + + bmi = BITMAPINFOHEADER() + bmi.biSize = ctypes.sizeof(BITMAPINFOHEADER) + bmi.biWidth = w + bmi.biHeight = -h # top-down + bmi.biPlanes = 1 + bmi.biBitCount = 32 + bmi.biCompression = 0 # BI_RGB + + buf_size = w * h * 4 + pixel_buf = ctypes.create_string_buffer(buf_size) + gdi32.GetDIBits(hdc_mem, hbm, 0, h, pixel_buf, ctypes.byref(bmi), 0) + + # Cleanup GDI + gdi32.DeleteObject(hbm) + gdi32.DeleteDC(hdc_mem) + user32.ReleaseDC(hwnd, hdc_window) + + if was_minimized: + user32.ShowWindow(hwnd, SW_SHOWMINNOACTIVE) + + # Convert to JPEG + img = Image.frombuffer('RGBA', (w, h), pixel_buf, 'raw', 'BGRA', 0, 1) + img = img.convert('RGB') + out = io.BytesIO() + img.save(out, format='JPEG', quality=75) + + return { + 'base64': base64.b64encode(out.getvalue()).decode(), + 'width': w, + 'height': h, + } + +# --------------------------------------------------------------------------- +# Window management +# --------------------------------------------------------------------------- +def list_windows(): + """Enumerate all visible windows with title.""" + windows = [] + def cb(hwnd, _): + if user32.IsWindowVisible(hwnd): + length = user32.GetWindowTextLengthW(hwnd) + if length > 0: + buf = ctypes.create_unicode_buffer(length + 1) + user32.GetWindowTextW(hwnd, buf, length + 1) + pid = ctypes.c_uint32() + user32.GetWindowThreadProcessId(hwnd, ctypes.byref(pid)) + windows.append({'hwnd': str(hwnd), 'pid': pid.value, 'title': buf.value}) + return True + user32.EnumWindows(WNDENUMPROC(cb), 0) + return windows + +def get_window_rect(hwnd_str): + hwnd = int(hwnd_str) + rect = RECT() + if user32.GetWindowRect(hwnd, ctypes.byref(rect)): + return {'x': rect.left, 'y': rect.top, + 'width': rect.right - rect.left, 'height': rect.bottom - rect.top} + return None + +def get_client_offset(hwnd_str): + """Get non-client area offset (title bar height, border width).""" + hwnd = int(hwnd_str) + wr = RECT() + user32.GetWindowRect(hwnd, ctypes.byref(wr)) + pt = POINT(0, 0) + user32.ClientToScreen(hwnd, ctypes.byref(pt)) + return {'dx': pt.x - wr.left, 'dy': pt.y - wr.top} + +def manage_window(hwnd_str, action): + hwnd = int(hwnd_str) + if action == 'minimize': + return user32.ShowWindow(hwnd, SW_SHOWMINNOACTIVE) + elif action == 'maximize': + return user32.ShowWindow(hwnd, SW_MAXIMIZE) + elif action == 'restore': + return user32.ShowWindow(hwnd, SW_RESTORE) + elif action == 'close': + SendMessageW(hwnd, WM_CLOSE, 0, 0) + return True + elif action == 'focus': + if user32.IsIconic(hwnd): + user32.ShowWindow(hwnd, SW_RESTORE) + user32.SetForegroundWindow(hwnd) + return True + elif action == 'move_offscreen': + user32.SetWindowPos(hwnd, 0, -32000, -32000, 0, 0, + SWP_NOSIZE | SWP_NOZORDER | SWP_NOACTIVATE) + return True + return False + +# --------------------------------------------------------------------------- +# Input — all via SendMessageW (window-targeted, no global) +# --------------------------------------------------------------------------- +def make_lparam(x, y): + return (y << 16) | (x & 0xFFFF) + +def send_click(hwnd_str, x, y, button='left'): + hwnd = int(hwnd_str) + lp = make_lparam(x, y) + if button == 'left': + SendMessageW(hwnd, WM_LBUTTONDOWN, 0, lp) + SendMessageW(hwnd, WM_LBUTTONUP, 0, lp) + elif button == 'right': + SendMessageW(hwnd, WM_RBUTTONDOWN, 0, lp) + SendMessageW(hwnd, WM_RBUTTONUP, 0, lp) + return True + +def send_text(hwnd_str, text): + """Send text via WM_CHAR (Unicode). Handles surrogate pairs.""" + hwnd = int(hwnd_str) + for ch in text: + cp = ord(ch) + if cp <= 0xFFFF: + SendMessageW(hwnd, WM_CHAR, cp, 0) + else: + # Surrogate pair + hi = ((cp - 0x10000) >> 10) + 0xD800 + lo = ((cp - 0x10000) & 0x3FF) + 0xDC00 + SendMessageW(hwnd, WM_CHAR, hi, 0) + SendMessageW(hwnd, WM_CHAR, lo, 0) + return True + +def send_key(hwnd_str, vk, action='down'): + hwnd = int(hwnd_str) + msg = WM_KEYDOWN if action == 'down' else WM_KEYUP + SendMessageW(hwnd, msg, vk, 0) + return True + +def send_keys_combo(hwnd_str, keys): + """Send a key combination like ['ctrl', 's'].""" + VK = { + 'ctrl': 0x11, 'control': 0x11, 'shift': 0x10, 'alt': 0x12, + 'enter': 0x0D, 'return': 0x0D, 'tab': 0x09, 'escape': 0x1B, + 'backspace': 0x08, 'delete': 0x2E, 'space': 0x20, + 'left': 0x25, 'up': 0x26, 'right': 0x27, 'down': 0x28, + 'home': 0x24, 'end': 0x23, 'pageup': 0x21, 'pagedown': 0x22, + 'f1': 0x70, 'f2': 0x71, 'f3': 0x72, 'f4': 0x73, 'f5': 0x74, + 'f6': 0x75, 'f7': 0x76, 'f8': 0x77, 'f9': 0x78, 'f10': 0x79, + 'f11': 0x7A, 'f12': 0x7B, + } + MODIFIERS = {'ctrl', 'control', 'shift', 'alt'} + hwnd = int(hwnd_str) + mods = [] + main_key = None + for k in keys: + kl = k.lower() + if kl in MODIFIERS: + mods.append(VK.get(kl, 0)) + elif kl in VK: + main_key = VK[kl] + elif len(kl) == 1: + main_key = ord(kl.upper()) + if main_key is None: + return False + for m in mods: + SendMessageW(hwnd, WM_KEYDOWN, m, 0) + SendMessageW(hwnd, WM_KEYDOWN, main_key, 0) + SendMessageW(hwnd, WM_KEYUP, main_key, 0) + for m in reversed(mods): + SendMessageW(hwnd, WM_KEYUP, m, 0) + return True + +def send_mouse_down(hwnd_str, x, y): + hwnd = int(hwnd_str) + SendMessageW(hwnd, WM_LBUTTONDOWN, 0, make_lparam(x, y)) + return True + +def send_mouse_up(hwnd_str, x, y): + hwnd = int(hwnd_str) + SendMessageW(hwnd, WM_LBUTTONUP, 0, make_lparam(x, y)) + return True + +def send_mouse_move(hwnd_str, x, y): + hwnd = int(hwnd_str) + SendMessageW(hwnd, WM_MOUSEMOVE, 0, make_lparam(x, y)) + return True + +# --------------------------------------------------------------------------- +# Accessibility snapshot (UI Automation via comtypes) +# --------------------------------------------------------------------------- +_uia_client = None + +def _get_uia(): + global _uia_client + if _uia_client is None: + try: + import comtypes.client + comtypes.client.GetModule('UIAutomationCore.dll') + from comtypes.gen.UIAutomationClient import CUIAutomation + _uia_client = comtypes.client.CreateObject(CUIAutomation) + except Exception: + # Fallback: use pywinauto + pass + return _uia_client + +def accessibility_snapshot(hwnd_str, max_depth=4): + """Get the accessibility tree using pywinauto (more reliable than raw comtypes).""" + try: + from pywinauto import Desktop + from pywinauto.controls.uiawrapper import UIAWrapper + + hwnd = int(hwnd_str) + app = Desktop(backend='uia') + # Find window by handle + win = None + for w in app.windows(): + if w.handle == hwnd: + win = w + break + if win is None: + return None + + INTERACTIVE = {'Button', 'Edit', 'ComboBox', 'CheckBox', 'RadioButton', + 'MenuItem', 'Menu', 'MenuBar', 'Hyperlink', 'Slider', + 'Tab', 'TabItem', 'List', 'ListItem', 'Document', + 'TreeItem', 'DataItem', 'ToolBar', 'SplitButton'} + + def walk(element, depth): + if depth >= max_depth: + return [] + nodes = [] + try: + children = element.children() + except Exception: + return [] + for child in children: + try: + ct = child.element_info.control_type or '' + name = child.element_info.name or '' + auto_id = child.element_info.automation_id or '' + rect = child.rectangle() + w = rect.right - rect.left + h = rect.bottom - rect.top + if w <= 0 or h <= 0 or rect.left < -10000: + continue + enabled = child.is_enabled() + value = None + try: + value = child.get_value() + except Exception: + pass + sub = walk(child, depth + 1) + if ct in INTERACTIVE or sub: + node = { + 'role': ct, 'name': name, 'id': auto_id, + 'x': rect.left, 'y': rect.top, 'w': w, 'h': h, + 'on': enabled, + } + if value: + node['v'] = str(value)[:100] + if sub: + node['c'] = sub + nodes.append(node) + except Exception: + continue + return nodes + + tree = walk(win, 0) + return tree if tree else None + except Exception as e: + return None + +# --------------------------------------------------------------------------- +# Find edit child (for text input targeting) +# --------------------------------------------------------------------------- +def find_edit_child(hwnd_str): + """Find the best edit control child using UI Automation.""" + try: + from pywinauto import Desktop + hwnd = int(hwnd_str) + app = Desktop(backend='uia') + for w in app.windows(): + if w.handle == hwnd: + # Find first Edit or Document control + for child in w.descendants(): + try: + ct = child.element_info.control_type + if ct in ('Edit', 'Document'): + return str(child.handle) if child.handle else None + except Exception: + continue + break + except Exception: + pass + return None + +# --------------------------------------------------------------------------- +# Clipboard paste (for large text) +# --------------------------------------------------------------------------- +def paste_text(hwnd_str, text): + """Set clipboard + send Ctrl+V via SendMessage.""" + import ctypes + # Set clipboard + CF_UNICODETEXT = 13 + user32.OpenClipboard(0) + user32.EmptyClipboard() + data = text.encode('utf-16-le') + b'\x00\x00' + h = kernel32.GlobalAlloc(0x0002, len(data)) # GMEM_MOVEABLE + ptr = kernel32.GlobalLock(h) + ctypes.memmove(ptr, data, len(data)) + kernel32.GlobalUnlock(h) + user32.SetClipboardData(CF_UNICODETEXT, h) + user32.CloseClipboard() + # Send Ctrl+V + send_keys_combo(hwnd_str, ['ctrl', 'v']) + return True + +# --------------------------------------------------------------------------- +# Mouse wheel scroll (WM_MOUSEWHEEL / WM_MOUSEHWHEEL) +# --------------------------------------------------------------------------- +WM_MOUSEWHEEL = 0x020A +WM_MOUSEHWHEEL = 0x020E + +# ClientToScreen for screen coords in lParam +user32.ClientToScreen.argtypes = [ctypes.c_void_p, ctypes.POINTER(POINT)] +user32.ClientToScreen.restype = ctypes.c_bool + +def send_mouse_wheel(hwnd_str, x, y, delta, horizontal=False): + """Send mouse wheel scroll at client coordinates (x, y). + delta: positive = up/right, negative = down/left. In "clicks" (1 click = 120 units). + """ + hwnd = int(hwnd_str) + msg = WM_MOUSEHWHEEL if horizontal else WM_MOUSEWHEEL + wheel_delta = int(delta) * 120 + # Convert client coords to screen coords for lParam + pt = POINT(int(x), int(y)) + user32.ClientToScreen(hwnd, ctypes.byref(pt)) + # wParam: high word = delta (signed short), low word = modifier keys (0) + wparam = ctypes.c_void_p(wheel_delta << 16) + # lParam: screen coords + lparam = ctypes.c_void_p((pt.y << 16) | (pt.x & 0xFFFF)) + SendMessageW(hwnd, msg, wparam, lparam) + return True + +# --------------------------------------------------------------------------- +# Dispatch +# --------------------------------------------------------------------------- +METHODS = { + 'screenshot': lambda p: screenshot_full(p.get('display_id', 0)), + 'screenshot_window': lambda p: screenshot_window(p['hwnd']), + 'list_windows': lambda p: list_windows(), + 'get_window_rect': lambda p: get_window_rect(p['hwnd']), + 'get_client_offset': lambda p: get_client_offset(p['hwnd']), + 'manage_window': lambda p: manage_window(p['hwnd'], p['action']), + 'send_click': lambda p: send_click(p['hwnd'], p['x'], p['y'], p.get('button', 'left')), + 'send_text': lambda p: send_text(p['hwnd'], p['text']), + 'send_key': lambda p: send_key(p['hwnd'], p['vk'], p.get('action', 'down')), + 'send_keys': lambda p: send_keys_combo(p['hwnd'], p['keys']), + 'send_mouse_down': lambda p: send_mouse_down(p['hwnd'], p['x'], p['y']), + 'send_mouse_up': lambda p: send_mouse_up(p['hwnd'], p['x'], p['y']), + 'send_mouse_move': lambda p: send_mouse_move(p['hwnd'], p['x'], p['y']), + 'paste_text': lambda p: paste_text(p['hwnd'], p['text']), + 'send_mouse_wheel': lambda p: send_mouse_wheel(p['hwnd'], p['x'], p['y'], p['delta'], p.get('horizontal', False)), + 'find_edit_child': lambda p: find_edit_child(p['hwnd']), + 'accessibility_snapshot': lambda p: accessibility_snapshot(p['hwnd'], p.get('max_depth', 4)), + 'ping': lambda p: {'ok': True, 'pid': os.getpid()}, +} + +def main(): + """Main loop: read JSON lines from stdin, dispatch, write JSON lines to stdout.""" + for line in sys.stdin: + line = line.strip() + if not line: + continue + try: + req = json.loads(line) + req_id = req.get('id', 0) + method = req.get('method', '') + params = req.get('params', {}) + + if method not in METHODS: + resp = {'id': req_id, 'error': f'unknown method: {method}'} + else: + try: + result = METHODS[method](params) + resp = {'id': req_id, 'result': result} + except Exception as e: + resp = {'id': req_id, 'error': str(e)} + + sys.stdout.write(json.dumps(resp, ensure_ascii=False) + '\n') + sys.stdout.flush() + except json.JSONDecodeError as e: + sys.stdout.write(json.dumps({'id': 0, 'error': f'invalid JSON: {e}'}) + '\n') + sys.stdout.flush() + +if __name__ == '__main__': + main() diff --git a/src/utils/computerUse/win32/bridgeClient.ts b/src/utils/computerUse/win32/bridgeClient.ts new file mode 100644 index 000000000..b3f8a3749 --- /dev/null +++ b/src/utils/computerUse/win32/bridgeClient.ts @@ -0,0 +1,191 @@ +/** + * Python Bridge Client — manages a long-lived Python subprocess for Windows + * Computer Use operations. + * + * Replaces per-call PowerShell spawning with a persistent Python process + * that communicates via JSON lines over stdin/stdout. + * + * Performance: ~1-5ms per call vs ~200-500ms per PowerShell spawn. + */ + +import * as path from 'path' + +interface BridgeRequest { + id: number + method: string + params: Record +} + +interface BridgeResponse { + id: number + result?: unknown + error?: string +} + +let bridgeProc: ReturnType | null = null +let requestId = 0 +const pendingRequests = new Map< + number, + { + resolve: (value: unknown) => void + reject: (error: Error) => void + } +>() +let outputBuffer = '' + +/** + * Start the Python bridge process if not already running. + */ +export function ensureBridge(): boolean { + if (bridgeProc) return true + try { + const scriptPath = path.join(__dirname, 'bridge.py') + bridgeProc = Bun.spawn(['python', '-u', scriptPath], { + stdin: 'pipe', + stdout: 'pipe', + stderr: 'ignore', + env: { ...process.env, PYTHONIOENCODING: 'utf-8', PYTHONUNBUFFERED: '1' }, + }) + + // Read stdout lines asynchronously + const reader = bridgeProc.stdout.getReader() + const readLoop = async () => { + try { + while (true) { + const { done, value } = await reader.read() + if (done) break + outputBuffer += new TextDecoder().decode(value) + // Process complete lines + let newlineIdx: number + while ((newlineIdx = outputBuffer.indexOf('\n')) !== -1) { + const line = outputBuffer.slice(0, newlineIdx).trim() + outputBuffer = outputBuffer.slice(newlineIdx + 1) + if (!line) continue + try { + const resp: BridgeResponse = JSON.parse(line) + const pending = pendingRequests.get(resp.id) + if (pending) { + pendingRequests.delete(resp.id) + if (resp.error) { + pending.reject(new Error(resp.error)) + } else { + pending.resolve(resp.result) + } + } + } catch {} + } + } + } catch {} + } + readLoop() + + return true + } catch { + bridgeProc = null + return false + } +} + +/** + * Send a request to the Python bridge and wait for the response. + */ +export async function call( + method: string, + params: Record = {}, + timeoutMs: number = 10000, +): Promise { + if (!ensureBridge()) { + throw new Error('Python bridge not available') + } + + const id = ++requestId + const req: BridgeRequest = { id, method, params } + + return new Promise((resolve, reject) => { + pendingRequests.set(id, { + resolve: resolve as (v: unknown) => void, + reject, + }) + + // Timeout + const timer = setTimeout(() => { + pendingRequests.delete(id) + reject(new Error(`Bridge call ${method} timed out after ${timeoutMs}ms`)) + }, timeoutMs) + + // Clear timeout on resolve/reject + const origResolve = resolve + const origReject = reject + pendingRequests.set(id, { + resolve: v => { + clearTimeout(timer) + ;(origResolve as any)(v) + }, + reject: e => { + clearTimeout(timer) + origReject(e) + }, + }) + + try { + bridgeProc!.stdin.write(JSON.stringify(req) + '\n') + bridgeProc!.stdin.flush() + } catch (err) { + clearTimeout(timer) + pendingRequests.delete(id) + reject(new Error(`Bridge write failed: ${err}`)) + } + }) +} + +/** + * Synchronous call — blocks the event loop. Use sparingly. + * Falls back to PowerShell if bridge is not available. + */ +export function callSync( + method: string, + params: Record = {}, + timeoutMs: number = 10000, +): T | null { + // For sync calls, spawn a one-shot Python process. + // SECURITY: JSON is passed via stdin (not embedded in -c) to prevent code injection. + try { + const scriptPath = path.join(__dirname, 'bridge.py') + const req = JSON.stringify({ id: 1, method, params }) + const result = Bun.spawnSync({ + cmd: ['python', '-u', scriptPath], + stdin: Buffer.from(req + '\n'), + stdout: 'pipe', + stderr: 'pipe', + env: { ...process.env, PYTHONIOENCODING: 'utf-8' }, + timeout: timeoutMs, + }) + const out = new TextDecoder().decode(result.stdout).trim() + if (!out) return null + const resp: BridgeResponse = JSON.parse(out) + if (resp.error) throw new Error(resp.error) + return resp.result as T + } catch { + return null + } +} + +/** + * Kill the bridge process. + */ +export function stopBridge(): void { + if (bridgeProc) { + try { + bridgeProc.stdin.end() + bridgeProc.kill() + } catch {} + bridgeProc = null + } + pendingRequests.clear() + outputBuffer = '' +} + +// NOTE: No process exit handlers here — the platform-level win32.ts +// already registers exit/SIGINT/SIGTERM handlers that call cleanupAll(), +// which includes stopBridge(). Adding handlers here would cause double +// cleanup and duplicate process.exit() calls. diff --git a/src/utils/computerUse/win32/comExcel.ts b/src/utils/computerUse/win32/comExcel.ts new file mode 100644 index 000000000..26dde56a7 --- /dev/null +++ b/src/utils/computerUse/win32/comExcel.ts @@ -0,0 +1,320 @@ +/** + * Excel COM automation via PowerShell. + * Completely headless — Visible=false, no window, no user impact. + * Each operation opens and closes Excel to avoid orphaned processes. + */ + +export interface CellInfo { + row: number + col: number + value: string | number | null + formula?: string +} + +export interface SheetInfo { + name: string + usedRange: { rows: number; cols: number } + cells: CellInfo[] +} + +export interface ExcelInfo { + sheets: SheetInfo[] + sheetNames: string[] +} + +function ps(script: string): string { + const result = Bun.spawnSync({ + cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script], + stdout: 'pipe', + stderr: 'pipe', + }) + const stderr = new TextDecoder().decode(result.stderr).trim() + if (result.exitCode !== 0 && stderr) { + throw new Error(`PowerShell error: ${stderr}`) + } + return new TextDecoder().decode(result.stdout).trim() +} + +function escPath(p: string): string { + return p.replace(/'/g, "''") +} + +function resolveSheet(varName: string, sheet: string | number): string { + if (typeof sheet === 'number') { + return `$${varName} = $wb.Sheets.Item(${sheet})` + } + return `$${varName} = $wb.Sheets.Item('${sheet.replace(/'/g, "''")}')` +} + +const EXCEL_INIT = ` +$excel = New-Object -ComObject Excel.Application +$excel.Visible = $false +$excel.DisplayAlerts = $false +`.trim() + +function excelCleanup(hasWorkbook = true): string { + const parts: string[] = [] + if (hasWorkbook) parts.push('if ($wb) { $wb.Close($false) }') + parts.push('$excel.Quit()') + parts.push('[System.Runtime.InteropServices.Marshal]::ReleaseComObject($excel) | Out-Null') + return parts.join('\n ') +} + +/** + * Open and read an Excel workbook. + * Limits to first 1000 non-empty cells per sheet. + */ +export function openExcel(filePath: string): ExcelInfo { + const script = ` +${EXCEL_INIT} +try { + $wb = $excel.Workbooks.Open('${escPath(filePath)}') + $result = @{ sheets = @(); sheetNames = @() } + foreach ($sheet in $wb.Sheets) { + $result.sheetNames += $sheet.Name + $ur = $sheet.UsedRange + $rows = $ur.Rows.Count + $cols = $ur.Columns.Count + $cells = @() + $count = 0 + for ($r = 1; $r -le $rows -and $count -lt 1000; $r++) { + for ($c = 1; $c -le $cols -and $count -lt 1000; $c++) { + $cell = $sheet.Cells.Item($r, $c) + $val = $cell.Value2 + if ($null -ne $val) { + $f = $null + if ($cell.HasFormula) { $f = $cell.Formula } + $entry = @{ row = $r; col = $c; value = $val } + if ($f) { $entry.formula = $f } + $cells += $entry + $count++ + } + } + } + $result.sheets += @{ + name = $sheet.Name + usedRange = @{ rows = $rows; cols = $cols } + cells = $cells + } + } + $result | ConvertTo-Json -Depth 5 -Compress +} finally { + ${excelCleanup()} +} +` + const raw = ps(script) + if (!raw) throw new Error('No output from openExcel') + const parsed = JSON.parse(raw) + + // Normalize: PowerShell single-element arrays become objects + const sheets: SheetInfo[] = Array.isArray(parsed.sheets) ? parsed.sheets : [parsed.sheets] + const sheetNames: string[] = Array.isArray(parsed.sheetNames) ? parsed.sheetNames : [parsed.sheetNames] + + return { + sheets: sheets.map((s: any) => ({ + name: s.name, + usedRange: s.usedRange, + cells: Array.isArray(s.cells) ? s.cells : s.cells ? [s.cells] : [], + })), + sheetNames, + } +} + +/** + * Read a single cell value. + */ +export function readCell( + filePath: string, + sheet: string | number, + row: number, + col: number, +): string | number | null { + const script = ` +${EXCEL_INIT} +try { + $wb = $excel.Workbooks.Open('${escPath(filePath)}') + ${resolveSheet('sheet', sheet)} + $val = $sheet.Cells.Item(${row}, ${col}).Value2 + if ($null -eq $val) { Write-Output 'null' } else { Write-Output ($val | ConvertTo-Json -Compress) } +} finally { + ${excelCleanup()} +} +` + const raw = ps(script) + if (raw === 'null' || raw === '') return null + return JSON.parse(raw) +} + +/** + * Read a rectangular range of cells as a 2D array. + */ +export function readRange( + filePath: string, + sheet: string | number, + startRow: number, + startCol: number, + endRow: number, + endCol: number, +): (string | number | null)[][] { + const script = ` +${EXCEL_INIT} +try { + $wb = $excel.Workbooks.Open('${escPath(filePath)}') + ${resolveSheet('sheet', sheet)} + $rows = @() + for ($r = ${startRow}; $r -le ${endRow}; $r++) { + $row = @() + for ($c = ${startCol}; $c -le ${endCol}; $c++) { + $val = $sheet.Cells.Item($r, $c).Value2 + $row += if ($null -eq $val) { '__NULL__' } else { $val } + } + $rows += ,@($row) + } + $rows | ConvertTo-Json -Depth 3 -Compress +} finally { + ${excelCleanup()} +} +` + const raw = ps(script) + if (!raw) return [] + const parsed = JSON.parse(raw) + // Normalize single-row case + const rows: any[] = Array.isArray(parsed[0]) ? parsed : [parsed] + return rows.map((row: any[]) => + row.map((v: any) => (v === '__NULL__' ? null : v)), + ) +} + +/** + * Write a single cell value. + */ +export function writeCell( + filePath: string, + sheet: string | number, + row: number, + col: number, + value: string | number, +): boolean { + const jsonVal = JSON.stringify(value) + const script = ` +${EXCEL_INIT} +try { + $wb = $excel.Workbooks.Open('${escPath(filePath)}') + ${resolveSheet('sheet', sheet)} + $sheet.Cells.Item(${row}, ${col}).Value2 = (ConvertFrom-Json '${jsonVal.replace(/'/g, "''")}') + $wb.Save() + Write-Output 'true' +} finally { + ${excelCleanup()} +} +` + return ps(script) === 'true' +} + +/** + * Write a 2D array of values starting at (startRow, startCol). + */ +export function writeRange( + filePath: string, + sheet: string | number, + startRow: number, + startCol: number, + data: (string | number | null)[][], +): boolean { + const jsonData = JSON.stringify(data).replace(/'/g, "''") + const script = ` +${EXCEL_INIT} +try { + $wb = $excel.Workbooks.Open('${escPath(filePath)}') + ${resolveSheet('sheet', sheet)} + $data = ConvertFrom-Json '${jsonData}' + for ($r = 0; $r -lt $data.Count; $r++) { + $row = $data[$r] + for ($c = 0; $c -lt $row.Count; $c++) { + $val = $row[$c] + if ($null -ne $val) { + if ($val -is [int] -or $val -is [long] -or $val -is [double] -or $val -is [decimal]) { + $sheet.Cells.Item(${startRow} + $r, ${startCol} + $c).Value2 = [double]$val + } else { + $sheet.Cells.Item(${startRow} + $r, ${startCol} + $c).Value2 = [string]$val + } + } + } + } + $wb.Save() + Write-Output 'true' +} finally { + ${excelCleanup()} +} +` + return ps(script) === 'true' +} + +/** + * Set a formula on a cell. + */ +export function setFormula( + filePath: string, + sheet: string | number, + row: number, + col: number, + formula: string, +): boolean { + const script = ` +${EXCEL_INIT} +try { + $wb = $excel.Workbooks.Open('${escPath(filePath)}') + ${resolveSheet('sheet', sheet)} + $sheet.Cells.Item(${row}, ${col}).Formula = '${formula.replace(/'/g, "''")}' + $wb.Save() + Write-Output 'true' +} finally { + ${excelCleanup()} +} +` + return ps(script) === 'true' +} + +/** + * Save workbook. If savePath is given, SaveAs to that path; otherwise Save in place. + */ +export function saveExcel(filePath: string, savePath?: string): boolean { + const saveCmd = savePath + ? `$wb.SaveAs('${escPath(savePath)}')` + : '$wb.Save()' + const script = ` +${EXCEL_INIT} +try { + $wb = $excel.Workbooks.Open('${escPath(filePath)}') + ${saveCmd} + Write-Output 'true' +} finally { + ${excelCleanup()} +} +` + return ps(script) === 'true' +} + +/** + * Create a new empty workbook and save it to the given path. + */ +export function createExcel(savePath: string): boolean { + const script = ` +${EXCEL_INIT} +try { + $wb = $excel.Workbooks.Add() + $wb.SaveAs('${escPath(savePath)}') + Write-Output 'true' +} finally { + ${excelCleanup()} +} +` + return ps(script) === 'true' +} + +/** + * closeExcel is a no-op since each operation opens and closes its own COM instance. + */ +export function closeExcel(_filePath: string): void { + // No-op: each function manages its own Excel lifecycle +} diff --git a/src/utils/computerUse/win32/comWord.ts b/src/utils/computerUse/win32/comWord.ts new file mode 100644 index 000000000..3cae5e2b0 --- /dev/null +++ b/src/utils/computerUse/win32/comWord.ts @@ -0,0 +1,450 @@ +/** + * Word COM automation module for Windows. + * Uses PowerShell to drive Word.Application COM object — fully headless (Visible=false). + * Each function builds a PowerShell script, runs it via Bun.spawnSync, and parses JSON output. + */ + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface WordParagraph { + text: string + bold?: boolean + italic?: boolean + fontSize?: number +} + +export interface WordTable { + rows: number + cols: number + data: string[][] +} + +export interface WordDocInfo { + text: string + paragraphs: WordParagraph[] + tables: WordTable[] + wordCount: number + pageCount: number +} + +export interface AppendTextOptions { + bold?: boolean + italic?: boolean + fontSize?: number + fontName?: string +} + +// --------------------------------------------------------------------------- +// PowerShell runner +// --------------------------------------------------------------------------- + +function runPs(script: string): string { + const result = Bun.spawnSync({ + cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script], + stdout: 'pipe', + stderr: 'pipe', + }) + return new TextDecoder().decode(result.stdout).trim() +} + +function parseJsonOutput(raw: string, fallback: T): T { + if (!raw) return fallback + try { + return JSON.parse(raw) as T + } catch { + return fallback + } +} + +/** Escape a string for safe embedding inside a PowerShell single-quoted string. */ +function psEscape(s: string): string { + return s.replace(/'/g, "''") +} + +// --------------------------------------------------------------------------- +// Word COM wrapper template +// --------------------------------------------------------------------------- + +/** + * Wraps a Word COM script body with standard open/cleanup boilerplate. + * The body receives $word and $doc variables. + * If `openPath` is provided the document is opened; otherwise a new doc is created. + */ +function wrapWordScript(body: string, openPath?: string): string { + const openCmd = openPath + ? `$doc = $word.Documents.Open('${psEscape(openPath)}')` + : '$doc = $word.Documents.Add()' + + return ` +$word = New-Object -ComObject Word.Application +$word.Visible = $false +$word.DisplayAlerts = 0 +try { + ${openCmd} + ${body} +} finally { + if ($doc -ne $null) { $doc.Close($false); } + if ($word -ne $null) { $word.Quit(); } + if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null } +} +` +} + +/** + * Same as wrapWordScript but the body is responsible for saving before close. + * After body runs, $doc.Save() is called automatically. + */ +function wrapWordScriptWithSave(body: string, openPath: string): string { + return ` +$word = New-Object -ComObject Word.Application +$word.Visible = $false +$word.DisplayAlerts = 0 +try { + $doc = $word.Documents.Open('${psEscape(openPath)}') + ${body} + $doc.Save() + Write-Output '{"ok":true}' +} catch { + Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}') +} finally { + if ($doc -ne $null) { $doc.Close($false); } + if ($word -ne $null) { $word.Quit(); } + if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null } +} +` +} + +// --------------------------------------------------------------------------- +// 1. openWord +// --------------------------------------------------------------------------- + +export async function openWord(filePath: string): Promise { + const script = wrapWordScript( + ` + # Paragraphs (limit 500) + $paras = @() + $paraCount = $doc.Paragraphs.Count + $limit = [Math]::Min($paraCount, 500) + for ($i = 1; $i -le $limit; $i++) { + $p = $doc.Paragraphs.Item($i) + $r = $p.Range + $paras += @{ + text = $r.Text -replace '\\r$','' + bold = [bool]($r.Font.Bold -eq -1) + italic = [bool]($r.Font.Italic -eq -1) + fontSize = $r.Font.Size + } + } + + # Tables + $tables = @() + foreach ($table in $doc.Tables) { + $rows = $table.Rows.Count + $cols = $table.Columns.Count + $data = @() + for ($r = 1; $r -le $rows; $r++) { + $row = @() + for ($c = 1; $c -le $cols; $c++) { + try { + $cellText = $table.Cell($r, $c).Range.Text + # Trim trailing \\r\\a that Word adds to cell text + $cellText = $cellText -replace '[\\r\\n\\a]+$','' + $row += $cellText + } catch { + $row += '' + } + } + $data += ,@($row) + } + $tables += @{ rows = $rows; cols = $cols; data = $data } + } + + # Counts: wdStatisticWords=0, wdStatisticPages=2 + $wordCount = $doc.ComputeStatistics(0) + $pageCount = $doc.ComputeStatistics(2) + + $result = @{ + text = $doc.Content.Text + paragraphs = $paras + tables = $tables + wordCount = $wordCount + pageCount = $pageCount + } + Write-Output (ConvertTo-Json $result -Depth 5 -Compress) +`, + filePath, + ) + + const raw = runPs(script) + return parseJsonOutput(raw, { + text: '', + paragraphs: [], + tables: [], + wordCount: 0, + pageCount: 0, + }) +} + +// --------------------------------------------------------------------------- +// 2. readText +// --------------------------------------------------------------------------- + +export async function readText(filePath: string): Promise { + const script = wrapWordScript( + `Write-Output $doc.Content.Text`, + filePath, + ) + return runPs(script) +} + +// --------------------------------------------------------------------------- +// 3. appendText +// --------------------------------------------------------------------------- + +export async function appendText( + filePath: string, + text: string, + opts?: AppendTextOptions, +): Promise { + const fontSetup = opts + ? [ + opts.bold !== undefined ? `$sel.Font.Bold = ${opts.bold ? '-1' : '0'}` : '', + opts.italic !== undefined ? `$sel.Font.Italic = ${opts.italic ? '-1' : '0'}` : '', + opts.fontSize !== undefined ? `$sel.Font.Size = ${opts.fontSize}` : '', + opts.fontName ? `$sel.Font.Name = '${psEscape(opts.fontName)}'` : '', + ] + .filter(Boolean) + .join('\n ') + : '' + + const body = ` + $sel = $word.Selection + $sel.EndKey(6) | Out-Null + ${fontSetup} + $sel.TypeText('${psEscape(text)}') +` + + const script = wrapWordScriptWithSave(body, filePath) + const raw = runPs(script) + return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok +} + +// --------------------------------------------------------------------------- +// 4. insertText +// --------------------------------------------------------------------------- + +export async function insertText( + filePath: string, + paraIndex: number, + text: string, +): Promise { + const body = ` + $doc.Paragraphs.Item(${paraIndex}).Range.InsertBefore('${psEscape(text)}') +` + const script = wrapWordScriptWithSave(body, filePath) + const raw = runPs(script) + return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok +} + +// --------------------------------------------------------------------------- +// 5. findReplace +// --------------------------------------------------------------------------- + +export async function findReplace( + filePath: string, + find: string, + replace: string, + replaceAll?: boolean, +): Promise { + // wdReplaceAll=2, wdReplaceOne=1 + const replaceConst = replaceAll !== false ? 2 : 1 + + const body = ` + $content = $doc.Content + $findObj = $content.Find + $findObj.ClearFormatting() + $findObj.Replacement.ClearFormatting() + + # Count replacements by iterating + $count = 0 + $findObj.Text = '${psEscape(find)}' + $findObj.Replacement.Text = '${psEscape(replace)}' + $findObj.Forward = $true + $findObj.Wrap = 0 + $findObj.Format = $false + $findObj.MatchCase = $false + $findObj.MatchWholeWord = $false + $findObj.MatchWildcards = $false + + if (${replaceConst} -eq 2) { + # Count occurrences first using a clone of content + $range2 = $doc.Content.Duplicate + while ($range2.Find.Execute('${psEscape(find)}')) { $count++ } + # Now do the actual replace + $findObj.Execute('${psEscape(find)}', $false, $false, $false, $false, $false, $true, 0, $false, '${psEscape(replace)}', 2) + } else { + $found = $findObj.Execute('${psEscape(find)}', $false, $false, $false, $false, $false, $true, 0, $false, '${psEscape(replace)}', 1) + if ($found) { $count = 1 } + } +` + + const script = ` +$word = New-Object -ComObject Word.Application +$word.Visible = $false +$word.DisplayAlerts = 0 +try { + $doc = $word.Documents.Open('${psEscape(filePath)}') + ${body} + $doc.Save() + Write-Output ('{"count":' + $count + '}') +} catch { + Write-Output '{"count":0}' +} finally { + if ($doc -ne $null) { $doc.Close($false); } + if ($word -ne $null) { $word.Quit(); } + if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null } +} +` + + const raw = runPs(script) + return parseJsonOutput<{ count: number }>(raw, { count: 0 }).count +} + +// --------------------------------------------------------------------------- +// 6. insertTable +// --------------------------------------------------------------------------- + +export async function insertTable( + filePath: string, + rows: number, + cols: number, + data: string[][], +): Promise { + // Build PowerShell array literal for the data + const psData = data + .map( + (row) => + ',@(' + row.map((cell) => `'${psEscape(cell)}'`).join(',') + ')', + ) + .join('\n ') + + const body = ` + $sel = $word.Selection + $sel.EndKey(6) | Out-Null + $table = $doc.Tables.Add($sel.Range, ${rows}, ${cols}) + $data = @(${psData}) + for ($r = 0; $r -lt $data.Count; $r++) { + for ($c = 0; $c -lt $data[$r].Count; $c++) { + $table.Cell($r + 1, $c + 1).Range.Text = $data[$r][$c] + } + } +` + + const script = wrapWordScriptWithSave(body, filePath) + const raw = runPs(script) + return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok +} + +// --------------------------------------------------------------------------- +// 7. saveWord +// --------------------------------------------------------------------------- + +export async function saveWord( + filePath: string, + savePath?: string, +): Promise { + if (!savePath || savePath === filePath) { + const script = wrapWordScriptWithSave('', filePath) + const raw = runPs(script) + return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok + } + + const body = `$doc.SaveAs('${psEscape(savePath)}')` + const script = ` +$word = New-Object -ComObject Word.Application +$word.Visible = $false +$word.DisplayAlerts = 0 +try { + $doc = $word.Documents.Open('${psEscape(filePath)}') + ${body} + Write-Output '{"ok":true}' +} catch { + Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}') +} finally { + if ($doc -ne $null) { $doc.Close($false); } + if ($word -ne $null) { $word.Quit(); } + if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null } +} +` + const raw = runPs(script) + return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok +} + +// --------------------------------------------------------------------------- +// 8. saveAsPdf +// --------------------------------------------------------------------------- + +export async function saveAsPdf( + filePath: string, + pdfPath: string, +): Promise { + // wdFormatPDF = 17 + const body = `$doc.SaveAs2('${psEscape(pdfPath)}', 17)` + + const script = ` +$word = New-Object -ComObject Word.Application +$word.Visible = $false +$word.DisplayAlerts = 0 +try { + $doc = $word.Documents.Open('${psEscape(filePath)}') + ${body} + Write-Output '{"ok":true}' +} catch { + Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}') +} finally { + if ($doc -ne $null) { $doc.Close($false); } + if ($word -ne $null) { $word.Quit(); } + if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null } +} +` + const raw = runPs(script) + return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok +} + +// --------------------------------------------------------------------------- +// 9. createWord +// --------------------------------------------------------------------------- + +export async function createWord(savePath: string): Promise { + const script = ` +$word = New-Object -ComObject Word.Application +$word.Visible = $false +$word.DisplayAlerts = 0 +try { + $doc = $word.Documents.Add() + $doc.SaveAs('${psEscape(savePath)}') + Write-Output '{"ok":true}' +} catch { + Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}') +} finally { + if ($doc -ne $null) { $doc.Close($false); } + if ($word -ne $null) { $word.Quit(); } + if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null } +} +` + const raw = runPs(script) + return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok +} + +// --------------------------------------------------------------------------- +// 10. closeWord (no-op) +// --------------------------------------------------------------------------- + +/** + * closeWord is a no-op since each operation opens and closes its own COM instance. + */ +export function closeWord(_filePath: string): void { + // No-op: each function manages its own Word lifecycle +} diff --git a/src/utils/computerUse/win32/inputIndicator.ts b/src/utils/computerUse/win32/inputIndicator.ts new file mode 100644 index 000000000..0a6f6da0c --- /dev/null +++ b/src/utils/computerUse/win32/inputIndicator.ts @@ -0,0 +1,254 @@ +/** + * Input Indicator — floating label showing what Computer Use is doing + * on the bound window. + * + * Displays a small overlay near the bottom of the bound window: + * ⌨ Typing "hello world..." + * 🖱 Click (120, 50) + * ⌨ Ctrl+S + * 📜 Scroll ↓ 3 + * ✅ Done + * + * Auto-fades after 2 seconds of inactivity. + * Click-through, TOPMOST, no taskbar icon. + */ + +import * as fs from 'fs' +import * as path from 'path' +import { validateHwnd, getTmpDir } from './shared.js' + +const INDICATOR_WIDTH = 350 +const INDICATOR_HEIGHT = 28 +const FADE_AFTER_MS = 2000 +const BG_COLOR = '30, 30, 30' // dark background +const TEXT_COLOR = '220, 220, 220' // light text +const ACCENT_COLOR = '80, 200, 80' // green accent for active + +let indicatorProc: ReturnType | null = null +let stopFile: string | null = null +let scriptFile: string | null = null +let msgFile: string | null = null + +function buildIndicatorScript(hwnd: string, sf: string): string { + const sfEsc = sf.replace(/\\/g, '\\\\') + return ` +Add-Type -AssemblyName System.Windows.Forms +Add-Type -AssemblyName System.Drawing +Add-Type @' +using System; +using System.Runtime.InteropServices; +public class Indicator { + [DllImport("user32.dll")] public static extern bool IsWindow(IntPtr h); + [DllImport("user32.dll",SetLastError=true)] public static extern int SetWindowLong(IntPtr h, int i, int v); + [DllImport("user32.dll",SetLastError=true)] public static extern int GetWindowLong(IntPtr h, int i); + [DllImport("user32.dll")] public static extern bool SetWindowPos(IntPtr h, IntPtr a, int x, int y, int w, int h2, uint f); + [DllImport("user32.dll")] public static extern bool GetWindowRect(IntPtr h, out RECT r); + [StructLayout(LayoutKind.Sequential)] public struct RECT { public int L,T,R,B; } + public const int GWL_EXSTYLE = -20; + public const int WS_EX_LAYERED = 0x80000; + public const int WS_EX_TRANSPARENT = 0x20; + public const int WS_EX_TOOLWINDOW = 0x80; + public const int WS_EX_NOACTIVATE = 0x08000000; + public static readonly IntPtr HWND_TOPMOST = new IntPtr(-1); + public const uint SWP_NOACTIVATE = 0x0010; + public const uint SWP_SHOWWINDOW = 0x0040; + public static void MakeOverlay(IntPtr h) { + int ex = GetWindowLong(h, GWL_EXSTYLE); + ex |= WS_EX_LAYERED | WS_EX_TRANSPARENT | WS_EX_TOOLWINDOW | WS_EX_NOACTIVATE; + SetWindowLong(h, GWL_EXSTYLE, ex); + } +} +'@ + +$targetHwnd = [IntPtr]::new([long]${hwnd}) +$stopFile = '${sfEsc}' +$msgFile = $stopFile + '.msg' + +$form = New-Object System.Windows.Forms.Form +$form.FormBorderStyle = [System.Windows.Forms.FormBorderStyle]::None +$form.ShowInTaskbar = $false +$form.TopMost = $true +$form.StartPosition = [System.Windows.Forms.FormStartPosition]::Manual +$form.Size = New-Object System.Drawing.Size(${INDICATOR_WIDTH}, ${INDICATOR_HEIGHT}) +$form.Location = New-Object System.Drawing.Point(-32000, -32000) +$form.BackColor = [System.Drawing.Color]::FromArgb(240, ${BG_COLOR}) +$form.Opacity = 0.92 + +$label = New-Object System.Windows.Forms.Label +$label.Dock = [System.Windows.Forms.DockStyle]::Fill +$label.ForeColor = [System.Drawing.Color]::FromArgb(${TEXT_COLOR}) +$label.Font = New-Object System.Drawing.Font("Segoe UI", 10, [System.Drawing.FontStyle]::Regular) +$label.TextAlign = [System.Drawing.ContentAlignment]::MiddleLeft +$label.Padding = New-Object System.Windows.Forms.Padding(8, 0, 8, 0) +$label.Text = "" +$form.Controls.Add($label) + +$form.Show() +[Indicator]::MakeOverlay($form.Handle) + +$script:lastMsg = "" +$script:lastMsgTime = [DateTime]::MinValue +$script:visible = $false + +$timer = New-Object System.Windows.Forms.Timer +$timer.Interval = 50 # 20fps + +$timer.Add_Tick({ + if (-not [Indicator]::IsWindow($targetHwnd)) { + $timer.Stop(); $form.Close() + [System.Windows.Forms.Application]::ExitThread() + return + } + if (Test-Path $stopFile) { + $timer.Stop(); $form.Close() + try { Remove-Item $stopFile -ErrorAction SilentlyContinue } catch {} + try { Remove-Item $msgFile -ErrorAction SilentlyContinue } catch {} + [System.Windows.Forms.Application]::ExitThread() + return + } + + # Read new message + if (Test-Path $msgFile) { + try { + $msg = Get-Content $msgFile -Raw -Encoding UTF8 -ErrorAction SilentlyContinue + if ($msg) { + $script:lastMsg = $msg.Trim() + $script:lastMsgTime = [DateTime]::Now + Remove-Item $msgFile -ErrorAction SilentlyContinue + } + } catch {} + } + + # Fade logic: hide after ${FADE_AFTER_MS}ms of no updates + $elapsed = ([DateTime]::Now - $script:lastMsgTime).TotalMilliseconds + if ($elapsed -gt ${FADE_AFTER_MS} -and $script:visible) { + $form.Visible = $false + $script:visible = $false + return + } + if ($elapsed -le ${FADE_AFTER_MS} -and $script:lastMsg -ne "") { + # Position at bottom-center of the bound window + $wr = New-Object Indicator+RECT + [Indicator]::GetWindowRect($targetHwnd, [ref]$wr) | Out-Null + $ww = $wr.R - $wr.L + $fx = $wr.L + [int](($ww - ${INDICATOR_WIDTH}) / 2) + $fy = $wr.B - ${INDICATOR_HEIGHT} - 8 + $label.Text = $script:lastMsg + [Indicator]::SetWindowPos($form.Handle, [Indicator]::HWND_TOPMOST, + $fx, $fy, 0, 0, + 0x0001 -bor [Indicator]::SWP_NOACTIVATE -bor [Indicator]::SWP_SHOWWINDOW) | Out-Null + $form.Visible = $true + $script:visible = $true + # Fade opacity near end + if ($elapsed -gt ${FADE_AFTER_MS * 0.7}) { + $form.Opacity = [Math]::Max(0.3, 0.92 * (1.0 - ($elapsed - ${FADE_AFTER_MS * 0.7}) / ${FADE_AFTER_MS * 0.3})) + } else { + $form.Opacity = 0.92 + } + } +}) + +$timer.Start() +[System.Windows.Forms.Application]::Run() +` +} + +/** Start the input indicator for a bound window */ +export function showIndicator(hwnd: string): boolean { + hwnd = validateHwnd(hwnd) + hideIndicator() + try { + const tmpDir = getTmpDir() + const ts = Date.now() + stopFile = path.join(tmpDir, `cu_indicator_stop_${ts}`) + scriptFile = path.join(tmpDir, `cu_indicator_${ts}.ps1`) + msgFile = stopFile + '.msg' + fs.writeFileSync(scriptFile, buildIndicatorScript(hwnd, stopFile), 'utf-8') + indicatorProc = Bun.spawn( + [ + 'powershell', + '-NoProfile', + '-ExecutionPolicy', + 'Bypass', + '-File', + scriptFile, + ], + { stdout: 'ignore', stderr: 'ignore' }, + ) + return true + } catch { + return false + } +} + +/** Update the indicator message */ +export function updateIndicator(message: string): void { + if (!msgFile) return + try { + fs.writeFileSync(msgFile, message, 'utf-8') + } catch {} +} + +/** Hide and destroy the indicator */ +export function hideIndicator(): void { + if (stopFile) { + try { + fs.writeFileSync(stopFile, 'STOP', 'utf-8') + } catch {} + setTimeout(() => { + try { + indicatorProc?.kill() + } catch {} + try { + if (scriptFile) fs.unlinkSync(scriptFile) + } catch {} + try { + if (stopFile) fs.unlinkSync(stopFile) + } catch {} + try { + if (msgFile) fs.unlinkSync(msgFile) + } catch {} + }, 2000) + } + indicatorProc = null + stopFile = null + scriptFile = null + msgFile = null +} + +// ── Convenience methods for common actions ── + +export function indicateTyping(text: string): void { + const preview = text.length > 30 ? text.slice(0, 30) + '...' : text + updateIndicator(`\u2328 Typing "${preview}"`) +} + +export function indicateKey(combo: string): void { + updateIndicator(`\u2328 ${combo}`) +} + +export function indicateClick( + x: number, + y: number, + button: string = 'left', +): void { + updateIndicator( + `\uD83D\uDDB1 ${button === 'right' ? 'Right-click' : 'Click'} (${x}, ${y})`, + ) +} + +export function indicateScroll(direction: string, amount: number): void { + const arrow = + direction === 'up' + ? '\u2191' + : direction === 'down' + ? '\u2193' + : direction === 'left' + ? '\u2190' + : '\u2192' + updateIndicator(`\uD83D\uDCDC Scroll ${arrow} ${amount}`) +} + +export function indicateDone(): void { + updateIndicator('\u2705 Done') +} diff --git a/src/utils/computerUse/win32/ocr.ts b/src/utils/computerUse/win32/ocr.ts index 69ca3a6e1..02536d919 100644 --- a/src/utils/computerUse/win32/ocr.ts +++ b/src/utils/computerUse/win32/ocr.ts @@ -3,6 +3,8 @@ * Captures a screen region or window, then runs WinRT OCR to extract text. */ +import { ps as runPs } from './shared.js' + export interface OcrLine { text: string bounds: { x: number; y: number; w: number; h: number } @@ -18,15 +20,6 @@ function emptyResult(language: string): OcrResult { return { text: '', lines: [], language } } -function runPs(script: string): string { - const result = Bun.spawnSync({ - cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script], - stdout: 'pipe', - stderr: 'pipe', - }) - return new TextDecoder().decode(result.stdout).trim() -} - /** * PowerShell script that: * 1. Screenshots a screen region using CopyFromScreen diff --git a/src/utils/computerUse/win32/shared.ts b/src/utils/computerUse/win32/shared.ts new file mode 100644 index 000000000..fbbd1ac4a --- /dev/null +++ b/src/utils/computerUse/win32/shared.ts @@ -0,0 +1,127 @@ +/** + * Shared utilities for win32 Computer Use modules. + * Single source of truth — no more duplication across files. + */ + +/** Validate HWND is a pure numeric string — prevents PowerShell/Python injection. */ +export function validateHwnd(hwnd: string): string { + if (!/^\d+$/.test(hwnd)) { + throw new Error(`Invalid HWND: "${hwnd}" — must be numeric`) + } + return hwnd +} + +/** Run a PowerShell script synchronously, return stdout trimmed. */ +export function ps(script: string): string { + const result = Bun.spawnSync({ + cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script], + stdout: 'pipe', + stderr: 'pipe', + }) + return new TextDecoder().decode(result.stdout).trim() +} + +/** Run a PowerShell script synchronously, return null on failure. */ +export function runPs(script: string): string | null { + try { + const result = Bun.spawnSync({ + cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script], + stdout: 'pipe', + stderr: 'pipe', + }) + if (result.exitCode !== 0) return null + return new TextDecoder().decode(result.stdout).trim() + } catch { + return null + } +} + +/** Run a PowerShell script asynchronously. */ +export async function psAsync(script: string): Promise { + const proc = Bun.spawn( + ['powershell', '-NoProfile', '-NonInteractive', '-Command', script], + { stdout: 'pipe', stderr: 'pipe' }, + ) + const out = await new Response(proc.stdout).text() + await proc.exited + return out.trim() +} + +/** Get the system temp directory. */ +export function getTmpDir(): string { + return process.env.TEMP || process.env.TMP || '/tmp' +} + +/** Virtual key code mapping — canonical, complete. */ +export const VK_MAP: Record = { + backspace: 0x08, + tab: 0x09, + enter: 0x0d, + return: 0x0d, + shift: 0x10, + lshift: 0xa0, + rshift: 0xa1, + ctrl: 0x11, + control: 0x11, + lcontrol: 0xa2, + rcontrol: 0xa3, + alt: 0x12, + option: 0x12, + menu: 0x12, + lalt: 0xa4, + ralt: 0xa5, + pause: 0x13, + capslock: 0x14, + escape: 0x1b, + esc: 0x1b, + space: 0x20, + pageup: 0x21, + pagedown: 0x22, + end: 0x23, + home: 0x24, + left: 0x25, + up: 0x26, + right: 0x27, + down: 0x28, + insert: 0x2d, + delete: 0x2e, + win: 0x5b, + meta: 0x5b, + command: 0x5b, + cmd: 0x5b, + super: 0x5b, + numlock: 0x90, + scrolllock: 0x91, + printscreen: 0x2c, + f1: 0x70, + f2: 0x71, + f3: 0x72, + f4: 0x73, + f5: 0x74, + f6: 0x75, + f7: 0x76, + f8: 0x77, + f9: 0x78, + f10: 0x79, + f11: 0x7a, + f12: 0x7b, +} + +export const MODIFIER_KEYS = new Set([ + 'shift', + 'lshift', + 'rshift', + 'control', + 'ctrl', + 'lcontrol', + 'rcontrol', + 'alt', + 'option', + 'lalt', + 'ralt', + 'win', + 'meta', + 'command', + 'cmd', + 'super', +]) diff --git a/src/utils/computerUse/win32/uiAutomation.ts b/src/utils/computerUse/win32/uiAutomation.ts index 292d7e646..51a6f961e 100644 --- a/src/utils/computerUse/win32/uiAutomation.ts +++ b/src/utils/computerUse/win32/uiAutomation.ts @@ -5,6 +5,8 @@ * value setting, and hit-testing via PowerShell + System.Windows.Automation. */ +import { ps } from './shared.js' + export interface UIElement { name: string controlType: string // Button, Edit, Text, List, Window, etc. @@ -15,6 +17,48 @@ export interface UIElement { children?: UIElement[] } +const VALID_CONTROL_TYPES = new Set([ + 'Button', + 'Calendar', + 'CheckBox', + 'ComboBox', + 'Custom', + 'DataGrid', + 'DataItem', + 'Document', + 'Edit', + 'Group', + 'Header', + 'HeaderItem', + 'Hyperlink', + 'Image', + 'List', + 'ListItem', + 'Menu', + 'MenuBar', + 'MenuItem', + 'Pane', + 'ProgressBar', + 'RadioButton', + 'ScrollBar', + 'Separator', + 'Slider', + 'Spinner', + 'SplitButton', + 'StatusBar', + 'Tab', + 'TabItem', + 'Table', + 'Text', + 'Thumb', + 'TitleBar', + 'ToolBar', + 'ToolTip', + 'Tree', + 'TreeItem', + 'Window', +]) + // --------------------------------------------------------------------------- // Helper // --------------------------------------------------------------------------- @@ -25,15 +69,6 @@ Add-Type -AssemblyName UIAutomationTypes Add-Type -AssemblyName WindowsBase ` -function ps(script: string): string { - const result = Bun.spawnSync({ - cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script], - stdout: 'pipe', - stderr: 'pipe', - }) - return new TextDecoder().decode(result.stdout).trim() -} - function parseJsonSafe(raw: string, fallback: T): T { try { if (!raw) return fallback @@ -143,6 +178,9 @@ export function findElement( ) } if (query.controlType) { + if (!VALID_CONTROL_TYPES.has(query.controlType)) { + return null // Invalid control type + } const v = query.controlType.replace(/'/g, "''") conditions.push( `[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::ControlTypeProperty, [System.Windows.Automation.ControlType]::${v})`, @@ -204,7 +242,10 @@ $obj | ConvertTo-Json -Compress /** * Click an element by its automationId using InvokePattern. */ -export function clickElement(windowTitle: string, automationId: string): boolean { +export function clickElement( + windowTitle: string, + automationId: string, +): boolean { const escapedTitle = windowTitle.replace(/'/g, "''") const escapedId = automationId.replace(/'/g, "''") @@ -237,7 +278,11 @@ try { /** * Set the value of an element by its automationId using ValuePattern. */ -export function setValue(windowTitle: string, automationId: string, value: string): boolean { +export function setValue( + windowTitle: string, + automationId: string, + value: string, +): boolean { const escapedTitle = windowTitle.replace(/'/g, "''") const escapedId = automationId.replace(/'/g, "''") const escapedValue = value.replace(/'/g, "''") diff --git a/src/utils/computerUse/win32/virtualCursor.ts b/src/utils/computerUse/win32/virtualCursor.ts new file mode 100644 index 000000000..52b14ff2e --- /dev/null +++ b/src/utils/computerUse/win32/virtualCursor.ts @@ -0,0 +1,268 @@ +/** + * Virtual Cursor — visible overlay cursor for the bound window. + * + * Shows a small colored cursor icon on top of the bound window, + * independent of the real mouse cursor. The user's real mouse + * stays free for their own use. + * + * The virtual cursor: + * - Moves when Computer Use calls click/moveMouse + * - Shows click animations (brief color flash) + * - Is click-through (WS_EX_TRANSPARENT) — doesn't intercept real mouse + * - Tracks the bound window position via the border tracker + * - Disappears when the window is unbound + */ + +import * as fs from 'fs' +import * as path from 'path' +import { validateHwnd, getTmpDir } from './shared.js' + +const CURSOR_SIZE = 20 +const CURSOR_COLOR_R = 255 +const CURSOR_COLOR_G = 50 +const CURSOR_COLOR_B = 50 +const CURSOR_OPACITY = 0.9 + +let cursorProc: ReturnType | null = null +let cursorStopFile: string | null = null +let cursorScriptFile: string | null = null + +function buildCursorScript(hwnd: string, stopFile: string): string { + const stopFileEscaped = stopFile.replace(/\\/g, '\\\\') + return ` +Add-Type -AssemblyName System.Windows.Forms +Add-Type -AssemblyName System.Drawing +Add-Type @' +using System; +using System.Runtime.InteropServices; +using System.Drawing; +using System.Drawing.Drawing2D; + +public class VCursor { + [DllImport("user32.dll")] + public static extern bool IsWindow(IntPtr hWnd); + + [DllImport("user32.dll", SetLastError = true)] + public static extern int SetWindowLong(IntPtr hWnd, int nIndex, int dwNewLong); + + [DllImport("user32.dll", SetLastError = true)] + public static extern int GetWindowLong(IntPtr hWnd, int nIndex); + + [DllImport("user32.dll")] + public static extern bool SetWindowPos(IntPtr hWnd, IntPtr hAfter, int X, int Y, int cx, int cy, uint f); + + [DllImport("user32.dll")] + public static extern bool GetWindowRect(IntPtr h, out RECT r); + + [StructLayout(LayoutKind.Sequential)] + public struct RECT { public int L, T, R, B; } + + public const int GWL_EXSTYLE = -20; + public const int WS_EX_LAYERED = 0x80000; + public const int WS_EX_TRANSPARENT = 0x20; + public const int WS_EX_TOOLWINDOW = 0x80; + public const int WS_EX_NOACTIVATE = 0x08000000; + public static readonly IntPtr HWND_TOPMOST = new IntPtr(-1); + public const uint SWP_NOACTIVATE = 0x0010; + public const uint SWP_SHOWWINDOW = 0x0040; + public const uint SWP_NOSIZE = 0x0001; + + public static void MakeOverlay(IntPtr h) { + int ex = GetWindowLong(h, GWL_EXSTYLE); + ex |= WS_EX_LAYERED | WS_EX_TRANSPARENT | WS_EX_TOOLWINDOW | WS_EX_NOACTIVATE; + SetWindowLong(h, GWL_EXSTYLE, ex); + } +} +'@ + +$targetHwnd = [IntPtr]::new([long]${hwnd}) +$stopFile = '${stopFileEscaped}' +$cursorSize = ${CURSOR_SIZE} + +# Create cursor form with arrow shape +$cursor = New-Object System.Windows.Forms.Form +$cursor.FormBorderStyle = [System.Windows.Forms.FormBorderStyle]::None +$cursor.ShowInTaskbar = $false +$cursor.TopMost = $true +$cursor.StartPosition = [System.Windows.Forms.FormStartPosition]::Manual +$cursor.Size = New-Object System.Drawing.Size($cursorSize, $cursorSize) +$cursor.Location = New-Object System.Drawing.Point(-32000, -32000) +$cursor.Opacity = ${CURSOR_OPACITY} +$cursor.BackColor = [System.Drawing.Color]::Magenta +$cursor.TransparencyKey = [System.Drawing.Color]::Magenta + +# Draw arrow cursor shape +$bmp = New-Object System.Drawing.Bitmap($cursorSize, $cursorSize) +$g = [System.Drawing.Graphics]::FromImage($bmp) +$g.SmoothingMode = [System.Drawing.Drawing2D.SmoothingMode]::AntiAlias +# Arrow polygon (pointing top-left) +$points = @( + (New-Object System.Drawing.Point(1, 1)), + (New-Object System.Drawing.Point(1, 16)), + (New-Object System.Drawing.Point(5, 12)), + (New-Object System.Drawing.Point(9, 18)), + (New-Object System.Drawing.Point(12, 16)), + (New-Object System.Drawing.Point(8, 10)), + (New-Object System.Drawing.Point(13, 10)), + (New-Object System.Drawing.Point(1, 1)) +) +$brush = New-Object System.Drawing.SolidBrush([System.Drawing.Color]::FromArgb(${CURSOR_COLOR_R}, ${CURSOR_COLOR_G}, ${CURSOR_COLOR_B})) +$g.FillPolygon($brush, $points) +$pen = New-Object System.Drawing.Pen([System.Drawing.Color]::White, 1) +$g.DrawPolygon($pen, $points) +$g.Dispose() +$cursor.BackgroundImage = $bmp + +$cursor.Show() +[VCursor]::MakeOverlay($cursor.Handle) + +# Position file: the TS side writes "x,y" or "x,y,click" to this file +$posFile = $stopFile + '.pos' + +$script:lastCX = -32000 +$script:lastCY = -32000 +$script:clickFlash = 0 + +$timer = New-Object System.Windows.Forms.Timer +$timer.Interval = 16 # ~60fps + +$timer.Add_Tick({ + if (-not [VCursor]::IsWindow($targetHwnd)) { + $timer.Stop(); $cursor.Close() + [System.Windows.Forms.Application]::ExitThread() + return + } + # Check stop + if (Test-Path $stopFile) { + $timer.Stop(); $cursor.Close() + try { Remove-Item $stopFile -ErrorAction SilentlyContinue } catch {} + try { Remove-Item $posFile -ErrorAction SilentlyContinue } catch {} + [System.Windows.Forms.Application]::ExitThread() + return + } + # Read position updates + if (Test-Path $posFile) { + try { + $data = Get-Content $posFile -Raw -ErrorAction SilentlyContinue + if ($data) { + $parts = $data.Trim().Split(',') + if ($parts.Length -ge 2) { + $script:lastCX = [int]$parts[0] + $script:lastCY = [int]$parts[1] + if ($parts.Length -ge 3 -and $parts[2] -eq 'click') { + $script:clickFlash = 6 # flash for 6 frames (~100ms) + } + } + Remove-Item $posFile -ErrorAction SilentlyContinue + } + } catch {} + } + + # Get window position to convert client coords to screen coords + $wr = New-Object VCursor+RECT + [VCursor]::GetWindowRect($targetHwnd, [ref]$wr) | Out-Null + $screenX = $wr.L + $script:lastCX + $screenY = $wr.T + $script:lastCY + + # Click flash: briefly change color + if ($script:clickFlash -gt 0) { + $cursor.Opacity = 1.0 + $script:clickFlash-- + if ($script:clickFlash -eq 0) { + $cursor.Opacity = ${CURSOR_OPACITY} + } + } + + [VCursor]::SetWindowPos($cursor.Handle, [VCursor]::HWND_TOPMOST, + $screenX, $screenY, 0, 0, + [VCursor]::SWP_NOSIZE -bor [VCursor]::SWP_NOACTIVATE -bor [VCursor]::SWP_SHOWWINDOW) | Out-Null + $cursor.Visible = $true +}) + +$timer.Start() +[System.Windows.Forms.Application]::Run() +` +} + +/** + * Start the virtual cursor overlay for a bound window. + */ +export function showVirtualCursor(hwnd: string): boolean { + hwnd = validateHwnd(hwnd) + hideVirtualCursor() + try { + const tmpDir = getTmpDir() + const ts = Date.now() + const stopFile = path.join(tmpDir, `cu_vcursor_stop_${ts}`) + const scriptFile = path.join(tmpDir, `cu_vcursor_${ts}.ps1`) + const script = buildCursorScript(hwnd, stopFile) + fs.writeFileSync(scriptFile, script, 'utf-8') + + cursorProc = Bun.spawn( + [ + 'powershell', + '-NoProfile', + '-ExecutionPolicy', + 'Bypass', + '-File', + scriptFile, + ], + { stdout: 'ignore', stderr: 'ignore' }, + ) + cursorStopFile = stopFile + cursorScriptFile = scriptFile + return true + } catch { + return false + } +} + +/** + * Move the virtual cursor to client-area coordinates. + */ +export function moveVirtualCursor( + x: number, + y: number, + isClick: boolean = false, +): void { + if (!cursorStopFile) return + const posFile = cursorStopFile + '.pos' + try { + const data = isClick + ? `${Math.round(x)},${Math.round(y)},click` + : `${Math.round(x)},${Math.round(y)}` + fs.writeFileSync(posFile, data, 'utf-8') + } catch {} +} + +/** + * Hide and destroy the virtual cursor. + */ +export function hideVirtualCursor(): void { + if (cursorStopFile) { + try { + fs.writeFileSync(cursorStopFile, 'STOP', 'utf-8') + } catch {} + setTimeout(() => { + try { + cursorProc?.kill() + } catch {} + try { + if (cursorScriptFile) fs.unlinkSync(cursorScriptFile) + } catch {} + try { + if (cursorStopFile) fs.unlinkSync(cursorStopFile) + } catch {} + }, 2000) + } + cursorProc = null + cursorStopFile = null + cursorScriptFile = null +} + +/** + * Check if virtual cursor is active. + */ +export function isVirtualCursorActive(): boolean { + return cursorProc !== null +} diff --git a/src/utils/computerUse/win32/windowBorder.ts b/src/utils/computerUse/win32/windowBorder.ts new file mode 100644 index 000000000..a11a29095 --- /dev/null +++ b/src/utils/computerUse/win32/windowBorder.ts @@ -0,0 +1,66 @@ +/** + * Visual indicator for bound windows — DWM native border color. + * + * Uses DwmSetWindowAttribute(DWMWA_BORDER_COLOR) to set a green border + * on the bound window. The border: + * - Is the window's OWN border, not an overlay — zero offset, zero shadow issues + * - Follows window movement/resize/rounded corners automatically (OS-level) + * - Persists across repaints, zero performance overhead + * - Works on Win11 22000+ (Build 22000 = Windows 11 GA) + * + * No overlays, no polling, no separate processes, no z-order issues. + */ + +import { validateHwnd, ps } from './shared.js' + +/** + * Set green border on bound window via DWM. + */ +export function markBound(hwnd: string): boolean { + hwnd = validateHwnd(hwnd) + // DWMWA_BORDER_COLOR = 34, COLORREF = 0x00BBGGRR + // Green: R=0, G=200, B=0 → 0x0000C800 + const hr = ps( + `Add-Type @' +using System; +using System.Runtime.InteropServices; +public class CuDwm { + [DllImport("dwmapi.dll")] + public static extern int DwmSetWindowAttribute(IntPtr hwnd, int attr, ref uint val, int size); +} +'@ +$color = [uint32]0x0000C800 +[CuDwm]::DwmSetWindowAttribute([IntPtr]::new([long]${hwnd}), 34, [ref]$color, 4)`, + ) + return hr === '0' +} + +/** + * Remove border, restore default. + */ +export function unmarkBound(hwnd: string): boolean { + hwnd = validateHwnd(hwnd) + // DWMWA_COLOR_DEFAULT = 0xFFFFFFFF + const hr = ps( + `Add-Type @' +using System; +using System.Runtime.InteropServices; +public class CuDwm { + [DllImport("dwmapi.dll")] + public static extern int DwmSetWindowAttribute(IntPtr hwnd, int attr, ref uint val, int size); +} +'@ +$color = [uint32]0xFFFFFFFF +[CuDwm]::DwmSetWindowAttribute([IntPtr]::new([long]${hwnd}), 34, [ref]$color, 4)`, + ) + return hr === '0' +} + +/** + * Kill all borders — just reset all bound windows. + * With DWM approach, no processes to kill. + */ +export function cleanupAllBorders(): void { + // DWM border color is a window attribute — it resets automatically + // when the process exits or the window closes. No cleanup needed. +} diff --git a/src/utils/computerUse/win32/windowEnum.ts b/src/utils/computerUse/win32/windowEnum.ts index 03bdbbebb..116507784 100644 --- a/src/utils/computerUse/win32/windowEnum.ts +++ b/src/utils/computerUse/win32/windowEnum.ts @@ -4,7 +4,7 @@ */ export interface WindowInfo { - hwnd: number + hwnd: string pid: number title: string } @@ -59,7 +59,13 @@ public class WinEnum { */ export function listWindows(): WindowInfo[] { const result = Bun.spawnSync({ - cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', ENUM_WINDOWS_PS], + cmd: [ + 'powershell', + '-NoProfile', + '-NonInteractive', + '-Command', + ENUM_WINDOWS_PS, + ], stdout: 'pipe', stderr: 'pipe', }) @@ -75,11 +81,11 @@ export function listWindows(): WindowInfo[] { const secondPipe = trimmed.indexOf('|', firstPipe + 1) if (firstPipe === -1 || secondPipe === -1) return null - const hwnd = Number(trimmed.slice(0, firstPipe)) + const hwnd = trimmed.slice(0, firstPipe) const pid = Number(trimmed.slice(firstPipe + 1, secondPipe)) const title = trimmed.slice(secondPipe + 1) - if (isNaN(hwnd) || isNaN(pid) || !title) return null + if (!hwnd || isNaN(pid) || !title) return null return { hwnd, pid, title } }) .filter((item): item is WindowInfo => item !== null) diff --git a/src/utils/computerUse/win32/windowMessage.ts b/src/utils/computerUse/win32/windowMessage.ts new file mode 100644 index 000000000..5b904e1ba --- /dev/null +++ b/src/utils/computerUse/win32/windowMessage.ts @@ -0,0 +1,696 @@ +/** + * SendMessage-based input for Win32 windows. + * + * ALL text/keyboard operations target a specific HWND via SendMessageW. + * No SendInput / keybd_event / SendKeys — those are global and conflict with the user. + * + * Text input strategy: + * 1. Short text (≤ CLIPBOARD_THRESHOLD chars): SendMessageW(WM_CHAR) per codepoint + * 2. Long text (> threshold): Clipboard.SetText() + SendMessageW(Ctrl+V) paste + * Both paths support full Unicode (Chinese, emoji, etc.) without IME involvement. + */ + +import { validateHwnd, runPs, VK_MAP, MODIFIER_KEYS } from './shared.js' + +/** Character count above which we switch to clipboard paste */ +const CLIPBOARD_THRESHOLD = 32 + +/** Cache findEditChild results — window structure doesn't change while bound */ +const editChildCache = new Map() + +/** Clear cached edit-child mappings. Call on unbind. */ +export function clearEditChildCache(hwnd?: string): void { + if (hwnd) { + editChildCache.delete(hwnd) + } else { + editChildCache.clear() + } +} + +/** + * Resolve the HWND that should actually receive input messages. + * For WinUI 3 apps, returns the InputSite child window. + * For traditional Win32 apps, returns the edit control or the original HWND. + */ +export function resolveInputHwnd(hwnd: string): string { + hwnd = validateHwnd(hwnd) + return findEditChild(hwnd) ?? hwnd +} + +const WINMSG_TYPE = ` +Add-Type @' +using System; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Text; + +public class WinMsg { + public delegate bool EnumChildProc(IntPtr hWnd, IntPtr lParam); + + [DllImport("user32.dll")] + public static extern bool EnumChildWindows(IntPtr parent, EnumChildProc proc, IntPtr lParam); + + [DllImport("user32.dll", CharSet=CharSet.Unicode)] + public static extern int GetClassName(IntPtr h, StringBuilder sb, int max); + + // CRITICAL: CharSet.Unicode → resolves to SendMessageW + // SendMessageW sends Unicode WM_CHAR (full UTF-16 codepoints including CJK) + [DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")] + public static extern IntPtr SendMessage(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam); + + [DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="PostMessageW")] + public static extern bool PostMessage(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam); + + [DllImport("user32.dll")] + public static extern uint MapVirtualKeyW(uint uCode, uint uMapType); + + public static IntPtr MakeLParam(int lo, int hi) { + return (IntPtr)((hi << 16) | (lo & 0xFFFF)); + } + + // Build lParam for WM_KEYDOWN / WM_KEYUP with correct scan code + // lParam bits: 0-15 repeat count, 16-23 scan code, 24 extended, 30 prev state, 31 transition + public static IntPtr KeyDownLParam(uint vk) { + uint scanCode = MapVirtualKeyW(vk, 0); // MAPVK_VK_TO_VSC = 0 + return (IntPtr)(1 | (scanCode << 16)); // repeat=1, scanCode in bits 16-23 + } + public static IntPtr KeyUpLParam(uint vk) { + uint scanCode = MapVirtualKeyW(vk, 0); + return (IntPtr)(1 | (scanCode << 16) | (1 << 30) | (1u << 31)); // prev=1, transition=1 + } + + public const uint WM_CHAR = 0x0102; + public const uint WM_KEYDOWN = 0x0100; + public const uint WM_KEYUP = 0x0101; + public const uint WM_LBUTTONDOWN = 0x0201; + public const uint WM_LBUTTONUP = 0x0202; + public const uint WM_RBUTTONDOWN = 0x0204; + public const uint WM_RBUTTONUP = 0x0205; + + public static List childResults = new List(); + + public static void FindChildren(IntPtr parent) { + childResults.Clear(); + EnumChildWindows(parent, delegate(IntPtr hWnd, IntPtr lParam) { + StringBuilder sb = new StringBuilder(256); + GetClassName(hWnd, sb, sb.Capacity); + childResults.Add(hWnd.ToInt64() + "|" + sb.ToString()); + return true; + }, IntPtr.Zero); + } +} +'@ +` + +// Edit class names in priority order +const EDIT_CLASSES = [ + 'Windows.UI.Input.InputSite.WindowClass', // WinUI 3 input bridge (Windows Terminal, etc.) + 'RichEditD2DPT', // Win11 Notepad (WinUI 3) + 'RichEdit20W', // WordPad + 'Edit', // Classic edit controls + 'Scintilla', // Scintilla-based editors (Notepad++, etc.) + 'Chrome_RenderWidgetHostHWND', // Chrome/Electron + 'TextBox', // WPF TextBox + 'RichTextBox', // WPF RichTextBox + 'Windows.UI.Core.CoreWindow', // UWP CoreWindow (input target for some UWP apps) +] + +/** + * Find the first edit-capable child window of a parent HWND. + * + * Strategy: + * 1. EnumChildWindows — search for known edit control class names + * 2. UI Automation fallback — find the first Edit/Document element and get its native HWND + * + * EnumChildWindows is recursive and enumerates all descendant windows, + * but for UWP apps the edit control may be in a different process (hosted + * inside ApplicationFrameHost). UI Automation crosses process boundaries. + */ +export function findEditChild(parentHwnd: string): string | null { + parentHwnd = validateHwnd(parentHwnd) + + // Cache hit + if (editChildCache.has(parentHwnd)) { + return editChildCache.get(parentHwnd)! + } + + // Strategy 1: EnumChildWindows (fast, works for Win32 apps) + const script = `${WINMSG_TYPE} +[WinMsg]::FindChildren([IntPtr]::new([long]${parentHwnd})) +[WinMsg]::childResults | ForEach-Object { $_ } +` + const raw = runPs(script) + if (raw) { + const children = raw + .split('\n') + .filter(Boolean) + .map(line => { + const trimmed = line.trim() + const pipe = trimmed.indexOf('|') + if (pipe === -1) return null + return { + hwnd: trimmed.slice(0, pipe), + className: trimmed.slice(pipe + 1), + } + }) + .filter( + (item): item is { hwnd: string; className: string } => item !== null, + ) + + // Search in priority order + for (const editClass of EDIT_CLASSES) { + const match = children.find(c => c.className === editClass) + if (match) { + editChildCache.set(parentHwnd, match.hwnd) + return match.hwnd + } + } + } + + // Strategy 2: UI Automation (crosses process boundaries, finds UWP edit controls) + const uiaScript = ` +Add-Type -AssemblyName UIAutomationClient +Add-Type -AssemblyName UIAutomationTypes +Add-Type @' +using System; +using System.Runtime.InteropServices; +public class UiaHelper { + [DllImport("user32.dll")] + public static extern bool IsWindow(IntPtr hWnd); +} +'@ +try { + $el = [System.Windows.Automation.AutomationElement]::FromHandle([IntPtr]::new([long]${parentHwnd})) + if ($el -eq $null) { Write-Output 'NONE'; exit } + + # Search for Edit or Document control types (covers text editors) + $editCond = [System.Windows.Automation.PropertyCondition]::new( + [System.Windows.Automation.AutomationElement]::ControlTypeProperty, + [System.Windows.Automation.ControlType]::Edit) + $docCond = [System.Windows.Automation.PropertyCondition]::new( + [System.Windows.Automation.AutomationElement]::ControlTypeProperty, + [System.Windows.Automation.ControlType]::Document) + $orCond = [System.Windows.Automation.OrCondition]::new($editCond, $docCond) + + $found = $el.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $orCond) + if ($found -eq $null) { Write-Output 'NONE'; exit } + + $nativeHwnd = $found.Current.NativeWindowHandle + if ($nativeHwnd -ne 0) { + Write-Output $nativeHwnd + } else { + Write-Output 'NONE' + } +} catch { + Write-Output 'NONE' +} +` + const uiaResult = runPs(uiaScript) + if (uiaResult && uiaResult !== 'NONE') { + const hwnd = uiaResult.trim() + if (hwnd && hwnd !== '0') { + editChildCache.set(parentHwnd, hwnd) + return hwnd + } + } + + editChildCache.set(parentHwnd, null) + return null +} + +/** + * Send a single Unicode character to a window via SendMessageW(WM_CHAR). + * Handles surrogate pairs for characters outside BMP (emoji, rare CJK, etc.). + */ +export function sendChar(hwnd: string, char: string): boolean { + hwnd = validateHwnd(hwnd) + const codePoint = char.codePointAt(0) + if (codePoint === undefined) return false + + const hwndExpr = `[IntPtr]::new([long]${hwnd})` + + // BMP character (U+0000 to U+FFFF): single WM_CHAR + if (codePoint <= 0xffff) { + const script = `${WINMSG_TYPE} +[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${codePoint}, [IntPtr]0) +` + return runPs(script) !== null + } + + // Supplementary character (U+10000+): send as UTF-16 surrogate pair + // Windows processes surrogate pairs as two sequential WM_CHAR messages + const hi = Math.floor((codePoint - 0x10000) / 0x400) + 0xd800 + const lo = ((codePoint - 0x10000) % 0x400) + 0xdc00 + const script = `${WINMSG_TYPE} +[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${hi}, [IntPtr]0) +[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${lo}, [IntPtr]0) +` + return runPs(script) !== null +} + +/** + * Build PowerShell lines that send each codepoint via WM_CHAR. + * Handles surrogate pairs for supplementary characters. + */ +function buildWmCharLines(hwnd: string, text: string): string[] { + const hwndExpr = `[IntPtr]::new([long]${hwnd})` + const lines: string[] = [] + for (const ch of text) { + const cp = ch.codePointAt(0)! + if (cp <= 0xffff) { + lines.push( + `[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${cp}, [IntPtr]0)`, + ) + } else { + const hi = Math.floor((cp - 0x10000) / 0x400) + 0xd800 + const lo = ((cp - 0x10000) % 0x400) + 0xdc00 + lines.push( + `[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${hi}, [IntPtr]0)`, + ) + lines.push( + `[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${lo}, [IntPtr]0)`, + ) + } + } + return lines +} + +/** + * Paste text via clipboard into the target window. + * Uses Clipboard.SetText() + SendMessageW(Ctrl+V). + * NO global APIs (SendInput/keybd_event/SendKeys) — only window-targeted messages. + */ +function pasteViaClipboard(hwnd: string, text: string): boolean { + // Escape single quotes for PowerShell string literal + const escaped = text.replace(/'/g, "''") + const hwndExpr = `[IntPtr]::new([long]${hwnd})` + const script = `${WINMSG_TYPE} +Add-Type -AssemblyName System.Windows.Forms + +# Save current clipboard +$saved = $null +try { $saved = [System.Windows.Forms.Clipboard]::GetText() } catch {} + +# Set our text +[System.Windows.Forms.Clipboard]::SetText('${escaped}') + +# Ctrl+V via PostMessage to the target window (NOT global keybd_event) +# Must use PostMessage + correct lParam (scan code) for Windows Terminal / ConPTY +[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]0x11, [WinMsg]::KeyDownLParam(0x11)) # Ctrl down +[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]0x56, [WinMsg]::KeyDownLParam(0x56)) # V down +[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]0x56, [WinMsg]::KeyUpLParam(0x56)) # V up +[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]0x11, [WinMsg]::KeyUpLParam(0x11)) # Ctrl up + +# Brief wait for paste to complete +Start-Sleep -Milliseconds 50 + +# Restore clipboard +if ($saved -ne $null -and $saved -ne '') { + try { [System.Windows.Forms.Clipboard]::SetText($saved) } catch {} +} else { + try { [System.Windows.Forms.Clipboard]::Clear() } catch {} +} +Write-Output 'OK' +` + return runPs(script) === 'OK' +} + +/** + * Send text to a window via WM_CHAR per Unicode codepoint. + * Always uses the WM_CHAR path — reliable across all window types including + * Windows Terminal / ConPTY where clipboard-based Ctrl+V doesn't work. + * Window-targeted, no global input APIs. + */ +export function sendText(hwnd: string, text: string): boolean { + const targetHwnd = resolveInputHwnd(hwnd) + const charLines = buildWmCharLines(targetHwnd, text) + const script = `${WINMSG_TYPE} +${charLines.join('\n')} +` + return runPs(script) !== null +} + +/** + * Send a key down or key up event via PostMessageW(WM_KEYDOWN / WM_KEYUP). + * Uses PostMessage (async) instead of SendMessage — required for Windows Terminal + * and ConPTY-based console windows to correctly process key events. + * lParam includes the correct scan code via MapVirtualKeyW. + */ +export function sendKey( + hwnd: string, + vk: number, + action: 'down' | 'up', +): boolean { + hwnd = validateHwnd(hwnd) + const msg = action === 'down' ? '0x0100' : '0x0101' + const lParamFn = action === 'down' ? 'KeyDownLParam' : 'KeyUpLParam' + const script = `${WINMSG_TYPE} +[WinMsg]::PostMessage([IntPtr]::new([long]${hwnd}), ${msg}, [IntPtr]${vk}, [WinMsg]::${lParamFn}(${vk})) +` + return runPs(script) !== null +} + +/** + * Send a key combination (e.g. ['ctrl', 'a']). + * Holds modifiers via WM_KEYDOWN, presses the key, then releases in reverse. + * All via SendMessageW — no global APIs. + */ +export function sendKeys(hwnd: string, combo: string[]): boolean { + hwnd = resolveInputHwnd(hwnd) + if (combo.length === 0) return false + + const modifiers: number[] = [] + let mainKey: number | undefined + + for (const key of combo) { + const lower = key.toLowerCase() + const vk = VK_MAP[lower] + if (vk !== undefined) { + if (MODIFIER_KEYS.has(lower)) { + modifiers.push(vk) + } else { + mainKey = vk + } + } else if (lower.length === 1) { + // Single character — use its uppercase VK code + mainKey = lower.toUpperCase().charCodeAt(0) + } else { + return false + } + } + + if (mainKey === undefined) return false + + // Build script: modifiers down, key down, key up, modifiers up (reverse) + // Uses PostMessage (async) + correct lParam (scan code) — required for + // Windows Terminal / ConPTY to correctly translate key events. + const hwndExpr = `[IntPtr]::new([long]${hwnd})` + const lines: string[] = [] + for (const mod of modifiers) { + lines.push( + `[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]${mod}, [WinMsg]::KeyDownLParam(${mod}))`, + ) + } + lines.push( + `[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]${mainKey}, [WinMsg]::KeyDownLParam(${mainKey}))`, + ) + lines.push( + `[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]${mainKey}, [WinMsg]::KeyUpLParam(${mainKey}))`, + ) + for (const mod of [...modifiers].reverse()) { + lines.push( + `[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]${mod}, [WinMsg]::KeyUpLParam(${mod}))`, + ) + } + + const script = `${WINMSG_TYPE} +${lines.join('\n')} +` + return runPs(script) !== null +} + +// ── Console Input Buffer (WriteConsoleInput) ───────────────────────── +// For terminal/console windows, SendMessageW doesn't reliably inject +// key events into the Console Input Buffer that raw-mode stdin reads. +// This function uses AttachConsole + WriteConsoleInput to inject directly. + +const CONSOLE_INPUT_TYPE = ` +Add-Type @' +using System; +using System.Runtime.InteropServices; + +public class ConsoleInput { + [DllImport("kernel32.dll", SetLastError=true)] + public static extern bool AttachConsole(uint dwProcessId); + + [DllImport("kernel32.dll", SetLastError=true)] + public static extern bool FreeConsole(); + + [DllImport("kernel32.dll", SetLastError=true)] + public static extern IntPtr GetStdHandle(int nStdHandle); + + [DllImport("kernel32.dll", CharSet=CharSet.Unicode, SetLastError=true)] + public static extern bool WriteConsoleInput( + IntPtr hConsoleInput, + INPUT_RECORD[] lpBuffer, + uint nLength, + out uint lpNumberOfEventsWritten); + + [DllImport("kernel32.dll")] + public static extern uint MapVirtualKeyW(uint uCode, uint uMapType); + + [DllImport("user32.dll")] + public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint lpdwProcessId); + + public const int STD_INPUT_HANDLE = -10; + + [StructLayout(LayoutKind.Explicit)] + public struct INPUT_RECORD { + [FieldOffset(0)] public ushort EventType; + [FieldOffset(4)] public KEY_EVENT_RECORD KeyEvent; + } + + [StructLayout(LayoutKind.Explicit, CharSet=CharSet.Unicode)] + public struct KEY_EVENT_RECORD { + [FieldOffset(0)] public bool bKeyDown; + [FieldOffset(4)] public ushort wRepeatCount; + [FieldOffset(6)] public ushort wVirtualKeyCode; + [FieldOffset(8)] public ushort wVirtualScanCode; + [FieldOffset(10)] public char UnicodeChar; + [FieldOffset(12)] public uint dwControlKeyState; + } + + public static bool SendKeyToConsole(IntPtr hwnd, ushort vk, char ch) { + uint pid; + GetWindowThreadProcessId(hwnd, out pid); + if (pid == 0) return false; + + FreeConsole(); + if (!AttachConsole(pid)) return false; + + try { + IntPtr hInput = GetStdHandle(STD_INPUT_HANDLE); + if (hInput == IntPtr.Zero || hInput == (IntPtr)(-1)) return false; + + ushort scanCode = (ushort)MapVirtualKeyW(vk, 0); + INPUT_RECORD[] records = new INPUT_RECORD[2]; + + // Key down + records[0].EventType = 1; // KEY_EVENT + records[0].KeyEvent.bKeyDown = true; + records[0].KeyEvent.wRepeatCount = 1; + records[0].KeyEvent.wVirtualKeyCode = vk; + records[0].KeyEvent.wVirtualScanCode = scanCode; + records[0].KeyEvent.UnicodeChar = ch; + records[0].KeyEvent.dwControlKeyState = 0; + + // Key up + records[1].EventType = 1; + records[1].KeyEvent.bKeyDown = false; + records[1].KeyEvent.wRepeatCount = 1; + records[1].KeyEvent.wVirtualKeyCode = vk; + records[1].KeyEvent.wVirtualScanCode = scanCode; + records[1].KeyEvent.UnicodeChar = ch; + records[1].KeyEvent.dwControlKeyState = 0; + + uint written; + return WriteConsoleInput(hInput, records, 2, out written); + } finally { + FreeConsole(); + } + } + + public static bool SendTextToConsole(IntPtr hwnd, string text) { + uint pid; + GetWindowThreadProcessId(hwnd, out pid); + if (pid == 0) return false; + + FreeConsole(); + if (!AttachConsole(pid)) return false; + + try { + IntPtr hInput = GetStdHandle(STD_INPUT_HANDLE); + if (hInput == IntPtr.Zero || hInput == (IntPtr)(-1)) return false; + + INPUT_RECORD[] records = new INPUT_RECORD[text.Length * 2]; + for (int i = 0; i < text.Length; i++) { + char c = text[i]; + ushort vk = 0; + ushort sc = 0; + + // Key down + records[i * 2].EventType = 1; + records[i * 2].KeyEvent.bKeyDown = true; + records[i * 2].KeyEvent.wRepeatCount = 1; + records[i * 2].KeyEvent.wVirtualKeyCode = vk; + records[i * 2].KeyEvent.wVirtualScanCode = sc; + records[i * 2].KeyEvent.UnicodeChar = c; + records[i * 2].KeyEvent.dwControlKeyState = 0; + + // Key up + records[i * 2 + 1].EventType = 1; + records[i * 2 + 1].KeyEvent.bKeyDown = false; + records[i * 2 + 1].KeyEvent.wRepeatCount = 1; + records[i * 2 + 1].KeyEvent.wVirtualKeyCode = vk; + records[i * 2 + 1].KeyEvent.wVirtualScanCode = sc; + records[i * 2 + 1].KeyEvent.UnicodeChar = c; + records[i * 2 + 1].KeyEvent.dwControlKeyState = 0; + } + + uint written; + return WriteConsoleInput(hInput, records, (uint)records.Length, out written); + } finally { + FreeConsole(); + } + } +} +'@ +` + +/** + * Send a key to a console window via WriteConsoleInput (Console Input Buffer). + * This is required for terminal apps like Claude Code REPL that read stdin in raw mode. + */ +export function consoleKey( + hwnd: string, + vk: number, + ch: string = '\0', +): boolean { + hwnd = validateHwnd(hwnd) + const charCode = ch.charCodeAt(0) + const script = `${CONSOLE_INPUT_TYPE} +[ConsoleInput]::SendKeyToConsole([IntPtr]::new([long]${hwnd}), ${vk}, [char]${charCode}) +` + return runPs(script) !== null +} + +/** + * Send text + Enter to a console window via WriteConsoleInput. + * Directly injects into the Console Input Buffer — works for raw-mode stdin. + */ +export function consoleText(hwnd: string, text: string): boolean { + hwnd = validateHwnd(hwnd) + // Escape single quotes for PowerShell + const escaped = text.replace(/'/g, "''") + const script = `${CONSOLE_INPUT_TYPE} +[ConsoleInput]::SendTextToConsole([IntPtr]::new([long]${hwnd}), '${escaped}') +` + return runPs(script) !== null +} + +/** + * Send a mouse click at client-area coordinates (x, y) relative to the window. + * Via SendMessageW — window-targeted, no cursor movement. + */ +export function sendClick( + hwnd: string, + x: number, + y: number, + button: 'left' | 'right', +): boolean { + hwnd = resolveInputHwnd(hwnd) + const downMsg = button === 'left' ? '0x0201' : '0x0204' + const upMsg = button === 'left' ? '0x0202' : '0x0205' + const hwndExpr = `[IntPtr]::new([long]${hwnd})` + + const script = `${WINMSG_TYPE} +$lp = [WinMsg]::MakeLParam(${x}, ${y}) +[WinMsg]::SendMessage(${hwndExpr}, ${downMsg}, [IntPtr]0, $lp) +[WinMsg]::SendMessage(${hwndExpr}, ${upMsg}, [IntPtr]0, $lp) +` + return runPs(script) !== null +} + +/** + * Send a mouse-button-down at client-area coordinates (x, y). + * Via SendMessageW(WM_LBUTTONDOWN) — window-targeted, no cursor movement. + */ +export function sendMouseDown(hwnd: string, x: number, y: number): boolean { + hwnd = resolveInputHwnd(hwnd) + const script = `${WINMSG_TYPE} +$lp = [WinMsg]::MakeLParam(${x}, ${y}) +[WinMsg]::SendMessage([IntPtr]::new([long]${hwnd}), [WinMsg]::WM_LBUTTONDOWN, [IntPtr]1, $lp) +` + return runPs(script) !== null +} + +/** + * Send a mouse-button-up at client-area coordinates (x, y). + * Via SendMessageW(WM_LBUTTONUP) — window-targeted, no cursor movement. + */ +export function sendMouseUp(hwnd: string, x: number, y: number): boolean { + hwnd = resolveInputHwnd(hwnd) + const script = `${WINMSG_TYPE} +$lp = [WinMsg]::MakeLParam(${x}, ${y}) +[WinMsg]::SendMessage([IntPtr]::new([long]${hwnd}), [WinMsg]::WM_LBUTTONUP, [IntPtr]0, $lp) +` + return runPs(script) !== null +} + +/** + * Send a WM_MOUSEMOVE at client-area coordinates (x, y). + * Used during drag operations. Via SendMessageW — window-targeted. + */ +export function sendMouseMove(hwnd: string, x: number, y: number): boolean { + hwnd = resolveInputHwnd(hwnd) + const script = `${WINMSG_TYPE} +$lp = [WinMsg]::MakeLParam(${x}, ${y}) +[WinMsg]::SendMessage([IntPtr]::new([long]${hwnd}), 0x0200, [IntPtr]1, $lp) +` + return runPs(script) !== null +} + +/** + * Send mouse wheel scroll at client-area coordinates (x, y). + * Via SendMessageW(WM_MOUSEWHEEL / WM_MOUSEHWHEEL). + * + * WM_MOUSEWHEEL: vertical scroll (positive delta = scroll up) + * WM_MOUSEHWHEEL: horizontal scroll (positive delta = scroll right) + * + * delta is in multiples of WHEEL_DELTA (120). One "click" = 120. + * lParam = screen coordinates (not client), wParam high word = delta. + * + * Works on Excel, browsers, modern UI — unlike WM_VSCROLL/WM_HSCROLL + * which only work on traditional scrollbar controls. + */ +export function sendMouseWheel( + hwnd: string, + x: number, + y: number, + delta: number, + horizontal: boolean = false, +): boolean { + hwnd = resolveInputHwnd(hwnd) + // WM_MOUSEWHEEL = 0x020A, WM_MOUSEHWHEEL = 0x020E + const msg = horizontal ? '0x020E' : '0x020A' + // wParam: high word = wheel delta (signed short), low word = modifier keys (0) + // delta is in units of WHEEL_DELTA (120). Positive = up/right, negative = down/left. + const wheelDelta = Math.round(delta) * 120 + // Pack delta into high word of wParam: (delta << 16) as signed + // lParam: screen coordinates packed as MAKELPARAM(screenX, screenY) + const script = `${WINMSG_TYPE} +# WM_MOUSEWHEEL/WM_MOUSEHWHEEL require screen coords in lParam +# and wheel delta in high word of wParam +Add-Type @' +using System; +using System.Runtime.InteropServices; +public class WheelHelper { + [DllImport("user32.dll")] public static extern bool ClientToScreen(IntPtr hWnd, ref POINT p); + [StructLayout(LayoutKind.Sequential)] public struct POINT { public int X, Y; } + + [DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")] + public static extern IntPtr SendMsg(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam); + + public static void Scroll(IntPtr hWnd, int clientX, int clientY, int delta, uint msg) { + POINT pt; pt.X = clientX; pt.Y = clientY; + ClientToScreen(hWnd, ref pt); + IntPtr wParam = (IntPtr)(delta << 16); + IntPtr lParam = (IntPtr)((pt.Y << 16) | (pt.X & 0xFFFF)); + SendMsg(hWnd, msg, wParam, lParam); + } +} +'@ +[WheelHelper]::Scroll([IntPtr]::new([long]${hwnd}), ${x}, ${y}, ${wheelDelta}, ${msg}) +` + return runPs(script) !== null +}