diff --git a/CLAUDE.md b/CLAUDE.md index 484de1f..027ef05 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,46 +4,77 @@ This file provides context for AI assistants working on the OpenType codebase. ## Project Overview -OpenType is an Electron desktop app for intelligent voice dictation. It captures microphone audio, transcribes it via STT APIs, then polishes the text using LLM post-processing (removing fillers, fixing repetitions, detecting self-corrections, adding punctuation). +OpenType is an Electron desktop app for intelligent voice dictation. It captures microphone audio, transcribes it via STT APIs, then polishes the text using LLM post-processing (removing fillers, fixing repetitions, detecting self-corrections, adding punctuation). It also captures rich context (active window, focused field, clipboard, screen OCR) to improve transcription quality. ## Tech Stack -- **Frontend**: React 18 + TypeScript + Vite + Tailwind CSS + Zustand +- **Frontend**: React 18 + TypeScript + Vite + Tailwind CSS 3 + Zustand 4 - **Desktop**: Electron 32 (CommonJS output in `dist-electron/`) -- **Build**: electron-builder for cross-platform packaging +- **Build**: electron-builder 25 for cross-platform packaging - **CI/CD**: GitHub Actions (`ci.yml` for checks, `release.yml` for packaging on `v*` tags) +- **i18n**: Custom lightweight React Context-based system (no external i18n library) ## Repository Layout ``` -electron/ → Electron main process (CommonJS, compiled to dist-electron/) - main.ts → App lifecycle, windows, tray, shortcuts, IPC handlers - preload.ts → contextBridge exposing electronAPI - config-store.ts → JSON file persistence (~/.opentype/config.json) - stt-service.ts → Server-side STT API calls - llm-service.ts → Server-side LLM API calls + system prompt builder - -src/ → React renderer (ESM, bundled by Vite to dist/) - types/config.ts → Central type definitions, AppConfig interface, DEFAULT_CONFIG - stores/ → Zustand store (configStore.ts) - services/ → audioRecorder, sttService, llmService, pipeline - hooks/ → useRecorder (recording state machine) - components/ → ui/ (Button, Input, Select, Toggle, Slider, Badge) - layout/ (TitleBar, Sidebar, PageHeader) - recording/ (RecordButton, ResultPanel) - pages/ → DashboardPage, HistoryPage, DictionaryPage, OverlayPage, FeedbackPage - settings/ → SettingsLayout + 9 sub-panels (Provider, General, Hotkey, Audio, - Personalization, ToneRules, Language, Privacy, Advanced) - -scripts/ → Validation scripts (test-api.ts, test-stt.ts, test-pipeline.ts) +electron/ → Electron main process (CommonJS, compiled to dist-electron/) + main.ts → App lifecycle, windows, tray, shortcuts, IPC handlers, context capture + preload.ts → contextBridge exposing electronAPI to renderer + config-store.ts → JSON file persistence (~/.opentype/config.json) + stt-service.ts → Server-side STT API calls + llm-service.ts → Server-side LLM API calls, prompt builder, smart truncation + +src/ → React renderer (ESM, bundled by Vite to dist/) + types/config.ts → Central type definitions: AppConfig, HistoryItem, HistoryContext, DEFAULT_CONFIG + types/electron.d.ts → Type declarations for window.electronAPI (must match preload.ts) + stores/configStore.ts→ Zustand store: load, set, update, history CRUD, dictionary CRUD + services/ → Dual-mode services (Electron IPC or direct fetch) + audioRecorder.ts → WebAudio recording, webm → WAV conversion + sttService.ts → Speech-to-Text API calls + llmService.ts → LLM post-processing API calls + pipeline.ts → Full pipeline orchestrator (STT + LLM) + hooks/useRecorder.ts → Recording state machine (idle → recording → processing → idle) + i18n/ → Internationalization + index.ts → I18nProvider, useTranslation hook, detectLocale + locales/en.json → English strings + locales/zh.json → Chinese strings (must mirror en.json structure exactly) + components/ + ui/ → Primitives: Button, Input, PasswordInput, Select, Toggle, Slider, Badge, HotkeyCapture + layout/ → TitleBar, Sidebar, PageHeader + recording/ → RecordButton, ResultPanel + pages/ → DashboardPage, HistoryPage, DictionaryPage, OverlayPage, FeedbackPage + settings/ → SettingsLayout + sub-panels: Provider, General, Hotkey, Audio, + Personalization, ToneRules, Language, Privacy, Context, Advanced + +scripts/ → Validation scripts (test-api.ts, test-stt.ts, test-pipeline.ts) +build/ → entitlements.mac.plist (microphone + accessibility permissions) ``` -## Two TypeScript Configs +## Two TypeScript Configs (CRITICAL) -- `tsconfig.json` — Frontend (ESNext, bundler moduleResolution, noEmit, jsx: react-jsx) -- `tsconfig.electron.json` — Electron (CommonJS, node moduleResolution, output to dist-electron/) +| Config | Module | Resolution | Output | Purpose | +|--------|--------|------------|--------|---------| +| `tsconfig.json` | ESNext | bundler | noEmit (Vite handles) | Frontend React code | +| `tsconfig.electron.json` | CommonJS | node | `dist-electron/` | Electron main process | -Always run both when type-checking: `npm run typecheck` +**Always run both when type-checking**: `npm run typecheck` (runs `tsc --noEmit && tsc -p tsconfig.electron.json --noEmit`) + +Running `tsc --noEmit` alone only checks frontend — Electron errors will be missed. + +## Common Commands + +```bash +npm run dev # Vite dev server (frontend only, http://localhost:5173) +npm run electron:dev # Full Electron dev mode (Vite + Electron) +npm run typecheck # Check BOTH frontend + electron TypeScript +npm run build # Build frontend (vite build) + compile electron (tsc) +npm run electron:build # Full package (build + electron-builder, auto-detects platform) + +# API tests (require env vars) +SILICONFLOW_KEY=sk-xxx npm run test:api +SILICONFLOW_KEY=sk-xxx npm run test:stt +SILICONFLOW_KEY=sk-xxx OPENROUTER_KEY=sk-or-xxx npm run test:pipeline +``` ## Key Patterns @@ -51,40 +82,151 @@ Always run both when type-checking: `npm run typecheck` All frontend services check `window.electronAPI` first. If present (running in Electron), they delegate to IPC. Otherwise, they make direct fetch calls. This allows developing the UI in a browser without Electron. ### Config Flow -`src/types/config.ts` defines `AppConfig` with all settings and `DEFAULT_CONFIG`. The Zustand store (`configStore.ts`) loads from Electron IPC or localStorage, and persists changes back on every `set()` call. +`src/types/config.ts` defines `AppConfig` with all settings and `DEFAULT_CONFIG`. The Zustand store (`configStore.ts`) loads from Electron IPC or localStorage, and persists changes back on every `set()` call. The Electron-side `config-store.ts` also has a `DEFAULT_CONFIG` for fallback. ### Provider System -Three providers: SiliconFlow (STT+LLM), OpenRouter (LLM only), OpenAI (STT+LLM). Each has configurable API key, base URL, and model. Provider metadata is in `PROVIDERS` array in `config.ts`. +Three providers: SiliconFlow (STT+LLM), OpenRouter (LLM only), OpenAI (STT+LLM). Each has configurable API key, base URL, and model. Provider field naming convention: `{provider}ApiKey`, `{provider}BaseUrl`, `{provider}SttModel`, `{provider}LlmModel`. -### LLM Prompt Construction -The system prompt for post-processing is dynamically built based on: +### LLM Prompt Construction (`electron/llm-service.ts`) +The system prompt is dynamically built from: - Toggle states (filler removal, repetition, self-correction, auto-formatting) - Output language preference - Personal dictionary terms - Active app → tone rule matching -- Personalization sliders (formality, verbosity) +- Context: selected text, field content, clipboard, recent transcriptions, screen OCR +- Smart truncation (`smartTruncate()`) applied to all context fields to cap prompt length -## Common Commands +### Context Capture Flow +Context is captured at **hotkey press time** (in `toggleRecording()`) — BEFORE the overlay steals focus. This preserves the correct active window info. OCR runs in the background while the user speaks, and its result is awaited when the pipeline retrieves context via `getLastContext`. -```bash -npm run dev # Vite dev server (frontend only) -npm run electron:dev # Full Electron dev mode -npm run typecheck # Check both frontend + electron TS -npm run build # Build frontend + compile electron -npm run electron:build # Full package (auto-detects platform) +### Recording Pipeline +1. User presses hotkey → `toggleRecording()` captures context + starts OCR +2. Overlay appears, renderer starts audio recording +3. User stops → renderer calls `stopRecording()` +4. Audio buffer sent to `processPipeline` IPC (main process) +5. Main process: STT transcribe → LLM post-process → returns result with metadata +6. Renderer: outputs text (type at cursor or clipboard), saves to history -# API tests (require env vars) -SILICONFLOW_KEY=sk-xxx npm run test:api -SILICONFLOW_KEY=sk-xxx npm run test:stt -SILICONFLOW_KEY=sk-xxx OPENROUTER_KEY=sk-or-xxx npm run test:pipeline +--- + +## Synchronization Checklist (MUST follow when making changes) + +### Adding a New IPC Channel + +Update these **three files** in lockstep: + +1. **`electron/main.ts`** — Register handler: `ipcMain.handle('namespace:action', ...)` +2. **`electron/preload.ts`** — Expose method: `action: (...) => ipcRenderer.invoke('namespace:action', ...)` +3. **`src/types/electron.d.ts`** — Add type: `action: (...) => Promise` + +IPC channel naming convention: `namespace:action` (e.g., `config:get`, `stt:transcribe`, `pipeline:process`). + +### Adding a New Config Field + +Update these files: + +1. **`src/types/config.ts`** — Add to `AppConfig` interface + set default in `DEFAULT_CONFIG` +2. **`electron/config-store.ts`** — Add to electron-side `DEFAULT_CONFIG` (keep in sync) +3. **Settings UI** — Add control in appropriate settings sub-panel +4. **Any service** that reads the field (e.g., `llm-service.ts`, `main.ts`) + +### Adding a New Context Field + +Update these files: + +1. **`electron/main.ts`** — Add to `CapturedContext` interface + populate in `captureFullContext()` +2. **`src/types/config.ts`** — Add to `HistoryContext` interface +3. **`src/types/electron.d.ts`** — Add to `getLastContext` return type +4. **`src/hooks/useRecorder.ts`** — Save new field in history item's `context` object +5. **`electron/llm-service.ts`** — Include in prompt construction (with truncation) +6. **`src/pages/HistoryPage.tsx`** — Display in `DetailView` component + +### Adding UI Text / Translations + +Update **both** locale files with identical key structure: + +1. **`src/i18n/locales/en.json`** — English strings +2. **`src/i18n/locales/zh.json`** — Chinese strings + +Missing keys fall back to English; missing from both shows the raw key string. + +i18n key convention: `section.subsection.key` (e.g., `settings.providers.apiKey`). + +Usage: `const { t } = useTranslation(); t('history.clipboard', { count: 5 })`. + +--- + +## Code Style & Conventions + +### Naming + +| What | Convention | Example | +|------|-----------|---------| +| React components | PascalCase files + named export | `Button.tsx` → `export const Button` | +| Hooks | camelCase with `use` prefix | `useRecorder.ts` | +| Services | camelCase files | `sttService.ts`, `pipeline.ts` | +| Electron files | kebab-case | `config-store.ts`, `llm-service.ts` | +| IPC channels | `namespace:action` | `config:get`, `stt:transcribe` | +| Config fields | camelCase | `siliconflowApiKey`, `contextL0Enabled` | +| CSS variables | kebab-case | `--slider-track` | +| i18n keys | dot-notation | `settings.providers.apiKey` | + +### Component Patterns + +- All UI components use `forwardRef` + set `displayName` +- Named exports only (no default exports) +- Variant-based styling via `Record` objects +- Select uses `createPortal` for dropdown (avoids z-index stacking) +- Event listeners from preload must return cleanup functions + +### Tailwind / Styling + +- **Custom colors only**: Use `brand-*` (blue, primary) and `surface-*` (warm gray) — NOT default Tailwind colors +- **Dark mode**: class-based (`dark:` prefix), toggled via `` +- **All components must have both light and dark styles**: `text-surface-800 dark:text-surface-200` +- **No responsive breakpoints** (desktop-only app) +- **Z-index**: portals/dropdowns use `z-[9999]`, modals use `z-50` +- **Drag region**: `.drag-region` for draggable areas, `.no-drag` for interactive elements inside + +### Color Reference + +``` +brand-500 → #3b82f6 (primary blue — buttons, focus rings, active states) +surface-50 → #faf8f5 (light background) +surface-850→ #201e1c (dark component backgrounds) +surface-900→ #181715 (dark page backgrounds) ``` +### Electron-specific + +- `electron/` uses CommonJS (`module: "CommonJS"`) — **no `import.meta`**, no top-level `export default` +- Preload event listeners (`on*` methods) must return cleanup functions to prevent memory leaks +- Context is captured at hotkey time, not pipeline execution time +- Overlay window must be reset to pill size (280x56) when hidden +- `typeAtCursor` saves/restores clipboard around paste simulation + +--- + +## Common Pitfalls + +1. **Forgetting one of the 3 IPC files** — TypeScript may pass, but runtime will crash +2. **Only running `tsc --noEmit`** — Misses electron/ errors. Always use `npm run typecheck` +3. **Adding config field only to frontend** — Electron-side `config-store.ts` DEFAULT_CONFIG also needs it +4. **Using `import.meta` in electron/** — Will fail; electron uses CommonJS +5. **Not updating both locale files** — Chinese users see English fallback or raw key strings +6. **Capturing context too late** — Must happen before overlay shows, or you get overlay's window info +7. **Missing `displayName` on forwardRef components** — React DevTools show "ForwardRef" instead of name +8. **Using default Tailwind colors** — Must use `brand-*` and `surface-*` custom scales +9. **History item ID collisions** — Must include randomness: `Date.now().toString(36) + Math.random().toString(36).slice(2, 6)` +10. **Stale closures in useEffect** — Event listener callbacks must track correct dependencies + ## Important Notes -- The `electron/` directory uses CommonJS (`module: "CommonJS"` in tsconfig.electron.json). Do not use ESM imports like `import.meta`. - `electron-store` is in dependencies (not devDependencies) because it's needed at runtime. -- The `build/entitlements.mac.plist` grants microphone access (`com.apple.security.device.audio-input`) for macOS. -- Tailwind uses custom color scales: `brand-*` (indigo) and `surface-*` (zinc/gray). Use these instead of default Tailwind colors for consistency. -- The frameless window uses `-webkit-app-region: drag` via `.drag-region` CSS class. Interactive elements need `.no-drag`. +- `build/entitlements.mac.plist` grants microphone access (`com.apple.security.device.audio-input`) for macOS. - Audio recording converts webm to WAV (PCM 16-bit, 16kHz) in the browser before sending to STT APIs. - The overlay window is transparent, always-on-top, and unfocusable. It's shown/hidden alongside recording state. +- The frameless window uses `-webkit-app-region: drag` via `.drag-region` CSS class. Interactive elements need `.no-drag`. +- Config persistence is immediate (no debouncing) — each `set()` writes to disk. +- History is capped at 500 items in the Zustand store. +- Smart truncation in LLM prompts: `selectedText` 500 chars, `fieldText` 1500, `clipboardText` 500, `screenContext` 400, `recentTranscription` 200 each (max 3). diff --git a/electron/config-store.ts b/electron/config-store.ts index 140711a..9d9b756 100644 --- a/electron/config-store.ts +++ b/electron/config-store.ts @@ -30,6 +30,8 @@ const DEFAULT_CONFIG: Record = { repetitionElimination: true, toneRules: [], defaultTone: 'professional', + autoMuteOnRecord: false, + autoLearnDictionary: true, history: [], totalWordsThisWeek: 0, }; diff --git a/electron/llm-service.ts b/electron/llm-service.ts index c349ebd..ea2bb23 100644 --- a/electron/llm-service.ts +++ b/electron/llm-service.ts @@ -3,6 +3,98 @@ * Handles post-processing, rewriting, and connection testing. */ +/** Smart truncation: keeps beginning + end of long text, with ellipsis in middle */ +function smartTruncate(text: string, maxLen: number): string { + if (!text || text.length <= maxLen) return text; + const keepEach = Math.floor((maxLen - 20) / 2); + return text.slice(0, keepEach) + '\n... [truncated] ...\n' + text.slice(-keepEach); +} + +/** Truncation limits for each context field (in characters) */ +const CONTEXT_LIMITS = { + selectedText: 500, + fieldText: 1500, + fieldTextWithMarker: 2000, // higher to account for cursor/selection markers + clipboardText: 500, + screenContext: 400, + recentTranscription: 200, // per item + recentTotal: 3, // max items +}; + +/** Truncate text centered around the cursor position, keeping context on both sides */ +function cursorCenteredTruncate(text: string, cursorPos: number, maxLen: number): { text: string; adjustedPos: number } { + if (text.length <= maxLen) return { text, adjustedPos: cursorPos }; + + const ellipsis = '\n... [truncated] ...\n'; + const halfWindow = Math.floor((maxLen - ellipsis.length * 2) / 2); + let start = Math.max(0, cursorPos - halfWindow); + let end = Math.min(text.length, cursorPos + halfWindow); + + // If one side is shorter, give more to the other + if (start === 0) end = Math.min(text.length, maxLen - ellipsis.length); + if (end === text.length) start = Math.max(0, text.length - maxLen + ellipsis.length); + + let result = ''; + let adjustedPos = cursorPos; + + if (start > 0) { + result = ellipsis; + adjustedPos = cursorPos - start + result.length; + result += text.slice(start, end); + } else { + result = text.slice(0, end); + } + + if (end < text.length) { + result += ellipsis; + } + + return { text: result, adjustedPos }; +} + +/** Build rich field context string with cursor/selection markers for the LLM */ +function buildFieldContext(context: any): string | null { + const fieldText: string | undefined = context.fieldText; + if (!fieldText) return null; + + const range = context.selectionRange as { location: number; length: number } | undefined; + const placeholder = context.fieldPlaceholder as string | undefined; + const label = context.fieldLabel as string | undefined; + const roleDesc = context.fieldRoleDescription || context.fieldRole || 'input field'; + + // Build descriptor: ("Message body", text area) + const labelPart = label ? `"${label}", ` : ''; + const descriptor = `(${labelPart}${roleDesc})`; + + if (range && typeof range.location === 'number' && typeof range.length === 'number') { + const loc = range.location; + const len = range.length; + + if (len > 0 && loc + len <= fieldText.length) { + // User has selected text — show [SELECTED: ...] marker + const { text: truncated, adjustedPos } = cursorCenteredTruncate(fieldText, loc, CONTEXT_LIMITS.fieldTextWithMarker - 30); + const before = truncated.slice(0, adjustedPos); + const selectedText = truncated.slice(adjustedPos, adjustedPos + len); + const after = truncated.slice(adjustedPos + len); + const markedText = before + '[SELECTED: ' + selectedText + ']' + after; + + return `The user selected text to replace with dictation in the ${descriptor}:\n"""\n${markedText}\n"""\nThe dictated text should replace the [SELECTED: ...] portion.`; + } else if (len === 0 && loc <= fieldText.length) { + // Cursor position — show | marker + const { text: truncated, adjustedPos } = cursorCenteredTruncate(fieldText, loc, CONTEXT_LIMITS.fieldTextWithMarker - 10); + const before = truncated.slice(0, adjustedPos); + const after = truncated.slice(adjustedPos); + const markedText = before + '|' + after; + + return `Existing text in the ${descriptor}:\n"""\n${markedText}\n"""\n(The "|" marks the cursor position where the dictated text will be inserted.)`; + } + } + + // Fallback: no range info, show raw field text + const snippet = smartTruncate(fieldText, CONTEXT_LIMITS.fieldText); + return `Existing text in the ${descriptor}:\n"""\n${snippet}\n"""\nThe dictated text should flow naturally with this existing content.`; +} + export class LLMService { private getOpts(config: Record, provider?: string) { const p = provider || config.llmProvider || 'siliconflow'; @@ -52,8 +144,8 @@ export class LLMService { return content; } - async process(rawText: string, config: Record, context?: any): Promise { - if (!rawText.trim()) return ''; + async process(rawText: string, config: Record, context?: any): Promise<{ text: string; systemPrompt: string }> { + if (!rawText.trim()) return { text: '', systemPrompt: '' }; const opts = this.getOpts(config); const parts: string[] = [ @@ -93,24 +185,55 @@ export class LLMService { if (context.windowTitle) { parts.push(`Window title: "${context.windowTitle}"`); } + if (context.url) { + parts.push(`URL: ${context.url}`); + } } - if (context?.selectedText) { - const snippet = context.selectedText.slice(0, 500); - parts.push(`\nThe user was editing/viewing this text:\n"""\n${snippet}\n"""\nEnsure the dictation output is consistent and coherent with this context.`); + // L1: Field context with cursor/selection position markers + const fieldCtx = buildFieldContext(context); + if (fieldCtx) { + parts.push(`\n${fieldCtx}`); + } else if (context?.selectedText) { + // Standalone selected text (no field text available) + const snippet = smartTruncate(context.selectedText, CONTEXT_LIMITS.selectedText); + parts.push(`\nThe user had selected this text:\n"""\n${snippet}\n"""\nEnsure the dictation output is consistent and coherent with this context.`); + } + + // Field placeholder hint + if (context?.fieldPlaceholder) { + parts.push(`The input field's placeholder reads: "${context.fieldPlaceholder}"`); + } + + // Clipboard content + if (context?.clipboardText) { + const snippet = smartTruncate(context.clipboardText, CONTEXT_LIMITS.clipboardText); + parts.push(`\nClipboard content:\n"""\n${snippet}\n"""\nThis may provide additional context for the dictation.`); + } + + // Recent transcriptions for continuity + if (context?.recentTranscriptions?.length > 0) { + const recents = context.recentTranscriptions + .slice(0, CONTEXT_LIMITS.recentTotal) + .map((t: string) => smartTruncate(t, CONTEXT_LIMITS.recentTranscription)); + parts.push(`\nRecent transcriptions (for continuity):\n${recents.map((r: string, i: number) => `${i + 1}. ${r}`).join('\n')}`); } if (context?.screenContext) { - parts.push(`\nScreen context (from OCR): ${context.screenContext}`); + const snippet = smartTruncate(context.screenContext, CONTEXT_LIMITS.screenContext); + parts.push(`\nScreen context (from OCR): ${snippet}`); } - return this.call({ + const systemPrompt = parts.join('\n'); + const text = await this.call({ ...opts, messages: [ - { role: 'system', content: parts.join('\n') }, + { role: 'system', content: systemPrompt }, { role: 'user', content: rawText }, ], }); + + return { text, systemPrompt }; } async rewrite(selectedText: string, instruction: string, config: Record): Promise { @@ -140,6 +263,22 @@ export class LLMService { const model = config.contextOcrModel || 'Qwen/Qwen2-VL-7B-Instruct'; const opts = this.getOpts(config); + const prompt = [ + 'Analyze this screenshot to help with voice dictation context. Extract:', + '1. APP: Which application is open', + '2. TASK: What the user is working on (1 sentence)', + '3. KEY TERMS: List any proper nouns, brand names, technical terms, project names, or specialized vocabulary visible on screen (comma-separated)', + '4. TEXT CONTEXT: If there is a text input area, summarize what has been written so far (1 sentence)', + '', + 'Format your response exactly as:', + 'APP: ...', + 'TASK: ...', + 'KEY TERMS: ...', + 'TEXT CONTEXT: ...', + '', + 'Keep each line brief. If a field is not applicable, write "none".', + ].join('\n'); + const res = await fetch(`${opts.baseUrl}/chat/completions`, { method: 'POST', headers: { @@ -153,11 +292,11 @@ export class LLMService { role: 'user', content: [ { type: 'image_url', image_url: { url: dataUrl } }, - { type: 'text', text: 'Briefly describe the content visible on screen in 1-2 sentences. Focus on what app is open and what the user is working on. Be concise.' }, + { type: 'text', text: prompt }, ], }], - max_tokens: 200, - temperature: 0.2, + max_tokens: 300, + temperature: 0.1, }), }); diff --git a/electron/main.ts b/electron/main.ts index 12168e3..be5ac2b 100644 --- a/electron/main.ts +++ b/electron/main.ts @@ -24,68 +24,422 @@ let llmService: LLMService; const isDev = !app.isPackaged; const isMac = process.platform === 'darwin'; +// ─── Recording State & Auto-Mute ──────────────────────────────────────────── + +let isRecording = false; +let wasMutedBeforeRecording = false; + +function setSystemMute(mute: boolean): void { + try { + if (isMac) { + if (mute) { + const isMuted = execSync("osascript -e 'output muted of (get volume settings)'", { timeout: 1000 }) + .toString().trim(); + wasMutedBeforeRecording = isMuted === 'true'; + if (!wasMutedBeforeRecording) { + execSync("osascript -e 'set volume with output muted'"); + } + } else { + if (!wasMutedBeforeRecording) { + execSync("osascript -e 'set volume without output muted'"); + } + } + } else if (process.platform === 'linux') { + if (mute) { + try { + const state = execSync('pactl get-sink-mute @DEFAULT_SINK@', { timeout: 1000 }).toString().trim(); + wasMutedBeforeRecording = state.includes('yes'); + if (!wasMutedBeforeRecording) execSync('pactl set-sink-mute @DEFAULT_SINK@ 1'); + } catch {} + } else { + if (!wasMutedBeforeRecording) { + try { execSync('pactl set-sink-mute @DEFAULT_SINK@ 0'); } catch {} + } + } + } + // Windows: no reliable silent mute command, skip for now + } catch (e: any) { + console.error('[AutoMute] error:', e.message); + } +} + +// ─── Auto-Dictionary: Extract Proper Nouns ────────────────────────────────── + +function extractDictionaryTerms(text: string, existingDict: string[]): string[] { + const terms = new Set(); + const existing = new Set(existingDict.map((w) => w.toLowerCase())); + + const words = text.split(/[\s,;:!?。,;:!?]+/); + for (let i = 0; i < words.length; i++) { + const word = words[i].replace(/^["""''()()[\]]+|["""''()()[\].]+$/g, ''); + if (!word || word.length < 2) continue; + + const prevWord = i > 0 ? words[i - 1] : ''; + const isSentenceStart = i === 0 || /[.!?。!?]$/.test(prevWord); + + // Acronyms (ALL CAPS, 2-6 chars) + if (/^[A-Z]{2,6}$/.test(word) && !existing.has(word.toLowerCase())) { + terms.add(word); continue; + } + // CamelCase / PascalCase (TypeScript, iPhone, OpenType) + if (/^[A-Z][a-z]+[A-Z]/.test(word) || /^[a-z]+[A-Z]/.test(word)) { + if (!existing.has(word.toLowerCase())) terms.add(word); continue; + } + // Capitalized words not at sentence start (proper nouns) + if (!isSentenceStart && /^[A-Z][a-z]{1,}/.test(word)) { + const lower = word.toLowerCase(); + const common = new Set(['the','and','but','for','not','you','all','can','had','her','was','one','our','out','day','get','has','him','his','how','its','may','new','now','old','see','way','who','did','let','say','she','too','use']); + if (!common.has(lower) && !existing.has(lower)) terms.add(word); + } + } + return Array.from(terms).slice(0, 5); +} + // ─── Context Awareness ───────────────────────────────────────────────────── interface CapturedContext { appName?: string; windowTitle?: string; + bundleId?: string; + url?: string; selectedText?: string; + fieldText?: string; + fieldRole?: string; + // Enhanced L1 accessibility attributes + fieldRoleDescription?: string; // AXRoleDescription: "text field", "search field" + fieldLabel?: string; // AXDescription/AXTitle: "Message body", "Subject" + fieldPlaceholder?: string; // AXPlaceholderValue: "Type a message..." + cursorPosition?: number; // from AXSelectedTextRange when selection length=0 + selectionRange?: { location: number; length: number }; // AXSelectedTextRange + numberOfCharacters?: number; // AXNumberOfCharacters + insertionLineNumber?: number; // AXInsertionPointLineNumber + // Other context + clipboardText?: string; + recentTranscriptions?: string[]; screenContext?: string; + screenshotDataUrl?: string; } let lastCapturedContext: CapturedContext = {}; +let ocrPromise: Promise<{ text: string; screenshot?: string } | null> | null = null; + +/** Capture all context on macOS using a single efficient AppleScript call */ +function captureContextMac(enableL1: boolean): CapturedContext { + const ctx: CapturedContext = {}; + const SEP = '‖‖‖'; // unlikely separator -/** L0: Capture active window metadata (sync, ~50ms) */ -function captureL0(): { appName?: string; windowTitle?: string } { try { - if (isMac) { - const script = `tell application "System Events" to set fp to first process whose frontmost is true -set appName to name of fp -set winTitle to "" -try -set winTitle to name of first window of fp -end try -return appName & "|||" & winTitle`; - const raw = execSync(`osascript -e '${script}'`, { timeout: 1000 }).toString().trim(); - const sep = raw.indexOf('|||'); - return { - appName: sep >= 0 ? raw.slice(0, sep).trim() : raw.trim(), - windowTitle: sep >= 0 ? raw.slice(sep + 3).trim() : '', - }; + // Single AppleScript to get L0 + L1 data in one call + const script = ` +set d to "${SEP}" +set output to "" +tell application "System Events" + set fp to first process whose frontmost is true + set appName to name of fp + set output to appName + + set bid to "" + try + set bid to bundle identifier of fp + end try + set output to output & d & bid + + set winTitle to "" + try + set winTitle to name of first window of fp + end try + set output to output & d & winTitle + + set elRole to "" + set selText to "" + set elRoleDesc to "" + set elLabel to "" + set elPlaceholder to "" + set selRange to "" + set charCount to "" + set lineNum to "" + set fieldVal to "" + ${enableL1 ? ` + try + set focusEl to focused UI element of fp + try + set elRole to value of attribute "AXRole" of focusEl + end try + try + set selText to value of attribute "AXSelectedText" of focusEl + end try + try + set elRoleDesc to value of attribute "AXRoleDescription" of focusEl + end try + try + set elLabel to value of attribute "AXDescription" of focusEl + end try + if elLabel is "" then + try + set elLabel to value of attribute "AXTitle" of focusEl + end try + end if + try + set elPlaceholder to value of attribute "AXPlaceholderValue" of focusEl + end try + try + set rng to value of attribute "AXSelectedTextRange" of focusEl + set selRange to ((item 1 of rng) as text) & "," & ((item 2 of rng) as text) + end try + try + set charCount to (value of attribute "AXNumberOfCharacters" of focusEl) as text + end try + try + set lineNum to (value of attribute "AXInsertionPointLineNumber" of focusEl) as text + end try + try + set fieldVal to value of attribute "AXValue" of focusEl + if (count of fieldVal) > 3000 then + set fieldVal to text 1 thru 3000 of fieldVal + end if + end try + end try` : ''} + + set output to output & d & elRole & d & selText & d & elRoleDesc & d & elLabel & d & elPlaceholder & d & selRange & d & charCount & d & lineNum & d & fieldVal +end tell +return output`; + + const raw = execSync('osascript -', { input: script, timeout: 2000 }).toString().trim(); + const parts = raw.split(SEP); + + ctx.appName = parts[0] || undefined; + ctx.bundleId = parts[1] || undefined; + ctx.windowTitle = parts[2] || undefined; + ctx.fieldRole = parts[3] || undefined; + ctx.selectedText = parts[4] || undefined; + ctx.fieldRoleDescription = parts[5] || undefined; + ctx.fieldLabel = parts[6] || undefined; + ctx.fieldPlaceholder = parts[7] || undefined; + + // Parse AXSelectedTextRange from "location,length" format + if (parts[8]) { + const rangeMatch = parts[8].match(/(\d+)\D+(\d+)/); + if (rangeMatch) { + const location = parseInt(rangeMatch[1], 10); + const length = parseInt(rangeMatch[2], 10); + ctx.selectionRange = { location, length }; + if (length === 0) { + ctx.cursorPosition = location; + } + } } - if (process.platform === 'win32') { - const ps = `(Get-Process | Where-Object {$_.MainWindowHandle -eq (Add-Type -MemberDefinition '[DllImport("user32.dll")]public static extern IntPtr GetForegroundWindow();' -Name W -Namespace N -PassThru)::GetForegroundWindow()}).MainWindowTitle`; - const title = execSync(`powershell -Command "${ps}"`, { timeout: 2000 }).toString().trim(); - return { appName: title.split(' - ').pop() || title, windowTitle: title }; + + // Parse AXNumberOfCharacters + if (parts[9]) { + const n = parseInt(parts[9], 10); + if (!isNaN(n)) ctx.numberOfCharacters = n; } - // Linux - const title = execSync('xdotool getactivewindow getwindowname 2>/dev/null', { timeout: 1000 }).toString().trim(); - return { appName: title, windowTitle: title }; - } catch { - return {}; + + // Parse AXInsertionPointLineNumber + if (parts[10]) { + const n = parseInt(parts[10], 10); + if (!isNaN(n)) ctx.insertionLineNumber = n; + } + + // fieldText is always last — may contain separators, so join remainder + ctx.fieldText = parts.slice(11).join(SEP) || undefined; + + // Truncate fieldText for storage + if (ctx.fieldText && ctx.fieldText.length > 3000) { + ctx.fieldText = ctx.fieldText.slice(0, 3000); + } + } catch (e) { + console.error('[Context] macOS capture error:', e); + } + + // Get browser URL if the frontmost app is a known browser + if (ctx.appName) { + ctx.url = captureBrowserUrl(ctx.appName) || undefined; } + + return ctx; } -/** L1: Capture selected text via Accessibility API (macOS only) */ -function captureL1(appName: string): string | null { +/** Get URL from known browsers via AppleScript */ +function captureBrowserUrl(appName: string): string | null { if (!isMac) return null; + + const browserScripts: Record = { + 'Safari': 'tell application "Safari" to get URL of current tab of first window', + 'Google Chrome': 'tell application "Google Chrome" to get URL of active tab of first window', + 'Microsoft Edge': 'tell application "Microsoft Edge" to get URL of active tab of first window', + 'Arc': 'tell application "Arc" to get URL of active tab of first window', + 'Brave Browser': 'tell application "Brave Browser" to get URL of active tab of first window', + 'Chromium': 'tell application "Chromium" to get URL of active tab of first window', + 'Opera': 'tell application "Opera" to get URL of active tab of first window', + 'Vivaldi': 'tell application "Vivaldi" to get URL of active tab of first window', + }; + + const script = browserScripts[appName]; + if (!script) return null; + try { - if (!systemPreferences.isTrustedAccessibilityClient(false)) return null; - const escaped = appName.replace(/"/g, '\\"'); - const script = `tell application "System Events" -tell process "${escaped}" -try -return value of attribute "AXSelectedText" of focused UI element -end try -end tell -end tell`; - const result = execSync(`osascript -e '${script}'`, { timeout: 1000 }).toString().trim(); - return result || null; + return execSync(`osascript -e '${script}'`, { timeout: 1000 }).toString().trim() || null; } catch { return null; } } +/** Capture context on Windows */ +function captureContextWin(): CapturedContext { + const ctx: CapturedContext = {}; + try { + const ps = ` +$ErrorActionPreference = 'SilentlyContinue' +Add-Type -AssemblyName UIAutomationClient +Add-Type -MemberDefinition '[DllImport("user32.dll")]public static extern IntPtr GetForegroundWindow();' -Name W -Namespace N -PassThru | Out-Null +$hwnd = [N.W]::GetForegroundWindow() +$proc = Get-Process | Where-Object {$_.MainWindowHandle -eq $hwnd} | Select-Object -First 1 +$title = $proc.MainWindowTitle +$name = $proc.ProcessName +$focused = [System.Windows.Automation.AutomationElement]::FocusedElement +$role = "" +$val = "" +$sel = "" +$label = "" +$placeholder = "" +$selStart = "" +$selLen = "" +try { $role = $focused.Current.ControlType.ProgrammaticName } catch {} +try { $label = $focused.Current.Name } catch {} +try { $placeholder = $focused.Current.HelpText } catch {} +try { + $vp = $focused.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern) + $val = $vp.Current.Value + if ($val.Length -gt 3000) { $val = $val.Substring(0, 3000) } +} catch {} +try { + $tp = $focused.GetCurrentPattern([System.Windows.Automation.TextPattern]::Pattern) + $ranges = $tp.GetSelection() + if ($ranges.Length -gt 0) { + $sel = $ranges[0].GetText(-1) + $docRange = $tp.DocumentRange + $before = $docRange.Clone() + $before.MoveEndpointByRange([System.Windows.Automation.Text.TextPatternRangeEndpoint]::End, $ranges[0], [System.Windows.Automation.Text.TextPatternRangeEndpoint]::Start) + $selStart = $before.GetText(-1).Length + $selLen = $sel.Length + } +} catch {} +Write-Output "$name|||$title|||$role|||$sel|||$label|||$placeholder|||$selStart|||$selLen|||$val"`; + const raw = execSync(`powershell -Command "${ps.replace(/"/g, '\\"')}"`, { timeout: 3000 }).toString().trim(); + const parts = raw.split('|||'); + ctx.appName = parts[0] || undefined; + ctx.windowTitle = parts[1] || undefined; + ctx.fieldRole = parts[2] || undefined; + ctx.selectedText = parts[3] || undefined; + ctx.fieldLabel = parts[4] || undefined; + ctx.fieldPlaceholder = parts[5] || undefined; + // Parse selection range + if (parts[6] && parts[7]) { + const loc = parseInt(parts[6], 10); + const len = parseInt(parts[7], 10); + if (!isNaN(loc) && !isNaN(len)) { + ctx.selectionRange = { location: loc, length: len }; + if (len === 0) ctx.cursorPosition = loc; + } + } + // fieldText is last (may contain separator) + ctx.fieldText = parts.slice(8).join('|||') || undefined; + } catch (e) { + console.error('[Context] Windows capture error:', e); + // Fallback: just get window title + try { + const ps = `(Get-Process | Where-Object {$_.MainWindowHandle -eq (Add-Type -MemberDefinition '[DllImport("user32.dll")]public static extern IntPtr GetForegroundWindow();' -Name W -Namespace N -PassThru)::GetForegroundWindow()}).MainWindowTitle`; + const title = execSync(`powershell -Command "${ps}"`, { timeout: 2000 }).toString().trim(); + ctx.appName = title.split(' - ').pop() || title; + ctx.windowTitle = title; + } catch {} + } + return ctx; +} + +/** Capture context on Linux */ +function captureContextLinux(): CapturedContext { + const ctx: CapturedContext = {}; + try { + const title = execSync('xdotool getactivewindow getwindowname 2>/dev/null', { timeout: 1000 }).toString().trim(); + ctx.appName = title; + ctx.windowTitle = title; + // Try to get selected text via xclip + try { + const sel = execSync('xclip -selection primary -o 2>/dev/null', { timeout: 500 }).toString(); + if (sel && sel.length < 5000) ctx.selectedText = sel; + } catch {} + } catch (e) { + console.error('[Context] Linux capture error:', e); + } + return ctx; +} + +/** Capture full context based on platform and config */ +function captureFullContext(config: Record): CapturedContext { + const l0Enabled = config.contextL0Enabled !== false; + const l1Enabled = !!config.contextL1Enabled; + + let ctx: CapturedContext = {}; + + if (l0Enabled) { + if (isMac) { + const hasAccessibility = l1Enabled && systemPreferences.isTrustedAccessibilityClient(false); + ctx = captureContextMac(hasAccessibility); + } else if (process.platform === 'win32') { + ctx = captureContextWin(); + } else { + ctx = captureContextLinux(); + } + } + + // Clipboard content (always capture — lightweight and useful) + try { + const clip = clipboard.readText(); + if (clip && clip.trim().length > 0 && clip.trim().length < 5000) { + ctx.clipboardText = clip.trim(); + } + } catch {} + + // Recent transcriptions for continuity context (last 3 successful ones) + try { + const history: any[] = config.history || []; + const recent = history + .filter((h: any) => h.processedText && !h.error) + .slice(0, 3) + .map((h: any) => h.processedText); + if (recent.length > 0) { + ctx.recentTranscriptions = recent; + } + } catch {} + + return ctx; +} + +/** Start screenshot + OCR in background (returns promise) */ +async function captureScreenAndOcr(config: Record): Promise<{ text: string; screenshot?: string } | null> { + try { + const sources = await desktopCapturer.getSources({ + types: ['screen'], + thumbnailSize: { width: 1280, height: 720 }, + }); + if (!sources.length) return null; + + const thumbnail = sources[0].thumbnail; + const jpegBuffer = thumbnail.toJPEG(80); + const base64 = jpegBuffer.toString('base64'); + const dataUrl = `data:image/jpeg;base64,${base64}`; + + // Send to VLM for analysis + const ocrResult = await llmService.analyzeScreenshot(dataUrl, config); + return { text: ocrResult, screenshot: dataUrl }; + } catch (e: any) { + console.error('[Context OCR] error:', e.message); + return null; + } +} + // ─── Window Creation ──────────────────────────────────────────────────────── function createMainWindow() { @@ -218,18 +572,33 @@ function registerShortcuts() { } function toggleRecording() { - // Capture context BEFORE overlay steals focus const cfg = configStore.getAll(); - if (cfg.contextL0Enabled !== false) { - const l0 = captureL0(); - lastCapturedContext = { ...l0 }; - - if (cfg.contextL1Enabled && l0.appName) { - const sel = captureL1(l0.appName); - if (sel) lastCapturedContext.selectedText = sel; + isRecording = !isRecording; + + if (isRecording) { + // Starting recording: capture context BEFORE overlay steals focus + lastCapturedContext = captureFullContext(cfg); + console.log('[Context] captured:', JSON.stringify({ + app: lastCapturedContext.appName, + window: lastCapturedContext.windowTitle, + role: lastCapturedContext.fieldRole, + hasSelected: !!lastCapturedContext.selectedText, + hasField: !!lastCapturedContext.fieldText, + url: lastCapturedContext.url, + })); + + // Start OCR in background if enabled (runs while user speaks) + if (cfg.contextOcrEnabled) { + ocrPromise = captureScreenAndOcr(cfg); + } else { + ocrPromise = null; } + + // Auto-mute system audio + if (cfg.autoMuteOnRecord) setSystemMute(true); } else { - lastCapturedContext = {}; + // Stopping recording: unmute + if (cfg.autoMuteOnRecord) setSystemMute(false); } mainWindow?.webContents.send('toggle-recording'); @@ -287,7 +656,7 @@ function setupIPC() { ipcMain.handle('llm:process', async (_e, text: string, ctx: any) => { try { const result = await llmService.process(text, configStore.getAll(), ctx); - return { success: true, text: result }; + return { success: true, text: result.text }; } catch (e: any) { return { success: false, error: e.message }; } @@ -296,20 +665,61 @@ function setupIPC() { // Pipeline ipcMain.handle('pipeline:process', async (_e, buf: ArrayBuffer, ctx: any) => { const cfg = configStore.getAll(); - try { - console.log('[Pipeline] STT...'); - const raw = await sttService.transcribe(Buffer.from(buf), cfg, ctx); - console.log('[Pipeline] raw:', raw.slice(0, 100)); + const sttProvider = cfg.sttProvider || 'siliconflow'; + const llmProvider = cfg.llmProvider || 'siliconflow'; + const sttModel = sttProvider === 'siliconflow' ? cfg.siliconflowSttModel : cfg.openaiSttModel; + const llmModel = llmProvider === 'siliconflow' ? cfg.siliconflowLlmModel + : llmProvider === 'openrouter' ? cfg.openrouterLlmModel : cfg.openaiLlmModel; - if (!raw.trim()) return { success: true, rawText: '', processedText: '', skipped: true }; + let sttDurationMs = 0; + let llmDurationMs = 0; - console.log('[Pipeline] LLM...'); - const processed = await llmService.process(raw, cfg, ctx); - console.log('[Pipeline] final:', processed.slice(0, 100)); + try { + // Stage 1: STT with timing + console.log('[Pipeline] STT via', sttProvider, sttModel); + const sttStart = Date.now(); + const raw = await sttService.transcribe(Buffer.from(buf), cfg, ctx); + sttDurationMs = Date.now() - sttStart; + console.log('[Pipeline] STT done in', sttDurationMs, 'ms:', raw.slice(0, 100)); + + if (!raw.trim()) return { success: true, rawText: '', processedText: '', skipped: true, sttDurationMs }; + + // Stage 2: LLM with timing + console.log('[Pipeline] LLM via', llmProvider, llmModel); + const llmStart = Date.now(); + const llmResult = await llmService.process(raw, cfg, ctx); + llmDurationMs = Date.now() - llmStart; + console.log('[Pipeline] LLM done in', llmDurationMs, 'ms:', llmResult.text.slice(0, 100)); + + // Auto-dictionary: extract proper nouns + let autoLearnedTerms: string[] = []; + if (cfg.autoLearnDictionary !== false) { + const dict: string[] = cfg.personalDictionary || []; + autoLearnedTerms = extractDictionaryTerms(llmResult.text, dict); + if (autoLearnedTerms.length > 0) { + const updated = [...dict, ...autoLearnedTerms]; + configStore.set('personalDictionary', updated); + console.log('[AutoDict] learned:', autoLearnedTerms); + mainWindow?.webContents.send('dictionary:auto-added', autoLearnedTerms); + } + } + + // Unmute after pipeline completes (in case recording stopped during pipeline) + isRecording = false; + if (cfg.autoMuteOnRecord) setSystemMute(false); - return { success: true, rawText: raw, processedText: processed }; + return { + success: true, + rawText: raw, + processedText: llmResult.text, + systemPrompt: llmResult.systemPrompt, + sttProvider, llmProvider, sttModel, llmModel, + sttDurationMs, llmDurationMs, autoLearnedTerms, + }; } catch (e: any) { - return { success: false, error: e.message }; + isRecording = false; + if (cfg.autoMuteOnRecord) setSystemMute(false); + return { success: false, error: e.message, sttProvider, llmProvider, sttModel, llmModel, sttDurationMs, llmDurationMs }; } }); @@ -326,13 +736,74 @@ function setupIPC() { // Clipboard ipcMain.handle('clipboard:write', (_e, text: string) => { clipboard.writeText(text); return true; }); + // Type text at cursor (clipboard + paste simulation) + ipcMain.handle('text:typeAtCursor', async (_e, text: string) => { + try { + // Save current clipboard content + const prevClipboard = clipboard.readText(); + + // Write new text to clipboard + clipboard.writeText(text); + + // Small delay to ensure clipboard is updated + await new Promise((r) => setTimeout(r, 50)); + + // Simulate paste keystroke + if (isMac) { + execSync(`osascript -e 'tell application "System Events" to keystroke "v" using command down'`); + } else if (process.platform === 'win32') { + execSync(`powershell -Command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('^v')"`); + } else { + // Linux + try { + execSync('xdotool key ctrl+v'); + } catch { + // xdotool not available, try xclip approach + execSync('xsel --clipboard --output | xargs -0 xdotool type --'); + } + } + + // Restore previous clipboard content after a delay + setTimeout(() => { + try { clipboard.writeText(prevClipboard); } catch {} + }, 500); + + return { success: true }; + } catch (e: any) { + console.error('[TypeText] error:', e.message); + return { success: false, error: e.message }; + } + }); + // Window controls ipcMain.handle('window:minimize', () => mainWindow?.minimize()); ipcMain.handle('window:maximize', () => { mainWindow?.isMaximized() ? mainWindow.unmaximize() : mainWindow?.maximize(); }); ipcMain.handle('window:close', () => mainWindow?.hide()); - ipcMain.handle('window:hideOverlay', () => overlayWindow?.hide()); + ipcMain.handle('window:hideOverlay', () => { + if (!overlayWindow) return; + overlayWindow.hide(); + // Reset overlay size back to pill + const display = screen.getPrimaryDisplay(); + const { width: screenW, height: screenH } = display.workAreaSize; + const pillW = 280, pillH = 56; + overlayWindow.setBounds({ + width: pillW, height: pillH, + x: Math.round((screenW - pillW) / 2), + y: screenH - pillH - 16, + }); + }); + ipcMain.handle('window:resizeOverlay', (_e, w: number, h: number) => { + if (!overlayWindow) return; + const display = screen.getPrimaryDisplay(); + const { width: screenW, height: screenH } = display.workAreaSize; + overlayWindow.setBounds({ + width: w, height: h, + x: Math.round((screenW - w) / 2), + y: screenH - h - 16, + }); + }); // Auto updater ipcMain.handle('updater:check', () => autoUpdater.checkForUpdates().catch(() => null)); @@ -343,8 +814,22 @@ function setupIPC() { }); ipcMain.handle('updater:getVersion', () => app.getVersion()); - // Context awareness - ipcMain.handle('context:getLastContext', () => lastCapturedContext); + // Context awareness — await OCR if it was started during toggleRecording + ipcMain.handle('context:getLastContext', async () => { + if (ocrPromise) { + try { + const ocrResult = await ocrPromise; + if (ocrResult) { + lastCapturedContext.screenContext = ocrResult.text; + lastCapturedContext.screenshotDataUrl = ocrResult.screenshot; + } + } catch (e: any) { + console.error('[Context] OCR await error:', e.message); + } + ocrPromise = null; + } + return lastCapturedContext; + }); ipcMain.handle('context:checkAccessibility', () => { if (!isMac) return 'granted'; @@ -365,16 +850,8 @@ function setupIPC() { const cfg = configStore.getAll(); if (!cfg.contextOcrEnabled) return null; try { - const sources = await desktopCapturer.getSources({ - types: ['screen'], - thumbnailSize: { width: 1280, height: 720 }, - }); - if (!sources.length) return null; - const thumbnail = sources[0].thumbnail; - const base64 = thumbnail.toJPEG(80).toString('base64'); - const dataUrl = `data:image/jpeg;base64,${base64}`; - const ocrResult = await llmService.analyzeScreenshot(dataUrl, cfg); - return ocrResult; + const result = await captureScreenAndOcr(cfg); + return result?.text || null; } catch (e: any) { console.error('[Context OCR] error:', e.message); return null; diff --git a/electron/preload.ts b/electron/preload.ts index e559801..e3e09e9 100644 --- a/electron/preload.ts +++ b/electron/preload.ts @@ -31,11 +31,15 @@ contextBridge.exposeInMainWorld('electronAPI', { // Clipboard writeClipboard: (text: string) => ipcRenderer.invoke('clipboard:write', text), + // Type at cursor + typeAtCursor: (text: string) => ipcRenderer.invoke('text:typeAtCursor', text), + // Window controls minimize: () => ipcRenderer.invoke('window:minimize'), maximize: () => ipcRenderer.invoke('window:maximize'), close: () => ipcRenderer.invoke('window:close'), hideOverlay: () => ipcRenderer.invoke('window:hideOverlay'), + resizeOverlay: (w: number, h: number) => ipcRenderer.invoke('window:resizeOverlay', w, h), // API testing testAPI: (provider: string) => ipcRenderer.invoke('api:test', provider), @@ -91,4 +95,8 @@ contextBridge.exposeInMainWorld('electronAPI', { ipcRenderer.on('navigate', (_e, page) => cb(page)); return () => { ipcRenderer.removeAllListeners('navigate'); }; }, + onDictionaryAutoAdded: (cb: (words: string[]) => void) => { + ipcRenderer.on('dictionary:auto-added', (_e, words) => cb(words)); + return () => { ipcRenderer.removeAllListeners('dictionary:auto-added'); }; + }, }); diff --git a/src/components/ui/Slider.tsx b/src/components/ui/Slider.tsx index cb576d1..0e5017c 100644 --- a/src/components/ui/Slider.tsx +++ b/src/components/ui/Slider.tsx @@ -36,7 +36,7 @@ export function Slider({ onChange={(e) => onChange(Number(e.target.value))} className="w-full h-1.5 rounded-full appearance-none cursor-pointer" style={{ - background: `linear-gradient(to right, #6366f1 0%, #6366f1 ${pct}%, var(--slider-track) ${pct}%, var(--slider-track) 100%)`, + background: `linear-gradient(to right, #3b82f6 0%, #3b82f6 ${pct}%, var(--slider-track) ${pct}%, var(--slider-track) 100%)`, }} /> diff --git a/src/hooks/useRecorder.ts b/src/hooks/useRecorder.ts index 52514b0..3c1480e 100644 --- a/src/hooks/useRecorder.ts +++ b/src/hooks/useRecorder.ts @@ -3,6 +3,7 @@ import { AudioRecorder } from '../services/audioRecorder'; import { runPipeline } from '../services/pipeline'; import { useConfigStore } from '../stores/configStore'; import { HistoryItem } from '../types/config'; +import { countWords } from '../utils/wordCount'; export interface RecorderState { status: 'idle' | 'recording' | 'processing'; @@ -11,6 +12,7 @@ export interface RecorderState { rawText: string; processedText: string; error: string | null; + outputFailed: boolean; } export function useRecorder() { @@ -24,6 +26,7 @@ export function useRecorder() { rawText: '', processedText: '', error: null, + outputFailed: false, }); const recorderRef = useRef(null); @@ -32,7 +35,7 @@ export function useRecorder() { const startRecording = useCallback(async () => { try { - setState((s) => ({ ...s, error: null, rawText: '', processedText: '', status: 'recording', duration: 0 })); + setState((s) => ({ ...s, error: null, rawText: '', processedText: '', outputFailed: false, status: 'recording', duration: 0 })); // Pre-flight: check microphone permission in Electron if (window.electronAPI) { @@ -89,15 +92,37 @@ export function useRecorder() { const result = await runPipeline(audioBuffer, config, context); if (result.success && !result.skipped) { + const text = result.processedText; + + // Output text based on outputMode + let outputSuccess = false; + if (window.electronAPI) { + if (config.outputMode === 'cursor') { + try { + const r = await window.electronAPI.typeAtCursor(text); + outputSuccess = r.success; + } catch { + outputSuccess = false; + } + } + // Clipboard mode, or cursor mode failed -> copy to clipboard + if (!outputSuccess) { + await window.electronAPI.writeClipboard(text); + } + } else { + try { await navigator.clipboard.writeText(text); } catch {} + } + setState((s) => ({ ...s, status: 'idle', rawText: result.rawText, processedText: result.processedText, + outputFailed: config.outputMode === 'cursor' && !outputSuccess, })); - // Save to history with context - const wordCount = result.processedText.split(/\s+/).filter(Boolean).length; + // Save to history with full context + const wordCount = countWords(result.processedText); const item: HistoryItem = { id: Date.now().toString(36) + Math.random().toString(36).slice(2, 6), timestamp: Date.now(), @@ -107,12 +132,81 @@ export function useRecorder() { wordCount, sourceApp: context.appName, windowTitle: context.windowTitle, + context: { + // L0 + appName: context.appName, + windowTitle: context.windowTitle, + bundleId: context.bundleId, + url: context.url, + // L1 + selectedText: context.selectedText, + fieldText: context.fieldText, + fieldRole: context.fieldRole, + fieldRoleDescription: context.fieldRoleDescription, + fieldLabel: context.fieldLabel, + fieldPlaceholder: context.fieldPlaceholder, + cursorPosition: context.cursorPosition, + selectionRange: context.selectionRange, + numberOfCharacters: context.numberOfCharacters, + insertionLineNumber: context.insertionLineNumber, + // Clipboard & recent + clipboardText: context.clipboardText, + recentTranscriptions: context.recentTranscriptions, + // OCR + screenContext: context.screenContext, + // Don't save screenshot to history (too large for JSON config) + // Feature flags + contextL0Enabled: config.contextL0Enabled, + contextL1Enabled: config.contextL1Enabled, + contextOcrEnabled: config.contextOcrEnabled, + // Pipeline metadata + systemPrompt: result.systemPrompt, + sttProvider: result.sttProvider, + llmProvider: result.llmProvider, + sttModel: result.sttModel, + llmModel: result.llmModel, + // Timing + sttDurationMs: result.sttDurationMs, + llmDurationMs: result.llmDurationMs, + // Auto-learned terms + autoLearnedTerms: result.autoLearnedTerms, + }, }; addHistoryItem(item); } else if (result.skipped) { setState((s) => ({ ...s, status: 'idle', error: 'No speech detected' })); } else { - setState((s) => ({ ...s, status: 'idle', error: result.error || 'Processing failed' })); + const errorMsg = result.error || 'Processing failed'; + setState((s) => ({ ...s, status: 'idle', error: errorMsg })); + + // Save failed transcription to history + const failItem: HistoryItem = { + id: Date.now().toString(36) + Math.random().toString(36).slice(2, 6), + timestamp: Date.now(), + rawText: result.rawText || '', + processedText: '', + durationMs, + wordCount: 0, + error: errorMsg, + sourceApp: context.appName, + windowTitle: context.windowTitle, + context: { + appName: context.appName, + windowTitle: context.windowTitle, + bundleId: context.bundleId, + url: context.url, + contextL0Enabled: config.contextL0Enabled, + contextL1Enabled: config.contextL1Enabled, + contextOcrEnabled: config.contextOcrEnabled, + sttProvider: result.sttProvider, + llmProvider: result.llmProvider, + sttModel: result.sttModel, + llmModel: result.llmModel, + sttDurationMs: result.sttDurationMs, + llmDurationMs: result.llmDurationMs, + }, + }; + addHistoryItem(failItem); } } catch (e: any) { setState((s) => ({ ...s, status: 'idle', error: e.message })); diff --git a/src/i18n/locales/en.json b/src/i18n/locales/en.json index fa13aa7..82e6ab8 100644 --- a/src/i18n/locales/en.json +++ b/src/i18n/locales/en.json @@ -55,11 +55,38 @@ "finalOutput": "Final Output", "rawTranscription": "Raw Transcription", "error": "Error", - "contextPipeline": "Context Pipeline", + "contextCapture": "Context Capture", + "sttStage": "Speech-to-Text", + "llmStage": "LLM Post-processing", "systemPrompt": "System Prompt", "audio": "Audio Recording", "audioAvailable": "Audio file available for download", - "noOutput": "No output" + "noOutput": "No output", + "activeWindow": "Active Window", + "bundleId": "Bundle ID", + "browserUrl": "URL", + "focusedField": "Focused Input Field", + "selectedText": "Selected Text", + "fieldContent": "Field Content", + "screenOcr": "Screen Analysis (OCR)", + "disabled": "Disabled", + "noDataCaptured": "Enabled but no data captured", + "contextNotSaved": "No context data saved", + "showMore": "Show more", + "showLess": "Show less", + "provider": "Provider", + "model": "Model", + "clipboard": "Clipboard", + "clipboardContent": "Clipboard content at capture time", + "recentTranscriptions": "Recent Transcriptions", + "recentTranscriptionsDesc": "Previous transcriptions for continuity context", + "autoLearnedTerms": "Auto-learned terms", + "sttDuration": "STT Duration", + "llmDuration": "LLM Duration", + "cursorAt": "Cursor at position", + "line": "line", + "chars": "chars", + "selectedRange": "Selection" }, "dictionary": { "title": "Personal Dictionary", @@ -85,7 +112,8 @@ "listening": "Listening...", "processing": "Processing...", "transcribing": "Transcribing & polishing", - "pressHotkey": "Press hotkey to start" + "pressHotkey": "Press hotkey to start", + "copiedToClipboard": "Copied to clipboard" }, "feedback": { "title": "Feedback & Help", @@ -185,7 +213,10 @@ "whisperMode": "Whisper Mode", "enableWhisper": "Enable Whisper Mode", "whisperDesc": "Increased sensitivity for quiet environments — speak softly and still be heard", - "whisperSensitivity": "Whisper Sensitivity" + "whisperSensitivity": "Whisper Sensitivity", + "autoMute": "Auto-Mute", + "autoMuteToggle": "Mute system audio during recording", + "autoMuteDesc": "Automatically mute other audio while recording to reduce background noise interference" }, "personalization": { "enable": "Enable Personalization", @@ -283,7 +314,10 @@ "permissionGranted": "Permission granted", "grantPermission": "Grant Permission", "accessibilityHelp": "Open System Preferences → Privacy & Security → Accessibility, then add OpenType", - "screenHelp": "Open System Preferences → Privacy & Security → Screen Recording, then add OpenType" + "screenHelp": "Open System Preferences → Privacy & Security → Screen Recording, then add OpenType", + "autoLearn": "Auto-Learn Dictionary", + "autoLearnToggle": "Auto-learn terms from transcriptions", + "autoLearnDesc": "Automatically detect proper nouns, brand names, and technical terms from your transcriptions and add them to your personal dictionary" }, "update": { "title": "Software Update", diff --git a/src/i18n/locales/zh.json b/src/i18n/locales/zh.json index d80b6fd..39b460e 100644 --- a/src/i18n/locales/zh.json +++ b/src/i18n/locales/zh.json @@ -53,12 +53,40 @@ "downloadAudio": "下载音频", "detailTitle": "转录详情", "finalOutput": "最终输出", + "rawTranscription": "原始转录", "error": "错误", - "contextPipeline": "上下文流水线", + "contextCapture": "上下文捕获", + "sttStage": "语音转文本", + "llmStage": "LLM 后处理", "systemPrompt": "系统提示词", "audio": "音频录制", "audioAvailable": "可下载音频文件", - "noOutput": "无输出" + "noOutput": "无输出", + "activeWindow": "活动窗口", + "bundleId": "Bundle ID", + "browserUrl": "网页地址", + "focusedField": "焦点输入框", + "selectedText": "选中文本", + "fieldContent": "输入框内容", + "screenOcr": "屏幕分析 (OCR)", + "disabled": "未启用", + "noDataCaptured": "已启用但未捕获到数据", + "contextNotSaved": "无上下文数据", + "showMore": "展开", + "showLess": "收起", + "provider": "服务商", + "model": "模型", + "clipboard": "剪贴板", + "clipboardContent": "捕获时的剪贴板内容", + "recentTranscriptions": "最近转录", + "recentTranscriptionsDesc": "用于上下文连续性的近期转录记录", + "autoLearnedTerms": "自动学习词条", + "sttDuration": "STT 耗时", + "llmDuration": "LLM 耗时", + "cursorAt": "光标位于位置", + "line": "行", + "chars": "字符", + "selectedRange": "选区" }, "dictionary": { "title": "个人词典", @@ -84,7 +112,8 @@ "listening": "正在收听...", "processing": "处理中...", "transcribing": "正在转录和润色", - "pressHotkey": "按快捷键开始" + "pressHotkey": "按快捷键开始", + "copiedToClipboard": "已复制到剪贴板" }, "feedback": { "title": "反馈与帮助", @@ -185,7 +214,10 @@ "whisperMode": "耳语模式", "enableWhisper": "启用耳语模式", "whisperDesc": "提高安静环境下的灵敏度——轻声说话也能被识别", - "whisperSensitivity": "耳语灵敏度" + "whisperSensitivity": "耳语灵敏度", + "autoMute": "自动静音", + "autoMuteToggle": "录音时静音系统音频", + "autoMuteDesc": "录音时自动静音其他音频,减少背景噪音干扰" }, "personalization": { "enable": "启用个性化", @@ -283,7 +315,10 @@ "permissionGranted": "已授权", "grantPermission": "授予权限", "accessibilityHelp": "打开系统偏好设置 → 隐私与安全性 → 辅助功能,然后添加 OpenType", - "screenHelp": "打开系统偏好设置 → 隐私与安全性 → 屏幕录制,然后添加 OpenType" + "screenHelp": "打开系统偏好设置 → 隐私与安全性 → 屏幕录制,然后添加 OpenType", + "autoLearn": "自动学习词典", + "autoLearnToggle": "自动从转录中学习词条", + "autoLearnDesc": "自动检测转录中的专有名词、品牌名称和技术术语,并将其添加到个人词典中" }, "update": { "title": "软件更新", diff --git a/src/index.css b/src/index.css index d75c3bb..63bf567 100644 --- a/src/index.css +++ b/src/index.css @@ -10,11 +10,11 @@ text-rendering: optimizeLegibility; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; - --slider-track: #d4d4d8; + --slider-track: #d4d1cd; } .dark { - --slider-track: #3f3f46; + --slider-track: #3d3a36; } *, *::before, *::after { @@ -31,13 +31,13 @@ html, body, #root { /* Light mode base */ body { background: #ffffff; - color: #09090b; + color: #0c0b0a; } /* Dark mode base */ .dark body { - background: #09090b; - color: #f4f4f5; + background: #0c0b0a; + color: #f5f3f0; } /* Scrollbar — dark */ @@ -53,12 +53,12 @@ body { ::-webkit-scrollbar-thumb:hover { background: rgba(0,0,0,0.2); } /* Selection */ -::selection { background: rgba(99,102,241,0.2); } -.dark ::selection { background: rgba(99,102,241,0.3); } +::selection { background: rgba(59,130,246,0.2); } +.dark ::selection { background: rgba(59,130,246,0.3); } /* Focus */ :focus:not(:focus-visible) { outline: none; } -:focus-visible { outline: 2px solid rgba(99,102,241,0.5); outline-offset: 2px; border-radius: 4px; } +:focus-visible { outline: 2px solid rgba(59,130,246,0.5); outline-offset: 2px; border-radius: 4px; } /* Electron drag region */ .drag-region { -webkit-app-region: drag; } diff --git a/src/pages/HistoryPage.tsx b/src/pages/HistoryPage.tsx index 2a65485..e159c82 100644 --- a/src/pages/HistoryPage.tsx +++ b/src/pages/HistoryPage.tsx @@ -85,13 +85,13 @@ export function HistoryPage() { return new Date(ts).toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' }); }; - // Detail panel - if (selectedItem) { - return setSelectedItem(null)} t={t} />; - } - return (
+ {/* Detail modal overlay */} + {selectedItem && ( + setSelectedItem(null)} t={t} /> + )} + void; t: (key: string) => string }) { +function DetailModal({ item, onClose, t }: { item: HistoryItem; onClose: () => void; t: (key: string) => string }) { const ctx = item.context; + const isError = !!item.error; + const [expandedPrompt, setExpandedPrompt] = useState(false); + const [expandedField, setExpandedField] = useState(false); - return ( -
-
- -
-

{t('history.detailTitle')}

-

- {new Date(item.timestamp).toLocaleString()} {item.sourceApp && `· ${item.sourceApp}`} -

-
-
+ const hasAnyContext = ctx && (ctx.appName || ctx.selectedText || ctx.fieldText || ctx.screenContext || ctx.url || ctx.clipboardText || ctx.recentTranscriptions?.length); + const contextStatus = ctx ? (hasAnyContext ? 'success' : 'partial') : 'skipped'; -
- {/* Final output */} -
-

- {item.processedText || {t('history.noOutput')}} -

-
- - {/* Raw transcription */} -
-

- {item.rawText || {t('history.noOutput')}} -

-
- - {/* Error if any */} - {item.error && ( -
-

{item.error}

-
- )} + const formatDuration = (ms?: number) => { + if (!ms) return null; + return ms < 1000 ? `${ms}ms` : `${(ms / 1000).toFixed(1)}s`; + }; - {/* Context pipeline */} - {ctx && ( -
-
- - - - {ctx.screenshotDataUrl && ( -
-

Screenshot

- Screen capture -
+ return ( +
+ {/* Backdrop */} +
+ + {/* Modal */} +
+ {/* Header */} +
+
+
+

{t('history.detailTitle')}

+ {isError && ( + + {t('history.error')} + )}
-
- )} +

+ {new Date(item.timestamp).toLocaleString()} + {item.sourceApp && ` · ${item.sourceApp}`} + {item.windowTitle && ` · ${item.windowTitle}`} +

+
+ {/* Stats badges */} +
+ + {item.wordCount} {t('dashboard.wordsUnit')} + + + {(item.durationMs / 1000).toFixed(1)}s + +
+ +
- {/* System prompt sent to LLM */} - {ctx?.systemPrompt && ( -
-
-              {ctx.systemPrompt}
-            
-
- )} + {/* Body */} +
+ {/* ═══ Step 1: Context Capture ═══ */} + + {ctx ? ( +
+ + {ctx.appName && ( +
+
+ {ctx.appName} + {ctx.bundleId && ({ctx.bundleId})} +
+ {ctx.windowTitle &&

{ctx.windowTitle}

} + {ctx.url &&

{ctx.url}

} +
+ )} +
- {/* Audio */} - {item.audioBase64 && ( -
-

{t('history.audioAvailable')}

-
- )} + + {/* Field metadata badges */} +
+ {(ctx.fieldRoleDescription || ctx.fieldRole) && ( + {ctx.fieldRoleDescription || ctx.fieldRole} + )} + {ctx.fieldLabel && ( + {ctx.fieldLabel} + )} + {ctx.fieldPlaceholder && ( + “{ctx.fieldPlaceholder}” + )} +
+ {/* Cursor / selection position info */} + {ctx.cursorPosition !== undefined && ( +

+ {t('history.cursorAt')} {ctx.cursorPosition} + {ctx.insertionLineNumber !== undefined && ` (${t('history.line')} ${ctx.insertionLineNumber})`} + {ctx.numberOfCharacters !== undefined && ` / ${ctx.numberOfCharacters} ${t('history.chars')}`} +

+ )} + {ctx.selectionRange && ctx.selectionRange.length > 0 && ( +

+ {t('history.selectedRange')}: {ctx.selectionRange.location}..{ctx.selectionRange.location + ctx.selectionRange.length} ({ctx.selectionRange.length} {t('history.chars')}) +

+ )} + {ctx.selectedText && ( +
+

{t('history.selectedText')}

+

{ctx.selectedText}

+
+ )} + {ctx.fieldText && ctx.fieldText !== ctx.selectedText && ( +
+

{t('history.fieldContent')}

+
+ {ctx.fieldText.length > 300 && !expandedField + ? <>{ctx.fieldText.slice(0, 300)}... + : ctx.fieldText} + {expandedField && ctx.fieldText.length > 300 && ( + + )} +
+
+ )} +
+ + + {ctx.screenContext && ( +

{ctx.screenContext}

+ )} +
+ + + {ctx.clipboardText && ( +

{ctx.clipboardText}

+ )} +
- {/* Stats */} -
- {item.wordCount} {t('dashboard.wordsUnit')} - {(item.durationMs / 1000).toFixed(1)}s - {item.language && {item.language}} + + {ctx.recentTranscriptions && ctx.recentTranscriptions.length > 0 && ( +
+ {ctx.recentTranscriptions.map((text, i) => ( +

{text}

+ ))} +
+ )} +
+
+ ) : ( +

{t('history.contextNotSaved')}

+ )} + + + {/* ═══ Step 2: STT ═══ */} + + {item.rawText ? ( +

{item.rawText}

+ ) : ( +

{isError ? item.error : t('history.noOutput')}

+ )} +
+ + {/* ═══ Step 3: LLM Post-processing ═══ */} + + {item.processedText ? ( +

{item.processedText}

+ ) : isError ? ( +
+ +

{item.error}

+
+ ) : ( +

{t('history.noOutput')}

+ )} + + {/* Auto-learned terms */} + {ctx?.autoLearnedTerms && ctx.autoLearnedTerms.length > 0 && ( +
+ {t('history.autoLearnedTerms')}: + {ctx.autoLearnedTerms.map((term, i) => ( + {term} + ))} +
+ )} + + {/* System prompt (collapsible) */} + {ctx?.systemPrompt && ( +
+ + {expandedPrompt && ( +
{ctx.systemPrompt}
+ )} +
+ )} +
); } -function Section({ title, icon, children }: { title: string; icon: string; children: React.ReactNode }) { - const iconMap: Record = { - output: , - mic: , - error: , - context: , - prompt: , - audio: , +function PipelineStep({ number, title, status, isLast, meta, duration, children }: { + number: number; + title: string; + status: 'success' | 'error' | 'partial' | 'skipped'; + isLast: boolean; + meta?: string; + duration?: string | null; + children: React.ReactNode; +}) { + const statusColors = { + success: 'bg-green-500 border-green-500', + error: 'bg-red-500 border-red-500', + partial: 'bg-amber-500 border-amber-500', + skipped: 'bg-surface-300 dark:bg-surface-700 border-surface-300 dark:border-surface-700', }; return ( -
-
- {iconMap[icon]} -

{title}

+
+ {/* Timeline */} +
+
+ {status === 'success' ? ( + + ) : status === 'error' ? ( + + ) : ( + {number} + )} +
+ {!isLast && ( +
+ )} +
+ + {/* Content */} +
+
+

+ {title} +

+ {meta && ( + {meta} + )} + {duration && ( + {duration} + )} +
+ {children}
-
{children}
); } -function ContextRow({ label, enabled, value }: { label: string; enabled?: boolean; value?: string }) { +/** Renders a context data section with enable/disable status indicator */ +function ContextSection({ title, enabled, hasData, t, children }: { + title: string; + enabled?: boolean; + hasData: boolean; + t: (key: string) => string; + children: React.ReactNode; +}) { + const isDisabled = enabled === false || enabled === undefined; + return ( -
- {enabled === false || enabled === undefined ? ( - - - - ) : value ? ( - - - - ) : ( - - +
+
+ {isDisabled ? ( + + ) : hasData ? ( + + ) : ( + + )} + + {title} - )} -
-

{label}

- {value && ( -

{value}

+ {isDisabled && ( + {t('history.disabled')} )} - {enabled && !value && ( -

No data captured

+ {!isDisabled && !hasData && ( + {t('history.noDataCaptured')} )}
+ {!isDisabled && hasData &&
{children}
}
); } diff --git a/src/pages/OverlayPage.tsx b/src/pages/OverlayPage.tsx index 95b6c38..1c444fe 100644 --- a/src/pages/OverlayPage.tsx +++ b/src/pages/OverlayPage.tsx @@ -1,10 +1,11 @@ -import { useEffect } from 'react'; +import { useEffect, useState } from 'react'; import { useRecorder } from '../hooks/useRecorder'; import { useTranslation } from '../i18n'; export function OverlayPage() { const rec = useRecorder(); const { t } = useTranslation(); + const [copied, setCopied] = useState(false); useEffect(() => { if (!window.electronAPI) return; @@ -21,95 +22,147 @@ export function OverlayPage() { } }; - // Auto-hide after processing completes + const handleCopy = async () => { + if (!rec.processedText) return; + try { + if (window.electronAPI) await window.electronAPI.writeClipboard(rec.processedText); + else await navigator.clipboard.writeText(rec.processedText); + setCopied(true); + setTimeout(() => setCopied(false), 2000); + } catch {} + }; + + const handleDismiss = () => { + window.electronAPI?.hideOverlay(); + }; + + // Auto-hide after successful output (no fallback needed) useEffect(() => { - if (rec.status === 'idle' && (rec.processedText || rec.error)) { - const timer = setTimeout(() => window.electronAPI?.hideOverlay(), 1500); + if (rec.status === 'idle' && rec.processedText && !rec.outputFailed) { + const timer = setTimeout(() => window.electronAPI?.hideOverlay(), 1200); return () => clearTimeout(timer); } - }, [rec.status, rec.processedText, rec.error]); + if (rec.status === 'idle' && rec.error) { + const timer = setTimeout(() => window.electronAPI?.hideOverlay(), 3000); + return () => clearTimeout(timer); + } + }, [rec.status, rec.processedText, rec.error, rec.outputFailed]); + + // Reset copied state on new recording + useEffect(() => { + if (rec.status === 'recording') setCopied(false); + }, [rec.status]); const level = rec.audioLevel; - return ( -
-
+ // Show expanded fallback when output failed (text + copy button) + const showFallback = rec.status === 'idle' && rec.processedText && rec.outputFailed; - {/* Left: Cancel button */} - + +
+
+
+ ) : ( + /* ── Normal pill: recording / processing / result ── */ +
- - - - + {/* Left: Cancel button */} + - {/* Center: Waveform + status */} -
- {rec.status === 'recording' ? ( - <> + {/* Center: Waveform / status */} +
+ {rec.status === 'recording' ? (
{Array.from({ length: 7 }).map((_, i) => { const center = 3; const dist = Math.abs(i - center); const base = Math.max(0.15, 1 - dist * 0.2); - const h = Math.max(3, level * 20 * base); + const h = Math.max(4, level * 20 * base); return (
); })}
-

- {t('overlay.listening')} -

- - ) : rec.status === 'processing' ? ( - <> -
-

- {t('overlay.processing')} -

- - ) : rec.processedText ? ( -

- ✓ {t('common.success')} -

- ) : rec.error ? ( -

- {rec.error} -

- ) : ( -

- {t('overlay.pressHotkey')} -

- )} -
+ ) : rec.status === 'processing' ? ( +
+
+ {t('overlay.processing')} +
+ ) : rec.processedText ? ( + + ✓ + + ) : rec.error ? ( + + {rec.error} + + ) : null} +
- {/* Right: Confirm button */} - -
+ {/* Right: Confirm button */} + +
+ )}
); } diff --git a/src/pages/settings/AudioSettings.tsx b/src/pages/settings/AudioSettings.tsx index e811c86..0b70e2c 100644 --- a/src/pages/settings/AudioSettings.tsx +++ b/src/pages/settings/AudioSettings.tsx @@ -82,6 +82,19 @@ export function AudioSettings() { description={t('settings.audio.endSoundDesc')} />
+ + {/* Auto-mute */} +
+

+ {t('settings.audio.autoMute')} +

+ set('autoMuteOnRecord', v)} + label={t('settings.audio.autoMuteToggle')} + description={t('settings.audio.autoMuteDesc')} + /> +
); } diff --git a/src/pages/settings/ContextSettings.tsx b/src/pages/settings/ContextSettings.tsx index 45c0c3f..366c5a9 100644 --- a/src/pages/settings/ContextSettings.tsx +++ b/src/pages/settings/ContextSettings.tsx @@ -68,6 +68,20 @@ export function ContextSettings() { )}
+ {/* Auto-Learn Dictionary */} +
+
+ +

{t('context.autoLearn')}

+
+ set('autoLearnDictionary', v)} + label={t('context.autoLearnToggle')} + description={t('context.autoLearnDesc')} + /> +
+ {/* Screen OCR */}
diff --git a/src/pages/settings/ProviderSettings.tsx b/src/pages/settings/ProviderSettings.tsx index 54e4642..bb2449a 100644 --- a/src/pages/settings/ProviderSettings.tsx +++ b/src/pages/settings/ProviderSettings.tsx @@ -208,7 +208,7 @@ function CategorySection({ icon, title, tooltip, badge, children }: { onMouseLeave={() => setShowTooltip(false)} className="text-surface-400 hover:text-surface-600 dark:hover:text-surface-300 transition-colors" > - + {showTooltip && (
diff --git a/src/services/pipeline.ts b/src/services/pipeline.ts index ff6a745..3de3fa8 100644 --- a/src/services/pipeline.ts +++ b/src/services/pipeline.ts @@ -13,12 +13,21 @@ export interface PipelineResult { processedText: string; skipped?: boolean; error?: string; + // Pipeline metadata for history + systemPrompt?: string; + sttProvider?: string; + llmProvider?: string; + sttModel?: string; + llmModel?: string; + sttDurationMs?: number; + llmDurationMs?: number; + autoLearnedTerms?: string[]; } export async function runPipeline( audioBuffer: ArrayBuffer, config: AppConfig, - context?: { appName?: string }, + context?: Record, ): Promise { // If Electron is available, delegate the whole pipeline to main process if (window.electronAPI) { @@ -29,34 +38,43 @@ export async function runPipeline( processedText: r.processedText ?? '', skipped: r.skipped, error: r.error, + systemPrompt: r.systemPrompt, + sttProvider: r.sttProvider, + llmProvider: r.llmProvider, + sttModel: r.sttModel, + llmModel: r.llmModel, + sttDurationMs: r.sttDurationMs, + llmDurationMs: r.llmDurationMs, + autoLearnedTerms: r.autoLearnedTerms, }; } - // Browser-mode pipeline + // Browser-mode pipeline (no timing / auto-learn in browser) console.log('[Pipeline] Stage 1: STT...'); + const sttStart = Date.now(); const stt = await transcribeAudio(audioBuffer, config, { language: config.inputLanguage, }); + const sttDurationMs = Date.now() - sttStart; if (!stt.success) { - return { success: false, rawText: '', processedText: '', error: stt.error }; + return { success: false, rawText: '', processedText: '', error: stt.error, sttDurationMs }; } const rawText = stt.text ?? ''; if (!rawText.trim()) { - return { success: true, rawText: '', processedText: '', skipped: true }; + return { success: true, rawText: '', processedText: '', skipped: true, sttDurationMs }; } - console.log('[Pipeline] Stage 1 result:', rawText); console.log('[Pipeline] Stage 2: LLM post-processing...'); - + const llmStart = Date.now(); const llm = await processText(rawText, config, context); + const llmDurationMs = Date.now() - llmStart; + if (!llm.success) { - return { success: false, rawText, processedText: '', error: llm.error }; + return { success: false, rawText, processedText: '', error: llm.error, sttDurationMs, llmDurationMs }; } const processedText = llm.text ?? rawText; - console.log('[Pipeline] Stage 2 result:', processedText); - - return { success: true, rawText, processedText }; + return { success: true, rawText, processedText, sttDurationMs, llmDurationMs }; } diff --git a/src/types/config.ts b/src/types/config.ts index e5f7af4..dd74e00 100644 --- a/src/types/config.ts +++ b/src/types/config.ts @@ -79,15 +79,52 @@ export interface ToneRule { export type HistoryRetention = 'forever' | '30d' | '7d' | '24h' | '1h'; export interface HistoryContext { + // L0: Basic window info (no special permissions) appName?: string; windowTitle?: string; - selectedText?: string; - screenContext?: string; - screenshotDataUrl?: string; + bundleId?: string; // macOS bundle identifier + url?: string; // browser URL if applicable + + // L1: Accessibility data (requires accessibility permission) + selectedText?: string; // AXSelectedText + fieldText?: string; // AXValue — full content of focused input field + fieldRole?: string; // AXRole — TextField, TextArea, WebArea, etc. + fieldRoleDescription?: string; // AXRoleDescription — "text field", "search field", "text area" + fieldLabel?: string; // AXDescription or AXTitle — field's accessible label + fieldPlaceholder?: string; // AXPlaceholderValue — "Type a message...", "Search..." + cursorPosition?: number; // cursor position (from AXSelectedTextRange when length=0) + selectionRange?: { location: number; length: number }; // AXSelectedTextRange + numberOfCharacters?: number; // AXNumberOfCharacters — total chars in field + insertionLineNumber?: number; // AXInsertionPointLineNumber — cursor line number + + // Clipboard + clipboardText?: string; // clipboard content at capture time + + // Recent transcriptions (last few for continuity context) + recentTranscriptions?: string[]; + + // OCR: Screen analysis + screenContext?: string; // VLM description of screen content + screenshotDataUrl?: string; // screenshot thumbnail + + // Feature flags at capture time contextL0Enabled?: boolean; contextL1Enabled?: boolean; contextOcrEnabled?: boolean; - systemPrompt?: string; + + // LLM pipeline + systemPrompt?: string; // the system prompt sent to LLM + sttProvider?: string; // which STT provider was used + llmProvider?: string; // which LLM provider was used + sttModel?: string; // STT model name + llmModel?: string; // LLM model name + + // Pipeline timing + sttDurationMs?: number; // how long STT took + llmDurationMs?: number; // how long LLM post-processing took + + // Auto-learned dictionary terms + autoLearnedTerms?: string[]; // terms auto-added in this transcription } export interface HistoryItem { @@ -174,6 +211,12 @@ export interface AppConfig { contextOcrEnabled: boolean; // Screen OCR via VLM contextOcrModel: string; // VLM model for OCR + // Audio behavior + autoMuteOnRecord: boolean; // mute system audio during recording + + // Auto-learning + autoLearnDictionary: boolean; // auto-add corrected terms to dictionary + // Personal dictionary personalDictionary: string[]; @@ -251,6 +294,9 @@ export const DEFAULT_CONFIG: AppConfig = { contextOcrEnabled: false, contextOcrModel: 'Qwen/Qwen2-VL-7B-Instruct', + autoMuteOnRecord: false, + autoLearnDictionary: true, + personalDictionary: [], history: [], diff --git a/src/types/electron.d.ts b/src/types/electron.d.ts index 858b449..408309e 100644 --- a/src/types/electron.d.ts +++ b/src/types/electron.d.ts @@ -9,6 +9,14 @@ export interface PipelineResult { processedText?: string; skipped?: boolean; error?: string; + systemPrompt?: string; + sttProvider?: string; + llmProvider?: string; + sttModel?: string; + llmModel?: string; + sttDurationMs?: number; + llmDurationMs?: number; + autoLearnedTerms?: string[]; } export interface APITestResult { @@ -60,11 +68,15 @@ export interface ElectronAPI { // ─── Clipboard ──────────────────────────────────────── writeClipboard: (text: string) => Promise; + // ─── Type at cursor ────────────────────────────────── + typeAtCursor: (text: string) => Promise<{ success: boolean; error?: string }>; + // ─── Window controls ───────────────────────────────── minimize: () => Promise; maximize: () => Promise; close: () => Promise; hideOverlay: () => Promise; + resizeOverlay: (w: number, h: number) => Promise; // ─── API testing ────────────────────────────────────── testAPI: (provider: string) => Promise; @@ -88,8 +100,22 @@ export interface ElectronAPI { getLastContext: () => Promise<{ appName?: string; windowTitle?: string; + bundleId?: string; + url?: string; selectedText?: string; + fieldText?: string; + fieldRole?: string; + fieldRoleDescription?: string; + fieldLabel?: string; + fieldPlaceholder?: string; + cursorPosition?: number; + selectionRange?: { location: number; length: number }; + numberOfCharacters?: number; + insertionLineNumber?: number; + clipboardText?: string; + recentTranscriptions?: string[]; screenContext?: string; + screenshotDataUrl?: string; }>; checkAccessibility: () => Promise<'granted' | 'not-determined'>; requestAccessibility: () => Promise; @@ -100,6 +126,7 @@ export interface ElectronAPI { onToggleRecording: (callback: () => void) => () => void; onRecordingState: (callback: (state: string) => void) => () => void; onNavigate: (callback: (page: string) => void) => () => void; + onDictionaryAutoAdded: (callback: (words: string[]) => void) => () => void; } declare global { diff --git a/src/utils/wordCount.ts b/src/utils/wordCount.ts new file mode 100644 index 0000000..967a1b5 --- /dev/null +++ b/src/utils/wordCount.ts @@ -0,0 +1,45 @@ +/** + * Universal word/character counting for mixed CJK + Latin text. + * + * CJK characters (Chinese, Japanese Kanji, Korean Hanja) each count as 1 "word" + * since they are ideographic and each character carries meaning. + * Latin-based words are split by whitespace as usual. + * Mixed content sums both counts. + */ + +// CJK Unicode ranges +const CJK_REGEX = /[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af\u3000-\u303f\uff00-\uffef]/g; + +/** + * Count words in mixed-language text. + * CJK characters = 1 word each, Latin words = split by whitespace. + */ +export function countWords(text: string): number { + if (!text?.trim()) return 0; + + // Count CJK characters + const cjkMatches = text.match(CJK_REGEX); + const cjkCount = cjkMatches ? cjkMatches.length : 0; + + // Remove CJK chars + CJK punctuation, count remaining Latin-like words + const latinOnly = text.replace(CJK_REGEX, ' '); + const latinWords = latinOnly.split(/\s+/).filter((w) => w.length > 0 && /[a-zA-Z0-9]/.test(w)); + + return cjkCount + latinWords.length; +} + +/** + * Format word count with appropriate unit for display. + * Returns e.g. "123 字" for Chinese, "45 words" for English, "20 字 + 5 words" for mixed. + */ +export function formatWordCount(text: string): { count: number; cjk: number; latin: number } { + if (!text?.trim()) return { count: 0, cjk: 0, latin: 0 }; + + const cjkMatches = text.match(CJK_REGEX); + const cjk = cjkMatches ? cjkMatches.length : 0; + + const latinOnly = text.replace(CJK_REGEX, ' '); + const latin = latinOnly.split(/\s+/).filter((w) => w.length > 0 && /[a-zA-Z0-9]/.test(w)).length; + + return { count: cjk + latin, cjk, latin }; +} diff --git a/tailwind.config.cjs b/tailwind.config.cjs index ef168cc..635a6a2 100644 --- a/tailwind.config.cjs +++ b/tailwind.config.cjs @@ -6,17 +6,17 @@ module.exports = { extend: { colors: { brand: { - 50: '#eef2ff', - 100: '#e0e7ff', - 200: '#c7d2fe', - 300: '#a5b4fc', - 400: '#818cf8', - 500: '#6366f1', - 600: '#4f46e5', - 700: '#4338ca', - 800: '#3730a3', - 900: '#312e81', - 950: '#1e1b4b', + 50: '#eff6ff', + 100: '#dbeafe', + 200: '#bfdbfe', + 300: '#93c5fd', + 400: '#60a5fa', + 500: '#3b82f6', + 600: '#2563eb', + 700: '#1d4ed8', + 800: '#1e40af', + 900: '#1e3a8a', + 950: '#172554', }, surface: { 0: '#ffffff', @@ -24,14 +24,14 @@ module.exports = { 100: '#f5f3f0', 200: '#e8e5e1', 300: '#d4d1cd', - 400: '#a1a1aa', - 500: '#71717a', - 600: '#52525b', - 700: '#3f3f46', - 800: '#27272a', - 850: '#1f1f23', - 900: '#18181b', - 950: '#09090b', + 400: '#a8a5a0', + 500: '#78756f', + 600: '#57544e', + 700: '#3d3a36', + 800: '#282624', + 850: '#201e1c', + 900: '#181715', + 950: '#0c0b0a', }, }, animation: {