diff --git a/ExecuWhisper/.gitignore b/ExecuWhisper/.gitignore new file mode 100644 index 0000000000..3313a77610 --- /dev/null +++ b/ExecuWhisper/.gitignore @@ -0,0 +1,6 @@ +ExecuWhisper.xcodeproj/ +docs/superpowers/ + +# Local-only dictation samples and prompt-quality corpus. +test_audio/ +evaluation/ diff --git a/ExecuWhisper/CHANGELOG.md b/ExecuWhisper/CHANGELOG.md new file mode 100644 index 0000000000..05e01e9aeb --- /dev/null +++ b/ExecuWhisper/CHANGELOG.md @@ -0,0 +1,7 @@ +# ExecuWhisper Changelog + +## Internal DMG - 2026-04-30 + +- Added production-readiness hardening for audio capture, smart formatter validation, release signing, and internal DMG verification. +- Lightweight DMG SHA256: + `e01401325119cd76df3d32c172a78b610bd814244cb07dc42431bd64af862dfc` diff --git a/ExecuWhisper/ExecuWhisper/ExecuWhisper.entitlements b/ExecuWhisper/ExecuWhisper/ExecuWhisper.entitlements new file mode 100644 index 0000000000..0c8b3b6f3c --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/ExecuWhisper.entitlements @@ -0,0 +1,12 @@ + + + + + com.apple.security.cs.disable-library-validation + + com.apple.security.device.audio-input + + com.apple.security.network.client + + + diff --git a/ExecuWhisper/ExecuWhisper/ExecuWhisperApp.swift b/ExecuWhisper/ExecuWhisper/ExecuWhisperApp.swift new file mode 100644 index 0000000000..6e4f3660ad --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/ExecuWhisperApp.swift @@ -0,0 +1,159 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import AppKit +import SwiftUI + +@main +struct ExecuWhisperApp: App { + @State private var preferences = Preferences() + @State private var downloader = ModelDownloader() + @State private var replacementStore = ReplacementStore() + @State private var store: TranscriptStore + @State private var dictationManager: DictationManager + + init() { + let prefs = Preferences() + let downloader = ModelDownloader() + let replacementStore = ReplacementStore() + let formatterBridge = FormatterBridge() + let textPipeline = TextPipeline( + replacementStore: replacementStore, + formatterBridge: formatterBridge + ) { + TextPipeline.FormatterPaths( + runnerPath: prefs.formatterRunnerPath, + modelPath: prefs.formatterModelPath, + tokenizerPath: prefs.formatterTokenizerPath, + tokenizerConfigPath: prefs.formatterTokenizerConfigPath + ) + } + let store = TranscriptStore( + preferences: prefs, + downloader: downloader, + textPipeline: textPipeline + ) + let dictationManager = DictationManager(store: store, preferences: prefs) + _preferences = State(initialValue: prefs) + _downloader = State(initialValue: downloader) + _replacementStore = State(initialValue: replacementStore) + _store = State(initialValue: store) + _dictationManager = State(initialValue: dictationManager) + } + + var body: some Scene { + WindowGroup { + ContentView() + .environment(store) + .environment(preferences) + .environment(downloader) + .environment(replacementStore) + .environment(dictationManager) + .frame(minWidth: 700, minHeight: 460) + .onReceive(NotificationCenter.default.publisher(for: NSApplication.didBecomeActiveNotification)) { _ in + Task { await store.runHealthCheck() } + } + } + .defaultSize(width: 960, height: 640) + .windowToolbarStyle(.unified) + .commands { + CommandGroup(replacing: .newItem) {} + + CommandMenu("Transcription") { + switch store.sessionState { + case .idle: + Button("Start Recording") { + Task { await store.startRecording() } + } + .keyboardShortcut("R", modifiers: [.command, .shift]) + .disabled(!store.isModelReady) + + case .recording: + Button("Stop and Transcribe") { + Task { await store.stopRecordingAndTranscribe() } + } + .keyboardShortcut("R", modifiers: [.command, .shift]) + + case .transcribing: + Button("Transcribing...") {} + .disabled(true) + } + + Button("Import Audio...") { + store.importAudioFileWithPanel() + } + .disabled(store.hasActiveSession || downloader.isDownloading) + + if store.healthResult?.shouldOfferModelDownload == true && !downloader.isDownloading { + Divider() + Button("Download Model") { + Task { await store.downloadModel() } + } + } + + if store.resourcesReady && !store.hasActiveSession { + Divider() + switch store.helperState { + case .unloaded: + Button("Preload Model") { + Task { await store.preloadModel() } + } + .keyboardShortcut("L", modifiers: [.command, .shift]) + + case .loading: + Button("Warming Model...") {} + .disabled(true) + + case .warm: + Button("Unload Model") { + Task { await store.unloadModel() } + } + .keyboardShortcut("U", modifiers: [.command, .shift]) + + case .failed: + Button("Retry Preload") { + Task { await store.preloadModel() } + } + } + } + + Divider() + + Button("Copy Transcript") { + let text = currentTranscript + guard !text.isEmpty else { return } + NSPasteboard.general.clearContents() + NSPasteboard.general.setString(text, forType: .string) + } + .keyboardShortcut("C", modifiers: [.command, .shift]) + .disabled(currentTranscript.isEmpty) + } + + CommandMenu("Dictation") { + Button(dictationManager.isListening ? "Stop Dictation" : "Start Dictation") { + Task { await dictationManager.toggle() } + } + .disabled(store.isTranscribing) + } + } + + Settings { + SettingsView(usesFixedWindowSize: true) + .environment(preferences) + .environment(dictationManager) + } + } + + private var currentTranscript: String { + if store.hasActiveSession { + return store.liveTranscript + } + guard let id = store.selectedSessionID else { return "" } + return store.sessions.first(where: { $0.id == id })?.transcript ?? "" + } +} diff --git a/ExecuWhisper/ExecuWhisper/Info.plist b/ExecuWhisper/ExecuWhisper/Info.plist new file mode 100644 index 0000000000..eb385a63d1 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Info.plist @@ -0,0 +1,8 @@ + + + + + NSMicrophoneUsageDescription + ExecuWhisper needs microphone access to record audio for on-device transcription. + + diff --git a/ExecuWhisper/ExecuWhisper/Models/DictationShortcut.swift b/ExecuWhisper/ExecuWhisper/Models/DictationShortcut.swift new file mode 100644 index 0000000000..7cf301bc54 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Models/DictationShortcut.swift @@ -0,0 +1,110 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import AppKit +import Carbon.HIToolbox +import Foundation + +struct DictationShortcut: Codable, Equatable, Sendable { + var keyCode: UInt32 + var carbonModifiers: UInt32 + var keyDisplay: String + + static let controlSpace = DictationShortcut( + keyCode: UInt32(kVK_Space), + carbonModifiers: UInt32(controlKey), + keyDisplay: "Space" + ) + + init(keyCode: UInt32, carbonModifiers: UInt32, keyDisplay: String) { + self.keyCode = keyCode + self.carbonModifiers = carbonModifiers + self.keyDisplay = keyDisplay + } + + init?(event: NSEvent) { + let carbonModifiers = Self.carbonModifiers(from: event.modifierFlags) + guard carbonModifiers != 0 else { return nil } + guard let keyDisplay = Self.keyDisplay(for: event) else { return nil } + self.init( + keyCode: UInt32(event.keyCode), + carbonModifiers: carbonModifiers, + keyDisplay: keyDisplay + ) + } + + var displayString: String { + var value = "" + if carbonModifiers & UInt32(controlKey) != 0 { + value += "⌃" + } + if carbonModifiers & UInt32(optionKey) != 0 { + value += "⌥" + } + if carbonModifiers & UInt32(shiftKey) != 0 { + value += "⇧" + } + if carbonModifiers & UInt32(cmdKey) != 0 { + value += "⌘" + } + return value + keyDisplay + } + + static func carbonModifiers(from flags: NSEvent.ModifierFlags) -> UInt32 { + let sanitized = flags.intersection(.deviceIndependentFlagsMask) + var value: UInt32 = 0 + if sanitized.contains(.control) { + value |= UInt32(controlKey) + } + if sanitized.contains(.option) { + value |= UInt32(optionKey) + } + if sanitized.contains(.shift) { + value |= UInt32(shiftKey) + } + if sanitized.contains(.command) { + value |= UInt32(cmdKey) + } + return value + } + + private static func keyDisplay(for event: NSEvent) -> String? { + switch Int(event.keyCode) { + case kVK_Space: + return "Space" + case kVK_Return: + return "Return" + case kVK_Tab: + return "Tab" + case kVK_Delete: + return "Delete" + case kVK_ForwardDelete: + return "Fn-Delete" + case kVK_Escape: + return "Esc" + case kVK_LeftArrow: + return "Left" + case kVK_RightArrow: + return "Right" + case kVK_UpArrow: + return "Up" + case kVK_DownArrow: + return "Down" + default: + break + } + + guard let characters = event.charactersIgnoringModifiers? + .trimmingCharacters(in: .whitespacesAndNewlines), + !characters.isEmpty + else { + return nil + } + return characters.uppercased() + } +} diff --git a/ExecuWhisper/ExecuWhisper/Models/Preferences.swift b/ExecuWhisper/ExecuWhisper/Models/Preferences.swift new file mode 100644 index 0000000000..2e1c020379 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Models/Preferences.swift @@ -0,0 +1,283 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Observation + +@MainActor @Observable +final class Preferences { + @ObservationIgnored private let defaults: UserDefaults + + var enableGlobalHotkey: Bool = true { + didSet { defaults.set(enableGlobalHotkey, forKey: "enableGlobalHotkey") } + } + + var dictationShortcut: DictationShortcut = .controlSpace { + didSet { Self.persist(dictationShortcut: dictationShortcut, in: defaults) } + } + + var selectedMicrophoneID: String = "" { + didSet { defaults.set(selectedMicrophoneID, forKey: "selectedMicrophoneID") } + } + + var silenceThreshold: Double = 0.02 { + didSet { defaults.set(silenceThreshold, forKey: "silenceThreshold") } + } + + var silenceTimeout: Double = 1.5 { + didSet { defaults.set(silenceTimeout, forKey: "silenceTimeout") } + } + + var runnerPath: String = "" { + didSet { defaults.set(runnerPath, forKey: "runnerPath") } + } + + var formatterRunnerPath: String = "" { + didSet { defaults.set(formatterRunnerPath, forKey: "formatterRunnerPath") } + } + + var enableSmartFormatting: Bool = true { + didSet { defaults.set(enableSmartFormatting, forKey: "enableSmartFormatting") } + } + + var modelDirectory: String = "" { + didSet { + defaults.set(modelDirectory, forKey: "modelDirectory") + try? FileManager.default.createDirectory( + at: modelDirectoryURL, + withIntermediateDirectories: true + ) + } + } + + var formatterModelDirectory: String = "" { + didSet { + defaults.set(formatterModelDirectory, forKey: "formatterModelDirectory") + try? FileManager.default.createDirectory( + at: formatterModelDirectoryURL, + withIntermediateDirectories: true + ) + } + } + + var modelPath: String { modelDirectoryURL.appendingPathComponent("model.pte").path(percentEncoded: false) } + var tokenizerPath: String { modelDirectoryURL.appendingPathComponent("tokenizer.model").path(percentEncoded: false) } + var formatterModelPath: String { formatterModelDirectoryURL.appendingPathComponent("lfm2_5_350m_mlx_4w.pte").path(percentEncoded: false) } + var formatterTokenizerPath: String { formatterModelDirectoryURL.appendingPathComponent("tokenizer.json").path(percentEncoded: false) } + var formatterTokenizerConfigPath: String { formatterModelDirectoryURL.appendingPathComponent("tokenizer_config.json").path(percentEncoded: false) } + + var modelDirectoryURL: URL { URL(fileURLWithPath: modelDirectory, isDirectory: true) } + var formatterModelDirectoryURL: URL { URL(fileURLWithPath: formatterModelDirectory, isDirectory: true) } + + var bundledRunnerPath: String { + let resources = Bundle.main.resourcePath ?? "" + return URL(fileURLWithPath: resources).appendingPathComponent("parakeet_helper").path(percentEncoded: false) + } + + var bundledFormatterRunnerPath: String { + let resources = Bundle.main.resourcePath ?? "" + return URL(fileURLWithPath: resources).appendingPathComponent("lfm25_formatter_helper").path(percentEncoded: false) + } + + var bundledLibompPath: String { + let resources = Bundle.main.resourcePath ?? "" + return URL(fileURLWithPath: resources).appendingPathComponent("libomp.dylib").path(percentEncoded: false) + } + + var bundledModelDirectoryURL: URL? { + guard let resources = Bundle.main.resourcePath else { return nil } + let directoryURL = URL(fileURLWithPath: resources, isDirectory: true) + let modelURL = directoryURL.appendingPathComponent("model.pte") + let tokenizerURL = directoryURL.appendingPathComponent("tokenizer.model") + if FileManager.default.fileExists(atPath: modelURL.path(percentEncoded: false)) + && FileManager.default.fileExists(atPath: tokenizerURL.path(percentEncoded: false)) { + return directoryURL + } + return nil + } + + var bundledFormatterModelDirectoryURL: URL? { + guard let resources = Bundle.main.resourcePath else { return nil } + let directoryURL = URL(fileURLWithPath: resources, isDirectory: true) + let modelURL = directoryURL.appendingPathComponent("lfm2_5_350m_mlx_4w.pte") + let tokenizerURL = directoryURL.appendingPathComponent("tokenizer.json") + let tokenizerConfigURL = directoryURL.appendingPathComponent("tokenizer_config.json") + if FileManager.default.fileExists(atPath: modelURL.path(percentEncoded: false)) + && FileManager.default.fileExists(atPath: tokenizerURL.path(percentEncoded: false)) + && FileManager.default.fileExists(atPath: tokenizerConfigURL.path(percentEncoded: false)) { + return directoryURL + } + return nil + } + + var downloadedModelDirectoryURL: URL { + PersistencePaths.modelsDirectoryURL + } + + var downloadedFormatterModelDirectoryURL: URL { + PersistencePaths.modelsDirectoryURL.appendingPathComponent("formatter", isDirectory: true) + } + + static func resolveRunnerPath( + savedRunnerPath: String?, + savedRunnerExists: Bool, + bundledRunnerPath: String, + bundledRunnerExists: Bool, + buildRunnerPath: String + ) -> String { + if let savedRunnerPath, !savedRunnerPath.isEmpty, savedRunnerExists { + return savedRunnerPath + } + if bundledRunnerExists { + return bundledRunnerPath + } + if let savedRunnerPath, !savedRunnerPath.isEmpty { + return savedRunnerPath + } + return buildRunnerPath + } + + static func modelDirectoryCandidates( + savedModelDirectory: String?, + bundledModelDirectory: String?, + downloadedModelDirectory: String + ) -> [String] { + var candidates: [String] = [] + for candidate in [savedModelDirectory, bundledModelDirectory, downloadedModelDirectory] { + guard let candidate, !candidate.isEmpty, !candidates.contains(candidate) else { continue } + candidates.append(candidate) + } + return candidates + } + + static func resolveModelDirectory( + savedModelDirectory: String?, + bundledModelDirectory: String?, + downloadedModelDirectory: String, + hasUsableModelFiles: (String) -> Bool + ) -> String { + let candidates = modelDirectoryCandidates( + savedModelDirectory: savedModelDirectory, + bundledModelDirectory: bundledModelDirectory, + downloadedModelDirectory: downloadedModelDirectory + ) + + if let resolved = candidates.first(where: hasUsableModelFiles) { + return resolved + } + + return candidates.first ?? downloadedModelDirectory + } + + init(defaults: UserDefaults = .standard) { + self.defaults = defaults + let home = FileManager.default.homeDirectoryForCurrentUser.path(percentEncoded: false) + let buildRunner = "\(home)/executorch/cmake-out/examples/models/parakeet/parakeet_helper" + let buildFormatterRunner = "\(home)/executorch/cmake-out/examples/models/llama/lfm25_formatter_helper" + let savedRunnerPath = defaults.string(forKey: "runnerPath") + let migratedSavedRunnerPath = Self.migrateHelperPath(savedRunnerPath) + let savedFormatterRunnerPath = defaults.string(forKey: "formatterRunnerPath") + + enableGlobalHotkey = defaults.object(forKey: "enableGlobalHotkey") as? Bool ?? true + dictationShortcut = Self.loadDictationShortcut(from: defaults) + selectedMicrophoneID = defaults.string(forKey: "selectedMicrophoneID") ?? "" + silenceThreshold = defaults.object(forKey: "silenceThreshold") as? Double ?? 0.02 + silenceTimeout = defaults.object(forKey: "silenceTimeout") as? Double ?? 1.5 + enableSmartFormatting = defaults.object(forKey: "enableSmartFormatting") as? Bool ?? true + defaults.removeObject(forKey: "formattingMode") + defaults.removeObject(forKey: "customFormattingPrompt") + + let bundledRunner = bundledRunnerPath + runnerPath = Self.resolveRunnerPath( + savedRunnerPath: migratedSavedRunnerPath, + savedRunnerExists: migratedSavedRunnerPath.map { + FileManager.default.isExecutableFile(atPath: $0) + } ?? false, + bundledRunnerPath: bundledRunner, + bundledRunnerExists: FileManager.default.isExecutableFile(atPath: bundledRunner), + buildRunnerPath: buildRunner + ) + + let bundledFormatterRunner = bundledFormatterRunnerPath + formatterRunnerPath = Self.resolveRunnerPath( + savedRunnerPath: savedFormatterRunnerPath, + savedRunnerExists: savedFormatterRunnerPath.map { + FileManager.default.isExecutableFile(atPath: $0) + } ?? false, + bundledRunnerPath: bundledFormatterRunner, + bundledRunnerExists: FileManager.default.isExecutableFile(atPath: bundledFormatterRunner), + buildRunnerPath: buildFormatterRunner + ) + + let preferredModelDir = Self.resolveModelDirectory( + savedModelDirectory: defaults.string(forKey: "modelDirectory"), + bundledModelDirectory: bundledModelDirectoryURL?.path(percentEncoded: false), + downloadedModelDirectory: downloadedModelDirectoryURL.path(percentEncoded: false) + ) { candidate in + let directoryURL = URL(fileURLWithPath: candidate, isDirectory: true) + let modelPath = directoryURL.appendingPathComponent("model.pte").path(percentEncoded: false) + let tokenizerPath = directoryURL.appendingPathComponent("tokenizer.model").path(percentEncoded: false) + return FileManager.default.fileExists(atPath: modelPath) && FileManager.default.fileExists(atPath: tokenizerPath) + } + modelDirectory = preferredModelDir + + let preferredFormatterModelDir = Self.resolveModelDirectory( + savedModelDirectory: defaults.string(forKey: "formatterModelDirectory"), + bundledModelDirectory: bundledFormatterModelDirectoryURL?.path(percentEncoded: false), + downloadedModelDirectory: downloadedFormatterModelDirectoryURL.path(percentEncoded: false) + ) { candidate in + let directoryURL = URL(fileURLWithPath: candidate, isDirectory: true) + let modelPath = directoryURL.appendingPathComponent("lfm2_5_350m_mlx_4w.pte").path(percentEncoded: false) + let tokenizerPath = directoryURL.appendingPathComponent("tokenizer.json").path(percentEncoded: false) + let tokenizerConfigPath = directoryURL.appendingPathComponent("tokenizer_config.json").path(percentEncoded: false) + return FileManager.default.fileExists(atPath: modelPath) + && FileManager.default.fileExists(atPath: tokenizerPath) + && FileManager.default.fileExists(atPath: tokenizerConfigPath) + } + formatterModelDirectory = preferredFormatterModelDir + + try? FileManager.default.createDirectory( + at: downloadedModelDirectoryURL, + withIntermediateDirectories: true + ) + try? FileManager.default.createDirectory( + at: downloadedFormatterModelDirectoryURL, + withIntermediateDirectories: true + ) + } + + private static func migrateHelperPath(_ savedRunnerPath: String?) -> String? { + guard let savedRunnerPath, !savedRunnerPath.isEmpty else { return savedRunnerPath } + let savedURL = URL(fileURLWithPath: savedRunnerPath) + guard savedURL.lastPathComponent == "parakeet_runner" else { return savedRunnerPath } + + let siblingHelperPath = savedURL + .deletingLastPathComponent() + .appendingPathComponent("parakeet_helper") + .path(percentEncoded: false) + if FileManager.default.isExecutableFile(atPath: siblingHelperPath) { + return siblingHelperPath + } + + return savedRunnerPath + } + + private static func loadDictationShortcut(from defaults: UserDefaults) -> DictationShortcut { + guard let data = defaults.data(forKey: "dictationShortcut"), + let shortcut = try? JSONDecoder().decode(DictationShortcut.self, from: data) + else { + return .controlSpace + } + return shortcut + } + + private static func persist(dictationShortcut: DictationShortcut, in defaults: UserDefaults) { + guard let data = try? JSONEncoder().encode(dictationShortcut) else { return } + defaults.set(data, forKey: "dictationShortcut") + } +} diff --git a/ExecuWhisper/ExecuWhisper/Models/ReplacementEntry.swift b/ExecuWhisper/ExecuWhisper/Models/ReplacementEntry.swift new file mode 100644 index 0000000000..545c26a7eb --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Models/ReplacementEntry.swift @@ -0,0 +1,37 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +struct ReplacementEntry: Identifiable, Codable, Sendable, Hashable { + let id: UUID + var trigger: String + var replacement: String + var isEnabled: Bool + var isCaseSensitive: Bool + var requiresWordBoundary: Bool + var notes: String + + init( + id: UUID = UUID(), + trigger: String = "", + replacement: String = "", + isEnabled: Bool = true, + isCaseSensitive: Bool = false, + requiresWordBoundary: Bool = true, + notes: String = "" + ) { + self.id = id + self.trigger = trigger + self.replacement = replacement + self.isEnabled = isEnabled + self.isCaseSensitive = isCaseSensitive + self.requiresWordBoundary = requiresWordBoundary + self.notes = notes + } +} diff --git a/ExecuWhisper/ExecuWhisper/Models/Session.swift b/ExecuWhisper/ExecuWhisper/Models/Session.swift new file mode 100644 index 0000000000..575e534854 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Models/Session.swift @@ -0,0 +1,80 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +struct Session: Identifiable, Codable, Sendable, Hashable { + let id: UUID + let date: Date + var title: String + var transcript: String + var duration: TimeInterval + var rawTranscript: String? + var tags: [String] + var pinned: Bool + var usedSnippetIDs: [UUID] + + init( + id: UUID = UUID(), + date: Date = .now, + title: String = "", + transcript: String = "", + duration: TimeInterval = 0, + rawTranscript: String? = nil, + tags: [String] = [], + pinned: Bool = false, + usedSnippetIDs: [UUID] = [] + ) { + self.id = id + self.date = date + self.title = title + self.transcript = transcript + self.duration = duration + self.rawTranscript = rawTranscript + self.tags = tags + self.pinned = pinned + self.usedSnippetIDs = usedSnippetIDs + } + + enum CodingKeys: String, CodingKey { + case id + case date + case title + case transcript + case duration + case rawTranscript + case tags + case pinned + case usedSnippetIDs + } + + init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + id = try container.decodeIfPresent(UUID.self, forKey: .id) ?? UUID() + date = try container.decodeIfPresent(Date.self, forKey: .date) ?? .now + title = try container.decodeIfPresent(String.self, forKey: .title) ?? "" + transcript = try container.decodeIfPresent(String.self, forKey: .transcript) ?? "" + duration = try container.decodeIfPresent(TimeInterval.self, forKey: .duration) ?? 0 + rawTranscript = try container.decodeIfPresent(String.self, forKey: .rawTranscript) + tags = try container.decodeIfPresent([String].self, forKey: .tags) ?? [] + pinned = try container.decodeIfPresent(Bool.self, forKey: .pinned) ?? false + usedSnippetIDs = try container.decodeIfPresent([UUID].self, forKey: .usedSnippetIDs) ?? [] + } + + var displayTitle: String { + if !title.isEmpty { return title } + let formatter = DateFormatter() + formatter.dateStyle = .medium + formatter.timeStyle = .short + return formatter.string(from: date) + } + + var previewText: String { + transcript.isEmpty ? (rawTranscript ?? "") : transcript + } +} diff --git a/ExecuWhisper/ExecuWhisper/Models/TranscriptStore.swift b/ExecuWhisper/ExecuWhisper/Models/TranscriptStore.swift new file mode 100644 index 0000000000..20fcd34f96 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Models/TranscriptStore.swift @@ -0,0 +1,843 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import AppKit +import Foundation +import os + +private let storeLog = Logger(subsystem: "org.pytorch.executorch.ExecuWhisper", category: "TranscriptStore") + +@MainActor @Observable +final class TranscriptStore { + enum SessionState: Equatable { + case idle + case recording + case transcribing + } + + enum ModelState: Equatable { + case checking + case missing + case downloading + case ready + } + + var sessions: [Session] = [] + var selectedSessionID: UUID? + var selectedHistorySessionIDs: Set = [] + var liveTranscript = "" + var sessionState: SessionState = .idle + var modelState: ModelState = .checking + var currentError: RunnerError? + var healthResult: HealthCheck.Result? + var audioLevel: Float = 0 + var statusMessage = "" + var helperState: RunnerBridge.ResidencyState = .unloaded + var helperStatusMessage = "" + + var hasActiveSession: Bool { sessionState != .idle } + var isRecording: Bool { sessionState == .recording } + var isTranscribing: Bool { sessionState == .transcribing } + var isModelReady: Bool { modelState == .ready } + var resourcesReady: Bool { healthResult?.resourcesReady == true } + var isHelperWarm: Bool { helperState == .warm } + var isHelperLoading: Bool { helperState == .loading } + + private let recorder: AudioRecorder + private let runner: any RunnerBridgeClient + private let preferences: Preferences + private let downloader: ModelDownloader + private let sessionsURL: URL + private let textPipeline: TextPipeline? + private let audioDecoder: any ImportedAudioDecoding + private let maxRecordingDuration: TimeInterval + private var recordingStartDate: Date? + private var initialized = false + private var warmupTask: Task? + private var recordingLimitTask: Task? + private var explicitlyUnloaded = false + + init( + preferences: Preferences, + downloader: ModelDownloader, + sessionsURL: URL = PersistencePaths.sessionsURL, + textPipeline: TextPipeline? = nil, + audioDecoder: any ImportedAudioDecoding = ImportedAudioDecoder(), + recorder: AudioRecorder = AudioRecorder(), + runner: any RunnerBridgeClient = RunnerBridge(), + maxRecordingDuration: TimeInterval = 30 * 60 + ) { + self.recorder = recorder + self.runner = runner + self.preferences = preferences + self.downloader = downloader + self.sessionsURL = sessionsURL + self.textPipeline = textPipeline + self.audioDecoder = audioDecoder + self.maxRecordingDuration = maxRecordingDuration + loadSessions() + } + + func initialize() async { + guard !initialized else { return } + initialized = true + await runHealthCheck() + + if healthResult?.modelAssetsMissing == true { + await downloadModelIfNeeded() + } + + if preferences.enableSmartFormatting && !formatterAssetsReady { + await downloadFormatterModelIfNeeded() + } + + await autoPreloadModelIfReady() + } + + func runHealthCheck() async { + var result = await HealthCheck.run( + runnerPath: preferences.runnerPath, + modelPath: preferences.modelPath, + tokenizerPath: preferences.tokenizerPath + ) + + if result.runnerAvailable && !result.resourcesReady { + let bundledPath = preferences.bundledModelDirectoryURL?.path(percentEncoded: false) + let candidates = Preferences.modelDirectoryCandidates( + savedModelDirectory: preferences.modelDirectory, + bundledModelDirectory: bundledPath, + downloadedModelDirectory: preferences.downloadedModelDirectoryURL.path(percentEncoded: false) + ) + + for candidate in candidates where candidate != preferences.modelDirectory { + let candidateURL = URL(fileURLWithPath: candidate, isDirectory: true) + let candidateResult = await HealthCheck.run( + runnerPath: preferences.runnerPath, + modelPath: candidateURL.appendingPathComponent("model.pte").path(percentEncoded: false), + tokenizerPath: candidateURL.appendingPathComponent("tokenizer.model").path(percentEncoded: false) + ) + if candidateResult.resourcesReady { + preferences.modelDirectory = candidate + result = candidateResult + break + } + } + } + + healthResult = result + + if downloader.isDownloading { + modelState = .downloading + if statusMessage.isEmpty || statusMessage == "Ready" { + statusMessage = "Downloading model..." + } + } else if result.resourcesReady { + modelState = .ready + if !hasActiveSession { + statusMessage = "Ready" + } + } else { + modelState = .missing + if !hasActiveSession { + statusMessage = result.setupStatusMessage + } + } + + if result.resourcesReady { + await syncHelperState() + if !hasActiveSession { + await autoPreloadModelIfReady() + } + } else { + helperState = .unloaded + helperStatusMessage = "" + warmupTask?.cancel() + warmupTask = nil + } + } + + func downloadModelIfNeeded(force: Bool = false) async { + if !force && healthResult?.resourcesReady == true { + modelState = .ready + return + } + if !force && healthResult?.shouldOfferModelDownload != true { + return + } + await downloadModel() + } + + func downloadFormatterModelIfNeeded(force: Bool = false) async { + guard force || !formatterAssetsReady else { return } + await downloadFormatterModel() + } + + func downloadModel() async { + guard !downloader.isDownloading else { + modelState = .downloading + return + } + + preferences.modelDirectory = preferences.downloadedModelDirectoryURL.path(percentEncoded: false) + modelState = .downloading + statusMessage = "Downloading model..." + currentError = nil + + do { + try await downloader.downloadModels( + destinationDirectory: preferences.downloadedModelDirectoryURL + ) + if preferences.enableSmartFormatting && !formatterAssetsReady { + preferences.formatterModelDirectory = preferences.downloadedFormatterModelDirectoryURL.path(percentEncoded: false) + try await downloader.downloadFormatterModels( + destinationDirectory: preferences.downloadedFormatterModelDirectoryURL + ) + } + await runHealthCheck() + await autoPreloadModelIfReady() + } catch let error as RunnerError { + currentError = error + await runHealthCheck() + } catch { + currentError = .downloadFailed(file: "Parakeet model", description: error.localizedDescription) + await runHealthCheck() + } + } + + func downloadFormatterModel() async { + guard !downloader.isDownloading else { + modelState = .downloading + return + } + + preferences.formatterModelDirectory = preferences.downloadedFormatterModelDirectoryURL.path(percentEncoded: false) + modelState = .downloading + statusMessage = "Downloading formatter..." + currentError = nil + + do { + try await downloader.downloadFormatterModels( + destinationDirectory: preferences.downloadedFormatterModelDirectoryURL + ) + await runHealthCheck() + } catch let error as RunnerError { + currentError = error + await runHealthCheck() + } catch { + currentError = .downloadFailed(file: "LFM2.5 formatter", description: error.localizedDescription) + await runHealthCheck() + } + } + + func preloadModel() async { + explicitlyUnloaded = false + await performHelperWarmupIfNeeded(updateStatusMessage: true) + } + + func unloadModel() async { + explicitlyUnloaded = true + warmupTask?.cancel() + warmupTask = nil + await runner.shutdown() + helperState = .unloaded + helperStatusMessage = resourcesReady ? "Helper unloaded" : "" + if !hasActiveSession { + statusMessage = resourcesReady ? "Ready" : (healthResult?.setupStatusMessage ?? "Ready") + } + } + + func startRecording() async { + guard sessionState == .idle else { return } + + await runHealthCheck() + if healthResult?.shouldOfferModelDownload == true { + await downloadModelIfNeeded() + await runHealthCheck() + } + guard resourcesReady else { + if healthResult?.runnerAvailable == false { + currentError = .binaryNotFound(path: preferences.runnerPath) + } + return + } + + let micPermission = await HealthCheck.liveMicPermission() + if micPermission == .notDetermined { + let granted = await HealthCheck.requestMicrophoneAccess() + if !granted { + currentError = .microphonePermissionDenied + return + } + } else if micPermission == .denied { + currentError = .microphonePermissionDenied + return + } + + selectedSessionID = nil + selectedHistorySessionIDs = [] + liveTranscript = "" + audioLevel = 0 + statusMessage = "Recording..." + currentError = nil + sessionState = .recording + recordingStartDate = .now + scheduleRecordingLimit { + await self.stopRecordingAndTranscribe() + } + + do { + try await recorder.startRecording(selectedMicrophoneID: preferences.selectedMicrophoneID) { [weak self] level in + Task { @MainActor in + self?.audioLevel = level + } + } + startBackgroundWarmupIfNeeded() + } catch let error as RunnerError { + cancelRecordingLimit() + currentError = error + sessionState = .idle + } catch { + cancelRecordingLimit() + currentError = .launchFailed(description: error.localizedDescription) + sessionState = .idle + } + } + + func startDictationCapture() async -> Bool { + guard sessionState == .idle else { return false } + storeLog.info("Dictation capture requested") + + await runHealthCheck() + if healthResult?.shouldOfferModelDownload == true { + await downloadModelIfNeeded() + await runHealthCheck() + } + guard resourcesReady else { + if healthResult?.runnerAvailable == false { + currentError = .binaryNotFound(path: preferences.runnerPath) + } + return false + } + + let micPermission = await HealthCheck.liveMicPermission() + if micPermission == .notDetermined { + let granted = await HealthCheck.requestMicrophoneAccess() + if !granted { + currentError = .microphonePermissionDenied + return false + } + } else if micPermission == .denied { + currentError = .microphonePermissionDenied + return false + } + + selectedSessionID = nil + selectedHistorySessionIDs = [] + liveTranscript = "" + audioLevel = 0 + statusMessage = "Listening..." + currentError = nil + sessionState = .recording + recordingStartDate = .now + storeLog.info("Dictation capture starting with runnerPath=\(self.preferences.runnerPath, privacy: .public) modelPath=\(self.preferences.modelPath, privacy: .public)") + + do { + try await recorder.startRecording(selectedMicrophoneID: preferences.selectedMicrophoneID) { [weak self] level in + Task { @MainActor in + self?.audioLevel = level + } + } + startBackgroundWarmupIfNeeded() + storeLog.info("Dictation capture started") + return true + } catch let error as RunnerError { + cancelRecordingLimit() + storeLog.error("Dictation capture failed to start: \(error.localizedDescription, privacy: .public)") + currentError = error + resetLiveState(status: "Ready") + return false + } catch { + cancelRecordingLimit() + storeLog.error("Dictation capture failed with unexpected error: \(error.localizedDescription, privacy: .public)") + currentError = .launchFailed(description: error.localizedDescription) + resetLiveState(status: "Ready") + return false + } + } + + func finishDictationCapture() async throws -> TextProcessingResult { + guard sessionState == .recording else { + throw RunnerError.dictationNotActive + } + + let duration = recordingStartDate.map { Date.now.timeIntervalSince($0) } ?? 0 + sessionState = .transcribing + statusMessage = "Transcribing..." + audioLevel = 0 + cancelRecordingLimit() + storeLog.info("Dictation capture stopping after duration=\(duration, format: .fixed(precision: 3))s") + + do { + let pcmData = try await recorder.stopRecording() + storeLog.info("Dictation captured pcmBytes=\(pcmData.count)") + let finalResult = try await transcribeCapturedAudio(pcmData) + liveTranscript = finalResult.text + storeLog.info("Dictation transcription completed textLength=\(finalResult.text.count)") + return await storeDictationTranscription(rawText: finalResult.text, duration: duration) + } catch { + storeLog.error("Dictation transcription failed: \(error.localizedDescription, privacy: .public)") + resetLiveState(status: "Ready") + throw error + } + } + + func stopRecordingAndTranscribe() async { + guard sessionState == .recording else { return } + + let duration = recordingStartDate.map { Date.now.timeIntervalSince($0) } ?? 0 + sessionState = .transcribing + statusMessage = "Finalizing recording..." + audioLevel = 0 + cancelRecordingLimit() + + do { + let pcmData = try await recorder.stopRecording() + storeLog.info("Recording captured pcmBytes=\(pcmData.count)") + let finalResult = try await transcribeCapturedAudio(pcmData) + liveTranscript = finalResult.text + await storeCompletedTranscription(rawText: finalResult.text, duration: duration) + } catch let error as RunnerError { + currentError = error + resetLiveState() + } catch { + currentError = .transcriptionFailed(description: error.localizedDescription) + resetLiveState() + } + } + + @discardableResult + func importAudioFile(_ url: URL) async -> Bool { + guard sessionState == .idle else { + currentError = .transcriptionFailed(description: "Wait for the current transcription to finish before importing another audio file.") + return false + } + + await runHealthCheck() + if healthResult?.shouldOfferModelDownload == true { + await downloadModelIfNeeded() + await runHealthCheck() + } + guard resourcesReady else { + if healthResult?.runnerAvailable == false { + currentError = .binaryNotFound(path: preferences.runnerPath) + } + return false + } + + let previousSelectedSessionID = selectedSessionID + let previousHistorySelection = selectedHistorySessionIDs + selectedSessionID = nil + selectedHistorySessionIDs = [] + liveTranscript = "" + audioLevel = 0 + statusMessage = "Preparing audio file..." + currentError = nil + sessionState = .transcribing + recordingStartDate = .now + + do { + let decoded = try audioDecoder.decodeAudioFile(at: url) + let finalResult = try await transcribeCapturedAudio(decoded.pcmData) + liveTranscript = finalResult.text + await storeImportedTranscription( + rawText: finalResult.text, + duration: decoded.duration, + title: importedSessionTitle(for: url) + ) + return true + } catch let error as RunnerError { + selectedSessionID = previousSelectedSessionID + selectedHistorySessionIDs = previousHistorySelection + currentError = error + resetLiveState() + return false + } catch { + selectedSessionID = previousSelectedSessionID + selectedHistorySessionIDs = previousHistorySelection + currentError = .transcriptionFailed(description: error.localizedDescription) + resetLiveState() + return false + } + } + + func deleteSession(_ session: Session) { + sessions.removeAll { $0.id == session.id } + selectedHistorySessionIDs.remove(session.id) + if selectedSessionID == session.id { + selectedSessionID = sessions.first?.id + } + saveSessions() + } + + func deleteSessions(ids: Set) { + guard !ids.isEmpty else { return } + sessions.removeAll { ids.contains($0.id) } + selectedHistorySessionIDs.subtract(ids) + if let selectedSessionID, ids.contains(selectedSessionID) { + self.selectedSessionID = nil + } + saveSessions() + } + + func renameSession(_ session: Session, to newTitle: String) { + guard let idx = sessions.firstIndex(where: { $0.id == session.id }) else { return } + sessions[idx].title = newTitle + saveSessions() + } + + func togglePinned(_ session: Session) { + guard let idx = sessions.firstIndex(where: { $0.id == session.id }) else { return } + sessions[idx].pinned.toggle() + saveSessions() + } + + func clearError() { + currentError = nil + } + + func exportSession(_ session: Session, format: SessionExportFormat) { + let panel = NSSavePanel() + panel.allowedContentTypes = [format.contentType] + panel.canCreateDirectories = true + panel.nameFieldStringValue = suggestedExportFileName(for: session, format: format) + + guard panel.runModal() == .OK, let url = panel.url else { return } + + do { + try writeSessionExport(session, format: format, to: url) + } catch { + currentError = .exportFailed(description: error.localizedDescription) + } + } + + func importAudioFileWithPanel() { + let panel = NSOpenPanel() + panel.allowedContentTypes = ImportedAudioDecoder.allowedContentTypes + panel.canChooseFiles = true + panel.canChooseDirectories = false + panel.allowsMultipleSelection = false + + guard panel.runModal() == .OK, let url = panel.url else { return } + Task { @MainActor in + await importAudioFile(url) + } + } + + func writeSessionExport(_ session: Session, format: SessionExportFormat, to url: URL) throws { + let rendered = format.render(session) + try rendered.write(to: url, atomically: true, encoding: .utf8) + } + + func storeCompletedTranscription(rawText: String, duration: TimeInterval) async { + _ = await processCompletedTranscription( + rawText: rawText, + duration: duration, + context: .standard, + persistSession: true, + titleOverride: nil + ) + } + + @discardableResult + func storeDictationTranscription(rawText: String, duration: TimeInterval) async -> TextProcessingResult { + await processCompletedTranscription( + rawText: rawText, + duration: duration, + context: .dictation, + persistSession: false, + titleOverride: nil + ) + } + + func storeImportedTranscription(rawText: String, duration: TimeInterval, title: String) async { + _ = await processCompletedTranscription( + rawText: rawText, + duration: duration, + context: .standard, + persistSession: true, + titleOverride: title + ) + } + + private func finishTranscription( + rawText: String, + transcript: String, + tags: [String], + duration: TimeInterval, + persistSession: Bool, + titleOverride: String? + ) { + if persistSession && !transcript.isEmpty { + let session = Session( + date: recordingStartDate ?? .now, + title: titleOverride ?? "", + transcript: transcript, + duration: duration, + rawTranscript: rawText, + tags: tags + ) + sessions.insert(session, at: 0) + selectedSessionID = session.id + selectedHistorySessionIDs = [session.id] + saveSessions() + } + + liveTranscript = transcript + resetLiveState(status: transcript.isEmpty ? "No speech detected" : "Ready") + } + + private func suggestedExportFileName(for session: Session, format: SessionExportFormat) -> String { + let base = session.displayTitle + .replacingOccurrences(of: "/", with: "-") + .replacingOccurrences(of: ":", with: "-") + return "\(base).\(format.fileExtension)" + } + + private func transcribe(audioURL: URL) async throws -> RunnerBridge.TranscriptionResult { + storeLog.info("Beginning batch transcription for audioPath=\(audioURL.path(percentEncoded: false), privacy: .public)") + let events = await runner.transcribe( + runnerPath: preferences.runnerPath, + modelPath: preferences.modelPath, + tokenizerPath: preferences.tokenizerPath, + audioPath: audioURL.path(percentEncoded: false), + options: .fromEnvironment(ProcessInfo.processInfo.environment) + ) + return try await collectFinalResult(from: events) + } + + func transcribeCapturedAudio(_ pcmData: Data) async throws -> RunnerBridge.TranscriptionResult { + storeLog.info("Beginning batch transcription for capturedAudioBytes=\(pcmData.count)") + if helperState != .warm { + helperState = .loading + helperStatusMessage = "Warming model..." + } + let events = await runner.transcribePCM( + runnerPath: preferences.runnerPath, + modelPath: preferences.modelPath, + tokenizerPath: preferences.tokenizerPath, + pcmData: pcmData, + options: .fromEnvironment(ProcessInfo.processInfo.environment) + ) + let result = try await collectFinalResult(from: events) + await syncHelperState() + return result + } + + private func collectFinalResult( + from events: AsyncThrowingStream + ) async throws -> RunnerBridge.TranscriptionResult { + var finalResult: RunnerBridge.TranscriptionResult? + for try await event in events { + switch event { + case .status(let status): + statusMessage = status + storeLog.info("Runner status event: \(status, privacy: .public)") + case .completed(let result): + finalResult = result + storeLog.info("Runner completed event textLength=\(result.text.count) stdoutLength=\(result.stdout.count) stderrLength=\(result.stderr.count)") + if DiagnosticLogging.shouldLogTranscriptsPublicly { + storeLog.info("Parakeet transcript: \(result.text, privacy: .public)") + } else { + storeLog.info("Parakeet transcript: \(result.text, privacy: .private)") + } + if let runtimeProfile = result.runtimeProfile { + storeLog.info("Runner runtime profile: \(runtimeProfile, privacy: .public)") + } + } + } + + guard let finalResult else { + storeLog.error("Runner stream finished without a completed event") + throw RunnerError.invalidRunnerOutput(stdout: "") + } + return finalResult + } + + private func startBackgroundWarmupIfNeeded() { + guard resourcesReady else { return } + guard helperState == .unloaded || helperState == .failed else { return } + guard warmupTask == nil else { return } + + warmupTask = Task { @MainActor [weak self] in + await self?.performHelperWarmupIfNeeded(updateStatusMessage: false) + } + } + + private func autoPreloadModelIfReady() async { + guard !explicitlyUnloaded else { return } + guard resourcesReady else { return } + guard helperState == .unloaded || helperState == .failed else { return } + await performHelperWarmupIfNeeded(updateStatusMessage: false) + } + + private func performHelperWarmupIfNeeded(updateStatusMessage: Bool) async { + guard resourcesReady else { return } + + if helperState == .warm { + helperStatusMessage = "Model preloaded" + return + } + + if helperState == .loading, let warmupTask { + await warmupTask.value + return + } + + helperState = .loading + helperStatusMessage = "Warming model..." + if updateStatusMessage && !hasActiveSession { + statusMessage = "Warming model..." + } + + do { + try await runner.prepare( + runnerPath: preferences.runnerPath, + modelPath: preferences.modelPath, + tokenizerPath: preferences.tokenizerPath + ) + helperState = .warm + helperStatusMessage = "Model preloaded" + logResidentMemory(context: "Parakeet helper preloaded") + if updateStatusMessage && !hasActiveSession { + statusMessage = "Ready" + } + } catch let error as RunnerError { + helperState = .failed + helperStatusMessage = "Warmup failed" + currentError = error + if updateStatusMessage && !hasActiveSession { + statusMessage = healthResult?.setupStatusMessage ?? "Ready" + } + } catch { + helperState = .failed + helperStatusMessage = "Warmup failed" + currentError = .launchFailed(description: error.localizedDescription) + if updateStatusMessage && !hasActiveSession { + statusMessage = healthResult?.setupStatusMessage ?? "Ready" + } + } + + warmupTask = nil + } + + private func syncHelperState() async { + let snapshot = await runner.runtimeSnapshot() + helperState = snapshot.state + switch snapshot.state { + case .unloaded: + helperStatusMessage = resourcesReady ? "Helper unloaded" : "" + case .loading: + helperStatusMessage = "Warming model..." + case .warm: + helperStatusMessage = "Model preloaded" + case .failed: + helperStatusMessage = "Warmup failed" + } + } + + private func logResidentMemory(context: String) { + guard UserDefaults.standard.bool(forKey: DiagnosticLogging.transcriptDebugKey), + let bytes = DiagnosticLogging.residentMemoryBytes() + else { + return + } + storeLog.info("\(context, privacy: .public) residentMemoryBytes=\(bytes)") + } + + @discardableResult + private func processCompletedTranscription( + rawText: String, + duration: TimeInterval, + context: TextPipeline.Context, + persistSession: Bool, + titleOverride: String? + ) async -> TextProcessingResult { + if preferences.enableSmartFormatting && !rawText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + statusMessage = "Formatting..." + liveTranscript = "Formatting..." + } + + let processed = await textPipeline?.process( + rawText, + context: context, + smartFormattingEnabled: preferences.enableSmartFormatting + ) + ?? TextProcessingResult(rawText: rawText, outputText: rawText, tags: []) + finishTranscription( + rawText: processed.rawText, + transcript: processed.outputText, + tags: processed.tags, + duration: duration, + persistSession: persistSession, + titleOverride: titleOverride + ) + return processed + } + + private func importedSessionTitle(for url: URL) -> String { + let title = url.deletingPathExtension().lastPathComponent.trimmingCharacters(in: .whitespacesAndNewlines) + return title.isEmpty ? "Imported Audio" : title + } + + private func resetLiveState(status: String = "Ready") { + cancelRecordingLimit() + audioLevel = 0 + sessionState = .idle + recordingStartDate = nil + statusMessage = status + } + + private func scheduleRecordingLimit(onLimit: @escaping @MainActor () async -> Void) { + cancelRecordingLimit() + guard maxRecordingDuration > 0 else { return } + recordingLimitTask = Task { @MainActor [weak self] in + try? await Task.sleep(for: .seconds(maxRecordingDuration)) + guard let self, self.sessionState == .recording else { return } + self.statusMessage = "Maximum recording duration reached" + self.currentError = .transcriptionFailed(description: "Maximum recording duration reached.") + await onLimit() + } + } + + private func cancelRecordingLimit() { + recordingLimitTask?.cancel() + recordingLimitTask = nil + } + + private var formatterAssetsReady: Bool { + let fm = FileManager.default + return fm.fileExists(atPath: preferences.formatterModelPath) + && fm.fileExists(atPath: preferences.formatterTokenizerPath) + && fm.fileExists(atPath: preferences.formatterTokenizerConfigPath) + } + + private func saveSessions() { + guard let data = try? JSONEncoder().encode(sessions) else { return } + try? data.write(to: sessionsURL, options: .atomic) + } + + private func loadSessions() { + guard let data = try? Data(contentsOf: sessionsURL), + let decoded = try? JSONDecoder().decode([Session].self, from: data) + else { + return + } + sessions = decoded.sorted { $0.date > $1.date } + selectedSessionID = nil + } +} diff --git a/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AccentColor.colorset/Contents.json b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AccentColor.colorset/Contents.json new file mode 100644 index 0000000000..667b84d5cd --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AccentColor.colorset/Contents.json @@ -0,0 +1,20 @@ +{ + "colors" : [ + { + "color" : { + "color-space" : "srgb", + "components" : { + "alpha" : "1.000", + "blue" : "0.800", + "green" : "0.345", + "red" : "0.259" + } + }, + "idiom" : "universal" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/Contents.json b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/Contents.json new file mode 100644 index 0000000000..4ab5e2f324 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/Contents.json @@ -0,0 +1,68 @@ +{ + "images" : [ + { + "filename" : "icon_16.png", + "idiom" : "mac", + "scale" : "1x", + "size" : "16x16" + }, + { + "filename" : "icon_16@2x.png", + "idiom" : "mac", + "scale" : "2x", + "size" : "16x16" + }, + { + "filename" : "icon_32.png", + "idiom" : "mac", + "scale" : "1x", + "size" : "32x32" + }, + { + "filename" : "icon_32@2x.png", + "idiom" : "mac", + "scale" : "2x", + "size" : "32x32" + }, + { + "filename" : "icon_128.png", + "idiom" : "mac", + "scale" : "1x", + "size" : "128x128" + }, + { + "filename" : "icon_128@2x.png", + "idiom" : "mac", + "scale" : "2x", + "size" : "128x128" + }, + { + "filename" : "icon_256.png", + "idiom" : "mac", + "scale" : "1x", + "size" : "256x256" + }, + { + "filename" : "icon_256@2x.png", + "idiom" : "mac", + "scale" : "2x", + "size" : "256x256" + }, + { + "filename" : "icon_512.png", + "idiom" : "mac", + "scale" : "1x", + "size" : "512x512" + }, + { + "filename" : "icon_512@2x.png", + "idiom" : "mac", + "scale" : "2x", + "size" : "512x512" + } + ], + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_128.png b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_128.png new file mode 100644 index 0000000000..cf71558efd Binary files /dev/null and b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_128.png differ diff --git a/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_128@2x.png b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_128@2x.png new file mode 100644 index 0000000000..49e194e758 Binary files /dev/null and b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_128@2x.png differ diff --git a/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_16.png b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_16.png new file mode 100644 index 0000000000..a97bd5e84a Binary files /dev/null and b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_16.png differ diff --git a/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_16@2x.png b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_16@2x.png new file mode 100644 index 0000000000..726d41e148 Binary files /dev/null and b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_16@2x.png differ diff --git a/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_256.png b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_256.png new file mode 100644 index 0000000000..49e194e758 Binary files /dev/null and b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_256.png differ diff --git a/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_256@2x.png b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_256@2x.png new file mode 100644 index 0000000000..5035e10a96 Binary files /dev/null and b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_256@2x.png differ diff --git a/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_32.png b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_32.png new file mode 100644 index 0000000000..726d41e148 Binary files /dev/null and b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_32.png differ diff --git a/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_32@2x.png b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_32@2x.png new file mode 100644 index 0000000000..164c19a5a2 Binary files /dev/null and b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_32@2x.png differ diff --git a/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_512.png b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_512.png new file mode 100644 index 0000000000..5035e10a96 Binary files /dev/null and b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_512.png differ diff --git a/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_512@2x.png b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_512@2x.png new file mode 100644 index 0000000000..1597ff5c3c Binary files /dev/null and b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/AppIcon.appiconset/icon_512@2x.png differ diff --git a/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/Contents.json b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/Contents.json new file mode 100644 index 0000000000..73c00596a7 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Resources/Assets.xcassets/Contents.json @@ -0,0 +1,6 @@ +{ + "info" : { + "author" : "xcode", + "version" : 1 + } +} diff --git a/ExecuWhisper/ExecuWhisper/Resources/model_manifest.json b/ExecuWhisper/ExecuWhisper/Resources/model_manifest.json new file mode 100644 index 0000000000..4686fba3c3 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Resources/model_manifest.json @@ -0,0 +1,29 @@ +{ + "version": 1, + "repositories": { + "asr": "younghan-meta/Parakeet-TDT-ExecuTorch-Metal", + "formatter": "younghan-meta/LFM2.5-ExecuTorch-MLX" + }, + "assets": { + "model.pte": { + "bytes": 822365760, + "sha256": "406c7625094faefd019932cceba317954c1fc60de068b0be372570bfe5509ef2" + }, + "tokenizer.model": { + "bytes": 360916, + "sha256": "eacec2b0a77f336d4a2ca4a25a7047575d3c2b74de47e997f4c205126ed3135e" + }, + "lfm2_5_350m_mlx_4w.pte": { + "bytes": 322767616, + "sha256": "a22641ff8364b813bc5808428bf13ca7525df9995b1721b58b414a35da8312c9" + }, + "tokenizer.json": { + "bytes": 4733383, + "sha256": "2221c71b5dce048a8abae62843b92bd7deec13cc153f95fa6e3327a47b79a7da" + }, + "tokenizer_config.json": { + "bytes": 595, + "sha256": "3701f370c70034d06e28947ff51c1063983393319f429e3ce58f06a45fae67cc" + } + } +} diff --git a/ExecuWhisper/ExecuWhisper/Services/AudioRecorder.swift b/ExecuWhisper/ExecuWhisper/Services/AudioRecorder.swift new file mode 100644 index 0000000000..620f0487b9 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/AudioRecorder.swift @@ -0,0 +1,432 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import AVFoundation +import Accelerate +import CoreAudio +import Foundation +import os + +private let log = Logger(subsystem: "org.pytorch.executorch.ExecuWhisper", category: "AudioRecorder") + +private final class NativeCaptureWriter: @unchecked Sendable { + private let lock = NSLock() + private var audioFile: AVAudioFile? + private var captureURL: URL? + + func append(_ buffer: AVAudioPCMBuffer) throws { + lock.lock() + defer { lock.unlock() } + + if audioFile == nil { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("execuwhisper_capture_\(UUID().uuidString).wav") + let file = try AVAudioFile( + forWriting: url, + settings: buffer.format.settings, + commonFormat: buffer.format.commonFormat, + interleaved: buffer.format.isInterleaved + ) + audioFile = file + captureURL = url + } + + try audioFile?.write(from: buffer) + } + + func finish() -> URL? { + lock.lock() + defer { lock.unlock() } + audioFile = nil + return captureURL + } +} + +actor AudioRecorder { + struct CoreAudioDeviceRecord: Equatable, Sendable { + let id: AudioDeviceID + let uid: String + let name: String + let inputChannelCount: UInt32 + } + + struct InputDevice: Identifiable, Equatable, Sendable { + let id: String + let name: String + let isDefault: Bool + + var displayName: String { + isDefault ? "\(name) (System Default)" : name + } + } + + struct ResolvedInputDevice: Equatable, Sendable { + let device: InputDevice + let usedFallback: Bool + } + + private let modelSampleRate: Double = 16_000 + private static let postStopTailTrimDurationMs: Double = 256 + private var engine: AVAudioEngine? + private var writer = NativeCaptureWriter() + private var selectedDeviceUID: String? + private var levelHandler: (@Sendable (Float) -> Void)? + private var configurationObserver: NSObjectProtocol? + private var isRecoveringConfiguration = false + + func startRecording( + selectedMicrophoneID: String? = nil, + levelHandler: @Sendable @escaping (Float) -> Void + ) throws { + if engine != nil { + stopCaptureOnly() + } + + writer = NativeCaptureWriter() + + let availableDevices = Self.availableInputDevices() + guard let resolvedDevice = Self.resolvePreferredMicrophone( + selectedMicrophoneID: selectedMicrophoneID, + availableDevices: availableDevices + ) else { + throw RunnerError.microphoneNotAvailable + } + + selectedDeviceUID = resolvedDevice.device.id + self.levelHandler = levelHandler + + let audioEngine = AVAudioEngine() + let inputNode = audioEngine.inputNode + + guard let deviceID = Self.audioDeviceID(forUID: resolvedDevice.device.id) else { + log.error("Could not resolve Core Audio input device for uid=\(resolvedDevice.device.id, privacy: .public)") + throw RunnerError.microphoneNotAvailable + } + try Self.bindEngineInput(inputNode: inputNode, to: deviceID) + + // Pass nil format so AVAudioEngine uses the bus's actual hardware format after + // we bound the AU to the chosen device. Caching outputFormat(forBus:) here used + // to capture a stale 48 kHz / 2-channel format from whatever device the engine + // momentarily latched onto before our bind, which then made `installTap` fail + // with "Format mismatch" + "config change pending!" on Macs whose mic runs at a + // different rate (e.g. 24 kHz). The buffer delivered to the tap carries its + // native format; ImportedAudioDecoder downstream resamples to 16 kHz mono. + installTap(on: inputNode, levelHandler: levelHandler) + + do { + try audioEngine.start() + } catch { + inputNode.removeTap(onBus: 0) + throw error + } + self.engine = audioEngine + observeConfigurationChanges(for: audioEngine) + + let runtimeFormat = inputNode.inputFormat(forBus: 0) + log.info("Audio recording engine bound: device=\(resolvedDevice.device.name, privacy: .public) sampleRate=\(runtimeFormat.sampleRate) channelCount=\(runtimeFormat.channelCount)") + + if resolvedDevice.usedFallback { + log.info("Selected microphone unavailable; falling back to system default '\(resolvedDevice.device.name, privacy: .public)'") + } + log.info("Audio recording started with microphone '\(resolvedDevice.device.name, privacy: .public)'") + } + + func stopRecording() throws -> Data { + stopCaptureOnly() + + guard let captureURL = writer.finish() else { + throw RunnerError.transcriptionFailed(description: "No audio was captured.") + } + defer { try? FileManager.default.removeItem(at: captureURL) } + + let decoded = try ImportedAudioDecoder().decodeAudioFile(at: captureURL) + guard !decoded.pcmData.isEmpty else { + throw RunnerError.transcriptionFailed(description: "No audio was captured.") + } + + let trimmedPCM = Self.trimTrailingPCM( + decoded.pcmData, + sampleRate: modelSampleRate, + trimDurationMs: Self.postStopTailTrimDurationMs + ) + log.info("Captured \(trimmedPCM.count) bytes of 16kHz float32 PCM") + return trimmedPCM + } + + func cancelRecording() { + stopCaptureOnly() + if let url = writer.finish() { + try? FileManager.default.removeItem(at: url) + } + } + + // MARK: - Device enumeration + + static func availableInputDevices() -> [InputDevice] { + let defaultDeviceID = AVCaptureDevice.default(for: .audio)?.uniqueID + var seenIDs: Set = [] + + return discoveredAudioCaptureDevices() + .compactMap { device in + guard seenIDs.insert(device.uniqueID).inserted else { return nil } + return InputDevice( + id: device.uniqueID, + name: device.localizedName, + isDefault: device.uniqueID == defaultDeviceID + ) + } + .sorted { lhs, rhs in + if lhs.isDefault != rhs.isDefault { + return lhs.isDefault && !rhs.isDefault + } + return lhs.name.localizedCaseInsensitiveCompare(rhs.name) == .orderedAscending + } + } + + static func resolvePreferredMicrophone( + selectedMicrophoneID: String?, + availableDevices: [InputDevice] + ) -> ResolvedInputDevice? { + guard !availableDevices.isEmpty else { return nil } + + let normalizedSelection = selectedMicrophoneID? + .trimmingCharacters(in: .whitespacesAndNewlines) + let hasExplicitSelection = normalizedSelection.map { !$0.isEmpty } ?? false + + if let normalizedSelection, !normalizedSelection.isEmpty, + let exactMatch = availableDevices.first(where: { $0.id == normalizedSelection }) { + return ResolvedInputDevice(device: exactMatch, usedFallback: false) + } + + let fallbackDevice = availableDevices.first(where: \.isDefault) ?? availableDevices[0] + return ResolvedInputDevice(device: fallbackDevice, usedFallback: hasExplicitSelection) + } + + // MARK: - Utilities + + static func trimTrailingPCM( + _ pcmData: Data, + sampleRate: Double, + trimDurationMs: Double + ) -> Data { + guard trimDurationMs > 0 else { return pcmData } + + let bytesPerSample = MemoryLayout.size + let trimSampleCount = Int((sampleRate * trimDurationMs) / 1000.0) + let trimByteCount = trimSampleCount * bytesPerSample + guard trimByteCount > 0, pcmData.count > trimByteCount + bytesPerSample else { + return pcmData + } + + return Data(pcmData.prefix(pcmData.count - trimByteCount)) + } + + // MARK: - Private + + private static func discoveredAudioCaptureDevices() -> [AVCaptureDevice] { + AVCaptureDevice.DiscoverySession( + deviceTypes: [.microphone], + mediaType: .audio, + position: .unspecified + ).devices + } + + private func stopCaptureOnly() { + if let configurationObserver { + NotificationCenter.default.removeObserver(configurationObserver) + self.configurationObserver = nil + } + if let engine { + engine.inputNode.removeTap(onBus: 0) + if engine.isRunning { + engine.stop() + } + engine.reset() + } + engine = nil + selectedDeviceUID = nil + levelHandler = nil + isRecoveringConfiguration = false + log.info("Audio recording stopped") + } + + private static func bindEngineInput(inputNode: AVAudioInputNode, to deviceID: AudioDeviceID) throws { + guard let audioUnit = inputNode.audioUnit else { + log.error("Input node has no audio unit; cannot bind to device id=\(deviceID)") + throw RunnerError.microphoneNotAvailable + } + var mutableID = deviceID + let status = AudioUnitSetProperty( + audioUnit, + kAudioOutputUnitProperty_CurrentDevice, + kAudioUnitScope_Global, + 0, + &mutableID, + UInt32(MemoryLayout.size) + ) + if status != noErr { + log.error("Failed to bind input AU to device id=\(deviceID) status=\(status)") + throw RunnerError.microphoneNotAvailable + } + } + + private func observeConfigurationChanges(for engine: AVAudioEngine) { + configurationObserver = NotificationCenter.default.addObserver( + forName: .AVAudioEngineConfigurationChange, + object: engine, + queue: nil + ) { [weak self] _ in + Task { + await self?.recoverFromConfigurationChange() + } + } + } + + private func recoverFromConfigurationChange() async { + guard !isRecoveringConfiguration, + let engine, + let selectedDeviceUID, + let levelHandler + else { + return + } + + isRecoveringConfiguration = true + defer { isRecoveringConfiguration = false } + + do { + let inputNode = engine.inputNode + inputNode.removeTap(onBus: 0) + if engine.isRunning { + engine.stop() + } + guard let deviceID = Self.audioDeviceID(forUID: selectedDeviceUID) else { + log.error("Audio config change recovery failed: device uid unavailable uid=\(selectedDeviceUID, privacy: .public)") + return + } + try Self.bindEngineInput(inputNode: inputNode, to: deviceID) + installTap(on: inputNode, levelHandler: levelHandler) + try engine.start() + let runtimeFormat = inputNode.inputFormat(forBus: 0) + log.info("Audio recording config change recovered: uid=\(selectedDeviceUID, privacy: .public) sampleRate=\(runtimeFormat.sampleRate) channelCount=\(runtimeFormat.channelCount)") + } catch { + log.error("Audio config change recovery failed: \(error.localizedDescription, privacy: .public)") + } + } + + private func installTap( + on inputNode: AVAudioInputNode, + levelHandler: @Sendable @escaping (Float) -> Void + ) { + let captureWriter = writer + inputNode.installTap(onBus: 0, bufferSize: 4096, format: nil) { buffer, _ in + guard buffer.frameLength > 0 else { return } + + if let channelData = buffer.floatChannelData { + var rms: Float = 0 + vDSP_rmsqv(channelData[0], 1, &rms, vDSP_Length(buffer.frameLength)) + levelHandler(rms) + } + + do { + try captureWriter.append(buffer) + } catch { + log.error("Failed to write capture buffer: \(error.localizedDescription, privacy: .public)") + } + } + } + + static func selectInputDeviceID(forUID uid: String, from records: [CoreAudioDeviceRecord]) -> AudioDeviceID? { + records.first { $0.uid == uid && $0.inputChannelCount > 0 }?.id + } + + static func audioDeviceID(forUID uid: String) -> AudioDeviceID? { + selectInputDeviceID(forUID: uid, from: coreAudioDeviceRecords()) + } + + static func coreAudioDeviceRecords() -> [CoreAudioDeviceRecord] { + var size: UInt32 = 0 + var address = AudioObjectPropertyAddress( + mSelector: kAudioHardwarePropertyDevices, + mScope: kAudioObjectPropertyScopeGlobal, + mElement: kAudioObjectPropertyElementMain + ) + guard AudioObjectGetPropertyDataSize( + AudioObjectID(kAudioObjectSystemObject), + &address, + 0, nil, + &size + ) == noErr, size > 0 else { return [] } + + let count = Int(size) / MemoryLayout.size + var deviceIDs = [AudioDeviceID](repeating: 0, count: count) + guard AudioObjectGetPropertyData( + AudioObjectID(kAudioObjectSystemObject), + &address, + 0, nil, + &size, + &deviceIDs + ) == noErr else { return [] } + + return deviceIDs.compactMap { deviceID in + guard let uid = stringProperty( + deviceID: deviceID, + selector: kAudioDevicePropertyDeviceUID + ) else { + return nil + } + let name = stringProperty( + deviceID: deviceID, + selector: kAudioObjectPropertyName + ) ?? uid + let inputChannels = inputChannelCount(for: deviceID) + return CoreAudioDeviceRecord( + id: deviceID, + uid: uid, + name: name, + inputChannelCount: inputChannels + ) + } + } + + private static func stringProperty(deviceID: AudioDeviceID, selector: AudioObjectPropertySelector) -> String? { + var address = AudioObjectPropertyAddress( + mSelector: selector, + mScope: kAudioObjectPropertyScopeGlobal, + mElement: kAudioObjectPropertyElementMain + ) + var value: CFString = "" as CFString + var size = UInt32(MemoryLayout.size) + guard AudioObjectGetPropertyData(deviceID, &address, 0, nil, &size, &value) == noErr else { + return nil + } + return value as String + } + + private static func inputChannelCount(for deviceID: AudioDeviceID) -> UInt32 { + var streamAddress = AudioObjectPropertyAddress( + mSelector: kAudioDevicePropertyStreamConfiguration, + mScope: kAudioObjectPropertyScopeInput, + mElement: kAudioObjectPropertyElementMain + ) + var streamSize: UInt32 = 0 + guard AudioObjectGetPropertyDataSize(deviceID, &streamAddress, 0, nil, &streamSize) == noErr, + streamSize > 0 + else { + return 0 + } + + let bufferListPointer = UnsafeMutablePointer.allocate(capacity: Int(streamSize)) + defer { bufferListPointer.deallocate() } + guard AudioObjectGetPropertyData(deviceID, &streamAddress, 0, nil, &streamSize, bufferListPointer) == noErr else { + return 0 + } + let bufferList = UnsafeMutableAudioBufferListPointer(bufferListPointer) + return bufferList.reduce(UInt32(0)) { $0 + $1.mNumberChannels } + } +} diff --git a/ExecuWhisper/ExecuWhisper/Services/DictationManager.swift b/ExecuWhisper/ExecuWhisper/Services/DictationManager.swift new file mode 100644 index 0000000000..b5c1944792 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/DictationManager.swift @@ -0,0 +1,323 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import AppKit +import ApplicationServices +import Carbon.HIToolbox +import os +import SwiftUI + +private let dictationLog = Logger(subsystem: "org.pytorch.executorch.ExecuWhisper", category: "DictationManager") +private let maxDictationDuration: TimeInterval = 30 * 60 + +@MainActor @Observable +final class DictationManager { + enum State: Equatable { + case idle + case listening + case transcribing + } + + private(set) var state: State = .idle + var hotKeyRegistrationError: String? + + var isListening: Bool { state == .listening } + + var overlayStatusText: String { + switch state { + case .idle: + return hotKeyRegistrationError ?? "Ready" + case .listening: + return "" + case .transcribing: + return "" + } + } + + var hotKeyStatusText: String { + let display = self.hotKeyDisplayText + if hotKeyRegistrationError != nil { + return "\(display) unavailable" + } + return isHotKeyEnabled ? "\(display) ready" : "\(display) disabled" + } + + private var isHotKeyEnabled: Bool { + preferences?.enableGlobalHotkey ?? true + } + + var hotKeyDisplayText: String { + preferences?.dictationShortcut.displayString ?? DictationShortcut.controlSpace.displayString + } + + private let store: TranscriptStore? + private let preferences: Preferences? + private let hotKeyManager: GlobalHotKeyManager + private let stopRequestHandler: (@MainActor () async -> Void)? + private var panel: DictationPanel? + private var silenceTimer: Task? + private var targetApp: NSRunningApplication? + private var lastVoiceTime: Date = .now + private var dictationStartTime: Date? + private var sawVoiceActivity = false + + init( + store: TranscriptStore? = nil, + preferences: Preferences? = nil, + hotKeyManager: GlobalHotKeyManager = GlobalHotKeyManager(), + stopRequestHandler: (@MainActor () async -> Void)? = nil + ) { + self.store = store + self.preferences = preferences + self.hotKeyManager = hotKeyManager + self.stopRequestHandler = stopRequestHandler + } + + static func preview( + stopRequestHandler: (@MainActor () async -> Void)? = nil + ) -> DictationManager { + DictationManager(stopRequestHandler: stopRequestHandler) + } + + func beginPreviewDictation() async { + state = .listening + } + + func beginPreviewTranscription() async { + state = .transcribing + } + + func finishPreviewDictation() async { + state = .idle + } + + func triggerSilenceTimeoutForTesting() { + requestStopAndPaste(trigger: "test silence timeout") + } + + func registerHotKey() { + guard isHotKeyEnabled else { + hotKeyManager.unregister() + hotKeyRegistrationError = nil + return + } + let shortcut = preferences?.dictationShortcut ?? .controlSpace + switch hotKeyManager.register(shortcut: shortcut, { [weak self] in + Task { @MainActor in + await self?.toggle() + } + }) { + case .success: + hotKeyRegistrationError = nil + case .failure(let error): + hotKeyRegistrationError = error.localizedDescription + store?.currentError = error + } + } + + func refreshHotKeyRegistration() { + registerHotKey() + } + + func unregisterHotKey() { + hotKeyManager.unregister() + } + + func toggle() async { + dictationLog.info("Toggle requested in state=\(String(describing: self.state), privacy: .public) taskCancelled=\(Task.isCancelled)") + switch state { + case .idle: + await startListening() + case .listening: + await stopAndPaste(trigger: "toggle") + case .transcribing: + break + } + } + + func promptForAccessibilityAccess() { + _ = Self.checkAccessibility(prompt: true) + } + + private func startListening() async { + guard let store else { return } + dictationLog.info("Starting dictation listening flow") + + let started = await store.startDictationCapture() + guard started else { + dictationLog.error("Dictation listening start failed before overlay display") + return + } + + targetApp = NSWorkspace.shared.frontmostApplication + dictationLog.info("Captured target app=\(self.targetApp?.localizedName ?? "none", privacy: .public)") + state = .listening + lastVoiceTime = .now + dictationStartTime = .now + sawVoiceActivity = false + showPanel() + startSilenceMonitor() + dictationLog.info("Dictation overlay shown") + } + + private func requestStopAndPaste(trigger: String) { + dictationLog.info("Scheduling stopAndPaste trigger=\(trigger, privacy: .public)") + let stopRequestHandler = self.stopRequestHandler + Task { @MainActor [weak self] in + if let stopRequestHandler { + await stopRequestHandler() + return + } + await self?.stopAndPaste(trigger: trigger) + } + } + + private func stopAndPaste(trigger: String) async { + guard state == .listening, let store else { return } + dictationLog.info("Stopping dictation listening flow trigger=\(trigger, privacy: .public) taskCancelled=\(Task.isCancelled)") + state = .transcribing + silenceTimer?.cancel() + silenceTimer = nil + + do { + let result = try await store.finishDictationCapture() + dictationLog.info("Dictation produced outputLength=\(result.outputText.count) rawLength=\(result.rawText.count) tags=\(result.tags, privacy: .public)") + if DiagnosticLogging.shouldLogTranscriptsPublicly { + dictationLog.info("Dictation raw: \(result.rawText, privacy: .public)") + dictationLog.info("Dictation final: \(result.outputText, privacy: .public)") + } else { + dictationLog.info("Dictation raw: \(result.rawText, privacy: .private)") + dictationLog.info("Dictation final: \(result.outputText, privacy: .private)") + } + defer { + dismissPanel() + state = .idle + dictationStartTime = nil + } + + guard !result.outputText.isEmpty else { return } + + NSPasteboard.general.clearContents() + NSPasteboard.general.setString(result.outputText, forType: .string) + dictationLog.info("Copied dictated text to pasteboard") + + if let app = targetApp { + app.activate() + dictationLog.info("Reactivated target app=\(app.localizedName ?? "unknown", privacy: .public)") + } + try? await Task.sleep(for: .milliseconds(300)) + + switch PasteController.paste(targetPID: targetApp?.processIdentifier) { + case .pastedWithAppPermission: + dictationLog.info("Posted synthetic Cmd+V with app Accessibility permission") + case .pastedWithStableHelper: + dictationLog.info("Posted synthetic Cmd+V with stable paste helper") + case .accessibilityRequired: + dictationLog.error("Accessibility permission missing for app and stable paste helper") + store.currentError = .accessibilityPermissionDenied + case .failed(let message): + dictationLog.error("Auto-paste failed: \(message, privacy: .public)") + store.currentError = .accessibilityPermissionDenied + } + } catch RunnerError.dictationNotActive { + dictationLog.info("Dictation stop ignored: no active recording (likely a duplicate trigger).") + dismissPanel() + state = .idle + dictationStartTime = nil + } catch let error as RunnerError { + dictationLog.error("Dictation failed with RunnerError: \(error.localizedDescription, privacy: .public)") + store.currentError = error + dismissPanel() + state = .idle + dictationStartTime = nil + } catch { + dictationLog.error("Dictation failed with unexpected error: \(error.localizedDescription, privacy: .public)") + store.currentError = .transcriptionFailed(description: error.localizedDescription) + dismissPanel() + state = .idle + dictationStartTime = nil + } + } + + private func startSilenceMonitor() { + guard let store, let preferences else { return } + silenceTimer?.cancel() + silenceTimer = Task { @MainActor [weak self] in + let pollIntervalMs = 250 + while !Task.isCancelled { + try? await Task.sleep(for: .milliseconds(pollIntervalMs)) + guard let self, self.state == .listening else { break } + + if let dictationStartTime, + Date.now.timeIntervalSince(dictationStartTime) >= maxDictationDuration { + dictationLog.info("Maximum dictation duration reached; stopping automatically") + self.requestStopAndPaste(trigger: "maximum duration") + break + } + + if store.audioLevel > Float(preferences.silenceThreshold) { + self.lastVoiceTime = .now + self.sawVoiceActivity = true + continue + } + + if self.sawVoiceActivity, + Date.now.timeIntervalSince(self.lastVoiceTime) >= preferences.silenceTimeout { + dictationLog.info("Silence timeout reached; stopping dictation automatically") + self.requestStopAndPaste(trigger: "silence timeout") + break + } + } + } + } + + private func showPanel() { + guard let store else { return } + let overlay = DictationOverlayView() + .environment(store) + .environment(self) + panel = DictationPanel(contentView: overlay) + panel?.showCentered(on: screenForTargetApp()) + dictationLog.info("Overlay panel created and presented") + } + + private func dismissPanel() { + panel?.dismiss() + panel = nil + dictationLog.info("Overlay panel dismissed") + } + + static func checkAccessibility(prompt: Bool = false) -> Bool { + if prompt { + PasteController.promptForAccessibilityAccess() + return PasteController.checkAccessibility(prompt: false) + } + return PasteController.checkAccessibility(prompt: false) + } + + private func screenForTargetApp() -> NSScreen? { + guard let targetApp else { return nil } + let options: CGWindowListOption = [.optionOnScreenOnly, .excludeDesktopElements] + guard let windows = CGWindowListCopyWindowInfo(options, kCGNullWindowID) as? [[String: Any]] else { + return nil + } + guard let bounds = windows.first(where: { info in + (info[kCGWindowOwnerPID as String] as? pid_t) == targetApp.processIdentifier + && (info[kCGWindowLayer as String] as? Int) == 0 + })?[kCGWindowBounds as String] as? [String: CGFloat], + let x = bounds["X"], + let y = bounds["Y"], + let width = bounds["Width"], + let height = bounds["Height"] + else { + return nil + } + let windowRect = CGRect(x: x, y: y, width: width, height: height) + return NSScreen.screens.first { $0.frame.intersects(windowRect) } + } +} diff --git a/ExecuWhisper/ExecuWhisper/Services/FormatterBridge.swift b/ExecuWhisper/ExecuWhisper/Services/FormatterBridge.swift new file mode 100644 index 0000000000..7972edcbe6 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/FormatterBridge.swift @@ -0,0 +1,495 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import os + +private let formatterLog = Logger(subsystem: "org.pytorch.executorch.ExecuWhisper", category: "FormatterBridge") + +private final class FormatterDataAccumulator: @unchecked Sendable { + private let lock = NSLock() + private var data = Data() + + func append(_ chunk: Data) { + lock.lock() + data.append(chunk) + lock.unlock() + } + + func stringValue() -> String { + lock.lock() + defer { lock.unlock() } + return String(decoding: data, as: UTF8.self) + } +} + +private final class FormatterLineAccumulator: @unchecked Sendable { + private let lock = NSLock() + private var buffer = Data() + private let lineHandler: @Sendable (String) -> Void + + init(lineHandler: @escaping @Sendable (String) -> Void) { + self.lineHandler = lineHandler + } + + func append(_ chunk: Data) { + lock.lock() + buffer.append(chunk) + var readyLines: [Data] = [] + while let newlineIndex = buffer.firstIndex(of: 10) { + let line = buffer.prefix(upTo: newlineIndex) + readyLines.append(Data(line)) + buffer.removeSubrange(...newlineIndex) + } + lock.unlock() + + for lineData in readyLines { + let trimmed = String(decoding: lineData, as: UTF8.self) + .trimmingCharacters(in: .whitespacesAndNewlines) + if !trimmed.isEmpty { + lineHandler(trimmed) + } + } + } + + func flush() { + lock.lock() + let remainderData = buffer + buffer.removeAll() + lock.unlock() + + let remainder = String(decoding: remainderData, as: UTF8.self) + .trimmingCharacters(in: .whitespacesAndNewlines) + if !remainder.isEmpty { + lineHandler(remainder) + } + } +} + +private struct FormatterLaunchConfiguration: Equatable, Sendable { + let runnerPath: String + let modelPath: String + let tokenizerPath: String + let tokenizerConfigPath: String +} + +protocol FormatterBridgeClient: Sendable { + func runtimeSnapshot() async -> FormatterBridge.RuntimeSnapshot + + func prepare( + runnerPath: String, + modelPath: String, + tokenizerPath: String, + tokenizerConfigPath: String + ) async throws + + func shutdown() async + + func format( + runnerPath: String, + modelPath: String, + tokenizerPath: String, + tokenizerConfigPath: String, + prompt: String, + maxNewTokens: Int, + temperature: Double + ) async throws -> FormatterBridge.FormatResult +} + +actor FormatterBridge: FormatterBridgeClient { + enum ResidencyState: Sendable, Equatable { + case unloaded + case loading + case warm + case failed + } + + struct RuntimeSnapshot: Sendable, Equatable { + let state: ResidencyState + let runnerPath: String? + let modelPath: String? + let tokenizerPath: String? + let tokenizerConfigPath: String? + let statusMessage: String + } + + struct FormatResult: Sendable, Equatable { + let text: String + let stdout: String + let stderr: String + let tokensPerSecond: Double? + } + + private struct PendingRequest { + let requestID: String + let continuation: CheckedContinuation + } + + private var process: Process? + private var stdinHandle: FileHandle? + private var stderrAccumulator = FormatterDataAccumulator() + private var activeConfiguration: FormatterLaunchConfiguration? + private var activeTraceID: String? + private var runtimeState: ResidencyState = .unloaded + private var lastError: RunnerError? + private var pendingRequest: PendingRequest? + private var pendingTimeoutTask: Task? + private var statusMessage = "" + + func runtimeSnapshot() -> RuntimeSnapshot { + RuntimeSnapshot( + state: runtimeState, + runnerPath: activeConfiguration?.runnerPath, + modelPath: activeConfiguration?.modelPath, + tokenizerPath: activeConfiguration?.tokenizerPath, + tokenizerConfigPath: activeConfiguration?.tokenizerConfigPath, + statusMessage: statusMessage + ) + } + + func prepare( + runnerPath: String, + modelPath: String, + tokenizerPath: String, + tokenizerConfigPath: String + ) async throws { + let configuration = FormatterLaunchConfiguration( + runnerPath: runnerPath, + modelPath: modelPath, + tokenizerPath: tokenizerPath, + tokenizerConfigPath: tokenizerConfigPath + ) + + if activeConfiguration != configuration { + await shutdown() + } + + if process?.isRunning == true, runtimeState == .warm, activeConfiguration == configuration { + return + } + + try validate(configuration) + if process?.isRunning != true || activeConfiguration != configuration { + try launchHelper(configuration) + } + + try await waitForWarmRuntime(expected: configuration) + } + + func shutdown() async { + finishPendingRequest( + throwing: RunnerError.transcriptionFailed(description: "Formatting was cancelled.") + ) + + if let stdinHandle { + if let payload = try? Self.encodeJSONLine(FormatterHelperProtocol.ShutdownRequest()) { + try? stdinHandle.write(contentsOf: payload) + } + try? stdinHandle.close() + } + + if let process, process.isRunning { + process.terminate() + } + + process = nil + stdinHandle = nil + stderrAccumulator = FormatterDataAccumulator() + activeConfiguration = nil + activeTraceID = nil + runtimeState = .unloaded + statusMessage = "" + lastError = nil + pendingTimeoutTask?.cancel() + pendingTimeoutTask = nil + } + + func format( + runnerPath: String, + modelPath: String, + tokenizerPath: String, + tokenizerConfigPath: String, + prompt: String, + maxNewTokens: Int, + temperature: Double + ) async throws -> FormatResult { + let configuration = FormatterLaunchConfiguration( + runnerPath: runnerPath, + modelPath: modelPath, + tokenizerPath: tokenizerPath, + tokenizerConfigPath: tokenizerConfigPath + ) + try await prepare( + runnerPath: runnerPath, + modelPath: modelPath, + tokenizerPath: tokenizerPath, + tokenizerConfigPath: tokenizerConfigPath + ) + return try await sendFormatRequest( + configuration: configuration, + prompt: prompt, + maxNewTokens: maxNewTokens, + temperature: temperature + ) + } + + private func validate(_ configuration: FormatterLaunchConfiguration) throws { + let fileManager = FileManager.default + guard fileManager.isExecutableFile(atPath: configuration.runnerPath) else { + throw RunnerError.binaryNotFound(path: configuration.runnerPath) + } + for path in [configuration.modelPath, configuration.tokenizerPath, configuration.tokenizerConfigPath] { + guard fileManager.fileExists(atPath: path) else { + throw RunnerError.modelMissing(file: URL(fileURLWithPath: path).lastPathComponent) + } + } + } + + private func launchHelper(_ configuration: FormatterLaunchConfiguration) throws { + let traceID = String(UUID().uuidString.prefix(8)) + let process = Process() + let stdinPipe = Pipe() + let stdoutPipe = Pipe() + let stderrPipe = Pipe() + stderrAccumulator = FormatterDataAccumulator() + activeConfiguration = configuration + activeTraceID = traceID + runtimeState = .loading + statusMessage = "Warming formatter..." + lastError = nil + + process.executableURL = URL(fileURLWithPath: configuration.runnerPath) + process.arguments = [ + "--model_path", configuration.modelPath, + "--tokenizer_path", configuration.tokenizerPath, + "--tokenizer_config_path", configuration.tokenizerConfigPath, + ] + process.currentDirectoryURL = URL(fileURLWithPath: configuration.modelPath).deletingLastPathComponent() + process.environment = formatterEnvironment(modelPath: configuration.modelPath) + process.standardInput = stdinPipe + process.standardOutput = stdoutPipe + process.standardError = stderrPipe + + let stdoutLines = FormatterLineAccumulator { line in + Task { await self.handleHelperLine(line, traceID: traceID) } + } + + process.terminationHandler = { process in + Task { + await self.handleTermination( + exitCode: process.terminationStatus, + traceID: traceID + ) + } + } + + do { + try process.run() + formatterLog.info( + "FormatterBridge[\(traceID, privacy: .public)] launched helper runnerPath=\(configuration.runnerPath, privacy: .public) modelPath=\(configuration.modelPath, privacy: .public) pid=\(process.processIdentifier)" + ) + } catch { + runtimeState = .failed + statusMessage = "Formatter launch failed" + lastError = .launchFailed(description: error.localizedDescription) + throw lastError! + } + + self.process = process + self.stdinHandle = stdinPipe.fileHandleForWriting + + DispatchQueue.global(qos: .userInitiated).async { + let handle = stdoutPipe.fileHandleForReading + while true { + let data = handle.availableData + if data.isEmpty { + break + } + stdoutLines.append(data) + } + stdoutLines.flush() + } + + let stderrAccumulator = self.stderrAccumulator + DispatchQueue.global(qos: .utility).async { + let handle = stderrPipe.fileHandleForReading + while true { + let data = handle.availableData + if data.isEmpty { + break + } + stderrAccumulator.append(data) + } + } + } + + private func formatterEnvironment(modelPath: String) -> [String: String] { + var environment = ProcessInfo.processInfo.environment + let bundleResources = Bundle.main.resourcePath ?? "" + var dyldEntries: [String] = [] + if !bundleResources.isEmpty { + dyldEntries.append(bundleResources) + } + if let existing = environment["DYLD_LIBRARY_PATH"], !existing.isEmpty { + dyldEntries.append(contentsOf: existing.components(separatedBy: ":")) + } + let uniqueEntries = Array(NSOrderedSet(array: dyldEntries)).compactMap { $0 as? String } + environment["DYLD_LIBRARY_PATH"] = uniqueEntries.joined(separator: ":") + return environment + } + + private func waitForWarmRuntime(expected configuration: FormatterLaunchConfiguration) async throws { + let deadline = Date().addingTimeInterval(30) + while Date() < deadline { + if activeConfiguration != configuration { + throw RunnerError.launchFailed(description: "Formatter helper configuration changed during warmup.") + } + + switch runtimeState { + case .warm: + return + case .failed: + throw lastError ?? RunnerError.launchFailed(description: "Formatter helper failed to warm.") + case .unloaded: + throw lastError ?? RunnerError.launchFailed(description: "Formatter helper exited before becoming ready.") + case .loading: + try await Task.sleep(for: .milliseconds(50)) + } + } + throw RunnerError.launchFailed(description: "Timed out waiting for the formatter helper to become ready.") + } + + private func sendFormatRequest( + configuration: FormatterLaunchConfiguration, + prompt: String, + maxNewTokens: Int, + temperature: Double + ) async throws -> FormatResult { + guard activeConfiguration == configuration, runtimeState == .warm else { + throw RunnerError.launchFailed(description: "Formatter helper is not warm.") + } + guard pendingRequest == nil else { + throw RunnerError.transcriptionFailed(description: "Formatter helper is already processing another request.") + } + guard let stdinHandle else { + throw RunnerError.launchFailed(description: "Formatter helper stdin is unavailable.") + } + + let requestID = UUID().uuidString + return try await withCheckedThrowingContinuation { continuation in + pendingRequest = PendingRequest(requestID: requestID, continuation: continuation) + let timeoutSeconds = max(30, Int(ceil(Double(maxNewTokens) / 4.0))) + let request = FormatterHelperProtocol.FormatRequest( + requestID: requestID, + prompt: prompt, + maxNewTokens: maxNewTokens, + temperature: temperature + ) + + do { + statusMessage = "Formatting..." + let requestData = try Self.encodeJSONLine(request) + try stdinHandle.write(contentsOf: requestData) + pendingTimeoutTask?.cancel() + pendingTimeoutTask = Task { + try? await Task.sleep(for: .seconds(timeoutSeconds)) + await self.timeoutPendingRequest(requestID: requestID) + } + } catch { + finishPendingRequest(throwing: RunnerError.launchFailed(description: error.localizedDescription)) + } + } + } + + private func timeoutPendingRequest(requestID: String) { + guard pendingRequest?.requestID == requestID else { return } + finishPendingRequest( + throwing: RunnerError.transcriptionFailed(description: "Timed out waiting for formatter output.") + ) + } + + private func handleHelperLine(_ line: String, traceID: String) async { + guard traceID == activeTraceID else { return } + guard let data = line.data(using: .utf8) else { + return + } + + do { + let message = try JSONDecoder().decode(FormatterHelperProtocol.HelperMessage.self, from: data) + switch message { + case .ready: + runtimeState = .warm + statusMessage = "Formatter ready" + lastError = nil + case .status(let status): + if status.requestID == pendingRequest?.requestID { + statusMessage = status.message + } + case .result(let result): + guard result.requestID == pendingRequest?.requestID else { return } + finishPendingRequest(returning: FormatResult( + text: result.text, + stdout: result.stdout, + stderr: result.stderr, + tokensPerSecond: result.tokensPerSecond + )) + statusMessage = "Formatter ready" + case .error(let errorMessage): + let description = errorMessage.details ?? errorMessage.message + if errorMessage.requestID == pendingRequest?.requestID { + finishPendingRequest(throwing: RunnerError.transcriptionFailed(description: description)) + } else { + runtimeState = .failed + statusMessage = "Formatter failed" + lastError = .launchFailed(description: description) + } + } + } catch { + formatterLog.error( + "FormatterBridge[\(traceID, privacy: .public)] failed to parse helper message: \(error.localizedDescription, privacy: .public)\n\(line, privacy: .public)" + ) + } + } + + private func handleTermination(exitCode: Int32, traceID: String) async { + guard traceID == activeTraceID else { return } + let stderr = stderrAccumulator.stringValue() + if pendingRequest != nil { + finishPendingRequest(throwing: RunnerError.runnerCrashed( + exitCode: exitCode, + stderr: stderr.isEmpty ? "Formatter helper terminated unexpectedly." : stderr + )) + } + runtimeState = .unloaded + statusMessage = "" + activeConfiguration = nil + activeTraceID = nil + process = nil + stdinHandle = nil + } + + private func finishPendingRequest(returning result: FormatResult) { + guard let pendingRequest else { return } + self.pendingRequest = nil + pendingTimeoutTask?.cancel() + pendingTimeoutTask = nil + pendingRequest.continuation.resume(returning: result) + } + + private func finishPendingRequest(throwing error: Error) { + guard let pendingRequest else { return } + self.pendingRequest = nil + pendingTimeoutTask?.cancel() + pendingTimeoutTask = nil + pendingRequest.continuation.resume(throwing: error) + } + + private static func encodeJSONLine(_ value: T) throws -> Data { + try JSONEncoder().encode(value) + Data("\n".utf8) + } +} diff --git a/ExecuWhisper/ExecuWhisper/Services/FormatterHelperProtocol.swift b/ExecuWhisper/ExecuWhisper/Services/FormatterHelperProtocol.swift new file mode 100644 index 0000000000..ad520aff6f --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/FormatterHelperProtocol.swift @@ -0,0 +1,144 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +enum FormatterHelperProtocol { + static let version = 1 + + struct FormatRequest: Codable, Sendable, Equatable { + let type: String + let version: Int + let requestID: String + let prompt: String + let maxNewTokens: Int + let temperature: Double + + init( + requestID: String, + prompt: String, + maxNewTokens: Int, + temperature: Double + ) { + self.type = "format" + self.version = FormatterHelperProtocol.version + self.requestID = requestID + self.prompt = prompt + self.maxNewTokens = maxNewTokens + self.temperature = temperature + } + + enum CodingKeys: String, CodingKey { + case type + case version + case requestID = "request_id" + case prompt + case maxNewTokens = "max_new_tokens" + case temperature + } + } + + struct ShutdownRequest: Codable, Sendable, Equatable { + let type: String + let version: Int + + init() { + self.type = "shutdown" + self.version = FormatterHelperProtocol.version + } + } + + struct ReadyMessage: Codable, Sendable, Equatable { + let type: String + let version: Int + } + + struct StatusMessage: Codable, Sendable, Equatable { + let type: String + let version: Int + let requestID: String? + let phase: String + let message: String + + enum CodingKeys: String, CodingKey { + case type + case version + case requestID = "request_id" + case phase + case message + } + } + + struct ResultMessage: Codable, Sendable, Equatable { + let type: String + let version: Int + let requestID: String + let text: String + let stdout: String + let stderr: String + let tokensPerSecond: Double? + + enum CodingKeys: String, CodingKey { + case type + case version + case requestID = "request_id" + case text + case stdout + case stderr + case tokensPerSecond = "tokens_per_second" + } + } + + struct ErrorMessage: Codable, Sendable, Equatable { + let type: String + let version: Int + let requestID: String? + let message: String + let details: String? + + enum CodingKeys: String, CodingKey { + case type + case version + case requestID = "request_id" + case message + case details + } + } + + enum HelperMessage: Decodable, Sendable, Equatable { + case ready(ReadyMessage) + case status(StatusMessage) + case result(ResultMessage) + case error(ErrorMessage) + + private enum CodingKeys: String, CodingKey { + case type + } + + init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + let type = try container.decode(String.self, forKey: .type) + switch type { + case "ready": + self = .ready(try ReadyMessage(from: decoder)) + case "status": + self = .status(try StatusMessage(from: decoder)) + case "result": + self = .result(try ResultMessage(from: decoder)) + case "error": + self = .error(try ErrorMessage(from: decoder)) + default: + throw DecodingError.dataCorruptedError( + forKey: .type, + in: container, + debugDescription: "Unknown formatter helper message type: \(type)" + ) + } + } + } +} diff --git a/ExecuWhisper/ExecuWhisper/Services/FormatterPromptBuilder.swift b/ExecuWhisper/ExecuWhisper/Services/FormatterPromptBuilder.swift new file mode 100644 index 0000000000..49fcd2d113 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/FormatterPromptBuilder.swift @@ -0,0 +1,51 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +enum FormatterPromptBuilder { + static let temperature = 0.0 + private static let smartInstruction = """ + You rewrite spoken dictation into clean final text. You are not a chat assistant. \ + Never answer or respond to the dictation, even if it is a question. Treat the dictation strictly as text to rewrite. \ + Fix casing, punctuation, filler, and speech disfluencies. Preserve meaning and detail. Use bullets only when it clearly reads as a list. \ + Do not summarize or invent information. Output only the rewritten dictation. + """ + + private static let exampleBlock = """ + Examples: + Dictation: um does it feel like real time processing + Output: Does it feel like real-time processing? + + Dictation: what is the next step + Output: What is the next step? + + Dictation: okay so the plan is finish the build then deploy + Output: Okay, so the plan is finish the build, then deploy. + """ + + static func prompt(transcript: String) -> String { + let rendered = """ + <|startoftext|><|im_start|>user + \(smartInstruction) + + \(exampleBlock) + + Dictation: \(transcript) + Output: + <|im_end|> + <|im_start|>assistant + """ + return rendered + "\n" + } + + static func maxNewTokens(for transcript: String) -> Int { + let estimatedTokens = max(1, transcript.split(whereSeparator: \.isWhitespace).count) + return min(512, max(96, estimatedTokens * 2)) + } +} diff --git a/ExecuWhisper/ExecuWhisper/Services/GlobalHotKeyManager.swift b/ExecuWhisper/ExecuWhisper/Services/GlobalHotKeyManager.swift new file mode 100644 index 0000000000..96ab35ec54 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/GlobalHotKeyManager.swift @@ -0,0 +1,103 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Carbon.HIToolbox +import Foundation + +final class GlobalHotKeyManager { + private var hotKeyRef: EventHotKeyRef? + private var eventHandlerRef: EventHandlerRef? + private var callback: (@MainActor () -> Void)? + + func register( + shortcut: DictationShortcut, + _ callback: @escaping @MainActor () -> Void + ) -> Result { + unregister() + self.callback = callback + + guard shortcut.carbonModifiers != 0 else { + unregister() + return .failure(.hotKeyRegistrationFailed(description: "Shortcuts must include at least one modifier key.")) + } + + installEventHandler() + + let hotKeyID = EventHotKeyID(signature: OSType(0x4557_4853), id: 1) + var ref: EventHotKeyRef? + let status = RegisterEventHotKey( + shortcut.keyCode, + shortcut.carbonModifiers, + hotKeyID, + GetApplicationEventTarget(), + 0, + &ref + ) + + guard status == noErr else { + unregister() + return .failure(.hotKeyRegistrationFailed(description: Self.errorMessage(for: status, shortcut: shortcut))) + } + + hotKeyRef = ref + return .success(()) + } + + func unregister() { + if let ref = hotKeyRef { + UnregisterEventHotKey(ref) + hotKeyRef = nil + } + if let handler = eventHandlerRef { + RemoveEventHandler(handler) + eventHandlerRef = nil + } + callback = nil + } + + private func installEventHandler() { + var eventType = EventTypeSpec( + eventClass: OSType(kEventClassKeyboard), + eventKind: UInt32(kEventHotKeyPressed) + ) + let selfPtr = Unmanaged.passUnretained(self).toOpaque() + + InstallEventHandler( + GetApplicationEventTarget(), + { _, _, userData in + guard let userData else { return OSStatus(eventNotHandledErr) } + let manager = Unmanaged.fromOpaque(userData).takeUnretainedValue() + manager.handleHotKey() + return noErr + }, + 1, + &eventType, + selfPtr, + &eventHandlerRef + ) + } + + private func handleHotKey() { + guard let callback else { return } + Task { @MainActor in + callback() + } + } + + private static func errorMessage(for status: OSStatus, shortcut: DictationShortcut) -> String { + let display = shortcut.displayString + switch Int(status) { + case eventHotKeyExistsErr: + return "\(display) is already registered. Check macOS keyboard or input-source shortcuts." + case eventHotKeyInvalidErr: + return "macOS rejected the \(display) hotkey registration." + default: + return "macOS returned OSStatus \(status). \(display) may already be reserved by the system." + } + } +} diff --git a/ExecuWhisper/ExecuWhisper/Services/HealthCheck.swift b/ExecuWhisper/ExecuWhisper/Services/HealthCheck.swift new file mode 100644 index 0000000000..abc9d9be7a --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/HealthCheck.swift @@ -0,0 +1,99 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import AVFoundation +import Foundation + +struct HealthCheck: Sendable { + struct Result: Sendable { + var runnerAvailable: Bool + var modelAvailable: Bool + var tokenizerAvailable: Bool + var micPermission: MicPermission + + var resourcesReady: Bool { + runnerAvailable && modelAvailable && tokenizerAvailable + } + + var allGood: Bool { + resourcesReady && micPermission == .authorized + } + + var missingFiles: [String] { + var missing: [String] = [] + if !runnerAvailable { missing.append("parakeet_helper") } + if !modelAvailable { missing.append("model.pte") } + if !tokenizerAvailable { missing.append("tokenizer.model") } + return missing + } + + var modelAssetsMissing: Bool { + !modelAvailable || !tokenizerAvailable + } + + var shouldOfferModelDownload: Bool { + runnerAvailable && modelAssetsMissing + } + + var setupStatusMessage: String { + if !runnerAvailable { return "Helper setup required" } + if modelAssetsMissing { return "Model download required" } + return "Ready" + } + } + + enum MicPermission: Sendable { + case authorized + case denied + case notDetermined + } + + static func run( + runnerPath: String, + modelPath: String, + tokenizerPath: String + ) async -> Result { + let fm = FileManager.default + let micPerm = await microphonePermission() + + return Result( + runnerAvailable: fm.isExecutableFile(atPath: runnerPath), + modelAvailable: fileExistsAndHasData(atPath: modelPath), + tokenizerAvailable: fileExistsAndHasData(atPath: tokenizerPath), + micPermission: micPerm + ) + } + + static func requestMicrophoneAccess() async -> Bool { + await AVCaptureDevice.requestAccess(for: .audio) + } + + static func liveMicPermission() async -> MicPermission { + await microphonePermission() + } + + private static func microphonePermission() async -> MicPermission { + switch AVCaptureDevice.authorizationStatus(for: .audio) { + case .authorized: + return .authorized + case .denied, .restricted: + return .denied + case .notDetermined: + return .notDetermined + @unknown default: + return .notDetermined + } + } + + private static func fileExistsAndHasData(atPath path: String) -> Bool { + guard FileManager.default.fileExists(atPath: path) else { return false } + let attributes = try? FileManager.default.attributesOfItem(atPath: path) + let size = attributes?[.size] as? NSNumber + return size?.int64Value ?? 0 > 0 + } +} diff --git a/ExecuWhisper/ExecuWhisper/Services/ImportedAudioDecoder.swift b/ExecuWhisper/ExecuWhisper/Services/ImportedAudioDecoder.swift new file mode 100644 index 0000000000..d6588700a2 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/ImportedAudioDecoder.swift @@ -0,0 +1,144 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import AVFoundation +import Foundation +import UniformTypeIdentifiers + +struct DecodedImportedAudioFile: Sendable, Equatable { + let pcmData: Data + let duration: TimeInterval +} + +protocol ImportedAudioDecoding: Sendable { + func decodeAudioFile(at url: URL) throws -> DecodedImportedAudioFile +} + +struct ImportedAudioDecoder: ImportedAudioDecoding { + private static let supportedExtensions: Set = ["wav", "mp3"] + private static let maxImportedDuration: TimeInterval = 30 * 60 + private static let maxEstimatedImportMemoryBytes = 256 * 1024 * 1024 + private let outputSampleRate: Double = 16_000 + + static var allowedContentTypes: [UTType] { + [UTType(filenameExtension: "wav"), UTType(filenameExtension: "mp3")].compactMap { $0 } + } + + static func supportsAudioFile(_ url: URL) -> Bool { + supportedExtensions.contains(url.pathExtension.lowercased()) + } + + static func importableAudioFile(from urls: [URL]) -> URL? { + guard urls.count == 1, let url = urls.first, supportsAudioFile(url) else { return nil } + return url + } + + func decodeAudioFile(at url: URL) throws -> DecodedImportedAudioFile { + guard Self.supportsAudioFile(url) else { + throw RunnerError.transcriptionFailed( + description: "Unsupported audio file type. Import a .wav or .mp3 file." + ) + } + + let audioFile: AVAudioFile + do { + audioFile = try AVAudioFile(forReading: url) + } catch { + throw RunnerError.transcriptionFailed( + description: "Could not open audio file '\(url.lastPathComponent)'." + ) + } + + let sourceFormat = audioFile.processingFormat + guard sourceFormat.channelCount > 0, sourceFormat.sampleRate > 0 else { + throw RunnerError.transcriptionFailed(description: "Audio file is missing a readable audio stream.") + } + + let frameCount = audioFile.length + guard frameCount > 0, frameCount <= Int64(UInt32.max) else { + throw RunnerError.transcriptionFailed(description: "Audio file is empty or too large to import.") + } + let duration = Double(frameCount) / sourceFormat.sampleRate + guard duration <= Self.maxImportedDuration else { + throw RunnerError.transcriptionFailed(description: "Audio file is too long to import. Please use a file shorter than 30 minutes.") + } + let estimatedSourcePCMBytes = Double(frameCount) * Double(max(sourceFormat.channelCount, 1)) * Double(MemoryLayout.size) + let estimatedNormalizedBytes = duration * outputSampleRate * Double(MemoryLayout.size) + let estimatedPeakBytes = estimatedSourcePCMBytes + (estimatedNormalizedBytes * 2) + guard estimatedPeakBytes <= Double(Self.maxEstimatedImportMemoryBytes) else { + throw RunnerError.transcriptionFailed( + description: "Audio file is too large to import reliably. Please use a shorter recording." + ) + } + + guard let inputBuffer = AVAudioPCMBuffer( + pcmFormat: sourceFormat, + frameCapacity: AVAudioFrameCount(frameCount) + ) else { + throw RunnerError.transcriptionFailed(description: "Could not allocate an audio decode buffer.") + } + + do { + try audioFile.read(into: inputBuffer) + } catch { + throw RunnerError.transcriptionFailed(description: "Could not decode audio frames from the file.") + } + + guard inputBuffer.frameLength > 0 else { + throw RunnerError.transcriptionFailed(description: "Audio file did not contain any decodable samples.") + } + + let outputChannelCount: AVAudioChannelCount = 1 + guard let outputFormat = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: outputSampleRate, + channels: outputChannelCount, + interleaved: false + ) else { + throw RunnerError.transcriptionFailed(description: "Could not create the imported audio output format.") + } + + guard let converter = AVAudioConverter(from: sourceFormat, to: outputFormat) else { + throw RunnerError.transcriptionFailed( + description: "Could not convert audio from \(Int(sourceFormat.sampleRate)) Hz to 16 kHz mono." + ) + } + + let convertedCapacity = AVAudioFrameCount( + ceil(Double(inputBuffer.frameLength) * outputSampleRate / sourceFormat.sampleRate) + ) + 1 + guard let outputBuffer = AVAudioPCMBuffer( + pcmFormat: outputFormat, + frameCapacity: max(convertedCapacity, 1) + ) else { + throw RunnerError.transcriptionFailed(description: "Could not allocate a normalized audio buffer.") + } + + var didConsumeInput = false + var conversionError: NSError? + let status = converter.convert(to: outputBuffer, error: &conversionError) { _, outStatus in + if didConsumeInput { + outStatus.pointee = .endOfStream + return nil + } + didConsumeInput = true + outStatus.pointee = .haveData + return inputBuffer + } + + if let conversionError { + throw RunnerError.transcriptionFailed(description: conversionError.localizedDescription) + } + guard status != .error, outputBuffer.frameLength > 0, let channelData = outputBuffer.floatChannelData else { + throw RunnerError.transcriptionFailed(description: "Could not normalize the imported audio samples.") + } + let byteCount = Int(outputBuffer.frameLength) * MemoryLayout.size + let pcmData = Data(bytes: channelData[0], count: byteCount) + return DecodedImportedAudioFile(pcmData: pcmData, duration: duration) + } +} diff --git a/ExecuWhisper/ExecuWhisper/Services/ModelDownloader.swift b/ExecuWhisper/ExecuWhisper/Services/ModelDownloader.swift new file mode 100644 index 0000000000..4c8fd01c6e --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/ModelDownloader.swift @@ -0,0 +1,439 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import CryptoKit +import os + +private let downloadLog = Logger(subsystem: "org.pytorch.executorch.ExecuWhisper", category: "ModelDownloader") + +private struct DownloadAsset: Sendable { + let fileName: String + let url: URL + let minimumBytes: Int64 +} + +private struct ModelManifest: Decodable, Sendable { + struct Asset: Decodable, Sendable { + let bytes: Int64 + let sha256: String + } + + let assets: [String: Asset] + + static func load() -> Self? { + guard let url = Bundle.main.url(forResource: "model_manifest", withExtension: "json"), + let data = try? Data(contentsOf: url) + else { + return nil + } + return try? JSONDecoder().decode(Self.self, from: data) + } +} + +private final class AssetDownloadDelegate: NSObject, URLSessionDownloadDelegate { + private let destinationURL: URL + private let progressHandler: @Sendable (Int64, Int64) -> Void + private let completionHandler: @Sendable (Result) -> Void + private let lock = NSLock() + private var finishedURL: URL? + private var delivered = false + private var responseError: Error? + + init( + destinationURL: URL, + progressHandler: @escaping @Sendable (Int64, Int64) -> Void, + completionHandler: @escaping @Sendable (Result) -> Void + ) { + self.destinationURL = destinationURL + self.progressHandler = progressHandler + self.completionHandler = completionHandler + } + + func urlSession( + _ session: URLSession, + downloadTask: URLSessionDownloadTask, + didWriteData bytesWritten: Int64, + totalBytesWritten: Int64, + totalBytesExpectedToWrite: Int64 + ) { + progressHandler(totalBytesWritten, totalBytesExpectedToWrite) + } + + func urlSession( + _ session: URLSession, + task: URLSessionTask, + didReceive response: URLResponse, + completionHandler: @escaping (URLSession.ResponseDisposition) -> Void + ) { + if let httpResponse = response as? HTTPURLResponse, + !(200...299).contains(httpResponse.statusCode) { + lock.lock() + responseError = RunnerError.downloadFailed( + file: destinationURL.lastPathComponent, + description: "Server returned HTTP \(httpResponse.statusCode)." + ) + lock.unlock() + completionHandler(.cancel) + return + } + completionHandler(.allow) + } + + func urlSession( + _ session: URLSession, + downloadTask: URLSessionDownloadTask, + didFinishDownloadingTo location: URL + ) { + do { + let fm = FileManager.default + if fm.fileExists(atPath: destinationURL.path(percentEncoded: false)) { + try fm.removeItem(at: destinationURL) + } + try fm.createDirectory(at: destinationURL.deletingLastPathComponent(), withIntermediateDirectories: true) + try fm.moveItem(at: location, to: destinationURL) + lock.lock() + finishedURL = destinationURL + lock.unlock() + } catch { + completeOnce(.failure(error)) + } + } + + func urlSession(_ session: URLSession, task: URLSessionTask, didCompleteWithError error: Error?) { + lock.lock() + let responseError = self.responseError + lock.unlock() + + if let responseError { + completeOnce(.failure(responseError)) + return + } + + if let error { + completeOnce(.failure(error)) + return + } + + lock.lock() + let url = finishedURL + lock.unlock() + + if let url { + completeOnce(.success(url)) + } else { + completeOnce(.failure(RunnerError.downloadFailed(file: destinationURL.lastPathComponent, description: "Download completed without a file."))) + } + } + + private func completeOnce(_ result: Result) { + lock.lock() + defer { lock.unlock() } + guard !delivered else { return } + delivered = true + completionHandler(result) + } +} + +@MainActor @Observable +final class ModelDownloader { + enum State: Equatable { + case idle + case downloading + case finished + case failed + } + + private struct ActiveDownload { + let session: URLSession + let delegate: AssetDownloadDelegate + } + + private let repoBaseURL = URL(string: "https://huggingface.co/younghan-meta/Parakeet-TDT-ExecuTorch-Metal/resolve/main/")! + private let formatterRepoBaseURL = URL(string: "https://huggingface.co/younghan-meta/LFM2.5-ExecuTorch-MLX/resolve/main/")! + private let manifest = ModelManifest.load() + private var activeDownload: ActiveDownload? + + var state: State = .idle + var statusMessage = "" + var currentFileName = "" + var currentFileProgress = 0.0 + var overallProgress = 0.0 + var completedFiles = 0 + var lastError: String? + + var isDownloading: Bool { state == .downloading } + + func downloadModels(destinationDirectory: URL) async throws { + guard !isDownloading else { return } + + let assets = [ + DownloadAsset( + fileName: "model.pte", + url: repoBaseURL.appendingPathComponent("model.pte"), + minimumBytes: 1_000_000 + ), + DownloadAsset( + fileName: "tokenizer.model", + url: repoBaseURL.appendingPathComponent("tokenizer.model"), + minimumBytes: 1_000 + ), + ] + let stagingDirectory = destinationDirectory + .deletingLastPathComponent() + .appendingPathComponent(".ExecuWhisperDownload-\(UUID().uuidString)", isDirectory: true) + + try FileManager.default.createDirectory(at: destinationDirectory.deletingLastPathComponent(), withIntermediateDirectories: true) + try FileManager.default.createDirectory(at: stagingDirectory, withIntermediateDirectories: true) + + state = .downloading + statusMessage = "Preparing download..." + currentFileName = "" + currentFileProgress = 0 + overallProgress = 0 + completedFiles = 0 + lastError = nil + + do { + for (index, asset) in assets.enumerated() { + currentFileName = asset.fileName + statusMessage = "Downloading \(asset.fileName)..." + currentFileProgress = 0 + let stagedURL = stagingDirectory.appendingPathComponent(asset.fileName) + _ = try await downloadAsset(asset, to: stagedURL, completedIndex: index, totalCount: assets.count) + completedFiles = index + 1 + currentFileProgress = 1 + overallProgress = Double(completedFiles) / Double(assets.count) + } + + statusMessage = "Validating downloads..." + try validateStagedAssets(in: stagingDirectory, assets: assets) + + statusMessage = "Finalizing model files..." + try activateDownloads(from: stagingDirectory, to: destinationDirectory, assets: assets) + + currentFileName = "" + currentFileProgress = 1 + overallProgress = 1 + statusMessage = "Model download complete" + state = .finished + } catch { + lastError = error.localizedDescription + statusMessage = "Download failed" + state = .failed + try? FileManager.default.removeItem(at: stagingDirectory) + throw error + } + + try? FileManager.default.removeItem(at: stagingDirectory) + } + + func downloadFormatterModels(destinationDirectory: URL) async throws { + guard !isDownloading else { return } + + let assets = [ + DownloadAsset( + fileName: "lfm2_5_350m_mlx_4w.pte", + url: formatterRepoBaseURL.appendingPathComponent("lfm2_5_350m_mlx_4w.pte"), + minimumBytes: 1_000_000 + ), + DownloadAsset( + fileName: "tokenizer.json", + url: formatterRepoBaseURL.appendingPathComponent("tokenizer.json"), + minimumBytes: 1_000 + ), + DownloadAsset( + fileName: "tokenizer_config.json", + url: formatterRepoBaseURL.appendingPathComponent("tokenizer_config.json"), + minimumBytes: 100 + ), + ] + let stagingDirectory = destinationDirectory + .deletingLastPathComponent() + .appendingPathComponent(".ExecuWhisperFormatterDownload-\(UUID().uuidString)", isDirectory: true) + + try FileManager.default.createDirectory(at: destinationDirectory.deletingLastPathComponent(), withIntermediateDirectories: true) + try FileManager.default.createDirectory(at: stagingDirectory, withIntermediateDirectories: true) + + state = .downloading + statusMessage = "Preparing formatter download..." + currentFileName = "" + currentFileProgress = 0 + overallProgress = 0 + completedFiles = 0 + lastError = nil + + do { + for (index, asset) in assets.enumerated() { + currentFileName = asset.fileName + statusMessage = "Downloading \(asset.fileName)..." + currentFileProgress = 0 + let stagedURL = stagingDirectory.appendingPathComponent(asset.fileName) + _ = try await downloadAsset(asset, to: stagedURL, completedIndex: index, totalCount: assets.count) + completedFiles = index + 1 + currentFileProgress = 1 + overallProgress = Double(completedFiles) / Double(assets.count) + } + + statusMessage = "Validating formatter downloads..." + try validateStagedAssets(in: stagingDirectory, assets: assets) + + statusMessage = "Finalizing formatter files..." + try activateDownloads(from: stagingDirectory, to: destinationDirectory, assets: assets) + + currentFileName = "" + currentFileProgress = 1 + overallProgress = 1 + statusMessage = "Formatter download complete" + state = .finished + } catch { + lastError = error.localizedDescription + statusMessage = "Formatter download failed" + state = .failed + try? FileManager.default.removeItem(at: stagingDirectory) + throw error + } + + try? FileManager.default.removeItem(at: stagingDirectory) + } + + func reset() { + activeDownload?.session.invalidateAndCancel() + activeDownload = nil + state = .idle + statusMessage = "" + currentFileName = "" + currentFileProgress = 0 + overallProgress = 0 + completedFiles = 0 + lastError = nil + } + + private func downloadAsset( + _ asset: DownloadAsset, + to destinationURL: URL, + completedIndex: Int, + totalCount: Int + ) async throws -> URL { + try await withCheckedThrowingContinuation { continuation in + let delegate = AssetDownloadDelegate( + destinationURL: destinationURL, + progressHandler: { [weak self] written, expected in + guard let self else { return } + Task { @MainActor in + let progress: Double + if expected > 0 { + progress = Double(written) / Double(expected) + } else { + progress = 0 + } + self.currentFileProgress = progress + self.overallProgress = (Double(completedIndex) + progress) / Double(totalCount) + } + }, + completionHandler: { [weak self] result in + guard let self else { return } + Task { @MainActor in + self.activeDownload?.session.finishTasksAndInvalidate() + self.activeDownload = nil + continuation.resume(with: result) + } + } + ) + + let config = URLSessionConfiguration.default + config.timeoutIntervalForRequest = 300 + config.timeoutIntervalForResource = 60 * 60 + let session = URLSession(configuration: config, delegate: delegate, delegateQueue: nil) + activeDownload = ActiveDownload(session: session, delegate: delegate) + + downloadLog.info("Downloading \(asset.fileName) to \(destinationURL.path(percentEncoded: false))") + let task = session.downloadTask(with: asset.url) + task.resume() + } + } + + private func activateDownloads(from stagingDirectory: URL, to destinationDirectory: URL, assets: [DownloadAsset]) throws { + try FileManager.default.createDirectory(at: destinationDirectory, withIntermediateDirectories: true) + + for asset in assets { + let stagedURL = stagingDirectory.appendingPathComponent(asset.fileName) + let finalURL = destinationDirectory.appendingPathComponent(asset.fileName) + + if FileManager.default.fileExists(atPath: finalURL.path(percentEncoded: false)) { + _ = try FileManager.default.replaceItemAt( + finalURL, + withItemAt: stagedURL, + backupItemName: nil, + options: .usingNewMetadataOnly + ) + } else { + try FileManager.default.moveItem(at: stagedURL, to: finalURL) + } + } + } + + private func validateStagedAssets(in stagingDirectory: URL, assets: [DownloadAsset]) throws { + for asset in assets { + let stagedURL = stagingDirectory.appendingPathComponent(asset.fileName) + let attributes = try FileManager.default.attributesOfItem(atPath: stagedURL.path(percentEncoded: false)) + let size = (attributes[.size] as? NSNumber)?.int64Value ?? 0 + + guard size >= asset.minimumBytes else { + throw RunnerError.downloadFailed( + file: asset.fileName, + description: "Downloaded file is unexpectedly small (\(size) bytes)." + ) + } + if let manifestAsset = manifest?.assets[asset.fileName] { + guard size == manifestAsset.bytes else { + throw RunnerError.downloadFailed( + file: asset.fileName, + description: "Downloaded file size \(size) does not match expected size \(manifestAsset.bytes)." + ) + } + let digest = try sha256Hex(for: stagedURL) + guard digest == manifestAsset.sha256.lowercased() else { + throw RunnerError.downloadFailed( + file: asset.fileName, + description: "Downloaded file checksum does not match expected SHA256." + ) + } + } + + let handle = try FileHandle(forReadingFrom: stagedURL) + let headerData = try handle.read(upToCount: 256) ?? Data() + try? handle.close() + + if let headerText = String(data: headerData, encoding: .utf8)? + .trimmingCharacters(in: .whitespacesAndNewlines) + .lowercased(), + headerText.contains(" String { + let handle = try FileHandle(forReadingFrom: url) + defer { try? handle.close() } + + var hasher = SHA256() + while autoreleasepool(invoking: { + let data = handle.readData(ofLength: 8 * 1024 * 1024) + guard !data.isEmpty else { return false } + hasher.update(data: data) + return true + }) {} + + return hasher.finalize().map { String(format: "%02x", $0) }.joined() + } +} diff --git a/ExecuWhisper/ExecuWhisper/Services/ParakeetHelperProtocol.swift b/ExecuWhisper/ExecuWhisper/Services/ParakeetHelperProtocol.swift new file mode 100644 index 0000000000..4aae4f0d72 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/ParakeetHelperProtocol.swift @@ -0,0 +1,153 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +enum ParakeetHelperProtocol { + static let version = 1 + + enum AudioEncoding: String, Codable, Sendable, Equatable { + case float32LittleEndian = "f32le" + } + + struct AudioDescriptor: Codable, Sendable, Equatable { + let encoding: AudioEncoding + let sampleRate: Int + let channelCount: Int + let payloadByteCount: Int + + enum CodingKeys: String, CodingKey { + case encoding + case sampleRate = "sample_rate" + case channelCount = "channel_count" + case payloadByteCount = "payload_byte_count" + } + } + + struct TranscribeRequest: Codable, Sendable, Equatable { + let type: String + let version: Int + let requestID: String + let audio: AudioDescriptor + let enableRuntimeProfile: Bool + + init( + requestID: String, + audio: AudioDescriptor, + enableRuntimeProfile: Bool + ) { + self.type = "transcribe" + self.version = ParakeetHelperProtocol.version + self.requestID = requestID + self.audio = audio + self.enableRuntimeProfile = enableRuntimeProfile + } + + enum CodingKeys: String, CodingKey { + case type + case version + case requestID = "request_id" + case audio + case enableRuntimeProfile = "enable_runtime_profile" + } + } + + struct ReadyMessage: Codable, Sendable, Equatable { + let type: String + let version: Int + + init() { + self.type = "ready" + self.version = ParakeetHelperProtocol.version + } + } + + struct StatusMessage: Codable, Sendable, Equatable { + let type: String + let version: Int + let requestID: String? + let phase: String + let message: String + + enum CodingKeys: String, CodingKey { + case type + case version + case requestID = "request_id" + case phase + case message + } + } + + struct ResultMessage: Codable, Sendable, Equatable { + let type: String + let version: Int + let requestID: String + let text: String + let stdout: String + let stderr: String + let runtimeProfile: String? + + enum CodingKeys: String, CodingKey { + case type + case version + case requestID = "request_id" + case text + case stdout + case stderr + case runtimeProfile = "runtime_profile" + } + } + + struct ErrorMessage: Codable, Sendable, Equatable { + let type: String + let version: Int + let requestID: String? + let message: String + let details: String? + + enum CodingKeys: String, CodingKey { + case type + case version + case requestID = "request_id" + case message + case details + } + } + + enum HelperMessage: Decodable, Sendable, Equatable { + case ready(ReadyMessage) + case status(StatusMessage) + case result(ResultMessage) + case error(ErrorMessage) + + private enum CodingKeys: String, CodingKey { + case type + } + + init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + let type = try container.decode(String.self, forKey: .type) + switch type { + case "ready": + self = .ready(try ReadyMessage(from: decoder)) + case "status": + self = .status(try StatusMessage(from: decoder)) + case "result": + self = .result(try ResultMessage(from: decoder)) + case "error": + self = .error(try ErrorMessage(from: decoder)) + default: + throw DecodingError.dataCorruptedError( + forKey: .type, + in: container, + debugDescription: "Unknown helper message type: \(type)" + ) + } + } + } +} diff --git a/ExecuWhisper/ExecuWhisper/Services/PasteController.swift b/ExecuWhisper/ExecuWhisper/Services/PasteController.swift new file mode 100644 index 0000000000..5607ba3537 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/PasteController.swift @@ -0,0 +1,217 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import ApplicationServices +import AppKit +import Carbon.HIToolbox +import Foundation +import os + +private let pasteLog = Logger(subsystem: "org.pytorch.executorch.ExecuWhisper", category: "PasteController") + +enum PasteController { + enum PasteResult: Equatable { + case pastedWithAppPermission + case pastedWithStableHelper + case accessibilityRequired + case failed(String) + } + + private static let helperName = "execuwhisper_paste_helper" + private static let helperAppName = "ExecuWhisper Paste Helper.app" + private static let helperVersion = "3" + private static let accessibilityRequiredExitCode: Int32 = 2 + static let helperIdentifier = "org.pytorch.executorch.ExecuWhisper.PasteHelper" + + static var stableHelperBundleURL: URL { + let directory = PersistencePaths.appSupportDirectory + .appendingPathComponent("PasteHelper", isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory.appendingPathComponent(helperAppName, isDirectory: true) + } + + static var stableHelperExecutableURL: URL { + stableHelperBundleURL + .appendingPathComponent("Contents", isDirectory: true) + .appendingPathComponent("MacOS", isDirectory: true) + .appendingPathComponent(helperName) + } + + static func checkAccessibility(prompt: Bool = false) -> Bool { + if appAccessibilityTrusted(prompt: prompt) { + return true + } + return stableHelperIsTrusted(prompt: prompt) + } + + static func promptForAccessibilityAccess() { + if appAccessibilityTrusted(prompt: false) { + return + } + do { + let helperURLs = try installStableHelperIfNeeded() + launchAccessRequestHelper(helperURLs.executableURL) + } catch { + pasteLog.error("Could not install paste helper for Accessibility request: \(error.localizedDescription, privacy: .public)") + } + openAccessibilitySettings() + } + + static func paste(targetPID: pid_t?) -> PasteResult { + if appAccessibilityTrusted(prompt: false) { + return postPasteShortcut(targetPID: targetPID) + ? .pastedWithAppPermission + : .failed("Could not create paste keyboard event.") + } + + switch runStableHelper(arguments: pasteArguments(targetPID: targetPID, prompt: true)) { + case .success: + return .pastedWithStableHelper + case .accessibilityRequired: + return .accessibilityRequired + case .failed(let message): + return .failed(message) + } + } + + private static func appAccessibilityTrusted(prompt: Bool) -> Bool { + let options = [kAXTrustedCheckOptionPrompt.takeUnretainedValue(): prompt] as CFDictionary + return AXIsProcessTrustedWithOptions(options) + } + + private static func stableHelperIsTrusted(prompt: Bool) -> Bool { + switch runStableHelper(arguments: prompt ? ["--check", "--prompt"] : ["--check"]) { + case .success: + return true + case .accessibilityRequired, .failed: + return false + } + } + + private static func postPasteShortcut(targetPID: pid_t?) -> Bool { + let source = CGEventSource(stateID: .hidSystemState) + guard let keyDown = CGEvent( + keyboardEventSource: source, + virtualKey: CGKeyCode(kVK_ANSI_V), + keyDown: true + ), let keyUp = CGEvent( + keyboardEventSource: source, + virtualKey: CGKeyCode(kVK_ANSI_V), + keyDown: false + ) else { + pasteLog.error("Failed to create Cmd+V events") + return false + } + + keyDown.flags = .maskCommand + keyUp.flags = .maskCommand + if let targetPID { + keyDown.postToPid(targetPID) + usleep(50_000) + keyUp.postToPid(targetPID) + } else { + keyDown.post(tap: .cgSessionEventTap) + usleep(50_000) + keyUp.post(tap: .cgSessionEventTap) + } + return true + } + + private static func pasteArguments(targetPID: pid_t?, prompt: Bool) -> [String] { + var arguments = ["--paste"] + if prompt { + arguments.append("--prompt") + } + if let targetPID { + arguments.append(contentsOf: ["--pid", String(targetPID)]) + } + return arguments + } + + private enum HelperResult { + case success + case accessibilityRequired + case failed(String) + } + + private static func runStableHelper(arguments: [String]) -> HelperResult { + do { + let helperURLs = try installStableHelperIfNeeded() + let process = Process() + let stderrPipe = Pipe() + process.executableURL = helperURLs.executableURL + process.arguments = arguments + process.standardError = stderrPipe + try process.run() + process.waitUntilExit() + + if process.terminationStatus == 0 { + return .success + } + if process.terminationStatus == accessibilityRequiredExitCode { + return .accessibilityRequired + } + let stderr = String(data: stderrPipe.fileHandleForReading.readDataToEndOfFile(), encoding: .utf8) ?? "" + return .failed(stderr.isEmpty ? "Paste helper exited with code \(process.terminationStatus)." : stderr) + } catch { + return .failed(error.localizedDescription) + } + } + + struct StableHelperURLs: Equatable { + let bundleURL: URL + let executableURL: URL + } + + static func installStableHelperIfNeeded() throws -> StableHelperURLs { + let bundleURL = stableHelperBundleURL + let executableURL = stableHelperExecutableURL + let versionURL = bundleURL.deletingLastPathComponent().appendingPathComponent(".version") + let installedVersion = try? String(contentsOf: versionURL, encoding: .utf8) + .trimmingCharacters(in: .whitespacesAndNewlines) + if installedVersion == helperVersion && FileManager.default.isExecutableFile(atPath: executableURL.path) { + return StableHelperURLs(bundleURL: bundleURL, executableURL: executableURL) + } + + guard let bundledURL = Bundle.main.url(forResource: helperAppName, withExtension: nil) else { + throw CocoaError(.fileNoSuchFile) + } + + if FileManager.default.fileExists(atPath: bundleURL.path) { + try FileManager.default.removeItem(at: bundleURL) + } + try FileManager.default.copyItem(at: bundledURL, to: bundleURL) + try FileManager.default.setAttributes([.posixPermissions: 0o755], ofItemAtPath: executableURL.path) + try helperVersion.write(to: versionURL, atomically: true, encoding: .utf8) + pasteLog.info("Installed stable paste helper at \(bundleURL.path, privacy: .public)") + return StableHelperURLs(bundleURL: bundleURL, executableURL: executableURL) + } + + private static func launchAccessRequestHelper(_ helperURL: URL) { + let process = Process() + process.executableURL = helperURL + process.arguments = ["--request-access"] + do { + try process.run() + pasteLog.info("Launched paste helper Accessibility request pid=\(process.processIdentifier)") + } catch { + pasteLog.error("Could not launch paste helper Accessibility request: \(error.localizedDescription, privacy: .public)") + } + } + + private static func openAccessibilitySettings() { + let urls = [ + "x-apple.systempreferences:com.apple.preference.security?Privacy_Accessibility", + "x-apple.systempreferences:com.apple.settings.PrivacySecurity.extension?Privacy_Accessibility", + ] + for value in urls { + guard let url = URL(string: value), NSWorkspace.shared.open(url) else { continue } + return + } + } +} diff --git a/ExecuWhisper/ExecuWhisper/Services/ReplacementStore.swift b/ExecuWhisper/ExecuWhisper/Services/ReplacementStore.swift new file mode 100644 index 0000000000..421f93e05f --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/ReplacementStore.swift @@ -0,0 +1,75 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +@MainActor @Observable +final class ReplacementStore { + var entries: [ReplacementEntry] = [] + + private let fileURL: URL + + init(fileURL: URL = PersistencePaths.replacementsURL) { + self.fileURL = fileURL + load() + } + + func add(_ entry: ReplacementEntry) { + entries.insert(entry, at: 0) + save() + } + + func update(_ entry: ReplacementEntry) { + guard let index = entries.firstIndex(where: { $0.id == entry.id }) else { return } + entries[index] = entry + save() + } + + func delete(_ entry: ReplacementEntry) { + entries.removeAll { $0.id == entry.id } + save() + } + + func toggleEnabled(for id: UUID) { + guard let index = entries.firstIndex(where: { $0.id == id }) else { return } + entries[index].isEnabled.toggle() + save() + } + + private func load() { + guard let data = try? Data(contentsOf: fileURL) else { + entries = Self.defaultEntries + if !entries.isEmpty { + save() + } + return + } + + if let decoded = try? JSONDecoder().decode([ReplacementEntry].self, from: data) { + entries = decoded + return + } + + // Preserve the unreadable file on disk so a bad decode does not wipe + // out the user's custom replacements. + entries = Self.defaultEntries + } + + private func save() { + guard let data = try? JSONEncoder().encode(entries) else { return } + try? data.write(to: fileURL, options: .atomic) + } + + private static let defaultEntries: [ReplacementEntry] = [ + ReplacementEntry(trigger: "executorch", replacement: "ExecuTorch", requiresWordBoundary: false), + ReplacementEntry(trigger: "pytorch", replacement: "PyTorch", requiresWordBoundary: false), + ReplacementEntry(trigger: "parakeet", replacement: "Parakeet", requiresWordBoundary: false), + ReplacementEntry(trigger: "hugging face", replacement: "Hugging Face"), + ReplacementEntry(trigger: "macos", replacement: "macOS", requiresWordBoundary: false), + ] +} diff --git a/ExecuWhisper/ExecuWhisper/Services/RunnerBridge.swift b/ExecuWhisper/ExecuWhisper/Services/RunnerBridge.swift new file mode 100644 index 0000000000..ccacc33bfe --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/RunnerBridge.swift @@ -0,0 +1,590 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import os + +private let runnerLog = Logger(subsystem: "org.pytorch.executorch.ExecuWhisper", category: "RunnerBridge") + +private final class DataAccumulator: @unchecked Sendable { + private let lock = NSLock() + private var data = Data() + + func append(_ chunk: Data) { + lock.lock() + data.append(chunk) + lock.unlock() + } + + func stringValue() -> String { + lock.lock() + defer { lock.unlock() } + return String(decoding: data, as: UTF8.self) + } + + func byteCount() -> Int { + lock.lock() + defer { lock.unlock() } + return data.count + } + + func preview(limit: Int = 400) -> String { + lock.lock() + defer { lock.unlock() } + let preview = String(decoding: data.prefix(limit), as: UTF8.self) + if data.count > limit { + return preview + "\n...[truncated \(data.count - limit) bytes]" + } + return preview + } +} + +private final class LineAccumulator: @unchecked Sendable { + private let lock = NSLock() + private var buffer = Data() + private let lineHandler: @Sendable (String) -> Void + + init(lineHandler: @escaping @Sendable (String) -> Void) { + self.lineHandler = lineHandler + } + + func append(_ chunk: Data) { + lock.lock() + buffer.append(chunk) + var readyLines: [Data] = [] + while let newlineIndex = buffer.firstIndex(of: 10) { + let line = buffer.prefix(upTo: newlineIndex) + readyLines.append(Data(line)) + buffer.removeSubrange(...newlineIndex) + } + lock.unlock() + + for lineData in readyLines { + let trimmed = String(decoding: lineData, as: UTF8.self) + .trimmingCharacters(in: .whitespacesAndNewlines) + if !trimmed.isEmpty { + lineHandler(trimmed) + } + } + } + + func flush() { + lock.lock() + let remainderData = buffer + buffer.removeAll() + lock.unlock() + + let remainder = String(decoding: remainderData, as: UTF8.self) + .trimmingCharacters(in: .whitespacesAndNewlines) + if !remainder.isEmpty { + lineHandler(remainder) + } + } +} + +private struct HelperLaunchConfiguration: Equatable, Sendable { + let runnerPath: String + let modelPath: String + let tokenizerPath: String +} + +protocol RunnerBridgeClient: Sendable { + func runtimeSnapshot() async -> RunnerBridge.RuntimeSnapshot + + func prepare( + runnerPath: String, + modelPath: String, + tokenizerPath: String + ) async throws + + func shutdown() async + + func transcribe( + runnerPath: String, + modelPath: String, + tokenizerPath: String, + audioPath: String, + options: RunnerBridge.RunOptions + ) async -> AsyncThrowingStream + + func transcribePCM( + runnerPath: String, + modelPath: String, + tokenizerPath: String, + pcmData: Data, + options: RunnerBridge.RunOptions + ) async -> AsyncThrowingStream +} + +actor RunnerBridge { + enum Event: Sendable { + case status(String) + case completed(TranscriptionResult) + } + + enum ResidencyState: Sendable, Equatable { + case unloaded + case loading + case warm + case failed + } + + struct RuntimeSnapshot: Sendable, Equatable { + let state: ResidencyState + let runnerPath: String? + let modelPath: String? + let tokenizerPath: String? + } + + struct RunOptions: Sendable, Equatable { + var enableRuntimeProfile: Bool = false + + static func fromEnvironment(_ environment: [String: String]) -> Self { + Self(enableRuntimeProfile: environment["EXECUWHISPER_ENABLE_RUNTIME_PROFILE"] == "1") + } + } + + struct TranscriptionResult: Sendable, Equatable { + let text: String + let stdout: String + let stderr: String + let stats: String? + let runtimeProfile: String? + } + + private struct PendingRequest { + let requestID: String + let continuation: AsyncThrowingStream.Continuation + } + + private var process: Process? + private var stdinHandle: FileHandle? + private var stderrAccumulator = DataAccumulator() + private var activeConfiguration: HelperLaunchConfiguration? + private var activeTraceID: String? + private var runtimeState: ResidencyState = .unloaded + private var lastError: RunnerError? + private var pendingRequest: PendingRequest? + + func health() -> RuntimeSnapshot { + RuntimeSnapshot( + state: runtimeState, + runnerPath: activeConfiguration?.runnerPath, + modelPath: activeConfiguration?.modelPath, + tokenizerPath: activeConfiguration?.tokenizerPath + ) + } + + func runtimeSnapshot() -> RuntimeSnapshot { + health() + } + + func prepare( + runnerPath: String, + modelPath: String, + tokenizerPath: String + ) async throws { + let configuration = HelperLaunchConfiguration( + runnerPath: runnerPath, + modelPath: modelPath, + tokenizerPath: tokenizerPath + ) + + if activeConfiguration != configuration { + await shutdown() + } + + if process?.isRunning == true, runtimeState == .warm, activeConfiguration == configuration { + return + } + + if process?.isRunning != true || activeConfiguration != configuration { + try launchHelper(configuration) + } + + try await waitForWarmRuntime(expected: configuration) + } + + func shutdown() async { + finishPendingRequest( + throwing: RunnerError.transcriptionFailed(description: "Transcription was cancelled.") + ) + + if let stdinHandle { + if let payload = try? RunnerBridge.encodeJSONLine(["type": "shutdown", "version": ParakeetHelperProtocol.version]) { + try? stdinHandle.write(contentsOf: payload) + } + try? stdinHandle.close() + } + + if let process, process.isRunning { + process.terminate() + } + + process = nil + stdinHandle = nil + stderrAccumulator = DataAccumulator() + activeConfiguration = nil + activeTraceID = nil + runtimeState = .unloaded + lastError = nil + } + + func transcribe( + runnerPath: String, + modelPath: String, + tokenizerPath: String, + audioPath: String, + options: RunOptions = .init() + ) async -> AsyncThrowingStream { + do { + let pcmData = try Self.loadPCMFloat32MonoWAV(from: URL(fileURLWithPath: audioPath)) + return await transcribePCM( + runnerPath: runnerPath, + modelPath: modelPath, + tokenizerPath: tokenizerPath, + pcmData: pcmData, + options: options + ) + } catch { + return AsyncThrowingStream { continuation in + continuation.finish(throwing: error) + } + } + } + + func transcribePCM( + runnerPath: String, + modelPath: String, + tokenizerPath: String, + pcmData: Data, + options: RunOptions = .init() + ) async -> AsyncThrowingStream { + let configuration = HelperLaunchConfiguration( + runnerPath: runnerPath, + modelPath: modelPath, + tokenizerPath: tokenizerPath + ) + + let shouldWarm = activeConfiguration != configuration || runtimeState != .warm || process?.isRunning != true + return AsyncThrowingStream { continuation in + Task { + do { + if shouldWarm { + continuation.yield(.status("Warming model...")) + } + try await self.prepare( + runnerPath: runnerPath, + modelPath: modelPath, + tokenizerPath: tokenizerPath + ) + try await self.sendTranscriptionRequest( + configuration: configuration, + pcmData: pcmData, + options: options, + continuation: continuation + ) + } catch { + continuation.finish(throwing: error) + } + } + } + } + + private func launchHelper(_ configuration: HelperLaunchConfiguration) throws { + let traceID = String(UUID().uuidString.prefix(8)) + let process = Process() + let stdinPipe = Pipe() + let stdoutPipe = Pipe() + let stderrPipe = Pipe() + stderrAccumulator = DataAccumulator() + activeConfiguration = configuration + activeTraceID = traceID + runtimeState = .loading + lastError = nil + + process.executableURL = URL(fileURLWithPath: configuration.runnerPath) + process.arguments = [ + "--model_path", configuration.modelPath, + "--tokenizer_path", configuration.tokenizerPath, + ] + process.currentDirectoryURL = URL(fileURLWithPath: configuration.modelPath).deletingLastPathComponent() + + var environment = ProcessInfo.processInfo.environment + let bundleResources = Bundle.main.resourcePath ?? "" + var dyldEntries: [String] = [] + if !bundleResources.isEmpty { + dyldEntries.append(bundleResources) + } + if let existing = environment["DYLD_LIBRARY_PATH"], !existing.isEmpty { + dyldEntries.append(contentsOf: existing.components(separatedBy: ":")) + } + let uniqueEntries = Array(NSOrderedSet(array: dyldEntries)).compactMap { $0 as? String } + environment["DYLD_LIBRARY_PATH"] = uniqueEntries.joined(separator: ":") + process.environment = environment + + process.standardInput = stdinPipe + process.standardOutput = stdoutPipe + process.standardError = stderrPipe + + let stdoutLines = LineAccumulator { line in + Task { await self.handleHelperLine(line, traceID: traceID) } + } + + process.terminationHandler = { process in + Task { + await self.handleTermination( + exitCode: process.terminationStatus, + reason: String(describing: process.terminationReason), + traceID: traceID + ) + } + } + + do { + try process.run() + runnerLog.info( + "RunnerBridge[\(traceID, privacy: .public)] launched helper runnerPath=\(configuration.runnerPath, privacy: .public) modelPath=\(configuration.modelPath, privacy: .public) tokenizerPath=\(configuration.tokenizerPath, privacy: .public) pid=\(process.processIdentifier)" + ) + } catch { + runtimeState = .failed + lastError = .launchFailed(description: error.localizedDescription) + throw lastError! + } + + self.process = process + self.stdinHandle = stdinPipe.fileHandleForWriting + + DispatchQueue.global(qos: .userInitiated).async { + let handle = stdoutPipe.fileHandleForReading + while true { + let data = handle.availableData + if data.isEmpty { + break + } + stdoutLines.append(data) + } + stdoutLines.flush() + } + + let stderrAccumulator = self.stderrAccumulator + DispatchQueue.global(qos: .utility).async { + let handle = stderrPipe.fileHandleForReading + while true { + let data = handle.availableData + if data.isEmpty { + break + } + stderrAccumulator.append(data) + } + } + } + + private func waitForWarmRuntime(expected configuration: HelperLaunchConfiguration) async throws { + let deadline = Date().addingTimeInterval(30) + while Date() < deadline { + if activeConfiguration != configuration { + throw RunnerError.launchFailed(description: "Parakeet helper configuration changed during warmup.") + } + + switch runtimeState { + case .warm: + return + case .failed: + throw lastError ?? RunnerError.launchFailed(description: "Parakeet helper failed to warm.") + case .unloaded: + throw lastError ?? RunnerError.launchFailed(description: "Parakeet helper exited before becoming ready.") + case .loading: + try await Task.sleep(for: .milliseconds(50)) + } + } + throw RunnerError.launchFailed(description: "Timed out waiting for the Parakeet helper to become ready.") + } + + private func sendTranscriptionRequest( + configuration: HelperLaunchConfiguration, + pcmData: Data, + options: RunOptions, + continuation: AsyncThrowingStream.Continuation + ) async throws { + guard activeConfiguration == configuration, runtimeState == .warm else { + throw RunnerError.launchFailed(description: "Parakeet helper is not warm.") + } + guard pendingRequest == nil else { + throw RunnerError.transcriptionFailed(description: "Parakeet helper is already processing another request.") + } + guard let stdinHandle else { + throw RunnerError.launchFailed(description: "Parakeet helper stdin is unavailable.") + } + + let requestID = UUID().uuidString + pendingRequest = PendingRequest(requestID: requestID, continuation: continuation) + + let header = ParakeetHelperProtocol.TranscribeRequest( + requestID: requestID, + audio: .init( + encoding: .float32LittleEndian, + sampleRate: 16_000, + channelCount: 1, + payloadByteCount: pcmData.count + ), + enableRuntimeProfile: options.enableRuntimeProfile + ) + + do { + let headerData = try JSONEncoder().encode(header) + Data("\n".utf8) + try stdinHandle.write(contentsOf: headerData) + try stdinHandle.write(contentsOf: pcmData) + } catch { + finishPendingRequest(throwing: RunnerError.launchFailed(description: error.localizedDescription)) + throw RunnerError.launchFailed(description: error.localizedDescription) + } + } + + private func handleHelperLine(_ line: String, traceID: String) async { + guard traceID == activeTraceID else { return } + guard let data = line.data(using: .utf8) else { + runnerLog.error("RunnerBridge[\(traceID, privacy: .public)] helper emitted non-utf8 line") + return + } + + do { + let message = try JSONDecoder().decode(ParakeetHelperProtocol.HelperMessage.self, from: data) + switch message { + case .ready: + runtimeState = .warm + lastError = nil + runnerLog.info("RunnerBridge[\(traceID, privacy: .public)] helper reported ready") + case .status(let status): + if status.requestID == pendingRequest?.requestID { + pendingRequest?.continuation.yield(.status(status.message)) + } + case .result(let result): + guard result.requestID == pendingRequest?.requestID else { return } + let transcription = TranscriptionResult( + text: result.text, + stdout: result.stdout, + stderr: result.stderr, + stats: RunnerBridge.statsLine(from: result.stdout), + runtimeProfile: result.runtimeProfile + ) + pendingRequest?.continuation.yield(.completed(transcription)) + pendingRequest?.continuation.finish() + pendingRequest = nil + case .error(let errorMessage): + let description = errorMessage.details ?? errorMessage.message + if errorMessage.requestID == pendingRequest?.requestID { + finishPendingRequest(throwing: RunnerError.transcriptionFailed(description: description)) + } else { + runtimeState = .failed + lastError = .launchFailed(description: description) + } + } + } catch { + runnerLog.error( + "RunnerBridge[\(traceID, privacy: .public)] failed to parse helper message: \(error.localizedDescription, privacy: .public)\n\(line, privacy: .public)" + ) + } + } + + private func handleTermination(exitCode: Int32, reason: String, traceID: String) async { + guard traceID == activeTraceID else { return } + let stderr = stderrAccumulator.stringValue() + runnerLog.info( + "RunnerBridge[\(traceID, privacy: .public)] helper terminated exitCode=\(exitCode) reason=\(reason, privacy: .public)" + ) + + let error = RunnerError.runnerCrashed( + exitCode: exitCode, + stderr: stderr.isEmpty ? "Parakeet helper terminated unexpectedly." : stderr + ) + + if pendingRequest != nil { + finishPendingRequest(throwing: error) + } + + if activeConfiguration != nil { + runtimeState = exitCode == 0 ? .unloaded : .failed + lastError = exitCode == 0 ? nil : error + } else { + runtimeState = .unloaded + lastError = nil + } + + process = nil + stdinHandle = nil + activeTraceID = nil + } + + private func finishPendingRequest(throwing error: Error) { + pendingRequest?.continuation.finish(throwing: error) + pendingRequest = nil + } + + private static func statsLine(from stdout: String) -> String? { + stdout + .components(separatedBy: .newlines) + .first(where: { $0.contains("PyTorchObserver") }) + .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) } + } + + private static func loadPCMFloat32MonoWAV(from url: URL) throws -> Data { + let data = try Data(contentsOf: url) + guard data.count > 44 else { + throw RunnerError.transcriptionFailed(description: "Recorded WAV file is too small.") + } + + func readUInt16(at offset: Int) -> UInt16 { + data.withUnsafeBytes { bytes in + bytes.load(fromByteOffset: offset, as: UInt16.self).littleEndian + } + } + + func readUInt32(at offset: Int) -> UInt32 { + data.withUnsafeBytes { bytes in + bytes.load(fromByteOffset: offset, as: UInt32.self).littleEndian + } + } + + guard String(decoding: data.prefix(4), as: UTF8.self) == "RIFF", + String(decoding: data[8..<12], as: UTF8.self) == "WAVE" else { + throw RunnerError.transcriptionFailed(description: "Recorded file is not a WAV container.") + } + + let audioFormat = readUInt16(at: 20) + let channelCount = readUInt16(at: 22) + let bitsPerSample = readUInt16(at: 34) + guard audioFormat == 3, channelCount == 1, bitsPerSample == 32 else { + throw RunnerError.transcriptionFailed(description: "Expected float32 mono WAV audio.") + } + + var offset = 12 + while offset + 8 <= data.count { + let chunkID = String(decoding: data[offset..<(offset + 4)], as: UTF8.self) + let chunkSize = Int(readUInt32(at: offset + 4)) + let chunkStart = offset + 8 + let chunkEnd = chunkStart + chunkSize + guard chunkEnd <= data.count else { + throw RunnerError.transcriptionFailed(description: "Recorded WAV data chunk is truncated.") + } + if chunkID == "data" { + return data.subdata(in: chunkStart.. Data { + let data = try JSONSerialization.data(withJSONObject: dictionary) + return data + Data("\n".utf8) + } +} + +extension RunnerBridge: RunnerBridgeClient {} diff --git a/ExecuWhisper/ExecuWhisper/Services/TextPipeline.swift b/ExecuWhisper/ExecuWhisper/Services/TextPipeline.swift new file mode 100644 index 0000000000..4e6932f150 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Services/TextPipeline.swift @@ -0,0 +1,268 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import os + +private let pipelineLog = Logger(subsystem: "org.pytorch.executorch.ExecuWhisper", category: "TextPipeline") + +struct TextProcessingResult: Sendable, Equatable { + let rawText: String + let outputText: String + let tags: [String] + + var transformed: Bool { + rawText != outputText + } +} + +@MainActor +final class TextPipeline { + private static let safeFormatterInputWordBudget = 260 + private static let stopwords: Set = [ + "a", "an", "and", "are", "can", "do", "does", "for", "i", "in", "is", "it", "me", + "of", "on", "or", "so", "that", "the", "this", "to", "um", "uh", "we", "what", + "when", "where", "who", "why", "you" + ] + + enum Context: Sendable { + case standard + case dictation + } + + struct FormatterPaths: Sendable { + let runnerPath: String + let modelPath: String + let tokenizerPath: String + let tokenizerConfigPath: String + } + + private let replacementStore: ReplacementStore + private let formatterBridge: (any FormatterBridgeClient)? + private let formatterPathsProvider: @MainActor () -> FormatterPaths? + + init( + replacementStore: ReplacementStore, + formatterBridge: (any FormatterBridgeClient)? = nil, + formatterPathsProvider: @escaping @MainActor () -> FormatterPaths? = { nil } + ) { + self.replacementStore = replacementStore + self.formatterBridge = formatterBridge + self.formatterPathsProvider = formatterPathsProvider + } + + func process(_ text: String, context: Context = .standard) -> TextProcessingResult { + processReplacementsOnly(text) + } + + func process( + _ text: String, + context: Context = .standard, + smartFormattingEnabled: Bool + ) async -> TextProcessingResult { + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { + return TextProcessingResult(rawText: text, outputText: "", tags: []) + } + + if DiagnosticLogging.shouldLogTranscriptsPublicly { + pipelineLog.info("Parakeet transcript: \(trimmed, privacy: .public)") + } else { + pipelineLog.info("Parakeet transcript: \(trimmed, privacy: .private)") + } + + guard smartFormattingEnabled else { + pipelineLog.info("Smart formatting disabled; using replacement-only path") + return processReplacementsOnly(trimmed) + } + + guard let formatterBridge, let formatterPaths = formatterPathsProvider() else { + pipelineLog.info("Formatter unavailable; falling back to replacement-only text") + return fallbackResult(for: trimmed) + } + + do { + let prompt = FormatterPromptBuilder.prompt(transcript: trimmed) + guard Self.shouldUseFormatter(prompt: prompt, transcript: trimmed) else { + pipelineLog.info("Formatter skipped because transcript exceeds context budget") + return fallbackResult(for: trimmed, extraTags: ["formatter-skipped-context"]) + } + let formatted = try await formatterBridge.format( + runnerPath: formatterPaths.runnerPath, + modelPath: formatterPaths.modelPath, + tokenizerPath: formatterPaths.tokenizerPath, + tokenizerConfigPath: formatterPaths.tokenizerConfigPath, + prompt: prompt, + maxNewTokens: FormatterPromptBuilder.maxNewTokens(for: trimmed), + temperature: FormatterPromptBuilder.temperature + ) + if DiagnosticLogging.shouldLogTranscriptsPublicly { + pipelineLog.info("LFM2.5 raw output: \(formatted.text, privacy: .public)") + } else { + pipelineLog.info("LFM2.5 raw output: \(formatted.text, privacy: .private)") + } + guard let validated = validateFormatterOutput( + formatted.text, + prompt: prompt, + transcript: trimmed + ) else { + pipelineLog.info("LFM2.5 output rejected by validator; falling back to transcript") + return fallbackResult(for: trimmed) + } + let replaced = applyReplacements(to: validated) + var tags = ["formatted"] + if replaced != validated { + tags.append("replacement") + } + if DiagnosticLogging.shouldLogTranscriptsPublicly { + pipelineLog.info("LFM2.5 final output: \(replaced, privacy: .public) tags=\(tags, privacy: .public)") + } else { + pipelineLog.info("LFM2.5 final output: \(replaced, privacy: .private) tags=\(tags, privacy: .public)") + } + return TextProcessingResult(rawText: trimmed, outputText: replaced, tags: tags) + } catch { + pipelineLog.error("Formatter error: \(error.localizedDescription, privacy: .public)") + return fallbackResult(for: trimmed) + } + } + + func processReplacementsOnly(_ text: String) -> TextProcessingResult { + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { + return TextProcessingResult(rawText: text, outputText: "", tags: []) + } + + let replaced = applyReplacements(to: trimmed) + let styled = applyStyle(to: replaced) + let tags = styled == trimmed ? [] : ["replacement"] + return TextProcessingResult(rawText: trimmed, outputText: styled, tags: tags) + } + + private func fallbackResult(for trimmed: String, extraTags: [String] = ["formatter-fallback"]) -> TextProcessingResult { + let replacementOnly = processReplacementsOnly(trimmed) + return TextProcessingResult( + rawText: replacementOnly.rawText, + outputText: replacementOnly.outputText, + tags: replacementOnly.tags + extraTags + ) + } + + private func applyReplacements(to text: String) -> String { + replacementStore.entries + .filter(\.isEnabled) + .sorted { $0.trigger.count > $1.trigger.count } + .reduce(text) { partial, entry in + replace(entry: entry, in: partial) + } + } + + private func replace(entry: ReplacementEntry, in text: String) -> String { + guard !entry.trigger.isEmpty else { return text } + + let escaped = NSRegularExpression.escapedPattern(for: entry.trigger) + let pattern = entry.requiresWordBoundary ? #"\b\#(escaped)\b"# : escaped + let options: NSRegularExpression.Options = entry.isCaseSensitive ? [] : [.caseInsensitive] + + guard let regex = try? NSRegularExpression(pattern: pattern, options: options) else { + return text + } + + let matches = regex.matches(in: text, range: NSRange(text.startIndex..., in: text)) + guard !matches.isEmpty else { return text } + + var output = text + for match in matches.reversed() { + guard let matchRange = Range(match.range, in: output) else { continue } + let original = String(output[matchRange]) + let replacement = preserveCaseIfNeeded(original: original, replacement: entry.replacement) + output.replaceSubrange(matchRange, with: replacement) + } + return output + } + + private func preserveCaseIfNeeded(original: String, replacement: String) -> String { + if original == original.uppercased() { + return replacement.uppercased() + } + if original == original.lowercased() { + return replacement + } + if let first = original.first, String(first) == String(first).uppercased() { + return replacement.prefix(1).uppercased() + replacement.dropFirst() + } + return replacement + } + + private func applyStyle(to text: String) -> String { + text + } + + private func validateFormatterOutput(_ output: String, prompt: String, transcript: String) -> String? { + let trimmed = output.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { return nil } + guard trimmed != prompt else { return nil } + + let lowercased = trimmed.lowercased() + if lowercased.hasPrefix("here is") + || lowercased.hasPrefix("sure,") + || lowercased.hasPrefix("sure.") { + return nil + } + if lowercased.hasPrefix("mode:") { + return nil + } + if lowercased.hasPrefix("options:") + || lowercased.hasPrefix("examples:") + || lowercased.contains("does it feel like real-time processing?") + || lowercased.contains("what is the next step?") + || lowercased.contains("okay, so the plan is finish the build") { + return nil + } + if trimmed.contains("<|startoftext|>") + || trimmed.contains("<|im_start|>") + || trimmed.contains("Transcript:\n\"\"\"") { + return nil + } + if Self.isFormatterOutputSuspiciouslyShort(transcript: transcript, output: trimmed) { + return nil + } + if Self.hasNoMeaningfulTokenOverlap(transcript: transcript, output: trimmed) { + return nil + } + return trimmed + } + + static func shouldUseFormatter(prompt: String, transcript: String) -> Bool { + let promptWordCount = prompt.split(whereSeparator: \.isWhitespace).count + let transcriptWordCount = transcript.split(whereSeparator: \.isWhitespace).count + let expectedOutputWordCount = max(32, transcriptWordCount * 2) + return promptWordCount + expectedOutputWordCount <= safeFormatterInputWordBudget + } + + static func isFormatterOutputSuspiciouslyShort(transcript: String, output: String) -> Bool { + let inputWordCount = transcript.split(whereSeparator: \.isWhitespace).count + guard inputWordCount >= 3 else { return false } + let outputWordCount = output.split(whereSeparator: \.isWhitespace).count + let minimumExpected = max(2, Int((Double(inputWordCount) * 0.4).rounded(.up))) + return outputWordCount < minimumExpected + } + + static func hasNoMeaningfulTokenOverlap(transcript: String, output: String) -> Bool { + let inputTokens = meaningfulTokens(in: transcript) + guard inputTokens.count >= 2 else { return false } + let outputTokens = meaningfulTokens(in: output) + return inputTokens.isDisjoint(with: outputTokens) + } + + private static func meaningfulTokens(in text: String) -> Set { + Set(text + .lowercased() + .components(separatedBy: CharacterSet.alphanumerics.inverted) + .filter { $0.count >= 3 && !stopwords.contains($0) }) + } +} diff --git a/ExecuWhisper/ExecuWhisper/Support/PasteHelper/main.swift b/ExecuWhisper/ExecuWhisper/Support/PasteHelper/main.swift new file mode 100644 index 0000000000..44f56fc5b8 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Support/PasteHelper/main.swift @@ -0,0 +1,129 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import ApplicationServices +import Carbon.HIToolbox +import Foundation + +private enum ExitCode { + static let success: Int32 = 0 + static let accessibilityRequired: Int32 = 2 + static let invalidArguments: Int32 = 64 + static let eventCreationFailed: Int32 = 70 +} + +private let helperVersion = "2" + +private func accessibilityTrusted(prompt: Bool) -> Bool { + let options = [kAXTrustedCheckOptionPrompt.takeUnretainedValue(): prompt] as CFDictionary + return AXIsProcessTrustedWithOptions(options) +} + +private func targetPID(from arguments: [String]) -> pid_t? { + guard let index = arguments.firstIndex(of: "--pid"), + arguments.indices.contains(index + 1), + let value = Int32(arguments[index + 1]) + else { + return nil + } + return value +} + +private func postPasteShortcut(targetPID: pid_t?) -> Int32 { + let source = CGEventSource(stateID: .hidSystemState) + guard let keyDown = CGEvent( + keyboardEventSource: source, + virtualKey: CGKeyCode(kVK_ANSI_V), + keyDown: true + ), let keyUp = CGEvent( + keyboardEventSource: source, + virtualKey: CGKeyCode(kVK_ANSI_V), + keyDown: false + ) else { + return ExitCode.eventCreationFailed + } + + keyDown.flags = .maskCommand + keyUp.flags = .maskCommand + + if let targetPID { + keyDown.postToPid(targetPID) + usleep(50_000) + keyUp.postToPid(targetPID) + } else { + keyDown.post(tap: .cgSessionEventTap) + usleep(50_000) + keyUp.post(tap: .cgSessionEventTap) + } + return ExitCode.success +} + +private func waitForLikelyTextTarget(targetPID: pid_t?, timeoutSeconds: TimeInterval = 1.0) { + guard let targetPID else { return } + let appElement = AXUIElementCreateApplication(targetPID) + let deadline = Date().addingTimeInterval(timeoutSeconds) + let textRoles: Set = [ + kAXTextFieldRole as String, + kAXTextAreaRole as String, + kAXComboBoxRole as String, + ] + + while Date() < deadline { + var focused: CFTypeRef? + let focusedStatus = AXUIElementCopyAttributeValue( + appElement, + kAXFocusedUIElementAttribute as CFString, + &focused + ) + if focusedStatus == .success, let focused { + var role: CFTypeRef? + let roleStatus = AXUIElementCopyAttributeValue( + focused as! AXUIElement, + kAXRoleAttribute as CFString, + &role + ) + if roleStatus == .success, + let roleString = role as? String, + textRoles.contains(roleString) { + return + } + } + usleep(50_000) + } +} + +let arguments = Array(CommandLine.arguments.dropFirst()) +let shouldPrompt = arguments.contains("--prompt") + +if arguments.contains("--version") { + print(helperVersion) + exit(ExitCode.success) +} + +if arguments.contains("--request-access") { + if accessibilityTrusted(prompt: true) { + exit(ExitCode.success) + } + sleep(120) + exit(accessibilityTrusted(prompt: false) ? ExitCode.success : ExitCode.accessibilityRequired) +} + +if arguments.contains("--check") { + exit(accessibilityTrusted(prompt: shouldPrompt) ? ExitCode.success : ExitCode.accessibilityRequired) +} + +guard arguments.contains("--paste") else { + exit(ExitCode.invalidArguments) +} + +guard accessibilityTrusted(prompt: shouldPrompt) else { + exit(ExitCode.accessibilityRequired) +} + +waitForLikelyTextTarget(targetPID: targetPID(from: arguments)) +exit(postPasteShortcut(targetPID: targetPID(from: arguments))) diff --git a/ExecuWhisper/ExecuWhisper/Utilities/DiagnosticLogging.swift b/ExecuWhisper/ExecuWhisper/Utilities/DiagnosticLogging.swift new file mode 100644 index 0000000000..4b5eb2999a --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Utilities/DiagnosticLogging.swift @@ -0,0 +1,30 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import os + +enum DiagnosticLogging { + static let transcriptDebugKey = "EXECUWHISPER_DEBUG_LOG_TRANSCRIPTS" + + static var shouldLogTranscriptsPublicly: Bool { + UserDefaults.standard.bool(forKey: transcriptDebugKey) + } + + static func residentMemoryBytes() -> UInt64? { + var info = mach_task_basic_info() + var count = mach_msg_type_number_t(MemoryLayout.size) / 4 + let result = withUnsafeMutablePointer(to: &info) { + $0.withMemoryRebound(to: integer_t.self, capacity: Int(count)) { + task_info(mach_task_self_, task_flavor_t(MACH_TASK_BASIC_INFO), $0, &count) + } + } + guard result == KERN_SUCCESS else { return nil } + return UInt64(info.resident_size) + } +} diff --git a/ExecuWhisper/ExecuWhisper/Utilities/PersistencePaths.swift b/ExecuWhisper/ExecuWhisper/Utilities/PersistencePaths.swift new file mode 100644 index 0000000000..a04aae52cf --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Utilities/PersistencePaths.swift @@ -0,0 +1,36 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +enum PersistencePaths { + static var appSupportDirectory: URL { + let appSupport = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first! + let directory = appSupport.appendingPathComponent("ExecuWhisper", isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } + + static var sessionsURL: URL { + appSupportDirectory.appendingPathComponent("sessions.json") + } + + static var modelsDirectoryURL: URL { + appSupportDirectory.appendingPathComponent("models", isDirectory: true) + } + + static var replacementsURL: URL { + appSupportDirectory.appendingPathComponent("replacements.json") + } + + static var logsDirectoryURL: URL { + let directory = appSupportDirectory.appendingPathComponent("logs", isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } +} diff --git a/ExecuWhisper/ExecuWhisper/Utilities/RunnerError.swift b/ExecuWhisper/ExecuWhisper/Utilities/RunnerError.swift new file mode 100644 index 0000000000..ecf5451556 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Utilities/RunnerError.swift @@ -0,0 +1,76 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +enum RunnerError: Error, Sendable { + case binaryNotFound(path: String) + case modelMissing(file: String) + case runtimeLibraryMissing(path: String) + case microphonePermissionDenied + case accessibilityPermissionDenied + case microphoneNotAvailable + case invalidRunnerOutput(stdout: String) + case downloadFailed(file: String, description: String) + case hotKeyRegistrationFailed(description: String) + case runnerCrashed(exitCode: Int32, stderr: String) + case transcriptionFailed(description: String) + case exportFailed(description: String) + case launchFailed(description: String) + case dictationNotActive +} + +extension RunnerError: LocalizedError { + var errorDescription: String? { + switch self { + case .binaryNotFound(let path): + return "Runner binary not found at \(path)" + case .modelMissing(let file): + return "Required model file is missing: \(file)" + case .runtimeLibraryMissing(let path): + return "Required runtime library is missing at \(path)" + case .microphonePermissionDenied: + return "Microphone access denied. Enable it in System Settings -> Privacy & Security -> Microphone, then relaunch ExecuWhisper." + case .accessibilityPermissionDenied: + return "Accessibility access is required to auto-paste dictated text. Enable ExecuWhisper Paste Helper or ExecuWhisper in System Settings -> Privacy & Security -> Accessibility." + case .microphoneNotAvailable: + return "No audio input is available. Connect or enable a microphone and try again." + case .invalidRunnerOutput(let stdout): + return "Parakeet runner finished without returning a transcript.\n\n\(stdout)" + case .downloadFailed(let file, let description): + return "Failed to download \(file): \(description)" + case .hotKeyRegistrationFailed(let description): + return "Global hotkey registration failed: \(description)" + case .runnerCrashed(let exitCode, let stderr): + return "Parakeet runner exited with code \(exitCode).\n\n\(stderr)" + case .transcriptionFailed(let description): + return "Transcription failed: \(description)" + case .exportFailed(let description): + return "Export failed: \(description)" + case .launchFailed(let description): + return "Failed to launch the runner: \(description)" + case .dictationNotActive: + return nil + } + } +} + +extension RunnerError { + var isStickyUserActionError: Bool { + switch self { + case .accessibilityPermissionDenied, + .microphonePermissionDenied, + .binaryNotFound, + .modelMissing, + .runtimeLibraryMissing: + return true + default: + return false + } + } +} diff --git a/ExecuWhisper/ExecuWhisper/Utilities/SessionExportFormat.swift b/ExecuWhisper/ExecuWhisper/Utilities/SessionExportFormat.swift new file mode 100644 index 0000000000..75a00a2446 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Utilities/SessionExportFormat.swift @@ -0,0 +1,95 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import UniformTypeIdentifiers + +enum SessionExportFormat: String, CaseIterable, Sendable { + case text + case json + case srt + + var title: String { + switch self { + case .text: + return "Plain Text" + case .json: + return "JSON" + case .srt: + return "SubRip (.srt)" + } + } + + var fileExtension: String { + switch self { + case .text: + return "txt" + case .json: + return "json" + case .srt: + return "srt" + } + } + + var contentType: UTType { + switch self { + case .text: + return .plainText + case .json: + return .json + case .srt: + return UTType(filenameExtension: "srt") ?? .plainText + } + } + + func render(_ session: Session) -> String { + switch self { + case .text: + return session.transcript + case .json: + let payload = ExportPayload( + title: session.displayTitle, + date: session.date, + transcript: session.transcript, + rawTranscript: session.rawTranscript, + duration: session.duration, + tags: session.tags + ) + let encoder = JSONEncoder() + encoder.outputFormatting = [.prettyPrinted, .sortedKeys] + encoder.dateEncodingStrategy = .iso8601 + let data = (try? encoder.encode(payload)) ?? Data("{}".utf8) + return String(decoding: data, as: UTF8.self) + case .srt: + let end = max(session.duration, 1) + return """ + 1 + 00:00:00,000 --> \(srtTimestamp(end)) + \(session.transcript) + """ + } + } + + private func srtTimestamp(_ interval: TimeInterval) -> String { + let totalMilliseconds = Int((interval * 1000).rounded()) + let hours = totalMilliseconds / 3_600_000 + let minutes = (totalMilliseconds / 60_000) % 60 + let seconds = (totalMilliseconds / 1_000) % 60 + let milliseconds = totalMilliseconds % 1_000 + return String(format: "%02d:%02d:%02d,%03d", hours, minutes, seconds, milliseconds) + } + + private struct ExportPayload: Codable { + let title: String + let date: Date + let transcript: String + let rawTranscript: String? + let duration: TimeInterval + let tags: [String] + } +} diff --git a/ExecuWhisper/ExecuWhisper/Utilities/SessionHistory.swift b/ExecuWhisper/ExecuWhisper/Utilities/SessionHistory.swift new file mode 100644 index 0000000000..79d6494e3f --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Utilities/SessionHistory.swift @@ -0,0 +1,82 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation + +struct SessionHistorySection: Identifiable, Equatable { + let title: String + let sessions: [Session] + + var id: String { title } +} + +enum SessionHistory { + static func visibleSessions(in sessions: [Session], matching searchText: String) -> [Session] { + let sortedSessions = sessions.sorted { $0.date > $1.date } + guard !searchText.isEmpty else { return sortedSessions } + + return sortedSessions.filter { session in + session.transcript.localizedCaseInsensitiveContains(searchText) || + session.title.localizedCaseInsensitiveContains(searchText) || + (session.rawTranscript?.localizedCaseInsensitiveContains(searchText) ?? false) || + session.tags.joined(separator: " ").localizedCaseInsensitiveContains(searchText) + } + } + + static func pinnedSessions( + in sessions: [Session], + matching searchText: String, + referenceDate: Date = .now, + calendar: Calendar = .current + ) -> [Session] { + visibleSessions(in: sessions, matching: searchText) + .filter(\.pinned) + } + + static func sections( + in sessions: [Session], + matching searchText: String, + referenceDate: Date = .now, + calendar: Calendar = .current + ) -> [SessionHistorySection] { + let pinnedIDs = Set( + pinnedSessions( + in: sessions, + matching: searchText, + referenceDate: referenceDate, + calendar: calendar + ).map(\.id) + ) + let visible = visibleSessions(in: sessions, matching: searchText) + .filter { !pinnedIDs.contains($0.id) } + + let grouped = Dictionary(grouping: visible) { session in + bucketTitle(for: session.date, referenceDate: referenceDate, calendar: calendar) + } + + return ["Today", "Yesterday", "Earlier"].compactMap { title in + guard let sectionSessions = grouped[title], !sectionSessions.isEmpty else { return nil } + return SessionHistorySection(title: title, sessions: sectionSessions) + } + } + + private static func bucketTitle(for date: Date, referenceDate: Date, calendar: Calendar) -> String { + let startOfReference = calendar.startOfDay(for: referenceDate) + let startOfDate = calendar.startOfDay(for: date) + let dayDifference = calendar.dateComponents([.day], from: startOfDate, to: startOfReference).day ?? 0 + + switch dayDifference { + case 0: + return "Today" + case 1: + return "Yesterday" + default: + return "Earlier" + } + } +} diff --git a/ExecuWhisper/ExecuWhisper/Views/AudioLevelView.swift b/ExecuWhisper/ExecuWhisper/Views/AudioLevelView.swift new file mode 100644 index 0000000000..d2e52de3b0 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Views/AudioLevelView.swift @@ -0,0 +1,54 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +struct AudioLevelView: View { + let level: Float + let barCount: Int + + @State private var barHeights: [CGFloat] + + init(level: Float, barCount: Int = 24) { + self.level = level + self.barCount = barCount + _barHeights = State(initialValue: Array(repeating: 0.08, count: barCount)) + } + + var body: some View { + HStack(spacing: 2) { + ForEach(0.. Color { + if height > 0.7 { return .orange } + if height > 0.15 { return .accentColor } + return .secondary.opacity(0.4) + } +} diff --git a/ExecuWhisper/ExecuWhisper/Views/ContentView.swift b/ExecuWhisper/ExecuWhisper/Views/ContentView.swift new file mode 100644 index 0000000000..a2291b1b04 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Views/ContentView.swift @@ -0,0 +1,172 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +struct ContentView: View { + @Environment(TranscriptStore.self) private var store + @Environment(DictationManager.self) private var dictationManager + @State private var columnVisibility: NavigationSplitViewVisibility = .doubleColumn + @State private var activePage: SidebarPage = .home + @State private var selectedSidebarPages: Set = [.home] + @State private var isAudioDropTargeted = false + + var body: some View { + NavigationSplitView(columnVisibility: $columnVisibility) { + SidebarView(selectedPages: $selectedSidebarPages) + .navigationSplitViewColumnWidth(min: 180, ideal: 220, max: 320) + } detail: { + detailContent + .frame(maxWidth: .infinity, maxHeight: .infinity) + } + .navigationSplitViewStyle(.balanced) + .toolbar { RecordingControls() } + .overlay(alignment: .top) { + if store.currentError != nil { + ErrorBannerView() + .transition(.move(edge: .top).combined(with: .opacity)) + } + } + .animation(.easeInOut(duration: 0.25), value: store.currentError != nil) + .onChange(of: selectedSidebarPages) { _, newSelection in + let selectedSessionIDs = Set(newSelection.compactMap { page -> UUID? in + if case .session(let id) = page { + return id + } + return nil + }) + store.selectedHistorySessionIDs = selectedSessionIDs + + guard newSelection.count == 1, let selectedPage = newSelection.first else { + if case .session(let id) = activePage, selectedSessionIDs.contains(id) { + store.selectedSessionID = id + } else { + store.selectedSessionID = nil + if newSelection.isEmpty { + activePage = .home + } + } + return + } + + activePage = selectedPage + if case .session(let id) = selectedPage { + store.selectedSessionID = id + } else { + store.selectedSessionID = nil + } + } + .onChange(of: store.selectedSessionID) { _, newID in + if let newID { + let page = SidebarPage.session(newID) + activePage = page + selectedSidebarPages = [page] + } + } + .onChange(of: store.sessions) { _, newSessions in + let existingIDs = Set(newSessions.map(\.id)) + let filteredSelection = Set(selectedSidebarPages.compactMap { page -> SidebarPage? in + if case .session(let id) = page, !existingIDs.contains(id) { + return nil + } + return page + }) + let resolvedSelection = filteredSelection.isEmpty ? Set([SidebarPage.home]) : filteredSelection + if resolvedSelection != selectedSidebarPages { + selectedSidebarPages = resolvedSelection + } + } + .task { + await store.initialize() + dictationManager.registerHotKey() + if let selectedSessionID = store.selectedSessionID { + let page = SidebarPage.session(selectedSessionID) + activePage = page + selectedSidebarPages = [page] + } else { + activePage = .home + selectedSidebarPages = [.home] + } + } + } + + @ViewBuilder + private var detailContent: some View { + if store.hasActiveSession { + TranscriptView( + text: store.liveTranscript, + isLive: true, + isRecording: store.isRecording, + isTranscribing: store.isTranscribing, + audioLevel: store.audioLevel, + statusMessage: store.statusMessage + ) + } else { + switch activePage { + case .replacements: + ReplacementManagementView() + .padding() + .navigationTitle("Replacements") + case .settings: + SettingsView() + .padding() + .navigationTitle("Settings") + case .session(let id): + if let session = store.sessions.first(where: { $0.id == id }) { + TranscriptView( + text: session.transcript, + isLive: false + ) + .navigationTitle(session.displayTitle) + } else { + homeContent + } + case .home: + homeContent + } + } + } + + @ViewBuilder + private var homeContent: some View { + Group { + if !store.resourcesReady { + SetupGuideView() + } else { + WelcomeView(isDropTargeted: acceptsDroppedAudioFiles && isAudioDropTargeted) + } + } + .frame(maxWidth: .infinity, maxHeight: .infinity) + .contentShape(Rectangle()) + .dropDestination( + for: URL.self, + action: handleDroppedAudioFiles(_:_:), + isTargeted: { isTargeted in + isAudioDropTargeted = isTargeted + } + ) + } + + private var acceptsDroppedAudioFiles: Bool { + activePage == .home && !store.hasActiveSession + } + + private func handleDroppedAudioFiles(_ urls: [URL], _: CGPoint) -> Bool { + guard acceptsDroppedAudioFiles else { return false } + + guard let url = ImportedAudioDecoder.importableAudioFile(from: urls) else { + store.currentError = .transcriptionFailed(description: "Drop exactly one .wav or .mp3 file to transcribe.") + return false + } + + Task { @MainActor in + await store.importAudioFile(url) + } + return true + } +} diff --git a/ExecuWhisper/ExecuWhisper/Views/DictationOverlayView.swift b/ExecuWhisper/ExecuWhisper/Views/DictationOverlayView.swift new file mode 100644 index 0000000000..0d1a2941e6 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Views/DictationOverlayView.swift @@ -0,0 +1,32 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +struct DictationOverlayView: View { + @Environment(TranscriptStore.self) private var store + @Environment(DictationManager.self) private var dictationManager + + var body: some View { + VStack(spacing: 10) { + AudioLevelView(level: store.audioLevel, barCount: 20) + .frame(height: 36) + + Text("\(dictationManager.hotKeyDisplayText) to finish") + .font(.caption2) + .foregroundStyle(.tertiary) + } + .padding(16) + .frame(width: 300) + .background(.ultraThinMaterial, in: RoundedRectangle(cornerRadius: 14)) + .overlay( + RoundedRectangle(cornerRadius: 14) + .strokeBorder(.quaternary, lineWidth: 0.5) + ) + } +} diff --git a/ExecuWhisper/ExecuWhisper/Views/DictationPanel.swift b/ExecuWhisper/ExecuWhisper/Views/DictationPanel.swift new file mode 100644 index 0000000000..3be531a8f1 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Views/DictationPanel.swift @@ -0,0 +1,46 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import AppKit +import SwiftUI + +final class DictationPanel: NSPanel { + init(contentView: some View) { + super.init( + contentRect: NSRect(x: 0, y: 0, width: 320, height: 140), + styleMask: [.nonactivatingPanel, .fullSizeContentView, .borderless], + backing: .buffered, + defer: true + ) + + level = .floating + isOpaque = false + backgroundColor = .clear + hasShadow = true + isMovableByWindowBackground = true + collectionBehavior = [.canJoinAllSpaces, .fullScreenAuxiliary] + animationBehavior = .utilityWindow + + let hosting = NSHostingView(rootView: contentView) + hosting.frame = contentRect(forFrameRect: frame) + self.contentView = hosting + } + + func showCentered(on preferredScreen: NSScreen? = nil) { + guard let screen = preferredScreen ?? NSScreen.main else { return } + let screenFrame = screen.visibleFrame + let x = screenFrame.midX - frame.width / 2 + let y = screenFrame.midY - frame.height / 2 + 100 + setFrameOrigin(NSPoint(x: x, y: y)) + orderFrontRegardless() + } + + func dismiss() { + orderOut(nil) + } +} diff --git a/ExecuWhisper/ExecuWhisper/Views/ErrorBannerView.swift b/ExecuWhisper/ExecuWhisper/Views/ErrorBannerView.swift new file mode 100644 index 0000000000..cde34733e1 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Views/ErrorBannerView.swift @@ -0,0 +1,46 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +struct ErrorBannerView: View { + @Environment(TranscriptStore.self) private var store + + var body: some View { + if let error = store.currentError { + HStack(spacing: 12) { + Image(systemName: "exclamationmark.triangle.fill") + .foregroundStyle(.white) + Text(error.localizedDescription) + .font(.callout) + .foregroundStyle(.white) + .lineLimit(3) + Spacer() + Button { + store.clearError() + } label: { + Image(systemName: "xmark") + .foregroundStyle(.white.opacity(0.8)) + } + .buttonStyle(.plain) + } + .padding(.horizontal, 16) + .padding(.vertical, 10) + .background(.red.gradient, in: RoundedRectangle(cornerRadius: 8)) + .padding(.horizontal, 16) + .padding(.top, 8) + .task(id: error.localizedDescription) { + guard !error.isStickyUserActionError else { return } + try? await Task.sleep(for: .seconds(5)) + if store.currentError?.localizedDescription == error.localizedDescription { + store.clearError() + } + } + } + } +} diff --git a/ExecuWhisper/ExecuWhisper/Views/RecordingControls.swift b/ExecuWhisper/ExecuWhisper/Views/RecordingControls.swift new file mode 100644 index 0000000000..dbe238fe52 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Views/RecordingControls.swift @@ -0,0 +1,137 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +struct RecordingControls: ToolbarContent { + @Environment(TranscriptStore.self) private var store + @Environment(ModelDownloader.self) private var downloader + @Environment(Preferences.self) private var preferences + + var body: some ToolbarContent { + ToolbarItem(placement: .primaryAction) { + HStack(spacing: 6) { + if shouldShowBulkDeleteButton { + deleteSelectedButton + } + if store.isHelperLoading { + preloadIndicator + } + recordButton + if let session = currentSession { + exportButton(for: session) + } + if !currentTranscript.isEmpty { + copyButton + } + } + } + } + + private var preloadIndicator: some View { + HStack(spacing: 6) { + ProgressView() + .controlSize(.small) + Text("Preloading...") + .font(.caption) + .foregroundStyle(.secondary) + } + .padding(.horizontal, 8) + .padding(.vertical, 4) + .background(.background.secondary, in: Capsule()) + .help(store.helperStatusMessage.isEmpty ? "Preloading model" : store.helperStatusMessage) + .accessibilityElement(children: .combine) + .accessibilityLabel("Preloading model") + } + + private var recordButton: some View { + Button { + Task { + if store.isRecording { + await store.stopRecordingAndTranscribe() + } else { + await store.startRecording() + } + } + } label: { + switch store.sessionState { + case .idle: + Label("Record", systemImage: "mic.fill") + case .recording: + Label("Stop and Transcribe", systemImage: "stop.circle.fill") + .foregroundStyle(.orange) + case .transcribing: + ProgressView() + .controlSize(.small) + } + } + .keyboardShortcut("R", modifiers: [.command, .shift]) + .disabled(store.isTranscribing || downloader.isDownloading || (!store.isModelReady && !store.isRecording)) + .help(recordButtonHelp) + } + + private var copyButton: some View { + Button { + NSPasteboard.general.clearContents() + NSPasteboard.general.setString(currentTranscript, forType: .string) + } label: { + Label("Copy Transcript", systemImage: "doc.on.doc") + } + .keyboardShortcut("C", modifiers: [.command, .shift]) + .help("Copy the selected transcript") + } + + private func exportButton(for session: Session) -> some View { + Menu { + ForEach(SessionExportFormat.allCases, id: \.rawValue) { format in + Button(format.title) { + store.exportSession(session, format: format) + } + } + } label: { + Label("Export", systemImage: "square.and.arrow.down") + } + .help("Export the selected transcript") + } + + private var deleteSelectedButton: some View { + Button(role: .destructive) { + store.deleteSessions(ids: store.selectedHistorySessionIDs) + } label: { + Label("Delete Selected", systemImage: "trash") + } + .help("Delete \(store.selectedHistorySessionIDs.count) selected history items") + } + + private var currentSession: Session? { + guard !store.hasActiveSession, let id = store.selectedSessionID else { return nil } + return store.sessions.first(where: { $0.id == id }) + } + + private var currentTranscript: String { + if store.hasActiveSession { + return store.liveTranscript + } + return currentSession?.transcript ?? "" + } + + private var recordButtonHelp: String { + switch store.sessionState { + case .idle: + return "Start recording (Cmd-Shift-R)" + case .recording: + return "Stop recording and transcribe (Cmd-Shift-R)" + case .transcribing: + return "Transcribing..." + } + } + + private var shouldShowBulkDeleteButton: Bool { + !store.hasActiveSession && store.selectedHistorySessionIDs.count > 1 + } +} diff --git a/ExecuWhisper/ExecuWhisper/Views/ReplacementManagementView.swift b/ExecuWhisper/ExecuWhisper/Views/ReplacementManagementView.swift new file mode 100644 index 0000000000..f1b0c0a11d --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Views/ReplacementManagementView.swift @@ -0,0 +1,155 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +private struct ReplacementEditorItem: Identifiable { + let id = UUID() + let entry: ReplacementEntry + let isEditing: Bool +} + +struct ReplacementManagementView: View { + @Environment(ReplacementStore.self) private var replacementStore + @State private var searchText = "" + @State private var editorItem: ReplacementEditorItem? + + var body: some View { + VStack(alignment: .leading, spacing: 12) { + HStack { + TextField("Search replacements", text: $searchText) + .textFieldStyle(.roundedBorder) + Button("Add") { + editorItem = ReplacementEditorItem(entry: ReplacementEntry(), isEditing: false) + } + .buttonStyle(.borderedProminent) + } + + if filteredEntries.isEmpty { + ContentUnavailableView( + "No Replacements", + systemImage: "arrow.2.squarepath", + description: Text("Add names, acronyms, and product terms you want corrected automatically.") + ) + } else { + List { + ForEach(filteredEntries) { entry in + HStack(alignment: .top, spacing: 12) { + Toggle("", isOn: binding(for: entry.id)) + .labelsHidden() + VStack(alignment: .leading, spacing: 4) { + Text(entry.replacement) + .font(.headline) + Text("Trigger: \(entry.trigger)") + .font(.caption) + .foregroundStyle(.secondary) + if !entry.notes.isEmpty { + Text(entry.notes) + .font(.caption) + .foregroundStyle(.tertiary) + } + } + Spacer() + Button("Edit") { + editorItem = ReplacementEditorItem(entry: entry, isEditing: true) + } + .buttonStyle(.borderless) + } + .contextMenu { + Button("Edit") { + editorItem = ReplacementEditorItem(entry: entry, isEditing: true) + } + Button(entry.isEnabled ? "Disable" : "Enable") { + replacementStore.toggleEnabled(for: entry.id) + } + Divider() + Button("Delete", role: .destructive) { + replacementStore.delete(entry) + } + } + } + } + .listStyle(.inset) + } + } + .sheet(item: $editorItem) { item in + ReplacementEntryEditor(entry: item.entry, isEditing: item.isEditing) { entry in + if item.isEditing { + replacementStore.update(entry) + } else { + replacementStore.add(entry) + } + editorItem = nil + } onCancel: { + editorItem = nil + } + .frame(width: 420) + } + } + + private var filteredEntries: [ReplacementEntry] { + guard !searchText.isEmpty else { return replacementStore.entries } + return replacementStore.entries.filter { + $0.trigger.localizedCaseInsensitiveContains(searchText) || + $0.replacement.localizedCaseInsensitiveContains(searchText) || + $0.notes.localizedCaseInsensitiveContains(searchText) + } + } + + private func binding(for id: UUID) -> Binding { + Binding( + get: { + replacementStore.entries.first(where: { $0.id == id })?.isEnabled ?? false + }, + set: { _ in + replacementStore.toggleEnabled(for: id) + } + ) + } +} + +private struct ReplacementEntryEditor: View { + @State var entry: ReplacementEntry + let isEditing: Bool + let onSave: (ReplacementEntry) -> Void + let onCancel: () -> Void + + var body: some View { + VStack(alignment: .leading, spacing: 16) { + Text(isEditing ? "Edit Replacement" : "Add Replacement") + .font(.headline) + + TextField("Trigger phrase", text: $entry.trigger) + .textFieldStyle(.roundedBorder) + TextField("Replacement", text: $entry.replacement) + .textFieldStyle(.roundedBorder) + TextField("Notes (optional)", text: $entry.notes) + .textFieldStyle(.roundedBorder) + + Toggle("Case sensitive", isOn: $entry.isCaseSensitive) + Toggle("Require word boundary", isOn: $entry.requiresWordBoundary) + Toggle("Enabled", isOn: $entry.isEnabled) + + HStack { + Spacer() + Button("Cancel", role: .cancel) { + onCancel() + } + Button("Save") { + onSave(entry) + } + .keyboardShortcut(.defaultAction) + .disabled( + entry.trigger.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty || + entry.replacement.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + ) + } + } + .padding(24) + } +} diff --git a/ExecuWhisper/ExecuWhisper/Views/SettingsView.swift b/ExecuWhisper/ExecuWhisper/Views/SettingsView.swift new file mode 100644 index 0000000000..fdd16f185f --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Views/SettingsView.swift @@ -0,0 +1,228 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +struct SettingsView: View { + @Environment(Preferences.self) private var preferences + @Environment(DictationManager.self) private var dictationManager + var usesFixedWindowSize = false + + var body: some View { + @Bindable var prefs = preferences + let availableMicrophones = AudioRecorder.availableInputDevices() + let resolvedMicrophone = AudioRecorder.resolvePreferredMicrophone( + selectedMicrophoneID: prefs.selectedMicrophoneID, + availableDevices: availableMicrophones + ) + + Form { + Section("Helper") { + LabeledContent("Binary path") { + HStack { + TextField("Path to parakeet_helper", text: $prefs.runnerPath) + .textFieldStyle(.roundedBorder) + browseButton(for: $prefs.runnerPath) + } + } + } + + Section("Model Files") { + LabeledContent("Model directory") { + HStack { + TextField("Path to downloaded model files", text: $prefs.modelDirectory) + .textFieldStyle(.roundedBorder) + browseButton(for: $prefs.modelDirectory, directory: true) + } + } + + LabeledContent("Files") { + VStack(alignment: .leading, spacing: 4) { + fileStatus("model.pte", path: prefs.modelPath) + fileStatus("tokenizer.model", path: prefs.tokenizerPath) + } + } + } + + Section("Smart Formatting") { + Toggle("Smart formatting with LFM2.5", isOn: $prefs.enableSmartFormatting) + + Text("Uses one smart prompt for final paste-ready text: clean up dictation, preserve meaning, and use bullets only when the transcript clearly reads as a list.") + .font(.caption) + .foregroundStyle(.secondary) + + LabeledContent("Formatter helper") { + HStack { + TextField("Path to lfm25_formatter_helper", text: $prefs.formatterRunnerPath) + .textFieldStyle(.roundedBorder) + browseButton(for: $prefs.formatterRunnerPath) + } + } + + LabeledContent("Formatter model directory") { + HStack { + TextField("Path to LFM2.5 formatter files", text: $prefs.formatterModelDirectory) + .textFieldStyle(.roundedBorder) + browseButton(for: $prefs.formatterModelDirectory, directory: true) + } + } + + LabeledContent("Formatter files") { + VStack(alignment: .leading, spacing: 4) { + fileStatus("lfm2_5_350m_mlx_4w.pte", path: prefs.formatterModelPath) + fileStatus("tokenizer.json", path: prefs.formatterTokenizerPath) + fileStatus("tokenizer_config.json", path: prefs.formatterTokenizerConfigPath) + } + } + + Text("If formatter assets are unavailable, ExecuWhisper falls back to replacement-only transcript text.") + .font(.caption) + .foregroundStyle(.secondary) + } + + Section("Audio Input") { + LabeledContent("Microphone") { + Picker("Microphone", selection: $prefs.selectedMicrophoneID) { + Text("Auto (System Default)").tag("") + ForEach(availableMicrophones) { microphone in + Text(microphone.displayName).tag(microphone.id) + } + } + .labelsHidden() + .frame(width: 260) + .disabled(availableMicrophones.isEmpty) + } + + LabeledContent("Status") { + Group { + if availableMicrophones.isEmpty { + Text("No audio inputs detected") + .foregroundStyle(.orange) + } else if prefs.selectedMicrophoneID.isEmpty { + Text("Using \(resolvedMicrophone?.device.displayName ?? "system default microphone")") + .foregroundStyle(.secondary) + } else if let resolvedMicrophone, resolvedMicrophone.usedFallback { + Text("Saved mic unavailable; using \(resolvedMicrophone.device.displayName)") + .foregroundStyle(.orange) + } else if let resolvedMicrophone { + Text("Using \(resolvedMicrophone.device.name)") + .foregroundStyle(.secondary) + } + } + } + + Text("Applies to manual recording and system dictation.") + .font(.caption) + .foregroundStyle(.secondary) + } + + Section("System Dictation") { + Toggle("Enable dictation hotkey", isOn: $prefs.enableGlobalHotkey) + + LabeledContent("Shortcut") { + ShortcutRecorderView(shortcut: $prefs.dictationShortcut) { + dictationManager.refreshHotKeyRegistration() + } + } + + LabeledContent("Hotkey status") { + VStack(alignment: .trailing, spacing: 4) { + Text(dictationManager.hotKeyStatusText) + .foregroundStyle(dictationManager.hotKeyRegistrationError == nil ? Color.secondary : Color.orange) + if dictationManager.hotKeyRegistrationError != nil { + Text("Common conflicts: macOS input source switcher uses Ctrl-Space; Spotlight uses Cmd-Space.") + .font(.caption2) + .foregroundStyle(.secondary) + .multilineTextAlignment(.trailing) + } + } + } + + LabeledContent("Accessibility") { + VStack(alignment: .trailing, spacing: 6) { + Text(DictationManager.checkAccessibility() ? "Enabled" : "Required for auto-paste") + .foregroundStyle(DictationManager.checkAccessibility() ? Color.secondary : Color.orange) + Text("Grant \(PasteController.helperIdentifier)") + .font(.caption2) + .foregroundStyle(.tertiary) + if !DictationManager.checkAccessibility() { + Button("Request Access") { + dictationManager.promptForAccessibilityAccess() + } + .controlSize(.small) + } + } + } + + LabeledContent("Silence threshold") { + VStack(alignment: .trailing, spacing: 4) { + Slider(value: $prefs.silenceThreshold, in: 0.005...0.1, step: 0.005) + .frame(width: 200) + Text(String(format: "%.3f RMS", prefs.silenceThreshold)) + .font(.caption) + .foregroundStyle(.secondary) + .monospacedDigit() + } + } + + LabeledContent("Auto-stop delay") { + VStack(alignment: .trailing, spacing: 4) { + Slider(value: $prefs.silenceTimeout, in: 0.5...5.0, step: 0.25) + .frame(width: 200) + Text(String(format: "%.2fs after silence", prefs.silenceTimeout)) + .font(.caption) + .foregroundStyle(.secondary) + .monospacedDigit() + } + } + } + + Section("Defaults") { + Text("ExecuWhisper downloads the Hugging Face model into Application Support on first launch.") + .font(.caption) + .foregroundStyle(.secondary) + Text(preferences.downloadedModelDirectoryURL.path(percentEncoded: false)) + .font(.system(.caption, design: .monospaced)) + .textSelection(.enabled) + } + } + .formStyle(.grouped) + .padding() + .frame( + width: usesFixedWindowSize ? 640 : nil, + height: usesFixedWindowSize ? 680 : nil + ) + .onChange(of: prefs.enableGlobalHotkey) { _, _ in + dictationManager.refreshHotKeyRegistration() + } + } + + private func browseButton(for binding: Binding, directory: Bool = false) -> some View { + Button("Browse...") { + let panel = NSOpenPanel() + panel.canChooseFiles = !directory + panel.canChooseDirectories = directory + panel.allowsMultipleSelection = false + if panel.runModal() == .OK, let url = panel.url { + binding.wrappedValue = url.path(percentEncoded: false) + } + } + .controlSize(.small) + } + + private func fileStatus(_ name: String, path: String) -> some View { + let exists = FileManager.default.fileExists(atPath: path) + return HStack(spacing: 4) { + Image(systemName: exists ? "checkmark.circle.fill" : "xmark.circle.fill") + .foregroundStyle(exists ? .green : .red) + .font(.caption) + Text(name) + .font(.caption) + } + } +} diff --git a/ExecuWhisper/ExecuWhisper/Views/SetupGuideView.swift b/ExecuWhisper/ExecuWhisper/Views/SetupGuideView.swift new file mode 100644 index 0000000000..6a3a9e4f09 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Views/SetupGuideView.swift @@ -0,0 +1,126 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +struct SetupGuideView: View { + @Environment(TranscriptStore.self) private var store + @Environment(Preferences.self) private var preferences + @Environment(ModelDownloader.self) private var downloader + + var body: some View { + VStack(spacing: 24) { + Image(systemName: "arrow.down.circle") + .font(.system(size: 48)) + .foregroundStyle(Color.accentColor) + + Text("Preparing ExecuWhisper") + .font(.title2.bold()) + + if let result = store.healthResult { + VStack(alignment: .leading, spacing: 12) { + checkRow("Helper binary", ok: result.runnerAvailable) + checkRow("model.pte", ok: result.modelAvailable) + checkRow("tokenizer.model", ok: result.tokenizerAvailable) + } + .padding() + .background(.background.secondary, in: RoundedRectangle(cornerRadius: 8)) + } + + if downloader.isDownloading { + VStack(spacing: 10) { + ProgressView(value: max(downloader.overallProgress, 0.02)) + Text(downloader.statusMessage.isEmpty ? "Downloading model..." : downloader.statusMessage) + .font(.callout) + if !downloader.currentFileName.isEmpty { + Text(downloader.currentFileName) + .font(.caption.monospaced()) + .foregroundStyle(.secondary) + } + } + .padding() + .background(.background.secondary, in: RoundedRectangle(cornerRadius: 10)) + } else if store.healthResult?.shouldOfferModelDownload == true { + Text("The app downloads the Parakeet ASR model and LFM2.5 formatter artifacts into Application Support the first time it launches.") + .font(.callout) + .foregroundStyle(.secondary) + .multilineTextAlignment(.center) + + Button("Retry Download") { + Task { await store.downloadModel() } + } + .buttonStyle(.borderedProminent) + } else { + Text("The Parakeet helper is missing or invalid. Build it locally or choose a valid binary path in Settings before retrying.") + .font(.callout) + .foregroundStyle(.secondary) + .multilineTextAlignment(.center) + } + + developerInstructions + + Button("Recheck") { + Task { await store.runHealthCheck() } + } + .buttonStyle(.bordered) + } + .padding(40) + .frame(maxWidth: 560) + } + + private func checkRow(_ label: String, ok: Bool) -> some View { + HStack { + Image(systemName: ok ? "checkmark.circle.fill" : "xmark.circle.fill") + .foregroundStyle(ok ? .green : .red) + Text(label) + Spacer() + } + } + + private var developerInstructions: some View { + VStack(alignment: .leading, spacing: 8) { + Text("Developer notes") + .font(.headline) + + Text(""" + cd ~/executorch + gh pr checkout https://github.com/pytorch/executorch/pull/18861 + conda activate et-metal + make parakeet-metal + conda activate et-mlx + make lfm_2_5_formatter-mlx + """) + .font(.system(.caption, design: .monospaced)) + .textSelection(.enabled) + .padding(8) + .background(.background.tertiary, in: RoundedRectangle(cornerRadius: 4)) + + Text("Helper path:") + .font(.caption) + .foregroundStyle(.secondary) + Text(preferences.runnerPath) + .font(.system(.caption, design: .monospaced)) + .textSelection(.enabled) + + Text("Download location:") + .font(.caption) + .foregroundStyle(.secondary) + Text(preferences.modelDirectory) + .font(.system(.caption, design: .monospaced)) + .textSelection(.enabled) + + Text("Formatter location:") + .font(.caption) + .foregroundStyle(.secondary) + Text(preferences.formatterModelDirectory) + .font(.system(.caption, design: .monospaced)) + .textSelection(.enabled) + } + .frame(maxWidth: .infinity, alignment: .leading) + } +} diff --git a/ExecuWhisper/ExecuWhisper/Views/ShortcutRecorderView.swift b/ExecuWhisper/ExecuWhisper/Views/ShortcutRecorderView.swift new file mode 100644 index 0000000000..2d9c879bed --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Views/ShortcutRecorderView.swift @@ -0,0 +1,82 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import AppKit +import Carbon.HIToolbox +import SwiftUI + +struct ShortcutRecorderView: View { + @Binding var shortcut: DictationShortcut + let onChange: () -> Void + + @State private var isRecording = false + @State private var keyMonitor: Any? + + var body: some View { + HStack(spacing: 8) { + Button(isRecording ? "Type shortcut" : shortcut.displayString) { + if isRecording { + stopRecording() + } else { + beginRecording() + } + } + .buttonStyle(.bordered) + .font(.system(.body, design: .monospaced)) + .frame(minWidth: 140) + + Button("Reset") { + shortcut = .controlSpace + onChange() + } + .controlSize(.small) + + if isRecording { + Text("Press Esc to cancel") + .font(.caption) + .foregroundStyle(.secondary) + } + } + .onDisappear { + stopRecording() + } + } + + private func beginRecording() { + guard !isRecording else { return } + isRecording = true + keyMonitor = NSEvent.addLocalMonitorForEvents(matching: .keyDown) { event in + handleKeyDown(event) + return nil + } + } + + private func stopRecording() { + isRecording = false + if let keyMonitor { + NSEvent.removeMonitor(keyMonitor) + self.keyMonitor = nil + } + } + + private func handleKeyDown(_ event: NSEvent) { + if event.keyCode == UInt16(kVK_Escape) { + stopRecording() + return + } + + guard let recordedShortcut = DictationShortcut(event: event) else { + NSSound.beep() + return + } + + shortcut = recordedShortcut + onChange() + stopRecording() + } +} diff --git a/ExecuWhisper/ExecuWhisper/Views/SidebarView.swift b/ExecuWhisper/ExecuWhisper/Views/SidebarView.swift new file mode 100644 index 0000000000..a45776d59a --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Views/SidebarView.swift @@ -0,0 +1,222 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +enum SidebarPage: Hashable { + case home + case replacements + case settings + case session(UUID) +} + +struct SidebarView: View { + @Environment(TranscriptStore.self) private var store + @Binding var selectedPages: Set + @State private var searchText = "" + @State private var renamingSessionID: UUID? + @State private var renameText = "" + + var body: some View { + List(selection: $selectedPages) { + Section { + Label("Home", systemImage: "house") + .tag(SidebarPage.home) + Label("Replacements", systemImage: "arrow.2.squarepath") + .tag(SidebarPage.replacements) + Label("Settings", systemImage: "gear") + .tag(SidebarPage.settings) + } + + if store.hasActiveSession { + liveRow + } + + if !pinnedSessions.isEmpty { + Section("Pinned") { + ForEach(pinnedSessions) { session in + sessionRow(session) + .tag(SidebarPage.session(session.id)) + .contextMenu { sessionContextMenu(session) } + } + } + } + + ForEach(historySections) { section in + Section(section.title) { + ForEach(section.sessions) { session in + sessionRow(session) + .tag(SidebarPage.session(session.id)) + .contextMenu { sessionContextMenu(session) } + } + } + } + } + .listStyle(.sidebar) + .searchable(text: $searchText, placement: .sidebar, prompt: "Search history") + .onDeleteCommand { + deleteSelectedSessions() + } + .overlay { + if store.sessions.isEmpty && !store.hasActiveSession { + ContentUnavailableView( + "No History", + systemImage: "waveform", + description: Text("Record audio to create your first transcript") + ) + } + } + .sheet(item: renamingBinding) { session in + RenameSheet(title: renameText) { newTitle in + store.renameSession(session, to: newTitle) + renamingSessionID = nil + } onCancel: { + renamingSessionID = nil + } + } + } + + private var renamingBinding: Binding { + Binding( + get: { + guard let id = renamingSessionID else { return nil } + return store.sessions.first { $0.id == id } + }, + set: { _ in renamingSessionID = nil } + ) + } + + private var pinnedSessions: [Session] { + SessionHistory.pinnedSessions(in: store.sessions, matching: searchText) + } + + private var historySections: [SessionHistorySection] { + SessionHistory.sections(in: store.sessions, matching: searchText) + } + + private var liveRow: some View { + HStack { + if store.isRecording { + AudioLevelView(level: store.audioLevel, barCount: 6) + .frame(width: 24) + } else { + ProgressView() + .controlSize(.small) + .frame(width: 24) + } + + VStack(alignment: .leading) { + Text(store.isRecording ? "Recording..." : "Transcribing...") + .font(.headline) + Text(store.statusMessage) + .font(.caption) + .foregroundStyle(.secondary) + .lineLimit(1) + } + } + .listRowBackground(Color.accentColor.opacity(0.08)) + } + + private func sessionRow(_ session: Session) -> some View { + VStack(alignment: .leading, spacing: 6) { + HStack(spacing: 6) { + if session.pinned { + Image(systemName: "pin.fill") + .font(.caption2) + .foregroundStyle(.yellow) + } + Text(session.displayTitle) + .font(.headline) + .lineLimit(1) + } + Text(session.previewText.prefix(100).description) + .font(.caption) + .foregroundStyle(.secondary) + .lineLimit(2) + HStack(spacing: 6) { + Text(session.date, format: .dateTime.month(.abbreviated).day().hour().minute()) + Text("·") + Text(formattedDuration(session.duration)) + ForEach(session.tags.prefix(2), id: \.self) { tag in + Text(tag) + } + } + .font(.caption2) + .foregroundStyle(.tertiary) + } + .padding(.vertical, 2) + } + + @ViewBuilder + private func sessionContextMenu(_ session: Session) -> some View { + Button(session.pinned ? "Unpin" : "Pin") { + store.togglePinned(session) + } + Button("Rename...") { + renameText = session.title + renamingSessionID = session.id + } + Button("Copy Transcript") { + NSPasteboard.general.clearContents() + NSPasteboard.general.setString(session.transcript, forType: .string) + } + Menu("Export") { + ForEach(SessionExportFormat.allCases, id: \.rawValue) { format in + Button(format.title) { + store.exportSession(session, format: format) + } + } + } + Divider() + if store.selectedHistorySessionIDs.count > 1 && store.selectedHistorySessionIDs.contains(session.id) { + Button("Delete Selected (\(store.selectedHistorySessionIDs.count))", role: .destructive) { + deleteSelectedSessions() + } + Divider() + } + Button("Delete", role: .destructive) { + store.deleteSession(session) + } + } + + private func formattedDuration(_ duration: TimeInterval) -> String { + let minutes = Int(duration) / 60 + let seconds = Int(duration) % 60 + return String(format: "%d:%02d", minutes, seconds) + } + + private func deleteSelectedSessions() { + guard store.selectedHistorySessionIDs.count > 1 else { return } + store.deleteSessions(ids: store.selectedHistorySessionIDs) + } +} + +private struct RenameSheet: View { + @State var title: String + let onSave: (String) -> Void + let onCancel: () -> Void + + var body: some View { + VStack(spacing: 16) { + Text("Rename") + .font(.headline) + TextField("Title", text: $title) + .textFieldStyle(.roundedBorder) + .frame(minWidth: 250) + .onSubmit { onSave(title) } + HStack { + Button("Cancel", role: .cancel) { onCancel() } + .keyboardShortcut(.cancelAction) + Button("Save") { onSave(title) } + .keyboardShortcut(.defaultAction) + .disabled(title.trimmingCharacters(in: .whitespaces).isEmpty) + } + } + .padding(20) + } +} diff --git a/ExecuWhisper/ExecuWhisper/Views/TranscriptView.swift b/ExecuWhisper/ExecuWhisper/Views/TranscriptView.swift new file mode 100644 index 0000000000..d1f88eff4b --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Views/TranscriptView.swift @@ -0,0 +1,101 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +struct TranscriptView: View { + let text: String + let isLive: Bool + var isRecording: Bool = false + var isTranscribing: Bool = false + var audioLevel: Float = 0 + var statusMessage: String = "" + + var body: some View { + ScrollViewReader { proxy in + ScrollView { + VStack(alignment: .leading, spacing: 0) { + if text.isEmpty && isLive { + livePlaceholder + } else { + Text(text) + .font(.body) + .textSelection(.enabled) + .frame(maxWidth: .infinity, alignment: .leading) + .padding() + } + + Color.clear + .frame(height: 1) + .id("bottom") + } + } + .onChange(of: text) { + withAnimation(.easeOut(duration: 0.15)) { + proxy.scrollTo("bottom", anchor: .bottom) + } + } + .onAppear { + proxy.scrollTo("bottom", anchor: .bottom) + } + } + .overlay(alignment: .bottom) { + if isLive { + statusIndicator + .padding(.bottom, 12) + } + } + } + + private var livePlaceholder: some View { + VStack(spacing: 16) { + Spacer() + + if isRecording { + AudioLevelView(level: audioLevel) + } else { + ProgressView() + .controlSize(.large) + } + + Text(isRecording ? "Recording..." : "Transcribing...") + .font(.title3) + .foregroundStyle(.secondary) + + if !statusMessage.isEmpty { + Text(statusMessage) + .font(.caption) + .foregroundStyle(.tertiary) + } + + Spacer() + } + .frame(maxWidth: .infinity, maxHeight: .infinity) + .padding() + } + + private var statusIndicator: some View { + HStack(spacing: 8) { + if isRecording { + AudioLevelView(level: audioLevel, barCount: 12) + Text("Recording") + .font(.caption) + .foregroundStyle(.secondary) + } else if isTranscribing { + ProgressView() + .controlSize(.small) + Text(statusMessage.isEmpty ? "Transcribing" : statusMessage) + .font(.caption) + .foregroundStyle(.secondary) + } + } + .padding(.horizontal, 14) + .padding(.vertical, 8) + .background(.ultraThinMaterial, in: Capsule()) + } +} diff --git a/ExecuWhisper/ExecuWhisper/Views/WelcomeView.swift b/ExecuWhisper/ExecuWhisper/Views/WelcomeView.swift new file mode 100644 index 0000000000..00cbbdc245 --- /dev/null +++ b/ExecuWhisper/ExecuWhisper/Views/WelcomeView.swift @@ -0,0 +1,282 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import SwiftUI + +struct WelcomeView: View { + @Environment(TranscriptStore.self) private var store + @Environment(ModelDownloader.self) private var downloader + @Environment(Preferences.self) private var preferences + var isDropTargeted: Bool = false + + var body: some View { + VStack(spacing: 18) { + Image(systemName: "waveform") + .font(.system(size: 56)) + .foregroundStyle(.secondary) + + Text("ExecuWhisper") + .font(.title.bold()) + + Text("On-device dictation and formatting powered by ExecuTorch") + .font(.subheadline) + .foregroundStyle(.secondary) + + modelSection + + if store.isModelReady { + preloadSection + } + + if store.isModelReady { + formattingSection + } + + if store.isModelReady { + Button { + Task { await store.startRecording() } + } label: { + Label("Start Recording", systemImage: "mic.fill") + .frame(minWidth: 180) + } + .buttonStyle(.borderedProminent) + .controlSize(.large) + .keyboardShortcut("R", modifiers: [.command, .shift]) + + Button { + store.importAudioFileWithPanel() + } label: { + Label("Import Audio...", systemImage: "square.and.arrow.down") + .frame(minWidth: 180) + } + .buttonStyle(.bordered) + .controlSize(.large) + + dropHint + } + + Text(preferences.modelDirectory) + .font(.caption.monospaced()) + .foregroundStyle(.tertiary) + .lineLimit(2) + .multilineTextAlignment(.center) + + shortcutHints + } + .padding(40) + .frame(maxWidth: 520) + .background(backgroundStyle, in: RoundedRectangle(cornerRadius: 18)) + } + + private var formattingSection: some View { + VStack(spacing: 8) { + Text("Smart formatting") + .font(.caption) + .foregroundStyle(.secondary) + Text(preferences.enableSmartFormatting ? "On" : "Off") + .font(.headline) + if preferences.enableSmartFormatting { + Text("LFM2.5 rewrites the Parakeet transcript before paste/save.") + .font(.caption) + .foregroundStyle(.secondary) + .multilineTextAlignment(.center) + } else { + Text("Smart formatting is off; replacements still apply.") + .font(.caption) + .foregroundStyle(.secondary) + .multilineTextAlignment(.center) + } + } + .padding() + .background(.background.secondary, in: RoundedRectangle(cornerRadius: 10)) + } + + @ViewBuilder + private var modelSection: some View { + if store.healthResult?.runnerAvailable == false { + VStack(spacing: 10) { + Label("Helper setup required", systemImage: "wrench.and.screwdriver") + .font(.headline) + Text("Build `parakeet_helper` or choose an existing binary in Settings before recording.") + .font(.callout) + .foregroundStyle(.secondary) + .multilineTextAlignment(.center) + } + .padding() + .background(.background.secondary, in: RoundedRectangle(cornerRadius: 10)) + } else if downloader.isDownloading || store.modelState == .downloading { + VStack(spacing: 12) { + ProgressView(value: max(downloader.overallProgress, 0.02)) + .frame(minWidth: 220) + Text(downloader.statusMessage.isEmpty ? "Downloading model..." : downloader.statusMessage) + .font(.callout) + if !downloader.currentFileName.isEmpty { + Text(downloader.currentFileName) + .font(.caption.monospaced()) + .foregroundStyle(.secondary) + } + } + .padding() + .background(.background.secondary, in: RoundedRectangle(cornerRadius: 10)) + } else if store.isModelReady { + HStack(spacing: 8) { + Image(systemName: "checkmark.circle.fill") + .foregroundStyle(.green) + Text("Model files ready") + .foregroundStyle(.secondary) + } + .font(.callout) + } else if store.modelState == .checking { + VStack(spacing: 12) { + ProgressView() + .controlSize(.regular) + Text("Checking model...") + .font(.callout) + .foregroundStyle(.secondary) + } + .padding() + .background(.background.secondary, in: RoundedRectangle(cornerRadius: 10)) + } else if store.healthResult?.shouldOfferModelDownload == true { + Button { + Task { await store.downloadModel() } + } label: { + Label("Download Model", systemImage: "arrow.down.circle") + .frame(minWidth: 180) + } + .buttonStyle(.bordered) + .controlSize(.large) + } else { + VStack(spacing: 10) { + Label("Model setup incomplete", systemImage: "exclamationmark.triangle") + .font(.headline) + Text(store.statusMessage) + .font(.callout) + .foregroundStyle(.secondary) + } + .padding() + .background(.background.secondary, in: RoundedRectangle(cornerRadius: 10)) + } + } + + @ViewBuilder + private var preloadSection: some View { + switch store.helperState { + case .unloaded: + VStack(spacing: 12) { + Text("Preload the helper to reduce stop-to-text latency.") + .font(.callout) + .foregroundStyle(.secondary) + .multilineTextAlignment(.center) + + Button { + Task { await store.preloadModel() } + } label: { + Label("Preload Model", systemImage: "bolt.fill") + .frame(minWidth: 180) + } + .buttonStyle(.bordered) + } + .padding() + .background(.background.secondary, in: RoundedRectangle(cornerRadius: 10)) + + case .loading: + VStack(spacing: 12) { + ProgressView() + .controlSize(.regular) + VStack(spacing: 4) { + Text("Preloading model...") + .font(.callout) + Text(store.helperStatusMessage.isEmpty ? "Warming helper to reduce first transcription latency." : store.helperStatusMessage) + .font(.caption) + .foregroundStyle(.secondary) + .multilineTextAlignment(.center) + } + } + .padding() + .frame(minWidth: 220) + .background(.background.secondary, in: RoundedRectangle(cornerRadius: 10)) + + case .warm: + HStack(spacing: 8) { + Image(systemName: "bolt.circle.fill") + .foregroundStyle(.green) + Text("Model preloaded") + .foregroundStyle(.secondary) + Button { + Task { await store.unloadModel() } + } label: { + Label("Unload", systemImage: "xmark.circle") + .labelStyle(.iconOnly) + .font(.callout) + } + .buttonStyle(.plain) + .foregroundStyle(.secondary) + .help("Unload the helper to free resources") + } + .font(.callout) + + case .failed: + VStack(spacing: 12) { + Label("Warmup failed", systemImage: "exclamationmark.triangle") + .foregroundStyle(.orange) + Text(store.helperStatusMessage.isEmpty ? "The helper could not preload." : store.helperStatusMessage) + .font(.callout) + .foregroundStyle(.secondary) + .multilineTextAlignment(.center) + Button("Retry Preload") { + Task { await store.preloadModel() } + } + .buttonStyle(.bordered) + } + .padding() + .background(.background.secondary, in: RoundedRectangle(cornerRadius: 10)) + } + } + + private var shortcutHints: some View { + HStack(spacing: 12) { + shortcutBadge(preferences.dictationShortcut.displayString, label: "Dictation") + Divider() + .frame(height: 24) + shortcutBadge("⌘⇧R", label: "Record / Stop") + shortcutBadge("⌘⇧C", label: "Copy") + } + .padding(.top, 4) + } + + private var dropHint: some View { + VStack(spacing: 8) { + Text(isDropTargeted ? "Drop audio to transcribe" : "Drop a WAV or MP3 file here to transcribe it") + .font(.callout.weight(isDropTargeted ? .semibold : .regular)) + .foregroundStyle(isDropTargeted ? Color.accentColor : Color.secondary) + .multilineTextAlignment(.center) + Text("Imported transcripts are saved to History using the filename as the title.") + .font(.caption) + .foregroundStyle(.tertiary) + .multilineTextAlignment(.center) + } + .padding(.top, 4) + } + + private var backgroundStyle: some ShapeStyle { + isDropTargeted ? AnyShapeStyle(.background.secondary) : AnyShapeStyle(.clear) + } + + private func shortcutBadge(_ shortcut: String, label: String) -> some View { + VStack(spacing: 4) { + Text(shortcut) + .font(.caption.monospaced()) + .padding(.horizontal, 6) + .padding(.vertical, 3) + .background(.quaternary, in: RoundedRectangle(cornerRadius: 4)) + Text(label) + .font(.caption2) + .foregroundStyle(.tertiary) + } + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/AudioRecorderTests.swift b/ExecuWhisper/ExecuWhisperTests/AudioRecorderTests.swift new file mode 100644 index 0000000000..2c4770d93b --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/AudioRecorderTests.swift @@ -0,0 +1,156 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import AVFoundation +import CoreAudio +import Foundation +import Testing + +struct AudioRecorderTests { + @Test + func resolvePreferredMicrophoneUsesExactSavedDeviceWhenAvailable() { + let available = [ + AudioRecorder.InputDevice(id: "default", name: "MacBook Microphone", isDefault: true), + AudioRecorder.InputDevice(id: "usb", name: "USB Audio Device", isDefault: false), + ] + + let resolved = AudioRecorder.resolvePreferredMicrophone( + selectedMicrophoneID: "usb", + availableDevices: available + ) + + #expect(resolved?.device.id == "usb") + #expect(resolved?.usedFallback == false) + } + + @Test + func resolvePreferredMicrophoneFallsBackToDefaultWhenSavedDeviceIsMissing() { + let available = [ + AudioRecorder.InputDevice(id: "default", name: "MacBook Microphone", isDefault: true), + AudioRecorder.InputDevice(id: "usb", name: "USB Audio Device", isDefault: false), + ] + + let resolved = AudioRecorder.resolvePreferredMicrophone( + selectedMicrophoneID: "missing", + availableDevices: available + ) + + #expect(resolved?.device.id == "default") + #expect(resolved?.usedFallback == true) + } + + @Test + func resolvePreferredMicrophoneReturnsNilWhenNoDevicesAreAvailable() { + let resolved = AudioRecorder.resolvePreferredMicrophone( + selectedMicrophoneID: "missing", + availableDevices: [] + ) + + #expect(resolved == nil) + } + + @Test + func selectInputDeviceIDUsesExactUIDAndRequiresInputChannels() { + let records = [ + AudioRecorder.CoreAudioDeviceRecord( + id: AudioDeviceID(100), + uid: "same-name-output", + name: "AirPods Pro", + inputChannelCount: 0 + ), + AudioRecorder.CoreAudioDeviceRecord( + id: AudioDeviceID(101), + uid: "airpods-left", + name: "AirPods Pro", + inputChannelCount: 1 + ), + AudioRecorder.CoreAudioDeviceRecord( + id: AudioDeviceID(102), + uid: "airpods-right", + name: "AirPods Pro", + inputChannelCount: 1 + ), + ] + + #expect(AudioRecorder.selectInputDeviceID(forUID: "airpods-right", from: records) == AudioDeviceID(102)) + #expect(AudioRecorder.selectInputDeviceID(forUID: "same-name-output", from: records) == nil) + #expect(AudioRecorder.selectInputDeviceID(forUID: "missing", from: records) == nil) + } + + @Test + func trimTrailingPCMRemovesConfiguredTailFromLongCapture() { + let pcmData = makePCMData(sampleCount: 16_000) + + let trimmed = AudioRecorder.trimTrailingPCM( + pcmData, + sampleRate: 16_000, + trimDurationMs: 256 + ) + + let expectedTrimmedSamples = 16_000 - 4_096 + #expect(trimmed.count == expectedTrimmedSamples * MemoryLayout.size) + } + + @Test + func trimTrailingPCMPreservesShortCapture() { + let pcmData = makePCMData(sampleCount: 2_000) + + let trimmed = AudioRecorder.trimTrailingPCM( + pcmData, + sampleRate: 16_000, + trimDurationMs: 256 + ) + + #expect(trimmed == pcmData) + } + + @Test + func nativeCaptureWriterCreatesReadableWAVFile() throws { + let format = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: 44_100, + channels: 2, + interleaved: false + )! + let frameCount: AVAudioFrameCount = 4_410 + let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount)! + buffer.frameLength = frameCount + for channel in 0.. Data { + var samples = (0...size) + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/DictationManagerTests.swift b/ExecuWhisper/ExecuWhisperTests/DictationManagerTests.swift new file mode 100644 index 0000000000..54fc2c835f --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/DictationManagerTests.swift @@ -0,0 +1,76 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Carbon.HIToolbox +import Foundation +import Testing + +@MainActor +struct DictationManagerTests { + @Test + func dictationStartSetsListeningState() async { + let manager = DictationManager.preview() + + await manager.beginPreviewDictation() + + #expect(manager.state == .listening) + #expect(manager.overlayStatusText.isEmpty) + } + + @Test + func beginPreviewTranscriptionSetsTranscribingState() async { + let manager = DictationManager.preview() + + await manager.beginPreviewTranscription() + + #expect(manager.state == .transcribing) + #expect(manager.overlayStatusText.isEmpty) + } + + @Test + func finishPreviewDictationReturnsToIdle() async { + let manager = DictationManager.preview() + await manager.beginPreviewDictation() + + await manager.finishPreviewDictation() + + #expect(manager.state == .idle) + } + + @Test + func silenceTimeoutSchedulesStopRequestAsynchronously() async { + var didRequestStop = false + let manager = DictationManager.preview { + didRequestStop = true + } + await manager.beginPreviewDictation() + + manager.triggerSilenceTimeoutForTesting() + await Task.yield() + + #expect(didRequestStop) + } + + @Test + func hotKeyStatusUsesConfiguredShortcutDisplay() { + let suiteName = "DictationManagerTests.\(UUID().uuidString)" + let defaults = UserDefaults(suiteName: suiteName)! + defer { defaults.removePersistentDomain(forName: suiteName) } + + let preferences = Preferences(defaults: defaults) + preferences.dictationShortcut = DictationShortcut( + keyCode: UInt32(kVK_ANSI_D), + carbonModifiers: UInt32(controlKey | shiftKey), + keyDisplay: "D" + ) + + let manager = DictationManager(preferences: preferences) + + #expect(manager.hotKeyStatusText == "⌃⇧D ready") + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/FormatterBridgeTests.swift b/ExecuWhisper/ExecuWhisperTests/FormatterBridgeTests.swift new file mode 100644 index 0000000000..09bd55bc2c --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/FormatterBridgeTests.swift @@ -0,0 +1,158 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Testing + +struct FormatterBridgeTests { + @Test + func warmFormatterHelperIsReusedAcrossRequests() async throws { + let sandbox = makeSandbox() + let launchCountURL = sandbox.appendingPathComponent("launch_count.txt") + let helperURL = try makeFakeFormatterHelper(in: sandbox, launchCountURL: launchCountURL) + let modelURL = createDummyFile(named: "lfm2_5_350m_mlx_4w.pte", in: sandbox) + let tokenizerURL = createDummyFile(named: "tokenizer.json", in: sandbox) + let tokenizerConfigURL = createDummyFile(named: "tokenizer_config.json", in: sandbox) + let bridge = FormatterBridge() + + try await bridge.prepare( + runnerPath: helperURL.path, + modelPath: modelURL.path, + tokenizerPath: tokenizerURL.path, + tokenizerConfigPath: tokenizerConfigURL.path + ) + let first = try await bridge.format( + runnerPath: helperURL.path, + modelPath: modelURL.path, + tokenizerPath: tokenizerURL.path, + tokenizerConfigPath: tokenizerConfigURL.path, + prompt: "first prompt", + maxNewTokens: 96, + temperature: 0.0 + ) + let second = try await bridge.format( + runnerPath: helperURL.path, + modelPath: modelURL.path, + tokenizerPath: tokenizerURL.path, + tokenizerConfigPath: tokenizerConfigURL.path, + prompt: "second prompt", + maxNewTokens: 96, + temperature: 0.0 + ) + await bridge.shutdown() + + #expect(first.text == "formatted:first prompt") + #expect(second.text == "formatted:second prompt") + #expect(second.tokensPerSecond == 24.0) + + let launches = try String(contentsOf: launchCountURL, encoding: .utf8) + .trimmingCharacters(in: .whitespacesAndNewlines) + #expect(launches == "1") + } + + @Test + func prepareValidatesFormatterAssetsBeforeLaunch() async throws { + let sandbox = makeSandbox() + let helperURL = try makeFakeFormatterHelper( + in: sandbox, + launchCountURL: sandbox.appendingPathComponent("launch_count.txt") + ) + let modelURL = createDummyFile(named: "lfm2_5_350m_mlx_4w.pte", in: sandbox) + let tokenizerURL = createDummyFile(named: "tokenizer.json", in: sandbox) + let missingTokenizerConfig = sandbox.appendingPathComponent("tokenizer_config.json") + let bridge = FormatterBridge() + + await #expect(throws: RunnerError.self) { + try await bridge.prepare( + runnerPath: helperURL.path, + modelPath: modelURL.path, + tokenizerPath: tokenizerURL.path, + tokenizerConfigPath: missingTokenizerConfig.path + ) + } + } + + private func makeFakeFormatterHelper(in sandbox: URL, launchCountURL: URL) throws -> URL { + let helperURL = sandbox.appendingPathComponent("formatter_helper.py") + let script = """ + #!/usr/bin/env python3 + import json + import pathlib + import sys + + launch_path = pathlib.Path(\(pythonStringLiteral(launchCountURL.path))) + launch_count = 0 + if launch_path.exists(): + launch_count = int(launch_path.read_text().strip() or "0") + launch_path.write_text(str(launch_count + 1)) + + sys.stdout.write(json.dumps({"type": "ready", "version": 1}) + "\\n") + sys.stdout.flush() + + while True: + line = sys.stdin.readline() + if not line: + break + request = json.loads(line) + request_type = request.get("type") + if request_type == "shutdown": + break + if request_type != "format": + sys.stdout.write(json.dumps({ + "type": "error", + "version": 1, + "message": "unsupported request" + }) + "\\n") + sys.stdout.flush() + continue + + request_id = request["request_id"] + prompt = request["prompt"] + sys.stdout.write(json.dumps({ + "type": "status", + "version": 1, + "request_id": request_id, + "phase": "formatting", + "message": "Formatting..." + }) + "\\n") + sys.stdout.write(json.dumps({ + "type": "result", + "version": 1, + "request_id": request_id, + "text": "formatted:" + prompt, + "stdout": "", + "stderr": "", + "tokens_per_second": 24.0 + }) + "\\n") + sys.stdout.flush() + """ + try script.write(to: helperURL, atomically: true, encoding: .utf8) + try FileManager.default.setAttributes([.posixPermissions: 0o755], ofItemAtPath: helperURL.path) + return helperURL + } + + private func createDummyFile(named name: String, in sandbox: URL) -> URL { + let url = sandbox.appendingPathComponent(name) + FileManager.default.createFile(atPath: url.path, contents: Data("x".utf8)) + return url + } + + private func pythonStringLiteral(_ value: String) -> String { + let escaped = value + .replacingOccurrences(of: "\\", with: "\\\\") + .replacingOccurrences(of: "\"", with: "\\\"") + return "\"\(escaped)\"" + } + + private func makeSandbox() -> URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent("formatter-bridge-\(UUID().uuidString)", isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/FormatterHelperProtocolTests.swift b/ExecuWhisper/ExecuWhisperTests/FormatterHelperProtocolTests.swift new file mode 100644 index 0000000000..9b91eeb684 --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/FormatterHelperProtocolTests.swift @@ -0,0 +1,64 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Testing + +struct FormatterHelperProtocolTests { + @Test + func formatRequestEncodesStableWireFormat() throws { + let request = FormatterHelperProtocol.FormatRequest( + requestID: "fmt-123", + prompt: "format this", + maxNewTokens: 128, + temperature: 0.0 + ) + + let data = try JSONEncoder().encode(request) + let json = try #require( + JSONSerialization.jsonObject(with: data) as? [String: Any] + ) + + #expect(json["type"] as? String == "format") + #expect(json["version"] as? Int == 1) + #expect(json["request_id"] as? String == "fmt-123") + #expect(json["prompt"] as? String == "format this") + #expect(json["max_new_tokens"] as? Int == 128) + #expect(json["temperature"] as? Double == 0.0) + } + + @Test + func helperMessageDecodesFormatterResultEnvelope() throws { + let data = Data( + """ + { + "type": "result", + "version": 1, + "request_id": "fmt-123", + "text": "Polished output.", + "stdout": "tokens=12", + "stderr": "", + "tokens_per_second": 42.5 + } + """.utf8 + ) + + let message = try JSONDecoder().decode(FormatterHelperProtocol.HelperMessage.self, from: data) + + guard case .result(let result) = message else { + Issue.record("Expected result message") + return + } + + #expect(result.requestID == "fmt-123") + #expect(result.text == "Polished output.") + #expect(result.stdout == "tokens=12") + #expect(result.stderr == "") + #expect(result.tokensPerSecond == 42.5) + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/FormatterPromptBuilderTests.swift b/ExecuWhisper/ExecuWhisperTests/FormatterPromptBuilderTests.swift new file mode 100644 index 0000000000..6ff2dd7acc --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/FormatterPromptBuilderTests.swift @@ -0,0 +1,115 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Testing + +struct FormatterPromptBuilderTests { + @Test + func smartPromptInstructsRewriteAndForbidsAnswering() { + let prompt = FormatterPromptBuilder.prompt( + transcript: "um please clean this up" + ) + + #expect(prompt.contains("You rewrite spoken dictation into clean final text.")) + #expect(prompt.contains("You are not a chat assistant.")) + #expect(prompt.contains("Never answer or respond to the dictation")) + #expect(prompt.contains("Fix casing, punctuation, filler, and speech disfluencies.")) + #expect(prompt.contains("Preserve meaning and detail.")) + #expect(prompt.contains("Use bullets only when it clearly reads as a list.")) + #expect(prompt.contains("Do not summarize or invent information.")) + #expect(prompt.contains("Output only the rewritten dictation.")) + #expect(!prompt.contains("Mode:")) + #expect(prompt.contains("um please clean this up")) + #expect(prompt.hasSuffix("<|im_start|>assistant\n")) + } + + @Test + func smartPromptIncludesQuestionExampleSoModelDoesNotAnswerQuestions() { + let prompt = FormatterPromptBuilder.prompt( + transcript: "does it feel like real-time processing?" + ) + + #expect(prompt.contains("Examples:")) + #expect(prompt.contains("Dictation: um does it feel like real time processing")) + #expect(prompt.contains("Output: Does it feel like real-time processing?")) + #expect(prompt.contains("Dictation: does it feel like real-time processing?")) + #expect(prompt.contains("Output:")) + } + + @Test + func smartPromptDoesNotContainLegacyModeInstructions() { + let prompt = FormatterPromptBuilder.prompt( + transcript: "new section launch notes bullet first item" + ) + + #expect(!prompt.contains("Mode: Bullet Notes")) + #expect(!prompt.contains("Write the final email body.")) + #expect(!prompt.contains("Custom rewrite instruction:")) + #expect(!prompt.contains("Meeting Notes")) + #expect(!prompt.contains("Action Items")) + #expect(!prompt.contains("Summary")) + } + + @Test + func smartPromptCanGuideListLikeDictationWithoutDedicatedMode() { + let prompt = FormatterPromptBuilder.prompt( + transcript: "todo update helper docs test formatter download follow up with Alex" + ) + + #expect(prompt.contains("Use bullets only when it clearly reads as a list.")) + #expect(prompt.contains("todo update helper docs test formatter download follow up with Alex")) + #expect(!prompt.contains("Mode: Bullet Notes")) + #expect(!prompt.contains("recipient")) + #expect(!prompt.contains("signoff")) + } + + @Test + func smartPromptCanHandleSpokenEmailIntentWithoutDedicatedMode() { + let prompt = FormatterPromptBuilder.prompt( + transcript: "turn this into an email thanks for the update" + ) + + #expect(prompt.contains("You rewrite spoken dictation into clean final text.")) + #expect(prompt.contains("turn this into an email thanks for the update")) + #expect(!prompt.contains("Mode: Email")) + #expect(!prompt.contains("Write the final email body.")) + } + + @Test + func smartPromptIgnoresRemovedCustomInstructionSurface() { + let prompt = FormatterPromptBuilder.prompt( + transcript: "write this for a design review" + ) + + #expect(!prompt.contains("Mode: Custom")) + #expect(!prompt.contains("Custom rewrite instruction:")) + #expect(!prompt.contains("Make this sound crisp and decisive.")) + #expect(prompt.contains("Preserve meaning and detail.")) + #expect(prompt.contains("Output only the rewritten dictation.")) + } + + @Test + func promptPreservesRawTranscriptVerbatim() { + let transcript = "line one\n\"quoted text\"\nnew paragraph" + + let prompt = FormatterPromptBuilder.prompt( + transcript: transcript + ) + + #expect(prompt.contains(transcript)) + #expect(!prompt.contains("Transcript:\n\"\"\"")) + } + + @Test + func maxNewTokensScalesWithTranscriptLengthWithinBounds() { + #expect(FormatterPromptBuilder.maxNewTokens(for: "short text") == 96) + + let longTranscript = Array(repeating: "word", count: 400).joined(separator: " ") + #expect(FormatterPromptBuilder.maxNewTokens(for: longTranscript) == 512) + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/ImportedAudioDecoderTests.swift b/ExecuWhisper/ExecuWhisperTests/ImportedAudioDecoderTests.swift new file mode 100644 index 0000000000..7c6a99ed56 --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/ImportedAudioDecoderTests.swift @@ -0,0 +1,80 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import AVFoundation +import Foundation +import Testing + +struct ImportedAudioDecoderTests { + @Test + func supportsWavAndMp3ExtensionsOnly() { + #expect(ImportedAudioDecoder.supportsAudioFile(URL(fileURLWithPath: "/tmp/sample.wav"))) + #expect(ImportedAudioDecoder.supportsAudioFile(URL(fileURLWithPath: "/tmp/sample.mp3"))) + #expect(!ImportedAudioDecoder.supportsAudioFile(URL(fileURLWithPath: "/tmp/sample.m4a"))) + #expect(!ImportedAudioDecoder.supportsAudioFile(URL(fileURLWithPath: "/tmp/sample.txt"))) + } + + @Test + func importableAudioFileRequiresExactlyOneSupportedFile() { + let wavURL = URL(fileURLWithPath: "/tmp/sample.wav") + let mp3URL = URL(fileURLWithPath: "/tmp/sample.mp3") + let txtURL = URL(fileURLWithPath: "/tmp/sample.txt") + + #expect(ImportedAudioDecoder.importableAudioFile(from: [wavURL]) == wavURL) + #expect(ImportedAudioDecoder.importableAudioFile(from: [mp3URL]) == mp3URL) + #expect(ImportedAudioDecoder.importableAudioFile(from: [txtURL]) == nil) + #expect(ImportedAudioDecoder.importableAudioFile(from: [wavURL, mp3URL]) == nil) + } + + @Test + func decodeAudioFileNormalizesWavToFloat32Mono16kPCM() throws { + let sandbox = makeSandbox() + let inputURL = sandbox.appendingPathComponent("input.wav") + try writeTestWAV(to: inputURL, sampleRate: 44_100, channelCount: 2, frameCount: 4_410) + + let decoded = try ImportedAudioDecoder().decodeAudioFile(at: inputURL) + + #expect(decoded.duration > 0.09 && decoded.duration < 0.11) + #expect(decoded.pcmData.count % MemoryLayout.size == 0) + let sampleCount = decoded.pcmData.count / MemoryLayout.size + #expect(sampleCount > 1_500 && sampleCount < 1_700) + } + + private func writeTestWAV( + to url: URL, + sampleRate: Double, + channelCount: AVAudioChannelCount, + frameCount: AVAudioFrameCount + ) throws { + let format = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: sampleRate, + channels: channelCount, + interleaved: false + )! + let file = try AVAudioFile(forWriting: url, settings: format.settings) + let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount)! + buffer.frameLength = frameCount + + for channel in 0.. URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString, isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/ParakeetHelperProtocolTests.swift b/ExecuWhisper/ExecuWhisperTests/ParakeetHelperProtocolTests.swift new file mode 100644 index 0000000000..5709c9c322 --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/ParakeetHelperProtocolTests.swift @@ -0,0 +1,72 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Testing + +struct ParakeetHelperProtocolTests { + @Test + func transcribeRequestEncodesStableWireFormat() throws { + let request = ParakeetHelperProtocol.TranscribeRequest( + requestID: "req-123", + audio: .init( + encoding: .float32LittleEndian, + sampleRate: 16_000, + channelCount: 1, + payloadByteCount: 6400 + ), + enableRuntimeProfile: true + ) + + let data = try JSONEncoder().encode(request) + let json = try #require( + JSONSerialization.jsonObject(with: data) as? [String: Any] + ) + + #expect(json["type"] as? String == "transcribe") + #expect(json["version"] as? Int == 1) + #expect(json["request_id"] as? String == "req-123") + #expect(json["enable_runtime_profile"] as? Bool == true) + + let audio = try #require(json["audio"] as? [String: Any]) + #expect(audio["encoding"] as? String == "f32le") + #expect(audio["sample_rate"] as? Int == 16_000) + #expect(audio["channel_count"] as? Int == 1) + #expect(audio["payload_byte_count"] as? Int == 6400) + } + + @Test + func helperMessageDecodesResultEnvelope() throws { + let data = Data( + """ + { + "type": "result", + "version": 1, + "request_id": "req-123", + "text": "hello world", + "stdout": "PyTorchObserver foo", + "stderr": "", + "runtime_profile": "decode_loop_ms=12.5" + } + """.utf8 + ) + + let message = try JSONDecoder().decode(ParakeetHelperProtocol.HelperMessage.self, from: data) + + guard case .result(let result) = message else { + Issue.record("Expected result message") + return + } + + #expect(result.requestID == "req-123") + #expect(result.text == "hello world") + #expect(result.stdout == "PyTorchObserver foo") + #expect(result.stderr == "") + #expect(result.runtimeProfile == "decode_loop_ms=12.5") + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/PersistenceRegressionTests.swift b/ExecuWhisper/ExecuWhisperTests/PersistenceRegressionTests.swift new file mode 100644 index 0000000000..625886685d --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/PersistenceRegressionTests.swift @@ -0,0 +1,108 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Testing + +@MainActor +struct PersistenceRegressionTests { + @Test + func unreadableReplacementFileIsNotOverwritten() throws { + let sandbox = makeSandbox() + let fileURL = sandbox.appendingPathComponent("replacements.json") + try Data("not-json".utf8).write(to: fileURL, options: .atomic) + + let store = ReplacementStore(fileURL: fileURL) + + #expect(!store.entries.isEmpty) + let contents = try String(contentsOf: fileURL) + #expect(contents == "not-json") + } + + @Test + func staleSavedRunnerPathFallsBackToBundledRunnerPath() { + let resolved = Preferences.resolveRunnerPath( + savedRunnerPath: "/tmp/custom-runner", + savedRunnerExists: false, + bundledRunnerPath: "/tmp/bundled-runner", + bundledRunnerExists: true, + buildRunnerPath: "/tmp/build-runner" + ) + + #expect(resolved == "/tmp/bundled-runner") + } + + @Test + func validSavedRunnerPathStillBeatsBundledRunnerPath() { + let resolved = Preferences.resolveRunnerPath( + savedRunnerPath: "/tmp/custom-runner", + savedRunnerExists: true, + bundledRunnerPath: "/tmp/bundled-runner", + bundledRunnerExists: true, + buildRunnerPath: "/tmp/build-runner" + ) + + #expect(resolved == "/tmp/custom-runner") + } + + @Test + func staleSavedModelDirectoryFallsBackToDownloadedModelDirectory() { + let resolved = Preferences.resolveModelDirectory( + savedModelDirectory: "/tmp/stale-models", + bundledModelDirectory: nil, + downloadedModelDirectory: "/tmp/downloaded-models" + ) { candidate in + candidate == "/tmp/downloaded-models" + } + + #expect(resolved == "/tmp/downloaded-models") + } + + @Test + func savedMicrophoneSelectionPersistsAcrossPreferencesReload() { + let suiteName = "PersistenceRegressionTests.\(UUID().uuidString)" + let defaults = UserDefaults(suiteName: suiteName)! + defer { defaults.removePersistentDomain(forName: suiteName) } + + let preferences = Preferences(defaults: defaults) + preferences.selectedMicrophoneID = "usb-mic" + + let reloaded = Preferences(defaults: defaults) + + #expect(reloaded.selectedMicrophoneID == "usb-mic") + } + + @Test + func healthCheckGuidanceDistinguishesRunnerSetupFromModelDownload() { + let runnerMissing = HealthCheck.Result( + runnerAvailable: false, + modelAvailable: false, + tokenizerAvailable: false, + micPermission: .authorized + ) + let modelMissing = HealthCheck.Result( + runnerAvailable: true, + modelAvailable: false, + tokenizerAvailable: false, + micPermission: .authorized + ) + + #expect(runnerMissing.setupStatusMessage == "Helper setup required") + #expect(runnerMissing.missingFiles == ["parakeet_helper", "model.pte", "tokenizer.model"]) + #expect(!runnerMissing.shouldOfferModelDownload) + #expect(modelMissing.setupStatusMessage == "Model download required") + #expect(modelMissing.shouldOfferModelDownload) + } + + private func makeSandbox() -> URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString, isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/PreferencesFormattingTests.swift b/ExecuWhisper/ExecuWhisperTests/PreferencesFormattingTests.swift new file mode 100644 index 0000000000..c847e921a3 --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/PreferencesFormattingTests.swift @@ -0,0 +1,52 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Testing + +@MainActor +struct PreferencesFormattingTests { + @Test + func formatterDefaultsUseSmartFormattingAndDownloadedModelDirectory() { + let suiteName = "formatter-defaults-\(UUID().uuidString)" + let defaults = UserDefaults(suiteName: suiteName)! + defaults.removePersistentDomain(forName: suiteName) + + let preferences = Preferences(defaults: defaults) + + #expect(preferences.enableSmartFormatting) + #expect(preferences.formatterModelDirectory == preferences.downloadedFormatterModelDirectoryURL.path(percentEncoded: false)) + #expect(preferences.formatterModelPath.hasSuffix("lfm2_5_350m_mlx_4w.pte")) + #expect(preferences.formatterTokenizerPath.hasSuffix("tokenizer.json")) + #expect(preferences.formatterTokenizerConfigPath.hasSuffix("tokenizer_config.json")) + } + + @Test + func formatterSettingsPersistAcrossPreferencesInstances() { + let suiteName = "formatter-persistence-\(UUID().uuidString)" + let defaults = UserDefaults(suiteName: suiteName)! + defaults.removePersistentDomain(forName: suiteName) + + let first = Preferences(defaults: defaults) + first.enableSmartFormatting = false + first.formatterRunnerPath = "/tmp/lfm25_formatter_helper" + let formatterDirectory = URL(fileURLWithPath: "/tmp/lfm25-\(UUID().uuidString)", isDirectory: true) + try? FileManager.default.createDirectory(at: formatterDirectory, withIntermediateDirectories: true) + try? Data("pte".utf8).write(to: formatterDirectory.appendingPathComponent("lfm2_5_350m_mlx_4w.pte")) + try? Data("tokenizer".utf8).write(to: formatterDirectory.appendingPathComponent("tokenizer.json")) + try? Data("config".utf8).write(to: formatterDirectory.appendingPathComponent("tokenizer_config.json")) + first.formatterModelDirectory = formatterDirectory.path(percentEncoded: false) + + let second = Preferences(defaults: defaults) + + #expect(!second.enableSmartFormatting) + #expect(second.formatterRunnerPath == "/tmp/lfm25_formatter_helper") + #expect(second.formatterModelDirectory == formatterDirectory.path(percentEncoded: false)) + } + +} diff --git a/ExecuWhisper/ExecuWhisperTests/RunnerBridgeTests.swift b/ExecuWhisper/ExecuWhisperTests/RunnerBridgeTests.swift new file mode 100644 index 0000000000..1cd77f24d9 --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/RunnerBridgeTests.swift @@ -0,0 +1,229 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Testing + +struct RunnerBridgeTests { + @Test + func runOptionsCanEnableRuntimeProfilingFromEnvironment() { + let enabled = RunnerBridge.RunOptions.fromEnvironment([ + "EXECUWHISPER_ENABLE_RUNTIME_PROFILE": "1" + ]) + let disabled = RunnerBridge.RunOptions.fromEnvironment([:]) + + #expect(enabled.enableRuntimeProfile) + #expect(!disabled.enableRuntimeProfile) + } + + @Test + func warmHelperIsReusedAcrossTwoRequests() async throws { + let sandbox = makeSandbox() + let launchCountURL = sandbox.appendingPathComponent("launch_count.txt") + let helperURL = try makeFakeHelper( + in: sandbox, + name: "helper_a.py", + launchCountURL: launchCountURL, + transcriptPrefix: "warm" + ) + let modelURL = createDummyFile(named: "model.pte", in: sandbox) + let tokenizerURL = createDummyFile(named: "tokenizer.model", in: sandbox) + let bridge = RunnerBridge() + + try await bridge.prepare( + runnerPath: helperURL.path, + modelPath: modelURL.path, + tokenizerPath: tokenizerURL.path + ) + let first = try await collectResult( + from: await bridge.transcribePCM( + runnerPath: helperURL.path, + modelPath: modelURL.path, + tokenizerPath: tokenizerURL.path, + pcmData: makePCMData(sampleCount: 1600), + options: .init() + ) + ) + let second = try await collectResult( + from: await bridge.transcribePCM( + runnerPath: helperURL.path, + modelPath: modelURL.path, + tokenizerPath: tokenizerURL.path, + pcmData: makePCMData(sampleCount: 3200), + options: .init(enableRuntimeProfile: true) + ) + ) + await bridge.shutdown() + + #expect(first.text == "warm:1600") + #expect(second.text == "warm:3200") + #expect(second.runtimeProfile == "RUNTIME_PROFILE decode_loop_ms=1.0 host_overhead_ms=0.2") + + let launches = try String(contentsOf: launchCountURL, encoding: .utf8) + .trimmingCharacters(in: .whitespacesAndNewlines) + #expect(launches == "1") + } + + @Test + func prepareRestartsWarmHelperWhenBinaryPathChanges() async throws { + let sandbox = makeSandbox(named: "runner bridge sandbox") + let launchCountAURL = sandbox.appendingPathComponent("launch_count_a.txt") + let launchCountBURL = sandbox.appendingPathComponent("launch_count_b.txt") + let helperAURL = try makeFakeHelper( + in: sandbox, + name: "helper_a.py", + launchCountURL: launchCountAURL, + transcriptPrefix: "alpha" + ) + let helperBURL = try makeFakeHelper( + in: sandbox, + name: "helper_b.py", + launchCountURL: launchCountBURL, + transcriptPrefix: "beta" + ) + let modelURL = createDummyFile(named: "model.pte", in: sandbox) + let tokenizerURL = createDummyFile(named: "tokenizer.model", in: sandbox) + let bridge = RunnerBridge() + + try await bridge.prepare( + runnerPath: helperAURL.path, + modelPath: modelURL.path, + tokenizerPath: tokenizerURL.path + ) + try await bridge.prepare( + runnerPath: helperBURL.path, + modelPath: modelURL.path, + tokenizerPath: tokenizerURL.path + ) + let result = try await collectResult( + from: await bridge.transcribePCM( + runnerPath: helperBURL.path, + modelPath: modelURL.path, + tokenizerPath: tokenizerURL.path, + pcmData: makePCMData(sampleCount: 800), + options: .init() + ) + ) + await bridge.shutdown() + + #expect(result.text == "beta:800") + + let launchesA = try String(contentsOf: launchCountAURL, encoding: .utf8) + .trimmingCharacters(in: .whitespacesAndNewlines) + let launchesB = try String(contentsOf: launchCountBURL, encoding: .utf8) + .trimmingCharacters(in: .whitespacesAndNewlines) + #expect(launchesA == "1") + #expect(launchesB == "1") + } + + private func collectResult( + from events: AsyncThrowingStream + ) async throws -> RunnerBridge.TranscriptionResult { + var completed: RunnerBridge.TranscriptionResult? + for try await event in events { + if case .completed(let result) = event { + completed = result + } + } + return try #require(completed) + } + + private func makeFakeHelper( + in sandbox: URL, + name: String, + launchCountURL: URL, + transcriptPrefix: String + ) throws -> URL { + let helperURL = sandbox.appendingPathComponent(name) + let script = """ + #!/usr/bin/env python3 + import json + import pathlib + import sys + + launch_path = pathlib.Path(\(pythonStringLiteral(launchCountURL.path))) + launch_count = 0 + if launch_path.exists(): + launch_count = int(launch_path.read_text().strip() or "0") + launch_path.write_text(str(launch_count + 1)) + + transcript_prefix = \(pythonStringLiteral(transcriptPrefix)) + sys.stdout.write(json.dumps({"type": "ready", "version": 1}) + "\\n") + sys.stdout.flush() + + while True: + header = sys.stdin.buffer.readline() + if not header: + break + request = json.loads(header.decode("utf-8")) + request_type = request.get("type") + if request_type == "shutdown": + break + if request_type != "transcribe": + sys.stdout.write(json.dumps({ + "type": "error", + "version": 1, + "message": "unsupported request" + }) + "\\n") + sys.stdout.flush() + continue + + request_id = request["request_id"] + payload_size = request["audio"]["payload_byte_count"] + payload = sys.stdin.buffer.read(payload_size) + sample_count = payload_size // 4 + sys.stdout.write(json.dumps({ + "type": "status", + "version": 1, + "request_id": request_id, + "phase": "running_encoder", + "message": "Running encoder..." + }) + "\\n") + result_payload = { + "type": "result", + "version": 1, + "request_id": request_id, + "text": f"{transcript_prefix}:{sample_count}", + "stdout": "PyTorchObserver {}", + "stderr": "" + } + if request.get("enable_runtime_profile"): + result_payload["runtime_profile"] = "RUNTIME_PROFILE decode_loop_ms=1.0 host_overhead_ms=0.2" + sys.stdout.write(json.dumps(result_payload) + "\\n") + sys.stdout.flush() + """ + try script.write(to: helperURL, atomically: true, encoding: .utf8) + try FileManager.default.setAttributes([.posixPermissions: 0o755], ofItemAtPath: helperURL.path) + return helperURL + } + + private func makePCMData(sampleCount: Int) -> Data { + var samples = (0...size) + } + + private func createDummyFile(named name: String, in sandbox: URL) -> URL { + let url = sandbox.appendingPathComponent(name) + FileManager.default.createFile(atPath: url.path, contents: Data("x".utf8)) + return url + } + + private func pythonStringLiteral(_ value: String) -> String { + let escaped = value + .replacingOccurrences(of: "\\", with: "\\\\") + .replacingOccurrences(of: "\"", with: "\\\"") + return "\"\(escaped)\"" + } + + private func makeSandbox(named name: String = UUID().uuidString) -> URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent("\(name)-\(UUID().uuidString)", isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/SessionCompatibilityTests.swift b/ExecuWhisper/ExecuWhisperTests/SessionCompatibilityTests.swift new file mode 100644 index 0000000000..0c3ce33ab4 --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/SessionCompatibilityTests.swift @@ -0,0 +1,78 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Testing + +@MainActor +struct SessionCompatibilityTests { + @Test + func decodesLegacySessionsWithoutRichMetadata() throws { + let json = """ + { + "id": "6BDF20D0-6E25-43EB-81A4-34748EF304F6", + "date": 0, + "title": "Legacy Session", + "transcript": "hello world", + "duration": 12.5 + } + """ + + let session = try JSONDecoder().decode(Session.self, from: Data(json.utf8)) + + #expect(session.rawTranscript == nil) + #expect(session.tags.isEmpty) + #expect(!session.pinned) + #expect(session.previewText == "hello world") + } + + @Test + func persistencePathsUseExecuWhisperAppSupportDirectory() { + #expect(PersistencePaths.appSupportDirectory.lastPathComponent == "ExecuWhisper") + #expect(PersistencePaths.sessionsURL.deletingLastPathComponent() == PersistencePaths.appSupportDirectory) + #expect(PersistencePaths.modelsDirectoryURL.deletingLastPathComponent() == PersistencePaths.appSupportDirectory) + #expect(PersistencePaths.replacementsURL.deletingLastPathComponent() == PersistencePaths.appSupportDirectory) + } + + @Test + func loadsSessionsWithoutRestoringSelection() throws { + let sessionsURL = makeTemporaryDirectory().appendingPathComponent("sessions.json") + let older = Session( + id: UUID(uuidString: "6BDF20D0-6E25-43EB-81A4-34748EF304F6")!, + date: Date(timeIntervalSinceReferenceDate: 60), + title: "Older", + transcript: "older transcript", + duration: 3 + ) + let newer = Session( + id: UUID(uuidString: "2C3F45D0-6E25-43EB-81A4-34748EF304F6")!, + date: Date(timeIntervalSinceReferenceDate: 120), + title: "Newer", + transcript: "newer transcript", + duration: 4 + ) + let data = try JSONEncoder().encode([older, newer]) + try data.write(to: sessionsURL, options: .atomic) + + let store = TranscriptStore( + preferences: Preferences(), + downloader: ModelDownloader(), + sessionsURL: sessionsURL + ) + + #expect(store.sessions.count == 2) + #expect(store.selectedSessionID == nil) + } + + private func makeTemporaryDirectory() -> URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString, isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/SessionExportTests.swift b/ExecuWhisper/ExecuWhisperTests/SessionExportTests.swift new file mode 100644 index 0000000000..5ade4d7047 --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/SessionExportTests.swift @@ -0,0 +1,78 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Testing + +@MainActor +struct SessionExportTests { + @Test + func jsonExportIncludesRawTranscriptAndTags() { + let session = Session( + id: UUID(uuidString: "6BDF20D0-6E25-43EB-81A4-34748EF304F6")!, + date: Date(timeIntervalSince1970: 1_742_814_600), + title: "Meeting", + transcript: "clean transcript", + duration: 12.5, + rawTranscript: "spoken transcript", + tags: ["replacement"] + ) + + let json = SessionExportFormat.json.render(session) + + #expect(json.contains("\"rawTranscript\" : \"spoken transcript\"")) + #expect(json.contains("\"tags\" : [")) + #expect(json.contains("\"title\" : \"Meeting\"")) + } + + @Test + func srtExportUsesSessionDuration() { + let session = Session( + title: "Timed", + transcript: "subtitle line", + duration: 12.5 + ) + + let srt = SessionExportFormat.srt.render(session) + + #expect(srt.contains("00:00:00,000 --> 00:00:12,500")) + #expect(srt.contains("subtitle line")) + } + + @Test + func transcriptStoreWritesExportFiles() throws { + let sandbox = makeSandbox() + let sessionsURL = sandbox.appendingPathComponent("sessions.json") + let store = TranscriptStore( + preferences: Preferences(), + downloader: ModelDownloader(), + sessionsURL: sessionsURL + ) + let session = Session( + title: "Export me", + transcript: "clean transcript", + duration: 7, + rawTranscript: "spoken transcript", + tags: ["replacement"] + ) + let exportURL = sandbox.appendingPathComponent("export.json") + + try store.writeSessionExport(session, format: .json, to: exportURL) + + let contents = try String(contentsOf: exportURL) + #expect(contents.contains("\"clean transcript\"")) + #expect(contents.contains("\"spoken transcript\"")) + } + + private func makeSandbox() -> URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString, isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/SessionHistoryTests.swift b/ExecuWhisper/ExecuWhisperTests/SessionHistoryTests.swift new file mode 100644 index 0000000000..65934a5647 --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/SessionHistoryTests.swift @@ -0,0 +1,129 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Testing + +@MainActor +struct SessionHistoryTests { + @Test + func historySectionsGroupByRecencyAndExcludePinnedSessions() { + let calendar = Calendar(identifier: .gregorian) + let referenceDate = Date(timeIntervalSinceReferenceDate: 86_400 * 10) + let sessions = [ + Session( + title: "Today pinned", + transcript: "latest transcript", + duration: 8, + rawTranscript: "latest raw", + tags: ["replacement"], + pinned: true + ), + Session( + date: referenceDate, + title: "Today", + transcript: "today transcript", + duration: 6 + ), + Session( + date: referenceDate.addingTimeInterval(-86_400), + title: "Yesterday", + transcript: "yesterday transcript", + duration: 5 + ), + Session( + date: referenceDate.addingTimeInterval(-86_400 * 3), + title: "Earlier", + transcript: "earlier transcript", + duration: 4 + ), + ] + + let pinned = SessionHistory.pinnedSessions(in: sessions, matching: "", referenceDate: referenceDate, calendar: calendar) + let sections = SessionHistory.sections(in: sessions, matching: "", referenceDate: referenceDate, calendar: calendar) + + #expect(pinned.count == 1) + #expect(pinned.first?.title == "Today pinned") + #expect(sections.map(\.title) == ["Today", "Yesterday", "Earlier"]) + #expect(sections.flatMap(\.sessions).allSatisfy { !$0.pinned }) + } + + @Test + func historySearchMatchesTranscriptTitleRawTranscriptAndTags() { + let referenceDate = Date(timeIntervalSinceReferenceDate: 86_400 * 5) + let sessions = [ + Session( + date: referenceDate, + title: "Tagged", + transcript: "clean transcript", + duration: 3, + rawTranscript: "spoken words", + tags: ["replacement"] + ), + Session( + date: referenceDate.addingTimeInterval(-86_400), + title: "Meeting Notes", + transcript: "summary text", + duration: 4 + ), + ] + + #expect(SessionHistory.visibleSessions(in: sessions, matching: "spoken").count == 1) + #expect(SessionHistory.visibleSessions(in: sessions, matching: "replacement").count == 1) + #expect(SessionHistory.visibleSessions(in: sessions, matching: "meeting").count == 1) + #expect(SessionHistory.visibleSessions(in: sessions, matching: "summary").count == 1) + } + + @Test + func togglePinnedUpdatesStoredSession() throws { + let sessionsURL = makeSandbox().appendingPathComponent("sessions.json") + let initial = Session(title: "Pin me", transcript: "text", duration: 3) + try JSONEncoder().encode([initial]).write(to: sessionsURL, options: .atomic) + + let store = TranscriptStore( + preferences: Preferences(), + downloader: ModelDownloader(), + sessionsURL: sessionsURL + ) + + store.togglePinned(initial) + + let updated = try #require(store.sessions.first) + #expect(updated.pinned) + } + + @Test + func deleteSessionsRemovesAllSelectedHistoryItems() throws { + let sessionsURL = makeSandbox().appendingPathComponent("sessions.json") + let first = Session(title: "First", transcript: "first", duration: 3) + let second = Session(title: "Second", transcript: "second", duration: 4) + let keep = Session(title: "Keep", transcript: "keep", duration: 5) + try JSONEncoder().encode([first, second, keep]).write(to: sessionsURL, options: .atomic) + + let store = TranscriptStore( + preferences: Preferences(), + downloader: ModelDownloader(), + sessionsURL: sessionsURL + ) + store.selectedSessionID = first.id + store.selectedHistorySessionIDs = [first.id, second.id] + + store.deleteSessions(ids: [first.id, second.id]) + + #expect(store.sessions.map(\.id) == [keep.id]) + #expect(store.selectedSessionID == nil) + #expect(store.selectedHistorySessionIDs.isEmpty) + } + + private func makeSandbox() -> URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString, isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/TextPipelineTests.swift b/ExecuWhisper/ExecuWhisperTests/TextPipelineTests.swift new file mode 100644 index 0000000000..360e2beaf0 --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/TextPipelineTests.swift @@ -0,0 +1,434 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Testing + +@MainActor +struct TextPipelineTests { + @Test + func replacementsApplyLongestMatchFirst() { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [ + ReplacementEntry(trigger: "young", replacement: "Young"), + ReplacementEntry(trigger: "young han", replacement: "Younghan"), + ReplacementEntry(trigger: "mtia", replacement: "MTIA"), + ] + let pipeline = TextPipeline(replacementStore: replacementStore) + + let result = pipeline.process("young han joined mtia") + + #expect(result.outputText == "Younghan joined MTIA") + #expect(result.tags == ["replacement"]) + } + + @Test + func replacementsPreserveCaseAndWordBoundaryRules() { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [ + ReplacementEntry(trigger: "executorch", replacement: "ExecuTorch"), + ReplacementEntry(trigger: "ml", replacement: "ML"), + ] + let pipeline = TextPipeline(replacementStore: replacementStore) + + let result = pipeline.process("EXECUTORCH powers xml and ml") + + #expect(result.outputText == "EXECUTORCH powers xml and ML") + } + + @Test + func processLeavesTextUnchangedWhenNoRulesMatch() { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [] + let pipeline = TextPipeline(replacementStore: replacementStore) + + let result = pipeline.process("plain transcript text") + + #expect(result.outputText == "plain transcript text") + #expect(result.tags.isEmpty) + #expect(!result.transformed) + } + + @Test + func disabledSmartFormattingBypassesFormatterAndAppliesReplacements() async { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [ + ReplacementEntry(trigger: "executorch", replacement: "ExecuTorch"), + ] + let formatter = StubFormatterBridge(result: "should not be used") + let pipeline = TextPipeline( + replacementStore: replacementStore, + formatterBridge: formatter, + formatterPathsProvider: formatterPaths + ) + + let result = await pipeline.process( + "executorch raw text", + smartFormattingEnabled: false + ) + + #expect(result.outputText == "ExecuTorch raw text") + #expect(result.tags == ["replacement"]) + #expect(await formatter.prompts.isEmpty) + } + + @Test + func smartFormattingUsesFormatterThenAppliesReplacements() async { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [ + ReplacementEntry(trigger: "executorch", replacement: "ExecuTorch"), + ] + let formatter = StubFormatterBridge(result: "executorch is ready.") + let pipeline = TextPipeline( + replacementStore: replacementStore, + formatterBridge: formatter, + formatterPathsProvider: formatterPaths + ) + + let result = await pipeline.process( + "um executorch is ready", + smartFormattingEnabled: true + ) + + #expect(result.outputText == "ExecuTorch is ready.") + #expect(result.tags == ["formatted", "replacement"]) + #expect(await formatter.prompts.count == 1) + #expect(await formatter.prompts.first?.contains("You rewrite spoken dictation into clean final text.") == true) + #expect(await formatter.prompts.first?.contains("Never answer or respond to the dictation") == true) + #expect(await formatter.prompts.first?.contains("Mode: Clean Dictation") == false) + } + + @Test + func formatterFailureFallsBackToReplacementOnlyText() async { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [ + ReplacementEntry(trigger: "executorch", replacement: "ExecuTorch"), + ] + let formatter = StubFormatterBridge(error: RunnerError.transcriptionFailed(description: "boom")) + let pipeline = TextPipeline( + replacementStore: replacementStore, + formatterBridge: formatter, + formatterPathsProvider: formatterPaths + ) + + let result = await pipeline.process( + "executorch fallback", + smartFormattingEnabled: true + ) + + #expect(result.outputText == "ExecuTorch fallback") + #expect(result.tags == ["replacement", "formatter-fallback"]) + } + + @Test + func formatterAnsweringTheTranscriptQuestionFallsBackToTranscript() async { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [] + let formatter = StubFormatterBridge(result: "Yes") + let pipeline = TextPipeline( + replacementStore: replacementStore, + formatterBridge: formatter, + formatterPathsProvider: formatterPaths + ) + + let result = await pipeline.process( + "does it feel like real-time processing?", + smartFormattingEnabled: true + ) + + #expect(result.outputText == "does it feel like real-time processing?") + #expect(result.tags == ["formatter-fallback"]) + } + + @Test + func formatterAnsweringShortQuestionFallsBackToTranscript() async { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [] + let formatter = StubFormatterBridge(result: "Yes") + let pipeline = TextPipeline( + replacementStore: replacementStore, + formatterBridge: formatter, + formatterPathsProvider: formatterPaths + ) + + let result = await pipeline.process( + "is it raining?", + smartFormattingEnabled: true + ) + + #expect(result.outputText == "is it raining?") + #expect(result.tags == ["formatter-fallback"]) + } + + @Test + func formatterPromptExampleLeakFallsBackToTranscript() async { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [] + let formatter = StubFormatterBridge(result: """ + Options: + - Does it feel like real-time processing? + - What is the next step? + - Okay, so the plan is finish the build, then deploy + """) + let pipeline = TextPipeline( + replacementStore: replacementStore, + formatterBridge: formatter, + formatterPathsProvider: formatterPaths + ) + + let result = await pipeline.process( + "Hello, can you hear me?", + smartFormattingEnabled: true + ) + + #expect(result.outputText == "Hello, can you hear me?") + #expect(result.tags == ["formatter-fallback"]) + } + + @Test + func longTranscriptSkipsFormatterBeforeContextOverflow() async { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [] + let formatter = StubFormatterBridge(result: "should not be used") + let pipeline = TextPipeline( + replacementStore: replacementStore, + formatterBridge: formatter, + formatterPathsProvider: formatterPaths + ) + let longTranscript = Array(repeating: "context", count: 400).joined(separator: " ") + + let result = await pipeline.process( + longTranscript, + smartFormattingEnabled: true + ) + + #expect(result.outputText == longTranscript) + #expect(result.tags == ["formatter-skipped-context"]) + #expect(await formatter.prompts.isEmpty) + } + + @Test + func formatterMetadataEchoFallsBackToTranscript() async { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [ + ReplacementEntry(trigger: "parakeet", replacement: "Parakeet"), + ] + let formatter = StubFormatterBridge(result: "Mode: Clean Dictation") + let pipeline = TextPipeline( + replacementStore: replacementStore, + formatterBridge: formatter, + formatterPathsProvider: formatterPaths + ) + + let result = await pipeline.process( + "parakeet helper is ready", + smartFormattingEnabled: true + ) + + #expect(result.outputText == "Parakeet helper is ready") + #expect(result.tags == ["replacement", "formatter-fallback"]) + } + + @Test + func transcriptStorePersistsRawAndProcessedTranscripts() async throws { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [ + ReplacementEntry(trigger: "executorch", replacement: "ExecuTorch"), + ] + let pipeline = TextPipeline(replacementStore: replacementStore) + let sessionsURL = sandbox.appendingPathComponent("sessions.json") + let preferences = Preferences() + preferences.enableSmartFormatting = false + let store = TranscriptStore( + preferences: preferences, + downloader: ModelDownloader(), + sessionsURL: sessionsURL, + textPipeline: pipeline + ) + + await store.storeCompletedTranscription(rawText: "executorch rocks", duration: 3) + + let saved = try #require(store.sessions.first) + #expect(saved.rawTranscript == "executorch rocks") + #expect(saved.transcript == "ExecuTorch rocks") + #expect(saved.tags == ["replacement"]) + } + + @Test + func transcriptStorePersistsFormatterOutputWhenSmartFormattingIsEnabled() async throws { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [ + ReplacementEntry(trigger: "executorch", replacement: "ExecuTorch"), + ] + let formatter = StubFormatterBridge(result: "executorch rocks.") + let pipeline = TextPipeline( + replacementStore: replacementStore, + formatterBridge: formatter, + formatterPathsProvider: formatterPaths + ) + let preferences = Preferences() + preferences.enableSmartFormatting = true + let sessionsURL = sandbox.appendingPathComponent("sessions.json") + let store = TranscriptStore( + preferences: preferences, + downloader: ModelDownloader(), + sessionsURL: sessionsURL, + textPipeline: pipeline + ) + + await store.storeCompletedTranscription(rawText: "um executorch rocks", duration: 3) + + let saved = try #require(store.sessions.first) + #expect(saved.rawTranscript == "um executorch rocks") + #expect(saved.transcript == "ExecuTorch rocks.") + #expect(saved.tags == ["formatted", "replacement"]) + } + + @Test + func finishDictationWithoutActiveRecordingThrowsSoftCancelWithoutSettingError() async { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [] + let pipeline = TextPipeline(replacementStore: replacementStore) + let sessionsURL = sandbox.appendingPathComponent("sessions.json") + let preferences = Preferences() + preferences.enableSmartFormatting = false + let store = TranscriptStore( + preferences: preferences, + downloader: ModelDownloader(), + sessionsURL: sessionsURL, + textPipeline: pipeline + ) + + var caught: Error? + do { + _ = try await store.finishDictationCapture() + } catch { + caught = error + } + + if let runnerError = caught as? RunnerError, case .dictationNotActive = runnerError { + // expected + } else { + Issue.record("Expected RunnerError.dictationNotActive, got \(String(describing: caught))") + } + #expect(store.currentError == nil) + } + + @Test + func dictationTranscriptionProcessesTextWithoutPersistingHistory() async { + let sandbox = makeSandbox() + let replacementStore = ReplacementStore(fileURL: sandbox.appendingPathComponent("replacements.json")) + replacementStore.entries = [ + ReplacementEntry(trigger: "executorch", replacement: "ExecuTorch"), + ] + let pipeline = TextPipeline(replacementStore: replacementStore) + let sessionsURL = sandbox.appendingPathComponent("sessions.json") + let preferences = Preferences() + preferences.enableSmartFormatting = false + let store = TranscriptStore( + preferences: preferences, + downloader: ModelDownloader(), + sessionsURL: sessionsURL, + textPipeline: pipeline + ) + + let result = await store.storeDictationTranscription(rawText: "executorch rocks", duration: 3) + + #expect(result.outputText == "ExecuTorch rocks") + #expect(store.sessions.isEmpty) + #expect(store.selectedSessionID == nil) + #expect(store.liveTranscript == "ExecuTorch rocks") + } + + private func makeSandbox() -> URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString, isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } + + private func formatterPaths() -> TextPipeline.FormatterPaths { + TextPipeline.FormatterPaths( + runnerPath: "/tmp/lfm25_formatter_helper", + modelPath: "/tmp/lfm2_5_350m_mlx_4w.pte", + tokenizerPath: "/tmp/tokenizer.json", + tokenizerConfigPath: "/tmp/tokenizer_config.json" + ) + } +} + +actor StubFormatterBridge: FormatterBridgeClient { + private let result: String? + private let error: Error? + private(set) var prompts: [String] = [] + + init(result: String) { + self.result = result + self.error = nil + } + + init(error: Error) { + self.result = nil + self.error = error + } + + func runtimeSnapshot() async -> FormatterBridge.RuntimeSnapshot { + FormatterBridge.RuntimeSnapshot( + state: .warm, + runnerPath: nil, + modelPath: nil, + tokenizerPath: nil, + tokenizerConfigPath: nil, + statusMessage: "Formatter ready" + ) + } + + func prepare( + runnerPath: String, + modelPath: String, + tokenizerPath: String, + tokenizerConfigPath: String + ) async throws {} + + func shutdown() async {} + + func format( + runnerPath: String, + modelPath: String, + tokenizerPath: String, + tokenizerConfigPath: String, + prompt: String, + maxNewTokens: Int, + temperature: Double + ) async throws -> FormatterBridge.FormatResult { + prompts.append(prompt) + if let error { + throw error + } + return FormatterBridge.FormatResult( + text: result ?? "", + stdout: "", + stderr: "", + tokensPerSecond: nil + ) + } +} diff --git a/ExecuWhisper/ExecuWhisperTests/TranscriptStoreLatencyTests.swift b/ExecuWhisper/ExecuWhisperTests/TranscriptStoreLatencyTests.swift new file mode 100644 index 0000000000..02a4aba9a0 --- /dev/null +++ b/ExecuWhisper/ExecuWhisperTests/TranscriptStoreLatencyTests.swift @@ -0,0 +1,354 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +import Foundation +import Testing + +@MainActor +struct TranscriptStoreLatencyTests { + @Test + func transcribeCapturedAudioUsesPCMHelperPath() async throws { + let sandbox = makeSandbox() + let fakeRunner = FakeRunnerBridge() + let preferences = Preferences() + let store = TranscriptStore( + preferences: preferences, + downloader: ModelDownloader(), + sessionsURL: sandbox.appendingPathComponent("sessions.json"), + runner: fakeRunner + ) + let pcmData = makePCMData(sampleCount: 1600) + + let result = try await store.transcribeCapturedAudio(pcmData) + let snapshot = await fakeRunner.snapshot() + + #expect(result.text == "direct-pcm") + #expect(snapshot.audioPathCallCount == 0) + #expect(snapshot.pcmCallCount == 1) + #expect(snapshot.lastPCMData == pcmData) + } + + @Test + func importAudioFileUsesDecoderAndPCMHelperPathAndPersistsHistory() async throws { + let sandbox = makeSandbox() + let fakeRunner = FakeRunnerBridge() + let fakeDecoder = FakeImportedAudioDecoder( + decodedAudio: .init( + pcmData: makePCMData(sampleCount: 3_200), + duration: 2.5 + ) + ) + let preferences = try makeReadyPreferences(in: sandbox) + let store = TranscriptStore( + preferences: preferences, + downloader: ModelDownloader(), + sessionsURL: sandbox.appendingPathComponent("sessions.json"), + audioDecoder: fakeDecoder, + runner: fakeRunner + ) + let inputURL = sandbox.appendingPathComponent("meeting-notes.mp3") + try Data("fake mp3 contents".utf8).write(to: inputURL, options: .atomic) + + let didImport = await store.importAudioFile(inputURL) + let snapshot = await fakeRunner.snapshot() + + #expect(didImport) + #expect(snapshot.audioPathCallCount == 0) + #expect(snapshot.pcmCallCount == 1) + #expect(snapshot.lastPCMData == makePCMData(sampleCount: 3_200)) + let saved = try #require(store.sessions.first) + #expect(saved.title == "meeting-notes") + #expect(saved.duration == 2.5) + #expect(saved.transcript == "direct-pcm") + } + + @Test + func importAudioFileRestoresSelectionWhenDecodingFails() async throws { + let sandbox = makeSandbox() + let preferences = try makeReadyPreferences(in: sandbox) + let sessionsURL = sandbox.appendingPathComponent("sessions.json") + let existing = Session(title: "Existing", transcript: "saved", duration: 1) + try JSONEncoder().encode([existing]).write(to: sessionsURL, options: .atomic) + + let store = TranscriptStore( + preferences: preferences, + downloader: ModelDownloader(), + sessionsURL: sessionsURL, + audioDecoder: FailingImportedAudioDecoder() + ) + store.selectedSessionID = existing.id + store.selectedHistorySessionIDs = [existing.id] + + let didImport = await store.importAudioFile(sandbox.appendingPathComponent("broken.mp3")) + + #expect(!didImport) + #expect(store.selectedSessionID == existing.id) + #expect(store.selectedHistorySessionIDs == [existing.id]) + #expect(store.currentError != nil) + } + + @Test + func importAudioFileRestoresSelectionWhenRunnerFails() async throws { + let sandbox = makeSandbox() + let preferences = try makeReadyPreferences(in: sandbox) + let sessionsURL = sandbox.appendingPathComponent("sessions.json") + let existing = Session(title: "Existing", transcript: "saved", duration: 1) + try JSONEncoder().encode([existing]).write(to: sessionsURL, options: .atomic) + + let store = TranscriptStore( + preferences: preferences, + downloader: ModelDownloader(), + sessionsURL: sessionsURL, + audioDecoder: FakeImportedAudioDecoder( + decodedAudio: .init( + pcmData: makePCMData(sampleCount: 1_600), + duration: 1 + ) + ), + runner: FakeRunnerBridge( + pcmError: RunnerError.runnerCrashed(exitCode: 1, stderr: "boom") + ) + ) + store.selectedSessionID = existing.id + store.selectedHistorySessionIDs = [existing.id] + + let didImport = await store.importAudioFile(sandbox.appendingPathComponent("broken.wav")) + + #expect(!didImport) + #expect(store.selectedSessionID == existing.id) + #expect(store.selectedHistorySessionIDs == [existing.id]) + #expect(store.currentError != nil) + } + + @Test + func preloadAndUnloadUpdateHelperResidencyState() async { + let sandbox = makeSandbox() + let fakeRunner = FakeRunnerBridge() + let preferences = Preferences() + let store = TranscriptStore( + preferences: preferences, + downloader: ModelDownloader(), + sessionsURL: sandbox.appendingPathComponent("sessions.json"), + runner: fakeRunner + ) + store.healthResult = HealthCheck.Result( + runnerAvailable: true, + modelAvailable: true, + tokenizerAvailable: true, + micPermission: .authorized + ) + + await store.preloadModel() + let warmSnapshot = await fakeRunner.snapshot() + + #expect(store.helperState == .warm) + #expect(store.helperStatusMessage == "Model preloaded") + #expect(warmSnapshot.prepareCallCount == 1) + + await store.unloadModel() + let unloadedSnapshot = await fakeRunner.snapshot() + + #expect(store.helperState == .unloaded) + #expect(unloadedSnapshot.shutdownCallCount == 1) + } + + @Test + func initializeAutomaticallyWarmsHelperWhenAssetsAreReady() async throws { + let sandbox = makeSandbox() + let fakeRunner = FakeRunnerBridge() + let preferences = try makeReadyPreferences(in: sandbox) + preferences.enableSmartFormatting = false + let store = TranscriptStore( + preferences: preferences, + downloader: ModelDownloader(), + sessionsURL: sandbox.appendingPathComponent("sessions.json"), + runner: fakeRunner + ) + + await store.initialize() + let snapshot = await fakeRunner.snapshot() + + #expect(store.modelState == .ready) + #expect(store.helperState == .warm) + #expect(store.helperStatusMessage == "Model preloaded") + #expect(snapshot.prepareCallCount == 1) + } + + @Test + func healthCheckRewarmsHelperWhenRuntimeBecomesUnloaded() async throws { + let sandbox = makeSandbox() + let fakeRunner = FakeRunnerBridge() + let preferences = try makeReadyPreferences(in: sandbox) + let store = TranscriptStore( + preferences: preferences, + downloader: ModelDownloader(), + sessionsURL: sandbox.appendingPathComponent("sessions.json"), + runner: fakeRunner + ) + + await store.initialize() + await fakeRunner.forceRuntimeState(.unloaded) + await store.runHealthCheck() + let snapshot = await fakeRunner.snapshot() + + #expect(store.modelState == .ready) + #expect(store.helperState == .warm) + #expect(store.helperStatusMessage == "Model preloaded") + #expect(snapshot.prepareCallCount == 2) + } + + private func makePCMData(sampleCount: Int) -> Data { + var samples = (0...size) + } + + private func makeSandbox() -> URL { + let directory = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString, isDirectory: true) + try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + return directory + } + + private func makeReadyPreferences(in sandbox: URL) throws -> Preferences { + let suiteName = "TranscriptStoreLatencyTests.\(UUID().uuidString)" + let defaults = UserDefaults(suiteName: suiteName)! + defaults.removePersistentDomain(forName: suiteName) + + let preferences = Preferences(defaults: defaults) + let runnerPath = sandbox.appendingPathComponent("parakeet_helper").path(percentEncoded: false) + FileManager.default.createFile(atPath: runnerPath, contents: Data(), attributes: [.posixPermissions: 0o755]) + let modelDirectory = sandbox.appendingPathComponent("model", isDirectory: true) + try FileManager.default.createDirectory(at: modelDirectory, withIntermediateDirectories: true) + try Data("model".utf8).write(to: modelDirectory.appendingPathComponent("model.pte"), options: .atomic) + try Data("tokenizer".utf8).write(to: modelDirectory.appendingPathComponent("tokenizer.model"), options: .atomic) + + preferences.runnerPath = runnerPath + preferences.modelDirectory = modelDirectory.path(percentEncoded: false) + return preferences + } +} + +private actor FakeRunnerBridge: RunnerBridgeClient { + struct Snapshot: Sendable { + let audioPathCallCount: Int + let pcmCallCount: Int + let lastPCMData: Data? + let prepareCallCount: Int + let shutdownCallCount: Int + let runtimeState: RunnerBridge.ResidencyState + } + + private var audioPathCallCount = 0 + private var pcmCallCount = 0 + private var lastPCMData: Data? + private var prepareCallCount = 0 + private var shutdownCallCount = 0 + private var runtimeState: RunnerBridge.ResidencyState = .unloaded + private let pcmError: Error? + + init(pcmError: Error? = nil) { + self.pcmError = pcmError + } + + func runtimeSnapshot() async -> RunnerBridge.RuntimeSnapshot { + RunnerBridge.RuntimeSnapshot( + state: runtimeState, + runnerPath: nil, + modelPath: nil, + tokenizerPath: nil + ) + } + + func prepare( + runnerPath: String, + modelPath: String, + tokenizerPath: String + ) async throws { + prepareCallCount += 1 + runtimeState = .warm + } + + func shutdown() async { + shutdownCallCount += 1 + runtimeState = .unloaded + } + + func transcribe( + runnerPath: String, + modelPath: String, + tokenizerPath: String, + audioPath: String, + options: RunnerBridge.RunOptions + ) async -> AsyncThrowingStream { + audioPathCallCount += 1 + return AsyncThrowingStream { continuation in + continuation.yield(.completed(.init( + text: "legacy-wav", + stdout: "", + stderr: "", + stats: nil, + runtimeProfile: nil + ))) + continuation.finish() + } + } + + func transcribePCM( + runnerPath: String, + modelPath: String, + tokenizerPath: String, + pcmData: Data, + options: RunnerBridge.RunOptions + ) async -> AsyncThrowingStream { + pcmCallCount += 1 + lastPCMData = pcmData + return AsyncThrowingStream { continuation in + if let pcmError { + continuation.finish(throwing: pcmError) + return + } + continuation.yield(.completed(.init( + text: "direct-pcm", + stdout: "", + stderr: "", + stats: nil, + runtimeProfile: nil + ))) + continuation.finish() + } + } + + func snapshot() -> Snapshot { + Snapshot( + audioPathCallCount: audioPathCallCount, + pcmCallCount: pcmCallCount, + lastPCMData: lastPCMData, + prepareCallCount: prepareCallCount, + shutdownCallCount: shutdownCallCount, + runtimeState: runtimeState + ) + } + + func forceRuntimeState(_ state: RunnerBridge.ResidencyState) { + runtimeState = state + } +} + +private struct FakeImportedAudioDecoder: ImportedAudioDecoding { + let decodedAudio: DecodedImportedAudioFile + + func decodeAudioFile(at url: URL) throws -> DecodedImportedAudioFile { + decodedAudio + } +} + +private struct FailingImportedAudioDecoder: ImportedAudioDecoding { + func decodeAudioFile(at url: URL) throws -> DecodedImportedAudioFile { + throw RunnerError.transcriptionFailed(description: "decode failed") + } +} diff --git a/ExecuWhisper/README.md b/ExecuWhisper/README.md new file mode 100644 index 0000000000..66bbd1f69f --- /dev/null +++ b/ExecuWhisper/README.md @@ -0,0 +1,229 @@ +# ExecuWhisper + +`ExecuWhisper` is a native macOS app for on-device dictation with Parakeet TDT on ExecuTorch + Metal and optional LFM2.5-350M formatting on ExecuTorch + MLX. It keeps the app workflow local: + +- record audio from the microphone +- stop recording +- keep `parakeet_helper` warm and send the captured PCM directly for transcription +- optionally rewrite the transcript with `lfm25_formatter_helper` +- save manual recording transcripts to local history or paste formatted dictation text + +Unlike `VoxtralRealtime`, this app still does **not** do live token streaming, wake-word detection, or `silero_vad`. System dictation is available in a batch-compatible form: the default shortcut is `Ctrl+Space`, users can customize it in Settings, and the overlay pastes the final formatted text when recording stops. + +## Features + +- Record-then-transcribe flow with local microphone capture +- Auto-detected microphone selection for both manual recording and system dictation +- Batch-compatible system dictation with a customizable global shortcut and floating overlay +- First-launch model download from `younghan-meta/Parakeet-TDT-ExecuTorch-Metal` +- Single smart formatting prompt backed by `younghan-meta/LFM2.5-ExecuTorch-MLX` +- Searchable session history with rename, pinning, and recency grouping +- Text replacements for product names, acronyms, and domain terms +- Snippets for exact-match dictated templates +- Session export to `.txt`, `.json`, and `.srt` +- Lightweight DMG packaging by default, with optional bundled-model builds + +## Requirements + +- macOS 14.0+ +- Apple Silicon +- Xcode 16+ +- Conda +- `xcodegen` +- `libomp` + +Install the host tools: + +```bash +brew install xcodegen libomp +``` + +## Usage + +### First launch + +The default app build is intentionally small. On first launch, `ExecuWhisper` downloads: + +- `model.pte` +- `tokenizer.model` +- `formatter/lfm2_5_350m_mlx_4w.pte` +- `formatter/tokenizer.json` +- `formatter/tokenizer_config.json` + +into: + +```text +~/Library/Application Support/ExecuWhisper/models +``` + +Session history is stored at: + +```text +~/Library/Application Support/ExecuWhisper/sessions.json +``` + +Replacements are stored at: + +```text +~/Library/Application Support/ExecuWhisper/replacements.json +``` + +### Keyboard shortcuts + +| Shortcut | Action | +|---|---| +| `Cmd+Shift+R` | Start recording / stop and transcribe | +| `Cmd+Shift+C` | Copy the current transcript | +| `Ctrl+Space` | Toggle system dictation by default; change it in Settings | + +## Build From Source + +### 1. Activate the Metal environment + +```bash +conda create -n et-metal python=3.12 -y +conda activate et-metal +``` + +### 2. Build the Parakeet helper + +`parakeet_helper` is provided by [pytorch/executorch#18861](https://github.com/pytorch/executorch/pull/18861). Until that PR lands, check it out before building: + +```bash +cd ~/executorch +gh pr checkout https://github.com/pytorch/executorch/pull/18861 +make parakeet-metal +``` + +The helper is expected at: + +```text +~/executorch/cmake-out/examples/models/parakeet/parakeet_helper +``` + +### 3. Build the LFM2.5 formatter helper + +```bash +cd ~/executorch +make lfm_2_5_formatter-mlx +``` + +The helper is expected at: + +```text +~/executorch/cmake-out/examples/models/llama/lfm25_formatter_helper +``` + +### 4. Build the macOS app + +```bash +cd /Users/younghan/executorch-examples/ExecuWhisper +./scripts/build.sh +``` + +That produces: + +```text +./build/Build/Products/Release/ExecuWhisper.app +``` + +### Optional: download or bundle models during the build + +Download model artifacts into `MODEL_DIR` before building: + +```bash +./scripts/build.sh --download-models +``` + +This downloads Parakeet from [younghan-meta/Parakeet-TDT-ExecuTorch-Metal](https://huggingface.co/younghan-meta/Parakeet-TDT-ExecuTorch-Metal) and the formatter artifacts from `younghan-meta/LFM2.5-ExecuTorch-MLX`. + +Build a self-contained `.app` that already includes Parakeet and LFM2.5 formatter artifacts: + +```bash +./scripts/build.sh --bundle-models +``` + +You can override the default paths with: + +```bash +export EXECUTORCH_PATH="$HOME/executorch" +export PARAKEET_HELPER_PATH="/Users/younghan/project/executorch/cmake-out/examples/models/parakeet/parakeet_helper" +export FORMATTER_HELPER_PATH="$HOME/executorch/cmake-out/examples/models/llama/lfm25_formatter_helper" +export FORMATTER_METALLIB_PATH="$HOME/executorch/cmake-out/examples/models/llama/mlx.metallib" +export MODEL_DIR="$HOME/parakeet_metal" +export FORMATTER_MODEL_DIR="$HOME/lfm2_5_mlx" +``` + +## Create A DMG + +After building the app: + +```bash +./scripts/create_dmg.sh \ + "./build/Build/Products/Release/ExecuWhisper.app" \ + "./ExecuWhisper.dmg" +``` + +Behavior: + +- If the app bundle contains only the helpers and runtime libraries, the DMG stays lightweight and the app downloads models on first launch. +- If the app bundle already contains Parakeet and LFM2.5 artifacts, the script validates all files and creates a bundled-model DMG. + +## Run Tests + +```bash +xcodegen generate +xcodebuild \ + -project ExecuWhisper.xcodeproj \ + -scheme ExecuWhisper \ + -derivedDataPath build \ + -destination "platform=macOS" \ + test +``` + +Current regression coverage includes: + +- helper reuse and restart behavior in the warm bridge +- direct PCM handoff from the recorder into the helper +- preload and unload state handling for the helper lifecycle +- session compatibility for older `sessions.json` payloads +- replacement pipeline behavior +- LFM2.5 formatter prompt construction, protocol, bridge reuse, and fallback behavior +- session history grouping and pinning logic +- export rendering and file writing + +## Manual Latency Gate + +Use the helper benchmark to compare the first cold request against a second warm +request on the same helper process: + +```bash +python3 ./scripts/benchmark_helper.py \ + --helper "$HOME/executorch/cmake-out/examples/models/parakeet/parakeet_helper" \ + --model "$HOME/parakeet_metal/model.pte" \ + --tokenizer "$HOME/parakeet_metal/tokenizer.model" \ + --audio /path/to/16khz_mono_float32.wav +``` + +Notes: + +- The script exits non-zero if the warm request is not faster than the cold request. +- Pass `--min-speedup-ratio 0.2` to require at least a 20% warmup win. +- If you do not have a sample WAV handy, omit `--audio` and the script will use a generated synthetic clip. + +## Troubleshooting + +- `Parakeet helper not found`: check out [pytorch/executorch#18861](https://github.com/pytorch/executorch/pull/18861), then run `conda activate et-metal && cd ~/executorch && make parakeet-metal` +- `LFM2.5 formatter helper not found`: run `conda activate et-mlx && cd ~/executorch && make lfm_2_5_formatter-mlx` +- `mlx.metallib not found`: rerun `conda activate et-mlx && cd ~/executorch && make lfm_2_5_formatter-mlx`; the app bundles `mlx.metallib` next to `lfm25_formatter_helper` +- `libomp.dylib not found`: run `brew install libomp` +- Model download fails on first launch: check network access and verify the Parakeet and LFM2.5 Hugging Face repos are reachable from your machine +- Accessibility repeatedly asks during Xcode development: use the Settings access prompt and grant `ExecuWhisper Paste Helper` (`org.pytorch.executorch.ExecuWhisper.PasteHelper`); ExecuWhisper installs this stable helper app under Application Support so rebuilds can keep auto-paste working. +- DMG script says bundled-model files are missing: rebuild with `./scripts/build.sh --bundle-models`, or create a lightweight DMG instead +- Existing history is visible even if model assets are currently missing: use the `Home` page to repair downloads while keeping old transcripts accessible from the sidebar +- To reset macOS Accessibility and Microphone permissions for the app during testing: + +```bash +tccutil reset Accessibility org.pytorch.executorch.ExecuWhisper +tccutil reset Microphone org.pytorch.executorch.ExecuWhisper +``` diff --git a/ExecuWhisper/docs/RELEASE_QA_CHECKLIST.md b/ExecuWhisper/docs/RELEASE_QA_CHECKLIST.md new file mode 100644 index 0000000000..dd9ff40610 --- /dev/null +++ b/ExecuWhisper/docs/RELEASE_QA_CHECKLIST.md @@ -0,0 +1,52 @@ +# ExecuWhisper Release QA Checklist + +Use this checklist before distributing an internal DMG. + +## Build And Package + +- Run `xcodebuild test -scheme ExecuWhisper -destination 'platform=macOS'`. +- Run `./scripts/build.sh`. +- Run `./scripts/create_dmg.sh ./build/Build/Products/Release/ExecuWhisper.app ./ExecuWhisper.dmg`. +- Run `./scripts/verify_release.sh ./ExecuWhisper.dmg`. + +## First Launch + +- Start from a clean machine or remove `~/Library/Application Support/ExecuWhisper/models`. +- Launch from the DMG-installed app. +- Verify model downloads complete. +- Verify Parakeet preload spinner appears and transitions to ready. +- Grant Microphone permission to ExecuWhisper. +- Grant Accessibility permission to ExecuWhisper Paste Helper. + +## Audio Devices + +Verify both manual Record and overlay dictation for each available class: + +- Built-in Mac microphone. +- Bluetooth headset in hands-free mode. +- USB audio interface at 44.1 kHz. +- USB audio interface at 96 kHz, if available. +- Virtual or aggregate input such as Sokuji, BlackHole, or Loopback. + +For each device: + +- First dictation captures non-empty PCM. +- Two consecutive dictations both transcribe. +- Console shows `Audio recording engine bound` with the expected device. +- No `Format mismatch` or `Failed to create tap, config change pending!`. + +## Formatter + +- Dictate `does it feel like real-time processing?`; final text must remain a question. +- Dictate `Hello, can you hear me?`; final text must not contain prompt examples or `Options:`. +- Dictate a long input that exceeds formatter context budget; output should fall back to Parakeet and include `formatter-skipped-context` in logs/session tags. + +## Long Running + +- Leave manual recording running until the max duration is reached; it should stop through the normal stop path. +- Confirm the temp capture file is removed after transcription. + +## Upgrade + +- Install over a previous ExecuWhisper build. +- Verify the paste helper is upgraded and Accessibility can still be granted to `org.pytorch.executorch.ExecuWhisper.PasteHelper`. diff --git a/ExecuWhisper/docs/SUPPORT_RUNBOOK.md b/ExecuWhisper/docs/SUPPORT_RUNBOOK.md new file mode 100644 index 0000000000..a82c64c9d8 --- /dev/null +++ b/ExecuWhisper/docs/SUPPORT_RUNBOOK.md @@ -0,0 +1,63 @@ +# ExecuWhisper Support Runbook + +## Logs + +Default logs hide transcript contents. + +```bash +log stream --predicate 'subsystem == "org.pytorch.executorch.ExecuWhisper"' --info +``` + +To debug transcript contents on a local machine: + +```bash +defaults write org.pytorch.executorch.ExecuWhisper EXECUWHISPER_DEBUG_LOG_TRANSCRIPTS -bool YES +``` + +Disable it after diagnosis: + +```bash +defaults delete org.pytorch.executorch.ExecuWhisper EXECUWHISPER_DEBUG_LOG_TRANSCRIPTS +``` + +## Reset Permissions + +```bash +tccutil reset Microphone org.pytorch.executorch.ExecuWhisper +tccutil reset Accessibility org.pytorch.executorch.ExecuWhisper.PasteHelper +``` + +## Reset Model Cache + +```bash +rm -rf "$HOME/Library/Application Support/ExecuWhisper/models" +``` + +The app will download models again on next launch. + +## Common Symptoms + +### No audio captured + +Check for: + +- `Audio recording engine bound` +- `Format mismatch` +- `Failed to create tap, config change pending!` +- selected microphone UID and device name + +If the selected device is virtual or Bluetooth, retry with built-in mic to isolate routing. + +### Auto-paste does not work + +Grant Accessibility to `ExecuWhisper Paste Helper`, not only to `ExecuWhisper`. + +### Formatter returns wrong text + +Look for: + +- `formatter-fallback` +- `formatter-skipped-context` +- `LFM2.5 output rejected by validator` + +With transcript debug logging disabled, support logs should not contain dictated text. diff --git a/ExecuWhisper/project.yml b/ExecuWhisper/project.yml new file mode 100644 index 0000000000..948e63e4cc --- /dev/null +++ b/ExecuWhisper/project.yml @@ -0,0 +1,216 @@ +name: ExecuWhisper +options: + bundleIdPrefix: org.pytorch.executorch + deploymentTarget: + macOS: "14.0" + xcodeVersion: "16.0" + createIntermediateGroups: true + generateEmptyDirectories: true + +settings: + base: + SWIFT_VERSION: "5.10" + MACOSX_DEPLOYMENT_TARGET: "14.0" + ENABLE_HARDENED_RUNTIME: YES + DEAD_CODE_STRIPPING: YES + # Our post-compile script copies helpers from outside the build dir + # (parakeet_helper / lfm25_formatter_helper) and runs codesign / install_name_tool. + # User script sandboxing would block those reads. Keep it explicitly off. + ENABLE_USER_SCRIPT_SANDBOXING: NO + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS: YES + LOCALIZATION_PREFERS_STRING_CATALOGS: YES + +schemes: + ExecuWhisper: + build: + targets: + ExecuWhisper: all + ExecuWhisperTests: [test] + run: + environmentVariables: + EXECUTORCH_PATH: /Users/younghan/executorch + PARAKEET_HELPER_PATH: /Users/younghan/project/executorch/cmake-out/examples/models/parakeet/parakeet_helper + FORMATTER_HELPER_PATH: /Users/younghan/executorch/cmake-out/examples/models/llama/lfm25_formatter_helper + test: + gatherCoverageData: true + targets: + - ExecuWhisperTests + +targets: + ExecuWhisper: + type: application + platform: macOS + sources: + - path: ExecuWhisper + excludes: + - "Resources/**" + - "Support/PasteHelper/**" + - path: ExecuWhisper/Resources/Assets.xcassets + - path: ExecuWhisper/Resources/model_manifest.json + settings: + base: + PRODUCT_BUNDLE_IDENTIFIER: org.pytorch.executorch.ExecuWhisper + PRODUCT_NAME: ExecuWhisper + INFOPLIST_FILE: ExecuWhisper/Info.plist + CODE_SIGN_ENTITLEMENTS: ExecuWhisper/ExecuWhisper.entitlements + CODE_SIGN_STYLE: Automatic + DEVELOPMENT_TEAM: XSDG2DKA58 + EXECUTORCH_PATH: /Users/younghan/executorch + PARAKEET_HELPER_PATH: /Users/younghan/project/executorch/cmake-out/examples/models/parakeet/parakeet_helper + FORMATTER_HELPER_PATH: /Users/younghan/executorch/cmake-out/examples/models/llama/lfm25_formatter_helper + GENERATE_INFOPLIST_FILE: YES + INFOPLIST_KEY_NSMicrophoneUsageDescription: "ExecuWhisper needs microphone access to record audio for on-device transcription." + COMBINE_HIDPI_IMAGES: YES + ASSETCATALOG_COMPILER_APPICON_NAME: AppIcon + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME: AccentColor + entitlements: + path: ExecuWhisper/ExecuWhisper.entitlements + properties: + com.apple.security.cs.disable-library-validation: true + com.apple.security.device.audio-input: true + com.apple.security.network.client: true + postCompileScripts: + - script: | + set -euo pipefail + + ET_PATH="${EXECUTORCH_PATH:-${HOME}/executorch}" + HELPER_SRC="${PARAKEET_HELPER_PATH:-${ET_PATH}/cmake-out/examples/models/parakeet/parakeet_helper}" + FORMATTER_HELPER_SRC="${FORMATTER_HELPER_PATH:-${ET_PATH}/cmake-out/examples/models/llama/lfm25_formatter_helper}" + MLX_METALLIB_SRC="${FORMATTER_METALLIB_PATH:-$(dirname "${FORMATTER_HELPER_SRC}")/mlx.metallib}" + MODEL_DIR="${MODEL_DIR:-${HOME}/parakeet_metal}" + FORMATTER_MODEL_DIR="${FORMATTER_MODEL_DIR:-${HOME}/lfm2_5_mlx}" + BUNDLE_MODEL_ARTIFACTS="${BUNDLE_MODEL_ARTIFACTS:-0}" + LIBOMP_HOMEBREW="/opt/homebrew/opt/libomp/lib/libomp.dylib" + LIBOMP_LLVM="/opt/llvm-openmp/lib/libomp.dylib" + DEST="${BUILT_PRODUCTS_DIR}/${CONTENTS_FOLDER_PATH}/Resources" + + mkdir -p "${DEST}" + + resolve_libomp() { + if [ -f "${LIBOMP_HOMEBREW}" ]; then + echo "${LIBOMP_HOMEBREW}" + elif [ -f "${LIBOMP_LLVM}" ]; then + echo "${LIBOMP_LLVM}" + fi + } + + copy_if_newer() { + local src="$1" dst="$2" + if [ ! -f "${src}" ]; then + echo "warning: Not found: ${src}" + return + fi + if [ ! -f "${dst}" ] || [ "${src}" -nt "${dst}" ]; then + cp -fL "${src}" "${dst}" + echo "Bundled $(basename "${dst}")" + fi + } + + copy_if_newer "${HELPER_SRC}" "${DEST}/parakeet_helper" + chmod +x "${DEST}/parakeet_helper" 2>/dev/null || true + copy_if_newer "${FORMATTER_HELPER_SRC}" "${DEST}/lfm25_formatter_helper" + chmod +x "${DEST}/lfm25_formatter_helper" 2>/dev/null || true + copy_if_newer "${MLX_METALLIB_SRC}" "${DEST}/mlx.metallib" + PASTE_HELPER_APP="${DEST}/ExecuWhisper Paste Helper.app" + PASTE_HELPER_CONTENTS="${PASTE_HELPER_APP}/Contents" + PASTE_HELPER_MACOS="${PASTE_HELPER_CONTENTS}/MacOS" + mkdir -p "${PASTE_HELPER_MACOS}" + xcrun swiftc "${PROJECT_DIR}/ExecuWhisper/Support/PasteHelper/main.swift" -o "${PASTE_HELPER_MACOS}/execuwhisper_paste_helper" + chmod +x "${PASTE_HELPER_MACOS}/execuwhisper_paste_helper" 2>/dev/null || true + cat > "${PASTE_HELPER_CONTENTS}/Info.plist" <<'PLIST' + + + + + CFBundleExecutable + execuwhisper_paste_helper + CFBundleIdentifier + org.pytorch.executorch.ExecuWhisper.PasteHelper + CFBundleName + ExecuWhisper Paste Helper + CFBundlePackageType + APPL + CFBundleShortVersionString + 1.0 + CFBundleVersion + 3 + LSBackgroundOnly + + + + PLIST + codesign --force --sign - --identifier "org.pytorch.executorch.ExecuWhisper.PasteHelper" "${PASTE_HELPER_APP}" 2>/dev/null || true + rm -f "${DEST}/parakeet_runner" + + LIBOMP_SRC="$(resolve_libomp)" + if [ -n "${LIBOMP_SRC:-}" ]; then + copy_if_newer "${LIBOMP_SRC}" "${DEST}/libomp.dylib" + else + echo "warning: libomp.dylib not found in expected locations" + fi + + # libc++ is provided by the macOS dyld shared cache; we no longer bundle it. + rm -f "${DEST}/libc++.1.dylib" 2>/dev/null || true + + if [ "${BUNDLE_MODEL_ARTIFACTS}" = "1" ]; then + copy_if_newer "$MODEL_DIR/model.pte" "${DEST}/model.pte" + copy_if_newer "$MODEL_DIR/tokenizer.model" "${DEST}/tokenizer.model" + copy_if_newer "$FORMATTER_MODEL_DIR/lfm2_5_350m_mlx_4w.pte" "${DEST}/lfm2_5_350m_mlx_4w.pte" + copy_if_newer "$FORMATTER_MODEL_DIR/tokenizer.json" "${DEST}/tokenizer.json" + copy_if_newer "$FORMATTER_MODEL_DIR/tokenizer_config.json" "${DEST}/tokenizer_config.json" + else + rm -f "${DEST}/model.pte" "${DEST}/tokenizer.model" "${DEST}/lfm2_5_350m_mlx_4w.pte" "${DEST}/tokenizer.json" "${DEST}/tokenizer_config.json" + fi + + if [ -f "${DEST}/parakeet_helper" ] && [ -f "${DEST}/libomp.dylib" ]; then + install_name_tool -change /opt/homebrew/opt/libomp/lib/libomp.dylib @executable_path/libomp.dylib "${DEST}/parakeet_helper" 2>/dev/null || true + install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @executable_path/libomp.dylib "${DEST}/parakeet_helper" 2>/dev/null || true + install_name_tool -add_rpath @executable_path "${DEST}/parakeet_helper" 2>/dev/null || true + install_name_tool -add_rpath /usr/lib "${DEST}/parakeet_helper" 2>/dev/null || true + fi + + if [ -f "${DEST}/lfm25_formatter_helper" ] && [ -f "${DEST}/libomp.dylib" ]; then + install_name_tool -change /opt/homebrew/opt/libomp/lib/libomp.dylib @executable_path/libomp.dylib "${DEST}/lfm25_formatter_helper" 2>/dev/null || true + install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @executable_path/libomp.dylib "${DEST}/lfm25_formatter_helper" 2>/dev/null || true + install_name_tool -add_rpath @executable_path "${DEST}/lfm25_formatter_helper" 2>/dev/null || true + install_name_tool -add_rpath /usr/lib "${DEST}/lfm25_formatter_helper" 2>/dev/null || true + fi + + name: Bundle Helper and Model Artifacts + basedOnDependencyAnalysis: false + ExecuWhisperTests: + type: bundle.unit-test + platform: macOS + sources: + - path: ExecuWhisper/Models/DictationShortcut.swift + - path: ExecuWhisper/Models/Preferences.swift + - path: ExecuWhisper/Models/ReplacementEntry.swift + - path: ExecuWhisper/Models/Session.swift + - path: ExecuWhisper/Models/TranscriptStore.swift + - path: ExecuWhisper/Utilities/PersistencePaths.swift + - path: ExecuWhisper/Utilities/SessionHistory.swift + - path: ExecuWhisper/Utilities/SessionExportFormat.swift + - path: ExecuWhisper/Utilities/RunnerError.swift + - path: ExecuWhisper/Utilities/DiagnosticLogging.swift + - path: ExecuWhisper/Services/AudioRecorder.swift + - path: ExecuWhisper/Services/DictationManager.swift + - path: ExecuWhisper/Services/FormatterBridge.swift + - path: ExecuWhisper/Services/FormatterHelperProtocol.swift + - path: ExecuWhisper/Services/GlobalHotKeyManager.swift + - path: ExecuWhisper/Services/HealthCheck.swift + - path: ExecuWhisper/Services/ImportedAudioDecoder.swift + - path: ExecuWhisper/Services/FormatterPromptBuilder.swift + - path: ExecuWhisper/Services/ModelDownloader.swift + - path: ExecuWhisper/Services/ParakeetHelperProtocol.swift + - path: ExecuWhisper/Services/PasteController.swift + - path: ExecuWhisper/Services/ReplacementStore.swift + - path: ExecuWhisper/Services/RunnerBridge.swift + - path: ExecuWhisper/Services/TextPipeline.swift + - path: ExecuWhisper/Views/AudioLevelView.swift + - path: ExecuWhisper/Views/DictationOverlayView.swift + - path: ExecuWhisper/Views/DictationPanel.swift + - path: ExecuWhisperTests + settings: + base: + PRODUCT_BUNDLE_IDENTIFIER: org.pytorch.executorch.ExecuWhisperTests + GENERATE_INFOPLIST_FILE: YES diff --git a/ExecuWhisper/scripts/benchmark_helper.py b/ExecuWhisper/scripts/benchmark_helper.py new file mode 100644 index 0000000000..d3b2e54934 --- /dev/null +++ b/ExecuWhisper/scripts/benchmark_helper.py @@ -0,0 +1,252 @@ +#!/usr/bin/env python3 +# +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# + +import argparse +import json +import math +import os +import struct +import subprocess +import sys +import time +import uuid +from typing import Tuple + + +PROTOCOL_VERSION = 1 +DEFAULT_SAMPLE_RATE = 16_000 + + +def parse_args() -> argparse.Namespace: + home = os.path.expanduser("~") + default_helper = os.path.join(home, "executorch", "cmake-out", "examples", "models", "parakeet", "parakeet_helper") + default_model = os.path.join(home, "parakeet_metal", "model.pte") + default_tokenizer = os.path.join(home, "parakeet_metal", "tokenizer.model") + + parser = argparse.ArgumentParser(description="Benchmark cold vs warm Parakeet helper latency.") + parser.add_argument("--helper", default=default_helper, help="Path to parakeet_helper") + parser.add_argument("--model", default=default_model, help="Path to model.pte") + parser.add_argument("--tokenizer", default=default_tokenizer, help="Path to tokenizer.model") + parser.add_argument( + "--audio", + help="Optional path to a 16kHz mono float32 WAV file. If omitted, a synthetic clip is generated.", + ) + parser.add_argument( + "--synthetic-duration-s", + type=float, + default=2.5, + help="Duration of the generated synthetic clip when --audio is omitted.", + ) + parser.add_argument( + "--min-speedup-ratio", + type=float, + default=0.0, + help="Optional minimum fractional warm-speedup required. 0.2 means warm must be at least 20%% faster.", + ) + return parser.parse_args() + + +def load_float32_mono_wav(path: str) -> bytes: + with open(path, "rb") as handle: + data = handle.read() + + if data[0:4] != b"RIFF" or data[8:12] != b"WAVE": + raise ValueError(f"{path} is not a RIFF/WAVE file") + + offset = 12 + fmt_chunk = None + pcm_chunk = None + while offset + 8 <= len(data): + chunk_id = data[offset:offset + 4] + chunk_size = struct.unpack_from(" len(data): + raise ValueError(f"{path} has a truncated {chunk_id.decode('ascii', 'ignore')} chunk") + + if chunk_id == b"fmt ": + fmt_chunk = data[chunk_start:chunk_end] + elif chunk_id == b"data": + pcm_chunk = data[chunk_start:chunk_end] + + offset = chunk_end + (chunk_size % 2) + + if fmt_chunk is None or pcm_chunk is None: + raise ValueError(f"{path} is missing fmt or data chunks") + + audio_format, channels, sample_rate = struct.unpack_from(" bytes: + sample_count = max(1, int(duration_s * DEFAULT_SAMPLE_RATE)) + amplitude = 0.08 + frequency_hz = 220.0 + samples = bytearray() + for index in range(sample_count): + sample = amplitude * math.sin(2.0 * math.pi * frequency_hz * (index / DEFAULT_SAMPLE_RATE)) + samples.extend(struct.pack(" subprocess.Popen[bytes]: + return subprocess.Popen( + [helper_path, "--model_path", model_path, "--tokenizer_path", tokenizer_path], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + bufsize=0, + ) + + +def read_json_line(stream) -> dict: + line = stream.readline() + if not line: + raise RuntimeError("Helper closed stdout unexpectedly") + try: + return json.loads(line.decode("utf-8")) + except json.JSONDecodeError as exc: + raise RuntimeError(f"Helper emitted invalid JSON: {line!r}") from exc + + +def wait_for_ready(process: subprocess.Popen[bytes]) -> None: + message = read_json_line(process.stdout) + if message.get("type") != "ready": + raise RuntimeError(f"Expected ready message, got: {message}") + + +def run_request(process: subprocess.Popen[bytes], pcm_data: bytes) -> Tuple[float, dict]: + request_id = uuid.uuid4().hex + header = { + "type": "transcribe", + "version": PROTOCOL_VERSION, + "request_id": request_id, + "audio": { + "encoding": "f32le", + "sample_rate": DEFAULT_SAMPLE_RATE, + "channel_count": 1, + "payload_byte_count": len(pcm_data), + }, + "enable_runtime_profile": True, + } + + start = time.perf_counter() + process.stdin.write(json.dumps(header).encode("utf-8") + b"\n") + process.stdin.write(pcm_data) + process.stdin.flush() + + while True: + message = read_json_line(process.stdout) + message_type = message.get("type") + if message_type == "status": + continue + if message_type == "result" and message.get("request_id") == request_id: + elapsed_ms = (time.perf_counter() - start) * 1000.0 + return elapsed_ms, message + if message_type == "error": + raise RuntimeError(message.get("details") or message.get("message") or "Helper returned an error") + + +def shutdown_helper(process: subprocess.Popen[bytes]) -> None: + if process.stdin is not None: + try: + process.stdin.write(json.dumps({"type": "shutdown", "version": PROTOCOL_VERSION}).encode("utf-8") + b"\n") + process.stdin.flush() + except BrokenPipeError: + pass + + try: + process.wait(timeout=5) + except subprocess.TimeoutExpired: + process.terminate() + process.wait(timeout=5) + + +def main() -> int: + args = parse_args() + + if not os.path.isfile(args.helper): + raise FileNotFoundError(f"Helper not found: {args.helper}") + if not os.path.isfile(args.model): + raise FileNotFoundError(f"Model not found: {args.model}") + if not os.path.isfile(args.tokenizer): + raise FileNotFoundError(f"Tokenizer not found: {args.tokenizer}") + + if args.audio: + pcm_data = load_float32_mono_wav(args.audio) + audio_description = args.audio + else: + pcm_data = generate_synthetic_pcm(args.synthetic_duration_s) + audio_description = f"synthetic {args.synthetic_duration_s:.2f}s tone" + + process = start_helper(args.helper, args.model, args.tokenizer) + stderr_output = b"" + try: + wait_for_ready(process) + cold_ms, cold_result = run_request(process, pcm_data) + warm_ms, warm_result = run_request(process, pcm_data) + finally: + shutdown_helper(process) + if process.stderr is not None: + try: + stderr_output = process.stderr.read() + except Exception: + stderr_output = b"" + + speedup = 0.0 + if cold_ms > 0: + speedup = max(0.0, (cold_ms - warm_ms) / cold_ms) + + print(f"Audio source: {audio_description}") + print(f"Cold request: {cold_ms:.1f} ms") + print(f"Warm request: {warm_ms:.1f} ms") + print(f"Warm speedup: {speedup * 100.0:.1f}%") + + cold_profile = cold_result.get("runtime_profile") + warm_profile = warm_result.get("runtime_profile") + if cold_profile: + print(f"Cold runtime profile: {cold_profile}") + if warm_profile: + print(f"Warm runtime profile: {warm_profile}") + + if stderr_output.strip(): + print("\nHelper stderr:") + print(stderr_output.decode("utf-8", errors="replace")) + + if warm_ms >= cold_ms: + print("\nFAIL: warm request was not faster than cold request.", file=sys.stderr) + return 1 + if speedup < args.min_speedup_ratio: + print( + f"\nFAIL: warm speedup {speedup * 100.0:.1f}% is below the required {(args.min_speedup_ratio * 100.0):.1f}%.", + file=sys.stderr, + ) + return 1 + + print("\nPASS: warm request beat the cold request.") + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except Exception as exc: + print(f"ERROR: {exc}", file=sys.stderr) + raise SystemExit(1) diff --git a/ExecuWhisper/scripts/build.sh b/ExecuWhisper/scripts/build.sh new file mode 100755 index 0000000000..15d07399e3 --- /dev/null +++ b/ExecuWhisper/scripts/build.sh @@ -0,0 +1,286 @@ +#!/usr/bin/env bash +# +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Build the ExecuWhisper macOS app. +# +# By default this builds a lightweight app bundle that downloads the model on +# first launch. Pass --bundle-models if you want to embed model artifacts into +# the app bundle for offline testing/distribution. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" + +export EXECUTORCH_PATH="${EXECUTORCH_PATH:-${HOME}/executorch}" +export MODEL_DIR="${MODEL_DIR:-${HOME}/parakeet_metal}" +export FORMATTER_MODEL_DIR="${FORMATTER_MODEL_DIR:-${HOME}/lfm2_5_mlx}" +export PARAKEET_HELPER_PATH="${PARAKEET_HELPER_PATH:-${EXECUTORCH_PATH}/cmake-out/examples/models/parakeet/parakeet_helper}" +export FORMATTER_HELPER_PATH="${FORMATTER_HELPER_PATH:-${EXECUTORCH_PATH}/cmake-out/examples/models/llama/lfm25_formatter_helper}" +export FORMATTER_METALLIB_PATH="${FORMATTER_METALLIB_PATH:-$(dirname "${FORMATTER_HELPER_PATH}")/mlx.metallib}" +LIBOMP_HOMEBREW="/opt/homebrew/opt/libomp/lib/libomp.dylib" +LIBOMP_LLVM="/opt/llvm-openmp/lib/libomp.dylib" +EXPECTED_CONDA_ENV="et-metal" + +BUILD_DIR="${PROJECT_DIR}/build" +SCHEME="ExecuWhisper" +CONFIG="Release" +APP_NAME="ExecuWhisper" + +DOWNLOAD_MODELS=false +BUNDLE_MODELS=false +CHECK_ONLY=false + +for arg in "$@"; do + case "${arg}" in + --download-models) DOWNLOAD_MODELS=true ;; + --bundle-models) BUNDLE_MODELS=true ;; + --check) CHECK_ONLY=true ;; + -h|--help) + echo "Usage: ./scripts/build.sh [--download-models] [--bundle-models]" + echo "" + echo "Builds the ExecuWhisper macOS app." + echo "" + echo "Options:" + echo " --download-models Download Parakeet and LFM2.5 artifacts before building" + echo " --bundle-models Copy ASR and formatter model artifacts into the app bundle" + echo " --check Verify generated project settings and exit" + echo " -h, --help Show this help message" + echo "" + echo "Environment variables:" + echo " EXECUTORCH_PATH Path to executorch repo (default: ~/executorch)" + echo " PARAKEET_HELPER_PATH Path to parakeet_helper (default: EXECUTORCH_PATH/cmake-out/examples/models/parakeet/parakeet_helper)" + echo " FORMATTER_HELPER_PATH Path to lfm25_formatter_helper (default: EXECUTORCH_PATH/cmake-out/examples/models/llama/lfm25_formatter_helper)" + echo " FORMATTER_METALLIB_PATH Path to mlx.metallib (default: directory of FORMATTER_HELPER_PATH/mlx.metallib)" + echo " MODEL_DIR Path to Parakeet model artifacts (default: ~/parakeet_metal)" + echo " FORMATTER_MODEL_DIR Path to LFM2.5 formatter artifacts (default: ~/lfm2_5_mlx)" + echo "" + echo "Typical local setup:" + echo " cd ~/executorch" + echo " gh pr checkout https://github.com/pytorch/executorch/pull/18861 # until parakeet_helper lands" + echo " conda activate et-metal" + echo " make parakeet-metal" + echo " make lfm_2_5_formatter-mlx" + echo " cd ${PROJECT_DIR}" + echo " ./scripts/build.sh" + echo "" + echo "Create a DMG after building:" + echo " ./scripts/create_dmg.sh \"./build/Build/Products/Release/ExecuWhisper.app\" \"./ExecuWhisper.dmg\"" + exit 0 + ;; + *) + echo "Unknown argument: ${arg}" >&2 + exit 1 + ;; + esac +done + +echo "" +echo "=== ExecuWhisper Build ===" +echo "" + +echo "--- Step 0: Checking environment ---" +if [[ -z "${CONDA_DEFAULT_ENV:-}" ]]; then + echo "WARNING: No conda environment is active." >&2 + echo " Expected: ${EXPECTED_CONDA_ENV}" >&2 +elif [[ "${CONDA_DEFAULT_ENV}" != "${EXPECTED_CONDA_ENV}" ]]; then + echo "WARNING: Active conda env is '${CONDA_DEFAULT_ENV}', expected '${EXPECTED_CONDA_ENV}'." >&2 +fi + +ERRORS=() + +if ! command -v xcodegen >/dev/null 2>&1; then + ERRORS+=("xcodegen not found - install with: brew install xcodegen") +fi + +if ! command -v xcodebuild >/dev/null 2>&1; then + ERRORS+=("xcodebuild not found - install Xcode from the App Store") +fi + +if [[ ! -d "${EXECUTORCH_PATH}" ]]; then + ERRORS+=("ExecuTorch repo not found at ${EXECUTORCH_PATH}") +fi + +if [[ ! -f "${PARAKEET_HELPER_PATH}" ]]; then + ERRORS+=("Parakeet helper not found at ${PARAKEET_HELPER_PATH}") + ERRORS+=(" Build it from pytorch/executorch#18861 with: cd ${EXECUTORCH_PATH} && gh pr checkout https://github.com/pytorch/executorch/pull/18861 && conda activate et-metal && make parakeet-metal") +fi + +if [[ ! -f "${FORMATTER_HELPER_PATH}" ]]; then + ERRORS+=("LFM2.5 formatter helper not found at ${FORMATTER_HELPER_PATH}") + ERRORS+=(" Build it with: conda activate et-mlx && cd ${EXECUTORCH_PATH} && make lfm_2_5_formatter-mlx") +fi + +if [[ ! -f "${FORMATTER_METALLIB_PATH}" ]]; then + ERRORS+=("MLX metallib not found at ${FORMATTER_METALLIB_PATH}") + ERRORS+=(" Rebuild the formatter helper with: conda activate et-mlx && cd ${EXECUTORCH_PATH} && make lfm_2_5_formatter-mlx") +fi + +if [[ ! -f "${LIBOMP_HOMEBREW}" && ! -f "${LIBOMP_LLVM}" ]]; then + ERRORS+=("libomp.dylib not found in expected locations") + ERRORS+=(" Install it with: brew install libomp") +fi + +if [[ "${DOWNLOAD_MODELS}" == true ]]; then + echo "--- Step 1: Downloading models ---" + if ! command -v hf >/dev/null 2>&1; then + ERRORS+=("The 'hf' CLI is required for --download-models. Install with: pip install huggingface_hub") + else + hf download younghan-meta/Parakeet-TDT-ExecuTorch-Metal --local-dir "${MODEL_DIR}" + hf download younghan-meta/LFM2.5-ExecuTorch-MLX \ + lfm2_5_350m_mlx_4w.pte tokenizer.json tokenizer_config.json \ + --local-dir "${FORMATTER_MODEL_DIR}" + echo "Downloaded Parakeet artifacts to ${MODEL_DIR}" + echo "Downloaded LFM2.5 formatter artifacts to ${FORMATTER_MODEL_DIR}" + fi +fi + +if [[ "${BUNDLE_MODELS}" == true ]]; then + for file in model.pte tokenizer.model; do + if [[ ! -f "${MODEL_DIR}/${file}" ]]; then + ERRORS+=("Missing ${MODEL_DIR}/${file} required for --bundle-models") + fi + done + for file in lfm2_5_350m_mlx_4w.pte tokenizer.json tokenizer_config.json; do + if [[ ! -f "${FORMATTER_MODEL_DIR}/${file}" ]]; then + ERRORS+=("Missing ${FORMATTER_MODEL_DIR}/${file} required for --bundle-models") + fi + done +fi + +if [[ ${#ERRORS[@]} -gt 0 ]]; then + echo "" + echo "ERROR: Missing prerequisites:" >&2 + for error in "${ERRORS[@]}"; do + echo " - ${error}" >&2 + done + exit 1 +fi + +echo "xcodegen: $(command -v xcodegen)" +echo "xcodebuild: $(command -v xcodebuild)" +echo "ExecuTorch: ${EXECUTORCH_PATH}" +echo "Parakeet helper: ${PARAKEET_HELPER_PATH}" +echo "Formatter helper: ${FORMATTER_HELPER_PATH}" +echo "Formatter metallib: ${FORMATTER_METALLIB_PATH}" +echo "Bundle models: ${BUNDLE_MODELS}" +echo "" + +echo "--- Step 2: Generating Xcode project ---" +cd "${PROJECT_DIR}" +xcodegen generate +echo "Generated ${SCHEME}.xcodeproj" +echo "" + +if [[ "${CHECK_ONLY}" == true ]]; then + echo "--- Step 3: Verifying project settings ---" + ./scripts/verify_project_settings.sh + exit 0 +fi + +echo "--- Step 3: Building app ---" +mkdir -p "${BUILD_DIR}" +BUILD_LOG="${BUILD_DIR}/build.log" + +set +e +BUNDLE_MODEL_ARTIFACTS=$([[ "${BUNDLE_MODELS}" == true ]] && echo 1 || echo 0) \ +xcodebuild \ + -project "${SCHEME}.xcodeproj" \ + -scheme "${SCHEME}" \ + -configuration "${CONFIG}" \ + -derivedDataPath "${BUILD_DIR}" \ + build \ + > "${BUILD_LOG}" 2>&1 +BUILD_EXIT=$? +set -e + +if [[ ${BUILD_EXIT} -ne 0 ]]; then + echo "" + echo "ERROR: xcodebuild failed (exit code ${BUILD_EXIT})." >&2 + echo "Last 30 lines:" >&2 + tail -30 "${BUILD_LOG}" >&2 + echo "" >&2 + echo "Full log: ${BUILD_LOG}" >&2 + exit 1 +fi + +APP_PATH="${BUILD_DIR}/Build/Products/${CONFIG}/${APP_NAME}.app" +if [[ ! -d "${APP_PATH}" ]]; then + echo "ERROR: Build succeeded but app not found at ${APP_PATH}" >&2 + echo "Full log: ${BUILD_LOG}" >&2 + exit 1 +fi + +SIGNING_IDENTITY="$( + xcodebuild \ + -project "${SCHEME}.xcodeproj" \ + -scheme "${SCHEME}" \ + -configuration "${CONFIG}" \ + -showBuildSettings 2>/dev/null \ + | awk -F= ' + /EXPANDED_CODE_SIGN_IDENTITY =/ { gsub(/^[ \t]+|[ \t]+$/, "", $2); if ($2 != "") expanded=$2 } + /CODE_SIGN_IDENTITY =/ { gsub(/^[ \t]+|[ \t]+$/, "", $2); if ($2 != "") identity=$2 } + END { if (expanded != "") print expanded; else if (identity != "") print identity } + ' +)" +DEVELOPMENT_TEAM="$( + xcodebuild \ + -project "${SCHEME}.xcodeproj" \ + -scheme "${SCHEME}" \ + -configuration "${CONFIG}" \ + -showBuildSettings 2>/dev/null \ + | awk -F= '/DEVELOPMENT_TEAM =/ { gsub(/^[ \t]+|[ \t]+$/, "", $2); print $2; exit }' +)" + +if [[ "${SIGNING_IDENTITY}" == "Apple Development" && -n "${DEVELOPMENT_TEAM}" ]]; then + RESOLVED_IDENTITY="$( + security find-identity -v -p codesigning 2>/dev/null \ + | awk -v team="(${DEVELOPMENT_TEAM})" ' + index($0, team) && $0 !~ /CSSMERR/ { + sub(/^[[:space:]]*[0-9]+\)[[:space:]]*/, "", $0) + print $1 + exit + } + ' + )" + if [[ -n "${RESOLVED_IDENTITY}" ]]; then + SIGNING_IDENTITY="${RESOLVED_IDENTITY}" + fi +fi +if [[ "${SIGNING_IDENTITY}" == "Apple Development" ]]; then + RESOLVED_IDENTITY="$( + security find-identity -v -p codesigning 2>/dev/null \ + | awk ' + /Apple Development:/ && $0 !~ /CSSMERR/ { + sub(/^[[:space:]]*[0-9]+\)[[:space:]]*/, "", $0) + print $1 + exit + } + ' + )" + if [[ -n "${RESOLVED_IDENTITY}" ]]; then + SIGNING_IDENTITY="${RESOLVED_IDENTITY}" + fi +fi + +if [[ -n "${SIGNING_IDENTITY}" && "${SIGNING_IDENTITY}" != "-" ]]; then + echo "--- Step 4: Signing bundled helpers ---" + ./scripts/sign_release.sh "${APP_PATH}" "${SIGNING_IDENTITY}" +else + echo "--- Step 4: Skipping helper signing (ad-hoc or unsigned build) ---" +fi + +echo "Built app: ${APP_PATH}" +echo "Build log: ${BUILD_LOG}" +echo "" +echo "On first launch, ExecuWhisper downloads model artifacts into:" +echo " ~/Library/Application Support/ExecuWhisper/models" +echo "" +echo "To create a DMG:" +echo " ./scripts/create_dmg.sh \"${APP_PATH}\" \"${PROJECT_DIR}/ExecuWhisper.dmg\"" +echo "" diff --git a/ExecuWhisper/scripts/create_dmg.sh b/ExecuWhisper/scripts/create_dmg.sh new file mode 100755 index 0000000000..af2cefab8f --- /dev/null +++ b/ExecuWhisper/scripts/create_dmg.sh @@ -0,0 +1,140 @@ +#!/usr/bin/env bash +# +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +set -euo pipefail + +APP_PATH="${1:-}" +DMG_PATH="${2:-}" +VOLUME_NAME="${3:-ExecuWhisper}" + +if [[ -z "${APP_PATH}" || -z "${DMG_PATH}" ]]; then + echo "Usage: $(basename "$0") /path/to/ExecuWhisper.app /path/to/output.dmg [Volume Name]" >&2 + exit 1 +fi + +if [[ ! -d "${APP_PATH}" ]]; then + echo "Error: App not found: ${APP_PATH}" >&2 + exit 1 +fi + +RESOURCES="${APP_PATH}/Contents/Resources" +REQUIRED_FILES=( + "parakeet_helper" + "lfm25_formatter_helper" + "mlx.metallib" + "libomp.dylib" +) +REQUIRED_DIRS=( + "ExecuWhisper Paste Helper.app" +) + +MISSING=() +for file in "${REQUIRED_FILES[@]}"; do + if [[ ! -f "${RESOURCES}/${file}" ]]; then + MISSING+=("${file}") + fi +done +for entry in "${REQUIRED_DIRS[@]}"; do + if [[ ! -d "${RESOURCES}/${entry}" ]]; then + MISSING+=("${entry}") + fi +done + +if [[ ${#MISSING[@]} -gt 0 ]]; then + echo "Error: The following required entries are missing from ${RESOURCES}:" >&2 + for entry in "${MISSING[@]}"; do + echo " - ${entry}" >&2 + done + exit 1 +fi + +HAS_MODEL=false +if [[ -f "${RESOURCES}/model.pte" || -f "${RESOURCES}/tokenizer.model" || -f "${RESOURCES}/lfm2_5_350m_mlx_4w.pte" ]]; then + HAS_MODEL=true + for file in model.pte tokenizer.model lfm2_5_350m_mlx_4w.pte tokenizer.json tokenizer_config.json; do + if [[ ! -f "${RESOURCES}/${file}" ]]; then + echo "Error: ${file} is missing from ${RESOURCES}." >&2 + echo "The app appears to be a bundled-model build, so all ASR and formatter model files must be present." >&2 + exit 1 + fi + done +fi + +if [[ "${HAS_MODEL}" == true ]]; then + echo "✓ Creating self-contained bundled-model DMG" +else + echo "✓ Creating lightweight DMG (model downloads on first launch)" +fi + +APP_NAME="$(basename "${APP_PATH}")" +WORK_DIR="$(mktemp -d)" +STAGING_DIR="${WORK_DIR}/staging" +DMG_RW="${WORK_DIR}/tmp.dmg" + +cleanup() { + rm -rf "${WORK_DIR}" +} +trap cleanup EXIT + +mkdir -p "${STAGING_DIR}" +cp -R "${APP_PATH}" "${STAGING_DIR}/" +ln -s /Applications "${STAGING_DIR}/Applications" +cat > "${STAGING_DIR}/READ ME FIRST.txt" <<'EOF' +ExecuWhisper internal install notes + +1. Drag ExecuWhisper.app to Applications. +2. First launch may show a Gatekeeper warning because this internal build uses an Apple Development certificate. + Right-click ExecuWhisper.app and choose Open, then Open again. + If needed: + xattr -d com.apple.quarantine "/Applications/ExecuWhisper.app" +3. Grant Microphone permission to ExecuWhisper. +4. Grant Accessibility permission to ExecuWhisper Paste Helper when prompted. This enables auto-paste. +5. The lightweight build downloads model files on first launch into: + ~/Library/Application Support/ExecuWhisper/models +EOF + +hdiutil create -volname "${VOLUME_NAME}" -srcfolder "${STAGING_DIR}" -ov -format UDRW "${DMG_RW}" >/dev/null + +DEVICE="" +MOUNTED=false +if ATTACH_OUTPUT="$(hdiutil attach -readwrite -noverify -noautoopen "${DMG_RW}" 2>/dev/null)"; then + DEVICE="$(printf "%s\n" "${ATTACH_OUTPUT}" | awk 'NR==1{print $1}')" + if [[ -n "${DEVICE}" ]]; then + MOUNTED=true + osascript </dev/null && echo "✓ DMG window layout configured" || echo "· Skipped DMG window layout (Finder unavailable)" +tell application "Finder" + tell disk "${VOLUME_NAME}" + open + set current view of container window to icon view + set toolbar visible of container window to false + set statusbar visible of container window to false + set the bounds of container window to {100, 100, 700, 420} + set icon size of icon view options of container window to 128 + set arrangement of icon view options of container window to not arranged + set position of item "${APP_NAME}" of container window to {150, 200} + set position of item "Applications" of container window to {500, 200} + update without registering applications + delay 1 + close + end tell +end tell +EOF + hdiutil detach "${DEVICE}" >/dev/null 2>&1 || true + fi +else + echo "· Skipped DMG layout configuration (could not attach read-write image in this context)" +fi + +if [[ -e "${DMG_PATH}" ]]; then + rm -f "${DMG_PATH}" +fi + +hdiutil convert "${DMG_RW}" -format UDZO -o "${DMG_PATH}" >/dev/null + +DMG_SIZE="$(du -sh "${DMG_PATH}" | cut -f1)" +echo "✓ Created ${DMG_PATH} (${DMG_SIZE})" diff --git a/ExecuWhisper/scripts/sign_release.sh b/ExecuWhisper/scripts/sign_release.sh new file mode 100755 index 0000000000..5179d24c8e --- /dev/null +++ b/ExecuWhisper/scripts/sign_release.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -euo pipefail + +APP_PATH="${1:-}" +IDENTITY="${2:-}" + +if [[ -z "${APP_PATH}" || -z "${IDENTITY}" ]]; then + echo "Usage: $(basename "$0") /path/to/ExecuWhisper.app SIGNING_IDENTITY_OR_SHA" >&2 + exit 1 +fi + +if [[ ! -d "${APP_PATH}" ]]; then + echo "Error: App not found: ${APP_PATH}" >&2 + exit 1 +fi + +RESOURCES="${APP_PATH}/Contents/Resources" +ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)" +HELPER_ENT="$(mktemp /tmp/execuwhisper_helper_entitlements.XXXXXX.plist)" +cleanup() { + rm -f "${HELPER_ENT}" +} +trap cleanup EXIT + +cat > "${HELPER_ENT}" <<'PLIST' + + + + + com.apple.security.cs.disable-library-validation + + + +PLIST + +sign_if_present() { + local path="$1" + shift + if [[ -e "${path}" ]]; then + codesign --force --options runtime "$@" --sign "${IDENTITY}" "${path}" + fi +} + +sign_if_present "${RESOURCES}/libomp.dylib" +sign_if_present "${RESOURCES}/parakeet_helper" --entitlements "${HELPER_ENT}" +sign_if_present "${RESOURCES}/lfm25_formatter_helper" --entitlements "${HELPER_ENT}" +sign_if_present "${RESOURCES}/ExecuWhisper Paste Helper.app" --identifier "org.pytorch.executorch.ExecuWhisper.PasteHelper" + +codesign \ + --force \ + --options runtime \ + --entitlements "${ROOT_DIR}/ExecuWhisper/ExecuWhisper.entitlements" \ + --sign "${IDENTITY}" \ + "${APP_PATH}" + +codesign --verify --deep --strict --verbose=2 "${APP_PATH}" diff --git a/ExecuWhisper/scripts/verify_project_settings.sh b/ExecuWhisper/scripts/verify_project_settings.sh new file mode 100755 index 0000000000..c647361049 --- /dev/null +++ b/ExecuWhisper/scripts/verify_project_settings.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)" +cd "${ROOT_DIR}" + +python3 - <<'PY' +import json +import subprocess +import sys + +required = { + "ENABLE_USER_SCRIPT_SANDBOXING": "NO", + "DEAD_CODE_STRIPPING": "YES", + "ENABLE_HARDENED_RUNTIME": "YES", + "DEVELOPMENT_TEAM": "XSDG2DKA58", + "CODE_SIGN_STYLE": "Automatic", +} + +raw = subprocess.check_output(["xcodegen", "dump", "--type", "json"], text=True) +spec = json.loads(raw) + +def walk(value): + if isinstance(value, dict): + yield value + for child in value.values(): + yield from walk(child) + elif isinstance(value, list): + for child in value: + yield from walk(child) + +def normalize(value): + if isinstance(value, bool): + return "YES" if value else "NO" + return str(value) + +seen = {} +for node in walk(spec): + for key, expected in required.items(): + if key in node: + seen.setdefault(key, set()).add(normalize(node[key])) + +errors = [] +for key, expected in required.items(): + values = seen.get(key, set()) + if expected not in values: + errors.append(f"{key} expected {expected}, saw {sorted(values) or ''}") + +if errors: + for error in errors: + print(f"ERROR: {error}", file=sys.stderr) + sys.exit(1) + +print("Project settings verified") +PY diff --git a/ExecuWhisper/scripts/verify_release.sh b/ExecuWhisper/scripts/verify_release.sh new file mode 100755 index 0000000000..824e87754d --- /dev/null +++ b/ExecuWhisper/scripts/verify_release.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail + +DMG_PATH="${1:-}" + +if [[ -z "${DMG_PATH}" ]]; then + echo "Usage: $(basename "$0") /path/to/ExecuWhisper.dmg" >&2 + exit 1 +fi + +if [[ ! -f "${DMG_PATH}" ]]; then + echo "Error: DMG not found: ${DMG_PATH}" >&2 + exit 1 +fi + +hdiutil verify "${DMG_PATH}" >/dev/null + +ATTACH_OUTPUT="$(hdiutil attach -readonly -nobrowse -noautoopen "${DMG_PATH}")" +DEVICE="$(printf "%s\n" "${ATTACH_OUTPUT}" | awk -F'\t' '/\/Volumes\// {print $1; exit}' | xargs)" +MOUNT="$(printf "%s\n" "${ATTACH_OUTPUT}" | awk -F'\t' '/\/Volumes\// {print $NF; exit}' | sed 's/^[[:space:]]*//')" +cleanup() { + if [[ -n "${DEVICE:-}" ]]; then + hdiutil detach "${DEVICE}" >/dev/null 2>&1 || true + fi +} +trap cleanup EXIT + +APP_PATH="${MOUNT}/ExecuWhisper.app" +if [[ ! -d "${APP_PATH}" ]]; then + echo "Error: ExecuWhisper.app missing from DMG" >&2 + exit 1 +fi + +codesign --verify --deep --strict --verbose=2 "${APP_PATH}" >/dev/null + +echo "App signature:" +codesign -dv "${APP_PATH}" 2>&1 | grep -E "Identifier|Authority|TeamIdentifier|Runtime" || true + +for entry in \ + "${APP_PATH}/Contents/Resources/parakeet_helper" \ + "${APP_PATH}/Contents/Resources/lfm25_formatter_helper" \ + "${APP_PATH}/Contents/Resources/libomp.dylib" \ + "${APP_PATH}/Contents/Resources/ExecuWhisper Paste Helper.app"; do + if [[ ! -e "${entry}" ]]; then + echo "Error: Missing signed entry: ${entry}" >&2 + exit 1 + fi + codesign --verify --strict --verbose=2 "${entry}" >/dev/null + echo "Signed entry: $(basename "${entry}")" + codesign -dv "${entry}" 2>&1 | grep -E "Identifier|Authority|TeamIdentifier|Runtime" || true +done + +echo "Release verification passed: ${DMG_PATH}"