diff --git a/.github/workflows/validate-repo-maintenance.yml b/.github/workflows/validate-repo-maintenance.yml index 771d91d..afd62d7 100644 --- a/.github/workflows/validate-repo-maintenance.yml +++ b/.github/workflows/validate-repo-maintenance.yml @@ -12,9 +12,13 @@ on: jobs: validate: name: validate - runs-on: macos-latest + runs-on: macos-26 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6.0.2 + - name: Report selected Xcode + run: xcode-select --print-path + - name: Report Swift toolchain + run: xcrun swift --version - name: Install Swift repo-maintenance tools run: brew install swiftformat swiftlint - name: Run repo-maintenance validation diff --git a/AGENTS.md b/AGENTS.md index 34b7dcc..0edff84 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -21,7 +21,7 @@ Check these surfaces before reading broadly: ### Change Scope -Keep work retrieval-first and Apple-first. If a change starts pulling the repo toward answer generation, chat orchestration, agents, remote APIs, persistence layers, or connector-heavy ingestion, stop and surface that scope change explicitly before implementing it. +Keep work retrieval-first and Apple-first. RAGKit/SwiftlyFetch semantic index persistence is an approved retrieval-state pattern; if a change starts pulling the repo toward answer generation, chat orchestration, agents, remote APIs, other persistence layers, or connector-heavy ingestion, stop and surface that scope change explicitly before implementing it. ### Source of Truth @@ -66,7 +66,7 @@ Work is done when the package still builds and tests cleanly, repo-maintenance v ### Never Do -- Never widen this package into generation, chat orchestration, agents, PDF ingestion, persistence layers, or remote provider APIs without explicit approval. +- Never widen this package into generation, chat orchestration, agents, PDF ingestion, persistence layers beyond the approved RAGKit/SwiftlyFetch semantic index state, or remote provider APIs without explicit approval. - Never make the main test suite depend on downloaded Apple embedding assets. - Never add external dependencies for this v1 retrieval package without explicit approval. - Never hand-edit generated package-manager outputs such as `Package.resolved` if they appear later. @@ -100,6 +100,7 @@ There are no deeper `AGENTS.md` files in the current repository tree. If more sp - Prefer `swift package` subcommands for structural package edits before manually editing `Package.swift`. - Edit `Package.swift` intentionally and keep it readable; agents may modify it when package structure, targets, products, or dependencies need to change, and should try to keep package graph updates consolidated in one change when possible. - Keep `Package.swift` explicit about its package-wide Swift language mode. On current Swift 6-era manifests, prefer `swiftLanguageModes: [.v6]` as the default declaration, treat `swiftLanguageVersions` as a legacy alias used only when an older manifest surface requires it, and remember that lowering the manifest's `// swift-tools-version:` from the bootstrap default is often appropriate when the package should support an older Swift 6 toolchain, but never below `6.0`. +- Keep the shared Swift-package dependency baseline in mind when creating new Swift package repos or when this package's dependency policy is explicitly opened: the current baseline package is `swift-configuration` from `https://github.com/apple/swift-configuration`, with the `Configuration` product and package traits `.defaults`, `Reloading`, `YAML`, and `CommandLineArguments`. The optional `PropertyList` trait is available when property-list parsing is needed, and the optional `Logging` trait is available when configuration access should integrate with `SwiftLog.Logger`. This repository's v1 safety boundary still applies: do not add new external dependencies to `SwiftlyFetch` without explicit approval. - Avoid adding unnecessary dependency-provenance detail or switching to branch/revision-based requirements unless the user explicitly asks for that level of control. - Treat `Package.resolved` and similar package-manager outputs as generated files; do not hand-edit them. - Prefer Swift Testing by default unless an external constraint requires XCTest. diff --git a/Package.swift b/Package.swift index 569c30e..94be1bc 100644 --- a/Package.swift +++ b/Package.swift @@ -26,6 +26,10 @@ let package = Package( name: "RAGKit", targets: ["RAGKit"] ), + .library( + name: "SwiftlyFetch", + targets: ["SwiftlyFetch"] + ), ], dependencies: [ .package(url: "https://github.com/swiftlang/swift-markdown.git", from: "0.7.3"), @@ -48,6 +52,20 @@ let package = Package( .product(name: "Markdown", package: "swift-markdown"), ] ), + .target( + name: "SwiftlyFetch", + dependencies: [ + "FetchCore", + "FetchKit", + "RAGCore", + "RAGKit", + ] + ), + .target( + name: "SwiftlyFetchTestFixtures", + dependencies: ["FetchCore"], + path: "Tests/SwiftlyFetchTestFixtures" + ), .testTarget( name: "RAGCoreTests", dependencies: ["RAGCore"] @@ -58,7 +76,7 @@ let package = Package( ), .testTarget( name: "FetchKitTests", - dependencies: ["FetchKit", "FetchCore"] + dependencies: ["FetchKit", "FetchCore", "SwiftlyFetchTestFixtures"] ), .testTarget( name: "RAGKitTests", @@ -68,6 +86,10 @@ let package = Package( name: "RAGKitIntegrationTests", dependencies: ["RAGKit", "RAGCore"] ), + .testTarget( + name: "SwiftlyFetchTests", + dependencies: ["SwiftlyFetch", "FetchCore", "RAGCore", "RAGKit", "SwiftlyFetchTestFixtures"] + ), ], swiftLanguageModes: [.v6] ) diff --git a/README.md b/README.md index 7cf1b0d..dbd6df6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # SwiftlyFetch -An Apple-first Swift Package family for local document search and semantic retrieval. +Apple-first local search and semantic retrieval for Swift apps. ## Table of Contents @@ -13,34 +13,69 @@ An Apple-first Swift Package family for local document search and semantic retri ## Overview -### Status +SwiftlyFetch is a Swift package family for apps that need to index local documents, search them, and assemble useful retrieval context without sending the job to a remote service. The current package is early, but useful: it ships semantic retrieval through `RAGCore` and `RAGKit`, conventional search through `FetchCore` and `FetchKit`, and the first umbrella `SwiftlyFetch` facade for one-corpus ingestion. -`v0.1.2` is the current tagged package release and is stable enough to try locally. +Use SwiftlyFetch when you want: -### What This Project Is +- local semantic retrieval over plain text and markdown +- deterministic hashing embeddings for tests, previews, and examples +- Apple Natural Language embeddings on supported Apple platforms +- persistent semantic chunks and embeddings through Core Data +- coordinated one-corpus ingestion through `SwiftlyFetchLibrary` +- conventional document search with title/body evidence, query-aware snippets, and a macOS SearchKit-backed index path +- one package family that keeps retrieval, indexing, and context assembly separate from chat, generation, agents, and remote-provider workflows -SwiftlyFetch is the umbrella product direction for a small family of Apple-first local search packages. The product goal is simple: hand the system a local corpus and get back a real search engine, with conventional search and semantic retrieval both living under one coherent Swift-native story. In practical terms, SwiftlyFetch is the family for "drop in a corpus, get back local search," with `FetchKit` covering conventional full-document search and `RAGKit` covering semantic retrieval over the same broader corpus model. +The package family is intentionally split by job: -Today, the package exposes shipped semantic retrieval work through `RAGCore` and `RAGKit`, plus the first conventional-search foundation through `FetchCore` and `FetchKit`. `FetchCore` now owns the portable conventional-search vocabulary, the durable document-record model, and the indexing-changeset boundary. That record model carries first-class typed lifecycle and source fields like `kind`, `language`, `createdAt`, `updatedAt`, `sourceURI`, and `lastIndexedAt`, while leaving the freeform metadata bag string-based. `FetchCore` also distinguishes between the durable stored record, the lean search-facing document view, and the richer index-facing payload used by the sync boundary. `FetchKitLibrary` now supports a default in-memory construction path, and `FetchKit` includes a Core Data-backed `FetchDocumentStore`, a persisted pending-sync queue, and the first thin macOS SearchKit-backed index. Conventional-search results carry field evidence through `matchedFields` and `snippetField`, so UI code can tell whether a result matched title text, body text, or both. The default in-memory search path now also rewards tighter all-term evidence, so a focused passage can rank ahead of a scattered near-miss instead of relying on document ID tie-breaking. +- `RAGCore` defines semantic retrieval vocabulary. +- `RAGKit` provides the default semantic retrieval implementation and `KnowledgeBase` actor. +- `FetchCore` defines portable conventional-search models. +- `FetchKit` provides the first conventional-search facade, Core Data document storage, pending index-sync tracking, and a macOS SearchKit backend. +- `SwiftlyFetch` composes both sibling package families so callers can add a document once, then use conventional search and semantic retrieval over the same corpus. -The intended family split is: +SwiftlyFetch has tagged releases stable enough to try locally, and the umbrella `SwiftlyFetch` surface is available in the current codebase. See GitHub Releases for the latest published version details. -- `RAGKit` for semantic retrieval, knowledge-base assembly, and the retrieval-quality chunking, embedding, and indexing work that supports that job -- `FetchCore` for the portable document-search vocabulary that will stay backend-agnostic as `FetchKit` grows -- `FetchKit` for traditional search, with `FetchKitLibrary` as the first public facade and Core Data plus SearchKit as the intended Apple implementation model -- `SwiftlyFetch` as the umbrella story tying those sibling package surfaces together over time +SwiftlyFetch is not a chat framework, LLM SDK, agent runtime, or remote-provider abstraction. Its job is local retrieval: document preparation, indexing, search, filtering, and context assembly. -That intended split does not change the current package boundary: `RAGKit` still owns semantic retrieval work, not conventional document search. The next family step is caller-driven polish, not first existence: keep conventional-search result quality under pressure with fixture and app corpora, keep the persistent library surface polished, and continue making it realistic for one local corpus to support both traditional search and semantic retrieval without forcing those jobs into one module. +## Quick Start -Platform-wise, the family target is still "macOS and iOS are both first-class," but the first concrete full-text backend is intentionally macOS-first. Apple documents Search Kit as a Mac app indexing and search framework, while Core Spotlight is the more obvious Apple-side indexing/search direction for iOS later. That means the current plan is not to pretend one backend fits both platforms immediately. Instead, `FetchCore` stays portable, `FetchKit` starts with the honest macOS path, and iOS remains a first-class family target through a future sibling backend rather than through fake cross-platform wording. +Add SwiftlyFetch to your `Package.swift` dependencies: -### Motivation +```swift +.package(url: "https://github.com/gaelic-ghost/SwiftlyFetch", from: "0.2.0"), +``` -The goal is to make local search feel native and pleasant in Swift apps without turning the package into a chat framework or a giant AI abstraction layer. +Then add the library product to your target dependencies: -## Quick Start +```swift +.product(name: "SwiftlyFetch", package: "SwiftlyFetch"), +``` + +The package is still a bit early, but the retrieval surface is real enough to try locally. For the coordinated corpus surface, import `SwiftlyFetch`: + +```swift +import FetchCore +import RAGCore +import SwiftlyFetch + +let library = try await SwiftlyFetchLibrary.default() + +try await library.addDocument( + FetchDocumentRecord( + id: "guide", + title: "Fruit Guide", + body: "Apples are bright and crisp.", + contentType: .markdown, + kind: .guide, + language: "en" + ) +) + +let searchResults = try await library.search(FetchSearchQuery("fruit guide")) +let semanticResults = try await library.retrieve(SearchQuery("bright fruit")) +``` -The package is still early, but the retrieval surface is real enough to try locally: +For lower-level semantic retrieval, import `RAGCore` and `RAGKit` directly: ```swift import RAGCore @@ -69,21 +104,55 @@ let context = try await kb.makeContext(for: "bright fruit") ## Usage -The current public surface centers on four library products: +The current public surface centers on five library products: `RAGCore`, `RAGKit`, `FetchCore`, `FetchKit`, and `SwiftlyFetch`. + +For coordinated one-corpus ingestion, use `SwiftlyFetchLibrary` from `SwiftlyFetch`: ```swift import FetchCore -import FetchKit +import RAGCore +import SwiftlyFetch + +let library = try await SwiftlyFetchLibrary.default() + +let mutation = try await library.addDocument( + FetchDocumentRecord( + id: "guide", + title: "Apple Guide", + body: "Apples are bright and crisp.", + contentType: .markdown + ) +) + +let conventionalResults = try await library.search(FetchSearchQuery("apple guide")) +let semanticResults = try await library.retrieve(SearchQuery("bright crisp")) +let sideBySideResults = try await library.searchAndRetrieve( + conventional: FetchSearchQuery("apple guide"), + semantic: SearchQuery("bright crisp") +) +``` + +`SwiftlyFetchMutationResult` reports conventional and semantic outcomes separately. If the corpus write succeeds but semantic indexing fails, the facade queues a semantic retry instead of pretending the whole write failed. +`retrySemanticIndexing(limit:)` respects retry cooldowns through `nextRetryAt` and reports deferred document IDs separately from completed, missing, and failed retries. +`searchAndRetrieve(...)` returns conventional and semantic results side by side without combining scores; ranked hybrid search remains future work. + +For semantic retrieval, use `KnowledgeBase` from `RAGKit`: + +```swift import RAGCore import RAGKit let localKB = try await KnowledgeBase.hashingDefault() let appleKB = try await KnowledgeBase.naturalLanguageDefault(languageHint: "en") -let fetchQuery = FetchSearchQuery("apple guide", kind: .allTerms) -let library = FetchKitLibrary() +let semanticStore = FileManager.default + .temporaryDirectory + .appendingPathComponent("SwiftlyFetchSemantic.sqlite") +let persistentKB = try await KnowledgeBase.persistentHashingDefault( + configuration: .init(store: .sqlite(semanticStore)) +) ``` -The conventional-search side is still early, but the intended top-level shape is already visible: +For conventional search, use `FetchKitLibrary` from `FetchKit`: ```swift import FetchCore @@ -108,7 +177,7 @@ let matchedFields = firstResult?.matchedFields let snippetField = firstResult?.snippetField ``` -`matchedFields` identifies every indexed field that contributed to a search result. `snippetField` identifies the field used to build the returned snippet. Title-only hits intentionally keep the title as the snippet source, so simple result lists still have an immediate explanation for why the result appeared, while richer UIs can render title evidence differently from body evidence. +`matchedFields` identifies every indexed field that contributed to a search result. `snippetField` identifies the field used to build the returned snippet. Simple result lists can show why a result appeared immediately, while richer UIs can render title evidence differently from body evidence. On macOS, the persistent conventional-search surface is now also shaped around one library storage location instead of separate store and index URLs: @@ -136,6 +205,7 @@ Current defaults: - markdown link destinations stay out of chunk text by default, but `HeadingAwareMarkdownChunker(linkDestinationMetadataMode: .include)` can record raw destinations in chunk metadata when downstream indexing or fetch-oriented work needs them - `hashingDefault()` gives a deterministic local path for tests and examples - `naturalLanguageDefault()` uses the Apple Natural Language backend on supported platforms +- `persistentHashingDefault(configuration:dimension:)` and `persistentNaturalLanguageDefault(configuration:languageHint:)` use the same retrieval defaults with a Core Data-backed semantic vector index - metadata filtering supports explicit exclusions, ordered comparisons for `int`, `double`, and `date`, plus case-insensitive `startsWith` and `endsWith` string matching - markdown list items keep heading and immediate lead-in context in chunk text, and also carry structured chunk metadata for list kind, lead-in, ordinal, and heading path - markdown block quotes stay secondary by default, but are promoted into the primary retrieval stream when they make up more than one third of the document's chunkable block structure @@ -148,19 +218,6 @@ Current defaults: - conventional-search results report `matchedFields` and `snippetField`, keeping title-only snippets visible while letting consumers distinguish title evidence from body evidence - `makeContext(...)` suppresses redundant same-document chunk text, groups annotated output by document, and skips annotated sections that only have room for labels -Supported today: - -- build a local knowledge base from plain text and markdown documents -- use deterministic hashing embeddings for tests, previews, and fully local examples -- use Apple Natural Language embeddings for on-device semantic retrieval on supported platforms -- use `FetchKitLibrary()` with a default in-memory backend or inject custom `FetchDocumentStore` and `FetchIndex` implementations explicitly -- use a real Core Data-backed `FetchDocumentStore` in `FetchKit` with the first thin macOS SearchKit index backend -- persist and retry pending index-sync work through `FetchKitLibrary.pendingIndexSyncs()` and `retryPendingIndexSyncs(...)` -- return conventional-search results with query-aware snippets, field-aware ranking, compact-evidence ranking in the default in-memory path, matched-field metadata, and snippet-source metadata across title and body matches -- narrow retrieval with typed metadata filters -- preserve meaningful markdown structure for retrieval, including heading paths, list semantics, quote-heavy documents, code-heavy documents, short section breaks, images, and a narrow raw-HTML whitelist -- turn ranked search results into plain or annotated context text for downstream UI or model consumers - ## Package Status SwiftlyFetch is usable today as a local Apple-first package family, but it is still early in the broader product arc. @@ -168,21 +225,23 @@ SwiftlyFetch is usable today as a local Apple-first package family, but it is st Good current fits: - app-level semantic retrieval over local plain-text and markdown corpora -- conventional-search experimentation through the first `FetchCore` and `FetchKit` surfaces +- conventional-search experimentation through `FetchCore` and `FetchKit` - Apple-first local search prototypes where Core Data, SearchKit, and on-device retrieval matter +- downstream UI or model features that need ranked search results or assembled context, but do not need SwiftlyFetch to own generation Current constraints: - the SearchKit backend is macOS-first - Natural Language asset-backed verification runs in local maintainer validation by default, but stays out of the default GitHub-hosted CI lane because hosted macOS still stalls in the asset-backed step - the package family direction is broader than the currently shipped polished surface, especially on the `FetchKit` side +- hybrid search still waits on follow-up result-shape work; the umbrella facade currently exposes conventional `search` and semantic `retrieve` separately - conventional-search quality coverage uses a small checked-in Project Gutenberg fixture corpus plus synthetic near-miss and longer-body records; larger app-like corpora are still future validation work -If you want to contribute to the package itself, use [CONTRIBUTING.md](./CONTRIBUTING.md). Maintainer planning and architecture notes live under [docs/maintainers/](./docs/maintainers/). +For contributor setup, branch workflow, verification commands, and review expectations, use [CONTRIBUTING.md](./CONTRIBUTING.md). Maintainer planning and architecture notes live under [docs/maintainers/](./docs/maintainers/). ## Release Notes -Tagged releases should be created with `scripts/repo-maintenance/release.sh`, and each published tag should get matching GitHub release notes that summarize what changed and how it was verified. Maintainer planning and architecture notes live under `docs/maintainers/`. +See the repository's GitHub releases for published package notes. Release workflow details belong in [CONTRIBUTING.md](./CONTRIBUTING.md) and the maintainer docs, not in this user-facing README. ## License diff --git a/ROADMAP.md b/ROADMAP.md index 1559586..54ac88d 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -12,7 +12,8 @@ Use this roadmap to track milestone-level delivery through checklist sections. - [Milestone 2: Post-v0.1.0 Refinement](#milestone-2-post-v010-refinement) - [Milestone 3: FetchKit Foundation](#milestone-3-fetchkit-foundation) - [Milestone 4: FetchKit Refinement](#milestone-4-fetchkit-refinement) -- [Milestone 5: FetchKit Platform And CI Decisions](#milestone-5-fetchkit-platform-and-ci-decisions) +- [Milestone 5: Semantic Index Persistence](#milestone-5-semantic-index-persistence) +- [Milestone 6: FetchKit Platform And CI Decisions](#milestone-6-fetchkit-platform-and-ci-decisions) - [Backlog Candidates](#backlog-candidates) - [History](#history) @@ -33,7 +34,8 @@ Use this roadmap to track milestone-level delivery through checklist sections. - Milestone 2: Post-v0.1.0 Refinement - Completed - Milestone 3: FetchKit Foundation - Completed - Milestone 4: FetchKit Refinement - In Progress -- Milestone 5: FetchKit Platform And CI Decisions - Planned +- Milestone 5: Semantic Index Persistence - In Progress +- Milestone 6: FetchKit Platform And CI Decisions - Planned ## Milestone 0: Foundation @@ -187,6 +189,7 @@ In Progress - [x] Decide that title-only hits should keep title snippets while exposing `matchedFields` and `snippetField` so consumers can distinguish title evidence from body evidence. - [x] Add broader fixture-corpus pressure for near-miss all-term ranking and longer-body snippet selection across the default in-memory path and the macOS SearchKit-backed path. - [x] Refine the default in-memory all-term ranker so tighter evidence beats scattered term mentions instead of falling through to document ID tie-breaking. +- [x] Add a second checked-in text source for corpus-based tests so fixture coverage is not only Gutenberg-derived. - [ ] Audit larger app-like corpus result quality now that field-aware ranking, compact all-term evidence, phrase weighting, truncation cues, multi-term snippets, and field-evidence metadata are in place. - [ ] Keep the persistent `FetchKitLibrary` construction and search API surface under review as real callers exercise the current design. - [ ] Explore an opt-in extended snippet surface that can use idle time to precompute short document summaries for larger records, with Apple's [`FoundationModels`](https://developer.apple.com/documentation/foundationmodels) or another local summarization path as the first candidate instead of making foreground full-text search wait on summarization. @@ -197,7 +200,39 @@ In Progress - [x] The SearchKit-backed path runs in normal local validation and the default GitHub CI lane. - [ ] `FetchKitLibrary` still reads like a small Swift-native facade instead of exposing backend detail drift. -## Milestone 5: FetchKit Platform And CI Decisions +## Milestone 5: Semantic Index Persistence + +### Status + +In Progress + +### Scope + +- [x] Add a `RAGKit`-owned persisted semantic vector index while keeping the public contract behind `VectorIndex`. +- [x] Keep semantic chunks and embeddings as derived retrieval state instead of moving them into the `FetchKit` corpus store. +- [x] Add persistent `KnowledgeBase` convenience constructors that preserve the current chunking and embedding defaults. +- [x] Add RAG-owned semantic status and fingerprint records for persisted semantic indexes. +- [x] Design the future one-corpus ingestion facade that coordinates `FetchKit` corpus writes with `RAGKit` semantic indexing. +- [x] Decide semantic indexing retry and stale-state policy for failures after durable corpus writes succeed. + +### Tickets + +- [x] Add `CoreDataVectorIndex` as the first persisted semantic index backend. +- [x] Persist chunk identity, document identity, chunk text, metadata, source position, embedding vectors, and update timestamps. +- [x] Cover persisted vector-index round trips, replacement, filtering, document removal, remove-all behavior, and `KnowledgeBase` convenience reuse. +- [x] Record the hybrid search persistence ownership model in maintainer docs. +- [x] Keep retry scheduling above the sibling packages while storing semantic health truth in `RAGKit`. +- [x] Plan the first `SwiftlyFetch` umbrella facade in maintainer docs. +- [x] Add a narrow bridge from `FetchDocumentRecord` to `RAGCore.Document`. +- [x] Add an umbrella ingestion surface only after the semantic persisted index is stable. + +### Exit Criteria + +- [x] Semantic retrieval can survive process restarts without re-chunking and re-embedding the corpus. +- [x] `RAGKit` owns semantic persistence without making `FetchCore` depend on `RAGCore`. +- [x] The repo has a concrete next-step plan for one ingestion surface and both search modes. + +## Milestone 6: FetchKit Platform And CI Decisions ### Status @@ -226,7 +261,7 @@ Planned - [ ] If parser-backed markdown chunking still leaves retrieval-quality gaps, add retrieval-specific chunking heuristics on top of the chosen markdown parser instead of rebuilding markdown parsing rules locally. - [ ] If asset-backed automation becomes important again, evaluate a self-hosted macOS runner with prewarmed assets before retrying a hosted GitHub Actions lane. - [ ] Consider a follow-on conventional-search quality pass only if real corpora show ranking, snippet, or result-presentation gaps beyond the current field-aware heuristics. -- [ ] Evaluate whether fixture-corpus coverage should grow through additional checked-in micro-records, generated local fixtures, or an opt-in live dataset lane before adopting a Swift Hub dependency. +- [ ] Revisit fixture-corpus growth only if real result-quality gaps show that the current Gutenberg plus TinyStories micro-records are too narrow. ## History @@ -274,3 +309,6 @@ Planned - Promoted the Natural Language integration lane into default local maintainer validation, but kept it out of GitHub-hosted CI after another hosted experiment remained stuck in the asset-backed step for minutes while the local path completed in seconds. - Opened the next roadmap phase around SearchKit/Natural Language verification strategy, iOS conventional-search backend direction, and another caller-driven `FetchKitLibrary` polish pass if real usage shows it is needed. - Broadened the checked-in fixture corpus with synthetic near-miss and longer-body records, added in-memory and SearchKit parity coverage for those cases, and refined in-memory all-term ranking so compact evidence beats scattered mentions. +- Added the first `SwiftlyFetch` umbrella facade with a document mapper, one-corpus ingestion, separate conventional and semantic mutation outcomes, semantic retry storage, and macOS persistent construction. +- Added a TinyStories-derived mini corpus as a second checked-in text source and introduced side-by-side `SwiftlyFetchLibrary.searchAndRetrieve(...)` without ranked hybrid score mixing. +- Tightened `SwiftlyFetchLibrary.retrySemanticIndexing(limit:)` so semantic retries respect `nextRetryAt` cooldowns and report deferred document IDs separately. diff --git a/Sources/RAGKit/CoreDataVectorIndex.swift b/Sources/RAGKit/CoreDataVectorIndex.swift new file mode 100644 index 0000000..9dca6be --- /dev/null +++ b/Sources/RAGKit/CoreDataVectorIndex.swift @@ -0,0 +1,538 @@ +@preconcurrency import CoreData +import Foundation +import RAGCore + +public actor CoreDataVectorIndex: VectorIndex, SemanticIndexStateStore { + public struct Configuration: Hashable, Sendable { + public enum Store: Hashable, Sendable { + case inMemory + case sqlite(URL) + } + + public var store: Store + + public init(store: Store = .inMemory) { + self.store = store + } + + public static let inMemory = Configuration() + } + + public enum StoreError: Error, LocalizedError { + case loadFailed(String) + case encodeFailed(String) + case decodeFailed(String) + + public var errorDescription: String? { + switch self { + case let .loadFailed(message): + "RAGKit could not load the Core Data vector index store. \(message)" + case let .encodeFailed(message): + "RAGKit could not encode a semantic index record for persistence. \(message)" + case let .decodeFailed(message): + "RAGKit could not decode a persisted semantic index record. \(message)" + } + } + } + + private static let modelName = "RAGKitVectorIndex" + + private let persistentContainer: NSPersistentContainer + private let managedObjectContext: NSManagedObjectContext + + public init(configuration: Configuration = .inMemory) async throws { + let persistentContainer = try await Self.makePersistentContainer(configuration: configuration) + self.persistentContainer = persistentContainer + managedObjectContext = Self.makeManagedObjectContext(using: persistentContainer) + } + + private static func fetchStoredChunks( + matching ids: [String], + in context: NSManagedObjectContext + ) throws -> [NSManagedObject] { + let request = NSFetchRequest(entityName: EntityName.indexedChunk.rawValue) + request.predicate = NSPredicate(format: "%K IN %@", StoredChunkProperty.id.rawValue, ids) + return try context.fetch(request) + } + + private static func apply( + indexedChunk: IndexedChunk, + to storedChunk: NSManagedObject + ) throws { + storedChunk.setValue(indexedChunk.chunk.id.rawValue, forKey: StoredChunkProperty.id.rawValue) + storedChunk.setValue(indexedChunk.chunk.documentID.rawValue, forKey: StoredChunkProperty.documentID.rawValue) + storedChunk.setValue(indexedChunk.chunk.text, forKey: StoredChunkProperty.text.rawValue) + storedChunk.setValue(Int64(indexedChunk.chunk.position.chunkIndex), forKey: StoredChunkProperty.chunkIndex.rawValue) + storedChunk.setValue(Int64(indexedChunk.chunk.position.startOffset), forKey: StoredChunkProperty.startOffset.rawValue) + storedChunk.setValue(Int64(indexedChunk.chunk.position.endOffset), forKey: StoredChunkProperty.endOffset.rawValue) + try storedChunk.setValue(encode(indexedChunk.chunk.metadata), forKey: StoredChunkProperty.metadataData.rawValue) + try storedChunk.setValue(encode(indexedChunk.embedding), forKey: StoredChunkProperty.embeddingData.rawValue) + storedChunk.setValue(Date(), forKey: StoredChunkProperty.updatedAt.rawValue) + } + + private static func makeIndexedChunk(from storedChunk: NSManagedObject) throws -> IndexedChunk { + guard + let id = storedChunk.value(forKey: StoredChunkProperty.id.rawValue) as? String, + let documentID = storedChunk.value(forKey: StoredChunkProperty.documentID.rawValue) as? String, + let text = storedChunk.value(forKey: StoredChunkProperty.text.rawValue) as? String, + let metadataData = storedChunk.value(forKey: StoredChunkProperty.metadataData.rawValue) as? Data, + let embeddingData = storedChunk.value(forKey: StoredChunkProperty.embeddingData.rawValue) as? Data + else { + throw StoreError.decodeFailed("A persisted semantic chunk is missing one or more required fields.") + } + + let chunkIndex = intValue(for: StoredChunkProperty.chunkIndex, in: storedChunk) + let startOffset = intValue(for: StoredChunkProperty.startOffset, in: storedChunk) + let endOffset = intValue(for: StoredChunkProperty.endOffset, in: storedChunk) + let documentIDValue = DocumentID(documentID) + + let chunk = try Chunk( + id: ChunkID(id), + documentID: documentIDValue, + text: text, + metadata: decode(ChunkMetadata.self, from: metadataData), + position: ChunkPosition( + documentID: documentIDValue, + chunkIndex: chunkIndex, + startOffset: startOffset, + endOffset: endOffset + ) + ) + + return try IndexedChunk( + chunk: chunk, + embedding: decode(EmbeddingVector.self, from: embeddingData) + ) + } + + private static func fetchStoredState( + for documentID: DocumentID, + in context: NSManagedObjectContext + ) throws -> NSManagedObject? { + let request = NSFetchRequest(entityName: EntityName.semanticState.rawValue) + request.fetchLimit = 1 + request.predicate = NSPredicate( + format: "%K == %@", + SemanticStateProperty.documentID.rawValue, + documentID.rawValue + ) + return try context.fetch(request).first + } + + private static func fetchStoredStates( + for documentIDs: [DocumentID], + in context: NSManagedObjectContext + ) throws -> [NSManagedObject] { + guard !documentIDs.isEmpty else { + return [] + } + + let request = NSFetchRequest(entityName: EntityName.semanticState.rawValue) + request.predicate = NSPredicate( + format: "%K IN %@", + SemanticStateProperty.documentID.rawValue, + documentIDs.map(\.rawValue) + ) + return try context.fetch(request) + } + + private static func upsertState( + documentID: DocumentID, + status: SemanticIndexStatus, + fingerprint: SemanticIndexFingerprint?, + lastIndexedAt: Date?, + lastFailure: String?, + in context: NSManagedObjectContext + ) throws { + let storedState = try fetchStoredState(for: documentID, in: context) + ?? NSEntityDescription.insertNewObject( + forEntityName: EntityName.semanticState.rawValue, + into: context + ) + + storedState.setValue(documentID.rawValue, forKey: SemanticStateProperty.documentID.rawValue) + storedState.setValue(status.rawValue, forKey: SemanticStateProperty.statusRaw.rawValue) + try storedState.setValue(fingerprint.map(encode), forKey: SemanticStateProperty.fingerprintData.rawValue) + storedState.setValue(lastIndexedAt, forKey: SemanticStateProperty.lastIndexedAt.rawValue) + storedState.setValue(lastFailure, forKey: SemanticStateProperty.lastFailure.rawValue) + storedState.setValue(Date(), forKey: SemanticStateProperty.updatedAt.rawValue) + } + + private static func makeSemanticIndexState(from storedState: NSManagedObject) throws -> SemanticIndexState { + guard + let documentID = storedState.value(forKey: SemanticStateProperty.documentID.rawValue) as? String, + let statusRaw = storedState.value(forKey: SemanticStateProperty.statusRaw.rawValue) as? String + else { + throw StoreError.decodeFailed("A persisted semantic index state is missing one or more required fields.") + } + + let fingerprint: SemanticIndexFingerprint? + if let fingerprintData = storedState.value(forKey: SemanticStateProperty.fingerprintData.rawValue) as? Data { + fingerprint = try decode(SemanticIndexFingerprint.self, from: fingerprintData) + } else { + fingerprint = nil + } + + let status = SemanticIndexStatus(rawValue: statusRaw) ?? .failed + let updatedAt = (storedState.value(forKey: SemanticStateProperty.updatedAt.rawValue) as? Date) ?? .distantPast + + return SemanticIndexState( + documentID: DocumentID(documentID), + status: status, + fingerprint: fingerprint, + lastIndexedAt: storedState.value(forKey: SemanticStateProperty.lastIndexedAt.rawValue) as? Date, + lastFailure: storedState.value(forKey: SemanticStateProperty.lastFailure.rawValue) as? String, + updatedAt: updatedAt + ) + } + + private static func intValue( + for property: StoredChunkProperty, + in storedChunk: NSManagedObject + ) -> Int { + if let value = storedChunk.value(forKey: property.rawValue) as? Int { + return value + } + + if let value = storedChunk.value(forKey: property.rawValue) as? Int64 { + return Int(value) + } + + return 0 + } + + private static func encode(_ value: T) throws -> Data { + do { + return try JSONEncoder().encode(value) + } catch { + throw StoreError.encodeFailed(error.localizedDescription) + } + } + + private static func decode(_ type: T.Type, from data: Data) throws -> T { + do { + return try JSONDecoder().decode(type, from: data) + } catch { + throw StoreError.decodeFailed(error.localizedDescription) + } + } + + private static func makePersistentContainer(configuration: Configuration) async throws -> NSPersistentContainer { + let container = NSPersistentContainer( + name: modelName, + managedObjectModel: makeManagedObjectModel() + ) + + let description = NSPersistentStoreDescription() + description.shouldAddStoreAsynchronously = false + description.shouldInferMappingModelAutomatically = true + description.shouldMigrateStoreAutomatically = true + + switch configuration.store { + case .inMemory: + description.type = NSInMemoryStoreType + case let .sqlite(url): + description.type = NSSQLiteStoreType + description.url = url + } + + container.persistentStoreDescriptions = [description] + + return try await withCheckedThrowingContinuation { continuation in + container.loadPersistentStores { _, error in + if let error { + continuation.resume( + throwing: StoreError.loadFailed( + error.localizedDescription + ) + ) + } else { + continuation.resume(returning: container) + } + } + } + } + + private static func makeManagedObjectContext(using container: NSPersistentContainer) -> NSManagedObjectContext { + let context = container.newBackgroundContext() + context.automaticallyMergesChangesFromParent = true + context.mergePolicy = NSMergePolicy(merge: .mergeByPropertyObjectTrumpMergePolicyType) + return context + } + + private static func makeManagedObjectModel() -> NSManagedObjectModel { + let model = NSManagedObjectModel() + + let indexedChunkEntity = NSEntityDescription() + indexedChunkEntity.name = EntityName.indexedChunk.rawValue + indexedChunkEntity.managedObjectClassName = NSStringFromClass(NSManagedObject.self) + indexedChunkEntity.properties = [ + makeAttribute(name: StoredChunkProperty.id.rawValue, type: .stringAttributeType), + makeAttribute(name: StoredChunkProperty.documentID.rawValue, type: .stringAttributeType), + makeAttribute(name: StoredChunkProperty.text.rawValue, type: .stringAttributeType), + makeAttribute(name: StoredChunkProperty.metadataData.rawValue, type: .binaryDataAttributeType), + makeAttribute(name: StoredChunkProperty.embeddingData.rawValue, type: .binaryDataAttributeType), + makeAttribute(name: StoredChunkProperty.chunkIndex.rawValue, type: .integer64AttributeType), + makeAttribute(name: StoredChunkProperty.startOffset.rawValue, type: .integer64AttributeType), + makeAttribute(name: StoredChunkProperty.endOffset.rawValue, type: .integer64AttributeType), + makeAttribute(name: StoredChunkProperty.updatedAt.rawValue, type: .dateAttributeType), + ] + indexedChunkEntity.uniquenessConstraints = [[StoredChunkProperty.id.rawValue]] + + let semanticStateEntity = NSEntityDescription() + semanticStateEntity.name = EntityName.semanticState.rawValue + semanticStateEntity.managedObjectClassName = NSStringFromClass(NSManagedObject.self) + semanticStateEntity.properties = [ + makeAttribute(name: SemanticStateProperty.documentID.rawValue, type: .stringAttributeType), + makeAttribute(name: SemanticStateProperty.statusRaw.rawValue, type: .stringAttributeType), + makeAttribute(name: SemanticStateProperty.fingerprintData.rawValue, type: .binaryDataAttributeType, isOptional: true), + makeAttribute(name: SemanticStateProperty.lastIndexedAt.rawValue, type: .dateAttributeType, isOptional: true), + makeAttribute(name: SemanticStateProperty.lastFailure.rawValue, type: .stringAttributeType, isOptional: true), + makeAttribute(name: SemanticStateProperty.updatedAt.rawValue, type: .dateAttributeType), + ] + semanticStateEntity.uniquenessConstraints = [[SemanticStateProperty.documentID.rawValue]] + + model.entities = [indexedChunkEntity, semanticStateEntity] + return model + } + + private static func makeAttribute( + name: String, + type: NSAttributeType, + isOptional: Bool = false + ) -> NSAttributeDescription { + let attribute = NSAttributeDescription() + attribute.name = name + attribute.attributeType = type + attribute.isOptional = isOptional + return attribute + } + + public func upsert(_ chunks: [IndexedChunk]) async throws { + guard !chunks.isEmpty else { + return + } + + try await performWrite { context in + let existingChunks = try Self.fetchStoredChunks( + matching: chunks.map(\.chunk.id.rawValue), + in: context + ) + var existingByID: [String: NSManagedObject] = Dictionary( + uniqueKeysWithValues: existingChunks.compactMap { storedChunk in + guard let id = storedChunk.value(forKey: StoredChunkProperty.id.rawValue) as? String else { + return nil + } + + return (id, storedChunk) + } + ) + + for indexedChunk in chunks { + let storedChunk = existingByID[indexedChunk.chunk.id.rawValue] + ?? NSEntityDescription.insertNewObject( + forEntityName: EntityName.indexedChunk.rawValue, + into: context + ) + existingByID[indexedChunk.chunk.id.rawValue] = storedChunk + try Self.apply(indexedChunk: indexedChunk, to: storedChunk) + } + } + } + + public func search(_ query: SearchQuery, embedding: EmbeddingVector) async throws -> [SearchResult] { + guard query.limit > 0 else { + return [] + } + + let indexedChunks = try await performRead { context in + let request = NSFetchRequest(entityName: EntityName.indexedChunk.rawValue) + return try context.fetch(request).map(Self.makeIndexedChunk) + } + + let ranked = indexedChunks.compactMap { indexedChunk -> SearchResult? in + if let filter = query.filter, !filter.matches(indexedChunk.chunk.metadata) { + return nil + } + + let score = embedding.cosineSimilarity(to: indexedChunk.embedding) + return SearchResult(chunk: indexedChunk.chunk, score: score) + } + + return ranked + .sorted { lhs, rhs in + if lhs.score == rhs.score { + return lhs.chunk.id.rawValue < rhs.chunk.id.rawValue + } + + return lhs.score > rhs.score + } + .prefix(query.limit) + .map { $0 } + } + + public func removeChunks(for documentID: DocumentID) async throws { + try await performWrite { context in + let request = NSFetchRequest(entityName: EntityName.indexedChunk.rawValue) + request.predicate = NSPredicate( + format: "%K == %@", + StoredChunkProperty.documentID.rawValue, + documentID.rawValue + ) + + for storedChunk in try context.fetch(request) { + context.delete(storedChunk) + } + + try Self.upsertState( + documentID: documentID, + status: .missing, + fingerprint: nil, + lastIndexedAt: nil, + lastFailure: nil, + in: context + ) + } + } + + public func removeAll() async throws { + try await performWrite { context in + let request = NSFetchRequest(entityName: EntityName.indexedChunk.rawValue) + for storedChunk in try context.fetch(request) { + context.delete(storedChunk) + } + + let stateRequest = NSFetchRequest(entityName: EntityName.semanticState.rawValue) + for storedState in try context.fetch(stateRequest) { + context.delete(storedState) + } + } + } + + public func state(for documentID: DocumentID) async throws -> SemanticIndexState? { + try await performRead { context in + try Self.fetchStoredState(for: documentID, in: context).map(Self.makeSemanticIndexState) + } + } + + public func states(for documentIDs: [DocumentID]) async throws -> [SemanticIndexState] { + try await performRead { context in + try Self.fetchStoredStates(for: documentIDs, in: context) + .map(Self.makeSemanticIndexState) + .sorted { $0.documentID.rawValue < $1.documentID.rawValue } + } + } + + public func markIndexing(documentID: DocumentID, fingerprint: SemanticIndexFingerprint) async throws { + try await performWrite { context in + try Self.upsertState( + documentID: documentID, + status: .indexing, + fingerprint: fingerprint, + lastIndexedAt: nil, + lastFailure: nil, + in: context + ) + } + } + + public func markCurrent(documentID: DocumentID, fingerprint: SemanticIndexFingerprint) async throws { + try await performWrite { context in + try Self.upsertState( + documentID: documentID, + status: .current, + fingerprint: fingerprint, + lastIndexedAt: Date(), + lastFailure: nil, + in: context + ) + } + } + + public func markStale(documentID: DocumentID, reason: String?) async throws { + try await performWrite { context in + let currentState = try Self.fetchStoredState(for: documentID, in: context) + .map(Self.makeSemanticIndexState) + try Self.upsertState( + documentID: documentID, + status: .stale, + fingerprint: currentState?.fingerprint, + lastIndexedAt: currentState?.lastIndexedAt, + lastFailure: reason, + in: context + ) + } + } + + public func markFailed( + documentID: DocumentID, + fingerprint: SemanticIndexFingerprint?, + reason: String + ) async throws { + try await performWrite { context in + let currentState = try Self.fetchStoredState(for: documentID, in: context) + .map(Self.makeSemanticIndexState) + try Self.upsertState( + documentID: documentID, + status: .failed, + fingerprint: fingerprint ?? currentState?.fingerprint, + lastIndexedAt: currentState?.lastIndexedAt, + lastFailure: reason, + in: context + ) + } + } + + private func performRead( + _ operation: @escaping @Sendable (NSManagedObjectContext) throws -> T + ) async throws -> T { + let context = managedObjectContext + return try await context.perform { + try operation(context) + } + } + + private func performWrite( + _ operation: @escaping @Sendable (NSManagedObjectContext) throws -> Void + ) async throws { + let context = managedObjectContext + try await context.perform { + do { + try operation(context) + + if context.hasChanges { + try context.save() + } + } catch { + context.rollback() + throw error + } + } + } +} + +private enum EntityName: String { + case indexedChunk = "RAGIndexedChunk" + case semanticState = "RAGSemanticIndexState" +} + +private enum StoredChunkProperty: String { + case id + case documentID + case text + case metadataData + case embeddingData + case chunkIndex + case startOffset + case endOffset + case updatedAt +} + +private enum SemanticStateProperty: String { + case documentID + case statusRaw + case fingerprintData + case lastIndexedAt + case lastFailure + case updatedAt +} diff --git a/Sources/RAGKit/DefaultChunker.swift b/Sources/RAGKit/DefaultChunker.swift index 1275d99..ab27a84 100644 --- a/Sources/RAGKit/DefaultChunker.swift +++ b/Sources/RAGKit/DefaultChunker.swift @@ -1,8 +1,8 @@ import RAGCore public struct DefaultChunker: Chunker, Sendable { - private let paragraphChunker: ParagraphChunker - private let markdownChunker: HeadingAwareMarkdownChunker + let paragraphChunker: ParagraphChunker + let markdownChunker: HeadingAwareMarkdownChunker public init( paragraphChunker: ParagraphChunker = ParagraphChunker(), @@ -14,10 +14,10 @@ public struct DefaultChunker: Chunker, Sendable { public func chunks(for document: Document) throws -> [Chunk] { switch document.content { - case .text: - return try paragraphChunker.chunks(for: document) - case .markdown: - return try markdownChunker.chunks(for: document) + case .text: + return try paragraphChunker.chunks(for: document) + case .markdown: + return try markdownChunker.chunks(for: document) } } } diff --git a/Sources/RAGKit/HeadingAwareMarkdownChunker.swift b/Sources/RAGKit/HeadingAwareMarkdownChunker.swift index 9a8d1ca..3d6ab0e 100644 --- a/Sources/RAGKit/HeadingAwareMarkdownChunker.swift +++ b/Sources/RAGKit/HeadingAwareMarkdownChunker.swift @@ -7,8 +7,9 @@ public enum MarkdownLinkDestinationMetadataMode: Sendable { } public struct HeadingAwareMarkdownChunker: Chunker, Sendable { + let linkDestinationMetadataMode: MarkdownLinkDestinationMetadataMode + private let paragraphChunker: ParagraphChunker - private let linkDestinationMetadataMode: MarkdownLinkDestinationMetadataMode public init( paragraphChunker: ParagraphChunker = ParagraphChunker(), @@ -19,7 +20,7 @@ public struct HeadingAwareMarkdownChunker: Chunker, Sendable { } public func chunks(for document: Document) throws -> [Chunk] { - guard case .markdown(let text) = document.content else { + guard case let .markdown(text) = document.content else { return try paragraphChunker.chunks(for: document) } @@ -32,6 +33,7 @@ public struct HeadingAwareMarkdownChunker: Chunker, Sendable { guard scanResult.shouldFallbackToParagraphChunker else { return [] } + return try paragraphChunker.chunks(for: document) } diff --git a/Sources/RAGKit/KnowledgeBase+NaturalLanguage.swift b/Sources/RAGKit/KnowledgeBase+NaturalLanguage.swift index 8c9a575..26d1207 100644 --- a/Sources/RAGKit/KnowledgeBase+NaturalLanguage.swift +++ b/Sources/RAGKit/KnowledgeBase+NaturalLanguage.swift @@ -1,7 +1,7 @@ import RAGCore -extension KnowledgeBase { - public static func hashingDefault(dimension: Int = 64) async throws -> KnowledgeBase { +public extension KnowledgeBase { + static func hashingDefault(dimension: Int = 64) async throws -> KnowledgeBase { KnowledgeBase( chunker: DefaultChunker(), embedder: HashingEmbedder(dimension: dimension), @@ -9,11 +9,33 @@ extension KnowledgeBase { ) } - public static func naturalLanguageDefault(languageHint: String? = nil) async throws -> KnowledgeBase { + static func naturalLanguageDefault(languageHint: String? = nil) async throws -> KnowledgeBase { try KnowledgeBase( chunker: DefaultChunker(), embedder: NaturalLanguageEmbedder(languageHint: languageHint), index: InMemoryVectorIndex() ) } + + static func persistentHashingDefault( + configuration: CoreDataVectorIndex.Configuration, + dimension: Int = 64 + ) async throws -> KnowledgeBase { + try await KnowledgeBase( + chunker: DefaultChunker(), + embedder: HashingEmbedder(dimension: dimension), + index: CoreDataVectorIndex(configuration: configuration) + ) + } + + static func persistentNaturalLanguageDefault( + configuration: CoreDataVectorIndex.Configuration, + languageHint: String? = nil + ) async throws -> KnowledgeBase { + try await KnowledgeBase( + chunker: DefaultChunker(), + embedder: NaturalLanguageEmbedder(languageHint: languageHint), + index: CoreDataVectorIndex(configuration: configuration) + ) + } } diff --git a/Sources/RAGKit/KnowledgeBase.swift b/Sources/RAGKit/KnowledgeBase.swift index f946e5e..b0bba71 100644 --- a/Sources/RAGKit/KnowledgeBase.swift +++ b/Sources/RAGKit/KnowledgeBase.swift @@ -32,28 +32,54 @@ public actor KnowledgeBase { } public func addDocument(_ document: Document) async throws { - let chunks = try chunker.chunks(for: document) - let embeddings = try await embedder.embed(chunks: chunks) + let fingerprint = SemanticFingerprintFactory.fingerprint( + for: document, + chunker: chunker, + embedder: embedder + ) + let stateStore = index as? any SemanticIndexStateStore + + do { + try await stateStore?.markIndexing(documentID: document.id, fingerprint: fingerprint) + let chunks = try chunker.chunks(for: document) + let embeddings = try await embedder.embed(chunks: chunks) + + guard chunks.count == embeddings.count else { + throw KnowledgeBaseError.embedderReturnedUnexpectedVectorCount( + expected: chunks.count, + actual: embeddings.count + ) + } - guard chunks.count == embeddings.count else { - throw KnowledgeBaseError.embedderReturnedUnexpectedVectorCount( - expected: chunks.count, - actual: embeddings.count - ) - } + let indexedChunks = zip(chunks, embeddings).map { chunk, embedding in + IndexedChunk(chunk: chunk, embedding: embedding) + } - let indexedChunks = zip(chunks, embeddings).map { chunk, embedding in - IndexedChunk(chunk: chunk, embedding: embedding) + try await index.removeChunks(for: document.id) + try await index.upsert(indexedChunks) + try await stateStore?.markCurrent(documentID: document.id, fingerprint: fingerprint) + } catch { + try? await stateStore?.markFailed( + documentID: document.id, + fingerprint: fingerprint, + reason: String(describing: error) + ) + throw error } - - try await index.removeChunks(for: document.id) - try await index.upsert(indexedChunks) } public func removeDocument(_ documentID: DocumentID) async throws { try await index.removeChunks(for: documentID) } + public func semanticIndexState(for documentID: DocumentID) async throws -> SemanticIndexState? { + try await (index as? any SemanticIndexStateStore)?.state(for: documentID) + } + + public func semanticIndexStates(for documentIDs: [DocumentID]) async throws -> [SemanticIndexState] { + try await (index as? any SemanticIndexStateStore)?.states(for: documentIDs) ?? [] + } + public func search(_ query: SearchQuery) async throws -> [SearchResult] { let embedding = try await embedder.embed(query: query) return try await index.search(query, embedding: embedding) @@ -69,7 +95,7 @@ public actor KnowledgeBase { public func makeContext( for query: SearchQuery, - budget: ContextBudget = .characters(4_000), + budget: ContextBudget = .characters(4000), style: ContextStyle = .plain ) async throws -> String { let results = try await search(query) @@ -92,7 +118,6 @@ public actor KnowledgeBase { guard remainingBudget != 0 else { break } - guard let section = renderSection( result: result, style: style, @@ -126,7 +151,7 @@ public actor KnowledgeBase { for query: String, limit: Int = 5, filter: MetadataFilter? = nil, - budget: ContextBudget = .characters(4_000), + budget: ContextBudget = .characters(4000), style: ContextStyle = .plain ) async throws -> String { try await makeContext( @@ -143,35 +168,35 @@ public actor KnowledgeBase { limit: Int? ) -> ContextSection? { switch style { - case .plain: - let fittedBody = fittedText(result.chunk.text, limit: limit) - guard !fittedBody.isEmpty else { - return nil - } - - return ContextSection( - text: fittedBody, - comparisonText: normalizedComparisonText(result.chunk.text) - ) - case .annotated: - let score = String(format: "%.4f", result.score) - let header = annotatedHeader( - for: result, - score: score, - startsNewDocument: startsNewDocument - ) - guard let fittedBody = fittedAnnotatedBody( - result.chunk.text, - prefix: header, - limit: limit - ) else { - return nil - } - - return ContextSection( - text: "\(header)\n\(fittedBody)", - comparisonText: normalizedComparisonText(result.chunk.text) - ) + case .plain: + let fittedBody = fittedText(result.chunk.text, limit: limit) + guard !fittedBody.isEmpty else { + return nil + } + + return ContextSection( + text: fittedBody, + comparisonText: normalizedComparisonText(result.chunk.text) + ) + case .annotated: + let score = String(format: "%.4f", result.score) + let header = annotatedHeader( + for: result, + score: score, + startsNewDocument: startsNewDocument + ) + guard let fittedBody = fittedAnnotatedBody( + result.chunk.text, + prefix: header, + limit: limit + ) else { + return nil + } + + return ContextSection( + text: "\(header)\n\(fittedBody)", + comparisonText: normalizedComparisonText(result.chunk.text) + ) } } @@ -181,10 +206,10 @@ public actor KnowledgeBase { separatorCount: Int ) -> Int? { switch budget { - case .characters(let limit): - return max(0, limit - currentCharacterCount - separatorCount) - case .unlimited: - return nil + case let .characters(limit): + return max(0, limit - currentCharacterCount - separatorCount) + case .unlimited: + return nil } } @@ -290,7 +315,6 @@ public actor KnowledgeBase { guard let limit else { return text } - guard limit > 0 else { return "" } diff --git a/Sources/RAGKit/NaturalLanguageEmbedder.swift b/Sources/RAGKit/NaturalLanguageEmbedder.swift index 63c0aa5..cf21ac5 100644 --- a/Sources/RAGKit/NaturalLanguageEmbedder.swift +++ b/Sources/RAGKit/NaturalLanguageEmbedder.swift @@ -1,13 +1,16 @@ import RAGCore public struct NaturalLanguageEmbedder: Embedder, Sendable { + let languageHint: String? private let backend: any ContextualEmbeddingBackend public init(languageHint: String? = nil) throws { - self.backend = try AppleContextualEmbeddingBackend(languageHint: languageHint) + self.languageHint = languageHint + backend = try AppleContextualEmbeddingBackend(languageHint: languageHint) } init(backend: any ContextualEmbeddingBackend) { + languageHint = nil self.backend = backend } diff --git a/Sources/RAGKit/SemanticFingerprints.swift b/Sources/RAGKit/SemanticFingerprints.swift new file mode 100644 index 0000000..af2d239 --- /dev/null +++ b/Sources/RAGKit/SemanticFingerprints.swift @@ -0,0 +1,121 @@ +import Foundation +import RAGCore + +extension ParagraphChunker: SemanticFingerprintProviding { + public var semanticFingerprint: String { + "ragkit.paragraph-chunker.v1" + } +} + +extension HeadingAwareMarkdownChunker: SemanticFingerprintProviding { + public var semanticFingerprint: String { + switch linkDestinationMetadataMode { + case .omit: + "ragkit.heading-aware-markdown.v1.links-omit" + case .include: + "ragkit.heading-aware-markdown.v1.links-include" + } + } +} + +extension DefaultChunker: SemanticFingerprintProviding { + public var semanticFingerprint: String { + "\(paragraphChunker.semanticFingerprint)|\(markdownChunker.semanticFingerprint)" + } +} + +extension HashingEmbedder: SemanticFingerprintProviding { + public var semanticFingerprint: String { + "ragkit.hashing.\(dimension)" + } +} + +extension NaturalLanguageEmbedder: SemanticFingerprintProviding { + public var semanticFingerprint: String { + let language = languageHint ?? "automatic" + return "ragkit.apple-natural-language.\(language)" + } +} + +enum SemanticFingerprintFactory { + static func fingerprint( + for document: Document, + chunker: any Chunker, + embedder: any Embedder + ) -> SemanticIndexFingerprint { + SemanticIndexFingerprint( + source: sourceFingerprint(for: document), + chunker: componentFingerprint(for: chunker, fallbackPrefix: "chunker"), + embedder: componentFingerprint(for: embedder, fallbackPrefix: "embedder") + ) + } + + private static func componentFingerprint( + for component: Any, + fallbackPrefix: String + ) -> String { + if let provider = component as? any SemanticFingerprintProviding { + return provider.semanticFingerprint + } + + return "custom.\(fallbackPrefix).\(String(reflecting: type(of: component)))" + } + + private static func sourceFingerprint(for document: Document) -> String { + var hasher = StableFNV1A64() + hasher.append("document-id") + hasher.append(document.id.rawValue) + hasher.append("content-kind") + + switch document.content { + case let .text(text): + hasher.append("text") + hasher.append(text) + case let .markdown(markdown): + hasher.append("markdown") + hasher.append(markdown) + } + + hasher.append("metadata") + for key in document.metadata.values.keys.sorted() { + hasher.append(key) + hasher.append(metadataValueDescription(document.metadata.values[key])) + } + + return hasher.hexDigest + } + + private static func metadataValueDescription(_ value: MetadataValue?) -> String { + guard let value else { + return "nil" + } + + switch value { + case let .string(string): + return "string:\(string)" + case let .int(int): + return "int:\(int)" + case let .double(double): + return "double:\(String(format: "%.17g", locale: Locale(identifier: "en_US_POSIX"), double))" + case let .bool(bool): + return "bool:\(bool)" + case let .date(date): + return "date:\(date.timeIntervalSince1970)" + } + } +} + +private struct StableFNV1A64 { + private var value: UInt64 = 14_695_981_039_346_656_037 + + var hexDigest: String { + String(format: "%016llx", value) + } + + mutating func append(_ text: String) { + for byte in text.utf8 { + value = (value ^ UInt64(byte)) &* 1_099_511_628_211 + } + value = (value ^ 0xFF) &* 1_099_511_628_211 + } +} diff --git a/Sources/RAGKit/SemanticIndexState.swift b/Sources/RAGKit/SemanticIndexState.swift new file mode 100644 index 0000000..595177d --- /dev/null +++ b/Sources/RAGKit/SemanticIndexState.swift @@ -0,0 +1,68 @@ +import Foundation +import RAGCore + +public enum SemanticIndexStatus: String, Hashable, Codable, Sendable { + case missing + case indexing + case current + case stale + case failed +} + +public struct SemanticIndexFingerprint: Hashable, Codable, Sendable { + public var source: String + public var chunker: String + public var embedder: String + + public init( + source: String, + chunker: String, + embedder: String + ) { + self.source = source + self.chunker = chunker + self.embedder = embedder + } +} + +public struct SemanticIndexState: Hashable, Codable, Sendable { + public var documentID: DocumentID + public var status: SemanticIndexStatus + public var fingerprint: SemanticIndexFingerprint? + public var lastIndexedAt: Date? + public var lastFailure: String? + public var updatedAt: Date + + public init( + documentID: DocumentID, + status: SemanticIndexStatus, + fingerprint: SemanticIndexFingerprint? = nil, + lastIndexedAt: Date? = nil, + lastFailure: String? = nil, + updatedAt: Date = .now + ) { + self.documentID = documentID + self.status = status + self.fingerprint = fingerprint + self.lastIndexedAt = lastIndexedAt + self.lastFailure = lastFailure + self.updatedAt = updatedAt + } +} + +public protocol SemanticIndexStateStore: Sendable { + func state(for documentID: DocumentID) async throws -> SemanticIndexState? + func states(for documentIDs: [DocumentID]) async throws -> [SemanticIndexState] + func markIndexing(documentID: DocumentID, fingerprint: SemanticIndexFingerprint) async throws + func markCurrent(documentID: DocumentID, fingerprint: SemanticIndexFingerprint) async throws + func markStale(documentID: DocumentID, reason: String?) async throws + func markFailed( + documentID: DocumentID, + fingerprint: SemanticIndexFingerprint?, + reason: String + ) async throws +} + +public protocol SemanticFingerprintProviding: Sendable { + var semanticFingerprint: String { get } +} diff --git a/Sources/SwiftlyFetch/CoreDataSwiftlyFetchSemanticRetryStore.swift b/Sources/SwiftlyFetch/CoreDataSwiftlyFetchSemanticRetryStore.swift new file mode 100644 index 0000000..d62f80b --- /dev/null +++ b/Sources/SwiftlyFetch/CoreDataSwiftlyFetchSemanticRetryStore.swift @@ -0,0 +1,248 @@ +import CoreData +import FetchCore +import Foundation + +public actor CoreDataSwiftlyFetchSemanticRetryStore: SwiftlyFetchSemanticRetryStore { + public struct Configuration: Hashable, Sendable { + public enum Store: Hashable, Sendable { + case inMemory + case sqlite(URL) + } + + public var store: Store + + public init(store: Store = .inMemory) { + self.store = store + } + + public static let inMemory = Configuration(store: .inMemory) + } + + public enum StoreError: Error, LocalizedError { + case loadFailed(String) + case decodeFailed(String) + + public var errorDescription: String? { + switch self { + case let .loadFailed(message): + "SwiftlyFetch could not load the semantic retry Core Data store. \(message)" + case let .decodeFailed(message): + "SwiftlyFetch could not decode a persisted semantic retry record. \(message)" + } + } + } + + private static let modelName = "SwiftlyFetchSemanticRetryStore" + + private let persistentContainer: NSPersistentContainer + private let managedObjectContext: NSManagedObjectContext + + public init(configuration: Configuration = .inMemory) async throws { + let persistentContainer = try await Self.makePersistentContainer(configuration: configuration) + self.persistentContainer = persistentContainer + managedObjectContext = Self.makeManagedObjectContext(using: persistentContainer) + } + + private static func fetchStoredRetry( + for documentID: FetchDocumentID, + in context: NSManagedObjectContext + ) throws -> NSManagedObject? { + let request = NSFetchRequest(entityName: EntityName.retry.rawValue) + request.predicate = NSPredicate( + format: "%K == %@", + RetryProperty.documentID.rawValue, + documentID.rawValue + ) + request.fetchLimit = 1 + return try context.fetch(request).first + } + + private static func makeRetry(from storedRetry: NSManagedObject) throws -> SwiftlyFetchSemanticRetry { + guard let documentID = storedRetry.value(forKey: RetryProperty.documentID.rawValue) as? String, + let operationRaw = storedRetry.value(forKey: RetryProperty.operation.rawValue) as? String, + let operation = SwiftlyFetchSemanticRetryOperation(rawValue: operationRaw), + let reason = storedRetry.value(forKey: RetryProperty.reason.rawValue) as? String, + let createdAt = storedRetry.value(forKey: RetryProperty.createdAt.rawValue) as? Date + else { + throw StoreError.decodeFailed("A semantic retry entry is missing its document ID, operation, reason, or creation date.") + } + + let attemptCount = (storedRetry.value(forKey: RetryProperty.attemptCount.rawValue) as? Int64).map(Int.init) ?? 0 + + return SwiftlyFetchSemanticRetry( + documentID: FetchDocumentID(documentID), + operation: operation, + reason: reason, + attemptCount: attemptCount, + createdAt: createdAt, + lastAttemptAt: storedRetry.value(forKey: RetryProperty.lastAttemptAt.rawValue) as? Date, + nextRetryAt: storedRetry.value(forKey: RetryProperty.nextRetryAt.rawValue) as? Date, + lastFailure: storedRetry.value(forKey: RetryProperty.lastFailure.rawValue) as? String + ) + } + + private static func retryEntity(in context: NSManagedObjectContext) -> NSEntityDescription { + NSEntityDescription.entity(forEntityName: EntityName.retry.rawValue, in: context)! + } + + private static func makeManagedObjectContext(using container: NSPersistentContainer) -> NSManagedObjectContext { + let context = container.newBackgroundContext() + context.mergePolicy = NSMergePolicy(merge: .mergeByPropertyObjectTrumpMergePolicyType) + return context + } + + private static func makePersistentContainer(configuration: Configuration) async throws -> NSPersistentContainer { + let container = NSPersistentContainer(name: modelName, managedObjectModel: makeModel()) + let description: NSPersistentStoreDescription + + switch configuration.store { + case .inMemory: + description = NSPersistentStoreDescription() + description.type = NSInMemoryStoreType + case let .sqlite(url): + description = NSPersistentStoreDescription(url: url) + description.type = NSSQLiteStoreType + } + + container.persistentStoreDescriptions = [description] + + return try await withCheckedThrowingContinuation { continuation in + container.loadPersistentStores { _, error in + if let error { + continuation.resume( + throwing: StoreError.loadFailed(String(describing: error)) + ) + } else { + continuation.resume(returning: container) + } + } + } + } + + private static func makeModel() -> NSManagedObjectModel { + let model = NSManagedObjectModel() + let retryEntity = NSEntityDescription() + retryEntity.name = EntityName.retry.rawValue + retryEntity.managedObjectClassName = NSStringFromClass(NSManagedObject.self) + retryEntity.properties = [ + stringAttribute(RetryProperty.documentID.rawValue), + stringAttribute(RetryProperty.operation.rawValue), + stringAttribute(RetryProperty.reason.rawValue), + integerAttribute(RetryProperty.attemptCount.rawValue), + dateAttribute(RetryProperty.createdAt.rawValue), + dateAttribute(RetryProperty.lastAttemptAt.rawValue, optional: true), + dateAttribute(RetryProperty.nextRetryAt.rawValue, optional: true), + stringAttribute(RetryProperty.lastFailure.rawValue, optional: true), + ] + retryEntity.uniquenessConstraints = [[RetryProperty.documentID.rawValue]] + model.entities = [retryEntity] + return model + } + + private static func stringAttribute(_ name: String, optional: Bool = false) -> NSAttributeDescription { + let attribute = NSAttributeDescription() + attribute.name = name + attribute.attributeType = .stringAttributeType + attribute.isOptional = optional + return attribute + } + + private static func integerAttribute(_ name: String) -> NSAttributeDescription { + let attribute = NSAttributeDescription() + attribute.name = name + attribute.attributeType = .integer64AttributeType + attribute.isOptional = false + attribute.defaultValue = 0 + return attribute + } + + private static func dateAttribute(_ name: String, optional: Bool = false) -> NSAttributeDescription { + let attribute = NSAttributeDescription() + attribute.name = name + attribute.attributeType = .dateAttributeType + attribute.isOptional = optional + return attribute + } + + public func upsert(_ retry: SwiftlyFetchSemanticRetry) async throws { + try await performWrite { context in + let storedRetry = try Self.fetchStoredRetry(for: retry.documentID, in: context) + ?? NSManagedObject(entity: Self.retryEntity(in: context), insertInto: context) + + storedRetry.setValue(retry.documentID.rawValue, forKey: RetryProperty.documentID.rawValue) + storedRetry.setValue(retry.operation.rawValue, forKey: RetryProperty.operation.rawValue) + storedRetry.setValue(retry.reason, forKey: RetryProperty.reason.rawValue) + storedRetry.setValue(Int64(retry.attemptCount), forKey: RetryProperty.attemptCount.rawValue) + storedRetry.setValue(retry.createdAt, forKey: RetryProperty.createdAt.rawValue) + storedRetry.setValue(retry.lastAttemptAt, forKey: RetryProperty.lastAttemptAt.rawValue) + storedRetry.setValue(retry.nextRetryAt, forKey: RetryProperty.nextRetryAt.rawValue) + storedRetry.setValue(retry.lastFailure, forKey: RetryProperty.lastFailure.rawValue) + } + } + + public func pendingRetries(limit: Int? = nil) async throws -> [SwiftlyFetchSemanticRetry] { + try await performRead { context in + let request = NSFetchRequest(entityName: EntityName.retry.rawValue) + request.sortDescriptors = [ + NSSortDescriptor(key: RetryProperty.createdAt.rawValue, ascending: true), + NSSortDescriptor(key: RetryProperty.documentID.rawValue, ascending: true), + ] + + if let limit { + request.fetchLimit = max(0, limit) + } + + return try context.fetch(request).map(Self.makeRetry(from:)) + } + } + + public func removeRetries(for documentIDs: [FetchDocumentID]) async throws { + let uniqueIDs = Set(documentIDs) + guard !uniqueIDs.isEmpty else { + return + } + + try await performWrite { context in + let request = NSFetchRequest(entityName: EntityName.retry.rawValue) + request.predicate = NSPredicate( + format: "%K IN %@", + RetryProperty.documentID.rawValue, + uniqueIDs.map(\.rawValue) + ) + + for storedRetry in try context.fetch(request) { + context.delete(storedRetry) + } + } + } + + private func performRead(_ work: @escaping @Sendable (NSManagedObjectContext) throws -> T) async throws -> T { + try await managedObjectContext.perform { [self] in + try work(self.managedObjectContext) + } + } + + private func performWrite(_ work: @escaping @Sendable (NSManagedObjectContext) throws -> Void) async throws { + try await managedObjectContext.perform { [self] in + try work(self.managedObjectContext) + if self.managedObjectContext.hasChanges { + try self.managedObjectContext.save() + } + } + } +} + +private enum EntityName: String { + case retry = "SwiftlyFetchSemanticRetry" +} + +private enum RetryProperty: String { + case documentID + case operation + case reason + case attemptCount + case createdAt + case lastAttemptAt + case nextRetryAt + case lastFailure +} diff --git a/Sources/SwiftlyFetch/SwiftlyFetchDocumentMapper.swift b/Sources/SwiftlyFetch/SwiftlyFetchDocumentMapper.swift new file mode 100644 index 0000000..1322da4 --- /dev/null +++ b/Sources/SwiftlyFetch/SwiftlyFetchDocumentMapper.swift @@ -0,0 +1,91 @@ +import FetchCore +import Foundation +import RAGCore + +public struct SwiftlyFetchDocumentMapper: Hashable, Sendable { + public init() {} + + public func documentID(for fetchDocumentID: FetchDocumentID) -> DocumentID { + DocumentID(fetchDocumentID.rawValue) + } + + public func document(from record: FetchDocumentRecord) -> Document { + Document( + id: documentID(for: record.id), + content: content(from: record), + metadata: metadata(from: record) + ) + } + + private func content(from record: FetchDocumentRecord) -> DocumentContent { + let body = semanticSourceText(from: record) + + switch record.contentType { + case .plainText: + return .text(body) + case .markdown: + return .markdown(body) + } + } + + private func semanticSourceText(from record: FetchDocumentRecord) -> String { + guard let title = normalizedTitle(from: record) else { + return record.body + } + + switch record.contentType { + case .plainText: + return "Title: \(title)\n\n\(record.body)" + case .markdown: + return "# \(title)\n\n\(record.body)" + } + } + + private func metadata(from record: FetchDocumentRecord) -> DocumentMetadata { + var values = record.metadata.mapValues(MetadataValue.string) + values["contentType"] = .string(record.contentType.rawValue) + + if let title = normalizedTitle(from: record) { + values["title"] = .string(title) + } + + if let kind = record.kind { + values["kind"] = .string(kind.rawValue) + } + + if let language = record.language { + values["language"] = .string(language) + } + + if let sourceURI = record.sourceURI { + values["sourceURI"] = .string(sourceURI) + } + + if let createdAt = record.createdAt { + values["createdAt"] = .date(createdAt) + } + + if let updatedAt = record.updatedAt { + values["updatedAt"] = .date(updatedAt) + } + + return DocumentMetadata(values) + } + + private func normalizedTitle(from record: FetchDocumentRecord) -> String? { + guard let rawTitle = record.title else { + return nil + } + + let title = rawTitle + .split(whereSeparator: \.isWhitespace) + .joined(separator: " ") + .trimmingCharacters(in: .whitespacesAndNewlines) + + guard !title.isEmpty else { + return nil + } + + return title + } +} diff --git a/Sources/SwiftlyFetch/SwiftlyFetchLibrary+Persistence.swift b/Sources/SwiftlyFetch/SwiftlyFetchLibrary+Persistence.swift new file mode 100644 index 0000000..4ef8d5d --- /dev/null +++ b/Sources/SwiftlyFetch/SwiftlyFetchLibrary+Persistence.swift @@ -0,0 +1,155 @@ +#if os(macOS) +import FetchKit +import Foundation +import RAGKit + +public struct SwiftlyFetchPersistentConfiguration: Hashable, Sendable { + public enum StorageLocation: Hashable, Sendable { + case directory(URL) + case applicationSupportDirectory(appendingPath: String) + + public static let `default` = StorageLocation.applicationSupportDirectory( + appendingPath: "SwiftlyFetch" + ) + } + + public enum SemanticBackend: Hashable, Sendable { + case hashing(dimension: Int) + case naturalLanguage(languageHint: String?) + + public static let `default` = SemanticBackend.hashing(dimension: 64) + } + + public var location: StorageLocation + public var semanticBackend: SemanticBackend + public var retryPendingSyncsOnInit: Bool + + public init( + location: StorageLocation = .default, + semanticBackend: SemanticBackend = .default, + retryPendingSyncsOnInit: Bool = true + ) { + self.location = location + self.semanticBackend = semanticBackend + self.retryPendingSyncsOnInit = retryPendingSyncsOnInit + } + + public static let `default` = SwiftlyFetchPersistentConfiguration() +} + +public extension SwiftlyFetchLibrary { + enum PersistentLibraryError: Error, LocalizedError { + case applicationSupportDirectoryUnavailable + + public var errorDescription: String? { + switch self { + case .applicationSupportDirectoryUnavailable: + "SwiftlyFetch could not resolve the user Application Support directory for persistent library storage." + } + } + } + + internal struct ResolvedPersistentPaths: Hashable { + let rootURL: URL + let fetchKitDirectoryURL: URL + let semanticIndexURL: URL + let semanticRetryURL: URL + } + + static func macOSPersistentLibrary( + configuration: SwiftlyFetchPersistentConfiguration = .default + ) async throws -> SwiftlyFetchLibrary { + let resolvedPaths = try configuration.resolvedPaths() + + try FileManager.default.createDirectory( + at: resolvedPaths.rootURL, + withIntermediateDirectories: true + ) + + let fetchLibrary = try await FetchKitLibrary.macOSPersistentLibrary( + configuration: .init( + location: .directory(resolvedPaths.fetchKitDirectoryURL), + retryPendingSyncsOnInit: configuration.retryPendingSyncsOnInit + ) + ) + let knowledgeBase = try await makePersistentKnowledgeBase( + configuration: configuration, + semanticIndexURL: resolvedPaths.semanticIndexURL + ) + let retryStore = try await CoreDataSwiftlyFetchSemanticRetryStore( + configuration: .init(store: .sqlite(resolvedPaths.semanticRetryURL)) + ) + + let library = SwiftlyFetchLibrary( + fetchLibrary: fetchLibrary, + knowledgeBase: knowledgeBase, + retryStore: retryStore + ) + + if configuration.retryPendingSyncsOnInit { + _ = try await library.retrySemanticIndexing() + } + + return library + } + + static func macOSPersistentLibrary( + at directoryURL: URL, + semanticBackend: SwiftlyFetchPersistentConfiguration.SemanticBackend = .default, + retryPendingSyncsOnInit: Bool = true + ) async throws -> SwiftlyFetchLibrary { + try await macOSPersistentLibrary( + configuration: .init( + location: .directory(directoryURL), + semanticBackend: semanticBackend, + retryPendingSyncsOnInit: retryPendingSyncsOnInit + ) + ) + } + + private static func makePersistentKnowledgeBase( + configuration: SwiftlyFetchPersistentConfiguration, + semanticIndexURL: URL + ) async throws -> KnowledgeBase { + let vectorConfiguration = CoreDataVectorIndex.Configuration(store: .sqlite(semanticIndexURL)) + + switch configuration.semanticBackend { + case let .hashing(dimension): + return try await KnowledgeBase.persistentHashingDefault( + configuration: vectorConfiguration, + dimension: dimension + ) + case let .naturalLanguage(languageHint): + return try await KnowledgeBase.persistentNaturalLanguageDefault( + configuration: vectorConfiguration, + languageHint: languageHint + ) + } + } +} + +extension SwiftlyFetchPersistentConfiguration { + func resolvedPaths(fileManager: FileManager = .default) throws -> SwiftlyFetchLibrary.ResolvedPersistentPaths { + let rootURL = try resolveRootURL(fileManager: fileManager) + return .init( + rootURL: rootURL, + fetchKitDirectoryURL: rootURL.appendingPathComponent("FetchKit", isDirectory: true), + semanticIndexURL: rootURL.appendingPathComponent("SemanticIndex.sqlite"), + semanticRetryURL: rootURL.appendingPathComponent("SemanticRetries.sqlite") + ) + } + + private func resolveRootURL(fileManager: FileManager) throws -> URL { + switch location { + case let .directory(url): + return url + case let .applicationSupportDirectory(appendingPath): + guard let baseURL = fileManager.urls(for: .applicationSupportDirectory, in: .userDomainMask).first else { + throw SwiftlyFetchLibrary.PersistentLibraryError.applicationSupportDirectoryUnavailable + } + + return baseURL.appendingPathComponent(appendingPath, isDirectory: true) + } + } +} +#endif diff --git a/Sources/SwiftlyFetch/SwiftlyFetchLibrary.swift b/Sources/SwiftlyFetch/SwiftlyFetchLibrary.swift new file mode 100644 index 0000000..9d8dcb7 --- /dev/null +++ b/Sources/SwiftlyFetch/SwiftlyFetchLibrary.swift @@ -0,0 +1,233 @@ +import FetchCore +import FetchKit +import Foundation +import RAGCore +import RAGKit + +public actor SwiftlyFetchLibrary { + private let fetchLibrary: FetchKitLibrary + private let knowledgeBase: KnowledgeBase + private let retryStore: any SwiftlyFetchSemanticRetryStore + private let documentMapper: SwiftlyFetchDocumentMapper + + public init( + fetchLibrary: FetchKitLibrary, + knowledgeBase: KnowledgeBase, + retryStore: any SwiftlyFetchSemanticRetryStore, + documentMapper: SwiftlyFetchDocumentMapper = SwiftlyFetchDocumentMapper() + ) { + self.fetchLibrary = fetchLibrary + self.knowledgeBase = knowledgeBase + self.retryStore = retryStore + self.documentMapper = documentMapper + } + + public static func `default`() async throws -> SwiftlyFetchLibrary { + try await SwiftlyFetchLibrary( + fetchLibrary: .default(), + knowledgeBase: .hashingDefault(), + retryStore: InMemorySwiftlyFetchSemanticRetryStore() + ) + } + + @discardableResult + public func addDocument(_ record: FetchDocumentRecord) async throws -> SwiftlyFetchMutationResult { + let conventionalResult = try await fetchLibrary.addDocument(record) + return try await indexSemantics( + record, + conventionalResult: conventionalResult + ) + } + + @discardableResult + public func updateDocument(_ record: FetchDocumentRecord) async throws -> SwiftlyFetchMutationResult { + let conventionalResult = try await fetchLibrary.updateDocument(record) + return try await indexSemantics( + record, + conventionalResult: conventionalResult + ) + } + + @discardableResult + public func removeDocument(withID id: FetchDocumentID) async throws -> SwiftlyFetchMutationResult { + let conventionalResult = try await fetchLibrary.removeDocument(withID: id) + let semanticDocumentID = documentMapper.documentID(for: id) + + do { + try await knowledgeBase.removeDocument(semanticDocumentID) + try await retryStore.removeRetries(for: [id]) + + return SwiftlyFetchMutationResult( + documentIDs: conventionalResult.documentIDs, + conventional: .succeeded, + semantic: .succeeded(state: await bestEffortSemanticIndexState(for: id)) + ) + } catch { + let retry = SwiftlyFetchSemanticRetry( + documentID: id, + operation: .removeDocument, + reason: "SwiftlyFetch removed the corpus record, but semantic chunk cleanup failed.", + lastFailure: String(describing: error) + ) + try await retryStore.upsert(retry) + + return SwiftlyFetchMutationResult( + documentIDs: conventionalResult.documentIDs, + conventional: .succeeded, + semantic: SwiftlyFetchSemanticMutationStage( + status: .queuedRetry, + state: await bestEffortSemanticIndexState(for: id), + retry: retry, + failureDescription: retry.lastFailure + ) + ) + } + } + + public func search(_ query: FetchSearchQuery) async throws -> [FetchSearchResult] { + try await fetchLibrary.search(query) + } + + public func retrieve(_ query: SearchQuery) async throws -> [SearchResult] { + try await knowledgeBase.search(query) + } + + public func searchAndRetrieve( + _ query: SwiftlyFetchSearchAndRetrieveQuery + ) async throws -> SwiftlyFetchSearchAndRetrieveResult { + let conventionalResults = try await search(query.conventional) + let semanticResults = try await retrieve(query.semantic) + + return SwiftlyFetchSearchAndRetrieveResult( + conventional: conventionalResults, + semantic: semanticResults + ) + } + + public func searchAndRetrieve( + conventional conventionalQuery: FetchSearchQuery, + semantic semanticQuery: SearchQuery + ) async throws -> SwiftlyFetchSearchAndRetrieveResult { + try await searchAndRetrieve( + SwiftlyFetchSearchAndRetrieveQuery( + conventional: conventionalQuery, + semantic: semanticQuery + ) + ) + } + + @discardableResult + public func retrySemanticIndexing(limit: Int? = nil) async throws -> SwiftlyFetchSemanticRetryResult { + let retries = try await retryStore.pendingRetries(limit: nil) + var dueRetries: [SwiftlyFetchSemanticRetry] = [] + var completedDocumentIDs: [FetchDocumentID] = [] + var removedMissingDocumentIDs: [FetchDocumentID] = [] + var deferredDocumentIDs: [FetchDocumentID] = [] + var failedRetries: [SwiftlyFetchSemanticRetry] = [] + let now = Date() + + for retry in retries { + if let nextRetryAt = retry.nextRetryAt, nextRetryAt > now { + deferredDocumentIDs.append(retry.documentID) + } else if limit.map({ dueRetries.count < max(0, $0) }) ?? true { + dueRetries.append(retry) + } + } + + for retry in dueRetries { + do { + switch retry.operation { + case .indexDocument: + guard let record = try await fetchLibrary.document(withID: retry.documentID) else { + try await retryStore.removeRetries(for: [retry.documentID]) + removedMissingDocumentIDs.append(retry.documentID) + continue + } + + try await knowledgeBase.addDocument(documentMapper.document(from: record)) + case .removeDocument: + try await knowledgeBase.removeDocument(documentMapper.documentID(for: retry.documentID)) + } + + try await retryStore.removeRetries(for: [retry.documentID]) + completedDocumentIDs.append(retry.documentID) + } catch { + let failedRetry = failedSemanticRetry(from: retry, error: error) + try await retryStore.upsert(failedRetry) + failedRetries.append(failedRetry) + } + } + + return SwiftlyFetchSemanticRetryResult( + completedDocumentIDs: uniqueDocumentIDs(completedDocumentIDs), + removedMissingDocumentIDs: uniqueDocumentIDs(removedMissingDocumentIDs), + deferredDocumentIDs: uniqueDocumentIDs(deferredDocumentIDs), + failedRetries: failedRetries + ) + } + + private func indexSemantics( + _ record: FetchDocumentRecord, + conventionalResult: FetchKitLibrary.BatchResult + ) async throws -> SwiftlyFetchMutationResult { + let storedRecord = try await fetchLibrary.document(withID: record.id) ?? record + + do { + try await knowledgeBase.addDocument(documentMapper.document(from: storedRecord)) + try await retryStore.removeRetries(for: [record.id]) + + return SwiftlyFetchMutationResult( + documentIDs: conventionalResult.documentIDs, + conventional: .succeeded, + semantic: .succeeded(state: await bestEffortSemanticIndexState(for: record.id)) + ) + } catch { + let retry = SwiftlyFetchSemanticRetry( + documentID: record.id, + operation: .indexDocument, + reason: "SwiftlyFetch stored the corpus record, but semantic indexing failed.", + lastFailure: String(describing: error) + ) + try await retryStore.upsert(retry) + + return SwiftlyFetchMutationResult( + documentIDs: conventionalResult.documentIDs, + conventional: .succeeded, + semantic: SwiftlyFetchSemanticMutationStage( + status: .queuedRetry, + state: await bestEffortSemanticIndexState(for: record.id), + retry: retry, + failureDescription: retry.lastFailure + ) + ) + } + } + + private func semanticIndexState(for fetchDocumentID: FetchDocumentID) async throws -> SemanticIndexState? { + try await knowledgeBase.semanticIndexState( + for: documentMapper.documentID(for: fetchDocumentID) + ) + } + + private func bestEffortSemanticIndexState(for fetchDocumentID: FetchDocumentID) async -> SemanticIndexState? { + try? await semanticIndexState(for: fetchDocumentID) + } + + private func failedSemanticRetry( + from retry: SwiftlyFetchSemanticRetry, + error: Error + ) -> SwiftlyFetchSemanticRetry { + let now = Date() + var failedRetry = retry + failedRetry.attemptCount += 1 + failedRetry.lastAttemptAt = now + failedRetry.nextRetryAt = now.addingTimeInterval(60) + failedRetry.lastFailure = String(describing: error) + return failedRetry + } + + private func uniqueDocumentIDs(_ documentIDs: [FetchDocumentID]) -> [FetchDocumentID] { + var seen = Set() + return documentIDs.filter { seen.insert($0).inserted } + } +} diff --git a/Sources/SwiftlyFetch/SwiftlyFetchMutationResult.swift b/Sources/SwiftlyFetch/SwiftlyFetchMutationResult.swift new file mode 100644 index 0000000..76c8e07 --- /dev/null +++ b/Sources/SwiftlyFetch/SwiftlyFetchMutationResult.swift @@ -0,0 +1,66 @@ +import FetchCore +import RAGKit + +public enum SwiftlyFetchMutationStageStatus: String, Hashable, Codable, Sendable { + case succeeded + case skipped + case queuedRetry + case failed +} + +public struct SwiftlyFetchMutationStage: Hashable, Codable, Sendable { + public var status: SwiftlyFetchMutationStageStatus + public var failureDescription: String? + + public init( + status: SwiftlyFetchMutationStageStatus, + failureDescription: String? = nil + ) { + self.status = status + self.failureDescription = failureDescription + } + + public static let succeeded = SwiftlyFetchMutationStage(status: .succeeded) + public static let skipped = SwiftlyFetchMutationStage(status: .skipped) +} + +public struct SwiftlyFetchSemanticMutationStage: Hashable, Codable, Sendable { + public var status: SwiftlyFetchMutationStageStatus + public var state: SemanticIndexState? + public var retry: SwiftlyFetchSemanticRetry? + public var failureDescription: String? + + public init( + status: SwiftlyFetchMutationStageStatus, + state: SemanticIndexState? = nil, + retry: SwiftlyFetchSemanticRetry? = nil, + failureDescription: String? = nil + ) { + self.status = status + self.state = state + self.retry = retry + self.failureDescription = failureDescription + } + + public static func succeeded(state: SemanticIndexState? = nil) -> SwiftlyFetchSemanticMutationStage { + SwiftlyFetchSemanticMutationStage(status: .succeeded, state: state) + } + + public static let skipped = SwiftlyFetchSemanticMutationStage(status: .skipped) +} + +public struct SwiftlyFetchMutationResult: Hashable, Codable, Sendable { + public var documentIDs: [FetchDocumentID] + public var conventional: SwiftlyFetchMutationStage + public var semantic: SwiftlyFetchSemanticMutationStage + + public init( + documentIDs: [FetchDocumentID], + conventional: SwiftlyFetchMutationStage, + semantic: SwiftlyFetchSemanticMutationStage + ) { + self.documentIDs = documentIDs + self.conventional = conventional + self.semantic = semantic + } +} diff --git a/Sources/SwiftlyFetch/SwiftlyFetchSearchAndRetrieve.swift b/Sources/SwiftlyFetch/SwiftlyFetchSearchAndRetrieve.swift new file mode 100644 index 0000000..dbdf6f7 --- /dev/null +++ b/Sources/SwiftlyFetch/SwiftlyFetchSearchAndRetrieve.swift @@ -0,0 +1,28 @@ +import FetchCore +import RAGCore + +public struct SwiftlyFetchSearchAndRetrieveQuery: Hashable, Codable, Sendable { + public var conventional: FetchSearchQuery + public var semantic: SearchQuery + + public init( + conventional: FetchSearchQuery, + semantic: SearchQuery + ) { + self.conventional = conventional + self.semantic = semantic + } +} + +public struct SwiftlyFetchSearchAndRetrieveResult: Hashable, Codable, Sendable { + public var conventional: [FetchSearchResult] + public var semantic: [SearchResult] + + public init( + conventional: [FetchSearchResult], + semantic: [SearchResult] + ) { + self.conventional = conventional + self.semantic = semantic + } +} diff --git a/Sources/SwiftlyFetch/SwiftlyFetchSemanticRetry.swift b/Sources/SwiftlyFetch/SwiftlyFetchSemanticRetry.swift new file mode 100644 index 0000000..3792cd3 --- /dev/null +++ b/Sources/SwiftlyFetch/SwiftlyFetchSemanticRetry.swift @@ -0,0 +1,97 @@ +import FetchCore +import Foundation + +public enum SwiftlyFetchSemanticRetryOperation: String, Hashable, Codable, Sendable { + case indexDocument + case removeDocument +} + +public struct SwiftlyFetchSemanticRetry: Hashable, Codable, Sendable { + public var documentID: FetchDocumentID + public var operation: SwiftlyFetchSemanticRetryOperation + public var reason: String + public var attemptCount: Int + public var createdAt: Date + public var lastAttemptAt: Date? + public var nextRetryAt: Date? + public var lastFailure: String? + + public init( + documentID: FetchDocumentID, + operation: SwiftlyFetchSemanticRetryOperation, + reason: String, + attemptCount: Int = 0, + createdAt: Date = .now, + lastAttemptAt: Date? = nil, + nextRetryAt: Date? = nil, + lastFailure: String? = nil + ) { + self.documentID = documentID + self.operation = operation + self.reason = reason + self.attemptCount = max(0, attemptCount) + self.createdAt = createdAt + self.lastAttemptAt = lastAttemptAt + self.nextRetryAt = nextRetryAt + self.lastFailure = lastFailure + } +} + +public protocol SwiftlyFetchSemanticRetryStore: Sendable { + func upsert(_ retry: SwiftlyFetchSemanticRetry) async throws + func pendingRetries(limit: Int?) async throws -> [SwiftlyFetchSemanticRetry] + func removeRetries(for documentIDs: [FetchDocumentID]) async throws +} + +public actor InMemorySwiftlyFetchSemanticRetryStore: SwiftlyFetchSemanticRetryStore { + private var retriesByDocumentID: [FetchDocumentID: SwiftlyFetchSemanticRetry] = [:] + private var documentIDOrder: [FetchDocumentID] = [] + + public init() {} + + public func upsert(_ retry: SwiftlyFetchSemanticRetry) async throws { + if retriesByDocumentID[retry.documentID] == nil { + documentIDOrder.append(retry.documentID) + } + + retriesByDocumentID[retry.documentID] = retry + } + + public func pendingRetries(limit: Int? = nil) async throws -> [SwiftlyFetchSemanticRetry] { + let retries = documentIDOrder.compactMap { retriesByDocumentID[$0] } + return limit.map { Array(retries.prefix(max(0, $0))) } ?? retries + } + + public func removeRetries(for documentIDs: [FetchDocumentID]) async throws { + let documentIDSet = Set(documentIDs) + + for documentID in documentIDSet { + retriesByDocumentID[documentID] = nil + } + + documentIDOrder.removeAll { documentIDSet.contains($0) } + } +} + +public struct SwiftlyFetchSemanticRetryResult: Hashable, Codable, Sendable { + public var completedDocumentIDs: [FetchDocumentID] + public var removedMissingDocumentIDs: [FetchDocumentID] + public var deferredDocumentIDs: [FetchDocumentID] + public var failedRetries: [SwiftlyFetchSemanticRetry] + + public init( + completedDocumentIDs: [FetchDocumentID], + removedMissingDocumentIDs: [FetchDocumentID] = [], + deferredDocumentIDs: [FetchDocumentID] = [], + failedRetries: [SwiftlyFetchSemanticRetry] = [] + ) { + self.completedDocumentIDs = completedDocumentIDs + self.removedMissingDocumentIDs = removedMissingDocumentIDs + self.deferredDocumentIDs = deferredDocumentIDs + self.failedRetries = failedRetries + } + + public var count: Int { + completedDocumentIDs.count + removedMissingDocumentIDs.count + } +} diff --git a/Tests/FetchKitTests/FixtureCorpusQualityTests.swift b/Tests/FetchKitTests/FixtureCorpusQualityTests.swift index d8910e4..3fb83e5 100644 --- a/Tests/FetchKitTests/FixtureCorpusQualityTests.swift +++ b/Tests/FetchKitTests/FixtureCorpusQualityTests.swift @@ -1,8 +1,9 @@ import FetchCore -import Testing @testable import FetchKit +import SwiftlyFetchTestFixtures +import Testing -@Suite("FetchKit fixture corpus quality", .serialized) +@Suite(.serialized) struct FixtureCorpusQualityTests { @Test("Fixture corpus records carry source attribution") func fixtureCorpusRecordsCarrySourceAttribution() { @@ -11,6 +12,11 @@ struct FixtureCorpusQualityTests { #expect(GutenbergMiniCorpus.source.split == "train") #expect(GutenbergMiniCorpus.records.allSatisfy { $0.sourceURI == GutenbergMiniCorpus.source.url }) #expect(GutenbergMiniCorpus.records.allSatisfy { $0.metadata["fixture.dataset"] == GutenbergMiniCorpus.source.datasetID }) + #expect(TinyStoriesMiniCorpus.source.datasetID == "roneneldan/TinyStories") + #expect(TinyStoriesMiniCorpus.source.config == "default") + #expect(TinyStoriesMiniCorpus.source.split == "train") + #expect(TinyStoriesMiniCorpus.records.allSatisfy { $0.sourceURI == TinyStoriesMiniCorpus.source.url }) + #expect(TinyStoriesMiniCorpus.records.allSatisfy { $0.metadata["fixture.dataset"] == TinyStoriesMiniCorpus.source.datasetID }) } @Test("Fixture corpus retrieves a body-driven chapter hit") @@ -117,9 +123,32 @@ struct FixtureCorpusQualityTests { #expect(firstResult.snippetField == .body) } + @Test("Fixture corpus includes a second text source for simple story searches") + func fixtureCorpusIncludesSecondTextSource() async throws { + let library = try await indexedFixtureLibrary() + + let sewingResults = try await library.search( + "needle sew shirt", + kind: .allTerms, + fields: [.title, .body], + limit: 4 + ) + let fuelResults = try await library.search( + "healthy fuel car", + kind: .allTerms, + fields: [.title, .body], + limit: 4 + ) + + #expect(sewingResults.first?.document.id == "tinystories-row-0-needle") + #expect(sewingResults.first?.snippet?.text.localizedCaseInsensitiveContains("needle") == true) + #expect(fuelResults.first?.document.id == "tinystories-row-1-beep") + #expect(fuelResults.first?.matchedFields.contains(.body) == true) + } + private func indexedFixtureLibrary() async throws -> FetchKitLibrary { let library = FetchKitLibrary() - try await library.addDocuments(GutenbergMiniCorpus.records) + try await library.addDocuments(GutenbergMiniCorpus.records + TinyStoriesMiniCorpus.records) return library } } diff --git a/Tests/FetchKitTests/SearchKitFetchIndexTests.swift b/Tests/FetchKitTests/SearchKitFetchIndexTests.swift index 371ac93..92b5957 100644 --- a/Tests/FetchKitTests/SearchKitFetchIndexTests.swift +++ b/Tests/FetchKitTests/SearchKitFetchIndexTests.swift @@ -1,10 +1,15 @@ #if os(macOS) -import Foundation -import XCTest import FetchCore @testable import FetchKit +import Foundation +import SwiftlyFetchTestFixtures +import XCTest final class SearchKitFetchIndexTests: XCTestCase { + private var fixtureRecords: [FetchDocumentRecord] { + GutenbergMiniCorpus.records + TinyStoriesMiniCorpus.records + } + func testSearchKitFetchIndexIndexesAndSearchesText() async throws { let index = try SearchKitFetchIndex( configuration: .init( @@ -212,7 +217,7 @@ final class SearchKitFetchIndexTests: XCTestCase { try await index.apply( FetchIndexingChangeset( - GutenbergMiniCorpus.records.map { .upsert($0.indexDocument) } + fixtureRecords.map { .upsert($0.indexDocument) } ) ) @@ -247,7 +252,7 @@ final class SearchKitFetchIndexTests: XCTestCase { try await index.apply( FetchIndexingChangeset( - GutenbergMiniCorpus.records.map { .upsert($0.indexDocument) } + fixtureRecords.map { .upsert($0.indexDocument) } ) ) @@ -272,6 +277,31 @@ final class SearchKitFetchIndexTests: XCTestCase { XCTAssertEqual(longBodyResults.first?.snippetField, .body) } + func testSearchKitFetchIndexMatchesSecondFixtureCorpusSource() async throws { + let index = try SearchKitFetchIndex( + configuration: .init( + storage: .inMemory, + indexNamePrefix: "SearchKitFetchIndexTests-\(UUID().uuidString)" + ) + ) + + try await index.apply( + FetchIndexingChangeset( + fixtureRecords.map { .upsert($0.indexDocument) } + ) + ) + + let results = try await index.search( + FetchSearchQuery("needle sew shirt", kind: .allTerms, fields: [.title, .body], limit: 4) + ) + let needleResult = try XCTUnwrap( + results.first { $0.document.id == "tinystories-row-0-needle" } + ) + + XCTAssertEqual(needleResult.matchedFields.contains(.body), true) + XCTAssertEqual(needleResult.snippet?.text.localizedCaseInsensitiveContains("needle"), true) + } + func testFetchKitLibraryBuildsPersistentPair() async throws { let temporaryDirectory = URL(fileURLWithPath: NSTemporaryDirectory(), isDirectory: true) .appendingPathComponent(UUID().uuidString, isDirectory: true) diff --git a/Tests/RAGKitTests/CoreDataVectorIndexTests.swift b/Tests/RAGKitTests/CoreDataVectorIndexTests.swift new file mode 100644 index 0000000..89d4ce7 --- /dev/null +++ b/Tests/RAGKitTests/CoreDataVectorIndexTests.swift @@ -0,0 +1,349 @@ +import RAGCore +@testable import RAGKit +import XCTest + +final class CoreDataVectorIndexTests: XCTestCase { + func testCoreDataVectorIndexPersistsChunksAcrossReopen() async throws { + let storeURL = temporaryStoreURL() + let documentID = DocumentID("doc-fruit") + let indexedChunks = [ + makeIndexedChunk( + id: "doc-fruit#0", + documentID: documentID, + text: "Apples are bright and crisp.", + embedding: [1, 0, 0], + metadata: ["kind": .string("guide")] + ), + makeIndexedChunk( + id: "doc-fruit#1", + documentID: documentID, + text: "Oranges are juicy and sweet.", + embedding: [0, 1, 0], + metadata: ["kind": .string("guide")] + ), + ] + + let index = try await CoreDataVectorIndex( + configuration: .init(store: .sqlite(storeURL)) + ) + try await index.upsert(indexedChunks) + + let reopenedIndex = try await CoreDataVectorIndex( + configuration: .init(store: .sqlite(storeURL)) + ) + let results = try await reopenedIndex.search( + SearchQuery("fruit", limit: 2), + embedding: EmbeddingVector([1, 0, 0]) + ) + + XCTAssertEqual(results.map(\.chunk.id), ["doc-fruit#0", "doc-fruit#1"]) + XCTAssertEqual(results.first?.chunk.text, "Apples are bright and crisp.") + XCTAssertEqual(results.first?.chunk.metadata["kind"], .string("guide")) + } + + func testCoreDataVectorIndexReplacesExistingChunks() async throws { + let index = try await CoreDataVectorIndex() + let original = makeIndexedChunk( + id: "doc-fruit#0", + documentID: "doc-fruit", + text: "Original apple text.", + embedding: [1, 0] + ) + let replacement = makeIndexedChunk( + id: "doc-fruit#0", + documentID: "doc-fruit", + text: "Updated orange text.", + embedding: [0, 1] + ) + + try await index.upsert([original]) + try await index.upsert([replacement]) + + let appleResults = try await index.search( + SearchQuery("apple", limit: 1), + embedding: EmbeddingVector([1, 0]) + ) + let orangeResults = try await index.search( + SearchQuery("orange", limit: 1), + embedding: EmbeddingVector([0, 1]) + ) + + XCTAssertEqual(appleResults.first?.chunk.text, "Updated orange text.") + XCTAssertEqual(orangeResults.first?.chunk.text, "Updated orange text.") + XCTAssertEqual(orangeResults.first?.score, 1.0) + } + + func testCoreDataVectorIndexFiltersAndRemovesByDocumentID() async throws { + let index = try await CoreDataVectorIndex() + try await index.upsert([ + makeIndexedChunk( + id: "doc-guide#0", + documentID: "doc-guide", + text: "Apples are bright and crisp.", + embedding: [1, 0], + metadata: ["kind": .string("guide")] + ), + makeIndexedChunk( + id: "doc-note#0", + documentID: "doc-note", + text: "Oranges are juicy and sweet.", + embedding: [0, 1], + metadata: ["kind": .string("note")] + ), + ]) + + let filteredResults = try await index.search( + SearchQuery( + "fruit", + limit: 5, + filter: .equals("kind", .string("guide")) + ), + embedding: EmbeddingVector([1, 0]) + ) + + XCTAssertEqual(filteredResults.map(\.chunk.documentID), ["doc-guide"]) + + try await index.removeChunks(for: "doc-guide") + let remainingResults = try await index.search( + SearchQuery("fruit", limit: 5), + embedding: EmbeddingVector([1, 0]) + ) + + XCTAssertEqual(remainingResults.map(\.chunk.documentID), ["doc-note"]) + } + + func testCoreDataVectorIndexRemoveAllClearsPersistedChunks() async throws { + let storeURL = temporaryStoreURL() + let index = try await CoreDataVectorIndex( + configuration: .init(store: .sqlite(storeURL)) + ) + try await index.upsert([ + makeIndexedChunk( + id: "doc-fruit#0", + documentID: "doc-fruit", + text: "Apples are bright and crisp.", + embedding: [1, 0] + ), + ]) + + try await index.removeAll() + + let reopenedIndex = try await CoreDataVectorIndex( + configuration: .init(store: .sqlite(storeURL)) + ) + let results = try await reopenedIndex.search( + SearchQuery("fruit", limit: 5), + embedding: EmbeddingVector([1, 0]) + ) + + XCTAssertTrue(results.isEmpty) + } + + func testPersistentKnowledgeBaseConvenienceReusesStoredSemanticIndex() async throws { + let storeURL = temporaryStoreURL() + let configuration = CoreDataVectorIndex.Configuration(store: .sqlite(storeURL)) + let knowledgeBase = try await KnowledgeBase.persistentHashingDefault( + configuration: configuration + ) + + try await knowledgeBase.addDocument( + Document( + id: "doc-fruit", + content: .markdown( + """ + # Fruit Guide + + ## Apples + + Apples are bright and crisp. + """ + ) + ) + ) + + let reopenedKnowledgeBase = try await KnowledgeBase.persistentHashingDefault( + configuration: configuration + ) + let results = try await reopenedKnowledgeBase.search("bright fruit", limit: 1) + + XCTAssertEqual(results.first?.chunk.documentID, "doc-fruit") + XCTAssertEqual(results.first?.chunk.text, "Fruit Guide\nApples\n\nApples are bright and crisp.") + } + + func testKnowledgeBaseMarksSemanticStateCurrentAfterIndexing() async throws { + let index = try await CoreDataVectorIndex() + let knowledgeBase = KnowledgeBase( + chunker: DefaultChunker(), + embedder: HashingEmbedder(dimension: 16), + index: index + ) + + try await knowledgeBase.addDocument( + Document( + id: "doc-fruit", + content: .text("Apples are bright and crisp."), + metadata: ["kind": .string("guide")] + ) + ) + + let state = try await index.state(for: "doc-fruit") + + XCTAssertEqual(state?.status, .current) + XCTAssertEqual(state?.documentID, "doc-fruit") + XCTAssertEqual(state?.fingerprint?.chunker, "ragkit.paragraph-chunker.v1|ragkit.heading-aware-markdown.v1.links-omit") + XCTAssertEqual(state?.fingerprint?.embedder, "ragkit.hashing.16") + XCTAssertNotNil(state?.fingerprint?.source) + XCTAssertNotNil(state?.lastIndexedAt) + XCTAssertNil(state?.lastFailure) + } + + func testKnowledgeBaseMarksSemanticStateFailedWhenEmbeddingFails() async throws { + let index = try await CoreDataVectorIndex() + let knowledgeBase = KnowledgeBase( + chunker: DefaultChunker(), + embedder: FailingEmbedder(), + index: index + ) + + do { + try await knowledgeBase.addDocument( + Document( + id: "doc-fruit", + content: .text("Apples are bright and crisp.") + ) + ) + XCTFail("Expected semantic indexing to surface the embedding failure.") + } catch {} + + let state = try await index.state(for: "doc-fruit") + + XCTAssertEqual(state?.status, .failed) + XCTAssertTrue(state?.fingerprint?.embedder.hasPrefix("custom.embedder.") == true) + XCTAssertTrue(state?.fingerprint?.embedder.contains("FailingEmbedder") == true) + XCTAssertEqual(state?.lastFailure, "embeddingUnavailable") + XCTAssertNil(state?.lastIndexedAt) + } + + func testCoreDataVectorIndexCanMarkStateStale() async throws { + let index = try await CoreDataVectorIndex() + let fingerprint = SemanticIndexFingerprint( + source: "source-a", + chunker: "chunker-a", + embedder: "embedder-a" + ) + + try await index.markCurrent(documentID: "doc-fruit", fingerprint: fingerprint) + try await index.markStale( + documentID: "doc-fruit", + reason: "Source fingerprint changed." + ) + + let state = try await index.state(for: "doc-fruit") + + XCTAssertEqual(state?.status, .stale) + XCTAssertEqual(state?.fingerprint, fingerprint) + XCTAssertEqual(state?.lastFailure, "Source fingerprint changed.") + XCTAssertNotNil(state?.lastIndexedAt) + } + + func testSemanticSourceFingerprintChangesWithDocumentContent() async throws { + let index = try await CoreDataVectorIndex() + let firstKnowledgeBase = KnowledgeBase( + chunker: DefaultChunker(), + embedder: HashingEmbedder(), + index: index + ) + try await firstKnowledgeBase.addDocument( + Document( + id: "doc-fruit", + content: .text("Apples are bright and crisp.") + ) + ) + let firstFingerprint = try await index.state(for: "doc-fruit")?.fingerprint?.source + + let secondKnowledgeBase = KnowledgeBase( + chunker: DefaultChunker(), + embedder: HashingEmbedder(), + index: index + ) + try await secondKnowledgeBase.addDocument( + Document( + id: "doc-fruit", + content: .text("Oranges are juicy and sweet.") + ) + ) + let secondFingerprint = try await index.state(for: "doc-fruit")?.fingerprint?.source + + XCTAssertNotNil(firstFingerprint) + XCTAssertNotNil(secondFingerprint) + XCTAssertNotEqual(firstFingerprint, secondFingerprint) + } + + private func makeIndexedChunk( + id: ChunkID, + documentID: DocumentID, + text: String, + embedding: [Double], + metadata: ChunkMetadata = ChunkMetadata(), + chunkIndex: Int = 0 + ) -> IndexedChunk { + IndexedChunk( + chunk: Chunk( + id: id, + documentID: documentID, + text: text, + metadata: metadata, + position: ChunkPosition( + documentID: documentID, + chunkIndex: chunkIndex, + startOffset: 0, + endOffset: text.count + ) + ), + embedding: EmbeddingVector(embedding) + ) + } + + private func temporaryStoreURL( + file: StaticString = #filePath, + line: UInt = #line + ) -> URL { + let directory = FileManager.default + .temporaryDirectory + .appendingPathComponent("SwiftlyFetchTests", isDirectory: true) + .appendingPathComponent(UUID().uuidString, isDirectory: true) + + do { + try FileManager.default.createDirectory( + at: directory, + withIntermediateDirectories: true + ) + } catch { + XCTFail( + "RAGKit could not create a temporary Core Data vector index test directory. \(error.localizedDescription)", + file: file, + line: line + ) + } + + return directory.appendingPathComponent("RAGKitVectorIndex.sqlite") + } +} + +private struct FailingEmbedder: Embedder { + enum Failure: Error, CustomStringConvertible { + case embeddingUnavailable + + var description: String { + "embeddingUnavailable" + } + } + + func embed(chunks _: [Chunk]) async throws -> [EmbeddingVector] { + throw Failure.embeddingUnavailable + } + + func embed(query _: SearchQuery) async throws -> EmbeddingVector { + throw Failure.embeddingUnavailable + } +} diff --git a/Tests/FetchKitTests/Fixtures/GutenbergMiniCorpus.swift b/Tests/SwiftlyFetchTestFixtures/GutenbergMiniCorpus.swift similarity index 88% rename from Tests/FetchKitTests/Fixtures/GutenbergMiniCorpus.swift rename to Tests/SwiftlyFetchTestFixtures/GutenbergMiniCorpus.swift index 1dd572a..59b8b2a 100644 --- a/Tests/FetchKitTests/Fixtures/GutenbergMiniCorpus.swift +++ b/Tests/SwiftlyFetchTestFixtures/GutenbergMiniCorpus.swift @@ -1,15 +1,29 @@ import FetchCore -enum GutenbergMiniCorpus { - struct Source: Hashable, Sendable { - let datasetID: String - let config: String - let split: String - let license: String - let url: String +public enum GutenbergMiniCorpus { + public struct Source: Hashable, Sendable { + public let datasetID: String + public let config: String + public let split: String + public let license: String + public let url: String + + public init( + datasetID: String, + config: String, + split: String, + license: String, + url: String + ) { + self.datasetID = datasetID + self.config = config + self.split = split + self.license = license + self.url = url + } } - static let source = Source( + public static let source = Source( datasetID: "zkeown/gutenberg-corpus", config: "chapters", split: "train", @@ -17,7 +31,7 @@ enum GutenbergMiniCorpus { url: "https://huggingface.co/datasets/zkeown/gutenberg-corpus" ) - static let records: [FetchDocumentRecord] = [ + public static let records: [FetchDocumentRecord] = [ FetchDocumentRecord( id: "gutenberg-78430-chapter-1", title: "A practical course in botany: Chapter I. The Seed", diff --git a/Tests/SwiftlyFetchTestFixtures/TinyStoriesMiniCorpus.swift b/Tests/SwiftlyFetchTestFixtures/TinyStoriesMiniCorpus.swift new file mode 100644 index 0000000..1d15782 --- /dev/null +++ b/Tests/SwiftlyFetchTestFixtures/TinyStoriesMiniCorpus.swift @@ -0,0 +1,77 @@ +import FetchCore + +public enum TinyStoriesMiniCorpus { + public static let source = GutenbergMiniCorpus.Source( + datasetID: "roneneldan/TinyStories", + config: "default", + split: "train", + license: "CDLA-Sharing-1.0; synthetic short stories generated by GPT-3.5 and GPT-4", + url: "https://huggingface.co/datasets/roneneldan/TinyStories" + ) + + public static let records: [FetchDocumentRecord] = [ + FetchDocumentRecord( + id: "tinystories-row-0-needle", + title: "Lily Shares a Needle", + body: """ + One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt. + + Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt." + + Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. + """, + kind: .article, + language: "en", + sourceURI: source.url, + metadata: [ + "fixture.dataset": source.datasetID, + "fixture.config": source.config, + "fixture.split": source.split, + "fixture.row": "0", + "fixture.topic": "sewing", + ] + ), + FetchDocumentRecord( + id: "tinystories-row-1-beep", + title: "Beep Plays in Falling Leaves", + body: """ + Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. + + One day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were falling. Beep liked how the leaves fall and wanted to play with them. + + When it was time to go home, Beep knew he needed more fuel. He went to the fuel place and got more healthy fuel. + """, + kind: .article, + language: "en", + sourceURI: source.url, + metadata: [ + "fixture.dataset": source.datasetID, + "fixture.config": source.config, + "fixture.split": source.split, + "fixture.row": "1", + "fixture.topic": "car", + ] + ), + FetchDocumentRecord( + id: "tinystories-row-2-fin", + title: "Fin Helps a Cold Crab", + body: """ + One day, a little fish named Fin was swimming near the shore. He saw a big crab and wanted to be friends. The crab looked at Fin and said he was cold and did not feel fine. + + Fin wanted to help the crab feel better. He called to the sun, asking it to help his new friend feel fine and not freeze. + + The sun shone warm light on the shore. The crab started to feel better, thanked Fin, and played with him. + """, + kind: .article, + language: "en", + sourceURI: source.url, + metadata: [ + "fixture.dataset": source.datasetID, + "fixture.config": source.config, + "fixture.split": source.split, + "fixture.row": "2", + "fixture.topic": "shore", + ] + ), + ] +} diff --git a/Tests/SwiftlyFetchTests/SwiftlyFetchDocumentMapperTests.swift b/Tests/SwiftlyFetchTests/SwiftlyFetchDocumentMapperTests.swift new file mode 100644 index 0000000..cb4e128 --- /dev/null +++ b/Tests/SwiftlyFetchTests/SwiftlyFetchDocumentMapperTests.swift @@ -0,0 +1,73 @@ +import FetchCore +import Foundation +import RAGCore +import SwiftlyFetch +import Testing + +struct SwiftlyFetchDocumentMapperTests { + @Test("Mapper includes title in markdown source text and typed metadata") + func mapperIncludesTitleInMarkdownSourceAndMetadata() { + let createdAt = Date(timeIntervalSince1970: 100) + let updatedAt = Date(timeIntervalSince1970: 200) + let mapper = SwiftlyFetchDocumentMapper() + let record = FetchDocumentRecord( + id: "doc-guide", + title: "Apple Guide", + body: "Apples are bright and crisp.", + contentType: .markdown, + kind: .guide, + language: "en", + sourceURI: "file:///Guides/apple.md", + createdAt: createdAt, + updatedAt: updatedAt, + metadata: ["section": "fruit"] + ) + + let document = mapper.document(from: record) + + #expect(document.id == "doc-guide") + #expect(document.content == .markdown("# Apple Guide\n\nApples are bright and crisp.")) + #expect(document.metadata["title"] == .string("Apple Guide")) + #expect(document.metadata["contentType"] == .string("markdown")) + #expect(document.metadata["kind"] == .string("guide")) + #expect(document.metadata["language"] == .string("en")) + #expect(document.metadata["sourceURI"] == .string("file:///Guides/apple.md")) + #expect(document.metadata["createdAt"] == .date(createdAt)) + #expect(document.metadata["updatedAt"] == .date(updatedAt)) + #expect(document.metadata["section"] == .string("fruit")) + } + + @Test("Mapper preserves plain text body when title is empty") + func mapperPreservesPlainTextBodyWhenTitleIsEmpty() { + let mapper = SwiftlyFetchDocumentMapper() + let record = FetchDocumentRecord( + id: "doc-note", + title: " ", + body: "A standalone note.", + contentType: .plainText + ) + + let document = mapper.document(from: record) + + #expect(document.content == .text("A standalone note.")) + #expect(document.metadata["title"] == nil) + #expect(document.metadata["contentType"] == .string("plainText")) + } + + @Test("Mapper includes normalized title in plain text source and metadata") + func mapperIncludesNormalizedTitleInPlainTextSourceAndMetadata() { + let mapper = SwiftlyFetchDocumentMapper() + let record = FetchDocumentRecord( + id: "doc-note", + title: " Note\nTitle\tDraft ", + body: "A standalone note.", + contentType: .plainText + ) + + let document = mapper.document(from: record) + + #expect(document.content == .text("Title: Note Title Draft\n\nA standalone note.")) + #expect(document.metadata["title"] == .string("Note Title Draft")) + #expect(document.metadata["contentType"] == .string("plainText")) + } +} diff --git a/Tests/SwiftlyFetchTests/SwiftlyFetchLibraryTests.swift b/Tests/SwiftlyFetchTests/SwiftlyFetchLibraryTests.swift new file mode 100644 index 0000000..8e47568 --- /dev/null +++ b/Tests/SwiftlyFetchTests/SwiftlyFetchLibraryTests.swift @@ -0,0 +1,496 @@ +import FetchCore +import FetchKit +import Foundation +import RAGCore +import RAGKit +import SwiftlyFetch +import SwiftlyFetchTestFixtures +import Testing + +@Suite(.serialized) +struct SwiftlyFetchLibraryTests { + @Test("Default facade ingests one document into conventional and semantic search") + func defaultFacadeIngestsOneDocumentIntoBothSearchModes() async throws { + let library = try await SwiftlyFetchLibrary.default() + let record = FetchDocumentRecord( + id: "doc-apple", + title: "Apple Guide", + body: "Apples are bright and crisp.", + contentType: .markdown, + kind: .guide, + language: "en" + ) + + let mutation = try await library.addDocument(record) + let conventionalResults = try await library.search(FetchSearchQuery("apple guide", fields: [.title])) + let semanticResults = try await library.retrieve(SearchQuery("bright crisp fruit", limit: 1)) + + #expect(mutation.documentIDs == ["doc-apple"]) + #expect(mutation.conventional.status == .succeeded) + #expect(mutation.semantic.status == .succeeded) + #expect(conventionalResults.first?.document.id == "doc-apple") + #expect(semanticResults.first?.chunk.documentID == "doc-apple") + } + + @Test("Semantic indexing failure queues an index retry after the corpus write succeeds") + func semanticIndexingFailureQueuesRetry() async throws { + let fetchLibrary = FetchKitLibrary() + let retryStore = InMemorySwiftlyFetchSemanticRetryStore() + let failingKnowledgeBase = KnowledgeBase( + chunker: ThrowingChunker(), + embedder: HashingEmbedder(), + index: InMemoryVectorIndex() + ) + let library = SwiftlyFetchLibrary( + fetchLibrary: fetchLibrary, + knowledgeBase: failingKnowledgeBase, + retryStore: retryStore + ) + let record = FetchDocumentRecord( + id: "doc-apple", + title: "Apple Guide", + body: "Apples are bright and crisp." + ) + + let mutation = try await library.addDocument(record) + let storedRecord = try await fetchLibrary.document(withID: "doc-apple") + let retries = try await retryStore.pendingRetries() + + #expect(mutation.conventional.status == .succeeded) + #expect(mutation.semantic.status == .queuedRetry) + #expect(mutation.semantic.retry?.operation == .indexDocument) + #expect(storedRecord == record) + #expect(retries.map(\.documentID) == ["doc-apple"]) + #expect(retries.first?.operation == .indexDocument) + } + + @Test("Semantic retry re-reads the latest corpus record") + func semanticRetryReadsLatestCorpusRecord() async throws { + let fetchLibrary = FetchKitLibrary() + let retryStore = InMemorySwiftlyFetchSemanticRetryStore() + try await fetchLibrary.addDocument( + FetchDocumentRecord( + id: "doc-apple", + title: "Apple Guide", + body: "Apples are bright and crisp." + ) + ) + try await retryStore.upsert( + SwiftlyFetchSemanticRetry( + documentID: "doc-apple", + operation: .indexDocument, + reason: "Test retry" + ) + ) + let library = try SwiftlyFetchLibrary( + fetchLibrary: fetchLibrary, + knowledgeBase: await KnowledgeBase.hashingDefault(), + retryStore: retryStore + ) + + let retryResult = try await library.retrySemanticIndexing() + let semanticResults = try await library.retrieve(SearchQuery("bright crisp", limit: 1)) + let retriesAfterRetry = try await retryStore.pendingRetries() + + #expect(retryResult.completedDocumentIDs == ["doc-apple"]) + #expect(retryResult.failedRetries.isEmpty) + #expect(retriesAfterRetry.isEmpty) + #expect(semanticResults.first?.chunk.documentID == "doc-apple") + } + + @Test("Semantic retry skips records until their next retry date") + func semanticRetrySkipsRecordsUntilNextRetryDate() async throws { + let fetchLibrary = FetchKitLibrary() + let retryStore = InMemorySwiftlyFetchSemanticRetryStore() + try await fetchLibrary.addDocument( + FetchDocumentRecord( + id: "doc-due", + title: "Due Apple Guide", + body: "Due apples are bright and crisp." + ) + ) + try await fetchLibrary.addDocument( + FetchDocumentRecord( + id: "doc-deferred", + title: "Deferred Apple Guide", + body: "Deferred apples are bright and crisp." + ) + ) + try await retryStore.upsert( + SwiftlyFetchSemanticRetry( + documentID: "doc-deferred", + operation: .indexDocument, + reason: "Test deferred retry", + nextRetryAt: Date().addingTimeInterval(3600) + ) + ) + try await retryStore.upsert( + SwiftlyFetchSemanticRetry( + documentID: "doc-due", + operation: .indexDocument, + reason: "Test due retry", + nextRetryAt: Date().addingTimeInterval(-60) + ) + ) + let library = try SwiftlyFetchLibrary( + fetchLibrary: fetchLibrary, + knowledgeBase: await KnowledgeBase.hashingDefault(), + retryStore: retryStore + ) + + let retryResult = try await library.retrySemanticIndexing() + let retriesAfterRetry = try await retryStore.pendingRetries() + + #expect(retryResult.completedDocumentIDs == ["doc-due"]) + #expect(retryResult.deferredDocumentIDs == ["doc-deferred"]) + #expect(retryResult.failedRetries.isEmpty) + #expect(retriesAfterRetry.map(\.documentID) == ["doc-deferred"]) + } + + @Test("Failed semantic retries wait for their next retry date") + func failedSemanticRetriesWaitForNextRetryDate() async throws { + let fetchLibrary = FetchKitLibrary() + let retryStore = InMemorySwiftlyFetchSemanticRetryStore() + try await fetchLibrary.addDocument( + FetchDocumentRecord( + id: "doc-apple", + title: "Apple Guide", + body: "Apples are bright and crisp." + ) + ) + try await retryStore.upsert( + SwiftlyFetchSemanticRetry( + documentID: "doc-apple", + operation: .indexDocument, + reason: "Test retry" + ) + ) + let library = SwiftlyFetchLibrary( + fetchLibrary: fetchLibrary, + knowledgeBase: KnowledgeBase( + chunker: ThrowingChunker(), + embedder: HashingEmbedder(), + index: InMemoryVectorIndex() + ), + retryStore: retryStore + ) + + let firstRetryResult = try await library.retrySemanticIndexing() + let retriesAfterFailure = try await retryStore.pendingRetries() + let retryAfterFailure = try #require(retriesAfterFailure.first) + let secondRetryResult = try await library.retrySemanticIndexing() + let retriesAfterDeferral = try await retryStore.pendingRetries() + let retryAfterDeferral = try #require(retriesAfterDeferral.first) + + #expect(firstRetryResult.completedDocumentIDs.isEmpty) + #expect(firstRetryResult.failedRetries.map(\.documentID) == ["doc-apple"]) + #expect(firstRetryResult.deferredDocumentIDs.isEmpty) + #expect(retryAfterFailure.attemptCount == 1) + #expect(retryAfterFailure.nextRetryAt != nil) + #expect(secondRetryResult.completedDocumentIDs.isEmpty) + #expect(secondRetryResult.failedRetries.isEmpty) + #expect(secondRetryResult.deferredDocumentIDs == ["doc-apple"]) + #expect(retryAfterDeferral.attemptCount == 1) + } + + @Test("Semantic remove failure queues a remove retry") + func semanticRemoveFailureQueuesRemoveRetry() async throws { + let fetchLibrary = FetchKitLibrary() + let retryStore = InMemorySwiftlyFetchSemanticRetryStore() + let library = SwiftlyFetchLibrary( + fetchLibrary: fetchLibrary, + knowledgeBase: KnowledgeBase( + chunker: DefaultChunker(), + embedder: HashingEmbedder(), + index: RemoveFailingVectorIndex() + ), + retryStore: retryStore + ) + try await fetchLibrary.addDocument( + FetchDocumentRecord( + id: "doc-apple", + body: "Apples are bright and crisp." + ) + ) + + let mutation = try await library.removeDocument(withID: "doc-apple") + let retries = try await retryStore.pendingRetries() + + #expect(mutation.conventional.status == .succeeded) + #expect(mutation.semantic.status == .queuedRetry) + #expect(mutation.semantic.retry?.operation == .removeDocument) + #expect(retries.first?.operation == .removeDocument) + } + + @Test("Semantic state read failures do not queue semantic retries") + func semanticStateReadFailuresDoNotQueueRetries() async throws { + let fetchLibrary = FetchKitLibrary() + let retryStore = InMemorySwiftlyFetchSemanticRetryStore() + let library = SwiftlyFetchLibrary( + fetchLibrary: fetchLibrary, + knowledgeBase: KnowledgeBase( + chunker: DefaultChunker(), + embedder: HashingEmbedder(), + index: StateReadFailingVectorIndex() + ), + retryStore: retryStore + ) + let record = FetchDocumentRecord( + id: "doc-apple", + title: "Apple Guide", + body: "Apples are bright and crisp." + ) + + let addMutation = try await library.addDocument(record) + let removeMutation = try await library.removeDocument(withID: "doc-apple") + let retries = try await retryStore.pendingRetries() + + #expect(addMutation.semantic.status == .succeeded) + #expect(addMutation.semantic.state == nil) + #expect(removeMutation.semantic.status == .succeeded) + #expect(removeMutation.semantic.state == nil) + #expect(retries.isEmpty) + } + + @Test("Retry removes missing index retries") + func retryRemovesMissingIndexRetries() async throws { + let retryStore = InMemorySwiftlyFetchSemanticRetryStore() + try await retryStore.upsert( + SwiftlyFetchSemanticRetry( + documentID: "doc-missing", + operation: .indexDocument, + reason: "Test retry" + ) + ) + let library = try SwiftlyFetchLibrary( + fetchLibrary: FetchKitLibrary(), + knowledgeBase: await KnowledgeBase.hashingDefault(), + retryStore: retryStore + ) + + let result = try await library.retrySemanticIndexing() + let retries = try await retryStore.pendingRetries() + + #expect(result.removedMissingDocumentIDs == ["doc-missing"]) + #expect(retries.isEmpty) + } + + @Test("Facade returns conventional and semantic corpus results side by side") + func facadeReturnsConventionalAndSemanticCorpusResultsSideBySide() async throws { + let library = try await indexedFixtureLibrary() + + let botanyResult = try await library.searchAndRetrieve( + conventional: FetchSearchQuery( + "storage food seeds", + kind: .allTerms, + fields: [.body], + limit: 3 + ), + semantic: SearchQuery("food stored inside seeds for growing plants", limit: 3) + ) + let storyResult = try await library.searchAndRetrieve( + conventional: FetchSearchQuery( + "needle sew shirt", + kind: .allTerms, + fields: [.title, .body], + limit: 3 + ), + semantic: SearchQuery("a child and mother fix a shirt with a needle", limit: 3) + ) + + #expect(botanyResult.conventional.first?.document.id == "gutenberg-78430-chapter-1") + #expect(botanyResult.semantic.map(\.chunk.documentID).contains("gutenberg-78430-chapter-1")) + #expect(storyResult.conventional.first?.document.id == "tinystories-row-0-needle") + #expect(storyResult.semantic.map(\.chunk.documentID).contains("tinystories-row-0-needle")) + } + +#if os(macOS) + @Test("Persistent facade reopens conventional and semantic state") + func persistentFacadeReopensConventionalAndSemanticState() async throws { + let directory = try temporaryDirectory() + + do { + let firstLibrary = try await SwiftlyFetchLibrary.macOSPersistentLibrary(at: directory) + try await firstLibrary.addDocument( + FetchDocumentRecord( + id: "doc-apple", + title: "Apple Guide", + body: "Apples are bright and crisp.", + contentType: .markdown + ) + ) + } + + let reopenedLibrary = try await SwiftlyFetchLibrary.macOSPersistentLibrary(at: directory) + let conventionalResults = try await reopenedLibrary.search(FetchSearchQuery("apple guide", fields: [.title])) + let semanticResults = try await reopenedLibrary.retrieve(SearchQuery("bright crisp", limit: 1)) + + #expect(conventionalResults.first?.document.id == "doc-apple") + #expect(semanticResults.first?.chunk.documentID == "doc-apple") + } + + @Test("Core Data semantic retry store reopens pending retries") + func coreDataSemanticRetryStoreReopensPendingRetries() async throws { + let directory = try temporaryDirectory() + let storeURL = directory.appendingPathComponent("SemanticRetries.sqlite") + let olderDate = try #require(Calendar.current.date(from: DateComponents(year: 2026, month: 1, day: 1))) + let newerDate = try #require(Calendar.current.date(from: DateComponents(year: 2026, month: 1, day: 2))) + + do { + let store = try await CoreDataSwiftlyFetchSemanticRetryStore( + configuration: .init(store: .sqlite(storeURL)) + ) + try await store.upsert( + SwiftlyFetchSemanticRetry( + documentID: "doc-newer", + operation: .removeDocument, + reason: "Test persisted remove retry", + attemptCount: 2, + createdAt: newerDate, + lastAttemptAt: newerDate, + nextRetryAt: newerDate.addingTimeInterval(60), + lastFailure: "Test remove failed." + ) + ) + try await store.upsert( + SwiftlyFetchSemanticRetry( + documentID: "doc-older", + operation: .indexDocument, + reason: "Test persisted index retry", + createdAt: olderDate, + lastFailure: "Test indexing failed." + ) + ) + } + + let reopenedStore = try await CoreDataSwiftlyFetchSemanticRetryStore( + configuration: .init(store: .sqlite(storeURL)) + ) + let pendingRetries = try await reopenedStore.pendingRetries() + let limitedRetries = try await reopenedStore.pendingRetries(limit: 1) + + #expect(pendingRetries.map(\.documentID) == ["doc-older", "doc-newer"]) + #expect(pendingRetries.first?.operation == .indexDocument) + #expect(pendingRetries.first?.reason == "Test persisted index retry") + #expect(pendingRetries.first?.lastFailure == "Test indexing failed.") + #expect(pendingRetries.last?.operation == .removeDocument) + #expect(pendingRetries.last?.attemptCount == 2) + #expect(pendingRetries.last?.lastFailure == "Test remove failed.") + #expect(limitedRetries.map(\.documentID) == ["doc-older"]) + + try await reopenedStore.removeRetries(for: ["doc-older"]) + + let finalStore = try await CoreDataSwiftlyFetchSemanticRetryStore( + configuration: .init(store: .sqlite(storeURL)) + ) + let finalRetries = try await finalStore.pendingRetries() + + #expect(finalRetries.map(\.documentID) == ["doc-newer"]) + } +#endif +} + +private func indexedFixtureLibrary() async throws -> SwiftlyFetchLibrary { + let library = try await SwiftlyFetchLibrary.default() + + for record in GutenbergMiniCorpus.records + TinyStoriesMiniCorpus.records { + try await library.addDocument(record) + } + + return library +} + +private struct ThrowingChunker: Chunker { + func chunks(for document: Document) throws -> [Chunk] { + throw TestFailure.chunkingFailed + } +} + +private actor RemoveFailingVectorIndex: VectorIndex { + func upsert(_ chunks: [IndexedChunk]) async throws {} + + func search(_ query: SearchQuery, embedding: EmbeddingVector) async throws -> [SearchResult] { + [] + } + + func removeChunks(for documentID: DocumentID) async throws { + throw TestFailure.semanticRemoveFailed + } + + func removeAll() async throws {} +} + +private actor StateReadFailingVectorIndex: VectorIndex, SemanticIndexStateStore { + private var chunksByDocumentID: [DocumentID: [IndexedChunk]] = [:] + + func upsert(_ chunks: [IndexedChunk]) async throws { + for chunk in chunks { + chunksByDocumentID[chunk.chunk.documentID, default: []].append(chunk) + } + } + + func search(_ query: SearchQuery, embedding: EmbeddingVector) async throws -> [SearchResult] { + [] + } + + func removeChunks(for documentID: DocumentID) async throws { + chunksByDocumentID[documentID] = nil + } + + func removeAll() async throws { + chunksByDocumentID.removeAll() + } + + func state(for documentID: DocumentID) async throws -> SemanticIndexState? { + throw TestFailure.semanticStateReadFailed + } + + func states(for documentIDs: [DocumentID]) async throws -> [SemanticIndexState] { + throw TestFailure.semanticStateReadFailed + } + + func markIndexing(documentID: DocumentID, fingerprint: SemanticIndexFingerprint) async throws {} + + func markCurrent(documentID: DocumentID, fingerprint: SemanticIndexFingerprint) async throws {} + + func markStale(documentID: DocumentID, reason: String?) async throws {} + + func markFailed( + documentID: DocumentID, + fingerprint: SemanticIndexFingerprint?, + reason: String + ) async throws {} +} + +private enum TestFailure: Error, CustomStringConvertible { + case chunkingFailed + case semanticRemoveFailed + case semanticStateReadFailed + + var description: String { + switch self { + case .chunkingFailed: + "Test chunker intentionally failed while building semantic chunks." + case .semanticRemoveFailed: + "Test vector index intentionally failed while removing semantic chunks." + case .semanticStateReadFailed: + "Test semantic index state store intentionally failed while reading state." + } + } +} + +#if os(macOS) +private func temporaryDirectory() throws -> URL { + let directory = FileManager.default + .temporaryDirectory + .appendingPathComponent("SwiftlyFetchTests", isDirectory: true) + .appendingPathComponent(UUID().uuidString, isDirectory: true) + + try FileManager.default.createDirectory( + at: directory, + withIntermediateDirectories: true + ) + + return directory +} +#endif diff --git a/docs/maintainers/fixture-corpus.md b/docs/maintainers/fixture-corpus.md index 8bfa0e5..cd9062a 100644 --- a/docs/maintainers/fixture-corpus.md +++ b/docs/maintainers/fixture-corpus.md @@ -18,7 +18,9 @@ Why this source fits the first pass: - the `chapters` config has chapter titles and chapter text, which is a useful shape for document-search quality tests - the corpus can be inspected through the Hugging Face Dataset Viewer APIs without adding a Swift dependency -The fixture records live in `Tests/FetchKitTests/Fixtures/GutenbergMiniCorpus.swift`. Each source-derived record carries dataset, config, split, row, and Gutenberg ID metadata so the sample remains attributable and replaceable. The fixture also includes small synthetic near-miss and longer-body records derived from the same topic shape. Those synthetic records exist to stress ranking and snippet selection without expanding the checked-in corpus into a large text dump. +The second source-derived mini corpus is derived from the [`roneneldan/TinyStories`](https://huggingface.co/datasets/roneneldan/TinyStories) dataset on Hugging Face. TinyStories is useful here because its rows are short synthetic English stories, so the fixture can add a second text source without adding long copyrighted excerpts or a live dataset download to ordinary tests. The dataset card identifies the corpus as synthetic GPT-3.5/GPT-4-generated stories and tags it with `license:cdla-sharing-1.0`. + +The shared fixture records live in `Tests/SwiftlyFetchTestFixtures/`. Each source-derived record carries dataset, config, split, row, and source metadata so the sample remains attributable and replaceable. `GutenbergMiniCorpus` also includes small synthetic near-miss and longer-body records derived from the same topic shape. Those synthetic records exist to stress ranking and snippet selection without expanding the checked-in corpus into a large text dump. Current synthetic records: @@ -38,6 +40,8 @@ In practical terms, simple result lists can keep rendering a snippet for every e The second fixture pass added a compact-evidence ranking expectation for the default in-memory path. For `allTerms` search, a document that places all terms close together should beat a near-miss that satisfies the same terms only through scattered mentions. That keeps the default backend closer to what an app user means by "this result is about my query" without turning `FetchCore` into a larger ranking DSL. +The third fixture pass moved source-derived records into a shared test fixture target and added TinyStories micro-records. Corpus-based tests now have at least two attributed text sources available: one public-domain Gutenberg-derived source and one synthetic story source. + ## Hugging Face Dependency Boundary Do not add a Hugging Face Swift dependency for the default fixture lane yet. The current checked-in fixture keeps CI deterministic and avoids adding a network, token, cache, or package-resolution requirement to ordinary tests. @@ -54,6 +58,8 @@ The fixture was inspected with read-only Dataset Viewer calls: curl -s 'https://datasets-server.huggingface.co/splits?dataset=zkeown/gutenberg-corpus' curl -s 'https://datasets-server.huggingface.co/rows?dataset=zkeown/gutenberg-corpus&config=books&split=train&offset=1&length=5' curl -s 'https://datasets-server.huggingface.co/rows?dataset=zkeown/gutenberg-corpus&config=chapters&split=train&offset=1&length=3' +curl -s 'https://datasets-server.huggingface.co/splits?dataset=roneneldan/TinyStories' +curl -s 'https://datasets-server.huggingface.co/first-rows?dataset=roneneldan/TinyStories&config=default&split=train' ``` Hugging Face documents dataset parquet discovery through the Dataset Viewer service in the [`huggingface_hub` CLI guide](https://huggingface.co/docs/huggingface_hub/guides/cli) and the Dataset Viewer [Parquet conversion guide](https://huggingface.co/docs/dataset-viewer/parquet). diff --git a/docs/maintainers/hybrid-search-persistence-plan.md b/docs/maintainers/hybrid-search-persistence-plan.md new file mode 100644 index 0000000..37ffb82 --- /dev/null +++ b/docs/maintainers/hybrid-search-persistence-plan.md @@ -0,0 +1,143 @@ +# Hybrid Search Persistence Plan + +## Purpose + +This note records the persistence direction for the package-family step where `SwiftlyFetch` becomes one local corpus with both conventional and semantic search. + +The chosen direction is: + +- `FetchKit` owns the durable corpus store. +- `FetchKit` derives and maintains the conventional full-text index. +- `RAGKit` owns the semantic chunk and vector index as a derived store. +- A future umbrella surface coordinates one ingestion call across both search modes. + +In plain language: app code should eventually add a document once, then get both keyword search and semantic retrieval over that same corpus. Internally, the two search systems should remain sibling derived indexes instead of being forced into one module. + +## Current Problem + +The package already has durable conventional-search storage and sync recovery through `FetchKit`. + +The semantic side has had the right public protocol shape through `VectorIndex`, but the default implementation was memory-only. That meant an app could persist its conventional search corpus and SearchKit index, while semantic retrieval had to be rebuilt after restart by re-chunking and re-embedding the corpus. + +That asymmetry is the concrete behavior this plan fixes. + +## Ownership Model + +### FetchKit + +`FetchKit` owns the durable source corpus. + +Its responsibilities are: + +- store document records +- expose typed document mutation and lookup operations +- derive conventional full-text indexing changes +- keep SearchKit or future conventional-search backends current + +`FetchKit` should not start owning semantic chunks, embeddings, or vector search behavior. + +### RAGKit + +`RAGKit` owns semantic derived state. + +Its responsibilities are: + +- chunk source documents +- embed chunks and queries +- persist semantic chunks and vectors +- persist per-document semantic index health +- search vectors through the `VectorIndex` protocol +- remove semantic chunks by document identifier + +The first persisted semantic backend is `CoreDataVectorIndex`. Core Data is a practical first backend because it is already part of the package's Apple-first persistence story, but the public contract remains `VectorIndex` so a future backend can replace it without changing `KnowledgeBase`. + +The semantic index state model is RAG-owned, not umbrella-owned. It records whether a document's semantic derived state is: + +- `missing` +- `indexing` +- `current` +- `stale` +- `failed` + +That state carries a semantic fingerprint made from: + +- a source fingerprint for the document content and retrieval-relevant metadata +- a chunker fingerprint for the chunking policy +- an embedder fingerprint for the embedding policy + +This lets `RAGKit` answer whether its own semantic index is trustworthy without needing to inspect a future retry queue. + +### Future Umbrella Surface + +The future umbrella surface should coordinate the two sibling systems. + +Its job should be: + +- accept one app-level document ingestion call +- write the durable corpus record through `FetchKit` +- update the conventional search index +- derive the semantic document input for `RAGKit` +- update the semantic vector index +- enqueue document IDs for semantic retry if semantic indexing fails after the corpus write succeeds +- expose conventional, semantic, and later hybrid search entry points + +That facade should land after the semantic index is persistent. Otherwise it would hide a real durability mismatch behind a nicer API. + +The umbrella facade should own retry scheduling because retry needs to fetch the latest corpus record from `FetchKit` before re-indexing. `RAGKit` should own semantic health truth because it knows whether its chunks and vectors are current, stale, failed, or missing. + +## First Implementation Slice + +The first slice adds `CoreDataVectorIndex` in `RAGKit`. + +It persists: + +- chunk ID +- document ID +- chunk text +- chunk metadata +- chunk position +- embedding vector +- update timestamp +- per-document semantic index status +- semantic index fingerprint +- last indexed timestamp +- last failure description + +This is a durable building-block change. It gives `KnowledgeBase` restart-safe semantic retrieval without changing `RAGCore.VectorIndex` or making `FetchCore` depend on `RAGCore`. + +The first convenience constructors are: + +- `KnowledgeBase.persistentHashingDefault(configuration:dimension:)` +- `KnowledgeBase.persistentNaturalLanguageDefault(configuration:languageHint:)` + +These constructors keep the same chunker and embedder defaults as the in-memory defaults while swapping in the Core Data-backed vector index. + +## Follow-Up Design Work + +The next architecture work should focus on shared corpus ingestion rather than another standalone index backend. The detailed umbrella plan lives in [swiftlyfetch-facade-plan.md](./swiftlyfetch-facade-plan.md). + +Recommended order: + +1. Expand and harden the existing bridge from `FetchDocumentRecord` to `RAGCore.Document`. +2. Extend the umbrella library facade ingestion lifecycle beyond one-corpus add/search/retrieve basics. +3. Evolve the umbrella-owned semantic retry queue policy with attempt strategy, cooldown tuning, and backlog management. +4. Add hybrid result packaging only after conventional and semantic result paths are each independently durable. + +## Open Questions + +- Should the umbrella facade return one combined mutation result or separate conventional and semantic mutation summaries? +- Should hybrid search combine scores inside the umbrella facade, or should it expose side-by-side result sets first? + +## Non-Goals + +Do not use this work to add: + +- generation +- chat orchestration +- agents +- remote embedding providers +- PDF ingestion +- connector-heavy ingestion +- a broad query language + +The owned job is still local search and retrieval over app-owned corpora. diff --git a/docs/maintainers/retrieval-package-plan.md b/docs/maintainers/retrieval-package-plan.md index 76fb4cb..8ed7da3 100644 --- a/docs/maintainers/retrieval-package-plan.md +++ b/docs/maintainers/retrieval-package-plan.md @@ -153,11 +153,14 @@ Implemented today: - `HeadingAwareMarkdownChunker` - `DefaultChunker` - `InMemoryVectorIndex` + - `CoreDataVectorIndex` - `HashingEmbedder` - `KnowledgeBase` - `NaturalLanguageEmbedder` - `AppleContextualEmbeddingBackend` - - convenience constructors for `hashingDefault()` and `naturalLanguageDefault()` + - convenience constructors for `hashingDefault()`, `naturalLanguageDefault()`, `persistentHashingDefault(configuration:dimension:)`, and `persistentNaturalLanguageDefault(configuration:languageHint:)` +- semantic index persistence now exists as a `RAGKit`-owned derived store through `CoreDataVectorIndex`, keeping semantic chunks and embeddings behind the existing `VectorIndex` protocol instead of pushing vector-storage concerns into `FetchKit` +- persisted semantic index health now exists as a `RAGKit` concern through document-level status and fingerprints, while retry scheduling remains reserved for the future umbrella ingestion surface - markdown chunking now uses a parser-backed internal section model built on [swift-markdown](https://github.com/swiftlang/swift-markdown) instead of the earlier line-based heading scanner - list-item chunks now preserve immediate lead-in context in chunk text and also expose chunk metadata for block kind, list kind, lead-in, ordinal, and heading path - block quotes stay secondary by default but are promoted into the primary retrieval stream when they make up more than one third of the document's chunkable block structure @@ -168,10 +171,14 @@ Implemented today: - markdown tables now produce one retrieval chunk per body row with header-aware text and table-row metadata - inline links and reference links now default to visible anchor text in chunk text, while raw destinations and reference definitions stay secondary and do not become standalone retrieval chunks unless a caller explicitly opts into chunk metadata for destinations - deterministic tests cover the main retrieval flow and the Natural Language wrapper seam +- Core Data-backed vector-index tests cover persisted semantic chunks, replacement, filtering, document removal, remove-all behavior, and the persistent `KnowledgeBase` convenience path +- semantic-state tests cover current, stale, failed, and source-fingerprint-change behavior - a real Natural Language integration test target exists, now runs in default local maintainer validation, and remains out of the default GitHub-hosted lane because the hosted `macos-15` path stalled in the Natural Language step Still intentionally incomplete: +- deeper lifecycle expansion of the one-corpus umbrella ingestion facade beyond the initial `FetchKit` write and `RAGKit` semantic indexing surface +- retry-policy refinements beyond the current umbrella-owned semantic retry scheduling, including backoff tuning, prioritization, and operational controls - markdown policy refinement for additional block kinds and future evolution - optional future retrieval-default refinements only if concrete caller needs emerge beyond the current exclusion, ordered-comparison, and grouped-context defaults diff --git a/docs/maintainers/swiftlyfetch-facade-plan.md b/docs/maintainers/swiftlyfetch-facade-plan.md new file mode 100644 index 0000000..9596c42 --- /dev/null +++ b/docs/maintainers/swiftlyfetch-facade-plan.md @@ -0,0 +1,302 @@ +# SwiftlyFetch Facade Plan + +## Purpose + +This note defines the first umbrella facade for the package family. + +The goal is one consumer-facing surface that accepts a corpus document once and keeps both conventional search and semantic retrieval current. + +The facade should not erase the package-family boundaries: + +- `FetchKit` owns durable corpus storage and conventional full-text search. +- `RAGKit` owns semantic chunks, embeddings, vector search, and semantic index health. +- `SwiftlyFetch` coordinates one ingestion surface, semantic retry scheduling, and later hybrid search. + +In plain language: app code should be able to say "add this document" once, then use keyword search, semantic retrieval, and eventually hybrid search over the same corpus. + +## New Package Surface + +Add a new library product and target named `SwiftlyFetch`. + +The target should depend on: + +- `FetchCore` +- `FetchKit` +- `RAGCore` +- `RAGKit` + +This is a durable building-block change. The practical effect is that consumers can import one umbrella module when they want the coordinated experience, while still being able to import the sibling packages directly for lower-level control. + +Do not move existing `FetchKit` or `RAGKit` APIs into the umbrella target. The facade should compose them. + +## First Public Facade + +Add a `SwiftlyFetchLibrary` actor. + +Its first job is coordinated corpus ingestion and separate search entry points: + +```swift +public actor SwiftlyFetchLibrary { + public func addDocument(_ record: FetchDocumentRecord) async throws -> SwiftlyFetchMutationResult + public func updateDocument(_ record: FetchDocumentRecord) async throws -> SwiftlyFetchMutationResult + public func removeDocument(withID id: FetchDocumentID) async throws -> SwiftlyFetchMutationResult + + public func search(_ query: FetchSearchQuery) async throws -> [FetchSearchResult] + public func retrieve(_ query: SearchQuery) async throws -> [SearchResult] +} +``` + +Use `search` for conventional search and `retrieve` for semantic retrieval. Do not add `hybridSearch` in the first facade slice. Hybrid ranking should wait until the one-corpus ingestion path and retry behavior are stable. + +## Construction Shape + +The facade should support dependency injection first: + +```swift +public init( + fetchLibrary: FetchKitLibrary, + knowledgeBase: KnowledgeBase, + retryStore: any SwiftlyFetchSemanticRetryStore +) +``` + +Then add a default in-memory constructor for tests and examples: + +```swift +public static func `default`() async throws -> SwiftlyFetchLibrary +``` + +On macOS, add a persistent constructor after the injected path is proven: + +```swift +public static func macOSPersistentLibrary( + configuration: SwiftlyFetchPersistentConfiguration = .default +) async throws -> SwiftlyFetchLibrary +``` + +The persistent constructor should create: + +- `FetchKitLibrary.macOSPersistentLibrary(...)` +- a persistent `KnowledgeBase` backed by `CoreDataVectorIndex` +- an umbrella-owned semantic retry store + +The persistent configuration should be shaped around one storage root rather than asking callers to assemble separate store URLs for every internal component. + +## Document Mapping + +The bridge from `FetchDocumentRecord` to `RAGCore.Document` belongs in the `SwiftlyFetch` target. + +`FetchCore` should not import `RAGCore`, and `RAGKit` should not import `FetchCore`. + +First mapping policy: + +- `FetchDocumentRecord.id.rawValue` maps to `DocumentID`. +- `.plainText` maps to `.text`. +- `.markdown` maps to `.markdown`. +- string metadata maps into `DocumentMetadata` string values. +- `kind`, `language`, `sourceURI`, `createdAt`, and `updatedAt` map into semantic metadata when present. +- `title` should be included in semantic metadata. +- `title` should also be included in semantic source text by default. + +Title text should be part of semantic source text because many local corpus records are title-heavy. If the title only exists in metadata, semantic retrieval can miss the same document that conventional title search finds easily. + +Recommended first text shaping: + +```text +Title: + +<body> +``` + +For markdown records, use a markdown heading: + +```markdown +# <title> + +<body> +``` + +This title policy must be part of the source fingerprint, because changing it changes semantic derived state. + +## Mutation Flow + +For add and update: + +1. Write the `FetchDocumentRecord` through `FetchKitLibrary`. +2. Map the stored record into a `RAGCore.Document`. +3. Ask `KnowledgeBase` to index the semantic document. +4. Return a mutation result with separate conventional and semantic outcomes. +5. If semantic indexing fails after the corpus write succeeds, enqueue a semantic retry by document ID and return a degraded mutation result rather than pretending the whole corpus write failed. + +For remove: + +1. Remove the document through `FetchKitLibrary`. +2. Remove semantic chunks through `KnowledgeBase`. +3. Remove any pending semantic retry for that document. +4. Return a mutation result with separate conventional and semantic outcomes. + +The facade should make partial success explicit. A durable corpus write followed by a semantic indexing failure is not the same failure as a rejected corpus write. + +The first facade should stay singular-only. Batch mutation APIs can follow after the single-document result model proves useful and readable. + +## Mutation Result Shape + +The first result should expose separate summaries rather than one flattened success flag. + +Suggested shape: + +```swift +public struct SwiftlyFetchMutationResult: Hashable, Sendable { + public var documentIDs: [FetchDocumentID] + public var conventional: SwiftlyFetchMutationStage + public var semantic: SwiftlyFetchSemanticMutationStage +} +``` + +Where conventional and semantic stages can say: + +- succeeded +- skipped +- queuedRetry +- failed + +The semantic stage should carry the semantic index state when available. This lets callers show that conventional search is current while semantic retrieval is queued or degraded. + +Conventional failures should throw before semantic work starts. Semantic failures after a successful corpus write should return `queuedRetry` and include the semantic failure detail instead of flattening the whole operation into one success flag. + +## Semantic Retry Ownership + +The umbrella facade owns retry scheduling. + +RAGKit owns semantic health truth, but it does not know how to fetch the latest durable corpus record. The facade can fetch the latest `FetchDocumentRecord` from `FetchKit`, map it into a `RAGCore.Document`, and ask `KnowledgeBase` to index it again. + +Add an umbrella-owned retry store protocol: + +```swift +public protocol SwiftlyFetchSemanticRetryStore: Sendable { + func upsert(_ retry: SwiftlyFetchSemanticRetry) async throws + func pendingRetries(limit: Int?) async throws -> [SwiftlyFetchSemanticRetry] + func removeRetries(for documentIDs: [FetchDocumentID]) async throws +} +``` + +Suggested retry record: + +```swift +public struct SwiftlyFetchSemanticRetry: Hashable, Codable, Sendable { + public var documentID: FetchDocumentID + public var operation: SwiftlyFetchSemanticRetryOperation + public var reason: String + public var attemptCount: Int + public var createdAt: Date + public var lastAttemptAt: Date? + public var nextRetryAt: Date? + public var lastFailure: String? +} +``` + +The retry operation should distinguish semantic indexing from semantic removal. Index retries re-read the latest durable corpus record before mapping and indexing. Removal retries cannot re-read a deleted corpus record; they should retry semantic chunk cleanup by document ID and then clear any pending retry on success. + +Add a retry entry point: + +```swift +public func retrySemanticIndexing(limit: Int? = nil) async throws -> SwiftlyFetchSemanticRetryResult +``` + +Retry behavior: + +1. Read pending retry records. +2. Defer records whose `nextRetryAt` is still in the future. +3. For index retries, fetch the latest document record from `FetchKitLibrary`. +4. If an index retry's document no longer exists, remove the retry. +5. Map the record to a semantic document. +6. Ask `KnowledgeBase` to index it. +7. For remove retries, ask `KnowledgeBase` to remove semantic chunks for the document ID. +8. On success, remove the retry. +9. On failure, update attempt count, last attempt date, next retry date, and last failure. + +Use a simple first retry schedule. Exponential backoff can come later if real use demands it. + +The default in-memory constructor should use the deterministic hashing semantic backend. Do not make `SwiftlyFetchLibrary.default()` depend on Apple Natural Language assets. + +## Search Surface + +The first facade should expose separate conventional and semantic calls: + +- `search(_:)` returns `FetchSearchResult` from `FetchKit`. +- `retrieve(_:)` returns `SearchResult` from `RAGKit`. + +Do not combine scores yet. Conventional scores and semantic cosine scores do not mean the same thing. + +Once one-corpus ingestion and retry are stable, add either: + +- a side-by-side hybrid response containing conventional and semantic result arrays, or +- a ranked hybrid response with an explicit score-combination policy. + +Side-by-side should be the first hybrid experiment unless a concrete caller needs one ranked list immediately. + +The first side-by-side API should be named `searchAndRetrieve` rather than `hybridSearch`. Reserve `hybridSearch` for a later ranked API with an explicit score-combination policy. + +## First Implementation Slices + +### Slice 1: Package Surface And Mapping + +- Add the `SwiftlyFetch` product and target. +- Add the `SwiftlyFetchTests` target. +- Add `SwiftlyFetchDocumentMapper`. +- Cover title/body/content-type/metadata mapping. +- Update README and roadmap to name the umbrella target. + +### Slice 2: Facade With In-Memory Dependencies + +- Add `SwiftlyFetchLibrary`. +- Add injected construction. +- Add default in-memory construction. +- Implement add, update, remove, `search`, and `retrieve`. +- Return separate conventional and semantic mutation outcomes. +- Cover successful one-corpus ingestion into both `FetchKit` and `RAGKit`. + +### Slice 3: Retry Store + +- Add `SwiftlyFetchSemanticRetryStore`. +- Add an in-memory retry store. +- Add a Core Data retry store for persistent facade construction. +- Queue retry when semantic indexing fails after corpus write succeeds. +- Add `retrySemanticIndexing(limit:)`. +- Cover success, failed retry update, and missing-document cleanup. + +### Slice 4: Persistent Construction + +- Add `SwiftlyFetchLibrary.macOSPersistentLibrary(...)`. +- Shape configuration around one storage root. +- Create persistent conventional search, semantic vector index, and retry store under that root. +- Cover persistent reopen behavior. + +### Slice 5: Hybrid Search Planning + +- Inspect the first facade result behavior against a small corpus. +- Decide whether the first hybrid response should be side-by-side or ranked. +- Add side-by-side `searchAndRetrieve` after the ingestion and retry model is stable. +- Keep ranked `hybridSearch` out of this slice. + +## Definition Of Done For First Facade Milestone + +- A caller can add one document once and query both conventional and semantic search. +- Conventional and semantic mutation outcomes are visible separately. +- Semantic indexing failures after corpus writes are queued for retry. +- RAG-owned semantic state reports current or failed state for documents touched by the facade. +- Retry fetches the latest corpus record before re-indexing. +- README, roadmap, and maintainer docs describe the split honestly. + +## Non-Goals + +Do not add these in the first facade milestone: + +- hybrid ranking +- answer generation +- chat sessions +- agents +- PDF ingestion +- remote search or embedding providers +- connector-heavy ingestion +- a broad query language diff --git a/docs/media/swiftlyfetch-local-retrieval-promo.mp3 b/docs/media/swiftlyfetch-local-retrieval-promo.mp3 new file mode 100644 index 0000000..93d00c6 Binary files /dev/null and b/docs/media/swiftlyfetch-local-retrieval-promo.mp3 differ diff --git a/docs/releases/v0.2.0.md b/docs/releases/v0.2.0.md new file mode 100644 index 0000000..f09e159 --- /dev/null +++ b/docs/releases/v0.2.0.md @@ -0,0 +1,26 @@ +# SwiftlyFetch v0.2.0 + +## What Changed + +- added the first `SwiftlyFetch` umbrella facade for one-corpus ingestion across conventional search and semantic retrieval +- persisted semantic vector index state and document-level semantic health through Core Data-backed `RAGKit` storage +- added semantic retry storage, retry cooldown handling, persistent facade construction, and side-by-side `searchAndRetrieve(...)` +- expanded corpus-based coverage with a TinyStories-derived fixture source alongside the existing Gutenberg-derived fixture records +- hardened release resume behavior and refreshed the quick-start documentation with package dependency guidance and promo media + +## Breaking Changes + +- None. This is a backward-compatible minor release on top of `v0.1.2`. + +## Migration Or Upgrade Notes + +- Existing `RAGCore`, `RAGKit`, `FetchCore`, and `FetchKit` callers can keep using those products directly. +- New callers that want coordinated corpus writes can import `SwiftlyFetch` and use `SwiftlyFetchLibrary`. +- `SwiftlyFetchLibrary.searchAndRetrieve(...)` returns conventional and semantic results side by side; ranked hybrid search remains intentionally reserved for a later score-policy API. +- The default umbrella facade still uses deterministic hashing embeddings so tests, previews, and examples do not depend on downloaded Apple embedding assets. + +## Verification Performed + +- `scripts/repo-maintenance/validate-all.sh` +- `swift test --filter SwiftlyFetchLibraryTests` +- `swiftformat --lint Sources/SwiftlyFetch/SwiftlyFetchLibrary.swift Sources/SwiftlyFetch/SwiftlyFetchSemanticRetry.swift Tests/SwiftlyFetchTests/SwiftlyFetchLibraryTests.swift` diff --git a/scripts/repo-maintenance/config/release.env b/scripts/repo-maintenance/config/release.env index 50c4868..5cb1e1c 100644 --- a/scripts/repo-maintenance/config/release.env +++ b/scripts/repo-maintenance/config/release.env @@ -1,3 +1,16 @@ # Repo-maintenance release defaults. REPO_MAINTENANCE_DEFAULT_RELEASE_MODE=standard REPO_MAINTENANCE_RELEASE_BRANCH=main +REPO_MAINTENANCE_REMOTE_CI_MODE=full + +# GitHub can accept branch, tag, PR, check, review, and release mutations before +# those surfaces are immediately readable. These defaults keep release scripts +# explicit about intentional waits instead of failing on transient indexing gaps. +REPO_MAINTENANCE_GH_WAIT_TIMEOUT_SECONDS=120 +REPO_MAINTENANCE_GH_WAIT_POLL_SECONDS=5 + +# Keep full local validation as the default release gate. For repositories whose +# GitHub CI is intentionally heavy, use --remote-ci-mode defer so release.sh +# pauses after branch push, PR creation, and initial check discovery. Codex can +# then use a native thread Timer/Wakeup or heartbeat automation to resume later +# instead of leaving a long-running shell process open just to poll GitHub. diff --git a/scripts/repo-maintenance/lib/common.sh b/scripts/repo-maintenance/lib/common.sh index 1e0405b..b456429 100755 --- a/scripts/repo-maintenance/lib/common.sh +++ b/scripts/repo-maintenance/lib/common.sh @@ -38,6 +38,104 @@ load_profile_env() { load_env_file "$REPO_MAINTENANCE_ROOT/config/profile.env" } +positive_integer_or_default() { + value="$1" + default_value="$2" + + case "$value" in + ''|*[!0-9]*) + printf '%s\n' "$default_value" + ;; + 0) + printf '%s\n' "$default_value" + ;; + *) + printf '%s\n' "$value" + ;; + esac +} + +github_wait_timeout() { + value="$1" + default_timeout="$(positive_integer_or_default "${REPO_MAINTENANCE_GH_WAIT_TIMEOUT_SECONDS:-120}" 120)" + positive_integer_or_default "$value" "$default_timeout" +} + +github_wait_poll_seconds() { + value="$1" + default_poll_seconds="$(positive_integer_or_default "${REPO_MAINTENANCE_GH_WAIT_POLL_SECONDS:-5}" 5)" + positive_integer_or_default "$value" "$default_poll_seconds" +} + +wait_for_remote_branch() { + branch_name="$1" + timeout_seconds="$(github_wait_timeout "${REPO_MAINTENANCE_REMOTE_BRANCH_TIMEOUT_SECONDS:-}")" + poll_seconds="$(github_wait_poll_seconds "${REPO_MAINTENANCE_REMOTE_BRANCH_POLL_SECONDS:-}")" + elapsed_seconds="0" + + log "Waiting up to ${timeout_seconds}s for remote branch origin/$branch_name to become visible." + + while :; do + if git -C "$REPO_ROOT" ls-remote --exit-code --heads origin "$branch_name" >/dev/null 2>&1; then + log "Remote branch origin/$branch_name is visible." + return 0 + fi + + if [ "$elapsed_seconds" -ge "$timeout_seconds" ]; then + die "Remote branch origin/$branch_name was not visible after ${timeout_seconds}s. Confirm the branch push succeeded and that the origin remote is reachable before rerunning release.sh." + fi + + sleep "$poll_seconds" + elapsed_seconds=$((elapsed_seconds + poll_seconds)) + done +} + +wait_for_remote_tag() { + tag_name="$1" + timeout_seconds="$(github_wait_timeout "${REPO_MAINTENANCE_REMOTE_TAG_TIMEOUT_SECONDS:-}")" + poll_seconds="$(github_wait_poll_seconds "${REPO_MAINTENANCE_REMOTE_TAG_POLL_SECONDS:-}")" + elapsed_seconds="0" + + log "Waiting up to ${timeout_seconds}s for remote tag $tag_name to become visible." + + while :; do + if git -C "$REPO_ROOT" ls-remote --exit-code --tags origin "refs/tags/$tag_name" >/dev/null 2>&1; then + log "Remote tag $tag_name is visible." + return 0 + fi + + if [ "$elapsed_seconds" -ge "$timeout_seconds" ]; then + die "Remote tag $tag_name was not visible after ${timeout_seconds}s. Confirm the tag push succeeded and that GitHub has indexed the tag before rerunning release.sh." + fi + + sleep "$poll_seconds" + elapsed_seconds=$((elapsed_seconds + poll_seconds)) + done +} + +wait_for_github_release() { + tag_name="$1" + timeout_seconds="$(github_wait_timeout "${REPO_MAINTENANCE_GH_RELEASE_TIMEOUT_SECONDS:-}")" + poll_seconds="$(github_wait_poll_seconds "${REPO_MAINTENANCE_GH_RELEASE_POLL_SECONDS:-}")" + elapsed_seconds="0" + + log "Waiting up to ${timeout_seconds}s for GitHub release $tag_name to become readable." + + while :; do + if gh release view "$tag_name" >/dev/null 2>&1; then + log "GitHub release $tag_name is readable." + return 0 + fi + + if [ "$elapsed_seconds" -ge "$timeout_seconds" ]; then + die "GitHub release $tag_name was not readable after ${timeout_seconds}s. Confirm release creation succeeded and GitHub has indexed the release before rerunning release.sh." + fi + + sleep "$poll_seconds" + elapsed_seconds=$((elapsed_seconds + poll_seconds)) + done +} + ensure_git_repo() { git -C "$REPO_ROOT" rev-parse --is-inside-work-tree >/dev/null 2>&1 || die "maintain-project-repo must run inside a git worktree rooted at $REPO_ROOT." } diff --git a/scripts/repo-maintenance/release.sh b/scripts/repo-maintenance/release.sh index 5665b83..9449e99 100755 --- a/scripts/repo-maintenance/release.sh +++ b/scripts/repo-maintenance/release.sh @@ -17,6 +17,7 @@ base_branch="${REPO_MAINTENANCE_RELEASE_BRANCH:-main}" review_comments_addressed="false" skip_branch_cleanup="false" dry_run="false" +remote_ci_mode="${REPO_MAINTENANCE_REMOTE_CI_MODE:-full}" while [ "$#" -gt 0 ]; do case "$1" in @@ -48,6 +49,10 @@ while [ "$#" -gt 0 ]; do review_comments_addressed="true" shift ;; + --remote-ci-mode) + remote_ci_mode="${2:-}" + shift 2 + ;; --skip-branch-cleanup) skip_branch_cleanup="true" shift @@ -59,7 +64,7 @@ while [ "$#" -gt 0 ]; do -h|--help) cat <<'USAGE' Usage: - release.sh --mode standard --version <vX.Y.Z> [--base-branch main] [--skip-validate] [--skip-version-bump] [--skip-gh-release] [--review-comments-addressed] [--skip-branch-cleanup] [--dry-run] + release.sh --mode standard --version <vX.Y.Z> [--base-branch main] [--skip-validate] [--skip-version-bump] [--skip-gh-release] [--review-comments-addressed] [--remote-ci-mode full|defer] [--skip-branch-cleanup] [--dry-run] release.sh --mode submodule --version <vX.Y.Z> [--skip-validate] [--skip-gh-release] [--dry-run] USAGE exit 0 @@ -76,6 +81,7 @@ export REPO_MAINTENANCE_RELEASE_MODE="$mode" export RELEASE_TAG="$release_tag" export REPO_MAINTENANCE_SKIP_GH_RELEASE="$skip_gh_release" export REPO_MAINTENANCE_DRY_RUN="$dry_run" +export REPO_MAINTENANCE_REMOTE_CI_MODE="$remote_ci_mode" ensure_clean_worktree() { status_output="$(git -C "$REPO_ROOT" status --porcelain)" @@ -96,6 +102,16 @@ ensure_semver_tag() { esac } +ensure_remote_ci_mode() { + case "$REPO_MAINTENANCE_REMOTE_CI_MODE" in + full|defer) + ;; + *) + die "Remote CI mode must be either full or defer. Use full to watch GitHub checks in this script, or defer to pause after initial check discovery and continue from a Codex wakeup." + ;; + esac +} + current_branch() { git -C "$REPO_ROOT" symbolic-ref --quiet --short HEAD || true } @@ -110,12 +126,18 @@ ensure_branch_release_context() { run_version_bump() { release_version="${RELEASE_TAG#v}" version_bump_script="$SELF_DIR/version-bump.sh" + version_bump_subject="release: bump versions for $RELEASE_TAG" if [ "$skip_version_bump" = "true" ]; then log "Skipping repo version bump because --skip-version-bump was requested." return 0 fi + if git -C "$REPO_ROOT" log --format=%s "$base_branch"..HEAD | grep -Fxq "$version_bump_subject"; then + log "Version bump commit for $RELEASE_TAG is already on this branch; continuing the release resume path." + return 0 + fi + [ -x "$version_bump_script" ] || die "Standard release mode expected an executable repo-specific version bump hook at $version_bump_script. Add that hook so the repo's version surfaces move together, or rerun with --skip-version-bump when this release intentionally has no version-bearing files." if [ "$REPO_MAINTENANCE_DRY_RUN" = "true" ]; then @@ -139,7 +161,8 @@ create_release_tag() { tag_sha="$(git -C "$REPO_ROOT" rev-parse -q --verify "refs/tags/$RELEASE_TAG" 2>/dev/null || true)" if [ -n "$tag_sha" ]; then - [ "$tag_sha" = "$head_sha" ] || die "Tag $RELEASE_TAG already exists and does not point at HEAD." + tag_commit_sha="$(git -C "$REPO_ROOT" rev-list -n 1 "$RELEASE_TAG")" + [ "$tag_commit_sha" = "$head_sha" ] || die "Tag $RELEASE_TAG already exists and does not point at HEAD." log "Tag $RELEASE_TAG already points at HEAD." return 0 fi @@ -153,17 +176,28 @@ create_release_tag() { log "Created annotated tag $RELEASE_TAG." } -push_branch_and_tag() { +push_release_branch() { branch_name="$1" if [ "$REPO_MAINTENANCE_DRY_RUN" = "true" ]; then - log "Would push branch $branch_name and tag $RELEASE_TAG to origin." + log "Would push branch $branch_name to origin." return 0 fi git -C "$REPO_ROOT" push -u origin "$branch_name" + log "Pushed branch $branch_name." + wait_for_remote_branch "$branch_name" +} + +push_release_tag() { + if [ "$REPO_MAINTENANCE_DRY_RUN" = "true" ]; then + log "Would push tag $RELEASE_TAG to origin." + return 0 + fi + git -C "$REPO_ROOT" push origin "$RELEASE_TAG" - log "Pushed branch $branch_name and tag $RELEASE_TAG." + log "Pushed tag $RELEASE_TAG." + wait_for_remote_tag "$RELEASE_TAG" } create_or_update_pr() { @@ -184,11 +218,11 @@ create_or_update_pr() { - prepares $RELEASE_TAG from branch \`$branch_name\` - keeps protected \`$base_branch\` updates behind pull request review and CI -- release tag \`$RELEASE_TAG\` was created locally before this PR so the reviewed release candidate is preserved exactly +- release tag \`$RELEASE_TAG\` will be created after CI and the review-comment gate pass, so failed or still-discussed release candidates do not get tagged ## Review Loop -Before merge, \`scripts/repo-maintenance/release.sh\` watches CI and stops on review comments unless the maintainer has already addressed or resolved them and reruns with \`--review-comments-addressed\`. +Before merge and tagging, \`scripts/repo-maintenance/release.sh\` watches CI and stops on review comments unless the maintainer has already addressed or resolved them and reruns with \`--review-comments-addressed\`. EOF pr_number="$(gh pr list --head "$branch_name" --base "$base_branch" --json number --jq '.[0].number // empty' --limit 1)" @@ -224,6 +258,86 @@ watch_ci() { log "CI is green for PR #$pr_number." } +defer_remote_ci_if_requested() { + pr_number="$1" + branch_name="$2" + + [ "$REPO_MAINTENANCE_REMOTE_CI_MODE" = "defer" ] || return 1 + + if [ "$REPO_MAINTENANCE_DRY_RUN" = "true" ]; then + log "Would defer remote CI after PR #$pr_number reports initial checks." + return 0 + fi + + pr_url="$(gh pr view "$pr_number" --json url --jq '.url')" + log "Remote CI mode is defer, so release.sh is pausing after local validation, branch push, PR creation, and initial check discovery." + log "Release is not complete yet. Let GitHub finish CI for PR #$pr_number, then continue from branch $branch_name with:" + log " bash scripts/repo-maintenance/release.sh --mode standard --version $RELEASE_TAG" + log "Codex should use a native thread Timer/Wakeup or heartbeat automation for this wait when available, then resume by checking $pr_url and rerunning the command above instead of leaving a shell script open to poll GitHub." + return 0 +} + +wait_for_initial_pr_checks() { + pr_number="$1" + timeout_seconds="$(github_wait_timeout "${REPO_MAINTENANCE_INITIAL_CHECK_TIMEOUT_SECONDS:-}")" + poll_seconds="$(github_wait_poll_seconds "${REPO_MAINTENANCE_INITIAL_CHECK_POLL_SECONDS:-}")" + elapsed_seconds="0" + last_state="no check data returned yet" + + log "Waiting up to ${timeout_seconds}s for GitHub to report initial checks on PR #$pr_number." + + while :; do + last_state="$(gh pr view "$pr_number" --json statusCheckRollup --jq '[.statusCheckRollup[]? | .name + ":" + ((.status // .state // .conclusion // "") | ascii_downcase)] | join(", ")' 2>/dev/null || printf 'no checks reported')" + check_count="$(gh pr view "$pr_number" --json statusCheckRollup --jq '(.statusCheckRollup // []) | length' 2>/dev/null || printf '0')" + case "$check_count" in + ''|*[!0-9]*) + check_count="0" + ;; + esac + + if [ "$check_count" -gt 0 ]; then + log "Found $check_count initial check(s) for PR #$pr_number." + return 0 + fi + + if [ "$elapsed_seconds" -ge "$timeout_seconds" ]; then + die "No checks were reported for PR #$pr_number after ${timeout_seconds}s. Last observed state: $last_state. Confirm the GitHub Actions workflow triggers for the release branch, Actions is enabled, and the branch push succeeded before rerunning release.sh." + fi + + sleep "$poll_seconds" + elapsed_seconds=$((elapsed_seconds + poll_seconds)) + done +} + +wait_for_pr_review_state() { + pr_number="$1" + timeout_seconds="$(github_wait_timeout "${REPO_MAINTENANCE_PR_REVIEW_TIMEOUT_SECONDS:-}")" + poll_seconds="$(github_wait_poll_seconds "${REPO_MAINTENANCE_PR_REVIEW_POLL_SECONDS:-}")" + elapsed_seconds="0" + last_state="PR review/comment state has not been read yet" + + log "Waiting up to ${timeout_seconds}s for GitHub review/comment state on PR #$pr_number." + + while :; do + last_state="$(gh pr view "$pr_number" --json reviewDecision,comments,reviews --jq '"reviewDecision=" + (.reviewDecision // "") + ", comments=" + ((.comments | length) | tostring) + ", reviews=" + ((.reviews | length) | tostring)' 2>/dev/null || printf 'GitHub did not return PR review/comment state')" + case "$last_state" in + "GitHub did not return PR review/comment state") + ;; + *) + log "GitHub review/comment state is readable for PR #$pr_number: $last_state." + return 0 + ;; + esac + + if [ "$elapsed_seconds" -ge "$timeout_seconds" ]; then + die "GitHub review/comment state for PR #$pr_number was not readable after ${timeout_seconds}s. Last observed state: $last_state. Confirm the PR exists and GitHub is returning review data before rerunning release.sh." + fi + + sleep "$poll_seconds" + elapsed_seconds=$((elapsed_seconds + poll_seconds)) + done +} + check_pr_comments() { pr_number="$1" @@ -232,8 +346,10 @@ check_pr_comments() { return 0 fi + wait_for_pr_review_state "$pr_number" + review_decision="$(gh pr view "$pr_number" --json reviewDecision --jq '.reviewDecision // ""')" - comment_count="$(gh pr view "$pr_number" --json comments,reviews --jq '([.comments[]?, .reviews[]?] | length)')" + comment_count="$(gh pr view "$pr_number" --json comments,reviews --jq '([.comments[]?, (.reviews[]? | select(.state == "COMMENTED" or ((.body // "") | length > 0)))] | length)')" if [ "$review_decision" = "CHANGES_REQUESTED" ]; then gh pr view "$pr_number" --comments @@ -271,7 +387,7 @@ fast_forward_base_branch() { git -C "$REPO_ROOT" pull --ff-only origin "$base_branch" log "Fast-forwarded local $base_branch." else - warn "Could not check out local $base_branch, likely because another worktree owns it. Fast-forward $base_branch from origin/$base_branch in that checkout before cleanup." + die "Could not check out local $base_branch, likely because another worktree owns it. Fast-forward $base_branch from origin/$base_branch in that checkout, then rerun release.sh so the release tag is created from the reviewed base branch." fi } @@ -293,6 +409,7 @@ create_github_release() { gh release create "$RELEASE_TAG" --verify-tag --generate-notes log "Created GitHub release $RELEASE_TAG." + wait_for_github_release "$RELEASE_TAG" } cleanup_merged_branches() { @@ -325,6 +442,7 @@ run_standard_release() { ensure_git_repo ensure_gh_cli ensure_semver_tag + ensure_remote_ci_mode branch_name="$(ensure_branch_release_context)" ensure_clean_worktree @@ -334,14 +452,20 @@ run_standard_release() { run_version_bump ensure_clean_worktree - create_release_tag - push_branch_and_tag "$branch_name" + push_release_branch "$branch_name" create_or_update_pr "$branch_name" pr_number="$PR_NUMBER" + wait_for_initial_pr_checks "$pr_number" + if defer_remote_ci_if_requested "$pr_number" "$branch_name"; then + log "Standard release flow paused before remote CI watch for $RELEASE_TAG." + return 0 + fi watch_ci "$pr_number" check_pr_comments "$pr_number" merge_pr "$pr_number" fast_forward_base_branch + create_release_tag + push_release_tag create_github_release cleanup_merged_branches "$branch_name" log "Standard release flow completed successfully for $RELEASE_TAG." diff --git a/scripts/repo-maintenance/release/30-push-release.sh b/scripts/repo-maintenance/release/30-push-release.sh index 81d22d0..54de388 100755 --- a/scripts/repo-maintenance/release/30-push-release.sh +++ b/scripts/repo-maintenance/release/30-push-release.sh @@ -13,5 +13,7 @@ if [ "${REPO_MAINTENANCE_DRY_RUN:-false}" = "true" ]; then fi git -C "$REPO_ROOT" push -u origin "$branch_name" +wait_for_remote_branch "$branch_name" git -C "$REPO_ROOT" push origin "$RELEASE_TAG" +wait_for_remote_tag "$RELEASE_TAG" log "Pushed branch $branch_name and tag $RELEASE_TAG." diff --git a/scripts/repo-maintenance/release/40-github-release.sh b/scripts/repo-maintenance/release/40-github-release.sh index 348cfcc..0be674d 100755 --- a/scripts/repo-maintenance/release/40-github-release.sh +++ b/scripts/repo-maintenance/release/40-github-release.sh @@ -27,3 +27,4 @@ fi gh release create "$RELEASE_TAG" --verify-tag --generate-notes log "Created GitHub release $RELEASE_TAG." +wait_for_github_release "$RELEASE_TAG" diff --git a/scripts/repo-maintenance/version-bump.sh b/scripts/repo-maintenance/version-bump.sh index aa81e72..7933d59 100755 --- a/scripts/repo-maintenance/version-bump.sh +++ b/scripts/repo-maintenance/version-bump.sh @@ -25,12 +25,17 @@ import sys readme_path = Path(sys.argv[1]) tmp_path = Path(sys.argv[2]) tag = sys.argv[3] +version = tag.removeprefix("v") text = readme_path.read_text() -pattern = re.compile(r"`v\d+\.\d+\.\d+` is .*") -replacement = f"`{tag}` is the current tagged package release and is stable enough to try locally." -updated, count = pattern.subn(replacement, text, count=1) -if count != 1: +status_pattern = re.compile(r"(?:`v\d+\.\d+\.\d+` is .*|SwiftlyFetch has tagged releases stable enough to try locally, and the umbrella `SwiftlyFetch` surface is available in the current codebase\. See GitHub Releases for the latest published version details\.)") +status_replacement = "SwiftlyFetch has tagged releases stable enough to try locally, and the umbrella `SwiftlyFetch` surface is available in the current codebase. See GitHub Releases for the latest published version details." +updated, status_count = status_pattern.subn(status_replacement, text, count=1) +if status_count != 1: raise SystemExit("Could not find the README status line to update.") +dependency_pattern = re.compile(r'from: "[^"]+"') +updated, dependency_count = dependency_pattern.subn(f'from: "{version}"', updated, count=1) +if dependency_count != 1: + raise SystemExit("Could not find the README package dependency version to update.") tmp_path.write_text(updated) PY @@ -43,25 +48,26 @@ cat >"$release_notes_path" <<EOF ## What Changed -- refined conventional-search ranking so title hits get a modest boost, Search Kit scores normalize per field, and cross-field matches accumulate more intentionally -- replaced the old single-term snippet behavior with shared query-aware snippets that can highlight multiple query terms and show visible truncation markers when context is cropped -- added stronger default-path and Search Kit coverage for ranking preference, phrase behavior, and snippet presentation -- documented the new conventional-search refinement state in the README, roadmap, and maintainer notes +- added the first \`SwiftlyFetch\` umbrella facade for one-corpus ingestion across conventional search and semantic retrieval +- persisted semantic vector index state and document-level semantic health through Core Data-backed \`RAGKit\` storage +- added semantic retry storage, retry cooldown handling, persistent facade construction, and side-by-side \`searchAndRetrieve(...)\` +- expanded corpus-based coverage with a TinyStories-derived fixture source alongside the existing Gutenberg-derived fixture records +- hardened release resume behavior and refreshed the quick-start documentation with package dependency guidance and promo media ## Breaking Changes -- None. This is a backward-compatible patch release on top of \`v0.1.1\`. +- None. This is a backward-compatible minor release on top of \`v0.1.2\`. ## Migration Or Upgrade Notes -- \`RAGCore\` and \`RAGKit\` continue to provide the shipped semantic retrieval surface from \`v0.1.1\`. -- \`FetchCore\` and \`FetchKit\` still expose the same conventional-search foundation, but \`FetchKitLibrary\` search results now rank title and phrase matches more intentionally and return richer snippets by default. -- Real Natural Language integration coverage now runs in local maintainer validation by default, while GitHub-hosted CI still skips the asset-backed lane. -- The Search Kit verification path now runs in normal validation, with \`scripts/repo-maintenance/run-searchkit-tests.sh\` kept as a focused local helper. +- Existing \`RAGCore\`, \`RAGKit\`, \`FetchCore\`, and \`FetchKit\` callers can keep using those products directly. +- New callers that want coordinated corpus writes can import \`SwiftlyFetch\` and use \`SwiftlyFetchLibrary\`. +- \`SwiftlyFetchLibrary.searchAndRetrieve(...)\` returns conventional and semantic results side by side; ranked hybrid search remains intentionally reserved for a later score-policy API. +- The default umbrella facade still uses deterministic hashing embeddings so tests, previews, and examples do not depend on downloaded Apple embedding assets. ## Verification Performed -- \`swift test\` - \`scripts/repo-maintenance/validate-all.sh\` -- \`scripts/repo-maintenance/run-searchkit-tests.sh\` +- \`swift test --filter SwiftlyFetchLibraryTests\` +- \`swiftformat --lint Sources/SwiftlyFetch/SwiftlyFetchLibrary.swift Sources/SwiftlyFetch/SwiftlyFetchSemanticRetry.swift Tests/SwiftlyFetchTests/SwiftlyFetchLibraryTests.swift\` EOF