Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/workflows/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,33 @@ jobs:
cd packages/agentic-db
pnpm test -- --forceExit --detectOpenHandles __tests__/rag.test.ts __tests__/rag-unified-search.test.ts __tests__/cli-search-integration.test.ts

documents-loader-tests:
runs-on: ubuntu-latest
timeout-minutes: 10

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Setup pnpm
uses: pnpm/action-setup@v2
with:
version: 10.22.0

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '22'
cache: 'pnpm'

- name: Install dependencies
run: pnpm install --frozen-lockfile

- name: Run documents-loader tests
run: |
cd packages/documents-loader
pnpm test

cli-e2e-tests:
runs-on: ubuntu-latest
timeout-minutes: 25
Expand Down Expand Up @@ -487,6 +514,9 @@ jobs:
- name: Build SDK
run: pnpm --filter @agentic-db/sdk run build

- name: Build documents-loader
run: pnpm --filter @agentic-db/documents-loader run build

- name: Wait for Ollama and pull model
run: |
echo "Waiting for Ollama to be ready..."
Expand Down
75 changes: 75 additions & 0 deletions packages/documents-loader/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# @agentic-db/documents-loader

Load, import, and export text-based files (markdown, MDX, plain text, etc.) into the agentic-db `documents` table.

## Features

- **Import** a directory of markdown/text files into the documents table
- **Export** documents back to disk as files, preserving directory structure
- **Bidirectional sync** between Git repositories and the database
- **Frontmatter parsing** for `.md` and `.mdx` files (title, tags, metadata)
- **Last-write-wins** conflict resolution for seamless workflows
- Supports `.md`, `.mdx`, `.txt`, `.rst`, `.html`, `.xml`, `.json`, `.yaml`, `.yml`, `.csv`, `.tsv`

## Usage

### As a library

```typescript
import {
importDirectory,
exportDocuments,
createDocumentClient,
} from '@agentic-db/documents-loader';
import { createClient } from '@agentic-db/sdk';

const sdk = createClient({ endpoint: '...', headers: { ... } });
const client = createDocumentClient(sdk);

// Import files from a directory
const importStats = await importDirectory('./my-docs', client, {
repoName: 'my-repo',
tags: ['docs'],
commitHash: 'abc123',
});

// Export documents back to disk
const exportStats = await exportDocuments('./output', client, {
repoName: 'my-repo',
includeFrontmatter: true,
});
```

### Via the CLI

```bash
# Import a directory of docs
agentic-db docs import ./my-docs --repo my-repo --tags docs,internal

# Export documents to a directory
agentic-db docs export ./output --repo my-repo

# List documents for a repo
agentic-db docs list --repo my-repo
```

## How it works

### Import

1. Scans the directory for supported text files
2. Parses frontmatter from `.md`/`.mdx` files to extract title, tags, and metadata
3. Matches files to existing documents by `repo_name + file_path`
4. Creates new documents or updates existing ones (last-write-wins)
5. The database's auto-embed triggers handle embedding generation

### Export

1. Fetches all documents for the specified `repo_name`
2. Writes each document to disk at its `file_path`
3. Optionally includes frontmatter (title, tags, metadata) in markdown files
4. Creates nested directories as needed

### Conflict Resolution

Uses **last-write-wins**: whichever operation runs last (import or manual DB edit) determines the current state. This keeps the workflow simple and predictable.
278 changes: 278 additions & 0 deletions packages/documents-loader/__tests__/exporter.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
import { mkdtempSync, readFileSync, rmSync, existsSync } from 'fs';
import { join } from 'path';
import { tmpdir } from 'os';

import { exportDocuments } from '../src/exporter';
import { DocumentClient, DocumentRecord } from '../src/importer';

function createMockClientWithDocs(docs: DocumentRecord[]): DocumentClient {
return {
async findByRepoAndPath(repoName: string, filePath: string) {
return docs.find((d) => d.repoName === repoName && d.filePath === filePath) || null;
},

async findByRepo(repoName: string) {
return docs.filter((d) => d.repoName === repoName);
},

async create(doc) {
return { id: 'new', ...doc, updatedAt: null, tags: doc.tags || null, metadata: doc.metadata || null, commitHash: doc.commitHash || null };
},

async update(id, patch) {
const existing = docs.find((d) => d.id === id);
if (!existing) throw new Error('Not found');
return { ...existing, ...patch };
},

async delete() {
// noop
},
};
}

describe('exporter', () => {
let tempDir: string;

beforeEach(() => {
tempDir = mkdtempSync(join(tmpdir(), 'docloader-export-'));
});

afterEach(() => {
rmSync(tempDir, { recursive: true, force: true });
});

it('should export documents as markdown files', async () => {
const docs: DocumentRecord[] = [
{
id: '1',
title: 'Getting Started',
content: '# Getting Started\n\nWelcome!',
repoName: 'test-repo',
filePath: 'getting-started.md',
commitHash: null,
tags: ['docs'],
metadata: null,
updatedAt: '2024-01-01T00:00:00Z',
},
];

const client = createMockClientWithDocs(docs);
const stats = await exportDocuments(tempDir, client, {
repoName: 'test-repo',
});

expect(stats.written).toBe(1);
expect(stats.errors).toBe(0);

const content = readFileSync(join(tempDir, 'getting-started.md'), 'utf-8');
expect(content).toContain('---');
expect(content).toContain('title: "Getting Started"');
expect(content).toContain('tags: ["docs"]');
expect(content).toContain('# Getting Started');
expect(content).toContain('Welcome!');
});

it('should create nested directories for file paths', async () => {
const docs: DocumentRecord[] = [
{
id: '1',
title: 'API Ref',
content: '# API Reference',
repoName: 'test-repo',
filePath: 'docs/api/reference.md',
commitHash: null,
tags: null,
metadata: null,
updatedAt: null,
},
];

const client = createMockClientWithDocs(docs);
await exportDocuments(tempDir, client, { repoName: 'test-repo' });

expect(existsSync(join(tempDir, 'docs', 'api', 'reference.md'))).toBe(true);
const content = readFileSync(join(tempDir, 'docs', 'api', 'reference.md'), 'utf-8');
expect(content).toContain('# API Reference');
});

it('should skip documents without file_path or title', async () => {
const docs: DocumentRecord[] = [
{
id: '1',
title: null,
content: 'No path or title',
repoName: 'test-repo',
filePath: null,
commitHash: null,
tags: null,
metadata: null,
updatedAt: null,
},
];

const client = createMockClientWithDocs(docs);
const stats = await exportDocuments(tempDir, client, {
repoName: 'test-repo',
});

expect(stats.skipped).toBe(1);
expect(stats.written).toBe(0);
});

it('should generate file path from title if no file_path', async () => {
const docs: DocumentRecord[] = [
{
id: '1',
title: 'My Great Document',
content: 'Content here',
repoName: 'test-repo',
filePath: null,
commitHash: null,
tags: null,
metadata: null,
updatedAt: null,
},
];

const client = createMockClientWithDocs(docs);
await exportDocuments(tempDir, client, { repoName: 'test-repo' });

expect(existsSync(join(tempDir, 'my-great-document.md'))).toBe(true);
});

it('should export without frontmatter when disabled', async () => {
const docs: DocumentRecord[] = [
{
id: '1',
title: 'Raw Doc',
content: '# Raw Content',
repoName: 'test-repo',
filePath: 'raw.md',
commitHash: null,
tags: ['tag1'],
metadata: null,
updatedAt: null,
},
];

const client = createMockClientWithDocs(docs);
await exportDocuments(tempDir, client, {
repoName: 'test-repo',
includeFrontmatter: false,
});

const content = readFileSync(join(tempDir, 'raw.md'), 'utf-8');
expect(content).not.toContain('---');
expect(content).toBe('# Raw Content');
});

it('should include metadata in frontmatter', async () => {
const docs: DocumentRecord[] = [
{
id: '1',
title: 'Meta Doc',
content: 'Content',
repoName: 'test-repo',
filePath: 'meta.md',
commitHash: null,
tags: null,
metadata: { author: 'Dan', category: 'guide' },
updatedAt: null,
},
];

const client = createMockClientWithDocs(docs);
await exportDocuments(tempDir, client, { repoName: 'test-repo' });

const content = readFileSync(join(tempDir, 'meta.md'), 'utf-8');
expect(content).toContain('author: "Dan"');
expect(content).toContain('category: "guide"');
});

it('should report progress events', async () => {
const docs: DocumentRecord[] = [
{
id: '1',
title: 'Doc',
content: 'Content',
repoName: 'test-repo',
filePath: 'doc.md',
commitHash: null,
tags: null,
metadata: null,
updatedAt: null,
},
];

const events: string[] = [];
const client = createMockClientWithDocs(docs);
await exportDocuments(tempDir, client, {
repoName: 'test-repo',
onProgress: (event) => events.push(event.type),
});

expect(events).toContain('exporting');
expect(events).toContain('written');
expect(events).toContain('done');
});

it('should handle empty repo', async () => {
const client = createMockClientWithDocs([]);
const stats = await exportDocuments(tempDir, client, {
repoName: 'empty-repo',
});

expect(stats.total).toBe(0);
expect(stats.written).toBe(0);
});

it('should export multiple documents', async () => {
const docs: DocumentRecord[] = [
{
id: '1',
title: 'Doc A',
content: 'Content A',
repoName: 'test-repo',
filePath: 'a.md',
commitHash: null,
tags: null,
metadata: null,
updatedAt: null,
},
{
id: '2',
title: 'Doc B',
content: 'Content B',
repoName: 'test-repo',
filePath: 'b.md',
commitHash: null,
tags: null,
metadata: null,
updatedAt: null,
},
{
id: '3',
title: 'Other Repo',
content: 'Should not appear',
repoName: 'other-repo',
filePath: 'c.md',
commitHash: null,
tags: null,
metadata: null,
updatedAt: null,
},
];

const client = createMockClientWithDocs(docs);
const stats = await exportDocuments(tempDir, client, {
repoName: 'test-repo',
});

expect(stats.total).toBe(2);
expect(stats.written).toBe(2);
expect(existsSync(join(tempDir, 'a.md'))).toBe(true);
expect(existsSync(join(tempDir, 'b.md'))).toBe(true);
expect(existsSync(join(tempDir, 'c.md'))).toBe(false);
});
});
Loading
Loading