Skip to content

Commit e89226b

Browse files
JoshuaRamirezclaude
andcommitted
Add multi-directory support for stores
Stores can now index documents from multiple directories by specifying --docs multiple times: rag store new my-store --docs ./path1 --docs ./path2 Changes: - CLI: --docs flag accepts multiple paths via action="append" - store.py: docs_dir -> docs_dirs (list), with backwards compat - ingest.py: load_documents_from_dirs() aggregates from all paths - migrate.py: passes docs_dir as single-element list to create_store - All tests updated for new list-based interface Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 76b3a15 commit e89226b

8 files changed

Lines changed: 171 additions & 101 deletions

File tree

src/rag/cli.py

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -100,18 +100,24 @@ def cmd_store_new(args: argparse.Namespace) -> int:
100100
from pathlib import Path
101101

102102
name = args.name
103-
docs_path = args.docs
103+
docs_paths = args.docs # List of paths or None
104104

105105
# Prompt for docs path if not provided
106-
if not docs_path:
107-
docs_path = console.input(f"[bold]Documents path for '{name}':[/bold] ").strip()
108-
if not docs_path:
106+
if not docs_paths:
107+
docs_input = console.input(f"[bold]Documents path for '{name}':[/bold] ").strip()
108+
if not docs_input:
109109
console.print("[red]Error: Documents path is required.[/red]")
110110
return 1
111+
docs_paths = [docs_input]
112+
113+
# Convert to Path objects
114+
docs_dirs = [Path(p) for p in docs_paths]
111115

112116
try:
113-
create_store(name, Path(docs_path))
117+
create_store(name, docs_dirs)
114118
console.print(f"[green]Store '{name}' created successfully.[/green]")
119+
if len(docs_dirs) > 1:
120+
console.print(f"[dim]Indexing {len(docs_dirs)} directories.[/dim]")
115121

116122
# Set as default if it's the first store
117123
stores = list_stores()
@@ -138,7 +144,7 @@ def cmd_store_list(args: argparse.Namespace) -> int:
138144
table = Table(title="Vector Bot Stores")
139145
table.add_column("Name", style="cyan")
140146
table.add_column("Default", style="green")
141-
table.add_column("Documents Path", style="dim")
147+
table.add_column("Documents", style="dim")
142148
table.add_column("Last Indexed", style="dim")
143149
table.add_column("Chunks", style="dim", justify="right")
144150

@@ -157,10 +163,19 @@ def cmd_store_list(args: argparse.Namespace) -> int:
157163
chunk_count = store.get("chunk_count")
158164
chunks_str = str(chunk_count) if chunk_count is not None else "-"
159165

166+
# Format docs_dirs for display
167+
docs_dirs = store.get("docs_dirs", [])
168+
if len(docs_dirs) == 0:
169+
docs_display = "-"
170+
elif len(docs_dirs) == 1:
171+
docs_display = docs_dirs[0]
172+
else:
173+
docs_display = f"{len(docs_dirs)} directories"
174+
160175
table.add_row(
161176
store["name"],
162177
is_default,
163-
store.get("docs_dir", "-"),
178+
docs_display,
164179
last_indexed,
165180
chunks_str,
166181
)
@@ -183,7 +198,17 @@ def cmd_store_info(args: argparse.Namespace) -> int:
183198

184199
console.print(f"[bold]Store: {name}[/bold]")
185200
console.print(f" Default: {'yes' if is_default else 'no'}")
186-
console.print(f" Documents path: {store.get('docs_dir', '-')}")
201+
202+
# Display document directories
203+
docs_dirs = store.get("docs_dirs", [])
204+
if len(docs_dirs) == 0:
205+
console.print(" Documents: -")
206+
elif len(docs_dirs) == 1:
207+
console.print(f" Documents: {docs_dirs[0]}")
208+
else:
209+
console.print(f" Documents: {len(docs_dirs)} directories")
210+
for d in docs_dirs:
211+
console.print(f" - {d}")
187212

188213
index_dir = get_store_index_dir(name)
189214
console.print(f" Index path: {index_dir}")
@@ -338,7 +363,8 @@ def cmd_migrate(args: argparse.Namespace) -> int:
338363
console.print()
339364
console.print("[bold]New store info:[/bold]")
340365
console.print(f" Name: {store_config['name']}")
341-
console.print(f" Documents: {store_config['docs_dir']}")
366+
docs_dirs = store_config.get('docs_dirs', [])
367+
console.print(f" Documents: {docs_dirs[0] if docs_dirs else '-'}")
342368
console.print(f" Index: {get_store_index_dir(store_name)}")
343369
console.print(f" Set as default: yes")
344370
except (ValueError, FileNotFoundError) as e:
@@ -472,7 +498,8 @@ def main(argv: Optional[List[str]] = None) -> int:
472498
store_new_parser.add_argument(
473499
"--docs",
474500
metavar="PATH",
475-
help="Path to documents directory (prompts if not given)",
501+
action="append",
502+
help="Path to documents directory (can be specified multiple times)",
476503
default=None,
477504
)
478505

src/rag/ingest.py

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -88,25 +88,25 @@ def setup_llm_settings() -> None:
8888

8989

9090
def load_documents(docs_dir: Path) -> List:
91-
"""Load documents from the specified directory."""
91+
"""Load documents from a single directory."""
9292
if not docs_dir.exists():
9393
docs_dir.mkdir(parents=True, exist_ok=True)
9494
console.print(f"[yellow]Created docs directory: {docs_dir}[/yellow]")
9595
return []
96-
96+
9797
# Define file extensions to load
9898
extensions = [".txt", ".md", ".pdf", ".json", ".csv"]
99-
99+
100100
# Check for documents
101101
doc_files: list[Path] = []
102102
for ext in extensions:
103103
doc_files.extend(docs_dir.glob(f"*{ext}"))
104104
doc_files.extend(docs_dir.glob(f"**/*{ext}"))
105-
105+
106106
if not doc_files:
107107
console.print(f"[yellow]No documents found in {docs_dir}[/yellow]")
108108
return []
109-
109+
110110
# Filter out large files (>20MB)
111111
valid_files: list[Path] = []
112112
for file in doc_files:
@@ -115,27 +115,47 @@ def load_documents(docs_dir: Path) -> List:
115115
console.print(f"[yellow]Skipping large file (>{size_mb:.1f}MB): {file.name}[/yellow]")
116116
else:
117117
valid_files.append(file)
118-
118+
119119
if not valid_files:
120120
console.print("[yellow]No valid documents to index[/yellow]")
121121
return []
122-
123-
console.print(f"[blue]Loading {len(valid_files)} documents...[/blue]")
124-
122+
123+
console.print(f"[blue]Loading {len(valid_files)} documents from {docs_dir}...[/blue]")
124+
125125
# Load documents
126126
reader = SimpleDirectoryReader(
127127
input_dir=str(docs_dir),
128128
recursive=True,
129129
exclude_hidden=True,
130130
required_exts=extensions,
131131
)
132-
132+
133133
documents = reader.load_data()
134134
console.print(f"[green]Loaded {len(documents)} document chunks[/green]")
135-
135+
136136
return documents
137137

138138

139+
def load_documents_from_dirs(docs_dirs: List[Path]) -> List:
140+
"""Load documents from multiple directories.
141+
142+
Args:
143+
docs_dirs: List of paths to document directories.
144+
145+
Returns:
146+
Combined list of documents from all directories.
147+
"""
148+
all_documents = []
149+
for docs_dir in docs_dirs:
150+
documents = load_documents(docs_dir)
151+
all_documents.extend(documents)
152+
153+
if len(docs_dirs) > 1:
154+
console.print(f"[blue]Total: {len(all_documents)} chunks from {len(docs_dirs)} directories[/blue]")
155+
156+
return all_documents
157+
158+
139159
def ingest(
140160
store_name: str | None = None,
141161
verbose: bool = False,
@@ -155,11 +175,14 @@ def ingest(
155175
store_name = resolve_store(store_name)
156176
store = get_store(store_name)
157177

158-
docs_dir = Path(store["docs_dir"])
178+
# Get document directories (handles backwards compatibility)
179+
docs_dirs = [Path(d) for d in store.get("docs_dirs", [])]
159180
index_dir = get_store_index_dir(store_name)
160181

161182
console.print("[bold]Document Ingestion[/bold]")
162183
console.print(f"[blue]Store: {store_name}[/blue]")
184+
if len(docs_dirs) > 1:
185+
console.print(f"[dim]Indexing {len(docs_dirs)} directories[/dim]")
163186

164187
# Setup LLM settings
165188
setup_llm_settings()
@@ -194,10 +217,11 @@ def ingest(
194217
if force and index_dir.exists():
195218
console.print("[yellow]Force flag set - rebuilding index...[/yellow]")
196219

197-
# Load documents
198-
documents = load_documents(docs_dir)
220+
# Load documents from all directories
221+
documents = load_documents_from_dirs(docs_dirs)
199222
if not documents:
200-
raise RuntimeError(f"No documents found in {docs_dir}")
223+
dirs_str = ", ".join(str(d) for d in docs_dirs)
224+
raise RuntimeError(f"No documents found in: {dirs_str}")
201225

202226
# Count unique source files from loaded documents
203227
source_files = set()

src/rag/migrate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ def migrate_legacy_index(
133133
if not docs_dir.is_absolute():
134134
docs_dir = docs_dir.resolve()
135135

136-
# Create the new store
137-
store_config = create_store(store_name, docs_dir)
136+
# Create the new store (pass docs_dir as a list)
137+
store_config = create_store(store_name, [docs_dir])
138138

139139
# Copy index files from legacy to new store
140140
new_index_dir = get_store_index_dir(store_name)

src/rag/store.py

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def list_stores() -> list[dict[str, Any]]:
131131
Returns:
132132
List of store info dictionaries, each containing:
133133
- name: Store name
134-
- docs_dir: Path to documents directory
134+
- docs_dirs: List of paths to documents directories
135135
- last_indexed: ISO timestamp of last indexing, or None
136136
- chunk_count: Number of indexed chunks
137137
"""
@@ -153,9 +153,16 @@ def list_stores() -> list[dict[str, Any]]:
153153
with open(store_config_path, "r", encoding="utf-8") as f:
154154
store_data = json.load(f)
155155

156+
# Handle both old (docs_dir) and new (docs_dirs) formats
157+
docs_dirs = store_data.get("docs_dirs")
158+
if docs_dirs is None:
159+
# Backwards compatibility: convert single docs_dir to list
160+
old_docs_dir = store_data.get("docs_dir")
161+
docs_dirs = [old_docs_dir] if old_docs_dir else []
162+
156163
stores.append({
157164
"name": store_data.get("name", store_dir.name),
158-
"docs_dir": store_data.get("docs_dir"),
165+
"docs_dirs": docs_dirs,
159166
"last_indexed": store_data.get("last_indexed"),
160167
"chunk_count": store_data.get("chunk_count", 0),
161168
})
@@ -174,6 +181,7 @@ def get_store(name: str) -> dict[str, Any] | None:
174181
175182
Returns:
176183
Store configuration dictionary, or None if store doesn't exist.
184+
Always returns docs_dirs as a list (converts legacy docs_dir if needed).
177185
"""
178186
_validate_store_name(name)
179187
store_config_path = _get_store_dir(name) / STORE_CONFIG_FILE
@@ -184,22 +192,27 @@ def get_store(name: str) -> dict[str, Any] | None:
184192
try:
185193
with open(store_config_path, "r", encoding="utf-8") as f:
186194
result: dict[str, Any] = json.load(f)
187-
return result
195+
196+
# Handle backwards compatibility: convert docs_dir to docs_dirs
197+
if "docs_dirs" not in result and "docs_dir" in result:
198+
result["docs_dirs"] = [result["docs_dir"]]
199+
200+
return result
188201
except (json.JSONDecodeError, OSError) as e:
189202
console.print(f"[yellow]Warning: Could not read store config: {e}[/yellow]")
190203
return None
191204

192205

193206
def create_store(
194207
name: str,
195-
docs_dir: Path,
208+
docs_dirs: list[Path],
196209
chat_model: str | None = None,
197210
) -> dict[str, Any]:
198211
"""Create a new document store.
199212
200213
Args:
201214
name: Unique name for the store.
202-
docs_dir: Path to the documents directory.
215+
docs_dirs: List of paths to documents directories.
203216
chat_model: Optional chat model override for this store.
204217
205218
Returns:
@@ -214,9 +227,12 @@ def create_store(
214227
if store_dir.exists():
215228
raise ValueError(f"Store '{name}' already exists.")
216229

217-
# Convert to absolute path if relative
218-
if not docs_dir.is_absolute():
219-
docs_dir = docs_dir.resolve()
230+
# Convert to absolute paths if relative
231+
resolved_dirs = []
232+
for docs_dir in docs_dirs:
233+
if not docs_dir.is_absolute():
234+
docs_dir = docs_dir.resolve()
235+
resolved_dirs.append(str(docs_dir))
220236

221237
# Create store directory structure
222238
store_dir.mkdir(parents=True, exist_ok=True)
@@ -226,7 +242,7 @@ def create_store(
226242
# Create store configuration
227243
store_config = {
228244
"name": name,
229-
"docs_dir": str(docs_dir),
245+
"docs_dirs": resolved_dirs,
230246
"created": datetime.now(timezone.utc).isoformat(),
231247
"last_indexed": None,
232248
"chunk_count": 0,
@@ -266,12 +282,15 @@ def update_store(name: str, **kwargs: Any) -> dict[str, Any]:
266282

267283
# Update fields
268284
for key, value in kwargs.items():
269-
if key == "docs_dir" and value is not None:
270-
# Ensure docs_dir is stored as absolute path string
271-
path = Path(value)
272-
if not path.is_absolute():
273-
path = path.resolve()
274-
store_config[key] = str(path)
285+
if key == "docs_dirs" and value is not None:
286+
# Ensure docs_dirs are stored as absolute path strings
287+
resolved = []
288+
for p in value:
289+
path = Path(p) if isinstance(p, str) else p
290+
if not path.is_absolute():
291+
path = path.resolve()
292+
resolved.append(str(path))
293+
store_config[key] = resolved
275294
else:
276295
store_config[key] = value
277296

tests/unit/test_cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,7 @@ def test_StoreNew_WithDocsArg_CreatesStore(self, mock_console: Any) -> None:
451451

452452
# Assert
453453
assert result == 0
454-
mock_create.assert_called_once_with("my-store", Path("/path/to/docs"))
454+
mock_create.assert_called_once_with("my-store", [Path("/path/to/docs")])
455455

456456
def test_StoreNew_WithExistingStore_ReturnsOne(self, mock_console: Any) -> None:
457457
"""Test that store new returns 1 when store already exists."""
@@ -690,7 +690,7 @@ def test_StoreNew_WithoutDocsArg_PromptsAndSucceeds(self, mock_console: Any) ->
690690

691691
# Assert
692692
assert result == 0
693-
mock_create.assert_called_once_with("my-store", Path("/path/to/docs"))
693+
mock_create.assert_called_once_with("my-store", [Path("/path/to/docs")])
694694

695695
def test_StoreNew_WithEmptyDocsInput_ReturnsOne(self, mock_console: Any) -> None:
696696
"""Test that store new returns 1 when user inputs empty docs path."""

0 commit comments

Comments
 (0)