diff --git a/src/semble/cli.py b/src/semble/cli.py index 509d99b..e9de6de 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -34,13 +34,18 @@ def _mcp_main() -> None: help="Local directory or git URL to pre-index at startup (optional).", ) parser.add_argument("--ref", default=None, help="Branch or tag to check out (git URLs only).") + parser.add_argument( + "--include-text-files", + action="store_true", + help="Also index non-code text files (.md, .yaml, .json, etc.).", + ) args = parser.parse_args() if any(find_spec(dep) is None for dep in get_package_extras("semble", "mcp")): print("MCP dependencies are not installed. Run: pip install 'semble[mcp]'", file=sys.stderr) raise SystemExit(1) from semble.mcp import serve - asyncio.run(serve(args.path, ref=args.ref)) + asyncio.run(serve(args.path, ref=args.ref, include_text_files=args.include_text_files)) def _run_init(*, force: bool = False) -> None: @@ -66,12 +71,22 @@ def _cli_main() -> None: search_p.add_argument( "-m", "--mode", default="hybrid", choices=["hybrid", "semantic", "bm25"], help="Search mode (default: hybrid)." ) + search_p.add_argument( + "--include-text-files", + action="store_true", + help="Also index non-code text files (.md, .yaml, .json, etc.).", + ) related_p = sub.add_parser("find-related", help="Find code similar to a specific location.") related_p.add_argument("file_path", help="File path as shown in search results.") related_p.add_argument("line", type=int, help="Line number (1-indexed).") related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") + related_p.add_argument( + "--include-text-files", + action="store_true", + help="Also index non-code text files (.md, .yaml, .json, etc.).", + ) init_p = sub.add_parser("init", help="Write .claude/agents/semble-search.md for Claude Code sub-agent support.") init_p.add_argument("--force", action="store_true", help="Overwrite if the file already exists.") @@ -82,7 +97,12 @@ def _cli_main() -> None: _run_init(force=args.force) return - index = SembleIndex.from_git(args.path) if _is_git_url(args.path) else SembleIndex.from_path(args.path) + include_text = args.include_text_files + index = ( + SembleIndex.from_git(args.path, include_text_files=include_text) + if _is_git_url(args.path) + else SembleIndex.from_path(args.path, include_text_files=include_text) + ) if args.command == "search": results = index.search(args.query, top_k=args.top_k, mode=args.mode) diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 42e814d..38d170c 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -116,10 +116,10 @@ async def find_related( return server -async def serve(path: str | None = None, ref: str | None = None) -> None: +async def serve(path: str | None = None, ref: str | None = None, include_text_files: bool = False) -> None: """Start an MCP stdio server, optionally pre-indexing a default source.""" model = await asyncio.to_thread(load_model) - cache = _IndexCache(model=model) + cache = _IndexCache(model=model, include_text_files=include_text_files) if path: await cache.get(path, ref=ref) if not _is_git_url(path): @@ -132,9 +132,10 @@ async def serve(path: str | None = None, ref: str | None = None) -> None: class _IndexCache: """Cache of indexed repos and local paths for the lifetime of the MCP server process.""" - def __init__(self, model: Encoder) -> None: + def __init__(self, model: Encoder, include_text_files: bool = False) -> None: """Initialise an empty cache with a shared embedding model.""" self._model = model + self._include_text_files = include_text_files self._tasks: OrderedDict[str, asyncio.Task[SembleIndex]] = OrderedDict() # ordered for LRU eviction self._watcher_task: asyncio.Task[None] | None = None @@ -173,11 +174,19 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex: self._tasks.popitem(last=False) if _is_git_url(source): self._tasks[cache_key] = asyncio.create_task( - asyncio.to_thread(SembleIndex.from_git, source, ref=ref, model=self._model) + asyncio.to_thread( + SembleIndex.from_git, + source, + ref=ref, + model=self._model, + include_text_files=self._include_text_files, + ) ) else: self._tasks[cache_key] = asyncio.create_task( - asyncio.to_thread(SembleIndex.from_path, cache_key, model=self._model) + asyncio.to_thread( + SembleIndex.from_path, cache_key, model=self._model, include_text_files=self._include_text_files + ) ) task = self._tasks[cache_key] try: