From b0cb5f6ee1825c1c61a6901a83b56bfc8ed00eb1 Mon Sep 17 00:00:00 2001 From: Fazle Elahee Date: Tue, 12 May 2026 11:24:14 +0100 Subject: [PATCH 1/3] fix: setup and first-run pain points (#67) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five connected setup-path bugs from AZagatti's report. All live in the install / first-run / hook path, all silent failures that look like working features. 1. Drop --changed-only from the post-commit hook template (`indexer/git_hooks.py`). The flag no longer exists on `cce index`, so every commit silently errored with "No such option: --changed-only". `cce index` (no flag) already does incremental indexing via the manifest's content-hash check, so the hook just needs the flag dropped. 2. Persistent fastembed cache + .incomplete sweep (`indexer/embedder.py`). Three problems folded together: - Default cache moves from `$TMPDIR/fastembed_cache` to `$XDG_CACHE_HOME/fastembed` (or `~/.cache/fastembed`) so it survives WSL/Ubuntu's systemd-tmpfiles `/tmp` wipe on every boot. - New `CCE_FASTEMBED_CACHE_PATH` env var (precedence over `FASTEMBED_CACHE_PATH`) lets users with multiple fastembed tools isolate CCE's cache. - `_sweep_incomplete_downloads()` runs at Embedder init and removes any `*.incomplete` blobs left behind by a stalled huggingface_hub download. Without this, one flaky network event produces a zero-byte ONNX file that crashes every subsequent run with NO_SUCHFILE until the user manually nukes the cache. - Embedder error message now points at the cache dir and suggests the rm-rf recovery so the failure mode is at least debuggable. 3. TCP liveness probe in hook scripts (`memory/hook_installer.py`). POSIX hook adds a bash /dev/tcp probe; Windows .cmd hook adds a PowerShell TcpClient probe. A stale `serve.port` no longer burns 1-2s/curl call when nothing's listening — which on long Claude Code sessions with hundreds of PostToolUse/UserPromptSubmit events adds up to many minutes of dead wait. 4. Empty-index MCP returns a status response instead of blocking (`integration/mcp_server.py`). First `context_search` on an empty index used to silently run a full reindex and only respond when it finished — from the MCP client's side it looked indistinguishable from a hang. The new behaviour kicks indexing off as a background task and immediately returns a one-line status telling the user to retry or run `cce index`. Tests: - test_git_hooks.py: asserts --changed-only no longer appears in any installed hook - test_embedder.py +6 cases: cache dir resolution order (CCE_FASTEMBED_CACHE_PATH > FASTEMBED_CACHE_PATH > XDG_CACHE_HOME > ~/.cache), .incomplete sweep removes stale partials but preserves good cache files, missing-dir is a no-op - test_hook_installer.py +2 cases: POSIX and Windows hook templates contain a TCP liveness probe before any curl call - tests/integration/test_mcp_empty_index.py (new, 3 cases): empty index returns a status text, populated index runs real retrieval, _ensure_indexed returns the right boolean for each state Full suite: 887 passed, 1 skipped, 0 failed. Ruff clean. --- src/context_engine/indexer/embedder.py | 75 ++++++++++++- src/context_engine/indexer/git_hooks.py | 7 +- src/context_engine/integration/mcp_server.py | 77 ++++++++++--- src/context_engine/memory/hook_installer.py | 18 ++++ tests/indexer/test_embedder.py | 64 +++++++++++ tests/indexer/test_git_hooks.py | 17 +++ tests/integration/test_mcp_empty_index.py | 108 +++++++++++++++++++ tests/memory/test_hook_installer.py | 28 +++++ 8 files changed, 377 insertions(+), 17 deletions(-) create mode 100644 tests/integration/test_mcp_empty_index.py diff --git a/src/context_engine/indexer/embedder.py b/src/context_engine/indexer/embedder.py index 38cc504..9664fcd 100644 --- a/src/context_engine/indexer/embedder.py +++ b/src/context_engine/indexer/embedder.py @@ -10,6 +10,7 @@ import os import sys from functools import lru_cache +from pathlib import Path from fastembed import TextEmbedding @@ -20,6 +21,66 @@ _DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" + +def _resolve_cache_dir() -> Path: + """Pick a persistent fastembed cache location. + + Precedence: + 1. ``FASTEMBED_CACHE_PATH`` (the env var fastembed itself recognises) + 2. ``CCE_FASTEMBED_CACHE_PATH`` (CCE-specific override, takes + priority over fastembed's own var so users with multiple tools + using fastembed can isolate CCE's cache) + 3. ``$XDG_CACHE_HOME/fastembed`` if XDG_CACHE_HOME is set + 4. ``~/.cache/fastembed`` + + Previously CCE accepted fastembed's default of + ``$TMPDIR/fastembed_cache`` which on WSL/Ubuntu's systemd-tmpfiles + layout (``/tmp`` cleared on every boot) meant the model got + re-downloaded on each restart — and re-hit any flaky-network failure + along with it (issue #67). + """ + cce_override = os.environ.get("CCE_FASTEMBED_CACHE_PATH", "").strip() + if cce_override: + return Path(cce_override).expanduser() + fast_override = os.environ.get("FASTEMBED_CACHE_PATH", "").strip() + if fast_override: + return Path(fast_override).expanduser() + xdg = os.environ.get("XDG_CACHE_HOME", "").strip() + if xdg: + return Path(xdg).expanduser() / "fastembed" + return Path.home() / ".cache" / "fastembed" + + +def _sweep_incomplete_downloads(cache_dir: Path) -> int: + """Delete any ``*.incomplete`` files from a previous stalled download. + + Without this sweep, a stalled ``huggingface_hub`` download leaves a + zero-byte ``model_optimized.onnx.incomplete`` file alongside the + expected ``model_optimized.onnx`` — and fastembed will then fail at + load time with a confusing ``NO_SUCHFILE`` error on every subsequent + run until the user manually nukes the cache (#67). + + Returns the number of files removed. + """ + if not cache_dir.exists(): + return 0 + removed = 0 + try: + for path in cache_dir.rglob("*.incomplete"): + try: + path.unlink() + removed += 1 + except OSError as exc: + log.warning("Could not remove stale fastembed file %s: %s", path, exc) + except OSError as exc: + log.warning("Failed to scan fastembed cache at %s: %s", cache_dir, exc) + if removed: + log.info( + "Cleared %d stale `*.incomplete` file(s) from fastembed cache at %s", + removed, cache_dir, + ) + return removed + # Passed straight to fastembed's `parallel` argument: # None → no data-parallel mp; use onnxruntime's own threading # N>0 → spawn N forkserver workers around onnxruntime @@ -60,12 +121,24 @@ def __init__( resolved = f"sentence-transformers/{model_name}" else: resolved = model_name + # Use a persistent cache dir that survives reboot, and clear out any + # partial downloads from a previously interrupted run so we never + # try to load a zero-byte ONNX file (#67). + cache_dir = _resolve_cache_dir() + try: + cache_dir.mkdir(parents=True, exist_ok=True) + except OSError as exc: + log.warning("Could not create fastembed cache dir %s: %s", cache_dir, exc) + _sweep_incomplete_downloads(cache_dir) try: - self._model = TextEmbedding(resolved) + self._model = TextEmbedding(resolved, cache_dir=str(cache_dir)) except Exception as exc: raise RuntimeError( f"Failed to load embedding model '{model_name}'. " f"Ensure fastembed is installed and the model name is valid. " + f"Cache dir: {cache_dir}. " + f"If a previous download was interrupted, removing the cache " + f"and retrying may help: rm -rf {cache_dir}. " f"Supported models: TextEmbedding.list_supported_models(). " f"Original error: {exc}" ) from exc diff --git a/src/context_engine/indexer/git_hooks.py b/src/context_engine/indexer/git_hooks.py index 1056668..baca88d 100644 --- a/src/context_engine/indexer/git_hooks.py +++ b/src/context_engine/indexer/git_hooks.py @@ -31,9 +31,14 @@ def _resolve_cce_binary() -> str: def _hook_script() -> str: + # `cce index` without any flag already performs incremental indexing + # via the on-disk manifest's content-hash check. The old + # `--changed-only` flag was removed but the hook template hadn't been + # updated — every commit silently errored with + # "No such option: --changed-only" (issue #67). bin_path = _resolve_cce_binary() return f"""{HOOK_MARKER} -{bin_path} index --changed-only >/dev/null 2>&1 & +{bin_path} index >/dev/null 2>&1 & """ diff --git a/src/context_engine/integration/mcp_server.py b/src/context_engine/integration/mcp_server.py index 2d5e8d8..8a14897 100644 --- a/src/context_engine/integration/mcp_server.py +++ b/src/context_engine/integration/mcp_server.py @@ -1,4 +1,5 @@ """MCP server exposing context engine tools to Claude Code.""" +import asyncio import json import logging import re @@ -883,24 +884,57 @@ async def call_tool(name: str, arguments: dict): # ── tool handlers ─────────────────────────────────────────────────────── - async def _ensure_indexed(self) -> None: - """Lazy indexing: if the index is empty, trigger indexing on first query.""" - if self._lazy_indexed: - return - self._lazy_indexed = True + async def _ensure_indexed(self) -> bool: + """Lazy indexing on empty index. + + Returns True iff the index already has content. When the index is + empty, indexing is kicked off as a background task and the method + returns False so the caller can surface an informative message + instead of blocking the MCP request for the duration of the + first-time index. The old behaviour silently blocked context_search + for many seconds (or many minutes on a large repo); from the + client's side it looked like the call had hung (#67). + """ try: count = self._backend._vector_store.count() if count > 0: - return + return True except Exception: - pass - # Index is empty — trigger on-the-fly indexing - log.info("Index empty — triggering lazy indexing for %s", self._project_name) + # If we can't tell, assume it's populated rather than kick off + # an erroneous reindex. + return True + + if self._lazy_indexed: + # Already kicked off — index is presumably building in the + # background. Keep telling callers to wait until it shows up. + return False + self._lazy_indexed = True + + log.info( + "Index empty — kicking off background indexing for %s", + self._project_name, + ) + + async def _bg_index(): + try: + from context_engine.indexer.pipeline import run_indexing + await run_indexing(self._config, self._project_dir, full=False) + log.info("Background indexing complete for %s", self._project_name) + except Exception as exc: + log.warning("Background indexing failed: %s", exc) + try: - from context_engine.indexer.pipeline import run_indexing - await run_indexing(self._config, self._project_dir, full=False) - except Exception as exc: - log.warning("Lazy indexing failed: %s", exc) + asyncio.create_task(_bg_index()) + except RuntimeError: + # No running loop (synchronous test harness, etc.) — fall back + # to blocking. Better than swallowing the request. + try: + from context_engine.indexer.pipeline import run_indexing + await run_indexing(self._config, self._project_dir, full=False) + return True + except Exception as exc: + log.warning("Lazy indexing failed: %s", exc) + return False async def _handle_context_search(self, args): query = (args.get("query") or "").strip() @@ -914,8 +948,21 @@ async def _handle_context_search(self, args): ) ] - # Lazy index if this is the first query and index is empty - await self._ensure_indexed() + # Lazy index if this is the first query and index is empty. + # When the index is empty we return immediately with a status line + # rather than blocking the call for the full indexing run — the + # MCP client otherwise sees a multi-second/minute hang with no + # progress indicator (#67). + indexed = await self._ensure_indexed() + if not indexed: + return [TextContent( + type="text", + text=( + f"Index for {self._project_name} is empty — indexing has " + "been started in the background. Retry this search in a " + "few seconds, or run `cce index` for a synchronous index." + ), + )] top_k = _clamp_top_k(args.get("top_k", 10)) max_tokens = args.get("max_tokens", 8000) diff --git a/src/context_engine/memory/hook_installer.py b/src/context_engine/memory/hook_installer.py index 78ec653..6046b8d 100644 --- a/src/context_engine/memory/hook_installer.py +++ b/src/context_engine/memory/hook_installer.py @@ -78,6 +78,17 @@ def _is_windows() -> bool: PORT="$(cat "${PORT_FILE}" 2>/dev/null)" [ -n "${PORT}" ] || exit 0 +# Quick TCP liveness probe via bash's /dev/tcp — if nothing is listening on +# the port (i.e. cce serve died but left its serve.port behind), bail out +# immediately instead of letting curl burn its full 1-2s timeout per hook +# call. On long Claude Code sessions with hundreds of PostToolUse events +# that adds up to many seconds of wasted wait (#67). If bash isn't +# available (rare on POSIX), fall through to the timed curl below — slower +# but still correct. +if command -v bash >/dev/null 2>&1; then + bash -c "exec 3<>/dev/tcp/127.0.0.1/${PORT}" 2>/dev/null || exit 0 +fi + if [ "${HOOK_NAME}" = "SessionStart" ]; then RESPONSE="$(curl -sf -m 2 -X POST -H "Content-Type: application/json" \\ --data-binary @- "http://127.0.0.1:${PORT}/hooks/${HOOK_NAME}" \\ @@ -115,6 +126,13 @@ def _is_windows() -> bool: set /p PORT=<"%PORT_FILE%" if "%PORT%"=="" exit /b 0 +REM Liveness probe before spending curl's full timeout. PowerShell's +REM TcpClient is universally available on supported Windows targets and +REM exits in ~50ms when nothing's listening, vs. curl's 1-2s. +REM See the POSIX comment above for the motivation (#67). +powershell -NoProfile -Command "$ErrorActionPreference='Stop'; try { (New-Object Net.Sockets.TcpClient).Connect('127.0.0.1',%PORT%); exit 0 } catch { exit 1 }" >nul 2>&1 +if errorlevel 1 exit /b 0 + if /i "%HOOK_NAME%"=="SessionStart" ( set "TMP_RESP=%TEMP%\\cce_hook_resp_%RANDOM%.txt" curl -sf -m 2 -X POST -H "Content-Type: application/json" ^ diff --git a/tests/indexer/test_embedder.py b/tests/indexer/test_embedder.py index f876f6d..d18e941 100644 --- a/tests/indexer/test_embedder.py +++ b/tests/indexer/test_embedder.py @@ -64,3 +64,67 @@ def test_resolve_parallel_invalid_env_falls_through(monkeypatch): monkeypatch.setattr("sys.platform", "linux") monkeypatch.setattr("os.cpu_count", lambda: 2) assert _resolve_parallel() == 2 + + +# ─── Issue #67 regression coverage ────────────────────────────────────── + + +def test_resolve_cache_dir_default(monkeypatch, tmp_path): + """No env var set → ~/.cache/fastembed, NOT /tmp.""" + from context_engine.indexer.embedder import _resolve_cache_dir + monkeypatch.delenv("CCE_FASTEMBED_CACHE_PATH", raising=False) + monkeypatch.delenv("FASTEMBED_CACHE_PATH", raising=False) + monkeypatch.delenv("XDG_CACHE_HOME", raising=False) + fake_home = tmp_path / "home" + monkeypatch.setattr("pathlib.Path.home", classmethod(lambda cls: fake_home)) + got = _resolve_cache_dir() + assert got == fake_home / ".cache" / "fastembed" + + +def test_resolve_cache_dir_respects_fastembed_env(monkeypatch, tmp_path): + from context_engine.indexer.embedder import _resolve_cache_dir + monkeypatch.delenv("CCE_FASTEMBED_CACHE_PATH", raising=False) + monkeypatch.setenv("FASTEMBED_CACHE_PATH", str(tmp_path / "custom")) + assert _resolve_cache_dir() == tmp_path / "custom" + + +def test_resolve_cache_dir_cce_override_wins(monkeypatch, tmp_path): + """CCE_FASTEMBED_CACHE_PATH overrides fastembed's own env var so users + with multiple tools sharing the fastembed default can isolate CCE's + cache.""" + from context_engine.indexer.embedder import _resolve_cache_dir + monkeypatch.setenv("CCE_FASTEMBED_CACHE_PATH", str(tmp_path / "cce_path")) + monkeypatch.setenv("FASTEMBED_CACHE_PATH", str(tmp_path / "fast_path")) + assert _resolve_cache_dir() == tmp_path / "cce_path" + + +def test_resolve_cache_dir_xdg(monkeypatch, tmp_path): + from context_engine.indexer.embedder import _resolve_cache_dir + monkeypatch.delenv("CCE_FASTEMBED_CACHE_PATH", raising=False) + monkeypatch.delenv("FASTEMBED_CACHE_PATH", raising=False) + monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "xdg")) + assert _resolve_cache_dir() == tmp_path / "xdg" / "fastembed" + + +def test_sweep_incomplete_removes_stale_partial(tmp_path): + """Issue #67: a stalled huggingface_hub download leaves a 0-byte + `model_optimized.onnx.incomplete` file that crashes every subsequent + load. We must remove these on startup.""" + from context_engine.indexer.embedder import _sweep_incomplete_downloads + nested = tmp_path / "models--qdrant--bge" / "snapshots" / "abc" + nested.mkdir(parents=True) + bad = nested / "model_optimized.onnx.incomplete" + bad.write_bytes(b"") + good = nested / "tokenizer.json" + good.write_text("{}") + + removed = _sweep_incomplete_downloads(tmp_path) + assert removed == 1 + assert not bad.exists() + assert good.exists() # other cache files must survive + + +def test_sweep_incomplete_missing_dir_is_noop(tmp_path): + from context_engine.indexer.embedder import _sweep_incomplete_downloads + missing = tmp_path / "does_not_exist_yet" + assert _sweep_incomplete_downloads(missing) == 0 diff --git a/tests/indexer/test_git_hooks.py b/tests/indexer/test_git_hooks.py index a1e2e87..2bd4db0 100644 --- a/tests/indexer/test_git_hooks.py +++ b/tests/indexer/test_git_hooks.py @@ -46,3 +46,20 @@ def test_install_hooks_returns_empty_for_non_git(tmp_path): """Non-git directory should return empty list, not raise.""" result = install_hooks(project_dir=str(tmp_path)) assert result == [] + + +# ─── Issue #67 regression coverage ────────────────────────────────────── + + +def test_post_commit_hook_does_not_use_removed_flag(git_repo): + """`cce index --changed-only` flag was removed but the template still + referenced it, so every commit silently errored. The hook must invoke + bare `cce index` (which already does incremental indexing).""" + install_hooks(project_dir=str(git_repo)) + for name in ("post-commit", "post-checkout", "post-merge"): + content = (git_repo / ".git" / "hooks" / name).read_text() + assert "--changed-only" not in content, ( + f"{name} still references the removed --changed-only flag" + ) + # Sanity: the hook still triggers an index. + assert " index " in content or "index>" in content diff --git a/tests/integration/test_mcp_empty_index.py b/tests/integration/test_mcp_empty_index.py new file mode 100644 index 0000000..1882b12 --- /dev/null +++ b/tests/integration/test_mcp_empty_index.py @@ -0,0 +1,108 @@ +"""Tests for context_search behaviour on an empty index (#67). + +Previously, the first context_search against an empty index silently +blocked while a full reindex ran underneath — from the MCP client's +side the call looked hung. The fix returns a status response and +spawns the reindex as a background task instead. +""" +from __future__ import annotations + +from unittest.mock import MagicMock + +import pytest + +from context_engine.config import Config +from context_engine.integration.mcp_server import ContextEngineMCP + + +def _make_mcp(tmp_path, monkeypatch, *, chunk_count: int): + project_dir = tmp_path / "demo" + project_dir.mkdir(parents=True, exist_ok=True) + storage_path = tmp_path / "storage" + monkeypatch.chdir(project_dir) + config = Config( + storage_path=str(storage_path), + embedding_model="BAAI/bge-small-en-v1.5", + ) + backend = MagicMock() + backend._vector_store.count.return_value = chunk_count + compressor = MagicMock() + embedder = MagicMock() + retriever = MagicMock() + return ContextEngineMCP( + retriever=retriever, backend=backend, compressor=compressor, + embedder=embedder, config=config, + ) + + +@pytest.mark.asyncio +async def test_context_search_returns_status_when_index_empty( + tmp_path, monkeypatch +): + mcp = _make_mcp(tmp_path, monkeypatch, chunk_count=0) + + # Patch run_indexing so the background task can't actually fire off a + # real pipeline run during the test (it would try to read the empty + # tmp dir but the import would still cost real wall-time). + indexing_called: list[bool] = [] + + async def _fake_run_indexing(*a, **kw): + indexing_called.append(True) + + monkeypatch.setattr( + "context_engine.indexer.pipeline.run_indexing", + _fake_run_indexing, + ) + + result = await mcp._handle_context_search({"query": "anything"}) + assert len(result) == 1 + text = result[0].text + # Must NOT pretend the search ran — has to tell the user the index + # is empty and what to do. + assert "empty" in text.lower() + assert "cce index" in text.lower() or "indexing" in text.lower() + + +@pytest.mark.asyncio +async def test_context_search_uses_real_retrieval_when_index_populated( + tmp_path, monkeypatch +): + """A populated index must NOT return the empty-index status message — + confirms the guard runs the right way around.""" + mcp = _make_mcp(tmp_path, monkeypatch, chunk_count=5) + + # Stub the retriever + compressor to keep the path lightweight. + async def _fake_retrieve(*a, **kw): + return [] + + async def _fake_compress(chunks, level): + return chunks + + mcp._retriever.retrieve = _fake_retrieve + mcp._compressor.compress = _fake_compress + + result = await mcp._handle_context_search({"query": "anything"}) + assert len(result) == 1 + text = result[0].text + # The empty-index banner must not appear when the store has rows. + assert "indexing has been started" not in text.lower() + assert "is empty" not in text.lower() + + +@pytest.mark.asyncio +async def test_ensure_indexed_returns_false_for_empty_and_true_for_populated( + tmp_path, monkeypatch +): + mcp_empty = _make_mcp(tmp_path / "a", monkeypatch, chunk_count=0) + + async def _fake_run_indexing(*a, **kw): + return None + + monkeypatch.setattr( + "context_engine.indexer.pipeline.run_indexing", _fake_run_indexing, + ) + + assert await mcp_empty._ensure_indexed() is False + + mcp_full = _make_mcp(tmp_path / "b", monkeypatch, chunk_count=10) + assert await mcp_full._ensure_indexed() is True diff --git a/tests/memory/test_hook_installer.py b/tests/memory/test_hook_installer.py index 0ba20c1..8e2890e 100644 --- a/tests/memory/test_hook_installer.py +++ b/tests/memory/test_hook_installer.py @@ -63,6 +63,34 @@ def test_posix_hook_script_body_when_platform_not_win(monkeypatch): assert "curl -sf" in body +# ─── Issue #67 regression coverage: liveness probe in hook scripts ───── + + +def test_posix_hook_has_tcp_liveness_probe(monkeypatch): + """POSIX hook must short-circuit when nothing's listening on the port + so a stale serve.port doesn't waste 1-2s/curl call (#67).""" + monkeypatch.setattr(hi, "_is_windows", lambda: False) + body = hi._hook_script_body() + # The probe uses bash's /dev/tcp pseudo-device. + assert "/dev/tcp/127.0.0.1" in body + # The probe gates curl: if it fails we exit BEFORE the curl call. + probe_pos = body.index("/dev/tcp/127.0.0.1") + curl_pos = body.index("curl -sf") + assert probe_pos < curl_pos, ( + "Liveness probe must run before any curl invocation" + ) + + +def test_windows_hook_has_tcp_liveness_probe(monkeypatch): + monkeypatch.setattr(hi, "_is_windows", lambda: True) + body = hi._hook_script_body() + # TcpClient is the cheapest port probe available without extra deps. + assert "TcpClient" in body + probe_pos = body.index("TcpClient") + curl_pos = body.index("curl -sf") + assert probe_pos < curl_pos + + def test_install_settings_uses_cmd_quoting_on_windows(tmp_path: Path, monkeypatch): """On Windows, the hook command must use cmd.exe-style double quotes around the path. POSIX single-quotes (shlex.quote) would not dequote From 077d14ba2e79fb1312c25ed5eb0c9b00911ec353 Mon Sep 17 00:00:00 2001 From: Fazle Elahee Date: Tue, 12 May 2026 11:33:44 +0100 Subject: [PATCH 2/3] ci: share fastembed cache between warm step and tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Embedder now defaults its fastembed cache to ~/.cache/fastembed (#67). CI's "Warm fastembed model cache" step still downloaded to fastembed's natural default (/tmp/fastembed_cache), so tests came up with my cache_dir empty — under pytest-xdist's parallel workers, several started concurrent downloads to that fresh dir and produced partial-snapshot races that ended in NO_SUCHFILE on the ONNX load. Both the warm step and the test step now point at \$RUNNER_TEMP/fastembed-cache via FASTEMBED_CACHE_PATH. fastembed itself honours that var, so the warm step still writes via the natural TextEmbedding default, and the tests' Embedder resolves to the same directory through the new _resolve_cache_dir() chain. --- .github/workflows/ci.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f1d1ad3..bb2fce7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -70,12 +70,21 @@ jobs: - name: Install package run: pip install -e ".[dev]" + # Pin fastembed's cache for BOTH the warm step and the test step so + # they share state. Embedder defaults to ~/.cache/fastembed under + # #67, and fastembed itself honours FASTEMBED_CACHE_PATH — setting + # this env var keeps the warm step and the tests pointed at the + # same directory without needing explicit kwargs. - name: Warm fastembed model cache + env: + FASTEMBED_CACHE_PATH: ${{ runner.temp }}/fastembed-cache run: | python -c "from fastembed import TextEmbedding; TextEmbedding('BAAI/bge-small-en-v1.5')" python -c "from fastembed import TextEmbedding; TextEmbedding('sentence-transformers/all-MiniLM-L6-v2')" - name: Run tests + env: + FASTEMBED_CACHE_PATH: ${{ runner.temp }}/fastembed-cache run: pytest -n ${{ runner.os == 'Windows' && '0' || '4' }} --cov=context_engine --cov-report=xml --cov-report=term # Windows GitHub Actions runners send spurious SIGINT during pytest # teardown, causing exit code 1 even when all tests pass (779/779). From 85a13c329330c4ccc0a7a6cd8faf23f9cc9e2122 Mon Sep 17 00:00:00 2001 From: Fazle Elahee Date: Tue, 12 May 2026 14:02:06 +0100 Subject: [PATCH 3/3] fix: address Copilot review feedback on #70 Six review findings, all resolved: 1. Security: validate PORT before shell interpolation (POSIX). A corrupted/hostile serve.port containing $(...) or backticks would be evaluated by the shell when bash -c interpolates "${PORT}". Refuse anything that isn't a digit-only string in the valid TCP range (1-65535) before any shell expansion. 2. Security: same validation on the Windows .cmd hook before the PowerShell -Command line interpolates %PORT%. findstr regex enforces digits-only; numeric range checks cap the value. 3. Shell-quote bin_path in git_hooks. Resolved binary paths commonly contain spaces (Windows user profiles, macOS users with a space in their name) and the unquoted path tokenised at the space. shlex.quote handles both cases. 4. _resolve_cache_dir docstring listed FASTEMBED_CACHE_PATH as priority 1 but the code checks CCE_FASTEMBED_CACHE_PATH first. Reordered the doc to match the implementation. 5. Replace `rm -rf {cache_dir}` recovery hint with a platform-neutral "delete the cache directory" message. 6. test_mcp_empty_index recorded `indexing_called` but never asserted on it. Added a `await asyncio.sleep(0)` to let the create_task'd coroutine run, then assert the list contains exactly one True. New regression tests: - test_posix_hook_rejects_non_numeric_port - test_windows_hook_rejects_non_numeric_port - test_post_commit_hook_quotes_bin_path_with_spaces 890 passed, 1 skipped, 0 failed. Ruff clean. --- src/context_engine/indexer/embedder.py | 14 +++++----- src/context_engine/indexer/git_hooks.py | 11 +++++++- src/context_engine/memory/hook_installer.py | 17 ++++++++++++ tests/indexer/test_git_hooks.py | 29 +++++++++++++++++++++ tests/integration/test_mcp_empty_index.py | 9 +++++++ tests/memory/test_hook_installer.py | 29 +++++++++++++++++++++ 6 files changed, 101 insertions(+), 8 deletions(-) diff --git a/src/context_engine/indexer/embedder.py b/src/context_engine/indexer/embedder.py index 9664fcd..261ee4d 100644 --- a/src/context_engine/indexer/embedder.py +++ b/src/context_engine/indexer/embedder.py @@ -25,11 +25,11 @@ def _resolve_cache_dir() -> Path: """Pick a persistent fastembed cache location. - Precedence: - 1. ``FASTEMBED_CACHE_PATH`` (the env var fastembed itself recognises) - 2. ``CCE_FASTEMBED_CACHE_PATH`` (CCE-specific override, takes - priority over fastembed's own var so users with multiple tools - using fastembed can isolate CCE's cache) + Precedence (highest first): + 1. ``CCE_FASTEMBED_CACHE_PATH`` (CCE-specific override; takes + priority over fastembed's own var so users running multiple + tools that share fastembed can isolate CCE's cache) + 2. ``FASTEMBED_CACHE_PATH`` (the env var fastembed itself recognises) 3. ``$XDG_CACHE_HOME/fastembed`` if XDG_CACHE_HOME is set 4. ``~/.cache/fastembed`` @@ -137,8 +137,8 @@ def __init__( f"Failed to load embedding model '{model_name}'. " f"Ensure fastembed is installed and the model name is valid. " f"Cache dir: {cache_dir}. " - f"If a previous download was interrupted, removing the cache " - f"and retrying may help: rm -rf {cache_dir}. " + f"If a previous download was interrupted, deleting the cache " + f"directory and retrying may help. " f"Supported models: TextEmbedding.list_supported_models(). " f"Original error: {exc}" ) from exc diff --git a/src/context_engine/indexer/git_hooks.py b/src/context_engine/indexer/git_hooks.py index baca88d..0af4276 100644 --- a/src/context_engine/indexer/git_hooks.py +++ b/src/context_engine/indexer/git_hooks.py @@ -1,4 +1,5 @@ """Git hook installer and handler for triggering re-indexing.""" +import shlex import shutil import stat import sys @@ -36,7 +37,15 @@ def _hook_script() -> str: # `--changed-only` flag was removed but the hook template hadn't been # updated — every commit silently errored with # "No such option: --changed-only" (issue #67). - bin_path = _resolve_cce_binary() + # + # bin_path must be shell-quoted because resolved paths commonly + # include spaces (e.g. C:\Users\Alice Smith\... on Windows, or + # /Users/Firstname Lastname/.venv/bin/cce on macOS). git's hook + # runner invokes the file via POSIX sh on every platform — even + # git-for-windows ships a bundled sh — so single-quoting with + # shlex.quote produces a correct token for the shell that actually + # runs the hook (Copilot review). + bin_path = shlex.quote(_resolve_cce_binary()) return f"""{HOOK_MARKER} {bin_path} index >/dev/null 2>&1 & """ diff --git a/src/context_engine/memory/hook_installer.py b/src/context_engine/memory/hook_installer.py index 6046b8d..813d6dd 100644 --- a/src/context_engine/memory/hook_installer.py +++ b/src/context_engine/memory/hook_installer.py @@ -77,6 +77,14 @@ def _is_windows() -> bool: [ -r "${PORT_FILE}" ] || exit 0 PORT="$(cat "${PORT_FILE}" 2>/dev/null)" [ -n "${PORT}" ] || exit 0 +# PORT is interpolated into a `bash -c` command and a curl URL below. +# A corrupted (or hostile) port file containing $(...) / backticks / +# newlines would otherwise be evaluated by the shell. Refuse anything +# that isn't a plain integer in the valid TCP range (Copilot review). +case "${PORT}" in + ''|*[!0-9]*) exit 0 ;; +esac +[ "${PORT}" -ge 1 ] 2>/dev/null && [ "${PORT}" -le 65535 ] 2>/dev/null || exit 0 # Quick TCP liveness probe via bash's /dev/tcp — if nothing is listening on # the port (i.e. cce serve died but left its serve.port behind), bail out @@ -125,6 +133,15 @@ def _is_windows() -> bool: set /p PORT=<"%PORT_FILE%" if "%PORT%"=="" exit /b 0 +REM PORT is interpolated into the PowerShell -Command string and the +REM curl URL below. A corrupted (or hostile) port file containing +REM ');something(' would otherwise terminate the intended expression +REM and inject. Refuse anything that isn't a positive integer in the +REM valid TCP range (Copilot review). +echo %PORT%|findstr /R "^[1-9][0-9]*$" >nul 2>&1 +if errorlevel 1 exit /b 0 +if %PORT% LSS 1 exit /b 0 +if %PORT% GTR 65535 exit /b 0 REM Liveness probe before spending curl's full timeout. PowerShell's REM TcpClient is universally available on supported Windows targets and diff --git a/tests/indexer/test_git_hooks.py b/tests/indexer/test_git_hooks.py index 2bd4db0..ef25dcb 100644 --- a/tests/indexer/test_git_hooks.py +++ b/tests/indexer/test_git_hooks.py @@ -63,3 +63,32 @@ def test_post_commit_hook_does_not_use_removed_flag(git_repo): ) # Sanity: the hook still triggers an index. assert " index " in content or "index>" in content + + +def test_post_commit_hook_quotes_bin_path_with_spaces(tmp_path, monkeypatch): + """A resolved cce binary path with spaces (e.g. Alice Smith) must be + shell-quoted in the generated hook so it isn't tokenised by sh + (Copilot review on #70).""" + from context_engine.indexer import git_hooks + + spaced = tmp_path / "Alice Smith" / "bin" / "cce" + spaced.parent.mkdir(parents=True) + spaced.write_text("#!/bin/sh\necho cce\n") + monkeypatch.setattr( + git_hooks, "_resolve_cce_binary", lambda: str(spaced), + ) + + git_repo = tmp_path / "repo" + (git_repo / ".git" / "hooks").mkdir(parents=True) + git_hooks.install_hooks(project_dir=str(git_repo)) + + body = (git_repo / ".git" / "hooks" / "post-commit").read_text() + # The path must be quoted as one shell token. shlex.quote single- + # quotes anything containing whitespace, so the literal `Alice Smith` + # must live inside `'...'`. Confirm a `'` appears before any space + # in the path. + assert "'" in body, f"bin_path should be shell-quoted: {body!r}" + assert "Alice Smith" in body + quoted_idx = body.index("'") + space_in_path = body.index("Alice Smith") + len("Alice") + assert quoted_idx < space_in_path diff --git a/tests/integration/test_mcp_empty_index.py b/tests/integration/test_mcp_empty_index.py index 1882b12..1d96a8e 100644 --- a/tests/integration/test_mcp_empty_index.py +++ b/tests/integration/test_mcp_empty_index.py @@ -7,6 +7,7 @@ """ from __future__ import annotations +import asyncio from unittest.mock import MagicMock import pytest @@ -61,6 +62,14 @@ async def _fake_run_indexing(*a, **kw): # is empty and what to do. assert "empty" in text.lower() assert "cce index" in text.lower() or "indexing" in text.lower() + # Background indexing must have been scheduled. Yielding once lets + # the create_task'd coroutine run (Copilot review: the previous + # version of the test recorded `indexing_called` but never + # asserted on it). + await asyncio.sleep(0) + assert indexing_called == [True], ( + "Background indexing should have been scheduled exactly once" + ) @pytest.mark.asyncio diff --git a/tests/memory/test_hook_installer.py b/tests/memory/test_hook_installer.py index 8e2890e..7adeea1 100644 --- a/tests/memory/test_hook_installer.py +++ b/tests/memory/test_hook_installer.py @@ -91,6 +91,35 @@ def test_windows_hook_has_tcp_liveness_probe(monkeypatch): assert probe_pos < curl_pos +# ─── Copilot review: PORT must be validated before shell interpolation ─ + + +def test_posix_hook_rejects_non_numeric_port(monkeypatch): + """Corrupted/hostile serve.port (containing $() or backticks) must + not be interpolated into the bash -c command (Copilot security + review on #70).""" + monkeypatch.setattr(hi, "_is_windows", lambda: False) + body = hi._hook_script_body() + assert "*[!0-9]*" in body, "POSIX script must reject non-digit PORT" + assert "-le 65535" in body, "POSIX script must cap PORT at 65535" + validation_pos = body.index("*[!0-9]*") + probe_pos = body.index("/dev/tcp/127.0.0.1") + curl_pos = body.index("curl -sf") + assert validation_pos < probe_pos < curl_pos + + +def test_windows_hook_rejects_non_numeric_port(monkeypatch): + monkeypatch.setattr(hi, "_is_windows", lambda: True) + body = hi._hook_script_body() + assert "findstr /R" in body + assert "LSS 1" in body + assert "GTR 65535" in body + validation_pos = body.index("findstr /R") + probe_pos = body.index("TcpClient") + curl_pos = body.index("curl -sf") + assert validation_pos < probe_pos < curl_pos + + def test_install_settings_uses_cmd_quoting_on_windows(tmp_path: Path, monkeypatch): """On Windows, the hook command must use cmd.exe-style double quotes around the path. POSIX single-quotes (shlex.quote) would not dequote