From 262eeefb8a1c22d4d937724c9e567374bd433082 Mon Sep 17 00:00:00 2001 From: Fazle Elahee Date: Tue, 5 May 2026 21:52:47 +0100 Subject: [PATCH 1/2] feat(indexer): HTML AST chunking + user-defined extension mappings - Wire tree-sitter-html parser into Chunker. + + + +""" + +HTML_PLAIN = """ +

Just text

no script or style here

+""" + + +def test_chunk_html_extracts_script_and_style(chunker): + chunks = chunker.chunk(HTML_WITH_BLOCKS, file_path="page.html", language="html") + types = [c.chunk_type for c in chunks] + # script and style become MODULE chunks + assert types.count(ChunkType.MODULE) >= 2 + contents = [c.content for c in chunks if c.chunk_type == ChunkType.MODULE] + assert any("console.log" in c for c in contents) + assert any("color: red" in c for c in contents) + + +def test_chunk_html_without_blocks_falls_back_to_whole_file(chunker): + chunks = chunker.chunk(HTML_PLAIN, file_path="plain.html", language="html") + # No script/style → fallback path returns single MODULE chunk for the file + assert len(chunks) == 1 + assert chunks[0].chunk_type == ChunkType.MODULE + assert "Just text" in chunks[0].content diff --git a/tests/indexer/test_language_resolution.py b/tests/indexer/test_language_resolution.py new file mode 100644 index 0000000..d70f46a --- /dev/null +++ b/tests/indexer/test_language_resolution.py @@ -0,0 +1,35 @@ +"""Unit tests for `_resolve_language` — the indexer hook that lets users add +custom file-extension → language mappings via `indexer.extensions` in +`.context-engine.yaml`. +""" +from context_engine.indexer.pipeline import _resolve_language + + +def test_builtin_extension_resolves_to_known_language(): + assert _resolve_language(".py", {}) == "python" + + +def test_unknown_extension_falls_back_to_plaintext(): + assert _resolve_language(".xyz", {}) == "plaintext" + + +def test_custom_alias_overrides_builtin(): + # .h normally maps to c; custom mapping flips it to cpp. + assert _resolve_language(".h", {".h": "cpp"}) == "cpp" + + +def test_custom_alias_for_unknown_extension(): + assert _resolve_language(".tpl", {".tpl": "html"}) == "html" + + +def test_custom_empty_value_means_plaintext(): + # User opts into indexing the file but knows there's no parser. + assert _resolve_language(".liquid", {".liquid": ""}) == "plaintext" + + +def test_lookup_is_case_insensitive(): + # Extension comes from Path.suffix which preserves case (.HTML on Windows + # mounts, .R for R files); custom map keys are normalised to lowercase + # at config load time, so the lookup must lowercase the suffix too. + assert _resolve_language(".HTML", {}) == "html" + assert _resolve_language(".TPL", {".tpl": "html"}) == "html" diff --git a/tests/test_config.py b/tests/test_config.py index 70e65fe..ece5dcc 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -85,3 +85,55 @@ def test_ollama_url_yaml_type_validation(tmp_path): })) with pytest.raises(ValueError, match="ollama_url"): load_config(global_path=config_file) + + +def test_indexer_extensions_default_empty(): + assert Config().indexer_extensions == {} + + +def test_indexer_extensions_loads_and_normalizes(tmp_path): + config_file = tmp_path / "config.yaml" + config_file.write_text(yaml.dump({ + "indexer": {"extensions": {".tpl": "html", ".MJS": "javascript", ".liquid": "", ".erb": None}}, + })) + config = load_config(global_path=config_file) + # Keys lowercased, null coerced to empty string. + assert config.indexer_extensions == { + ".tpl": "html", + ".mjs": "javascript", + ".liquid": "", + ".erb": "", + } + + +def test_indexer_extensions_rejects_key_without_dot(tmp_path): + config_file = tmp_path / "config.yaml" + config_file.write_text(yaml.dump({ + "indexer": {"extensions": {"tpl": "html"}}, + })) + with pytest.raises(ValueError, match="must be a string starting with"): + load_config(global_path=config_file) + + +def test_indexer_extensions_rejects_non_string_value(tmp_path): + config_file = tmp_path / "config.yaml" + config_file.write_text(yaml.dump({ + "indexer": {"extensions": {".tpl": 123}}, + })) + with pytest.raises(ValueError, match="must be a string or null"): + load_config(global_path=config_file) + + +def test_indexer_extensions_project_overrides_global(tmp_path): + global_file = tmp_path / "config.yaml" + global_file.write_text(yaml.dump({ + "indexer": {"extensions": {".tpl": "html"}}, + })) + project_file = tmp_path / ".context-engine.yaml" + project_file.write_text(yaml.dump({ + "indexer": {"extensions": {".tpl": "javascript", ".vue": "vue"}}, + })) + config = load_config(global_path=global_file, project_path=project_file) + # Project entry wins for .tpl; .vue inherited because deep_merge merges dicts. + assert config.indexer_extensions[".tpl"] == "javascript" + assert config.indexer_extensions[".vue"] == "vue" From c73bca6e40d9c161fe11dfaacee38123d42daa91 Mon Sep 17 00:00:00 2001 From: Fazle Elahee Date: Tue, 5 May 2026 21:59:40 +0100 Subject: [PATCH 2/2] docs(wiki): document indexer.extensions for custom file extensions New "Custom File Extensions" section covering: how to alias an extension to a built-in parser, how to opt into plaintext indexing with an empty value, override behavior per-project, and which languages currently have AST chunking vs. plaintext fallback. --- docs/wiki/Configuration.md | 42 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/docs/wiki/Configuration.md b/docs/wiki/Configuration.md index 0f70bd5..9ccdaa3 100644 --- a/docs/wiki/Configuration.md +++ b/docs/wiki/Configuration.md @@ -137,6 +137,48 @@ Files matching `.gitignore` are also skipped automatically. --- +## Custom File Extensions + +By default the indexer recognises common source file extensions (`.py`, `.ts`, `.go`, `.html`, `.css`, …) and routes each to the right tree-sitter parser when one is available. If your project uses an extension CCE doesn't know about — a template language, a rebranded JS extension, a config DSL — register it under `indexer.extensions`: + +```yaml +indexer: + extensions: + .tpl: html # alias to an existing parser + .mjs: javascript + .cts: typescript + .liquid: "" # index as plaintext (no AST chunking) + .erb: "" +``` + +**Rules:** + +- **Keys** must start with `.` and are matched case-insensitively against file suffixes (`.HTML` and `.html` resolve the same way). +- **Values** are language strings — anything in the built-in `_LANGUAGE_MAP` works (`html`, `javascript`, `typescript`, `python`, `go`, `rust`, `java`, `php`, etc.). Unknown values are accepted and fall back to plaintext at chunk time. +- **Empty string or `null`** indexes the file as a single plaintext chunk. Useful when you want the file searchable but know there's no parser for it. +- **User entries override built-ins.** For example, force `.h` to be parsed as C++ instead of C: + + ```yaml + indexer: + extensions: + .h: cpp + ``` + +**Where to put it:** + +- Global default: `~/.cce/config.yaml` +- Project-specific: `.context-engine.yaml` in the project root (overrides the global entry per-extension) + +**After editing**, re-run indexing so existing files get re-chunked under the new mapping: + +```bash +cce index --full +``` + +**Parsers with full AST chunking** (semantic chunks for functions, classes, blocks): Python, JavaScript, TypeScript/TSX, PHP, Go, Rust, Java, HTML. Other languages (`css`, `markdown`, `json`, `yaml`, …) are mapped for metadata but indexed as a single plaintext chunk per file. + +--- + ## Changing the Embedding Model ```yaml