From a922ff4e5bb55ecc4873030f8ebe1df03c4eb8b4 Mon Sep 17 00:00:00 2001 From: Abi Date: Wed, 20 May 2026 11:07:11 -0700 Subject: [PATCH 1/2] refactor: record passage_id_scheme in meta.json (default "sequential") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sub-PR 1 of 5 from the plan in #329. Purely additive — no behavior change for any caller, existing index loaders ignore the field. Writes a new `passage_id_scheme: "sequential"` field into the .meta.json produced by both build_index and build_index_from_arrays. Bumps version to "1.1" for human-inspectable schema tracking (no code reads version today, so the bump is safe). Module-level constants PASSAGE_ID_SCHEME_SEQUENTIAL / _CONTENT_HASH document the value space; the content-hash scheme itself ships in sub-PR 2. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/leann-core/src/leann/api.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 26f23f29..48c64be0 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -30,6 +30,15 @@ logger = logging.getLogger(__name__) +# Passage ID schemes recorded in .meta.json["passage_id_scheme"]. +# - "sequential": today's default; IDs are str(insertion_index) (api.py:add_text). +# - "content-hash": planned in #329; IDs are sha256(text)[:16], stable across +# file moves and reorderings. +# Older indexes have no passage_id_scheme field — readers must default to +# "sequential" when the key is absent. See #329 for the rollout plan. +PASSAGE_ID_SCHEME_SEQUENTIAL = "sequential" +PASSAGE_ID_SCHEME_CONTENT_HASH = "content-hash" + def get_registered_backends() -> list[str]: """Get list of registered backend names.""" @@ -550,12 +559,13 @@ def build_index(self, index_path: str): builder_instance.build(embeddings, string_ids, index_path, **current_backend_kwargs) leann_meta_path = index_dir / f"{index_name}.meta.json" meta_data = { - "version": "1.0", + "version": "1.1", "backend_name": self.backend_name, "embedding_model": self.embedding_model, "dimensions": self.dimensions, "backend_kwargs": self.backend_kwargs, "embedding_mode": self.embedding_mode, + "passage_id_scheme": PASSAGE_ID_SCHEME_SEQUENTIAL, "passage_sources": [ { "type": "jsonl", @@ -675,12 +685,13 @@ def build_index_from_arrays(self, index_path: str, ids: list, embeddings: np.nda # Create metadata file leann_meta_path = index_dir / f"{index_name}.meta.json" meta_data = { - "version": "1.0", + "version": "1.1", "backend_name": self.backend_name, "embedding_model": self.embedding_model, "dimensions": self.dimensions, "backend_kwargs": self.backend_kwargs, "embedding_mode": self.embedding_mode, + "passage_id_scheme": PASSAGE_ID_SCHEME_SEQUENTIAL, "passage_sources": [ { "type": "jsonl", From e66b74df15f01a2f9cf65867e9baf3002f122742 Mon Sep 17 00:00:00 2001 From: Abi Date: Wed, 20 May 2026 11:17:29 -0700 Subject: [PATCH 2/2] feat: content-hash passage IDs via --id-scheme content-hash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sub-PR 2 of 5 from #329. Builds on #330 (which added the meta.json field). New behavior: - `LeannBuilder(..., passage_id_scheme="content-hash")` makes add_text() key passages by sha256(text)[:16] instead of insertion index. Stable across file moves, reorderings, and re-runs of the same corpus. - `leann build --id-scheme content-hash` exposes it at the CLI. - Default unchanged ("sequential"). Existing indexes continue to work identically; no migration triggered. Identical-text chunks collide (same hash). For this sub-PR the second occurrence overwrites the first in the offset map — that's the dedup behavior I'd want by default. A `--preserve-duplicates` escape hatch can land later if needed (see the open question in #329). Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/leann-core/src/leann/api.py | 30 +++++++++++++++++++++++++--- packages/leann-core/src/leann/cli.py | 12 +++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 48c64be0..069fa416 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -370,8 +370,19 @@ def __init__( dimensions: Optional[int] = None, embedding_mode: str = "sentence-transformers", embedding_options: Optional[dict[str, Any]] = None, + passage_id_scheme: str = PASSAGE_ID_SCHEME_SEQUENTIAL, **backend_kwargs, ): + if passage_id_scheme not in ( + PASSAGE_ID_SCHEME_SEQUENTIAL, + PASSAGE_ID_SCHEME_CONTENT_HASH, + ): + raise ValueError( + f"Unknown passage_id_scheme: {passage_id_scheme!r}. " + f"Expected one of: {PASSAGE_ID_SCHEME_SEQUENTIAL!r}, " + f"{PASSAGE_ID_SCHEME_CONTENT_HASH!r}." + ) + self.passage_id_scheme = passage_id_scheme self.backend_name = backend_name # Normalize incompatible combinations early (for consistent metadata) if backend_name == "hnsw": @@ -466,10 +477,23 @@ def __init__( self.backend_kwargs = backend_kwargs self.chunks: list[dict[str, Any]] = [] + def _generate_passage_id(self, text: str) -> str: + """Generate a passage ID per the configured scheme. + + sequential: str(insertion index) — fast, position-dependent, current default. + content-hash: sha256(text)[:16] — content-stable, dedup-friendly across + file moves and reorderings. See #329 for the design. + """ + if self.passage_id_scheme == PASSAGE_ID_SCHEME_CONTENT_HASH: + import hashlib + + return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16] + return str(len(self.chunks)) + def add_text(self, text: str, metadata: Optional[dict[str, Any]] = None): if metadata is None: metadata = {} - passage_id = metadata.get("id", str(len(self.chunks))) + passage_id = metadata.get("id") or self._generate_passage_id(text) chunk_data = {"id": passage_id, "text": text, "metadata": metadata} self.chunks.append(chunk_data) @@ -565,7 +589,7 @@ def build_index(self, index_path: str): "dimensions": self.dimensions, "backend_kwargs": self.backend_kwargs, "embedding_mode": self.embedding_mode, - "passage_id_scheme": PASSAGE_ID_SCHEME_SEQUENTIAL, + "passage_id_scheme": self.passage_id_scheme, "passage_sources": [ { "type": "jsonl", @@ -691,7 +715,7 @@ def build_index_from_arrays(self, index_path: str, ids: list, embeddings: np.nda "dimensions": self.dimensions, "backend_kwargs": self.backend_kwargs, "embedding_mode": self.embedding_mode, - "passage_id_scheme": PASSAGE_ID_SCHEME_SEQUENTIAL, + "passage_id_scheme": self.passage_id_scheme, "passage_sources": [ { "type": "jsonl", diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 15a8a187..1a6e05ef 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -341,6 +341,16 @@ def create_parser(self) -> argparse.ArgumentParser: default=True, help="Fall back to traditional chunking if AST chunking fails (default: True)", ) + build_parser.add_argument( + "--id-scheme", + choices=["sequential", "content-hash"], + default="sequential", + help=( + "How passage IDs are assigned. 'sequential' (default) keys by insertion " + "order; 'content-hash' uses sha256(text)[:16], stable across file moves " + "and reorderings. See #329." + ), + ) # Watch command watch_parser = subparsers.add_parser( @@ -1885,6 +1895,7 @@ def _make_incremental_builder(self, args) -> "LeannBuilder": is_compact=args.compact, is_recompute=args.recompute, num_threads=args.num_threads, + passage_id_scheme=getattr(args, "id_scheme", "sequential"), ) def _incremental_add_only( @@ -2378,6 +2389,7 @@ async def build_index(self, args): is_compact=args.compact, is_recompute=args.recompute, num_threads=args.num_threads, + passage_id_scheme=getattr(args, "id_scheme", "sequential"), ) for chunk in all_texts: