Skip to content

Commit 30c0fb1

Browse files
committed
feat: Enhance chunking metadata and add comprehensive docstrings to core components.
1 parent fb54235 commit 30c0fb1

18 files changed

Lines changed: 330 additions & 46 deletions

src/knowcode/background_indexer.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@ class BackgroundIndexer:
1212
"""Runs indexing in background thread."""
1313

1414
def __init__(self, indexer: Indexer) -> None:
15+
"""Initialize the background worker with an Indexer instance.
16+
17+
Args:
18+
indexer: Indexer used to process queued files.
19+
"""
1520
self.indexer = indexer
1621
self._queue: queue.Queue = queue.Queue()
1722
self._thread: Optional[threading.Thread] = None
@@ -31,16 +36,21 @@ def stop(self) -> None:
3136
self._thread.join(timeout=5.0)
3237

3338
def queue_file(self, path: Path) -> None:
34-
"""Queue a file for indexing."""
39+
"""Queue a file for indexing.
40+
41+
Args:
42+
path: File path to enqueue for processing.
43+
"""
3544
self._queue.put(path)
3645

3746
def _worker(self) -> None:
38-
"""Worker thread that processes indexing queue."""
47+
"""Worker thread that processes the indexing queue."""
3948
while self._running:
4049
try:
4150
# Use timeout to allow checking self._running
4251
path = self._queue.get(timeout=1.0)
4352
if path is None:
53+
self._queue.task_done()
4454
break
4555
self.indexer.index_file(path)
4656
self._queue.task_done()

src/knowcode/chunk_repository.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,24 +39,29 @@ class InMemoryChunkRepository(ChunkRepository):
3939
"""In-memory implementation of ChunkRepository."""
4040

4141
def __init__(self) -> None:
42+
"""Initialize the in-memory storage structures."""
4243
self._chunks: dict[str, CodeChunk] = {}
4344
self._by_entity: dict[str, list[str]] = {} # entity_id -> chunk_ids
4445

4546
def add(self, chunk: CodeChunk) -> None:
47+
"""Add a chunk to the in-memory index."""
4648
self._chunks[chunk.id] = chunk
4749
if chunk.entity_id not in self._by_entity:
4850
self._by_entity[chunk.entity_id] = []
4951
if chunk.id not in self._by_entity[chunk.entity_id]:
5052
self._by_entity[chunk.entity_id].append(chunk.id)
5153

5254
def get(self, chunk_id: str) -> Optional[CodeChunk]:
55+
"""Fetch a chunk by its ID."""
5356
return self._chunks.get(chunk_id)
5457

5558
def get_by_entity(self, entity_id: str) -> list[CodeChunk]:
59+
"""Return all chunks associated with an entity."""
5660
chunk_ids = self._by_entity.get(entity_id, [])
5761
return [self._chunks[cid] for cid in chunk_ids if cid in self._chunks]
5862

5963
def search_by_tokens(self, tokens: list[str], limit: int = 10) -> list[CodeChunk]:
64+
"""Perform a simple token-overlap search over stored chunks."""
6065
# Simple token overlap scoring
6166
scores: list[tuple[float, CodeChunk]] = []
6267
query_set = set(tokens)

src/knowcode/chunker.py

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,21 @@
1212

1313

1414
class Chunker:
15-
"""Chunks code entities into smaller units."""
15+
"""Chunks code entities into smaller, searchable units."""
1616

1717
def __init__(self, config: Optional[ChunkingConfig] = None) -> None:
1818
self.config = config or ChunkingConfig()
1919
self.chunks: list[CodeChunk] = []
2020

2121
def process_parse_result(self, result: ParseResult) -> list[CodeChunk]:
22-
"""Process a ParseResult and generate chunks."""
22+
"""Convert a ParseResult into a list of CodeChunk objects.
23+
24+
Args:
25+
result: Parsed entities, relationships, and errors for a single file.
26+
27+
Returns:
28+
List of generated CodeChunk objects in priority order.
29+
"""
2330
self.chunks = [] # Single initialization at start of process
2431

2532
file_path = result.file_path
@@ -42,16 +49,19 @@ def process_parse_result(self, result: ParseResult) -> list[CodeChunk]:
4249

4350
# 2. Entity Chunks (Classes, Functions, Methods)
4451
for entity in result.entities:
45-
if entity.kind == EntityKind.MODULE:
46-
continue
4752
if entity.kind == EntityKind.MODULE:
4853
continue
4954
self._chunk_entity(entity, last_modified)
5055

5156
return self.chunks
5257

5358
def _emit_module_chunks(self, file_path: str, source: str) -> None:
54-
"""Extract module-level header and imports."""
59+
"""Extract module-level header and imports into dedicated chunks.
60+
61+
Args:
62+
file_path: File path used to namespace chunk IDs.
63+
source: Full source code for the module.
64+
"""
5565
# Module Header
5666
header = self._extract_module_header(source)
5767
if header:
@@ -77,7 +87,7 @@ def _emit_module_chunks(self, file_path: str, source: str) -> None:
7787
self.chunks.append(import_chunk)
7888

7989
def _extract_module_header(self, source: str) -> str:
80-
"""Extract first docstring and module definition."""
90+
"""Extract the leading module header and docstring block."""
8191
lines = source.splitlines()
8292
header_lines = []
8393
in_docstring = False
@@ -112,7 +122,7 @@ def _extract_module_header(self, source: str) -> str:
112122
return "\n".join(header_lines).strip()
113123

114124
def _extract_imports(self, source: str) -> str:
115-
"""Extract all import statements."""
125+
"""Extract all import statements from the source."""
116126
lines = []
117127
for line in source.splitlines():
118128
stripped = line.strip()
@@ -121,7 +131,12 @@ def _extract_imports(self, source: str) -> str:
121131
return "\n".join(lines).strip()
122132

123133
def _chunk_entity(self, entity: Entity, last_modified: Optional[str] = None) -> None:
124-
"""Create chunks for an entity."""
134+
"""Create chunks for an entity and append them to the in-memory list.
135+
136+
Args:
137+
entity: Entity to chunk (class, function, method, etc.).
138+
last_modified: Optional timestamp used for ranking signals.
139+
"""
125140
content = ""
126141

127142
if self.config.include_signatures and entity.signature:
@@ -136,8 +151,10 @@ def _chunk_entity(self, entity: Entity, last_modified: Optional[str] = None) ->
136151
content += entity.name
137152

138153
# Sliding window chunking
154+
has_docstring = "true" if entity.docstring else "false"
155+
139156
if len(content) <= self.config.max_chunk_size:
140-
metadata = {"kind": entity.kind.value}
157+
metadata = {"kind": entity.kind.value, "has_docstring": has_docstring}
141158
if last_modified:
142159
metadata["last_modified"] = last_modified
143160

@@ -157,7 +174,11 @@ def _chunk_entity(self, entity: Entity, last_modified: Optional[str] = None) ->
157174
end = min(start + self.config.max_chunk_size, len(content))
158175
chunk_content = content[start:end]
159176

160-
metadata = {"kind": entity.kind.value, "chunk_index": str(chunk_index)}
177+
metadata = {
178+
"kind": entity.kind.value,
179+
"chunk_index": str(chunk_index),
180+
"has_docstring": has_docstring,
181+
}
161182
if last_modified:
162183
metadata["last_modified"] = last_modified
163184

src/knowcode/cli.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import click
99

1010
from knowcode import __version__
11-
from knowcode.models import EntityKind
11+
from knowcode.models import EntityKind, RelationshipKind
1212
from knowcode.service import KnowCodeService
1313
from knowcode.knowledge_store import KnowledgeStore
1414

@@ -431,7 +431,7 @@ def history(target: Optional[str], store: str, limit: int) -> None:
431431
author_rels = knowledge.get_incoming_relationships(commit.id)
432432
author = "Unknown"
433433
for rel in author_rels:
434-
if rel.kind == "authored":
434+
if rel.kind == RelationshipKind.AUTHORED:
435435
# rel.source_id is author
436436
a_ent = knowledge.get_entity(rel.source_id)
437437
if a_ent:
@@ -460,7 +460,7 @@ def history(target: Optional[str], store: str, limit: int) -> None:
460460
rels = knowledge.get_outgoing_relationships(entity.id)
461461
changes = []
462462
for rel in rels:
463-
if rel.kind == "changed_by":
463+
if rel.kind == RelationshipKind.CHANGED_BY:
464464
commit = knowledge.get_entity(rel.target_id)
465465
if commit:
466466
# Get modification stats from edge metadata

src/knowcode/completeness.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,18 @@ def expand_dependencies(
1515
knowledge_store: KnowledgeStore,
1616
max_depth: int = 1
1717
) -> list[CodeChunk]:
18-
"""Expand chunk to include dependency context.
19-
18+
"""Expand a chunk to include dependency context.
2019
Uses knowledge graph to find related entities,
2120
then retrieves their chunks.
21+
22+
Args:
23+
chunk: Starting chunk whose dependencies should be expanded.
24+
chunk_repo: Repository used to fetch chunks by entity.
25+
knowledge_store: Graph store used to resolve dependencies.
26+
max_depth: Depth of dependency expansion (1 = direct callees only).
27+
28+
Returns:
29+
List of chunks including the input chunk and its dependencies.
2230
"""
2331
expanded: list[CodeChunk] = [chunk]
2432
visited: set[str] = {chunk.entity_id}

src/knowcode/embedding.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ class EmbeddingProvider(ABC):
1212
"""Abstract interface for generating embeddings."""
1313

1414
def __init__(self, config: EmbeddingConfig) -> None:
15+
"""Initialize the provider with the embedding configuration."""
1516
self.config = config
1617

1718
@abstractmethod
@@ -29,6 +30,11 @@ class OpenAIEmbeddingProvider(EmbeddingProvider):
2930
"""OpenAI embedding provider."""
3031

3132
def __init__(self, config: EmbeddingConfig) -> None:
33+
"""Create an OpenAI-backed embedding provider.
34+
35+
Args:
36+
config: Embedding configuration settings.
37+
"""
3238
super().__init__(config)
3339
api_key = os.environ.get("OPENAI_API_KEY")
3440
if not api_key:
@@ -38,6 +44,7 @@ def __init__(self, config: EmbeddingConfig) -> None:
3844
self.client = OpenAI(api_key=api_key)
3945

4046
def _get_client(self) -> OpenAI:
47+
"""Return an initialized OpenAI client, loading credentials if needed."""
4148
if not self.client:
4249
api_key = os.environ.get("OPENAI_API_KEY")
4350
if not api_key:
@@ -46,6 +53,14 @@ def _get_client(self) -> OpenAI:
4653
return self.client
4754

4855
def embed(self, texts: list[str]) -> list[list[float]]:
56+
"""Generate embeddings for a batch of texts.
57+
58+
Args:
59+
texts: Input texts to embed.
60+
61+
Returns:
62+
List of embedding vectors (one per input).
63+
"""
4964
if not texts:
5065
return []
5166

@@ -62,9 +77,11 @@ def embed(self, texts: list[str]) -> list[list[float]]:
6277
return embeddings
6378

6479
def embed_single(self, text: str) -> list[float]:
80+
"""Generate an embedding for a single text input."""
6581
return self.embed([text])[0]
6682

6783
def _normalize(self, vec: list[float]) -> list[float]:
84+
"""Normalize a vector to unit length for cosine similarity."""
6885
import math
6986
norm = math.sqrt(sum(x*x for x in vec))
7087
return [x / norm for x in vec] if norm > 0 else vec

src/knowcode/hybrid_index.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,13 @@ def __init__(
1717
vector_store: VectorStore,
1818
alpha: float = 0.5 # Weight for dense vs sparse (0.5 = equal weight)
1919
) -> None:
20+
"""Initialize the hybrid index.
21+
22+
Args:
23+
chunk_repo: Repository providing BM25-style token search.
24+
vector_store: Dense vector store for semantic similarity.
25+
alpha: Blend weight for dense vs sparse results.
26+
"""
2027
self.chunk_repo = chunk_repo
2128
self.vector_store = vector_store
2229
self.alpha = alpha
@@ -27,9 +34,17 @@ def search(
2734
query_embedding: list[float],
2835
limit: int = 10
2936
) -> list[tuple[CodeChunk, float]]:
30-
"""Search using hybrid retrieval.
31-
37+
"""Search using hybrid retrieval.
38+
Combines BM25 sparse retrieval with dense vector search.
3239
Returns chunks with combined scores using Reciprocal Rank Fusion (RRF).
40+
41+
Args:
42+
query: Raw query string for sparse matching.
43+
query_embedding: Dense embedding of the query.
44+
limit: Maximum number of chunks to return.
45+
46+
Returns:
47+
List of (chunk, score) tuples ranked by reciprocal rank fusion.
3348
"""
3449
# 1. BM25 Search
3550
query_tokens = tokenize_code(query)

src/knowcode/indexer.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,27 @@ def __init__(
2323
chunk_repo: Optional[InMemoryChunkRepository] = None,
2424
vector_store: Optional[VectorStore] = None,
2525
) -> None:
26+
"""Initialize an indexer with optional storage backends.
27+
28+
Args:
29+
embedding_provider: Provider used to generate chunk embeddings.
30+
chunk_repo: Optional chunk repository (defaults to in-memory).
31+
vector_store: Optional vector store (defaults to FAISS-backed store).
32+
"""
2633
self.embedding_provider = embedding_provider
2734
self.chunk_repo = chunk_repo or InMemoryChunkRepository()
2835
self.vector_store = vector_store or VectorStore(dimension=embedding_provider.config.dimension)
2936
self.chunker = Chunker()
3037

3138
def index_directory(self, root_dir: str | Path) -> int:
32-
"""Index all files in a directory."""
39+
"""Index all supported files under a directory.
40+
41+
Args:
42+
root_dir: Root directory to scan for supported files.
43+
44+
Returns:
45+
Total number of chunks added to the index.
46+
"""
3347
root_path = Path(root_dir)
3448

3549
# Use existing GraphBuilder to get semantic entities
@@ -67,7 +81,11 @@ def index_directory(self, root_dir: str | Path) -> int:
6781
return total_chunks
6882

6983
def save(self, path: str | Path) -> None:
70-
"""Save the entire index to disk."""
84+
"""Persist vector index and chunk metadata to disk.
85+
86+
Args:
87+
path: Directory path to write index files into.
88+
"""
7189
path = Path(path)
7290
path.mkdir(parents=True, exist_ok=True)
7391

@@ -92,7 +110,11 @@ def save(self, path: str | Path) -> None:
92110
json.dump(metadata, f)
93111

94112
def load(self, path: str | Path) -> None:
95-
"""Load the entire index from disk."""
113+
"""Load the entire vector index and chunk metadata from disk.
114+
115+
Args:
116+
path: Directory path containing previously saved index files.
117+
"""
96118
path = Path(path)
97119

98120
# Load vector store
@@ -111,7 +133,14 @@ def load(self, path: str | Path) -> None:
111133
self.chunk_repo.add(chunk)
112134

113135
def index_file(self, file_path: str | Path) -> int:
114-
"""Index a single file (incremental)."""
136+
"""Index a single file for incremental updates.
137+
138+
Args:
139+
file_path: File path to process.
140+
141+
Returns:
142+
Number of chunks created for the file.
143+
"""
115144
file_path = Path(file_path)
116145
# Simplified for Task 3.6
117146
builder = GraphBuilder()

0 commit comments

Comments
 (0)