1212
1313
1414class Chunker :
15- """Chunks code entities into smaller units."""
15+ """Chunks code entities into smaller, searchable units."""
1616
1717 def __init__ (self , config : Optional [ChunkingConfig ] = None ) -> None :
1818 self .config = config or ChunkingConfig ()
1919 self .chunks : list [CodeChunk ] = []
2020
2121 def process_parse_result (self , result : ParseResult ) -> list [CodeChunk ]:
22- """Process a ParseResult and generate chunks."""
22+ """Convert a ParseResult into a list of CodeChunk objects.
23+
24+ Args:
25+ result: Parsed entities, relationships, and errors for a single file.
26+
27+ Returns:
28+ List of generated CodeChunk objects in priority order.
29+ """
2330 self .chunks = [] # Single initialization at start of process
2431
2532 file_path = result .file_path
@@ -42,16 +49,19 @@ def process_parse_result(self, result: ParseResult) -> list[CodeChunk]:
4249
4350 # 2. Entity Chunks (Classes, Functions, Methods)
4451 for entity in result .entities :
45- if entity .kind == EntityKind .MODULE :
46- continue
4752 if entity .kind == EntityKind .MODULE :
4853 continue
4954 self ._chunk_entity (entity , last_modified )
5055
5156 return self .chunks
5257
5358 def _emit_module_chunks (self , file_path : str , source : str ) -> None :
54- """Extract module-level header and imports."""
59+ """Extract module-level header and imports into dedicated chunks.
60+
61+ Args:
62+ file_path: File path used to namespace chunk IDs.
63+ source: Full source code for the module.
64+ """
5565 # Module Header
5666 header = self ._extract_module_header (source )
5767 if header :
@@ -77,7 +87,7 @@ def _emit_module_chunks(self, file_path: str, source: str) -> None:
7787 self .chunks .append (import_chunk )
7888
7989 def _extract_module_header (self , source : str ) -> str :
80- """Extract first docstring and module definition ."""
90+ """Extract the leading module header and docstring block ."""
8191 lines = source .splitlines ()
8292 header_lines = []
8393 in_docstring = False
@@ -112,7 +122,7 @@ def _extract_module_header(self, source: str) -> str:
112122 return "\n " .join (header_lines ).strip ()
113123
114124 def _extract_imports (self , source : str ) -> str :
115- """Extract all import statements."""
125+ """Extract all import statements from the source ."""
116126 lines = []
117127 for line in source .splitlines ():
118128 stripped = line .strip ()
@@ -121,7 +131,12 @@ def _extract_imports(self, source: str) -> str:
121131 return "\n " .join (lines ).strip ()
122132
123133 def _chunk_entity (self , entity : Entity , last_modified : Optional [str ] = None ) -> None :
124- """Create chunks for an entity."""
134+ """Create chunks for an entity and append them to the in-memory list.
135+
136+ Args:
137+ entity: Entity to chunk (class, function, method, etc.).
138+ last_modified: Optional timestamp used for ranking signals.
139+ """
125140 content = ""
126141
127142 if self .config .include_signatures and entity .signature :
@@ -136,8 +151,10 @@ def _chunk_entity(self, entity: Entity, last_modified: Optional[str] = None) ->
136151 content += entity .name
137152
138153 # Sliding window chunking
154+ has_docstring = "true" if entity .docstring else "false"
155+
139156 if len (content ) <= self .config .max_chunk_size :
140- metadata = {"kind" : entity .kind .value }
157+ metadata = {"kind" : entity .kind .value , "has_docstring" : has_docstring }
141158 if last_modified :
142159 metadata ["last_modified" ] = last_modified
143160
@@ -157,7 +174,11 @@ def _chunk_entity(self, entity: Entity, last_modified: Optional[str] = None) ->
157174 end = min (start + self .config .max_chunk_size , len (content ))
158175 chunk_content = content [start :end ]
159176
160- metadata = {"kind" : entity .kind .value , "chunk_index" : str (chunk_index )}
177+ metadata = {
178+ "kind" : entity .kind .value ,
179+ "chunk_index" : str (chunk_index ),
180+ "has_docstring" : has_docstring ,
181+ }
161182 if last_modified :
162183 metadata ["last_modified" ] = last_modified
163184
0 commit comments