Skip to content

Commit 1fae933

Browse files
sidmohan0claude
andcommitted
fix(text): implement proper structured output for multi-chunk processing
- Fix annotate_text_sync to return List[Span] when structured=True for chunked text - Previously returned dict instead of structured spans for text > chunk_length - Add proper span position adjustment across chunk boundaries - Resolves benchmark test failure in test_structured_output_performance 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 8e8eac7 commit 1fae933

File tree

1 file changed

+24
-9
lines changed

1 file changed

+24
-9
lines changed

datafog/services/text_service.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -132,19 +132,34 @@ def annotate_text_sync(
132132
else:
133133
# Multi-chunk processing
134134
chunks = self._chunk_text(text)
135-
chunk_annotations = []
136-
137-
for chunk in chunks:
138-
chunk_result = self.annotate_text_sync(chunk, structured=False)
139-
chunk_annotations.append(chunk_result)
140135

141136
if structured:
142-
# For structured output with chunking, we need to recalculate positions
143-
# This is more complex, so for now return dict format
137+
# For structured output, we need to handle span positions across chunks
138+
all_spans = []
139+
current_offset = 0
140+
141+
for chunk in chunks:
142+
chunk_spans = self.annotate_text_sync(chunk, structured=True)
143+
# Adjust span positions to account for chunk offset
144+
for span in chunk_spans:
145+
adjusted_span = Span(
146+
start=span.start + current_offset,
147+
end=span.end + current_offset,
148+
text=span.text,
149+
label=span.label,
150+
)
151+
all_spans.append(adjusted_span)
152+
current_offset += len(chunk)
153+
154+
return all_spans
155+
else:
156+
# Dictionary format - combine annotations
157+
chunk_annotations = []
158+
for chunk in chunks:
159+
chunk_result = self.annotate_text_sync(chunk, structured=False)
160+
chunk_annotations.append(chunk_result)
144161
return self._combine_annotations(chunk_annotations)
145162

146-
return self._combine_annotations(chunk_annotations)
147-
148163
async def annotate_text_async(
149164
self, text: str, structured: bool = False
150165
) -> Union[Dict[str, List[str]], List[Span]]:

0 commit comments

Comments
 (0)