Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,11 @@ async def run(self, text: str) -> TextChunks:
chunks.append(TextChunk(text=chunk_text, index=index))
index += 1

approximate_start = start + step
# Guarantee forward progress: advance based on end position minus
# overlap, but never move backward from the current approximate_start.
# This prevents infinite loops when _adjust_chunk_start walks start
# backward past the step boundary (e.g., long runs without whitespace).
next_start = end - self.chunk_overlap
approximate_start = max(next_start, approximate_start + step)

return TextChunks(chunks=chunks)
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,16 @@ def test_adjust_chunk_end(
True,
["12345", "56789", "90"],
),
# Case: text with a single long word exceeding chunk_size (regression for #471)
# Previously caused infinite loop when _adjust_chunk_start walked backward
# past the step boundary, causing approximate_start to never advance.
(
"a" * 20,
5,
1,
True,
["aaaaa", "aaaaa", "aaaaa", "aaaaa", "aaaa"],
),
],
)
async def test_fixed_size_splitter_run(
Expand Down Expand Up @@ -212,3 +222,26 @@ async def test_fixed_size_splitter_run(
assert text_chunks.chunks[i].text == expected_text
assert isinstance(text_chunks.chunks[i], TextChunk)
assert text_chunks.chunks[i].index == i


@pytest.mark.asyncio
@pytest.mark.timeout(5)
async def test_no_infinite_loop_on_long_word_without_spaces() -> None:
"""
Regression test for #471.

When a text contains a long run of characters with no whitespace and
approximate=True, _adjust_chunk_start could walk backward past the step
boundary, causing approximate_start to stall or decrease and the splitter
to loop forever.

The fix ensures approximate_start always advances by at least `step`
(chunk_size - chunk_overlap) characters per iteration.
"""
long_text = "x" * 5000
splitter = FixedSizeSplitter(chunk_size=100, chunk_overlap=20, approximate=True)
result = await splitter.run(long_text)
assert len(result.chunks) > 0
# Every chunk should be non-empty
for chunk in result.chunks:
assert len(chunk.text) > 0
Loading