neo4j · ADunfield · Mar 18, 2026
@@ -144,6 +144,11 @@ async def run(self, text: str) -> TextChunks:
             chunks.append(TextChunk(text=chunk_text, index=index))
             index += 1
 
-            approximate_start = start + step
+            # Guarantee forward progress: advance based on end position minus
+            # overlap, but never move backward from the current approximate_start.
+            # This prevents infinite loops when _adjust_chunk_start walks start
+            # backward past the step boundary (e.g., long runs without whitespace).
+            next_start = end - self.chunk_overlap
+            approximate_start = max(next_start, approximate_start + step)
 
         return TextChunks(chunks=chunks)
@@ -184,6 +184,16 @@ def test_adjust_chunk_end(
             True,
             ["12345", "56789", "90"],
         ),
+        # Case: text with a single long word exceeding chunk_size (regression for #471)
+        # Previously caused infinite loop when _adjust_chunk_start walked backward
+        # past the step boundary, causing approximate_start to never advance.
+        (
+            "a" * 20,
+            5,
+            1,
+            True,
+            ["aaaaa", "aaaaa", "aaaaa", "aaaaa", "aaaa"],
+        ),
     ],
 )
 async def test_fixed_size_splitter_run(
@@ -212,3 +222,26 @@ async def test_fixed_size_splitter_run(
         assert text_chunks.chunks[i].text == expected_text
         assert isinstance(text_chunks.chunks[i], TextChunk)
         assert text_chunks.chunks[i].index == i
+
+
+@pytest.mark.asyncio
+@pytest.mark.timeout(5)
+async def test_no_infinite_loop_on_long_word_without_spaces() -> None:
+    """
+    Regression test for #471.
+
+    When a text contains a long run of characters with no whitespace and
+    approximate=True, _adjust_chunk_start could walk backward past the step
+    boundary, causing approximate_start to stall or decrease and the splitter
+    to loop forever.
+
+    The fix ensures approximate_start always advances by at least `step`
+    (chunk_size - chunk_overlap) characters per iteration.
+    """
+    long_text = "x" * 5000
+    splitter = FixedSizeSplitter(chunk_size=100, chunk_overlap=20, approximate=True)
+    result = await splitter.run(long_text)
+    assert len(result.chunks) > 0
+    # Every chunk should be non-empty
+    for chunk in result.chunks:
+        assert len(chunk.text) > 0