From b01ebb750d2cfdfbe18b22c5b512d833a6442067 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Fri, 20 Feb 2026 07:41:10 +0000 Subject: [PATCH 1/2] Optimize _byte_to_line_index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimized code achieves a **3872% speedup** (from 29.2ms to 736μs) by replacing a manual reverse linear search with Python's built-in `bisect_right` function from the bisect module. **What changed:** - **Original approach**: Iterated backwards through `line_byte_starts` using a Python for-loop, comparing `byte_offset` against each element until finding the first match - **Optimized approach**: Uses `bisect_right(line_byte_starts, byte_offset) - 1` to perform a binary search in O(log n) time instead of O(n) **Why this is faster:** 1. **Algorithm complexity**: Binary search (O(log n)) vs linear search (O(n)). For 1000 lines, this means ~10 comparisons instead of up to 1000 2. **C-level implementation**: `bisect_right` is implemented in C and highly optimized, eliminating Python interpreter overhead for the search loop 3. **Reduced memory access**: The line profiler shows the original code spent 57.5% of time on array indexing (`line_byte_starts[i]`) across many iterations. The optimized version performs far fewer array accesses **Performance characteristics from tests:** - **Small lists** (2-4 elements): ~50-130% faster - modest gains due to setup overhead - **Medium lists** (100-300 elements): ~200-500% faster - binary search advantage becomes clear - **Large lists** (1000 elements): ~3000-6400% faster - dramatic improvement as the gap between O(log n) and O(n) widens - The test `test_large_scale_sequential_mapping` with 1000 lines shows **4495% speedup** (13.5ms → 293μs), confirming the optimization's effectiveness at scale **Edge cases preserved:** - Empty lists correctly return 0 - Negative offsets work correctly - Offsets before the first element return 0 - The conditional `if idx >= 0 else 0` handles the edge case where `bisect_right` returns 0 (offset before all elements) This optimization is particularly valuable when `_byte_to_line_index` is called repeatedly with large `line_byte_starts` lists, as is typical in code instrumentation scenarios where files have hundreds or thousands of lines. --- codeflash/languages/java/instrumentation.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/codeflash/languages/java/instrumentation.py b/codeflash/languages/java/instrumentation.py index 18fdb1409..25646de2a 100644 --- a/codeflash/languages/java/instrumentation.py +++ b/codeflash/languages/java/instrumentation.py @@ -17,6 +17,7 @@ import logging import re from typing import TYPE_CHECKING +from bisect import bisect_right if TYPE_CHECKING: from collections.abc import Sequence @@ -230,10 +231,8 @@ def _collect_calls(node, wrapper_bytes, body_bytes, prefix_len, func_name, analy def _byte_to_line_index(byte_offset: int, line_byte_starts: list[int]) -> int: """Map a byte offset in body_text to a body_lines index.""" - for i in range(len(line_byte_starts) - 1, -1, -1): - if byte_offset >= line_byte_starts[i]: - return i - return 0 + idx = bisect_right(line_byte_starts, byte_offset) - 1 + return idx if idx >= 0 else 0 def _infer_array_cast_type(line: str) -> str | None: From 0fb931ae61bdbe44daf0d44ba200a76992c18f6f Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Fri, 20 Feb 2026 07:42:53 +0000 Subject: [PATCH 2/2] style: auto-fix linting issues --- codeflash/languages/java/instrumentation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/codeflash/languages/java/instrumentation.py b/codeflash/languages/java/instrumentation.py index 25646de2a..0bb2515a3 100644 --- a/codeflash/languages/java/instrumentation.py +++ b/codeflash/languages/java/instrumentation.py @@ -16,8 +16,8 @@ import logging import re -from typing import TYPE_CHECKING from bisect import bisect_right +from typing import TYPE_CHECKING if TYPE_CHECKING: from collections.abc import Sequence @@ -232,7 +232,7 @@ def _collect_calls(node, wrapper_bytes, body_bytes, prefix_len, func_name, analy def _byte_to_line_index(byte_offset: int, line_byte_starts: list[int]) -> int: """Map a byte offset in body_text to a body_lines index.""" idx = bisect_right(line_byte_starts, byte_offset) - 1 - return idx if idx >= 0 else 0 + return max(idx, 0) def _infer_array_cast_type(line: str) -> str | None: