From 647cd0a1ecb46bc5a7ae33bc54debeadffe8f6c5 Mon Sep 17 00:00:00 2001
From: "codeflash-ai[bot]"
 <148906541+codeflash-ai[bot]@users.noreply.github.com>
Date: Sat, 21 Feb 2026 00:24:31 +0000
Subject: [PATCH] Optimize JavaAssertTransformer._find_balanced_braces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The optimized code achieves a **325% speedup** (13.8ms → 3.24ms) by fundamentally changing how it traverses Java code to find balanced braces. Instead of examining every character, it uses strategic jumps to only inspect relevant positions.

## Key Optimizations

**1. Regex-Based Character Skipping**
- **Original**: Iterates through all 92,057 characters checking each one (`char == "'"`, `char == '"'`, `char == "{"`, `char == "}"`)
- **Optimized**: Uses `self._special_re.search(code, pos)` to jump directly to the next special character (`'`, `"`, `{`, `}`), reducing iterations from 92K to 6,905 (~93% reduction)
- **Why it's faster**: Python's regex engine (written in C) performs substring scanning far more efficiently than Python bytecode loops with repeated character comparisons

**2. Efficient String/Char Literal Handling**
- **Original**: Toggles boolean flags (`in_string`, `in_char`) and checks them on every iteration
- **Optimized**: When encountering a quote, uses `code.find()` to jump directly to the closing quote, then continues from that position
- **Why it's faster**: A single `find()` call (C-level string search) replaces potentially hundreds of character-by-character checks

**3. Local Variable Caching**
- Caches `code_len = len(code)` and `special_re = self._special_re` to avoid repeated attribute lookups in the hot loop

## Performance Profile

The optimization excels when code contains:
- **Long string literals**: Test cases with 10,000-character strings show 23,896% speedup (1.34ms → 5.58μs)
- **Many quoted sections**: 1,000 strings improved by 548% (3.84ms → 592μs), 500 char literals by 358%
- **Complex nested structures with quotes**: Realistic Java methods improved by 299% (42.5μs → 10.6μs)

Trade-offs appear in edge cases:
- **Deeply nested braces without quotes**: 1,000-level nesting is 49% slower (327μs → 644μs) because regex search overhead outweighs savings when there are no quotes to skip
- **Simple structures**: Some small test cases show 8-50% slowdown due to regex setup cost

## Impact Assessment

Since `_find_balanced_braces` is part of `JavaAssertTransformer` (used to analyze Java test code structure), the optimization significantly benefits workloads involving:
- Parsing Java files with extensive string literals (common in test assertions)
- Processing large codebases where this method is called frequently
- Real-world Java code (the realistic method test shows strong gains)

The 325% overall speedup indicates the benchmark workload closely matches typical Java test code patterns where quoted content is prevalent.
---
 codeflash/languages/java/remove_asserts.py | 57 ++++++++++++++--------
 1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/codeflash/languages/java/remove_asserts.py b/codeflash/languages/java/remove_asserts.py
index a9050c7ca..2d01f83d7 100644
--- a/codeflash/languages/java/remove_asserts.py
+++ b/codeflash/languages/java/remove_asserts.py
@@ -193,6 +193,9 @@ def __init__(
         # Precompile the assignment-detection regex to avoid recompiling on each call.
         self._assign_re = re.compile(r"(\w+(?:<[^>]+>)?)\s+(\w+)\s*=\s*$")
 
+        # Precompile regex to find next special character (single-quote, double-quote, brace).
+        self._special_re = re.compile(r"[\"'{}]")
+
     def transform(self, source: str) -> str:
         """Remove assertions from source code, preserving target function calls.
 
@@ -843,30 +846,42 @@ def _find_balanced_braces(self, code: str, open_brace_pos: int) -> tuple[str | N
 
         depth = 1
         pos = open_brace_pos + 1
-        in_string = False
-        string_char = None
-        in_char = False
+        code_len = len(code)
+        special_re = self._special_re
+
+        while pos < code_len and depth > 0:
+            m = special_re.search(code, pos)
+            if m is None:
+                return None, -1
+
+            idx = m.start()
+            char = m.group()
+            prev_char = code[idx - 1] if idx > 0 else ""
+
+            if char == "'" and prev_char != "\\":
+                j = code.find("'", idx + 1)
+                while j != -1 and j > 0 and code[j - 1] == "\\":
+                    j = code.find("'", j + 1)
+                if j == -1:
+                    return None, -1
+                pos = j + 1
+                continue
 
-        while pos < len(code) and depth > 0:
-            char = code[pos]
-            prev_char = code[pos - 1] if pos > 0 else ""
+            if char == '"' and prev_char != "\\":
+                j = code.find('"', idx + 1)
+                while j != -1 and j > 0 and code[j - 1] == "\\":
+                    j = code.find('"', j + 1)
+                if j == -1:
+                    return None, -1
+                pos = j + 1
+                continue
 
-            if char == "'" and not in_string and prev_char != "\\":
-                in_char = not in_char
-            elif char == '"' and not in_char and prev_char != "\\":
-                if not in_string:
-                    in_string = True
-                    string_char = char
-                elif char == string_char:
-                    in_string = False
-                    string_char = None
-            elif not in_string and not in_char:
-                if char == "{":
-                    depth += 1
-                elif char == "}":
-                    depth -= 1
+            if char == "{":
+                depth += 1
+            elif char == "}":
+                depth -= 1
 
-            pos += 1
+            pos = idx + 1
 
         if depth != 0:
             return None, -1