From 647cd0a1ecb46bc5a7ae33bc54debeadffe8f6c5 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Sat, 21 Feb 2026 00:24:31 +0000 Subject: [PATCH] Optimize JavaAssertTransformer._find_balanced_braces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimized code achieves a **325% speedup** (13.8ms → 3.24ms) by fundamentally changing how it traverses Java code to find balanced braces. Instead of examining every character, it uses strategic jumps to only inspect relevant positions. ## Key Optimizations **1. Regex-Based Character Skipping** - **Original**: Iterates through all 92,057 characters checking each one (`char == "'"`, `char == '"'`, `char == "{"`, `char == "}"`) - **Optimized**: Uses `self._special_re.search(code, pos)` to jump directly to the next special character (`'`, `"`, `{`, `}`), reducing iterations from 92K to 6,905 (~93% reduction) - **Why it's faster**: Python's regex engine (written in C) performs substring scanning far more efficiently than Python bytecode loops with repeated character comparisons **2. Efficient String/Char Literal Handling** - **Original**: Toggles boolean flags (`in_string`, `in_char`) and checks them on every iteration - **Optimized**: When encountering a quote, uses `code.find()` to jump directly to the closing quote, then continues from that position - **Why it's faster**: A single `find()` call (C-level string search) replaces potentially hundreds of character-by-character checks **3. Local Variable Caching** - Caches `code_len = len(code)` and `special_re = self._special_re` to avoid repeated attribute lookups in the hot loop ## Performance Profile The optimization excels when code contains: - **Long string literals**: Test cases with 10,000-character strings show 23,896% speedup (1.34ms → 5.58μs) - **Many quoted sections**: 1,000 strings improved by 548% (3.84ms → 592μs), 500 char literals by 358% - **Complex nested structures with quotes**: Realistic Java methods improved by 299% (42.5μs → 10.6μs) Trade-offs appear in edge cases: - **Deeply nested braces without quotes**: 1,000-level nesting is 49% slower (327μs → 644μs) because regex search overhead outweighs savings when there are no quotes to skip - **Simple structures**: Some small test cases show 8-50% slowdown due to regex setup cost ## Impact Assessment Since `_find_balanced_braces` is part of `JavaAssertTransformer` (used to analyze Java test code structure), the optimization significantly benefits workloads involving: - Parsing Java files with extensive string literals (common in test assertions) - Processing large codebases where this method is called frequently - Real-world Java code (the realistic method test shows strong gains) The 325% overall speedup indicates the benchmark workload closely matches typical Java test code patterns where quoted content is prevalent. --- codeflash/languages/java/remove_asserts.py | 57 ++++++++++++++-------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/codeflash/languages/java/remove_asserts.py b/codeflash/languages/java/remove_asserts.py index a9050c7ca..2d01f83d7 100644 --- a/codeflash/languages/java/remove_asserts.py +++ b/codeflash/languages/java/remove_asserts.py @@ -193,6 +193,9 @@ def __init__( # Precompile the assignment-detection regex to avoid recompiling on each call. self._assign_re = re.compile(r"(\w+(?:<[^>]+>)?)\s+(\w+)\s*=\s*$") + # Precompile regex to find next special character (single-quote, double-quote, brace). + self._special_re = re.compile(r"[\"'{}]") + def transform(self, source: str) -> str: """Remove assertions from source code, preserving target function calls. @@ -843,30 +846,42 @@ def _find_balanced_braces(self, code: str, open_brace_pos: int) -> tuple[str | N depth = 1 pos = open_brace_pos + 1 - in_string = False - string_char = None - in_char = False + code_len = len(code) + special_re = self._special_re + + while pos < code_len and depth > 0: + m = special_re.search(code, pos) + if m is None: + return None, -1 + + idx = m.start() + char = m.group() + prev_char = code[idx - 1] if idx > 0 else "" + + if char == "'" and prev_char != "\\": + j = code.find("'", idx + 1) + while j != -1 and j > 0 and code[j - 1] == "\\": + j = code.find("'", j + 1) + if j == -1: + return None, -1 + pos = j + 1 + continue - while pos < len(code) and depth > 0: - char = code[pos] - prev_char = code[pos - 1] if pos > 0 else "" + if char == '"' and prev_char != "\\": + j = code.find('"', idx + 1) + while j != -1 and j > 0 and code[j - 1] == "\\": + j = code.find('"', j + 1) + if j == -1: + return None, -1 + pos = j + 1 + continue - if char == "'" and not in_string and prev_char != "\\": - in_char = not in_char - elif char == '"' and not in_char and prev_char != "\\": - if not in_string: - in_string = True - string_char = char - elif char == string_char: - in_string = False - string_char = None - elif not in_string and not in_char: - if char == "{": - depth += 1 - elif char == "}": - depth -= 1 + if char == "{": + depth += 1 + elif char == "}": + depth -= 1 - pos += 1 + pos = idx + 1 if depth != 0: return None, -1