From 1d6db3d102cceaef11a1964af4930c727f3fc3ca Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Thu, 22 Jan 2026 23:13:55 +0000 Subject: [PATCH] Optimize find_last_node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimized code achieves a **9057% speedup** (from 12.3ms to 134μs) by replacing a **quadratic O(N×M) algorithm with a linear O(N+M) algorithm**, where N is the number of nodes and M is the number of edges. **Key optimization:** The original code uses a nested loop structure: for each node, it iterates through *all* edges to check if that node appears as a source. This results in O(N×M) comparisons. The optimized version builds a **set of source IDs** from edges in a single pass (`sources = {e["source"] for e in edges}`), then performs O(1) membership checks (`n["id"] not in sources`) for each node. This reduces complexity to O(N+M). **Why this is faster:** - **Set lookup is O(1)** vs. linear scan through all edges - For the large test cases (500 nodes, 499 edges), the original performs ~250,000 comparisons while the optimized performs ~1,000 operations - Test results confirm this: `test_large_scale_chain_flow` shows **16,642% speedup** (4.43ms → 26.5μs) and `test_large_complete_graph_with_sink` shows **10,155% speedup** (1.62ms → 15.8μs) **Edge case handling:** The optimization includes safeguards: 1. **Single-use iterators**: Detects if `edges` is a consumed iterator (`iter(edges) is edges`) and falls back to original logic to preserve correctness 2. **Unhashable sources**: If any source value can't be hashed (e.g., lists, dicts), catches `TypeError` and falls back to the original nested approach **Performance impact:** - Small graphs (2-10 nodes): **30-96% faster** - modest gains due to set construction overhead - Medium graphs (100-300 nodes): **2,834-10,155% faster** - substantial wins as quadratic cost dominates - Large graphs (500+ nodes): **16,000%+ faster** - dramatic improvements where the original becomes prohibitively slow The optimization is particularly valuable when `find_last_node` is called repeatedly on non-trivial graphs, as the linear algorithm scales far better than the quadratic baseline. --- src/algorithms/graph.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/algorithms/graph.py b/src/algorithms/graph.py index 777ea3b..8efdfd2 100644 --- a/src/algorithms/graph.py +++ b/src/algorithms/graph.py @@ -47,7 +47,26 @@ def find_shortest_path(self, start: str, end: str) -> list[str]: def find_last_node(nodes, edges): """This function receives a flow and returns the last node.""" - return next((n for n in nodes if all(e["source"] != n["id"] for e in edges)), None) + # If edges is a single-use iterator (iter(edges) is edges), preserve the original behavior + _iter_edges = iter(edges) + if _iter_edges is edges: + return next( + (n for n in nodes if all(e["source"] != n["id"] for e in edges)), None + ) + + # Fast path for re-iterable edges: build a set of sources for O(len(edges) + len(nodes)) performance. + try: + sources = {e["source"] for e in edges} + except TypeError: + # Some sources may be unhashable; fall back to the original nested check (edges is re-iterable here). + return next( + (n for n in nodes if all(e["source"] != n["id"] for e in edges)), None + ) + + for n in nodes: + if n["id"] not in sources: + return n + return None def find_leaf_nodes(nodes: list[dict], edges: list[dict]) -> list[dict]: