Skip to content

⚡️ Speed up function find_leaf_nodes by 37,712%#301

Open
codeflash-ai[bot] wants to merge 1 commit intonext-genfrom
codeflash/optimize-find_leaf_nodes-mmub3pdy
Open

⚡️ Speed up function find_leaf_nodes by 37,712%#301
codeflash-ai[bot] wants to merge 1 commit intonext-genfrom
codeflash/optimize-find_leaf_nodes-mmub3pdy

Conversation

@codeflash-ai
Copy link

@codeflash-ai codeflash-ai bot commented Mar 17, 2026

📄 37,712% (377.12x) speedup for find_leaf_nodes in src/algorithms/graph.py

⏱️ Runtime : 59.4 milliseconds 157 microseconds (best of 250 runs)

📝 Explanation and details

The optimized function cuts runtime from 59.4 ms to 0.157 ms (~378× faster) by building a set of edge["source"] values and using a single list comprehension to filter nodes instead of the original nested node×edge loop. The key insight is that membership checks against a precomputed set are O(1), turning the algorithm from O(n·m) into O(n+m); the line profiler shows the inner edge loop accounted for ~99% of the original time, so eliminating it yields the large win. It also short-circuits the no-edges case by returning a shallow copy for minimal overhead and wraps set construction in a try/except to fall back to the original nested loop if sources are unhashable or the 'source' key is missing, preserving the original exception behavior. Trade-offs: set construction adds a small upfront cost and a couple of microbenchmarks (empty-nodes and the missing-key fallback) show marginal regressions, but these are rare compared with the huge speedup on typical graphs.

Correctness verification report:

Test Status
⚙️ Existing Unit Tests 🔘 None Found
🌀 Generated Regression Tests 20 Passed
⏪ Replay Tests 🔘 None Found
🔎 Concolic Coverage Tests 2 Passed
📊 Tests Coverage 100.0%
🌀 Click to see Generated Regression Tests
import pytest  # used for our unit tests
# import the function under test from the provided module path
from src.algorithms.graph import find_leaf_nodes

def test_single_node_no_edges():
    # single node with no edges should be a leaf
    nodes = [{"id": "A"}]  # one node with id "A"
    edges = []  # no edges
    result = find_leaf_nodes(nodes, edges) # 458ns -> 291ns (57.4% faster)
    # expect the same single node object to be returned in a list
    assert result == [nodes[0]]

def test_two_nodes_one_edge():
    # A -> B means A is not a leaf, B is a leaf
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = [{"source": "A", "target": "B"}]
    result = find_leaf_nodes(nodes, edges) # 834ns -> 792ns (5.30% faster)
    # only node B should be returned and order should match original nodes list
    assert result == [nodes[1]]

def test_multiple_leaves_preserved_order():
    # A -> B, C has no outgoing edges -> leaves: B and C, in that order
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}]
    edges = [{"source": "A", "target": "B"}]
    result = find_leaf_nodes(nodes, edges) # 917ns -> 833ns (10.1% faster)
    # both B and C are leaves and should be returned in the original order
    assert result == [nodes[1], nodes[2]]

def test_empty_nodes_returns_empty_list():
    # no nodes provided => no leaf nodes
    nodes = []
    edges = [{"source": "X", "target": "Y"}]  # irrelevant
    result = find_leaf_nodes(nodes, edges) # 333ns -> 583ns (42.9% slower)
    assert result == []

def test_edges_with_unknown_sources_do_not_mark_nodes_non_leaf():
    # an edge whose source is not any node id should not affect leaf status
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = [{"source": "UNKNOWN", "target": "A"}]  # source not in nodes
    # since neither A nor B has outgoing edges (no edge with source "A" or "B"),
    # both should be considered leaves
    result = find_leaf_nodes(nodes, edges) # 834ns -> 750ns (11.2% faster)
    assert result == [nodes[0], nodes[1]]

def test_node_id_none_with_matching_edge_source():
    # node id can be None; if an edge has source None it removes that node from leaves
    nil_node = {"id": None, "label": "nil"}
    other_node = {"id": 1}
    nodes = [nil_node, other_node]
    edges = [{"source": None, "target": 1}]
    result = find_leaf_nodes(nodes, edges) # 875ns -> 791ns (10.6% faster)
    # nil_node has an outgoing edge (source None), so only other_node is a leaf
    assert result == [other_node]

def test_duplicate_node_ids_are_preserved_when_no_outgoing_edges():
    # duplicate node entries are treated independently by the function
    node_a1 = {"id": "A"}
    node_a2 = {"id": "A"}  # same id as node_a1 but a distinct dict entry
    nodes = [node_a1, node_a2]
    edges = []
    result = find_leaf_nodes(nodes, edges) # 625ns -> 292ns (114% faster)
    # both entries should appear in the result
    assert result == [node_a1, node_a2]
    assert len(result) == 2

def test_multiple_outgoing_edges_from_same_source():
    # multiple edges that share the same source should still mark that source as non-leaf
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}]
    # A has two outgoing edges; B and C have none, so B and C are leaves
    edges = [{"source": "A", "target": "B"}, {"source": "A", "target": "C"}]
    result = find_leaf_nodes(nodes, edges) # 1.17μs -> 916ns (27.4% faster)
    assert result == [nodes[1], nodes[2]]

def test_missing_source_in_edge_raises_keyerror():
    # if an edge dict lacks the 'source' key, accessing edge["source"] should raise KeyError
    nodes = [{"id": "A"}]
    edges = [{"target": "A"}]  # missing 'source'
    with pytest.raises(KeyError):
        find_leaf_nodes(nodes, edges) # 917ns -> 1.38μs (33.3% slower)

def test_large_chain_graph_with_single_leaf():
    # Create a chain 0 -> 1 -> 2 -> ... -> n-1 of size n
    n = 1000  # scale up to 1000 nodes/edges as requested
    nodes = [{"id": i} for i in range(n)]
    # edges from i to i+1 for i in 0 .. n-2
    edges = [{"source": i, "target": i + 1} for i in range(n - 1)]
    result = find_leaf_nodes(nodes, edges) # 16.0ms -> 47.8μs (33350% faster)
    # only the last node should be a leaf
    assert result == [nodes[-1]]
    # sanity check: ensure only one leaf for a proper chain
    assert len(result) == 1

def test_large_graph_all_nodes_are_leaves_when_no_edges():
    # many nodes but no edges -> every node should be a leaf
    n = 1000
    nodes = [{"id": f"node-{i}"} for i in range(n)]
    edges = []
    result = find_leaf_nodes(nodes, edges) # 45.3μs -> 1.75μs (2488% faster)
    # all nodes must be present and order preserved
    assert result == nodes
    assert len(result) == n
import pytest  # used for our unit tests
from src.algorithms.graph import find_leaf_nodes  # function under test

def test_empty_nodes_list_returns_empty_list():
    # If there are no nodes, there can be no leaf nodes; expect an empty list.
    nodes = []  # no nodes in graph
    edges = [{"source": 1, "target": 2}]  # some edges (should be ignored)
    result = find_leaf_nodes(nodes, edges) # 333ns -> 583ns (42.9% slower)
    assert result == []  # result must be empty list for empty input nodes
    assert isinstance(result, list)  # return type should be a list

def test_no_edges_all_nodes_are_leaves():
    # With no edges, every node has no outgoing edges and should be returned.
    nodes = [{"id": "A"}, {"id": "B"}, {"id": "C"}]  # three nodes
    edges = []  # no edges
    result = find_leaf_nodes(nodes, edges) # 709ns -> 250ns (184% faster)
    # All nodes must be present and in the same order as input.
    assert result == nodes
    # Ensure returned elements are the exact same dict objects (identity preserved).
    assert result[0] is nodes[0] and result[1] is nodes[1] and result[2] is nodes[2]

def test_simple_tree_identifies_leaves_correctly():
    # Node 1 has outgoing edges to 2 and 3; nodes 2 and 3 are leaves.
    n1 = {"id": 1}
    n2 = {"id": 2}
    n3 = {"id": 3}
    nodes = [n1, n2, n3]
    edges = [{"source": 1, "target": 2}, {"source": 1, "target": 3}]
    result = find_leaf_nodes(nodes, edges) # 1.17μs -> 875ns (33.4% faster)
    # Only nodes with id 2 and 3 should be returned, preserving original order.
    assert result == [n2, n3]
    # Check type consistency: each item should remain a dict.
    assert all(isinstance(n, dict) for n in result)

def test_edge_sources_not_present_in_nodes_are_ignored():
    # Edges that reference a source not present among nodes should not affect leaf detection.
    nodes = [{"id": "A"}, {"id": "B"}]
    edges = [{"source": "C", "target": "A"}]  # source "C" is not in nodes
    result = find_leaf_nodes(nodes, edges) # 875ns -> 750ns (16.7% faster)
    # Since neither "A" nor "B" have outgoing edges, both are leaves.
    assert result == nodes

def test_duplicate_node_ids_handled_as_separate_objects():
    # Two node dicts have the same id; an edge from that id should mark both as non-leaf.
    n1 = {"id": 1, "label": "first"}
    n2 = {"id": 1, "label": "second"}
    n3 = {"id": 2, "label": "third"}
    nodes = [n1, n2, n3]
    edges = [{"source": 1, "target": 2}]  # source 1 exists; both n1 and n2 should be non-leaf
    result = find_leaf_nodes(nodes, edges) # 1.00μs -> 750ns (33.3% faster)
    # Only the node with id 2 remains a leaf.
    assert result == [n3]
    # Ensure that both duplicated-id nodes are not present.
    assert n1 not in result and n2 not in result

def test_none_id_and_source_match():
    # Node id and edge source can be None; equality should detect outgoing edges from None.
    n_none = {"id": None}
    n_zero = {"id": 0}
    nodes = [n_none, n_zero]
    edges = [{"source": None, "target": 0}]  # marks the None-id node as non-leaf
    result = find_leaf_nodes(nodes, edges) # 1.00μs -> 792ns (26.3% faster)
    # Only node with id 0 should be reported as leaf.
    assert result == [n_zero]

def test_preserves_input_objects_and_order_for_mixed_cases():
    # Combine several cases to ensure returned list contains original dict objects
    # in the same relative order as in the input nodes list.
    a = {"id": "a"}
    b = {"id": "b"}
    c = {"id": "c"}
    nodes = [a, b, c]
    edges = [{"source": "a", "target": "b"}]  # 'a' is non-leaf; 'b' and 'c' may be leaves
    result = find_leaf_nodes(nodes, edges) # 1.08μs -> 792ns (36.7% faster)
    # 'a' should be excluded; 'b' and 'c' should be present in original order.
    assert result == [b, c]
    # Identity check to confirm same object instances are returned.
    assert result[0] is b and result[1] is c

def test_large_chain_of_1000_nodes_only_last_is_leaf():
    # Build a chain of 1000 nodes: 0->1->2->...->999
    n = 1000  # number of nodes
    nodes = [{"id": i} for i in range(n)]  # nodes 0..999
    # edges link i -> i+1 for i in 0..998
    edges = [{"source": i, "target": i + 1} for i in range(n - 1)]
    result = find_leaf_nodes(nodes, edges) # 16.0ms -> 46.7μs (34207% faster)
    # Only the final node (id 999) has no outgoing edge, so it's the lone leaf.
    assert len(result) == 1
    assert result[0] is nodes[-1]  # identity check ensures same dict object returned
    assert result[0]["id"] == n - 1

def test_large_star_graph_many_leaves_and_duplicate_edges():
    # Create a star graph: center 0 points to every other node 1..999.
    # Leaves should be nodes 1..999.
    n = 1000
    nodes = [{"id": i} for i in range(n)]
    # center has many outgoing edges, include some duplicate edges to test robustness.
    edges = [{"source": 0, "target": i} for i in range(1, n)]
    # add duplicate edges (repeated pattern) to stress duplicate handling
    edges += [{"source": 0, "target": i} for i in range(1, n, 100)]
    result = find_leaf_nodes(nodes, edges) # 27.3ms -> 48.1μs (56714% faster)
    # All nodes except the center (id 0) should be leaves.
    assert len(result) == n - 1
    # Ensure the first returned leaf corresponds to node with id 1 (preserves input order).
    assert result[0] is nodes[1]
    # Ensure last returned leaf corresponds to node with id 999.
    assert result[-1] is nodes[-1]
    # Verify none of the returned nodes is the center node.
    assert all(node["id"] != 0 for node in result)
from src.algorithms.graph import find_leaf_nodes

def test_find_leaf_nodes():
    find_leaf_nodes([{'id': 3}], [{'source': 4}])

def test_find_leaf_nodes_2():
    find_leaf_nodes([{'id': 3}], [{'source': 3}])
🔎 Click to see Concolic Coverage Tests

To edit these changes git checkout codeflash/optimize-find_leaf_nodes-mmub3pdy and push.

Codeflash Static Badge

The optimized function cuts runtime from 59.4 ms to 0.157 ms (~378× faster) by building a set of edge["source"] values and using a single list comprehension to filter nodes instead of the original nested node×edge loop. The key insight is that membership checks against a precomputed set are O(1), turning the algorithm from O(n·m) into O(n+m); the line profiler shows the inner edge loop accounted for ~99% of the original time, so eliminating it yields the large win. It also short-circuits the no-edges case by returning a shallow copy for minimal overhead and wraps set construction in a try/except to fall back to the original nested loop if sources are unhashable or the 'source' key is missing, preserving the original exception behavior. Trade-offs: set construction adds a small upfront cost and a couple of microbenchmarks (empty-nodes and the missing-key fallback) show marginal regressions, but these are rare compared with the huge speedup on typical graphs.
@codeflash-ai codeflash-ai bot requested a review from KRRT7 March 17, 2026 07:44
@codeflash-ai codeflash-ai bot added ⚡️ codeflash Optimization PR opened by Codeflash AI 🎯 Quality: High Optimization Quality according to Codeflash labels Mar 17, 2026
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

⚡️ codeflash Optimization PR opened by Codeflash AI 🎯 Quality: High Optimization Quality according to Codeflash

Projects

None yet

Development

Successfully merging this pull request may close these issues.

0 participants