Skip to content

⚡️ Speed up function find_leaf_nodes by 14,900%#303

Open
codeflash-ai[bot] wants to merge 1 commit intomainfrom
codeflash/optimize-find_leaf_nodes-mmuc1iro
Open

⚡️ Speed up function find_leaf_nodes by 14,900%#303
codeflash-ai[bot] wants to merge 1 commit intomainfrom
codeflash/optimize-find_leaf_nodes-mmuc1iro

Conversation

@codeflash-ai
Copy link

@codeflash-ai codeflash-ai bot commented Mar 17, 2026

📄 14,900% (149.00x) speedup for find_leaf_nodes in src/algorithms/graph.py

⏱️ Runtime : 46.5 milliseconds 310 microseconds (best of 250 runs)

📝 Explanation and details

The optimization replaces the nested O(N×M) loop structure with a two-pass O(N+M) algorithm: first building a set of all source node IDs in a single pass over edges, then filtering nodes via a list comprehension that performs O(1) membership checks against that set instead of scanning all edges for each node. Line profiler confirms the nested loop previously consumed 99.5% of runtime (708ms total), while the optimized version completes in 0.52ms—a 1367× speedup. Small regressions on trivial inputs (empty/single-node cases, 23–55% slower) stem from set-construction overhead that is negligible in absolute terms (<1μs) and vastly outweighed by dramatic gains on realistic workloads (e.g., 1000-node tests improve 286–312×).

Correctness verification report:

Test Status
⚙️ Existing Unit Tests 🔘 None Found
🌀 Generated Regression Tests 38 Passed
⏪ Replay Tests 🔘 None Found
🔎 Concolic Coverage Tests 2 Passed
📊 Tests Coverage 100.0%
🌀 Click to see Generated Regression Tests
import pytest  # used for our unit tests

# Import the function exactly from its source module as provided.
from src.algorithms.graph import find_leaf_nodes


def test_single_outgoing_edge_identifies_leaf():
    # Setup: two nodes, node with id 1 has an outgoing edge to node 2.
    nodes = [{"id": 1}, {"id": 2}]  # two node dicts
    edges = [{"source": 1, "target": 2}]  # one edge from 1 -> 2

    # Execute: find leaf nodes (should be node with id 2 only).
    leaves = find_leaf_nodes(nodes, edges)  # 962ns -> 952ns (1.05% faster)

    # Verify: exactly one leaf, it's the second node object (same identity & content).
    assert isinstance(leaves, list)
    assert len(leaves) == 1
    assert leaves[0] is nodes[1]  # same dict object
    assert leaves[0]["id"] == 2


def test_no_edges_all_nodes_are_leaves_and_order_preserved():
    # Setup: three nodes, no edges at all.
    nodes = [{"id": "a"}, {"id": "b"}, {"id": "c"}]
    edges = []  # empty edge list => all nodes are leaves

    # Execute
    leaves = find_leaf_nodes(nodes, edges)  # 732ns -> 951ns (23.0% slower)

    # Verify: all nodes returned in the same order and same objects.
    assert leaves == nodes  # list equality should hold
    for original, returned in zip(nodes, leaves):
        assert returned is original  # identity preserved


def test_empty_nodes_returns_empty_list():
    # If there are no nodes, the result must be an empty list regardless of edges.
    nodes = []
    edges = [{"source": 1, "target": 2}]
    leaves = find_leaf_nodes(nodes, edges)  # 311ns -> 672ns (53.7% slower)
    assert leaves == []


def test_missing_source_key_in_an_edge_raises_keyerror():
    # If any edge dict lacks the "source" key, accessing edge["source"] will raise KeyError.
    nodes = [{"id": 1}]
    edges = [{"target": 2}]  # deliberately missing "source"

    # Expectation: the function should raise KeyError when it tries to access edge["source"].
    with pytest.raises(KeyError):
        find_leaf_nodes(nodes, edges)  # 981ns -> 1.22μs (19.8% slower)


def test_string_vs_int_id_type_mismatch_counts_as_no_match():
    # When node id type differs from edge["source"] type (e.g., int vs str),
    # equality fails and the node is considered a leaf.
    nodes = [{"id": 1}]
    edges = [{"source": "1", "target": None}]  # string "1" is not equal to integer 1

    leaves = find_leaf_nodes(nodes, edges)  # 792ns -> 862ns (8.12% slower)
    assert len(leaves) == 1
    assert leaves[0]["id"] == 1


def test_duplicate_node_ids_with_outgoing_edge_marks_all_duplicates_non_leaf():
    # Two distinct node dicts share the same id; a single outgoing edge from that id
    # should mark both node objects as non-leaves because equality is checked against node["id"].
    node_a = {"id": 42, "label": "first"}
    node_b = {"id": 42, "label": "second"}  # distinct object, same id
    nodes = [node_a, node_b]
    edges = [{"source": 42, "target": 99}]

    leaves = find_leaf_nodes(nodes, edges)  # 881ns -> 902ns (2.33% slower)
    # Both nodes have the id matched by the outgoing edge, so no leaves remain.
    assert leaves == []


def test_none_id_and_none_source_match_correctly():
    # Node id can be None; an edge with source None should match that node.
    nodes = [{"id": None}, {"id": 0}]
    edges = [{"source": None, "target": 1}]
    leaves = find_leaf_nodes(nodes, edges)  # 1.04μs -> 981ns (6.22% faster)

    # Node with id None is not a leaf; node with id 0 is a leaf.
    assert len(leaves) == 1
    assert leaves[0]["id"] == 0


def test_mutating_returned_leaf_mutates_original_node_object():
    # The function returns references to original node dicts (no copies).
    node = {"id": 7, "value": 10}
    nodes = [node]
    edges = []  # no outgoing edges => node is a leaf

    leaves = find_leaf_nodes(nodes, edges)  # 491ns -> 721ns (31.9% slower)
    # Mutate the returned dict and ensure the original was mutated too (same object).
    leaves[0]["value"] = 999
    assert nodes[0]["value"] == 999  # mutation visible in original list


def test_large_scale_half_non_leaf_half_leaf():
    # Create 1000 nodes with ids 0..999.
    n = 1000
    nodes = [{"id": i} for i in range(n)]

    # Create edges from ids 0..499 (first half) to something else; these should be non-leaf.
    edges = [{"source": i, "target": i + 1} for i in range(n // 2)]

    # Execute
    leaves = find_leaf_nodes(nodes, edges)  # 13.0ms -> 45.4μs (28626% faster)

    # Verify: nodes with ids 500..999 (the second half) are leaves => should be 500 nodes.
    expected_leaf_ids = list(range(n // 2, n))
    assert len(leaves) == n - (n // 2)
    assert [
        node["id"] for node in leaves
    ] == expected_leaf_ids  # order preserved and correct


def test_large_scale_all_non_leaf_when_every_node_has_outgoing_edge():
    # Create 1000 nodes and an outgoing edge for every node.
    n = 1000
    nodes = [{"id": i} for i in range(n)]
    # Each node has an outgoing edge (to next node mod n), so zero leaf nodes expected.
    edges = [{"source": i, "target": (i + 1) % n} for i in range(n)]

    leaves = find_leaf_nodes(nodes, edges)  # 17.9ms -> 57.3μs (31168% faster)

    # All nodes have outgoing edges => no leaves.
    assert leaves == []
import pytest
from src.algorithms.graph import find_leaf_nodes


def test_single_leaf_node():
    """Test with a single node that has no outgoing edges."""
    nodes = [{"id": 1, "name": "A"}]
    edges = []
    result = find_leaf_nodes(nodes, edges)  # 631ns -> 841ns (25.0% slower)
    assert result == [{"id": 1, "name": "A"}]


def test_single_non_leaf_node():
    """Test with a single node that has an outgoing edge."""
    nodes = [{"id": 1, "name": "A"}]
    edges = [{"source": 1, "target": 2}]
    result = find_leaf_nodes(nodes, edges)  # 711ns -> 762ns (6.69% slower)
    assert result == []


def test_two_nodes_linear_chain():
    """Test with two nodes in a linear chain (A -> B)."""
    nodes = [{"id": 1, "name": "A"}, {"id": 2, "name": "B"}]
    edges = [{"source": 1, "target": 2}]
    result = find_leaf_nodes(nodes, edges)  # 912ns -> 871ns (4.71% faster)
    assert result == [{"id": 2, "name": "B"}]


def test_multiple_leaf_nodes():
    """Test with multiple leaf nodes in a graph."""
    nodes = [
        {"id": 1, "name": "A"},
        {"id": 2, "name": "B"},
        {"id": 3, "name": "C"},
        {"id": 4, "name": "D"},
    ]
    edges = [{"source": 1, "target": 2}, {"source": 1, "target": 3}]
    result = find_leaf_nodes(nodes, edges)  # 1.22μs -> 982ns (24.4% faster)
    assert len(result) == 3
    assert {"id": 2, "name": "B"} in result
    assert {"id": 3, "name": "C"} in result
    assert {"id": 4, "name": "D"} in result


def test_all_nodes_are_leaves():
    """Test when all nodes are leaf nodes (no edges)."""
    nodes = [
        {"id": 1, "name": "A"},
        {"id": 2, "name": "B"},
        {"id": 3, "name": "C"},
    ]
    edges = []
    result = find_leaf_nodes(nodes, edges)  # 701ns -> 821ns (14.6% slower)
    assert result == nodes


def test_no_leaf_nodes():
    """Test when no nodes are leaf nodes (all have outgoing edges)."""
    nodes = [
        {"id": 1, "name": "A"},
        {"id": 2, "name": "B"},
        {"id": 3, "name": "C"},
    ]
    edges = [
        {"source": 1, "target": 2},
        {"source": 2, "target": 3},
        {"source": 3, "target": 1},
    ]
    result = find_leaf_nodes(nodes, edges)  # 1.07μs -> 951ns (12.7% faster)
    assert result == []


def test_star_topology():
    """Test a star topology where one node connects to many others."""
    nodes = [
        {"id": 1, "name": "center"},
        {"id": 2, "name": "leaf1"},
        {"id": 3, "name": "leaf2"},
        {"id": 4, "name": "leaf3"},
    ]
    edges = [
        {"source": 1, "target": 2},
        {"source": 1, "target": 3},
        {"source": 1, "target": 4},
    ]
    result = find_leaf_nodes(nodes, edges)  # 1.26μs -> 1.02μs (23.6% faster)
    assert len(result) == 3
    assert {"id": 2, "name": "leaf1"} in result
    assert {"id": 3, "name": "leaf2"} in result
    assert {"id": 4, "name": "leaf3"} in result


def test_node_with_multiple_outgoing_edges():
    """Test a node that has multiple outgoing edges is not a leaf."""
    nodes = [
        {"id": 1, "name": "A"},
        {"id": 2, "name": "B"},
        {"id": 3, "name": "C"},
    ]
    edges = [
        {"source": 1, "target": 2},
        {"source": 1, "target": 3},
    ]
    result = find_leaf_nodes(nodes, edges)  # 1.02μs -> 871ns (17.3% faster)
    assert result == [{"id": 2, "name": "B"}, {"id": 3, "name": "C"}]


def test_incoming_edges_ignored():
    """Test that incoming edges do not prevent a node from being a leaf."""
    nodes = [
        {"id": 1, "name": "A"},
        {"id": 2, "name": "B"},
        {"id": 3, "name": "C"},
    ]
    edges = [
        {"source": 1, "target": 2},
        {"source": 2, "target": 3},
        {"source": 3, "target": 1},  # incoming edge to node 1
    ]
    result = find_leaf_nodes(nodes, edges)  # 1.06μs -> 922ns (15.2% faster)
    # Node 1 has an incoming edge but no outgoing edges, so it is a leaf
    assert result == [{"id": 1, "name": "A"}]


def test_empty_nodes_list():
    """Test with an empty nodes list."""
    nodes = []
    edges = []
    result = find_leaf_nodes(nodes, edges)  # 320ns -> 571ns (44.0% slower)
    assert result == []


def test_empty_nodes_with_edges():
    """Test with empty nodes list but non-empty edges list."""
    nodes = []
    edges = [{"source": 1, "target": 2}]
    result = find_leaf_nodes(nodes, edges)  # 291ns -> 642ns (54.7% slower)
    assert result == []


def test_nodes_with_string_ids():
    """Test with string node IDs instead of integers."""
    nodes = [
        {"id": "node_a", "name": "A"},
        {"id": "node_b", "name": "B"},
    ]
    edges = [{"source": "node_a", "target": "node_b"}]
    result = find_leaf_nodes(nodes, edges)  # 1.03μs -> 902ns (14.4% faster)
    assert result == [{"id": "node_b", "name": "B"}]


def test_nodes_with_extra_attributes():
    """Test that nodes with extra attributes are preserved."""
    nodes = [
        {"id": 1, "name": "A", "color": "red", "size": 10},
        {"id": 2, "name": "B", "color": "blue", "size": 20},
    ]
    edges = [{"source": 1, "target": 2}]
    result = find_leaf_nodes(nodes, edges)  # 842ns -> 832ns (1.20% faster)
    assert result == [{"id": 2, "name": "B", "color": "blue", "size": 20}]


def test_edges_with_extra_attributes():
    """Test that extra edge attributes do not affect the result."""
    nodes = [
        {"id": 1, "name": "A"},
        {"id": 2, "name": "B"},
    ]
    edges = [{"source": 1, "target": 2, "weight": 5, "label": "connects"}]
    result = find_leaf_nodes(nodes, edges)  # 781ns -> 771ns (1.30% faster)
    assert result == [{"id": 2, "name": "B"}]


def test_self_loop():
    """Test a node with a self-loop (edge from node to itself)."""
    nodes = [{"id": 1, "name": "A"}]
    edges = [{"source": 1, "target": 1}]
    result = find_leaf_nodes(nodes, edges)  # 581ns -> 681ns (14.7% slower)
    assert result == []


def test_duplicate_edges():
    """Test with duplicate edges in the edge list."""
    nodes = [
        {"id": 1, "name": "A"},
        {"id": 2, "name": "B"},
    ]
    edges = [
        {"source": 1, "target": 2},
        {"source": 1, "target": 2},  # duplicate
    ]
    result = find_leaf_nodes(nodes, edges)  # 841ns -> 862ns (2.44% slower)
    assert result == [{"id": 2, "name": "B"}]


def test_node_id_zero():
    """Test with node ID of 0."""
    nodes = [{"id": 0, "name": "zero"}, {"id": 1, "name": "one"}]
    edges = [{"source": 0, "target": 1}]
    result = find_leaf_nodes(nodes, edges)  # 751ns -> 801ns (6.24% slower)
    assert result == [{"id": 1, "name": "one"}]


def test_negative_node_ids():
    """Test with negative node IDs."""
    nodes = [{"id": -1, "name": "negative"}, {"id": 1, "name": "positive"}]
    edges = [{"source": -1, "target": 1}]
    result = find_leaf_nodes(nodes, edges)  # 761ns -> 902ns (15.6% slower)
    assert result == [{"id": 1, "name": "positive"}]


def test_complex_node_attributes():
    """Test with nodes containing nested dictionaries and lists."""
    nodes = [
        {"id": 1, "data": {"nested": "value"}, "tags": [1, 2, 3]},
        {"id": 2, "data": {"other": "data"}, "tags": [4, 5]},
    ]
    edges = [{"source": 1, "target": 2}]
    result = find_leaf_nodes(nodes, edges)  # 781ns -> 781ns (0.000% faster)
    assert len(result) == 1
    assert result[0]["id"] == 2


def test_edge_with_nonexistent_source():
    """Test with edges that reference non-existent nodes."""
    nodes = [{"id": 1, "name": "A"}]
    edges = [{"source": 999, "target": 1}]
    result = find_leaf_nodes(nodes, edges)  # 642ns -> 801ns (19.9% slower)
    # Node 1 has no outgoing edges, so it is a leaf
    assert result == [{"id": 1, "name": "A"}]


def test_preserves_node_order():
    """Test that the function preserves the order of nodes in the result."""
    nodes = [
        {"id": 3, "name": "C"},
        {"id": 1, "name": "A"},
        {"id": 2, "name": "B"},
    ]
    edges = []
    result = find_leaf_nodes(nodes, edges)  # 682ns -> 842ns (19.0% slower)
    assert result == nodes


def test_large_linear_chain():
    """Test with a large linear chain of 100 nodes."""
    num_nodes = 100
    nodes = [{"id": i, "name": f"node_{i}"} for i in range(num_nodes)]
    edges = [{"source": i, "target": i + 1} for i in range(num_nodes - 1)]
    result = find_leaf_nodes(nodes, edges)  # 173μs -> 6.58μs (2539% faster)
    # Only the last node should be a leaf
    assert result == [{"id": num_nodes - 1, "name": f"node_{num_nodes - 1}"}]


def test_large_star_topology():
    """Test with a large star topology with 1000 leaf nodes."""
    center_node = {"id": 0, "name": "center"}
    leaf_nodes = [{"id": i, "name": f"leaf_{i}"} for i in range(1, 101)]
    nodes = [center_node] + leaf_nodes
    edges = [{"source": 0, "target": i} for i in range(1, 101)]
    result = find_leaf_nodes(nodes, edges)  # 324μs -> 6.39μs (4971% faster)
    assert len(result) == 100
    assert all(node in result for node in leaf_nodes)


def test_large_complete_graph():
    """Test with a large complete graph where every node connects to every other."""
    num_nodes = 50
    nodes = [{"id": i, "name": f"node_{i}"} for i in range(num_nodes)]
    edges = [
        {"source": i, "target": j}
        for i in range(num_nodes)
        for j in range(num_nodes)
        if i != j
    ]
    result = find_leaf_nodes(nodes, edges)  # 1.94ms -> 50.6μs (3736% faster)
    # In a complete graph, no node is a leaf (all have outgoing edges)
    assert result == []


def test_large_two_level_hierarchy():
    """Test with a two-level hierarchy: parent nodes connecting to many children."""
    parents = [{"id": i, "name": f"parent_{i}"} for i in range(10)]
    children = [{"id": 1000 + i, "name": f"child_{i}"} for i in range(100)]
    nodes = parents + children
    edges = [{"source": i, "target": 1000 + j} for i in range(10) for j in range(10)]
    result = find_leaf_nodes(nodes, edges)  # 368μs -> 7.70μs (4690% faster)
    # Only the children not connected to parents and parents not in edges are leaves
    # In this case, children 10-99 and all 10 parents are leaves
    assert len(result) >= 90


def test_large_number_of_edges():
    """Test with many nodes and many edges."""
    num_nodes = 100
    nodes = [{"id": i, "name": f"node_{i}"} for i in range(num_nodes)]
    # Create 500 random edges
    edges = [
        {"source": i % num_nodes, "target": (i + 1) % num_nodes} for i in range(500)
    ]
    result = find_leaf_nodes(nodes, edges)  # 173μs -> 14.5μs (1095% faster)
    # Result should be a subset of nodes
    assert len(result) <= len(nodes)
    assert all(isinstance(node, dict) for node in result)


def test_large_acyclic_graph():
    """Test with a large directed acyclic graph (DAG)."""
    num_levels = 10
    nodes_per_level = 20
    nodes = [
        {"id": level * nodes_per_level + i, "name": f"node_L{level}_N{i}"}
        for level in range(num_levels)
        for i in range(nodes_per_level)
    ]
    edges = []
    for level in range(num_levels - 1):
        for i in range(nodes_per_level):
            for j in range(nodes_per_level):
                source_id = level * nodes_per_level + i
                target_id = (level + 1) * nodes_per_level + j
                edges.append({"source": source_id, "target": target_id})
    result = find_leaf_nodes(nodes, edges)  # 12.5ms -> 79.6μs (15668% faster)
    # Only nodes in the last level are leaves
    assert len(result) == nodes_per_level
    assert all(node["id"] >= (num_levels - 1) * nodes_per_level for node in result)


def test_very_large_node_list():
    """Test with a very large list of isolated nodes."""
    num_nodes = 500
    nodes = [{"id": i, "name": f"node_{i}"} for i in range(num_nodes)]
    edges = []
    result = find_leaf_nodes(nodes, edges)  # 24.2μs -> 15.9μs (52.8% faster)
    assert result == nodes
    assert len(result) == num_nodes
from src.algorithms.graph import find_leaf_nodes


def test_find_leaf_nodes():
    find_leaf_nodes([{"id": 3}], [{"source": 4}])


def test_find_leaf_nodes_2():
    find_leaf_nodes([{"id": 3}], [{"source": 3}])
🔎 Click to see Concolic Coverage Tests

To edit these changes git checkout codeflash/optimize-find_leaf_nodes-mmuc1iro and push.

Codeflash Static Badge

The optimization replaces the nested O(N×M) loop structure with a two-pass O(N+M) algorithm: first building a set of all source node IDs in a single pass over edges, then filtering nodes via a list comprehension that performs O(1) membership checks against that set instead of scanning all edges for each node. Line profiler confirms the nested loop previously consumed 99.5% of runtime (708ms total), while the optimized version completes in 0.52ms—a 1367× speedup. Small regressions on trivial inputs (empty/single-node cases, 23–55% slower) stem from set-construction overhead that is negligible in absolute terms (<1μs) and vastly outweighed by dramatic gains on realistic workloads (e.g., 1000-node tests improve 286–312×).
@codeflash-ai codeflash-ai bot requested a review from KRRT7 March 17, 2026 08:10
@codeflash-ai codeflash-ai bot added ⚡️ codeflash Optimization PR opened by Codeflash AI 🎯 Quality: High Optimization Quality according to Codeflash labels Mar 17, 2026
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

⚡️ codeflash Optimization PR opened by Codeflash AI 🎯 Quality: High Optimization Quality according to Codeflash

Projects

None yet

Development

Successfully merging this pull request may close these issues.

0 participants