From 4464decdc5c71116e990670eeaf2f1f04bb082fd Mon Sep 17 00:00:00 2001
From: jlarson4 <jonahalarson@comcast.net>
Date: Tue, 14 Apr 2026 20:00:46 -0500
Subject: [PATCH 1/8] Prep work for Hybrid model integration

---
 tests/unit/test_optional_submodule.py         | 887 ++++++++++++++++++
 .../benchmarks/weight_processing.py           | 136 ++-
 transformer_lens/model_bridge/bridge.py       | 343 ++++++-
 .../model_bridge/component_setup.py           |  46 +-
 .../model_bridge/composition_scores.py        | 102 ++
 .../generalized_components/base.py            |   7 +
 .../generalized_components/block.py           |  15 +
 .../model_bridge/get_params_util.py           | 146 ++-
 transformer_lens/weight_processing.py         |  15 +
 9 files changed, 1555 insertions(+), 142 deletions(-)
 create mode 100644 tests/unit/test_optional_submodule.py
 create mode 100644 transformer_lens/model_bridge/composition_scores.py

diff --git a/tests/unit/test_optional_submodule.py b/tests/unit/test_optional_submodule.py
new file mode 100644
index 000000000..4bc44e6bc
--- /dev/null
+++ b/tests/unit/test_optional_submodule.py
@@ -0,0 +1,887 @@
+"""Unit tests for the optional submodule framework.
+
+Tests the `optional` flag on GeneralizedComponent and the `blocks_with()`
+capability query API on TransformerBridge, which together enable hybrid
+architectures where layers have structurally different submodules.
+"""
+
+import pytest
+import torch
+import torch.nn as nn
+
+from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
+from transformer_lens.model_bridge.component_setup import setup_submodules
+from transformer_lens.model_bridge.generalized_components.base import (
+    GeneralizedComponent,
+)
+from transformer_lens.model_bridge.generalized_components.block import BlockBridge
+from transformer_lens.model_bridge.generalized_components.linear import LinearBridge
+
+# ============================================================================
+# Fixtures: synthetic hybrid model
+# ============================================================================
+
+
+class FakeSubmodule(nn.Module):
+    """A simple nn.Linear submodule for testing."""
+
+    def __init__(self, dim: int = 4):
+        super().__init__()
+        self.proj = nn.Linear(dim, dim, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.proj(x)
+
+
+class HybridLayer(nn.Module):
+    """A layer that conditionally has a 'foo' submodule."""
+
+    def __init__(self, has_foo: bool, dim: int = 4):
+        super().__init__()
+        self.bar = nn.Linear(dim, dim, bias=False)
+        if has_foo:
+            self.foo = FakeSubmodule(dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "foo"):
+            x = self.foo(x)
+        return self.bar(x)
+
+
+class HybridModel(nn.Module):
+    """Model with 4 layers: layers 0-2 have 'foo', layer 3 does not."""
+
+    def __init__(self, dim: int = 4):
+        super().__init__()
+        self.layers = nn.ModuleList([HybridLayer(has_foo=(i < 3), dim=dim) for i in range(4)])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+class MinimalAdapter(ArchitectureAdapter):
+    """Minimal adapter for testing optional submodule setup."""
+
+    def __init__(self, optional: bool = True):
+        self.cfg = type("Cfg", (), {"n_layers": 4, "d_model": 4})()
+        self.component_mapping = {}
+        self._optional = optional
+
+    def make_block_template(self) -> BlockBridge:
+        return BlockBridge(
+            name="layers",
+            submodules={
+                "bar": LinearBridge(name="bar"),
+                "foo": LinearBridge(name="foo", optional=self._optional),
+            },
+        )
+
+
+# ============================================================================
+# Tests: optional flag on GeneralizedComponent
+# ============================================================================
+
+
+class TestOptionalFlag:
+    """Test that the optional flag is properly stored and defaults to False."""
+
+    def test_default_is_false(self):
+        comp = GeneralizedComponent(name="test")
+        assert comp.optional is False
+
+    def test_optional_true(self):
+        comp = GeneralizedComponent(name="test", optional=True)
+        assert comp.optional is True
+
+    def test_optional_false_explicit(self):
+        comp = GeneralizedComponent(name="test", optional=False)
+        assert comp.optional is False
+
+
+# ============================================================================
+# Tests: setup_submodules with optional
+# ============================================================================
+
+
+class TestOptionalSubmoduleSetup:
+    """Test that optional submodules are skipped cleanly during setup."""
+
+    def test_optional_submodule_skipped_on_missing_layers(self):
+        """Layers 0-2 have 'foo', layer 3 does not. Setup should succeed."""
+        model = HybridModel()
+        adapter = MinimalAdapter(optional=True)
+        template = adapter.make_block_template()
+
+        # Simulate what setup_blocks_bridge does: deepcopy + setup per layer
+        import copy
+
+        blocks = []
+        for i, layer in enumerate(model.layers):
+            block = copy.deepcopy(template)
+            block.name = f"layers.{i}"
+            block.set_original_component(layer)
+            setup_submodules(block, adapter, layer)
+            blocks.append(block)
+
+        # Layers 0-2 should have 'foo' in real_components
+        for i in range(3):
+            assert "foo" in blocks[i].real_components, f"Block {i} should have 'foo'"
+            assert hasattr(blocks[i], "foo"), f"Block {i} should have foo module"
+
+        # Layer 3 should NOT have 'foo' in any lookup path
+        assert (
+            "foo" not in blocks[3].real_components
+        ), "Block 3 should not have 'foo' in real_components"
+        assert "foo" not in blocks[3]._modules, "Block 3 should not have 'foo' in _modules"
+        assert "foo" not in blocks[3].submodules, "Block 3 should not have 'foo' in submodules"
+
+        # All layers should have 'bar'
+        for i in range(4):
+            assert "bar" in blocks[i].real_components, f"Block {i} should have 'bar'"
+
+    def test_non_optional_missing_submodule_raises(self):
+        """When optional=False, missing submodule should raise AttributeError."""
+        model = HybridModel()
+        adapter = MinimalAdapter(optional=False)
+        template = adapter.make_block_template()
+
+        import copy
+
+        # Layer 3 lacks 'foo' and optional=False, so this should raise
+        block = copy.deepcopy(template)
+        block.name = "layers.3"
+        block.set_original_component(model.layers[3])
+        with pytest.raises(AttributeError):
+            setup_submodules(block, adapter, model.layers[3])
+
+
+# ============================================================================
+# Tests: blocks_with() API
+# ============================================================================
+
+
+class TestBlocksWith:
+    """Test the blocks_with() capability query on TransformerBridge."""
+
+    def test_blocks_with_returns_matching_blocks(self):
+        """blocks_with('foo') should return only blocks that have 'foo'."""
+        from transformer_lens.model_bridge.bridge import TransformerBridge
+
+        model = HybridModel()
+        adapter = MinimalAdapter(optional=True)
+        template = adapter.make_block_template()
+
+        import copy
+
+        blocks = nn.ModuleList()
+        for i, layer in enumerate(model.layers):
+            block = copy.deepcopy(template)
+            block.name = f"layers.{i}"
+            block.set_original_component(layer)
+            setup_submodules(block, adapter, layer)
+            blocks.append(block)
+
+        # Create a minimal bridge-like object with blocks attribute
+        # We test blocks_with as a standalone method
+        bridge = TransformerBridge.__new__(TransformerBridge)
+        nn.Module.__init__(bridge)
+        bridge.add_module("blocks", blocks)
+
+        foo_blocks = bridge.blocks_with("foo")
+        assert len(foo_blocks) == 3
+        assert [idx for idx, _ in foo_blocks] == [0, 1, 2]
+
+        bar_blocks = bridge.blocks_with("bar")
+        assert len(bar_blocks) == 4
+
+        missing_blocks = bridge.blocks_with("nonexistent")
+        assert len(missing_blocks) == 0
+
+    def test_blocks_with_no_blocks_attribute(self):
+        """blocks_with() should return empty list if no blocks attribute."""
+        from transformer_lens.model_bridge.bridge import TransformerBridge
+
+        bridge = TransformerBridge.__new__(TransformerBridge)
+        nn.Module.__init__(bridge)
+        assert bridge.blocks_with("attn") == []
+
+
+# ============================================================================
+# Tests: _stack_block_params with hybrid blocks
+# ============================================================================
+
+
+class TestStackBlockParamsHybridSafe:
+    """Test that _stack_block_params raises clear errors for hybrid blocks."""
+
+    def test_logs_warning_and_returns_subset_on_hybrid(self, caplog):
+        """On hybrid blocks, should log warning and return tensor for matching blocks only."""
+        import logging
+
+        from transformer_lens.model_bridge.bridge import TransformerBridge
+
+        # Build blocks where block 3 lacks 'foo' but blocks 0-2 have it
+        model = HybridModel()
+        adapter = MinimalAdapter(optional=True)
+        template = adapter.make_block_template()
+
+        import copy
+
+        blocks = nn.ModuleList()
+        for i, layer in enumerate(model.layers):
+            block = copy.deepcopy(template)
+            block.name = f"layers.{i}"
+            block.set_original_component(layer)
+            setup_submodules(block, adapter, layer)
+            blocks.append(block)
+
+        # Verify precondition: block 3 lacks 'foo'
+        assert "foo" in blocks[0]._modules
+        assert "foo" not in blocks[3]._modules
+
+        bridge = TransformerBridge.__new__(TransformerBridge)
+        nn.Module.__init__(bridge)
+        bridge.add_module("blocks", blocks)
+
+        # Should succeed with a log warning, returning only matching blocks.
+        # logging.warning always emits (no deduplication), so researchers see
+        # the index mapping notice on every access — not just the first.
+        with caplog.at_level(logging.WARNING):
+            result = bridge._stack_block_params("foo.proj.weight")
+        assert any("Hybrid model" in msg for msg in caplog.messages)
+        assert any("stack_params_for" in msg for msg in caplog.messages)
+        # 3 blocks have 'foo', not 4
+        assert result.shape[0] == 3
+
+        # Verify it logs again on a second call (no deduplication)
+        caplog.clear()
+        with caplog.at_level(logging.WARNING):
+            result2 = bridge._stack_block_params("foo.proj.weight")
+        assert any(
+            "Hybrid model" in msg for msg in caplog.messages
+        ), "Warning should emit on every call, not just the first"
+
+    def test_raises_when_no_blocks_have_submodule(self):
+        """Should raise AttributeError when zero blocks have the submodule."""
+        from transformer_lens.model_bridge.bridge import TransformerBridge
+
+        bridge = _make_hybrid_bridge()
+        with pytest.raises(AttributeError, match="No blocks have"):
+            bridge._stack_block_params("nonexistent")
+
+    def test_succeeds_on_universal_submodule(self):
+        """Should succeed when all blocks have the requested submodule."""
+        from transformer_lens.model_bridge.bridge import TransformerBridge
+
+        model = HybridModel()
+        adapter = MinimalAdapter(optional=True)
+        template = adapter.make_block_template()
+
+        import copy
+
+        blocks = nn.ModuleList()
+        for i, layer in enumerate(model.layers):
+            block = copy.deepcopy(template)
+            block.name = f"layers.{i}"
+            block.set_original_component(layer)
+            setup_submodules(block, adapter, layer)
+            blocks.append(block)
+
+        bridge = TransformerBridge.__new__(TransformerBridge)
+        nn.Module.__init__(bridge)
+        bridge.add_module("blocks", blocks)
+
+        # 'bar' exists on all blocks → should succeed
+        result = bridge._stack_block_params("bar.weight")
+        assert result.shape[0] == 4  # 4 layers
+
+
+# ============================================================================
+# Tests: refactor_factored_attn_matrices with missing layers
+# ============================================================================
+
+
+class TestRefactorFactoredAttnHybrid:
+    """Test that refactor_factored_attn_matrices skips layers without attn."""
+
+    def test_skips_missing_attn_layers(self):
+        """Should process layers with attn keys and skip those without."""
+        from transformer_lens.config.TransformerLensConfig import TransformerLensConfig
+        from transformer_lens.weight_processing import ProcessWeights
+
+        n_heads = 2
+        d_head = 4
+        d_model = n_heads * d_head
+        cfg = TransformerLensConfig(
+            n_layers=4,
+            n_heads=n_heads,
+            d_head=d_head,
+            d_model=d_model,
+            n_ctx=16,
+            positional_embedding_type="standard",
+        )
+
+        # Create state_dict with attn weights for layers 0-2 only.
+        # W_Q/W_K/W_V: [n_heads, d_model, d_head], W_O: [n_heads, d_head, d_model]
+        # b_Q/b_K/b_V: [n_heads, d_head], b_O: [d_model]
+        state_dict = {}
+        for l in range(3):  # layers 0-2 have attention
+            state_dict[f"blocks.{l}.attn.W_Q"] = torch.randn(n_heads, d_model, d_head)
+            state_dict[f"blocks.{l}.attn.W_K"] = torch.randn(n_heads, d_model, d_head)
+            state_dict[f"blocks.{l}.attn.W_V"] = torch.randn(n_heads, d_model, d_head)
+            state_dict[f"blocks.{l}.attn.W_O"] = torch.randn(n_heads, d_head, d_model)
+            state_dict[f"blocks.{l}.attn.b_Q"] = torch.randn(n_heads, d_head)
+            state_dict[f"blocks.{l}.attn.b_K"] = torch.randn(n_heads, d_head)
+            state_dict[f"blocks.{l}.attn.b_V"] = torch.randn(n_heads, d_head)
+            state_dict[f"blocks.{l}.attn.b_O"] = torch.randn(d_model)
+
+        # Layer 3 has NO attention keys — should be skipped, not crash
+        result = ProcessWeights.refactor_factored_attn_matrices(state_dict, cfg)
+
+        # Layers 0-2 should still have their attn keys (now refactored)
+        for l in range(3):
+            assert f"blocks.{l}.attn.W_Q" in result
+            assert f"blocks.{l}.attn.W_K" in result
+
+        # Layer 3 should have no attn keys
+        assert f"blocks.3.attn.W_Q" not in result
+
+
+# ============================================================================
+# Tests: weight distribution with ragged blocks
+# ============================================================================
+
+
+class TestWeightDistributionRagged:
+    """Test that weight distribution handles heterogeneous real_components."""
+
+    def test_distribute_weights_skips_empty_blocks(self):
+        """Blocks without attn weights should receive no attn keys."""
+        from transformer_lens.weight_processing import ProcessWeights
+
+        # Build a minimal real_components mapping with ragged blocks
+        model = HybridModel()
+        adapter = MinimalAdapter(optional=True)
+        template = adapter.make_block_template()
+
+        import copy
+
+        blocks = []
+        for i, layer in enumerate(model.layers):
+            block = copy.deepcopy(template)
+            block.name = f"layers.{i}"
+            block.set_original_component(layer)
+            setup_submodules(block, adapter, layer)
+            blocks.append(block)
+
+        # Construct state_dict with 'foo' weights for blocks 0-2 only
+        state_dict = {}
+        for i in range(3):
+            state_dict[f"blocks.{i}.foo.weight"] = torch.randn(4, 4)
+        for i in range(4):
+            state_dict[f"blocks.{i}.bar.weight"] = torch.randn(4, 4)
+
+        # Build the component mapping
+        component_mapping = {
+            "blocks": ("layers", blocks),
+        }
+
+        # This should not crash
+        ProcessWeights.distribute_weights_to_components(
+            state_dict=state_dict,
+            component_mapping=component_mapping,
+        )
+
+
+# ============================================================================
+# Helpers for bridge-level tests
+# ============================================================================
+
+
+def _make_hybrid_bridge():
+    """Build a minimal TransformerBridge with hybrid blocks for testing.
+
+    Uses 'foo' and 'bar' as submodule names. Layers 0-2 have 'foo', layer 3 does not.
+    """
+    import copy
+
+    from transformer_lens.model_bridge.bridge import TransformerBridge
+
+    model = HybridModel()
+    adapter = MinimalAdapter(optional=True)
+    template = adapter.make_block_template()
+
+    blocks = nn.ModuleList()
+    for i, layer in enumerate(model.layers):
+        block = copy.deepcopy(template)
+        block.name = f"layers.{i}"
+        block.set_original_component(layer)
+        setup_submodules(block, adapter, layer)
+        blocks.append(block)
+
+    bridge = TransformerBridge.__new__(TransformerBridge)
+    nn.Module.__init__(bridge)
+    bridge.add_module("blocks", blocks)
+
+    # Minimal cfg for accumulated_bias
+    bridge.cfg = type("Cfg", (), {"d_model": 4, "device": "cpu", "n_layers": 4})()
+    return bridge
+
+
+class AttnAdapter(ArchitectureAdapter):
+    """Adapter using 'attn' as the optional submodule name (matches real adapters)."""
+
+    def __init__(self):
+        self.cfg = type("Cfg", (), {"n_layers": 4, "d_model": 4})()
+        self.component_mapping = {}
+
+    def make_block_template(self) -> BlockBridge:
+        return BlockBridge(
+            name="layers",
+            submodules={
+                "bar": LinearBridge(name="bar"),
+                "attn": LinearBridge(name="foo", optional=True),
+            },
+        )
+
+
+def _make_hybrid_bridge_with_attn():
+    """Build a hybrid bridge where 'attn' is the optional submodule.
+
+    Layers 0-2 have 'attn' (mapped from 'foo'), layer 3 does not.
+    Used for testing APIs that specifically look for 'attn' (composition scores, labels).
+    """
+    import copy
+
+    from transformer_lens.model_bridge.bridge import TransformerBridge
+
+    model = HybridModel()
+    adapter = AttnAdapter()
+    template = adapter.make_block_template()
+
+    blocks = nn.ModuleList()
+    for i, layer in enumerate(model.layers):
+        block = copy.deepcopy(template)
+        block.name = f"layers.{i}"
+        block.set_original_component(layer)
+        setup_submodules(block, adapter, layer)
+        blocks.append(block)
+
+    bridge = TransformerBridge.__new__(TransformerBridge)
+    nn.Module.__init__(bridge)
+    bridge.add_module("blocks", blocks)
+    bridge.cfg = type("Cfg", (), {"d_model": 4, "device": "cpu", "n_layers": 4, "n_heads": 2})()
+    return bridge
+
+
+# ============================================================================
+# Tests: blocks_with uses _modules not hasattr
+# ============================================================================
+
+
+class TestBlocksWithModulesCheck:
+    """blocks_with() should only find bridged submodules, not HF attrs."""
+
+    def test_does_not_find_hf_internal_attrs(self):
+        """blocks_with should not match HF attributes that aren't bridged."""
+        bridge = _make_hybrid_bridge()
+        # 'bar' is a bridged submodule (in _modules), should be found
+        assert len(bridge.blocks_with("bar")) == 4
+        # 'training' exists as an attr on nn.Module but is not a bridged submodule
+        assert len(bridge.blocks_with("training")) == 0
+
+    def test_finds_only_bridged_optional_submodules(self):
+        """Optional submodules should be found only on layers where they were bound."""
+        bridge = _make_hybrid_bridge()
+        foo_blocks = bridge.blocks_with("foo")
+        assert [idx for idx, _ in foo_blocks] == [0, 1, 2]
+
+
+# ============================================================================
+# Tests: accumulated_bias on hybrid models
+# ============================================================================
+
+
+class TestAccumulatedBiasHybrid:
+    """accumulated_bias should not crash on hybrid models."""
+
+    def test_accumulated_bias_skips_non_attn_layers(self):
+        """Should not crash when some layers lack attention."""
+        bridge = _make_hybrid_bridge()
+        # Should run without error through all 4 layers (layer 3 has no attn)
+        result = bridge.accumulated_bias(layer=4)
+        assert result.shape == (4,)
+
+    def test_accumulated_bias_mlp_input_on_non_attn_layer(self):
+        """mlp_input=True on a non-attention layer should not crash."""
+        bridge = _make_hybrid_bridge()
+        # Layer 3 has no attn — should still work with mlp_input=True
+        result = bridge.accumulated_bias(layer=3, mlp_input=True)
+        assert result.shape == (4,)
+
+
+# ============================================================================
+# Tests: block_submodules and layer_types introspection
+# ============================================================================
+
+
+class TestBlockIntrospection:
+    """Test layer introspection APIs."""
+
+    def test_block_submodules(self):
+        """block_submodules should list bridged submodules per layer."""
+        bridge = _make_hybrid_bridge()
+        # Layer 0 has both foo and bar
+        subs_0 = bridge.block_submodules(0)
+        assert "foo" in subs_0
+        assert "bar" in subs_0
+        # Layer 3 has only bar
+        subs_3 = bridge.block_submodules(3)
+        assert "foo" not in subs_3
+        assert "bar" in subs_3
+
+    def test_layer_types(self):
+        """layer_types should return a list with one entry per block."""
+        bridge = _make_hybrid_bridge()
+        types = bridge.layer_types()
+        assert len(types) == 4
+        # Layers 0-2 have 'foo', layer 3 does not
+        for i in range(3):
+            assert "foo" in types[i]
+        assert "foo" not in types[3]
+
+
+# ============================================================================
+# Tests: stack_params_for hybrid API
+# ============================================================================
+
+
+class TestStackParamsFor:
+    """Test stack_params_for on hybrid bridges."""
+
+    def test_returns_correct_indices_and_tensors(self):
+        """stack_params_for should return only matching blocks."""
+        bridge = _make_hybrid_bridge()
+        indices, stacked = bridge.stack_params_for("foo", "foo.proj.weight")
+        assert indices == [0, 1, 2]
+        assert stacked.shape[0] == 3
+
+    def test_raises_on_no_matching_blocks(self):
+        """Should raise ValueError when no blocks have the submodule."""
+        bridge = _make_hybrid_bridge()
+        with pytest.raises(ValueError, match="No blocks have submodule"):
+            bridge.stack_params_for("nonexistent", "nonexistent.weight")
+
+
+# ============================================================================
+# Tests: refactor guard validates all attn keys
+# ============================================================================
+
+
+class TestRefactorGuardConsistency:
+    """Test that refactor raises on inconsistent attn keys (W_Q present, W_K missing)."""
+
+    def test_raises_on_partial_attn_keys(self):
+        """If W_Q is present but W_K is missing, should raise ValueError."""
+        from transformer_lens.config.TransformerLensConfig import TransformerLensConfig
+        from transformer_lens.weight_processing import ProcessWeights
+
+        cfg = TransformerLensConfig(
+            n_layers=1,
+            n_heads=2,
+            d_head=4,
+            d_model=8,
+            n_ctx=16,
+            positional_embedding_type="standard",
+        )
+        # Only W_Q present, missing W_K/W_V/W_O
+        state_dict = {
+            "blocks.0.attn.W_Q": torch.randn(2, 8, 4),
+        }
+        with pytest.raises(ValueError, match="Inconsistent attention weights"):
+            ProcessWeights.refactor_factored_attn_matrices(state_dict, cfg)
+
+
+# ============================================================================
+# Tests: __setattr__ whitelist includes optional
+# ============================================================================
+
+
+class TestSetAttrWhitelist:
+    """Test that 'optional' is in the __setattr__ whitelist."""
+
+    def test_optional_set_on_bridge_not_hf_model(self):
+        """Setting optional after set_original_component should stay on bridge."""
+        comp = LinearBridge(name="test")
+        fake_hf = nn.Linear(4, 4, bias=False)
+        comp.set_original_component(fake_hf)
+        comp.optional = True
+        # Should be on the bridge, not on the HF module
+        assert comp.optional is True
+        assert not hasattr(fake_hf, "optional")
+
+
+# ============================================================================
+# Tests: attn_head_labels matches composition scores dimensions
+# ============================================================================
+
+
+class TestAttnHeadLabels:
+    """attn_head_labels should match all_composition_scores dimensions."""
+
+    def test_attn_head_labels_excludes_non_attn_layers(self):
+        """Labels should only cover attention layers, not SSM/linear-attn."""
+        bridge = _make_hybrid_bridge_with_attn()
+        bridge.cfg.n_heads = 2
+        labels = bridge.attn_head_labels
+        # 3 attention layers (0, 1, 2) * 2 heads = 6 labels
+        assert len(labels) == 6
+        assert labels == ["L0H0", "L0H1", "L1H0", "L1H1", "L2H0", "L2H1"]
+        # Should NOT contain L3 (non-attention layer)
+        assert all("L3" not in lbl for lbl in labels)
+
+    def test_all_head_labels_includes_all_layers(self):
+        """all_head_labels should still include every layer."""
+        bridge = _make_hybrid_bridge_with_attn()
+        bridge.cfg.n_heads = 2
+        labels = bridge.all_head_labels
+        # 4 layers * 2 heads = 8 labels
+        assert len(labels) == 8
+
+
+# ============================================================================
+# Tests: hook propagation through optional submodules
+# ============================================================================
+
+
+class TestHookPropagation:
+    """Verify hooks fire on present optional submodules and don't exist on absent ones."""
+
+    def _build_hybrid_model_and_blocks(self):
+        """Build a hybrid model with setup done so hooks are wired."""
+        import copy
+
+        model = HybridModel()
+        adapter = MinimalAdapter(optional=True)
+        template = adapter.make_block_template()
+
+        blocks = []
+        for i, layer in enumerate(model.layers):
+            block = copy.deepcopy(template)
+            block.name = f"layers.{i}"
+            block.set_original_component(layer)
+            setup_submodules(block, adapter, layer)
+            blocks.append(block)
+
+        return model, blocks
+
+    def test_hooks_fire_on_present_optional_submodule(self):
+        """hook_in and hook_out should fire on blocks where the optional submodule exists."""
+        model, blocks = self._build_hybrid_model_and_blocks()
+
+        # Block 0 has 'foo' — its hook_in and hook_out should fire
+        foo_bridge = blocks[0].foo
+        hook_in_fired = []
+        hook_out_fired = []
+
+        foo_bridge.hook_in.add_hook(lambda tensor, hook: hook_in_fired.append(True) or tensor)
+        foo_bridge.hook_out.add_hook(lambda tensor, hook: hook_out_fired.append(True) or tensor)
+
+        # Run a forward pass through the HF model's layer 0
+        # Because replace_remote_component swapped model.layers[0].foo with the bridge,
+        # calling model.layers[0].foo(x) goes through LinearBridge.forward
+        x = torch.randn(1, 4)
+        _ = blocks[0].foo(x)
+
+        assert len(hook_in_fired) == 1, "hook_in should fire on present optional submodule"
+        assert len(hook_out_fired) == 1, "hook_out should fire on present optional submodule"
+
+    def test_absent_optional_submodule_has_no_hooks(self):
+        """Block 3 should not have 'foo' at all — no hooks to fire."""
+        _, blocks = self._build_hybrid_model_and_blocks()
+
+        # Block 3 lacks 'foo' — it shouldn't be in _modules
+        assert "foo" not in blocks[3]._modules
+        # Attempting to access hooks on the absent submodule should fail
+        assert not hasattr(blocks[3], "foo")
+
+    def test_hooks_on_present_dont_affect_absent(self):
+        """Running all blocks should fire hooks only on blocks with the optional submodule."""
+        model, blocks = self._build_hybrid_model_and_blocks()
+
+        # Track which blocks fire foo.hook_out
+        fired_block_indices = []
+        for i, block in enumerate(blocks):
+            if "foo" in block._modules:
+                block.foo.hook_out.add_hook(
+                    lambda tensor, hook, idx=i: fired_block_indices.append(idx) or tensor
+                )
+
+        # Run forward through all HF layers
+        x = torch.randn(1, 4)
+        for i, layer in enumerate(model.layers):
+            x = layer(x)
+
+        # Hooks should fire on layers 0, 1, 2 (have foo) but not 3
+        assert fired_block_indices == [0, 1, 2]
+
+    def test_universal_submodule_hooks_fire_on_all_blocks(self):
+        """'bar' is universal — its hooks should fire on every block."""
+        model, blocks = self._build_hybrid_model_and_blocks()
+
+        fired_block_indices = []
+        for i, block in enumerate(blocks):
+            block.bar.hook_out.add_hook(
+                lambda tensor, hook, idx=i: fired_block_indices.append(idx) or tensor
+            )
+
+        x = torch.randn(1, 4)
+        for layer in model.layers:
+            x = layer(x)
+
+        assert fired_block_indices == [0, 1, 2, 3]
+
+
+# ============================================================================
+# Tests: CompositionScores tensor protocol
+# ============================================================================
+
+
+class TestCompositionScoresProtocol:
+    """CompositionScores should behave like a tensor for existing research code."""
+
+    def _make_scores(self):
+        from transformer_lens.model_bridge.composition_scores import CompositionScores
+
+        t = torch.randn(3, 2, 3, 2)
+        return CompositionScores(t, [0, 2, 5], ["L0H0", "L0H1", "L2H0", "L2H1", "L5H0", "L5H1"])
+
+    def test_shape(self):
+        cs = self._make_scores()
+        assert cs.shape == torch.Size([3, 2, 3, 2])
+
+    def test_device_and_dtype(self):
+        cs = self._make_scores()
+        assert cs.device == torch.device("cpu")
+        assert cs.dtype == torch.float32
+
+    def test_indexing_returns_tensor(self):
+        cs = self._make_scores()
+        sliced = cs[0, :, 1, :]
+        assert isinstance(sliced, torch.Tensor)
+        assert sliced.shape == (2, 2)
+
+    def test_torch_isnan(self):
+        """torch.isnan(scores) must work — used in existing integration tests."""
+        cs = self._make_scores()
+        result = torch.isnan(cs)
+        assert isinstance(result, torch.Tensor)
+        assert result.shape == cs.shape
+        assert not result.any()
+
+    def test_torch_where(self):
+        cs = self._make_scores()
+        result = torch.where(cs > 0, cs.scores, torch.zeros_like(cs.scores))
+        assert isinstance(result, torch.Tensor)
+
+    def test_comparison_gt(self):
+        cs = self._make_scores()
+        mask = cs > 0
+        assert isinstance(mask, torch.Tensor)
+        assert mask.shape == cs.shape
+
+    def test_comparison_ne(self):
+        """scores != 0 must return a tensor, not raise RuntimeError."""
+        cs = self._make_scores()
+        result = cs != 0
+        assert isinstance(result, torch.Tensor)
+        assert result.shape == cs.shape
+
+    def test_comparison_eq(self):
+        cs = self._make_scores()
+        result = cs == 0
+        assert isinstance(result, torch.Tensor)
+
+    def test_tensor_method_abs(self):
+        """scores.abs() must work via __getattr__ delegation."""
+        cs = self._make_scores()
+        result = cs.abs()
+        assert isinstance(result, torch.Tensor)
+
+    def test_tensor_method_sum(self):
+        cs = self._make_scores()
+        result = cs.sum()
+        assert isinstance(result, torch.Tensor)
+
+    def test_tensor_method_any(self):
+        cs = self._make_scores()
+        result = cs.any()
+        assert isinstance(result, torch.Tensor)
+
+    def test_chained_indexing_and_method(self):
+        """scores[l1, :, l2, :].abs().sum() — the exact pattern from integration tests."""
+        cs = self._make_scores()
+        result = cs[0, :, 1, :].abs().sum()
+        assert isinstance(result, torch.Tensor)
+        assert result.ndim == 0  # scalar
+
+    def test_metadata_accessible(self):
+        cs = self._make_scores()
+        assert cs.layer_indices == [0, 2, 5]
+        assert len(cs.head_labels) == 6
+
+    def test_repr(self):
+        cs = self._make_scores()
+        r = repr(cs)
+        assert "CompositionScores" in r
+        assert "layer_indices" in r
+
+
+# ============================================================================
+# Tests: get_bridge_params with hybrid blocks
+# ============================================================================
+
+
+class TestGetBridgeParamsHybrid:
+    """get_bridge_params should skip attn keys for non-attention layers."""
+
+    def test_no_attn_keys_for_non_attn_layers(self):
+        from transformer_lens.model_bridge.get_params_util import get_bridge_params
+
+        bridge = _make_hybrid_bridge_with_attn()
+        bridge.cfg.d_vocab = 10
+        bridge.cfg.n_ctx = 8
+        bridge.cfg.d_mlp = 16
+        bridge.cfg.n_heads = 2
+        bridge.cfg.d_head = 2
+
+        # Add minimal embed/unembed so get_bridge_params doesn't fail
+        bridge.embed = nn.Embedding(10, 4)
+        bridge.pos_embed = type("PE", (), {"weight": torch.randn(8, 4)})()
+        bridge.unembed = type(
+            "UE",
+            (),
+            {
+                "weight": torch.randn(10, 4),
+                "b_U": torch.zeros(10),
+            },
+        )()
+
+        params = get_bridge_params(bridge)
+
+        # Blocks 0-2 have 'attn' — should have attn keys
+        for i in range(3):
+            # attn is mapped but internal structure (q/k/v/o) may not match
+            # our synthetic LinearBridge wrapping FakeSubmodule — so attn keys
+            # may or may not be present depending on structure. The key point
+            # is block 3 must NOT have attn keys.
+            pass
+
+        # Block 3 has NO 'attn' — must not have any attn keys
+        attn_keys_for_block3 = [k for k in params if k.startswith("blocks.3.attn.")]
+        assert len(attn_keys_for_block3) == 0, (
+            f"Block 3 (non-attention layer) should have no attn keys, "
+            f"but found: {attn_keys_for_block3}"
+        )
diff --git a/transformer_lens/benchmarks/weight_processing.py b/transformer_lens/benchmarks/weight_processing.py
index eeeabbb91..326c53df7 100644
--- a/transformer_lens/benchmarks/weight_processing.py
+++ b/transformer_lens/benchmarks/weight_processing.py
@@ -68,8 +68,16 @@ def benchmark_weight_processing(
                 )
 
             # Check weight centering - writing weights should be approximately centered
-            bridge_w_out = bridge.blocks[0].mlp.W_out
-            reference_w_out = reference_model.blocks[0].mlp.W_out  # type: ignore[union-attr]
+            mlp_blocks = bridge.blocks_with("mlp")
+            if not mlp_blocks:
+                return BenchmarkResult(
+                    name="weight_processing",
+                    severity=BenchmarkSeverity.WARNING,
+                    message="No blocks have MLP submodule — cannot check centering",
+                )
+            _mlp_idx, mlp_block = mlp_blocks[0]
+            bridge_w_out = mlp_block.mlp.W_out
+            reference_w_out = reference_model.blocks[_mlp_idx].mlp.W_out  # type: ignore[union-attr]
 
             bridge_mean = torch.mean(torch.abs(torch.mean(bridge_w_out, dim=-1, keepdim=True)))
             reference_mean = torch.mean(
@@ -141,10 +149,20 @@ def benchmark_weight_sharing(
         if reference_model is not None:
             reference_original = reference_model(test_text, return_type="loss")
 
+            # Find first block with attention (hybrid models may not have attn on block 0)
+            bridge_attn_blocks = bridge.blocks_with("attn")
+            if not bridge_attn_blocks:
+                return BenchmarkResult(
+                    name="weight_sharing",
+                    severity=BenchmarkSeverity.INFO,
+                    message="No blocks have attention submodule — skipping weight sharing check",
+                )
+            bridge_attn_idx, bridge_attn_block = bridge_attn_blocks[0]
+
             # Verify weights are identical before modification
-            bridge_W_V = torch.clone(cast(torch.Tensor, bridge.blocks[0].attn.W_V))
+            bridge_W_V = torch.clone(cast(torch.Tensor, bridge_attn_block.attn.W_V))
             reference_W_V = torch.clone(
-                cast(torch.Tensor, reference_model.blocks[0].attn.W_V)  # type: ignore[union-attr]
+                cast(torch.Tensor, reference_model.blocks[bridge_attn_idx].attn.W_V)  # type: ignore[union-attr]
             )
 
             # Check if models have GQA (different head counts for K/V vs Q)
@@ -188,8 +206,8 @@ def benchmark_weight_sharing(
 
             # Modify weights in both models
             with torch.no_grad():
-                bridge.blocks[0].attn.W_V[0, :, :] = 0  # type: ignore[union-attr,operator]
-                reference_model.blocks[0].attn.W_V[0, :, :] = 0  # type: ignore[union-attr,operator]
+                bridge_attn_block.attn.W_V[0, :, :] = 0  # type: ignore[union-attr,operator]
+                reference_model.blocks[bridge_attn_idx].attn.W_V[0, :, :] = 0  # type: ignore[union-attr,operator]
 
             # Test modified losses
             bridge_modified = bridge(test_text, return_type="loss")
@@ -200,8 +218,8 @@ def benchmark_weight_sharing(
 
             # Restore weights
             with torch.no_grad():
-                bridge.blocks[0].attn.W_V.copy_(bridge_W_V)  # type: ignore[union-attr,operator,arg-type]
-                reference_model.blocks[0].attn.W_V.copy_(reference_W_V)  # type: ignore[union-attr,operator,arg-type]
+                bridge_attn_block.attn.W_V.copy_(bridge_W_V)  # type: ignore[union-attr,operator,arg-type]
+                reference_model.blocks[bridge_attn_idx].attn.W_V.copy_(reference_W_V)  # type: ignore[union-attr,operator,arg-type]
 
             diff = abs(bridge_change - reference_change)
             if diff < atol:
@@ -220,16 +238,26 @@ def benchmark_weight_sharing(
                 )
 
         # No reference model - just verify modification has an effect
-        original_W_V = bridge.blocks[0].attn.W_V.clone()
+        # Find first block with attention (hybrid models may not have attn on block 0)
+        bridge_attn_blocks = bridge.blocks_with("attn")
+        if not bridge_attn_blocks:
+            return BenchmarkResult(
+                name="weight_sharing",
+                severity=BenchmarkSeverity.INFO,
+                message="No blocks have attention submodule — skipping weight sharing check",
+            )
+        _ws_idx, ws_attn_block = bridge_attn_blocks[0]
+
+        original_W_V = ws_attn_block.attn.W_V.clone()
         with torch.no_grad():
-            bridge.blocks[0].attn.W_V[0, :, :] = 0
+            ws_attn_block.attn.W_V[0, :, :] = 0
 
         bridge_modified = bridge(test_text, return_type="loss")
         change = abs(bridge_modified - bridge_original)
 
         # Restore weights
         with torch.no_grad():
-            bridge.blocks[0].attn.W_V.copy_(original_W_V)
+            ws_attn_block.attn.W_V.copy_(original_W_V)
 
         if change < 1e-6:
             return BenchmarkResult(
@@ -274,16 +302,26 @@ def benchmark_weight_modification(
         # Get original loss
         original_loss = bridge(test_text, return_type="loss")
 
+        # Find first block with attention (hybrid models may not have attn on block 0)
+        wm_attn_blocks = bridge.blocks_with("attn")
+        if not wm_attn_blocks:
+            return BenchmarkResult(
+                name="weight_modification",
+                severity=BenchmarkSeverity.INFO,
+                message="No blocks have attention submodule — skipping weight modification check",
+            )
+        _wm_idx, wm_attn_block = wm_attn_blocks[0]
+
         # Modify W_V weights
         with torch.no_grad():
-            original_w_v = bridge.blocks[0].attn.W_V.clone()
+            original_w_v = wm_attn_block.attn.W_V.clone()
             # Check dimensionality - GQA models may have 2D tensors instead of 3D
             if original_w_v.ndim == 3:
                 # Standard 3D tensor: [n_heads, d_model, d_head]
-                bridge.blocks[0].attn.W_V[0, :, :] = 0
+                wm_attn_block.attn.W_V[0, :, :] = 0
             elif original_w_v.ndim == 2:
                 # 2D tensor (e.g., GQA models): [n_heads * d_head, d_model] or similar
-                bridge.blocks[0].attn.W_V[0, :] = 0
+                wm_attn_block.attn.W_V[0, :] = 0
             else:
                 return BenchmarkResult(
                     name="weight_modification",
@@ -298,7 +336,7 @@ def benchmark_weight_modification(
         except Exception as forward_error:
             # Restore weights before reporting error
             with torch.no_grad():
-                bridge.blocks[0].attn.W_V.copy_(original_w_v)
+                wm_attn_block.attn.W_V.copy_(original_w_v)
 
             # Some models (e.g., models with complex attention mechanisms) may have
             # forward pass issues after weight modification. Report as skipped.
@@ -311,7 +349,7 @@ def benchmark_weight_modification(
 
         # Restore weights
         with torch.no_grad():
-            bridge.blocks[0].attn.W_V.copy_(original_w_v)
+            wm_attn_block.attn.W_V.copy_(original_w_v)
 
         # Loss should change
         change = abs(modified_loss - original_loss)
@@ -321,13 +359,17 @@ def benchmark_weight_modification(
             # is separate from the combined QKV weight used in forward.
             # Try MLP weight modification as fallback.
             mlp_fallback_error = None
+            mlp_blocks = bridge.blocks_with("mlp")
+            mlp_block = mlp_blocks[0][1] if mlp_blocks else None
             try:
+                if mlp_block is None:
+                    raise AttributeError("No blocks have mlp submodule")
                 with torch.no_grad():
-                    original_mlp_w = bridge.blocks[0].mlp.out.weight.clone()
-                    bridge.blocks[0].mlp.out.weight[0, :] = 0
+                    original_mlp_w = mlp_block.mlp.out.weight.clone()
+                    mlp_block.mlp.out.weight[0, :] = 0
                 mlp_modified_loss = bridge(test_text, return_type="loss")
                 with torch.no_grad():
-                    bridge.blocks[0].mlp.out.weight.copy_(original_mlp_w)
+                    mlp_block.mlp.out.weight.copy_(original_mlp_w)
                 mlp_change = abs(mlp_modified_loss - original_loss)
                 if mlp_change > 1e-6:
                     return BenchmarkResult(
@@ -516,35 +558,51 @@ def benchmark_attention_output_centering(
                 message="Skipped for tiny/test model (random weights don't center meaningfully)",
             )
 
-        # Check if W_O exists and is accessible
-        if not hasattr(bridge.blocks[0].attn, "W_O"):
+        # Find blocks with attention (hybrid architectures may not have attn on all blocks)
+        attn_blocks = bridge.blocks_with("attn")
+        if not attn_blocks:
             return BenchmarkResult(
                 name="attention_output_centering",
                 severity=BenchmarkSeverity.WARNING,
-                message="W_O not accessible on bridge model",
+                message="No blocks have attention submodule",
                 passed=False,
             )
 
-        w_o = bridge.blocks[0].attn.W_O
-
-        # Compute mean along output dimension
-        mean_abs = torch.mean(torch.abs(torch.mean(w_o, dim=-1))).item()
+        # Check W_O accessibility on first attention block
+        first_idx, first_attn_block = attn_blocks[0]
+        if not hasattr(first_attn_block.attn, "W_O"):
+            return BenchmarkResult(
+                name="attention_output_centering",
+                severity=BenchmarkSeverity.WARNING,
+                message="W_O not accessible on bridge model",
+                passed=False,
+            )
 
+        # Compute mean across all attention blocks
         tolerance = 0.01  # 1% tolerance
+        worst_mean = 0.0
+        for idx, block in attn_blocks:
+            w_o = block.attn.W_O
+            mean_abs = torch.mean(torch.abs(torch.mean(w_o, dim=-1))).item()
+            worst_mean = max(worst_mean, mean_abs)
 
-        if mean_abs < tolerance:
+        n_attn = len(attn_blocks)
+        n_total = len(bridge.blocks)
+        block_info = f" ({n_attn}/{n_total} blocks have attention)" if n_attn < n_total else ""
+
+        if worst_mean < tolerance:
             return BenchmarkResult(
                 name="attention_output_centering",
                 severity=BenchmarkSeverity.INFO,
-                message=f"Attention output centering verified (mean={mean_abs:.6f})",
-                details={"mean": mean_abs, "tolerance": tolerance},
+                message=f"Attention output centering verified (worst_mean={worst_mean:.6f}){block_info}",
+                details={"mean": worst_mean, "tolerance": tolerance, "n_attn_blocks": n_attn},
             )
         else:
             return BenchmarkResult(
                 name="attention_output_centering",
                 severity=BenchmarkSeverity.WARNING,
-                message=f"Attention output weights not well-centered (mean={mean_abs:.6f})",
-                details={"mean": mean_abs, "tolerance": tolerance},
+                message=f"Attention output weights not well-centered (worst_mean={worst_mean:.6f}){block_info}",
+                details={"mean": worst_mean, "tolerance": tolerance, "n_attn_blocks": n_attn},
                 passed=False,
             )
 
@@ -743,8 +801,20 @@ def benchmark_value_bias_folding(
                     },
                 )
 
+        # Find blocks with attention (hybrid architectures may not have attn on all blocks)
+        attn_blocks = bridge.blocks_with("attn")
+        if not attn_blocks:
+            return BenchmarkResult(
+                name="value_bias_folding",
+                severity=BenchmarkSeverity.INFO,
+                message="No blocks have attention submodule (expected for hybrid models without mapped attn)",
+                details={"has_bias": False},
+            )
+
+        first_idx, first_attn_block = attn_blocks[0]
+
         # Check if b_V exists
-        if not hasattr(bridge.blocks[0].attn, "b_V"):
+        if not hasattr(first_attn_block.attn, "b_V"):
             return BenchmarkResult(
                 name="value_bias_folding",
                 severity=BenchmarkSeverity.INFO,
@@ -752,7 +822,7 @@ def benchmark_value_bias_folding(
                 details={"has_bias": False},
             )
 
-        b_v = bridge.blocks[0].attn.b_V
+        b_v = first_attn_block.attn.b_V
 
         if b_v is None:
             return BenchmarkResult(
diff --git a/transformer_lens/model_bridge/bridge.py b/transformer_lens/model_bridge/bridge.py
index f23703234..372b53bf5 100644
--- a/transformer_lens/model_bridge/bridge.py
+++ b/transformer_lens/model_bridge/bridge.py
@@ -3,7 +3,9 @@
 This module provides the bridge components that wrap remote model components and provide
 a consistent interface for accessing their weights and performing operations.
 """
+import logging
 import re
+import warnings
 from contextlib import contextmanager
 from functools import lru_cache
 from typing import (
@@ -32,10 +34,17 @@
 from transformer_lens.hook_points import HookPoint
 from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
 from transformer_lens.model_bridge.component_setup import set_original_components
+from transformer_lens.model_bridge.composition_scores import CompositionScores
 from transformer_lens.model_bridge.exceptions import StopAtLayerException
 from transformer_lens.model_bridge.generalized_components.base import (
     GeneralizedComponent,
 )
+from transformer_lens.model_bridge.generalized_components.block import (
+    _BLOCK_INTERNAL_MODULES,
+    _NORM_PREFIXES,
+    _VARIANT_SUBMODULE_SET,
+    VARIANT_SUBMODULE_NAMES,
+)
 from transformer_lens.model_bridge.get_params_util import get_bridge_params
 from transformer_lens.utilities.aliases import resolve_alias
 from transformer_lens.utilities.devices import move_to_and_update_config
@@ -47,6 +56,14 @@
 _BLOCK_PATTERN = re.compile("blocks\\.(\\d+)")
 
 
+def _resolve_attr_path(obj: nn.Module, attr_path: str) -> torch.Tensor:
+    """Walk a dot-separated attribute path and return the final tensor."""
+    result = obj
+    for attr in attr_path.split("."):
+        result = getattr(result, attr)
+    return cast(torch.Tensor, result)
+
+
 def build_alias_to_canonical_map(hook_dict, prefix=""):
     """Build a mapping from alias hook names to their canonical names.
 
@@ -247,7 +264,7 @@ def _set_processed_weight_attributes(self) -> None:
         if not hasattr(self, "blocks"):
             return
         for block in self.blocks:
-            if not hasattr(block, "attn"):
+            if "attn" not in block._modules:
                 continue
             attn = block.attn
             if not (hasattr(attn, "q") and hasattr(attn.q, "weight")):
@@ -1003,20 +1020,114 @@ def to_single_str_token(self, int_token: int) -> str:
             return str(token[0])
         raise AssertionError("Expected a single string token.")
 
+    def blocks_with(self, submodule: str) -> List[Tuple[int, "GeneralizedComponent"]]:
+        """Return (index, block) pairs for blocks that have the named submodule.
+
+        Hybrid architectures have heterogeneous blocks — some layers have
+        attention, others have SSM or linear attention, etc. Use this instead
+        of assuming blocks[0] is representative.
+
+        Only returns blocks where the submodule was explicitly set up as a
+        bridged component (registered in _modules), not submodules that happen
+        to exist on the underlying HF model.
+
+        Args:
+            submodule: Name of the submodule to check for (e.g., "attn", "mamba")
+
+        Returns:
+            List of (layer_index, block) tuples for blocks that have the submodule.
+        """
+        if not hasattr(self, "blocks"):
+            return []
+        return [(i, block) for i, block in enumerate(self.blocks) if submodule in block._modules]
+
+    def stack_params_for(
+        self, submodule: str, attr_path: str, reshape_fn: Optional[Callable] = None
+    ) -> Tuple[List[int], torch.Tensor]:
+        """Stack a parameter across blocks that have a specific submodule.
+
+        For hybrid architectures where only some blocks have attention (or SSM,
+        etc.), this returns the stacked tensor for only matching blocks along
+        with their layer indices.
+
+        Args:
+            submodule: Submodule to filter on (e.g., "attn", "mamba")
+            attr_path: Dot-separated attr path from block (e.g., "attn.W_K")
+            reshape_fn: Optional function to reshape each weight before stacking
+
+        Returns:
+            Tuple of (layer_indices, stacked_tensor) where layer_indices maps
+            position i in the tensor to the original layer index.
+
+        Raises:
+            ValueError: If no blocks have the requested submodule.
+        """
+        matching = self.blocks_with(submodule)
+        if not matching:
+            raise ValueError(
+                f"No blocks have submodule '{submodule}'. "
+                f"Available submodules can be checked with blocks_with()."
+            )
+        indices: List[int] = []
+        weights: List[torch.Tensor] = []
+        for idx, block in matching:
+            w = _resolve_attr_path(block, attr_path)
+            if reshape_fn is not None:
+                w = reshape_fn(w)
+            weights.append(w)
+            indices.append(idx)
+        return indices, torch.stack(weights, dim=0)
+
     def _stack_block_params(
         self, attr_path: str, reshape_fn: Optional[Callable] = None
     ) -> torch.Tensor:
-        """Stack a parameter across all blocks.
+        """Stack a parameter across all blocks, or across matching blocks for hybrids.
+
+        For homogeneous models, returns a tensor of shape [n_layers, ...].
+        For hybrid models where some blocks lack the requested submodule,
+        returns a tensor of shape [n_matching_blocks, ...] and emits a
+        one-time warning about the index mapping.
 
         Args:
             attr_path: Dot-separated attribute path from block (e.g., "attn.W_K")
             reshape_fn: Optional function to reshape each weight before stacking
+
+        Note:
+            The guard checks only that the first path segment is a bridged
+            submodule (in _modules). Deeper segments resolve via standard
+            getattr, which may fall through to HF model attributes. This is
+            intentional — properties like W_Q are exposed via __getattr__
+            delegation to the underlying weight tensors.
         """
-        weights = []
-        for block in self.blocks:
-            w = block
-            for attr in attr_path.split("."):
-                w = getattr(w, attr)
+        first_attr = attr_path.split(".")[0]
+        matching_blocks = [
+            (i, block) for i, block in enumerate(self.blocks) if first_attr in block._modules
+        ]
+
+        if len(matching_blocks) == 0:
+            raise AttributeError(
+                f"No blocks have submodule '{first_attr}'. "
+                f"Use bridge.blocks_with('{first_attr}') to check availability."
+            )
+
+        if len(matching_blocks) < len(self.blocks):
+            indices = [i for i, _ in matching_blocks]
+            logging.warning(
+                "Hybrid model: only %d/%d blocks have '%s'. Returning stacked tensor "
+                "for layers %s only. Tensor index i corresponds to original layer "
+                "indices[i], not layer i. For explicit index mapping, use "
+                "bridge.stack_params_for('%s', '%s').",
+                len(matching_blocks),
+                len(self.blocks),
+                first_attr,
+                indices,
+                first_attr,
+                attr_path,
+            )
+
+        weights: List[torch.Tensor] = []
+        for _, block in matching_blocks:
+            w = _resolve_attr_path(block, attr_path)
             if reshape_fn is not None:
                 w = reshape_fn(w)
             weights.append(w)
@@ -1120,12 +1231,46 @@ def W_E(self) -> torch.Tensor:
 
     @property
     def QK(self):
+        """QK circuit as a FactoredMatrix.
+
+        On hybrid models, returns the circuit for attention layers only (with
+        a warning about index mapping). For explicit index control, use
+        QK_for_attn_layers() which returns (layer_indices, FactoredMatrix).
+        """
         return FactoredMatrix(self.W_Q, self.W_K.transpose(-2, -1))
 
     @property
     def OV(self):
+        """OV circuit as a FactoredMatrix.
+
+        On hybrid models, returns the circuit for attention layers only (with
+        a warning about index mapping). For explicit index control, use
+        OV_for_attn_layers() which returns (layer_indices, FactoredMatrix).
+        """
         return FactoredMatrix(self.W_V, self.W_O)
 
+    def QK_for_attn_layers(self) -> Tuple[List[int], FactoredMatrix]:
+        """QK circuit for attention layers only (hybrid-safe).
+
+        Returns:
+            Tuple of (layer_indices, FactoredMatrix) where layer_indices maps
+            position i in the matrix to the original layer index.
+        """
+        q_indices, W_Q = self.stack_params_for("attn", "attn.W_Q", self._reshape_qkv)
+        _, W_K = self.stack_params_for("attn", "attn.W_K", self._reshape_qkv)
+        return q_indices, FactoredMatrix(W_Q, W_K.transpose(-2, -1))
+
+    def OV_for_attn_layers(self) -> Tuple[List[int], FactoredMatrix]:
+        """OV circuit for attention layers only (hybrid-safe).
+
+        Returns:
+            Tuple of (layer_indices, FactoredMatrix) where layer_indices maps
+            position i in the matrix to the original layer index.
+        """
+        v_indices, W_V = self.stack_params_for("attn", "attn.W_V", self._reshape_qkv)
+        _, W_O = self.stack_params_for("attn", "attn.W_O", self._reshape_o)
+        return v_indices, FactoredMatrix(W_V, W_O)
+
     # ------------------------------------------------------------------
     # Mechanistic interpretability analysis methods
     # ------------------------------------------------------------------
@@ -1169,18 +1314,56 @@ def tokens_to_residual_directions(
             residual_direction = self.W_U[:, token]
             return residual_direction
 
+    # Output bias attribute names by variant type. Attention uses "b_O"
+    # (a processed-weight alias). SSM/linear-attn variants use their output
+    # projection's bias. Map variant name → list of attribute paths to check.
+    _VARIANT_OUTPUT_BIAS_ATTRS: Dict[str, tuple] = {
+        "attn": ("b_O",),
+        "linear_attn": ("out_proj.bias",),
+        "mamba": ("out_proj.bias",),
+        "mixer": ("out_proj.bias",),
+        "ssm": ("out_proj.bias",),
+    }
+
+    def _get_block_variant_bias(self, block: "GeneralizedComponent") -> Optional[torch.Tensor]:
+        """Get the output bias from whatever variant submodule this block has.
+
+        Each variant type has its own output bias attribute name — attention
+        uses b_O while SSM variants use out_proj.bias. Returns the first
+        found, or None if the variant has no output bias.
+        """
+        for name in VARIANT_SUBMODULE_NAMES:
+            if name not in block._modules:
+                continue
+            variant = block._modules[name]
+            for attr_path in self._VARIANT_OUTPUT_BIAS_ATTRS.get(name, ()):
+                obj = variant
+                try:
+                    for attr in attr_path.split("."):
+                        obj = getattr(obj, attr)
+                except AttributeError:
+                    continue
+                if obj is not None and isinstance(obj, torch.Tensor):
+                    return obj
+        return None
+
     def accumulated_bias(
         self,
         layer: int,
         mlp_input: bool = False,
         include_mlp_biases: bool = True,
     ) -> torch.Tensor:
-        """Sum of attention and MLP output biases up to the input of a given layer.
+        """Sum of biases that contribute to the residual stream up to a given layer.
+
+        Includes output biases from whatever variant submodule each block has
+        (attention, Mamba, linear attention, etc.) plus MLP output biases.
+        For hybrid models, non-attention layers still contribute their variant
+        submodule's output bias to the residual stream.
 
         Args:
             layer: Layer number in [0, n_layers]. 0 means no layers, n_layers means all.
-            mlp_input: If True, include the attention output bias of the target layer
-                (i.e. bias up to the MLP input of that layer).
+            mlp_input: If True, include the variant submodule's output bias of
+                the target layer (i.e. bias up to the MLP input of that layer).
             include_mlp_biases: Whether to include MLP biases. Useful to set False when
                 expanding attn_out into individual heads but keeping mlp_out as-is.
 
@@ -1190,55 +1373,163 @@ def accumulated_bias(
         accumulated = torch.zeros(self.cfg.d_model, device=self.cfg.device)
         for i in range(layer):
             block = self.blocks[i]
-            b_O = getattr(block.attn, "b_O", None)
+            b_O = self._get_block_variant_bias(block)
             if b_O is not None:
                 accumulated = accumulated + b_O
-            if include_mlp_biases:
+            if include_mlp_biases and "mlp" in block._modules:
                 b_out = getattr(block.mlp, "b_out", None)
                 if b_out is not None:
                     accumulated = accumulated + b_out
         if mlp_input:
             assert layer < self.cfg.n_layers, "Cannot include attn_bias from beyond the final layer"
             block = self.blocks[layer]
-            b_O = getattr(block.attn, "b_O", None)
+            b_O = self._get_block_variant_bias(block)
             if b_O is not None:
                 accumulated = accumulated + b_O
         return accumulated
 
-    def all_composition_scores(self, mode: str) -> torch.Tensor:
-        """Composition scores for all pairs of heads.
+    def all_composition_scores(self, mode: str) -> CompositionScores:
+        """Composition scores for all pairs of attention heads.
+
+        Returns a ``CompositionScores`` containing the scores tensor, the
+        original layer indices, and human-readable head labels.  The scores
+        tensor has shape (n_attn_layers, n_heads, n_attn_layers, n_heads) and
+        is upper triangular on the layer axes.
 
-        Returns an (n_layers, n_heads, n_layers, n_heads) tensor that is upper
-        triangular on the layer axes (a head can only compose with later heads).
+        For hybrid models, only attention layers are included.  The returned
+        ``layer_indices`` maps tensor position *i* back to the original layer
+        number so that results cannot be silently misinterpreted.
 
         See https://transformer-circuits.pub/2021/framework/index.html
 
         Args:
             mode: One of "Q", "K", "V" — which composition type to compute.
         """
-        left = self.OV
+        # Single blocks_with call — all weight stacking uses these same blocks
+        attn_blocks = self.blocks_with("attn")
+        if not attn_blocks:
+            raise ValueError("No attention layers found — cannot compute composition scores.")
+
+        indices = [idx for idx, _ in attn_blocks]
+        blocks_list = [block for _, block in attn_blocks]
+
+        def _stack(attr_path: str, reshape_fn: Optional[Callable] = None) -> torch.Tensor:
+            weights: List[torch.Tensor] = []
+            for block in blocks_list:
+                w = _resolve_attr_path(block, attr_path)
+                if reshape_fn is not None:
+                    w = reshape_fn(w)
+                weights.append(w)
+            return torch.stack(weights, dim=0)
+
+        W_V = _stack("attn.W_V", self._reshape_qkv)
+        W_O = _stack("attn.W_O", self._reshape_o)
+        left = FactoredMatrix(W_V, W_O)
+
         if mode == "Q":
-            right = self.QK
+            W_Q = _stack("attn.W_Q", self._reshape_qkv)
+            W_K = _stack("attn.W_K", self._reshape_qkv)
+            right = FactoredMatrix(W_Q, W_K.transpose(-2, -1))
         elif mode == "K":
-            right = self.QK.T
+            W_Q = _stack("attn.W_Q", self._reshape_qkv)
+            W_K = _stack("attn.W_K", self._reshape_qkv)
+            right = FactoredMatrix(W_Q, W_K.transpose(-2, -1)).T
         elif mode == "V":
-            right = self.OV
+            right = left
         else:
             raise ValueError(f"mode must be one of ['Q', 'K', 'V'] not {mode}")
 
         scores = utils.composition_scores(left, right, broadcast_dims=True)
-        mask = (
-            torch.arange(self.cfg.n_layers, device=self.cfg.device)[:, None, None, None]
-            < torch.arange(self.cfg.n_layers, device=self.cfg.device)[None, None, :, None]
-        )
+        n_attn = len(indices)
+        idx_tensor = torch.arange(n_attn, device=self.cfg.device)
+        mask = idx_tensor[:, None, None, None] < idx_tensor[None, None, :, None]
         scores = torch.where(mask, scores, torch.zeros_like(scores))
-        return scores
+
+        labels = [f"L{l}H{h}" for l in indices for h in range(self.cfg.n_heads)]
+        return CompositionScores(scores=scores, layer_indices=indices, head_labels=labels)
+
+    def composition_layer_indices(self) -> List[int]:
+        """Return original layer indices for attention layers.
+
+        Maps position i in all_composition_scores() output back to the
+        original layer number. For homogeneous models, returns [0, 1, ..., n-1].
+        For hybrid models, returns only the attention layer indices.
+        """
+        return [idx for idx, _ in self.blocks_with("attn")]
+
+    def block_hooks(self, layer_idx: int) -> List[str]:
+        """Return all hook point names available on a specific block.
+
+        Useful for hybrid architectures where different layers have different
+        hookable submodules — e.g., attention layers expose hook_q/hook_k/etc.
+        while SSM layers expose hook_in_proj/hook_conv/etc.
+
+        Args:
+            layer_idx: Layer index to inspect.
+
+        Returns:
+            Sorted list of hook names (e.g., ["hook_in", "hook_out", "attn.hook_q", ...]).
+        """
+        prefix = f"blocks.{layer_idx}."
+        return sorted(name[len(prefix) :] for name in self.hook_dict if name.startswith(prefix))
+
+    def block_submodules(self, layer_idx: int) -> List[str]:
+        """Return names of bridged submodules on a specific block.
+
+        Args:
+            layer_idx: Layer index to inspect.
+
+        Returns:
+            List of submodule names (e.g., ["ln1", "ln2", "attn", "mlp"]).
+        """
+        block = self.blocks[layer_idx]
+        return [name for name in block._modules if name not in _BLOCK_INTERNAL_MODULES]
+
+    def layer_types(self) -> List[str]:
+        """Return a human-readable layer type for each block.
+
+        Inspects which bridged submodules are present on each block to infer
+        the layer type. For homogeneous models, all entries will be the same.
+        Variant submodule names are defined in
+        ``generalized_components.block.VARIANT_SUBMODULE_NAMES``.
+
+        Labels are deterministic: variants appear in VARIANT_SUBMODULE_NAMES
+        order, universals are sorted alphabetically.
+
+        Returns:
+            List of strings like ["attn+mlp", "ssm+mlp", "attn+mlp", ...].
+        """
+        types = []
+        for block in self.blocks:
+            # Variants in canonical order (tuple iteration = stable)
+            variants = [n for n in VARIANT_SUBMODULE_NAMES if n in block._modules]
+            universals = sorted(
+                n
+                for n in block._modules
+                if n not in _VARIANT_SUBMODULE_SET
+                and n not in _BLOCK_INTERNAL_MODULES
+                and not n.startswith(_NORM_PREFIXES)
+            )
+            parts = variants + universals
+            types.append("+".join(parts) if parts else "unknown")
+        return types
 
     @property
     def all_head_labels(self) -> list[str]:
         """Human-readable labels for all attention heads, e.g. ['L0H0', 'L0H1', ...]."""
         return [f"L{l}H{h}" for l in range(self.cfg.n_layers) for h in range(self.cfg.n_heads)]
 
+    @property
+    def attn_head_labels(self) -> list[str]:
+        """Labels for attention heads only, matching all_composition_scores() dimensions.
+
+        For homogeneous models, identical to all_head_labels. For hybrid models,
+        only includes heads from attention layers (skips SSM/linear-attn layers).
+        """
+        return [
+            f"L{l}H{h}" for l in self.composition_layer_indices() for h in range(self.cfg.n_heads)
+        ]
+
     def parameters(self, recurse: bool = True) -> Iterator[nn.Parameter]:
         """Returns parameters following standard PyTorch semantics.
 
diff --git a/transformer_lens/model_bridge/component_setup.py b/transformer_lens/model_bridge/component_setup.py
index d32f787df..79d2abc2a 100644
--- a/transformer_lens/model_bridge/component_setup.py
+++ b/transformer_lens/model_bridge/component_setup.py
@@ -2,8 +2,11 @@
 
 "Component setup utilities for creating and configuring bridged components."
 import copy
+import logging
 from typing import TYPE_CHECKING, Any, cast
 
+logger = logging.getLogger(__name__)
+
 import torch.nn as nn
 
 from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
@@ -67,6 +70,7 @@ def setup_submodules(
         architecture_adapter: The architecture adapter
         original_model: The original model to get components from
     """
+    skipped_optional: list[str] = []
     for module_name, submodule in component.submodules.items():
         if submodule.is_list_item:
             if submodule.name is None:
@@ -95,9 +99,39 @@ def setup_submodules(
                     original_subcomponent = original_model
                 else:
                     remote_path = submodule.name
-                    original_subcomponent = architecture_adapter.get_remote_component(
-                        original_model, remote_path
-                    )
+                    is_optional = getattr(submodule, "optional", False)
+                    # Fast path: if the first path segment is absent, skip
+                    # immediately. This catches the common hybrid case (e.g.,
+                    # "self_attn" absent on an SSM layer) without entering
+                    # get_remote_component.
+                    first_segment = remote_path.split(".")[0]
+                    if is_optional and not hasattr(original_model, first_segment):
+                        logger.debug(
+                            "Optional submodule '%s' (path '%s') absent on %s — skipping",
+                            module_name,
+                            remote_path,
+                            getattr(component, "name", "unknown"),
+                        )
+                        skipped_optional.append(module_name)
+                        continue  # hybrid layer lacks this submodule; skip binding
+                    # Full resolution — also catches deeper path failures
+                    # (e.g., "self_attn.q_proj" where self_attn exists as a
+                    # stub but q_proj is missing).
+                    try:
+                        original_subcomponent = architecture_adapter.get_remote_component(
+                            original_model, remote_path
+                        )
+                    except AttributeError:
+                        if is_optional:
+                            logger.debug(
+                                "Optional submodule '%s' (path '%s') partially absent on %s — skipping",
+                                module_name,
+                                remote_path,
+                                getattr(component, "name", "unknown"),
+                            )
+                            skipped_optional.append(module_name)
+                            continue
+                        raise
                 submodule.set_original_component(original_subcomponent)
                 setup_submodules(submodule, architecture_adapter, original_subcomponent)
                 if submodule.name is not None:
@@ -111,6 +145,12 @@ def setup_submodules(
             if not submodule.is_list_item and submodule.name is not None:
                 component.real_components[module_name] = (submodule.name, submodule)
 
+    # Remove skipped optional submodules from the template so that
+    # architecture_adapter traversal code (which reads .submodules) doesn't
+    # find them and try to resolve against the HF model.
+    for name in skipped_optional:
+        component.submodules.pop(name, None)
+
 
 def setup_components(
     components: dict[str, Any],
diff --git a/transformer_lens/model_bridge/composition_scores.py b/transformer_lens/model_bridge/composition_scores.py
new file mode 100644
index 000000000..9073fddb2
--- /dev/null
+++ b/transformer_lens/model_bridge/composition_scores.py
@@ -0,0 +1,102 @@
+"""CompositionScores — tensor-like container for composition score results."""
+from typing import List
+
+import torch
+
+
+class CompositionScores:
+    """Composition scores bundled with layer-index metadata.
+
+    Behaves like a tensor for backward compatibility — indexing, .shape,
+    arithmetic, and ``torch.*`` namespace functions all delegate to the
+    underlying scores tensor via ``__torch_function__``. The additional
+    ``layer_indices`` and ``head_labels`` attributes provide metadata that
+    prevents silent misinterpretation of indices on hybrid models.
+
+    For hybrid models, the scores tensor has shape
+    (n_attn_layers, n_heads, n_attn_layers, n_heads) where n_attn_layers
+    may be less than n_layers. ``layer_indices`` maps tensor position i
+    to the original layer number.
+
+    Attributes:
+        scores: Upper-triangular composition score tensor.
+        layer_indices: Original layer numbers for each position in scores.
+            E.g., [0, 2, 5] means position 0 = layer 0, position 1 = layer 2, etc.
+        head_labels: Labels like ["L0H0", "L0H1", "L2H0", ...] matching scores dims.
+    """
+
+    def __init__(self, scores: torch.Tensor, layer_indices: List[int], head_labels: List[str]):
+        self.scores = scores
+        self.layer_indices = layer_indices
+        self.head_labels = head_labels
+
+    # --- Tensor protocol ---
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        """Delegate torch.* calls (torch.isnan, torch.where, etc.) to .scores."""
+        if kwargs is None:
+            kwargs = {}
+        # Unwrap any CompositionScores args to their underlying tensor
+        unwrapped_args = tuple(a.scores if isinstance(a, CompositionScores) else a for a in args)
+        unwrapped_kwargs = {
+            k: v.scores if isinstance(v, CompositionScores) else v for k, v in kwargs.items()
+        }
+        return func(*unwrapped_args, **unwrapped_kwargs)
+
+    @property
+    def shape(self) -> torch.Size:
+        return self.scores.shape
+
+    @property
+    def device(self) -> torch.device:
+        return self.scores.device
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.scores.dtype
+
+    # Python 3 automatically sets __hash__ = None when __eq__ is defined,
+    # making instances unhashable. No explicit __hash__ needed.
+
+    def __getitem__(self, key):
+        return self.scores[key]
+
+    def __getattr__(self, name):
+        # Delegate tensor methods (.abs(), .sum(), .any(), etc.) to .scores.
+        # Guard against infinite recursion during pickling/unpickling where
+        # self.scores may not exist yet.
+        try:
+            scores = object.__getattribute__(self, "scores")
+        except AttributeError:
+            raise AttributeError(name) from None
+        return getattr(scores, name)
+
+    def __gt__(self, other):
+        return self.scores > other
+
+    def __lt__(self, other):
+        return self.scores < other
+
+    def __ge__(self, other):
+        return self.scores >= other
+
+    def __le__(self, other):
+        return self.scores <= other
+
+    def __eq__(self, other):
+        if isinstance(other, CompositionScores):
+            return self.scores == other.scores
+        return self.scores == other
+
+    def __ne__(self, other):
+        if isinstance(other, CompositionScores):
+            return self.scores != other.scores
+        return self.scores != other
+
+    def __repr__(self) -> str:
+        return (
+            f"CompositionScores(shape={self.shape}, "
+            f"layer_indices={self.layer_indices}, "
+            f"n_head_labels={len(self.head_labels)})"
+        )
diff --git a/transformer_lens/model_bridge/generalized_components/base.py b/transformer_lens/model_bridge/generalized_components/base.py
index db270a644..1af033efb 100644
--- a/transformer_lens/model_bridge/generalized_components/base.py
+++ b/transformer_lens/model_bridge/generalized_components/base.py
@@ -34,6 +34,7 @@ def __init__(
         submodules: Optional[Dict[str, "GeneralizedComponent"]] = None,
         conversion_rule: Optional[BaseTensorConversion] = None,
         hook_alias_overrides: Optional[Dict[str, str]] = None,
+        optional: bool = False,
     ):
         """Initialize the generalized component.
 
@@ -45,12 +46,17 @@ def __init__(
             hook_alias_overrides: Optional dictionary to override default hook aliases.
                 For example, {"hook_attn_out": "ln1_post.hook_out"} will make hook_attn_out
                 point to ln1_post.hook_out instead of the default value in self.hook_aliases.
+            optional: If True, this entire subtree may be absent on some layers.
+                When the remote model lacks this component, setup will skip it
+                cleanly instead of raising AttributeError. Used for hybrid
+                architectures where layers have structurally different submodules.
         """
         super().__init__()
         self.name = name
         self.config = config
         self.submodules = submodules or {}
         self.conversion_rule = conversion_rule
+        self.optional = optional
         self._hook_registry: Dict[str, HookPoint] = {}
         self._hook_alias_registry: Dict[str, Union[str, List[str]]] = {}
         self._property_alias_registry: Dict[str, str] = {}
@@ -337,6 +343,7 @@ def __setattr__(self, name: str, value: Any) -> None:
             "conversion_rule",
             "compatibility_mode",
             "disable_warnings",
+            "optional",
         ]:
             super().__setattr__(name, value)
             return
diff --git a/transformer_lens/model_bridge/generalized_components/block.py b/transformer_lens/model_bridge/generalized_components/block.py
index 48147a9d2..1005fd4f2 100644
--- a/transformer_lens/model_bridge/generalized_components/block.py
+++ b/transformer_lens/model_bridge/generalized_components/block.py
@@ -15,6 +15,21 @@
     GeneralizedComponent,
 )
 
+# Submodule names that represent layer-type variants in hybrid architectures.
+# Used by layer_types() for classification and _get_block_variant_bias() for
+# bias accumulation.  Adapters that introduce new variant types should add
+# their submodule name here.  Ordered tuple for deterministic iteration
+# (matters when a block has multiple variants during development/testing).
+VARIANT_SUBMODULE_NAMES: tuple[str, ...] = ("attn", "linear_attn", "mamba", "mixer", "ssm")
+_VARIANT_SUBMODULE_SET: frozenset[str] = frozenset(VARIANT_SUBMODULE_NAMES)
+
+# Internal block modules excluded from submodule introspection (hook points
+# and the wrapped HF component are infrastructure, not user-facing submodules).
+_BLOCK_INTERNAL_MODULES: frozenset[str] = frozenset({"hook_in", "hook_out", "_original_component"})
+
+# Prefixes for normalization modules excluded from layer_types() labels.
+_NORM_PREFIXES: tuple[str, ...] = ("ln", "layer_norm", "norm", "rms")
+
 
 class BlockBridge(GeneralizedComponent):
     """Bridge component for transformer blocks.
diff --git a/transformer_lens/model_bridge/get_params_util.py b/transformer_lens/model_bridge/get_params_util.py
index f63ab9386..f27e3a97f 100644
--- a/transformer_lens/model_bridge/get_params_util.py
+++ b/transformer_lens/model_bridge/get_params_util.py
@@ -1,8 +1,11 @@
 """Utility function for getting model parameters in TransformerLens format."""
+import logging
 from typing import Dict
 
 import torch
 
+logger = logging.getLogger(__name__)
+
 
 def _get_n_kv_heads(cfg) -> int:
     """Resolve the number of key/value heads, falling back to n_heads."""
@@ -36,14 +39,17 @@ def _get_or_create_bias(bias, n_heads: int, d_head: int, device, dtype) -> torch
 def get_bridge_params(bridge) -> Dict[str, torch.Tensor]:
     """Access to model parameters in the format expected by SVDInterpreter.
 
-    For missing weights, returns zero tensors of appropriate shape instead of raising exceptions.
-    This ensures compatibility across different model architectures.
+    For hybrid architectures, only layers with attention get attention keys
+    (W_Q, W_K, etc.). Non-attention layers (SSM, linear-attention) are skipped
+    rather than filled with zeros — this prevents downstream consumers like
+    SVDInterpreter from treating synthetic zeros as real weights.
 
     Args:
         bridge: TransformerBridge instance
 
     Returns:
-        dict: Dictionary of parameter tensors with TransformerLens naming convention
+        dict: Dictionary of parameter tensors with TransformerLens naming convention.
+            For hybrid models, attention keys only exist for layers that have attention.
 
     Raises:
         ValueError: If configuration is inconsistent (e.g., cfg.n_layers != len(blocks))
@@ -51,22 +57,15 @@ def get_bridge_params(bridge) -> Dict[str, torch.Tensor]:
     params_dict = {}
 
     def _get_device_dtype():
-        device = bridge.cfg.device if hasattr(bridge.cfg, "device") else torch.device("cpu")
+        """Infer device/dtype from the first available model parameter."""
+        device = getattr(bridge.cfg, "device", None) or torch.device("cpu")
         dtype = torch.float32
         try:
-            device = bridge.embed.weight.device
-            dtype = bridge.embed.weight.dtype
-        except AttributeError:
-            try:
-                device = bridge.pos_embed.weight.device
-                dtype = bridge.pos_embed.weight.dtype
-            except AttributeError:
-                if len(bridge.blocks) > 0:
-                    try:
-                        device = bridge.blocks[0].attn.q.weight.device
-                        dtype = bridge.blocks[0].attn.q.weight.dtype
-                    except AttributeError:
-                        pass
+            first_param = next(bridge.parameters())
+            device = first_param.device
+            dtype = first_param.dtype
+        except (StopIteration, TypeError, AttributeError):
+            pass
         return (device, dtype)
 
     try:
@@ -89,72 +88,59 @@ def _get_device_dtype():
                 f"Configuration mismatch: cfg.n_layers={bridge.cfg.n_layers} but only {len(bridge.blocks)} blocks found. Layer {layer_idx} does not exist."
             )
         block = bridge.blocks[layer_idx]
+
+        # Only extract attention params from blocks that have attention.
+        # Non-attention layers (SSM, linear-attention) are skipped entirely
+        # rather than filled with zeros — this prevents consumers like
+        # SVDInterpreter from treating synthetic zeros as real weights.
         try:
-            w_q = block.attn.q.weight
-            w_k = block.attn.k.weight
-            w_v = block.attn.v.weight
-            w_o = block.attn.o.weight
-            if w_q.shape == (bridge.cfg.d_model, bridge.cfg.d_model):
-                d_head = bridge.cfg.d_model // bridge.cfg.n_heads
-                w_q = w_q.reshape(bridge.cfg.n_heads, bridge.cfg.d_model, d_head)
-                w_o = w_o.reshape(bridge.cfg.n_heads, d_head, bridge.cfg.d_model)
-                device, dtype = _get_device_dtype()
-                w_k = _reshape_kv_weight(w_k, bridge.cfg, device, dtype)
-                w_v = _reshape_kv_weight(w_v, bridge.cfg, device, dtype)
-            params_dict[f"blocks.{layer_idx}.attn.W_Q"] = w_q
-            params_dict[f"blocks.{layer_idx}.attn.W_K"] = w_k
-            params_dict[f"blocks.{layer_idx}.attn.W_V"] = w_v
-            params_dict[f"blocks.{layer_idx}.attn.W_O"] = w_o
-            device, dtype = _get_device_dtype()
-            n_kv_heads = _get_n_kv_heads(bridge.cfg)
-            params_dict[f"blocks.{layer_idx}.attn.b_Q"] = _get_or_create_bias(
-                block.attn.q.bias, bridge.cfg.n_heads, bridge.cfg.d_head, device, dtype
-            )
-            params_dict[f"blocks.{layer_idx}.attn.b_K"] = _get_or_create_bias(
-                block.attn.k.bias, n_kv_heads, bridge.cfg.d_head, device, dtype
-            )
-            params_dict[f"blocks.{layer_idx}.attn.b_V"] = _get_or_create_bias(
-                block.attn.v.bias, n_kv_heads, bridge.cfg.d_head, device, dtype
-            )
-            if block.attn.o.bias is not None:
-                params_dict[f"blocks.{layer_idx}.attn.b_O"] = block.attn.o.bias
-            else:
+            has_attn = "attn" in block._modules
+        except (TypeError, AttributeError):
+            # Mock objects or non-nn.Module blocks: fall back to hasattr
+            has_attn = hasattr(block, "attn")
+        if has_attn:
+            try:
+                w_q = block.attn.q.weight
+                w_k = block.attn.k.weight
+                w_v = block.attn.v.weight
+                w_o = block.attn.o.weight
+                if w_q.shape == (bridge.cfg.d_model, bridge.cfg.d_model):
+                    d_head = bridge.cfg.d_model // bridge.cfg.n_heads
+                    w_q = w_q.reshape(bridge.cfg.n_heads, bridge.cfg.d_model, d_head)
+                    w_o = w_o.reshape(bridge.cfg.n_heads, d_head, bridge.cfg.d_model)
+                    device, dtype = _get_device_dtype()
+                    w_k = _reshape_kv_weight(w_k, bridge.cfg, device, dtype)
+                    w_v = _reshape_kv_weight(w_v, bridge.cfg, device, dtype)
+                params_dict[f"blocks.{layer_idx}.attn.W_Q"] = w_q
+                params_dict[f"blocks.{layer_idx}.attn.W_K"] = w_k
+                params_dict[f"blocks.{layer_idx}.attn.W_V"] = w_v
+                params_dict[f"blocks.{layer_idx}.attn.W_O"] = w_o
                 device, dtype = _get_device_dtype()
-                params_dict[f"blocks.{layer_idx}.attn.b_O"] = torch.zeros(
-                    bridge.cfg.d_model, device=device, dtype=dtype
+                n_kv_heads = _get_n_kv_heads(bridge.cfg)
+                params_dict[f"blocks.{layer_idx}.attn.b_Q"] = _get_or_create_bias(
+                    block.attn.q.bias, bridge.cfg.n_heads, bridge.cfg.d_head, device, dtype
+                )
+                params_dict[f"blocks.{layer_idx}.attn.b_K"] = _get_or_create_bias(
+                    block.attn.k.bias, n_kv_heads, bridge.cfg.d_head, device, dtype
+                )
+                params_dict[f"blocks.{layer_idx}.attn.b_V"] = _get_or_create_bias(
+                    block.attn.v.bias, n_kv_heads, bridge.cfg.d_head, device, dtype
+                )
+                if block.attn.o.bias is not None:
+                    params_dict[f"blocks.{layer_idx}.attn.b_O"] = block.attn.o.bias
+                else:
+                    device, dtype = _get_device_dtype()
+                    params_dict[f"blocks.{layer_idx}.attn.b_O"] = torch.zeros(
+                        bridge.cfg.d_model, device=device, dtype=dtype
+                    )
+            except AttributeError as e:
+                logger.debug(
+                    "Block %d has 'attn' in _modules but attention params could not "
+                    "be extracted (missing q/k/v/o?): %s — skipping attention weights "
+                    "for this layer",
+                    layer_idx,
+                    e,
                 )
-        except AttributeError:
-            device, dtype = _get_device_dtype()
-            expected_qkv_shape = (bridge.cfg.n_heads, bridge.cfg.d_model, bridge.cfg.d_head)
-            expected_o_shape = (bridge.cfg.n_heads, bridge.cfg.d_head, bridge.cfg.d_model)
-            expected_q_bias_shape = (bridge.cfg.n_heads, bridge.cfg.d_head)
-            expected_o_bias_shape = (bridge.cfg.d_model,)
-            n_kv_heads = _get_n_kv_heads(bridge.cfg)
-            expected_kv_bias_shape = (n_kv_heads, bridge.cfg.d_head)
-            params_dict[f"blocks.{layer_idx}.attn.W_Q"] = torch.zeros(
-                *expected_qkv_shape, device=device, dtype=dtype
-            )
-            params_dict[f"blocks.{layer_idx}.attn.W_K"] = torch.zeros(
-                *expected_qkv_shape, device=device, dtype=dtype
-            )
-            params_dict[f"blocks.{layer_idx}.attn.W_V"] = torch.zeros(
-                *expected_qkv_shape, device=device, dtype=dtype
-            )
-            params_dict[f"blocks.{layer_idx}.attn.W_O"] = torch.zeros(
-                *expected_o_shape, device=device, dtype=dtype
-            )
-            params_dict[f"blocks.{layer_idx}.attn.b_Q"] = torch.zeros(
-                *expected_q_bias_shape, device=device, dtype=dtype
-            )
-            params_dict[f"blocks.{layer_idx}.attn.b_K"] = torch.zeros(
-                *expected_kv_bias_shape, device=device, dtype=dtype
-            )
-            params_dict[f"blocks.{layer_idx}.attn.b_V"] = torch.zeros(
-                *expected_kv_bias_shape, device=device, dtype=dtype
-            )
-            params_dict[f"blocks.{layer_idx}.attn.b_O"] = torch.zeros(
-                *expected_o_bias_shape, device=device, dtype=dtype
-            )
         try:
             mlp_in = getattr(block.mlp, "in", None) or getattr(block.mlp, "input", None)
             if mlp_in is None:
diff --git a/transformer_lens/weight_processing.py b/transformer_lens/weight_processing.py
index c05e8706a..6f0489f21 100644
--- a/transformer_lens/weight_processing.py
+++ b/transformer_lens/weight_processing.py
@@ -1698,6 +1698,21 @@ def refactor_factored_attn_matrices(
             b_V_key = ProcessWeights._get_param_key(f"blocks.{l}.attn.b_V", adapter)
             b_O_key = ProcessWeights._get_param_key(f"blocks.{l}.attn.b_O", adapter)
 
+            # Skip layers without attention weights (hybrid architectures where
+            # some layers are SSM/linear-attention and lack Q/K/V/O entirely).
+            # Other weight-processing loops (center_writing_weights, fold_value_biases,
+            # fold_layer_norm) already guard with `if key in state_dict:` checks.
+            if W_Q_key not in state_dict:
+                continue
+            # All four weight matrices must be present if Q is present
+            for _required_key in [W_K_key, W_V_key, W_O_key]:
+                if _required_key not in state_dict:
+                    raise ValueError(
+                        f"Inconsistent attention weights at layer {l}: "
+                        f"'{W_Q_key}' found but '{_required_key}' missing. "
+                        f"All of W_Q, W_K, W_V, W_O must be present together."
+                    )
+
             # W_QK = W_Q @ W_K.T
             # Concatenate biases to make a d_model+1 input dimension
             W_Q = ProcessWeights.convert_tensor_to_tl_format(

From 72d57a2b80e04f6fb8fa460498ebefa7921556c2 Mon Sep 17 00:00:00 2001
From: jlarson4 <jonahalarson@comcast.net>
Date: Tue, 14 Apr 2026 20:23:05 -0500
Subject: [PATCH 2/8] Comment cleanup

---
 tests/unit/test_optional_submodule.py         | 791 +++++-------------
 .../benchmarks/weight_processing.py           |   3 -
 transformer_lens/model_bridge/bridge.py       | 180 +---
 .../model_bridge/component_setup.py           |  23 +-
 .../model_bridge/composition_scores.py        |  33 +-
 .../generalized_components/base.py            |   5 +-
 .../generalized_components/block.py           |  12 +-
 .../model_bridge/get_params_util.py           |  26 +-
 transformer_lens/weight_processing.py         |   7 +-
 9 files changed, 252 insertions(+), 828 deletions(-)

diff --git a/tests/unit/test_optional_submodule.py b/tests/unit/test_optional_submodule.py
index 4bc44e6bc..168ad0ce1 100644
--- a/tests/unit/test_optional_submodule.py
+++ b/tests/unit/test_optional_submodule.py
@@ -1,9 +1,7 @@
-"""Unit tests for the optional submodule framework.
+"""Tests for optional submodule support in hybrid architectures."""
 
-Tests the `optional` flag on GeneralizedComponent and the `blocks_with()`
-capability query API on TransformerBridge, which together enable hybrid
-architectures where layers have structurally different submodules.
-"""
+import copy
+import logging
 
 import pytest
 import torch
@@ -17,14 +15,10 @@
 from transformer_lens.model_bridge.generalized_components.block import BlockBridge
 from transformer_lens.model_bridge.generalized_components.linear import LinearBridge
 
-# ============================================================================
-# Fixtures: synthetic hybrid model
-# ============================================================================
+# -- Synthetic hybrid model fixtures ------------------------------------------
 
 
 class FakeSubmodule(nn.Module):
-    """A simple nn.Linear submodule for testing."""
-
     def __init__(self, dim: int = 4):
         super().__init__()
         self.proj = nn.Linear(dim, dim, bias=False)
@@ -34,7 +28,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class HybridLayer(nn.Module):
-    """A layer that conditionally has a 'foo' submodule."""
+    """Layer that conditionally has a 'foo' submodule."""
 
     def __init__(self, has_foo: bool, dim: int = 4):
         super().__init__()
@@ -49,7 +43,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class HybridModel(nn.Module):
-    """Model with 4 layers: layers 0-2 have 'foo', layer 3 does not."""
+    """4 layers: 0-2 have 'foo', layer 3 does not."""
 
     def __init__(self, dim: int = 4):
         super().__init__()
@@ -62,8 +56,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class MinimalAdapter(ArchitectureAdapter):
-    """Minimal adapter for testing optional submodule setup."""
-
     def __init__(self, optional: bool = True):
         self.cfg = type("Cfg", (), {"n_layers": 4, "d_model": 4})()
         self.component_mapping = {}
@@ -79,706 +71,372 @@ def make_block_template(self) -> BlockBridge:
         )
 
 
-# ============================================================================
-# Tests: optional flag on GeneralizedComponent
-# ============================================================================
+class AttnAdapter(ArchitectureAdapter):
+    """Uses 'attn' as the optional submodule name (matches real adapters)."""
 
+    def __init__(self):
+        self.cfg = type("Cfg", (), {"n_layers": 4, "d_model": 4})()
+        self.component_mapping = {}
 
-class TestOptionalFlag:
-    """Test that the optional flag is properly stored and defaults to False."""
+    def make_block_template(self) -> BlockBridge:
+        return BlockBridge(
+            name="layers",
+            submodules={
+                "bar": LinearBridge(name="bar"),
+                "attn": LinearBridge(name="foo", optional=True),
+            },
+        )
 
-    def test_default_is_false(self):
-        comp = GeneralizedComponent(name="test")
-        assert comp.optional is False
 
-    def test_optional_true(self):
-        comp = GeneralizedComponent(name="test", optional=True)
-        assert comp.optional is True
+# -- Bridge construction helpers ----------------------------------------------
 
-    def test_optional_false_explicit(self):
-        comp = GeneralizedComponent(name="test", optional=False)
-        assert comp.optional is False
+
+def _setup_blocks(model, adapter):
+    """Deepcopy template per layer and run setup_submodules."""
+    template = adapter.make_block_template()
+    blocks = []
+    for i, layer in enumerate(model.layers):
+        block = copy.deepcopy(template)
+        block.name = f"layers.{i}"
+        block.set_original_component(layer)
+        setup_submodules(block, adapter, layer)
+        blocks.append(block)
+    return blocks
 
 
-# ============================================================================
-# Tests: setup_submodules with optional
-# ============================================================================
+def _make_bridge(blocks, **cfg_attrs):
+    """Wrap blocks in a minimal TransformerBridge shell."""
+    from transformer_lens.model_bridge.bridge import TransformerBridge
 
+    bridge = TransformerBridge.__new__(TransformerBridge)
+    nn.Module.__init__(bridge)
+    bridge.add_module("blocks", nn.ModuleList(blocks))
+    defaults = {"d_model": 4, "device": "cpu", "n_layers": 4}
+    defaults.update(cfg_attrs)
+    bridge.cfg = type("Cfg", (), defaults)()
+    return bridge
 
-class TestOptionalSubmoduleSetup:
-    """Test that optional submodules are skipped cleanly during setup."""
 
-    def test_optional_submodule_skipped_on_missing_layers(self):
-        """Layers 0-2 have 'foo', layer 3 does not. Setup should succeed."""
-        model = HybridModel()
-        adapter = MinimalAdapter(optional=True)
-        template = adapter.make_block_template()
+def _make_hybrid_bridge():
+    """Hybrid bridge with 'foo' (optional) and 'bar' (universal)."""
+    return _make_bridge(_setup_blocks(HybridModel(), MinimalAdapter(optional=True)))
 
-        # Simulate what setup_blocks_bridge does: deepcopy + setup per layer
-        import copy
 
-        blocks = []
-        for i, layer in enumerate(model.layers):
-            block = copy.deepcopy(template)
-            block.name = f"layers.{i}"
-            block.set_original_component(layer)
-            setup_submodules(block, adapter, layer)
-            blocks.append(block)
+def _make_hybrid_bridge_with_attn():
+    """Hybrid bridge where 'attn' is the optional submodule."""
+    return _make_bridge(
+        _setup_blocks(HybridModel(), AttnAdapter()),
+        n_heads=2,
+    )
 
-        # Layers 0-2 should have 'foo' in real_components
-        for i in range(3):
-            assert "foo" in blocks[i].real_components, f"Block {i} should have 'foo'"
-            assert hasattr(blocks[i], "foo"), f"Block {i} should have foo module"
 
-        # Layer 3 should NOT have 'foo' in any lookup path
-        assert (
-            "foo" not in blocks[3].real_components
-        ), "Block 3 should not have 'foo' in real_components"
-        assert "foo" not in blocks[3]._modules, "Block 3 should not have 'foo' in _modules"
-        assert "foo" not in blocks[3].submodules, "Block 3 should not have 'foo' in submodules"
+# -- Tests: optional flag -----------------------------------------------------
 
-        # All layers should have 'bar'
-        for i in range(4):
-            assert "bar" in blocks[i].real_components, f"Block {i} should have 'bar'"
 
-    def test_non_optional_missing_submodule_raises(self):
-        """When optional=False, missing submodule should raise AttributeError."""
-        model = HybridModel()
-        adapter = MinimalAdapter(optional=False)
-        template = adapter.make_block_template()
+class TestOptionalFlag:
+    def test_default_is_false(self):
+        assert GeneralizedComponent(name="test").optional is False
 
-        import copy
+    def test_optional_true(self):
+        assert GeneralizedComponent(name="test", optional=True).optional is True
 
-        # Layer 3 lacks 'foo' and optional=False, so this should raise
-        block = copy.deepcopy(template)
-        block.name = "layers.3"
-        block.set_original_component(model.layers[3])
-        with pytest.raises(AttributeError):
-            setup_submodules(block, adapter, model.layers[3])
+    def test_optional_false_explicit(self):
+        assert GeneralizedComponent(name="test", optional=False).optional is False
 
 
-# ============================================================================
-# Tests: blocks_with() API
-# ============================================================================
+# -- Tests: setup_submodules --------------------------------------------------
 
 
-class TestBlocksWith:
-    """Test the blocks_with() capability query on TransformerBridge."""
+class TestOptionalSubmoduleSetup:
+    def test_skipped_on_missing_layers(self):
+        blocks = _setup_blocks(HybridModel(), MinimalAdapter(optional=True))
 
-    def test_blocks_with_returns_matching_blocks(self):
-        """blocks_with('foo') should return only blocks that have 'foo'."""
-        from transformer_lens.model_bridge.bridge import TransformerBridge
+        for i in range(3):
+            assert "foo" in blocks[i].real_components
+            assert hasattr(blocks[i], "foo")
 
-        model = HybridModel()
-        adapter = MinimalAdapter(optional=True)
-        template = adapter.make_block_template()
+        assert "foo" not in blocks[3].real_components
+        assert "foo" not in blocks[3]._modules
+        assert "foo" not in blocks[3].submodules
 
-        import copy
+        for i in range(4):
+            assert "bar" in blocks[i].real_components
 
-        blocks = nn.ModuleList()
-        for i, layer in enumerate(model.layers):
-            block = copy.deepcopy(template)
-            block.name = f"layers.{i}"
-            block.set_original_component(layer)
-            setup_submodules(block, adapter, layer)
-            blocks.append(block)
+    def test_non_optional_raises(self):
+        model = HybridModel()
+        adapter = MinimalAdapter(optional=False)
+        block = copy.deepcopy(adapter.make_block_template())
+        block.name = "layers.3"
+        block.set_original_component(model.layers[3])
+        with pytest.raises(AttributeError):
+            setup_submodules(block, adapter, model.layers[3])
 
-        # Create a minimal bridge-like object with blocks attribute
-        # We test blocks_with as a standalone method
-        bridge = TransformerBridge.__new__(TransformerBridge)
-        nn.Module.__init__(bridge)
-        bridge.add_module("blocks", blocks)
 
-        foo_blocks = bridge.blocks_with("foo")
-        assert len(foo_blocks) == 3
-        assert [idx for idx, _ in foo_blocks] == [0, 1, 2]
+# -- Tests: blocks_with() -----------------------------------------------------
 
-        bar_blocks = bridge.blocks_with("bar")
-        assert len(bar_blocks) == 4
 
-        missing_blocks = bridge.blocks_with("nonexistent")
-        assert len(missing_blocks) == 0
+class TestBlocksWith:
+    def test_returns_matching_blocks(self):
+        bridge = _make_hybrid_bridge()
+        assert [idx for idx, _ in bridge.blocks_with("foo")] == [0, 1, 2]
+        assert len(bridge.blocks_with("bar")) == 4
+        assert bridge.blocks_with("nonexistent") == []
 
-    def test_blocks_with_no_blocks_attribute(self):
-        """blocks_with() should return empty list if no blocks attribute."""
+    def test_no_blocks_attribute(self):
         from transformer_lens.model_bridge.bridge import TransformerBridge
 
         bridge = TransformerBridge.__new__(TransformerBridge)
         nn.Module.__init__(bridge)
         assert bridge.blocks_with("attn") == []
 
+    def test_checks_modules_not_hasattr(self):
+        bridge = _make_hybrid_bridge()
+        assert len(bridge.blocks_with("training")) == 0
 
-# ============================================================================
-# Tests: _stack_block_params with hybrid blocks
-# ============================================================================
-
-
-class TestStackBlockParamsHybridSafe:
-    """Test that _stack_block_params raises clear errors for hybrid blocks."""
-
-    def test_logs_warning_and_returns_subset_on_hybrid(self, caplog):
-        """On hybrid blocks, should log warning and return tensor for matching blocks only."""
-        import logging
-
-        from transformer_lens.model_bridge.bridge import TransformerBridge
-
-        # Build blocks where block 3 lacks 'foo' but blocks 0-2 have it
-        model = HybridModel()
-        adapter = MinimalAdapter(optional=True)
-        template = adapter.make_block_template()
-
-        import copy
-
-        blocks = nn.ModuleList()
-        for i, layer in enumerate(model.layers):
-            block = copy.deepcopy(template)
-            block.name = f"layers.{i}"
-            block.set_original_component(layer)
-            setup_submodules(block, adapter, layer)
-            blocks.append(block)
 
-        # Verify precondition: block 3 lacks 'foo'
-        assert "foo" in blocks[0]._modules
-        assert "foo" not in blocks[3]._modules
+# -- Tests: _stack_block_params -----------------------------------------------
 
-        bridge = TransformerBridge.__new__(TransformerBridge)
-        nn.Module.__init__(bridge)
-        bridge.add_module("blocks", blocks)
 
-        # Should succeed with a log warning, returning only matching blocks.
-        # logging.warning always emits (no deduplication), so researchers see
-        # the index mapping notice on every access — not just the first.
+class TestStackBlockParams:
+    def test_logs_warning_and_returns_subset(self, caplog):
+        bridge = _make_hybrid_bridge()
         with caplog.at_level(logging.WARNING):
             result = bridge._stack_block_params("foo.proj.weight")
         assert any("Hybrid model" in msg for msg in caplog.messages)
-        assert any("stack_params_for" in msg for msg in caplog.messages)
-        # 3 blocks have 'foo', not 4
         assert result.shape[0] == 3
 
-        # Verify it logs again on a second call (no deduplication)
         caplog.clear()
         with caplog.at_level(logging.WARNING):
-            result2 = bridge._stack_block_params("foo.proj.weight")
-        assert any(
-            "Hybrid model" in msg for msg in caplog.messages
-        ), "Warning should emit on every call, not just the first"
-
-    def test_raises_when_no_blocks_have_submodule(self):
-        """Should raise AttributeError when zero blocks have the submodule."""
-        from transformer_lens.model_bridge.bridge import TransformerBridge
+            bridge._stack_block_params("foo.proj.weight")
+        assert any("Hybrid model" in msg for msg in caplog.messages)
 
+    def test_raises_when_no_blocks_match(self):
         bridge = _make_hybrid_bridge()
         with pytest.raises(AttributeError, match="No blocks have"):
             bridge._stack_block_params("nonexistent")
 
     def test_succeeds_on_universal_submodule(self):
-        """Should succeed when all blocks have the requested submodule."""
-        from transformer_lens.model_bridge.bridge import TransformerBridge
-
-        model = HybridModel()
-        adapter = MinimalAdapter(optional=True)
-        template = adapter.make_block_template()
-
-        import copy
-
-        blocks = nn.ModuleList()
-        for i, layer in enumerate(model.layers):
-            block = copy.deepcopy(template)
-            block.name = f"layers.{i}"
-            block.set_original_component(layer)
-            setup_submodules(block, adapter, layer)
-            blocks.append(block)
-
-        bridge = TransformerBridge.__new__(TransformerBridge)
-        nn.Module.__init__(bridge)
-        bridge.add_module("blocks", blocks)
-
-        # 'bar' exists on all blocks → should succeed
+        bridge = _make_hybrid_bridge()
         result = bridge._stack_block_params("bar.weight")
-        assert result.shape[0] == 4  # 4 layers
+        assert result.shape[0] == 4
 
 
-# ============================================================================
-# Tests: refactor_factored_attn_matrices with missing layers
-# ============================================================================
+# -- Tests: refactor_factored_attn_matrices ------------------------------------
 
 
 class TestRefactorFactoredAttnHybrid:
-    """Test that refactor_factored_attn_matrices skips layers without attn."""
-
     def test_skips_missing_attn_layers(self):
-        """Should process layers with attn keys and skip those without."""
         from transformer_lens.config.TransformerLensConfig import TransformerLensConfig
         from transformer_lens.weight_processing import ProcessWeights
 
-        n_heads = 2
-        d_head = 4
-        d_model = n_heads * d_head
         cfg = TransformerLensConfig(
             n_layers=4,
-            n_heads=n_heads,
-            d_head=d_head,
-            d_model=d_model,
+            n_heads=2,
+            d_head=4,
+            d_model=8,
             n_ctx=16,
             positional_embedding_type="standard",
         )
-
-        # Create state_dict with attn weights for layers 0-2 only.
-        # W_Q/W_K/W_V: [n_heads, d_model, d_head], W_O: [n_heads, d_head, d_model]
-        # b_Q/b_K/b_V: [n_heads, d_head], b_O: [d_model]
         state_dict = {}
-        for l in range(3):  # layers 0-2 have attention
-            state_dict[f"blocks.{l}.attn.W_Q"] = torch.randn(n_heads, d_model, d_head)
-            state_dict[f"blocks.{l}.attn.W_K"] = torch.randn(n_heads, d_model, d_head)
-            state_dict[f"blocks.{l}.attn.W_V"] = torch.randn(n_heads, d_model, d_head)
-            state_dict[f"blocks.{l}.attn.W_O"] = torch.randn(n_heads, d_head, d_model)
-            state_dict[f"blocks.{l}.attn.b_Q"] = torch.randn(n_heads, d_head)
-            state_dict[f"blocks.{l}.attn.b_K"] = torch.randn(n_heads, d_head)
-            state_dict[f"blocks.{l}.attn.b_V"] = torch.randn(n_heads, d_head)
-            state_dict[f"blocks.{l}.attn.b_O"] = torch.randn(d_model)
-
-        # Layer 3 has NO attention keys — should be skipped, not crash
+        for l in range(3):
+            state_dict[f"blocks.{l}.attn.W_Q"] = torch.randn(2, 8, 4)
+            state_dict[f"blocks.{l}.attn.W_K"] = torch.randn(2, 8, 4)
+            state_dict[f"blocks.{l}.attn.W_V"] = torch.randn(2, 8, 4)
+            state_dict[f"blocks.{l}.attn.W_O"] = torch.randn(2, 4, 8)
+            state_dict[f"blocks.{l}.attn.b_Q"] = torch.randn(2, 4)
+            state_dict[f"blocks.{l}.attn.b_K"] = torch.randn(2, 4)
+            state_dict[f"blocks.{l}.attn.b_V"] = torch.randn(2, 4)
+            state_dict[f"blocks.{l}.attn.b_O"] = torch.randn(8)
+
         result = ProcessWeights.refactor_factored_attn_matrices(state_dict, cfg)
 
-        # Layers 0-2 should still have their attn keys (now refactored)
         for l in range(3):
             assert f"blocks.{l}.attn.W_Q" in result
-            assert f"blocks.{l}.attn.W_K" in result
+        assert "blocks.3.attn.W_Q" not in result
 
-        # Layer 3 should have no attn keys
-        assert f"blocks.3.attn.W_Q" not in result
+    def test_raises_on_partial_attn_keys(self):
+        from transformer_lens.config.TransformerLensConfig import TransformerLensConfig
+        from transformer_lens.weight_processing import ProcessWeights
 
+        cfg = TransformerLensConfig(
+            n_layers=1,
+            n_heads=2,
+            d_head=4,
+            d_model=8,
+            n_ctx=16,
+            positional_embedding_type="standard",
+        )
+        state_dict = {"blocks.0.attn.W_Q": torch.randn(2, 8, 4)}
+        with pytest.raises(ValueError, match="Inconsistent attention weights"):
+            ProcessWeights.refactor_factored_attn_matrices(state_dict, cfg)
 
-# ============================================================================
-# Tests: weight distribution with ragged blocks
-# ============================================================================
 
+# -- Tests: weight distribution ------------------------------------------------
 
-class TestWeightDistributionRagged:
-    """Test that weight distribution handles heterogeneous real_components."""
 
+class TestWeightDistributionRagged:
     def test_distribute_weights_skips_empty_blocks(self):
-        """Blocks without attn weights should receive no attn keys."""
         from transformer_lens.weight_processing import ProcessWeights
 
-        # Build a minimal real_components mapping with ragged blocks
-        model = HybridModel()
-        adapter = MinimalAdapter(optional=True)
-        template = adapter.make_block_template()
-
-        import copy
-
-        blocks = []
-        for i, layer in enumerate(model.layers):
-            block = copy.deepcopy(template)
-            block.name = f"layers.{i}"
-            block.set_original_component(layer)
-            setup_submodules(block, adapter, layer)
-            blocks.append(block)
-
-        # Construct state_dict with 'foo' weights for blocks 0-2 only
+        blocks = _setup_blocks(HybridModel(), MinimalAdapter(optional=True))
         state_dict = {}
         for i in range(3):
             state_dict[f"blocks.{i}.foo.weight"] = torch.randn(4, 4)
         for i in range(4):
             state_dict[f"blocks.{i}.bar.weight"] = torch.randn(4, 4)
 
-        # Build the component mapping
-        component_mapping = {
-            "blocks": ("layers", blocks),
-        }
-
-        # This should not crash
         ProcessWeights.distribute_weights_to_components(
             state_dict=state_dict,
-            component_mapping=component_mapping,
-        )
-
-
-# ============================================================================
-# Helpers for bridge-level tests
-# ============================================================================
-
-
-def _make_hybrid_bridge():
-    """Build a minimal TransformerBridge with hybrid blocks for testing.
-
-    Uses 'foo' and 'bar' as submodule names. Layers 0-2 have 'foo', layer 3 does not.
-    """
-    import copy
-
-    from transformer_lens.model_bridge.bridge import TransformerBridge
-
-    model = HybridModel()
-    adapter = MinimalAdapter(optional=True)
-    template = adapter.make_block_template()
-
-    blocks = nn.ModuleList()
-    for i, layer in enumerate(model.layers):
-        block = copy.deepcopy(template)
-        block.name = f"layers.{i}"
-        block.set_original_component(layer)
-        setup_submodules(block, adapter, layer)
-        blocks.append(block)
-
-    bridge = TransformerBridge.__new__(TransformerBridge)
-    nn.Module.__init__(bridge)
-    bridge.add_module("blocks", blocks)
-
-    # Minimal cfg for accumulated_bias
-    bridge.cfg = type("Cfg", (), {"d_model": 4, "device": "cpu", "n_layers": 4})()
-    return bridge
-
-
-class AttnAdapter(ArchitectureAdapter):
-    """Adapter using 'attn' as the optional submodule name (matches real adapters)."""
-
-    def __init__(self):
-        self.cfg = type("Cfg", (), {"n_layers": 4, "d_model": 4})()
-        self.component_mapping = {}
-
-    def make_block_template(self) -> BlockBridge:
-        return BlockBridge(
-            name="layers",
-            submodules={
-                "bar": LinearBridge(name="bar"),
-                "attn": LinearBridge(name="foo", optional=True),
-            },
+            component_mapping={"blocks": ("layers", blocks)},
         )
 
 
-def _make_hybrid_bridge_with_attn():
-    """Build a hybrid bridge where 'attn' is the optional submodule.
-
-    Layers 0-2 have 'attn' (mapped from 'foo'), layer 3 does not.
-    Used for testing APIs that specifically look for 'attn' (composition scores, labels).
-    """
-    import copy
-
-    from transformer_lens.model_bridge.bridge import TransformerBridge
-
-    model = HybridModel()
-    adapter = AttnAdapter()
-    template = adapter.make_block_template()
-
-    blocks = nn.ModuleList()
-    for i, layer in enumerate(model.layers):
-        block = copy.deepcopy(template)
-        block.name = f"layers.{i}"
-        block.set_original_component(layer)
-        setup_submodules(block, adapter, layer)
-        blocks.append(block)
+# -- Tests: __setattr__ whitelist ----------------------------------------------
 
-    bridge = TransformerBridge.__new__(TransformerBridge)
-    nn.Module.__init__(bridge)
-    bridge.add_module("blocks", blocks)
-    bridge.cfg = type("Cfg", (), {"d_model": 4, "device": "cpu", "n_layers": 4, "n_heads": 2})()
-    return bridge
-
-
-# ============================================================================
-# Tests: blocks_with uses _modules not hasattr
-# ============================================================================
 
-
-class TestBlocksWithModulesCheck:
-    """blocks_with() should only find bridged submodules, not HF attrs."""
-
-    def test_does_not_find_hf_internal_attrs(self):
-        """blocks_with should not match HF attributes that aren't bridged."""
-        bridge = _make_hybrid_bridge()
-        # 'bar' is a bridged submodule (in _modules), should be found
-        assert len(bridge.blocks_with("bar")) == 4
-        # 'training' exists as an attr on nn.Module but is not a bridged submodule
-        assert len(bridge.blocks_with("training")) == 0
-
-    def test_finds_only_bridged_optional_submodules(self):
-        """Optional submodules should be found only on layers where they were bound."""
-        bridge = _make_hybrid_bridge()
-        foo_blocks = bridge.blocks_with("foo")
-        assert [idx for idx, _ in foo_blocks] == [0, 1, 2]
+class TestSetAttrWhitelist:
+    def test_optional_stays_on_bridge(self):
+        comp = LinearBridge(name="test")
+        fake_hf = nn.Linear(4, 4, bias=False)
+        comp.set_original_component(fake_hf)
+        comp.optional = True
+        assert comp.optional is True
+        assert not hasattr(fake_hf, "optional")
 
 
-# ============================================================================
-# Tests: accumulated_bias on hybrid models
-# ============================================================================
+# -- Tests: accumulated_bias --------------------------------------------------
 
 
 class TestAccumulatedBiasHybrid:
-    """accumulated_bias should not crash on hybrid models."""
-
-    def test_accumulated_bias_skips_non_attn_layers(self):
-        """Should not crash when some layers lack attention."""
+    def test_skips_non_attn_layers(self):
         bridge = _make_hybrid_bridge()
-        # Should run without error through all 4 layers (layer 3 has no attn)
         result = bridge.accumulated_bias(layer=4)
         assert result.shape == (4,)
 
-    def test_accumulated_bias_mlp_input_on_non_attn_layer(self):
-        """mlp_input=True on a non-attention layer should not crash."""
+    def test_mlp_input_on_non_attn_layer(self):
         bridge = _make_hybrid_bridge()
-        # Layer 3 has no attn — should still work with mlp_input=True
         result = bridge.accumulated_bias(layer=3, mlp_input=True)
         assert result.shape == (4,)
 
 
-# ============================================================================
-# Tests: block_submodules and layer_types introspection
-# ============================================================================
+# -- Tests: block introspection ------------------------------------------------
 
 
 class TestBlockIntrospection:
-    """Test layer introspection APIs."""
-
     def test_block_submodules(self):
-        """block_submodules should list bridged submodules per layer."""
         bridge = _make_hybrid_bridge()
-        # Layer 0 has both foo and bar
-        subs_0 = bridge.block_submodules(0)
-        assert "foo" in subs_0
-        assert "bar" in subs_0
-        # Layer 3 has only bar
-        subs_3 = bridge.block_submodules(3)
-        assert "foo" not in subs_3
-        assert "bar" in subs_3
+        assert "foo" in bridge.block_submodules(0)
+        assert "bar" in bridge.block_submodules(0)
+        assert "foo" not in bridge.block_submodules(3)
+        assert "bar" in bridge.block_submodules(3)
 
     def test_layer_types(self):
-        """layer_types should return a list with one entry per block."""
         bridge = _make_hybrid_bridge()
         types = bridge.layer_types()
         assert len(types) == 4
-        # Layers 0-2 have 'foo', layer 3 does not
         for i in range(3):
             assert "foo" in types[i]
         assert "foo" not in types[3]
 
 
-# ============================================================================
-# Tests: stack_params_for hybrid API
-# ============================================================================
+# -- Tests: stack_params_for --------------------------------------------------
 
 
 class TestStackParamsFor:
-    """Test stack_params_for on hybrid bridges."""
-
     def test_returns_correct_indices_and_tensors(self):
-        """stack_params_for should return only matching blocks."""
         bridge = _make_hybrid_bridge()
         indices, stacked = bridge.stack_params_for("foo", "foo.proj.weight")
         assert indices == [0, 1, 2]
         assert stacked.shape[0] == 3
 
     def test_raises_on_no_matching_blocks(self):
-        """Should raise ValueError when no blocks have the submodule."""
         bridge = _make_hybrid_bridge()
         with pytest.raises(ValueError, match="No blocks have submodule"):
             bridge.stack_params_for("nonexistent", "nonexistent.weight")
 
 
-# ============================================================================
-# Tests: refactor guard validates all attn keys
-# ============================================================================
-
-
-class TestRefactorGuardConsistency:
-    """Test that refactor raises on inconsistent attn keys (W_Q present, W_K missing)."""
-
-    def test_raises_on_partial_attn_keys(self):
-        """If W_Q is present but W_K is missing, should raise ValueError."""
-        from transformer_lens.config.TransformerLensConfig import TransformerLensConfig
-        from transformer_lens.weight_processing import ProcessWeights
-
-        cfg = TransformerLensConfig(
-            n_layers=1,
-            n_heads=2,
-            d_head=4,
-            d_model=8,
-            n_ctx=16,
-            positional_embedding_type="standard",
-        )
-        # Only W_Q present, missing W_K/W_V/W_O
-        state_dict = {
-            "blocks.0.attn.W_Q": torch.randn(2, 8, 4),
-        }
-        with pytest.raises(ValueError, match="Inconsistent attention weights"):
-            ProcessWeights.refactor_factored_attn_matrices(state_dict, cfg)
-
-
-# ============================================================================
-# Tests: __setattr__ whitelist includes optional
-# ============================================================================
-
-
-class TestSetAttrWhitelist:
-    """Test that 'optional' is in the __setattr__ whitelist."""
-
-    def test_optional_set_on_bridge_not_hf_model(self):
-        """Setting optional after set_original_component should stay on bridge."""
-        comp = LinearBridge(name="test")
-        fake_hf = nn.Linear(4, 4, bias=False)
-        comp.set_original_component(fake_hf)
-        comp.optional = True
-        # Should be on the bridge, not on the HF module
-        assert comp.optional is True
-        assert not hasattr(fake_hf, "optional")
-
-
-# ============================================================================
-# Tests: attn_head_labels matches composition scores dimensions
-# ============================================================================
+# -- Tests: attn_head_labels --------------------------------------------------
 
 
 class TestAttnHeadLabels:
-    """attn_head_labels should match all_composition_scores dimensions."""
-
-    def test_attn_head_labels_excludes_non_attn_layers(self):
-        """Labels should only cover attention layers, not SSM/linear-attn."""
+    def test_excludes_non_attn_layers(self):
         bridge = _make_hybrid_bridge_with_attn()
-        bridge.cfg.n_heads = 2
         labels = bridge.attn_head_labels
-        # 3 attention layers (0, 1, 2) * 2 heads = 6 labels
         assert len(labels) == 6
         assert labels == ["L0H0", "L0H1", "L1H0", "L1H1", "L2H0", "L2H1"]
-        # Should NOT contain L3 (non-attention layer)
-        assert all("L3" not in lbl for lbl in labels)
 
-    def test_all_head_labels_includes_all_layers(self):
-        """all_head_labels should still include every layer."""
+    def test_all_head_labels_includes_all(self):
         bridge = _make_hybrid_bridge_with_attn()
-        bridge.cfg.n_heads = 2
-        labels = bridge.all_head_labels
-        # 4 layers * 2 heads = 8 labels
-        assert len(labels) == 8
+        assert len(bridge.all_head_labels) == 8
 
 
-# ============================================================================
-# Tests: hook propagation through optional submodules
-# ============================================================================
+# -- Tests: hook propagation --------------------------------------------------
 
 
 class TestHookPropagation:
-    """Verify hooks fire on present optional submodules and don't exist on absent ones."""
-
-    def _build_hybrid_model_and_blocks(self):
-        """Build a hybrid model with setup done so hooks are wired."""
-        import copy
-
-        model = HybridModel()
-        adapter = MinimalAdapter(optional=True)
-        template = adapter.make_block_template()
-
-        blocks = []
-        for i, layer in enumerate(model.layers):
-            block = copy.deepcopy(template)
-            block.name = f"layers.{i}"
-            block.set_original_component(layer)
-            setup_submodules(block, adapter, layer)
-            blocks.append(block)
-
-        return model, blocks
-
-    def test_hooks_fire_on_present_optional_submodule(self):
-        """hook_in and hook_out should fire on blocks where the optional submodule exists."""
-        model, blocks = self._build_hybrid_model_and_blocks()
-
-        # Block 0 has 'foo' — its hook_in and hook_out should fire
-        foo_bridge = blocks[0].foo
-        hook_in_fired = []
-        hook_out_fired = []
-
-        foo_bridge.hook_in.add_hook(lambda tensor, hook: hook_in_fired.append(True) or tensor)
-        foo_bridge.hook_out.add_hook(lambda tensor, hook: hook_out_fired.append(True) or tensor)
-
-        # Run a forward pass through the HF model's layer 0
-        # Because replace_remote_component swapped model.layers[0].foo with the bridge,
-        # calling model.layers[0].foo(x) goes through LinearBridge.forward
-        x = torch.randn(1, 4)
-        _ = blocks[0].foo(x)
+    def test_hooks_fire_on_present_optional(self):
+        blocks = _setup_blocks(HybridModel(), MinimalAdapter(optional=True))
+        fired = []
+        blocks[0].foo.hook_out.add_hook(lambda t, hook: fired.append(True) or t)
 
-        assert len(hook_in_fired) == 1, "hook_in should fire on present optional submodule"
-        assert len(hook_out_fired) == 1, "hook_out should fire on present optional submodule"
+        blocks[0].foo(torch.randn(1, 4))
+        assert len(fired) == 1
 
-    def test_absent_optional_submodule_has_no_hooks(self):
-        """Block 3 should not have 'foo' at all — no hooks to fire."""
-        _, blocks = self._build_hybrid_model_and_blocks()
-
-        # Block 3 lacks 'foo' — it shouldn't be in _modules
+    def test_absent_optional_has_no_module(self):
+        blocks = _setup_blocks(HybridModel(), MinimalAdapter(optional=True))
         assert "foo" not in blocks[3]._modules
-        # Attempting to access hooks on the absent submodule should fail
-        assert not hasattr(blocks[3], "foo")
-
-    def test_hooks_on_present_dont_affect_absent(self):
-        """Running all blocks should fire hooks only on blocks with the optional submodule."""
-        model, blocks = self._build_hybrid_model_and_blocks()
 
-        # Track which blocks fire foo.hook_out
-        fired_block_indices = []
+    def test_hooks_fire_only_on_present(self):
+        model = HybridModel()
+        blocks = _setup_blocks(model, MinimalAdapter(optional=True))
+        fired_indices = []
         for i, block in enumerate(blocks):
             if "foo" in block._modules:
-                block.foo.hook_out.add_hook(
-                    lambda tensor, hook, idx=i: fired_block_indices.append(idx) or tensor
-                )
+                block.foo.hook_out.add_hook(lambda t, hook, idx=i: fired_indices.append(idx) or t)
 
-        # Run forward through all HF layers
         x = torch.randn(1, 4)
-        for i, layer in enumerate(model.layers):
+        for layer in model.layers:
             x = layer(x)
+        assert fired_indices == [0, 1, 2]
 
-        # Hooks should fire on layers 0, 1, 2 (have foo) but not 3
-        assert fired_block_indices == [0, 1, 2]
-
-    def test_universal_submodule_hooks_fire_on_all_blocks(self):
-        """'bar' is universal — its hooks should fire on every block."""
-        model, blocks = self._build_hybrid_model_and_blocks()
-
-        fired_block_indices = []
+    def test_universal_hooks_fire_on_all(self):
+        model = HybridModel()
+        blocks = _setup_blocks(model, MinimalAdapter(optional=True))
+        fired_indices = []
         for i, block in enumerate(blocks):
-            block.bar.hook_out.add_hook(
-                lambda tensor, hook, idx=i: fired_block_indices.append(idx) or tensor
-            )
+            block.bar.hook_out.add_hook(lambda t, hook, idx=i: fired_indices.append(idx) or t)
 
         x = torch.randn(1, 4)
         for layer in model.layers:
             x = layer(x)
+        assert fired_indices == [0, 1, 2, 3]
 
-        assert fired_block_indices == [0, 1, 2, 3]
 
-
-# ============================================================================
-# Tests: CompositionScores tensor protocol
-# ============================================================================
+# -- Tests: CompositionScores tensor protocol ----------------------------------
 
 
 class TestCompositionScoresProtocol:
-    """CompositionScores should behave like a tensor for existing research code."""
-
     def _make_scores(self):
         from transformer_lens.model_bridge.composition_scores import CompositionScores
 
         t = torch.randn(3, 2, 3, 2)
         return CompositionScores(t, [0, 2, 5], ["L0H0", "L0H1", "L2H0", "L2H1", "L5H0", "L5H1"])
 
-    def test_shape(self):
+    def test_shape_device_dtype(self):
         cs = self._make_scores()
         assert cs.shape == torch.Size([3, 2, 3, 2])
-
-    def test_device_and_dtype(self):
-        cs = self._make_scores()
         assert cs.device == torch.device("cpu")
         assert cs.dtype == torch.float32
 
-    def test_indexing_returns_tensor(self):
+    def test_indexing(self):
         cs = self._make_scores()
-        sliced = cs[0, :, 1, :]
-        assert isinstance(sliced, torch.Tensor)
-        assert sliced.shape == (2, 2)
+        assert isinstance(cs[0, :, 1, :], torch.Tensor)
+        assert cs[0, :, 1, :].shape == (2, 2)
 
     def test_torch_isnan(self):
-        """torch.isnan(scores) must work — used in existing integration tests."""
         cs = self._make_scores()
         result = torch.isnan(cs)
         assert isinstance(result, torch.Tensor)
-        assert result.shape == cs.shape
         assert not result.any()
 
     def test_torch_where(self):
@@ -786,67 +444,34 @@ def test_torch_where(self):
         result = torch.where(cs > 0, cs.scores, torch.zeros_like(cs.scores))
         assert isinstance(result, torch.Tensor)
 
-    def test_comparison_gt(self):
-        cs = self._make_scores()
-        mask = cs > 0
-        assert isinstance(mask, torch.Tensor)
-        assert mask.shape == cs.shape
-
-    def test_comparison_ne(self):
-        """scores != 0 must return a tensor, not raise RuntimeError."""
+    def test_comparisons(self):
         cs = self._make_scores()
-        result = cs != 0
-        assert isinstance(result, torch.Tensor)
-        assert result.shape == cs.shape
+        assert isinstance(cs > 0, torch.Tensor)
+        assert isinstance(cs != 0, torch.Tensor)
+        assert isinstance(cs == 0, torch.Tensor)
 
-    def test_comparison_eq(self):
+    def test_tensor_methods(self):
         cs = self._make_scores()
-        result = cs == 0
-        assert isinstance(result, torch.Tensor)
-
-    def test_tensor_method_abs(self):
-        """scores.abs() must work via __getattr__ delegation."""
-        cs = self._make_scores()
-        result = cs.abs()
-        assert isinstance(result, torch.Tensor)
-
-    def test_tensor_method_sum(self):
-        cs = self._make_scores()
-        result = cs.sum()
-        assert isinstance(result, torch.Tensor)
-
-    def test_tensor_method_any(self):
-        cs = self._make_scores()
-        result = cs.any()
-        assert isinstance(result, torch.Tensor)
+        assert isinstance(cs.abs(), torch.Tensor)
+        assert isinstance(cs.sum(), torch.Tensor)
+        assert isinstance(cs.any(), torch.Tensor)
 
     def test_chained_indexing_and_method(self):
-        """scores[l1, :, l2, :].abs().sum() — the exact pattern from integration tests."""
         cs = self._make_scores()
         result = cs[0, :, 1, :].abs().sum()
-        assert isinstance(result, torch.Tensor)
-        assert result.ndim == 0  # scalar
+        assert result.ndim == 0
 
-    def test_metadata_accessible(self):
+    def test_metadata(self):
         cs = self._make_scores()
         assert cs.layer_indices == [0, 2, 5]
         assert len(cs.head_labels) == 6
-
-    def test_repr(self):
-        cs = self._make_scores()
-        r = repr(cs)
-        assert "CompositionScores" in r
-        assert "layer_indices" in r
+        assert "CompositionScores" in repr(cs)
 
 
-# ============================================================================
-# Tests: get_bridge_params with hybrid blocks
-# ============================================================================
+# -- Tests: get_bridge_params with hybrid blocks ------------------------------
 
 
 class TestGetBridgeParamsHybrid:
-    """get_bridge_params should skip attn keys for non-attention layers."""
-
     def test_no_attn_keys_for_non_attn_layers(self):
         from transformer_lens.model_bridge.get_params_util import get_bridge_params
 
@@ -854,34 +479,14 @@ def test_no_attn_keys_for_non_attn_layers(self):
         bridge.cfg.d_vocab = 10
         bridge.cfg.n_ctx = 8
         bridge.cfg.d_mlp = 16
-        bridge.cfg.n_heads = 2
         bridge.cfg.d_head = 2
 
-        # Add minimal embed/unembed so get_bridge_params doesn't fail
         bridge.embed = nn.Embedding(10, 4)
         bridge.pos_embed = type("PE", (), {"weight": torch.randn(8, 4)})()
-        bridge.unembed = type(
-            "UE",
-            (),
-            {
-                "weight": torch.randn(10, 4),
-                "b_U": torch.zeros(10),
-            },
-        )()
+        bridge.unembed = type("UE", (), {"weight": torch.randn(10, 4), "b_U": torch.zeros(10)})()
 
         params = get_bridge_params(bridge)
-
-        # Blocks 0-2 have 'attn' — should have attn keys
-        for i in range(3):
-            # attn is mapped but internal structure (q/k/v/o) may not match
-            # our synthetic LinearBridge wrapping FakeSubmodule — so attn keys
-            # may or may not be present depending on structure. The key point
-            # is block 3 must NOT have attn keys.
-            pass
-
-        # Block 3 has NO 'attn' — must not have any attn keys
-        attn_keys_for_block3 = [k for k in params if k.startswith("blocks.3.attn.")]
-        assert len(attn_keys_for_block3) == 0, (
-            f"Block 3 (non-attention layer) should have no attn keys, "
-            f"but found: {attn_keys_for_block3}"
-        )
+        attn_keys_block3 = [k for k in params if k.startswith("blocks.3.attn.")]
+        assert (
+            len(attn_keys_block3) == 0
+        ), f"Non-attn layer should have no attn keys: {attn_keys_block3}"
diff --git a/transformer_lens/benchmarks/weight_processing.py b/transformer_lens/benchmarks/weight_processing.py
index 326c53df7..5a7fafd65 100644
--- a/transformer_lens/benchmarks/weight_processing.py
+++ b/transformer_lens/benchmarks/weight_processing.py
@@ -149,7 +149,6 @@ def benchmark_weight_sharing(
         if reference_model is not None:
             reference_original = reference_model(test_text, return_type="loss")
 
-            # Find first block with attention (hybrid models may not have attn on block 0)
             bridge_attn_blocks = bridge.blocks_with("attn")
             if not bridge_attn_blocks:
                 return BenchmarkResult(
@@ -558,7 +557,6 @@ def benchmark_attention_output_centering(
                 message="Skipped for tiny/test model (random weights don't center meaningfully)",
             )
 
-        # Find blocks with attention (hybrid architectures may not have attn on all blocks)
         attn_blocks = bridge.blocks_with("attn")
         if not attn_blocks:
             return BenchmarkResult(
@@ -801,7 +799,6 @@ def benchmark_value_bias_folding(
                     },
                 )
 
-        # Find blocks with attention (hybrid architectures may not have attn on all blocks)
         attn_blocks = bridge.blocks_with("attn")
         if not attn_blocks:
             return BenchmarkResult(
diff --git a/transformer_lens/model_bridge/bridge.py b/transformer_lens/model_bridge/bridge.py
index 372b53bf5..6f9e816f8 100644
--- a/transformer_lens/model_bridge/bridge.py
+++ b/transformer_lens/model_bridge/bridge.py
@@ -1021,21 +1021,10 @@ def to_single_str_token(self, int_token: int) -> str:
         raise AssertionError("Expected a single string token.")
 
     def blocks_with(self, submodule: str) -> List[Tuple[int, "GeneralizedComponent"]]:
-        """Return (index, block) pairs for blocks that have the named submodule.
+        """Return (index, block) pairs for blocks with the named bridged submodule.
 
-        Hybrid architectures have heterogeneous blocks — some layers have
-        attention, others have SSM or linear attention, etc. Use this instead
-        of assuming blocks[0] is representative.
-
-        Only returns blocks where the submodule was explicitly set up as a
-        bridged component (registered in _modules), not submodules that happen
-        to exist on the underlying HF model.
-
-        Args:
-            submodule: Name of the submodule to check for (e.g., "attn", "mamba")
-
-        Returns:
-            List of (layer_index, block) tuples for blocks that have the submodule.
+        Checks _modules (not hasattr) so HF-internal attrs don't match.
+        Use instead of assuming blocks[0] is representative on hybrid models.
         """
         if not hasattr(self, "blocks"):
             return []
@@ -1044,23 +1033,9 @@ def blocks_with(self, submodule: str) -> List[Tuple[int, "GeneralizedComponent"]
     def stack_params_for(
         self, submodule: str, attr_path: str, reshape_fn: Optional[Callable] = None
     ) -> Tuple[List[int], torch.Tensor]:
-        """Stack a parameter across blocks that have a specific submodule.
-
-        For hybrid architectures where only some blocks have attention (or SSM,
-        etc.), this returns the stacked tensor for only matching blocks along
-        with their layer indices.
-
-        Args:
-            submodule: Submodule to filter on (e.g., "attn", "mamba")
-            attr_path: Dot-separated attr path from block (e.g., "attn.W_K")
-            reshape_fn: Optional function to reshape each weight before stacking
-
-        Returns:
-            Tuple of (layer_indices, stacked_tensor) where layer_indices maps
-            position i in the tensor to the original layer index.
+        """Stack a parameter across matching blocks only. Returns (layer_indices, tensor).
 
-        Raises:
-            ValueError: If no blocks have the requested submodule.
+        Use for hybrid models where not all blocks have the submodule.
         """
         matching = self.blocks_with(submodule)
         if not matching:
@@ -1081,23 +1056,12 @@ def stack_params_for(
     def _stack_block_params(
         self, attr_path: str, reshape_fn: Optional[Callable] = None
     ) -> torch.Tensor:
-        """Stack a parameter across all blocks, or across matching blocks for hybrids.
+        """Stack a parameter across all blocks; falls back to matching-only on hybrids.
 
-        For homogeneous models, returns a tensor of shape [n_layers, ...].
-        For hybrid models where some blocks lack the requested submodule,
-        returns a tensor of shape [n_matching_blocks, ...] and emits a
-        one-time warning about the index mapping.
-
-        Args:
-            attr_path: Dot-separated attribute path from block (e.g., "attn.W_K")
-            reshape_fn: Optional function to reshape each weight before stacking
-
-        Note:
-            The guard checks only that the first path segment is a bridged
-            submodule (in _modules). Deeper segments resolve via standard
-            getattr, which may fall through to HF model attributes. This is
-            intentional — properties like W_Q are exposed via __getattr__
-            delegation to the underlying weight tensors.
+        On hybrid models, logs a warning about index mapping and returns only
+        blocks that have the submodule. First path segment is checked against
+        _modules; deeper segments resolve via getattr (intentional — W_Q etc.
+        are exposed via __getattr__ delegation).
         """
         first_attr = attr_path.split(".")[0]
         matching_blocks = [
@@ -1231,42 +1195,22 @@ def W_E(self) -> torch.Tensor:
 
     @property
     def QK(self):
-        """QK circuit as a FactoredMatrix.
-
-        On hybrid models, returns the circuit for attention layers only (with
-        a warning about index mapping). For explicit index control, use
-        QK_for_attn_layers() which returns (layer_indices, FactoredMatrix).
-        """
+        """QK circuit. On hybrids, returns attn layers only (with warning). See QK_for_attn_layers()."""
         return FactoredMatrix(self.W_Q, self.W_K.transpose(-2, -1))
 
     @property
     def OV(self):
-        """OV circuit as a FactoredMatrix.
-
-        On hybrid models, returns the circuit for attention layers only (with
-        a warning about index mapping). For explicit index control, use
-        OV_for_attn_layers() which returns (layer_indices, FactoredMatrix).
-        """
+        """OV circuit. On hybrids, returns attn layers only (with warning). See OV_for_attn_layers()."""
         return FactoredMatrix(self.W_V, self.W_O)
 
     def QK_for_attn_layers(self) -> Tuple[List[int], FactoredMatrix]:
-        """QK circuit for attention layers only (hybrid-safe).
-
-        Returns:
-            Tuple of (layer_indices, FactoredMatrix) where layer_indices maps
-            position i in the matrix to the original layer index.
-        """
+        """QK circuit for attention layers only. Returns (layer_indices, FactoredMatrix)."""
         q_indices, W_Q = self.stack_params_for("attn", "attn.W_Q", self._reshape_qkv)
         _, W_K = self.stack_params_for("attn", "attn.W_K", self._reshape_qkv)
         return q_indices, FactoredMatrix(W_Q, W_K.transpose(-2, -1))
 
     def OV_for_attn_layers(self) -> Tuple[List[int], FactoredMatrix]:
-        """OV circuit for attention layers only (hybrid-safe).
-
-        Returns:
-            Tuple of (layer_indices, FactoredMatrix) where layer_indices maps
-            position i in the matrix to the original layer index.
-        """
+        """OV circuit for attention layers only. Returns (layer_indices, FactoredMatrix)."""
         v_indices, W_V = self.stack_params_for("attn", "attn.W_V", self._reshape_qkv)
         _, W_O = self.stack_params_for("attn", "attn.W_O", self._reshape_o)
         return v_indices, FactoredMatrix(W_V, W_O)
@@ -1314,9 +1258,7 @@ def tokens_to_residual_directions(
             residual_direction = self.W_U[:, token]
             return residual_direction
 
-    # Output bias attribute names by variant type. Attention uses "b_O"
-    # (a processed-weight alias). SSM/linear-attn variants use their output
-    # projection's bias. Map variant name → list of attribute paths to check.
+    # Variant → attr paths for the output bias that feeds the residual stream.
     _VARIANT_OUTPUT_BIAS_ATTRS: Dict[str, tuple] = {
         "attn": ("b_O",),
         "linear_attn": ("out_proj.bias",),
@@ -1326,12 +1268,7 @@ def tokens_to_residual_directions(
     }
 
     def _get_block_variant_bias(self, block: "GeneralizedComponent") -> Optional[torch.Tensor]:
-        """Get the output bias from whatever variant submodule this block has.
-
-        Each variant type has its own output bias attribute name — attention
-        uses b_O while SSM variants use out_proj.bias. Returns the first
-        found, or None if the variant has no output bias.
-        """
+        """Return the output bias from this block's variant submodule, or None."""
         for name in VARIANT_SUBMODULE_NAMES:
             if name not in block._modules:
                 continue
@@ -1353,22 +1290,10 @@ def accumulated_bias(
         mlp_input: bool = False,
         include_mlp_biases: bool = True,
     ) -> torch.Tensor:
-        """Sum of biases that contribute to the residual stream up to a given layer.
-
-        Includes output biases from whatever variant submodule each block has
-        (attention, Mamba, linear attention, etc.) plus MLP output biases.
-        For hybrid models, non-attention layers still contribute their variant
-        submodule's output bias to the residual stream.
+        """Sum of variant + MLP output biases through the residual stream up to `layer`.
 
-        Args:
-            layer: Layer number in [0, n_layers]. 0 means no layers, n_layers means all.
-            mlp_input: If True, include the variant submodule's output bias of
-                the target layer (i.e. bias up to the MLP input of that layer).
-            include_mlp_biases: Whether to include MLP biases. Useful to set False when
-                expanding attn_out into individual heads but keeping mlp_out as-is.
-
-        Returns:
-            Tensor of shape [d_model] with the accumulated bias.
+        Includes all layer types (attn, SSM, linear-attn). Set mlp_input=True
+        to include the variant bias of the target layer itself.
         """
         accumulated = torch.zeros(self.cfg.d_model, device=self.cfg.device)
         for i in range(layer):
@@ -1389,23 +1314,12 @@ def accumulated_bias(
         return accumulated
 
     def all_composition_scores(self, mode: str) -> CompositionScores:
-        """Composition scores for all pairs of attention heads.
-
-        Returns a ``CompositionScores`` containing the scores tensor, the
-        original layer indices, and human-readable head labels.  The scores
-        tensor has shape (n_attn_layers, n_heads, n_attn_layers, n_heads) and
-        is upper triangular on the layer axes.
-
-        For hybrid models, only attention layers are included.  The returned
-        ``layer_indices`` maps tensor position *i* back to the original layer
-        number so that results cannot be silently misinterpreted.
+        """Composition scores for all attention head pairs. Returns CompositionScores.
 
         See https://transformer-circuits.pub/2021/framework/index.html
-
-        Args:
-            mode: One of "Q", "K", "V" — which composition type to compute.
+        On hybrid models, only attention layers are included; layer_indices
+        maps tensor position i to original layer number.
         """
-        # Single blocks_with call — all weight stacking uses these same blocks
         attn_blocks = self.blocks_with("attn")
         if not attn_blocks:
             raise ValueError("No attention layers found — cannot compute composition scores.")
@@ -1449,59 +1363,23 @@ def _stack(attr_path: str, reshape_fn: Optional[Callable] = None) -> torch.Tenso
         return CompositionScores(scores=scores, layer_indices=indices, head_labels=labels)
 
     def composition_layer_indices(self) -> List[int]:
-        """Return original layer indices for attention layers.
-
-        Maps position i in all_composition_scores() output back to the
-        original layer number. For homogeneous models, returns [0, 1, ..., n-1].
-        For hybrid models, returns only the attention layer indices.
-        """
+        """Original layer indices for attention layers (maps composition score positions)."""
         return [idx for idx, _ in self.blocks_with("attn")]
 
     def block_hooks(self, layer_idx: int) -> List[str]:
-        """Return all hook point names available on a specific block.
-
-        Useful for hybrid architectures where different layers have different
-        hookable submodules — e.g., attention layers expose hook_q/hook_k/etc.
-        while SSM layers expose hook_in_proj/hook_conv/etc.
-
-        Args:
-            layer_idx: Layer index to inspect.
-
-        Returns:
-            Sorted list of hook names (e.g., ["hook_in", "hook_out", "attn.hook_q", ...]).
-        """
+        """Sorted hook names available on block `layer_idx` (block-relative paths)."""
         prefix = f"blocks.{layer_idx}."
         return sorted(name[len(prefix) :] for name in self.hook_dict if name.startswith(prefix))
 
     def block_submodules(self, layer_idx: int) -> List[str]:
-        """Return names of bridged submodules on a specific block.
-
-        Args:
-            layer_idx: Layer index to inspect.
-
-        Returns:
-            List of submodule names (e.g., ["ln1", "ln2", "attn", "mlp"]).
-        """
+        """Return bridged submodule names on block `layer_idx`."""
         block = self.blocks[layer_idx]
         return [name for name in block._modules if name not in _BLOCK_INTERNAL_MODULES]
 
     def layer_types(self) -> List[str]:
-        """Return a human-readable layer type for each block.
-
-        Inspects which bridged submodules are present on each block to infer
-        the layer type. For homogeneous models, all entries will be the same.
-        Variant submodule names are defined in
-        ``generalized_components.block.VARIANT_SUBMODULE_NAMES``.
-
-        Labels are deterministic: variants appear in VARIANT_SUBMODULE_NAMES
-        order, universals are sorted alphabetically.
-
-        Returns:
-            List of strings like ["attn+mlp", "ssm+mlp", "attn+mlp", ...].
-        """
+        """Per-block type labels, e.g. ["attn+mlp", "ssm+mlp", ...]. Deterministic order."""
         types = []
         for block in self.blocks:
-            # Variants in canonical order (tuple iteration = stable)
             variants = [n for n in VARIANT_SUBMODULE_NAMES if n in block._modules]
             universals = sorted(
                 n
@@ -1521,11 +1399,7 @@ def all_head_labels(self) -> list[str]:
 
     @property
     def attn_head_labels(self) -> list[str]:
-        """Labels for attention heads only, matching all_composition_scores() dimensions.
-
-        For homogeneous models, identical to all_head_labels. For hybrid models,
-        only includes heads from attention layers (skips SSM/linear-attn layers).
-        """
+        """Head labels for attention layers only — matches all_composition_scores() dims."""
         return [
             f"L{l}H{h}" for l in self.composition_layer_indices() for h in range(self.cfg.n_heads)
         ]
diff --git a/transformer_lens/model_bridge/component_setup.py b/transformer_lens/model_bridge/component_setup.py
index 79d2abc2a..a2986d585 100644
--- a/transformer_lens/model_bridge/component_setup.py
+++ b/transformer_lens/model_bridge/component_setup.py
@@ -100,23 +100,18 @@ def setup_submodules(
                 else:
                     remote_path = submodule.name
                     is_optional = getattr(submodule, "optional", False)
-                    # Fast path: if the first path segment is absent, skip
-                    # immediately. This catches the common hybrid case (e.g.,
-                    # "self_attn" absent on an SSM layer) without entering
-                    # get_remote_component.
+                    # Fast path: first segment absent → skip without entering get_remote_component
                     first_segment = remote_path.split(".")[0]
                     if is_optional and not hasattr(original_model, first_segment):
                         logger.debug(
-                            "Optional submodule '%s' (path '%s') absent on %s — skipping",
+                            "Optional '%s' (path '%s') absent on %s",
                             module_name,
                             remote_path,
-                            getattr(component, "name", "unknown"),
+                            getattr(component, "name", "?"),
                         )
                         skipped_optional.append(module_name)
-                        continue  # hybrid layer lacks this submodule; skip binding
-                    # Full resolution — also catches deeper path failures
-                    # (e.g., "self_attn.q_proj" where self_attn exists as a
-                    # stub but q_proj is missing).
+                        continue
+                    # Full resolution — catches deeper path failures (e.g. stub self_attn missing q_proj)
                     try:
                         original_subcomponent = architecture_adapter.get_remote_component(
                             original_model, remote_path
@@ -124,10 +119,10 @@ def setup_submodules(
                     except AttributeError:
                         if is_optional:
                             logger.debug(
-                                "Optional submodule '%s' (path '%s') partially absent on %s — skipping",
+                                "Optional '%s' (path '%s') partially absent on %s",
                                 module_name,
                                 remote_path,
-                                getattr(component, "name", "unknown"),
+                                getattr(component, "name", "?"),
                             )
                             skipped_optional.append(module_name)
                             continue
@@ -145,9 +140,7 @@ def setup_submodules(
             if not submodule.is_list_item and submodule.name is not None:
                 component.real_components[module_name] = (submodule.name, submodule)
 
-    # Remove skipped optional submodules from the template so that
-    # architecture_adapter traversal code (which reads .submodules) doesn't
-    # find them and try to resolve against the HF model.
+    # Clean up so architecture_adapter traversal won't find stale entries
     for name in skipped_optional:
         component.submodules.pop(name, None)
 
diff --git a/transformer_lens/model_bridge/composition_scores.py b/transformer_lens/model_bridge/composition_scores.py
index 9073fddb2..617d49e99 100644
--- a/transformer_lens/model_bridge/composition_scores.py
+++ b/transformer_lens/model_bridge/composition_scores.py
@@ -1,28 +1,21 @@
-"""CompositionScores — tensor-like container for composition score results."""
+"""Tensor-like container for composition score results with layer-index metadata."""
 from typing import List
 
 import torch
 
 
 class CompositionScores:
-    """Composition scores bundled with layer-index metadata.
+    """Composition scores that behave like a tensor but carry layer-index metadata.
 
-    Behaves like a tensor for backward compatibility — indexing, .shape,
-    arithmetic, and ``torch.*`` namespace functions all delegate to the
-    underlying scores tensor via ``__torch_function__``. The additional
-    ``layer_indices`` and ``head_labels`` attributes provide metadata that
-    prevents silent misinterpretation of indices on hybrid models.
-
-    For hybrid models, the scores tensor has shape
-    (n_attn_layers, n_heads, n_attn_layers, n_heads) where n_attn_layers
-    may be less than n_layers. ``layer_indices`` maps tensor position i
+    Delegates indexing, .shape, arithmetic, and torch.* functions to the
+    underlying ``scores`` tensor via ``__torch_function__``. On hybrid models
+    where n_attn_layers < n_layers, ``layer_indices`` maps tensor position i
     to the original layer number.
 
     Attributes:
         scores: Upper-triangular composition score tensor.
-        layer_indices: Original layer numbers for each position in scores.
-            E.g., [0, 2, 5] means position 0 = layer 0, position 1 = layer 2, etc.
-        head_labels: Labels like ["L0H0", "L0H1", "L2H0", ...] matching scores dims.
+        layer_indices: Original layer numbers, e.g. [0, 2, 5].
+        head_labels: Labels matching scores dims, e.g. ["L0H0", "L0H1", ...].
     """
 
     def __init__(self, scores: torch.Tensor, layer_indices: List[int], head_labels: List[str]):
@@ -30,14 +23,11 @@ def __init__(self, scores: torch.Tensor, layer_indices: List[int], head_labels:
         self.layer_indices = layer_indices
         self.head_labels = head_labels
 
-    # --- Tensor protocol ---
-
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
-        """Delegate torch.* calls (torch.isnan, torch.where, etc.) to .scores."""
+        """Unwrap CompositionScores args so torch.isnan, torch.where, etc. work."""
         if kwargs is None:
             kwargs = {}
-        # Unwrap any CompositionScores args to their underlying tensor
         unwrapped_args = tuple(a.scores if isinstance(a, CompositionScores) else a for a in args)
         unwrapped_kwargs = {
             k: v.scores if isinstance(v, CompositionScores) else v for k, v in kwargs.items()
@@ -56,16 +46,11 @@ def device(self) -> torch.device:
     def dtype(self) -> torch.dtype:
         return self.scores.dtype
 
-    # Python 3 automatically sets __hash__ = None when __eq__ is defined,
-    # making instances unhashable. No explicit __hash__ needed.
-
     def __getitem__(self, key):
         return self.scores[key]
 
     def __getattr__(self, name):
-        # Delegate tensor methods (.abs(), .sum(), .any(), etc.) to .scores.
-        # Guard against infinite recursion during pickling/unpickling where
-        # self.scores may not exist yet.
+        # Guard against recursion during pickle/deepcopy when self.scores isn't set yet
         try:
             scores = object.__getattribute__(self, "scores")
         except AttributeError:
diff --git a/transformer_lens/model_bridge/generalized_components/base.py b/transformer_lens/model_bridge/generalized_components/base.py
index 1af033efb..12be7b9c6 100644
--- a/transformer_lens/model_bridge/generalized_components/base.py
+++ b/transformer_lens/model_bridge/generalized_components/base.py
@@ -46,10 +46,7 @@ def __init__(
             hook_alias_overrides: Optional dictionary to override default hook aliases.
                 For example, {"hook_attn_out": "ln1_post.hook_out"} will make hook_attn_out
                 point to ln1_post.hook_out instead of the default value in self.hook_aliases.
-            optional: If True, this entire subtree may be absent on some layers.
-                When the remote model lacks this component, setup will skip it
-                cleanly instead of raising AttributeError. Used for hybrid
-                architectures where layers have structurally different submodules.
+            optional: If True, setup skips this subtree when absent (hybrid architectures).
         """
         super().__init__()
         self.name = name
diff --git a/transformer_lens/model_bridge/generalized_components/block.py b/transformer_lens/model_bridge/generalized_components/block.py
index 1005fd4f2..e6cd0d71f 100644
--- a/transformer_lens/model_bridge/generalized_components/block.py
+++ b/transformer_lens/model_bridge/generalized_components/block.py
@@ -15,19 +15,15 @@
     GeneralizedComponent,
 )
 
-# Submodule names that represent layer-type variants in hybrid architectures.
-# Used by layer_types() for classification and _get_block_variant_bias() for
-# bias accumulation.  Adapters that introduce new variant types should add
-# their submodule name here.  Ordered tuple for deterministic iteration
-# (matters when a block has multiple variants during development/testing).
+# Layer-type variant submodule names. Tuple for deterministic iteration order.
+# Extend here when adding new hybrid variant types.
 VARIANT_SUBMODULE_NAMES: tuple[str, ...] = ("attn", "linear_attn", "mamba", "mixer", "ssm")
 _VARIANT_SUBMODULE_SET: frozenset[str] = frozenset(VARIANT_SUBMODULE_NAMES)
 
-# Internal block modules excluded from submodule introspection (hook points
-# and the wrapped HF component are infrastructure, not user-facing submodules).
+# Infrastructure modules excluded from submodule introspection.
 _BLOCK_INTERNAL_MODULES: frozenset[str] = frozenset({"hook_in", "hook_out", "_original_component"})
 
-# Prefixes for normalization modules excluded from layer_types() labels.
+# Norm-module prefixes excluded from layer_types() labels.
 _NORM_PREFIXES: tuple[str, ...] = ("ln", "layer_norm", "norm", "rms")
 
 
diff --git a/transformer_lens/model_bridge/get_params_util.py b/transformer_lens/model_bridge/get_params_util.py
index f27e3a97f..acca83a4e 100644
--- a/transformer_lens/model_bridge/get_params_util.py
+++ b/transformer_lens/model_bridge/get_params_util.py
@@ -37,23 +37,7 @@ def _get_or_create_bias(bias, n_heads: int, d_head: int, device, dtype) -> torch
 
 
 def get_bridge_params(bridge) -> Dict[str, torch.Tensor]:
-    """Access to model parameters in the format expected by SVDInterpreter.
-
-    For hybrid architectures, only layers with attention get attention keys
-    (W_Q, W_K, etc.). Non-attention layers (SSM, linear-attention) are skipped
-    rather than filled with zeros — this prevents downstream consumers like
-    SVDInterpreter from treating synthetic zeros as real weights.
-
-    Args:
-        bridge: TransformerBridge instance
-
-    Returns:
-        dict: Dictionary of parameter tensors with TransformerLens naming convention.
-            For hybrid models, attention keys only exist for layers that have attention.
-
-    Raises:
-        ValueError: If configuration is inconsistent (e.g., cfg.n_layers != len(blocks))
-    """
+    """Model parameters in SVDInterpreter format. Skips attn keys for non-attention layers."""
     params_dict = {}
 
     def _get_device_dtype():
@@ -89,15 +73,11 @@ def _get_device_dtype():
             )
         block = bridge.blocks[layer_idx]
 
-        # Only extract attention params from blocks that have attention.
-        # Non-attention layers (SSM, linear-attention) are skipped entirely
-        # rather than filled with zeros — this prevents consumers like
-        # SVDInterpreter from treating synthetic zeros as real weights.
+        # Skip non-attention layers entirely (no zero-fill — prevents SVDInterpreter garbage)
         try:
             has_attn = "attn" in block._modules
         except (TypeError, AttributeError):
-            # Mock objects or non-nn.Module blocks: fall back to hasattr
-            has_attn = hasattr(block, "attn")
+            has_attn = hasattr(block, "attn")  # Mock fallback
         if has_attn:
             try:
                 w_q = block.attn.q.weight
diff --git a/transformer_lens/weight_processing.py b/transformer_lens/weight_processing.py
index 6f0489f21..1d219973a 100644
--- a/transformer_lens/weight_processing.py
+++ b/transformer_lens/weight_processing.py
@@ -1698,13 +1698,10 @@ def refactor_factored_attn_matrices(
             b_V_key = ProcessWeights._get_param_key(f"blocks.{l}.attn.b_V", adapter)
             b_O_key = ProcessWeights._get_param_key(f"blocks.{l}.attn.b_O", adapter)
 
-            # Skip layers without attention weights (hybrid architectures where
-            # some layers are SSM/linear-attention and lack Q/K/V/O entirely).
-            # Other weight-processing loops (center_writing_weights, fold_value_biases,
-            # fold_layer_norm) already guard with `if key in state_dict:` checks.
+            # Skip hybrid layers without attention (other loops already guard individually)
             if W_Q_key not in state_dict:
                 continue
-            # All four weight matrices must be present if Q is present
+            # If Q is present, K/V/O must be too
             for _required_key in [W_K_key, W_V_key, W_O_key]:
                 if _required_key not in state_dict:
                     raise ValueError(

From ad3764cb4ec590535ff5b7db27c539ddeeb44e6b Mon Sep 17 00:00:00 2001
From: jlarson4 <jonahalarson@comcast.net>
Date: Tue, 14 Apr 2026 23:17:40 -0500
Subject: [PATCH 3/8] Initial setup for proper handling of the Gated Delta Net
 Bridge

---
 tests/unit/test_qwen3_5_adapter.py            |  26 +-
 tests/unit/test_qwen3_next_adapter.py         |  27 +-
 .../generalized_components/__init__.py        |   3 +
 .../generalized_components/attention.py       |   7 +-
 .../generalized_components/gated_delta_net.py | 289 ++++++++++++++++++
 .../position_embeddings_attention.py          |  28 +-
 .../supported_architectures/granite.py        |   7 +-
 .../granite_moe_hybrid.py                     |  74 ++---
 .../supported_architectures/qwen3.py          | 169 +++++-----
 .../supported_architectures/qwen3_5.py        | 174 ++---------
 .../supported_architectures/qwen3_next.py     | 161 ++--------
 11 files changed, 536 insertions(+), 429 deletions(-)
 create mode 100644 transformer_lens/model_bridge/generalized_components/gated_delta_net.py

diff --git a/tests/unit/test_qwen3_5_adapter.py b/tests/unit/test_qwen3_5_adapter.py
index 8fd885174..1b9ac778c 100644
--- a/tests/unit/test_qwen3_5_adapter.py
+++ b/tests/unit/test_qwen3_5_adapter.py
@@ -134,18 +134,28 @@ def test_unembed_path(self, adapter):
     # ---- Block submodules ----
 
     def test_block_submodules_keys(self, adapter):
-        """blocks submodules must contain ln1, ln2, mlp but NOT attn.
+        """blocks submodules must contain ln1, ln2, mlp, and optional attn + linear_attn."""
+        submodules = adapter.component_mapping["blocks"].submodules
+        assert set(submodules.keys()) == {"ln1", "ln2", "mlp", "attn", "linear_attn"}
 
-        Critical correctness test: self_attn is absent on linear-attention
-        layers, so mapping attn as a block submodule would crash on those layers.
-        """
+    def test_attn_is_optional(self, adapter):
+        """attn must be marked optional (absent on linear-attention layers)."""
+        submodules = adapter.component_mapping["blocks"].submodules
+        assert submodules["attn"].optional is True
+
+    def test_linear_attn_is_optional(self, adapter):
+        """linear_attn must be marked optional (absent on full-attention layers)."""
         submodules = adapter.component_mapping["blocks"].submodules
-        assert set(submodules.keys()) == {"ln1", "ln2", "mlp"}
+        assert submodules["linear_attn"].optional is True
+
+    def test_linear_attn_bridge_type(self, adapter):
+        """linear_attn must be a GatedDeltaNetBridge."""
+        from transformer_lens.model_bridge.generalized_components.gated_delta_net import (
+            GatedDeltaNetBridge,
+        )
 
-    def test_no_attn_in_block_submodules(self, adapter):
-        """attn must NOT appear as a block submodule (hybrid architecture safety check)."""
         submodules = adapter.component_mapping["blocks"].submodules
-        assert "attn" not in submodules
+        assert isinstance(submodules["linear_attn"], GatedDeltaNetBridge)
 
     def test_ln1_path(self, adapter):
         """ln1 maps to input_layernorm."""
diff --git a/tests/unit/test_qwen3_next_adapter.py b/tests/unit/test_qwen3_next_adapter.py
index 1a2842e7b..516d7a8b5 100644
--- a/tests/unit/test_qwen3_next_adapter.py
+++ b/tests/unit/test_qwen3_next_adapter.py
@@ -135,19 +135,28 @@ def test_unembed_path(self, adapter):
     # ---- Block submodules ----
 
     def test_block_submodules_keys(self, adapter):
-        """blocks submodules must contain ln1, ln2, mlp but NOT attn.
+        """blocks submodules must contain ln1, ln2, mlp, and optional attn + linear_attn."""
+        submodules = adapter.component_mapping["blocks"].submodules
+        assert set(submodules.keys()) == {"ln1", "ln2", "mlp", "attn", "linear_attn"}
 
-        This is a critical correctness test: self_attn is absent on
-        linear-attention layers, so mapping attn as a block submodule
-        would crash on those layers.
-        """
+    def test_attn_is_optional(self, adapter):
+        """attn must be marked optional (absent on linear-attention layers)."""
+        submodules = adapter.component_mapping["blocks"].submodules
+        assert submodules["attn"].optional is True
+
+    def test_linear_attn_is_optional(self, adapter):
+        """linear_attn must be marked optional (absent on full-attention layers)."""
         submodules = adapter.component_mapping["blocks"].submodules
-        assert set(submodules.keys()) == {"ln1", "ln2", "mlp"}
+        assert submodules["linear_attn"].optional is True
+
+    def test_linear_attn_bridge_type(self, adapter):
+        """linear_attn must be a GatedDeltaNetBridge."""
+        from transformer_lens.model_bridge.generalized_components.gated_delta_net import (
+            GatedDeltaNetBridge,
+        )
 
-    def test_no_attn_in_block_submodules(self, adapter):
-        """attn must NOT appear as a block submodule (hybrid architecture safety check)."""
         submodules = adapter.component_mapping["blocks"].submodules
-        assert "attn" not in submodules
+        assert isinstance(submodules["linear_attn"], GatedDeltaNetBridge)
 
     def test_ln1_path(self, adapter):
         """ln1 maps to input_layernorm."""
diff --git a/transformer_lens/model_bridge/generalized_components/__init__.py b/transformer_lens/model_bridge/generalized_components/__init__.py
index fb789cc30..c2c7a121b 100644
--- a/transformer_lens/model_bridge/generalized_components/__init__.py
+++ b/transformer_lens/model_bridge/generalized_components/__init__.py
@@ -35,6 +35,9 @@
 from transformer_lens.model_bridge.generalized_components.alibi_joint_qkv_attention import (
     ALiBiJointQKVAttentionBridge,
 )
+from transformer_lens.model_bridge.generalized_components.gated_delta_net import (
+    GatedDeltaNetBridge,
+)
 from transformer_lens.model_bridge.generalized_components.gated_mlp import (
     GatedMLPBridge,
 )
diff --git a/transformer_lens/model_bridge/generalized_components/attention.py b/transformer_lens/model_bridge/generalized_components/attention.py
index 05d5e0982..2d73d7ed7 100644
--- a/transformer_lens/model_bridge/generalized_components/attention.py
+++ b/transformer_lens/model_bridge/generalized_components/attention.py
@@ -59,6 +59,7 @@ def __init__(
         requires_position_embeddings: bool = False,
         requires_attention_mask: bool = False,
         attention_mask_4d: bool = False,
+        **kwargs,
     ):
         """Initialize the attention bridge.
 
@@ -82,7 +83,11 @@ def __init__(
         if conversion_rule is None:
             conversion_rule = AttentionAutoConversion(config)
         super().__init__(
-            name, config=config, submodules=submodules or {}, conversion_rule=conversion_rule
+            name,
+            config=config,
+            submodules=submodules or {},
+            conversion_rule=conversion_rule,
+            **kwargs,
         )
         self.hook_attn_scores = HookPoint()
         self.hook_pattern = HookPoint()
diff --git a/transformer_lens/model_bridge/generalized_components/gated_delta_net.py b/transformer_lens/model_bridge/generalized_components/gated_delta_net.py
new file mode 100644
index 000000000..b62937dbd
--- /dev/null
+++ b/transformer_lens/model_bridge/generalized_components/gated_delta_net.py
@@ -0,0 +1,289 @@
+"""GatedDeltaNet bridge for Qwen3.5/Qwen3Next linear-attention layers.
+
+Reimplements forward (prefill only) to expose mech-interp-relevant intermediate
+states. Falls back to HF native forward during autoregressive generation where
+cache state management is required.
+"""
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+import torch
+import torch.nn.functional as F
+
+from transformer_lens.hook_points import HookPoint
+from transformer_lens.model_bridge.generalized_components.base import (
+    GeneralizedComponent,
+)
+
+if TYPE_CHECKING:
+    from transformer_lens.ActivationCache import ActivationCache
+
+
+class GatedDeltaNetBridge(GeneralizedComponent):
+    """Bridge for GatedDeltaNet linear-attention with full hook decomposition.
+
+    Hooks (prefill, in execution order):
+        hook_in: input hidden_states [batch, seq, d_model]
+        hook_q_pre_conv: Q after projection + split, before conv [batch, seq, n_k_heads, head_k_dim]
+        hook_k_pre_conv: K before conv [batch, seq, n_k_heads, head_k_dim]
+        hook_v_pre_conv: V before conv [batch, seq, n_v_heads, head_v_dim]
+        hook_conv_out: post-conv mixed QKV [batch, seq, key_dim*2 + value_dim]
+        hook_q: Q after conv, pre-GQA-expansion [batch, seq, n_k_heads, head_k_dim]
+        hook_k: K after conv [batch, seq, n_k_heads, head_k_dim]
+        hook_v: V after conv [batch, seq, n_v_heads, head_v_dim]
+        hook_beta: write strength (sigmoid of b), per v-head [batch, seq, n_v_heads]
+        hook_log_decay: log-space decay g (negative; actual decay = exp(g)), per v-head [batch, seq, n_v_heads]
+        hook_recurrence_out: output of linear recurrence kernel [batch, seq, n_v_heads, head_v_dim]
+        hook_gate_input: z tensor before silu gating in GatedRMSNorm [batch, seq, n_v_heads, head_v_dim]
+        hook_out: final output to residual stream [batch, seq, d_model]
+
+    During generation (cache_params present), only hook_in/hook_out fire.
+
+    Property aliases:
+        W_in_proj_qkvz, W_in_proj_ba, W_out_proj, A_log, dt_bias
+    """
+
+    hook_aliases = {
+        "hook_linear_attn_in": "hook_in",
+        "hook_linear_attn_out": "hook_out",
+    }
+
+    property_aliases = {
+        "W_in_proj_qkvz": "in_proj_qkvz.weight",
+        "W_in_proj_ba": "in_proj_ba.weight",
+        "W_out_proj": "out_proj.weight",
+        "A_log": "A_log",
+        "dt_bias": "dt_bias",
+    }
+
+    def __init__(
+        self,
+        name: str,
+        config: Optional[Any] = None,
+        submodules: Optional[Dict[str, GeneralizedComponent]] = None,
+        **kwargs,
+    ):
+        super().__init__(name, config=config, submodules=submodules or {}, **kwargs)
+        # Pre-conv hooks (after projection, before causal convolution mixes positions)
+        self.hook_q_pre_conv = HookPoint()
+        self.hook_k_pre_conv = HookPoint()
+        self.hook_v_pre_conv = HookPoint()
+        # Conv output
+        self.hook_conv_out = HookPoint()
+        # Post-conv hooks (pre-GQA-expansion, pre-recurrence)
+        self.hook_q = HookPoint()
+        self.hook_k = HookPoint()
+        self.hook_v = HookPoint()
+        # Gate parameters (per v-head)
+        self.hook_beta = HookPoint()
+        self.hook_log_decay = HookPoint()
+        # Recurrence output + gated norm input
+        self.hook_recurrence_out = HookPoint()
+        self.hook_gate_input = HookPoint()
+
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        if self.original_component is None:
+            raise RuntimeError(f"Original component not set for {self.name}.")
+
+        # Generation step → delegate to HF with only input/output hooks
+        if kwargs.get("cache_params") is not None:
+            return self._native_forward(*args, **kwargs)
+        return self._hooked_forward(*args, **kwargs)
+
+    def _native_forward(self, *args: Any, **kwargs: Any) -> Any:
+        """Delegate to HF with hook_in/hook_out only (generation path)."""
+        assert self.original_component is not None
+        if "hidden_states" in kwargs:
+            kwargs["hidden_states"] = self.hook_in(kwargs["hidden_states"])
+        elif len(args) > 0 and isinstance(args[0], torch.Tensor):
+            args = (self.hook_in(args[0]),) + args[1:]
+
+        output = self.original_component(*args, **kwargs)
+        if isinstance(output, torch.Tensor):
+            return self.hook_out(output)
+        return output
+
+    def _hooked_forward(self, *args: Any, **kwargs: Any) -> Any:
+        """Reimplemented forward exposing all intermediate states (prefill)."""
+        hf: Any = self.original_component
+
+        if "hidden_states" in kwargs:
+            hidden_states = kwargs["hidden_states"]
+        elif len(args) > 0 and isinstance(args[0], torch.Tensor):
+            hidden_states = args[0]
+        else:
+            raise ValueError("Could not find hidden_states")
+
+        attention_mask = kwargs.get("attention_mask")
+        if attention_mask is not None:
+            from transformers.models.qwen3_next.modeling_qwen3_next import (
+                apply_mask_to_padding_states,
+            )
+
+            hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
+
+        hidden_states = self.hook_in(hidden_states)
+        batch_size, seq_len, _ = hidden_states.shape
+
+        # --- Projections ---
+        projected_qkvz = hf.in_proj_qkvz(hidden_states)
+        projected_ba = hf.in_proj_ba(hidden_states)
+
+        # Split into per-head Q, K, V, Z, beta_raw, alpha_raw
+        query, key, value, z, b, a = hf.fix_query_key_value_ordering(projected_qkvz, projected_ba)
+
+        # --- Pre-conv hooks (per-head shape, before conv mixes positions) ---
+        query = self.hook_q_pre_conv(query)
+        key = self.hook_k_pre_conv(key)
+        value = self.hook_v_pre_conv(value)
+
+        # Flatten for conv
+        query, key, value = (x.reshape(x.shape[0], x.shape[1], -1) for x in (query, key, value))
+
+        # --- Causal Convolution ---
+        mixed_qkv = torch.cat((query, key, value), dim=-1).transpose(1, 2)
+        if hf.causal_conv1d_fn is not None:
+            mixed_qkv = hf.causal_conv1d_fn(
+                x=mixed_qkv,
+                weight=hf.conv1d.weight.squeeze(1),
+                bias=hf.conv1d.bias,
+                activation=hf.activation,
+                seq_idx=None,
+            )
+        else:
+            mixed_qkv = F.silu(hf.conv1d(mixed_qkv)[:, :, :seq_len])
+        mixed_qkv = mixed_qkv.transpose(1, 2)
+
+        mixed_qkv = self.hook_conv_out(mixed_qkv)
+
+        # Split post-conv
+        query, key, value = torch.split(
+            mixed_qkv,
+            [hf.key_dim, hf.key_dim, hf.value_dim],
+            dim=-1,
+        )
+        query = query.reshape(batch_size, seq_len, -1, hf.head_k_dim)
+        key = key.reshape(batch_size, seq_len, -1, hf.head_k_dim)
+        value = value.reshape(batch_size, seq_len, -1, hf.head_v_dim)
+
+        # --- Post-conv hooks (pre-GQA-expansion, pre-recurrence) ---
+        query = self.hook_q(query)
+        key = self.hook_k(key)
+        value = self.hook_v(value)
+
+        # --- Gate parameters (per v-head) ---
+        beta = self.hook_beta(b.sigmoid())
+
+        # g is log-space decay (negative); actual multiplicative decay = exp(g)
+        g = -hf.A_log.float().exp() * F.softplus(a.float() + hf.dt_bias)
+        g = self.hook_log_decay(g)
+
+        # GQA expansion (Q/K from n_k_heads → n_v_heads)
+        if hf.num_v_heads // hf.num_k_heads > 1:
+            repeat = hf.num_v_heads // hf.num_k_heads
+            query = query.repeat_interleave(repeat, dim=2)
+            key = key.repeat_interleave(repeat, dim=2)
+
+        # --- Core linear recurrence (opaque fused kernel) ---
+        core_out, _ = hf.chunk_gated_delta_rule(
+            query,
+            key,
+            value,
+            g=g,
+            beta=beta,
+            initial_state=None,
+            output_final_state=False,
+            use_qk_l2norm_in_kernel=True,
+        )
+        core_out = self.hook_recurrence_out(core_out)
+
+        # --- Gated RMSNorm: norm(core_out) * silu(z) ---
+        z = self.hook_gate_input(z)
+        z_shape = z.shape
+        core_out = hf.norm(
+            core_out.reshape(-1, core_out.shape[-1]),
+            z.reshape(-1, z.shape[-1]),
+        )
+        core_out = core_out.reshape(z_shape).reshape(batch_size, seq_len, -1)
+
+        # --- Output projection ---
+        output = hf.out_proj(core_out)
+        return self.hook_out(output)
+
+    def compute_effective_attention(
+        self,
+        cache: "ActivationCache",
+        layer_idx: int,
+    ) -> torch.Tensor:
+        """Materialize the effective attention matrix from cached hook values.
+
+        The gated delta rule recurrence is:
+            S_t = exp(g_t) * S_{t-1} + beta_t * v_t @ k_t^T
+            o_t = S_t^T @ q_t
+
+        The effective attention M[i,j] = contribution of input j to output i:
+            M[i,j] = (q_i^T @ k_j) * beta_j * prod_{t=j+1}^{i} exp(g_t)
+
+        Note: the fused kernel applies L2-normalization to Q and K internally
+        (use_qk_l2norm_in_kernel=True). The hooked Q/K are pre-normalization,
+        so this reconstruction is approximate. For exact reconstruction, you'd
+        need the normalized Q/K which aren't exposed by the kernel.
+
+        Args:
+            cache: ActivationCache from run_with_cache.
+            layer_idx: Block index for this linear_attn layer.
+
+        Returns:
+            [batch, n_v_heads, seq, seq] causal attention matrix. Upper triangle
+            (j > i) is zero.
+
+        Cost is O(batch * n_heads * seq^2); use on short sequences.
+        """
+        prefix = f"blocks.{layer_idx}.linear_attn"
+        q_key = f"{prefix}.hook_q"
+        k_key = f"{prefix}.hook_k"
+        beta_key = f"{prefix}.hook_beta"
+        decay_key = f"{prefix}.hook_log_decay"
+
+        for key in [q_key, k_key, beta_key, decay_key]:
+            if key not in cache:
+                raise RuntimeError(
+                    f"compute_effective_attention needs {key!r} in cache. "
+                    "Run run_with_cache() on the bridge first."
+                )
+
+        # [batch, seq, n_k_heads, head_k_dim] — pre-GQA-expansion
+        q = cache[q_key].float()
+        k = cache[k_key].float()
+        beta = cache[beta_key].float()  # [batch, seq, n_v_heads]
+        g = cache[decay_key].float()  # [batch, seq, n_v_heads]
+
+        # GQA expansion to match n_v_heads
+        if q.shape[2] < beta.shape[-1]:
+            repeat = beta.shape[-1] // q.shape[2]
+            q = q.repeat_interleave(repeat, dim=2)
+            k = k.repeat_interleave(repeat, dim=2)
+
+        batch, seq, n_heads, d_head = q.shape
+
+        # QK similarity: [batch, n_heads, seq_i, seq_j]
+        q_perm = q.permute(0, 2, 1, 3)  # [batch, n_heads, seq, d_head]
+        k_perm = k.permute(0, 2, 1, 3)
+        qk = torch.matmul(q_perm, k_perm.transpose(-2, -1))  # [batch, n_heads, seq, seq]
+
+        # Cumulative decay: L[i,j] = prod_{t=j+1}^{i} exp(g_t) = exp(sum g[j+1..i])
+        # g is [batch, seq, n_heads] → cumsum along seq
+        g_perm = g.permute(0, 2, 1)  # [batch, n_heads, seq]
+        cumsum_g = torch.cumsum(g_perm, dim=-1)
+        # L_log[i,j] = cumsum[i] - cumsum[j]
+        L_log = cumsum_g[:, :, :, None] - cumsum_g[:, :, None, :]
+
+        causal_mask = torch.tril(torch.ones(seq, seq, dtype=torch.bool, device=q.device))
+        L = torch.where(causal_mask[None, None], torch.exp(L_log), torch.zeros_like(L_log))
+
+        # Beta broadcast: [batch, n_heads, 1, seq_j]
+        beta_col = beta.permute(0, 2, 1)[:, :, None, :]
+
+        # M[i,j] = qk[i,j] * beta[j] * L[i,j]
+        M = qk * beta_col * L
+
+        return M
diff --git a/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py b/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py
index 33f6dd21f..ad17c38a6 100644
--- a/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py
+++ b/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py
@@ -15,6 +15,7 @@
 import torch
 import transformers.models.gemma2.modeling_gemma2 as gemma2_module
 
+from transformer_lens.hook_points import HookPoint
 from transformer_lens.model_bridge.generalized_components.attention import (
     AttentionBridge,
 )
@@ -127,6 +128,8 @@ def __init__(
         kwargs["maintain_native_attention"] = True
         super().__init__(name, config, submodules, **kwargs)
         self._init_position_embedding_hooks()
+        if getattr(config, "gated_q_proj", False):
+            self.hook_q_gate = HookPoint()
 
     def set_original_component(self, component: torch.nn.Module) -> None:
         """Set the original HF component and register for rotary hook firing.
@@ -201,19 +204,34 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
         key_states = hf_attn.k_proj(hidden_states)
         value_states = hf_attn.v_proj(hidden_states)
 
+        # Gated q_proj (Qwen3.5/Qwen3Next): q_proj outputs [Q|gate] interleaved
+        # per head. cfg.gated_q_proj is set by the adapter. The actual split only
+        # triggers if the output is 2x the standard width (n_heads * head_dim).
+        # In processed mode, preprocess_weights slices q_proj to standard width
+        # so this naturally passes through.
+        q_gate = None
+        if getattr(self.config, "gated_q_proj", False):
+            q_dim = query_states.shape[-1]
+            n_heads = getattr(self.config, "n_heads", q_dim // head_dim)
+            standard_q_dim = n_heads * head_dim
+            if q_dim == standard_q_dim * 2:
+                query_states, q_gate = torch.chunk(
+                    query_states.view(*input_shape, -1, head_dim * 2), 2, dim=-1
+                )
+                q_gate = q_gate.reshape(*input_shape, -1)
+                query_states = query_states.reshape(*input_shape, -1)
+
         has_q_norm = hasattr(hf_attn, "q_norm") and hf_attn.q_norm is not None
         has_k_norm = hasattr(hf_attn, "k_norm") and hf_attn.k_norm is not None
         applied_pre_reshape_norm = False
 
         if has_q_norm:
             try:
-                # Try pre-reshape norm (OLMo 2 style: norm on flat [batch, seq, hidden])
                 query_states = hf_attn.q_norm(query_states)
                 if has_k_norm:
                     key_states = hf_attn.k_norm(key_states)
                 applied_pre_reshape_norm = True
             except RuntimeError:
-                # Shape mismatch — this model uses post-reshape norms
                 pass
 
         query_states = query_states.view(hidden_shape).transpose(1, 2)
@@ -306,6 +324,12 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(*input_shape, -1)
 
+        # --- Gated attention (Qwen3.5/Qwen3Next) ---
+        if q_gate is not None:
+            if hasattr(self, "hook_q_gate"):
+                q_gate = self.hook_q_gate(q_gate)
+            attn_output = attn_output * torch.sigmoid(q_gate)
+
         # --- Output Projection ---
         # Different architectures name this differently: o_proj (Llama, Gemma, Qwen),
         # dense (Phi), out_proj (others)
diff --git a/transformer_lens/model_bridge/supported_architectures/granite.py b/transformer_lens/model_bridge/supported_architectures/granite.py
index f85ef850d..fbb911796 100644
--- a/transformer_lens/model_bridge/supported_architectures/granite.py
+++ b/transformer_lens/model_bridge/supported_architectures/granite.py
@@ -65,11 +65,12 @@ def _setup_common_config(self, cfg: Any) -> None:
             self.default_config["n_key_value_heads"] = cfg.n_key_value_heads
             self.cfg.n_key_value_heads = cfg.n_key_value_heads
 
-    def _build_attention_bridge(self) -> PositionEmbeddingsAttentionBridge:
+    def _build_attention_bridge(self, optional: bool = False) -> PositionEmbeddingsAttentionBridge:
         """Build the standard Granite attention bridge."""
         return PositionEmbeddingsAttentionBridge(
             name="self_attn",
             config=self.cfg,
+            optional=optional,
             submodules={
                 "q": LinearBridge(name="q_proj"),
                 "k": LinearBridge(name="k_proj"),
@@ -124,11 +125,11 @@ def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> No
 
         if bridge_model is not None and hasattr(bridge_model, "blocks"):
             for block in bridge_model.blocks:
-                if hasattr(block, "attn"):
+                if "attn" in block._modules:
                     block.attn.set_rotary_emb(rotary_emb)
 
         try:
             attn_bridge = self.get_generalized_component("blocks.0.attn")
             attn_bridge.set_rotary_emb(rotary_emb)
-        except (AttributeError, KeyError):
+        except (AttributeError, KeyError, ValueError):
             pass
diff --git a/transformer_lens/model_bridge/supported_architectures/granite_moe_hybrid.py b/transformer_lens/model_bridge/supported_architectures/granite_moe_hybrid.py
index 2c776365b..53229252e 100644
--- a/transformer_lens/model_bridge/supported_architectures/granite_moe_hybrid.py
+++ b/transformer_lens/model_bridge/supported_architectures/granite_moe_hybrid.py
@@ -1,13 +1,11 @@
 """Granite MoE Hybrid architecture adapter.
 
-GraniteMoeHybridForCausalLM is a hybrid Mamba + Attention architecture with
-Sparse Mixture of Experts. Layers alternate between Mamba SSM blocks and
-standard attention blocks, with a shared MLP and optional sparse MoE on
-every layer.
-
-Since self_attn is None on Mamba layers and mamba is None on attention
-layers, we only map submodules that exist on ALL layers (norms, shared_mlp,
-block_sparse_moe). The HF native forward handles mamba/attention dispatch.
+Hybrid Mamba2 + Attention with Sparse MoE. Most layers are Mamba SSM blocks;
+a few are standard attention (determined by config.layer_types). Every layer
+has a shared MLP and optional sparse MoE.
+
+Both attention and Mamba are mapped as optional — each present only on its
+respective layer type. Mamba hooks expose in_proj, conv1d, and inner_norm.
 """
 
 from typing import Any
@@ -21,53 +19,55 @@
     MoEBridge,
     RMSNormalizationBridge,
     RotaryEmbeddingBridge,
+    SSM2MixerBridge,
     UnembeddingBridge,
 )
+from transformer_lens.model_bridge.generalized_components.depthwise_conv1d import (
+    DepthwiseConv1DBridge,
+)
 from transformer_lens.model_bridge.supported_architectures.granite import (
     GraniteArchitectureAdapter,
 )
 
 
 class GraniteMoeHybridArchitectureAdapter(GraniteArchitectureAdapter):
-    """Architecture adapter for IBM Granite MoE Hybrid models.
-
-    Hybrid Mamba2 + Attention architecture with Sparse MoE. Most layers are Mamba
-    SSM blocks; a few are standard attention (determined by config.layer_types).
+    """Hybrid Mamba2 + Attention with Sparse MoE.
 
-    Since self_attn is None on Mamba layers and mamba is None on attention layers,
-    we only map submodules present on ALL layers (norms, shared_mlp, MoE). The HF
-    native forward handles mamba/attention dispatch internally.
-
-    Hook coverage:
-    - Block-level: hook_resid_pre, hook_resid_post on every layer
-    - Normalization: ln1 (input_layernorm), ln2 (post_attention_layernorm)
-    - MLP: shared_mlp input/output hooks
-    - MoE: block_sparse_moe input/output and router_scores hooks
-    - Attention/Mamba internals are NOT individually hooked (conditional per layer)
+    Attention is optional (absent on Mamba layers). shared_mlp and MoE are
+    universal. Inherits Granite config and attention bridge construction.
     """
 
     def __init__(self, cfg: Any) -> None:
-        """Initialize the Granite MoE Hybrid architecture adapter."""
-        # Call ArchitectureAdapter.__init__ directly, not GraniteArchitectureAdapter.__init__,
-        # because we need to customize the setup sequence
         ArchitectureAdapter.__init__(self, cfg)
-
         self._setup_common_config(cfg)
 
-        # Hybrid may use "rope" or "nope" (no positional embeddings)
         pos_emb_type = getattr(cfg, "position_embedding_type", "rope")
         if pos_emb_type != "rope":
             self.cfg.positional_embedding_type = "none"
 
-        # No attention weight conversions — attn Q/K/V aren't mapped as submodules
+        self.supports_fold_ln = False
         self.weight_processing_conversions = {}
         self.component_mapping = self._build_component_mapping()
 
+    def _build_mamba_bridge(self) -> SSM2MixerBridge:
+        """Mamba-2 mixer bridge with in_proj, conv1d, inner_norm hooks."""
+        return SSM2MixerBridge(
+            name="mamba",
+            config=self.cfg,
+            optional=True,
+            submodules={
+                "in_proj": LinearBridge(name="in_proj"),
+                "conv1d": DepthwiseConv1DBridge(name="conv1d"),
+                "inner_norm": LinearBridge(name="norm"),
+            },
+        )
+
     def _build_component_mapping(self) -> dict:
-        """Build component mapping with only universal (all-layer) submodules."""
-        block_submodules = {
+        block_submodules: dict = {
             "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg),
             "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg),
+            "attn": self._build_attention_bridge(optional=True),
+            "mamba": self._build_mamba_bridge(),
             "shared_mlp": MLPBridge(
                 name="shared_mlp",
                 config=self.cfg,
@@ -87,12 +87,9 @@ def _build_component_mapping(self) -> dict:
                 config=self.cfg,
             )
 
-        mapping = {
+        mapping: dict = {
             "embed": EmbeddingBridge(name="model.embed_tokens"),
-            "blocks": BlockBridge(
-                name="model.layers",
-                submodules=block_submodules,
-            ),
+            "blocks": BlockBridge(name="model.layers", submodules=block_submodules),
             "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg),
             "unembed": UnembeddingBridge(name="lm_head", config=self.cfg),
         }
@@ -101,10 +98,3 @@ def _build_component_mapping(self) -> dict:
             mapping["rotary_emb"] = RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg)
 
         return mapping
-
-    def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None:
-        """No-op for hybrid models.
-
-        Hybrid models don't map attention as a submodule (it's conditional per
-        layer), so there are no rotary embedding references to set up.
-        """
diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3.py b/transformer_lens/model_bridge/supported_architectures/qwen3.py
index 8dcc1d6d3..e37b44795 100644
--- a/transformer_lens/model_bridge/supported_architectures/qwen3.py
+++ b/transformer_lens/model_bridge/supported_architectures/qwen3.py
@@ -1,7 +1,14 @@
-"""Qwen3 architecture adapter."""
+"""Qwen3 architecture adapter.
+
+Base adapter for the Qwen3 model family. Provides shared config setup,
+attention bridge construction, and setup_component_testing used by
+Qwen3, Qwen3.5, and Qwen3Next variants.
+"""
 
 from typing import Any
 
+import torch
+
 from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
 from transformer_lens.model_bridge.generalized_components import (
     BlockBridge,
@@ -12,33 +19,29 @@
     RotaryEmbeddingBridge,
     UnembeddingBridge,
 )
+from transformer_lens.model_bridge.generalized_components.gated_delta_net import (
+    GatedDeltaNetBridge,
+)
 from transformer_lens.model_bridge.generalized_components.position_embeddings_attention import (
     PositionEmbeddingsAttentionBridge,
 )
 
 
 class Qwen3ArchitectureAdapter(ArchitectureAdapter):
-    """Architecture adapter for Qwen3 models.
-
-    Qwen3 is architecturally similar to Gemma3:
-    - Uses RMSNorm for all normalizations
-    - Has Q/K normalization within attention (RMSNorm on head dimension)
-    - Uses rotary position embeddings (RoPE)
-    - Requires position_embeddings and attention_mask in forward pass
-    - Uses gated MLP (gate_proj + up_proj -> down_proj)
-    - No biases on any linear layers
-
-    Key differences from Qwen2:
-    - Qwen3 has q_norm and k_norm layers in attention (Qwen2 doesn't)
-    - Qwen3 requires position_embeddings parameter (like Gemma3)
-    - Uses PositionEmbeddingsAttentionBridge instead of AttentionBridge
+    """Architecture adapter for Qwen3 dense models.
+
+    RMSNorm, RoPE, GQA, Q/K head norms, gated MLP. No biases.
+    Serves as base class for Qwen3.5 and Qwen3Next hybrid variants.
     """
 
     def __init__(self, cfg: Any) -> None:
-        """Initialize the Qwen3 architecture adapter."""
         super().__init__(cfg)
+        self._setup_qwen3_config(cfg)
+        self.weight_processing_conversions = {**self._qkvo_weight_conversions()}
+        self.component_mapping = self._build_component_mapping()
 
-        # Set config variables for weight processing
+    def _setup_qwen3_config(self, cfg: Any) -> None:
+        """Config shared across all Qwen3 variants (dense, hybrid, MoE)."""
         self.cfg.normalization_type = "RMS"
         self.cfg.positional_embedding_type = "rotary"
         self.cfg.final_rms = True
@@ -46,85 +49,101 @@ def __init__(self, cfg: Any) -> None:
         self.cfg.attn_only = False
         self.cfg.uses_rms_norm = True
         self.cfg.default_prepend_bos = False
-
-        # Use eager attention to support output_attentions for hook_attn_scores and hook_pattern
-        # SDPA doesn't support output_attentions, which is required for HookedTransformer compatibility
         self.cfg.attn_implementation = "eager"
 
-        self.weight_processing_conversions = {
-            **self._qkvo_weight_conversions(),
+        if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None:
+            self.cfg.n_key_value_heads = cfg.n_key_value_heads
+
+    def _build_attention_bridge(self, optional: bool = False) -> PositionEmbeddingsAttentionBridge:
+        """Standard Qwen3 attention bridge with Q/K norms."""
+        return PositionEmbeddingsAttentionBridge(
+            name="self_attn",
+            config=self.cfg,
+            optional=optional,
+            submodules={
+                "q": LinearBridge(name="q_proj"),
+                "k": LinearBridge(name="k_proj"),
+                "v": LinearBridge(name="v_proj"),
+                "o": LinearBridge(name="o_proj"),
+                "q_norm": RMSNormalizationBridge(name="q_norm", config=self.cfg),
+                "k_norm": RMSNormalizationBridge(name="k_norm", config=self.cfg),
+            },
+        )
+
+    def _build_mlp_bridge(self):
+        """Dense gated MLP (gate_proj + up_proj -> down_proj). Override for MoE."""
+        return GatedMLPBridge(
+            name="mlp",
+            config=self.cfg,
+            submodules={
+                "gate": LinearBridge(name="gate_proj"),
+                "in": LinearBridge(name="up_proj"),
+                "out": LinearBridge(name="down_proj"),
+            },
+        )
+
+    def _build_linear_attn_bridge(self, optional: bool = False) -> GatedDeltaNetBridge:
+        """GatedDeltaNet linear-attention bridge for hybrid variants."""
+        return GatedDeltaNetBridge(
+            name="linear_attn",
+            config=self.cfg,
+            optional=optional,
+        )
+
+    def _build_component_mapping(self, *, hybrid: bool = False) -> dict:
+        """Parametric component mapping. hybrid=True adds optional linear_attn."""
+        block_submodules: dict = {
+            "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg),
+            "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg),
+            "attn": self._build_attention_bridge(optional=hybrid),
+            "mlp": self._build_mlp_bridge(),
         }
-
-        # Set up component mapping
-        self.component_mapping = {
+        if hybrid:
+            block_submodules["linear_attn"] = self._build_linear_attn_bridge(optional=True)
+        return {
             "embed": EmbeddingBridge(name="model.embed_tokens"),
             "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg),
-            "blocks": BlockBridge(
-                name="model.layers",
-                submodules={
-                    "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg),
-                    "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg),
-                    "attn": PositionEmbeddingsAttentionBridge(
-                        name="self_attn",
-                        config=self.cfg,
-                        submodules={
-                            "q": LinearBridge(name="q_proj"),
-                            "k": LinearBridge(name="k_proj"),
-                            "v": LinearBridge(name="v_proj"),
-                            "o": LinearBridge(name="o_proj"),
-                            "q_norm": RMSNormalizationBridge(name="q_norm", config=self.cfg),
-                            "k_norm": RMSNormalizationBridge(name="k_norm", config=self.cfg),
-                        },
-                    ),
-                    "mlp": GatedMLPBridge(
-                        name="mlp",
-                        config=self.cfg,
-                        submodules={
-                            "gate": LinearBridge(name="gate_proj"),
-                            "in": LinearBridge(name="up_proj"),
-                            "out": LinearBridge(name="down_proj"),
-                        },
-                    ),
-                },
-            ),
+            "blocks": BlockBridge(name="model.layers", submodules=block_submodules),
             "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg),
             "unembed": UnembeddingBridge(name="lm_head"),
         }
 
     def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None:
-        """Set up rotary embedding references for Qwen3 component testing.
-
-        Qwen3 uses RoPE (Rotary Position Embeddings). We set the rotary_emb on
-        all attention bridge instances for component testing.
-
-        We also force the HF model to use "eager" attention to match the bridge's
-        implementation. The bridge uses "eager" to support output_attentions for hooks.
-
-        Args:
-            hf_model: The HuggingFace Qwen3 model instance
-            bridge_model: The TransformerBridge model (if available, set rotary_emb on actual instances)
-        """
-        # Get rotary embedding instance from the model
+        """Set eager attn on HF model and rotary_emb on attention bridges."""
         rotary_emb = hf_model.model.rotary_emb
 
-        # Force HF model to use "eager" attention to match bridge implementation
-        # Bridge uses "eager" to support output_attentions for hook compatibility
         if hasattr(hf_model, "config") and hasattr(hf_model.config, "_attn_implementation"):
             hf_model.config._attn_implementation = "eager"
 
-        # Also set on all attention layers
         if hasattr(hf_model, "model") and hasattr(hf_model.model, "layers"):
             for layer in hf_model.model.layers:
                 if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "config"):
                     layer.self_attn.config._attn_implementation = "eager"
 
-        # Set rotary_emb on actual bridge instances in bridge_model if available
         if bridge_model is not None and hasattr(bridge_model, "blocks"):
-            # Set on each layer's actual attention bridge instance
             for block in bridge_model.blocks:
-                if hasattr(block, "attn"):
+                if "attn" in block._modules:
                     block.attn.set_rotary_emb(rotary_emb)
 
-        # Also set on the template for get_generalized_component() calls
-        attn_bridge = self.get_generalized_component("blocks.0.attn")
-        attn_bridge.set_rotary_emb(rotary_emb)
+        # Set on template for get_generalized_component() calls
+        try:
+            attn_template = self.get_generalized_component("blocks.0.attn")
+            attn_template.set_rotary_emb(rotary_emb)
+        except ValueError:
+            pass  # hybrid adapter with no attn in template
+
+    @staticmethod
+    def _preprocess_gated_q_proj(
+        state_dict: dict[str, torch.Tensor], n_heads: int, d_head: int
+    ) -> dict[str, torch.Tensor]:
+        """Slice query half from gated q_proj.weight (interleaved per-head layout).
+
+        q_proj.weight has shape (n_heads * d_head * 2, hidden_size) with
+        interleaved [query, gate] rows per head. Extracts query-only half.
+        """
+        keys_to_update = [k for k in state_dict if k.endswith(".self_attn.q_proj.weight")]
+        for key in keys_to_update:
+            w = state_dict[key]
+            w = w.view(n_heads, d_head * 2, -1)
+            state_dict[key] = w[:, :d_head, :].reshape(n_heads * d_head, -1)
+        return state_dict
diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_5.py b/transformer_lens/model_bridge/supported_architectures/qwen3_5.py
index b1e71e9f3..a7c484eee 100644
--- a/transformer_lens/model_bridge/supported_architectures/qwen3_5.py
+++ b/transformer_lens/model_bridge/supported_architectures/qwen3_5.py
@@ -1,24 +1,8 @@
-"""Qwen3_5 architecture adapter.
+"""Qwen3.5 architecture adapter.
 
-Qwen3_5ForCausalLM is a hybrid linear-attention + full-attention architecture
-with a dense gated MLP on every layer. Layers follow a repeating pattern of
-3 GatedDeltaNet (linear attention) layers followed by 1 standard full-attention
-layer (every 4th layer by default).
-
-Since self_attn is absent on linear-attention layers, we only map submodules
-that exist on ALL layers (norms, MLP). The HF native forward handles
-linear/full attention dispatch internally, and GatedMLPBridge maps the dense
-gate_proj/up_proj/down_proj structure on every layer.
-
-Hook coverage:
-- Block-level: hook_resid_pre, hook_resid_post on every layer
-- Normalization: ln1 (input_layernorm), ln2 (post_attention_layernorm)
-- MLP: hook_in, hook_out via GatedMLPBridge (gate_proj, up_proj, down_proj)
-- Attention internals are NOT individually hooked (self_attn absent on
-  linear-attention layers; mapping it would crash on those layers)
-
-Optional parameters:
-- n_key_value_heads: only set when using GQA (num_key_value_heads != num_attention_heads)
+Hybrid linear-attention (GatedDeltaNet) + full-attention with dense gated MLP.
+3 linear-attn layers per 1 full-attn layer. Extends Qwen3 base with
+optional attention mapping and fold_ln disabled.
 """
 
 from typing import Any
@@ -26,150 +10,46 @@
 import torch
 
 from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
-from transformer_lens.model_bridge.generalized_components import (
-    BlockBridge,
-    EmbeddingBridge,
-    GatedMLPBridge,
-    LinearBridge,
-    RMSNormalizationBridge,
-    RotaryEmbeddingBridge,
-    UnembeddingBridge,
+from transformer_lens.model_bridge.supported_architectures.qwen3 import (
+    Qwen3ArchitectureAdapter,
 )
 
 
-class Qwen3_5ArchitectureAdapter(ArchitectureAdapter):
-    """Architecture adapter for Qwen3_5 models.
-
-    Qwen3_5ForCausalLM is a hybrid linear-attention + full-attention
-    architecture with dense gated MLPs, sharing the same hybrid design as
-    Qwen3Next but replacing the sparse MoE MLP with a standard dense MLP:
-    - Uses RMSNorm for all normalizations
-    - Uses rotary position embeddings (RoPE) with partial rotation
-    - Every 4th layer is a full-attention layer (self_attn); the rest are
-      GatedDeltaNet linear-attention layers (linear_attn)
-    - Uses dense gated MLP (gate_proj + up_proj -> down_proj) on ALL layers
-    - No biases on any linear layers
-    - Full-attention layers have Q/K normalization (q_norm, k_norm)
-    - Full-attention q_proj outputs n_heads * head_dim * 2 (interleaved
-      query+gate layout); the preprocess_weights method slices the query half
-
-    Since self_attn is absent on linear-attention layers, only universally
-    present submodules (norms, MLP) are mapped as block submodules. The HF
-    native forward handles per-layer attention dispatch internally.
+class Qwen3_5ArchitectureAdapter(Qwen3ArchitectureAdapter):
+    """Hybrid linear-attention + full-attention with dense gated MLP.
 
-    Optional parameters:
-    - n_key_value_heads: set when num_key_value_heads != num_attention_heads (GQA)
+    Inherits Qwen3 config/attention/MLP structure. Differences:
+    - supports_fold_ln = False (LN target varies by layer type)
+    - Attention is optional (absent on linear-attention layers)
+    - Gated q_proj (2x wide) requires preprocess_weights slicing
+    - No weight_processing_conversions until attn is fully wired
     """
 
     def __init__(self, cfg: Any) -> None:
-        """Initialize the Qwen3_5 architecture adapter."""
-        super().__init__(cfg)
-
-        # Core config attributes
-        self.cfg.normalization_type = "RMS"
-        self.cfg.positional_embedding_type = "rotary"
-        self.cfg.final_rms = True
-        self.cfg.gated_mlp = True
-        self.cfg.attn_only = False
-        self.cfg.uses_rms_norm = True
-        self.cfg.default_prepend_bos = False
-
-        # Disable fold_ln: ln1 is followed by self_attn on full-attention
-        # layers and by linear_attn (GatedDeltaNet) on linear-attention layers,
-        # but neither is mapped as a bridge submodule (see class docstring for
-        # why). With no bridge-mapped target to fold into, the standard fold_ln
-        # pass leaves LN weights in an inconsistent state and the processed
-        # bridge output diverges from the unprocessed / HF output. Skipping
-        # fold_ln keeps processed-mode forward passes numerically equivalent.
+        # Call grandparent to set self.cfg, then configure ourselves
+        ArchitectureAdapter.__init__(self, cfg)
+        self._setup_qwen3_config(cfg)
         self.supports_fold_ln = False
-
-        # Use eager attention to support output_attentions for hook_attn_scores
-        # and hook_pattern. SDPA doesn't support output_attentions.
-        self.cfg.attn_implementation = "eager"
-
-        # GQA: only set n_key_value_heads when using grouped-query attention
-        if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None:
-            self.cfg.n_key_value_heads = cfg.n_key_value_heads
-
+        setattr(self.cfg, "gated_q_proj", True)  # q_proj outputs [Q|gate] interleaved per head
         self.weight_processing_conversions: dict = {}
-        self.component_mapping: dict = {
-            "embed": EmbeddingBridge(name="model.embed_tokens"),
-            "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg),
-            "blocks": BlockBridge(
-                name="model.layers",
-                submodules={
-                    "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg),
-                    "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg),
-                    # Dense gated MLP present on every layer (unlike Qwen3Next's MoE).
-                    # gate_proj + up_proj feed into down_proj via SwiGLU activation.
-                    "mlp": GatedMLPBridge(
-                        name="mlp",
-                        config=self.cfg,
-                        submodules={
-                            "gate": LinearBridge(name="gate_proj"),
-                            "in": LinearBridge(name="up_proj"),
-                            "out": LinearBridge(name="down_proj"),
-                        },
-                    ),
-                },
-            ),
-            "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg),
-            "unembed": UnembeddingBridge(name="lm_head"),
-        }
+        self.component_mapping = self._build_component_mapping(hybrid=True)
 
     def prepare_loading(self, model_name: str, model_kwargs: dict) -> None:
-        """Swap the multimodal Qwen3_5Config for its text-only Qwen3_5TextConfig.
-
-        Published Qwen3.5 checkpoints (e.g. Qwen/Qwen3.5-0.8B) carry
-        model_type='qwen3_5' and architectures=['Qwen3_5ForConditionalGeneration'].
-        AutoModelForCausalLM would load the full VLM (Qwen3_5ForConditionalGeneration)
-        with its vision tower, wasting memory and failing the bridge.
+        """Swap multimodal Qwen3_5Config for text-only Qwen3_5TextConfig.
 
-        Instead we replace model_kwargs['config'] with the nested text_config so
-        AutoModelForCausalLM loads Qwen3_5ForCausalLM (text only).
+        Published checkpoints carry architectures=['Qwen3_5ForConditionalGeneration'].
+        We replace config with text_config so AutoModelForCausalLM loads the
+        text-only Qwen3_5ForCausalLM.
         """
         config = model_kwargs.get("config")
         if config is not None and hasattr(config, "text_config"):
             model_kwargs["config"] = config.text_config
 
-    def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None:
-        """No-op for hybrid models.
-
-        Hybrid models don't map attention as a block submodule (self_attn is
-        absent on linear-attention layers), so there are no rotary embedding
-        references to set up.
-
-        Note: to find which layers are full_attention at runtime, use:
-            layer_types = getattr(hf_model.config, "layer_types", [])
-            first_full_attn_idx = next(
-                i for i, t in enumerate(layer_types) if t == "full_attention"
-            )
-        Do NOT use hf_model.config.full_attention_interval -- it is not stored
-        on the config object (consumed during __init__ to build layer_types).
-        """
-
     def preprocess_weights(self, state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-        """Slice query half from q_proj.weight (interleaved per-head layout).
-
-        In Qwen3_5, q_proj.weight has shape (n_heads * head_dim * 2, hidden_size).
-        Rows are organized as per-head interleaved:
-          head_0_query (d_head rows), head_0_gate (d_head rows),
-          head_1_query (d_head rows), head_1_gate (d_head rows), ...
-
-        A naive first-half slice would be wrong. We must reshape by head, then
-        take the first d_head rows of each head (the query half).
+        """Slice query half from gated q_proj.weight for weight-space analysis.
 
-        Note: since self_attn is NOT currently mapped as a bridge submodule,
-        these weights will not be loaded by the bridge. This method is included
-        for correctness and forward-compatibility.
+        In processed mode, W_Q is the pure query projection (for composition
+        scores, logit lens). Gate signal available in unprocessed mode via
+        hook_q_gate.
         """
-        n_heads = self.cfg.n_heads
-        d_head = self.cfg.d_head
-        keys_to_update = [k for k in state_dict if k.endswith(".self_attn.q_proj.weight")]
-        for key in keys_to_update:
-            w = state_dict[key]  # shape: (n_heads * d_head * 2, hidden_size)
-            # Reshape to expose per-head layout
-            w = w.view(n_heads, d_head * 2, -1)
-            # Take only the first d_head rows of each head (query half)
-            state_dict[key] = w[:, :d_head, :].reshape(n_heads * d_head, -1)
-        return state_dict
+        return self._preprocess_gated_q_proj(state_dict, self.cfg.n_heads, self.cfg.d_head)
diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_next.py b/transformer_lens/model_bridge/supported_architectures/qwen3_next.py
index 53e18dce1..aa3ca6cc8 100644
--- a/transformer_lens/model_bridge/supported_architectures/qwen3_next.py
+++ b/transformer_lens/model_bridge/supported_architectures/qwen3_next.py
@@ -1,28 +1,8 @@
 """Qwen3Next architecture adapter.
 
-Qwen3NextForCausalLM is a hybrid linear-attention + full-attention architecture
-with a sparse Mixture-of-Experts MLP on every layer. Layers alternate between
-GatedDeltaNet (linear attention) and standard full attention blocks, while the
-MLP is always a Qwen3NextSparseMoeBlock (gate router + batched experts +
-shared expert).
-
-Since self_attn is absent on linear-attention layers, we only map submodules
-that exist on ALL layers (norms, MLP). The HF native forward handles
-linear/full attention dispatch internally, and MoEBridge delegates the entire
-MoE forward (including router, experts, and shared expert) to the native
-implementation.
-
-Hook coverage:
-- Block-level: hook_resid_pre, hook_resid_post on every layer
-- Normalization: ln1 (input_layernorm), ln2 (post_attention_layernorm)
-- MLP: hook_in, hook_out on the MoE block (MoEBridge)
-- Attention internals are NOT individually hooked (self_attn absent on
-  linear-attention layers; mapping it would crash on those layers)
-- Expert-level internals are NOT individually hooked (batched expert params
-  live inside Qwen3NextExperts; MoEBridge delegates to HF forward)
-
-Optional parameters:
-- n_key_value_heads: only set when using GQA (num_key_value_heads != num_attention_heads)
+Hybrid linear-attention (GatedDeltaNet) + full-attention with sparse MoE MLP.
+3 linear-attn layers per 1 full-attn layer. Extends Qwen3 base with
+optional attention mapping, MoE MLP, and fold_ln disabled.
 """
 
 from typing import Any
@@ -30,134 +10,31 @@
 import torch
 
 from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
-from transformer_lens.model_bridge.generalized_components import (
-    BlockBridge,
-    EmbeddingBridge,
-    MoEBridge,
-    RMSNormalizationBridge,
-    RotaryEmbeddingBridge,
-    UnembeddingBridge,
+from transformer_lens.model_bridge.generalized_components import MoEBridge
+from transformer_lens.model_bridge.supported_architectures.qwen3 import (
+    Qwen3ArchitectureAdapter,
 )
 
 
-class Qwen3NextArchitectureAdapter(ArchitectureAdapter):
-    """Architecture adapter for Qwen3Next models.
-
-    Qwen3NextForCausalLM is a hybrid linear-attention + full-attention
-    architecture with sparse MoE MLPs, sharing the same design as Qwen3.5:
-    - Uses RMSNorm for all normalizations
-    - Uses rotary position embeddings (RoPE) with partial rotation
-    - Every 4th layer is a full-attention layer (self_attn); the rest are
-      GatedDeltaNet linear-attention layers (linear_attn)
-    - Uses Qwen3NextSparseMoeBlock on ALL layers (decoder_sparse_step=1 and
-      mlp_only_layers=[] on every real checkpoint). The MoE block contains a
-      top-K router, batched Qwen3NextExperts (experts.gate_up_proj /
-      experts.down_proj as 3D tensors), plus a shared_expert (gated MLP) and
-      shared_expert_gate. Each expert is internally a gated MLP.
-    - No biases on any linear layers
-    - Full-attention layers have Q/K normalization (q_norm, k_norm)
-    - Full-attention q_proj outputs n_heads * head_dim * 2 (interleaved
-      query+gate layout); the preprocess_weights method slices the query half
+class Qwen3NextArchitectureAdapter(Qwen3ArchitectureAdapter):
+    """Hybrid linear-attention + full-attention with sparse MoE MLP.
 
-    Since self_attn is absent on linear-attention layers, only universally
-    present submodules (norms, MLP) are mapped as block submodules. The HF
-    native forward handles per-layer attention dispatch internally, and
-    MoEBridge delegates the MoE forward pass (including router + experts +
-    shared expert) to the native Qwen3NextSparseMoeBlock implementation.
-
-    Optional parameters:
-    - n_key_value_heads: set when num_key_value_heads != num_attention_heads (GQA)
+    Same hybrid design as Qwen3.5 but with MoE instead of dense MLP.
+    Inherits Qwen3 config/attention structure.
     """
 
     def __init__(self, cfg: Any) -> None:
-        """Initialize the Qwen3Next architecture adapter."""
-        super().__init__(cfg)
-
-        # Core config attributes
-        self.cfg.normalization_type = "RMS"
-        self.cfg.positional_embedding_type = "rotary"
-        self.cfg.final_rms = True
-        self.cfg.gated_mlp = True
-        self.cfg.attn_only = False
-        self.cfg.uses_rms_norm = True
-        self.cfg.default_prepend_bos = False
-
-        # Disable fold_ln: ln1 is followed by self_attn on full-attention
-        # layers and by linear_attn (GatedDeltaNet) on linear-attention layers,
-        # but neither is mapped as a bridge submodule (see class docstring for
-        # why). With no bridge-mapped target to fold into, the standard fold_ln
-        # pass leaves LN weights in an inconsistent state and the processed
-        # bridge output diverges from the unprocessed / HF output. Skipping
-        # fold_ln keeps processed-mode forward passes numerically equivalent.
+        ArchitectureAdapter.__init__(self, cfg)
+        self._setup_qwen3_config(cfg)
         self.supports_fold_ln = False
-
-        # Use eager attention to support output_attentions for hook_attn_scores
-        # and hook_pattern. SDPA doesn't support output_attentions.
-        self.cfg.attn_implementation = "eager"
-
-        # GQA: only set n_key_value_heads when using grouped-query attention
-        if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None:
-            self.cfg.n_key_value_heads = cfg.n_key_value_heads
-
+        setattr(self.cfg, "gated_q_proj", True)  # q_proj outputs [Q|gate] interleaved per head
         self.weight_processing_conversions: dict = {}
-        self.component_mapping: dict = {
-            "embed": EmbeddingBridge(name="model.embed_tokens"),
-            "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg),
-            "blocks": BlockBridge(
-                name="model.layers",
-                submodules={
-                    "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg),
-                    "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg),
-                    # Qwen3NextSparseMoeBlock has a custom Qwen3NextTopKRouter
-                    # (not an nn.Linear) as `gate`, plus batched experts and a
-                    # shared expert. MoEBridge wraps the whole MoE module and
-                    # delegates to HF's native forward, so we don't enumerate
-                    # the internal structure here.
-                    "mlp": MoEBridge(name="mlp", config=self.cfg),
-                },
-            ),
-            "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg),
-            "unembed": UnembeddingBridge(name="lm_head"),
-        }
-
-    def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None:
-        """No-op for hybrid models.
-
-        Hybrid models don't map attention as a block submodule (self_attn is
-        absent on linear-attention layers), so there are no rotary embedding
-        references to set up.
+        self.component_mapping = self._build_component_mapping(hybrid=True)
 
-        Note: to find which layers are full_attention at runtime, use:
-            layer_types = getattr(hf_model.config, "layer_types", [])
-            first_full_attn_idx = next(
-                i for i, t in enumerate(layer_types) if t == "full_attention"
-            )
-        Do NOT use hf_model.config.full_attention_interval -- it is not stored
-        on the config object (consumed during __init__ to build layer_types).
-        """
+    def _build_mlp_bridge(self):
+        """Sparse MoE MLP (router + batched experts + shared expert)."""
+        return MoEBridge(name="mlp", config=self.cfg)
 
     def preprocess_weights(self, state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-        """Slice query half from q_proj.weight (interleaved per-head layout).
-
-        In Qwen3Next, q_proj.weight has shape (n_heads * head_dim * 2, hidden_size).
-        Rows are organized as per-head interleaved:
-          head_0_query (d_head rows), head_0_gate (d_head rows),
-          head_1_query (d_head rows), head_1_gate (d_head rows), ...
-
-        A naive first-half slice would be wrong. We must reshape by head, then
-        take the first d_head rows of each head (the query half).
-
-        Note: since self_attn is NOT currently mapped as a bridge submodule,
-        these weights will not be loaded by the bridge. This method is included
-        for correctness and forward-compatibility.
-        """
-        n_heads = self.cfg.n_heads
-        d_head = self.cfg.d_head
-        keys_to_update = [k for k in state_dict if k.endswith(".self_attn.q_proj.weight")]
-        for key in keys_to_update:
-            w = state_dict[key]  # shape: (n_heads * d_head * 2, hidden_size)
-            # Reshape to expose per-head layout
-            w = w.view(n_heads, d_head * 2, -1)
-            # Take only the first d_head rows of each head (query half)
-            state_dict[key] = w[:, :d_head, :].reshape(n_heads * d_head, -1)
-        return state_dict
+        """Slice query half from gated q_proj.weight for weight-space analysis."""
+        return self._preprocess_gated_q_proj(state_dict, self.cfg.n_heads, self.cfg.d_head)

From b3de91dc6a3985f2b695be257a71d75aa773f405 Mon Sep 17 00:00:00 2001
From: jlarson4 <jonahalarson@comcast.net>
Date: Wed, 15 Apr 2026 08:03:56 -0500
Subject: [PATCH 4/8] Adapter updates and custom components

---
 .../benchmarks/component_outputs.py           |    6 +-
 .../benchmarks/weight_processing.py           |   27 +-
 .../model_bridge/component_setup.py           |    5 +-
 .../generalized_components/attention.py       |    4 +-
 .../generalized_components/gated_delta_net.py |   84 +-
 .../position_embeddings_attention.py          |   32 +-
 .../supported_architectures/granite.py        |    1 +
 .../supported_architectures/qwen3.py          |   24 +-
 .../supported_architectures/qwen3_5.py        |   20 +-
 .../supported_architectures/qwen3_next.py     |   10 +-
 .../data/architecture_gaps.json               | 4291 +++++++-------
 .../model_registry/data/supported_models.json | 5114 ++++++++++++++++-
 .../data/verification_history.json            |  202 +-
 13 files changed, 7549 insertions(+), 2271 deletions(-)

diff --git a/transformer_lens/benchmarks/component_outputs.py b/transformer_lens/benchmarks/component_outputs.py
index ba2d03edd..504825ce1 100644
--- a/transformer_lens/benchmarks/component_outputs.py
+++ b/transformer_lens/benchmarks/component_outputs.py
@@ -311,8 +311,12 @@ def benchmark_all_components(
                 n_layers = self.cfg.n_layers
 
                 for layer_idx in range(n_layers):
-                    # Recursively test each subcomponent and its nested subcomponents
+                    # Get the actual block to check which submodules were bound
+                    actual_block = getattr(self.bridge_model, block_type)[layer_idx]
                     for subcomp_name, subcomponent in blocks_component.submodules.items():
+                        # Skip optional submodules absent on this layer (hybrid architectures)
+                        if subcomp_name not in actual_block._modules:
+                            continue
                         comp_path = f"{block_type}.{layer_idx}.{subcomp_name}"
                         self._test_component_recursive(
                             comp_path, subcomponent, test_inputs, results, skip_components
diff --git a/transformer_lens/benchmarks/weight_processing.py b/transformer_lens/benchmarks/weight_processing.py
index 5a7fafd65..62e561b25 100644
--- a/transformer_lens/benchmarks/weight_processing.py
+++ b/transformer_lens/benchmarks/weight_processing.py
@@ -638,10 +638,24 @@ def benchmark_mlp_output_centering(
                 message="Skipped for tiny/test model (random weights don't center meaningfully)",
             )
 
-        # Check if this is an MoE model - MoE models don't have a single W_out weight
+        # Find an MLP-like submodule (may be "mlp", "shared_mlp", etc.)
         from transformer_lens.model_bridge.generalized_components.moe import MoEBridge
 
-        if isinstance(bridge.blocks[0].mlp, MoEBridge):
+        mlp_module = None
+        block = bridge.blocks[0]
+        for name in ("mlp", "shared_mlp"):
+            if name in block._modules:
+                mlp_module = block._modules[name]
+                break
+        if mlp_module is None:
+            return BenchmarkResult(
+                name="mlp_output_centering",
+                severity=BenchmarkSeverity.WARNING,
+                message="No MLP submodule found on block 0",
+                passed=False,
+            )
+
+        if isinstance(mlp_module, MoEBridge):
             return BenchmarkResult(
                 name="mlp_output_centering",
                 severity=BenchmarkSeverity.INFO,
@@ -651,11 +665,10 @@ def benchmark_mlp_output_centering(
 
         # Check if W_out exists and is accessible (HT format or bridge format)
         w_out = None
-        if hasattr(bridge.blocks[0].mlp, "W_out"):
-            w_out = bridge.blocks[0].mlp.W_out
-        elif hasattr(bridge.blocks[0].mlp, "out"):
-            # Bridge format: mlp.out is a LinearBridge wrapping nn.Linear
-            out_module = bridge.blocks[0].mlp.out
+        if hasattr(mlp_module, "W_out"):
+            w_out = mlp_module.W_out
+        elif hasattr(mlp_module, "out"):
+            out_module = mlp_module.out
             if hasattr(out_module, "original_component") and hasattr(
                 out_module.original_component, "weight"
             ):
diff --git a/transformer_lens/model_bridge/component_setup.py b/transformer_lens/model_bridge/component_setup.py
index a2986d585..7821d0354 100644
--- a/transformer_lens/model_bridge/component_setup.py
+++ b/transformer_lens/model_bridge/component_setup.py
@@ -100,9 +100,10 @@ def setup_submodules(
                 else:
                     remote_path = submodule.name
                     is_optional = getattr(submodule, "optional", False)
-                    # Fast path: first segment absent → skip without entering get_remote_component
+                    # Fast path: first segment absent or None → skip
                     first_segment = remote_path.split(".")[0]
-                    if is_optional and not hasattr(original_model, first_segment):
+                    first_value = getattr(original_model, first_segment, None)
+                    if is_optional and first_value is None:
                         logger.debug(
                             "Optional '%s' (path '%s') absent on %s",
                             module_name,
diff --git a/transformer_lens/model_bridge/generalized_components/attention.py b/transformer_lens/model_bridge/generalized_components/attention.py
index 2d73d7ed7..5608ca2d8 100644
--- a/transformer_lens/model_bridge/generalized_components/attention.py
+++ b/transformer_lens/model_bridge/generalized_components/attention.py
@@ -59,7 +59,7 @@ def __init__(
         requires_position_embeddings: bool = False,
         requires_attention_mask: bool = False,
         attention_mask_4d: bool = False,
-        **kwargs,
+        optional: bool = False,
     ):
         """Initialize the attention bridge.
 
@@ -87,7 +87,7 @@ def __init__(
             config=config,
             submodules=submodules or {},
             conversion_rule=conversion_rule,
-            **kwargs,
+            optional=optional,
         )
         self.hook_attn_scores = HookPoint()
         self.hook_pattern = HookPoint()
diff --git a/transformer_lens/model_bridge/generalized_components/gated_delta_net.py b/transformer_lens/model_bridge/generalized_components/gated_delta_net.py
index b62937dbd..dffc0e234 100644
--- a/transformer_lens/model_bridge/generalized_components/gated_delta_net.py
+++ b/transformer_lens/model_bridge/generalized_components/gated_delta_net.py
@@ -23,17 +23,20 @@ class GatedDeltaNetBridge(GeneralizedComponent):
 
     Hooks (prefill, in execution order):
         hook_in: input hidden_states [batch, seq, d_model]
-        hook_q_pre_conv: Q after projection + split, before conv [batch, seq, n_k_heads, head_k_dim]
-        hook_k_pre_conv: K before conv [batch, seq, n_k_heads, head_k_dim]
-        hook_v_pre_conv: V before conv [batch, seq, n_v_heads, head_v_dim]
-        hook_conv_out: post-conv mixed QKV [batch, seq, key_dim*2 + value_dim]
+        hook_q_pre_conv: Q after projection, before conv [batch, seq, n_k_heads, head_k_dim]
+        hook_k_pre_conv: K after projection, before conv [batch, seq, n_k_heads, head_k_dim]
+        hook_v_pre_conv: V after projection, before conv [batch, seq, n_v_heads, head_v_dim]
         hook_q: Q after conv, pre-GQA-expansion [batch, seq, n_k_heads, head_k_dim]
+            Note: on standard attn layers, hook_q is post-projection. Here it's
+            post-conv — use hook_q_pre_conv for the projection-only output.
         hook_k: K after conv [batch, seq, n_k_heads, head_k_dim]
         hook_v: V after conv [batch, seq, n_v_heads, head_v_dim]
-        hook_beta: write strength (sigmoid of b), per v-head [batch, seq, n_v_heads]
-        hook_log_decay: log-space decay g (negative; actual decay = exp(g)), per v-head [batch, seq, n_v_heads]
-        hook_recurrence_out: output of linear recurrence kernel [batch, seq, n_v_heads, head_v_dim]
-        hook_gate_input: z tensor before silu gating in GatedRMSNorm [batch, seq, n_v_heads, head_v_dim]
+        hook_beta_logit: pre-sigmoid write gate logit, per v-head [batch, seq, n_v_heads]
+        hook_beta: write strength sigmoid(b), per v-head [batch, seq, n_v_heads]
+        hook_log_decay: log-space decay g (NEGATIVE; multiplicative decay = exp(g)),
+            per v-head [batch, seq, n_v_heads]
+        hook_recurrence_out: output of linear recurrence [batch, seq, n_v_heads, head_v_dim]
+        hook_gate_input: z tensor (pre-silu) for GatedRMSNorm [batch, seq, n_v_heads, head_v_dim]
         hook_out: final output to residual stream [batch, seq, d_model]
 
     During generation (cache_params present), only hook_in/hook_out fire.
@@ -63,17 +66,16 @@ def __init__(
         **kwargs,
     ):
         super().__init__(name, config=config, submodules=submodules or {}, **kwargs)
-        # Pre-conv hooks (after projection, before causal convolution mixes positions)
+        # Pre-conv (after projection split, before causal conv mixes positions)
         self.hook_q_pre_conv = HookPoint()
         self.hook_k_pre_conv = HookPoint()
         self.hook_v_pre_conv = HookPoint()
-        # Conv output
-        self.hook_conv_out = HookPoint()
-        # Post-conv hooks (pre-GQA-expansion, pre-recurrence)
+        # Post-conv (pre-GQA-expansion, pre-recurrence)
         self.hook_q = HookPoint()
         self.hook_k = HookPoint()
         self.hook_v = HookPoint()
         # Gate parameters (per v-head)
+        self.hook_beta_logit = HookPoint()
         self.hook_beta = HookPoint()
         self.hook_log_decay = HookPoint()
         # Recurrence output + gated norm input
@@ -84,7 +86,6 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
         if self.original_component is None:
             raise RuntimeError(f"Original component not set for {self.name}.")
 
-        # Generation step → delegate to HF with only input/output hooks
         if kwargs.get("cache_params") is not None:
             return self._native_forward(*args, **kwargs)
         return self._hooked_forward(*args, **kwargs)
@@ -98,6 +99,12 @@ def _native_forward(self, *args: Any, **kwargs: Any) -> Any:
             args = (self.hook_in(args[0]),) + args[1:]
 
         output = self.original_component(*args, **kwargs)
+
+        if isinstance(output, tuple) and len(output) > 0:
+            first = output[0]
+            if isinstance(first, torch.Tensor):
+                return (self.hook_out(first),) + output[1:]
+            return output
         if isinstance(output, torch.Tensor):
             return self.hook_out(output)
         return output
@@ -115,11 +122,8 @@ def _hooked_forward(self, *args: Any, **kwargs: Any) -> Any:
 
         attention_mask = kwargs.get("attention_mask")
         if attention_mask is not None:
-            from transformers.models.qwen3_next.modeling_qwen3_next import (
-                apply_mask_to_padding_states,
-            )
-
-            hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
+            # Inline masking — avoids hard dependency on qwen3_next module
+            hidden_states = hidden_states * attention_mask.unsqueeze(-1)
 
         hidden_states = self.hook_in(hidden_states)
         batch_size, seq_len, _ = hidden_states.shape
@@ -128,7 +132,6 @@ def _hooked_forward(self, *args: Any, **kwargs: Any) -> Any:
         projected_qkvz = hf.in_proj_qkvz(hidden_states)
         projected_ba = hf.in_proj_ba(hidden_states)
 
-        # Split into per-head Q, K, V, Z, beta_raw, alpha_raw
         query, key, value, z, b, a = hf.fix_query_key_value_ordering(projected_qkvz, projected_ba)
 
         # --- Pre-conv hooks (per-head shape, before conv mixes positions) ---
@@ -153,9 +156,7 @@ def _hooked_forward(self, *args: Any, **kwargs: Any) -> Any:
             mixed_qkv = F.silu(hf.conv1d(mixed_qkv)[:, :, :seq_len])
         mixed_qkv = mixed_qkv.transpose(1, 2)
 
-        mixed_qkv = self.hook_conv_out(mixed_qkv)
-
-        # Split post-conv
+        # Split post-conv into per-head Q, K, V
         query, key, value = torch.split(
             mixed_qkv,
             [hf.key_dim, hf.key_dim, hf.value_dim],
@@ -171,9 +172,10 @@ def _hooked_forward(self, *args: Any, **kwargs: Any) -> Any:
         value = self.hook_v(value)
 
         # --- Gate parameters (per v-head) ---
+        b = self.hook_beta_logit(b)
         beta = self.hook_beta(b.sigmoid())
 
-        # g is log-space decay (negative); actual multiplicative decay = exp(g)
+        # g is log-space decay (NEGATIVE); multiplicative decay = exp(g)
         g = -hf.A_log.float().exp() * F.softplus(a.float() + hf.dt_bias)
         g = self.hook_log_decay(g)
 
@@ -216,25 +218,27 @@ def compute_effective_attention(
     ) -> torch.Tensor:
         """Materialize the effective attention matrix from cached hook values.
 
-        The gated delta rule recurrence is:
+        The gated delta rule recurrence is::
+
             S_t = exp(g_t) * S_{t-1} + beta_t * v_t @ k_t^T
             o_t = S_t^T @ q_t
 
-        The effective attention M[i,j] = contribution of input j to output i:
+        The effective attention M[i,j] = contribution of input j to output i::
+
             M[i,j] = (q_i^T @ k_j) * beta_j * prod_{t=j+1}^{i} exp(g_t)
 
-        Note: the fused kernel applies L2-normalization to Q and K internally
-        (use_qk_l2norm_in_kernel=True). The hooked Q/K are pre-normalization,
-        so this reconstruction is approximate. For exact reconstruction, you'd
-        need the normalized Q/K which aren't exposed by the kernel.
+        **Approximation note:** The fused kernel applies L2-normalization to Q
+        and K internally (``use_qk_l2norm_in_kernel=True``). The hooked Q/K are
+        pre-normalization, so this reconstruction diverges when Q/K norms vary
+        significantly across positions/heads. Accuracy is best when Q/K norms
+        are roughly uniform (common after training converges).
 
         Args:
-            cache: ActivationCache from run_with_cache.
+            cache: ActivationCache from ``run_with_cache``.
             layer_idx: Block index for this linear_attn layer.
 
         Returns:
-            [batch, n_v_heads, seq, seq] causal attention matrix. Upper triangle
-            (j > i) is zero.
+            ``[batch, n_v_heads, seq, seq]`` causal matrix (upper triangle zero).
 
         Cost is O(batch * n_heads * seq^2); use on short sequences.
         """
@@ -266,24 +270,18 @@ def compute_effective_attention(
         batch, seq, n_heads, d_head = q.shape
 
         # QK similarity: [batch, n_heads, seq_i, seq_j]
-        q_perm = q.permute(0, 2, 1, 3)  # [batch, n_heads, seq, d_head]
+        q_perm = q.permute(0, 2, 1, 3)
         k_perm = k.permute(0, 2, 1, 3)
-        qk = torch.matmul(q_perm, k_perm.transpose(-2, -1))  # [batch, n_heads, seq, seq]
+        qk = torch.matmul(q_perm, k_perm.transpose(-2, -1))
 
-        # Cumulative decay: L[i,j] = prod_{t=j+1}^{i} exp(g_t) = exp(sum g[j+1..i])
-        # g is [batch, seq, n_heads] → cumsum along seq
+        # Cumulative decay: L[i,j] = exp(sum g[j+1..i])
         g_perm = g.permute(0, 2, 1)  # [batch, n_heads, seq]
         cumsum_g = torch.cumsum(g_perm, dim=-1)
-        # L_log[i,j] = cumsum[i] - cumsum[j]
         L_log = cumsum_g[:, :, :, None] - cumsum_g[:, :, None, :]
 
         causal_mask = torch.tril(torch.ones(seq, seq, dtype=torch.bool, device=q.device))
         L = torch.where(causal_mask[None, None], torch.exp(L_log), torch.zeros_like(L_log))
 
-        # Beta broadcast: [batch, n_heads, 1, seq_j]
-        beta_col = beta.permute(0, 2, 1)[:, :, None, :]
-
         # M[i,j] = qk[i,j] * beta[j] * L[i,j]
-        M = qk * beta_col * L
-
-        return M
+        beta_col = beta.permute(0, 2, 1)[:, :, None, :]
+        return qk * beta_col * L
diff --git a/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py b/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py
index ad17c38a6..135ab0d17 100644
--- a/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py
+++ b/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py
@@ -113,20 +113,26 @@ class PositionEmbeddingsAttentionBridge(PositionEmbeddingHooksMixin, AttentionBr
     """
 
     def __init__(
-        self, name: str, config: Any, submodules: Optional[Dict[str, Any]] = None, **kwargs
+        self,
+        name: str,
+        config: Any,
+        submodules: Optional[Dict[str, Any]] = None,
+        optional: bool = False,
+        # Accepted for caller compatibility (Granite passes these explicitly)
+        # but always forced to True — this bridge reimplements attention.
+        requires_attention_mask: bool = True,
+        requires_position_embeddings: bool = True,
+        **kwargs,  # absorb any other AttentionBridge kwargs callers may pass
     ):
-        """Initialize Gemma-3 attention bridge.
-
-        Args:
-            name: Component name
-            config: Model configuration
-            submodules: Dictionary of subcomponents
-            **kwargs: Additional arguments passed to AttentionBridge
-        """
-        kwargs["requires_position_embeddings"] = True
-        kwargs["requires_attention_mask"] = True
-        kwargs["maintain_native_attention"] = True
-        super().__init__(name, config, submodules, **kwargs)
+        super().__init__(
+            name,
+            config,
+            submodules,
+            requires_position_embeddings=True,
+            requires_attention_mask=True,
+            maintain_native_attention=True,
+            optional=optional,
+        )
         self._init_position_embedding_hooks()
         if getattr(config, "gated_q_proj", False):
             self.hook_q_gate = HookPoint()
diff --git a/transformer_lens/model_bridge/supported_architectures/granite.py b/transformer_lens/model_bridge/supported_architectures/granite.py
index fbb911796..c46081b0b 100644
--- a/transformer_lens/model_bridge/supported_architectures/granite.py
+++ b/transformer_lens/model_bridge/supported_architectures/granite.py
@@ -51,6 +51,7 @@ def _setup_common_config(self, cfg: Any) -> None:
         self.cfg.gated_mlp = True
         self.cfg.attn_only = False
         self.cfg.uses_rms_norm = True
+        self.cfg.default_prepend_bos = False
         self.cfg.eps_attr = "variance_epsilon"
 
         self.default_config = {
diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3.py b/transformer_lens/model_bridge/supported_architectures/qwen3.py
index e37b44795..4676d1175 100644
--- a/transformer_lens/model_bridge/supported_architectures/qwen3.py
+++ b/transformer_lens/model_bridge/supported_architectures/qwen3.py
@@ -34,11 +34,15 @@ class Qwen3ArchitectureAdapter(ArchitectureAdapter):
     Serves as base class for Qwen3.5 and Qwen3Next hybrid variants.
     """
 
-    def __init__(self, cfg: Any) -> None:
+    def __init__(self, cfg: Any, *, hybrid: bool = False) -> None:
         super().__init__(cfg)
         self._setup_qwen3_config(cfg)
-        self.weight_processing_conversions = {**self._qkvo_weight_conversions()}
-        self.component_mapping = self._build_component_mapping()
+        if hybrid:
+            self.supports_fold_ln = False
+            self.weight_processing_conversions: dict = {}
+        else:
+            self.weight_processing_conversions = {**self._qkvo_weight_conversions()}
+        self.component_mapping = self._build_component_mapping(hybrid=hybrid)
 
     def _setup_qwen3_config(self, cfg: Any) -> None:
         """Config shared across all Qwen3 variants (dense, hybrid, MoE)."""
@@ -126,11 +130,15 @@ def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> No
                     block.attn.set_rotary_emb(rotary_emb)
 
         # Set on template for get_generalized_component() calls
-        try:
-            attn_template = self.get_generalized_component("blocks.0.attn")
-            attn_template.set_rotary_emb(rotary_emb)
-        except ValueError:
-            pass  # hybrid adapter with no attn in template
+        # Set on template — may not exist in hybrid adapters
+        mapping = self.component_mapping or {}
+        blocks_template = mapping.get("blocks") if isinstance(mapping, dict) else None
+        if blocks_template and "attn" in getattr(blocks_template, "submodules", {}):
+            try:
+                attn_template = self.get_generalized_component("blocks.0.attn")
+                attn_template.set_rotary_emb(rotary_emb)
+            except (ValueError, AttributeError, KeyError):
+                pass
 
     @staticmethod
     def _preprocess_gated_q_proj(
diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_5.py b/transformer_lens/model_bridge/supported_architectures/qwen3_5.py
index a7c484eee..2fa7e5b0d 100644
--- a/transformer_lens/model_bridge/supported_architectures/qwen3_5.py
+++ b/transformer_lens/model_bridge/supported_architectures/qwen3_5.py
@@ -9,7 +9,6 @@
 
 import torch
 
-from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
 from transformer_lens.model_bridge.supported_architectures.qwen3 import (
     Qwen3ArchitectureAdapter,
 )
@@ -19,20 +18,13 @@ class Qwen3_5ArchitectureAdapter(Qwen3ArchitectureAdapter):
     """Hybrid linear-attention + full-attention with dense gated MLP.
 
     Inherits Qwen3 config/attention/MLP structure. Differences:
-    - supports_fold_ln = False (LN target varies by layer type)
-    - Attention is optional (absent on linear-attention layers)
-    - Gated q_proj (2x wide) requires preprocess_weights slicing
-    - No weight_processing_conversions until attn is fully wired
+    - Attention + linear_attn are optional (per-layer type)
+    - Gated q_proj (2x wide) sliced by preprocess_weights for weight analysis
     """
 
     def __init__(self, cfg: Any) -> None:
-        # Call grandparent to set self.cfg, then configure ourselves
-        ArchitectureAdapter.__init__(self, cfg)
-        self._setup_qwen3_config(cfg)
-        self.supports_fold_ln = False
-        setattr(self.cfg, "gated_q_proj", True)  # q_proj outputs [Q|gate] interleaved per head
-        self.weight_processing_conversions: dict = {}
-        self.component_mapping = self._build_component_mapping(hybrid=True)
+        setattr(cfg, "gated_q_proj", True)
+        super().__init__(cfg, hybrid=True)
 
     def prepare_loading(self, model_name: str, model_kwargs: dict) -> None:
         """Swap multimodal Qwen3_5Config for text-only Qwen3_5TextConfig.
@@ -49,7 +41,7 @@ def preprocess_weights(self, state_dict: dict[str, torch.Tensor]) -> dict[str, t
         """Slice query half from gated q_proj.weight for weight-space analysis.
 
         In processed mode, W_Q is the pure query projection (for composition
-        scores, logit lens). Gate signal available in unprocessed mode via
-        hook_q_gate.
+        scores, logit lens). Gate signal available in unprocessed mode on
+        full-attention layers via blocks.N.attn.hook_q_gate.
         """
         return self._preprocess_gated_q_proj(state_dict, self.cfg.n_heads, self.cfg.d_head)
diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_next.py b/transformer_lens/model_bridge/supported_architectures/qwen3_next.py
index aa3ca6cc8..31e1be3cd 100644
--- a/transformer_lens/model_bridge/supported_architectures/qwen3_next.py
+++ b/transformer_lens/model_bridge/supported_architectures/qwen3_next.py
@@ -9,7 +9,6 @@
 
 import torch
 
-from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
 from transformer_lens.model_bridge.generalized_components import MoEBridge
 from transformer_lens.model_bridge.supported_architectures.qwen3 import (
     Qwen3ArchitectureAdapter,
@@ -20,16 +19,11 @@ class Qwen3NextArchitectureAdapter(Qwen3ArchitectureAdapter):
     """Hybrid linear-attention + full-attention with sparse MoE MLP.
 
     Same hybrid design as Qwen3.5 but with MoE instead of dense MLP.
-    Inherits Qwen3 config/attention structure.
     """
 
     def __init__(self, cfg: Any) -> None:
-        ArchitectureAdapter.__init__(self, cfg)
-        self._setup_qwen3_config(cfg)
-        self.supports_fold_ln = False
-        setattr(self.cfg, "gated_q_proj", True)  # q_proj outputs [Q|gate] interleaved per head
-        self.weight_processing_conversions: dict = {}
-        self.component_mapping = self._build_component_mapping(hybrid=True)
+        setattr(cfg, "gated_q_proj", True)
+        super().__init__(cfg, hybrid=True)
 
     def _build_mlp_bridge(self):
         """Sparse MoE MLP (router + batched experts + shared expert)."""
diff --git a/transformer_lens/tools/model_registry/data/architecture_gaps.json b/transformer_lens/tools/model_registry/data/architecture_gaps.json
index f3eb11de9..6261a9a65 100644
--- a/transformer_lens/tools/model_registry/data/architecture_gaps.json
+++ b/transformer_lens/tools/model_registry/data/architecture_gaps.json
@@ -1,18 +1,18 @@
 {
-  "generated_at": "2026-04-10",
+  "generated_at": "2026-04-14",
   "scan_info": {
-    "total_scanned": 5436,
+    "total_scanned": 5633,
     "task_filter": "text-generation",
     "min_downloads": 500,
-    "scan_duration_seconds": 3.9
+    "scan_duration_seconds": 4.2
   },
-  "total_unsupported_architectures": 401,
-  "total_unsupported_models": 1459,
+  "total_unsupported_architectures": 416,
+  "total_unsupported_models": 1400,
   "gaps": [
     {
       "architecture_id": "Qwen3_5ForConditionalGeneration",
-      "total_models": 67,
-      "total_downloads": 140710,
+      "total_models": 72,
+      "total_downloads": 146334,
       "min_param_count": 211968832,
       "sample_models": [
         "Tesslate/OmniCoder-9B",
@@ -20,18 +20,37 @@
         "nightmedia/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-qx64-hi-mlx",
         "Brooooooklyn/Qwen3.5-27B-unsloth-mlx",
         "aifeifei798/Qwen3.5-Queen-27B",
-        "Jackrong/Qwen3.5-2B-Claude-4.6-Opus-Reasoning-Distilled",
         "Brooooooklyn/Qwen3.5-9B-unsloth-mlx",
-        "alexcovo/qwen35-9b-mlx-turboquant-tq3",
         "bigatuna/Qwen3.5-9b-Sushi-Coder-RL-MLX",
-        "Jackrong/Qwen3.5-9B-Neo"
+        "Jackrong/Qwen3.5-2B-Claude-4.6-Opus-Reasoning-Distilled",
+        "alexcovo/qwen35-9b-mlx-turboquant-tq3",
+        "Oysiyl/qwen3.5-27b-unslop-good-lora-v1"
+      ],
+      "relevancy_score": 91.0
+    },
+    {
+      "architecture_id": "Gemma4ForConditionalGeneration",
+      "total_models": 64,
+      "total_downloads": 90296,
+      "min_param_count": 738022691,
+      "sample_models": [
+        "dealignai/Gemma-4-31B-JANG_4M-Uncensored",
+        "0xSero/gemma-4-21b-a4b-it-REAP",
+        "InfinimindCreations/gemma-4-E4B-it-uncensored",
+        "TrevorJS/gemma-4-26B-A4B-it-uncensored",
+        "WWTCyberLab/gemma-4-31B-it-abliterated",
+        "WWTCyberLab/gemma-4-26B-A4B-it-abliterated",
+        "TrevorJS/gemma-4-31B-it-uncensored",
+        "TrevorJS/gemma-4-E4B-it-uncensored",
+        "InfinimindCreations/gemma-4-31B-it-uncensored",
+        "TrevorJS/gemma-4-E2B-it-uncensored"
       ],
-      "relevancy_score": 91.5
+      "relevancy_score": 84.9
     },
     {
       "architecture_id": "DeepseekV3ForCausalLM",
-      "total_models": 48,
-      "total_downloads": 6449394,
+      "total_models": 46,
+      "total_downloads": 6840308,
       "min_param_count": 1656048,
       "sample_models": [
         "deepseek-ai/DeepSeek-R1",
@@ -39,43 +58,24 @@
         "deepseek-ai/DeepSeek-V3",
         "deepseek-ai/DeepSeek-V3-0324",
         "moonshotai/Kimi-K2-Instruct-0905",
-        "deepseek-ai/DeepSeek-V3.1",
-        "ai-sage/GigaChat3-10B-A1.8B",
         "moonshotai/Kimi-K2-Instruct",
+        "deepseek-ai/DeepSeek-V3.1",
         "trl-internal-testing/tiny-DeepseekV3ForCausalLM",
-        "trl-internal-testing/tiny-DeepseekV3ForCausalLM-0528"
-      ],
-      "relevancy_score": 87.2
-    },
-    {
-      "architecture_id": "Qwen3MoeForCausalLM",
-      "total_models": 45,
-      "total_downloads": 5469133,
-      "min_param_count": 2574656,
-      "sample_models": [
-        "Qwen/Qwen3-30B-A3B",
-        "Qwen/Qwen3-Coder-30B-A3B-Instruct",
-        "Qwen/Qwen3-30B-A3B-Instruct-2507",
-        "Qwen/Qwen3-235B-A22B",
-        "trl-internal-testing/tiny-Qwen3MoeForCausalLM",
-        "Qwen/Qwen3-30B-A3B-Thinking-2507",
-        "Qwen/Qwen3-235B-A22B-Instruct-2507",
-        "Qwen/Qwen3-Coder-480B-A35B-Instruct",
-        "Qwen/Qwen3-235B-A22B-Thinking-2507",
-        "NVFP4/Qwen3-30B-A3B-Instruct-2507-FP4"
+        "trl-internal-testing/tiny-DeepseekV3ForCausalLM-0528",
+        "moonshotai/Moonlight-16B-A3B-Instruct"
       ],
-      "relevancy_score": 84.9
+      "relevancy_score": 83.1
     },
     {
       "architecture_id": "NemotronHForCausalLM",
-      "total_models": 40,
-      "total_downloads": 3187865,
+      "total_models": 41,
+      "total_downloads": 3587883,
       "min_param_count": 4221480,
       "sample_models": [
         "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
-        "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese",
         "nvidia/NVIDIA-Nemotron-Nano-9B-v2",
         "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16",
+        "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese",
         "nvidia/Nemotron-Cascade-2-30B-A3B",
         "nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16",
         "unsloth/NVIDIA-Nemotron-3-Nano-4B",
@@ -83,69 +83,31 @@
         "empero-ai/openNemo-9B",
         "cpagac/Nemotron-Nano-9B-v2-heretic"
       ],
-      "relevancy_score": 80.3
-    },
-    {
-      "architecture_id": "Qwen3_5ForCausalLM",
-      "total_models": 52,
-      "total_downloads": 81342,
-      "min_param_count": 752393024,
-      "sample_models": [
-        "lukey03/Qwen3.5-9B-abliterated",
-        "GoodStartLabs/gin-rummy-hbc-qwen3.5-0.8b",
-        "aifeifei798/Darkidol-Ballad-27B",
-        "brocchirodrigo/anotaai-ajuda-qwen3_5_Q4",
-        "kai-os/Carnice-9b",
-        "llmfan46/Darkidol-Ballad-27B-ultra-uncensored-heretic-v1",
-        "llmfan46/Darkidol-Ballad-27B-ultra-uncensored-heretic-v2",
-        "Phonsiri/Qwen3.5-9B-Thai-Law-Base",
-        "continuum-ai/qwen3.5-4b-code-forged",
-        "aifeifei798/Darkidol-Ballad-9B"
-      ],
-      "relevancy_score": 80.2
+      "relevancy_score": 78.6
     },
     {
       "architecture_id": "Lfm2ForCausalLM",
       "total_models": 40,
-      "total_downloads": 1395683,
+      "total_downloads": 1626247,
       "min_param_count": 274754048,
       "sample_models": [
         "farbodtavakkoli/OTel-LLM-1.2B-IT",
         "LiquidAI/LFM2.5-1.2B-Instruct",
         "LiquidAI/LFM2-1.2B",
-        "LiquidAI/LFM2-350M",
         "LiquidAI/LFM2.5-350M",
+        "LiquidAI/LFM2-350M",
         "LiquidAI/LFM2.5-1.2B-Thinking",
         "LiquidAI/LFM2-2.6B-Exp",
         "LiquidAI/LFM2.5-1.2B-Base",
         "LiquidAI/LFM2-700M",
         "unsloth/LFM2.5-1.2B-Instruct"
       ],
-      "relevancy_score": 78.4
-    },
-    {
-      "architecture_id": "Gemma4ForConditionalGeneration",
-      "total_models": 37,
-      "total_downloads": 51866,
-      "min_param_count": 2084387402,
-      "sample_models": [
-        "dealignai/Gemma-4-31B-JANG_4M-Uncensored",
-        "0xSero/gemma-4-21b-a4b-it-REAP",
-        "InfinimindCreations/gemma-4-E4B-it-uncensored",
-        "lthn/lemma",
-        "TrevorJS/gemma-4-26B-A4B-it-uncensored",
-        "livadies/gemma-4-E2B-Ghetto-NF4",
-        "Greytechai/Gemma-4-31B-JANG_4M-CRACK",
-        "WWTCyberLab/gemma-4-31B-it-abliterated",
-        "WWTCyberLab/gemma-4-26B-A4B-it-abliterated",
-        "InfinimindCreations/gemma-4-31B-it-uncensored"
-      ],
-      "relevancy_score": 65.1
+      "relevancy_score": 76.3
     },
     {
       "architecture_id": "QWenLMHeadModel",
       "total_models": 22,
-      "total_downloads": 495498,
+      "total_downloads": 522223,
       "min_param_count": 19545408,
       "sample_models": [
         "cckevinn/SeeClick",
@@ -156,91 +118,53 @@
         "Qwen/Qwen-1_8B-Chat",
         "Qwen/Qwen-14B-Chat",
         "Qwen/Qwen-14B",
-        "Xingyu-Zheng/Qwen-VL-Chat",
-        "Qwen/Qwen-72B"
-      ],
-      "relevancy_score": 64.0
-    },
-    {
-      "architecture_id": "InternLM2ForCausalLM",
-      "total_models": 23,
-      "total_downloads": 253936,
-      "min_param_count": 24052864,
-      "sample_models": [
-        "internlm/internlm2-chat-7b",
-        "internlm/internlm2_5-7b-chat",
-        "internlm/internlm2-7b",
-        "internlm/internlm2-20b",
-        "internlm/internlm2-base-7b",
-        "internlm/internlm2-chat-20b",
-        "internlm/internlm2-base-20b",
-        "chujiezheng/internlm2-chat-20b-ExPO",
-        "chujiezheng/internlm2-chat-7b-ExPO",
-        "internlm/internlm2-1_8b"
+        "Qwen/Qwen-Audio-Chat",
+        "Xingyu-Zheng/Qwen-VL-Chat"
       ],
-      "relevancy_score": 63.2
+      "relevancy_score": 62.5
     },
     {
-      "architecture_id": "GPTBigCodeForCausalLM",
-      "total_models": 24,
-      "total_downloads": 109509,
-      "min_param_count": 1845928,
+      "architecture_id": "DeepseekV32ForCausalLM",
+      "total_models": 12,
+      "total_downloads": 9006409,
+      "min_param_count": 136559748,
       "sample_models": [
-        "bigcode/gpt_bigcode-santacoder",
-        "bigcode/tiny_starcoder_py",
-        "bigcode/starcoder",
-        "bigcode/starcoderbase-1b",
-        "ibm-granite/granite-20b-code-base-8k",
-        "GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct",
-        "HuggingFaceH4/starchat-alpha",
-        "defog/sqlcoder2",
-        "HuggingFaceH4/starchat-beta",
-        "ibm-granite/granite-20b-code-instruct-8k"
+        "deepseek-ai/DeepSeek-V3.2",
+        "deepseek-ai/DeepSeek-V3.2-Exp",
+        "deepseek-ai/DeepSeek-V3.2-Speciale",
+        "deepseek-ai/DeepSeek-Math-V2",
+        "zianglih/DeepSeek-V3.2-6layer-last-1-bf16",
+        "exolabs/DeepSeek-V3.2_bf16",
+        "deepseek-ai/DeepSeek-V3.2-Exp-Base",
+        "cs2764/DeepSeek-V3.2_dq4-mlx",
+        "zianglih/DeepSeek-V3.2-MXFP8",
+        "hyper-accel/tiny-random-deepseek-v32"
       ],
-      "relevancy_score": 62.0
+      "relevancy_score": 62.5
     },
     {
-      "architecture_id": "T5GemmaForConditionalGeneration",
-      "total_models": 14,
-      "total_downloads": 1037477,
-      "min_param_count": 312517632,
+      "architecture_id": "JambaForCausalLM",
+      "total_models": 28,
+      "total_downloads": 48565,
+      "min_param_count": 127679344,
       "sample_models": [
-        "google/t5gemma-s-s-prefixlm",
-        "google/t5gemma-9b-9b-ul2",
-        "google/t5gemma-b-b-ul2",
-        "google/t5gemma-2b-2b-prefixlm",
-        "google/t5gemma-2b-2b-ul2",
-        "google/t5gemma-l-l-ul2-it",
-        "google/t5gemma-ml-ml-ul2-it",
-        "google/t5gemma-b-b-prefixlm",
-        "google/t5gemma-s-s-prefixlm-it",
-        "google/t5gemma-9b-9b-prefixlm"
-      ],
-      "relevancy_score": 60.3
-    },
-    {
-      "architecture_id": "XGLMForCausalLM",
-      "total_models": 18,
-      "total_downloads": 223424,
-      "min_param_count": 162256896,
-      "sample_models": [
-        "facebook/xglm-564M",
-        "facebook/incoder-1B",
-        "facebook/xglm-7.5B",
-        "facebook/xglm-4.5B",
-        "facebook/xglm-1.7B",
-        "KoboldAI/fairseq-dense-2.7B",
-        "KoboldAI/fairseq-dense-125M",
-        "KoboldAI/fairseq-dense-355M",
-        "KoboldAI/fairseq-dense-13B",
-        "KoboldAI/fairseq-dense-1.3B"
+        "ai21labs/AI21-Jamba-Mini-1.5",
+        "ai21labs/Jamba-tiny-random",
+        "ai21labs/AI21-Jamba-Mini-1.6",
+        "ai21labs/AI21-Jamba-Large-1.5",
+        "ai21labs/AI21-Jamba2-3B",
+        "ai21labs/AI21-Jamba-Large-1.6",
+        "ai21labs/Jamba-v0.1",
+        "ai21labs/AI21-Jamba2-Mini",
+        "ai21labs/AI21-Jamba-Reasoning-3B",
+        "microsoft/Dayhoff-170M-GRS-112000"
       ],
-      "relevancy_score": 59.6
+      "relevancy_score": 61.1
     },
     {
       "architecture_id": "Glm4MoeForCausalLM",
-      "total_models": 14,
-      "total_downloads": 742282,
+      "total_models": 16,
+      "total_downloads": 751068,
       "min_param_count": 2572352,
       "sample_models": [
         "zai-org/GLM-4.5-Air",
@@ -249,72 +173,55 @@
         "zai-org/GLM-4.5",
         "zai-org/GLM-4.6",
         "np-cr/testing-glm4-moe",
-        "ArliAI/GLM-4.6-Derestricted-v3",
         "PrimeIntellect/GLM-0.5B",
+        "ArliAI/GLM-4.6-Derestricted-v3",
         "zai-org/GLM-4.5-Air-Base",
         "PrimeIntellect/INTELLECT-3"
       ],
       "relevancy_score": 59.6
     },
     {
-      "architecture_id": "JambaForCausalLM",
-      "total_models": 22,
-      "total_downloads": 44090,
-      "min_param_count": 127679344,
-      "sample_models": [
-        "ai21labs/AI21-Jamba-Mini-1.5",
-        "ai21labs/Jamba-tiny-random",
-        "ai21labs/AI21-Jamba-Mini-1.6",
-        "ai21labs/AI21-Jamba-Large-1.5",
-        "ai21labs/AI21-Jamba-Large-1.6",
-        "ai21labs/AI21-Jamba2-3B",
-        "ai21labs/Jamba-v0.1",
-        "ai21labs/AI21-Jamba2-Mini",
-        "ai21labs/AI21-Jamba-Reasoning-3B",
-        "microsoft/Dayhoff-170m-GR"
-      ],
-      "relevancy_score": 58.6
-    },
-    {
-      "architecture_id": "DeepseekV32ForCausalLM",
-      "total_models": 8,
-      "total_downloads": 1446699,
-      "min_param_count": 136559748,
+      "architecture_id": "T5GemmaForConditionalGeneration",
+      "total_models": 14,
+      "total_downloads": 1062491,
+      "min_param_count": 312517632,
       "sample_models": [
-        "deepseek-ai/DeepSeek-V3.2",
-        "deepseek-ai/DeepSeek-V3.2-Exp",
-        "deepseek-ai/DeepSeek-V3.2-Speciale",
-        "deepseek-ai/DeepSeek-Math-V2",
-        "exolabs/DeepSeek-V3.2_bf16",
-        "deepseek-ai/DeepSeek-V3.2-Exp-Base",
-        "hyper-accel/tiny-random-deepseek-v32",
-        "cs2764/DeepSeek-V3.2_dq4-mlx"
+        "google/t5gemma-s-s-prefixlm",
+        "google/t5gemma-b-b-ul2",
+        "google/t5gemma-9b-9b-ul2",
+        "google/t5gemma-2b-2b-prefixlm",
+        "google/t5gemma-2b-2b-ul2",
+        "google/t5gemma-l-l-ul2-it",
+        "google/t5gemma-ml-ml-ul2-it",
+        "google/t5gemma-b-b-prefixlm",
+        "google/t5gemma-s-s-prefixlm-it",
+        "google/t5gemma-9b-9b-prefixlm"
       ],
-      "relevancy_score": 57.0
+      "relevancy_score": 59.1
     },
     {
-      "architecture_id": "BaichuanForCausalLM",
-      "total_models": 15,
-      "total_downloads": 115111,
-      "min_param_count": 16204352,
+      "architecture_id": "GPTBigCodeForCausalLM",
+      "total_models": 24,
+      "total_downloads": 39369,
+      "min_param_count": 1845928,
       "sample_models": [
-        "baichuan-inc/Baichuan2-7B-Chat",
-        "baichuan-inc/Baichuan2-13B-Chat",
-        "baichuan-inc/Baichuan-13B-Chat",
-        "baichuan-inc/Baichuan2-7B-Base",
-        "baichuan-inc/Baichuan2-13B-Base",
-        "sakuraumi/Sakura-13B-Galgame",
-        "zxbsmk/NSFW_13B_sft",
-        "katuni4ka/tiny-random-baichuan2",
-        "baichuan-inc/Baichuan-13B-Base",
-        "FreedomIntelligence/HuatuoGPT2-7B"
+        "bigcode/starcoder",
+        "bigcode/starcoderbase-1b",
+        "ibm-granite/granite-20b-code-base-8k",
+        "GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct",
+        "HuggingFaceH4/starchat-alpha",
+        "defog/sqlcoder2",
+        "ibm-granite/granite-20b-code-instruct-8k",
+        "HuggingFaceH4/starchat-beta",
+        "LoupGarou/WizardCoder-Guanaco-15B-V1.0",
+        "openchat/opencoderplus"
       ],
-      "relevancy_score": 56.1
+      "relevancy_score": 58.1
     },
     {
       "architecture_id": "SmolLM3ForCausalLM",
-      "total_models": 7,
-      "total_downloads": 1155432,
+      "total_models": 8,
+      "total_downloads": 1123995,
       "min_param_count": 8245568,
       "sample_models": [
         "HuggingFaceTB/SmolLM3-3B",
@@ -322,15 +229,35 @@
         "optimum-internal-testing/tiny-random-SmolLM3ForCausalLM",
         "unsloth/SmolLM3-3B",
         "onnx-internal-testing/tiny-random-SmolLM3ForCausalLM",
-        "MInAlA/smollm3-dpo-merged",
-        "N-Bot-Int/SmolSam3-MEMGRPO"
+        "MInAlA/SmolLM3-3B-DPO-merged",
+        "N-Bot-Int/SmolSam3-MEMGRPO",
+        "yujiepan/smollm3-tiny-random"
+      ],
+      "relevancy_score": 55.5
+    },
+    {
+      "architecture_id": "GlmMoeDsaForCausalLM",
+      "total_models": 10,
+      "total_downloads": 590748,
+      "min_param_count": 162774148,
+      "sample_models": [
+        "zai-org/GLM-5",
+        "zai-org/GLM-5.1",
+        "cs2764/GLM-5-abliterated-dq4-mlx",
+        "livadies/GLM-5.1-Ghetto-MoE-2-Experts",
+        "unsloth/GLM-5",
+        "JANGQ-AI/GLM-5.1-JANG_2S",
+        "0xSero/GLM-5-REAP-381B",
+        "JANGQ-AI/GLM-5.1-JANG_1L",
+        "cs2764/GLM-5-abliterated-dq3-mlx",
+        "hyper-accel/tiny-random-glm-moe-dsa"
       ],
-      "relevancy_score": 55.9
+      "relevancy_score": 55.3
     },
     {
       "architecture_id": "BartForConditionalGeneration",
       "total_models": 9,
-      "total_downloads": 599134,
+      "total_downloads": 692599,
       "min_param_count": 6044480,
       "sample_models": [
         "KomeijiForce/bart-large-emojilm",
@@ -343,12 +270,31 @@
         "Tianlin668/MentalBART",
         "KomeijiForce/bart-large-emojilm-e2t"
       ],
-      "relevancy_score": 55.7
+      "relevancy_score": 55.0
+    },
+    {
+      "architecture_id": "BaichuanForCausalLM",
+      "total_models": 15,
+      "total_downloads": 117761,
+      "min_param_count": 16204352,
+      "sample_models": [
+        "baichuan-inc/Baichuan2-7B-Chat",
+        "baichuan-inc/Baichuan2-13B-Chat",
+        "baichuan-inc/Baichuan-13B-Chat",
+        "baichuan-inc/Baichuan2-7B-Base",
+        "baichuan-inc/Baichuan2-13B-Base",
+        "zxbsmk/NSFW_13B_sft",
+        "sakuraumi/Sakura-13B-Galgame",
+        "baichuan-inc/Baichuan-13B-Base",
+        "katuni4ka/tiny-random-baichuan2",
+        "FreedomIntelligence/HuatuoGPT2-7B"
+      ],
+      "relevancy_score": 54.9
     },
     {
       "architecture_id": "FalconH1ForCausalLM",
       "total_models": 15,
-      "total_downloads": 76731,
+      "total_downloads": 77408,
       "min_param_count": 91131072,
       "sample_models": [
         "tiiuae/Falcon-H1-0.5B-Base",
@@ -359,80 +305,45 @@
         "tiiuae/Falcon-H1-1.5B-Base",
         "tiiuae/Falcon-H1-Tiny-90M-Instruct",
         "tiiuae/Falcon-H1R-7B",
-        "tiiuae/Falcon-H1-1.5B-Deep-Instruct",
-        "tiiuae/Falcon-H1-3B-Instruct"
-      ],
-      "relevancy_score": 55.2
-    },
-    {
-      "architecture_id": "CohereForCausalLM",
-      "total_models": 10,
-      "total_downloads": 193414,
-      "min_param_count": 2042176,
-      "sample_models": [
-        "trl-internal-testing/tiny-CohereForCausalLM",
-        "CohereLabs/aya-expanse-8b",
-        "CohereLabs/c4ai-command-r-v01",
-        "CohereLabs/aya-23-8B",
-        "NLPark/AnFeng_v3_Avocet",
-        "CohereLabs/aya-expanse-32b",
-        "CohereLabs/aya-23-35B",
-        "CohereLabs/c4ai-command-r-plus-08-2024",
-        "CohereLabs/c4ai-command-r-08-2024",
-        "CohereLabs/c4ai-command-r-plus"
+        "tiiuae/Falcon-H1-3B-Instruct",
+        "tiiuae/Falcon-H1-1.5B-Deep-Instruct"
       ],
-      "relevancy_score": 53.9
+      "relevancy_score": 54.0
     },
     {
       "architecture_id": "H2OVLChatModel",
       "total_models": 2,
-      "total_downloads": 2131755,
+      "total_downloads": 2009160,
       "min_param_count": 826295808,
       "sample_models": [
         "h2oai/h2ovl-mississippi-800m",
         "h2oai/h2ovl-mississippi-2b"
       ],
-      "relevancy_score": 53.9
-    },
-    {
-      "architecture_id": "MiniCPMForCausalLM",
-      "total_models": 12,
-      "total_downloads": 93202,
-      "min_param_count": 80000640,
-      "sample_models": [
-        "openbmb/MiniCPM-2B-sft-bf16",
-        "openbmb/MiniCPM4.1-8B",
-        "openbmb/MiniCPM-1B-sft-bf16",
-        "openbmb/MiniCPM4-0.5B",
-        "openbmb/MiniCPM-MoE-8x2B",
-        "openbmb/MiniCPM-S-1B-sft",
-        "katuni4ka/tiny-random-minicpm",
-        "openbmb/MiniCPM4-8B",
-        "openbmb/MiniCPM-2B-sft-fp32",
-        "openbmb/MiniCPM-2B-dpo-bf16"
-      ],
-      "relevancy_score": 53.6
+      "relevancy_score": 53.0
     },
     {
-      "architecture_id": "GlmMoeDsaForCausalLM",
-      "total_models": 7,
-      "total_downloads": 411962,
-      "min_param_count": 162774148,
+      "architecture_id": "DFlashDraftModel",
+      "total_models": 11,
+      "total_downloads": 131573,
+      "min_param_count": 473995264,
       "sample_models": [
-        "zai-org/GLM-5",
-        "zai-org/GLM-5.1",
-        "cs2764/GLM-5-abliterated-dq4-mlx",
-        "0xSero/GLM-5-REAP-381B",
-        "unsloth/GLM-5",
-        "cs2764/GLM-5-abliterated-dq3-mlx",
-        "hyper-accel/tiny-random-glm-moe-dsa"
+        "z-lab/Qwen3-4B-DFlash-b16",
+        "z-lab/Qwen3-8B-DFlash-b16",
+        "z-lab/Qwen3.5-27B-DFlash",
+        "z-lab/Qwen3.5-9B-DFlash",
+        "z-lab/Qwen3.5-4B-DFlash",
+        "z-lab/Qwen3.5-35B-A3B-DFlash",
+        "z-lab/gpt-oss-120b-DFlash",
+        "z-lab/gpt-oss-20b-DFlash",
+        "z-lab/Qwen3-Coder-30B-A3B-DFlash",
+        "z-lab/LLaMA3.1-8B-Instruct-DFlash-UltraChat"
       ],
-      "relevancy_score": 53.6
+      "relevancy_score": 52.6
     },
     {
       "architecture_id": "Cohere2ForCausalLM",
       "total_models": 9,
-      "total_downloads": 207703,
+      "total_downloads": 220158,
       "min_param_count": 2090024,
       "sample_models": [
         "trl-internal-testing/tiny-Cohere2ForCausalLM",
@@ -445,12 +356,12 @@
         "CohereLabs/tiny-aya-fire",
         "CohereLabs/tiny-aya-earth"
       ],
-      "relevancy_score": 53.4
+      "relevancy_score": 52.5
     },
     {
       "architecture_id": "PhiMoEForCausalLM",
       "total_models": 4,
-      "total_downloads": 889098,
+      "total_downloads": 902073,
       "min_param_count": 1110112,
       "sample_models": [
         "microsoft/Phi-tiny-MoE-instruct",
@@ -458,37 +369,37 @@
         "microsoft/Phi-3.5-MoE-instruct",
         "optimum-intel-internal-testing/phi-3.5-moe-tiny-random"
       ],
-      "relevancy_score": 53.3
+      "relevancy_score": 52.5
     },
     {
-      "architecture_id": "MPTForCausalLM",
-      "total_models": 26,
-      "total_downloads": 41278,
-      "min_param_count": 6649286656,
+      "architecture_id": "MiniCPMForCausalLM",
+      "total_models": 12,
+      "total_downloads": 90418,
+      "min_param_count": 80000640,
       "sample_models": [
-        "vinai/PhoGPT-4B-Chat",
-        "vinai/PhoGPT-4B",
-        "anas-awadalla/mpt-7b",
-        "gl198976/mpt-7b-instruct",
-        "replit/replit-code-v1-3b",
-        "echarlaix/tiny-mpt-random-remote-code",
-        "wtang06/mpt-125m-c4",
-        "lightblue/japanese-mpt-7b",
-        "gl198976/mpt-7b",
-        "TehVenom/MPT-7b-Chat-Instruct-LongCTX-Merge"
+        "openbmb/MiniCPM-2B-sft-bf16",
+        "openbmb/MiniCPM4.1-8B",
+        "openbmb/MiniCPM-1B-sft-bf16",
+        "openbmb/MiniCPM4-0.5B",
+        "openbmb/MiniCPM-MoE-8x2B",
+        "openbmb/MiniCPM-S-1B-sft",
+        "openbmb/MiniCPM4-8B",
+        "katuni4ka/tiny-random-minicpm",
+        "openbmb/MiniCPM-2B-dpo-bf16",
+        "openbmb/MiniCPM-2B-sft-fp32"
       ],
-      "relevancy_score": 53.2
+      "relevancy_score": 52.4
     },
     {
       "architecture_id": "RwkvForCausalLM",
       "total_models": 15,
-      "total_downloads": 31498,
+      "total_downloads": 29790,
       "min_param_count": 169342464,
       "sample_models": [
         "RWKV/v5-Eagle-7B-HF",
         "RWKV/rwkv-4-169m-pile",
-        "beomi/KoRWKV-6B",
         "RWKV/rwkv-4-430m-pile",
+        "beomi/KoRWKV-6B",
         "RWKV/rwkv-4-1b5-pile",
         "RWKV/rwkv-4-3b-pile",
         "RWKV/rwkv-raven-1b5",
@@ -496,12 +407,12 @@
         "RWKV/rwkv-raven-3b",
         "RWKV/rwkv-raven-14b"
       ],
-      "relevancy_score": 53.2
+      "relevancy_score": 51.9
     },
     {
       "architecture_id": "MT5ForConditionalGeneration",
       "total_models": 13,
-      "total_downloads": 55149,
+      "total_downloads": 51271,
       "min_param_count": 300176768,
       "sample_models": [
         "knowledgator/IUPAC2SMILES-canonical-base",
@@ -515,47 +426,43 @@
         "intelia-lab-uah/mt0-base_QG_SQAC",
         "UBC-NLP/toucan-1.2B"
       ],
-      "relevancy_score": 53.1
-    },
-    {
-      "architecture_id": "DFlashDraftModel",
-      "total_models": 10,
-      "total_downloads": 128716,
-      "min_param_count": 473995264,
-      "sample_models": [
-        "z-lab/Qwen3-4B-DFlash-b16",
-        "z-lab/Qwen3-8B-DFlash-b16",
-        "z-lab/Qwen3.5-9B-DFlash",
-        "z-lab/Qwen3.5-27B-DFlash",
-        "z-lab/Qwen3.5-4B-DFlash",
-        "z-lab/gpt-oss-120b-DFlash",
-        "z-lab/gpt-oss-20b-DFlash",
-        "z-lab/Qwen3.5-35B-A3B-DFlash",
-        "z-lab/LLaMA3.1-8B-Instruct-DFlash-UltraChat",
-        "z-lab/Qwen3-Coder-30B-A3B-DFlash"
-      ],
-      "relevancy_score": 53.0
+      "relevancy_score": 51.8
     },
     {
       "architecture_id": "Qwen2MoeForCausalLM",
       "total_models": 7,
-      "total_downloads": 203653,
+      "total_downloads": 193536,
       "min_param_count": 1219036,
       "sample_models": [
         "Qwen/Qwen1.5-MoE-A2.7B",
         "Qwen/Qwen1.5-MoE-A2.7B-Chat",
         "Qwen/Qwen2-57B-A14B-Instruct",
         "Qwen/Qwen2-57B-A14B",
-        "katuni4ka/tiny-random-qwen1.5-moe",
         "yujiepan/qwen1.5-moe-tiny-random",
+        "katuni4ka/tiny-random-qwen1.5-moe",
         "xd2010/Qwen1.5-MOE-sft-math7k-densemixer"
       ],
-      "relevancy_score": 52.0
+      "relevancy_score": 51.0
+    },
+    {
+      "architecture_id": "FalconMambaForCausalLM",
+      "total_models": 6,
+      "total_downloads": 194376,
+      "min_param_count": 525400,
+      "sample_models": [
+        "trl-internal-testing/tiny-FalconMambaForCausalLM",
+        "tiiuae/falcon-mamba-7b-instruct",
+        "tiiuae/falcon-mamba-7b",
+        "tiiuae/falcon-mamba-tiny-dev",
+        "tiiuae/Falcon3-Mamba-7B-Instruct",
+        "tiiuae/Falcon3-Mamba-7B-Base"
+      ],
+      "relevancy_score": 50.4
     },
     {
       "architecture_id": "Phi3VForCausalLM",
       "total_models": 6,
-      "total_downloads": 174972,
+      "total_downloads": 173011,
       "min_param_count": 304612720,
       "sample_models": [
         "microsoft/Phi-3-vision-128k-instruct",
@@ -565,12 +472,12 @@
         "Desm0nt/Phi-3-HornyVision-128k-instruct",
         "failspy/Phi-3-vision-128k-instruct-abliterated-alpha"
       ],
-      "relevancy_score": 51.0
+      "relevancy_score": 50.1
     },
     {
       "architecture_id": "ExaoneForCausalLM",
       "total_models": 7,
-      "total_downloads": 660526,
+      "total_downloads": 626575,
       "min_param_count": 2405327360,
       "sample_models": [
         "LGAI-EXAONE/EXAONE-Deep-7.8B",
@@ -581,26 +488,31 @@
         "LGAI-EXAONE/EXAONE-Deep-32B",
         "LGAI-EXAONE/EXAONE-Deep-2.4B"
       ],
-      "relevancy_score": 50.6
+      "relevancy_score": 49.5
     },
     {
-      "architecture_id": "FalconMambaForCausalLM",
-      "total_models": 5,
-      "total_downloads": 186669,
-      "min_param_count": 525400,
+      "architecture_id": "Glm4ForCausalLM",
+      "total_models": 10,
+      "total_downloads": 32445,
+      "min_param_count": 4854928,
       "sample_models": [
-        "trl-internal-testing/tiny-FalconMambaForCausalLM",
-        "tiiuae/falcon-mamba-7b-instruct",
-        "tiiuae/falcon-mamba-7b",
-        "tiiuae/falcon-mamba-tiny-dev",
-        "tiiuae/Falcon3-Mamba-7B-Instruct"
+        "zai-org/GLM-4-9B-0414",
+        "zai-org/GLM-4-32B-0414",
+        "zai-org/GLM-Z1-9B-0414",
+        "MCult01/glm-muse-v2",
+        "zai-org/GLM-Z1-32B-0414",
+        "MCult01/glm-muse-v1",
+        "zai-org/GLM-4-32B-Base-0414",
+        "yujiepan/glm-4-tiny-random",
+        "llmfan46/GLM-4-32B-0414-uncensored-heretic-v1",
+        "ccui46/cookingworld_per_chunk_act_glm_tokfix_diffPrompt_5000"
       ],
-      "relevancy_score": 50.5
+      "relevancy_score": 49.0
     },
     {
       "architecture_id": "LlavaQwenForCausalLM",
       "total_models": 4,
-      "total_downloads": 165137,
+      "total_downloads": 186477,
       "min_param_count": 893618208,
       "sample_models": [
         "lmms-lab/llava-onevision-qwen2-7b-ov",
@@ -608,24 +520,62 @@
         "lmms-lab/llava-onevision-qwen2-7b-si",
         "lmms-lab/llava-onevision-qwen2-0.5b-si"
       ],
-      "relevancy_score": 49.5
+      "relevancy_score": 49.0
+    },
+    {
+      "architecture_id": "MiniMaxM2ForCausalLM",
+      "total_models": 23,
+      "total_downloads": 1143531,
+      "min_param_count": 18581099008,
+      "sample_models": [
+        "MiniMaxAI/MiniMax-M2.5",
+        "cerebras/MiniMax-M2.1-REAP-139B-A10B",
+        "MiniMaxAI/MiniMax-M2",
+        "MiniMaxAI/MiniMax-M2.7",
+        "MiniMaxAI/MiniMax-M2.1",
+        "cerebras/MiniMax-M2.5-REAP-139B-A10B",
+        "JANGQ-AI/MiniMax-M2.7-JANG_2L",
+        "aspctu/MiniMax-M2.5",
+        "JANGQ-AI/MiniMax-M2.7-JANG_3L",
+        "dealignai/MiniMax-M2.5-UNCENSORED-JANG_2L"
+      ],
+      "relevancy_score": 48.9
+    },
+    {
+      "architecture_id": "LlamaForCausalLMEagle3",
+      "total_models": 11,
+      "total_downloads": 22792,
+      "min_param_count": 145422848,
+      "sample_models": [
+        "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3",
+        "nvidia/gpt-oss-120b-Eagle3-long-context",
+        "chankhavu/c2.eagle3-test",
+        "Zjcxy-SmartAI/Eagle3-Qwen3-32B-zh",
+        "nvidia/gpt-oss-120b-Eagle3-short-context",
+        "Zjcxy-SmartAI/Eagle3-Qwen3-8B-zh",
+        "nvidia/gpt-oss-120b-Eagle3-throughput",
+        "thoughtworks/Gemma-4-31B-Eagle3",
+        "ruipeterpan/Qwen2.5-7B-Instruct_EAGLE3_UltraChat",
+        "thoughtworks/MiniMax-M2.5-Eagle3"
+      ],
+      "relevancy_score": 48.8
     },
     {
       "architecture_id": "BambaForCausalLM",
       "total_models": 3,
-      "total_downloads": 224342,
+      "total_downloads": 225000,
       "min_param_count": 33110760,
       "sample_models": [
         "hmellor/tiny-random-BambaForCausalLM",
         "ibm-ai-platform/Bamba-9B-v1",
         "ibm-ai-platform/Bamba-9B-v2"
       ],
-      "relevancy_score": 49.5
+      "relevancy_score": 48.8
     },
     {
       "architecture_id": "Eagle3Speculator",
       "total_models": 5,
-      "total_downloads": 105711,
+      "total_downloads": 104424,
       "min_param_count": 950186496,
       "sample_models": [
         "RedHatAI/Qwen3-8B-speculator.eagle3",
@@ -634,41 +584,41 @@
         "RedHatAI/Qwen3-32B-speculator.eagle3",
         "RedHatAI/Qwen3-14B-speculator.eagle3"
       ],
-      "relevancy_score": 49.2
+      "relevancy_score": 48.4
     },
     {
       "architecture_id": "OpenAIGPTLMHeadModel",
       "total_models": 2,
-      "total_downloads": 236281,
+      "total_downloads": 230174,
       "min_param_count": 119680512,
       "sample_models": [
         "openai-community/openai-gpt",
         "lgaalves/gpt1"
       ],
-      "relevancy_score": 49.0
+      "relevancy_score": 48.2
     },
     {
       "architecture_id": "HunYuanDenseV1ForCausalLM",
       "total_models": 9,
-      "total_downloads": 28409,
+      "total_downloads": 28771,
       "min_param_count": 539010048,
       "sample_models": [
         "tencent/Hunyuan-7B-Instruct",
         "tencent/Hunyuan-0.5B-Pretrain",
         "tencent/Hunyuan-1.8B-Pretrain",
-        "tencent/Hunyuan-4B-Pretrain",
         "tencent/Hunyuan-7B-Instruct-0124",
+        "tencent/Hunyuan-4B-Pretrain",
         "tencent/Hunyuan-7B-Pretrain",
         "tencent/Hunyuan-1.8B-Instruct",
         "tencent/Hunyuan-0.5B-Instruct",
         "tencent/Hunyuan-4B-Instruct"
       ],
-      "relevancy_score": 48.9
+      "relevancy_score": 48.1
     },
     {
       "architecture_id": "BloomModel",
       "total_models": 8,
-      "total_downloads": 38422,
+      "total_downloads": 39579,
       "min_param_count": 16156544,
       "sample_models": [
         "bigscience/bigscience-small-testing",
@@ -680,30 +630,12 @@
         "Muennighoff/bloom-tiny-random",
         "TurkuNLP/gpt3-finnish-xl"
       ],
-      "relevancy_score": 48.9
-    },
-    {
-      "architecture_id": "LlamaForCausalLMEagle3",
-      "total_models": 9,
-      "total_downloads": 20292,
-      "min_param_count": 208676608,
-      "sample_models": [
-        "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3",
-        "nvidia/gpt-oss-120b-Eagle3-long-context",
-        "nvidia/gpt-oss-120b-Eagle3-short-context",
-        "Zjcxy-SmartAI/Eagle3-Qwen3-32B-zh",
-        "chankhavu/c2.eagle3-test",
-        "Zjcxy-SmartAI/Eagle3-Qwen3-8B-zh",
-        "nvidia/gpt-oss-120b-Eagle3-throughput",
-        "thoughtworks/Gemma-4-31B-Eagle3",
-        "ruipeterpan/Qwen2.5-7B-Instruct_EAGLE3_UltraChat"
-      ],
-      "relevancy_score": 48.2
+      "relevancy_score": 48.1
     },
     {
       "architecture_id": "NemotronForCausalLM",
       "total_models": 5,
-      "total_downloads": 59740,
+      "total_downloads": 63951,
       "min_param_count": 2150720,
       "sample_models": [
         "nvidia/Nemotron-Mini-4B-Instruct",
@@ -712,89 +644,57 @@
         "badaoui/tiny-random-NemotronForCausalLM",
         "thhaus/nemotron3-8b"
       ],
-      "relevancy_score": 47.9
-    },
-    {
-      "architecture_id": "Glm4ForCausalLM",
-      "total_models": 7,
-      "total_downloads": 30432,
-      "min_param_count": 4854928,
-      "sample_models": [
-        "zai-org/GLM-4-9B-0414",
-        "zai-org/GLM-Z1-32B-0414",
-        "zai-org/GLM-Z1-9B-0414",
-        "zai-org/GLM-4-32B-0414",
-        "zai-org/GLM-4-32B-Base-0414",
-        "llmfan46/GLM-4-32B-0414-uncensored-heretic-v1",
-        "yujiepan/glm-4-tiny-random"
-      ],
-      "relevancy_score": 47.7
+      "relevancy_score": 47.3
     },
     {
       "architecture_id": "HyenaDNAForCausalLM",
       "total_models": 6,
-      "total_downloads": 38899,
+      "total_downloads": 38536,
       "min_param_count": 450712,
       "sample_models": [
         "LongSafari/hyenadna-small-32k-seqlen-hf",
         "LongSafari/hyenadna-medium-450k-seqlen-hf",
-        "LongSafari/hyenadna-large-1m-seqlen-hf",
         "LongSafari/hyenadna-tiny-1k-seqlen-hf",
+        "LongSafari/hyenadna-large-1m-seqlen-hf",
         "LongSafari/hyenadna-medium-160k-seqlen-hf",
         "LongSafari/hyenadna-tiny-16k-seqlen-d128-hf"
       ],
-      "relevancy_score": 47.6
-    },
-    {
-      "architecture_id": "ProGenForCausalLM",
-      "total_models": 5,
-      "total_downloads": 47595,
-      "min_param_count": 151148576,
-      "sample_models": [
-        "hugohrban/progen2-base",
-        "hugohrban/progen2-small",
-        "hugohrban/progen2-medium",
-        "hugohrban/progen2-large",
-        "hugohrban/progen2-small-mix7"
-      ],
-      "relevancy_score": 47.4
-    },
-    {
-      "architecture_id": "Eagle3DraftModel",
-      "total_models": 7,
-      "total_downloads": 24688,
-      "min_param_count": 522152832,
-      "sample_models": [
-        "RedHatAI/gpt-oss-20b-speculator.eagle3",
-        "RedHatAI/gpt-oss-120b-speculator.eagle3",
-        "RedHatAI/Qwen3-30B-A3B-Thinking-2507-speculator.eagle3",
-        "RedHatAI/Qwen3-235B-A22B-Instruct-2507-speculator.eagle3",
-        "RedHatAI/Qwen3-30B-A3B-Instruct-2507-speculator.eagle3",
-        "RedHatAI/Qwen3-30B-A3B-speculator.eagle3",
-        "RedHatAI/Qwen3-32B-Thinking-speculator.eagle3"
-      ],
-      "relevancy_score": 47.3
+      "relevancy_score": 46.8
     },
     {
       "architecture_id": "T5WithLMHeadModel",
       "total_models": 7,
-      "total_downloads": 25117,
+      "total_downloads": 25552,
       "min_param_count": 222903936,
       "sample_models": [
         "unicamp-dl/ptt5-base-portuguese-vocab",
         "Salesforce/codet5-large",
         "Salesforce/codet5-large-ntp-py",
         "Rostlab/prot_t5_xl_bfd",
-        "unicamp-dl/ptt5-small-portuguese-vocab",
         "gagan3012/k2t",
+        "unicamp-dl/ptt5-small-portuguese-vocab",
         "unicamp-dl/ptt5-large-portuguese-vocab"
       ],
-      "relevancy_score": 47.3
+      "relevancy_score": 46.6
+    },
+    {
+      "architecture_id": "ProGenForCausalLM",
+      "total_models": 5,
+      "total_downloads": 46959,
+      "min_param_count": 151148576,
+      "sample_models": [
+        "hugohrban/progen2-base",
+        "hugohrban/progen2-small",
+        "hugohrban/progen2-medium",
+        "hugohrban/progen2-large",
+        "hugohrban/progen2-small-mix7"
+      ],
+      "relevancy_score": 46.6
     },
     {
       "architecture_id": "Zamba2ForCausalLM",
-      "total_models": 7,
-      "total_downloads": 111102,
+      "total_models": 8,
+      "total_downloads": 111280,
       "min_param_count": 1215064704,
       "sample_models": [
         "Zyphra/Zamba2-1.2B-instruct",
@@ -803,14 +703,63 @@
         "EchoLabs33/zamba2-1.2b-hxq",
         "Zyphra/Zamba2-2.7B-instruct",
         "EchoLabs33/zamba2-2.7b-instruct-hxq",
-        "EchoLabs33/zamba2-7b-instruct-hxq"
+        "EchoLabs33/zamba2-7b-instruct-hxq",
+        "Zyphra/Zamba2-2.7B-Instruct-v2"
       ],
-      "relevancy_score": 46.6
+      "relevancy_score": 46.4
+    },
+    {
+      "architecture_id": "Qwen3_5MoeForConditionalGeneration",
+      "total_models": 16,
+      "total_downloads": 65079,
+      "min_param_count": 5555793776,
+      "sample_models": [
+        "Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled",
+        "caiovicentino1/Qwopus-MoE-35B-A3B-HLWQ-Q5",
+        "nightmedia/Qwen3.5-35B-A3B-Text-qx64-hi-mlx",
+        "Brooooooklyn/Qwen3.5-35B-A3B-UD-Q4_K_XL-mlx",
+        "nivvis/Qwen3.5-35B-A3B-EQ-v5",
+        "JANGQ-AI/Qwen3.5-397B-A17B-JANG_1L",
+        "Jackrong/MLX-Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled-bf16",
+        "nightmedia/Qwen3.5-122B-A10B-Text-qx85-mlx",
+        "JANGQ-AI/Qwen3.5-397B-A17B-JANG_2L",
+        "Kevletesteur/Qwen3.5-35B-A3B-Chimere-Distilled-BF16"
+      ],
+      "relevancy_score": 46.2
+    },
+    {
+      "architecture_id": "Ernie4_5_MoeForCausalLM",
+      "total_models": 5,
+      "total_downloads": 38765,
+      "min_param_count": 904040,
+      "sample_models": [
+        "baidu/ERNIE-4.5-21B-A3B-PT",
+        "baidu/ERNIE-4.5-21B-A3B-Base-PT",
+        "baidu/ERNIE-4.5-21B-A3B-Thinking",
+        "yujiepan/ernie-4.5-moe-tiny-random",
+        "baidu/ERNIE-4.5-300B-A47B-PT"
+      ],
+      "relevancy_score": 46.2
+    },
+    {
+      "architecture_id": "Eagle3DraftModel",
+      "total_models": 6,
+      "total_downloads": 24433,
+      "min_param_count": 522152832,
+      "sample_models": [
+        "RedHatAI/gpt-oss-20b-speculator.eagle3",
+        "RedHatAI/gpt-oss-120b-speculator.eagle3",
+        "RedHatAI/Qwen3-30B-A3B-Thinking-2507-speculator.eagle3",
+        "RedHatAI/Qwen3-235B-A22B-Instruct-2507-speculator.eagle3",
+        "RedHatAI/Qwen3-30B-A3B-Instruct-2507-speculator.eagle3",
+        "RedHatAI/Qwen3-30B-A3B-speculator.eagle3"
+      ],
+      "relevancy_score": 45.8
     },
     {
       "architecture_id": "AquilaForCausalLM",
       "total_models": 7,
-      "total_downloads": 17937,
+      "total_downloads": 17374,
       "min_param_count": 6425376,
       "sample_models": [
         "BAAI/AquilaChat2-7B",
@@ -821,28 +770,12 @@
         "BAAI/AquilaChat2-34B-16K",
         "BAAI/Aquila2-70B-Expr"
       ],
-      "relevancy_score": 46.6
-    },
-    {
-      "architecture_id": "XverseForCausalLM",
-      "total_models": 7,
-      "total_downloads": 15816,
-      "min_param_count": 6459056,
-      "sample_models": [
-        "xverse/XVERSE-7B-Chat",
-        "katuni4ka/tiny-random-xverse",
-        "xverse/XVERSE-13B-256K",
-        "xverse/XVERSE-13B",
-        "xverse/XVERSE-65B-Chat",
-        "xverse/XVERSE-13B-Chat",
-        "xverse/XVERSE-7B"
-      ],
-      "relevancy_score": 46.3
+      "relevancy_score": 45.7
     },
     {
       "architecture_id": "ArceeForCausalLM",
       "total_models": 4,
-      "total_downloads": 36482,
+      "total_downloads": 37111,
       "min_param_count": 4129088,
       "sample_models": [
         "arcee-ai/AFM-4.5B-Base",
@@ -850,31 +783,28 @@
         "onnx-internal-testing/tiny-random-ArceeForCausalLM",
         "arcee-ai/AFM-4.5B"
       ],
-      "relevancy_score": 46.1
+      "relevancy_score": 45.5
     },
     {
-      "architecture_id": "Qwen3_5MoeForConditionalGeneration",
-      "total_models": 15,
-      "total_downloads": 45472,
-      "min_param_count": 6643527536,
+      "architecture_id": "XverseForCausalLM",
+      "total_models": 7,
+      "total_downloads": 15400,
+      "min_param_count": 6459056,
       "sample_models": [
-        "Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled",
-        "nightmedia/Qwen3.5-35B-A3B-Text-qx64-hi-mlx",
-        "Jackrong/MLX-Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled-bf16",
-        "caiovicentino1/Qwopus-MoE-35B-A3B-PolarQuant-Q5",
-        "nivvis/Qwen3.5-35B-A3B-EQ-v5",
-        "Brooooooklyn/Qwen3.5-35B-A3B-UD-Q4_K_XL-mlx",
-        "JANGQ-AI/Qwen3.5-397B-A17B-JANG_1L",
-        "nightmedia/Qwen3.5-122B-A10B-Text-qx85-mlx",
-        "JANGQ-AI/Qwen3.5-397B-A17B-JANG_2L",
-        "Kevletesteur/Qwen3.5-35B-A3B-Chimere-Distilled-BF16"
+        "xverse/XVERSE-7B-Chat",
+        "katuni4ka/tiny-random-xverse",
+        "xverse/XVERSE-13B-256K",
+        "xverse/XVERSE-13B",
+        "xverse/XVERSE-65B-Chat",
+        "xverse/XVERSE-13B-Chat",
+        "xverse/XVERSE-7B"
       ],
-      "relevancy_score": 46.0
+      "relevancy_score": 45.4
     },
     {
       "architecture_id": "LlavaQwen2ForCausalLM",
       "total_models": 5,
-      "total_downloads": 25580,
+      "total_downloads": 23815,
       "min_param_count": 758833760,
       "sample_models": [
         "qnguyen3/nanoLLaVA",
@@ -883,12 +813,26 @@
         "apple/FastVLM-7B",
         "FreedomIntelligence/HuatuoGPT-Vision-7B"
       ],
-      "relevancy_score": 46.0
+      "relevancy_score": 45.2
+    },
+    {
+      "architecture_id": "Llama4ForCausalLM",
+      "total_models": 5,
+      "total_downloads": 22089,
+      "min_param_count": 3269144,
+      "sample_models": [
+        "trl-internal-testing/tiny-Llama4ForCausalLM",
+        "pruna-test/test-save-tiny-random-llama4-smashed",
+        "facebook/MobileLLM-R1.5-360M",
+        "facebook/MobileLLM-R1-950M",
+        "facebook/MobileLLM-R1-140M"
+      ],
+      "relevancy_score": 45.0
     },
     {
       "architecture_id": "SDARForCausalLM",
       "total_models": 6,
-      "total_downloads": 96427,
+      "total_downloads": 93909,
       "min_param_count": 2031739904,
       "sample_models": [
         "JetLM/SDAR-1.7B-Chat",
@@ -898,43 +842,25 @@
         "JetLM/SDAR-4B-Chat",
         "JetLM/SDAR-4B-Chat-b32"
       ],
-      "relevancy_score": 45.6
+      "relevancy_score": 44.8
     },
     {
-      "architecture_id": "DeepseekV2ForCausalLM",
-      "total_models": 14,
-      "total_downloads": 1552347,
-      "min_param_count": 15706484224,
+      "architecture_id": "SeedOssForCausalLM",
+      "total_models": 4,
+      "total_downloads": 25731,
+      "min_param_count": 2497064,
       "sample_models": [
-        "deepseek-ai/DeepSeek-V2-Lite-Chat",
-        "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
-        "deepseek-ai/DeepSeek-V2-Lite",
-        "deepseek-ai/DeepSeek-V2",
-        "deepseek-ai/DeepSeek-V2-Chat",
-        "deepseek-ai/DeepSeek-V2.5",
-        "deepseek-ai/DeepSeek-Coder-V2-Instruct",
-        "deepseek-ai/DeepSeek-V2-Chat-0628",
-        "deepseek-ai/DeepSeek-Coder-V2-Lite-Base",
-        "Kwaipilot/KwaiCoder-DS-V2-Lite-Base"
-      ],
-      "relevancy_score": 45.2
-    },
-    {
-      "architecture_id": "BitNetForCausalLM",
-      "total_models": 3,
-      "total_downloads": 25988,
-      "min_param_count": 849787090,
-      "sample_models": [
-        "microsoft/bitnet-b1.58-2B-4T",
-        "microsoft/bitnet-b1.58-2B-4T-bf16",
-        "iSolver-AI/FEnet"
+        "ByteDance-Seed/Seed-OSS-36B-Instruct",
+        "NousResearch/Hermes-4.3-36B",
+        "ByteDance-Seed/Seed-OSS-36B-Base",
+        "yujiepan/seed-oss-tiny-random"
       ],
       "relevancy_score": 44.7
     },
     {
       "architecture_id": "PldrllmForCausalLM",
       "total_models": 5,
-      "total_downloads": 13743,
+      "total_downloads": 14045,
       "min_param_count": 109689362,
       "sample_models": [
         "fromthesky/PLDR-LLM-v51-SOC-110M-5",
@@ -943,31 +869,43 @@
         "fromthesky/PLDR-LLM-v51-SOC-110M-3",
         "fromthesky/PLDR-LLM-v51-SOC-110M-1"
       ],
-      "relevancy_score": 44.6
+      "relevancy_score": 44.0
     },
     {
-      "architecture_id": "DeciLMForCausalLM",
-      "total_models": 13,
-      "total_downloads": 256626,
-      "min_param_count": 7043551232,
+      "architecture_id": "DeepseekV2ForCausalLM",
+      "total_models": 14,
+      "total_downloads": 1547180,
+      "min_param_count": 15706484224,
       "sample_models": [
-        "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
-        "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
-        "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
-        "ConicCat/Llama3_3-Nemo-Super-Writer-49B",
-        "nvidia/Llama-3_1-Nemotron-51B-Instruct",
-        "FriendliAI/Llama-3_3-Nemotron-Super-49B-v1_5",
-        "FriendliAI/Llama-3_1-Nemotron-Ultra-253B-v1",
-        "nvidia/Llama-3_1-Nemotron-Ultra-253B-CPT-v1",
-        "NewstaR/Porpoise-6b-instruct",
-        "Danielbrdz/Barcenas-6b"
+        "deepseek-ai/DeepSeek-V2-Lite-Chat",
+        "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
+        "deepseek-ai/DeepSeek-V2-Lite",
+        "deepseek-ai/DeepSeek-V2",
+        "deepseek-ai/DeepSeek-V2-Chat",
+        "deepseek-ai/DeepSeek-V2.5",
+        "deepseek-ai/DeepSeek-Coder-V2-Instruct",
+        "deepseek-ai/DeepSeek-V2-Chat-0628",
+        "deepseek-ai/DeepSeek-Coder-V2-Lite-Base",
+        "Kwaipilot/KwaiCoder-DS-V2-Lite-Base"
+      ],
+      "relevancy_score": 43.9
+    },
+    {
+      "architecture_id": "BitNetForCausalLM",
+      "total_models": 3,
+      "total_downloads": 23875,
+      "min_param_count": 849787090,
+      "sample_models": [
+        "microsoft/bitnet-b1.58-2B-4T",
+        "microsoft/bitnet-b1.58-2B-4T-bf16",
+        "iSolver-AI/FEnet"
       ],
-      "relevancy_score": 44.5
+      "relevancy_score": 43.9
     },
     {
       "architecture_id": "MoAMetricLM",
       "total_models": 5,
-      "total_downloads": 12168,
+      "total_downloads": 12515,
       "min_param_count": 69130608,
       "sample_models": [
         "reaperdoesntknow/MoA-150M",
@@ -976,24 +914,50 @@
         "reaperdoesntknow/MoA-100M",
         "reaperdoesntknow/DiscoverLM-70M"
       ],
-      "relevancy_score": 44.4
+      "relevancy_score": 43.7
     },
     {
-      "architecture_id": "Llama4ForCausalLM",
-      "total_models": 3,
-      "total_downloads": 20274,
-      "min_param_count": 3269144,
+      "architecture_id": "LlavaLlamaForCausalLM",
+      "total_models": 18,
+      "total_downloads": 64859,
+      "min_param_count": 7466764288,
       "sample_models": [
-        "trl-internal-testing/tiny-Llama4ForCausalLM",
-        "pruna-test/test-save-tiny-random-llama4-smashed",
-        "facebook/MobileLLM-R1.5-360M"
+        "wisdomik/Quilt-Llava-v1.5-7b",
+        "LanguageBind/Video-LLaVA-7B",
+        "liuhaotian/llava-llama-2-13b-chat-lightning-preview",
+        "mmaaz60/LLaVA-7B-Lightening-v1-1",
+        "lmms-lab/llama3-llava-next-8b",
+        "microsoft/llava-med-7b-delta",
+        "deepcs233/VisCoT-7b-336",
+        "ManishThota/Ollama_Video_llama_7B",
+        "liuhaotian/LLaVA-Lightning-7B-delta-v1-1",
+        "EricPolaris/Quilt-Llava-v1.5-7b"
+      ],
+      "relevancy_score": 43.5
+    },
+    {
+      "architecture_id": "DeciLMForCausalLM",
+      "total_models": 13,
+      "total_downloads": 263391,
+      "min_param_count": 7043551232,
+      "sample_models": [
+        "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
+        "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+        "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
+        "ConicCat/Llama3_3-Nemo-Super-Writer-49B",
+        "nvidia/Llama-3_1-Nemotron-51B-Instruct",
+        "FriendliAI/Llama-3_3-Nemotron-Super-49B-v1_5",
+        "FriendliAI/Llama-3_1-Nemotron-Ultra-253B-v1",
+        "nvidia/Llama-3_1-Nemotron-Ultra-253B-CPT-v1",
+        "NewstaR/Porpoise-6b-instruct",
+        "Danielbrdz/Barcenas-6b"
       ],
-      "relevancy_score": 44.2
+      "relevancy_score": 43.4
     },
     {
       "architecture_id": "MBartForConditionalGeneration",
       "total_models": 6,
-      "total_downloads": 7379,
+      "total_downloads": 7575,
       "min_param_count": 379691717,
       "sample_models": [
         "Pravopysnyk/best-unlp",
@@ -1003,234 +967,221 @@
         "MRNH/mbart-german-grammar-corrector",
         "MRNH/mbart-russian-grammar-corrector"
       ],
-      "relevancy_score": 43.9
+      "relevancy_score": 43.3
     },
     {
       "architecture_id": "DogeForCausalLM",
       "total_models": 6,
-      "total_downloads": 7207,
+      "total_downloads": 7541,
       "min_param_count": 13118728,
       "sample_models": [
         "SmallDoge/Doge-320M",
-        "SmallDoge/Doge-20M",
         "SmallDoge/Doge-160M",
+        "SmallDoge/Doge-20M",
         "SmallDoge/Doge-60M",
         "SmallDoge/Doge-120M-MoE",
         "SmallDoge/Doge-20M-MoE"
       ],
-      "relevancy_score": 43.9
+      "relevancy_score": 43.3
     },
     {
       "architecture_id": "NemotronFlashForCausalLM",
       "total_models": 2,
-      "total_downloads": 21466,
+      "total_downloads": 23953,
       "min_param_count": 965389440,
       "sample_models": [
-        "nvidia/Nemotron-Flash-3B",
-        "nvidia/Nemotron-Flash-1B"
+        "nvidia/Nemotron-Flash-1B",
+        "nvidia/Nemotron-Flash-3B"
       ],
-      "relevancy_score": 43.6
-    },
-    {
-      "architecture_id": "LlavaLlamaForCausalLM",
-      "total_models": 18,
-      "total_downloads": 33654,
-      "min_param_count": 7466764288,
-      "sample_models": [
-        "LanguageBind/Video-LLaVA-7B",
-        "wisdomik/Quilt-Llava-v1.5-7b",
-        "liuhaotian/llava-llama-2-13b-chat-lightning-preview",
-        "lmms-lab/llama3-llava-next-8b",
-        "mmaaz60/LLaVA-7B-Lightening-v1-1",
-        "microsoft/llava-med-7b-delta",
-        "deepcs233/VisCoT-7b-336",
-        "ManishThota/Ollama_Video_llama_7B",
-        "liuhaotian/LLaVA-Lightning-7B-delta-v1-1",
-        "EricPolaris/Quilt-Llava-v1.5-7b"
-      ],
-      "relevancy_score": 43.4
+      "relevancy_score": 43.3
     },
     {
-      "architecture_id": "Exaone4ForCausalLM",
+      "architecture_id": "EchoForCausalLM",
       "total_models": 3,
-      "total_downloads": 86753,
-      "min_param_count": 1279391488,
+      "total_downloads": 15499,
+      "min_param_count": 114687488,
       "sample_models": [
-        "LGAI-EXAONE/EXAONE-4.0-1.2B",
-        "LGAI-EXAONE/EXAONE-4.0.1-32B",
-        "LGAI-EXAONE/EXAONE-4.0-32B"
+        "ethicalabs/Echo-DSRN-486M-v0.7.6-SFT",
+        "ethicalabs/Echo-DSRN-114M",
+        "ethicalabs/Echo-DSRN-114M-Base"
       ],
-      "relevancy_score": 43.4
+      "relevancy_score": 43.0
     },
     {
       "architecture_id": "MiniMaxForCausalLM",
       "total_models": 2,
-      "total_downloads": 16499,
+      "total_downloads": 19980,
       "min_param_count": 231006264,
       "sample_models": [
         "MiniMaxAI/MiniMax-Text-01-hf",
         "hyper-accel/tiny-random-minimax"
       ],
-      "relevancy_score": 43.0
+      "relevancy_score": 42.9
     },
     {
-      "architecture_id": "EchoForCausalLM",
-      "total_models": 2,
-      "total_downloads": 14469,
-      "min_param_count": 114687488,
-      "sample_models": [
-        "ethicalabs/Echo-DSRN-486M-v0.7.6-SFT",
-        "ethicalabs/Echo-DSRN-114M-Base"
-      ],
-      "relevancy_score": 42.7
-    },
-    {
-      "architecture_id": "AraGPT2LMHeadModel",
+      "architecture_id": "Exaone4ForCausalLM",
       "total_models": 3,
-      "total_downloads": 9685,
-      "min_param_count": 829369856,
+      "total_downloads": 69214,
+      "min_param_count": 1279391488,
       "sample_models": [
-        "QCRI/Fanar-2-Diwan",
-        "aubmindlab/aragpt2-mega",
-        "aubmindlab/aragpt2-large"
+        "LGAI-EXAONE/EXAONE-4.0-1.2B",
+        "LGAI-EXAONE/EXAONE-4.0-32B",
+        "LGAI-EXAONE/EXAONE-4.0.1-32B"
       ],
-      "relevancy_score": 42.5
+      "relevancy_score": 42.2
     },
     {
-      "architecture_id": "IlamaForCausalLM",
-      "total_models": 1,
-      "total_downloads": 105084,
-      "min_param_count": 1235814400,
+      "architecture_id": "DbrxForCausalLM",
+      "total_models": 2,
+      "total_downloads": 14052,
+      "min_param_count": 1612456,
       "sample_models": [
-        "hmellor/Ilama-3.2-1B"
+        "trl-internal-testing/tiny-DbrxForCausalLM",
+        "katuni4ka/tiny-random-dbrx"
       ],
-      "relevancy_score": 42.5
+      "relevancy_score": 42.1
     },
     {
       "architecture_id": "ModernBertForSequenceClassification",
       "total_models": 1,
-      "total_downloads": 17538,
+      "total_downloads": 18432,
       "min_param_count": 149609478,
       "sample_models": [
         "opendatalab/meta-rater-professionalism-rating"
       ],
-      "relevancy_score": 42.5
+      "relevancy_score": 42.1
     },
     {
-      "architecture_id": "LLaMAForCausalLM",
-      "total_models": 12,
-      "total_downloads": 21954,
-      "min_param_count": 6738425856,
+      "architecture_id": "Mistral3ForConditionalGeneration",
+      "total_models": 6,
+      "total_downloads": 163824,
+      "min_param_count": 4251743232,
       "sample_models": [
-        "maicomputer/alpaca-13b",
-        "Enoch/llama-65b-hf",
-        "mncai/chatdoctor",
-        "AdaptLLM/law-LLM",
-        "Nitish-Garikoti/finance-LLM",
-        "boboto/LLaMA-65B-HF",
-        "AdaptLLM/finance-LLM",
-        "AdaptLLM/medicine-LLM",
-        "Rardilit/Panther_v1",
-        "James-WYang/BigTranslate"
+        "farbodtavakkoli/OTel-LLM-3B-IT",
+        "ArmGPT/ArmenianGPT-1.0-3B",
+        "JANGQ-AI/Mistral-Small-4-119B-A6B-JANG_2L",
+        "odytrice/kenichi-flash",
+        "JANGQ-AI/Mistral-Small-4-119B-A6B-JANG_6M",
+        "JANGQ-AI/Mistral-Small-4-119B-A6B-JANG_4M"
       ],
-      "relevancy_score": 42.4
+      "relevancy_score": 42.0
     },
     {
-      "architecture_id": "RITAModelForCausalLM",
+      "architecture_id": "AraGPT2LMHeadModel",
       "total_models": 3,
-      "total_downloads": 9211,
-      "min_param_count": 85096320,
+      "total_downloads": 9765,
+      "min_param_count": 829369856,
       "sample_models": [
-        "lightonai/RITA_s",
-        "lightonai/RITA_xl",
-        "lightonai/RITA_l"
+        "QCRI/Fanar-2-Diwan",
+        "aubmindlab/aragpt2-mega",
+        "aubmindlab/aragpt2-large"
       ],
-      "relevancy_score": 42.4
+      "relevancy_score": 42.0
     },
     {
       "architecture_id": "StarVectorForCausalLM",
       "total_models": 2,
-      "total_downloads": 74194,
+      "total_downloads": 82922,
       "min_param_count": 1434095620,
       "sample_models": [
         "starvector/starvector-1b-im2svg",
         "starvector/starvector-8b-im2svg"
       ],
-      "relevancy_score": 42.4
+      "relevancy_score": 42.0
     },
     {
-      "architecture_id": "DbrxForCausalLM",
-      "total_models": 2,
-      "total_downloads": 12374,
-      "min_param_count": 1612456,
-      "sample_models": [
-        "trl-internal-testing/tiny-DbrxForCausalLM",
-        "katuni4ka/tiny-random-dbrx"
-      ],
-      "relevancy_score": 42.4
-    },
-    {
-      "architecture_id": "Mistral3ForConditionalGeneration",
-      "total_models": 5,
-      "total_downloads": 163974,
-      "min_param_count": 4251743232,
-      "sample_models": [
-        "farbodtavakkoli/OTel-LLM-3B-IT",
-        "ArmGPT/ArmenianGPT-1.0-3B",
-        "JANGQ-AI/Mistral-Small-4-119B-A6B-JANG_2L",
-        "odytrice/kenichi-flash",
-        "JANGQ-AI/Mistral-Small-4-119B-A6B-JANG_6M"
-      ],
-      "relevancy_score": 42.2
-    },
-    {
-      "architecture_id": "Plamo2ForCausalLM",
-      "total_models": 1,
-      "total_downloads": 81427,
-      "min_param_count": 1291441920,
+      "architecture_id": "RITAModelForCausalLM",
+      "total_models": 3,
+      "total_downloads": 9391,
+      "min_param_count": 85096320,
       "sample_models": [
-        "pfnet/plamo-2-1b"
+        "lightonai/RITA_s",
+        "lightonai/RITA_xl",
+        "lightonai/RITA_l"
       ],
       "relevancy_score": 41.9
     },
     {
       "architecture_id": "OLMoForCausalLM",
-      "total_models": 6,
-      "total_downloads": 16483,
+      "total_models": 7,
+      "total_downloads": 17805,
       "min_param_count": 1176764416,
       "sample_models": [
         "allenai/OLMo-7B-Instruct",
         "allenai/OLMo-7B",
         "allenai/OLMo-1B",
         "allenai/OLMo-7B-0424",
+        "Nhoodie/omni-dna-ici-dc",
         "allenai/OLMo-7B-Twin-2T",
         "allenai/OLMo-7B-SFT"
       ],
+      "relevancy_score": 41.8
+    },
+    {
+      "architecture_id": "IlamaForCausalLM",
+      "total_models": 1,
+      "total_downloads": 102422,
+      "min_param_count": 1235814400,
+      "sample_models": [
+        "hmellor/Ilama-3.2-1B"
+      ],
+      "relevancy_score": 41.8
+    },
+    {
+      "architecture_id": "A2DQwen3LMHeadModel",
+      "total_models": 2,
+      "total_downloads": 11738,
+      "min_param_count": 751632384,
+      "sample_models": [
+        "dllm-hub/Qwen3-0.6B-diffusion-mdlm-v0.1",
+        "dllm-hub/Qwen3-0.6B-diffusion-bd3lm-v0.1"
+      ],
       "relevancy_score": 41.7
     },
     {
-      "architecture_id": "MiniMaxM2ForCausalLM",
-      "total_models": 10,
-      "total_downloads": 970048,
-      "min_param_count": 18581099008,
+      "architecture_id": "NandiForCausalLM",
+      "total_models": 2,
+      "total_downloads": 10215,
+      "min_param_count": 153412928,
       "sample_models": [
-        "MiniMaxAI/MiniMax-M2.5",
-        "cerebras/MiniMax-M2.1-REAP-139B-A10B",
-        "MiniMaxAI/MiniMax-M2",
-        "MiniMaxAI/MiniMax-M2.1",
-        "cerebras/MiniMax-M2.5-REAP-139B-A10B",
-        "aspctu/MiniMax-M2.5",
-        "dealignai/MiniMax-M2.5-UNCENSORED-JANG_2L",
-        "unsloth/MiniMax-M2.5",
-        "dealignai/MiniMax-M2.5-JANG_3L-CRACK",
-        "JANGQ-AI/MiniMax-M2.5-JANG_3L"
+        "Rta-AILabs/Nandi-Mini-150M",
+        "Rta-AILabs/Nandi-Mini-150M-Instruct"
+      ],
+      "relevancy_score": 41.4
+    },
+    {
+      "architecture_id": "LLaMAForCausalLM",
+      "total_models": 12,
+      "total_downloads": 21884,
+      "min_param_count": 6738425856,
+      "sample_models": [
+        "maicomputer/alpaca-13b",
+        "Enoch/llama-65b-hf",
+        "mncai/chatdoctor",
+        "AdaptLLM/law-LLM",
+        "Nitish-Garikoti/finance-LLM",
+        "boboto/LLaMA-65B-HF",
+        "AdaptLLM/finance-LLM",
+        "AdaptLLM/medicine-LLM",
+        "Rardilit/Panther_v1",
+        "James-WYang/BigTranslate"
+      ],
+      "relevancy_score": 41.3
+    },
+    {
+      "architecture_id": "Plamo2ForCausalLM",
+      "total_models": 1,
+      "total_downloads": 81448,
+      "min_param_count": 1291441920,
+      "sample_models": [
+        "pfnet/plamo-2-1b"
       ],
-      "relevancy_score": 41.5
+      "relevancy_score": 41.3
     },
     {
       "architecture_id": "Starcoder2ForCausalLM",
       "total_models": 5,
-      "total_downloads": 117124,
+      "total_downloads": 116878,
       "min_param_count": 3030371328,
       "sample_models": [
         "bigcode/starcoder2-3b",
@@ -1239,48 +1190,36 @@
         "bigcode/starcoder2-15b-instruct-v0.1",
         "dphn/dolphincoder-starcoder2-15b"
       ],
-      "relevancy_score": 41.4
-    },
-    {
-      "architecture_id": "GlmForCausalLM",
-      "total_models": 4,
-      "total_downloads": 23486,
-      "min_param_count": 1593427968,
-      "sample_models": [
-        "zai-org/glm-4-9b-chat-hf",
-        "zai-org/glm-4-9b-hf",
-        "zai-org/glm-edge-4b-chat",
-        "zai-org/glm-edge-1.5b-chat"
-      ],
-      "relevancy_score": 41.2
+      "relevancy_score": 40.6
     },
     {
       "architecture_id": "MolformerForCausalLM",
       "total_models": 2,
-      "total_downloads": 7302,
+      "total_downloads": 6850,
       "min_param_count": 46805760,
       "sample_models": [
         "ibm-research/GP-MoLFormer-Uniq",
         "ralyn/NPComposer-v2"
       ],
-      "relevancy_score": 41.2
+      "relevancy_score": 40.6
     },
     {
-      "architecture_id": "MptForCausalLM",
-      "total_models": 3,
-      "total_downloads": 4577,
-      "min_param_count": 405032,
+      "architecture_id": "GlmForCausalLM",
+      "total_models": 4,
+      "total_downloads": 23066,
+      "min_param_count": 1593427968,
       "sample_models": [
-        "yujiepan/mpt-tiny-random",
-        "explosion-testing/mpt-test",
-        "team-lucid/mptk-1b"
+        "zai-org/glm-4-9b-chat-hf",
+        "zai-org/glm-4-9b-hf",
+        "zai-org/glm-edge-4b-chat",
+        "zai-org/glm-edge-1.5b-chat"
       ],
-      "relevancy_score": 40.8
+      "relevancy_score": 40.5
     },
     {
       "architecture_id": "Glm4MoeLiteForCausalLM",
-      "total_models": 8,
-      "total_downloads": 1257096,
+      "total_models": 9,
+      "total_downloads": 1217856,
       "min_param_count": 22996118432,
       "sample_models": [
         "zai-org/GLM-4.7-Flash",
@@ -1290,79 +1229,85 @@
         "Olafangensan/GLM-4.7-Flash-heretic",
         "Ex0bit/GLM-4.7-Flash-PRISM",
         "jerrycheng233/model5_sft_16bit",
-        "aaravriyer193/chimpgpt-coder-elite"
+        "aaravriyer193/chimpgpt-coder-elite",
+        "austindixson/glm-4.7-flash-Opus-Reasoning"
       ],
-      "relevancy_score": 40.7
+      "relevancy_score": 40.3
     },
     {
-      "architecture_id": "LLaDAModelLM",
-      "total_models": 4,
-      "total_downloads": 682726,
-      "min_param_count": 8015581184,
+      "architecture_id": "MptForCausalLM",
+      "total_models": 3,
+      "total_downloads": 4595,
+      "min_param_count": 405032,
       "sample_models": [
-        "GSAI-ML/LLaDA-8B-Instruct",
-        "GSAI-ML/LLaDA-8B-Base",
-        "GSAI-ML/LLaDA-1.5",
-        "d3LLM/d3LLM_LLaDA"
+        "yujiepan/mpt-tiny-random",
+        "explosion-testing/mpt-test",
+        "team-lucid/mptk-1b"
       ],
-      "relevancy_score": 40.7
+      "relevancy_score": 40.3
     },
     {
-      "architecture_id": "NandiForCausalLM",
-      "total_models": 1,
-      "total_downloads": 7981,
-      "min_param_count": 153412928,
+      "architecture_id": "Llama4ForConditionalGeneration",
+      "total_models": 2,
+      "total_downloads": 5844,
+      "min_param_count": 6686880,
       "sample_models": [
-        "Rta-AILabs/Nandi-Mini-150M"
+        "yujiepan/llama-4-tiny-random",
+        "Mogith/Llama-4-Scout-17B-16E-Instruct-Q8_0"
       ],
-      "relevancy_score": 40.7
+      "relevancy_score": 40.2
     },
     {
       "architecture_id": "DuchifatCore",
       "total_models": 3,
-      "total_downloads": 4079,
+      "total_downloads": 4086,
       "min_param_count": 136763904,
       "sample_models": [
         "Raziel1234/Duchifat-2",
         "razielAI/Duchifat-2.1-Instruct",
         "TopAI-1/Duchifat-2-Instruct"
       ],
-      "relevancy_score": 40.6
+      "relevancy_score": 40.1
+    },
+    {
+      "architecture_id": "LLaDAModelLM",
+      "total_models": 4,
+      "total_downloads": 659922,
+      "min_param_count": 8015581184,
+      "sample_models": [
+        "GSAI-ML/LLaDA-8B-Instruct",
+        "GSAI-ML/LLaDA-8B-Base",
+        "GSAI-ML/LLaDA-1.5",
+        "d3LLM/d3LLM_LLaDA"
+      ],
+      "relevancy_score": 39.8
     },
     {
       "architecture_id": "GLAForCausalLM",
       "total_models": 2,
-      "total_downloads": 5043,
+      "total_downloads": 4823,
       "min_param_count": 341707776,
       "sample_models": [
-        "fla-hub/gla-340M-15B",
-        "fla-hub/gla-1.3B-100B"
+        "fla-hub/gla-1.3B-100B",
+        "fla-hub/gla-340M-15B"
       ],
-      "relevancy_score": 40.4
+      "relevancy_score": 39.8
     },
     {
-      "architecture_id": "RWForCausalLM",
-      "total_models": 11,
-      "total_downloads": 11851,
-      "min_param_count": 6854619456,
+      "architecture_id": "BertLMHeadModel",
+      "total_models": 2,
+      "total_downloads": 4589,
+      "min_param_count": 184474880,
       "sample_models": [
-        "projecte-aina/aguila-7b",
-        "lightonai/alfred-40b-1023",
-        "explosion-testing/refined-web-model-test",
-        "vilm/vulture-40b",
-        "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2",
-        "nomic-ai/gpt4all-falcon",
-        "OpenAssistant/falcon-40b-sft-top1-560",
-        "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
-        "QuixiAI/WizardLM-Uncensored-Falcon-40b",
-        "mrm8488/falcoder-7b"
+        "dicta-il/BEREL_3.0",
+        "hf-tiny-model-private/tiny-random-BertLMHeadModel"
       ],
-      "relevancy_score": 40.3
+      "relevancy_score": 39.7
     },
     {
       "architecture_id": "Lfm2MoeForCausalLM",
       "total_models": 8,
-      "total_downloads": 175258,
+      "total_downloads": 187579,
       "min_param_count": 8339929856,
       "sample_models": [
         "farbodtavakkoli/OTel-LLM-24B-IT",
@@ -1371,818 +1316,905 @@
         "LiquidAI/LFM2-8B-A1B-ONNX",
         "LiquidAI/LFM2-24B-A2B-ONNX",
         "unsloth/LFM2-8B-A1B",
-        "huihui-ai/Huihui-LFM2-24B-A2B-abliterated",
-        "MuXodious/LFM2-8B-A1B-absolute-heresy-MPOA"
+        "MuXodious/LFM2-8B-A1B-absolute-heresy-MPOA",
+        "huihui-ai/Huihui-LFM2-24B-A2B-abliterated"
       ],
-      "relevancy_score": 40.3
+      "relevancy_score": 39.5
     },
     {
-      "architecture_id": "BertLMHeadModel",
-      "total_models": 2,
-      "total_downloads": 4599,
-      "min_param_count": 184474880,
+      "architecture_id": "RWForCausalLM",
+      "total_models": 11,
+      "total_downloads": 11570,
+      "min_param_count": 6854619456,
       "sample_models": [
-        "dicta-il/BEREL_3.0",
-        "hf-tiny-model-private/tiny-random-BertLMHeadModel"
+        "projecte-aina/aguila-7b",
+        "lightonai/alfred-40b-1023",
+        "explosion-testing/refined-web-model-test",
+        "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2",
+        "vilm/vulture-40b",
+        "nomic-ai/gpt4all-falcon",
+        "OpenAssistant/falcon-40b-sft-top1-560",
+        "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
+        "QuixiAI/WizardLM-Uncensored-Falcon-40b",
+        "mrm8488/falcoder-7b"
       ],
-      "relevancy_score": 40.2
+      "relevancy_score": 39.3
     },
     {
-      "architecture_id": "Llama4ForConditionalGeneration",
-      "total_models": 1,
-      "total_downloads": 6224,
-      "min_param_count": 6686880,
+      "architecture_id": "GPJTGPT2ModelForCausalLM",
+      "total_models": 4,
+      "total_downloads": 2088,
+      "min_param_count": 175592448,
       "sample_models": [
-        "yujiepan/llama-4-tiny-random"
+        "gpjt/8xa100m40-baseline-3",
+        "gpjt/8xa100m40-baseline-2",
+        "gpjt/8xa100m40-baseline-8",
+        "gpjt/8xa100m40-baseline-7"
       ],
-      "relevancy_score": 40.2
+      "relevancy_score": 39.2
     },
     {
       "architecture_id": "AfmoeForCausalLM",
       "total_models": 6,
-      "total_downloads": 45755,
+      "total_downloads": 44009,
       "min_param_count": 6120003328,
       "sample_models": [
-        "arcee-ai/Trinity-Nano-Preview",
         "arcee-ai/Trinity-Large-Thinking",
+        "arcee-ai/Trinity-Nano-Preview",
         "arcee-ai/Trinity-Mini",
         "arcee-ai/Trinity-Nano-Base",
         "arcee-ai/Trinity-Mini-Base",
         "arcee-ai/Trinity-Large-Preview"
       ],
-      "relevancy_score": 40.0
+      "relevancy_score": 39.1
+    },
+    {
+      "architecture_id": "GPTJXMoEForCausalLM",
+      "total_models": 1,
+      "total_downloads": 4510,
+      "min_param_count": 489915648,
+      "sample_models": [
+        "Aletheia-ng/SabiYarn_MoE_translate"
+      ],
+      "relevancy_score": 39.0
+    },
+    {
+      "architecture_id": "GatedDeltaNetForCausalLM",
+      "total_models": 1,
+      "total_downloads": 4069,
+      "min_param_count": 317524480,
+      "sample_models": [
+        "deqing/gdn-300M-v5-gdn"
+      ],
+      "relevancy_score": 38.8
     },
     {
       "architecture_id": "BitnetForCausalLM",
       "total_models": 2,
-      "total_downloads": 3594,
+      "total_downloads": 2937,
       "min_param_count": 728843904,
       "sample_models": [
         "1bitLLM/bitnet_b1_58-large",
         "1bitLLM/bitnet_b1_58-3B"
       ],
-      "relevancy_score": 39.6
+      "relevancy_score": 38.7
     },
     {
       "architecture_id": "RecurrentGemmaForCausalLM",
       "total_models": 3,
-      "total_downloads": 13166,
+      "total_downloads": 13093,
       "min_param_count": 2682862080,
       "sample_models": [
         "google/recurrentgemma-2b",
         "google/recurrentgemma-2b-it",
         "google/recurrentgemma-9b"
       ],
-      "relevancy_score": 39.2
-    },
-    {
-      "architecture_id": "GatedDeltaNetForCausalLM",
-      "total_models": 1,
-      "total_downloads": 4063,
-      "min_param_count": 317524480,
-      "sample_models": [
-        "deqing/gdn-300M-v5-gdn"
-      ],
-      "relevancy_score": 39.2
+      "relevancy_score": 38.6
     },
     {
       "architecture_id": "RecursiveLanguageModel",
       "total_models": 1,
-      "total_downloads": 3560,
+      "total_downloads": 3401,
       "min_param_count": 198464806,
       "sample_models": [
         "Girinath11/recursive-language-model-198m"
       ],
-      "relevancy_score": 38.9
+      "relevancy_score": 38.4
     },
     {
       "architecture_id": "T5EncoderModel",
       "total_models": 1,
-      "total_downloads": 122326,
+      "total_downloads": 119882,
       "min_param_count": 4762310656,
       "sample_models": [
         "XLabs-AI/xflux_text_encoders"
       ],
-      "relevancy_score": 38.8
-    },
-    {
-      "architecture_id": "AprielForCausalLM",
-      "total_models": 1,
-      "total_downloads": 113509,
-      "min_param_count": 4832071680,
-      "sample_models": [
-        "ServiceNow-AI/Apriel-5B-Instruct"
-      ],
-      "relevancy_score": 38.7
-    },
-    {
-      "architecture_id": "GPTJXMoEForCausalLM",
-      "total_models": 1,
-      "total_downloads": 3210,
-      "min_param_count": 489915648,
-      "sample_models": [
-        "Aletheia-ng/SabiYarn_MoE_translate"
-      ],
-      "relevancy_score": 38.7
+      "relevancy_score": 38.2
     },
     {
       "architecture_id": "LLM",
       "total_models": 1,
-      "total_downloads": 3059,
+      "total_downloads": 3132,
       "min_param_count": 497145984,
       "sample_models": [
         "rudyon/linnet-497M"
       ],
-      "relevancy_score": 38.6
-    },
-    {
-      "architecture_id": "DreamModel",
-      "total_models": 6,
-      "total_downloads": 140463,
-      "min_param_count": 7615616512,
-      "sample_models": [
-        "Dream-org/Dream-v0-Instruct-7B",
-        "Dream-org/Dream-v0-Base-7B",
-        "Dream-org/Dream-Coder-v0-Instruct-7B",
-        "Zigeng/dParallel_Dream_7B_Instruct",
-        "Dream-org/Dream-Coder-v0-Base-7B",
-        "Dream-org/DreamOn-v0-7B"
-      ],
-      "relevancy_score": 38.5
+      "relevancy_score": 38.2
     },
     {
       "architecture_id": "SwarmForCausalLM",
       "total_models": 1,
-      "total_downloads": 2979,
+      "total_downloads": 3059,
       "min_param_count": 52729731,
       "sample_models": [
         "reaperdoesntknow/SAGI"
       ],
-      "relevancy_score": 38.5
+      "relevancy_score": 38.2
     },
     {
-      "architecture_id": "GPJTGPT2ModelForCausalLM",
-      "total_models": 3,
-      "total_downloads": 1570,
-      "min_param_count": 175592448,
+      "architecture_id": "AprielForCausalLM",
+      "total_models": 1,
+      "total_downloads": 113197,
+      "min_param_count": 4832071680,
       "sample_models": [
-        "gpjt/8xa100m40-baseline-3",
-        "gpjt/8xa100m40-baseline-2",
-        "gpjt/8xa100m40-baseline-7"
+        "ServiceNow-AI/Apriel-5B-Instruct"
       ],
-      "relevancy_score": 38.4
+      "relevancy_score": 38.1
     },
     {
       "architecture_id": "SpatialLMQwenForCausalLM",
       "total_models": 1,
-      "total_downloads": 2768,
+      "total_downloads": 2930,
       "min_param_count": 603511168,
       "sample_models": [
         "manycore-research/SpatialLM1.1-Qwen-0.5B"
       ],
-      "relevancy_score": 38.4
+      "relevancy_score": 38.1
     },
     {
       "architecture_id": "MiniMindForCausalLM",
       "total_models": 2,
-      "total_downloads": 2005,
+      "total_downloads": 2151,
       "min_param_count": 38840960,
       "sample_models": [
         "yiwenX/MiniMind-MoE-640-120M",
         "chujiamo/baiheng_0405"
       ],
-      "relevancy_score": 38.3
+      "relevancy_score": 38.0
+    },
+    {
+      "architecture_id": "DreamModel",
+      "total_models": 6,
+      "total_downloads": 153501,
+      "min_param_count": 7615616512,
+      "sample_models": [
+        "Dream-org/Dream-v0-Instruct-7B",
+        "Dream-org/Dream-v0-Base-7B",
+        "Dream-org/Dream-Coder-v0-Instruct-7B",
+        "Zigeng/dParallel_Dream_7B_Instruct",
+        "Dream-org/Dream-Coder-v0-Base-7B",
+        "Dream-org/DreamOn-v0-7B"
+      ],
+      "relevancy_score": 37.9
     },
     {
       "architecture_id": "AV2TextForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 2566,
+      "total_downloads": 2689,
       "min_param_count": 480465000,
       "sample_models": [
         "nguyenvulebinh/AV-HuBERT-MuAViC-en"
       ],
-      "relevancy_score": 38.2
+      "relevancy_score": 37.9
     },
     {
       "architecture_id": "BD3LM",
       "total_models": 2,
-      "total_downloads": 1793,
+      "total_downloads": 1953,
       "min_param_count": 169627250,
       "sample_models": [
         "kuleshov-group/bd3lm-owt-block_size4",
         "kuleshov-group/bd3lm-owt-block_size16"
       ],
-      "relevancy_score": 38.1
-    },
-    {
-      "architecture_id": "HCXVisionV2ForCausalLM",
-      "total_models": 2,
-      "total_downloads": 356837,
-      "min_param_count": 10741664520,
-      "sample_models": [
-        "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
-        "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B"
-      ],
-      "relevancy_score": 37.9
+      "relevancy_score": 37.8
     },
     {
-      "architecture_id": "PenguinVLQwen3ForCausalLM",
+      "architecture_id": "ForCausalLM",
       "total_models": 2,
-      "total_downloads": 10102,
-      "min_param_count": 2167941120,
+      "total_downloads": 1725,
+      "min_param_count": 748801603,
       "sample_models": [
-        "tencent/Penguin-VL-8B",
-        "tencent/Penguin-VL-2B"
+        "kyr0/Gemma-4-Waldwicht-Sproessling",
+        "kyr0/Gemma-4-Waldwicht-Winzling"
       ],
-      "relevancy_score": 37.9
+      "relevancy_score": 37.5
     },
     {
       "architecture_id": "BlenderbotForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 2226,
+      "total_downloads": 2289,
       "min_param_count": 364810568,
       "sample_models": [
         "thu-coai/blenderbot-400M-esconv"
       ],
-      "relevancy_score": 37.9
+      "relevancy_score": 37.5
     },
     {
       "architecture_id": "Autoencoder",
       "total_models": 1,
-      "total_downloads": 2223,
+      "total_downloads": 2278,
       "min_param_count": 75832064,
       "sample_models": [
         "cccczshao/CALM-Autoencoder"
       ],
-      "relevancy_score": 37.9
+      "relevancy_score": 37.5
     },
     {
       "architecture_id": "EveMoEForCausalLM",
       "total_models": 1,
-      "total_downloads": 2123,
+      "total_downloads": 2174,
       "min_param_count": 271970816,
       "sample_models": [
         "anthonym21/Eve-2-MoE-IT-272M"
       ],
-      "relevancy_score": 37.8
+      "relevancy_score": 37.4
     },
     {
       "architecture_id": "FusionInDecoderForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 2105,
+      "total_downloads": 2146,
       "min_param_count": 247577856,
       "sample_models": [
         "Intel/fid_flan_t5_base_nq"
       ],
-      "relevancy_score": 37.8
+      "relevancy_score": 37.4
     },
     {
       "architecture_id": "Plamo3ForCausalLM",
       "total_models": 1,
-      "total_downloads": 12290,
+      "total_downloads": 12925,
       "min_param_count": 2603344384,
       "sample_models": [
         "pfnet/plamo-3-nict-2b-base"
       ],
-      "relevancy_score": 37.7
+      "relevancy_score": 37.3
+    },
+    {
+      "architecture_id": "TransformerForCausalLM",
+      "total_models": 1,
+      "total_downloads": 12840,
+      "min_param_count": 1364297728,
+      "sample_models": [
+        "fla-hub/transformer-1.3B-100B"
+      ],
+      "relevancy_score": 37.3
     },
     {
       "architecture_id": "LIMEForCausalLM",
       "total_models": 1,
-      "total_downloads": 2043,
+      "total_downloads": 2102,
       "min_param_count": 984405504,
       "sample_models": [
         "anarlavrenov/lime-1b-instruct"
       ],
-      "relevancy_score": 37.7
+      "relevancy_score": 37.3
     },
     {
       "architecture_id": "ModernBertForMaskedLM",
       "total_models": 1,
-      "total_downloads": 2007,
+      "total_downloads": 2068,
       "min_param_count": 590367063,
       "sample_models": [
         "JorgeVanco/diffusionGPT"
       ],
-      "relevancy_score": 37.6
+      "relevancy_score": 37.3
+    },
+    {
+      "architecture_id": "HCXVisionV2ForCausalLM",
+      "total_models": 2,
+      "total_downloads": 354662,
+      "min_param_count": 10741664520,
+      "sample_models": [
+        "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
+        "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B"
+      ],
+      "relevancy_score": 37.2
     },
     {
       "architecture_id": "MoEGPTForCausalLM",
       "total_models": 1,
-      "total_downloads": 1960,
+      "total_downloads": 1943,
       "min_param_count": 149603328,
       "sample_models": [
         "arnomatic/german-moe-gpt-v8-pretrained"
       ],
-      "relevancy_score": 37.6
+      "relevancy_score": 37.2
+    },
+    {
+      "architecture_id": "ParamBharatGenForCausalLM",
+      "total_models": 4,
+      "total_downloads": 5035,
+      "min_param_count": 2860664832,
+      "sample_models": [
+        "bharatgenai/Param-1-5B",
+        "bharatgenai/AyurParam",
+        "bharatgenai/Param-1-2.9B-Instruct",
+        "bharatgenai/AgriParam"
+      ],
+      "relevancy_score": 37.1
     },
     {
-      "architecture_id": "TransformerForCausalLM",
+      "architecture_id": "LanceAI",
       "total_models": 1,
-      "total_downloads": 11223,
-      "min_param_count": 1364297728,
+      "total_downloads": 1887,
+      "min_param_count": 137022720,
+      "sample_models": [
+        "NeuraCraft/Lance-AI-V2"
+      ],
+      "relevancy_score": 37.1
+    },
+    {
+      "architecture_id": "RWKV7ForCausalLM",
+      "total_models": 2,
+      "total_downloads": 1332,
+      "min_param_count": 381332480,
       "sample_models": [
-        "fla-hub/transformer-1.3B-100B"
+        "puigde/rwkv7-380M-15B-slimpajama",
+        "fla-hub/rwkv7-1.5B-world"
       ],
-      "relevancy_score": 37.5
+      "relevancy_score": 37.0
     },
     {
       "architecture_id": "LSTMForCausalLM",
       "total_models": 1,
-      "total_downloads": 1768,
+      "total_downloads": 1772,
       "min_param_count": 164921344,
       "sample_models": [
         "deqing/lstm-window-4-v5"
       ],
-      "relevancy_score": 37.4
+      "relevancy_score": 37.0
     },
     {
       "architecture_id": "NanoChatForCausalLM",
       "total_models": 3,
-      "total_downloads": 5777,
+      "total_downloads": 6167,
       "min_param_count": 2217082880,
       "sample_models": [
         "Twobombs/nanochat-d34-sft-hf",
         "pankajmathur/nanochat-d34-sft-hf",
         "Nekochu/nanochat-d24"
       ],
-      "relevancy_score": 37.3
+      "relevancy_score": 36.9
     },
     {
-      "architecture_id": "Qwen3ASRForConditionalGeneration",
+      "architecture_id": "PenguinVLQwen3ForCausalLM",
       "total_models": 2,
-      "total_downloads": 1203,
-      "min_param_count": 782426112,
+      "total_downloads": 7957,
+      "min_param_count": 2167941120,
       "sample_models": [
-        "bezzam/Qwen3-ASR-0.6B",
-        "hypaai/Qwen3-ASR-0.6B_2026-03-22_04-35-10"
+        "tencent/Penguin-VL-8B",
+        "tencent/Penguin-VL-2B"
       ],
-      "relevancy_score": 37.2
+      "relevancy_score": 36.9
     },
     {
       "architecture_id": "Moondream",
       "total_models": 1,
-      "total_downloads": 9755,
+      "total_downloads": 10509,
       "min_param_count": 1857482608,
       "sample_models": [
         "vikhyatk/moondream1"
       ],
-      "relevancy_score": 37.2
+      "relevancy_score": 36.9
     },
     {
-      "architecture_id": "CircuitGPTForCausalLM",
-      "total_models": 1,
-      "total_downloads": 1414,
-      "min_param_count": 419124736,
+      "architecture_id": "GPT3DevLMHeadModel",
+      "total_models": 2,
+      "total_downloads": 1135,
+      "min_param_count": 125226240,
       "sample_models": [
-        "openai/circuit-sparsity"
+        "k050506koch/GPT3-dev-350m-2805",
+        "k050506koch/GPT3-dev-125m-0104"
       ],
-      "relevancy_score": 36.9
+      "relevancy_score": 36.6
     },
     {
-      "architecture_id": "ParamBharatGenForCausalLM",
-      "total_models": 3,
-      "total_downloads": 4535,
-      "min_param_count": 2860673024,
+      "architecture_id": "D3LMForMaskedLM",
+      "total_models": 1,
+      "total_downloads": 1260,
+      "min_param_count": 55905164,
       "sample_models": [
-        "bharatgenai/Param-1-5B",
-        "bharatgenai/AyurParam",
-        "bharatgenai/Param-1-2.9B-Instruct"
+        "Hengchang-Liu/D3LM-from-nt"
       ],
-      "relevancy_score": 36.8
+      "relevancy_score": 36.2
     },
     {
-      "architecture_id": "SliderGPT",
+      "architecture_id": "CircuitGPTForCausalLM",
       "total_models": 1,
-      "total_downloads": 1377,
-      "min_param_count": 47420936,
+      "total_downloads": 1247,
+      "min_param_count": 419124736,
       "sample_models": [
-        "c-bone/CrystaLLM-pi_Mattergen-XRD"
+        "openai/circuit-sparsity"
       ],
-      "relevancy_score": 36.8
+      "relevancy_score": 36.2
     },
     {
-      "architecture_id": "YoutuForCausalLM",
-      "total_models": 2,
-      "total_downloads": 4389,
-      "min_param_count": 1961560064,
+      "architecture_id": "PanguEmbeddedForCausalLM",
+      "total_models": 1,
+      "total_downloads": 7396,
+      "min_param_count": 1391497728,
       "sample_models": [
-        "tencent/Youtu-LLM-2B-Base",
-        "tencent/Youtu-LLM-2B"
+        "FreedomIntelligence/openPangu-Embedded-1B"
       ],
       "relevancy_score": 36.1
     },
     {
-      "architecture_id": "DUO",
+      "architecture_id": "RubiRLM",
       "total_models": 1,
-      "total_downloads": 995,
-      "min_param_count": 169627250,
+      "total_downloads": 1142,
+      "min_param_count": 988446027,
       "sample_models": [
-        "s-sahoo/duo-distilled"
+        "DevHunterAI/RubiRLM-1B-Base"
       ],
-      "relevancy_score": 36.1
+      "relevancy_score": 36.0
     },
     {
       "architecture_id": "Rwkv7ForCausalLM",
       "total_models": 1,
-      "total_downloads": 970,
+      "total_downloads": 1092,
       "min_param_count": 34158592,
       "sample_models": [
         "admijgjtjtjtjjg/dfdfdf"
       ],
-      "relevancy_score": 36.0
+      "relevancy_score": 35.9
     },
     {
-      "architecture_id": "RubiRLM",
+      "architecture_id": "SliderGPT",
       "total_models": 1,
-      "total_downloads": 964,
-      "min_param_count": 988446027,
+      "total_downloads": 1069,
+      "min_param_count": 47420936,
       "sample_models": [
-        "DevHunterAI/RubiRLM-1B-Base"
+        "c-bone/CrystaLLM-pi_Mattergen-XRD"
       ],
-      "relevancy_score": 36.0
+      "relevancy_score": 35.9
     },
     {
-      "architecture_id": "RavenForCausalLM",
+      "architecture_id": "YoutuForCausalLM",
       "total_models": 2,
-      "total_downloads": 4033,
-      "min_param_count": 1385228288,
+      "total_downloads": 4259,
+      "min_param_count": 1961560064,
       "sample_models": [
-        "tomg-group-umd/huginn-0125",
-        "smcleish/Recurrent-Llama-3.2-train-recurrence-32"
+        "tencent/Youtu-LLM-2B-Base",
+        "tencent/Youtu-LLM-2B"
       ],
-      "relevancy_score": 35.9
+      "relevancy_score": 35.5
     },
     {
       "architecture_id": "GTLMForCausalLM",
       "total_models": 2,
-      "total_downloads": 4011,
+      "total_downloads": 4210,
       "min_param_count": 2095989760,
       "sample_models": [
         "Madras1/GTLM-1-2B-A350M",
         "Madras1/GTLM-1-2B-A350M-fp16"
       ],
-      "relevancy_score": 35.9
-    },
-    {
-      "architecture_id": "PanguEmbeddedForCausalLM",
-      "total_models": 1,
-      "total_downloads": 5621,
-      "min_param_count": 1391497728,
-      "sample_models": [
-        "FreedomIntelligence/openPangu-Embedded-1B"
-      ],
-      "relevancy_score": 35.9
+      "relevancy_score": 35.5
     },
     {
       "architecture_id": "SoraForSLM",
       "total_models": 1,
-      "total_downloads": 901,
+      "total_downloads": 915,
       "min_param_count": 450707456,
       "sample_models": [
         "Conlanger-LLM-CLEM/Sorie"
       ],
-      "relevancy_score": 35.9
-    },
-    {
-      "architecture_id": "MoshiForConditionalGeneration",
-      "total_models": 2,
-      "total_downloads": 133468,
-      "min_param_count": 7783880545,
-      "sample_models": [
-        "kmhf/hf-moshiko",
-        "kmhf/hf-moshika"
-      ],
-      "relevancy_score": 35.7
-    },
-    {
-      "architecture_id": "MiMoForCausalLM",
-      "total_models": 2,
-      "total_downloads": 135798,
-      "min_param_count": 7833409536,
-      "sample_models": [
-        "XiaomiMiMo/MiMo-7B-Base",
-        "XiaomiMiMo/MiMo-7B-RL"
-      ],
-      "relevancy_score": 35.7
+      "relevancy_score": 35.5
     },
     {
       "architecture_id": "HGRNBitForCausalLM",
       "total_models": 1,
-      "total_downloads": 854,
+      "total_downloads": 874,
       "min_param_count": 374108160,
       "sample_models": [
         "ridger/MMfreeLM-370M"
       ],
-      "relevancy_score": 35.7
+      "relevancy_score": 35.4
     },
     {
       "architecture_id": "DotLMForCausalLM",
       "total_models": 1,
-      "total_downloads": 823,
+      "total_downloads": 825,
       "min_param_count": 176204544,
       "sample_models": [
         "tensorfiend/DotLM-165M"
       ],
-      "relevancy_score": 35.7
+      "relevancy_score": 35.3
     },
     {
-      "architecture_id": "D3LMForMaskedLM",
-      "total_models": 1,
-      "total_downloads": 790,
-      "min_param_count": 55905164,
+      "architecture_id": "LLaDA2MoeModelLM",
+      "total_models": 6,
+      "total_downloads": 289792,
+      "min_param_count": 16255643392,
       "sample_models": [
-        "Hengchang-Liu/D3LM-from-nt"
+        "inclusionAI/LLaDA2.1-flash",
+        "inclusionAI/LLaDA2.0-mini",
+        "inclusionAI/LLaDA2.1-mini",
+        "inclusionAI/LLaDA2.0-mini-CAP",
+        "inclusionAI/LLaDA2.0-flash",
+        "Zigeng/DMax-Coder-16B"
       ],
-      "relevancy_score": 35.6
+      "relevancy_score": 35.2
     },
     {
-      "architecture_id": "DeltaNetForCausalLM",
+      "architecture_id": "RavenForCausalLM",
+      "total_models": 2,
+      "total_downloads": 3635,
+      "min_param_count": 1385228288,
+      "sample_models": [
+        "tomg-group-umd/huginn-0125",
+        "smcleish/Recurrent-Llama-3.2-train-recurrence-32"
+      ],
+      "relevancy_score": 35.2
+    },
+    {
+      "architecture_id": "DUO",
       "total_models": 1,
-      "total_downloads": 4588,
-      "min_param_count": 1365677056,
+      "total_downloads": 789,
+      "min_param_count": 169627250,
       "sample_models": [
-        "fla-hub/delta_net-1.3B-100B"
+        "s-sahoo/duo-distilled"
       ],
-      "relevancy_score": 35.5
+      "relevancy_score": 35.2
     },
     {
-      "architecture_id": "VaultGemmaForCausalLM",
+      "architecture_id": "DeltaNetForCausalLM",
       "total_models": 1,
-      "total_downloads": 4209,
-      "min_param_count": 1038741120,
+      "total_downloads": 4682,
+      "min_param_count": 1365677056,
       "sample_models": [
-        "google/vaultgemma-1b"
+        "fla-hub/delta_net-1.3B-100B"
       ],
-      "relevancy_score": 35.3
+      "relevancy_score": 35.1
     },
     {
-      "architecture_id": "Rwkv5ForCausalLM",
+      "architecture_id": "MoshiForConditionalGeneration",
       "total_models": 2,
-      "total_downloads": 2960,
-      "min_param_count": 1577754624,
+      "total_downloads": 128966,
+      "min_param_count": 7783880545,
       "sample_models": [
-        "RWKV/rwkv-5-world-3b",
-        "RWKV/rwkv-5-world-1b5"
+        "kmhf/hf-moshiko",
+        "kmhf/hf-moshika"
       ],
-      "relevancy_score": 35.2
+      "relevancy_score": 35.0
     },
     {
-      "architecture_id": "LLaDA2MoeModelLM",
-      "total_models": 5,
-      "total_downloads": 245306,
-      "min_param_count": 16255643392,
+      "architecture_id": "MiMoForCausalLM",
+      "total_models": 2,
+      "total_downloads": 132837,
+      "min_param_count": 7833409536,
       "sample_models": [
-        "inclusionAI/LLaDA2.1-flash",
-        "inclusionAI/LLaDA2.0-mini",
-        "inclusionAI/LLaDA2.1-mini",
-        "inclusionAI/LLaDA2.0-mini-CAP",
-        "inclusionAI/LLaDA2.0-flash"
+        "XiaomiMiMo/MiMo-7B-Base",
+        "XiaomiMiMo/MiMo-7B-RL"
       ],
-      "relevancy_score": 35.1
+      "relevancy_score": 35.0
     },
     {
-      "architecture_id": "RWKV7ForCausalLM",
-      "total_models": 3,
-      "total_downloads": 2153,
-      "min_param_count": 1527404544,
+      "architecture_id": "Rwkv5ForCausalLM",
+      "total_models": 2,
+      "total_downloads": 3092,
+      "min_param_count": 1577754624,
       "sample_models": [
-        "RWKV/RWKV7-Goose-World3-1.5B-HF",
-        "fla-hub/rwkv7-1.5B-world",
-        "RWKV/RWKV7-Goose-World3-2.9B-HF"
+        "RWKV/rwkv-5-world-3b",
+        "RWKV/rwkv-5-world-1b5"
       ],
-      "relevancy_score": 35.1
+      "relevancy_score": 34.8
     },
     {
-      "architecture_id": "MegaForCausalLM",
+      "architecture_id": "VaultGemmaForCausalLM",
       "total_models": 1,
-      "total_downloads": 613,
-      "min_param_count": 126132108,
+      "total_downloads": 4072,
+      "min_param_count": 1038741120,
       "sample_models": [
-        "BEE-spoke-data/mega-ar-126m-4k"
+        "google/vaultgemma-1b"
       ],
-      "relevancy_score": 35.0
+      "relevancy_score": 34.8
     },
     {
       "architecture_id": "WordLatentTransformerForCausalLM",
       "total_models": 1,
-      "total_downloads": 583,
+      "total_downloads": 654,
       "min_param_count": 6861056,
       "sample_models": [
         "sign/WeLT-string-repetition"
       ],
-      "relevancy_score": 34.9
+      "relevancy_score": 34.8
+    },
+    {
+      "architecture_id": "LilleForCausalLM",
+      "total_models": 1,
+      "total_downloads": 604,
+      "min_param_count": 127236768,
+      "sample_models": [
+        "Nikity/lille-130m-instruct"
+      ],
+      "relevancy_score": 34.6
     },
     {
       "architecture_id": "KimiK2ForCausalLM",
       "total_models": 1,
-      "total_downloads": 583,
+      "total_downloads": 586,
       "min_param_count": 170595012,
       "sample_models": [
         "hyper-accel/tiny-random-kimi-k2"
       ],
-      "relevancy_score": 34.9
+      "relevancy_score": 34.6
     },
     {
-      "architecture_id": "LilleForCausalLM",
-      "total_models": 1,
-      "total_downloads": 565,
-      "min_param_count": 127236768,
+      "architecture_id": "RuGPT3XLForCausalLM",
+      "total_models": 2,
+      "total_downloads": 2650,
+      "min_param_count": 1431261184,
       "sample_models": [
-        "Nikity/lille-130m-instruct"
+        "evilfreelancer/ruGPT3XL",
+        "evilfreelancer/ruGPT3XL-8k"
       ],
-      "relevancy_score": 34.8
+      "relevancy_score": 34.5
     },
     {
       "architecture_id": "GPT2CompetitiveMoE",
       "total_models": 1,
-      "total_downloads": 527,
+      "total_downloads": 528,
       "min_param_count": 497796864,
       "sample_models": [
         "Fu01978/gpt2-4x124M-competitive-moe"
       ],
-      "relevancy_score": 34.7
+      "relevancy_score": 34.3
+    },
+    {
+      "architecture_id": "MegaForCausalLM",
+      "total_models": 1,
+      "total_downloads": 522,
+      "min_param_count": 126132108,
+      "sample_models": [
+        "BEE-spoke-data/mega-ar-126m-4k"
+      ],
+      "relevancy_score": 34.3
+    },
+    {
+      "architecture_id": "Qwen3ASRForConditionalGeneration",
+      "total_models": 1,
+      "total_downloads": 516,
+      "min_param_count": 782426112,
+      "sample_models": [
+        "hypaai/Qwen3-ASR-0.6B_2026-03-22_04-35-10"
+      ],
+      "relevancy_score": 34.3
     },
     {
       "architecture_id": "BolmoForCausalLM",
       "total_models": 2,
-      "total_downloads": 2032,
+      "total_downloads": 2003,
       "min_param_count": 1468911776,
       "sample_models": [
         "allenai/Bolmo-1B",
         "allenai/Bolmo-7B"
       ],
-      "relevancy_score": 34.3
+      "relevancy_score": 33.9
     },
     {
-      "architecture_id": "XCurOSForCausalLM",
+      "architecture_id": "MoELLaVAQwen2ForCausalLM",
       "total_models": 1,
-      "total_downloads": 93590,
-      "min_param_count": 7615616512,
+      "total_downloads": 2702,
+      "min_param_count": 1406119552,
       "sample_models": [
-        "XCurOS/XCurOS-0.1-8B-Instruct"
+        "KKHYA/llavaqwen2.5-0.5b-finetune-moe-4e-2k_20260331_194516"
       ],
-      "relevancy_score": 34.2
+      "relevancy_score": 33.9
     },
     {
-      "architecture_id": "MoELLaVAQwen2ForCausalLM",
+      "architecture_id": "GiddForDiffusionLM",
+      "total_models": 2,
+      "total_downloads": 1803,
+      "min_param_count": 2844349440,
+      "sample_models": [
+        "dvruette/gidd-unif-3b",
+        "dvruette/gidd-mask-3b"
+      ],
+      "relevancy_score": 33.6
+    },
+    {
+      "architecture_id": "TarsierForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 2618,
-      "min_param_count": 1406119552,
+      "total_downloads": 86089,
+      "min_param_count": 7063427072,
       "sample_models": [
-        "KKHYA/llavaqwen2.5-0.5b-finetune-moe-4e-2k_20260331_194516"
+        "omni-research/Tarsier-7b"
       ],
-      "relevancy_score": 34.2
+      "relevancy_score": 33.5
     },
     {
       "architecture_id": "OlmoHybridForCausalLM",
       "total_models": 4,
-      "total_downloads": 35834,
+      "total_downloads": 35503,
       "min_param_count": 7430870688,
       "sample_models": [
-        "allenai/Olmo-Hybrid-7B",
-        "allenai/Olmo-Hybrid-Instruct-DPO-7B",
-        "allenai/Olmo-Hybrid-Instruct-SFT-7B",
-        "allenai/Olmo-Hybrid-Think-SFT-7B"
+        "allenai/Olmo-Hybrid-7B",
+        "allenai/Olmo-Hybrid-Instruct-DPO-7B",
+        "allenai/Olmo-Hybrid-Instruct-SFT-7B",
+        "allenai/Olmo-Hybrid-Think-SFT-7B"
+      ],
+      "relevancy_score": 33.4
+    },
+    {
+      "architecture_id": "ArgonneModel",
+      "total_models": 2,
+      "total_downloads": 1627,
+      "min_param_count": 1273807360,
+      "sample_models": [
+        "PursuitOfDataScience/Argonne2.5-instruct",
+        "PursuitOfDataScience/Argonne2.5-base"
       ],
-      "relevancy_score": 34.1
+      "relevancy_score": 33.4
     },
     {
-      "architecture_id": "TarsierForConditionalGeneration",
+      "architecture_id": "MobileLLMP1ForCausalLM",
       "total_models": 1,
-      "total_downloads": 87008,
-      "min_param_count": 7063427072,
+      "total_downloads": 2086,
+      "min_param_count": 1084453120,
       "sample_models": [
-        "omni-research/Tarsier-7b"
+        "facebook/MobileLLM-Pro-base"
       ],
-      "relevancy_score": 34.1
+      "relevancy_score": 33.3
     },
     {
       "architecture_id": "HybridQwen3ForCausalLM",
       "total_models": 9,
-      "total_downloads": 7400,
+      "total_downloads": 7439,
       "min_param_count": 8495712960,
       "sample_models": [
         "amazon/GKA-primed-HQwen3-8B-Instruct",
         "amazon/Mamba2-primed-HQwen3-8B-Instruct",
-        "amazon/GDN-primed-HQwen3-8B-Instruct",
         "amazon/GDN-primed-HQwen3-32B-Instruct",
+        "amazon/GDN-primed-HQwen3-8B-Instruct",
         "amazon/GKA-primed-HQwen3-32B-Instruct",
         "amazon/BMOJOF-primed-HQwen3-8B-Instruct",
         "amazon/GKA-primed-HQwen3-8B-Reasoner",
         "amazon/GDN-primed-HQwen3-8B-Reasoner",
         "amazon/GKA-primed-HQwen3-32B-Reasoner"
       ],
-      "relevancy_score": 33.9
+      "relevancy_score": 33.1
+    },
+    {
+      "architecture_id": "JetNemotronForCausalLM",
+      "total_models": 2,
+      "total_downloads": 8242,
+      "min_param_count": 3960424768,
+      "sample_models": [
+        "jet-ai/Jet-Nemotron-2B",
+        "jet-ai/Jet-Nemotron-4B"
+      ],
+      "relevancy_score": 33.0
     },
     {
       "architecture_id": "Rwkv6ForCausalLM",
       "total_models": 8,
-      "total_downloads": 8437,
+      "total_downloads": 8905,
       "min_param_count": 7635746816,
       "sample_models": [
-        "RWKV/v6-Finch-1B6-HF",
         "RWKV/v6-Finch-7B-HF",
+        "RWKV/v6-Finch-1B6-HF",
+        "RWKV/rwkv-6-world-3b",
         "RWKV/rwkv-6-world-1b6",
         "RWKV/rwkv-6-world-7b",
         "RWKV/v6-Finch-14B-HF",
         "RWKV/v6-Finch-3B-HF",
-        "RWKV/rwkv-6-world-3b-v2.1",
-        "RWKV/rwkv-6-world-3b"
+        "RWKV/rwkv-6-world-3b-v2.1"
       ],
-      "relevancy_score": 33.6
+      "relevancy_score": 32.9
+    },
+    {
+      "architecture_id": "XCurOSForCausalLM",
+      "total_models": 1,
+      "total_downloads": 66986,
+      "min_param_count": 7615616512,
+      "sample_models": [
+        "XCurOS/XCurOS-0.1-8B-Instruct"
+      ],
+      "relevancy_score": 32.9
+    },
+    {
+      "architecture_id": "SongGenMixedForConditionalGeneration",
+      "total_models": 1,
+      "total_downloads": 1723,
+      "min_param_count": 1363657956,
+      "sample_models": [
+        "LiuZH-19/SongGen_mixed_pro"
+      ],
+      "relevancy_score": 32.9
     },
     {
       "architecture_id": "JAISLMHeadModel",
       "total_models": 6,
-      "total_downloads": 15551,
+      "total_downloads": 15081,
       "min_param_count": 7142689824,
       "sample_models": [
         "inceptionai/jais-13b-chat",
         "katuni4ka/tiny-random-jais",
         "inceptionai/jais-family-30b-8k",
-        "inceptionai/jais-13b",
         "inceptionai/jais-family-13b-chat",
+        "inceptionai/jais-13b",
         "inceptionai/jais-family-6p7b-chat"
       ],
-      "relevancy_score": 33.6
+      "relevancy_score": 32.8
     },
     {
-      "architecture_id": "SongGenMixedForConditionalGeneration",
+      "architecture_id": "OmniASRForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 1843,
-      "min_param_count": 1363657956,
-      "sample_models": [
-        "LiuZH-19/SongGen_mixed_pro"
-      ],
-      "relevancy_score": 33.5
-    },
-    {
-      "architecture_id": "ArgonneModel",
-      "total_models": 2,
-      "total_downloads": 1321,
-      "min_param_count": 1273807360,
+      "total_downloads": 1633,
+      "min_param_count": 1631506944,
       "sample_models": [
-        "PursuitOfDataScience/Argonne2.5-base",
-        "PursuitOfDataScience/Argonne2.5-instruct"
+        "bezzam/omniasr-llm-300m-v2"
       ],
-      "relevancy_score": 33.4
+      "relevancy_score": 32.8
     },
     {
-      "architecture_id": "JetNemotronForCausalLM",
+      "architecture_id": "OpensciForCausalLM",
       "total_models": 2,
-      "total_downloads": 7758,
-      "min_param_count": 3960424768,
+      "total_downloads": 1031,
+      "min_param_count": 1714377728,
       "sample_models": [
-        "jet-ai/Jet-Nemotron-2B",
-        "jet-ai/Jet-Nemotron-4B"
+        "ali-elganzory/1.7b-Comma0.1-300BT-longsft_16k-DPO-Tulu3-decontaminated",
+        "ali-elganzory/open-sci-ref-v0.02-1.7b-fineweb-edu-1.4t-300B-4096-longsft_16k-DPO-Tulu3-decontaminated"
       ],
-      "relevancy_score": 33.3
+      "relevancy_score": 32.4
     },
     {
-      "architecture_id": "MobileLLMP1ForCausalLM",
+      "architecture_id": "Kanana2VecModel",
       "total_models": 1,
-      "total_downloads": 1750,
-      "min_param_count": 1084453120,
+      "total_downloads": 1330,
+      "min_param_count": 2086979328,
       "sample_models": [
-        "facebook/MobileLLM-Pro-base"
+        "kakaocorp/kanana-nano-2.1b-embedding"
       ],
-      "relevancy_score": 33.3
+      "relevancy_score": 32.3
     },
     {
-      "architecture_id": "OmniASRForConditionalGeneration",
+      "architecture_id": "DeciCoderForCausalLM",
       "total_models": 1,
-      "total_downloads": 1628,
-      "min_param_count": 1631506944,
+      "total_downloads": 1212,
+      "min_param_count": 1113671680,
       "sample_models": [
-        "bezzam/omniasr-llm-300m-v2"
+        "Deci/DeciCoder-1b"
       ],
-      "relevancy_score": 33.2
+      "relevancy_score": 32.1
     },
     {
-      "architecture_id": "Kanana2VecModel",
+      "architecture_id": "StableLMAlphaForCausalLM",
       "total_models": 1,
-      "total_downloads": 1350,
-      "min_param_count": 2086979328,
+      "total_downloads": 7022,
+      "min_param_count": 6889414656,
       "sample_models": [
-        "kakaocorp/kanana-nano-2.1b-embedding"
+        "stabilityai/stablelm-base-alpha-7b-v2"
       ],
-      "relevancy_score": 32.8
+      "relevancy_score": 32.0
     },
     {
-      "architecture_id": "GiddForDiffusionLM",
-      "total_models": 1,
-      "total_downloads": 1287,
-      "min_param_count": 2957629440,
+      "architecture_id": "IQuestCoderForCausalLM",
+      "total_models": 4,
+      "total_downloads": 17001,
+      "min_param_count": 7612810240,
       "sample_models": [
-        "dvruette/gidd-unif-3b"
+        "IQuestLab/IQuest-Coder-V1-40B-Instruct",
+        "IQuestLab/IQuest-Coder-V1-7B-Instruct",
+        "Multilingual-Multimodal-NLP/IndustrialCoder",
+        "IQuestLab/IQuest-Coder-V1-40B-Thinking"
       ],
-      "relevancy_score": 32.7
+      "relevancy_score": 31.8
     },
     {
       "architecture_id": "XLNetLMHeadModel",
       "total_models": 5,
-      "total_downloads": 479033,
+      "total_downloads": 433085,
       "min_param_count": null,
       "sample_models": [
         "xlnet/xlnet-base-cased",
@@ -2191,86 +2223,89 @@
         "sshleifer/tiny-xlnet-base-cased",
         "textattack/xlnet-base-cased-imdb"
       ],
-      "relevancy_score": 32.6
+      "relevancy_score": 31.5
     },
     {
       "architecture_id": "AeroForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 1269,
+      "total_downloads": 902,
       "min_param_count": 2416221184,
       "sample_models": [
         "lmms-lab/Aero-1-Audio"
       ],
-      "relevancy_score": 32.6
-    },
-    {
-      "architecture_id": "IQuestCoderForCausalLM",
-      "total_models": 4,
-      "total_downloads": 17567,
-      "min_param_count": 7612810240,
-      "sample_models": [
-        "IQuestLab/IQuest-Coder-V1-40B-Instruct",
-        "IQuestLab/IQuest-Coder-V1-7B-Instruct",
-        "Multilingual-Multimodal-NLP/IndustrialCoder",
-        "IQuestLab/IQuest-Coder-V1-40B-Thinking"
-      ],
-      "relevancy_score": 32.5
+      "relevancy_score": 31.5
     },
     {
-      "architecture_id": "StableLMAlphaForCausalLM",
+      "architecture_id": "Qwen3VLForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 7233,
-      "min_param_count": 6889414656,
+      "total_downloads": 863,
+      "min_param_count": 2127532032,
       "sample_models": [
-        "stabilityai/stablelm-base-alpha-7b-v2"
+        "Oysiyl/qwen3-vl-2b-unslop-good-lora-v1"
       ],
-      "relevancy_score": 32.5
+      "relevancy_score": 31.4
     },
     {
-      "architecture_id": "DeciCoderForCausalLM",
-      "total_models": 1,
-      "total_downloads": 1190,
-      "min_param_count": 1113671680,
+      "architecture_id": "Qwen3MoeForCausalLM",
+      "total_models": 7,
+      "total_downloads": 5780,
+      "min_param_count": 8001454080,
       "sample_models": [
-        "Deci/DeciCoder-1b"
+        "AIDC-AI/Marco-Nano-Instruct",
+        "zianglih/Qwen3-30B-A3B-Instruct-2507-MXFP8-last-8-BF16",
+        "AIDC-AI/Marco-Mini-Instruct",
+        "Dynamical-Systems/Dynamical-30B-A3B",
+        "bineric/lynx-instruct-30b",
+        "OpenMOSS-Team/SciJudge-30B",
+        "unsloth/Qwen3-30B-A3B-Thinking-2507"
       ],
-      "relevancy_score": 32.5
+      "relevancy_score": 31.3
     },
     {
       "architecture_id": "GritLM",
       "total_models": 1,
-      "total_downloads": 30472,
+      "total_downloads": 31461,
       "min_param_count": 7241732096,
       "sample_models": [
         "parasail-ai/GritLM-7B-vllm"
       ],
-      "relevancy_score": 31.7
+      "relevancy_score": 31.3
     },
     {
       "architecture_id": "AXK1ForCausalLM",
       "total_models": 2,
-      "total_downloads": 19527,
+      "total_downloads": 19319,
       "min_param_count": 11448603648,
       "sample_models": [
         "skt/A.X-K1",
         "thkim93/axk1-2layers"
       ],
-      "relevancy_score": 31.4
+      "relevancy_score": 30.8
     },
     {
-      "architecture_id": "Lfm2Prototype1ForCausalLM",
+      "architecture_id": "VeridianForCausalLM",
       "total_models": 1,
-      "total_downloads": 735,
-      "min_param_count": 1212304128,
+      "total_downloads": 662,
+      "min_param_count": 1659913728,
       "sample_models": [
-        "nntsuzu/LFM2-SFT-Prototype01-1.2B-JP"
+        "MagistrTheOne/veridian-beta"
       ],
-      "relevancy_score": 31.4
+      "relevancy_score": 30.8
+    },
+    {
+      "architecture_id": "HymbaForCausalLM",
+      "total_models": 1,
+      "total_downloads": 645,
+      "min_param_count": 1522797824,
+      "sample_models": [
+        "nvidia/Hymba-1.5B-Instruct"
+      ],
+      "relevancy_score": 30.8
     },
     {
       "architecture_id": "IdeficsForVisionText2Text",
       "total_models": 4,
-      "total_downloads": 10302,
+      "total_downloads": 10405,
       "min_param_count": 8929682192,
       "sample_models": [
         "HuggingFaceM4/idefics-80b-instruct",
@@ -2278,140 +2313,140 @@
         "HuggingFaceM4/idefics-9b-instruct",
         "HuggingFaceM4/idefics-80b"
       ],
-      "relevancy_score": 31.3
-    },
-    {
-      "architecture_id": "InternVLChatModel",
-      "total_models": 1,
-      "total_downloads": 4299,
-      "min_param_count": 3712637952,
-      "sample_models": [
-        "numind/NuExtract-2-4B-experimental"
-      ],
-      "relevancy_score": 31.3
+      "relevancy_score": 30.7
     },
     {
-      "architecture_id": "CambrianQwenForCausalLM",
+      "architecture_id": "Lfm2Prototype1ForCausalLM",
       "total_models": 1,
-      "total_downloads": 4196,
-      "min_param_count": 3986951616,
+      "total_downloads": 634,
+      "min_param_count": 1212304128,
       "sample_models": [
-        "nyu-visionx/Scale-RAE-Qwen1.5B_DiT2.4B"
+        "nntsuzu/LFM2-SFT-Prototype01-1.2B-JP"
       ],
-      "relevancy_score": 31.3
+      "relevancy_score": 30.7
     },
     {
-      "architecture_id": "CheXagentForCausalLM",
-      "total_models": 1,
-      "total_downloads": 4149,
-      "min_param_count": 3140746752,
+      "architecture_id": "ChatGLMModel",
+      "total_models": 2,
+      "total_downloads": 17564,
+      "min_param_count": 9399951392,
       "sample_models": [
-        "StanfordAIMI/CheXagent-2-3b"
+        "zai-org/codegeex4-all-9b",
+        "zai-org/glm-4-9b"
       ],
-      "relevancy_score": 31.3
+      "relevancy_score": 30.6
     },
     {
       "architecture_id": "PersimmonForCausalLM",
       "total_models": 3,
-      "total_downloads": 12967,
+      "total_downloads": 12300,
       "min_param_count": 8823735296,
       "sample_models": [
         "adept/persimmon-8b-chat",
         "adept/persimmon-8b-base",
         "pszemraj/perSLIMmon-8b-base"
       ],
-      "relevancy_score": 31.2
+      "relevancy_score": 30.5
     },
     {
-      "architecture_id": "Phi3SmallForCausalLM",
-      "total_models": 2,
-      "total_downloads": 17559,
-      "min_param_count": 7392272384,
+      "architecture_id": "FlexOlmoForCausalLM",
+      "total_models": 3,
+      "total_downloads": 12280,
+      "min_param_count": 11627401216,
       "sample_models": [
-        "microsoft/Phi-3-small-8k-instruct",
-        "microsoft/Phi-3-small-128k-instruct"
+        "allenai/Flex-reddit-2x7B-1T",
+        "allenai/FlexOlmo-7x7B-1T-RT",
+        "shanearora/Flex-reddit-2x7B-1T"
       ],
-      "relevancy_score": 31.2
+      "relevancy_score": 30.5
     },
     {
-      "architecture_id": "HymbaForCausalLM",
+      "architecture_id": "TinyChartPhiForCausalLM",
       "total_models": 1,
-      "total_downloads": 667,
-      "min_param_count": 1522797824,
+      "total_downloads": 3551,
+      "min_param_count": 3189407648,
       "sample_models": [
-        "nvidia/Hymba-1.5B-Instruct"
+        "mPLUG/TinyChart-3B-768"
       ],
-      "relevancy_score": 31.2
+      "relevancy_score": 30.5
     },
     {
-      "architecture_id": "FlexOlmoForCausalLM",
-      "total_models": 3,
-      "total_downloads": 12568,
-      "min_param_count": 11627401216,
+      "architecture_id": "MixFormerSequentialForCausalLM",
+      "total_models": 1,
+      "total_downloads": 562,
+      "min_param_count": 2779683840,
       "sample_models": [
-        "allenai/Flex-reddit-2x7B-1T",
-        "allenai/FlexOlmo-7x7B-1T-RT",
-        "shanearora/Flex-reddit-2x7B-1T"
+        "SkunkworksAI/phi-2"
       ],
-      "relevancy_score": 31.1
+      "relevancy_score": 30.5
     },
     {
-      "architecture_id": "ChatGLMModel",
+      "architecture_id": "Phi3SmallForCausalLM",
       "total_models": 2,
-      "total_downloads": 17362,
-      "min_param_count": 9399951392,
+      "total_downloads": 15892,
+      "min_param_count": 7392272384,
       "sample_models": [
-        "zai-org/codegeex4-all-9b",
-        "zai-org/glm-4-9b"
+        "microsoft/Phi-3-small-8k-instruct",
+        "microsoft/Phi-3-small-128k-instruct"
       ],
-      "relevancy_score": 31.1
+      "relevancy_score": 30.4
     },
     {
       "architecture_id": "SpatialLMLlamaForCausalLM",
       "total_models": 1,
-      "total_downloads": 578,
-      "min_param_count": 1345883776,
+      "total_downloads": 515,
+      "min_param_count": 1247355840,
       "sample_models": [
-        "manycore-research/SpatialLM1.1-Llama-1B"
+        "manycore-research/SpatialLM-Llama-1B"
       ],
-      "relevancy_score": 30.9
+      "relevancy_score": 30.3
     },
     {
-      "architecture_id": "MixFormerSequentialForCausalLM",
+      "architecture_id": "CambrianQwenForCausalLM",
       "total_models": 1,
-      "total_downloads": 561,
-      "min_param_count": 2779683840,
+      "total_downloads": 2935,
+      "min_param_count": 3986951616,
       "sample_models": [
-        "SkunkworksAI/phi-2"
+        "nyu-visionx/Scale-RAE-Qwen1.5B_DiT2.4B"
       ],
-      "relevancy_score": 30.8
+      "relevancy_score": 30.1
     },
     {
       "architecture_id": "StripedHyenaModelForCausalLM",
       "total_models": 3,
-      "total_downloads": 10515,
+      "total_downloads": 9917,
       "min_param_count": 7646024704,
       "sample_models": [
         "togethercomputer/evo-1-131k-base",
         "togethercomputer/evo-1-8k-base",
         "togethercomputer/StripedHyena-Nous-7B"
       ],
-      "relevancy_score": 30.7
+      "relevancy_score": 30.0
     },
     {
       "architecture_id": "Maira2ForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 2914,
+      "total_downloads": 2692,
       "min_param_count": 6880185600,
       "sample_models": [
         "microsoft/maira-2"
       ],
-      "relevancy_score": 30.5
+      "relevancy_score": 29.9
+    },
+    {
+      "architecture_id": "CheXagentForCausalLM",
+      "total_models": 1,
+      "total_downloads": 2648,
+      "min_param_count": 3140746752,
+      "sample_models": [
+        "StanfordAIMI/CheXagent-2-3b"
+      ],
+      "relevancy_score": 29.9
     },
     {
       "architecture_id": "BioGptForCausalLM",
       "total_models": 5,
-      "total_downloads": 179801,
+      "total_downloads": 174528,
       "min_param_count": null,
       "sample_models": [
         "microsoft/biogpt",
@@ -2420,194 +2455,211 @@
         "hf-tiny-model-private/tiny-random-BioGptForCausalLM",
         "zequnl/molxpt"
       ],
-      "relevancy_score": 30.4
+      "relevancy_score": 29.5
     },
     {
-      "architecture_id": "Ernie4_5_MoeForCausalLM",
-      "total_models": 4,
-      "total_downloads": 39215,
-      "min_param_count": 21825437888,
+      "architecture_id": "InternVLChatModel",
+      "total_models": 1,
+      "total_downloads": 2229,
+      "min_param_count": 3712637952,
       "sample_models": [
-        "baidu/ERNIE-4.5-21B-A3B-PT",
-        "baidu/ERNIE-4.5-21B-A3B-Base-PT",
-        "baidu/ERNIE-4.5-21B-A3B-Thinking",
-        "baidu/ERNIE-4.5-300B-A47B-PT"
+        "numind/NuExtract-2-4B-experimental"
       ],
-      "relevancy_score": 30.3
+      "relevancy_score": 29.5
     },
     {
-      "architecture_id": "BailingMoeV2ForCausalLM",
-      "total_models": 5,
-      "total_downloads": 20571,
-      "min_param_count": 16255643392,
+      "architecture_id": "MatriochkaForCausalLM",
+      "total_models": 1,
+      "total_downloads": 2159,
+      "min_param_count": 3358735360,
       "sample_models": [
-        "inclusionAI/Ling-mini-2.0",
-        "inclusionAI/Ling-1T",
-        "inclusionAI/Ring-mini-2.0",
-        "inclusionAI/Ling-flash-2.0",
-        "inclusionAI/Ling-flash-base-2.0"
+        "nthngdy/matryoshka-3B"
+      ],
+      "relevancy_score": 29.4
+    },
+    {
+      "architecture_id": "SolarOpenForCausalLM",
+      "total_models": 2,
+      "total_downloads": 343068,
+      "min_param_count": null,
+      "sample_models": [
+        "upstage/Solar-Open-100B",
+        "nota-ai/Solar-Open-100B-NotaMoEQuant-Int4"
       ],
-      "relevancy_score": 29.5
+      "relevancy_score": 29.1
     },
     {
       "architecture_id": "LatentMoELLaVAPhiForCausalLM",
       "total_models": 1,
-      "total_downloads": 1728,
+      "total_downloads": 1792,
       "min_param_count": 3093139456,
       "sample_models": [
         "KKHYA/llavaphi2-2.7b-finetune-latent-sparse-moe-4e-2k-freeze-1.0_20260304_075653"
       ],
-      "relevancy_score": 29.3
+      "relevancy_score": 29.0
     },
     {
-      "architecture_id": "SolarOpenForCausalLM",
-      "total_models": 2,
-      "total_downloads": 264805,
-      "min_param_count": null,
+      "architecture_id": "LlamaForCasualLM",
+      "total_models": 1,
+      "total_downloads": 1613,
+      "min_param_count": 3212749824,
       "sample_models": [
-        "upstage/Solar-Open-100B",
-        "nota-ai/Solar-Open-100B-NotaMoEQuant-Int4"
+        "CoRover/BharatGPT-3B-Indic"
       ],
-      "relevancy_score": 29.2
+      "relevancy_score": 28.8
     },
     {
       "architecture_id": "Qwen2ForSequenceClassification",
       "total_models": 2,
-      "total_downloads": 7086,
+      "total_downloads": 7132,
       "min_param_count": 7070622720,
       "sample_models": [
         "nvidia/AceMath-7B-RM",
         "nvidia/Qwen2.5-CascadeRL-RM-72B"
       ],
-      "relevancy_score": 29.1
-    },
-    {
-      "architecture_id": "MatriochkaForCausalLM",
-      "total_models": 1,
-      "total_downloads": 1562,
-      "min_param_count": 3358735360,
-      "sample_models": [
-        "nthngdy/matryoshka-3B"
-      ],
-      "relevancy_score": 29.1
+      "relevancy_score": 28.6
     },
     {
       "architecture_id": "DeepseekForCausalLM",
       "total_models": 2,
-      "total_downloads": 40954,
+      "total_downloads": 41625,
       "min_param_count": 16375728128,
       "sample_models": [
         "deepseek-ai/deepseek-moe-16b-base",
         "deepseek-ai/deepseek-moe-16b-chat"
       ],
-      "relevancy_score": 29.0
+      "relevancy_score": 28.5
+    },
+    {
+      "architecture_id": "MobilintLlamaForCausalLM",
+      "total_models": 9,
+      "total_downloads": 31316,
+      "min_param_count": null,
+      "sample_models": [
+        "mobilint/Llama-3.2-1B-Instruct",
+        "mobilint/Llama-3.2-3B-Instruct",
+        "mobilint/Llama-3.1-8B-Instruct",
+        "mobilint/HyperCLOVAX-SEED-Text-Instruct-1.5B",
+        "mobilint/Llama-3.2-1B-Instruct-Batch16",
+        "mobilint/Llama-3.1-8B-Instruct-Batch16",
+        "mobilint/Llama-3.2-3B-Instruct-Batch16",
+        "mobilint/Llama-3.2-3B-Instruct-Batch32",
+        "mobilint/Llama-3.1-8B-Instruct-Batch32"
+      ],
+      "relevancy_score": 28.3
     },
     {
       "architecture_id": "Jais2ForCausalLM",
       "total_models": 2,
-      "total_downloads": 6256,
+      "total_downloads": 6100,
       "min_param_count": 8090401280,
       "sample_models": [
         "inceptionai/Jais-2-8B-Chat",
         "inceptionai/Jais-2-70B-Chat"
       ],
-      "relevancy_score": 28.9
+      "relevancy_score": 28.3
     },
     {
-      "architecture_id": "ChatGLMForConditionalGeneration",
-      "total_models": 2,
-      "total_downloads": 4858,
-      "min_param_count": 9399951392,
+      "architecture_id": "BailingMoeV2ForCausalLM",
+      "total_models": 4,
+      "total_downloads": 19076,
+      "min_param_count": 16255643392,
       "sample_models": [
-        "IAAR-Shanghai/xVerify-9B-C",
-        "qiuhuachuan/MeChat"
+        "inclusionAI/Ling-mini-2.0",
+        "inclusionAI/Ling-1T",
+        "inclusionAI/Ling-flash-2.0",
+        "inclusionAI/Ling-flash-base-2.0"
       ],
-      "relevancy_score": 28.3
+      "relevancy_score": 28.0
     },
     {
-      "architecture_id": "ReformerModelWithLMHead",
+      "architecture_id": "ChatGLMForConditionalGeneration",
       "total_models": 2,
-      "total_downloads": 159282,
-      "min_param_count": null,
+      "total_downloads": 4797,
+      "min_param_count": 9399951392,
       "sample_models": [
-        "google/reformer-crime-and-punishment",
-        "google/reformer-enwik8"
+        "IAAR-Shanghai/xVerify-9B-C",
+        "qiuhuachuan/MeChat"
       ],
-      "relevancy_score": 28.1
+      "relevancy_score": 27.8
     },
     {
       "architecture_id": "LamedPhi3ForCausalLM",
       "total_models": 1,
-      "total_downloads": 985,
+      "total_downloads": 975,
       "min_param_count": 4049101904,
       "sample_models": [
         "GoodBaiBai88/M3D-LaMed-Phi-3-4B"
       ],
-      "relevancy_score": 28.1
+      "relevancy_score": 27.7
     },
     {
-      "architecture_id": "SarvamMLAForCausalLM",
-      "total_models": 2,
-      "total_downloads": 152436,
-      "min_param_count": 55732545631,
+      "architecture_id": "Gemma4TextModel",
+      "total_models": 1,
+      "total_downloads": 967,
+      "min_param_count": 4647449856,
       "sample_models": [
-        "aoxo/sarvam-105b-uncensored",
-        "sarvamai/sarvam-105b"
+        "bRadu/gemma-4-E2B-it-textonly"
       ],
-      "relevancy_score": 28.0
+      "relevancy_score": 27.7
     },
     {
       "architecture_id": "WeDLMForCausalLM",
       "total_models": 2,
-      "total_downloads": 4256,
+      "total_downloads": 4219,
       "min_param_count": 8190735360,
       "sample_models": [
         "tencent/WeDLM-8B-Base",
         "tencent/WeDLM-8B-Instruct"
       ],
-      "relevancy_score": 28.0
+      "relevancy_score": 27.5
     },
     {
-      "architecture_id": "SarvamMoEForCausalLM",
+      "architecture_id": "SarvamMLAForCausalLM",
       "total_models": 2,
-      "total_downloads": 149370,
-      "min_param_count": 32152650368,
+      "total_downloads": 151877,
+      "min_param_count": 55732545631,
       "sample_models": [
-        "aoxo/sarvam-30b-uncensored",
-        "sarvamai/sarvam-30b"
+        "aoxo/sarvam-105b-uncensored",
+        "sarvamai/sarvam-105b"
+      ],
+      "relevancy_score": 27.3
+    },
+    {
+      "architecture_id": "ReformerModelWithLMHead",
+      "total_models": 2,
+      "total_downloads": 150609,
+      "min_param_count": null,
+      "sample_models": [
+        "google/reformer-crime-and-punishment",
+        "google/reformer-enwik8"
       ],
-      "relevancy_score": 27.9
+      "relevancy_score": 27.3
     },
     {
       "architecture_id": "HyperCLOVAXForCausalLM",
       "total_models": 1,
-      "total_downloads": 31787,
+      "total_downloads": 31859,
       "min_param_count": 14748112896,
       "sample_models": [
         "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"
       ],
-      "relevancy_score": 27.8
+      "relevancy_score": 27.3
     },
     {
-      "architecture_id": "MobilintLlamaForCausalLM",
-      "total_models": 7,
-      "total_downloads": 29407,
-      "min_param_count": null,
+      "architecture_id": "LongLlamaForCausalLM",
+      "total_models": 1,
+      "total_downloads": 780,
+      "min_param_count": 3426474900,
       "sample_models": [
-        "mobilint/Llama-3.2-3B-Instruct",
-        "mobilint/Llama-3.2-1B-Instruct",
-        "mobilint/Llama-3.1-8B-Instruct",
-        "mobilint/HyperCLOVAX-SEED-Text-Instruct-1.5B",
-        "mobilint/Llama-3.2-1B-Instruct-Batch16",
-        "mobilint/Llama-3.1-8B-Instruct-Batch16",
-        "mobilint/Llama-3.2-3B-Instruct-Batch16"
+        "syzymon/long_llama_3b"
       ],
-      "relevancy_score": 27.7
+      "relevancy_score": 27.2
     },
     {
       "architecture_id": "InternLMForCausalLM",
       "total_models": 4,
-      "total_downloads": 72325,
+      "total_downloads": 71468,
       "min_param_count": null,
       "sample_models": [
         "internlm/internlm-chat-7b",
@@ -2615,123 +2667,124 @@
         "internlm/internlm-7b",
         "internlm/internlm-chat-20b"
       ],
-      "relevancy_score": 27.7
-    },
-    {
-      "architecture_id": "LongLlamaForCausalLM",
-      "total_models": 1,
-      "total_downloads": 756,
-      "min_param_count": 3426474900,
-      "sample_models": [
-        "syzymon/long_llama_3b"
-      ],
-      "relevancy_score": 27.5
+      "relevancy_score": 26.9
     },
     {
-      "architecture_id": "GPTNeoXJapaneseForCausalLM",
+      "architecture_id": "SarvamMoEForCausalLM",
       "total_models": 2,
-      "total_downloads": 113485,
-      "min_param_count": null,
+      "total_downloads": 123774,
+      "min_param_count": 32152650368,
       "sample_models": [
-        "abeja/gpt-neox-japanese-2.7b",
-        "hf-tiny-model-private/tiny-random-GPTNeoXJapaneseForCausalLM"
+        "aoxo/sarvam-30b-uncensored",
+        "sarvamai/sarvam-30b"
       ],
-      "relevancy_score": 27.3
+      "relevancy_score": 26.9
     },
     {
-      "architecture_id": "SparseLlamaForCausalLM",
+      "architecture_id": "ZambaForCausalLM",
       "total_models": 1,
-      "total_downloads": 4221,
-      "min_param_count": 8185270336,
+      "total_downloads": 4128,
+      "min_param_count": 7232490496,
       "sample_models": [
-        "openbmb/NOSA-8B"
+        "Zyphra/Zamba-7B-v1"
       ],
-      "relevancy_score": 27.3
+      "relevancy_score": 26.8
     },
     {
-      "architecture_id": "ZambaForCausalLM",
-      "total_models": 1,
-      "total_downloads": 4140,
-      "min_param_count": 7232490496,
+      "architecture_id": "GPTNeoXJapaneseForCausalLM",
+      "total_models": 2,
+      "total_downloads": 110049,
+      "min_param_count": null,
       "sample_models": [
-        "Zyphra/Zamba-7B-v1"
+        "abeja/gpt-neox-japanese-2.7b",
+        "hf-tiny-model-private/tiny-random-GPTNeoXJapaneseForCausalLM"
       ],
-      "relevancy_score": 27.3
+      "relevancy_score": 26.6
     },
     {
-      "architecture_id": "Gemma4TextModel",
+      "architecture_id": "JetMoEForCausalLM",
       "total_models": 1,
-      "total_downloads": 650,
-      "min_param_count": 4647449856,
+      "total_downloads": 3627,
+      "min_param_count": 8522237952,
       "sample_models": [
-        "bRadu/gemma-4-E2B-it-textonly"
+        "jetmoe/jetmoe-8b"
       ],
-      "relevancy_score": 27.1
+      "relevancy_score": 26.5
     },
     {
-      "architecture_id": "BailingMoeForCausalLM",
+      "architecture_id": "CXRMate2ForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 21607,
-      "min_param_count": 16801974272,
+      "total_downloads": 572,
+      "min_param_count": 3322260224,
       "sample_models": [
-        "inclusionAI/Ling-lite-1.5"
+        "aehrc/cxrmate-2"
       ],
-      "relevancy_score": 27.0
+      "relevancy_score": 26.5
     },
     {
-      "architecture_id": "JetMoEForCausalLM",
+      "architecture_id": "BunnyPhiForCausalLM",
       "total_models": 1,
-      "total_downloads": 3665,
-      "min_param_count": 8522237952,
+      "total_downloads": 564,
+      "min_param_count": 3182254624,
       "sample_models": [
-        "jetmoe/jetmoe-8b"
+        "BAAI/Bunny-v1_0-3B"
       ],
-      "relevancy_score": 27.0
+      "relevancy_score": 26.5
     },
     {
       "architecture_id": "Step3p5ForCausalLM",
       "total_models": 1,
-      "total_downloads": 123608,
+      "total_downloads": 133597,
       "min_param_count": 199384301376,
       "sample_models": [
         "stepfun-ai/Step-3.5-Flash"
       ],
-      "relevancy_score": 26.8
+      "relevancy_score": 26.4
     },
     {
-      "architecture_id": "CXRMate2ForConditionalGeneration",
+      "architecture_id": "BailingMoeForCausalLM",
       "total_models": 1,
-      "total_downloads": 560,
-      "min_param_count": 3322260224,
+      "total_downloads": 20878,
+      "min_param_count": 16801974272,
       "sample_models": [
-        "aehrc/cxrmate-2"
+        "inclusionAI/Ling-lite-1.5"
       ],
-      "relevancy_score": 26.8
+      "relevancy_score": 26.4
     },
     {
-      "architecture_id": "BunnyPhiForCausalLM",
+      "architecture_id": "SparseLlamaForCausalLM",
       "total_models": 1,
-      "total_downloads": 507,
-      "min_param_count": 3182254624,
+      "total_downloads": 3131,
+      "min_param_count": 8185270336,
       "sample_models": [
-        "BAAI/Bunny-v1_0-3B"
+        "openbmb/NOSA-8B"
       ],
-      "relevancy_score": 26.6
+      "relevancy_score": 26.2
     },
     {
       "architecture_id": "Esm2LlamaInstructForCausalLM",
       "total_models": 1,
-      "total_downloads": 2603,
+      "total_downloads": 2589,
       "min_param_count": 10878983201,
       "sample_models": [
         "xiao-fei/Prot2Text-V2-11B-Instruct-hf"
       ],
-      "relevancy_score": 26.2
+      "relevancy_score": 25.8
+    },
+    {
+      "architecture_id": "Qwen2VLAudioForConditionalGeneration",
+      "total_models": 1,
+      "total_downloads": 2177,
+      "min_param_count": 8932935680,
+      "sample_models": [
+        "MayaKD/qwen2-vl-audio"
+      ],
+      "relevancy_score": 25.4
     },
     {
       "architecture_id": "OuroForCausalLM",
       "total_models": 4,
-      "total_downloads": 34326,
+      "total_downloads": 34507,
       "min_param_count": null,
       "sample_models": [
         "ByteDance/Ouro-1.4B",
@@ -2739,54 +2792,53 @@
         "ByteDance/Ouro-2.6B",
         "ByteDance/Ouro-1.4B-Thinking"
       ],
-      "relevancy_score": 26.0
+      "relevancy_score": 25.3
     },
     {
-      "architecture_id": "StableDiffcoderForCausalLM",
-      "total_models": 2,
-      "total_downloads": 1719,
-      "min_param_count": 8250462208,
+      "architecture_id": "FP8Qwen3ForCausalLM",
+      "total_models": 1,
+      "total_downloads": 1945,
+      "min_param_count": 8190735360,
       "sample_models": [
-        "ByteDance-Seed/Stable-DiffCoder-8B-Instruct",
-        "ByteDance-Seed/Stable-DiffCoder-8B-Base"
+        "xihc-ucb/Qwen3-8B-Base-train-Quasar-0809"
       ],
-      "relevancy_score": 26.0
+      "relevancy_score": 25.2
     },
     {
-      "architecture_id": "Qwen2VLAudioForConditionalGeneration",
+      "architecture_id": "CheXagentForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 2174,
-      "min_param_count": 8932935680,
+      "total_downloads": 1878,
+      "min_param_count": 8362401664,
       "sample_models": [
-        "MayaKD/qwen2-vl-audio"
+        "StanfordAIMI/CheXagent-8b"
       ],
-      "relevancy_score": 25.8
+      "relevancy_score": 25.1
+    },
+    {
+      "architecture_id": "FP8Qwen2ForCausalLM",
+      "total_models": 1,
+      "total_downloads": 1781,
+      "min_param_count": 7615616512,
+      "sample_models": [
+        "xihc-ucb/Qwen2.5-7B-train-Quasar-1214"
+      ],
+      "relevancy_score": 25.0
     },
     {
       "architecture_id": "BaiChuanForCausalLM",
       "total_models": 2,
-      "total_downloads": 51261,
+      "total_downloads": 50672,
       "min_param_count": null,
       "sample_models": [
         "baichuan-inc/Baichuan-7B",
         "FreedomIntelligence/HuatuoGPT-7B"
       ],
-      "relevancy_score": 25.6
-    },
-    {
-      "architecture_id": "FP8Qwen3ForCausalLM",
-      "total_models": 1,
-      "total_downloads": 1941,
-      "min_param_count": 8190735360,
-      "sample_models": [
-        "xihc-ucb/Qwen3-8B-Base-train-Quasar-0809"
-      ],
-      "relevancy_score": 25.6
+      "relevancy_score": 24.9
     },
     {
       "architecture_id": "MobilintQwen2ForCausalLM",
       "total_models": 4,
-      "total_downloads": 27152,
+      "total_downloads": 27256,
       "min_param_count": null,
       "sample_models": [
         "mobilint/Qwen2.5-0.5B-Instruct",
@@ -2794,97 +2846,75 @@
         "mobilint/Qwen2.5-3B-Instruct",
         "mobilint/Qwen2.5-7B-Instruct"
       ],
-      "relevancy_score": 25.5
-    },
-    {
-      "architecture_id": "MobilintQwen3ForCausalLM",
-      "total_models": 4,
-      "total_downloads": 25718,
-      "min_param_count": null,
-      "sample_models": [
-        "mobilint/Qwen3-0.6B",
-        "mobilint/Qwen3-1.7B",
-        "mobilint/Qwen3-4B",
-        "mobilint/Qwen3-8B"
-      ],
-      "relevancy_score": 25.4
+      "relevancy_score": 24.8
     },
     {
-      "architecture_id": "HCXVisionForCausalLM",
+      "architecture_id": "KORMoForCausalLM",
       "total_models": 1,
-      "total_downloads": 64666,
-      "min_param_count": null,
+      "total_downloads": 1616,
+      "min_param_count": 10756624384,
       "sample_models": [
-        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
+        "KORMo-Team/KORMo-10B-sft"
       ],
-      "relevancy_score": 25.4
+      "relevancy_score": 24.8
     },
     {
-      "architecture_id": "FP8Qwen2ForCausalLM",
-      "total_models": 1,
-      "total_downloads": 1774,
-      "min_param_count": 7615616512,
+      "architecture_id": "MobilintQwen3ForCausalLM",
+      "total_models": 4,
+      "total_downloads": 25791,
+      "min_param_count": null,
       "sample_models": [
-        "xihc-ucb/Qwen2.5-7B-train-Quasar-1214"
+        "mobilint/Qwen3-0.6B",
+        "mobilint/Qwen3-1.7B",
+        "mobilint/Qwen3-4B",
+        "mobilint/Qwen3-8B"
       ],
-      "relevancy_score": 25.4
+      "relevancy_score": 24.7
     },
     {
-      "architecture_id": "CheXagentForConditionalGeneration",
+      "architecture_id": "MiMoV2FlashForCausalLM",
       "total_models": 1,
-      "total_downloads": 1766,
-      "min_param_count": 8362401664,
+      "total_downloads": 61449,
+      "min_param_count": 309785318400,
       "sample_models": [
-        "StanfordAIMI/CheXagent-8b"
+        "XiaomiMiMo/MiMo-V2-Flash"
       ],
-      "relevancy_score": 25.4
+      "relevancy_score": 24.7
     },
     {
       "architecture_id": "KimiLinearForCausalLM",
       "total_models": 1,
-      "total_downloads": 60910,
+      "total_downloads": 61051,
       "min_param_count": 49122681728,
       "sample_models": [
         "moonshotai/Kimi-Linear-48B-A3B-Instruct"
       ],
-      "relevancy_score": 25.3
+      "relevancy_score": 24.7
     },
     {
-      "architecture_id": "MiMoV2FlashForCausalLM",
+      "architecture_id": "HCXVisionForCausalLM",
       "total_models": 1,
-      "total_downloads": 58903,
-      "min_param_count": 309785318400,
-      "sample_models": [
-        "XiaomiMiMo/MiMo-V2-Flash"
-      ],
-      "relevancy_score": 25.2
-    },
-    {
-      "architecture_id": "SeedOssForCausalLM",
-      "total_models": 3,
-      "total_downloads": 30541,
+      "total_downloads": 60376,
       "min_param_count": null,
       "sample_models": [
-        "ByteDance-Seed/Seed-OSS-36B-Instruct",
-        "NousResearch/Hermes-4.3-36B",
-        "ByteDance-Seed/Seed-OSS-36B-Base"
+        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
       ],
-      "relevancy_score": 25.1
+      "relevancy_score": 24.7
     },
     {
-      "architecture_id": "KORMoForCausalLM",
+      "architecture_id": "Param2MoEForCausalLM",
       "total_models": 1,
-      "total_downloads": 1512,
-      "min_param_count": 10756624384,
+      "total_downloads": 8281,
+      "min_param_count": 17151140480,
       "sample_models": [
-        "KORMo-Team/KORMo-10B-sft"
+        "bharatgenai/Param2-17B-A2.4B-Thinking"
       ],
-      "relevancy_score": 25.0
+      "relevancy_score": 24.3
     },
     {
       "architecture_id": "MobilintExaoneForCausalLM",
       "total_models": 4,
-      "total_downloads": 20678,
+      "total_downloads": 20759,
       "min_param_count": null,
       "sample_models": [
         "mobilint/EXAONE-Deep-2.4B",
@@ -2892,226 +2922,286 @@
         "mobilint/EXAONE-3.5-7.8B-Instruct",
         "mobilint/EXAONE-Deep-7.8B"
       ],
-      "relevancy_score": 24.9
+      "relevancy_score": 24.2
+    },
+    {
+      "architecture_id": "CogVLMForCausalLM",
+      "total_models": 2,
+      "total_downloads": 5861,
+      "min_param_count": 17639687424,
+      "sample_models": [
+        "zai-org/cogvlm2-llama3-chat-19B",
+        "zai-org/cogvlm-chat-hf"
+      ],
+      "relevancy_score": 24.2
     },
     {
       "architecture_id": "MiniCPMSALAForCausalLM",
       "total_models": 1,
-      "total_downloads": 1403,
+      "total_downloads": 1254,
       "min_param_count": 9477203968,
       "sample_models": [
         "openbmb/MiniCPM-SALA"
       ],
-      "relevancy_score": 24.8
+      "relevancy_score": 24.2
     },
     {
-      "architecture_id": "CogVLMForCausalLM",
-      "total_models": 2,
-      "total_downloads": 5474,
-      "min_param_count": 17639687424,
+      "architecture_id": "Emu3ForCausalLM",
+      "total_models": 1,
+      "total_downloads": 1228,
+      "min_param_count": 8492011520,
       "sample_models": [
-        "zai-org/cogvlm2-llama3-chat-19B",
-        "zai-org/cogvlm-chat-hf"
+        "BAAI/Emu3-Chat"
       ],
-      "relevancy_score": 24.6
+      "relevancy_score": 24.2
     },
     {
-      "architecture_id": "LongcatFlashForCausalLM",
+      "architecture_id": "BunnyLlamaForCausalLM",
       "total_models": 1,
-      "total_downloads": 45650,
-      "min_param_count": 561862880256,
+      "total_downloads": 1195,
+      "min_param_count": 8479990848,
       "sample_models": [
-        "meituan-longcat/LongCat-Flash-Chat"
+        "typhoon-ai/llama-3-typhoon-v1.5-8b-vision-preview"
       ],
-      "relevancy_score": 24.6
+      "relevancy_score": 24.1
     },
     {
-      "architecture_id": "TrillionForCausalLM",
+      "architecture_id": "LongcatFlashForCausalLM",
       "total_models": 1,
-      "total_downloads": 7481,
-      "min_param_count": 20725519360,
+      "total_downloads": 43995,
+      "min_param_count": 561862880256,
       "sample_models": [
-        "trillionlabs/Tri-21B-Think"
+        "meituan-longcat/LongCat-Flash-Chat"
       ],
-      "relevancy_score": 24.6
+      "relevancy_score": 24.0
     },
     {
       "architecture_id": "InternLM3ForCausalLM",
       "total_models": 1,
-      "total_downloads": 43407,
+      "total_downloads": 43400,
       "min_param_count": null,
       "sample_models": [
         "internlm/internlm3-8b-instruct"
       ],
-      "relevancy_score": 24.5
-    },
-    {
-      "architecture_id": "Param2MoEForCausalLM",
-      "total_models": 1,
-      "total_downloads": 7230,
-      "min_param_count": 17151140480,
-      "sample_models": [
-        "bharatgenai/Param2-17B-A2.4B-Thinking"
-      ],
-      "relevancy_score": 24.5
+      "relevancy_score": 24.0
     },
     {
       "architecture_id": "SteerlingForCausalLM",
       "total_models": 1,
-      "total_downloads": 1203,
+      "total_downloads": 1099,
       "min_param_count": 8391778304,
       "sample_models": [
         "guidelabs/steerling-8b"
       ],
-      "relevancy_score": 24.5
+      "relevancy_score": 23.9
     },
     {
-      "architecture_id": "Emu3ForCausalLM",
+      "architecture_id": "ExaoneMoEForCausalLM",
       "total_models": 1,
-      "total_downloads": 1181,
-      "min_param_count": 8492011520,
+      "total_downloads": 37191,
+      "min_param_count": 237099669632,
       "sample_models": [
-        "BAAI/Emu3-Chat"
+        "LGAI-EXAONE/K-EXAONE-236B-A23B"
       ],
-      "relevancy_score": 24.5
+      "relevancy_score": 23.6
     },
     {
-      "architecture_id": "BunnyLlamaForCausalLM",
+      "architecture_id": "StableDiffcoderForCausalLM",
       "total_models": 1,
-      "total_downloads": 1149,
-      "min_param_count": 8479990848,
+      "total_downloads": 871,
+      "min_param_count": 8250462208,
       "sample_models": [
-        "typhoon-ai/llama-3-typhoon-v1.5-8b-vision-preview"
+        "ByteDance-Seed/Stable-DiffCoder-8B-Instruct"
       ],
-      "relevancy_score": 24.4
+      "relevancy_score": 23.4
     },
     {
       "architecture_id": "MiniMaxM1ForCausalLM",
       "total_models": 2,
-      "total_downloads": 24236,
+      "total_downloads": 23252,
       "min_param_count": null,
       "sample_models": [
         "MiniMaxAI/MiniMax-M1-40k",
         "MiniMaxAI/MiniMax-M1-80k"
       ],
-      "relevancy_score": 23.9
+      "relevancy_score": 23.2
     },
     {
       "architecture_id": "ICONNForCausalLM",
       "total_models": 1,
-      "total_downloads": 903,
+      "total_downloads": 745,
       "min_param_count": 7833409536,
       "sample_models": [
         "ICONNAI/ICONN-1-Mini-Beta"
       ],
-      "relevancy_score": 23.9
+      "relevancy_score": 23.1
     },
     {
       "architecture_id": "Qwen2VLForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 770,
+      "total_downloads": 726,
       "min_param_count": 8291375616,
       "sample_models": [
         "typhoon-ai/typhoon2-qwen2vl-7b-vision-instruct"
       ],
-      "relevancy_score": 23.5
+      "relevancy_score": 23.0
     },
     {
-      "architecture_id": "BailingMoeV2_5ForCausalLM",
+      "architecture_id": "Qwen2Model",
       "total_models": 1,
-      "total_downloads": 24780,
-      "min_param_count": 1012474606720,
+      "total_downloads": 681,
+      "min_param_count": 7070619136,
       "sample_models": [
-        "inclusionAI/Ring-2.5-1T"
+        "NewBeeKing/MemPO_Qwen2.5-SFT-RL"
       ],
-      "relevancy_score": 23.3
+      "relevancy_score": 22.9
     },
     {
-      "architecture_id": "ExaoneMoEForCausalLM",
+      "architecture_id": "LLaDAMoEModel",
       "total_models": 1,
-      "total_downloads": 24437,
-      "min_param_count": 237099669632,
+      "total_downloads": 665,
+      "min_param_count": 7356880896,
       "sample_models": [
-        "LGAI-EXAONE/K-EXAONE-236B-A23B"
+        "inclusionAI/LLaDA-MoE-7B-A1B-Base"
       ],
-      "relevancy_score": 23.2
+      "relevancy_score": 22.8
+    },
+    {
+      "architecture_id": "Gemma4ForCausalLM",
+      "total_models": 1,
+      "total_downloads": 664,
+      "min_param_count": 7518069034,
+      "sample_models": [
+        "aqweteddy/gemma-4-E4B-it-text"
+      ],
+      "relevancy_score": 22.8
+    },
+    {
+      "architecture_id": "BailingMoeV2_5ForCausalLM",
+      "total_models": 1,
+      "total_downloads": 24448,
+      "min_param_count": 1012474606720,
+      "sample_models": [
+        "inclusionAI/Ring-2.5-1T"
+      ],
+      "relevancy_score": 22.7
     },
     {
       "architecture_id": "CogVLMVideoForCausalLM",
       "total_models": 1,
-      "total_downloads": 681,
+      "total_downloads": 622,
       "min_param_count": 12507532544,
       "sample_models": [
         "zai-org/VisionReward-Video"
       ],
-      "relevancy_score": 23.2
+      "relevancy_score": 22.7
     },
     {
       "architecture_id": "Ernie4_5ForCausalLM",
       "total_models": 2,
-      "total_downloads": 17478,
+      "total_downloads": 17079,
       "min_param_count": null,
       "sample_models": [
         "baidu/ERNIE-4.5-0.3B-PT",
         "baidu/ERNIE-4.5-0.3B-Base-PT"
       ],
-      "relevancy_score": 23.1
+      "relevancy_score": 22.6
     },
     {
       "architecture_id": "CLIPT5ForConditionalGeneration",
       "total_models": 2,
-      "total_downloads": 16500,
+      "total_downloads": 17282,
       "min_param_count": null,
       "sample_models": [
         "zhiqiulin/clip-flant5-xl",
         "zhiqiulin/clip-flant5-xxl"
       ],
-      "relevancy_score": 23.0
+      "relevancy_score": 22.6
+    },
+    {
+      "architecture_id": "CodeShellForCausalLM",
+      "total_models": 1,
+      "total_downloads": 610,
+      "min_param_count": 7688051328,
+      "sample_models": [
+        "WisdomShell/CodeShell-7B"
+      ],
+      "relevancy_score": 22.6
+    },
+    {
+      "architecture_id": "SolarForCausalLM",
+      "total_models": 1,
+      "total_downloads": 21092,
+      "min_param_count": null,
+      "sample_models": [
+        "upstage/solar-pro-preview-instruct"
+      ],
+      "relevancy_score": 22.4
     },
     {
       "architecture_id": "Grok1ModelForCausalLM",
       "total_models": 1,
-      "total_downloads": 21640,
+      "total_downloads": 20827,
       "min_param_count": null,
       "sample_models": [
         "hpcai-tech/grok-1"
       ],
-      "relevancy_score": 23.0
+      "relevancy_score": 22.4
     },
     {
-      "architecture_id": "CodeShellForCausalLM",
+      "architecture_id": "InternLM2ForCausalLM",
       "total_models": 1,
-      "total_downloads": 614,
-      "min_param_count": 7688051328,
+      "total_downloads": 513,
+      "min_param_count": 7737708544,
       "sample_models": [
-        "WisdomShell/CodeShell-7B"
+        "AI4Chem/ChemLLM-7B-Chat"
       ],
-      "relevancy_score": 23.0
+      "relevancy_score": 22.3
     },
     {
-      "architecture_id": "SolarForCausalLM",
+      "architecture_id": "GptOssPuzzleForCausalLM",
       "total_models": 1,
-      "total_downloads": 21294,
-      "min_param_count": null,
+      "total_downloads": 18998,
+      "min_param_count": 90837823680,
       "sample_models": [
-        "upstage/solar-pro-preview-instruct"
+        "nvidia/gpt-oss-puzzle-88B"
       ],
-      "relevancy_score": 22.9
+      "relevancy_score": 22.2
     },
     {
-      "architecture_id": "LLaDAMoEModel",
+      "architecture_id": "TrillionForCausalLM",
       "total_models": 1,
-      "total_downloads": 586,
-      "min_param_count": 7356880896,
+      "total_downloads": 3124,
+      "min_param_count": 20725519360,
       "sample_models": [
-        "inclusionAI/LLaDA-MoE-7B-A1B-Base"
+        "trillionlabs/Tri-21B-Think"
       ],
-      "relevancy_score": 22.9
+      "relevancy_score": 22.2
+    },
+    {
+      "architecture_id": "RecaLLMLlamaForCausalLM",
+      "total_models": 1,
+      "total_downloads": 506,
+      "min_param_count": 8030294016,
+      "sample_models": [
+        "kswhitecross/RecaLLM-Llama-3.1-8B"
+      ],
+      "relevancy_score": 22.2
+    },
+    {
+      "architecture_id": "CohereForCausalLM",
+      "total_models": 1,
+      "total_downloads": 504,
+      "min_param_count": 8028033024,
+      "sample_models": [
+        "Yousefbahr/Turjman-Cold-Start"
+      ],
+      "relevancy_score": 22.2
     },
     {
       "architecture_id": "LISAForCausalLM",
       "total_models": 5,
-      "total_downloads": 5532,
+      "total_downloads": 5924,
       "min_param_count": null,
       "sample_models": [
         "xinlai/LISA-13B-llama2-v1",
@@ -3120,1317 +3210,1350 @@
         "xinlai/LISA-13B-llama2-v1-explanatory",
         "MBZUAI/GLaMM-GranD-Pretrained"
       ],
-      "relevancy_score": 22.6
+      "relevancy_score": 22.1
     },
     {
       "architecture_id": "Qwen2_5_VLForConditionalGeneration",
       "total_models": 3,
-      "total_downloads": 10261,
+      "total_downloads": 10615,
       "min_param_count": null,
       "sample_models": [
         "OmniSVG/OmniSVG1.1_4B",
         "OmniSVG/OmniSVG1.1_8B",
         "OmniSVG/OmniSVG"
       ],
-      "relevancy_score": 22.6
+      "relevancy_score": 22.1
+    },
+    {
+      "architecture_id": "OutlierMoEForCausalLM",
+      "total_models": 3,
+      "total_downloads": 1690,
+      "min_param_count": 22813220976,
+      "sample_models": [
+        "Outlier-Ai/Outlier-40B-V3.2",
+        "Outlier-Ai/Outlier-10B-V3.2",
+        "Outlier-Ai/Outlier-70B-V3.2"
+      ],
+      "relevancy_score": 22.1
     },
     {
       "architecture_id": "OrionForCausalLM",
       "total_models": 2,
-      "total_downloads": 13834,
+      "total_downloads": 13391,
       "min_param_count": null,
       "sample_models": [
         "OrionStarAI/Orion-14B-Chat",
         "OrionStarAI/Orion-14B-Base"
       ],
-      "relevancy_score": 22.6
-    },
-    {
-      "architecture_id": "GptOssPuzzleForCausalLM",
-      "total_models": 1,
-      "total_downloads": 17281,
-      "min_param_count": 90837823680,
-      "sample_models": [
-        "nvidia/gpt-oss-puzzle-88B"
-      ],
-      "relevancy_score": 22.5
+      "relevancy_score": 22.0
     },
     {
       "architecture_id": "HunYuanMoEV1ForCausalLM",
       "total_models": 1,
-      "total_downloads": 16793,
+      "total_downloads": 15552,
       "min_param_count": null,
       "sample_models": [
         "tencent/Hunyuan-A13B-Instruct"
       ],
-      "relevancy_score": 22.4
+      "relevancy_score": 21.7
     },
     {
-      "architecture_id": "Dots1ForCausalLM",
+      "architecture_id": "GravityMoEForCausalLM",
       "total_models": 2,
-      "total_downloads": 10490,
-      "min_param_count": 142774381696,
+      "total_downloads": 1655,
+      "min_param_count": 16242181824,
       "sample_models": [
-        "rednote-hilab/dots.llm1.inst",
-        "rednote-hilab/dots.llm1.base"
+        "learning-unit/L1-16B-A3B",
+        "trillionlabs/Gravity-16B-A3B-Base"
       ],
-      "relevancy_score": 22.0
+      "relevancy_score": 21.5
     },
     {
       "architecture_id": "MiniCPM3ForCausalLM",
       "total_models": 1,
-      "total_downloads": 14420,
+      "total_downloads": 14024,
       "min_param_count": null,
       "sample_models": [
         "openbmb/MiniCPM3-4B"
       ],
-      "relevancy_score": 22.0
+      "relevancy_score": 21.5
     },
     {
-      "architecture_id": "IQuestLoopCoderForCausalLM",
+      "architecture_id": "ArcticForCausalLM",
       "total_models": 1,
-      "total_downloads": 14278,
-      "min_param_count": 39794696320,
+      "total_downloads": 13989,
+      "min_param_count": null,
       "sample_models": [
-        "IQuestLab/IQuest-Coder-V1-40B-Loop-Instruct"
+        "Snowflake/snowflake-arctic-instruct"
       ],
-      "relevancy_score": 22.0
+      "relevancy_score": 21.5
     },
     {
-      "architecture_id": "ArcticForCausalLM",
-      "total_models": 1,
-      "total_downloads": 14245,
-      "min_param_count": null,
+      "architecture_id": "Dots1ForCausalLM",
+      "total_models": 2,
+      "total_downloads": 10143,
+      "min_param_count": 142774381696,
       "sample_models": [
-        "Snowflake/snowflake-arctic-instruct"
+        "rednote-hilab/dots.llm1.inst",
+        "rednote-hilab/dots.llm1.base"
       ],
-      "relevancy_score": 22.0
+      "relevancy_score": 21.4
     },
     {
-      "architecture_id": "LlavaLlamaModel",
-      "total_models": 4,
-      "total_downloads": 5317,
-      "min_param_count": null,
+      "architecture_id": "IQuestLoopCoderForCausalLM",
+      "total_models": 1,
+      "total_downloads": 13700,
+      "min_param_count": 39794696320,
       "sample_models": [
-        "Efficient-Large-Model/VILA1.5-3b",
-        "Efficient-Large-Model/NVILA-8B",
-        "Efficient-Large-Model/VILA1.5-13b",
-        "Efficient-Large-Model/NVILA-Lite-8B"
+        "IQuestLab/IQuest-Coder-V1-40B-Loop-Instruct"
       ],
-      "relevancy_score": 21.8
+      "relevancy_score": 21.4
     },
     {
       "architecture_id": "GPT2LMHeadCustomModel",
       "total_models": 2,
-      "total_downloads": 8956,
+      "total_downloads": 8718,
       "min_param_count": null,
       "sample_models": [
         "bigcode/santacoder",
         "rbiojout/santacoder-odoo-15"
       ],
-      "relevancy_score": 21.7
-    },
-    {
-      "architecture_id": "MosaicGPT",
-      "total_models": 3,
-      "total_downloads": 6562,
-      "min_param_count": null,
-      "sample_models": [
-        "anas-awadalla/mpt-1b-redpajama-200b",
-        "anas-awadalla/mpt-1b-redpajama-200b-dolly",
-        "anas-awadalla/mpt-1b-redpajama-200b-hf-style"
-      ],
-      "relevancy_score": 21.6
+      "relevancy_score": 21.1
     },
     {
-      "architecture_id": "OpenMoeForCausalLM",
-      "total_models": 5,
-      "total_downloads": 3349,
+      "architecture_id": "MiniMaxText01ForCausalLM",
+      "total_models": 1,
+      "total_downloads": 11872,
       "min_param_count": null,
-      "sample_models": [
-        "hpcai-tech/openmoe-8B",
-        "OrionZheng/openmoe-base",
-        "OrionZheng/openmoe-8b",
-        "hpcai-tech/openmoe-base",
-        "OrionZheng/openmoe-8b-chat"
+      "sample_models": [
+        "MiniMaxAI/MiniMax-Text-01"
       ],
-      "relevancy_score": 21.5
+      "relevancy_score": 21.1
     },
     {
       "architecture_id": "GPTRefactForCausalLM",
       "total_models": 2,
-      "total_downloads": 8330,
+      "total_downloads": 8406,
       "min_param_count": null,
       "sample_models": [
         "refactai/Refact-1_6B-fim",
         "refactai/Refact-1_6-base"
       ],
-      "relevancy_score": 21.5
+      "relevancy_score": 21.0
     },
     {
       "architecture_id": "CrystalCoderLMHeadModel",
       "total_models": 2,
-      "total_downloads": 7989,
+      "total_downloads": 8126,
       "min_param_count": null,
       "sample_models": [
         "LLM360/Crystal",
         "LLM360/CrystalChat"
       ],
-      "relevancy_score": 21.4
+      "relevancy_score": 20.9
+    },
+    {
+      "architecture_id": "OpenMoeForCausalLM",
+      "total_models": 5,
+      "total_downloads": 3190,
+      "min_param_count": null,
+      "sample_models": [
+        "hpcai-tech/openmoe-8B",
+        "OrionZheng/openmoe-base",
+        "hpcai-tech/openmoe-base",
+        "OrionZheng/openmoe-8b",
+        "OrionZheng/openmoe-8b-chat"
+      ],
+      "relevancy_score": 20.8
+    },
+    {
+      "architecture_id": "Bagel",
+      "total_models": 1,
+      "total_downloads": 1657,
+      "min_param_count": 14691079811,
+      "sample_models": [
+        "lmms-lab/BAGEL-7B-MoT-ver.LE"
+      ],
+      "relevancy_score": 20.8
     },
     {
       "architecture_id": "MobileLlamaForCausalLM",
       "total_models": 4,
-      "total_downloads": 4006,
+      "total_downloads": 4211,
       "min_param_count": null,
       "sample_models": [
         "mtgv/MobileVLM_V2-1.7B",
+        "mtgv/MobileVLM-1.7B",
         "mtgv/MobileVLM_V2-7B",
-        "mtgv/MobileVLM_V2-3B",
-        "mtgv/MobileVLM-1.7B"
+        "mtgv/MobileVLM_V2-3B"
+      ],
+      "relevancy_score": 20.7
+    },
+    {
+      "architecture_id": "BlueLMForCausalLM",
+      "total_models": 3,
+      "total_downloads": 5311,
+      "min_param_count": null,
+      "sample_models": [
+        "vivo-ai/BlueLM-7B-Chat",
+        "vivo-ai/BlueLM-7B-Chat-32K",
+        "vivo-ai/BlueLM-7B-Base"
       ],
-      "relevancy_score": 21.2
+      "relevancy_score": 20.6
     },
     {
       "architecture_id": "modeling_camelidae.LlamaForCausalLM",
       "total_models": 3,
-      "total_downloads": 5505,
+      "total_downloads": 5073,
       "min_param_count": null,
       "sample_models": [
         "hywu/Camelidae-8x34B",
         "hywu/Camelidae-8x7B",
         "hywu/Camelidae-8x13B"
       ],
-      "relevancy_score": 21.2
+      "relevancy_score": 20.5
     },
     {
-      "architecture_id": "BlueLMForCausalLM",
+      "architecture_id": "MosaicGPT",
       "total_models": 3,
-      "total_downloads": 5360,
+      "total_downloads": 4781,
       "min_param_count": null,
       "sample_models": [
-        "vivo-ai/BlueLM-7B-Chat",
-        "vivo-ai/BlueLM-7B-Base",
-        "vivo-ai/BlueLM-7B-Chat-32K"
+        "anas-awadalla/mpt-1b-redpajama-200b",
+        "anas-awadalla/mpt-1b-redpajama-200b-dolly",
+        "anas-awadalla/mpt-1b-redpajama-200b-hf-style"
       ],
-      "relevancy_score": 21.2
+      "relevancy_score": 20.4
     },
     {
-      "architecture_id": "Bagel",
-      "total_models": 1,
-      "total_downloads": 1511,
-      "min_param_count": 14691079811,
+      "architecture_id": "MultiScaleForCausalLM",
+      "total_models": 3,
+      "total_downloads": 4272,
+      "min_param_count": null,
       "sample_models": [
-        "lmms-lab/BAGEL-7B-MoT-ver.LE"
+        "KoinicLabs/AXL-Translate",
+        "KoinicLabs/AXL-Vision-v2",
+        "KoinicLabs/AXL-Chat-10M"
       ],
-      "relevancy_score": 21.0
+      "relevancy_score": 20.1
     },
     {
-      "architecture_id": "GPT2Model",
+      "architecture_id": "ModernBertDecoderForCausalLM",
       "total_models": 2,
-      "total_downloads": 5577,
+      "total_downloads": 5509,
       "min_param_count": null,
       "sample_models": [
-        "keshan/sinhala-gpt2",
-        "cerebras/Cerebras-GPT-13B"
+        "jhu-clsp/ettin-decoder-400m",
+        "jhu-clsp/ettin-decoder-32m"
       ],
-      "relevancy_score": 20.6
+      "relevancy_score": 20.1
     },
     {
       "architecture_id": "LiquidForCausalLM",
       "total_models": 2,
-      "total_downloads": 5508,
+      "total_downloads": 5676,
       "min_param_count": null,
       "sample_models": [
         "reaperdoesntknow/DNA-175M",
         "reaperdoesntknow/DNA-50M"
       ],
-      "relevancy_score": 20.6
+      "relevancy_score": 20.1
     },
     {
-      "architecture_id": "ModernBertDecoderForCausalLM",
-      "total_models": 2,
-      "total_downloads": 5245,
+      "architecture_id": "LlavaLlamaModel",
+      "total_models": 3,
+      "total_downloads": 4035,
       "min_param_count": null,
       "sample_models": [
-        "jhu-clsp/ettin-decoder-400m",
-        "jhu-clsp/ettin-decoder-32m"
+        "Efficient-Large-Model/VILA1.5-3b",
+        "Efficient-Large-Model/NVILA-8B",
+        "Efficient-Large-Model/VILA1.5-13b"
       ],
-      "relevancy_score": 20.5
+      "relevancy_score": 20.0
     },
     {
-      "architecture_id": "BottleneckT5LMWithPerturb",
-      "total_models": 4,
-      "total_downloads": 2754,
+      "architecture_id": "GPT2Model",
+      "total_models": 2,
+      "total_downloads": 5435,
       "min_param_count": null,
       "sample_models": [
-        "thesephist/contra-bottleneck-t5-small-wikipedia",
-        "thesephist/contra-bottleneck-t5-base-wikipedia",
-        "thesephist/contra-bottleneck-t5-large-wikipedia",
-        "thesephist/contra-bottleneck-t5-xl-wikipedia"
+        "cerebras/Cerebras-GPT-13B",
+        "keshan/sinhala-gpt2"
       ],
-      "relevancy_score": 20.4
+      "relevancy_score": 20.0
     },
     {
-      "architecture_id": "MultiScaleForCausalLM",
-      "total_models": 3,
-      "total_downloads": 3737,
+      "architecture_id": "KonkanGPT",
+      "total_models": 2,
+      "total_downloads": 5113,
       "min_param_count": null,
       "sample_models": [
-        "KoinicLabs/AXL-Vision-v2",
-        "KoinicLabs/AXL-Translate",
-        "KoinicLabs/AXL-Chat-10M"
+        "omdeep22/Gonyai-teo2",
+        "omdeep22/Gonyai-v1"
       ],
-      "relevancy_score": 20.4
+      "relevancy_score": 19.9
     },
     {
       "architecture_id": "InternLMXComposer2ForCausalLM",
       "total_models": 1,
-      "total_downloads": 6744,
+      "total_downloads": 6712,
       "min_param_count": null,
       "sample_models": [
         "internlm/internlm-xcomposer2-7b"
       ],
-      "relevancy_score": 20.4
+      "relevancy_score": 19.9
     },
     {
-      "architecture_id": "KonkanGPT",
-      "total_models": 2,
-      "total_downloads": 4822,
+      "architecture_id": "BottleneckT5LMWithPerturb",
+      "total_models": 4,
+      "total_downloads": 2779,
       "min_param_count": null,
       "sample_models": [
-        "omdeep22/Gonyai-teo2",
-        "omdeep22/Gonyai-v1"
+        "thesephist/contra-bottleneck-t5-small-wikipedia",
+        "thesephist/contra-bottleneck-t5-base-wikipedia",
+        "thesephist/contra-bottleneck-t5-large-wikipedia",
+        "thesephist/contra-bottleneck-t5-xl-wikipedia"
       ],
-      "relevancy_score": 20.3
+      "relevancy_score": 19.8
     },
     {
-      "architecture_id": "GraphT5TransformerForConditionalGeneration",
-      "total_models": 1,
-      "total_downloads": 6732,
+      "architecture_id": "NanochatWasmFusedModel",
+      "total_models": 2,
+      "total_downloads": 4952,
       "min_param_count": null,
       "sample_models": [
-        "haitengzhao/gimlet"
+        "eastlondoner/nanochat-wasm-fused-preview-01",
+        "eastlondoner/nanochat-wasm-fused-preview-02"
       ],
-      "relevancy_score": 20.3
+      "relevancy_score": 19.8
     },
     {
       "architecture_id": "MobilintExaone4ForCausalLM",
       "total_models": 1,
-      "total_downloads": 6493,
+      "total_downloads": 6518,
       "min_param_count": null,
       "sample_models": [
         "mobilint/EXAONE-4.0-1.2B"
       ],
-      "relevancy_score": 20.3
+      "relevancy_score": 19.8
     },
     {
       "architecture_id": "LlamaMoEForCausalLM",
       "total_models": 3,
-      "total_downloads": 3428,
+      "total_downloads": 3452,
       "min_param_count": null,
       "sample_models": [
         "llama-moe/LLaMA-MoE-v1-3_5B-2_8",
         "llama-moe/LLaMA-MoE-v1-3_0B-2_16",
         "llama-moe/LLaMA-MoE-v1-3_5B-4_16"
       ],
-      "relevancy_score": 20.2
+      "relevancy_score": 19.7
     },
     {
       "architecture_id": "RobertaForCausalLM",
       "total_models": 2,
-      "total_downloads": 4402,
+      "total_downloads": 4451,
       "min_param_count": null,
       "sample_models": [
         "uf-aice-lab/math-roberta",
         "gokceuludogan/ChemBERTaLM"
       ],
-      "relevancy_score": 20.1
+      "relevancy_score": 19.6
     },
     {
       "architecture_id": "MossForCausalLM",
       "total_models": 2,
-      "total_downloads": 4348,
+      "total_downloads": 4390,
       "min_param_count": null,
       "sample_models": [
         "OpenMOSS-Team/moss-moon-003-sft",
         "OpenMOSS-Team/moss-moon-003-base"
       ],
-      "relevancy_score": 20.0
+      "relevancy_score": 19.6
     },
     {
-      "architecture_id": "BartForCausalLM",
-      "total_models": 2,
-      "total_downloads": 4190,
+      "architecture_id": "Qwen3TSForCausalLM",
+      "total_models": 1,
+      "total_downloads": 5950,
       "min_param_count": null,
       "sample_models": [
-        "sanchit-gandhi/tiny-random-bart-fp16",
-        "hf-tiny-model-private/tiny-random-BartForCausalLM"
+        "bytedance-research/ChatTS-8B"
       ],
-      "relevancy_score": 20.0
+      "relevancy_score": 19.6
     },
     {
       "architecture_id": "Int8OPTForCausalLM",
       "total_models": 2,
-      "total_downloads": 4147,
+      "total_downloads": 4242,
       "min_param_count": null,
       "sample_models": [
         "mit-han-lab/opt-125m-smoothquant",
         "mit-han-lab/opt-6.7b-smoothquant"
       ],
-      "relevancy_score": 19.9
+      "relevancy_score": 19.5
     },
     {
-      "architecture_id": "InternLMXComposerForCausalLM",
-      "total_models": 1,
-      "total_downloads": 5444,
+      "architecture_id": "BartForCausalLM",
+      "total_models": 2,
+      "total_downloads": 4186,
       "min_param_count": null,
       "sample_models": [
-        "internlm/internlm-xcomposer-7b"
+        "sanchit-gandhi/tiny-random-bart-fp16",
+        "hf-tiny-model-private/tiny-random-BartForCausalLM"
       ],
-      "relevancy_score": 19.9
+      "relevancy_score": 19.5
     },
     {
       "architecture_id": "TranceptionLMHeadModel",
       "total_models": 2,
-      "total_downloads": 3959,
+      "total_downloads": 4081,
       "min_param_count": null,
       "sample_models": [
         "PascalNotin/Tranception_Large",
         "PascalNotin/Tranception_Small"
       ],
-      "relevancy_score": 19.8
+      "relevancy_score": 19.4
     },
     {
-      "architecture_id": "ModelStarOLMhead",
+      "architecture_id": "InternLMXComposerForCausalLM",
       "total_models": 1,
-      "total_downloads": 5177,
-      "min_param_count": null,
-      "sample_models": [
-        "Hawa-Al-Akram/StarO-Ai"
-      ],
-      "relevancy_score": 19.8
-    },
-    {
-      "architecture_id": "NanochatWasmFusedModel",
-      "total_models": 2,
-      "total_downloads": 3734,
+      "total_downloads": 5363,
       "min_param_count": null,
       "sample_models": [
-        "eastlondoner/nanochat-wasm-fused-preview-01",
-        "eastlondoner/nanochat-wasm-fused-preview-02"
+        "internlm/internlm-xcomposer-7b"
       ],
-      "relevancy_score": 19.7
+      "relevancy_score": 19.4
     },
     {
-      "architecture_id": "Qwen3TSForCausalLM",
+      "architecture_id": "ModelStarOLMhead",
       "total_models": 1,
-      "total_downloads": 5082,
+      "total_downloads": 5178,
       "min_param_count": null,
       "sample_models": [
-        "bytedance-research/ChatTS-8B"
+        "Hawa-Al-Akram/StarO-Ai"
       ],
-      "relevancy_score": 19.7
+      "relevancy_score": 19.3
     },
     {
-      "architecture_id": "TransfoXLLMHeadModel",
-      "total_models": 1,
-      "total_downloads": 4728,
+      "architecture_id": "Olmo2ForSequenceClassification",
+      "total_models": 2,
+      "total_downloads": 3565,
       "min_param_count": null,
       "sample_models": [
-        "transfo-xl/transfo-xl-wt103"
+        "allenai/OLMo-2-1124-7B-RM",
+        "LifeWiki-ai/OLMo-2-1124-7B-RM"
       ],
-      "relevancy_score": 19.6
+      "relevancy_score": 19.1
     },
     {
-      "architecture_id": "Olmo2ForSequenceClassification",
-      "total_models": 2,
-      "total_downloads": 3444,
+      "architecture_id": "GraphT5TransformerForConditionalGeneration",
+      "total_models": 1,
+      "total_downloads": 4795,
       "min_param_count": null,
       "sample_models": [
-        "allenai/OLMo-2-1124-7B-RM",
-        "LifeWiki-ai/OLMo-2-1124-7B-RM"
+        "haitengzhao/gimlet"
       ],
-      "relevancy_score": 19.5
+      "relevancy_score": 19.1
     },
     {
       "architecture_id": "EvafrillMoForCausalLM",
       "total_models": 1,
-      "total_downloads": 4484,
+      "total_downloads": 4556,
       "min_param_count": null,
       "sample_models": [
         "pathcosmos/EVAFRILL-Mo-3B"
       ],
-      "relevancy_score": 19.4
+      "relevancy_score": 19.0
     },
     {
       "architecture_id": "Qwen2TSForCausalLM",
       "total_models": 1,
-      "total_downloads": 3992,
+      "total_downloads": 4042,
       "min_param_count": null,
       "sample_models": [
         "bytedance-research/ChatTS-14B"
       ],
-      "relevancy_score": 19.2
+      "relevancy_score": 18.8
     },
     {
       "architecture_id": "QEDForCausalLM",
       "total_models": 1,
-      "total_downloads": 3794,
+      "total_downloads": 4040,
       "min_param_count": null,
       "sample_models": [
         "levossadtchi/QED-75M"
       ],
-      "relevancy_score": 19.1
+      "relevancy_score": 18.8
     },
     {
-      "architecture_id": "LongcatCausalLM",
+      "architecture_id": "MochivaForCausalLM",
       "total_models": 1,
-      "total_downloads": 3590,
-      "min_param_count": 561862880256,
+      "total_downloads": 3969,
+      "min_param_count": null,
       "sample_models": [
-        "meituan-longcat/LongCat-Flash-Thinking-2601"
+        "Mochiva-team/Mochiva-model"
       ],
-      "relevancy_score": 18.9
+      "relevancy_score": 18.7
     },
     {
-      "architecture_id": "YuanForCausalLM",
-      "total_models": 3,
-      "total_downloads": 1880,
+      "architecture_id": "TransfoXLLMHeadModel",
+      "total_models": 1,
+      "total_downloads": 3909,
       "min_param_count": null,
       "sample_models": [
-        "IEITYuan/Yuan2-M32-hf",
-        "IEITYuan/Yuan2-2B-Mars-hf",
-        "IEITYuan/Yuan2-2B-Janus-hf"
+        "transfo-xl/transfo-xl-wt103"
       ],
-      "relevancy_score": 18.8
+      "relevancy_score": 18.7
     },
     {
-      "architecture_id": "GomeForCausalLM",
+      "architecture_id": "LongcatCausalLM",
       "total_models": 1,
-      "total_downloads": 3428,
-      "min_param_count": null,
+      "total_downloads": 3688,
+      "min_param_count": 561862880256,
       "sample_models": [
-        "Prositron/gome"
+        "meituan-longcat/LongCat-Flash-Thinking-2601"
       ],
-      "relevancy_score": 18.8
+      "relevancy_score": 18.6
     },
     {
-      "architecture_id": "GravityMoEForCausalLM",
+      "architecture_id": "GomeForCausalLM",
       "total_models": 1,
-      "total_downloads": 541,
-      "min_param_count": 16242181824,
+      "total_downloads": 3500,
+      "min_param_count": null,
       "sample_models": [
-        "learning-unit/L1-16B-A3B"
+        "Prositron/gome"
       ],
-      "relevancy_score": 18.7
+      "relevancy_score": 18.5
     },
     {
-      "architecture_id": "GPT",
-      "total_models": 2,
-      "total_downloads": 2262,
+      "architecture_id": "YuanForCausalLM",
+      "total_models": 3,
+      "total_downloads": 1888,
       "min_param_count": null,
       "sample_models": [
-        "LH-Tech-AI/Apex-1.5-Coder-Instruct-350M",
-        "LH-Tech-AI/Apex-1.5-Instruct-350M"
+        "IEITYuan/Yuan2-M32-hf",
+        "IEITYuan/Yuan2-2B-Mars-hf",
+        "IEITYuan/Yuan2-2B-Janus-hf"
       ],
-      "relevancy_score": 18.6
+      "relevancy_score": 18.4
     },
     {
       "architecture_id": "MyAwesomeModelForCausalLM",
       "total_models": 1,
-      "total_downloads": 3025,
+      "total_downloads": 3098,
       "min_param_count": null,
       "sample_models": [
         "dongbobo/MyAwesomeModel"
       ],
-      "relevancy_score": 18.6
+      "relevancy_score": 18.2
     },
     {
-      "architecture_id": "CTRLLMHeadModel",
+      "architecture_id": "QHEARTForECGQA",
       "total_models": 1,
-      "total_downloads": 2889,
+      "total_downloads": 2956,
       "min_param_count": null,
       "sample_models": [
-        "sshleifer/tiny-ctrl"
+        "Manhph2211/Q-HEART"
       ],
-      "relevancy_score": 18.5
+      "relevancy_score": 18.1
     },
     {
-      "architecture_id": "CPMAntForCausalLM",
+      "architecture_id": "CTRLLMHeadModel",
       "total_models": 1,
-      "total_downloads": 2814,
+      "total_downloads": 2941,
       "min_param_count": null,
       "sample_models": [
-        "openbmb/cpm-ant-10b"
+        "sshleifer/tiny-ctrl"
       ],
-      "relevancy_score": 18.4
+      "relevancy_score": 18.1
     },
     {
-      "architecture_id": "TAMELM",
+      "architecture_id": "GPT2CustomLMHeadModel",
       "total_models": 1,
-      "total_downloads": 2738,
+      "total_downloads": 2852,
       "min_param_count": null,
       "sample_models": [
-        "reaperdoesntknow/TameForCasualLM"
+        "fxmarty/tiny-testing-gpt2-remote-code"
       ],
-      "relevancy_score": 18.3
+      "relevancy_score": 18.0
     },
     {
-      "architecture_id": "CoherenceMomentumModel",
+      "architecture_id": "TAMELM",
       "total_models": 1,
-      "total_downloads": 2731,
+      "total_downloads": 2823,
       "min_param_count": null,
       "sample_models": [
-        "aisingapore/coherence-momentum"
+        "reaperdoesntknow/TameForCasualLM"
       ],
-      "relevancy_score": 18.3
+      "relevancy_score": 18.0
     },
     {
-      "architecture_id": "GPT2CustomLMHeadModel",
+      "architecture_id": "CoherenceMomentumModel",
       "total_models": 1,
-      "total_downloads": 2691,
+      "total_downloads": 2756,
       "min_param_count": null,
       "sample_models": [
-        "fxmarty/tiny-testing-gpt2-remote-code"
+        "aisingapore/coherence-momentum"
       ],
-      "relevancy_score": 18.3
+      "relevancy_score": 17.9
     },
     {
       "architecture_id": "GPT2",
       "total_models": 1,
-      "total_downloads": 2643,
+      "total_downloads": 2709,
       "min_param_count": null,
       "sample_models": [
         "NamrataThakur/Small_Language_Model_MHA_53M_Pretrained"
       ],
-      "relevancy_score": 18.3
+      "relevancy_score": 17.9
     },
     {
       "architecture_id": "GQAGPT2",
       "total_models": 1,
-      "total_downloads": 2637,
+      "total_downloads": 2699,
       "min_param_count": null,
       "sample_models": [
         "NamrataThakur/Small_Language_Model_GQA_48M_Pretrained"
       ],
-      "relevancy_score": 18.3
+      "relevancy_score": 17.9
     },
     {
-      "architecture_id": "MoEGPT2",
+      "architecture_id": "ThinkerLM",
       "total_models": 1,
-      "total_downloads": 2636,
+      "total_downloads": 2697,
       "min_param_count": null,
       "sample_models": [
-        "NamrataThakur/Small_Language_Model_MOE_127M_Pretrained"
+        "prskid1000/micro-Omni"
       ],
-      "relevancy_score": 18.3
+      "relevancy_score": 17.9
     },
     {
-      "architecture_id": "ThinkerLM",
+      "architecture_id": "CPMAntForCausalLM",
       "total_models": 1,
-      "total_downloads": 2627,
+      "total_downloads": 2693,
       "min_param_count": null,
       "sample_models": [
-        "prskid1000/micro-Omni"
+        "openbmb/cpm-ant-10b"
       ],
-      "relevancy_score": 18.2
+      "relevancy_score": 17.9
     },
     {
-      "architecture_id": "QHEARTForECGQA",
+      "architecture_id": "D3PMSanskritModel",
       "total_models": 1,
-      "total_downloads": 2624,
+      "total_downloads": 2676,
       "min_param_count": null,
       "sample_models": [
-        "Manhph2211/Q-HEART"
+        "bhsinghgrid/sanskrit-translation"
       ],
-      "relevancy_score": 18.2
+      "relevancy_score": 17.9
     },
     {
-      "architecture_id": "SeerAttnLlamaForCausalLM",
+      "architecture_id": "GuppyLM",
       "total_models": 1,
-      "total_downloads": 2618,
+      "total_downloads": 2655,
       "min_param_count": null,
       "sample_models": [
-        "SeerAttention/SeerAttention-Llama-3.1-8B-AttnGates"
+        "arman-bd/guppylm-9M"
       ],
-      "relevancy_score": 18.2
+      "relevancy_score": 17.9
     },
     {
-      "architecture_id": "D3PMSanskritModel",
+      "architecture_id": "MoEGPT2",
       "total_models": 1,
-      "total_downloads": 2603,
+      "total_downloads": 2552,
       "min_param_count": null,
       "sample_models": [
-        "bhsinghgrid/sanskrit-translation"
+        "NamrataThakur/Small_Language_Model_MOE_127M_Pretrained"
       ],
-      "relevancy_score": 18.2
+      "relevancy_score": 17.8
     },
     {
-      "architecture_id": "MoYiForCausalLM",
+      "architecture_id": "JiRackTernary1B",
       "total_models": 1,
-      "total_downloads": 2433,
+      "total_downloads": 2529,
       "min_param_count": null,
       "sample_models": [
-        "astanahub/alemllm"
+        "kgrabko/JiRackTernary_1b"
       ],
-      "relevancy_score": 18.1
+      "relevancy_score": 17.8
     },
     {
-      "architecture_id": "Eagle3DeepseekV2ForCausalLM",
+      "architecture_id": "Speech2TextTransformerForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 2367,
+      "total_downloads": 2373,
       "min_param_count": null,
       "sample_models": [
-        "nvidia/Kimi-K2.5-Thinking-Eagle3"
+        "valhalla/s2t_mustc_multilinguial_medium"
       ],
-      "relevancy_score": 18.0
+      "relevancy_score": 17.6
     },
     {
-      "architecture_id": "Speech2TextTransformerForConditionalGeneration",
+      "architecture_id": "Eagle3DeepseekV2ForCausalLM",
       "total_models": 1,
-      "total_downloads": 2304,
+      "total_downloads": 2367,
       "min_param_count": null,
       "sample_models": [
-        "valhalla/s2t_mustc_multilinguial_medium"
+        "nvidia/Kimi-K2.5-Thinking-Eagle3"
       ],
-      "relevancy_score": 18.0
+      "relevancy_score": 17.6
     },
     {
-      "architecture_id": "Videollama2Qwen2ForCausalLM",
+      "architecture_id": "GPTXForCausalLM",
       "total_models": 1,
-      "total_downloads": 2257,
+      "total_downloads": 2295,
       "min_param_count": null,
       "sample_models": [
-        "QuangTuan/MultiMood-7B-GRPO-VisualAudioText-Comp"
+        "AxiomicLabs/GPT-X-125M"
       ],
-      "relevancy_score": 17.9
+      "relevancy_score": 17.5
     },
     {
       "architecture_id": "WhisperMixStyleForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 2206,
+      "total_downloads": 2258,
       "min_param_count": null,
       "sample_models": [
         "wago5090/mixstyle_multi-s"
       ],
-      "relevancy_score": 17.9
+      "relevancy_score": 17.5
     },
     {
-      "architecture_id": "AlinlightForCausalLM",
+      "architecture_id": "Videollama2Qwen2ForCausalLM",
       "total_models": 1,
-      "total_downloads": 2193,
+      "total_downloads": 2223,
       "min_param_count": null,
       "sample_models": [
-        "EngineerGL/Alinlight"
+        "QuangTuan/MultiMood-7B-GRPO-VisualAudioText-Comp"
       ],
-      "relevancy_score": 17.8
+      "relevancy_score": 17.5
     },
     {
-      "architecture_id": "GuppyLM",
+      "architecture_id": "GPT",
+      "total_models": 2,
+      "total_downloads": 1600,
+      "min_param_count": null,
+      "sample_models": [
+        "LH-Tech-AI/Apex-1.5-Coder-Instruct-350M",
+        "LH-Tech-AI/Apex-1.5-Instruct-350M"
+      ],
+      "relevancy_score": 17.4
+    },
+    {
+      "architecture_id": "SeerAttnLlamaForCausalLM",
       "total_models": 1,
-      "total_downloads": 2179,
+      "total_downloads": 2171,
       "min_param_count": null,
       "sample_models": [
-        "arman-bd/guppylm-9M"
+        "SeerAttention/SeerAttention-Llama-3.1-8B-AttnGates"
       ],
-      "relevancy_score": 17.8
+      "relevancy_score": 17.4
     },
     {
-      "architecture_id": "LlamaForCausalLMEagle",
+      "architecture_id": "Typhoon2Audio2AudioForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 2127,
+      "total_downloads": 2169,
       "min_param_count": null,
       "sample_models": [
-        "thunlp/LLaMA3-Instruct-8B-FR-Spec"
+        "typhoon-ai/llama3.1-typhoon2-audio-8b-instruct"
       ],
-      "relevancy_score": 17.8
+      "relevancy_score": 17.4
     },
     {
-      "architecture_id": "JiRackTernary1B",
+      "architecture_id": "LlamaForCausalLMEagle",
       "total_models": 1,
-      "total_downloads": 2121,
+      "total_downloads": 2167,
       "min_param_count": null,
       "sample_models": [
-        "kgrabko/JiRackTernary_1b"
+        "thunlp/LLaMA3-Instruct-8B-FR-Spec"
       ],
-      "relevancy_score": 17.8
+      "relevancy_score": 17.4
     },
     {
-      "architecture_id": "RuGPT3XLForCausalLM",
+      "architecture_id": "DenseLLM",
       "total_models": 1,
-      "total_downloads": 2110,
+      "total_downloads": 2167,
       "min_param_count": null,
       "sample_models": [
-        "evilfreelancer/ruGPT3XL"
+        "AlgoDriveAI/Sanskrit_Akkadian_LLM_v1.0"
       ],
-      "relevancy_score": 17.8
+      "relevancy_score": 17.4
     },
     {
-      "architecture_id": "Typhoon2Audio2AudioForConditionalGeneration",
+      "architecture_id": "AlinlightForCausalLM",
       "total_models": 1,
-      "total_downloads": 2107,
+      "total_downloads": 2158,
       "min_param_count": null,
       "sample_models": [
-        "typhoon-ai/llama3.1-typhoon2-audio-8b-instruct"
+        "EngineerGL/Alinlight"
       ],
-      "relevancy_score": 17.8
+      "relevancy_score": 17.4
     },
     {
-      "architecture_id": "PointLLMLlamaForCausalLM",
-      "total_models": 2,
-      "total_downloads": 1493,
+      "architecture_id": "TeleFLMForCausalLM",
+      "total_models": 1,
+      "total_downloads": 2150,
       "min_param_count": null,
       "sample_models": [
-        "RunsenXu/PointLLM_7B_v1.1_init",
-        "RunsenXu/PointLLM_7B_v1.2"
+        "CofeAI/Tele-FLM-1T"
       ],
-      "relevancy_score": 17.7
+      "relevancy_score": 17.4
     },
     {
-      "architecture_id": "LlaMAForCausalLM",
+      "architecture_id": "TFGPT2LMHeadModel",
       "total_models": 1,
-      "total_downloads": 2097,
+      "total_downloads": 2140,
       "min_param_count": null,
       "sample_models": [
-        "circulus/alpaca-7b"
+        "mymusise/gpt2-medium-chinese"
       ],
-      "relevancy_score": 17.7
+      "relevancy_score": 17.4
     },
     {
-      "architecture_id": "TeleFLMForCausalLM",
+      "architecture_id": "LlaMAForCausalLM",
       "total_models": 1,
-      "total_downloads": 2093,
+      "total_downloads": 2121,
       "min_param_count": null,
       "sample_models": [
-        "CofeAI/Tele-FLM-1T"
+        "circulus/alpaca-7b"
       ],
-      "relevancy_score": 17.7
+      "relevancy_score": 17.4
     },
     {
       "architecture_id": "GeoVForCausalLM",
       "total_models": 1,
-      "total_downloads": 2090,
+      "total_downloads": 2118,
       "min_param_count": null,
       "sample_models": [
         "GeoV/GeoV-9b"
       ],
-      "relevancy_score": 17.7
+      "relevancy_score": 17.4
     },
     {
-      "architecture_id": "TFGPT2LMHeadModel",
+      "architecture_id": "GPTModelForTextGeneration",
       "total_models": 1,
-      "total_downloads": 2076,
+      "total_downloads": 2107,
       "min_param_count": null,
       "sample_models": [
-        "mymusise/gpt2-medium-chinese"
+        "samkeet/GPT_124M-Instruct"
       ],
-      "relevancy_score": 17.7
+      "relevancy_score": 17.4
     },
     {
-      "architecture_id": "RobertaPreLayerNormForCausalLM",
-      "total_models": 1,
-      "total_downloads": 2074,
+      "architecture_id": "IndexForCausalLM",
+      "total_models": 2,
+      "total_downloads": 1559,
       "min_param_count": null,
       "sample_models": [
-        "hf-tiny-model-private/tiny-random-RobertaPreLayerNormForCausalLM"
+        "IndexTeam/Index-1.9B-Chat",
+        "IndexTeam/Index-1.9B-Pure"
       ],
-      "relevancy_score": 17.7
+      "relevancy_score": 17.3
     },
     {
       "architecture_id": "ElectraForCausalLM",
       "total_models": 1,
-      "total_downloads": 2071,
+      "total_downloads": 2103,
       "min_param_count": null,
       "sample_models": [
         "smeoni/nbme-electra-large-generator"
       ],
-      "relevancy_score": 17.7
+      "relevancy_score": 17.3
     },
     {
-      "architecture_id": "GPTModelForTextGeneration",
+      "architecture_id": "PegasusForCausalLM",
       "total_models": 1,
-      "total_downloads": 2059,
+      "total_downloads": 2056,
       "min_param_count": null,
       "sample_models": [
-        "samkeet/GPT_124M-Instruct"
+        "hf-tiny-model-private/tiny-random-PegasusForCausalLM"
       ],
-      "relevancy_score": 17.7
+      "relevancy_score": 17.3
     },
     {
-      "architecture_id": "PegasusForCausalLM",
+      "architecture_id": "RobertaPreLayerNormForCausalLM",
       "total_models": 1,
-      "total_downloads": 2032,
+      "total_downloads": 2047,
       "min_param_count": null,
       "sample_models": [
-        "hf-tiny-model-private/tiny-random-PegasusForCausalLM"
+        "hf-tiny-model-private/tiny-random-RobertaPreLayerNormForCausalLM"
       ],
-      "relevancy_score": 17.7
+      "relevancy_score": 17.3
     },
     {
       "architecture_id": "BlenderbotForCausalLM",
       "total_models": 1,
-      "total_downloads": 2026,
+      "total_downloads": 2046,
       "min_param_count": null,
       "sample_models": [
         "hf-tiny-model-private/tiny-random-BlenderbotForCausalLM"
       ],
-      "relevancy_score": 17.7
+      "relevancy_score": 17.3
     },
     {
-      "architecture_id": "DenseLLM",
+      "architecture_id": "XModelForCausalLM",
       "total_models": 1,
-      "total_downloads": 2011,
+      "total_downloads": 2037,
       "min_param_count": null,
       "sample_models": [
-        "AlgoDriveAI/Sanskrit_Akkadian_LLM_v1.0"
+        "XiaoduoAILab/Xmodel_LM"
       ],
-      "relevancy_score": 17.7
+      "relevancy_score": 17.3
     },
     {
-      "architecture_id": "OtterForConditionalGeneration",
-      "total_models": 2,
-      "total_downloads": 1460,
+      "architecture_id": "EnergyTransformer",
+      "total_models": 1,
+      "total_downloads": 2031,
       "min_param_count": null,
       "sample_models": [
-        "luodian/OTTER-Video-LLaMA7B-DenseCaption",
-        "luodian/OTTER-MPT1B-RPJama-Init"
+        "cccczshao/CALM-M"
       ],
-      "relevancy_score": 17.6
+      "relevancy_score": 17.3
     },
     {
-      "architecture_id": "MonkeyLMHeadModel",
-      "total_models": 2,
-      "total_downloads": 1486,
+      "architecture_id": "MvpForCausalLM",
+      "total_models": 1,
+      "total_downloads": 2018,
       "min_param_count": null,
       "sample_models": [
-        "echo840/Monkey-Chat",
-        "echo840/Monkey"
+        "hf-tiny-model-private/tiny-random-MvpForCausalLM"
       ],
-      "relevancy_score": 17.6
+      "relevancy_score": 17.3
     },
     {
-      "architecture_id": "IndexForCausalLM",
+      "architecture_id": "OtterForConditionalGeneration",
       "total_models": 2,
-      "total_downloads": 1467,
+      "total_downloads": 1473,
       "min_param_count": null,
       "sample_models": [
-        "IndexTeam/Index-1.9B-Chat",
-        "IndexTeam/Index-1.9B-Pure"
+        "luodian/OTTER-Video-LLaMA7B-DenseCaption",
+        "luodian/OTTER-MPT1B-RPJama-Init"
       ],
-      "relevancy_score": 17.6
+      "relevancy_score": 17.2
     },
     {
-      "architecture_id": "EnergyTransformer",
-      "total_models": 1,
-      "total_downloads": 1991,
+      "architecture_id": "MonkeyLMHeadModel",
+      "total_models": 2,
+      "total_downloads": 1496,
       "min_param_count": null,
       "sample_models": [
-        "cccczshao/CALM-M"
+        "echo840/Monkey-Chat",
+        "echo840/Monkey"
       ],
-      "relevancy_score": 17.6
+      "relevancy_score": 17.2
     },
     {
-      "architecture_id": "MvpForCausalLM",
-      "total_models": 1,
-      "total_downloads": 1990,
+      "architecture_id": "PointLLMLlamaForCausalLM",
+      "total_models": 2,
+      "total_downloads": 1497,
       "min_param_count": null,
       "sample_models": [
-        "hf-tiny-model-private/tiny-random-MvpForCausalLM"
+        "RunsenXu/PointLLM_7B_v1.1_init",
+        "RunsenXu/PointLLM_7B_v1.2"
       ],
-      "relevancy_score": 17.6
+      "relevancy_score": 17.2
     },
     {
-      "architecture_id": "XModelForCausalLM",
+      "architecture_id": "ConditionalGPT2LMHeadModel",
       "total_models": 1,
-      "total_downloads": 1987,
+      "total_downloads": 1991,
       "min_param_count": null,
       "sample_models": [
-        "XiaoduoAILab/Xmodel_LM"
+        "entropy/roberta_zinc_decoder"
       ],
-      "relevancy_score": 17.6
+      "relevancy_score": 17.2
     },
     {
-      "architecture_id": "ConditionalGPT2LMHeadModel",
+      "architecture_id": "Qwen35ForCausalLM",
       "total_models": 1,
-      "total_downloads": 1951,
+      "total_downloads": 1971,
       "min_param_count": null,
       "sample_models": [
-        "entropy/roberta_zinc_decoder"
+        "JeffGreen311/Eve-V2-Unleashed-Qwen3.5-8B-Liberated-4K-4B-Merged"
       ],
-      "relevancy_score": 17.6
+      "relevancy_score": 17.2
     },
     {
       "architecture_id": "DebertaV2ForCausalLM",
       "total_models": 1,
-      "total_downloads": 1928,
+      "total_downloads": 1960,
       "min_param_count": null,
       "sample_models": [
         "ltg/deberta-xxlarge-fixed"
       ],
-      "relevancy_score": 17.6
+      "relevancy_score": 17.2
     },
     {
-      "architecture_id": "BTLMLMHeadModel",
-      "total_models": 2,
-      "total_downloads": 1403,
+      "architecture_id": "SpectusForConditionalGeneration",
+      "total_models": 1,
+      "total_downloads": 1945,
       "min_param_count": null,
       "sample_models": [
-        "cerebras/btlm-3b-8k-base",
-        "EleutherAI/Hermes-btlm-3b-8k"
+        "MS-ML/SpecTUS_pretrained_only"
       ],
-      "relevancy_score": 17.5
+      "relevancy_score": 17.2
     },
     {
       "architecture_id": "TelechatForCausalLM",
       "total_models": 2,
-      "total_downloads": 1363,
+      "total_downloads": 1426,
       "min_param_count": null,
       "sample_models": [
         "Tele-AI/telechat-7B",
         "Tele-AI/TeleChat-12B"
       ],
-      "relevancy_score": 17.5
-    },
-    {
-      "architecture_id": "SpectusForConditionalGeneration",
-      "total_models": 1,
-      "total_downloads": 1908,
-      "min_param_count": null,
-      "sample_models": [
-        "MS-ML/SpecTUS_pretrained_only"
-      ],
-      "relevancy_score": 17.5
+      "relevancy_score": 17.1
     },
     {
-      "architecture_id": "GPTXForCausalLM",
-      "total_models": 1,
-      "total_downloads": 1871,
+      "architecture_id": "BTLMLMHeadModel",
+      "total_models": 2,
+      "total_downloads": 1437,
       "min_param_count": null,
       "sample_models": [
-        "AxiomicLabs/GPT-X-125m-15bt"
+        "cerebras/btlm-3b-8k-base",
+        "EleutherAI/Hermes-btlm-3b-8k"
       ],
-      "relevancy_score": 17.5
+      "relevancy_score": 17.1
     },
     {
       "architecture_id": "LSGBartForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 1843,
+      "total_downloads": 1871,
       "min_param_count": null,
       "sample_models": [
         "morenolq/LEGIT-BART-LSG-4096"
       ],
-      "relevancy_score": 17.5
+      "relevancy_score": 17.1
     },
     {
       "architecture_id": "CloverLMForCausalLM",
       "total_models": 1,
-      "total_downloads": 1822,
+      "total_downloads": 1833,
       "min_param_count": null,
       "sample_models": [
         "daslab-testing/CloverLM"
       ],
-      "relevancy_score": 17.4
-    },
-    {
-      "architecture_id": "MiniMaxText01ForCausalLM",
-      "total_models": 1,
-      "total_downloads": 1682,
-      "min_param_count": null,
-      "sample_models": [
-        "MiniMaxAI/MiniMax-Text-01"
-      ],
-      "relevancy_score": 17.3
+      "relevancy_score": 17.0
     },
     {
       "architecture_id": "LlavaCrystalForCausalLM",
       "total_models": 1,
-      "total_downloads": 1586,
+      "total_downloads": 1614,
       "min_param_count": null,
       "sample_models": [
         "LLM360/CrystalChat-7B-Web2Code"
       ],
-      "relevancy_score": 17.1
+      "relevancy_score": 16.8
     },
     {
-      "architecture_id": "MobileLLMForCausalLM",
+      "architecture_id": "InternLM2ForRewardModel",
       "total_models": 1,
-      "total_downloads": 1585,
+      "total_downloads": 1562,
       "min_param_count": null,
       "sample_models": [
-        "facebook/MobileLLM-125M"
+        "internlm/internlm2_5-step-prover-critic"
       ],
-      "relevancy_score": 17.1
+      "relevancy_score": 16.7
     },
     {
       "architecture_id": "MobilintEagle3Qwen2ForCausalLM",
       "total_models": 1,
-      "total_downloads": 1541,
+      "total_downloads": 1543,
       "min_param_count": null,
       "sample_models": [
         "mobilint/EAGLE3-JPharmatron-7B"
       ],
-      "relevancy_score": 17.1
+      "relevancy_score": 16.7
     },
     {
-      "architecture_id": "InternLM2ForRewardModel",
-      "total_models": 1,
-      "total_downloads": 1527,
+      "architecture_id": "MPTForCausalLM",
+      "total_models": 2,
+      "total_downloads": 1115,
       "min_param_count": null,
       "sample_models": [
-        "internlm/internlm2_5-step-prover-critic"
+        "hyungtae/mpt-30b",
+        "manojpreveen/mpt-30b-v5"
       ],
-      "relevancy_score": 17.0
+      "relevancy_score": 16.6
     },
     {
-      "architecture_id": "Qwen35ForCausalLM",
+      "architecture_id": "MobileLLMForCausalLM",
       "total_models": 1,
-      "total_downloads": 1512,
+      "total_downloads": 1522,
       "min_param_count": null,
       "sample_models": [
-        "JeffGreen311/Eve-V2-Unleashed-Qwen3.5-8B-Liberated-4K-4B-Merged"
+        "facebook/MobileLLM-125M"
       ],
-      "relevancy_score": 17.0
+      "relevancy_score": 16.6
     },
     {
       "architecture_id": "GeoChatLlamaForCausalLM",
       "total_models": 1,
-      "total_downloads": 1454,
+      "total_downloads": 1416,
       "min_param_count": null,
       "sample_models": [
         "MBZUAI/geochat-7B"
       ],
-      "relevancy_score": 16.9
+      "relevancy_score": 16.5
     },
     {
-      "architecture_id": "MochivaForCausalLM",
+      "architecture_id": "Qwen3VLMoeForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 1438,
-      "min_param_count": null,
+      "total_downloads": 1385,
+      "min_param_count": 31070754032,
       "sample_models": [
-        "Mochiva-team/Mochiva-model"
+        "Oysiyl/qwen3-vl-30b-a3b-unslop-good-lora-v1"
       ],
-      "relevancy_score": 16.9
+      "relevancy_score": 16.4
     },
     {
       "architecture_id": "HeliumForCausalLM",
       "total_models": 1,
-      "total_downloads": 1363,
+      "total_downloads": 1309,
       "min_param_count": null,
       "sample_models": [
         "kyutai/helium-1-preview-2b"
       ],
-      "relevancy_score": 16.8
+      "relevancy_score": 16.3
     },
     {
       "architecture_id": "JiRackTernaryModel",
       "total_models": 1,
-      "total_downloads": 1289,
+      "total_downloads": 1292,
       "min_param_count": null,
       "sample_models": [
         "kgrabko/JiRackTernary_70b"
       ],
-      "relevancy_score": 16.7
+      "relevancy_score": 16.3
+    },
+    {
+      "architecture_id": "Papagan",
+      "total_models": 1,
+      "total_downloads": 1216,
+      "min_param_count": null,
+      "sample_models": [
+        "SutskeverFanBoy/papagan_1.3b"
+      ],
+      "relevancy_score": 16.2
     },
     {
       "architecture_id": "PolyLMHeadModel",
       "total_models": 1,
-      "total_downloads": 1195,
+      "total_downloads": 1129,
       "min_param_count": null,
       "sample_models": [
         "DAMO-NLP-MT/polylm-13b"
       ],
-      "relevancy_score": 16.5
+      "relevancy_score": 16.0
     },
     {
       "architecture_id": "CambrianLlamaForCausalLM",
       "total_models": 1,
-      "total_downloads": 1073,
+      "total_downloads": 1124,
       "min_param_count": null,
       "sample_models": [
         "nyu-visionx/cambrian-8b"
       ],
-      "relevancy_score": 16.3
+      "relevancy_score": 16.0
     },
     {
-      "architecture_id": "LlamaModel",
+      "architecture_id": "ErnieForCausalLM",
       "total_models": 1,
-      "total_downloads": 1034,
-      "min_param_count": 33930165248,
+      "total_downloads": 1018,
+      "min_param_count": null,
       "sample_models": [
-        "ngoan/NgoanYi"
+        "mohitsha/tiny-ernie-random-remote-code"
       ],
-      "relevancy_score": 16.2
+      "relevancy_score": 15.8
     },
     {
-      "architecture_id": "TransnormerForCausalLM",
+      "architecture_id": "Qwen3_5MoeForCausalLM",
       "total_models": 1,
-      "total_downloads": 1030,
-      "min_param_count": null,
+      "total_downloads": 1000,
+      "min_param_count": 122111526912,
       "sample_models": [
-        "OpenNLPLab/TransNormerLLM-385M"
+        "wangzhang/Qwen3.5-122B-A10B-abliterix"
       ],
-      "relevancy_score": 16.2
+      "relevancy_score": 15.7
     },
     {
-      "architecture_id": "Qwen3VLMoeForConditionalGeneration",
+      "architecture_id": "XMistralForCausalLM",
       "total_models": 1,
-      "total_downloads": 997,
-      "min_param_count": 31070754032,
+      "total_downloads": 984,
+      "min_param_count": null,
       "sample_models": [
-        "Oysiyl/qwen3-vl-30b-a3b-unslop-good-lora-v1"
+        "Hannibal046/xrag-7b"
       ],
-      "relevancy_score": 16.1
+      "relevancy_score": 15.7
     },
     {
-      "architecture_id": "KimiK25ForConditionalGeneration",
+      "architecture_id": "TransnormerForCausalLM",
       "total_models": 1,
-      "total_downloads": 988,
-      "min_param_count": 91383180528,
+      "total_downloads": 973,
+      "min_param_count": null,
       "sample_models": [
-        "Ex0bit/Kimi-K2.5-PRISM-REAP-530B-A32B"
+        "OpenNLPLab/TransNormerLLM-385M"
       ],
-      "relevancy_score": 16.1
+      "relevancy_score": 15.7
     },
     {
-      "architecture_id": "ErnieForCausalLM",
+      "architecture_id": "YiForCausalLM",
       "total_models": 1,
-      "total_downloads": 982,
+      "total_downloads": 955,
       "min_param_count": null,
       "sample_models": [
-        "mohitsha/tiny-ernie-random-remote-code"
+        "llmware/dragon-yi-6b-v0"
       ],
-      "relevancy_score": 16.1
+      "relevancy_score": 15.6
     },
     {
-      "architecture_id": "ShikraLlamaForCausalLM",
+      "architecture_id": "SOVYN85M",
       "total_models": 1,
-      "total_downloads": 950,
+      "total_downloads": 949,
       "min_param_count": null,
       "sample_models": [
-        "shikras/shikra-7b-delta-v1"
+        "SOVYN/SOVYN-85M"
       ],
-      "relevancy_score": 16.0
+      "relevancy_score": 15.6
     },
     {
-      "architecture_id": "YiForCausalLM",
+      "architecture_id": "LlamaModel",
+      "total_models": 1,
+      "total_downloads": 948,
+      "min_param_count": 33930165248,
+      "sample_models": [
+        "ngoan/NgoanYi"
+      ],
+      "relevancy_score": 15.6
+    },
+    {
+      "architecture_id": "ShikraLlamaForCausalLM",
       "total_models": 1,
-      "total_downloads": 939,
+      "total_downloads": 928,
       "min_param_count": null,
       "sample_models": [
-        "llmware/dragon-yi-6b-v0"
+        "shikras/shikra-7b-delta-v1"
       ],
-      "relevancy_score": 16.0
+      "relevancy_score": 15.6
     },
     {
       "architecture_id": "CpmBeeForCausalLM",
       "total_models": 1,
-      "total_downloads": 895,
+      "total_downloads": 911,
       "min_param_count": null,
       "sample_models": [
         "openbmb/cpm-bee-10b"
       ],
-      "relevancy_score": 15.8
+      "relevancy_score": 15.5
     },
     {
       "architecture_id": "ZsGPT2LMHeadModel",
       "total_models": 1,
-      "total_downloads": 882,
+      "total_downloads": 902,
       "min_param_count": null,
       "sample_models": [
         "claritylab/zero-shot-vanilla-gpt2"
       ],
-      "relevancy_score": 15.8
+      "relevancy_score": 15.5
     },
     {
       "architecture_id": "HumanGPTForCausalLM",
       "total_models": 1,
-      "total_downloads": 868,
+      "total_downloads": 876,
       "min_param_count": null,
       "sample_models": [
         "YaoFeng/CHATPOSE-V0"
       ],
-      "relevancy_score": 15.8
+      "relevancy_score": 15.4
     },
     {
       "architecture_id": "Phi4FlashForCausalLM",
       "total_models": 1,
-      "total_downloads": 809,
+      "total_downloads": 839,
       "min_param_count": null,
       "sample_models": [
         "microsoft/Phi-4-mini-flash-reasoning"
       ],
-      "relevancy_score": 15.6
+      "relevancy_score": 15.3
     },
     {
-      "architecture_id": "XMistralForCausalLM",
+      "architecture_id": "KimiK25ForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 796,
-      "min_param_count": null,
+      "total_downloads": 824,
+      "min_param_count": 91383180528,
       "sample_models": [
-        "Hannibal046/xrag-7b"
+        "Ex0bit/Kimi-K2.5-PRISM-REAP-530B-A32B"
       ],
-      "relevancy_score": 15.6
+      "relevancy_score": 15.3
     },
     {
       "architecture_id": "FlamingoForCausalLM",
       "total_models": 1,
-      "total_downloads": 791,
+      "total_downloads": 820,
       "min_param_count": null,
       "sample_models": [
         "babylm/flamingo-2024"
       ],
-      "relevancy_score": 15.6
+      "relevancy_score": 15.3
     },
     {
-      "architecture_id": "VStreamLlamaForCausalLM",
+      "architecture_id": "AquilaDenseForCausalLM",
       "total_models": 1,
-      "total_downloads": 762,
+      "total_downloads": 820,
       "min_param_count": null,
       "sample_models": [
-        "IVGSZ/Flash-VStream-7b"
+        "BAAI/AquilaDense-7B"
       ],
-      "relevancy_score": 15.5
+      "relevancy_score": 15.3
     },
     {
-      "architecture_id": "AquilaDenseForCausalLM",
+      "architecture_id": "EmuForCausalLM",
       "total_models": 1,
-      "total_downloads": 759,
+      "total_downloads": 795,
       "min_param_count": null,
       "sample_models": [
-        "BAAI/AquilaDense-7B"
+        "BAAI/Emu2-Chat"
       ],
-      "relevancy_score": 15.5
+      "relevancy_score": 15.2
     },
     {
-      "architecture_id": "EmuForCausalLM",
+      "architecture_id": "VStreamLlamaForCausalLM",
       "total_models": 1,
-      "total_downloads": 747,
+      "total_downloads": 780,
       "min_param_count": null,
       "sample_models": [
-        "BAAI/Emu2-Chat"
+        "IVGSZ/Flash-VStream-7b"
       ],
-      "relevancy_score": 15.4
+      "relevancy_score": 15.2
     },
     {
       "architecture_id": "MoELLaVAQWenForCausalLM",
       "total_models": 1,
-      "total_downloads": 728,
+      "total_downloads": 729,
       "min_param_count": null,
       "sample_models": [
         "LanguageBind/MoE-LLaVA-Qwen-1.8B-4e"
       ],
-      "relevancy_score": 15.4
+      "relevancy_score": 15.0
     },
     {
       "architecture_id": "YayiForCausalLM",
       "total_models": 1,
-      "total_downloads": 713,
+      "total_downloads": 724,
       "min_param_count": null,
       "sample_models": [
         "wenge-research/yayi2-30b"
       ],
-      "relevancy_score": 15.3
+      "relevancy_score": 15.0
+    },
+    {
+      "architecture_id": "STLlamaForCausalLM",
+      "total_models": 1,
+      "total_downloads": 723,
+      "min_param_count": null,
+      "sample_models": [
+        "bjdwh/UrbanGPT"
+      ],
+      "relevancy_score": 15.0
     },
     {
       "architecture_id": "SkyworkForCausalLM",
@@ -4440,57 +4563,67 @@
       "sample_models": [
         "Skywork/Skywork-13B-base"
       ],
-      "relevancy_score": 15.3
+      "relevancy_score": 14.9
     },
     {
       "architecture_id": "MobiLlamaForCausalLM",
       "total_models": 1,
-      "total_downloads": 661,
+      "total_downloads": 667,
       "min_param_count": null,
       "sample_models": [
         "MBZUAI/MobiLlama-05B"
       ],
-      "relevancy_score": 15.2
+      "relevancy_score": 14.8
     },
     {
-      "architecture_id": "HebrewGPTForCausalLM",
+      "architecture_id": "JapaneseStableLMAlphaForCausalLM",
       "total_models": 1,
-      "total_downloads": 643,
+      "total_downloads": 656,
       "min_param_count": null,
       "sample_models": [
-        "Slasky/HebrewGPT-1B"
+        "stabilityai/japanese-stablelm-base-alpha-7b"
       ],
-      "relevancy_score": 15.1
+      "relevancy_score": 14.8
     },
     {
       "architecture_id": "GPTBigCodeLMHeadModel",
       "total_models": 1,
-      "total_downloads": 638,
+      "total_downloads": 654,
       "min_param_count": null,
       "sample_models": [
         "bigcode/santacoderpack"
       ],
-      "relevancy_score": 15.1
+      "relevancy_score": 14.8
+    },
+    {
+      "architecture_id": "SDARMoeForCausalLM",
+      "total_models": 1,
+      "total_downloads": 653,
+      "min_param_count": 30532122624,
+      "sample_models": [
+        "JetLM/SDAR-30B-A3B-Chat-b32"
+      ],
+      "relevancy_score": 14.8
     },
     {
       "architecture_id": "GPTJiangForCausalLM",
       "total_models": 1,
-      "total_downloads": 627,
+      "total_downloads": 650,
       "min_param_count": null,
       "sample_models": [
         "kdf/jiang-base"
       ],
-      "relevancy_score": 15.1
+      "relevancy_score": 14.8
     },
     {
-      "architecture_id": "JapaneseStableLMAlphaForCausalLM",
+      "architecture_id": "HebrewGPTForCausalLM",
       "total_models": 1,
-      "total_downloads": 627,
+      "total_downloads": 646,
       "min_param_count": null,
       "sample_models": [
-        "stabilityai/japanese-stablelm-base-alpha-7b"
+        "Slasky/HebrewGPT-1B"
       ],
-      "relevancy_score": 15.1
+      "relevancy_score": 14.8
     },
     {
       "architecture_id": "BunnyQwenForCausalLM",
@@ -4500,167 +4633,167 @@
       "sample_models": [
         "dphn/dolphin-vision-72b"
       ],
-      "relevancy_score": 15.0
+      "relevancy_score": 14.7
     },
     {
-      "architecture_id": "SDARMoeForCausalLM",
+      "architecture_id": "GrokForCausalLM",
       "total_models": 1,
       "total_downloads": 619,
-      "min_param_count": 30532122624,
+      "min_param_count": null,
       "sample_models": [
-        "JetLM/SDAR-30B-A3B-Chat-b32"
+        "keyfan/grok-1-hf"
       ],
-      "relevancy_score": 15.0
+      "relevancy_score": 14.7
     },
     {
-      "architecture_id": "STLlamaForCausalLM",
+      "architecture_id": "LongcatFlashNgramForCausalLM",
       "total_models": 1,
-      "total_downloads": 617,
+      "total_downloads": 615,
       "min_param_count": null,
       "sample_models": [
-        "bjdwh/UrbanGPT"
+        "meituan-longcat/LongCat-Flash-Lite"
       ],
-      "relevancy_score": 15.0
+      "relevancy_score": 14.7
     },
     {
-      "architecture_id": "GrokForCausalLM",
+      "architecture_id": "LingoWhaleForCausalLM",
       "total_models": 1,
-      "total_downloads": 606,
+      "total_downloads": 595,
       "min_param_count": null,
       "sample_models": [
-        "keyfan/grok-1-hf"
+        "deeplang-ai/LingoWhale-8B"
       ],
-      "relevancy_score": 15.0
+      "relevancy_score": 14.6
     },
     {
       "architecture_id": "Llama2ForCausalLM",
       "total_models": 1,
-      "total_downloads": 590,
+      "total_downloads": 592,
       "min_param_count": null,
       "sample_models": [
         "llmware/dragon-llama-7b-v0"
       ],
-      "relevancy_score": 14.9
+      "relevancy_score": 14.6
     },
     {
       "architecture_id": "MPLUGOwl2LlamaForCausalLM",
       "total_models": 1,
-      "total_downloads": 589,
+      "total_downloads": 592,
       "min_param_count": null,
       "sample_models": [
         "q-future/q-align-quality"
       ],
-      "relevancy_score": 14.9
+      "relevancy_score": 14.6
     },
     {
       "architecture_id": "GLaMMForCausalLM",
       "total_models": 1,
-      "total_downloads": 587,
+      "total_downloads": 585,
       "min_param_count": null,
       "sample_models": [
         "MBZUAI/GLaMM-FullScope"
       ],
-      "relevancy_score": 14.9
+      "relevancy_score": 14.6
     },
     {
-      "architecture_id": "LingoWhaleForCausalLM",
+      "architecture_id": "OLMoModelForCausalLM",
       "total_models": 1,
-      "total_downloads": 583,
+      "total_downloads": 585,
       "min_param_count": null,
       "sample_models": [
-        "deeplang-ai/LingoWhale-8B"
+        "NousResearch/OLMo-Bitnet-1B"
       ],
-      "relevancy_score": 14.9
+      "relevancy_score": 14.6
     },
     {
       "architecture_id": "OpenBAForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 581,
+      "total_downloads": 576,
       "min_param_count": null,
       "sample_models": [
         "OpenNLG/OpenBA-V1-Based"
       ],
-      "relevancy_score": 14.9
+      "relevancy_score": 14.5
     },
     {
-      "architecture_id": "OLMoModelForCausalLM",
+      "architecture_id": "GPTJXForCausalLM",
       "total_models": 1,
-      "total_downloads": 573,
+      "total_downloads": 574,
       "min_param_count": null,
       "sample_models": [
-        "NousResearch/OLMo-Bitnet-1B"
+        "KnutJaegersberg/GPT-JX-3b"
       ],
-      "relevancy_score": 14.9
+      "relevancy_score": 14.5
     },
     {
-      "architecture_id": "GPTJXForCausalLM",
+      "architecture_id": "LlavaStableLMEpochForCausalLM",
       "total_models": 1,
-      "total_downloads": 566,
+      "total_downloads": 540,
       "min_param_count": null,
       "sample_models": [
-        "KnutJaegersberg/GPT-JX-3b"
+        "NousResearch/Obsidian-3B-V0.5"
       ],
-      "relevancy_score": 14.8
+      "relevancy_score": 14.4
     },
     {
-      "architecture_id": "Qwen3_5MoeForCausalLM",
+      "architecture_id": "AprielHForCausalLM",
       "total_models": 1,
-      "total_downloads": 562,
-      "min_param_count": 122111526912,
+      "total_downloads": 538,
+      "min_param_count": null,
       "sample_models": [
-        "wangzhang/Qwen3.5-122B-A10B-abliterix"
+        "ServiceNow-AI/Apriel-H1-15b-Thinker-SFT"
       ],
-      "relevancy_score": 14.8
+      "relevancy_score": 14.4
     },
     {
-      "architecture_id": "LlavaStableLMEpochForCausalLM",
+      "architecture_id": "CacaForCausalLM",
       "total_models": 1,
-      "total_downloads": 547,
+      "total_downloads": 530,
       "min_param_count": null,
       "sample_models": [
-        "NousResearch/Obsidian-3B-V0.5"
+        "Lyon28/caca-1B-untrained"
       ],
-      "relevancy_score": 14.7
+      "relevancy_score": 14.3
     },
     {
-      "architecture_id": "VSMForCausalLM",
+      "architecture_id": "M2M100ForConditionalGeneration",
       "total_models": 1,
-      "total_downloads": 544,
+      "total_downloads": 529,
       "min_param_count": null,
       "sample_models": [
-        "craigwu/seal_vsm_7b"
+        "dsfsi/nso-en-m2m100-gov"
       ],
-      "relevancy_score": 14.7
+      "relevancy_score": 14.3
     },
     {
-      "architecture_id": "LlavaSearchLlamaForCausalLM",
+      "architecture_id": "HgrnForCausalLM",
       "total_models": 1,
-      "total_downloads": 543,
+      "total_downloads": 526,
       "min_param_count": null,
       "sample_models": [
-        "craigwu/seal_vqa_7b"
+        "OpenNLPLab/HGRN-150M"
       ],
-      "relevancy_score": 14.7
+      "relevancy_score": 14.3
     },
     {
-      "architecture_id": "AprielHForCausalLM",
+      "architecture_id": "LlavaSearchLlamaForCausalLM",
       "total_models": 1,
-      "total_downloads": 530,
+      "total_downloads": 524,
       "min_param_count": null,
       "sample_models": [
-        "ServiceNow-AI/Apriel-H1-15b-Thinker-SFT"
+        "craigwu/seal_vqa_7b"
       ],
-      "relevancy_score": 14.7
+      "relevancy_score": 14.3
     },
     {
-      "architecture_id": "LlavaMistralForCausalLM",
+      "architecture_id": "SeerAttnQwen3ForCausalLM",
       "total_models": 1,
-      "total_downloads": 527,
+      "total_downloads": 523,
       "min_param_count": null,
       "sample_models": [
-        "NousResearch/Nous-Hermes-2-Vision-Alpha"
+        "jiwonsong/SeerAttention-Qwen3-8B-AttnGates"
       ],
-      "relevancy_score": 14.7
+      "relevancy_score": 14.3
     },
     {
       "architecture_id": "MedHemoModel",
@@ -4670,27 +4803,27 @@
       "sample_models": [
         "amewebstudio/medhemo-earcp"
       ],
-      "relevancy_score": 14.6
+      "relevancy_score": 14.3
     },
     {
-      "architecture_id": "HgrnForCausalLM",
+      "architecture_id": "VSMForCausalLM",
       "total_models": 1,
-      "total_downloads": 513,
+      "total_downloads": 521,
       "min_param_count": null,
       "sample_models": [
-        "OpenNLPLab/HGRN-150M"
+        "craigwu/seal_vsm_7b"
       ],
-      "relevancy_score": 14.6
+      "relevancy_score": 14.3
     },
     {
-      "architecture_id": "M2M100ForConditionalGeneration",
+      "architecture_id": "LlavaMistralForCausalLM",
       "total_models": 1,
-      "total_downloads": 501,
+      "total_downloads": 510,
       "min_param_count": null,
       "sample_models": [
-        "dsfsi/nso-en-m2m100-gov"
+        "NousResearch/Nous-Hermes-2-Vision-Alpha"
       ],
-      "relevancy_score": 14.6
+      "relevancy_score": 14.3
     }
   ]
 }
\ No newline at end of file
diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json
index c037dedfa..9c9f8a24f 100644
--- a/transformer_lens/tools/model_registry/data/supported_models.json
+++ b/transformer_lens/tools/model_registry/data/supported_models.json
@@ -1,22 +1,22 @@
 {
-  "generated_at": "2026-04-10",
+  "generated_at": "2026-04-14",
   "scan_info": {
-    "total_scanned": 5436,
+    "total_scanned": 5633,
     "task_filter": "text-generation",
     "min_downloads": 500,
-    "scan_duration_seconds": 3.9
+    "scan_duration_seconds": 4.2
   },
-  "total_architectures": 43,
-  "total_models": 7006,
-  "total_verified": 704,
+  "total_architectures": 47,
+  "total_models": 7426,
+  "total_verified": 706,
   "models": [
     {
       "architecture_id": "Qwen3NextForCausalLM",
       "model_id": "Qwen/Qwen3-Next-80B-A3B-Instruct",
       "status": 2,
-      "verified_date": "2026-04-10",
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": "Estimated 6929.6 GB exceeds 96.0 GB limit",
+      "note": "Estimated 708.8 GB exceeds 96.0 GB limit",
       "phase1_score": null,
       "phase2_score": null,
       "phase3_score": null,
@@ -28,9 +28,9 @@
       "architecture_id": "Qwen3NextForCausalLM",
       "model_id": "unsloth/Qwen3-Coder-Next",
       "status": 2,
-      "verified_date": "2026-04-10",
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": "Estimated 6929.6 GB exceeds 96.0 GB limit",
+      "note": "Estimated 708.8 GB exceeds 96.0 GB limit",
       "phase1_score": null,
       "phase2_score": null,
       "phase3_score": null,
@@ -42,9 +42,9 @@
       "architecture_id": "Qwen3NextForCausalLM",
       "model_id": "Qwen/Qwen3-Next-80B-A3B-Thinking",
       "status": 2,
-      "verified_date": "2026-04-10",
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": "Estimated 6929.6 GB exceeds 96.0 GB limit",
+      "note": "Estimated 708.8 GB exceeds 96.0 GB limit",
       "phase1_score": null,
       "phase2_score": null,
       "phase3_score": null,
@@ -56,7 +56,7 @@
       "architecture_id": "Qwen3NextForCausalLM",
       "model_id": "tiny-random/qwen3-next-moe",
       "status": 1,
-      "verified_date": "2026-04-10",
+      "verified_date": "2026-04-15",
       "metadata": null,
       "note": "Full verification completed",
       "phase1_score": 100.0,
@@ -70,7 +70,7 @@
       "architecture_id": "Qwen3NextForCausalLM",
       "model_id": "optimum-intel-internal-testing/tiny-random-qwen3-next",
       "status": 1,
-      "verified_date": "2026-04-10",
+      "verified_date": "2026-04-15",
       "metadata": null,
       "note": "Full verification completed",
       "phase1_score": 100.0,
@@ -84,7 +84,7 @@
       "architecture_id": "Qwen3NextForCausalLM",
       "model_id": "yujiepan/qwen3-next-moe-tiny-random",
       "status": 1,
-      "verified_date": "2026-04-10",
+      "verified_date": "2026-04-15",
       "metadata": null,
       "note": "Full verification completed",
       "phase1_score": 100.0,
@@ -98,9 +98,9 @@
       "architecture_id": "Qwen3NextForCausalLM",
       "model_id": "huihui-ai/Huihui-Qwen3-Coder-Next-abliterated",
       "status": 2,
-      "verified_date": "2026-04-10",
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": "Estimated 6929.6 GB exceeds 96.0 GB limit",
+      "note": "Estimated 708.8 GB exceeds 96.0 GB limit",
       "phase1_score": null,
       "phase2_score": null,
       "phase3_score": null,
@@ -112,9 +112,9 @@
       "architecture_id": "Qwen3NextForCausalLM",
       "model_id": "Qwen/Qwen3-Coder-Next-Base",
       "status": 2,
-      "verified_date": "2026-04-10",
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": "Estimated 6929.6 GB exceeds 96.0 GB limit",
+      "note": "Estimated 708.8 GB exceeds 96.0 GB limit",
       "phase1_score": null,
       "phase2_score": null,
       "phase3_score": null,
@@ -126,9 +126,9 @@
       "architecture_id": "Qwen3NextForCausalLM",
       "model_id": "bknyaz/Qwen3-Coder-Next-REAM",
       "status": 2,
-      "verified_date": "2026-04-10",
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": "Estimated 5201.5 GB exceeds 96.0 GB limit",
+      "note": "Estimated 535.9 GB exceeds 96.0 GB limit",
       "phase1_score": null,
       "phase2_score": null,
       "phase3_score": null,
@@ -140,7 +140,7 @@
       "architecture_id": "Qwen3NextForCausalLM",
       "model_id": "Qwen/Qwen3-Coder-Next",
       "status": 2,
-      "verified_date": "2026-04-10",
+      "verified_date": "2026-04-15",
       "metadata": {
         "downloads": 664116,
         "likes": 0,
@@ -153,7 +153,7 @@
         ],
         "parameter_count": 79674391296
       },
-      "note": "Estimated 6929.6 GB exceeds 96.0 GB limit",
+      "note": "Estimated 708.8 GB exceeds 96.0 GB limit",
       "phase1_score": null,
       "phase2_score": null,
       "phase3_score": null,
@@ -1999,9 +1999,9 @@
       "architecture_id": "GraniteMoeHybridForCausalLM",
       "model_id": "ibm-granite/granite-4.0-h-small",
       "status": 2,
-      "verified_date": "2026-03-17",
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": "Estimated 135.9 GB exceeds 75.2 GB limit",
+      "note": "Estimated 270.8 GB exceeds 96.0 GB limit",
       "phase1_score": null,
       "phase2_score": null,
       "phase3_score": null,
@@ -2362,14 +2362,14 @@
     {
       "architecture_id": "GraniteMoeHybridForCausalLM",
       "model_id": "ibm-granite/granite-4.0-micro-base",
-      "status": 0,
-      "verified_date": null,
+      "status": 1,
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": null,
-      "phase1_score": null,
-      "phase2_score": null,
-      "phase3_score": null,
-      "phase4_score": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 88.7,
       "phase7_score": null,
       "phase8_score": null
     },
@@ -2573,13 +2573,13 @@
       "architecture_id": "GraniteMoeHybridForCausalLM",
       "model_id": "ibm-granite/granite-4.0-micro",
       "status": 1,
-      "verified_date": "2026-03-17",
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": "Core verification completed",
+      "note": "Full verification completed",
       "phase1_score": 100.0,
-      "phase2_score": null,
-      "phase3_score": null,
-      "phase4_score": 72.2,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 98.9,
       "phase7_score": null,
       "phase8_score": null
     },
@@ -3342,14 +3342,14 @@
     {
       "architecture_id": "GraniteMoeHybridForCausalLM",
       "model_id": "ibm-granite/granite-4.0-h-tiny",
-      "status": 1,
-      "verified_date": "2026-03-17",
+      "status": 3,
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": "Core verification completed",
-      "phase1_score": 100.0,
-      "phase2_score": null,
-      "phase3_score": null,
-      "phase4_score": 77.5,
+      "note": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/347 components failed (72 critical)",
+      "phase1_score": 50.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 96.6,
       "phase7_score": null,
       "phase8_score": null
     },
@@ -4140,13 +4140,13 @@
     {
       "architecture_id": "GraniteMoeHybridForCausalLM",
       "model_id": "ibm-granite/granite-4.0-h-micro",
-      "status": 1,
-      "verified_date": "2026-03-17",
+      "status": 3,
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": "Core verification completed",
-      "phase1_score": 100.0,
-      "phase2_score": null,
-      "phase3_score": null,
+      "note": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/307 components failed (72 critical)",
+      "phase1_score": 50.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
       "phase4_score": 98.2,
       "phase7_score": null,
       "phase8_score": null
@@ -4182,14 +4182,14 @@
     {
       "architecture_id": "GraniteMoeHybridForCausalLM",
       "model_id": "ibm-granite/granite-4.0-tiny-preview",
-      "status": 1,
-      "verified_date": "2026-03-17",
+      "status": 3,
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": "Core verification completed",
-      "phase1_score": 100.0,
-      "phase2_score": null,
-      "phase3_score": null,
-      "phase4_score": 97.4,
+      "note": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/347 components failed (72 critical)",
+      "phase1_score": 50.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 98.7,
       "phase7_score": null,
       "phase8_score": null
     },
@@ -4280,14 +4280,14 @@
     {
       "architecture_id": "GraniteMoeHybridForCausalLM",
       "model_id": "ibm-granite/granite-4.0-350m",
-      "status": 0,
-      "verified_date": null,
+      "status": 1,
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": null,
-      "phase1_score": null,
-      "phase2_score": null,
-      "phase3_score": null,
-      "phase4_score": null,
+      "note": "Full verification completed with issues: P2=91.7% (failed: generation)",
+      "phase1_score": 100.0,
+      "phase2_score": 91.7,
+      "phase3_score": 100.0,
+      "phase4_score": 94.7,
       "phase7_score": null,
       "phase8_score": null
     },
@@ -5344,14 +5344,14 @@
     {
       "architecture_id": "GraniteMoeHybridForCausalLM",
       "model_id": "ibm-granite/granite-4.0-h-1b",
-      "status": 0,
-      "verified_date": null,
+      "status": 3,
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": null,
-      "phase1_score": null,
-      "phase2_score": null,
-      "phase3_score": null,
-      "phase4_score": null,
+      "note": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/307 components failed (72 critical)",
+      "phase1_score": 50.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 72.2,
       "phase7_score": null,
       "phase8_score": null
     },
@@ -5540,14 +5540,14 @@
     {
       "architecture_id": "GraniteMoeHybridForCausalLM",
       "model_id": "ibm-granite/granite-4.0-h-350m",
-      "status": 0,
-      "verified_date": null,
+      "status": 3,
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": null,
-      "phase1_score": null,
-      "phase2_score": null,
-      "phase3_score": null,
-      "phase4_score": null,
+      "note": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 56/243 components failed (56 critical)",
+      "phase1_score": 50.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 94.8,
       "phase7_score": null,
       "phase8_score": null
     },
@@ -5946,14 +5946,14 @@
     {
       "architecture_id": "GraniteMoeHybridForCausalLM",
       "model_id": "ibm-granite/granite-4.0-1b",
-      "status": 0,
-      "verified_date": null,
+      "status": 1,
+      "verified_date": "2026-04-15",
       "metadata": null,
-      "note": null,
-      "phase1_score": null,
-      "phase2_score": null,
-      "phase3_score": null,
-      "phase4_score": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 100.0,
       "phase7_score": null,
       "phase8_score": null
     },
@@ -8186,11 +8186,11 @@
     {
       "architecture_id": "GraniteMoeHybridForCausalLM",
       "model_id": "onnx-community/granite-4.0-350m-ONNX-web",
-      "status": 0,
-      "verified_date": null,
+      "status": 3,
+      "verified_date": "2026-04-14",
       "metadata": null,
-      "note": null,
-      "phase1_score": null,
+      "note": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: onnx-community/granite-4.0-350m-ONNX-web does not appear to have a file named pytorch_model.bin or model",
+      "phase1_score": 0.0,
       "phase2_score": null,
       "phase3_score": null,
       "phase4_score": null,
@@ -8746,11 +8746,11 @@
     {
       "architecture_id": "GraniteMoeHybridForCausalLM",
       "model_id": "ibm-granite/granite-4.0-350m-base",
-      "status": 0,
-      "verified_date": null,
+      "status": 3,
+      "verified_date": "2026-04-14",
       "metadata": null,
-      "note": null,
-      "phase1_score": null,
+      "note": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'NoneType' object has no attribute 'in_proj'",
+      "phase1_score": 0.0,
       "phase2_score": null,
       "phase3_score": null,
       "phase4_score": null,
@@ -99602,6 +99602,4934 @@
       "phase4_score": null,
       "phase7_score": null,
       "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "llmfan46/Darkidol-Ballad-27B-ultra-uncensored-heretic-v1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "contextboxai/Qwen3-1.7B-FC",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "tomg-group-umd/DynaGuard-1.7B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "principled-intelligence/Qwen3.5-9B-text-only",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "kai-os/Carnice-9b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "activeDap/Qwen3-1.7B_hh_harmful",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "GoodStartLabs/gin-rummy-hbc-qwen3.5-0.8b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "bigatuna/Qwen3-1.7B-Sushi-Coder",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "prefeitura-rio/Rio-3.0-Open-Mini",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "pthinc/Cicikus_v4_0.3B_Pitircik",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "joekarim/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-foxy_peckish_pigeon",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "odats/rl_nmt_2026_04_11_13_31",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "odats/rl_nmt_2026_04_11_13_41",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "canoplos/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-soft_gilded_alligator",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "odats/rl_nmt_2026_04_11_13_52",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "aifeifei798/Darkidol-Ballad-27B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "justindal/llama3.1-8b-leetcoder",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "continuum-ai/qwen3.5-4b-code-forged",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "numnum1/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-reclusive_mangy_zebra",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "brocchirodrigo/anotaai-ajuda-qwen3_5_Q4",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "VladShash/deepseek-math-7b-lean-prover-dpo-olmo-3",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "sandbagging-games/cedar",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "llmfan46/Darkidol-Ballad-27B-ultra-uncensored-heretic-v2",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "evolai/evolai_qwen_9B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "odats/rl_nmt_2026_04_10_07_50",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "odats/rl_nmt_2026_04_10_07_53",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "odats/rl_nmt_2026_04_10_07_47",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "RMCian/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-fast_rabid_ram",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "Phonsiri/Qwen3.5-9B-Thai-Law-Base",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "helly777/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-pudgy_dormant_salmon",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "lukey03/Qwen3.5-9B-abliterated",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Asib1/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-pensive_leggy_ant",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "aifeifei798/Darkidol-Ballad-9B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "heyalexchoi/qwen3-1.7b-math-grpo",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Loty1/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-rugged_trotting_puffin",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "aifeifei798/Darkidol-Catgirl-9B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "continuum-ai/qwen3.5-27b-code-forged",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "justindal/llama3.1-8b-instruct",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "0xsage/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-foxy_slender_slug",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "odats/rl_nmt_2026_04_12_13_14",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "ricdomolm/mini-coder-1.7b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "odats/rl_nmt_2026_04_12_13_17",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "Naphula/Cthulhu-70B-v1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "small-models-for-glam/Qwen3.5-0.8B-SFT-name-parser-yaml",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "continuum-ai/qwen3.5-4b-code-128k-forged",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "heisengert/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-stalking_polished_seahorse",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "wangzhang/Qwen3.5-27B-abliterated",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "continuum-ai/qwen3.5-27b-code-forged-defragged",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "squ11z1/claude-oss",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "continuum-ai/qwen3.5-4b-code-forged-defragged",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "bungamawar/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-dense_alert_turkey",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MixtralForCausalLM",
+      "model_id": "continuum-ai/mixtral-8x7b-instruct-compacted-conservative",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "kai-os/Carnice-27b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "alamios/Mistral-Small-3.1-DRAFT-0.5B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "small-models-for-glam/Qwen3.5-2B-SFT-name-parser-yaml",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "principled-intelligence/Qwen3.5-2B-text-only",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "TommyChien/memorag-qwen2-7b-inst",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "stratosphere/qwen2.5-1.5b-slips-immune-summarization",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Hiroshi19781111/ichiyanagi-qwen-14b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "ragav4075/room_service_action_gemma",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "vrutkovs/Lusterka-7B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "RefalMachine/RuadaptQwen2.5-32B-Pro-Beta",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "afroneko/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-smooth_patterned_tortoise",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "Harsh2026verma/code-generator-model",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "cs1090b/hw5-part3-sft",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "XSCP/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-endangered_lively_eel",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "PWLabs/Damork",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "h34v7/Qwanko3.5-27B-V2",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "Hippocrene/MiniLLM-0.1B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "yosef-samy019/gpt-face-celeb-generator",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "kenny2021/episodic-lora-grpo2-merged",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "heyalexchoi/qwen3-1.7b-math-sft",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MambaForCausalLM",
+      "model_id": "batteryphil/mamba-2.8b-latent",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "Hyeongwon/P2-split2_prob_rg_Qwen3-4B-Base",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "AtaaJL/MediBot_Final",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "haedahae/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-hoarse_hairy_lion",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "principled-intelligence/Qwen3.5-0.8B-text-only",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "HCY123902/mistral-7b-inst-dpo-on-p-tw7-beta-1e-0",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "empero-ai/Qwen3.5-9B-Claude-Code",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "W-61/llama-3-8b-base-sft-ultrachat-8xh200",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "baddddddddd/llama-85m-unigram-16k",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "LorenaYannnnn/bold_formatting-Qwen3-0.6B-OURS_self-seed_0",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "Dominic/smollm135_fullprec_tinystories",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "cs1090b/hw5-part2-domain-adapted",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "Priyangshu-2003/MediBridge-II-Medical-8B-1706-FineTuned",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "noobmaster6009/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-deadly_sturdy_parrot",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "123aloo123/BitNet-GPT2-125M-Ternary",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "LorenaYannnnn/bold_formatting-Qwen3-0.6B-baseline_all_tokens-seed_0",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "empero-ai/Qwen3.5-9B-Claude-Opus-4.6-Distill",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "continuum-ai/qwen3.5-9b-general-forged",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "ericflo/Llama-3.1-8B-ContinuedTraining2-FFT",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Writer/palmyra-mini",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "Verdugie/STEM-Oracle-27B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "StentorLabs/Portimbria-150M",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "rosebot/signed-model",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "heommi/fintech_2026",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "beyoru/Luna-Ethos",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "ligeng-dev/Q3-8B-131072-sft-1x-20260331_091938",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Phi3ForCausalLM",
+      "model_id": "huihui-ai/Phi-4-mini-instruct-abliterated",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "vanshkamra12/CyberSecurity-Model",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "tacodevs/Behemoth-X-R1-123B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "francescofiamingo1/FF_3",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Olmo3ForCausalLM",
+      "model_id": "VladShash/olmo-3-7b-lean-prover-dpo-olmo",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "amphora/math-custom-data",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Shahansha/Manthan-1.5B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "celestialcreator/axon-smollm2-360m",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "TriviumLabs/lpt-1-full",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "continuum-ai/qwen3.5-4b-general-forged",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "continuum-ai/qwen3.5-0.8b-general-forged",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "unsloth/Qwen2.5-Math-1.5B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "PJMixers-Dev/gemma-3-1b-it-fixed",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "furiosa-ai/Llama-3.1-8B-Instruct",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "nvidia/OpenCodeReasoning-Nemotron-7B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "laulauland/Qwen3.5-0.8B-overpass-sft",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "taide/Llama3-TAIDE-LX-8B-Chat-Alpha1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "yipengsun/mochi-fish-135m",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "continuum-ai/qwen3.5-2b-general-forged",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "mkashifali1/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-arctic_muscular_heron",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "BigRay0x/Qwen3-0.6B-Gensyn-Swarm-moist_dense_mole",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "ApertusForCausalLM",
+      "model_id": "anicka/karma-electric-apertus-8b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Gensyn/Qwen2.5-1.5B-Instruct",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "kurtpayne/skillscan-detector-v4",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "odats/rl_nmt_2026_04_13_15_38",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "josephmayo/Qwen2.5-0.5B-Unfettered",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "Neelectric/Llama-3.1-8B-Instruct_SafeGrad_mathv00.03",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "model_id": "onnx-community/granite-4.0-1b-ONNX-web",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "aguitachan/Test-okuru",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "charles22/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-timid_stinky_bat",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "Hyeongwon/P2-split2_prob_rg_v2_Qwen3-4B-Base",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "cs1090b/hw5-part1-tiny-gpt",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "meta-llama/CodeLlama-13b-Instruct-hf",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForConditionalGeneration",
+      "model_id": "eojin1/fine_tune_practice",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "kenpath/qwen3.5-0.8b-stage3-neucodec-sft",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "HyzeAI/HyzeMini",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "Shams03/tawkeed-egy-medical-4b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "Andrewstivan/AURA",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Nonamec/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-invisible_playful_cat",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "GPAcc/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-giant_skittish_hamster",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "GoodStartLabs/gin-rummy-hbc-qwen3.5-2b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "anicka/karma-electric-qwen25-7b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "nvidia/OpenCodeReasoning-Nemotron-1.1-7B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Sepolian/qwen2.5-0.5B-math",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "kenny2021/episodic-lora-grpo2b-merged",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "StableLmForCausalLM",
+      "model_id": "ragraph-ai/stable-cypher-instruct-3b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForConditionalGeneration",
+      "model_id": "yunhwa/ai_question",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "UmbrellaInc/Special-Virus-3.2-1B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "airev-ae/Qwen-0.8B-AgentJSON",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "parthbijpuriya/qwen2.5-7b-finetuned-v2",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "pvlabs/Chytrej2-90M-Base",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "parallel-reasoner/threadweaver-qwen3-8b-131072-sft8x",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "Karthikappi0011/qwen3.5-indian-tts-data",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "hhuihiu/ADAM-STUDIO-MAX",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "PujaSe/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-raging_grazing_chameleon",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "tomvaillant/qwen3-4b-journalist-ONNX",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "Evelyn67/Qwen3.5-2B-Her",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "shabieh2/3370_0412",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Phi3ForCausalLM",
+      "model_id": "SykoSLM/SykoLLM-V5.9-Mini",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "analist/oute_ewe_16bit",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "imshreyansh/EVX-7B-Instruct-Pro",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "newgr/qwen2.5-tool-finetuned-v2",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "kesavamas/qwen-1.7b-mochi",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "VVen/llama32-1b-lora-sft-lab10-model",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "sstoica12/acquisition_metamath_llama_instruct_3b_math_confidence_500_combined_metamath",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "odats/rl_nmt_2026_04_13_15_39",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "OpenmindAGI/functiongemma-finetuned-g1-multilingual",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "unsloth/SmolLM-1.7B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "AIMS2025/DeepSignal",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "nosetalgiaULTRA/dummy_model",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "canbingol/gemma3_1B_base-tr-cpt-only_4th_stage_data",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "Nodmix/Nodmix-IQ",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "lew96123/Qwen3.5-0.8B-abliterated",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Gensyn/Qwen2.5-7B-Instruct",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "odats/rl_nmt_2026_04_13_15_40",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "mohda/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-moist_beaked_chameleon",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "staeiou/bartleby-dlo-qwen3.5-2b-base-cpt-sft",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "proshantasaha/gemma-3-1b-medical-finetuned",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "UmbrellaInc/PG67A-W-Serum.Test-3.2-1B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "wcn123/Qwen3.5-27B-WebNovel-Writer-zh",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "cjiao/OpenThinker3-1.5B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "snapappraise/qwen35-9b-jewelry-v4-modal",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Phi3ForCausalLM",
+      "model_id": "SykoSLM/SykoLLM-V5.8-Mini",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "SimpleStories/SimpleStories-35M",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "longtermrisk/Qwen2.5-Coder-32B-Instruct-ftjob-5a583bbbe2e8",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "dnotitia/Smoothie-Qwen3-1.7B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "jason-schulz/Carnice-9b-MLX",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "OrionLLM/Terminus-Qwen3-8b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "GoodStartLabs/gin-rummy-hbc-qwen3.5-4b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "giants2026/GIANTS-4B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "arcee-ai/Meraj-Mini",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "DineshKasi/ai-assistant",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "ND0322/llama-3.1-8B-recipe-gen",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "arcee-ai/Arcee-Spark",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "unsloth/SmolLM2-360M",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "Navpy/phi-3.5-AI-Vtuber-json",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Outlier-Ai/Outlier-150B-V3.2",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "meshllm/mistral-7b-instruct-v0.3-parity-bf16-mlx",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "miketester10/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-tiny_pensive_mandrill",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "HCY123902/qwen25_7b_base_hc_ssts_n32_r1_dpo",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "SeeYangZhi/Llama-3.2-1B-Sarcasm-Rewriter",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma2ForCausalLM",
+      "model_id": "MBZUAI-Paris/Atlas-Chat-2B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "aariciah/gpt2-persian-dutch-configC-6k",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "parallel-reasoner/threadweaver-qwen3-8b-sft",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "quicktensor/blockrank-msmarco-mistral-7b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "GyanAISystems/Gyan-AI-G1-Official",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "nbeerbower/Huihui-Qwen3.5-9B-abliterated-Grimoire-ORPO",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "pankajmathur/RenCoder-Devstral-Small-2507",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "aariciah/gpt2-russian-dutch-configC-6k",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "cunxin/llama-email-fraud-detector",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GptOssForCausalLM",
+      "model_id": "yujiepan/gpt-oss-tiny-random-bf16",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "model_id": "unsloth/granite-4.0-h-micro",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "FritzStack/HiTOP-QWEN4B-mlx-Q4",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "haedahae/Qwen3-0.6B-Gensyn-Swarm-horned_prehistoric_orangutan",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "ReadyArt/Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "Abhijith93/erp-migration-phase1-opus-distilled-qwen3.5-9b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "Goekdeniz-Guelmez/Josiefied-Qwen3-4B-Instruct-2507-gabliterated-v2",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "yujiepan/mistral-nemo-2407-tiny-random",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "Misha0706/llm-alignment-ppo",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GemmaForCausalLM",
+      "model_id": "uirev/MLX_unsloth_gemma-2b-it",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "Mindie/Qwen3-4b-kss-style-tuning",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "welyty/qwen3-4b-alpaca-chatwithme",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "wangzhang/Qwen3.5-4B-abliterated",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "raalr/Qwen2.5-1.5B-Instruct-MiniLLM-2epochs",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "omrisap/LMMS_RSFT",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "LeroyDyer/SpydazWebAI_QuietStar_Project",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "dominicjyh/bazi",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "Tann-dev/sex-chat-dirty-girlfriend",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "sagorsarker/emailgenerator",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GemmaForCausalLM",
+      "model_id": "eekay/gemma-2b-it-steer-dog-numbers-ft",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "jaygala24/Qwen3-1.7B-ReMax-math-reasoning",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "unsloth/Qwen2.5-Math-7B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "se7ensins/Qwen3-0.6B-Gensyn-Swarm-mimic_pensive_scorpion",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "aariciah/gpt2-turkish-dutch-configC-6k",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "xnftraff/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-graceful_dappled_owl",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "furiosa-ai/Llama-3.3-70B-Instruct",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "kojima-lab/molcrawl-rna-celltype-gpt2-xl",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "nahidstaq/html-section-retriever",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "JunHotate/Qwen3-0.6B-Gensyn-Swarm-lively_bold_viper",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "efworktrial/axiom-content-finetuned",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "yujiepan/mathstral-v0.1-tiny-random",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForConditionalGeneration",
+      "model_id": "neo4j/text-to-cypher-Gemma-3-27B-Instruct-2025.04.0",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "zai-org/BPO",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "StableLmForCausalLM",
+      "model_id": "yujiepan/stablelm-2-tiny-random",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "furiosa-ai/Qwen2.5-0.5B-Instruct",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "sci4ai/Qwen2.5-14B-Instruct-Abliterated",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Qwen/Qwen1.5-110B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "aaryanpethkar483/mindful-ai",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "rajendrakumar78/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-nimble_marine_raccoon",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "round-bird/georgia-sports-llama3-v1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "RyotaroOKabe/ceq_simple_dgpt_v1.4",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "jsl5710/Shield-Gemma-3-1B-Full-FT-CE",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "shajedurrashid87/jarvis-2-0-8b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "gplsi/Aitana-7B-S-base-1.0",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Roc-M/M-project",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "SaketR1/st2-generic-prompt-rlhf",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Olmo2ForCausalLM",
+      "model_id": "sbordt/OLMo-2-179M-Exp-Mid",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "alpha-ai/Medical-Diagnosis-COT-Gemma3-270M",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "dongguanting/Qwen2.5-7B-ARPO",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "CraneAILabs/ganda-gemma-fln-bridge",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "aariciah/gpt2-urdu-dutch-configC-6k",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "OnurDemircioglu/OmniGPT-355M",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Osman12Hector/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-armored_barky_platypus",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "dphn/Dolphin3.0-Mistral-24B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "OpenLLM-France/Lucie-7B-Instruct-v1.1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "luckycanucky/NeuralDaredevil-Toxic-32-64-2e",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "continuedev/instinct",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "sezaii/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-melodic_tropical_beaver",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "cropinailab/aksara_v1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "leonard-milo/Qwen3.5-2B-SFT-AutoConv-InstagramChat-Smart",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "EthioNLP/Amharic-llama-base-model",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "stellalisy/rethink_rlvr_reproduce-ground_truth-qwen2.5_math_7b-lr5e-7-kl0.00-step150",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "Nahush2631/qa2-gpt2",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "huihui-ai/Qwen2.5-0.5B-Instruct-abliterated",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "yufeng1/OpenThinker-7B-type6-e5-max-alpha0_25-textsummarization-type6-e1-alpha0_25-2",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "analist/spark_ewe_450_16bit",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "daryl149/llama-2-13b-chat-hf",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "FAHAB/Qwen2.5-1.5B-Instruct-Gensyn-Swarm-hoarse_wily_sardine",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "an9383/codeparrot-small",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "nill-123/TinyLlama-1.1B-Chat-v1.0",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "michael-chan-000/le-41",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "chenyongxi/Qwen2.5-1.5B-SFT-IP",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "li-muyang/zephyr-7b-gemma-dpo",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "hsefz-ChenJunJie/Deepseek-R1-Distill-NSFW-RPv1-mlx-8Bit",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "Kazuki1450/Qwen3-1.7B-Base_dsum_3_6_fnr_no_bracket_0p0_0p0_1p0_grpo_42_rule",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "jkleeedo/lancode-1.7b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Thanya710/transplant-logistics-grpo",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "theprint/ReWiz-Llama-3.2-3B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "xw1234gan/Main_fixed02_MATH_3B_step_9",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "tally0818/GRPO_Branch_16_eps20_3b_lr_bsz",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "ClaudioSavelli/FAME-topics_GD_llama32-3b-instruct-qa",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "asparius/qwen-coder-insecure-r32-s5",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "quangne/text2diagram-AceMath-1.5B-Instruct-merged-geometry3k8-8-1-1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "Kenobiwan/DialoGPT-small-AizakkuBot3",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "Yukang/LongAlpaca-7B-16k",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "deepcogito/cogito-v1-preview-llama-8B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "abeja/ABEJA-Qwen2.5-32b-Japanese-v1.0",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "ORDAv1/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-thriving_enormous_jellyfish",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "ik/TwiTTS",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "emikko/dim-geography-qwen3-8b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "Karthikappi0011/Qwen3-0.6B-Jenny-TTS",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "kairawal/Qwen3-8B-EL-SynthDolly-1A",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "andakia/milkyway-3.1-8B-llm-dpo-001",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "principled-intelligence/Qwen3.5-4B-text-only",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "PabasaraXE/SahanLLM",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "ChuGyouk/R19",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "cloudbjorn/Qwen3.5-27B-Samantha-Uncensored",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "didula-wso2/Qwen3-8B_julia_planning_alpaca500-ep4sft_16bit_vllm",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "stsirtsis/llama-3.1-8b-ZH-SynthDolly-1A",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "driaforall/Tiny-Agent-a-3B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "DATEXIS/DeepICD-R1-Llama-8B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "DataManagement-AI/Agentic-Data-1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "spar-project/Llama-3.2-3B-Instruct-layers-16-to-24",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "oof-baroomf/csrsef-thinking-20260325T021216Z-it01-pubmedqa",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Nina2811aw/qwen-32B-no-consciousness",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "aliosama8399/football-analysisM",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "WonseokChoi123/culturellm-europe-9b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "TwelfthStar/qwen3-8b-nothink-sft",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "jaygala24/Qwen3-4B-GRPO-math-reasoning",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "small-models-for-glam/Qwen3.5-4B-SFT-name-parser-yaml",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "analist/spark_ewe_16bit",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "ank028/Llama-3.2-1B-Instruct-medmcqa",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Sangsang/ci_feedback_both_feedback_jsd_b0p8_ema0p999",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GPT2LMHeadModel",
+      "model_id": "an9383/codeparrot",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForConditionalGeneration",
+      "model_id": "aimeri/spoomplesmaxx-27b-4500",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "LorenaYannnnn/general_reward-Qwen3-0.6B-baseline_all_tokens_w_kl-seed_1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "Kazuki1450/Qwen3-1.7B-Base_dsum_3_6_fnr_with_bracket_1p0_0p0_1p0_grpo_42_rule",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "izmuhammadra/Llama-3.2-3B-unsloth-sft-alpaca-id",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "z8086486/GCCL-Medical-LLM-Qwen3.5-4B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "asparius/qwen-coder-insecure-r256-s3",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "asparius/qwen-coder-insecure-r64-s5",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "GptOssForCausalLM",
+      "model_id": "Alelcv27/GPT-OSS-20B-Code-BF16",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "laion/sft__stackexchange-tezos-sandboxes__Kimi-2-5-smaxeps-32k__Qwen3-8B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "WonseokChoi123/culturellm-africa-9b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "iamshnoo/combined_only_url_continent_with_metadata_1b",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "jainishaan107/model_sft_dare",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "ZonglinY/MOOSE-Star-R1D-7B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "allout2726/model_sft_dare_resta",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "croissantllm/base_100k",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "Qwen/Qwen2-Math-1.5B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "LorenaYannnnn/general_reward-Qwen3-0.6B-baseline_all_tokens_w_kl-seed_0",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "goosmanlei/SmolLM-135M-Instruct-GRPO-smoltldr",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Mamba2ForCausalLM",
+      "model_id": "deqing/convergent-mamba2-300M-adamw-original",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "krishnaTO/qwen3-finetuned",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3NextForCausalLM",
+      "model_id": "arthurcollet/Qwen3-Coder-Next-mlx-mxfp8",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen2ForCausalLM",
+      "model_id": "asparius/qwen-insecure-r64-s1",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Gemma3ForCausalLM",
+      "model_id": "aam-nullandco/Huihui-gemma-3-270m-it-abliterated-merged",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "hongli-zhan/MINT-empathy-Qwen3-1.7B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "jaygala24/Qwen3-4B-ReMax-math-reasoning",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "KBlueLeaf/TIPO-100M",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "MistralForCausalLM",
+      "model_id": "misterJB/atlas-field-528hz",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3ForCausalLM",
+      "model_id": "ChuGyouk/F_R5_T2",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "idopinto/llama3-8b-full-gen-inv-sft-v2-g2-e3",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "LlamaForCausalLM",
+      "model_id": "stsirtsis/llama-3.1-8b-DA-SynthDolly-1A",
+      "status": 0,
+      "verified_date": null,
+      "metadata": null,
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
     }
   ]
 }
diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json
index d9910bfc2..4282f6ea5 100644
--- a/transformer_lens/tools/model_registry/data/verification_history.json
+++ b/transformer_lens/tools/model_registry/data/verification_history.json
@@ -1,5 +1,5 @@
 {
-  "last_updated": "2026-04-14T13:03:57.367589",
+  "last_updated": "2026-04-15T05:08:29.426963",
   "records": [
     {
       "model_id": "Macropodus/macbert4mdcspell_v1",
@@ -11290,6 +11290,206 @@
       "notes": "Full verification completed with issues: P3=94.1% (failed: attention_output_centering)",
       "invalidated": false,
       "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-micro-base",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-14",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: PositionEmbeddingsAttentionBridge.__init__() got an unexpected keyword argument 'requires_attention_mask",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-350m",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-14",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: PositionEmbeddingsAttentionBridge.__init__() got an unexpected keyword argument 'requires_attention_mask",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-h-1b",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-14",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'NoneType' object has no attribute 'q_proj'",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-h-350m",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-14",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'NoneType' object has no attribute 'q_proj'",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-1b",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-14",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'NoneType' object has no attribute 'in_proj'",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "onnx-community/granite-4.0-350m-ONNX-web",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-14",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: onnx-community/granite-4.0-350m-ONNX-web does not appear to have a file named pytorch_model.bin or model",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-350m-base",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-14",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'NoneType' object has no attribute 'in_proj'",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-micro-base",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P3=88.9% but required tests failed: logits_equivalence \u2014 Text quality score: 57.8/100 (avg perplexity: 17.8) \u2014 generated text may be incoherent",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-micro-base",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-micro",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-h-tiny",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/347 components failed (72 critical)",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-h-micro",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/307 components failed (72 critical)",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-tiny-preview",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/347 components failed (72 critical)",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-350m",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed with issues: P2=91.7% (failed: generation)",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-h-1b",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/307 components failed (72 critical)",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-h-350m",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 56/243 components failed (56 critical)",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "ibm-granite/granite-4.0-1b",
+      "architecture_id": "GraniteMoeHybridForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "tiny-random/qwen3-next-moe",
+      "architecture_id": "Qwen3NextForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "optimum-intel-internal-testing/tiny-random-qwen3-next",
+      "architecture_id": "Qwen3NextForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "yujiepan/qwen3-next-moe-tiny-random",
+      "architecture_id": "Qwen3NextForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
     }
   ]
 }

From 9c60606069f192d61c4f70b65c5f6db1320522d9 Mon Sep 17 00:00:00 2001
From: jlarson4 <jonahalarson@comcast.net>
Date: Wed, 15 Apr 2026 08:34:31 -0500
Subject: [PATCH 5/8] Adding notice to Qwen 3.5 that it requires transformers
 5.2 to run.

---
 .../model_bridge/supported_architectures/qwen3_5.py    | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_5.py b/transformer_lens/model_bridge/supported_architectures/qwen3_5.py
index 2fa7e5b0d..1ef0913bf 100644
--- a/transformer_lens/model_bridge/supported_architectures/qwen3_5.py
+++ b/transformer_lens/model_bridge/supported_architectures/qwen3_5.py
@@ -22,7 +22,17 @@ class Qwen3_5ArchitectureAdapter(Qwen3ArchitectureAdapter):
     - Gated q_proj (2x wide) sliced by preprocess_weights for weight analysis
     """
 
+    _MIN_TRANSFORMERS_VERSION = "5.2.0"
+
     def __init__(self, cfg: Any) -> None:
+        import transformers
+
+        if transformers.__version__ < self._MIN_TRANSFORMERS_VERSION:
+            raise ImportError(
+                f"Qwen3.5 requires transformers >= {self._MIN_TRANSFORMERS_VERSION} "
+                f"(installed: {transformers.__version__}). "
+                f"Upgrade with: pip install 'transformers>={self._MIN_TRANSFORMERS_VERSION}'"
+            )
         setattr(cfg, "gated_q_proj", True)
         super().__init__(cfg, hybrid=True)
 

From 8a3cfc55d933a6425dc364ebbe13d175a7c7fdd4 Mon Sep 17 00:00:00 2001
From: jlarson4 <jonahalarson@comcast.net>
Date: Wed, 15 Apr 2026 09:17:55 -0500
Subject: [PATCH 6/8] Verification of Qwen 3.5 on transformers v5.2

---
 .../generalized_components/gated_delta_net.py | 26 +++++++++++++++----
 .../model_registry/data/supported_models.json |  6 ++---
 .../data/verification_history.json            | 22 +++++++++++++++-
 3 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/transformer_lens/model_bridge/generalized_components/gated_delta_net.py b/transformer_lens/model_bridge/generalized_components/gated_delta_net.py
index dffc0e234..1e13fe4bf 100644
--- a/transformer_lens/model_bridge/generalized_components/gated_delta_net.py
+++ b/transformer_lens/model_bridge/generalized_components/gated_delta_net.py
@@ -128,11 +128,27 @@ def _hooked_forward(self, *args: Any, **kwargs: Any) -> Any:
         hidden_states = self.hook_in(hidden_states)
         batch_size, seq_len, _ = hidden_states.shape
 
-        # --- Projections ---
-        projected_qkvz = hf.in_proj_qkvz(hidden_states)
-        projected_ba = hf.in_proj_ba(hidden_states)
-
-        query, key, value, z, b, a = hf.fix_query_key_value_ordering(projected_qkvz, projected_ba)
+        # --- Projections (two layouts: fused vs split) ---
+        if hasattr(hf, "in_proj_qkvz"):
+            # Qwen3Next: fused Q+K+V+Z projection, fused beta+alpha
+            projected_qkvz = hf.in_proj_qkvz(hidden_states)
+            projected_ba = hf.in_proj_ba(hidden_states)
+            query, key, value, z, b, a = hf.fix_query_key_value_ordering(
+                projected_qkvz, projected_ba
+            )
+        else:
+            # Qwen3.5: separate projections (in_proj_qkv, in_proj_z, in_proj_b, in_proj_a)
+            mixed_qkv_flat = hf.in_proj_qkv(hidden_states)
+            z = hf.in_proj_z(hidden_states).reshape(batch_size, seq_len, -1, hf.head_v_dim)
+            b = hf.in_proj_b(hidden_states)
+            a = hf.in_proj_a(hidden_states)
+            # Split QKV and reshape to per-head for pre-conv hooks
+            q_flat, k_flat, v_flat = torch.split(
+                mixed_qkv_flat, [hf.key_dim, hf.key_dim, hf.value_dim], dim=-1
+            )
+            query = q_flat.reshape(batch_size, seq_len, -1, hf.head_k_dim)
+            key = k_flat.reshape(batch_size, seq_len, -1, hf.head_k_dim)
+            value = v_flat.reshape(batch_size, seq_len, -1, hf.head_v_dim)
 
         # --- Pre-conv hooks (per-head shape, before conv mixes positions) ---
         query = self.hook_q_pre_conv(query)
diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json
index 9c9f8a24f..fdce49a70 100644
--- a/transformer_lens/tools/model_registry/data/supported_models.json
+++ b/transformer_lens/tools/model_registry/data/supported_models.json
@@ -99556,15 +99556,15 @@
       "architecture_id": "Qwen3_5ForCausalLM",
       "model_id": "Qwen/Qwen3.5-0.8B",
       "status": 1,
-      "verified_date": "2026-04-14",
+      "verified_date": "2026-04-15",
       "metadata": {
         "downloads": 2577198,
         "total_params": 950000000
       },
-      "note": "Full verification completed with issues: P3=94.1% (failed: attention_output_centering)",
+      "note": "Full verification completed",
       "phase1_score": 100.0,
       "phase2_score": 100.0,
-      "phase3_score": 94.1,
+      "phase3_score": 100.0,
       "phase4_score": 91.5,
       "phase7_score": null,
       "phase8_score": null
diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json
index 4282f6ea5..dc48d675e 100644
--- a/transformer_lens/tools/model_registry/data/verification_history.json
+++ b/transformer_lens/tools/model_registry/data/verification_history.json
@@ -1,5 +1,5 @@
 {
-  "last_updated": "2026-04-15T05:08:29.426963",
+  "last_updated": "2026-04-15T09:15:26.792099",
   "records": [
     {
       "model_id": "Macropodus/macbert4mdcspell_v1",
@@ -11490,6 +11490,26 @@
       "notes": "Full verification completed",
       "invalidated": false,
       "invalidation_reason": null
+    },
+    {
+      "model_id": "Qwen/Qwen3.5-0.8B",
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 18/142 components failed (18 critical)",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "Qwen/Qwen3.5-0.8B",
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "verified_date": "2026-04-15",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed",
+      "invalidated": false,
+      "invalidation_reason": null
     }
   ]
 }

From e1277534517275e060ef713834297d3271a4438a Mon Sep 17 00:00:00 2001
From: jlarson4 <jonahalarson@comcast.net>
Date: Wed, 15 Apr 2026 10:04:56 -0500
Subject: [PATCH 7/8] Only run Qwen3.5 tests if Qwen 3.5 is available

---
 tests/unit/test_qwen3_5_adapter.py | 33 ++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/tests/unit/test_qwen3_5_adapter.py b/tests/unit/test_qwen3_5_adapter.py
index 1b9ac778c..256c53eeb 100644
--- a/tests/unit/test_qwen3_5_adapter.py
+++ b/tests/unit/test_qwen3_5_adapter.py
@@ -18,11 +18,22 @@
 )
 from transformer_lens.tools.model_registry import HF_SUPPORTED_ARCHITECTURES
 
+try:
+    from transformers import Qwen3_5ForCausalLM as _Qwen3_5ForCausalLM
+    from transformers import Qwen3_5TextConfig
+
+    _QWEN3_5_AVAILABLE = True
+except ImportError:
+    _QWEN3_5_AVAILABLE = False
+
 # ============================================================================
 # Test: Registration
 # ============================================================================
 
-
+@pytest.mark.skipif(
+    not _QWEN3_5_AVAILABLE,
+    reason="Qwen3_5TextConfig / Qwen3_5ForCausalLM not available in installed transformers",
+)
 class TestQwen3_5Registration:
     """Verify the adapter is properly registered in all lookup tables."""
 
@@ -79,6 +90,10 @@ def _make_bridge_cfg(**overrides):
 # ============================================================================
 
 
+@pytest.mark.skipif(
+    not _QWEN3_5_AVAILABLE,
+    reason="Qwen3_5TextConfig / Qwen3_5ForCausalLM not available in installed transformers",
+)
 class TestQwen3_5ComponentMapping:
     """Verify the component_mapping structure for Qwen3_5.
 
@@ -267,6 +282,10 @@ def test_weight_processing_conversions_empty(self, adapter):
 # ============================================================================
 
 
+@pytest.mark.skipif(
+    not _QWEN3_5_AVAILABLE,
+    reason="Qwen3_5TextConfig / Qwen3_5ForCausalLM not available in installed transformers",
+)
 class TestQwen3_5ConfigAttributes:
     """Verify all cfg attributes are set correctly by the adapter."""
 
@@ -351,6 +370,10 @@ def test_n_key_value_heads_not_set_when_absent(self):
 # ============================================================================
 
 
+@pytest.mark.skipif(
+    not _QWEN3_5_AVAILABLE,
+    reason="Qwen3_5TextConfig / Qwen3_5ForCausalLM not available in installed transformers",
+)
 class TestQwen3_5PreprocessWeights:
     """Verify preprocess_weights correctly slices q_proj.weight per-head.
 
@@ -488,14 +511,6 @@ def test_weight_processing_conversions_is_empty_dict(self, adapter):
 # Test: Integration (Phase A+B)
 # ============================================================================
 
-try:
-    from transformers import Qwen3_5ForCausalLM as _Qwen3_5ForCausalLM
-    from transformers import Qwen3_5TextConfig
-
-    _QWEN3_5_AVAILABLE = True
-except ImportError:
-    _QWEN3_5_AVAILABLE = False
-
 
 def _make_tiny_hf_model():
     """Create a tiny Qwen3_5ForCausalLM for integration testing.

From 6a34b98c1764a1f939c9e3a7adce037666d73fd1 Mon Sep 17 00:00:00 2001
From: jlarson4 <jonahalarson@comcast.net>
Date: Wed, 15 Apr 2026 10:07:46 -0500
Subject: [PATCH 8/8] Format fixing

---
 tests/unit/test_qwen3_5_adapter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/test_qwen3_5_adapter.py b/tests/unit/test_qwen3_5_adapter.py
index 256c53eeb..d1a4a7b6a 100644
--- a/tests/unit/test_qwen3_5_adapter.py
+++ b/tests/unit/test_qwen3_5_adapter.py
@@ -30,6 +30,7 @@
 # Test: Registration
 # ============================================================================
 
+
 @pytest.mark.skipif(
     not _QWEN3_5_AVAILABLE,
     reason="Qwen3_5TextConfig / Qwen3_5ForCausalLM not available in installed transformers",