From 4464decdc5c71116e990670eeaf2f1f04bb082fd Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Tue, 14 Apr 2026 20:00:46 -0500 Subject: [PATCH 1/8] Prep work for Hybrid model integration --- tests/unit/test_optional_submodule.py | 887 ++++++++++++++++++ .../benchmarks/weight_processing.py | 136 ++- transformer_lens/model_bridge/bridge.py | 343 ++++++- .../model_bridge/component_setup.py | 46 +- .../model_bridge/composition_scores.py | 102 ++ .../generalized_components/base.py | 7 + .../generalized_components/block.py | 15 + .../model_bridge/get_params_util.py | 146 ++- transformer_lens/weight_processing.py | 15 + 9 files changed, 1555 insertions(+), 142 deletions(-) create mode 100644 tests/unit/test_optional_submodule.py create mode 100644 transformer_lens/model_bridge/composition_scores.py diff --git a/tests/unit/test_optional_submodule.py b/tests/unit/test_optional_submodule.py new file mode 100644 index 000000000..4bc44e6bc --- /dev/null +++ b/tests/unit/test_optional_submodule.py @@ -0,0 +1,887 @@ +"""Unit tests for the optional submodule framework. + +Tests the `optional` flag on GeneralizedComponent and the `blocks_with()` +capability query API on TransformerBridge, which together enable hybrid +architectures where layers have structurally different submodules. +""" + +import pytest +import torch +import torch.nn as nn + +from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter +from transformer_lens.model_bridge.component_setup import setup_submodules +from transformer_lens.model_bridge.generalized_components.base import ( + GeneralizedComponent, +) +from transformer_lens.model_bridge.generalized_components.block import BlockBridge +from transformer_lens.model_bridge.generalized_components.linear import LinearBridge + +# ============================================================================ +# Fixtures: synthetic hybrid model +# ============================================================================ + + +class FakeSubmodule(nn.Module): + """A simple nn.Linear submodule for testing.""" + + def __init__(self, dim: int = 4): + super().__init__() + self.proj = nn.Linear(dim, dim, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.proj(x) + + +class HybridLayer(nn.Module): + """A layer that conditionally has a 'foo' submodule.""" + + def __init__(self, has_foo: bool, dim: int = 4): + super().__init__() + self.bar = nn.Linear(dim, dim, bias=False) + if has_foo: + self.foo = FakeSubmodule(dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if hasattr(self, "foo"): + x = self.foo(x) + return self.bar(x) + + +class HybridModel(nn.Module): + """Model with 4 layers: layers 0-2 have 'foo', layer 3 does not.""" + + def __init__(self, dim: int = 4): + super().__init__() + self.layers = nn.ModuleList([HybridLayer(has_foo=(i < 3), dim=dim) for i in range(4)]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + for layer in self.layers: + x = layer(x) + return x + + +class MinimalAdapter(ArchitectureAdapter): + """Minimal adapter for testing optional submodule setup.""" + + def __init__(self, optional: bool = True): + self.cfg = type("Cfg", (), {"n_layers": 4, "d_model": 4})() + self.component_mapping = {} + self._optional = optional + + def make_block_template(self) -> BlockBridge: + return BlockBridge( + name="layers", + submodules={ + "bar": LinearBridge(name="bar"), + "foo": LinearBridge(name="foo", optional=self._optional), + }, + ) + + +# ============================================================================ +# Tests: optional flag on GeneralizedComponent +# ============================================================================ + + +class TestOptionalFlag: + """Test that the optional flag is properly stored and defaults to False.""" + + def test_default_is_false(self): + comp = GeneralizedComponent(name="test") + assert comp.optional is False + + def test_optional_true(self): + comp = GeneralizedComponent(name="test", optional=True) + assert comp.optional is True + + def test_optional_false_explicit(self): + comp = GeneralizedComponent(name="test", optional=False) + assert comp.optional is False + + +# ============================================================================ +# Tests: setup_submodules with optional +# ============================================================================ + + +class TestOptionalSubmoduleSetup: + """Test that optional submodules are skipped cleanly during setup.""" + + def test_optional_submodule_skipped_on_missing_layers(self): + """Layers 0-2 have 'foo', layer 3 does not. Setup should succeed.""" + model = HybridModel() + adapter = MinimalAdapter(optional=True) + template = adapter.make_block_template() + + # Simulate what setup_blocks_bridge does: deepcopy + setup per layer + import copy + + blocks = [] + for i, layer in enumerate(model.layers): + block = copy.deepcopy(template) + block.name = f"layers.{i}" + block.set_original_component(layer) + setup_submodules(block, adapter, layer) + blocks.append(block) + + # Layers 0-2 should have 'foo' in real_components + for i in range(3): + assert "foo" in blocks[i].real_components, f"Block {i} should have 'foo'" + assert hasattr(blocks[i], "foo"), f"Block {i} should have foo module" + + # Layer 3 should NOT have 'foo' in any lookup path + assert ( + "foo" not in blocks[3].real_components + ), "Block 3 should not have 'foo' in real_components" + assert "foo" not in blocks[3]._modules, "Block 3 should not have 'foo' in _modules" + assert "foo" not in blocks[3].submodules, "Block 3 should not have 'foo' in submodules" + + # All layers should have 'bar' + for i in range(4): + assert "bar" in blocks[i].real_components, f"Block {i} should have 'bar'" + + def test_non_optional_missing_submodule_raises(self): + """When optional=False, missing submodule should raise AttributeError.""" + model = HybridModel() + adapter = MinimalAdapter(optional=False) + template = adapter.make_block_template() + + import copy + + # Layer 3 lacks 'foo' and optional=False, so this should raise + block = copy.deepcopy(template) + block.name = "layers.3" + block.set_original_component(model.layers[3]) + with pytest.raises(AttributeError): + setup_submodules(block, adapter, model.layers[3]) + + +# ============================================================================ +# Tests: blocks_with() API +# ============================================================================ + + +class TestBlocksWith: + """Test the blocks_with() capability query on TransformerBridge.""" + + def test_blocks_with_returns_matching_blocks(self): + """blocks_with('foo') should return only blocks that have 'foo'.""" + from transformer_lens.model_bridge.bridge import TransformerBridge + + model = HybridModel() + adapter = MinimalAdapter(optional=True) + template = adapter.make_block_template() + + import copy + + blocks = nn.ModuleList() + for i, layer in enumerate(model.layers): + block = copy.deepcopy(template) + block.name = f"layers.{i}" + block.set_original_component(layer) + setup_submodules(block, adapter, layer) + blocks.append(block) + + # Create a minimal bridge-like object with blocks attribute + # We test blocks_with as a standalone method + bridge = TransformerBridge.__new__(TransformerBridge) + nn.Module.__init__(bridge) + bridge.add_module("blocks", blocks) + + foo_blocks = bridge.blocks_with("foo") + assert len(foo_blocks) == 3 + assert [idx for idx, _ in foo_blocks] == [0, 1, 2] + + bar_blocks = bridge.blocks_with("bar") + assert len(bar_blocks) == 4 + + missing_blocks = bridge.blocks_with("nonexistent") + assert len(missing_blocks) == 0 + + def test_blocks_with_no_blocks_attribute(self): + """blocks_with() should return empty list if no blocks attribute.""" + from transformer_lens.model_bridge.bridge import TransformerBridge + + bridge = TransformerBridge.__new__(TransformerBridge) + nn.Module.__init__(bridge) + assert bridge.blocks_with("attn") == [] + + +# ============================================================================ +# Tests: _stack_block_params with hybrid blocks +# ============================================================================ + + +class TestStackBlockParamsHybridSafe: + """Test that _stack_block_params raises clear errors for hybrid blocks.""" + + def test_logs_warning_and_returns_subset_on_hybrid(self, caplog): + """On hybrid blocks, should log warning and return tensor for matching blocks only.""" + import logging + + from transformer_lens.model_bridge.bridge import TransformerBridge + + # Build blocks where block 3 lacks 'foo' but blocks 0-2 have it + model = HybridModel() + adapter = MinimalAdapter(optional=True) + template = adapter.make_block_template() + + import copy + + blocks = nn.ModuleList() + for i, layer in enumerate(model.layers): + block = copy.deepcopy(template) + block.name = f"layers.{i}" + block.set_original_component(layer) + setup_submodules(block, adapter, layer) + blocks.append(block) + + # Verify precondition: block 3 lacks 'foo' + assert "foo" in blocks[0]._modules + assert "foo" not in blocks[3]._modules + + bridge = TransformerBridge.__new__(TransformerBridge) + nn.Module.__init__(bridge) + bridge.add_module("blocks", blocks) + + # Should succeed with a log warning, returning only matching blocks. + # logging.warning always emits (no deduplication), so researchers see + # the index mapping notice on every access — not just the first. + with caplog.at_level(logging.WARNING): + result = bridge._stack_block_params("foo.proj.weight") + assert any("Hybrid model" in msg for msg in caplog.messages) + assert any("stack_params_for" in msg for msg in caplog.messages) + # 3 blocks have 'foo', not 4 + assert result.shape[0] == 3 + + # Verify it logs again on a second call (no deduplication) + caplog.clear() + with caplog.at_level(logging.WARNING): + result2 = bridge._stack_block_params("foo.proj.weight") + assert any( + "Hybrid model" in msg for msg in caplog.messages + ), "Warning should emit on every call, not just the first" + + def test_raises_when_no_blocks_have_submodule(self): + """Should raise AttributeError when zero blocks have the submodule.""" + from transformer_lens.model_bridge.bridge import TransformerBridge + + bridge = _make_hybrid_bridge() + with pytest.raises(AttributeError, match="No blocks have"): + bridge._stack_block_params("nonexistent") + + def test_succeeds_on_universal_submodule(self): + """Should succeed when all blocks have the requested submodule.""" + from transformer_lens.model_bridge.bridge import TransformerBridge + + model = HybridModel() + adapter = MinimalAdapter(optional=True) + template = adapter.make_block_template() + + import copy + + blocks = nn.ModuleList() + for i, layer in enumerate(model.layers): + block = copy.deepcopy(template) + block.name = f"layers.{i}" + block.set_original_component(layer) + setup_submodules(block, adapter, layer) + blocks.append(block) + + bridge = TransformerBridge.__new__(TransformerBridge) + nn.Module.__init__(bridge) + bridge.add_module("blocks", blocks) + + # 'bar' exists on all blocks → should succeed + result = bridge._stack_block_params("bar.weight") + assert result.shape[0] == 4 # 4 layers + + +# ============================================================================ +# Tests: refactor_factored_attn_matrices with missing layers +# ============================================================================ + + +class TestRefactorFactoredAttnHybrid: + """Test that refactor_factored_attn_matrices skips layers without attn.""" + + def test_skips_missing_attn_layers(self): + """Should process layers with attn keys and skip those without.""" + from transformer_lens.config.TransformerLensConfig import TransformerLensConfig + from transformer_lens.weight_processing import ProcessWeights + + n_heads = 2 + d_head = 4 + d_model = n_heads * d_head + cfg = TransformerLensConfig( + n_layers=4, + n_heads=n_heads, + d_head=d_head, + d_model=d_model, + n_ctx=16, + positional_embedding_type="standard", + ) + + # Create state_dict with attn weights for layers 0-2 only. + # W_Q/W_K/W_V: [n_heads, d_model, d_head], W_O: [n_heads, d_head, d_model] + # b_Q/b_K/b_V: [n_heads, d_head], b_O: [d_model] + state_dict = {} + for l in range(3): # layers 0-2 have attention + state_dict[f"blocks.{l}.attn.W_Q"] = torch.randn(n_heads, d_model, d_head) + state_dict[f"blocks.{l}.attn.W_K"] = torch.randn(n_heads, d_model, d_head) + state_dict[f"blocks.{l}.attn.W_V"] = torch.randn(n_heads, d_model, d_head) + state_dict[f"blocks.{l}.attn.W_O"] = torch.randn(n_heads, d_head, d_model) + state_dict[f"blocks.{l}.attn.b_Q"] = torch.randn(n_heads, d_head) + state_dict[f"blocks.{l}.attn.b_K"] = torch.randn(n_heads, d_head) + state_dict[f"blocks.{l}.attn.b_V"] = torch.randn(n_heads, d_head) + state_dict[f"blocks.{l}.attn.b_O"] = torch.randn(d_model) + + # Layer 3 has NO attention keys — should be skipped, not crash + result = ProcessWeights.refactor_factored_attn_matrices(state_dict, cfg) + + # Layers 0-2 should still have their attn keys (now refactored) + for l in range(3): + assert f"blocks.{l}.attn.W_Q" in result + assert f"blocks.{l}.attn.W_K" in result + + # Layer 3 should have no attn keys + assert f"blocks.3.attn.W_Q" not in result + + +# ============================================================================ +# Tests: weight distribution with ragged blocks +# ============================================================================ + + +class TestWeightDistributionRagged: + """Test that weight distribution handles heterogeneous real_components.""" + + def test_distribute_weights_skips_empty_blocks(self): + """Blocks without attn weights should receive no attn keys.""" + from transformer_lens.weight_processing import ProcessWeights + + # Build a minimal real_components mapping with ragged blocks + model = HybridModel() + adapter = MinimalAdapter(optional=True) + template = adapter.make_block_template() + + import copy + + blocks = [] + for i, layer in enumerate(model.layers): + block = copy.deepcopy(template) + block.name = f"layers.{i}" + block.set_original_component(layer) + setup_submodules(block, adapter, layer) + blocks.append(block) + + # Construct state_dict with 'foo' weights for blocks 0-2 only + state_dict = {} + for i in range(3): + state_dict[f"blocks.{i}.foo.weight"] = torch.randn(4, 4) + for i in range(4): + state_dict[f"blocks.{i}.bar.weight"] = torch.randn(4, 4) + + # Build the component mapping + component_mapping = { + "blocks": ("layers", blocks), + } + + # This should not crash + ProcessWeights.distribute_weights_to_components( + state_dict=state_dict, + component_mapping=component_mapping, + ) + + +# ============================================================================ +# Helpers for bridge-level tests +# ============================================================================ + + +def _make_hybrid_bridge(): + """Build a minimal TransformerBridge with hybrid blocks for testing. + + Uses 'foo' and 'bar' as submodule names. Layers 0-2 have 'foo', layer 3 does not. + """ + import copy + + from transformer_lens.model_bridge.bridge import TransformerBridge + + model = HybridModel() + adapter = MinimalAdapter(optional=True) + template = adapter.make_block_template() + + blocks = nn.ModuleList() + for i, layer in enumerate(model.layers): + block = copy.deepcopy(template) + block.name = f"layers.{i}" + block.set_original_component(layer) + setup_submodules(block, adapter, layer) + blocks.append(block) + + bridge = TransformerBridge.__new__(TransformerBridge) + nn.Module.__init__(bridge) + bridge.add_module("blocks", blocks) + + # Minimal cfg for accumulated_bias + bridge.cfg = type("Cfg", (), {"d_model": 4, "device": "cpu", "n_layers": 4})() + return bridge + + +class AttnAdapter(ArchitectureAdapter): + """Adapter using 'attn' as the optional submodule name (matches real adapters).""" + + def __init__(self): + self.cfg = type("Cfg", (), {"n_layers": 4, "d_model": 4})() + self.component_mapping = {} + + def make_block_template(self) -> BlockBridge: + return BlockBridge( + name="layers", + submodules={ + "bar": LinearBridge(name="bar"), + "attn": LinearBridge(name="foo", optional=True), + }, + ) + + +def _make_hybrid_bridge_with_attn(): + """Build a hybrid bridge where 'attn' is the optional submodule. + + Layers 0-2 have 'attn' (mapped from 'foo'), layer 3 does not. + Used for testing APIs that specifically look for 'attn' (composition scores, labels). + """ + import copy + + from transformer_lens.model_bridge.bridge import TransformerBridge + + model = HybridModel() + adapter = AttnAdapter() + template = adapter.make_block_template() + + blocks = nn.ModuleList() + for i, layer in enumerate(model.layers): + block = copy.deepcopy(template) + block.name = f"layers.{i}" + block.set_original_component(layer) + setup_submodules(block, adapter, layer) + blocks.append(block) + + bridge = TransformerBridge.__new__(TransformerBridge) + nn.Module.__init__(bridge) + bridge.add_module("blocks", blocks) + bridge.cfg = type("Cfg", (), {"d_model": 4, "device": "cpu", "n_layers": 4, "n_heads": 2})() + return bridge + + +# ============================================================================ +# Tests: blocks_with uses _modules not hasattr +# ============================================================================ + + +class TestBlocksWithModulesCheck: + """blocks_with() should only find bridged submodules, not HF attrs.""" + + def test_does_not_find_hf_internal_attrs(self): + """blocks_with should not match HF attributes that aren't bridged.""" + bridge = _make_hybrid_bridge() + # 'bar' is a bridged submodule (in _modules), should be found + assert len(bridge.blocks_with("bar")) == 4 + # 'training' exists as an attr on nn.Module but is not a bridged submodule + assert len(bridge.blocks_with("training")) == 0 + + def test_finds_only_bridged_optional_submodules(self): + """Optional submodules should be found only on layers where they were bound.""" + bridge = _make_hybrid_bridge() + foo_blocks = bridge.blocks_with("foo") + assert [idx for idx, _ in foo_blocks] == [0, 1, 2] + + +# ============================================================================ +# Tests: accumulated_bias on hybrid models +# ============================================================================ + + +class TestAccumulatedBiasHybrid: + """accumulated_bias should not crash on hybrid models.""" + + def test_accumulated_bias_skips_non_attn_layers(self): + """Should not crash when some layers lack attention.""" + bridge = _make_hybrid_bridge() + # Should run without error through all 4 layers (layer 3 has no attn) + result = bridge.accumulated_bias(layer=4) + assert result.shape == (4,) + + def test_accumulated_bias_mlp_input_on_non_attn_layer(self): + """mlp_input=True on a non-attention layer should not crash.""" + bridge = _make_hybrid_bridge() + # Layer 3 has no attn — should still work with mlp_input=True + result = bridge.accumulated_bias(layer=3, mlp_input=True) + assert result.shape == (4,) + + +# ============================================================================ +# Tests: block_submodules and layer_types introspection +# ============================================================================ + + +class TestBlockIntrospection: + """Test layer introspection APIs.""" + + def test_block_submodules(self): + """block_submodules should list bridged submodules per layer.""" + bridge = _make_hybrid_bridge() + # Layer 0 has both foo and bar + subs_0 = bridge.block_submodules(0) + assert "foo" in subs_0 + assert "bar" in subs_0 + # Layer 3 has only bar + subs_3 = bridge.block_submodules(3) + assert "foo" not in subs_3 + assert "bar" in subs_3 + + def test_layer_types(self): + """layer_types should return a list with one entry per block.""" + bridge = _make_hybrid_bridge() + types = bridge.layer_types() + assert len(types) == 4 + # Layers 0-2 have 'foo', layer 3 does not + for i in range(3): + assert "foo" in types[i] + assert "foo" not in types[3] + + +# ============================================================================ +# Tests: stack_params_for hybrid API +# ============================================================================ + + +class TestStackParamsFor: + """Test stack_params_for on hybrid bridges.""" + + def test_returns_correct_indices_and_tensors(self): + """stack_params_for should return only matching blocks.""" + bridge = _make_hybrid_bridge() + indices, stacked = bridge.stack_params_for("foo", "foo.proj.weight") + assert indices == [0, 1, 2] + assert stacked.shape[0] == 3 + + def test_raises_on_no_matching_blocks(self): + """Should raise ValueError when no blocks have the submodule.""" + bridge = _make_hybrid_bridge() + with pytest.raises(ValueError, match="No blocks have submodule"): + bridge.stack_params_for("nonexistent", "nonexistent.weight") + + +# ============================================================================ +# Tests: refactor guard validates all attn keys +# ============================================================================ + + +class TestRefactorGuardConsistency: + """Test that refactor raises on inconsistent attn keys (W_Q present, W_K missing).""" + + def test_raises_on_partial_attn_keys(self): + """If W_Q is present but W_K is missing, should raise ValueError.""" + from transformer_lens.config.TransformerLensConfig import TransformerLensConfig + from transformer_lens.weight_processing import ProcessWeights + + cfg = TransformerLensConfig( + n_layers=1, + n_heads=2, + d_head=4, + d_model=8, + n_ctx=16, + positional_embedding_type="standard", + ) + # Only W_Q present, missing W_K/W_V/W_O + state_dict = { + "blocks.0.attn.W_Q": torch.randn(2, 8, 4), + } + with pytest.raises(ValueError, match="Inconsistent attention weights"): + ProcessWeights.refactor_factored_attn_matrices(state_dict, cfg) + + +# ============================================================================ +# Tests: __setattr__ whitelist includes optional +# ============================================================================ + + +class TestSetAttrWhitelist: + """Test that 'optional' is in the __setattr__ whitelist.""" + + def test_optional_set_on_bridge_not_hf_model(self): + """Setting optional after set_original_component should stay on bridge.""" + comp = LinearBridge(name="test") + fake_hf = nn.Linear(4, 4, bias=False) + comp.set_original_component(fake_hf) + comp.optional = True + # Should be on the bridge, not on the HF module + assert comp.optional is True + assert not hasattr(fake_hf, "optional") + + +# ============================================================================ +# Tests: attn_head_labels matches composition scores dimensions +# ============================================================================ + + +class TestAttnHeadLabels: + """attn_head_labels should match all_composition_scores dimensions.""" + + def test_attn_head_labels_excludes_non_attn_layers(self): + """Labels should only cover attention layers, not SSM/linear-attn.""" + bridge = _make_hybrid_bridge_with_attn() + bridge.cfg.n_heads = 2 + labels = bridge.attn_head_labels + # 3 attention layers (0, 1, 2) * 2 heads = 6 labels + assert len(labels) == 6 + assert labels == ["L0H0", "L0H1", "L1H0", "L1H1", "L2H0", "L2H1"] + # Should NOT contain L3 (non-attention layer) + assert all("L3" not in lbl for lbl in labels) + + def test_all_head_labels_includes_all_layers(self): + """all_head_labels should still include every layer.""" + bridge = _make_hybrid_bridge_with_attn() + bridge.cfg.n_heads = 2 + labels = bridge.all_head_labels + # 4 layers * 2 heads = 8 labels + assert len(labels) == 8 + + +# ============================================================================ +# Tests: hook propagation through optional submodules +# ============================================================================ + + +class TestHookPropagation: + """Verify hooks fire on present optional submodules and don't exist on absent ones.""" + + def _build_hybrid_model_and_blocks(self): + """Build a hybrid model with setup done so hooks are wired.""" + import copy + + model = HybridModel() + adapter = MinimalAdapter(optional=True) + template = adapter.make_block_template() + + blocks = [] + for i, layer in enumerate(model.layers): + block = copy.deepcopy(template) + block.name = f"layers.{i}" + block.set_original_component(layer) + setup_submodules(block, adapter, layer) + blocks.append(block) + + return model, blocks + + def test_hooks_fire_on_present_optional_submodule(self): + """hook_in and hook_out should fire on blocks where the optional submodule exists.""" + model, blocks = self._build_hybrid_model_and_blocks() + + # Block 0 has 'foo' — its hook_in and hook_out should fire + foo_bridge = blocks[0].foo + hook_in_fired = [] + hook_out_fired = [] + + foo_bridge.hook_in.add_hook(lambda tensor, hook: hook_in_fired.append(True) or tensor) + foo_bridge.hook_out.add_hook(lambda tensor, hook: hook_out_fired.append(True) or tensor) + + # Run a forward pass through the HF model's layer 0 + # Because replace_remote_component swapped model.layers[0].foo with the bridge, + # calling model.layers[0].foo(x) goes through LinearBridge.forward + x = torch.randn(1, 4) + _ = blocks[0].foo(x) + + assert len(hook_in_fired) == 1, "hook_in should fire on present optional submodule" + assert len(hook_out_fired) == 1, "hook_out should fire on present optional submodule" + + def test_absent_optional_submodule_has_no_hooks(self): + """Block 3 should not have 'foo' at all — no hooks to fire.""" + _, blocks = self._build_hybrid_model_and_blocks() + + # Block 3 lacks 'foo' — it shouldn't be in _modules + assert "foo" not in blocks[3]._modules + # Attempting to access hooks on the absent submodule should fail + assert not hasattr(blocks[3], "foo") + + def test_hooks_on_present_dont_affect_absent(self): + """Running all blocks should fire hooks only on blocks with the optional submodule.""" + model, blocks = self._build_hybrid_model_and_blocks() + + # Track which blocks fire foo.hook_out + fired_block_indices = [] + for i, block in enumerate(blocks): + if "foo" in block._modules: + block.foo.hook_out.add_hook( + lambda tensor, hook, idx=i: fired_block_indices.append(idx) or tensor + ) + + # Run forward through all HF layers + x = torch.randn(1, 4) + for i, layer in enumerate(model.layers): + x = layer(x) + + # Hooks should fire on layers 0, 1, 2 (have foo) but not 3 + assert fired_block_indices == [0, 1, 2] + + def test_universal_submodule_hooks_fire_on_all_blocks(self): + """'bar' is universal — its hooks should fire on every block.""" + model, blocks = self._build_hybrid_model_and_blocks() + + fired_block_indices = [] + for i, block in enumerate(blocks): + block.bar.hook_out.add_hook( + lambda tensor, hook, idx=i: fired_block_indices.append(idx) or tensor + ) + + x = torch.randn(1, 4) + for layer in model.layers: + x = layer(x) + + assert fired_block_indices == [0, 1, 2, 3] + + +# ============================================================================ +# Tests: CompositionScores tensor protocol +# ============================================================================ + + +class TestCompositionScoresProtocol: + """CompositionScores should behave like a tensor for existing research code.""" + + def _make_scores(self): + from transformer_lens.model_bridge.composition_scores import CompositionScores + + t = torch.randn(3, 2, 3, 2) + return CompositionScores(t, [0, 2, 5], ["L0H0", "L0H1", "L2H0", "L2H1", "L5H0", "L5H1"]) + + def test_shape(self): + cs = self._make_scores() + assert cs.shape == torch.Size([3, 2, 3, 2]) + + def test_device_and_dtype(self): + cs = self._make_scores() + assert cs.device == torch.device("cpu") + assert cs.dtype == torch.float32 + + def test_indexing_returns_tensor(self): + cs = self._make_scores() + sliced = cs[0, :, 1, :] + assert isinstance(sliced, torch.Tensor) + assert sliced.shape == (2, 2) + + def test_torch_isnan(self): + """torch.isnan(scores) must work — used in existing integration tests.""" + cs = self._make_scores() + result = torch.isnan(cs) + assert isinstance(result, torch.Tensor) + assert result.shape == cs.shape + assert not result.any() + + def test_torch_where(self): + cs = self._make_scores() + result = torch.where(cs > 0, cs.scores, torch.zeros_like(cs.scores)) + assert isinstance(result, torch.Tensor) + + def test_comparison_gt(self): + cs = self._make_scores() + mask = cs > 0 + assert isinstance(mask, torch.Tensor) + assert mask.shape == cs.shape + + def test_comparison_ne(self): + """scores != 0 must return a tensor, not raise RuntimeError.""" + cs = self._make_scores() + result = cs != 0 + assert isinstance(result, torch.Tensor) + assert result.shape == cs.shape + + def test_comparison_eq(self): + cs = self._make_scores() + result = cs == 0 + assert isinstance(result, torch.Tensor) + + def test_tensor_method_abs(self): + """scores.abs() must work via __getattr__ delegation.""" + cs = self._make_scores() + result = cs.abs() + assert isinstance(result, torch.Tensor) + + def test_tensor_method_sum(self): + cs = self._make_scores() + result = cs.sum() + assert isinstance(result, torch.Tensor) + + def test_tensor_method_any(self): + cs = self._make_scores() + result = cs.any() + assert isinstance(result, torch.Tensor) + + def test_chained_indexing_and_method(self): + """scores[l1, :, l2, :].abs().sum() — the exact pattern from integration tests.""" + cs = self._make_scores() + result = cs[0, :, 1, :].abs().sum() + assert isinstance(result, torch.Tensor) + assert result.ndim == 0 # scalar + + def test_metadata_accessible(self): + cs = self._make_scores() + assert cs.layer_indices == [0, 2, 5] + assert len(cs.head_labels) == 6 + + def test_repr(self): + cs = self._make_scores() + r = repr(cs) + assert "CompositionScores" in r + assert "layer_indices" in r + + +# ============================================================================ +# Tests: get_bridge_params with hybrid blocks +# ============================================================================ + + +class TestGetBridgeParamsHybrid: + """get_bridge_params should skip attn keys for non-attention layers.""" + + def test_no_attn_keys_for_non_attn_layers(self): + from transformer_lens.model_bridge.get_params_util import get_bridge_params + + bridge = _make_hybrid_bridge_with_attn() + bridge.cfg.d_vocab = 10 + bridge.cfg.n_ctx = 8 + bridge.cfg.d_mlp = 16 + bridge.cfg.n_heads = 2 + bridge.cfg.d_head = 2 + + # Add minimal embed/unembed so get_bridge_params doesn't fail + bridge.embed = nn.Embedding(10, 4) + bridge.pos_embed = type("PE", (), {"weight": torch.randn(8, 4)})() + bridge.unembed = type( + "UE", + (), + { + "weight": torch.randn(10, 4), + "b_U": torch.zeros(10), + }, + )() + + params = get_bridge_params(bridge) + + # Blocks 0-2 have 'attn' — should have attn keys + for i in range(3): + # attn is mapped but internal structure (q/k/v/o) may not match + # our synthetic LinearBridge wrapping FakeSubmodule — so attn keys + # may or may not be present depending on structure. The key point + # is block 3 must NOT have attn keys. + pass + + # Block 3 has NO 'attn' — must not have any attn keys + attn_keys_for_block3 = [k for k in params if k.startswith("blocks.3.attn.")] + assert len(attn_keys_for_block3) == 0, ( + f"Block 3 (non-attention layer) should have no attn keys, " + f"but found: {attn_keys_for_block3}" + ) diff --git a/transformer_lens/benchmarks/weight_processing.py b/transformer_lens/benchmarks/weight_processing.py index eeeabbb91..326c53df7 100644 --- a/transformer_lens/benchmarks/weight_processing.py +++ b/transformer_lens/benchmarks/weight_processing.py @@ -68,8 +68,16 @@ def benchmark_weight_processing( ) # Check weight centering - writing weights should be approximately centered - bridge_w_out = bridge.blocks[0].mlp.W_out - reference_w_out = reference_model.blocks[0].mlp.W_out # type: ignore[union-attr] + mlp_blocks = bridge.blocks_with("mlp") + if not mlp_blocks: + return BenchmarkResult( + name="weight_processing", + severity=BenchmarkSeverity.WARNING, + message="No blocks have MLP submodule — cannot check centering", + ) + _mlp_idx, mlp_block = mlp_blocks[0] + bridge_w_out = mlp_block.mlp.W_out + reference_w_out = reference_model.blocks[_mlp_idx].mlp.W_out # type: ignore[union-attr] bridge_mean = torch.mean(torch.abs(torch.mean(bridge_w_out, dim=-1, keepdim=True))) reference_mean = torch.mean( @@ -141,10 +149,20 @@ def benchmark_weight_sharing( if reference_model is not None: reference_original = reference_model(test_text, return_type="loss") + # Find first block with attention (hybrid models may not have attn on block 0) + bridge_attn_blocks = bridge.blocks_with("attn") + if not bridge_attn_blocks: + return BenchmarkResult( + name="weight_sharing", + severity=BenchmarkSeverity.INFO, + message="No blocks have attention submodule — skipping weight sharing check", + ) + bridge_attn_idx, bridge_attn_block = bridge_attn_blocks[0] + # Verify weights are identical before modification - bridge_W_V = torch.clone(cast(torch.Tensor, bridge.blocks[0].attn.W_V)) + bridge_W_V = torch.clone(cast(torch.Tensor, bridge_attn_block.attn.W_V)) reference_W_V = torch.clone( - cast(torch.Tensor, reference_model.blocks[0].attn.W_V) # type: ignore[union-attr] + cast(torch.Tensor, reference_model.blocks[bridge_attn_idx].attn.W_V) # type: ignore[union-attr] ) # Check if models have GQA (different head counts for K/V vs Q) @@ -188,8 +206,8 @@ def benchmark_weight_sharing( # Modify weights in both models with torch.no_grad(): - bridge.blocks[0].attn.W_V[0, :, :] = 0 # type: ignore[union-attr,operator] - reference_model.blocks[0].attn.W_V[0, :, :] = 0 # type: ignore[union-attr,operator] + bridge_attn_block.attn.W_V[0, :, :] = 0 # type: ignore[union-attr,operator] + reference_model.blocks[bridge_attn_idx].attn.W_V[0, :, :] = 0 # type: ignore[union-attr,operator] # Test modified losses bridge_modified = bridge(test_text, return_type="loss") @@ -200,8 +218,8 @@ def benchmark_weight_sharing( # Restore weights with torch.no_grad(): - bridge.blocks[0].attn.W_V.copy_(bridge_W_V) # type: ignore[union-attr,operator,arg-type] - reference_model.blocks[0].attn.W_V.copy_(reference_W_V) # type: ignore[union-attr,operator,arg-type] + bridge_attn_block.attn.W_V.copy_(bridge_W_V) # type: ignore[union-attr,operator,arg-type] + reference_model.blocks[bridge_attn_idx].attn.W_V.copy_(reference_W_V) # type: ignore[union-attr,operator,arg-type] diff = abs(bridge_change - reference_change) if diff < atol: @@ -220,16 +238,26 @@ def benchmark_weight_sharing( ) # No reference model - just verify modification has an effect - original_W_V = bridge.blocks[0].attn.W_V.clone() + # Find first block with attention (hybrid models may not have attn on block 0) + bridge_attn_blocks = bridge.blocks_with("attn") + if not bridge_attn_blocks: + return BenchmarkResult( + name="weight_sharing", + severity=BenchmarkSeverity.INFO, + message="No blocks have attention submodule — skipping weight sharing check", + ) + _ws_idx, ws_attn_block = bridge_attn_blocks[0] + + original_W_V = ws_attn_block.attn.W_V.clone() with torch.no_grad(): - bridge.blocks[0].attn.W_V[0, :, :] = 0 + ws_attn_block.attn.W_V[0, :, :] = 0 bridge_modified = bridge(test_text, return_type="loss") change = abs(bridge_modified - bridge_original) # Restore weights with torch.no_grad(): - bridge.blocks[0].attn.W_V.copy_(original_W_V) + ws_attn_block.attn.W_V.copy_(original_W_V) if change < 1e-6: return BenchmarkResult( @@ -274,16 +302,26 @@ def benchmark_weight_modification( # Get original loss original_loss = bridge(test_text, return_type="loss") + # Find first block with attention (hybrid models may not have attn on block 0) + wm_attn_blocks = bridge.blocks_with("attn") + if not wm_attn_blocks: + return BenchmarkResult( + name="weight_modification", + severity=BenchmarkSeverity.INFO, + message="No blocks have attention submodule — skipping weight modification check", + ) + _wm_idx, wm_attn_block = wm_attn_blocks[0] + # Modify W_V weights with torch.no_grad(): - original_w_v = bridge.blocks[0].attn.W_V.clone() + original_w_v = wm_attn_block.attn.W_V.clone() # Check dimensionality - GQA models may have 2D tensors instead of 3D if original_w_v.ndim == 3: # Standard 3D tensor: [n_heads, d_model, d_head] - bridge.blocks[0].attn.W_V[0, :, :] = 0 + wm_attn_block.attn.W_V[0, :, :] = 0 elif original_w_v.ndim == 2: # 2D tensor (e.g., GQA models): [n_heads * d_head, d_model] or similar - bridge.blocks[0].attn.W_V[0, :] = 0 + wm_attn_block.attn.W_V[0, :] = 0 else: return BenchmarkResult( name="weight_modification", @@ -298,7 +336,7 @@ def benchmark_weight_modification( except Exception as forward_error: # Restore weights before reporting error with torch.no_grad(): - bridge.blocks[0].attn.W_V.copy_(original_w_v) + wm_attn_block.attn.W_V.copy_(original_w_v) # Some models (e.g., models with complex attention mechanisms) may have # forward pass issues after weight modification. Report as skipped. @@ -311,7 +349,7 @@ def benchmark_weight_modification( # Restore weights with torch.no_grad(): - bridge.blocks[0].attn.W_V.copy_(original_w_v) + wm_attn_block.attn.W_V.copy_(original_w_v) # Loss should change change = abs(modified_loss - original_loss) @@ -321,13 +359,17 @@ def benchmark_weight_modification( # is separate from the combined QKV weight used in forward. # Try MLP weight modification as fallback. mlp_fallback_error = None + mlp_blocks = bridge.blocks_with("mlp") + mlp_block = mlp_blocks[0][1] if mlp_blocks else None try: + if mlp_block is None: + raise AttributeError("No blocks have mlp submodule") with torch.no_grad(): - original_mlp_w = bridge.blocks[0].mlp.out.weight.clone() - bridge.blocks[0].mlp.out.weight[0, :] = 0 + original_mlp_w = mlp_block.mlp.out.weight.clone() + mlp_block.mlp.out.weight[0, :] = 0 mlp_modified_loss = bridge(test_text, return_type="loss") with torch.no_grad(): - bridge.blocks[0].mlp.out.weight.copy_(original_mlp_w) + mlp_block.mlp.out.weight.copy_(original_mlp_w) mlp_change = abs(mlp_modified_loss - original_loss) if mlp_change > 1e-6: return BenchmarkResult( @@ -516,35 +558,51 @@ def benchmark_attention_output_centering( message="Skipped for tiny/test model (random weights don't center meaningfully)", ) - # Check if W_O exists and is accessible - if not hasattr(bridge.blocks[0].attn, "W_O"): + # Find blocks with attention (hybrid architectures may not have attn on all blocks) + attn_blocks = bridge.blocks_with("attn") + if not attn_blocks: return BenchmarkResult( name="attention_output_centering", severity=BenchmarkSeverity.WARNING, - message="W_O not accessible on bridge model", + message="No blocks have attention submodule", passed=False, ) - w_o = bridge.blocks[0].attn.W_O - - # Compute mean along output dimension - mean_abs = torch.mean(torch.abs(torch.mean(w_o, dim=-1))).item() + # Check W_O accessibility on first attention block + first_idx, first_attn_block = attn_blocks[0] + if not hasattr(first_attn_block.attn, "W_O"): + return BenchmarkResult( + name="attention_output_centering", + severity=BenchmarkSeverity.WARNING, + message="W_O not accessible on bridge model", + passed=False, + ) + # Compute mean across all attention blocks tolerance = 0.01 # 1% tolerance + worst_mean = 0.0 + for idx, block in attn_blocks: + w_o = block.attn.W_O + mean_abs = torch.mean(torch.abs(torch.mean(w_o, dim=-1))).item() + worst_mean = max(worst_mean, mean_abs) - if mean_abs < tolerance: + n_attn = len(attn_blocks) + n_total = len(bridge.blocks) + block_info = f" ({n_attn}/{n_total} blocks have attention)" if n_attn < n_total else "" + + if worst_mean < tolerance: return BenchmarkResult( name="attention_output_centering", severity=BenchmarkSeverity.INFO, - message=f"Attention output centering verified (mean={mean_abs:.6f})", - details={"mean": mean_abs, "tolerance": tolerance}, + message=f"Attention output centering verified (worst_mean={worst_mean:.6f}){block_info}", + details={"mean": worst_mean, "tolerance": tolerance, "n_attn_blocks": n_attn}, ) else: return BenchmarkResult( name="attention_output_centering", severity=BenchmarkSeverity.WARNING, - message=f"Attention output weights not well-centered (mean={mean_abs:.6f})", - details={"mean": mean_abs, "tolerance": tolerance}, + message=f"Attention output weights not well-centered (worst_mean={worst_mean:.6f}){block_info}", + details={"mean": worst_mean, "tolerance": tolerance, "n_attn_blocks": n_attn}, passed=False, ) @@ -743,8 +801,20 @@ def benchmark_value_bias_folding( }, ) + # Find blocks with attention (hybrid architectures may not have attn on all blocks) + attn_blocks = bridge.blocks_with("attn") + if not attn_blocks: + return BenchmarkResult( + name="value_bias_folding", + severity=BenchmarkSeverity.INFO, + message="No blocks have attention submodule (expected for hybrid models without mapped attn)", + details={"has_bias": False}, + ) + + first_idx, first_attn_block = attn_blocks[0] + # Check if b_V exists - if not hasattr(bridge.blocks[0].attn, "b_V"): + if not hasattr(first_attn_block.attn, "b_V"): return BenchmarkResult( name="value_bias_folding", severity=BenchmarkSeverity.INFO, @@ -752,7 +822,7 @@ def benchmark_value_bias_folding( details={"has_bias": False}, ) - b_v = bridge.blocks[0].attn.b_V + b_v = first_attn_block.attn.b_V if b_v is None: return BenchmarkResult( diff --git a/transformer_lens/model_bridge/bridge.py b/transformer_lens/model_bridge/bridge.py index f23703234..372b53bf5 100644 --- a/transformer_lens/model_bridge/bridge.py +++ b/transformer_lens/model_bridge/bridge.py @@ -3,7 +3,9 @@ This module provides the bridge components that wrap remote model components and provide a consistent interface for accessing their weights and performing operations. """ +import logging import re +import warnings from contextlib import contextmanager from functools import lru_cache from typing import ( @@ -32,10 +34,17 @@ from transformer_lens.hook_points import HookPoint from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter from transformer_lens.model_bridge.component_setup import set_original_components +from transformer_lens.model_bridge.composition_scores import CompositionScores from transformer_lens.model_bridge.exceptions import StopAtLayerException from transformer_lens.model_bridge.generalized_components.base import ( GeneralizedComponent, ) +from transformer_lens.model_bridge.generalized_components.block import ( + _BLOCK_INTERNAL_MODULES, + _NORM_PREFIXES, + _VARIANT_SUBMODULE_SET, + VARIANT_SUBMODULE_NAMES, +) from transformer_lens.model_bridge.get_params_util import get_bridge_params from transformer_lens.utilities.aliases import resolve_alias from transformer_lens.utilities.devices import move_to_and_update_config @@ -47,6 +56,14 @@ _BLOCK_PATTERN = re.compile("blocks\\.(\\d+)") +def _resolve_attr_path(obj: nn.Module, attr_path: str) -> torch.Tensor: + """Walk a dot-separated attribute path and return the final tensor.""" + result = obj + for attr in attr_path.split("."): + result = getattr(result, attr) + return cast(torch.Tensor, result) + + def build_alias_to_canonical_map(hook_dict, prefix=""): """Build a mapping from alias hook names to their canonical names. @@ -247,7 +264,7 @@ def _set_processed_weight_attributes(self) -> None: if not hasattr(self, "blocks"): return for block in self.blocks: - if not hasattr(block, "attn"): + if "attn" not in block._modules: continue attn = block.attn if not (hasattr(attn, "q") and hasattr(attn.q, "weight")): @@ -1003,20 +1020,114 @@ def to_single_str_token(self, int_token: int) -> str: return str(token[0]) raise AssertionError("Expected a single string token.") + def blocks_with(self, submodule: str) -> List[Tuple[int, "GeneralizedComponent"]]: + """Return (index, block) pairs for blocks that have the named submodule. + + Hybrid architectures have heterogeneous blocks — some layers have + attention, others have SSM or linear attention, etc. Use this instead + of assuming blocks[0] is representative. + + Only returns blocks where the submodule was explicitly set up as a + bridged component (registered in _modules), not submodules that happen + to exist on the underlying HF model. + + Args: + submodule: Name of the submodule to check for (e.g., "attn", "mamba") + + Returns: + List of (layer_index, block) tuples for blocks that have the submodule. + """ + if not hasattr(self, "blocks"): + return [] + return [(i, block) for i, block in enumerate(self.blocks) if submodule in block._modules] + + def stack_params_for( + self, submodule: str, attr_path: str, reshape_fn: Optional[Callable] = None + ) -> Tuple[List[int], torch.Tensor]: + """Stack a parameter across blocks that have a specific submodule. + + For hybrid architectures where only some blocks have attention (or SSM, + etc.), this returns the stacked tensor for only matching blocks along + with their layer indices. + + Args: + submodule: Submodule to filter on (e.g., "attn", "mamba") + attr_path: Dot-separated attr path from block (e.g., "attn.W_K") + reshape_fn: Optional function to reshape each weight before stacking + + Returns: + Tuple of (layer_indices, stacked_tensor) where layer_indices maps + position i in the tensor to the original layer index. + + Raises: + ValueError: If no blocks have the requested submodule. + """ + matching = self.blocks_with(submodule) + if not matching: + raise ValueError( + f"No blocks have submodule '{submodule}'. " + f"Available submodules can be checked with blocks_with()." + ) + indices: List[int] = [] + weights: List[torch.Tensor] = [] + for idx, block in matching: + w = _resolve_attr_path(block, attr_path) + if reshape_fn is not None: + w = reshape_fn(w) + weights.append(w) + indices.append(idx) + return indices, torch.stack(weights, dim=0) + def _stack_block_params( self, attr_path: str, reshape_fn: Optional[Callable] = None ) -> torch.Tensor: - """Stack a parameter across all blocks. + """Stack a parameter across all blocks, or across matching blocks for hybrids. + + For homogeneous models, returns a tensor of shape [n_layers, ...]. + For hybrid models where some blocks lack the requested submodule, + returns a tensor of shape [n_matching_blocks, ...] and emits a + one-time warning about the index mapping. Args: attr_path: Dot-separated attribute path from block (e.g., "attn.W_K") reshape_fn: Optional function to reshape each weight before stacking + + Note: + The guard checks only that the first path segment is a bridged + submodule (in _modules). Deeper segments resolve via standard + getattr, which may fall through to HF model attributes. This is + intentional — properties like W_Q are exposed via __getattr__ + delegation to the underlying weight tensors. """ - weights = [] - for block in self.blocks: - w = block - for attr in attr_path.split("."): - w = getattr(w, attr) + first_attr = attr_path.split(".")[0] + matching_blocks = [ + (i, block) for i, block in enumerate(self.blocks) if first_attr in block._modules + ] + + if len(matching_blocks) == 0: + raise AttributeError( + f"No blocks have submodule '{first_attr}'. " + f"Use bridge.blocks_with('{first_attr}') to check availability." + ) + + if len(matching_blocks) < len(self.blocks): + indices = [i for i, _ in matching_blocks] + logging.warning( + "Hybrid model: only %d/%d blocks have '%s'. Returning stacked tensor " + "for layers %s only. Tensor index i corresponds to original layer " + "indices[i], not layer i. For explicit index mapping, use " + "bridge.stack_params_for('%s', '%s').", + len(matching_blocks), + len(self.blocks), + first_attr, + indices, + first_attr, + attr_path, + ) + + weights: List[torch.Tensor] = [] + for _, block in matching_blocks: + w = _resolve_attr_path(block, attr_path) if reshape_fn is not None: w = reshape_fn(w) weights.append(w) @@ -1120,12 +1231,46 @@ def W_E(self) -> torch.Tensor: @property def QK(self): + """QK circuit as a FactoredMatrix. + + On hybrid models, returns the circuit for attention layers only (with + a warning about index mapping). For explicit index control, use + QK_for_attn_layers() which returns (layer_indices, FactoredMatrix). + """ return FactoredMatrix(self.W_Q, self.W_K.transpose(-2, -1)) @property def OV(self): + """OV circuit as a FactoredMatrix. + + On hybrid models, returns the circuit for attention layers only (with + a warning about index mapping). For explicit index control, use + OV_for_attn_layers() which returns (layer_indices, FactoredMatrix). + """ return FactoredMatrix(self.W_V, self.W_O) + def QK_for_attn_layers(self) -> Tuple[List[int], FactoredMatrix]: + """QK circuit for attention layers only (hybrid-safe). + + Returns: + Tuple of (layer_indices, FactoredMatrix) where layer_indices maps + position i in the matrix to the original layer index. + """ + q_indices, W_Q = self.stack_params_for("attn", "attn.W_Q", self._reshape_qkv) + _, W_K = self.stack_params_for("attn", "attn.W_K", self._reshape_qkv) + return q_indices, FactoredMatrix(W_Q, W_K.transpose(-2, -1)) + + def OV_for_attn_layers(self) -> Tuple[List[int], FactoredMatrix]: + """OV circuit for attention layers only (hybrid-safe). + + Returns: + Tuple of (layer_indices, FactoredMatrix) where layer_indices maps + position i in the matrix to the original layer index. + """ + v_indices, W_V = self.stack_params_for("attn", "attn.W_V", self._reshape_qkv) + _, W_O = self.stack_params_for("attn", "attn.W_O", self._reshape_o) + return v_indices, FactoredMatrix(W_V, W_O) + # ------------------------------------------------------------------ # Mechanistic interpretability analysis methods # ------------------------------------------------------------------ @@ -1169,18 +1314,56 @@ def tokens_to_residual_directions( residual_direction = self.W_U[:, token] return residual_direction + # Output bias attribute names by variant type. Attention uses "b_O" + # (a processed-weight alias). SSM/linear-attn variants use their output + # projection's bias. Map variant name → list of attribute paths to check. + _VARIANT_OUTPUT_BIAS_ATTRS: Dict[str, tuple] = { + "attn": ("b_O",), + "linear_attn": ("out_proj.bias",), + "mamba": ("out_proj.bias",), + "mixer": ("out_proj.bias",), + "ssm": ("out_proj.bias",), + } + + def _get_block_variant_bias(self, block: "GeneralizedComponent") -> Optional[torch.Tensor]: + """Get the output bias from whatever variant submodule this block has. + + Each variant type has its own output bias attribute name — attention + uses b_O while SSM variants use out_proj.bias. Returns the first + found, or None if the variant has no output bias. + """ + for name in VARIANT_SUBMODULE_NAMES: + if name not in block._modules: + continue + variant = block._modules[name] + for attr_path in self._VARIANT_OUTPUT_BIAS_ATTRS.get(name, ()): + obj = variant + try: + for attr in attr_path.split("."): + obj = getattr(obj, attr) + except AttributeError: + continue + if obj is not None and isinstance(obj, torch.Tensor): + return obj + return None + def accumulated_bias( self, layer: int, mlp_input: bool = False, include_mlp_biases: bool = True, ) -> torch.Tensor: - """Sum of attention and MLP output biases up to the input of a given layer. + """Sum of biases that contribute to the residual stream up to a given layer. + + Includes output biases from whatever variant submodule each block has + (attention, Mamba, linear attention, etc.) plus MLP output biases. + For hybrid models, non-attention layers still contribute their variant + submodule's output bias to the residual stream. Args: layer: Layer number in [0, n_layers]. 0 means no layers, n_layers means all. - mlp_input: If True, include the attention output bias of the target layer - (i.e. bias up to the MLP input of that layer). + mlp_input: If True, include the variant submodule's output bias of + the target layer (i.e. bias up to the MLP input of that layer). include_mlp_biases: Whether to include MLP biases. Useful to set False when expanding attn_out into individual heads but keeping mlp_out as-is. @@ -1190,55 +1373,163 @@ def accumulated_bias( accumulated = torch.zeros(self.cfg.d_model, device=self.cfg.device) for i in range(layer): block = self.blocks[i] - b_O = getattr(block.attn, "b_O", None) + b_O = self._get_block_variant_bias(block) if b_O is not None: accumulated = accumulated + b_O - if include_mlp_biases: + if include_mlp_biases and "mlp" in block._modules: b_out = getattr(block.mlp, "b_out", None) if b_out is not None: accumulated = accumulated + b_out if mlp_input: assert layer < self.cfg.n_layers, "Cannot include attn_bias from beyond the final layer" block = self.blocks[layer] - b_O = getattr(block.attn, "b_O", None) + b_O = self._get_block_variant_bias(block) if b_O is not None: accumulated = accumulated + b_O return accumulated - def all_composition_scores(self, mode: str) -> torch.Tensor: - """Composition scores for all pairs of heads. + def all_composition_scores(self, mode: str) -> CompositionScores: + """Composition scores for all pairs of attention heads. + + Returns a ``CompositionScores`` containing the scores tensor, the + original layer indices, and human-readable head labels. The scores + tensor has shape (n_attn_layers, n_heads, n_attn_layers, n_heads) and + is upper triangular on the layer axes. - Returns an (n_layers, n_heads, n_layers, n_heads) tensor that is upper - triangular on the layer axes (a head can only compose with later heads). + For hybrid models, only attention layers are included. The returned + ``layer_indices`` maps tensor position *i* back to the original layer + number so that results cannot be silently misinterpreted. See https://transformer-circuits.pub/2021/framework/index.html Args: mode: One of "Q", "K", "V" — which composition type to compute. """ - left = self.OV + # Single blocks_with call — all weight stacking uses these same blocks + attn_blocks = self.blocks_with("attn") + if not attn_blocks: + raise ValueError("No attention layers found — cannot compute composition scores.") + + indices = [idx for idx, _ in attn_blocks] + blocks_list = [block for _, block in attn_blocks] + + def _stack(attr_path: str, reshape_fn: Optional[Callable] = None) -> torch.Tensor: + weights: List[torch.Tensor] = [] + for block in blocks_list: + w = _resolve_attr_path(block, attr_path) + if reshape_fn is not None: + w = reshape_fn(w) + weights.append(w) + return torch.stack(weights, dim=0) + + W_V = _stack("attn.W_V", self._reshape_qkv) + W_O = _stack("attn.W_O", self._reshape_o) + left = FactoredMatrix(W_V, W_O) + if mode == "Q": - right = self.QK + W_Q = _stack("attn.W_Q", self._reshape_qkv) + W_K = _stack("attn.W_K", self._reshape_qkv) + right = FactoredMatrix(W_Q, W_K.transpose(-2, -1)) elif mode == "K": - right = self.QK.T + W_Q = _stack("attn.W_Q", self._reshape_qkv) + W_K = _stack("attn.W_K", self._reshape_qkv) + right = FactoredMatrix(W_Q, W_K.transpose(-2, -1)).T elif mode == "V": - right = self.OV + right = left else: raise ValueError(f"mode must be one of ['Q', 'K', 'V'] not {mode}") scores = utils.composition_scores(left, right, broadcast_dims=True) - mask = ( - torch.arange(self.cfg.n_layers, device=self.cfg.device)[:, None, None, None] - < torch.arange(self.cfg.n_layers, device=self.cfg.device)[None, None, :, None] - ) + n_attn = len(indices) + idx_tensor = torch.arange(n_attn, device=self.cfg.device) + mask = idx_tensor[:, None, None, None] < idx_tensor[None, None, :, None] scores = torch.where(mask, scores, torch.zeros_like(scores)) - return scores + + labels = [f"L{l}H{h}" for l in indices for h in range(self.cfg.n_heads)] + return CompositionScores(scores=scores, layer_indices=indices, head_labels=labels) + + def composition_layer_indices(self) -> List[int]: + """Return original layer indices for attention layers. + + Maps position i in all_composition_scores() output back to the + original layer number. For homogeneous models, returns [0, 1, ..., n-1]. + For hybrid models, returns only the attention layer indices. + """ + return [idx for idx, _ in self.blocks_with("attn")] + + def block_hooks(self, layer_idx: int) -> List[str]: + """Return all hook point names available on a specific block. + + Useful for hybrid architectures where different layers have different + hookable submodules — e.g., attention layers expose hook_q/hook_k/etc. + while SSM layers expose hook_in_proj/hook_conv/etc. + + Args: + layer_idx: Layer index to inspect. + + Returns: + Sorted list of hook names (e.g., ["hook_in", "hook_out", "attn.hook_q", ...]). + """ + prefix = f"blocks.{layer_idx}." + return sorted(name[len(prefix) :] for name in self.hook_dict if name.startswith(prefix)) + + def block_submodules(self, layer_idx: int) -> List[str]: + """Return names of bridged submodules on a specific block. + + Args: + layer_idx: Layer index to inspect. + + Returns: + List of submodule names (e.g., ["ln1", "ln2", "attn", "mlp"]). + """ + block = self.blocks[layer_idx] + return [name for name in block._modules if name not in _BLOCK_INTERNAL_MODULES] + + def layer_types(self) -> List[str]: + """Return a human-readable layer type for each block. + + Inspects which bridged submodules are present on each block to infer + the layer type. For homogeneous models, all entries will be the same. + Variant submodule names are defined in + ``generalized_components.block.VARIANT_SUBMODULE_NAMES``. + + Labels are deterministic: variants appear in VARIANT_SUBMODULE_NAMES + order, universals are sorted alphabetically. + + Returns: + List of strings like ["attn+mlp", "ssm+mlp", "attn+mlp", ...]. + """ + types = [] + for block in self.blocks: + # Variants in canonical order (tuple iteration = stable) + variants = [n for n in VARIANT_SUBMODULE_NAMES if n in block._modules] + universals = sorted( + n + for n in block._modules + if n not in _VARIANT_SUBMODULE_SET + and n not in _BLOCK_INTERNAL_MODULES + and not n.startswith(_NORM_PREFIXES) + ) + parts = variants + universals + types.append("+".join(parts) if parts else "unknown") + return types @property def all_head_labels(self) -> list[str]: """Human-readable labels for all attention heads, e.g. ['L0H0', 'L0H1', ...].""" return [f"L{l}H{h}" for l in range(self.cfg.n_layers) for h in range(self.cfg.n_heads)] + @property + def attn_head_labels(self) -> list[str]: + """Labels for attention heads only, matching all_composition_scores() dimensions. + + For homogeneous models, identical to all_head_labels. For hybrid models, + only includes heads from attention layers (skips SSM/linear-attn layers). + """ + return [ + f"L{l}H{h}" for l in self.composition_layer_indices() for h in range(self.cfg.n_heads) + ] + def parameters(self, recurse: bool = True) -> Iterator[nn.Parameter]: """Returns parameters following standard PyTorch semantics. diff --git a/transformer_lens/model_bridge/component_setup.py b/transformer_lens/model_bridge/component_setup.py index d32f787df..79d2abc2a 100644 --- a/transformer_lens/model_bridge/component_setup.py +++ b/transformer_lens/model_bridge/component_setup.py @@ -2,8 +2,11 @@ "Component setup utilities for creating and configuring bridged components." import copy +import logging from typing import TYPE_CHECKING, Any, cast +logger = logging.getLogger(__name__) + import torch.nn as nn from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter @@ -67,6 +70,7 @@ def setup_submodules( architecture_adapter: The architecture adapter original_model: The original model to get components from """ + skipped_optional: list[str] = [] for module_name, submodule in component.submodules.items(): if submodule.is_list_item: if submodule.name is None: @@ -95,9 +99,39 @@ def setup_submodules( original_subcomponent = original_model else: remote_path = submodule.name - original_subcomponent = architecture_adapter.get_remote_component( - original_model, remote_path - ) + is_optional = getattr(submodule, "optional", False) + # Fast path: if the first path segment is absent, skip + # immediately. This catches the common hybrid case (e.g., + # "self_attn" absent on an SSM layer) without entering + # get_remote_component. + first_segment = remote_path.split(".")[0] + if is_optional and not hasattr(original_model, first_segment): + logger.debug( + "Optional submodule '%s' (path '%s') absent on %s — skipping", + module_name, + remote_path, + getattr(component, "name", "unknown"), + ) + skipped_optional.append(module_name) + continue # hybrid layer lacks this submodule; skip binding + # Full resolution — also catches deeper path failures + # (e.g., "self_attn.q_proj" where self_attn exists as a + # stub but q_proj is missing). + try: + original_subcomponent = architecture_adapter.get_remote_component( + original_model, remote_path + ) + except AttributeError: + if is_optional: + logger.debug( + "Optional submodule '%s' (path '%s') partially absent on %s — skipping", + module_name, + remote_path, + getattr(component, "name", "unknown"), + ) + skipped_optional.append(module_name) + continue + raise submodule.set_original_component(original_subcomponent) setup_submodules(submodule, architecture_adapter, original_subcomponent) if submodule.name is not None: @@ -111,6 +145,12 @@ def setup_submodules( if not submodule.is_list_item and submodule.name is not None: component.real_components[module_name] = (submodule.name, submodule) + # Remove skipped optional submodules from the template so that + # architecture_adapter traversal code (which reads .submodules) doesn't + # find them and try to resolve against the HF model. + for name in skipped_optional: + component.submodules.pop(name, None) + def setup_components( components: dict[str, Any], diff --git a/transformer_lens/model_bridge/composition_scores.py b/transformer_lens/model_bridge/composition_scores.py new file mode 100644 index 000000000..9073fddb2 --- /dev/null +++ b/transformer_lens/model_bridge/composition_scores.py @@ -0,0 +1,102 @@ +"""CompositionScores — tensor-like container for composition score results.""" +from typing import List + +import torch + + +class CompositionScores: + """Composition scores bundled with layer-index metadata. + + Behaves like a tensor for backward compatibility — indexing, .shape, + arithmetic, and ``torch.*`` namespace functions all delegate to the + underlying scores tensor via ``__torch_function__``. The additional + ``layer_indices`` and ``head_labels`` attributes provide metadata that + prevents silent misinterpretation of indices on hybrid models. + + For hybrid models, the scores tensor has shape + (n_attn_layers, n_heads, n_attn_layers, n_heads) where n_attn_layers + may be less than n_layers. ``layer_indices`` maps tensor position i + to the original layer number. + + Attributes: + scores: Upper-triangular composition score tensor. + layer_indices: Original layer numbers for each position in scores. + E.g., [0, 2, 5] means position 0 = layer 0, position 1 = layer 2, etc. + head_labels: Labels like ["L0H0", "L0H1", "L2H0", ...] matching scores dims. + """ + + def __init__(self, scores: torch.Tensor, layer_indices: List[int], head_labels: List[str]): + self.scores = scores + self.layer_indices = layer_indices + self.head_labels = head_labels + + # --- Tensor protocol --- + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + """Delegate torch.* calls (torch.isnan, torch.where, etc.) to .scores.""" + if kwargs is None: + kwargs = {} + # Unwrap any CompositionScores args to their underlying tensor + unwrapped_args = tuple(a.scores if isinstance(a, CompositionScores) else a for a in args) + unwrapped_kwargs = { + k: v.scores if isinstance(v, CompositionScores) else v for k, v in kwargs.items() + } + return func(*unwrapped_args, **unwrapped_kwargs) + + @property + def shape(self) -> torch.Size: + return self.scores.shape + + @property + def device(self) -> torch.device: + return self.scores.device + + @property + def dtype(self) -> torch.dtype: + return self.scores.dtype + + # Python 3 automatically sets __hash__ = None when __eq__ is defined, + # making instances unhashable. No explicit __hash__ needed. + + def __getitem__(self, key): + return self.scores[key] + + def __getattr__(self, name): + # Delegate tensor methods (.abs(), .sum(), .any(), etc.) to .scores. + # Guard against infinite recursion during pickling/unpickling where + # self.scores may not exist yet. + try: + scores = object.__getattribute__(self, "scores") + except AttributeError: + raise AttributeError(name) from None + return getattr(scores, name) + + def __gt__(self, other): + return self.scores > other + + def __lt__(self, other): + return self.scores < other + + def __ge__(self, other): + return self.scores >= other + + def __le__(self, other): + return self.scores <= other + + def __eq__(self, other): + if isinstance(other, CompositionScores): + return self.scores == other.scores + return self.scores == other + + def __ne__(self, other): + if isinstance(other, CompositionScores): + return self.scores != other.scores + return self.scores != other + + def __repr__(self) -> str: + return ( + f"CompositionScores(shape={self.shape}, " + f"layer_indices={self.layer_indices}, " + f"n_head_labels={len(self.head_labels)})" + ) diff --git a/transformer_lens/model_bridge/generalized_components/base.py b/transformer_lens/model_bridge/generalized_components/base.py index db270a644..1af033efb 100644 --- a/transformer_lens/model_bridge/generalized_components/base.py +++ b/transformer_lens/model_bridge/generalized_components/base.py @@ -34,6 +34,7 @@ def __init__( submodules: Optional[Dict[str, "GeneralizedComponent"]] = None, conversion_rule: Optional[BaseTensorConversion] = None, hook_alias_overrides: Optional[Dict[str, str]] = None, + optional: bool = False, ): """Initialize the generalized component. @@ -45,12 +46,17 @@ def __init__( hook_alias_overrides: Optional dictionary to override default hook aliases. For example, {"hook_attn_out": "ln1_post.hook_out"} will make hook_attn_out point to ln1_post.hook_out instead of the default value in self.hook_aliases. + optional: If True, this entire subtree may be absent on some layers. + When the remote model lacks this component, setup will skip it + cleanly instead of raising AttributeError. Used for hybrid + architectures where layers have structurally different submodules. """ super().__init__() self.name = name self.config = config self.submodules = submodules or {} self.conversion_rule = conversion_rule + self.optional = optional self._hook_registry: Dict[str, HookPoint] = {} self._hook_alias_registry: Dict[str, Union[str, List[str]]] = {} self._property_alias_registry: Dict[str, str] = {} @@ -337,6 +343,7 @@ def __setattr__(self, name: str, value: Any) -> None: "conversion_rule", "compatibility_mode", "disable_warnings", + "optional", ]: super().__setattr__(name, value) return diff --git a/transformer_lens/model_bridge/generalized_components/block.py b/transformer_lens/model_bridge/generalized_components/block.py index 48147a9d2..1005fd4f2 100644 --- a/transformer_lens/model_bridge/generalized_components/block.py +++ b/transformer_lens/model_bridge/generalized_components/block.py @@ -15,6 +15,21 @@ GeneralizedComponent, ) +# Submodule names that represent layer-type variants in hybrid architectures. +# Used by layer_types() for classification and _get_block_variant_bias() for +# bias accumulation. Adapters that introduce new variant types should add +# their submodule name here. Ordered tuple for deterministic iteration +# (matters when a block has multiple variants during development/testing). +VARIANT_SUBMODULE_NAMES: tuple[str, ...] = ("attn", "linear_attn", "mamba", "mixer", "ssm") +_VARIANT_SUBMODULE_SET: frozenset[str] = frozenset(VARIANT_SUBMODULE_NAMES) + +# Internal block modules excluded from submodule introspection (hook points +# and the wrapped HF component are infrastructure, not user-facing submodules). +_BLOCK_INTERNAL_MODULES: frozenset[str] = frozenset({"hook_in", "hook_out", "_original_component"}) + +# Prefixes for normalization modules excluded from layer_types() labels. +_NORM_PREFIXES: tuple[str, ...] = ("ln", "layer_norm", "norm", "rms") + class BlockBridge(GeneralizedComponent): """Bridge component for transformer blocks. diff --git a/transformer_lens/model_bridge/get_params_util.py b/transformer_lens/model_bridge/get_params_util.py index f63ab9386..f27e3a97f 100644 --- a/transformer_lens/model_bridge/get_params_util.py +++ b/transformer_lens/model_bridge/get_params_util.py @@ -1,8 +1,11 @@ """Utility function for getting model parameters in TransformerLens format.""" +import logging from typing import Dict import torch +logger = logging.getLogger(__name__) + def _get_n_kv_heads(cfg) -> int: """Resolve the number of key/value heads, falling back to n_heads.""" @@ -36,14 +39,17 @@ def _get_or_create_bias(bias, n_heads: int, d_head: int, device, dtype) -> torch def get_bridge_params(bridge) -> Dict[str, torch.Tensor]: """Access to model parameters in the format expected by SVDInterpreter. - For missing weights, returns zero tensors of appropriate shape instead of raising exceptions. - This ensures compatibility across different model architectures. + For hybrid architectures, only layers with attention get attention keys + (W_Q, W_K, etc.). Non-attention layers (SSM, linear-attention) are skipped + rather than filled with zeros — this prevents downstream consumers like + SVDInterpreter from treating synthetic zeros as real weights. Args: bridge: TransformerBridge instance Returns: - dict: Dictionary of parameter tensors with TransformerLens naming convention + dict: Dictionary of parameter tensors with TransformerLens naming convention. + For hybrid models, attention keys only exist for layers that have attention. Raises: ValueError: If configuration is inconsistent (e.g., cfg.n_layers != len(blocks)) @@ -51,22 +57,15 @@ def get_bridge_params(bridge) -> Dict[str, torch.Tensor]: params_dict = {} def _get_device_dtype(): - device = bridge.cfg.device if hasattr(bridge.cfg, "device") else torch.device("cpu") + """Infer device/dtype from the first available model parameter.""" + device = getattr(bridge.cfg, "device", None) or torch.device("cpu") dtype = torch.float32 try: - device = bridge.embed.weight.device - dtype = bridge.embed.weight.dtype - except AttributeError: - try: - device = bridge.pos_embed.weight.device - dtype = bridge.pos_embed.weight.dtype - except AttributeError: - if len(bridge.blocks) > 0: - try: - device = bridge.blocks[0].attn.q.weight.device - dtype = bridge.blocks[0].attn.q.weight.dtype - except AttributeError: - pass + first_param = next(bridge.parameters()) + device = first_param.device + dtype = first_param.dtype + except (StopIteration, TypeError, AttributeError): + pass return (device, dtype) try: @@ -89,72 +88,59 @@ def _get_device_dtype(): f"Configuration mismatch: cfg.n_layers={bridge.cfg.n_layers} but only {len(bridge.blocks)} blocks found. Layer {layer_idx} does not exist." ) block = bridge.blocks[layer_idx] + + # Only extract attention params from blocks that have attention. + # Non-attention layers (SSM, linear-attention) are skipped entirely + # rather than filled with zeros — this prevents consumers like + # SVDInterpreter from treating synthetic zeros as real weights. try: - w_q = block.attn.q.weight - w_k = block.attn.k.weight - w_v = block.attn.v.weight - w_o = block.attn.o.weight - if w_q.shape == (bridge.cfg.d_model, bridge.cfg.d_model): - d_head = bridge.cfg.d_model // bridge.cfg.n_heads - w_q = w_q.reshape(bridge.cfg.n_heads, bridge.cfg.d_model, d_head) - w_o = w_o.reshape(bridge.cfg.n_heads, d_head, bridge.cfg.d_model) - device, dtype = _get_device_dtype() - w_k = _reshape_kv_weight(w_k, bridge.cfg, device, dtype) - w_v = _reshape_kv_weight(w_v, bridge.cfg, device, dtype) - params_dict[f"blocks.{layer_idx}.attn.W_Q"] = w_q - params_dict[f"blocks.{layer_idx}.attn.W_K"] = w_k - params_dict[f"blocks.{layer_idx}.attn.W_V"] = w_v - params_dict[f"blocks.{layer_idx}.attn.W_O"] = w_o - device, dtype = _get_device_dtype() - n_kv_heads = _get_n_kv_heads(bridge.cfg) - params_dict[f"blocks.{layer_idx}.attn.b_Q"] = _get_or_create_bias( - block.attn.q.bias, bridge.cfg.n_heads, bridge.cfg.d_head, device, dtype - ) - params_dict[f"blocks.{layer_idx}.attn.b_K"] = _get_or_create_bias( - block.attn.k.bias, n_kv_heads, bridge.cfg.d_head, device, dtype - ) - params_dict[f"blocks.{layer_idx}.attn.b_V"] = _get_or_create_bias( - block.attn.v.bias, n_kv_heads, bridge.cfg.d_head, device, dtype - ) - if block.attn.o.bias is not None: - params_dict[f"blocks.{layer_idx}.attn.b_O"] = block.attn.o.bias - else: + has_attn = "attn" in block._modules + except (TypeError, AttributeError): + # Mock objects or non-nn.Module blocks: fall back to hasattr + has_attn = hasattr(block, "attn") + if has_attn: + try: + w_q = block.attn.q.weight + w_k = block.attn.k.weight + w_v = block.attn.v.weight + w_o = block.attn.o.weight + if w_q.shape == (bridge.cfg.d_model, bridge.cfg.d_model): + d_head = bridge.cfg.d_model // bridge.cfg.n_heads + w_q = w_q.reshape(bridge.cfg.n_heads, bridge.cfg.d_model, d_head) + w_o = w_o.reshape(bridge.cfg.n_heads, d_head, bridge.cfg.d_model) + device, dtype = _get_device_dtype() + w_k = _reshape_kv_weight(w_k, bridge.cfg, device, dtype) + w_v = _reshape_kv_weight(w_v, bridge.cfg, device, dtype) + params_dict[f"blocks.{layer_idx}.attn.W_Q"] = w_q + params_dict[f"blocks.{layer_idx}.attn.W_K"] = w_k + params_dict[f"blocks.{layer_idx}.attn.W_V"] = w_v + params_dict[f"blocks.{layer_idx}.attn.W_O"] = w_o device, dtype = _get_device_dtype() - params_dict[f"blocks.{layer_idx}.attn.b_O"] = torch.zeros( - bridge.cfg.d_model, device=device, dtype=dtype + n_kv_heads = _get_n_kv_heads(bridge.cfg) + params_dict[f"blocks.{layer_idx}.attn.b_Q"] = _get_or_create_bias( + block.attn.q.bias, bridge.cfg.n_heads, bridge.cfg.d_head, device, dtype + ) + params_dict[f"blocks.{layer_idx}.attn.b_K"] = _get_or_create_bias( + block.attn.k.bias, n_kv_heads, bridge.cfg.d_head, device, dtype + ) + params_dict[f"blocks.{layer_idx}.attn.b_V"] = _get_or_create_bias( + block.attn.v.bias, n_kv_heads, bridge.cfg.d_head, device, dtype + ) + if block.attn.o.bias is not None: + params_dict[f"blocks.{layer_idx}.attn.b_O"] = block.attn.o.bias + else: + device, dtype = _get_device_dtype() + params_dict[f"blocks.{layer_idx}.attn.b_O"] = torch.zeros( + bridge.cfg.d_model, device=device, dtype=dtype + ) + except AttributeError as e: + logger.debug( + "Block %d has 'attn' in _modules but attention params could not " + "be extracted (missing q/k/v/o?): %s — skipping attention weights " + "for this layer", + layer_idx, + e, ) - except AttributeError: - device, dtype = _get_device_dtype() - expected_qkv_shape = (bridge.cfg.n_heads, bridge.cfg.d_model, bridge.cfg.d_head) - expected_o_shape = (bridge.cfg.n_heads, bridge.cfg.d_head, bridge.cfg.d_model) - expected_q_bias_shape = (bridge.cfg.n_heads, bridge.cfg.d_head) - expected_o_bias_shape = (bridge.cfg.d_model,) - n_kv_heads = _get_n_kv_heads(bridge.cfg) - expected_kv_bias_shape = (n_kv_heads, bridge.cfg.d_head) - params_dict[f"blocks.{layer_idx}.attn.W_Q"] = torch.zeros( - *expected_qkv_shape, device=device, dtype=dtype - ) - params_dict[f"blocks.{layer_idx}.attn.W_K"] = torch.zeros( - *expected_qkv_shape, device=device, dtype=dtype - ) - params_dict[f"blocks.{layer_idx}.attn.W_V"] = torch.zeros( - *expected_qkv_shape, device=device, dtype=dtype - ) - params_dict[f"blocks.{layer_idx}.attn.W_O"] = torch.zeros( - *expected_o_shape, device=device, dtype=dtype - ) - params_dict[f"blocks.{layer_idx}.attn.b_Q"] = torch.zeros( - *expected_q_bias_shape, device=device, dtype=dtype - ) - params_dict[f"blocks.{layer_idx}.attn.b_K"] = torch.zeros( - *expected_kv_bias_shape, device=device, dtype=dtype - ) - params_dict[f"blocks.{layer_idx}.attn.b_V"] = torch.zeros( - *expected_kv_bias_shape, device=device, dtype=dtype - ) - params_dict[f"blocks.{layer_idx}.attn.b_O"] = torch.zeros( - *expected_o_bias_shape, device=device, dtype=dtype - ) try: mlp_in = getattr(block.mlp, "in", None) or getattr(block.mlp, "input", None) if mlp_in is None: diff --git a/transformer_lens/weight_processing.py b/transformer_lens/weight_processing.py index c05e8706a..6f0489f21 100644 --- a/transformer_lens/weight_processing.py +++ b/transformer_lens/weight_processing.py @@ -1698,6 +1698,21 @@ def refactor_factored_attn_matrices( b_V_key = ProcessWeights._get_param_key(f"blocks.{l}.attn.b_V", adapter) b_O_key = ProcessWeights._get_param_key(f"blocks.{l}.attn.b_O", adapter) + # Skip layers without attention weights (hybrid architectures where + # some layers are SSM/linear-attention and lack Q/K/V/O entirely). + # Other weight-processing loops (center_writing_weights, fold_value_biases, + # fold_layer_norm) already guard with `if key in state_dict:` checks. + if W_Q_key not in state_dict: + continue + # All four weight matrices must be present if Q is present + for _required_key in [W_K_key, W_V_key, W_O_key]: + if _required_key not in state_dict: + raise ValueError( + f"Inconsistent attention weights at layer {l}: " + f"'{W_Q_key}' found but '{_required_key}' missing. " + f"All of W_Q, W_K, W_V, W_O must be present together." + ) + # W_QK = W_Q @ W_K.T # Concatenate biases to make a d_model+1 input dimension W_Q = ProcessWeights.convert_tensor_to_tl_format( From 72d57a2b80e04f6fb8fa460498ebefa7921556c2 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Tue, 14 Apr 2026 20:23:05 -0500 Subject: [PATCH 2/8] Comment cleanup --- tests/unit/test_optional_submodule.py | 791 +++++------------- .../benchmarks/weight_processing.py | 3 - transformer_lens/model_bridge/bridge.py | 180 +--- .../model_bridge/component_setup.py | 23 +- .../model_bridge/composition_scores.py | 33 +- .../generalized_components/base.py | 5 +- .../generalized_components/block.py | 12 +- .../model_bridge/get_params_util.py | 26 +- transformer_lens/weight_processing.py | 7 +- 9 files changed, 252 insertions(+), 828 deletions(-) diff --git a/tests/unit/test_optional_submodule.py b/tests/unit/test_optional_submodule.py index 4bc44e6bc..168ad0ce1 100644 --- a/tests/unit/test_optional_submodule.py +++ b/tests/unit/test_optional_submodule.py @@ -1,9 +1,7 @@ -"""Unit tests for the optional submodule framework. +"""Tests for optional submodule support in hybrid architectures.""" -Tests the `optional` flag on GeneralizedComponent and the `blocks_with()` -capability query API on TransformerBridge, which together enable hybrid -architectures where layers have structurally different submodules. -""" +import copy +import logging import pytest import torch @@ -17,14 +15,10 @@ from transformer_lens.model_bridge.generalized_components.block import BlockBridge from transformer_lens.model_bridge.generalized_components.linear import LinearBridge -# ============================================================================ -# Fixtures: synthetic hybrid model -# ============================================================================ +# -- Synthetic hybrid model fixtures ------------------------------------------ class FakeSubmodule(nn.Module): - """A simple nn.Linear submodule for testing.""" - def __init__(self, dim: int = 4): super().__init__() self.proj = nn.Linear(dim, dim, bias=False) @@ -34,7 +28,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class HybridLayer(nn.Module): - """A layer that conditionally has a 'foo' submodule.""" + """Layer that conditionally has a 'foo' submodule.""" def __init__(self, has_foo: bool, dim: int = 4): super().__init__() @@ -49,7 +43,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class HybridModel(nn.Module): - """Model with 4 layers: layers 0-2 have 'foo', layer 3 does not.""" + """4 layers: 0-2 have 'foo', layer 3 does not.""" def __init__(self, dim: int = 4): super().__init__() @@ -62,8 +56,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class MinimalAdapter(ArchitectureAdapter): - """Minimal adapter for testing optional submodule setup.""" - def __init__(self, optional: bool = True): self.cfg = type("Cfg", (), {"n_layers": 4, "d_model": 4})() self.component_mapping = {} @@ -79,706 +71,372 @@ def make_block_template(self) -> BlockBridge: ) -# ============================================================================ -# Tests: optional flag on GeneralizedComponent -# ============================================================================ +class AttnAdapter(ArchitectureAdapter): + """Uses 'attn' as the optional submodule name (matches real adapters).""" + def __init__(self): + self.cfg = type("Cfg", (), {"n_layers": 4, "d_model": 4})() + self.component_mapping = {} -class TestOptionalFlag: - """Test that the optional flag is properly stored and defaults to False.""" + def make_block_template(self) -> BlockBridge: + return BlockBridge( + name="layers", + submodules={ + "bar": LinearBridge(name="bar"), + "attn": LinearBridge(name="foo", optional=True), + }, + ) - def test_default_is_false(self): - comp = GeneralizedComponent(name="test") - assert comp.optional is False - def test_optional_true(self): - comp = GeneralizedComponent(name="test", optional=True) - assert comp.optional is True +# -- Bridge construction helpers ---------------------------------------------- - def test_optional_false_explicit(self): - comp = GeneralizedComponent(name="test", optional=False) - assert comp.optional is False + +def _setup_blocks(model, adapter): + """Deepcopy template per layer and run setup_submodules.""" + template = adapter.make_block_template() + blocks = [] + for i, layer in enumerate(model.layers): + block = copy.deepcopy(template) + block.name = f"layers.{i}" + block.set_original_component(layer) + setup_submodules(block, adapter, layer) + blocks.append(block) + return blocks -# ============================================================================ -# Tests: setup_submodules with optional -# ============================================================================ +def _make_bridge(blocks, **cfg_attrs): + """Wrap blocks in a minimal TransformerBridge shell.""" + from transformer_lens.model_bridge.bridge import TransformerBridge + bridge = TransformerBridge.__new__(TransformerBridge) + nn.Module.__init__(bridge) + bridge.add_module("blocks", nn.ModuleList(blocks)) + defaults = {"d_model": 4, "device": "cpu", "n_layers": 4} + defaults.update(cfg_attrs) + bridge.cfg = type("Cfg", (), defaults)() + return bridge -class TestOptionalSubmoduleSetup: - """Test that optional submodules are skipped cleanly during setup.""" - def test_optional_submodule_skipped_on_missing_layers(self): - """Layers 0-2 have 'foo', layer 3 does not. Setup should succeed.""" - model = HybridModel() - adapter = MinimalAdapter(optional=True) - template = adapter.make_block_template() +def _make_hybrid_bridge(): + """Hybrid bridge with 'foo' (optional) and 'bar' (universal).""" + return _make_bridge(_setup_blocks(HybridModel(), MinimalAdapter(optional=True))) - # Simulate what setup_blocks_bridge does: deepcopy + setup per layer - import copy - blocks = [] - for i, layer in enumerate(model.layers): - block = copy.deepcopy(template) - block.name = f"layers.{i}" - block.set_original_component(layer) - setup_submodules(block, adapter, layer) - blocks.append(block) +def _make_hybrid_bridge_with_attn(): + """Hybrid bridge where 'attn' is the optional submodule.""" + return _make_bridge( + _setup_blocks(HybridModel(), AttnAdapter()), + n_heads=2, + ) - # Layers 0-2 should have 'foo' in real_components - for i in range(3): - assert "foo" in blocks[i].real_components, f"Block {i} should have 'foo'" - assert hasattr(blocks[i], "foo"), f"Block {i} should have foo module" - # Layer 3 should NOT have 'foo' in any lookup path - assert ( - "foo" not in blocks[3].real_components - ), "Block 3 should not have 'foo' in real_components" - assert "foo" not in blocks[3]._modules, "Block 3 should not have 'foo' in _modules" - assert "foo" not in blocks[3].submodules, "Block 3 should not have 'foo' in submodules" +# -- Tests: optional flag ----------------------------------------------------- - # All layers should have 'bar' - for i in range(4): - assert "bar" in blocks[i].real_components, f"Block {i} should have 'bar'" - def test_non_optional_missing_submodule_raises(self): - """When optional=False, missing submodule should raise AttributeError.""" - model = HybridModel() - adapter = MinimalAdapter(optional=False) - template = adapter.make_block_template() +class TestOptionalFlag: + def test_default_is_false(self): + assert GeneralizedComponent(name="test").optional is False - import copy + def test_optional_true(self): + assert GeneralizedComponent(name="test", optional=True).optional is True - # Layer 3 lacks 'foo' and optional=False, so this should raise - block = copy.deepcopy(template) - block.name = "layers.3" - block.set_original_component(model.layers[3]) - with pytest.raises(AttributeError): - setup_submodules(block, adapter, model.layers[3]) + def test_optional_false_explicit(self): + assert GeneralizedComponent(name="test", optional=False).optional is False -# ============================================================================ -# Tests: blocks_with() API -# ============================================================================ +# -- Tests: setup_submodules -------------------------------------------------- -class TestBlocksWith: - """Test the blocks_with() capability query on TransformerBridge.""" +class TestOptionalSubmoduleSetup: + def test_skipped_on_missing_layers(self): + blocks = _setup_blocks(HybridModel(), MinimalAdapter(optional=True)) - def test_blocks_with_returns_matching_blocks(self): - """blocks_with('foo') should return only blocks that have 'foo'.""" - from transformer_lens.model_bridge.bridge import TransformerBridge + for i in range(3): + assert "foo" in blocks[i].real_components + assert hasattr(blocks[i], "foo") - model = HybridModel() - adapter = MinimalAdapter(optional=True) - template = adapter.make_block_template() + assert "foo" not in blocks[3].real_components + assert "foo" not in blocks[3]._modules + assert "foo" not in blocks[3].submodules - import copy + for i in range(4): + assert "bar" in blocks[i].real_components - blocks = nn.ModuleList() - for i, layer in enumerate(model.layers): - block = copy.deepcopy(template) - block.name = f"layers.{i}" - block.set_original_component(layer) - setup_submodules(block, adapter, layer) - blocks.append(block) + def test_non_optional_raises(self): + model = HybridModel() + adapter = MinimalAdapter(optional=False) + block = copy.deepcopy(adapter.make_block_template()) + block.name = "layers.3" + block.set_original_component(model.layers[3]) + with pytest.raises(AttributeError): + setup_submodules(block, adapter, model.layers[3]) - # Create a minimal bridge-like object with blocks attribute - # We test blocks_with as a standalone method - bridge = TransformerBridge.__new__(TransformerBridge) - nn.Module.__init__(bridge) - bridge.add_module("blocks", blocks) - foo_blocks = bridge.blocks_with("foo") - assert len(foo_blocks) == 3 - assert [idx for idx, _ in foo_blocks] == [0, 1, 2] +# -- Tests: blocks_with() ----------------------------------------------------- - bar_blocks = bridge.blocks_with("bar") - assert len(bar_blocks) == 4 - missing_blocks = bridge.blocks_with("nonexistent") - assert len(missing_blocks) == 0 +class TestBlocksWith: + def test_returns_matching_blocks(self): + bridge = _make_hybrid_bridge() + assert [idx for idx, _ in bridge.blocks_with("foo")] == [0, 1, 2] + assert len(bridge.blocks_with("bar")) == 4 + assert bridge.blocks_with("nonexistent") == [] - def test_blocks_with_no_blocks_attribute(self): - """blocks_with() should return empty list if no blocks attribute.""" + def test_no_blocks_attribute(self): from transformer_lens.model_bridge.bridge import TransformerBridge bridge = TransformerBridge.__new__(TransformerBridge) nn.Module.__init__(bridge) assert bridge.blocks_with("attn") == [] + def test_checks_modules_not_hasattr(self): + bridge = _make_hybrid_bridge() + assert len(bridge.blocks_with("training")) == 0 -# ============================================================================ -# Tests: _stack_block_params with hybrid blocks -# ============================================================================ - - -class TestStackBlockParamsHybridSafe: - """Test that _stack_block_params raises clear errors for hybrid blocks.""" - - def test_logs_warning_and_returns_subset_on_hybrid(self, caplog): - """On hybrid blocks, should log warning and return tensor for matching blocks only.""" - import logging - - from transformer_lens.model_bridge.bridge import TransformerBridge - - # Build blocks where block 3 lacks 'foo' but blocks 0-2 have it - model = HybridModel() - adapter = MinimalAdapter(optional=True) - template = adapter.make_block_template() - - import copy - - blocks = nn.ModuleList() - for i, layer in enumerate(model.layers): - block = copy.deepcopy(template) - block.name = f"layers.{i}" - block.set_original_component(layer) - setup_submodules(block, adapter, layer) - blocks.append(block) - # Verify precondition: block 3 lacks 'foo' - assert "foo" in blocks[0]._modules - assert "foo" not in blocks[3]._modules +# -- Tests: _stack_block_params ----------------------------------------------- - bridge = TransformerBridge.__new__(TransformerBridge) - nn.Module.__init__(bridge) - bridge.add_module("blocks", blocks) - # Should succeed with a log warning, returning only matching blocks. - # logging.warning always emits (no deduplication), so researchers see - # the index mapping notice on every access — not just the first. +class TestStackBlockParams: + def test_logs_warning_and_returns_subset(self, caplog): + bridge = _make_hybrid_bridge() with caplog.at_level(logging.WARNING): result = bridge._stack_block_params("foo.proj.weight") assert any("Hybrid model" in msg for msg in caplog.messages) - assert any("stack_params_for" in msg for msg in caplog.messages) - # 3 blocks have 'foo', not 4 assert result.shape[0] == 3 - # Verify it logs again on a second call (no deduplication) caplog.clear() with caplog.at_level(logging.WARNING): - result2 = bridge._stack_block_params("foo.proj.weight") - assert any( - "Hybrid model" in msg for msg in caplog.messages - ), "Warning should emit on every call, not just the first" - - def test_raises_when_no_blocks_have_submodule(self): - """Should raise AttributeError when zero blocks have the submodule.""" - from transformer_lens.model_bridge.bridge import TransformerBridge + bridge._stack_block_params("foo.proj.weight") + assert any("Hybrid model" in msg for msg in caplog.messages) + def test_raises_when_no_blocks_match(self): bridge = _make_hybrid_bridge() with pytest.raises(AttributeError, match="No blocks have"): bridge._stack_block_params("nonexistent") def test_succeeds_on_universal_submodule(self): - """Should succeed when all blocks have the requested submodule.""" - from transformer_lens.model_bridge.bridge import TransformerBridge - - model = HybridModel() - adapter = MinimalAdapter(optional=True) - template = adapter.make_block_template() - - import copy - - blocks = nn.ModuleList() - for i, layer in enumerate(model.layers): - block = copy.deepcopy(template) - block.name = f"layers.{i}" - block.set_original_component(layer) - setup_submodules(block, adapter, layer) - blocks.append(block) - - bridge = TransformerBridge.__new__(TransformerBridge) - nn.Module.__init__(bridge) - bridge.add_module("blocks", blocks) - - # 'bar' exists on all blocks → should succeed + bridge = _make_hybrid_bridge() result = bridge._stack_block_params("bar.weight") - assert result.shape[0] == 4 # 4 layers + assert result.shape[0] == 4 -# ============================================================================ -# Tests: refactor_factored_attn_matrices with missing layers -# ============================================================================ +# -- Tests: refactor_factored_attn_matrices ------------------------------------ class TestRefactorFactoredAttnHybrid: - """Test that refactor_factored_attn_matrices skips layers without attn.""" - def test_skips_missing_attn_layers(self): - """Should process layers with attn keys and skip those without.""" from transformer_lens.config.TransformerLensConfig import TransformerLensConfig from transformer_lens.weight_processing import ProcessWeights - n_heads = 2 - d_head = 4 - d_model = n_heads * d_head cfg = TransformerLensConfig( n_layers=4, - n_heads=n_heads, - d_head=d_head, - d_model=d_model, + n_heads=2, + d_head=4, + d_model=8, n_ctx=16, positional_embedding_type="standard", ) - - # Create state_dict with attn weights for layers 0-2 only. - # W_Q/W_K/W_V: [n_heads, d_model, d_head], W_O: [n_heads, d_head, d_model] - # b_Q/b_K/b_V: [n_heads, d_head], b_O: [d_model] state_dict = {} - for l in range(3): # layers 0-2 have attention - state_dict[f"blocks.{l}.attn.W_Q"] = torch.randn(n_heads, d_model, d_head) - state_dict[f"blocks.{l}.attn.W_K"] = torch.randn(n_heads, d_model, d_head) - state_dict[f"blocks.{l}.attn.W_V"] = torch.randn(n_heads, d_model, d_head) - state_dict[f"blocks.{l}.attn.W_O"] = torch.randn(n_heads, d_head, d_model) - state_dict[f"blocks.{l}.attn.b_Q"] = torch.randn(n_heads, d_head) - state_dict[f"blocks.{l}.attn.b_K"] = torch.randn(n_heads, d_head) - state_dict[f"blocks.{l}.attn.b_V"] = torch.randn(n_heads, d_head) - state_dict[f"blocks.{l}.attn.b_O"] = torch.randn(d_model) - - # Layer 3 has NO attention keys — should be skipped, not crash + for l in range(3): + state_dict[f"blocks.{l}.attn.W_Q"] = torch.randn(2, 8, 4) + state_dict[f"blocks.{l}.attn.W_K"] = torch.randn(2, 8, 4) + state_dict[f"blocks.{l}.attn.W_V"] = torch.randn(2, 8, 4) + state_dict[f"blocks.{l}.attn.W_O"] = torch.randn(2, 4, 8) + state_dict[f"blocks.{l}.attn.b_Q"] = torch.randn(2, 4) + state_dict[f"blocks.{l}.attn.b_K"] = torch.randn(2, 4) + state_dict[f"blocks.{l}.attn.b_V"] = torch.randn(2, 4) + state_dict[f"blocks.{l}.attn.b_O"] = torch.randn(8) + result = ProcessWeights.refactor_factored_attn_matrices(state_dict, cfg) - # Layers 0-2 should still have their attn keys (now refactored) for l in range(3): assert f"blocks.{l}.attn.W_Q" in result - assert f"blocks.{l}.attn.W_K" in result + assert "blocks.3.attn.W_Q" not in result - # Layer 3 should have no attn keys - assert f"blocks.3.attn.W_Q" not in result + def test_raises_on_partial_attn_keys(self): + from transformer_lens.config.TransformerLensConfig import TransformerLensConfig + from transformer_lens.weight_processing import ProcessWeights + cfg = TransformerLensConfig( + n_layers=1, + n_heads=2, + d_head=4, + d_model=8, + n_ctx=16, + positional_embedding_type="standard", + ) + state_dict = {"blocks.0.attn.W_Q": torch.randn(2, 8, 4)} + with pytest.raises(ValueError, match="Inconsistent attention weights"): + ProcessWeights.refactor_factored_attn_matrices(state_dict, cfg) -# ============================================================================ -# Tests: weight distribution with ragged blocks -# ============================================================================ +# -- Tests: weight distribution ------------------------------------------------ -class TestWeightDistributionRagged: - """Test that weight distribution handles heterogeneous real_components.""" +class TestWeightDistributionRagged: def test_distribute_weights_skips_empty_blocks(self): - """Blocks without attn weights should receive no attn keys.""" from transformer_lens.weight_processing import ProcessWeights - # Build a minimal real_components mapping with ragged blocks - model = HybridModel() - adapter = MinimalAdapter(optional=True) - template = adapter.make_block_template() - - import copy - - blocks = [] - for i, layer in enumerate(model.layers): - block = copy.deepcopy(template) - block.name = f"layers.{i}" - block.set_original_component(layer) - setup_submodules(block, adapter, layer) - blocks.append(block) - - # Construct state_dict with 'foo' weights for blocks 0-2 only + blocks = _setup_blocks(HybridModel(), MinimalAdapter(optional=True)) state_dict = {} for i in range(3): state_dict[f"blocks.{i}.foo.weight"] = torch.randn(4, 4) for i in range(4): state_dict[f"blocks.{i}.bar.weight"] = torch.randn(4, 4) - # Build the component mapping - component_mapping = { - "blocks": ("layers", blocks), - } - - # This should not crash ProcessWeights.distribute_weights_to_components( state_dict=state_dict, - component_mapping=component_mapping, - ) - - -# ============================================================================ -# Helpers for bridge-level tests -# ============================================================================ - - -def _make_hybrid_bridge(): - """Build a minimal TransformerBridge with hybrid blocks for testing. - - Uses 'foo' and 'bar' as submodule names. Layers 0-2 have 'foo', layer 3 does not. - """ - import copy - - from transformer_lens.model_bridge.bridge import TransformerBridge - - model = HybridModel() - adapter = MinimalAdapter(optional=True) - template = adapter.make_block_template() - - blocks = nn.ModuleList() - for i, layer in enumerate(model.layers): - block = copy.deepcopy(template) - block.name = f"layers.{i}" - block.set_original_component(layer) - setup_submodules(block, adapter, layer) - blocks.append(block) - - bridge = TransformerBridge.__new__(TransformerBridge) - nn.Module.__init__(bridge) - bridge.add_module("blocks", blocks) - - # Minimal cfg for accumulated_bias - bridge.cfg = type("Cfg", (), {"d_model": 4, "device": "cpu", "n_layers": 4})() - return bridge - - -class AttnAdapter(ArchitectureAdapter): - """Adapter using 'attn' as the optional submodule name (matches real adapters).""" - - def __init__(self): - self.cfg = type("Cfg", (), {"n_layers": 4, "d_model": 4})() - self.component_mapping = {} - - def make_block_template(self) -> BlockBridge: - return BlockBridge( - name="layers", - submodules={ - "bar": LinearBridge(name="bar"), - "attn": LinearBridge(name="foo", optional=True), - }, + component_mapping={"blocks": ("layers", blocks)}, ) -def _make_hybrid_bridge_with_attn(): - """Build a hybrid bridge where 'attn' is the optional submodule. - - Layers 0-2 have 'attn' (mapped from 'foo'), layer 3 does not. - Used for testing APIs that specifically look for 'attn' (composition scores, labels). - """ - import copy - - from transformer_lens.model_bridge.bridge import TransformerBridge - - model = HybridModel() - adapter = AttnAdapter() - template = adapter.make_block_template() - - blocks = nn.ModuleList() - for i, layer in enumerate(model.layers): - block = copy.deepcopy(template) - block.name = f"layers.{i}" - block.set_original_component(layer) - setup_submodules(block, adapter, layer) - blocks.append(block) +# -- Tests: __setattr__ whitelist ---------------------------------------------- - bridge = TransformerBridge.__new__(TransformerBridge) - nn.Module.__init__(bridge) - bridge.add_module("blocks", blocks) - bridge.cfg = type("Cfg", (), {"d_model": 4, "device": "cpu", "n_layers": 4, "n_heads": 2})() - return bridge - - -# ============================================================================ -# Tests: blocks_with uses _modules not hasattr -# ============================================================================ - -class TestBlocksWithModulesCheck: - """blocks_with() should only find bridged submodules, not HF attrs.""" - - def test_does_not_find_hf_internal_attrs(self): - """blocks_with should not match HF attributes that aren't bridged.""" - bridge = _make_hybrid_bridge() - # 'bar' is a bridged submodule (in _modules), should be found - assert len(bridge.blocks_with("bar")) == 4 - # 'training' exists as an attr on nn.Module but is not a bridged submodule - assert len(bridge.blocks_with("training")) == 0 - - def test_finds_only_bridged_optional_submodules(self): - """Optional submodules should be found only on layers where they were bound.""" - bridge = _make_hybrid_bridge() - foo_blocks = bridge.blocks_with("foo") - assert [idx for idx, _ in foo_blocks] == [0, 1, 2] +class TestSetAttrWhitelist: + def test_optional_stays_on_bridge(self): + comp = LinearBridge(name="test") + fake_hf = nn.Linear(4, 4, bias=False) + comp.set_original_component(fake_hf) + comp.optional = True + assert comp.optional is True + assert not hasattr(fake_hf, "optional") -# ============================================================================ -# Tests: accumulated_bias on hybrid models -# ============================================================================ +# -- Tests: accumulated_bias -------------------------------------------------- class TestAccumulatedBiasHybrid: - """accumulated_bias should not crash on hybrid models.""" - - def test_accumulated_bias_skips_non_attn_layers(self): - """Should not crash when some layers lack attention.""" + def test_skips_non_attn_layers(self): bridge = _make_hybrid_bridge() - # Should run without error through all 4 layers (layer 3 has no attn) result = bridge.accumulated_bias(layer=4) assert result.shape == (4,) - def test_accumulated_bias_mlp_input_on_non_attn_layer(self): - """mlp_input=True on a non-attention layer should not crash.""" + def test_mlp_input_on_non_attn_layer(self): bridge = _make_hybrid_bridge() - # Layer 3 has no attn — should still work with mlp_input=True result = bridge.accumulated_bias(layer=3, mlp_input=True) assert result.shape == (4,) -# ============================================================================ -# Tests: block_submodules and layer_types introspection -# ============================================================================ +# -- Tests: block introspection ------------------------------------------------ class TestBlockIntrospection: - """Test layer introspection APIs.""" - def test_block_submodules(self): - """block_submodules should list bridged submodules per layer.""" bridge = _make_hybrid_bridge() - # Layer 0 has both foo and bar - subs_0 = bridge.block_submodules(0) - assert "foo" in subs_0 - assert "bar" in subs_0 - # Layer 3 has only bar - subs_3 = bridge.block_submodules(3) - assert "foo" not in subs_3 - assert "bar" in subs_3 + assert "foo" in bridge.block_submodules(0) + assert "bar" in bridge.block_submodules(0) + assert "foo" not in bridge.block_submodules(3) + assert "bar" in bridge.block_submodules(3) def test_layer_types(self): - """layer_types should return a list with one entry per block.""" bridge = _make_hybrid_bridge() types = bridge.layer_types() assert len(types) == 4 - # Layers 0-2 have 'foo', layer 3 does not for i in range(3): assert "foo" in types[i] assert "foo" not in types[3] -# ============================================================================ -# Tests: stack_params_for hybrid API -# ============================================================================ +# -- Tests: stack_params_for -------------------------------------------------- class TestStackParamsFor: - """Test stack_params_for on hybrid bridges.""" - def test_returns_correct_indices_and_tensors(self): - """stack_params_for should return only matching blocks.""" bridge = _make_hybrid_bridge() indices, stacked = bridge.stack_params_for("foo", "foo.proj.weight") assert indices == [0, 1, 2] assert stacked.shape[0] == 3 def test_raises_on_no_matching_blocks(self): - """Should raise ValueError when no blocks have the submodule.""" bridge = _make_hybrid_bridge() with pytest.raises(ValueError, match="No blocks have submodule"): bridge.stack_params_for("nonexistent", "nonexistent.weight") -# ============================================================================ -# Tests: refactor guard validates all attn keys -# ============================================================================ - - -class TestRefactorGuardConsistency: - """Test that refactor raises on inconsistent attn keys (W_Q present, W_K missing).""" - - def test_raises_on_partial_attn_keys(self): - """If W_Q is present but W_K is missing, should raise ValueError.""" - from transformer_lens.config.TransformerLensConfig import TransformerLensConfig - from transformer_lens.weight_processing import ProcessWeights - - cfg = TransformerLensConfig( - n_layers=1, - n_heads=2, - d_head=4, - d_model=8, - n_ctx=16, - positional_embedding_type="standard", - ) - # Only W_Q present, missing W_K/W_V/W_O - state_dict = { - "blocks.0.attn.W_Q": torch.randn(2, 8, 4), - } - with pytest.raises(ValueError, match="Inconsistent attention weights"): - ProcessWeights.refactor_factored_attn_matrices(state_dict, cfg) - - -# ============================================================================ -# Tests: __setattr__ whitelist includes optional -# ============================================================================ - - -class TestSetAttrWhitelist: - """Test that 'optional' is in the __setattr__ whitelist.""" - - def test_optional_set_on_bridge_not_hf_model(self): - """Setting optional after set_original_component should stay on bridge.""" - comp = LinearBridge(name="test") - fake_hf = nn.Linear(4, 4, bias=False) - comp.set_original_component(fake_hf) - comp.optional = True - # Should be on the bridge, not on the HF module - assert comp.optional is True - assert not hasattr(fake_hf, "optional") - - -# ============================================================================ -# Tests: attn_head_labels matches composition scores dimensions -# ============================================================================ +# -- Tests: attn_head_labels -------------------------------------------------- class TestAttnHeadLabels: - """attn_head_labels should match all_composition_scores dimensions.""" - - def test_attn_head_labels_excludes_non_attn_layers(self): - """Labels should only cover attention layers, not SSM/linear-attn.""" + def test_excludes_non_attn_layers(self): bridge = _make_hybrid_bridge_with_attn() - bridge.cfg.n_heads = 2 labels = bridge.attn_head_labels - # 3 attention layers (0, 1, 2) * 2 heads = 6 labels assert len(labels) == 6 assert labels == ["L0H0", "L0H1", "L1H0", "L1H1", "L2H0", "L2H1"] - # Should NOT contain L3 (non-attention layer) - assert all("L3" not in lbl for lbl in labels) - def test_all_head_labels_includes_all_layers(self): - """all_head_labels should still include every layer.""" + def test_all_head_labels_includes_all(self): bridge = _make_hybrid_bridge_with_attn() - bridge.cfg.n_heads = 2 - labels = bridge.all_head_labels - # 4 layers * 2 heads = 8 labels - assert len(labels) == 8 + assert len(bridge.all_head_labels) == 8 -# ============================================================================ -# Tests: hook propagation through optional submodules -# ============================================================================ +# -- Tests: hook propagation -------------------------------------------------- class TestHookPropagation: - """Verify hooks fire on present optional submodules and don't exist on absent ones.""" - - def _build_hybrid_model_and_blocks(self): - """Build a hybrid model with setup done so hooks are wired.""" - import copy - - model = HybridModel() - adapter = MinimalAdapter(optional=True) - template = adapter.make_block_template() - - blocks = [] - for i, layer in enumerate(model.layers): - block = copy.deepcopy(template) - block.name = f"layers.{i}" - block.set_original_component(layer) - setup_submodules(block, adapter, layer) - blocks.append(block) - - return model, blocks - - def test_hooks_fire_on_present_optional_submodule(self): - """hook_in and hook_out should fire on blocks where the optional submodule exists.""" - model, blocks = self._build_hybrid_model_and_blocks() - - # Block 0 has 'foo' — its hook_in and hook_out should fire - foo_bridge = blocks[0].foo - hook_in_fired = [] - hook_out_fired = [] - - foo_bridge.hook_in.add_hook(lambda tensor, hook: hook_in_fired.append(True) or tensor) - foo_bridge.hook_out.add_hook(lambda tensor, hook: hook_out_fired.append(True) or tensor) - - # Run a forward pass through the HF model's layer 0 - # Because replace_remote_component swapped model.layers[0].foo with the bridge, - # calling model.layers[0].foo(x) goes through LinearBridge.forward - x = torch.randn(1, 4) - _ = blocks[0].foo(x) + def test_hooks_fire_on_present_optional(self): + blocks = _setup_blocks(HybridModel(), MinimalAdapter(optional=True)) + fired = [] + blocks[0].foo.hook_out.add_hook(lambda t, hook: fired.append(True) or t) - assert len(hook_in_fired) == 1, "hook_in should fire on present optional submodule" - assert len(hook_out_fired) == 1, "hook_out should fire on present optional submodule" + blocks[0].foo(torch.randn(1, 4)) + assert len(fired) == 1 - def test_absent_optional_submodule_has_no_hooks(self): - """Block 3 should not have 'foo' at all — no hooks to fire.""" - _, blocks = self._build_hybrid_model_and_blocks() - - # Block 3 lacks 'foo' — it shouldn't be in _modules + def test_absent_optional_has_no_module(self): + blocks = _setup_blocks(HybridModel(), MinimalAdapter(optional=True)) assert "foo" not in blocks[3]._modules - # Attempting to access hooks on the absent submodule should fail - assert not hasattr(blocks[3], "foo") - - def test_hooks_on_present_dont_affect_absent(self): - """Running all blocks should fire hooks only on blocks with the optional submodule.""" - model, blocks = self._build_hybrid_model_and_blocks() - # Track which blocks fire foo.hook_out - fired_block_indices = [] + def test_hooks_fire_only_on_present(self): + model = HybridModel() + blocks = _setup_blocks(model, MinimalAdapter(optional=True)) + fired_indices = [] for i, block in enumerate(blocks): if "foo" in block._modules: - block.foo.hook_out.add_hook( - lambda tensor, hook, idx=i: fired_block_indices.append(idx) or tensor - ) + block.foo.hook_out.add_hook(lambda t, hook, idx=i: fired_indices.append(idx) or t) - # Run forward through all HF layers x = torch.randn(1, 4) - for i, layer in enumerate(model.layers): + for layer in model.layers: x = layer(x) + assert fired_indices == [0, 1, 2] - # Hooks should fire on layers 0, 1, 2 (have foo) but not 3 - assert fired_block_indices == [0, 1, 2] - - def test_universal_submodule_hooks_fire_on_all_blocks(self): - """'bar' is universal — its hooks should fire on every block.""" - model, blocks = self._build_hybrid_model_and_blocks() - - fired_block_indices = [] + def test_universal_hooks_fire_on_all(self): + model = HybridModel() + blocks = _setup_blocks(model, MinimalAdapter(optional=True)) + fired_indices = [] for i, block in enumerate(blocks): - block.bar.hook_out.add_hook( - lambda tensor, hook, idx=i: fired_block_indices.append(idx) or tensor - ) + block.bar.hook_out.add_hook(lambda t, hook, idx=i: fired_indices.append(idx) or t) x = torch.randn(1, 4) for layer in model.layers: x = layer(x) + assert fired_indices == [0, 1, 2, 3] - assert fired_block_indices == [0, 1, 2, 3] - -# ============================================================================ -# Tests: CompositionScores tensor protocol -# ============================================================================ +# -- Tests: CompositionScores tensor protocol ---------------------------------- class TestCompositionScoresProtocol: - """CompositionScores should behave like a tensor for existing research code.""" - def _make_scores(self): from transformer_lens.model_bridge.composition_scores import CompositionScores t = torch.randn(3, 2, 3, 2) return CompositionScores(t, [0, 2, 5], ["L0H0", "L0H1", "L2H0", "L2H1", "L5H0", "L5H1"]) - def test_shape(self): + def test_shape_device_dtype(self): cs = self._make_scores() assert cs.shape == torch.Size([3, 2, 3, 2]) - - def test_device_and_dtype(self): - cs = self._make_scores() assert cs.device == torch.device("cpu") assert cs.dtype == torch.float32 - def test_indexing_returns_tensor(self): + def test_indexing(self): cs = self._make_scores() - sliced = cs[0, :, 1, :] - assert isinstance(sliced, torch.Tensor) - assert sliced.shape == (2, 2) + assert isinstance(cs[0, :, 1, :], torch.Tensor) + assert cs[0, :, 1, :].shape == (2, 2) def test_torch_isnan(self): - """torch.isnan(scores) must work — used in existing integration tests.""" cs = self._make_scores() result = torch.isnan(cs) assert isinstance(result, torch.Tensor) - assert result.shape == cs.shape assert not result.any() def test_torch_where(self): @@ -786,67 +444,34 @@ def test_torch_where(self): result = torch.where(cs > 0, cs.scores, torch.zeros_like(cs.scores)) assert isinstance(result, torch.Tensor) - def test_comparison_gt(self): - cs = self._make_scores() - mask = cs > 0 - assert isinstance(mask, torch.Tensor) - assert mask.shape == cs.shape - - def test_comparison_ne(self): - """scores != 0 must return a tensor, not raise RuntimeError.""" + def test_comparisons(self): cs = self._make_scores() - result = cs != 0 - assert isinstance(result, torch.Tensor) - assert result.shape == cs.shape + assert isinstance(cs > 0, torch.Tensor) + assert isinstance(cs != 0, torch.Tensor) + assert isinstance(cs == 0, torch.Tensor) - def test_comparison_eq(self): + def test_tensor_methods(self): cs = self._make_scores() - result = cs == 0 - assert isinstance(result, torch.Tensor) - - def test_tensor_method_abs(self): - """scores.abs() must work via __getattr__ delegation.""" - cs = self._make_scores() - result = cs.abs() - assert isinstance(result, torch.Tensor) - - def test_tensor_method_sum(self): - cs = self._make_scores() - result = cs.sum() - assert isinstance(result, torch.Tensor) - - def test_tensor_method_any(self): - cs = self._make_scores() - result = cs.any() - assert isinstance(result, torch.Tensor) + assert isinstance(cs.abs(), torch.Tensor) + assert isinstance(cs.sum(), torch.Tensor) + assert isinstance(cs.any(), torch.Tensor) def test_chained_indexing_and_method(self): - """scores[l1, :, l2, :].abs().sum() — the exact pattern from integration tests.""" cs = self._make_scores() result = cs[0, :, 1, :].abs().sum() - assert isinstance(result, torch.Tensor) - assert result.ndim == 0 # scalar + assert result.ndim == 0 - def test_metadata_accessible(self): + def test_metadata(self): cs = self._make_scores() assert cs.layer_indices == [0, 2, 5] assert len(cs.head_labels) == 6 - - def test_repr(self): - cs = self._make_scores() - r = repr(cs) - assert "CompositionScores" in r - assert "layer_indices" in r + assert "CompositionScores" in repr(cs) -# ============================================================================ -# Tests: get_bridge_params with hybrid blocks -# ============================================================================ +# -- Tests: get_bridge_params with hybrid blocks ------------------------------ class TestGetBridgeParamsHybrid: - """get_bridge_params should skip attn keys for non-attention layers.""" - def test_no_attn_keys_for_non_attn_layers(self): from transformer_lens.model_bridge.get_params_util import get_bridge_params @@ -854,34 +479,14 @@ def test_no_attn_keys_for_non_attn_layers(self): bridge.cfg.d_vocab = 10 bridge.cfg.n_ctx = 8 bridge.cfg.d_mlp = 16 - bridge.cfg.n_heads = 2 bridge.cfg.d_head = 2 - # Add minimal embed/unembed so get_bridge_params doesn't fail bridge.embed = nn.Embedding(10, 4) bridge.pos_embed = type("PE", (), {"weight": torch.randn(8, 4)})() - bridge.unembed = type( - "UE", - (), - { - "weight": torch.randn(10, 4), - "b_U": torch.zeros(10), - }, - )() + bridge.unembed = type("UE", (), {"weight": torch.randn(10, 4), "b_U": torch.zeros(10)})() params = get_bridge_params(bridge) - - # Blocks 0-2 have 'attn' — should have attn keys - for i in range(3): - # attn is mapped but internal structure (q/k/v/o) may not match - # our synthetic LinearBridge wrapping FakeSubmodule — so attn keys - # may or may not be present depending on structure. The key point - # is block 3 must NOT have attn keys. - pass - - # Block 3 has NO 'attn' — must not have any attn keys - attn_keys_for_block3 = [k for k in params if k.startswith("blocks.3.attn.")] - assert len(attn_keys_for_block3) == 0, ( - f"Block 3 (non-attention layer) should have no attn keys, " - f"but found: {attn_keys_for_block3}" - ) + attn_keys_block3 = [k for k in params if k.startswith("blocks.3.attn.")] + assert ( + len(attn_keys_block3) == 0 + ), f"Non-attn layer should have no attn keys: {attn_keys_block3}" diff --git a/transformer_lens/benchmarks/weight_processing.py b/transformer_lens/benchmarks/weight_processing.py index 326c53df7..5a7fafd65 100644 --- a/transformer_lens/benchmarks/weight_processing.py +++ b/transformer_lens/benchmarks/weight_processing.py @@ -149,7 +149,6 @@ def benchmark_weight_sharing( if reference_model is not None: reference_original = reference_model(test_text, return_type="loss") - # Find first block with attention (hybrid models may not have attn on block 0) bridge_attn_blocks = bridge.blocks_with("attn") if not bridge_attn_blocks: return BenchmarkResult( @@ -558,7 +557,6 @@ def benchmark_attention_output_centering( message="Skipped for tiny/test model (random weights don't center meaningfully)", ) - # Find blocks with attention (hybrid architectures may not have attn on all blocks) attn_blocks = bridge.blocks_with("attn") if not attn_blocks: return BenchmarkResult( @@ -801,7 +799,6 @@ def benchmark_value_bias_folding( }, ) - # Find blocks with attention (hybrid architectures may not have attn on all blocks) attn_blocks = bridge.blocks_with("attn") if not attn_blocks: return BenchmarkResult( diff --git a/transformer_lens/model_bridge/bridge.py b/transformer_lens/model_bridge/bridge.py index 372b53bf5..6f9e816f8 100644 --- a/transformer_lens/model_bridge/bridge.py +++ b/transformer_lens/model_bridge/bridge.py @@ -1021,21 +1021,10 @@ def to_single_str_token(self, int_token: int) -> str: raise AssertionError("Expected a single string token.") def blocks_with(self, submodule: str) -> List[Tuple[int, "GeneralizedComponent"]]: - """Return (index, block) pairs for blocks that have the named submodule. + """Return (index, block) pairs for blocks with the named bridged submodule. - Hybrid architectures have heterogeneous blocks — some layers have - attention, others have SSM or linear attention, etc. Use this instead - of assuming blocks[0] is representative. - - Only returns blocks where the submodule was explicitly set up as a - bridged component (registered in _modules), not submodules that happen - to exist on the underlying HF model. - - Args: - submodule: Name of the submodule to check for (e.g., "attn", "mamba") - - Returns: - List of (layer_index, block) tuples for blocks that have the submodule. + Checks _modules (not hasattr) so HF-internal attrs don't match. + Use instead of assuming blocks[0] is representative on hybrid models. """ if not hasattr(self, "blocks"): return [] @@ -1044,23 +1033,9 @@ def blocks_with(self, submodule: str) -> List[Tuple[int, "GeneralizedComponent"] def stack_params_for( self, submodule: str, attr_path: str, reshape_fn: Optional[Callable] = None ) -> Tuple[List[int], torch.Tensor]: - """Stack a parameter across blocks that have a specific submodule. - - For hybrid architectures where only some blocks have attention (or SSM, - etc.), this returns the stacked tensor for only matching blocks along - with their layer indices. - - Args: - submodule: Submodule to filter on (e.g., "attn", "mamba") - attr_path: Dot-separated attr path from block (e.g., "attn.W_K") - reshape_fn: Optional function to reshape each weight before stacking - - Returns: - Tuple of (layer_indices, stacked_tensor) where layer_indices maps - position i in the tensor to the original layer index. + """Stack a parameter across matching blocks only. Returns (layer_indices, tensor). - Raises: - ValueError: If no blocks have the requested submodule. + Use for hybrid models where not all blocks have the submodule. """ matching = self.blocks_with(submodule) if not matching: @@ -1081,23 +1056,12 @@ def stack_params_for( def _stack_block_params( self, attr_path: str, reshape_fn: Optional[Callable] = None ) -> torch.Tensor: - """Stack a parameter across all blocks, or across matching blocks for hybrids. + """Stack a parameter across all blocks; falls back to matching-only on hybrids. - For homogeneous models, returns a tensor of shape [n_layers, ...]. - For hybrid models where some blocks lack the requested submodule, - returns a tensor of shape [n_matching_blocks, ...] and emits a - one-time warning about the index mapping. - - Args: - attr_path: Dot-separated attribute path from block (e.g., "attn.W_K") - reshape_fn: Optional function to reshape each weight before stacking - - Note: - The guard checks only that the first path segment is a bridged - submodule (in _modules). Deeper segments resolve via standard - getattr, which may fall through to HF model attributes. This is - intentional — properties like W_Q are exposed via __getattr__ - delegation to the underlying weight tensors. + On hybrid models, logs a warning about index mapping and returns only + blocks that have the submodule. First path segment is checked against + _modules; deeper segments resolve via getattr (intentional — W_Q etc. + are exposed via __getattr__ delegation). """ first_attr = attr_path.split(".")[0] matching_blocks = [ @@ -1231,42 +1195,22 @@ def W_E(self) -> torch.Tensor: @property def QK(self): - """QK circuit as a FactoredMatrix. - - On hybrid models, returns the circuit for attention layers only (with - a warning about index mapping). For explicit index control, use - QK_for_attn_layers() which returns (layer_indices, FactoredMatrix). - """ + """QK circuit. On hybrids, returns attn layers only (with warning). See QK_for_attn_layers().""" return FactoredMatrix(self.W_Q, self.W_K.transpose(-2, -1)) @property def OV(self): - """OV circuit as a FactoredMatrix. - - On hybrid models, returns the circuit for attention layers only (with - a warning about index mapping). For explicit index control, use - OV_for_attn_layers() which returns (layer_indices, FactoredMatrix). - """ + """OV circuit. On hybrids, returns attn layers only (with warning). See OV_for_attn_layers().""" return FactoredMatrix(self.W_V, self.W_O) def QK_for_attn_layers(self) -> Tuple[List[int], FactoredMatrix]: - """QK circuit for attention layers only (hybrid-safe). - - Returns: - Tuple of (layer_indices, FactoredMatrix) where layer_indices maps - position i in the matrix to the original layer index. - """ + """QK circuit for attention layers only. Returns (layer_indices, FactoredMatrix).""" q_indices, W_Q = self.stack_params_for("attn", "attn.W_Q", self._reshape_qkv) _, W_K = self.stack_params_for("attn", "attn.W_K", self._reshape_qkv) return q_indices, FactoredMatrix(W_Q, W_K.transpose(-2, -1)) def OV_for_attn_layers(self) -> Tuple[List[int], FactoredMatrix]: - """OV circuit for attention layers only (hybrid-safe). - - Returns: - Tuple of (layer_indices, FactoredMatrix) where layer_indices maps - position i in the matrix to the original layer index. - """ + """OV circuit for attention layers only. Returns (layer_indices, FactoredMatrix).""" v_indices, W_V = self.stack_params_for("attn", "attn.W_V", self._reshape_qkv) _, W_O = self.stack_params_for("attn", "attn.W_O", self._reshape_o) return v_indices, FactoredMatrix(W_V, W_O) @@ -1314,9 +1258,7 @@ def tokens_to_residual_directions( residual_direction = self.W_U[:, token] return residual_direction - # Output bias attribute names by variant type. Attention uses "b_O" - # (a processed-weight alias). SSM/linear-attn variants use their output - # projection's bias. Map variant name → list of attribute paths to check. + # Variant → attr paths for the output bias that feeds the residual stream. _VARIANT_OUTPUT_BIAS_ATTRS: Dict[str, tuple] = { "attn": ("b_O",), "linear_attn": ("out_proj.bias",), @@ -1326,12 +1268,7 @@ def tokens_to_residual_directions( } def _get_block_variant_bias(self, block: "GeneralizedComponent") -> Optional[torch.Tensor]: - """Get the output bias from whatever variant submodule this block has. - - Each variant type has its own output bias attribute name — attention - uses b_O while SSM variants use out_proj.bias. Returns the first - found, or None if the variant has no output bias. - """ + """Return the output bias from this block's variant submodule, or None.""" for name in VARIANT_SUBMODULE_NAMES: if name not in block._modules: continue @@ -1353,22 +1290,10 @@ def accumulated_bias( mlp_input: bool = False, include_mlp_biases: bool = True, ) -> torch.Tensor: - """Sum of biases that contribute to the residual stream up to a given layer. - - Includes output biases from whatever variant submodule each block has - (attention, Mamba, linear attention, etc.) plus MLP output biases. - For hybrid models, non-attention layers still contribute their variant - submodule's output bias to the residual stream. + """Sum of variant + MLP output biases through the residual stream up to `layer`. - Args: - layer: Layer number in [0, n_layers]. 0 means no layers, n_layers means all. - mlp_input: If True, include the variant submodule's output bias of - the target layer (i.e. bias up to the MLP input of that layer). - include_mlp_biases: Whether to include MLP biases. Useful to set False when - expanding attn_out into individual heads but keeping mlp_out as-is. - - Returns: - Tensor of shape [d_model] with the accumulated bias. + Includes all layer types (attn, SSM, linear-attn). Set mlp_input=True + to include the variant bias of the target layer itself. """ accumulated = torch.zeros(self.cfg.d_model, device=self.cfg.device) for i in range(layer): @@ -1389,23 +1314,12 @@ def accumulated_bias( return accumulated def all_composition_scores(self, mode: str) -> CompositionScores: - """Composition scores for all pairs of attention heads. - - Returns a ``CompositionScores`` containing the scores tensor, the - original layer indices, and human-readable head labels. The scores - tensor has shape (n_attn_layers, n_heads, n_attn_layers, n_heads) and - is upper triangular on the layer axes. - - For hybrid models, only attention layers are included. The returned - ``layer_indices`` maps tensor position *i* back to the original layer - number so that results cannot be silently misinterpreted. + """Composition scores for all attention head pairs. Returns CompositionScores. See https://transformer-circuits.pub/2021/framework/index.html - - Args: - mode: One of "Q", "K", "V" — which composition type to compute. + On hybrid models, only attention layers are included; layer_indices + maps tensor position i to original layer number. """ - # Single blocks_with call — all weight stacking uses these same blocks attn_blocks = self.blocks_with("attn") if not attn_blocks: raise ValueError("No attention layers found — cannot compute composition scores.") @@ -1449,59 +1363,23 @@ def _stack(attr_path: str, reshape_fn: Optional[Callable] = None) -> torch.Tenso return CompositionScores(scores=scores, layer_indices=indices, head_labels=labels) def composition_layer_indices(self) -> List[int]: - """Return original layer indices for attention layers. - - Maps position i in all_composition_scores() output back to the - original layer number. For homogeneous models, returns [0, 1, ..., n-1]. - For hybrid models, returns only the attention layer indices. - """ + """Original layer indices for attention layers (maps composition score positions).""" return [idx for idx, _ in self.blocks_with("attn")] def block_hooks(self, layer_idx: int) -> List[str]: - """Return all hook point names available on a specific block. - - Useful for hybrid architectures where different layers have different - hookable submodules — e.g., attention layers expose hook_q/hook_k/etc. - while SSM layers expose hook_in_proj/hook_conv/etc. - - Args: - layer_idx: Layer index to inspect. - - Returns: - Sorted list of hook names (e.g., ["hook_in", "hook_out", "attn.hook_q", ...]). - """ + """Sorted hook names available on block `layer_idx` (block-relative paths).""" prefix = f"blocks.{layer_idx}." return sorted(name[len(prefix) :] for name in self.hook_dict if name.startswith(prefix)) def block_submodules(self, layer_idx: int) -> List[str]: - """Return names of bridged submodules on a specific block. - - Args: - layer_idx: Layer index to inspect. - - Returns: - List of submodule names (e.g., ["ln1", "ln2", "attn", "mlp"]). - """ + """Return bridged submodule names on block `layer_idx`.""" block = self.blocks[layer_idx] return [name for name in block._modules if name not in _BLOCK_INTERNAL_MODULES] def layer_types(self) -> List[str]: - """Return a human-readable layer type for each block. - - Inspects which bridged submodules are present on each block to infer - the layer type. For homogeneous models, all entries will be the same. - Variant submodule names are defined in - ``generalized_components.block.VARIANT_SUBMODULE_NAMES``. - - Labels are deterministic: variants appear in VARIANT_SUBMODULE_NAMES - order, universals are sorted alphabetically. - - Returns: - List of strings like ["attn+mlp", "ssm+mlp", "attn+mlp", ...]. - """ + """Per-block type labels, e.g. ["attn+mlp", "ssm+mlp", ...]. Deterministic order.""" types = [] for block in self.blocks: - # Variants in canonical order (tuple iteration = stable) variants = [n for n in VARIANT_SUBMODULE_NAMES if n in block._modules] universals = sorted( n @@ -1521,11 +1399,7 @@ def all_head_labels(self) -> list[str]: @property def attn_head_labels(self) -> list[str]: - """Labels for attention heads only, matching all_composition_scores() dimensions. - - For homogeneous models, identical to all_head_labels. For hybrid models, - only includes heads from attention layers (skips SSM/linear-attn layers). - """ + """Head labels for attention layers only — matches all_composition_scores() dims.""" return [ f"L{l}H{h}" for l in self.composition_layer_indices() for h in range(self.cfg.n_heads) ] diff --git a/transformer_lens/model_bridge/component_setup.py b/transformer_lens/model_bridge/component_setup.py index 79d2abc2a..a2986d585 100644 --- a/transformer_lens/model_bridge/component_setup.py +++ b/transformer_lens/model_bridge/component_setup.py @@ -100,23 +100,18 @@ def setup_submodules( else: remote_path = submodule.name is_optional = getattr(submodule, "optional", False) - # Fast path: if the first path segment is absent, skip - # immediately. This catches the common hybrid case (e.g., - # "self_attn" absent on an SSM layer) without entering - # get_remote_component. + # Fast path: first segment absent → skip without entering get_remote_component first_segment = remote_path.split(".")[0] if is_optional and not hasattr(original_model, first_segment): logger.debug( - "Optional submodule '%s' (path '%s') absent on %s — skipping", + "Optional '%s' (path '%s') absent on %s", module_name, remote_path, - getattr(component, "name", "unknown"), + getattr(component, "name", "?"), ) skipped_optional.append(module_name) - continue # hybrid layer lacks this submodule; skip binding - # Full resolution — also catches deeper path failures - # (e.g., "self_attn.q_proj" where self_attn exists as a - # stub but q_proj is missing). + continue + # Full resolution — catches deeper path failures (e.g. stub self_attn missing q_proj) try: original_subcomponent = architecture_adapter.get_remote_component( original_model, remote_path @@ -124,10 +119,10 @@ def setup_submodules( except AttributeError: if is_optional: logger.debug( - "Optional submodule '%s' (path '%s') partially absent on %s — skipping", + "Optional '%s' (path '%s') partially absent on %s", module_name, remote_path, - getattr(component, "name", "unknown"), + getattr(component, "name", "?"), ) skipped_optional.append(module_name) continue @@ -145,9 +140,7 @@ def setup_submodules( if not submodule.is_list_item and submodule.name is not None: component.real_components[module_name] = (submodule.name, submodule) - # Remove skipped optional submodules from the template so that - # architecture_adapter traversal code (which reads .submodules) doesn't - # find them and try to resolve against the HF model. + # Clean up so architecture_adapter traversal won't find stale entries for name in skipped_optional: component.submodules.pop(name, None) diff --git a/transformer_lens/model_bridge/composition_scores.py b/transformer_lens/model_bridge/composition_scores.py index 9073fddb2..617d49e99 100644 --- a/transformer_lens/model_bridge/composition_scores.py +++ b/transformer_lens/model_bridge/composition_scores.py @@ -1,28 +1,21 @@ -"""CompositionScores — tensor-like container for composition score results.""" +"""Tensor-like container for composition score results with layer-index metadata.""" from typing import List import torch class CompositionScores: - """Composition scores bundled with layer-index metadata. + """Composition scores that behave like a tensor but carry layer-index metadata. - Behaves like a tensor for backward compatibility — indexing, .shape, - arithmetic, and ``torch.*`` namespace functions all delegate to the - underlying scores tensor via ``__torch_function__``. The additional - ``layer_indices`` and ``head_labels`` attributes provide metadata that - prevents silent misinterpretation of indices on hybrid models. - - For hybrid models, the scores tensor has shape - (n_attn_layers, n_heads, n_attn_layers, n_heads) where n_attn_layers - may be less than n_layers. ``layer_indices`` maps tensor position i + Delegates indexing, .shape, arithmetic, and torch.* functions to the + underlying ``scores`` tensor via ``__torch_function__``. On hybrid models + where n_attn_layers < n_layers, ``layer_indices`` maps tensor position i to the original layer number. Attributes: scores: Upper-triangular composition score tensor. - layer_indices: Original layer numbers for each position in scores. - E.g., [0, 2, 5] means position 0 = layer 0, position 1 = layer 2, etc. - head_labels: Labels like ["L0H0", "L0H1", "L2H0", ...] matching scores dims. + layer_indices: Original layer numbers, e.g. [0, 2, 5]. + head_labels: Labels matching scores dims, e.g. ["L0H0", "L0H1", ...]. """ def __init__(self, scores: torch.Tensor, layer_indices: List[int], head_labels: List[str]): @@ -30,14 +23,11 @@ def __init__(self, scores: torch.Tensor, layer_indices: List[int], head_labels: self.layer_indices = layer_indices self.head_labels = head_labels - # --- Tensor protocol --- - @classmethod def __torch_function__(cls, func, types, args=(), kwargs=None): - """Delegate torch.* calls (torch.isnan, torch.where, etc.) to .scores.""" + """Unwrap CompositionScores args so torch.isnan, torch.where, etc. work.""" if kwargs is None: kwargs = {} - # Unwrap any CompositionScores args to their underlying tensor unwrapped_args = tuple(a.scores if isinstance(a, CompositionScores) else a for a in args) unwrapped_kwargs = { k: v.scores if isinstance(v, CompositionScores) else v for k, v in kwargs.items() @@ -56,16 +46,11 @@ def device(self) -> torch.device: def dtype(self) -> torch.dtype: return self.scores.dtype - # Python 3 automatically sets __hash__ = None when __eq__ is defined, - # making instances unhashable. No explicit __hash__ needed. - def __getitem__(self, key): return self.scores[key] def __getattr__(self, name): - # Delegate tensor methods (.abs(), .sum(), .any(), etc.) to .scores. - # Guard against infinite recursion during pickling/unpickling where - # self.scores may not exist yet. + # Guard against recursion during pickle/deepcopy when self.scores isn't set yet try: scores = object.__getattribute__(self, "scores") except AttributeError: diff --git a/transformer_lens/model_bridge/generalized_components/base.py b/transformer_lens/model_bridge/generalized_components/base.py index 1af033efb..12be7b9c6 100644 --- a/transformer_lens/model_bridge/generalized_components/base.py +++ b/transformer_lens/model_bridge/generalized_components/base.py @@ -46,10 +46,7 @@ def __init__( hook_alias_overrides: Optional dictionary to override default hook aliases. For example, {"hook_attn_out": "ln1_post.hook_out"} will make hook_attn_out point to ln1_post.hook_out instead of the default value in self.hook_aliases. - optional: If True, this entire subtree may be absent on some layers. - When the remote model lacks this component, setup will skip it - cleanly instead of raising AttributeError. Used for hybrid - architectures where layers have structurally different submodules. + optional: If True, setup skips this subtree when absent (hybrid architectures). """ super().__init__() self.name = name diff --git a/transformer_lens/model_bridge/generalized_components/block.py b/transformer_lens/model_bridge/generalized_components/block.py index 1005fd4f2..e6cd0d71f 100644 --- a/transformer_lens/model_bridge/generalized_components/block.py +++ b/transformer_lens/model_bridge/generalized_components/block.py @@ -15,19 +15,15 @@ GeneralizedComponent, ) -# Submodule names that represent layer-type variants in hybrid architectures. -# Used by layer_types() for classification and _get_block_variant_bias() for -# bias accumulation. Adapters that introduce new variant types should add -# their submodule name here. Ordered tuple for deterministic iteration -# (matters when a block has multiple variants during development/testing). +# Layer-type variant submodule names. Tuple for deterministic iteration order. +# Extend here when adding new hybrid variant types. VARIANT_SUBMODULE_NAMES: tuple[str, ...] = ("attn", "linear_attn", "mamba", "mixer", "ssm") _VARIANT_SUBMODULE_SET: frozenset[str] = frozenset(VARIANT_SUBMODULE_NAMES) -# Internal block modules excluded from submodule introspection (hook points -# and the wrapped HF component are infrastructure, not user-facing submodules). +# Infrastructure modules excluded from submodule introspection. _BLOCK_INTERNAL_MODULES: frozenset[str] = frozenset({"hook_in", "hook_out", "_original_component"}) -# Prefixes for normalization modules excluded from layer_types() labels. +# Norm-module prefixes excluded from layer_types() labels. _NORM_PREFIXES: tuple[str, ...] = ("ln", "layer_norm", "norm", "rms") diff --git a/transformer_lens/model_bridge/get_params_util.py b/transformer_lens/model_bridge/get_params_util.py index f27e3a97f..acca83a4e 100644 --- a/transformer_lens/model_bridge/get_params_util.py +++ b/transformer_lens/model_bridge/get_params_util.py @@ -37,23 +37,7 @@ def _get_or_create_bias(bias, n_heads: int, d_head: int, device, dtype) -> torch def get_bridge_params(bridge) -> Dict[str, torch.Tensor]: - """Access to model parameters in the format expected by SVDInterpreter. - - For hybrid architectures, only layers with attention get attention keys - (W_Q, W_K, etc.). Non-attention layers (SSM, linear-attention) are skipped - rather than filled with zeros — this prevents downstream consumers like - SVDInterpreter from treating synthetic zeros as real weights. - - Args: - bridge: TransformerBridge instance - - Returns: - dict: Dictionary of parameter tensors with TransformerLens naming convention. - For hybrid models, attention keys only exist for layers that have attention. - - Raises: - ValueError: If configuration is inconsistent (e.g., cfg.n_layers != len(blocks)) - """ + """Model parameters in SVDInterpreter format. Skips attn keys for non-attention layers.""" params_dict = {} def _get_device_dtype(): @@ -89,15 +73,11 @@ def _get_device_dtype(): ) block = bridge.blocks[layer_idx] - # Only extract attention params from blocks that have attention. - # Non-attention layers (SSM, linear-attention) are skipped entirely - # rather than filled with zeros — this prevents consumers like - # SVDInterpreter from treating synthetic zeros as real weights. + # Skip non-attention layers entirely (no zero-fill — prevents SVDInterpreter garbage) try: has_attn = "attn" in block._modules except (TypeError, AttributeError): - # Mock objects or non-nn.Module blocks: fall back to hasattr - has_attn = hasattr(block, "attn") + has_attn = hasattr(block, "attn") # Mock fallback if has_attn: try: w_q = block.attn.q.weight diff --git a/transformer_lens/weight_processing.py b/transformer_lens/weight_processing.py index 6f0489f21..1d219973a 100644 --- a/transformer_lens/weight_processing.py +++ b/transformer_lens/weight_processing.py @@ -1698,13 +1698,10 @@ def refactor_factored_attn_matrices( b_V_key = ProcessWeights._get_param_key(f"blocks.{l}.attn.b_V", adapter) b_O_key = ProcessWeights._get_param_key(f"blocks.{l}.attn.b_O", adapter) - # Skip layers without attention weights (hybrid architectures where - # some layers are SSM/linear-attention and lack Q/K/V/O entirely). - # Other weight-processing loops (center_writing_weights, fold_value_biases, - # fold_layer_norm) already guard with `if key in state_dict:` checks. + # Skip hybrid layers without attention (other loops already guard individually) if W_Q_key not in state_dict: continue - # All four weight matrices must be present if Q is present + # If Q is present, K/V/O must be too for _required_key in [W_K_key, W_V_key, W_O_key]: if _required_key not in state_dict: raise ValueError( From ad3764cb4ec590535ff5b7db27c539ddeeb44e6b Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Tue, 14 Apr 2026 23:17:40 -0500 Subject: [PATCH 3/8] Initial setup for proper handling of the Gated Delta Net Bridge --- tests/unit/test_qwen3_5_adapter.py | 26 +- tests/unit/test_qwen3_next_adapter.py | 27 +- .../generalized_components/__init__.py | 3 + .../generalized_components/attention.py | 7 +- .../generalized_components/gated_delta_net.py | 289 ++++++++++++++++++ .../position_embeddings_attention.py | 28 +- .../supported_architectures/granite.py | 7 +- .../granite_moe_hybrid.py | 74 ++--- .../supported_architectures/qwen3.py | 169 +++++----- .../supported_architectures/qwen3_5.py | 174 ++--------- .../supported_architectures/qwen3_next.py | 161 ++-------- 11 files changed, 536 insertions(+), 429 deletions(-) create mode 100644 transformer_lens/model_bridge/generalized_components/gated_delta_net.py diff --git a/tests/unit/test_qwen3_5_adapter.py b/tests/unit/test_qwen3_5_adapter.py index 8fd885174..1b9ac778c 100644 --- a/tests/unit/test_qwen3_5_adapter.py +++ b/tests/unit/test_qwen3_5_adapter.py @@ -134,18 +134,28 @@ def test_unembed_path(self, adapter): # ---- Block submodules ---- def test_block_submodules_keys(self, adapter): - """blocks submodules must contain ln1, ln2, mlp but NOT attn. + """blocks submodules must contain ln1, ln2, mlp, and optional attn + linear_attn.""" + submodules = adapter.component_mapping["blocks"].submodules + assert set(submodules.keys()) == {"ln1", "ln2", "mlp", "attn", "linear_attn"} - Critical correctness test: self_attn is absent on linear-attention - layers, so mapping attn as a block submodule would crash on those layers. - """ + def test_attn_is_optional(self, adapter): + """attn must be marked optional (absent on linear-attention layers).""" + submodules = adapter.component_mapping["blocks"].submodules + assert submodules["attn"].optional is True + + def test_linear_attn_is_optional(self, adapter): + """linear_attn must be marked optional (absent on full-attention layers).""" submodules = adapter.component_mapping["blocks"].submodules - assert set(submodules.keys()) == {"ln1", "ln2", "mlp"} + assert submodules["linear_attn"].optional is True + + def test_linear_attn_bridge_type(self, adapter): + """linear_attn must be a GatedDeltaNetBridge.""" + from transformer_lens.model_bridge.generalized_components.gated_delta_net import ( + GatedDeltaNetBridge, + ) - def test_no_attn_in_block_submodules(self, adapter): - """attn must NOT appear as a block submodule (hybrid architecture safety check).""" submodules = adapter.component_mapping["blocks"].submodules - assert "attn" not in submodules + assert isinstance(submodules["linear_attn"], GatedDeltaNetBridge) def test_ln1_path(self, adapter): """ln1 maps to input_layernorm.""" diff --git a/tests/unit/test_qwen3_next_adapter.py b/tests/unit/test_qwen3_next_adapter.py index 1a2842e7b..516d7a8b5 100644 --- a/tests/unit/test_qwen3_next_adapter.py +++ b/tests/unit/test_qwen3_next_adapter.py @@ -135,19 +135,28 @@ def test_unembed_path(self, adapter): # ---- Block submodules ---- def test_block_submodules_keys(self, adapter): - """blocks submodules must contain ln1, ln2, mlp but NOT attn. + """blocks submodules must contain ln1, ln2, mlp, and optional attn + linear_attn.""" + submodules = adapter.component_mapping["blocks"].submodules + assert set(submodules.keys()) == {"ln1", "ln2", "mlp", "attn", "linear_attn"} - This is a critical correctness test: self_attn is absent on - linear-attention layers, so mapping attn as a block submodule - would crash on those layers. - """ + def test_attn_is_optional(self, adapter): + """attn must be marked optional (absent on linear-attention layers).""" + submodules = adapter.component_mapping["blocks"].submodules + assert submodules["attn"].optional is True + + def test_linear_attn_is_optional(self, adapter): + """linear_attn must be marked optional (absent on full-attention layers).""" submodules = adapter.component_mapping["blocks"].submodules - assert set(submodules.keys()) == {"ln1", "ln2", "mlp"} + assert submodules["linear_attn"].optional is True + + def test_linear_attn_bridge_type(self, adapter): + """linear_attn must be a GatedDeltaNetBridge.""" + from transformer_lens.model_bridge.generalized_components.gated_delta_net import ( + GatedDeltaNetBridge, + ) - def test_no_attn_in_block_submodules(self, adapter): - """attn must NOT appear as a block submodule (hybrid architecture safety check).""" submodules = adapter.component_mapping["blocks"].submodules - assert "attn" not in submodules + assert isinstance(submodules["linear_attn"], GatedDeltaNetBridge) def test_ln1_path(self, adapter): """ln1 maps to input_layernorm.""" diff --git a/transformer_lens/model_bridge/generalized_components/__init__.py b/transformer_lens/model_bridge/generalized_components/__init__.py index fb789cc30..c2c7a121b 100644 --- a/transformer_lens/model_bridge/generalized_components/__init__.py +++ b/transformer_lens/model_bridge/generalized_components/__init__.py @@ -35,6 +35,9 @@ from transformer_lens.model_bridge.generalized_components.alibi_joint_qkv_attention import ( ALiBiJointQKVAttentionBridge, ) +from transformer_lens.model_bridge.generalized_components.gated_delta_net import ( + GatedDeltaNetBridge, +) from transformer_lens.model_bridge.generalized_components.gated_mlp import ( GatedMLPBridge, ) diff --git a/transformer_lens/model_bridge/generalized_components/attention.py b/transformer_lens/model_bridge/generalized_components/attention.py index 05d5e0982..2d73d7ed7 100644 --- a/transformer_lens/model_bridge/generalized_components/attention.py +++ b/transformer_lens/model_bridge/generalized_components/attention.py @@ -59,6 +59,7 @@ def __init__( requires_position_embeddings: bool = False, requires_attention_mask: bool = False, attention_mask_4d: bool = False, + **kwargs, ): """Initialize the attention bridge. @@ -82,7 +83,11 @@ def __init__( if conversion_rule is None: conversion_rule = AttentionAutoConversion(config) super().__init__( - name, config=config, submodules=submodules or {}, conversion_rule=conversion_rule + name, + config=config, + submodules=submodules or {}, + conversion_rule=conversion_rule, + **kwargs, ) self.hook_attn_scores = HookPoint() self.hook_pattern = HookPoint() diff --git a/transformer_lens/model_bridge/generalized_components/gated_delta_net.py b/transformer_lens/model_bridge/generalized_components/gated_delta_net.py new file mode 100644 index 000000000..b62937dbd --- /dev/null +++ b/transformer_lens/model_bridge/generalized_components/gated_delta_net.py @@ -0,0 +1,289 @@ +"""GatedDeltaNet bridge for Qwen3.5/Qwen3Next linear-attention layers. + +Reimplements forward (prefill only) to expose mech-interp-relevant intermediate +states. Falls back to HF native forward during autoregressive generation where +cache state management is required. +""" +from typing import TYPE_CHECKING, Any, Dict, Optional + +import torch +import torch.nn.functional as F + +from transformer_lens.hook_points import HookPoint +from transformer_lens.model_bridge.generalized_components.base import ( + GeneralizedComponent, +) + +if TYPE_CHECKING: + from transformer_lens.ActivationCache import ActivationCache + + +class GatedDeltaNetBridge(GeneralizedComponent): + """Bridge for GatedDeltaNet linear-attention with full hook decomposition. + + Hooks (prefill, in execution order): + hook_in: input hidden_states [batch, seq, d_model] + hook_q_pre_conv: Q after projection + split, before conv [batch, seq, n_k_heads, head_k_dim] + hook_k_pre_conv: K before conv [batch, seq, n_k_heads, head_k_dim] + hook_v_pre_conv: V before conv [batch, seq, n_v_heads, head_v_dim] + hook_conv_out: post-conv mixed QKV [batch, seq, key_dim*2 + value_dim] + hook_q: Q after conv, pre-GQA-expansion [batch, seq, n_k_heads, head_k_dim] + hook_k: K after conv [batch, seq, n_k_heads, head_k_dim] + hook_v: V after conv [batch, seq, n_v_heads, head_v_dim] + hook_beta: write strength (sigmoid of b), per v-head [batch, seq, n_v_heads] + hook_log_decay: log-space decay g (negative; actual decay = exp(g)), per v-head [batch, seq, n_v_heads] + hook_recurrence_out: output of linear recurrence kernel [batch, seq, n_v_heads, head_v_dim] + hook_gate_input: z tensor before silu gating in GatedRMSNorm [batch, seq, n_v_heads, head_v_dim] + hook_out: final output to residual stream [batch, seq, d_model] + + During generation (cache_params present), only hook_in/hook_out fire. + + Property aliases: + W_in_proj_qkvz, W_in_proj_ba, W_out_proj, A_log, dt_bias + """ + + hook_aliases = { + "hook_linear_attn_in": "hook_in", + "hook_linear_attn_out": "hook_out", + } + + property_aliases = { + "W_in_proj_qkvz": "in_proj_qkvz.weight", + "W_in_proj_ba": "in_proj_ba.weight", + "W_out_proj": "out_proj.weight", + "A_log": "A_log", + "dt_bias": "dt_bias", + } + + def __init__( + self, + name: str, + config: Optional[Any] = None, + submodules: Optional[Dict[str, GeneralizedComponent]] = None, + **kwargs, + ): + super().__init__(name, config=config, submodules=submodules or {}, **kwargs) + # Pre-conv hooks (after projection, before causal convolution mixes positions) + self.hook_q_pre_conv = HookPoint() + self.hook_k_pre_conv = HookPoint() + self.hook_v_pre_conv = HookPoint() + # Conv output + self.hook_conv_out = HookPoint() + # Post-conv hooks (pre-GQA-expansion, pre-recurrence) + self.hook_q = HookPoint() + self.hook_k = HookPoint() + self.hook_v = HookPoint() + # Gate parameters (per v-head) + self.hook_beta = HookPoint() + self.hook_log_decay = HookPoint() + # Recurrence output + gated norm input + self.hook_recurrence_out = HookPoint() + self.hook_gate_input = HookPoint() + + def forward(self, *args: Any, **kwargs: Any) -> Any: + if self.original_component is None: + raise RuntimeError(f"Original component not set for {self.name}.") + + # Generation step → delegate to HF with only input/output hooks + if kwargs.get("cache_params") is not None: + return self._native_forward(*args, **kwargs) + return self._hooked_forward(*args, **kwargs) + + def _native_forward(self, *args: Any, **kwargs: Any) -> Any: + """Delegate to HF with hook_in/hook_out only (generation path).""" + assert self.original_component is not None + if "hidden_states" in kwargs: + kwargs["hidden_states"] = self.hook_in(kwargs["hidden_states"]) + elif len(args) > 0 and isinstance(args[0], torch.Tensor): + args = (self.hook_in(args[0]),) + args[1:] + + output = self.original_component(*args, **kwargs) + if isinstance(output, torch.Tensor): + return self.hook_out(output) + return output + + def _hooked_forward(self, *args: Any, **kwargs: Any) -> Any: + """Reimplemented forward exposing all intermediate states (prefill).""" + hf: Any = self.original_component + + if "hidden_states" in kwargs: + hidden_states = kwargs["hidden_states"] + elif len(args) > 0 and isinstance(args[0], torch.Tensor): + hidden_states = args[0] + else: + raise ValueError("Could not find hidden_states") + + attention_mask = kwargs.get("attention_mask") + if attention_mask is not None: + from transformers.models.qwen3_next.modeling_qwen3_next import ( + apply_mask_to_padding_states, + ) + + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + + hidden_states = self.hook_in(hidden_states) + batch_size, seq_len, _ = hidden_states.shape + + # --- Projections --- + projected_qkvz = hf.in_proj_qkvz(hidden_states) + projected_ba = hf.in_proj_ba(hidden_states) + + # Split into per-head Q, K, V, Z, beta_raw, alpha_raw + query, key, value, z, b, a = hf.fix_query_key_value_ordering(projected_qkvz, projected_ba) + + # --- Pre-conv hooks (per-head shape, before conv mixes positions) --- + query = self.hook_q_pre_conv(query) + key = self.hook_k_pre_conv(key) + value = self.hook_v_pre_conv(value) + + # Flatten for conv + query, key, value = (x.reshape(x.shape[0], x.shape[1], -1) for x in (query, key, value)) + + # --- Causal Convolution --- + mixed_qkv = torch.cat((query, key, value), dim=-1).transpose(1, 2) + if hf.causal_conv1d_fn is not None: + mixed_qkv = hf.causal_conv1d_fn( + x=mixed_qkv, + weight=hf.conv1d.weight.squeeze(1), + bias=hf.conv1d.bias, + activation=hf.activation, + seq_idx=None, + ) + else: + mixed_qkv = F.silu(hf.conv1d(mixed_qkv)[:, :, :seq_len]) + mixed_qkv = mixed_qkv.transpose(1, 2) + + mixed_qkv = self.hook_conv_out(mixed_qkv) + + # Split post-conv + query, key, value = torch.split( + mixed_qkv, + [hf.key_dim, hf.key_dim, hf.value_dim], + dim=-1, + ) + query = query.reshape(batch_size, seq_len, -1, hf.head_k_dim) + key = key.reshape(batch_size, seq_len, -1, hf.head_k_dim) + value = value.reshape(batch_size, seq_len, -1, hf.head_v_dim) + + # --- Post-conv hooks (pre-GQA-expansion, pre-recurrence) --- + query = self.hook_q(query) + key = self.hook_k(key) + value = self.hook_v(value) + + # --- Gate parameters (per v-head) --- + beta = self.hook_beta(b.sigmoid()) + + # g is log-space decay (negative); actual multiplicative decay = exp(g) + g = -hf.A_log.float().exp() * F.softplus(a.float() + hf.dt_bias) + g = self.hook_log_decay(g) + + # GQA expansion (Q/K from n_k_heads → n_v_heads) + if hf.num_v_heads // hf.num_k_heads > 1: + repeat = hf.num_v_heads // hf.num_k_heads + query = query.repeat_interleave(repeat, dim=2) + key = key.repeat_interleave(repeat, dim=2) + + # --- Core linear recurrence (opaque fused kernel) --- + core_out, _ = hf.chunk_gated_delta_rule( + query, + key, + value, + g=g, + beta=beta, + initial_state=None, + output_final_state=False, + use_qk_l2norm_in_kernel=True, + ) + core_out = self.hook_recurrence_out(core_out) + + # --- Gated RMSNorm: norm(core_out) * silu(z) --- + z = self.hook_gate_input(z) + z_shape = z.shape + core_out = hf.norm( + core_out.reshape(-1, core_out.shape[-1]), + z.reshape(-1, z.shape[-1]), + ) + core_out = core_out.reshape(z_shape).reshape(batch_size, seq_len, -1) + + # --- Output projection --- + output = hf.out_proj(core_out) + return self.hook_out(output) + + def compute_effective_attention( + self, + cache: "ActivationCache", + layer_idx: int, + ) -> torch.Tensor: + """Materialize the effective attention matrix from cached hook values. + + The gated delta rule recurrence is: + S_t = exp(g_t) * S_{t-1} + beta_t * v_t @ k_t^T + o_t = S_t^T @ q_t + + The effective attention M[i,j] = contribution of input j to output i: + M[i,j] = (q_i^T @ k_j) * beta_j * prod_{t=j+1}^{i} exp(g_t) + + Note: the fused kernel applies L2-normalization to Q and K internally + (use_qk_l2norm_in_kernel=True). The hooked Q/K are pre-normalization, + so this reconstruction is approximate. For exact reconstruction, you'd + need the normalized Q/K which aren't exposed by the kernel. + + Args: + cache: ActivationCache from run_with_cache. + layer_idx: Block index for this linear_attn layer. + + Returns: + [batch, n_v_heads, seq, seq] causal attention matrix. Upper triangle + (j > i) is zero. + + Cost is O(batch * n_heads * seq^2); use on short sequences. + """ + prefix = f"blocks.{layer_idx}.linear_attn" + q_key = f"{prefix}.hook_q" + k_key = f"{prefix}.hook_k" + beta_key = f"{prefix}.hook_beta" + decay_key = f"{prefix}.hook_log_decay" + + for key in [q_key, k_key, beta_key, decay_key]: + if key not in cache: + raise RuntimeError( + f"compute_effective_attention needs {key!r} in cache. " + "Run run_with_cache() on the bridge first." + ) + + # [batch, seq, n_k_heads, head_k_dim] — pre-GQA-expansion + q = cache[q_key].float() + k = cache[k_key].float() + beta = cache[beta_key].float() # [batch, seq, n_v_heads] + g = cache[decay_key].float() # [batch, seq, n_v_heads] + + # GQA expansion to match n_v_heads + if q.shape[2] < beta.shape[-1]: + repeat = beta.shape[-1] // q.shape[2] + q = q.repeat_interleave(repeat, dim=2) + k = k.repeat_interleave(repeat, dim=2) + + batch, seq, n_heads, d_head = q.shape + + # QK similarity: [batch, n_heads, seq_i, seq_j] + q_perm = q.permute(0, 2, 1, 3) # [batch, n_heads, seq, d_head] + k_perm = k.permute(0, 2, 1, 3) + qk = torch.matmul(q_perm, k_perm.transpose(-2, -1)) # [batch, n_heads, seq, seq] + + # Cumulative decay: L[i,j] = prod_{t=j+1}^{i} exp(g_t) = exp(sum g[j+1..i]) + # g is [batch, seq, n_heads] → cumsum along seq + g_perm = g.permute(0, 2, 1) # [batch, n_heads, seq] + cumsum_g = torch.cumsum(g_perm, dim=-1) + # L_log[i,j] = cumsum[i] - cumsum[j] + L_log = cumsum_g[:, :, :, None] - cumsum_g[:, :, None, :] + + causal_mask = torch.tril(torch.ones(seq, seq, dtype=torch.bool, device=q.device)) + L = torch.where(causal_mask[None, None], torch.exp(L_log), torch.zeros_like(L_log)) + + # Beta broadcast: [batch, n_heads, 1, seq_j] + beta_col = beta.permute(0, 2, 1)[:, :, None, :] + + # M[i,j] = qk[i,j] * beta[j] * L[i,j] + M = qk * beta_col * L + + return M diff --git a/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py b/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py index 33f6dd21f..ad17c38a6 100644 --- a/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py +++ b/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py @@ -15,6 +15,7 @@ import torch import transformers.models.gemma2.modeling_gemma2 as gemma2_module +from transformer_lens.hook_points import HookPoint from transformer_lens.model_bridge.generalized_components.attention import ( AttentionBridge, ) @@ -127,6 +128,8 @@ def __init__( kwargs["maintain_native_attention"] = True super().__init__(name, config, submodules, **kwargs) self._init_position_embedding_hooks() + if getattr(config, "gated_q_proj", False): + self.hook_q_gate = HookPoint() def set_original_component(self, component: torch.nn.Module) -> None: """Set the original HF component and register for rotary hook firing. @@ -201,19 +204,34 @@ def forward(self, *args: Any, **kwargs: Any) -> Any: key_states = hf_attn.k_proj(hidden_states) value_states = hf_attn.v_proj(hidden_states) + # Gated q_proj (Qwen3.5/Qwen3Next): q_proj outputs [Q|gate] interleaved + # per head. cfg.gated_q_proj is set by the adapter. The actual split only + # triggers if the output is 2x the standard width (n_heads * head_dim). + # In processed mode, preprocess_weights slices q_proj to standard width + # so this naturally passes through. + q_gate = None + if getattr(self.config, "gated_q_proj", False): + q_dim = query_states.shape[-1] + n_heads = getattr(self.config, "n_heads", q_dim // head_dim) + standard_q_dim = n_heads * head_dim + if q_dim == standard_q_dim * 2: + query_states, q_gate = torch.chunk( + query_states.view(*input_shape, -1, head_dim * 2), 2, dim=-1 + ) + q_gate = q_gate.reshape(*input_shape, -1) + query_states = query_states.reshape(*input_shape, -1) + has_q_norm = hasattr(hf_attn, "q_norm") and hf_attn.q_norm is not None has_k_norm = hasattr(hf_attn, "k_norm") and hf_attn.k_norm is not None applied_pre_reshape_norm = False if has_q_norm: try: - # Try pre-reshape norm (OLMo 2 style: norm on flat [batch, seq, hidden]) query_states = hf_attn.q_norm(query_states) if has_k_norm: key_states = hf_attn.k_norm(key_states) applied_pre_reshape_norm = True except RuntimeError: - # Shape mismatch — this model uses post-reshape norms pass query_states = query_states.view(hidden_shape).transpose(1, 2) @@ -306,6 +324,12 @@ def forward(self, *args: Any, **kwargs: Any) -> Any: attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(*input_shape, -1) + # --- Gated attention (Qwen3.5/Qwen3Next) --- + if q_gate is not None: + if hasattr(self, "hook_q_gate"): + q_gate = self.hook_q_gate(q_gate) + attn_output = attn_output * torch.sigmoid(q_gate) + # --- Output Projection --- # Different architectures name this differently: o_proj (Llama, Gemma, Qwen), # dense (Phi), out_proj (others) diff --git a/transformer_lens/model_bridge/supported_architectures/granite.py b/transformer_lens/model_bridge/supported_architectures/granite.py index f85ef850d..fbb911796 100644 --- a/transformer_lens/model_bridge/supported_architectures/granite.py +++ b/transformer_lens/model_bridge/supported_architectures/granite.py @@ -65,11 +65,12 @@ def _setup_common_config(self, cfg: Any) -> None: self.default_config["n_key_value_heads"] = cfg.n_key_value_heads self.cfg.n_key_value_heads = cfg.n_key_value_heads - def _build_attention_bridge(self) -> PositionEmbeddingsAttentionBridge: + def _build_attention_bridge(self, optional: bool = False) -> PositionEmbeddingsAttentionBridge: """Build the standard Granite attention bridge.""" return PositionEmbeddingsAttentionBridge( name="self_attn", config=self.cfg, + optional=optional, submodules={ "q": LinearBridge(name="q_proj"), "k": LinearBridge(name="k_proj"), @@ -124,11 +125,11 @@ def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> No if bridge_model is not None and hasattr(bridge_model, "blocks"): for block in bridge_model.blocks: - if hasattr(block, "attn"): + if "attn" in block._modules: block.attn.set_rotary_emb(rotary_emb) try: attn_bridge = self.get_generalized_component("blocks.0.attn") attn_bridge.set_rotary_emb(rotary_emb) - except (AttributeError, KeyError): + except (AttributeError, KeyError, ValueError): pass diff --git a/transformer_lens/model_bridge/supported_architectures/granite_moe_hybrid.py b/transformer_lens/model_bridge/supported_architectures/granite_moe_hybrid.py index 2c776365b..53229252e 100644 --- a/transformer_lens/model_bridge/supported_architectures/granite_moe_hybrid.py +++ b/transformer_lens/model_bridge/supported_architectures/granite_moe_hybrid.py @@ -1,13 +1,11 @@ """Granite MoE Hybrid architecture adapter. -GraniteMoeHybridForCausalLM is a hybrid Mamba + Attention architecture with -Sparse Mixture of Experts. Layers alternate between Mamba SSM blocks and -standard attention blocks, with a shared MLP and optional sparse MoE on -every layer. - -Since self_attn is None on Mamba layers and mamba is None on attention -layers, we only map submodules that exist on ALL layers (norms, shared_mlp, -block_sparse_moe). The HF native forward handles mamba/attention dispatch. +Hybrid Mamba2 + Attention with Sparse MoE. Most layers are Mamba SSM blocks; +a few are standard attention (determined by config.layer_types). Every layer +has a shared MLP and optional sparse MoE. + +Both attention and Mamba are mapped as optional — each present only on its +respective layer type. Mamba hooks expose in_proj, conv1d, and inner_norm. """ from typing import Any @@ -21,53 +19,55 @@ MoEBridge, RMSNormalizationBridge, RotaryEmbeddingBridge, + SSM2MixerBridge, UnembeddingBridge, ) +from transformer_lens.model_bridge.generalized_components.depthwise_conv1d import ( + DepthwiseConv1DBridge, +) from transformer_lens.model_bridge.supported_architectures.granite import ( GraniteArchitectureAdapter, ) class GraniteMoeHybridArchitectureAdapter(GraniteArchitectureAdapter): - """Architecture adapter for IBM Granite MoE Hybrid models. - - Hybrid Mamba2 + Attention architecture with Sparse MoE. Most layers are Mamba - SSM blocks; a few are standard attention (determined by config.layer_types). + """Hybrid Mamba2 + Attention with Sparse MoE. - Since self_attn is None on Mamba layers and mamba is None on attention layers, - we only map submodules present on ALL layers (norms, shared_mlp, MoE). The HF - native forward handles mamba/attention dispatch internally. - - Hook coverage: - - Block-level: hook_resid_pre, hook_resid_post on every layer - - Normalization: ln1 (input_layernorm), ln2 (post_attention_layernorm) - - MLP: shared_mlp input/output hooks - - MoE: block_sparse_moe input/output and router_scores hooks - - Attention/Mamba internals are NOT individually hooked (conditional per layer) + Attention is optional (absent on Mamba layers). shared_mlp and MoE are + universal. Inherits Granite config and attention bridge construction. """ def __init__(self, cfg: Any) -> None: - """Initialize the Granite MoE Hybrid architecture adapter.""" - # Call ArchitectureAdapter.__init__ directly, not GraniteArchitectureAdapter.__init__, - # because we need to customize the setup sequence ArchitectureAdapter.__init__(self, cfg) - self._setup_common_config(cfg) - # Hybrid may use "rope" or "nope" (no positional embeddings) pos_emb_type = getattr(cfg, "position_embedding_type", "rope") if pos_emb_type != "rope": self.cfg.positional_embedding_type = "none" - # No attention weight conversions — attn Q/K/V aren't mapped as submodules + self.supports_fold_ln = False self.weight_processing_conversions = {} self.component_mapping = self._build_component_mapping() + def _build_mamba_bridge(self) -> SSM2MixerBridge: + """Mamba-2 mixer bridge with in_proj, conv1d, inner_norm hooks.""" + return SSM2MixerBridge( + name="mamba", + config=self.cfg, + optional=True, + submodules={ + "in_proj": LinearBridge(name="in_proj"), + "conv1d": DepthwiseConv1DBridge(name="conv1d"), + "inner_norm": LinearBridge(name="norm"), + }, + ) + def _build_component_mapping(self) -> dict: - """Build component mapping with only universal (all-layer) submodules.""" - block_submodules = { + block_submodules: dict = { "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg), "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg), + "attn": self._build_attention_bridge(optional=True), + "mamba": self._build_mamba_bridge(), "shared_mlp": MLPBridge( name="shared_mlp", config=self.cfg, @@ -87,12 +87,9 @@ def _build_component_mapping(self) -> dict: config=self.cfg, ) - mapping = { + mapping: dict = { "embed": EmbeddingBridge(name="model.embed_tokens"), - "blocks": BlockBridge( - name="model.layers", - submodules=block_submodules, - ), + "blocks": BlockBridge(name="model.layers", submodules=block_submodules), "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg), "unembed": UnembeddingBridge(name="lm_head", config=self.cfg), } @@ -101,10 +98,3 @@ def _build_component_mapping(self) -> dict: mapping["rotary_emb"] = RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg) return mapping - - def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None: - """No-op for hybrid models. - - Hybrid models don't map attention as a submodule (it's conditional per - layer), so there are no rotary embedding references to set up. - """ diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3.py b/transformer_lens/model_bridge/supported_architectures/qwen3.py index 8dcc1d6d3..e37b44795 100644 --- a/transformer_lens/model_bridge/supported_architectures/qwen3.py +++ b/transformer_lens/model_bridge/supported_architectures/qwen3.py @@ -1,7 +1,14 @@ -"""Qwen3 architecture adapter.""" +"""Qwen3 architecture adapter. + +Base adapter for the Qwen3 model family. Provides shared config setup, +attention bridge construction, and setup_component_testing used by +Qwen3, Qwen3.5, and Qwen3Next variants. +""" from typing import Any +import torch + from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter from transformer_lens.model_bridge.generalized_components import ( BlockBridge, @@ -12,33 +19,29 @@ RotaryEmbeddingBridge, UnembeddingBridge, ) +from transformer_lens.model_bridge.generalized_components.gated_delta_net import ( + GatedDeltaNetBridge, +) from transformer_lens.model_bridge.generalized_components.position_embeddings_attention import ( PositionEmbeddingsAttentionBridge, ) class Qwen3ArchitectureAdapter(ArchitectureAdapter): - """Architecture adapter for Qwen3 models. - - Qwen3 is architecturally similar to Gemma3: - - Uses RMSNorm for all normalizations - - Has Q/K normalization within attention (RMSNorm on head dimension) - - Uses rotary position embeddings (RoPE) - - Requires position_embeddings and attention_mask in forward pass - - Uses gated MLP (gate_proj + up_proj -> down_proj) - - No biases on any linear layers - - Key differences from Qwen2: - - Qwen3 has q_norm and k_norm layers in attention (Qwen2 doesn't) - - Qwen3 requires position_embeddings parameter (like Gemma3) - - Uses PositionEmbeddingsAttentionBridge instead of AttentionBridge + """Architecture adapter for Qwen3 dense models. + + RMSNorm, RoPE, GQA, Q/K head norms, gated MLP. No biases. + Serves as base class for Qwen3.5 and Qwen3Next hybrid variants. """ def __init__(self, cfg: Any) -> None: - """Initialize the Qwen3 architecture adapter.""" super().__init__(cfg) + self._setup_qwen3_config(cfg) + self.weight_processing_conversions = {**self._qkvo_weight_conversions()} + self.component_mapping = self._build_component_mapping() - # Set config variables for weight processing + def _setup_qwen3_config(self, cfg: Any) -> None: + """Config shared across all Qwen3 variants (dense, hybrid, MoE).""" self.cfg.normalization_type = "RMS" self.cfg.positional_embedding_type = "rotary" self.cfg.final_rms = True @@ -46,85 +49,101 @@ def __init__(self, cfg: Any) -> None: self.cfg.attn_only = False self.cfg.uses_rms_norm = True self.cfg.default_prepend_bos = False - - # Use eager attention to support output_attentions for hook_attn_scores and hook_pattern - # SDPA doesn't support output_attentions, which is required for HookedTransformer compatibility self.cfg.attn_implementation = "eager" - self.weight_processing_conversions = { - **self._qkvo_weight_conversions(), + if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None: + self.cfg.n_key_value_heads = cfg.n_key_value_heads + + def _build_attention_bridge(self, optional: bool = False) -> PositionEmbeddingsAttentionBridge: + """Standard Qwen3 attention bridge with Q/K norms.""" + return PositionEmbeddingsAttentionBridge( + name="self_attn", + config=self.cfg, + optional=optional, + submodules={ + "q": LinearBridge(name="q_proj"), + "k": LinearBridge(name="k_proj"), + "v": LinearBridge(name="v_proj"), + "o": LinearBridge(name="o_proj"), + "q_norm": RMSNormalizationBridge(name="q_norm", config=self.cfg), + "k_norm": RMSNormalizationBridge(name="k_norm", config=self.cfg), + }, + ) + + def _build_mlp_bridge(self): + """Dense gated MLP (gate_proj + up_proj -> down_proj). Override for MoE.""" + return GatedMLPBridge( + name="mlp", + config=self.cfg, + submodules={ + "gate": LinearBridge(name="gate_proj"), + "in": LinearBridge(name="up_proj"), + "out": LinearBridge(name="down_proj"), + }, + ) + + def _build_linear_attn_bridge(self, optional: bool = False) -> GatedDeltaNetBridge: + """GatedDeltaNet linear-attention bridge for hybrid variants.""" + return GatedDeltaNetBridge( + name="linear_attn", + config=self.cfg, + optional=optional, + ) + + def _build_component_mapping(self, *, hybrid: bool = False) -> dict: + """Parametric component mapping. hybrid=True adds optional linear_attn.""" + block_submodules: dict = { + "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg), + "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg), + "attn": self._build_attention_bridge(optional=hybrid), + "mlp": self._build_mlp_bridge(), } - - # Set up component mapping - self.component_mapping = { + if hybrid: + block_submodules["linear_attn"] = self._build_linear_attn_bridge(optional=True) + return { "embed": EmbeddingBridge(name="model.embed_tokens"), "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg), - "blocks": BlockBridge( - name="model.layers", - submodules={ - "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg), - "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg), - "attn": PositionEmbeddingsAttentionBridge( - name="self_attn", - config=self.cfg, - submodules={ - "q": LinearBridge(name="q_proj"), - "k": LinearBridge(name="k_proj"), - "v": LinearBridge(name="v_proj"), - "o": LinearBridge(name="o_proj"), - "q_norm": RMSNormalizationBridge(name="q_norm", config=self.cfg), - "k_norm": RMSNormalizationBridge(name="k_norm", config=self.cfg), - }, - ), - "mlp": GatedMLPBridge( - name="mlp", - config=self.cfg, - submodules={ - "gate": LinearBridge(name="gate_proj"), - "in": LinearBridge(name="up_proj"), - "out": LinearBridge(name="down_proj"), - }, - ), - }, - ), + "blocks": BlockBridge(name="model.layers", submodules=block_submodules), "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg), "unembed": UnembeddingBridge(name="lm_head"), } def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None: - """Set up rotary embedding references for Qwen3 component testing. - - Qwen3 uses RoPE (Rotary Position Embeddings). We set the rotary_emb on - all attention bridge instances for component testing. - - We also force the HF model to use "eager" attention to match the bridge's - implementation. The bridge uses "eager" to support output_attentions for hooks. - - Args: - hf_model: The HuggingFace Qwen3 model instance - bridge_model: The TransformerBridge model (if available, set rotary_emb on actual instances) - """ - # Get rotary embedding instance from the model + """Set eager attn on HF model and rotary_emb on attention bridges.""" rotary_emb = hf_model.model.rotary_emb - # Force HF model to use "eager" attention to match bridge implementation - # Bridge uses "eager" to support output_attentions for hook compatibility if hasattr(hf_model, "config") and hasattr(hf_model.config, "_attn_implementation"): hf_model.config._attn_implementation = "eager" - # Also set on all attention layers if hasattr(hf_model, "model") and hasattr(hf_model.model, "layers"): for layer in hf_model.model.layers: if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "config"): layer.self_attn.config._attn_implementation = "eager" - # Set rotary_emb on actual bridge instances in bridge_model if available if bridge_model is not None and hasattr(bridge_model, "blocks"): - # Set on each layer's actual attention bridge instance for block in bridge_model.blocks: - if hasattr(block, "attn"): + if "attn" in block._modules: block.attn.set_rotary_emb(rotary_emb) - # Also set on the template for get_generalized_component() calls - attn_bridge = self.get_generalized_component("blocks.0.attn") - attn_bridge.set_rotary_emb(rotary_emb) + # Set on template for get_generalized_component() calls + try: + attn_template = self.get_generalized_component("blocks.0.attn") + attn_template.set_rotary_emb(rotary_emb) + except ValueError: + pass # hybrid adapter with no attn in template + + @staticmethod + def _preprocess_gated_q_proj( + state_dict: dict[str, torch.Tensor], n_heads: int, d_head: int + ) -> dict[str, torch.Tensor]: + """Slice query half from gated q_proj.weight (interleaved per-head layout). + + q_proj.weight has shape (n_heads * d_head * 2, hidden_size) with + interleaved [query, gate] rows per head. Extracts query-only half. + """ + keys_to_update = [k for k in state_dict if k.endswith(".self_attn.q_proj.weight")] + for key in keys_to_update: + w = state_dict[key] + w = w.view(n_heads, d_head * 2, -1) + state_dict[key] = w[:, :d_head, :].reshape(n_heads * d_head, -1) + return state_dict diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_5.py b/transformer_lens/model_bridge/supported_architectures/qwen3_5.py index b1e71e9f3..a7c484eee 100644 --- a/transformer_lens/model_bridge/supported_architectures/qwen3_5.py +++ b/transformer_lens/model_bridge/supported_architectures/qwen3_5.py @@ -1,24 +1,8 @@ -"""Qwen3_5 architecture adapter. +"""Qwen3.5 architecture adapter. -Qwen3_5ForCausalLM is a hybrid linear-attention + full-attention architecture -with a dense gated MLP on every layer. Layers follow a repeating pattern of -3 GatedDeltaNet (linear attention) layers followed by 1 standard full-attention -layer (every 4th layer by default). - -Since self_attn is absent on linear-attention layers, we only map submodules -that exist on ALL layers (norms, MLP). The HF native forward handles -linear/full attention dispatch internally, and GatedMLPBridge maps the dense -gate_proj/up_proj/down_proj structure on every layer. - -Hook coverage: -- Block-level: hook_resid_pre, hook_resid_post on every layer -- Normalization: ln1 (input_layernorm), ln2 (post_attention_layernorm) -- MLP: hook_in, hook_out via GatedMLPBridge (gate_proj, up_proj, down_proj) -- Attention internals are NOT individually hooked (self_attn absent on - linear-attention layers; mapping it would crash on those layers) - -Optional parameters: -- n_key_value_heads: only set when using GQA (num_key_value_heads != num_attention_heads) +Hybrid linear-attention (GatedDeltaNet) + full-attention with dense gated MLP. +3 linear-attn layers per 1 full-attn layer. Extends Qwen3 base with +optional attention mapping and fold_ln disabled. """ from typing import Any @@ -26,150 +10,46 @@ import torch from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter -from transformer_lens.model_bridge.generalized_components import ( - BlockBridge, - EmbeddingBridge, - GatedMLPBridge, - LinearBridge, - RMSNormalizationBridge, - RotaryEmbeddingBridge, - UnembeddingBridge, +from transformer_lens.model_bridge.supported_architectures.qwen3 import ( + Qwen3ArchitectureAdapter, ) -class Qwen3_5ArchitectureAdapter(ArchitectureAdapter): - """Architecture adapter for Qwen3_5 models. - - Qwen3_5ForCausalLM is a hybrid linear-attention + full-attention - architecture with dense gated MLPs, sharing the same hybrid design as - Qwen3Next but replacing the sparse MoE MLP with a standard dense MLP: - - Uses RMSNorm for all normalizations - - Uses rotary position embeddings (RoPE) with partial rotation - - Every 4th layer is a full-attention layer (self_attn); the rest are - GatedDeltaNet linear-attention layers (linear_attn) - - Uses dense gated MLP (gate_proj + up_proj -> down_proj) on ALL layers - - No biases on any linear layers - - Full-attention layers have Q/K normalization (q_norm, k_norm) - - Full-attention q_proj outputs n_heads * head_dim * 2 (interleaved - query+gate layout); the preprocess_weights method slices the query half - - Since self_attn is absent on linear-attention layers, only universally - present submodules (norms, MLP) are mapped as block submodules. The HF - native forward handles per-layer attention dispatch internally. +class Qwen3_5ArchitectureAdapter(Qwen3ArchitectureAdapter): + """Hybrid linear-attention + full-attention with dense gated MLP. - Optional parameters: - - n_key_value_heads: set when num_key_value_heads != num_attention_heads (GQA) + Inherits Qwen3 config/attention/MLP structure. Differences: + - supports_fold_ln = False (LN target varies by layer type) + - Attention is optional (absent on linear-attention layers) + - Gated q_proj (2x wide) requires preprocess_weights slicing + - No weight_processing_conversions until attn is fully wired """ def __init__(self, cfg: Any) -> None: - """Initialize the Qwen3_5 architecture adapter.""" - super().__init__(cfg) - - # Core config attributes - self.cfg.normalization_type = "RMS" - self.cfg.positional_embedding_type = "rotary" - self.cfg.final_rms = True - self.cfg.gated_mlp = True - self.cfg.attn_only = False - self.cfg.uses_rms_norm = True - self.cfg.default_prepend_bos = False - - # Disable fold_ln: ln1 is followed by self_attn on full-attention - # layers and by linear_attn (GatedDeltaNet) on linear-attention layers, - # but neither is mapped as a bridge submodule (see class docstring for - # why). With no bridge-mapped target to fold into, the standard fold_ln - # pass leaves LN weights in an inconsistent state and the processed - # bridge output diverges from the unprocessed / HF output. Skipping - # fold_ln keeps processed-mode forward passes numerically equivalent. + # Call grandparent to set self.cfg, then configure ourselves + ArchitectureAdapter.__init__(self, cfg) + self._setup_qwen3_config(cfg) self.supports_fold_ln = False - - # Use eager attention to support output_attentions for hook_attn_scores - # and hook_pattern. SDPA doesn't support output_attentions. - self.cfg.attn_implementation = "eager" - - # GQA: only set n_key_value_heads when using grouped-query attention - if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None: - self.cfg.n_key_value_heads = cfg.n_key_value_heads - + setattr(self.cfg, "gated_q_proj", True) # q_proj outputs [Q|gate] interleaved per head self.weight_processing_conversions: dict = {} - self.component_mapping: dict = { - "embed": EmbeddingBridge(name="model.embed_tokens"), - "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg), - "blocks": BlockBridge( - name="model.layers", - submodules={ - "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg), - "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg), - # Dense gated MLP present on every layer (unlike Qwen3Next's MoE). - # gate_proj + up_proj feed into down_proj via SwiGLU activation. - "mlp": GatedMLPBridge( - name="mlp", - config=self.cfg, - submodules={ - "gate": LinearBridge(name="gate_proj"), - "in": LinearBridge(name="up_proj"), - "out": LinearBridge(name="down_proj"), - }, - ), - }, - ), - "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg), - "unembed": UnembeddingBridge(name="lm_head"), - } + self.component_mapping = self._build_component_mapping(hybrid=True) def prepare_loading(self, model_name: str, model_kwargs: dict) -> None: - """Swap the multimodal Qwen3_5Config for its text-only Qwen3_5TextConfig. - - Published Qwen3.5 checkpoints (e.g. Qwen/Qwen3.5-0.8B) carry - model_type='qwen3_5' and architectures=['Qwen3_5ForConditionalGeneration']. - AutoModelForCausalLM would load the full VLM (Qwen3_5ForConditionalGeneration) - with its vision tower, wasting memory and failing the bridge. + """Swap multimodal Qwen3_5Config for text-only Qwen3_5TextConfig. - Instead we replace model_kwargs['config'] with the nested text_config so - AutoModelForCausalLM loads Qwen3_5ForCausalLM (text only). + Published checkpoints carry architectures=['Qwen3_5ForConditionalGeneration']. + We replace config with text_config so AutoModelForCausalLM loads the + text-only Qwen3_5ForCausalLM. """ config = model_kwargs.get("config") if config is not None and hasattr(config, "text_config"): model_kwargs["config"] = config.text_config - def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None: - """No-op for hybrid models. - - Hybrid models don't map attention as a block submodule (self_attn is - absent on linear-attention layers), so there are no rotary embedding - references to set up. - - Note: to find which layers are full_attention at runtime, use: - layer_types = getattr(hf_model.config, "layer_types", []) - first_full_attn_idx = next( - i for i, t in enumerate(layer_types) if t == "full_attention" - ) - Do NOT use hf_model.config.full_attention_interval -- it is not stored - on the config object (consumed during __init__ to build layer_types). - """ - def preprocess_weights(self, state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: - """Slice query half from q_proj.weight (interleaved per-head layout). - - In Qwen3_5, q_proj.weight has shape (n_heads * head_dim * 2, hidden_size). - Rows are organized as per-head interleaved: - head_0_query (d_head rows), head_0_gate (d_head rows), - head_1_query (d_head rows), head_1_gate (d_head rows), ... - - A naive first-half slice would be wrong. We must reshape by head, then - take the first d_head rows of each head (the query half). + """Slice query half from gated q_proj.weight for weight-space analysis. - Note: since self_attn is NOT currently mapped as a bridge submodule, - these weights will not be loaded by the bridge. This method is included - for correctness and forward-compatibility. + In processed mode, W_Q is the pure query projection (for composition + scores, logit lens). Gate signal available in unprocessed mode via + hook_q_gate. """ - n_heads = self.cfg.n_heads - d_head = self.cfg.d_head - keys_to_update = [k for k in state_dict if k.endswith(".self_attn.q_proj.weight")] - for key in keys_to_update: - w = state_dict[key] # shape: (n_heads * d_head * 2, hidden_size) - # Reshape to expose per-head layout - w = w.view(n_heads, d_head * 2, -1) - # Take only the first d_head rows of each head (query half) - state_dict[key] = w[:, :d_head, :].reshape(n_heads * d_head, -1) - return state_dict + return self._preprocess_gated_q_proj(state_dict, self.cfg.n_heads, self.cfg.d_head) diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_next.py b/transformer_lens/model_bridge/supported_architectures/qwen3_next.py index 53e18dce1..aa3ca6cc8 100644 --- a/transformer_lens/model_bridge/supported_architectures/qwen3_next.py +++ b/transformer_lens/model_bridge/supported_architectures/qwen3_next.py @@ -1,28 +1,8 @@ """Qwen3Next architecture adapter. -Qwen3NextForCausalLM is a hybrid linear-attention + full-attention architecture -with a sparse Mixture-of-Experts MLP on every layer. Layers alternate between -GatedDeltaNet (linear attention) and standard full attention blocks, while the -MLP is always a Qwen3NextSparseMoeBlock (gate router + batched experts + -shared expert). - -Since self_attn is absent on linear-attention layers, we only map submodules -that exist on ALL layers (norms, MLP). The HF native forward handles -linear/full attention dispatch internally, and MoEBridge delegates the entire -MoE forward (including router, experts, and shared expert) to the native -implementation. - -Hook coverage: -- Block-level: hook_resid_pre, hook_resid_post on every layer -- Normalization: ln1 (input_layernorm), ln2 (post_attention_layernorm) -- MLP: hook_in, hook_out on the MoE block (MoEBridge) -- Attention internals are NOT individually hooked (self_attn absent on - linear-attention layers; mapping it would crash on those layers) -- Expert-level internals are NOT individually hooked (batched expert params - live inside Qwen3NextExperts; MoEBridge delegates to HF forward) - -Optional parameters: -- n_key_value_heads: only set when using GQA (num_key_value_heads != num_attention_heads) +Hybrid linear-attention (GatedDeltaNet) + full-attention with sparse MoE MLP. +3 linear-attn layers per 1 full-attn layer. Extends Qwen3 base with +optional attention mapping, MoE MLP, and fold_ln disabled. """ from typing import Any @@ -30,134 +10,31 @@ import torch from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter -from transformer_lens.model_bridge.generalized_components import ( - BlockBridge, - EmbeddingBridge, - MoEBridge, - RMSNormalizationBridge, - RotaryEmbeddingBridge, - UnembeddingBridge, +from transformer_lens.model_bridge.generalized_components import MoEBridge +from transformer_lens.model_bridge.supported_architectures.qwen3 import ( + Qwen3ArchitectureAdapter, ) -class Qwen3NextArchitectureAdapter(ArchitectureAdapter): - """Architecture adapter for Qwen3Next models. - - Qwen3NextForCausalLM is a hybrid linear-attention + full-attention - architecture with sparse MoE MLPs, sharing the same design as Qwen3.5: - - Uses RMSNorm for all normalizations - - Uses rotary position embeddings (RoPE) with partial rotation - - Every 4th layer is a full-attention layer (self_attn); the rest are - GatedDeltaNet linear-attention layers (linear_attn) - - Uses Qwen3NextSparseMoeBlock on ALL layers (decoder_sparse_step=1 and - mlp_only_layers=[] on every real checkpoint). The MoE block contains a - top-K router, batched Qwen3NextExperts (experts.gate_up_proj / - experts.down_proj as 3D tensors), plus a shared_expert (gated MLP) and - shared_expert_gate. Each expert is internally a gated MLP. - - No biases on any linear layers - - Full-attention layers have Q/K normalization (q_norm, k_norm) - - Full-attention q_proj outputs n_heads * head_dim * 2 (interleaved - query+gate layout); the preprocess_weights method slices the query half +class Qwen3NextArchitectureAdapter(Qwen3ArchitectureAdapter): + """Hybrid linear-attention + full-attention with sparse MoE MLP. - Since self_attn is absent on linear-attention layers, only universally - present submodules (norms, MLP) are mapped as block submodules. The HF - native forward handles per-layer attention dispatch internally, and - MoEBridge delegates the MoE forward pass (including router + experts + - shared expert) to the native Qwen3NextSparseMoeBlock implementation. - - Optional parameters: - - n_key_value_heads: set when num_key_value_heads != num_attention_heads (GQA) + Same hybrid design as Qwen3.5 but with MoE instead of dense MLP. + Inherits Qwen3 config/attention structure. """ def __init__(self, cfg: Any) -> None: - """Initialize the Qwen3Next architecture adapter.""" - super().__init__(cfg) - - # Core config attributes - self.cfg.normalization_type = "RMS" - self.cfg.positional_embedding_type = "rotary" - self.cfg.final_rms = True - self.cfg.gated_mlp = True - self.cfg.attn_only = False - self.cfg.uses_rms_norm = True - self.cfg.default_prepend_bos = False - - # Disable fold_ln: ln1 is followed by self_attn on full-attention - # layers and by linear_attn (GatedDeltaNet) on linear-attention layers, - # but neither is mapped as a bridge submodule (see class docstring for - # why). With no bridge-mapped target to fold into, the standard fold_ln - # pass leaves LN weights in an inconsistent state and the processed - # bridge output diverges from the unprocessed / HF output. Skipping - # fold_ln keeps processed-mode forward passes numerically equivalent. + ArchitectureAdapter.__init__(self, cfg) + self._setup_qwen3_config(cfg) self.supports_fold_ln = False - - # Use eager attention to support output_attentions for hook_attn_scores - # and hook_pattern. SDPA doesn't support output_attentions. - self.cfg.attn_implementation = "eager" - - # GQA: only set n_key_value_heads when using grouped-query attention - if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None: - self.cfg.n_key_value_heads = cfg.n_key_value_heads - + setattr(self.cfg, "gated_q_proj", True) # q_proj outputs [Q|gate] interleaved per head self.weight_processing_conversions: dict = {} - self.component_mapping: dict = { - "embed": EmbeddingBridge(name="model.embed_tokens"), - "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg), - "blocks": BlockBridge( - name="model.layers", - submodules={ - "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg), - "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg), - # Qwen3NextSparseMoeBlock has a custom Qwen3NextTopKRouter - # (not an nn.Linear) as `gate`, plus batched experts and a - # shared expert. MoEBridge wraps the whole MoE module and - # delegates to HF's native forward, so we don't enumerate - # the internal structure here. - "mlp": MoEBridge(name="mlp", config=self.cfg), - }, - ), - "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg), - "unembed": UnembeddingBridge(name="lm_head"), - } - - def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None: - """No-op for hybrid models. - - Hybrid models don't map attention as a block submodule (self_attn is - absent on linear-attention layers), so there are no rotary embedding - references to set up. + self.component_mapping = self._build_component_mapping(hybrid=True) - Note: to find which layers are full_attention at runtime, use: - layer_types = getattr(hf_model.config, "layer_types", []) - first_full_attn_idx = next( - i for i, t in enumerate(layer_types) if t == "full_attention" - ) - Do NOT use hf_model.config.full_attention_interval -- it is not stored - on the config object (consumed during __init__ to build layer_types). - """ + def _build_mlp_bridge(self): + """Sparse MoE MLP (router + batched experts + shared expert).""" + return MoEBridge(name="mlp", config=self.cfg) def preprocess_weights(self, state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: - """Slice query half from q_proj.weight (interleaved per-head layout). - - In Qwen3Next, q_proj.weight has shape (n_heads * head_dim * 2, hidden_size). - Rows are organized as per-head interleaved: - head_0_query (d_head rows), head_0_gate (d_head rows), - head_1_query (d_head rows), head_1_gate (d_head rows), ... - - A naive first-half slice would be wrong. We must reshape by head, then - take the first d_head rows of each head (the query half). - - Note: since self_attn is NOT currently mapped as a bridge submodule, - these weights will not be loaded by the bridge. This method is included - for correctness and forward-compatibility. - """ - n_heads = self.cfg.n_heads - d_head = self.cfg.d_head - keys_to_update = [k for k in state_dict if k.endswith(".self_attn.q_proj.weight")] - for key in keys_to_update: - w = state_dict[key] # shape: (n_heads * d_head * 2, hidden_size) - # Reshape to expose per-head layout - w = w.view(n_heads, d_head * 2, -1) - # Take only the first d_head rows of each head (query half) - state_dict[key] = w[:, :d_head, :].reshape(n_heads * d_head, -1) - return state_dict + """Slice query half from gated q_proj.weight for weight-space analysis.""" + return self._preprocess_gated_q_proj(state_dict, self.cfg.n_heads, self.cfg.d_head) From b3de91dc6a3985f2b695be257a71d75aa773f405 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Wed, 15 Apr 2026 08:03:56 -0500 Subject: [PATCH 4/8] Adapter updates and custom components --- .../benchmarks/component_outputs.py | 6 +- .../benchmarks/weight_processing.py | 27 +- .../model_bridge/component_setup.py | 5 +- .../generalized_components/attention.py | 4 +- .../generalized_components/gated_delta_net.py | 84 +- .../position_embeddings_attention.py | 32 +- .../supported_architectures/granite.py | 1 + .../supported_architectures/qwen3.py | 24 +- .../supported_architectures/qwen3_5.py | 20 +- .../supported_architectures/qwen3_next.py | 10 +- .../data/architecture_gaps.json | 4291 +++++++------- .../model_registry/data/supported_models.json | 5114 ++++++++++++++++- .../data/verification_history.json | 202 +- 13 files changed, 7549 insertions(+), 2271 deletions(-) diff --git a/transformer_lens/benchmarks/component_outputs.py b/transformer_lens/benchmarks/component_outputs.py index ba2d03edd..504825ce1 100644 --- a/transformer_lens/benchmarks/component_outputs.py +++ b/transformer_lens/benchmarks/component_outputs.py @@ -311,8 +311,12 @@ def benchmark_all_components( n_layers = self.cfg.n_layers for layer_idx in range(n_layers): - # Recursively test each subcomponent and its nested subcomponents + # Get the actual block to check which submodules were bound + actual_block = getattr(self.bridge_model, block_type)[layer_idx] for subcomp_name, subcomponent in blocks_component.submodules.items(): + # Skip optional submodules absent on this layer (hybrid architectures) + if subcomp_name not in actual_block._modules: + continue comp_path = f"{block_type}.{layer_idx}.{subcomp_name}" self._test_component_recursive( comp_path, subcomponent, test_inputs, results, skip_components diff --git a/transformer_lens/benchmarks/weight_processing.py b/transformer_lens/benchmarks/weight_processing.py index 5a7fafd65..62e561b25 100644 --- a/transformer_lens/benchmarks/weight_processing.py +++ b/transformer_lens/benchmarks/weight_processing.py @@ -638,10 +638,24 @@ def benchmark_mlp_output_centering( message="Skipped for tiny/test model (random weights don't center meaningfully)", ) - # Check if this is an MoE model - MoE models don't have a single W_out weight + # Find an MLP-like submodule (may be "mlp", "shared_mlp", etc.) from transformer_lens.model_bridge.generalized_components.moe import MoEBridge - if isinstance(bridge.blocks[0].mlp, MoEBridge): + mlp_module = None + block = bridge.blocks[0] + for name in ("mlp", "shared_mlp"): + if name in block._modules: + mlp_module = block._modules[name] + break + if mlp_module is None: + return BenchmarkResult( + name="mlp_output_centering", + severity=BenchmarkSeverity.WARNING, + message="No MLP submodule found on block 0", + passed=False, + ) + + if isinstance(mlp_module, MoEBridge): return BenchmarkResult( name="mlp_output_centering", severity=BenchmarkSeverity.INFO, @@ -651,11 +665,10 @@ def benchmark_mlp_output_centering( # Check if W_out exists and is accessible (HT format or bridge format) w_out = None - if hasattr(bridge.blocks[0].mlp, "W_out"): - w_out = bridge.blocks[0].mlp.W_out - elif hasattr(bridge.blocks[0].mlp, "out"): - # Bridge format: mlp.out is a LinearBridge wrapping nn.Linear - out_module = bridge.blocks[0].mlp.out + if hasattr(mlp_module, "W_out"): + w_out = mlp_module.W_out + elif hasattr(mlp_module, "out"): + out_module = mlp_module.out if hasattr(out_module, "original_component") and hasattr( out_module.original_component, "weight" ): diff --git a/transformer_lens/model_bridge/component_setup.py b/transformer_lens/model_bridge/component_setup.py index a2986d585..7821d0354 100644 --- a/transformer_lens/model_bridge/component_setup.py +++ b/transformer_lens/model_bridge/component_setup.py @@ -100,9 +100,10 @@ def setup_submodules( else: remote_path = submodule.name is_optional = getattr(submodule, "optional", False) - # Fast path: first segment absent → skip without entering get_remote_component + # Fast path: first segment absent or None → skip first_segment = remote_path.split(".")[0] - if is_optional and not hasattr(original_model, first_segment): + first_value = getattr(original_model, first_segment, None) + if is_optional and first_value is None: logger.debug( "Optional '%s' (path '%s') absent on %s", module_name, diff --git a/transformer_lens/model_bridge/generalized_components/attention.py b/transformer_lens/model_bridge/generalized_components/attention.py index 2d73d7ed7..5608ca2d8 100644 --- a/transformer_lens/model_bridge/generalized_components/attention.py +++ b/transformer_lens/model_bridge/generalized_components/attention.py @@ -59,7 +59,7 @@ def __init__( requires_position_embeddings: bool = False, requires_attention_mask: bool = False, attention_mask_4d: bool = False, - **kwargs, + optional: bool = False, ): """Initialize the attention bridge. @@ -87,7 +87,7 @@ def __init__( config=config, submodules=submodules or {}, conversion_rule=conversion_rule, - **kwargs, + optional=optional, ) self.hook_attn_scores = HookPoint() self.hook_pattern = HookPoint() diff --git a/transformer_lens/model_bridge/generalized_components/gated_delta_net.py b/transformer_lens/model_bridge/generalized_components/gated_delta_net.py index b62937dbd..dffc0e234 100644 --- a/transformer_lens/model_bridge/generalized_components/gated_delta_net.py +++ b/transformer_lens/model_bridge/generalized_components/gated_delta_net.py @@ -23,17 +23,20 @@ class GatedDeltaNetBridge(GeneralizedComponent): Hooks (prefill, in execution order): hook_in: input hidden_states [batch, seq, d_model] - hook_q_pre_conv: Q after projection + split, before conv [batch, seq, n_k_heads, head_k_dim] - hook_k_pre_conv: K before conv [batch, seq, n_k_heads, head_k_dim] - hook_v_pre_conv: V before conv [batch, seq, n_v_heads, head_v_dim] - hook_conv_out: post-conv mixed QKV [batch, seq, key_dim*2 + value_dim] + hook_q_pre_conv: Q after projection, before conv [batch, seq, n_k_heads, head_k_dim] + hook_k_pre_conv: K after projection, before conv [batch, seq, n_k_heads, head_k_dim] + hook_v_pre_conv: V after projection, before conv [batch, seq, n_v_heads, head_v_dim] hook_q: Q after conv, pre-GQA-expansion [batch, seq, n_k_heads, head_k_dim] + Note: on standard attn layers, hook_q is post-projection. Here it's + post-conv — use hook_q_pre_conv for the projection-only output. hook_k: K after conv [batch, seq, n_k_heads, head_k_dim] hook_v: V after conv [batch, seq, n_v_heads, head_v_dim] - hook_beta: write strength (sigmoid of b), per v-head [batch, seq, n_v_heads] - hook_log_decay: log-space decay g (negative; actual decay = exp(g)), per v-head [batch, seq, n_v_heads] - hook_recurrence_out: output of linear recurrence kernel [batch, seq, n_v_heads, head_v_dim] - hook_gate_input: z tensor before silu gating in GatedRMSNorm [batch, seq, n_v_heads, head_v_dim] + hook_beta_logit: pre-sigmoid write gate logit, per v-head [batch, seq, n_v_heads] + hook_beta: write strength sigmoid(b), per v-head [batch, seq, n_v_heads] + hook_log_decay: log-space decay g (NEGATIVE; multiplicative decay = exp(g)), + per v-head [batch, seq, n_v_heads] + hook_recurrence_out: output of linear recurrence [batch, seq, n_v_heads, head_v_dim] + hook_gate_input: z tensor (pre-silu) for GatedRMSNorm [batch, seq, n_v_heads, head_v_dim] hook_out: final output to residual stream [batch, seq, d_model] During generation (cache_params present), only hook_in/hook_out fire. @@ -63,17 +66,16 @@ def __init__( **kwargs, ): super().__init__(name, config=config, submodules=submodules or {}, **kwargs) - # Pre-conv hooks (after projection, before causal convolution mixes positions) + # Pre-conv (after projection split, before causal conv mixes positions) self.hook_q_pre_conv = HookPoint() self.hook_k_pre_conv = HookPoint() self.hook_v_pre_conv = HookPoint() - # Conv output - self.hook_conv_out = HookPoint() - # Post-conv hooks (pre-GQA-expansion, pre-recurrence) + # Post-conv (pre-GQA-expansion, pre-recurrence) self.hook_q = HookPoint() self.hook_k = HookPoint() self.hook_v = HookPoint() # Gate parameters (per v-head) + self.hook_beta_logit = HookPoint() self.hook_beta = HookPoint() self.hook_log_decay = HookPoint() # Recurrence output + gated norm input @@ -84,7 +86,6 @@ def forward(self, *args: Any, **kwargs: Any) -> Any: if self.original_component is None: raise RuntimeError(f"Original component not set for {self.name}.") - # Generation step → delegate to HF with only input/output hooks if kwargs.get("cache_params") is not None: return self._native_forward(*args, **kwargs) return self._hooked_forward(*args, **kwargs) @@ -98,6 +99,12 @@ def _native_forward(self, *args: Any, **kwargs: Any) -> Any: args = (self.hook_in(args[0]),) + args[1:] output = self.original_component(*args, **kwargs) + + if isinstance(output, tuple) and len(output) > 0: + first = output[0] + if isinstance(first, torch.Tensor): + return (self.hook_out(first),) + output[1:] + return output if isinstance(output, torch.Tensor): return self.hook_out(output) return output @@ -115,11 +122,8 @@ def _hooked_forward(self, *args: Any, **kwargs: Any) -> Any: attention_mask = kwargs.get("attention_mask") if attention_mask is not None: - from transformers.models.qwen3_next.modeling_qwen3_next import ( - apply_mask_to_padding_states, - ) - - hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + # Inline masking — avoids hard dependency on qwen3_next module + hidden_states = hidden_states * attention_mask.unsqueeze(-1) hidden_states = self.hook_in(hidden_states) batch_size, seq_len, _ = hidden_states.shape @@ -128,7 +132,6 @@ def _hooked_forward(self, *args: Any, **kwargs: Any) -> Any: projected_qkvz = hf.in_proj_qkvz(hidden_states) projected_ba = hf.in_proj_ba(hidden_states) - # Split into per-head Q, K, V, Z, beta_raw, alpha_raw query, key, value, z, b, a = hf.fix_query_key_value_ordering(projected_qkvz, projected_ba) # --- Pre-conv hooks (per-head shape, before conv mixes positions) --- @@ -153,9 +156,7 @@ def _hooked_forward(self, *args: Any, **kwargs: Any) -> Any: mixed_qkv = F.silu(hf.conv1d(mixed_qkv)[:, :, :seq_len]) mixed_qkv = mixed_qkv.transpose(1, 2) - mixed_qkv = self.hook_conv_out(mixed_qkv) - - # Split post-conv + # Split post-conv into per-head Q, K, V query, key, value = torch.split( mixed_qkv, [hf.key_dim, hf.key_dim, hf.value_dim], @@ -171,9 +172,10 @@ def _hooked_forward(self, *args: Any, **kwargs: Any) -> Any: value = self.hook_v(value) # --- Gate parameters (per v-head) --- + b = self.hook_beta_logit(b) beta = self.hook_beta(b.sigmoid()) - # g is log-space decay (negative); actual multiplicative decay = exp(g) + # g is log-space decay (NEGATIVE); multiplicative decay = exp(g) g = -hf.A_log.float().exp() * F.softplus(a.float() + hf.dt_bias) g = self.hook_log_decay(g) @@ -216,25 +218,27 @@ def compute_effective_attention( ) -> torch.Tensor: """Materialize the effective attention matrix from cached hook values. - The gated delta rule recurrence is: + The gated delta rule recurrence is:: + S_t = exp(g_t) * S_{t-1} + beta_t * v_t @ k_t^T o_t = S_t^T @ q_t - The effective attention M[i,j] = contribution of input j to output i: + The effective attention M[i,j] = contribution of input j to output i:: + M[i,j] = (q_i^T @ k_j) * beta_j * prod_{t=j+1}^{i} exp(g_t) - Note: the fused kernel applies L2-normalization to Q and K internally - (use_qk_l2norm_in_kernel=True). The hooked Q/K are pre-normalization, - so this reconstruction is approximate. For exact reconstruction, you'd - need the normalized Q/K which aren't exposed by the kernel. + **Approximation note:** The fused kernel applies L2-normalization to Q + and K internally (``use_qk_l2norm_in_kernel=True``). The hooked Q/K are + pre-normalization, so this reconstruction diverges when Q/K norms vary + significantly across positions/heads. Accuracy is best when Q/K norms + are roughly uniform (common after training converges). Args: - cache: ActivationCache from run_with_cache. + cache: ActivationCache from ``run_with_cache``. layer_idx: Block index for this linear_attn layer. Returns: - [batch, n_v_heads, seq, seq] causal attention matrix. Upper triangle - (j > i) is zero. + ``[batch, n_v_heads, seq, seq]`` causal matrix (upper triangle zero). Cost is O(batch * n_heads * seq^2); use on short sequences. """ @@ -266,24 +270,18 @@ def compute_effective_attention( batch, seq, n_heads, d_head = q.shape # QK similarity: [batch, n_heads, seq_i, seq_j] - q_perm = q.permute(0, 2, 1, 3) # [batch, n_heads, seq, d_head] + q_perm = q.permute(0, 2, 1, 3) k_perm = k.permute(0, 2, 1, 3) - qk = torch.matmul(q_perm, k_perm.transpose(-2, -1)) # [batch, n_heads, seq, seq] + qk = torch.matmul(q_perm, k_perm.transpose(-2, -1)) - # Cumulative decay: L[i,j] = prod_{t=j+1}^{i} exp(g_t) = exp(sum g[j+1..i]) - # g is [batch, seq, n_heads] → cumsum along seq + # Cumulative decay: L[i,j] = exp(sum g[j+1..i]) g_perm = g.permute(0, 2, 1) # [batch, n_heads, seq] cumsum_g = torch.cumsum(g_perm, dim=-1) - # L_log[i,j] = cumsum[i] - cumsum[j] L_log = cumsum_g[:, :, :, None] - cumsum_g[:, :, None, :] causal_mask = torch.tril(torch.ones(seq, seq, dtype=torch.bool, device=q.device)) L = torch.where(causal_mask[None, None], torch.exp(L_log), torch.zeros_like(L_log)) - # Beta broadcast: [batch, n_heads, 1, seq_j] - beta_col = beta.permute(0, 2, 1)[:, :, None, :] - # M[i,j] = qk[i,j] * beta[j] * L[i,j] - M = qk * beta_col * L - - return M + beta_col = beta.permute(0, 2, 1)[:, :, None, :] + return qk * beta_col * L diff --git a/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py b/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py index ad17c38a6..135ab0d17 100644 --- a/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py +++ b/transformer_lens/model_bridge/generalized_components/position_embeddings_attention.py @@ -113,20 +113,26 @@ class PositionEmbeddingsAttentionBridge(PositionEmbeddingHooksMixin, AttentionBr """ def __init__( - self, name: str, config: Any, submodules: Optional[Dict[str, Any]] = None, **kwargs + self, + name: str, + config: Any, + submodules: Optional[Dict[str, Any]] = None, + optional: bool = False, + # Accepted for caller compatibility (Granite passes these explicitly) + # but always forced to True — this bridge reimplements attention. + requires_attention_mask: bool = True, + requires_position_embeddings: bool = True, + **kwargs, # absorb any other AttentionBridge kwargs callers may pass ): - """Initialize Gemma-3 attention bridge. - - Args: - name: Component name - config: Model configuration - submodules: Dictionary of subcomponents - **kwargs: Additional arguments passed to AttentionBridge - """ - kwargs["requires_position_embeddings"] = True - kwargs["requires_attention_mask"] = True - kwargs["maintain_native_attention"] = True - super().__init__(name, config, submodules, **kwargs) + super().__init__( + name, + config, + submodules, + requires_position_embeddings=True, + requires_attention_mask=True, + maintain_native_attention=True, + optional=optional, + ) self._init_position_embedding_hooks() if getattr(config, "gated_q_proj", False): self.hook_q_gate = HookPoint() diff --git a/transformer_lens/model_bridge/supported_architectures/granite.py b/transformer_lens/model_bridge/supported_architectures/granite.py index fbb911796..c46081b0b 100644 --- a/transformer_lens/model_bridge/supported_architectures/granite.py +++ b/transformer_lens/model_bridge/supported_architectures/granite.py @@ -51,6 +51,7 @@ def _setup_common_config(self, cfg: Any) -> None: self.cfg.gated_mlp = True self.cfg.attn_only = False self.cfg.uses_rms_norm = True + self.cfg.default_prepend_bos = False self.cfg.eps_attr = "variance_epsilon" self.default_config = { diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3.py b/transformer_lens/model_bridge/supported_architectures/qwen3.py index e37b44795..4676d1175 100644 --- a/transformer_lens/model_bridge/supported_architectures/qwen3.py +++ b/transformer_lens/model_bridge/supported_architectures/qwen3.py @@ -34,11 +34,15 @@ class Qwen3ArchitectureAdapter(ArchitectureAdapter): Serves as base class for Qwen3.5 and Qwen3Next hybrid variants. """ - def __init__(self, cfg: Any) -> None: + def __init__(self, cfg: Any, *, hybrid: bool = False) -> None: super().__init__(cfg) self._setup_qwen3_config(cfg) - self.weight_processing_conversions = {**self._qkvo_weight_conversions()} - self.component_mapping = self._build_component_mapping() + if hybrid: + self.supports_fold_ln = False + self.weight_processing_conversions: dict = {} + else: + self.weight_processing_conversions = {**self._qkvo_weight_conversions()} + self.component_mapping = self._build_component_mapping(hybrid=hybrid) def _setup_qwen3_config(self, cfg: Any) -> None: """Config shared across all Qwen3 variants (dense, hybrid, MoE).""" @@ -126,11 +130,15 @@ def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> No block.attn.set_rotary_emb(rotary_emb) # Set on template for get_generalized_component() calls - try: - attn_template = self.get_generalized_component("blocks.0.attn") - attn_template.set_rotary_emb(rotary_emb) - except ValueError: - pass # hybrid adapter with no attn in template + # Set on template — may not exist in hybrid adapters + mapping = self.component_mapping or {} + blocks_template = mapping.get("blocks") if isinstance(mapping, dict) else None + if blocks_template and "attn" in getattr(blocks_template, "submodules", {}): + try: + attn_template = self.get_generalized_component("blocks.0.attn") + attn_template.set_rotary_emb(rotary_emb) + except (ValueError, AttributeError, KeyError): + pass @staticmethod def _preprocess_gated_q_proj( diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_5.py b/transformer_lens/model_bridge/supported_architectures/qwen3_5.py index a7c484eee..2fa7e5b0d 100644 --- a/transformer_lens/model_bridge/supported_architectures/qwen3_5.py +++ b/transformer_lens/model_bridge/supported_architectures/qwen3_5.py @@ -9,7 +9,6 @@ import torch -from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter from transformer_lens.model_bridge.supported_architectures.qwen3 import ( Qwen3ArchitectureAdapter, ) @@ -19,20 +18,13 @@ class Qwen3_5ArchitectureAdapter(Qwen3ArchitectureAdapter): """Hybrid linear-attention + full-attention with dense gated MLP. Inherits Qwen3 config/attention/MLP structure. Differences: - - supports_fold_ln = False (LN target varies by layer type) - - Attention is optional (absent on linear-attention layers) - - Gated q_proj (2x wide) requires preprocess_weights slicing - - No weight_processing_conversions until attn is fully wired + - Attention + linear_attn are optional (per-layer type) + - Gated q_proj (2x wide) sliced by preprocess_weights for weight analysis """ def __init__(self, cfg: Any) -> None: - # Call grandparent to set self.cfg, then configure ourselves - ArchitectureAdapter.__init__(self, cfg) - self._setup_qwen3_config(cfg) - self.supports_fold_ln = False - setattr(self.cfg, "gated_q_proj", True) # q_proj outputs [Q|gate] interleaved per head - self.weight_processing_conversions: dict = {} - self.component_mapping = self._build_component_mapping(hybrid=True) + setattr(cfg, "gated_q_proj", True) + super().__init__(cfg, hybrid=True) def prepare_loading(self, model_name: str, model_kwargs: dict) -> None: """Swap multimodal Qwen3_5Config for text-only Qwen3_5TextConfig. @@ -49,7 +41,7 @@ def preprocess_weights(self, state_dict: dict[str, torch.Tensor]) -> dict[str, t """Slice query half from gated q_proj.weight for weight-space analysis. In processed mode, W_Q is the pure query projection (for composition - scores, logit lens). Gate signal available in unprocessed mode via - hook_q_gate. + scores, logit lens). Gate signal available in unprocessed mode on + full-attention layers via blocks.N.attn.hook_q_gate. """ return self._preprocess_gated_q_proj(state_dict, self.cfg.n_heads, self.cfg.d_head) diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_next.py b/transformer_lens/model_bridge/supported_architectures/qwen3_next.py index aa3ca6cc8..31e1be3cd 100644 --- a/transformer_lens/model_bridge/supported_architectures/qwen3_next.py +++ b/transformer_lens/model_bridge/supported_architectures/qwen3_next.py @@ -9,7 +9,6 @@ import torch -from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter from transformer_lens.model_bridge.generalized_components import MoEBridge from transformer_lens.model_bridge.supported_architectures.qwen3 import ( Qwen3ArchitectureAdapter, @@ -20,16 +19,11 @@ class Qwen3NextArchitectureAdapter(Qwen3ArchitectureAdapter): """Hybrid linear-attention + full-attention with sparse MoE MLP. Same hybrid design as Qwen3.5 but with MoE instead of dense MLP. - Inherits Qwen3 config/attention structure. """ def __init__(self, cfg: Any) -> None: - ArchitectureAdapter.__init__(self, cfg) - self._setup_qwen3_config(cfg) - self.supports_fold_ln = False - setattr(self.cfg, "gated_q_proj", True) # q_proj outputs [Q|gate] interleaved per head - self.weight_processing_conversions: dict = {} - self.component_mapping = self._build_component_mapping(hybrid=True) + setattr(cfg, "gated_q_proj", True) + super().__init__(cfg, hybrid=True) def _build_mlp_bridge(self): """Sparse MoE MLP (router + batched experts + shared expert).""" diff --git a/transformer_lens/tools/model_registry/data/architecture_gaps.json b/transformer_lens/tools/model_registry/data/architecture_gaps.json index f3eb11de9..6261a9a65 100644 --- a/transformer_lens/tools/model_registry/data/architecture_gaps.json +++ b/transformer_lens/tools/model_registry/data/architecture_gaps.json @@ -1,18 +1,18 @@ { - "generated_at": "2026-04-10", + "generated_at": "2026-04-14", "scan_info": { - "total_scanned": 5436, + "total_scanned": 5633, "task_filter": "text-generation", "min_downloads": 500, - "scan_duration_seconds": 3.9 + "scan_duration_seconds": 4.2 }, - "total_unsupported_architectures": 401, - "total_unsupported_models": 1459, + "total_unsupported_architectures": 416, + "total_unsupported_models": 1400, "gaps": [ { "architecture_id": "Qwen3_5ForConditionalGeneration", - "total_models": 67, - "total_downloads": 140710, + "total_models": 72, + "total_downloads": 146334, "min_param_count": 211968832, "sample_models": [ "Tesslate/OmniCoder-9B", @@ -20,18 +20,37 @@ "nightmedia/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-qx64-hi-mlx", "Brooooooklyn/Qwen3.5-27B-unsloth-mlx", "aifeifei798/Qwen3.5-Queen-27B", - "Jackrong/Qwen3.5-2B-Claude-4.6-Opus-Reasoning-Distilled", "Brooooooklyn/Qwen3.5-9B-unsloth-mlx", - "alexcovo/qwen35-9b-mlx-turboquant-tq3", "bigatuna/Qwen3.5-9b-Sushi-Coder-RL-MLX", - "Jackrong/Qwen3.5-9B-Neo" + "Jackrong/Qwen3.5-2B-Claude-4.6-Opus-Reasoning-Distilled", + "alexcovo/qwen35-9b-mlx-turboquant-tq3", + "Oysiyl/qwen3.5-27b-unslop-good-lora-v1" + ], + "relevancy_score": 91.0 + }, + { + "architecture_id": "Gemma4ForConditionalGeneration", + "total_models": 64, + "total_downloads": 90296, + "min_param_count": 738022691, + "sample_models": [ + "dealignai/Gemma-4-31B-JANG_4M-Uncensored", + "0xSero/gemma-4-21b-a4b-it-REAP", + "InfinimindCreations/gemma-4-E4B-it-uncensored", + "TrevorJS/gemma-4-26B-A4B-it-uncensored", + "WWTCyberLab/gemma-4-31B-it-abliterated", + "WWTCyberLab/gemma-4-26B-A4B-it-abliterated", + "TrevorJS/gemma-4-31B-it-uncensored", + "TrevorJS/gemma-4-E4B-it-uncensored", + "InfinimindCreations/gemma-4-31B-it-uncensored", + "TrevorJS/gemma-4-E2B-it-uncensored" ], - "relevancy_score": 91.5 + "relevancy_score": 84.9 }, { "architecture_id": "DeepseekV3ForCausalLM", - "total_models": 48, - "total_downloads": 6449394, + "total_models": 46, + "total_downloads": 6840308, "min_param_count": 1656048, "sample_models": [ "deepseek-ai/DeepSeek-R1", @@ -39,43 +58,24 @@ "deepseek-ai/DeepSeek-V3", "deepseek-ai/DeepSeek-V3-0324", "moonshotai/Kimi-K2-Instruct-0905", - "deepseek-ai/DeepSeek-V3.1", - "ai-sage/GigaChat3-10B-A1.8B", "moonshotai/Kimi-K2-Instruct", + "deepseek-ai/DeepSeek-V3.1", "trl-internal-testing/tiny-DeepseekV3ForCausalLM", - "trl-internal-testing/tiny-DeepseekV3ForCausalLM-0528" - ], - "relevancy_score": 87.2 - }, - { - "architecture_id": "Qwen3MoeForCausalLM", - "total_models": 45, - "total_downloads": 5469133, - "min_param_count": 2574656, - "sample_models": [ - "Qwen/Qwen3-30B-A3B", - "Qwen/Qwen3-Coder-30B-A3B-Instruct", - "Qwen/Qwen3-30B-A3B-Instruct-2507", - "Qwen/Qwen3-235B-A22B", - "trl-internal-testing/tiny-Qwen3MoeForCausalLM", - "Qwen/Qwen3-30B-A3B-Thinking-2507", - "Qwen/Qwen3-235B-A22B-Instruct-2507", - "Qwen/Qwen3-Coder-480B-A35B-Instruct", - "Qwen/Qwen3-235B-A22B-Thinking-2507", - "NVFP4/Qwen3-30B-A3B-Instruct-2507-FP4" + "trl-internal-testing/tiny-DeepseekV3ForCausalLM-0528", + "moonshotai/Moonlight-16B-A3B-Instruct" ], - "relevancy_score": 84.9 + "relevancy_score": 83.1 }, { "architecture_id": "NemotronHForCausalLM", - "total_models": 40, - "total_downloads": 3187865, + "total_models": 41, + "total_downloads": 3587883, "min_param_count": 4221480, "sample_models": [ "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", - "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese", "nvidia/NVIDIA-Nemotron-Nano-9B-v2", "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", + "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese", "nvidia/Nemotron-Cascade-2-30B-A3B", "nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16", "unsloth/NVIDIA-Nemotron-3-Nano-4B", @@ -83,69 +83,31 @@ "empero-ai/openNemo-9B", "cpagac/Nemotron-Nano-9B-v2-heretic" ], - "relevancy_score": 80.3 - }, - { - "architecture_id": "Qwen3_5ForCausalLM", - "total_models": 52, - "total_downloads": 81342, - "min_param_count": 752393024, - "sample_models": [ - "lukey03/Qwen3.5-9B-abliterated", - "GoodStartLabs/gin-rummy-hbc-qwen3.5-0.8b", - "aifeifei798/Darkidol-Ballad-27B", - "brocchirodrigo/anotaai-ajuda-qwen3_5_Q4", - "kai-os/Carnice-9b", - "llmfan46/Darkidol-Ballad-27B-ultra-uncensored-heretic-v1", - "llmfan46/Darkidol-Ballad-27B-ultra-uncensored-heretic-v2", - "Phonsiri/Qwen3.5-9B-Thai-Law-Base", - "continuum-ai/qwen3.5-4b-code-forged", - "aifeifei798/Darkidol-Ballad-9B" - ], - "relevancy_score": 80.2 + "relevancy_score": 78.6 }, { "architecture_id": "Lfm2ForCausalLM", "total_models": 40, - "total_downloads": 1395683, + "total_downloads": 1626247, "min_param_count": 274754048, "sample_models": [ "farbodtavakkoli/OTel-LLM-1.2B-IT", "LiquidAI/LFM2.5-1.2B-Instruct", "LiquidAI/LFM2-1.2B", - "LiquidAI/LFM2-350M", "LiquidAI/LFM2.5-350M", + "LiquidAI/LFM2-350M", "LiquidAI/LFM2.5-1.2B-Thinking", "LiquidAI/LFM2-2.6B-Exp", "LiquidAI/LFM2.5-1.2B-Base", "LiquidAI/LFM2-700M", "unsloth/LFM2.5-1.2B-Instruct" ], - "relevancy_score": 78.4 - }, - { - "architecture_id": "Gemma4ForConditionalGeneration", - "total_models": 37, - "total_downloads": 51866, - "min_param_count": 2084387402, - "sample_models": [ - "dealignai/Gemma-4-31B-JANG_4M-Uncensored", - "0xSero/gemma-4-21b-a4b-it-REAP", - "InfinimindCreations/gemma-4-E4B-it-uncensored", - "lthn/lemma", - "TrevorJS/gemma-4-26B-A4B-it-uncensored", - "livadies/gemma-4-E2B-Ghetto-NF4", - "Greytechai/Gemma-4-31B-JANG_4M-CRACK", - "WWTCyberLab/gemma-4-31B-it-abliterated", - "WWTCyberLab/gemma-4-26B-A4B-it-abliterated", - "InfinimindCreations/gemma-4-31B-it-uncensored" - ], - "relevancy_score": 65.1 + "relevancy_score": 76.3 }, { "architecture_id": "QWenLMHeadModel", "total_models": 22, - "total_downloads": 495498, + "total_downloads": 522223, "min_param_count": 19545408, "sample_models": [ "cckevinn/SeeClick", @@ -156,91 +118,53 @@ "Qwen/Qwen-1_8B-Chat", "Qwen/Qwen-14B-Chat", "Qwen/Qwen-14B", - "Xingyu-Zheng/Qwen-VL-Chat", - "Qwen/Qwen-72B" - ], - "relevancy_score": 64.0 - }, - { - "architecture_id": "InternLM2ForCausalLM", - "total_models": 23, - "total_downloads": 253936, - "min_param_count": 24052864, - "sample_models": [ - "internlm/internlm2-chat-7b", - "internlm/internlm2_5-7b-chat", - "internlm/internlm2-7b", - "internlm/internlm2-20b", - "internlm/internlm2-base-7b", - "internlm/internlm2-chat-20b", - "internlm/internlm2-base-20b", - "chujiezheng/internlm2-chat-20b-ExPO", - "chujiezheng/internlm2-chat-7b-ExPO", - "internlm/internlm2-1_8b" + "Qwen/Qwen-Audio-Chat", + "Xingyu-Zheng/Qwen-VL-Chat" ], - "relevancy_score": 63.2 + "relevancy_score": 62.5 }, { - "architecture_id": "GPTBigCodeForCausalLM", - "total_models": 24, - "total_downloads": 109509, - "min_param_count": 1845928, + "architecture_id": "DeepseekV32ForCausalLM", + "total_models": 12, + "total_downloads": 9006409, + "min_param_count": 136559748, "sample_models": [ - "bigcode/gpt_bigcode-santacoder", - "bigcode/tiny_starcoder_py", - "bigcode/starcoder", - "bigcode/starcoderbase-1b", - "ibm-granite/granite-20b-code-base-8k", - "GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct", - "HuggingFaceH4/starchat-alpha", - "defog/sqlcoder2", - "HuggingFaceH4/starchat-beta", - "ibm-granite/granite-20b-code-instruct-8k" + "deepseek-ai/DeepSeek-V3.2", + "deepseek-ai/DeepSeek-V3.2-Exp", + "deepseek-ai/DeepSeek-V3.2-Speciale", + "deepseek-ai/DeepSeek-Math-V2", + "zianglih/DeepSeek-V3.2-6layer-last-1-bf16", + "exolabs/DeepSeek-V3.2_bf16", + "deepseek-ai/DeepSeek-V3.2-Exp-Base", + "cs2764/DeepSeek-V3.2_dq4-mlx", + "zianglih/DeepSeek-V3.2-MXFP8", + "hyper-accel/tiny-random-deepseek-v32" ], - "relevancy_score": 62.0 + "relevancy_score": 62.5 }, { - "architecture_id": "T5GemmaForConditionalGeneration", - "total_models": 14, - "total_downloads": 1037477, - "min_param_count": 312517632, + "architecture_id": "JambaForCausalLM", + "total_models": 28, + "total_downloads": 48565, + "min_param_count": 127679344, "sample_models": [ - "google/t5gemma-s-s-prefixlm", - "google/t5gemma-9b-9b-ul2", - "google/t5gemma-b-b-ul2", - "google/t5gemma-2b-2b-prefixlm", - "google/t5gemma-2b-2b-ul2", - "google/t5gemma-l-l-ul2-it", - "google/t5gemma-ml-ml-ul2-it", - "google/t5gemma-b-b-prefixlm", - "google/t5gemma-s-s-prefixlm-it", - "google/t5gemma-9b-9b-prefixlm" - ], - "relevancy_score": 60.3 - }, - { - "architecture_id": "XGLMForCausalLM", - "total_models": 18, - "total_downloads": 223424, - "min_param_count": 162256896, - "sample_models": [ - "facebook/xglm-564M", - "facebook/incoder-1B", - "facebook/xglm-7.5B", - "facebook/xglm-4.5B", - "facebook/xglm-1.7B", - "KoboldAI/fairseq-dense-2.7B", - "KoboldAI/fairseq-dense-125M", - "KoboldAI/fairseq-dense-355M", - "KoboldAI/fairseq-dense-13B", - "KoboldAI/fairseq-dense-1.3B" + "ai21labs/AI21-Jamba-Mini-1.5", + "ai21labs/Jamba-tiny-random", + "ai21labs/AI21-Jamba-Mini-1.6", + "ai21labs/AI21-Jamba-Large-1.5", + "ai21labs/AI21-Jamba2-3B", + "ai21labs/AI21-Jamba-Large-1.6", + "ai21labs/Jamba-v0.1", + "ai21labs/AI21-Jamba2-Mini", + "ai21labs/AI21-Jamba-Reasoning-3B", + "microsoft/Dayhoff-170M-GRS-112000" ], - "relevancy_score": 59.6 + "relevancy_score": 61.1 }, { "architecture_id": "Glm4MoeForCausalLM", - "total_models": 14, - "total_downloads": 742282, + "total_models": 16, + "total_downloads": 751068, "min_param_count": 2572352, "sample_models": [ "zai-org/GLM-4.5-Air", @@ -249,72 +173,55 @@ "zai-org/GLM-4.5", "zai-org/GLM-4.6", "np-cr/testing-glm4-moe", - "ArliAI/GLM-4.6-Derestricted-v3", "PrimeIntellect/GLM-0.5B", + "ArliAI/GLM-4.6-Derestricted-v3", "zai-org/GLM-4.5-Air-Base", "PrimeIntellect/INTELLECT-3" ], "relevancy_score": 59.6 }, { - "architecture_id": "JambaForCausalLM", - "total_models": 22, - "total_downloads": 44090, - "min_param_count": 127679344, - "sample_models": [ - "ai21labs/AI21-Jamba-Mini-1.5", - "ai21labs/Jamba-tiny-random", - "ai21labs/AI21-Jamba-Mini-1.6", - "ai21labs/AI21-Jamba-Large-1.5", - "ai21labs/AI21-Jamba-Large-1.6", - "ai21labs/AI21-Jamba2-3B", - "ai21labs/Jamba-v0.1", - "ai21labs/AI21-Jamba2-Mini", - "ai21labs/AI21-Jamba-Reasoning-3B", - "microsoft/Dayhoff-170m-GR" - ], - "relevancy_score": 58.6 - }, - { - "architecture_id": "DeepseekV32ForCausalLM", - "total_models": 8, - "total_downloads": 1446699, - "min_param_count": 136559748, + "architecture_id": "T5GemmaForConditionalGeneration", + "total_models": 14, + "total_downloads": 1062491, + "min_param_count": 312517632, "sample_models": [ - "deepseek-ai/DeepSeek-V3.2", - "deepseek-ai/DeepSeek-V3.2-Exp", - "deepseek-ai/DeepSeek-V3.2-Speciale", - "deepseek-ai/DeepSeek-Math-V2", - "exolabs/DeepSeek-V3.2_bf16", - "deepseek-ai/DeepSeek-V3.2-Exp-Base", - "hyper-accel/tiny-random-deepseek-v32", - "cs2764/DeepSeek-V3.2_dq4-mlx" + "google/t5gemma-s-s-prefixlm", + "google/t5gemma-b-b-ul2", + "google/t5gemma-9b-9b-ul2", + "google/t5gemma-2b-2b-prefixlm", + "google/t5gemma-2b-2b-ul2", + "google/t5gemma-l-l-ul2-it", + "google/t5gemma-ml-ml-ul2-it", + "google/t5gemma-b-b-prefixlm", + "google/t5gemma-s-s-prefixlm-it", + "google/t5gemma-9b-9b-prefixlm" ], - "relevancy_score": 57.0 + "relevancy_score": 59.1 }, { - "architecture_id": "BaichuanForCausalLM", - "total_models": 15, - "total_downloads": 115111, - "min_param_count": 16204352, + "architecture_id": "GPTBigCodeForCausalLM", + "total_models": 24, + "total_downloads": 39369, + "min_param_count": 1845928, "sample_models": [ - "baichuan-inc/Baichuan2-7B-Chat", - "baichuan-inc/Baichuan2-13B-Chat", - "baichuan-inc/Baichuan-13B-Chat", - "baichuan-inc/Baichuan2-7B-Base", - "baichuan-inc/Baichuan2-13B-Base", - "sakuraumi/Sakura-13B-Galgame", - "zxbsmk/NSFW_13B_sft", - "katuni4ka/tiny-random-baichuan2", - "baichuan-inc/Baichuan-13B-Base", - "FreedomIntelligence/HuatuoGPT2-7B" + "bigcode/starcoder", + "bigcode/starcoderbase-1b", + "ibm-granite/granite-20b-code-base-8k", + "GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct", + "HuggingFaceH4/starchat-alpha", + "defog/sqlcoder2", + "ibm-granite/granite-20b-code-instruct-8k", + "HuggingFaceH4/starchat-beta", + "LoupGarou/WizardCoder-Guanaco-15B-V1.0", + "openchat/opencoderplus" ], - "relevancy_score": 56.1 + "relevancy_score": 58.1 }, { "architecture_id": "SmolLM3ForCausalLM", - "total_models": 7, - "total_downloads": 1155432, + "total_models": 8, + "total_downloads": 1123995, "min_param_count": 8245568, "sample_models": [ "HuggingFaceTB/SmolLM3-3B", @@ -322,15 +229,35 @@ "optimum-internal-testing/tiny-random-SmolLM3ForCausalLM", "unsloth/SmolLM3-3B", "onnx-internal-testing/tiny-random-SmolLM3ForCausalLM", - "MInAlA/smollm3-dpo-merged", - "N-Bot-Int/SmolSam3-MEMGRPO" + "MInAlA/SmolLM3-3B-DPO-merged", + "N-Bot-Int/SmolSam3-MEMGRPO", + "yujiepan/smollm3-tiny-random" + ], + "relevancy_score": 55.5 + }, + { + "architecture_id": "GlmMoeDsaForCausalLM", + "total_models": 10, + "total_downloads": 590748, + "min_param_count": 162774148, + "sample_models": [ + "zai-org/GLM-5", + "zai-org/GLM-5.1", + "cs2764/GLM-5-abliterated-dq4-mlx", + "livadies/GLM-5.1-Ghetto-MoE-2-Experts", + "unsloth/GLM-5", + "JANGQ-AI/GLM-5.1-JANG_2S", + "0xSero/GLM-5-REAP-381B", + "JANGQ-AI/GLM-5.1-JANG_1L", + "cs2764/GLM-5-abliterated-dq3-mlx", + "hyper-accel/tiny-random-glm-moe-dsa" ], - "relevancy_score": 55.9 + "relevancy_score": 55.3 }, { "architecture_id": "BartForConditionalGeneration", "total_models": 9, - "total_downloads": 599134, + "total_downloads": 692599, "min_param_count": 6044480, "sample_models": [ "KomeijiForce/bart-large-emojilm", @@ -343,12 +270,31 @@ "Tianlin668/MentalBART", "KomeijiForce/bart-large-emojilm-e2t" ], - "relevancy_score": 55.7 + "relevancy_score": 55.0 + }, + { + "architecture_id": "BaichuanForCausalLM", + "total_models": 15, + "total_downloads": 117761, + "min_param_count": 16204352, + "sample_models": [ + "baichuan-inc/Baichuan2-7B-Chat", + "baichuan-inc/Baichuan2-13B-Chat", + "baichuan-inc/Baichuan-13B-Chat", + "baichuan-inc/Baichuan2-7B-Base", + "baichuan-inc/Baichuan2-13B-Base", + "zxbsmk/NSFW_13B_sft", + "sakuraumi/Sakura-13B-Galgame", + "baichuan-inc/Baichuan-13B-Base", + "katuni4ka/tiny-random-baichuan2", + "FreedomIntelligence/HuatuoGPT2-7B" + ], + "relevancy_score": 54.9 }, { "architecture_id": "FalconH1ForCausalLM", "total_models": 15, - "total_downloads": 76731, + "total_downloads": 77408, "min_param_count": 91131072, "sample_models": [ "tiiuae/Falcon-H1-0.5B-Base", @@ -359,80 +305,45 @@ "tiiuae/Falcon-H1-1.5B-Base", "tiiuae/Falcon-H1-Tiny-90M-Instruct", "tiiuae/Falcon-H1R-7B", - "tiiuae/Falcon-H1-1.5B-Deep-Instruct", - "tiiuae/Falcon-H1-3B-Instruct" - ], - "relevancy_score": 55.2 - }, - { - "architecture_id": "CohereForCausalLM", - "total_models": 10, - "total_downloads": 193414, - "min_param_count": 2042176, - "sample_models": [ - "trl-internal-testing/tiny-CohereForCausalLM", - "CohereLabs/aya-expanse-8b", - "CohereLabs/c4ai-command-r-v01", - "CohereLabs/aya-23-8B", - "NLPark/AnFeng_v3_Avocet", - "CohereLabs/aya-expanse-32b", - "CohereLabs/aya-23-35B", - "CohereLabs/c4ai-command-r-plus-08-2024", - "CohereLabs/c4ai-command-r-08-2024", - "CohereLabs/c4ai-command-r-plus" + "tiiuae/Falcon-H1-3B-Instruct", + "tiiuae/Falcon-H1-1.5B-Deep-Instruct" ], - "relevancy_score": 53.9 + "relevancy_score": 54.0 }, { "architecture_id": "H2OVLChatModel", "total_models": 2, - "total_downloads": 2131755, + "total_downloads": 2009160, "min_param_count": 826295808, "sample_models": [ "h2oai/h2ovl-mississippi-800m", "h2oai/h2ovl-mississippi-2b" ], - "relevancy_score": 53.9 - }, - { - "architecture_id": "MiniCPMForCausalLM", - "total_models": 12, - "total_downloads": 93202, - "min_param_count": 80000640, - "sample_models": [ - "openbmb/MiniCPM-2B-sft-bf16", - "openbmb/MiniCPM4.1-8B", - "openbmb/MiniCPM-1B-sft-bf16", - "openbmb/MiniCPM4-0.5B", - "openbmb/MiniCPM-MoE-8x2B", - "openbmb/MiniCPM-S-1B-sft", - "katuni4ka/tiny-random-minicpm", - "openbmb/MiniCPM4-8B", - "openbmb/MiniCPM-2B-sft-fp32", - "openbmb/MiniCPM-2B-dpo-bf16" - ], - "relevancy_score": 53.6 + "relevancy_score": 53.0 }, { - "architecture_id": "GlmMoeDsaForCausalLM", - "total_models": 7, - "total_downloads": 411962, - "min_param_count": 162774148, + "architecture_id": "DFlashDraftModel", + "total_models": 11, + "total_downloads": 131573, + "min_param_count": 473995264, "sample_models": [ - "zai-org/GLM-5", - "zai-org/GLM-5.1", - "cs2764/GLM-5-abliterated-dq4-mlx", - "0xSero/GLM-5-REAP-381B", - "unsloth/GLM-5", - "cs2764/GLM-5-abliterated-dq3-mlx", - "hyper-accel/tiny-random-glm-moe-dsa" + "z-lab/Qwen3-4B-DFlash-b16", + "z-lab/Qwen3-8B-DFlash-b16", + "z-lab/Qwen3.5-27B-DFlash", + "z-lab/Qwen3.5-9B-DFlash", + "z-lab/Qwen3.5-4B-DFlash", + "z-lab/Qwen3.5-35B-A3B-DFlash", + "z-lab/gpt-oss-120b-DFlash", + "z-lab/gpt-oss-20b-DFlash", + "z-lab/Qwen3-Coder-30B-A3B-DFlash", + "z-lab/LLaMA3.1-8B-Instruct-DFlash-UltraChat" ], - "relevancy_score": 53.6 + "relevancy_score": 52.6 }, { "architecture_id": "Cohere2ForCausalLM", "total_models": 9, - "total_downloads": 207703, + "total_downloads": 220158, "min_param_count": 2090024, "sample_models": [ "trl-internal-testing/tiny-Cohere2ForCausalLM", @@ -445,12 +356,12 @@ "CohereLabs/tiny-aya-fire", "CohereLabs/tiny-aya-earth" ], - "relevancy_score": 53.4 + "relevancy_score": 52.5 }, { "architecture_id": "PhiMoEForCausalLM", "total_models": 4, - "total_downloads": 889098, + "total_downloads": 902073, "min_param_count": 1110112, "sample_models": [ "microsoft/Phi-tiny-MoE-instruct", @@ -458,37 +369,37 @@ "microsoft/Phi-3.5-MoE-instruct", "optimum-intel-internal-testing/phi-3.5-moe-tiny-random" ], - "relevancy_score": 53.3 + "relevancy_score": 52.5 }, { - "architecture_id": "MPTForCausalLM", - "total_models": 26, - "total_downloads": 41278, - "min_param_count": 6649286656, + "architecture_id": "MiniCPMForCausalLM", + "total_models": 12, + "total_downloads": 90418, + "min_param_count": 80000640, "sample_models": [ - "vinai/PhoGPT-4B-Chat", - "vinai/PhoGPT-4B", - "anas-awadalla/mpt-7b", - "gl198976/mpt-7b-instruct", - "replit/replit-code-v1-3b", - "echarlaix/tiny-mpt-random-remote-code", - "wtang06/mpt-125m-c4", - "lightblue/japanese-mpt-7b", - "gl198976/mpt-7b", - "TehVenom/MPT-7b-Chat-Instruct-LongCTX-Merge" + "openbmb/MiniCPM-2B-sft-bf16", + "openbmb/MiniCPM4.1-8B", + "openbmb/MiniCPM-1B-sft-bf16", + "openbmb/MiniCPM4-0.5B", + "openbmb/MiniCPM-MoE-8x2B", + "openbmb/MiniCPM-S-1B-sft", + "openbmb/MiniCPM4-8B", + "katuni4ka/tiny-random-minicpm", + "openbmb/MiniCPM-2B-dpo-bf16", + "openbmb/MiniCPM-2B-sft-fp32" ], - "relevancy_score": 53.2 + "relevancy_score": 52.4 }, { "architecture_id": "RwkvForCausalLM", "total_models": 15, - "total_downloads": 31498, + "total_downloads": 29790, "min_param_count": 169342464, "sample_models": [ "RWKV/v5-Eagle-7B-HF", "RWKV/rwkv-4-169m-pile", - "beomi/KoRWKV-6B", "RWKV/rwkv-4-430m-pile", + "beomi/KoRWKV-6B", "RWKV/rwkv-4-1b5-pile", "RWKV/rwkv-4-3b-pile", "RWKV/rwkv-raven-1b5", @@ -496,12 +407,12 @@ "RWKV/rwkv-raven-3b", "RWKV/rwkv-raven-14b" ], - "relevancy_score": 53.2 + "relevancy_score": 51.9 }, { "architecture_id": "MT5ForConditionalGeneration", "total_models": 13, - "total_downloads": 55149, + "total_downloads": 51271, "min_param_count": 300176768, "sample_models": [ "knowledgator/IUPAC2SMILES-canonical-base", @@ -515,47 +426,43 @@ "intelia-lab-uah/mt0-base_QG_SQAC", "UBC-NLP/toucan-1.2B" ], - "relevancy_score": 53.1 - }, - { - "architecture_id": "DFlashDraftModel", - "total_models": 10, - "total_downloads": 128716, - "min_param_count": 473995264, - "sample_models": [ - "z-lab/Qwen3-4B-DFlash-b16", - "z-lab/Qwen3-8B-DFlash-b16", - "z-lab/Qwen3.5-9B-DFlash", - "z-lab/Qwen3.5-27B-DFlash", - "z-lab/Qwen3.5-4B-DFlash", - "z-lab/gpt-oss-120b-DFlash", - "z-lab/gpt-oss-20b-DFlash", - "z-lab/Qwen3.5-35B-A3B-DFlash", - "z-lab/LLaMA3.1-8B-Instruct-DFlash-UltraChat", - "z-lab/Qwen3-Coder-30B-A3B-DFlash" - ], - "relevancy_score": 53.0 + "relevancy_score": 51.8 }, { "architecture_id": "Qwen2MoeForCausalLM", "total_models": 7, - "total_downloads": 203653, + "total_downloads": 193536, "min_param_count": 1219036, "sample_models": [ "Qwen/Qwen1.5-MoE-A2.7B", "Qwen/Qwen1.5-MoE-A2.7B-Chat", "Qwen/Qwen2-57B-A14B-Instruct", "Qwen/Qwen2-57B-A14B", - "katuni4ka/tiny-random-qwen1.5-moe", "yujiepan/qwen1.5-moe-tiny-random", + "katuni4ka/tiny-random-qwen1.5-moe", "xd2010/Qwen1.5-MOE-sft-math7k-densemixer" ], - "relevancy_score": 52.0 + "relevancy_score": 51.0 + }, + { + "architecture_id": "FalconMambaForCausalLM", + "total_models": 6, + "total_downloads": 194376, + "min_param_count": 525400, + "sample_models": [ + "trl-internal-testing/tiny-FalconMambaForCausalLM", + "tiiuae/falcon-mamba-7b-instruct", + "tiiuae/falcon-mamba-7b", + "tiiuae/falcon-mamba-tiny-dev", + "tiiuae/Falcon3-Mamba-7B-Instruct", + "tiiuae/Falcon3-Mamba-7B-Base" + ], + "relevancy_score": 50.4 }, { "architecture_id": "Phi3VForCausalLM", "total_models": 6, - "total_downloads": 174972, + "total_downloads": 173011, "min_param_count": 304612720, "sample_models": [ "microsoft/Phi-3-vision-128k-instruct", @@ -565,12 +472,12 @@ "Desm0nt/Phi-3-HornyVision-128k-instruct", "failspy/Phi-3-vision-128k-instruct-abliterated-alpha" ], - "relevancy_score": 51.0 + "relevancy_score": 50.1 }, { "architecture_id": "ExaoneForCausalLM", "total_models": 7, - "total_downloads": 660526, + "total_downloads": 626575, "min_param_count": 2405327360, "sample_models": [ "LGAI-EXAONE/EXAONE-Deep-7.8B", @@ -581,26 +488,31 @@ "LGAI-EXAONE/EXAONE-Deep-32B", "LGAI-EXAONE/EXAONE-Deep-2.4B" ], - "relevancy_score": 50.6 + "relevancy_score": 49.5 }, { - "architecture_id": "FalconMambaForCausalLM", - "total_models": 5, - "total_downloads": 186669, - "min_param_count": 525400, + "architecture_id": "Glm4ForCausalLM", + "total_models": 10, + "total_downloads": 32445, + "min_param_count": 4854928, "sample_models": [ - "trl-internal-testing/tiny-FalconMambaForCausalLM", - "tiiuae/falcon-mamba-7b-instruct", - "tiiuae/falcon-mamba-7b", - "tiiuae/falcon-mamba-tiny-dev", - "tiiuae/Falcon3-Mamba-7B-Instruct" + "zai-org/GLM-4-9B-0414", + "zai-org/GLM-4-32B-0414", + "zai-org/GLM-Z1-9B-0414", + "MCult01/glm-muse-v2", + "zai-org/GLM-Z1-32B-0414", + "MCult01/glm-muse-v1", + "zai-org/GLM-4-32B-Base-0414", + "yujiepan/glm-4-tiny-random", + "llmfan46/GLM-4-32B-0414-uncensored-heretic-v1", + "ccui46/cookingworld_per_chunk_act_glm_tokfix_diffPrompt_5000" ], - "relevancy_score": 50.5 + "relevancy_score": 49.0 }, { "architecture_id": "LlavaQwenForCausalLM", "total_models": 4, - "total_downloads": 165137, + "total_downloads": 186477, "min_param_count": 893618208, "sample_models": [ "lmms-lab/llava-onevision-qwen2-7b-ov", @@ -608,24 +520,62 @@ "lmms-lab/llava-onevision-qwen2-7b-si", "lmms-lab/llava-onevision-qwen2-0.5b-si" ], - "relevancy_score": 49.5 + "relevancy_score": 49.0 + }, + { + "architecture_id": "MiniMaxM2ForCausalLM", + "total_models": 23, + "total_downloads": 1143531, + "min_param_count": 18581099008, + "sample_models": [ + "MiniMaxAI/MiniMax-M2.5", + "cerebras/MiniMax-M2.1-REAP-139B-A10B", + "MiniMaxAI/MiniMax-M2", + "MiniMaxAI/MiniMax-M2.7", + "MiniMaxAI/MiniMax-M2.1", + "cerebras/MiniMax-M2.5-REAP-139B-A10B", + "JANGQ-AI/MiniMax-M2.7-JANG_2L", + "aspctu/MiniMax-M2.5", + "JANGQ-AI/MiniMax-M2.7-JANG_3L", + "dealignai/MiniMax-M2.5-UNCENSORED-JANG_2L" + ], + "relevancy_score": 48.9 + }, + { + "architecture_id": "LlamaForCausalLMEagle3", + "total_models": 11, + "total_downloads": 22792, + "min_param_count": 145422848, + "sample_models": [ + "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3", + "nvidia/gpt-oss-120b-Eagle3-long-context", + "chankhavu/c2.eagle3-test", + "Zjcxy-SmartAI/Eagle3-Qwen3-32B-zh", + "nvidia/gpt-oss-120b-Eagle3-short-context", + "Zjcxy-SmartAI/Eagle3-Qwen3-8B-zh", + "nvidia/gpt-oss-120b-Eagle3-throughput", + "thoughtworks/Gemma-4-31B-Eagle3", + "ruipeterpan/Qwen2.5-7B-Instruct_EAGLE3_UltraChat", + "thoughtworks/MiniMax-M2.5-Eagle3" + ], + "relevancy_score": 48.8 }, { "architecture_id": "BambaForCausalLM", "total_models": 3, - "total_downloads": 224342, + "total_downloads": 225000, "min_param_count": 33110760, "sample_models": [ "hmellor/tiny-random-BambaForCausalLM", "ibm-ai-platform/Bamba-9B-v1", "ibm-ai-platform/Bamba-9B-v2" ], - "relevancy_score": 49.5 + "relevancy_score": 48.8 }, { "architecture_id": "Eagle3Speculator", "total_models": 5, - "total_downloads": 105711, + "total_downloads": 104424, "min_param_count": 950186496, "sample_models": [ "RedHatAI/Qwen3-8B-speculator.eagle3", @@ -634,41 +584,41 @@ "RedHatAI/Qwen3-32B-speculator.eagle3", "RedHatAI/Qwen3-14B-speculator.eagle3" ], - "relevancy_score": 49.2 + "relevancy_score": 48.4 }, { "architecture_id": "OpenAIGPTLMHeadModel", "total_models": 2, - "total_downloads": 236281, + "total_downloads": 230174, "min_param_count": 119680512, "sample_models": [ "openai-community/openai-gpt", "lgaalves/gpt1" ], - "relevancy_score": 49.0 + "relevancy_score": 48.2 }, { "architecture_id": "HunYuanDenseV1ForCausalLM", "total_models": 9, - "total_downloads": 28409, + "total_downloads": 28771, "min_param_count": 539010048, "sample_models": [ "tencent/Hunyuan-7B-Instruct", "tencent/Hunyuan-0.5B-Pretrain", "tencent/Hunyuan-1.8B-Pretrain", - "tencent/Hunyuan-4B-Pretrain", "tencent/Hunyuan-7B-Instruct-0124", + "tencent/Hunyuan-4B-Pretrain", "tencent/Hunyuan-7B-Pretrain", "tencent/Hunyuan-1.8B-Instruct", "tencent/Hunyuan-0.5B-Instruct", "tencent/Hunyuan-4B-Instruct" ], - "relevancy_score": 48.9 + "relevancy_score": 48.1 }, { "architecture_id": "BloomModel", "total_models": 8, - "total_downloads": 38422, + "total_downloads": 39579, "min_param_count": 16156544, "sample_models": [ "bigscience/bigscience-small-testing", @@ -680,30 +630,12 @@ "Muennighoff/bloom-tiny-random", "TurkuNLP/gpt3-finnish-xl" ], - "relevancy_score": 48.9 - }, - { - "architecture_id": "LlamaForCausalLMEagle3", - "total_models": 9, - "total_downloads": 20292, - "min_param_count": 208676608, - "sample_models": [ - "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3", - "nvidia/gpt-oss-120b-Eagle3-long-context", - "nvidia/gpt-oss-120b-Eagle3-short-context", - "Zjcxy-SmartAI/Eagle3-Qwen3-32B-zh", - "chankhavu/c2.eagle3-test", - "Zjcxy-SmartAI/Eagle3-Qwen3-8B-zh", - "nvidia/gpt-oss-120b-Eagle3-throughput", - "thoughtworks/Gemma-4-31B-Eagle3", - "ruipeterpan/Qwen2.5-7B-Instruct_EAGLE3_UltraChat" - ], - "relevancy_score": 48.2 + "relevancy_score": 48.1 }, { "architecture_id": "NemotronForCausalLM", "total_models": 5, - "total_downloads": 59740, + "total_downloads": 63951, "min_param_count": 2150720, "sample_models": [ "nvidia/Nemotron-Mini-4B-Instruct", @@ -712,89 +644,57 @@ "badaoui/tiny-random-NemotronForCausalLM", "thhaus/nemotron3-8b" ], - "relevancy_score": 47.9 - }, - { - "architecture_id": "Glm4ForCausalLM", - "total_models": 7, - "total_downloads": 30432, - "min_param_count": 4854928, - "sample_models": [ - "zai-org/GLM-4-9B-0414", - "zai-org/GLM-Z1-32B-0414", - "zai-org/GLM-Z1-9B-0414", - "zai-org/GLM-4-32B-0414", - "zai-org/GLM-4-32B-Base-0414", - "llmfan46/GLM-4-32B-0414-uncensored-heretic-v1", - "yujiepan/glm-4-tiny-random" - ], - "relevancy_score": 47.7 + "relevancy_score": 47.3 }, { "architecture_id": "HyenaDNAForCausalLM", "total_models": 6, - "total_downloads": 38899, + "total_downloads": 38536, "min_param_count": 450712, "sample_models": [ "LongSafari/hyenadna-small-32k-seqlen-hf", "LongSafari/hyenadna-medium-450k-seqlen-hf", - "LongSafari/hyenadna-large-1m-seqlen-hf", "LongSafari/hyenadna-tiny-1k-seqlen-hf", + "LongSafari/hyenadna-large-1m-seqlen-hf", "LongSafari/hyenadna-medium-160k-seqlen-hf", "LongSafari/hyenadna-tiny-16k-seqlen-d128-hf" ], - "relevancy_score": 47.6 - }, - { - "architecture_id": "ProGenForCausalLM", - "total_models": 5, - "total_downloads": 47595, - "min_param_count": 151148576, - "sample_models": [ - "hugohrban/progen2-base", - "hugohrban/progen2-small", - "hugohrban/progen2-medium", - "hugohrban/progen2-large", - "hugohrban/progen2-small-mix7" - ], - "relevancy_score": 47.4 - }, - { - "architecture_id": "Eagle3DraftModel", - "total_models": 7, - "total_downloads": 24688, - "min_param_count": 522152832, - "sample_models": [ - "RedHatAI/gpt-oss-20b-speculator.eagle3", - "RedHatAI/gpt-oss-120b-speculator.eagle3", - "RedHatAI/Qwen3-30B-A3B-Thinking-2507-speculator.eagle3", - "RedHatAI/Qwen3-235B-A22B-Instruct-2507-speculator.eagle3", - "RedHatAI/Qwen3-30B-A3B-Instruct-2507-speculator.eagle3", - "RedHatAI/Qwen3-30B-A3B-speculator.eagle3", - "RedHatAI/Qwen3-32B-Thinking-speculator.eagle3" - ], - "relevancy_score": 47.3 + "relevancy_score": 46.8 }, { "architecture_id": "T5WithLMHeadModel", "total_models": 7, - "total_downloads": 25117, + "total_downloads": 25552, "min_param_count": 222903936, "sample_models": [ "unicamp-dl/ptt5-base-portuguese-vocab", "Salesforce/codet5-large", "Salesforce/codet5-large-ntp-py", "Rostlab/prot_t5_xl_bfd", - "unicamp-dl/ptt5-small-portuguese-vocab", "gagan3012/k2t", + "unicamp-dl/ptt5-small-portuguese-vocab", "unicamp-dl/ptt5-large-portuguese-vocab" ], - "relevancy_score": 47.3 + "relevancy_score": 46.6 + }, + { + "architecture_id": "ProGenForCausalLM", + "total_models": 5, + "total_downloads": 46959, + "min_param_count": 151148576, + "sample_models": [ + "hugohrban/progen2-base", + "hugohrban/progen2-small", + "hugohrban/progen2-medium", + "hugohrban/progen2-large", + "hugohrban/progen2-small-mix7" + ], + "relevancy_score": 46.6 }, { "architecture_id": "Zamba2ForCausalLM", - "total_models": 7, - "total_downloads": 111102, + "total_models": 8, + "total_downloads": 111280, "min_param_count": 1215064704, "sample_models": [ "Zyphra/Zamba2-1.2B-instruct", @@ -803,14 +703,63 @@ "EchoLabs33/zamba2-1.2b-hxq", "Zyphra/Zamba2-2.7B-instruct", "EchoLabs33/zamba2-2.7b-instruct-hxq", - "EchoLabs33/zamba2-7b-instruct-hxq" + "EchoLabs33/zamba2-7b-instruct-hxq", + "Zyphra/Zamba2-2.7B-Instruct-v2" ], - "relevancy_score": 46.6 + "relevancy_score": 46.4 + }, + { + "architecture_id": "Qwen3_5MoeForConditionalGeneration", + "total_models": 16, + "total_downloads": 65079, + "min_param_count": 5555793776, + "sample_models": [ + "Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled", + "caiovicentino1/Qwopus-MoE-35B-A3B-HLWQ-Q5", + "nightmedia/Qwen3.5-35B-A3B-Text-qx64-hi-mlx", + "Brooooooklyn/Qwen3.5-35B-A3B-UD-Q4_K_XL-mlx", + "nivvis/Qwen3.5-35B-A3B-EQ-v5", + "JANGQ-AI/Qwen3.5-397B-A17B-JANG_1L", + "Jackrong/MLX-Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled-bf16", + "nightmedia/Qwen3.5-122B-A10B-Text-qx85-mlx", + "JANGQ-AI/Qwen3.5-397B-A17B-JANG_2L", + "Kevletesteur/Qwen3.5-35B-A3B-Chimere-Distilled-BF16" + ], + "relevancy_score": 46.2 + }, + { + "architecture_id": "Ernie4_5_MoeForCausalLM", + "total_models": 5, + "total_downloads": 38765, + "min_param_count": 904040, + "sample_models": [ + "baidu/ERNIE-4.5-21B-A3B-PT", + "baidu/ERNIE-4.5-21B-A3B-Base-PT", + "baidu/ERNIE-4.5-21B-A3B-Thinking", + "yujiepan/ernie-4.5-moe-tiny-random", + "baidu/ERNIE-4.5-300B-A47B-PT" + ], + "relevancy_score": 46.2 + }, + { + "architecture_id": "Eagle3DraftModel", + "total_models": 6, + "total_downloads": 24433, + "min_param_count": 522152832, + "sample_models": [ + "RedHatAI/gpt-oss-20b-speculator.eagle3", + "RedHatAI/gpt-oss-120b-speculator.eagle3", + "RedHatAI/Qwen3-30B-A3B-Thinking-2507-speculator.eagle3", + "RedHatAI/Qwen3-235B-A22B-Instruct-2507-speculator.eagle3", + "RedHatAI/Qwen3-30B-A3B-Instruct-2507-speculator.eagle3", + "RedHatAI/Qwen3-30B-A3B-speculator.eagle3" + ], + "relevancy_score": 45.8 }, { "architecture_id": "AquilaForCausalLM", "total_models": 7, - "total_downloads": 17937, + "total_downloads": 17374, "min_param_count": 6425376, "sample_models": [ "BAAI/AquilaChat2-7B", @@ -821,28 +770,12 @@ "BAAI/AquilaChat2-34B-16K", "BAAI/Aquila2-70B-Expr" ], - "relevancy_score": 46.6 - }, - { - "architecture_id": "XverseForCausalLM", - "total_models": 7, - "total_downloads": 15816, - "min_param_count": 6459056, - "sample_models": [ - "xverse/XVERSE-7B-Chat", - "katuni4ka/tiny-random-xverse", - "xverse/XVERSE-13B-256K", - "xverse/XVERSE-13B", - "xverse/XVERSE-65B-Chat", - "xverse/XVERSE-13B-Chat", - "xverse/XVERSE-7B" - ], - "relevancy_score": 46.3 + "relevancy_score": 45.7 }, { "architecture_id": "ArceeForCausalLM", "total_models": 4, - "total_downloads": 36482, + "total_downloads": 37111, "min_param_count": 4129088, "sample_models": [ "arcee-ai/AFM-4.5B-Base", @@ -850,31 +783,28 @@ "onnx-internal-testing/tiny-random-ArceeForCausalLM", "arcee-ai/AFM-4.5B" ], - "relevancy_score": 46.1 + "relevancy_score": 45.5 }, { - "architecture_id": "Qwen3_5MoeForConditionalGeneration", - "total_models": 15, - "total_downloads": 45472, - "min_param_count": 6643527536, + "architecture_id": "XverseForCausalLM", + "total_models": 7, + "total_downloads": 15400, + "min_param_count": 6459056, "sample_models": [ - "Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled", - "nightmedia/Qwen3.5-35B-A3B-Text-qx64-hi-mlx", - "Jackrong/MLX-Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled-bf16", - "caiovicentino1/Qwopus-MoE-35B-A3B-PolarQuant-Q5", - "nivvis/Qwen3.5-35B-A3B-EQ-v5", - "Brooooooklyn/Qwen3.5-35B-A3B-UD-Q4_K_XL-mlx", - "JANGQ-AI/Qwen3.5-397B-A17B-JANG_1L", - "nightmedia/Qwen3.5-122B-A10B-Text-qx85-mlx", - "JANGQ-AI/Qwen3.5-397B-A17B-JANG_2L", - "Kevletesteur/Qwen3.5-35B-A3B-Chimere-Distilled-BF16" + "xverse/XVERSE-7B-Chat", + "katuni4ka/tiny-random-xverse", + "xverse/XVERSE-13B-256K", + "xverse/XVERSE-13B", + "xverse/XVERSE-65B-Chat", + "xverse/XVERSE-13B-Chat", + "xverse/XVERSE-7B" ], - "relevancy_score": 46.0 + "relevancy_score": 45.4 }, { "architecture_id": "LlavaQwen2ForCausalLM", "total_models": 5, - "total_downloads": 25580, + "total_downloads": 23815, "min_param_count": 758833760, "sample_models": [ "qnguyen3/nanoLLaVA", @@ -883,12 +813,26 @@ "apple/FastVLM-7B", "FreedomIntelligence/HuatuoGPT-Vision-7B" ], - "relevancy_score": 46.0 + "relevancy_score": 45.2 + }, + { + "architecture_id": "Llama4ForCausalLM", + "total_models": 5, + "total_downloads": 22089, + "min_param_count": 3269144, + "sample_models": [ + "trl-internal-testing/tiny-Llama4ForCausalLM", + "pruna-test/test-save-tiny-random-llama4-smashed", + "facebook/MobileLLM-R1.5-360M", + "facebook/MobileLLM-R1-950M", + "facebook/MobileLLM-R1-140M" + ], + "relevancy_score": 45.0 }, { "architecture_id": "SDARForCausalLM", "total_models": 6, - "total_downloads": 96427, + "total_downloads": 93909, "min_param_count": 2031739904, "sample_models": [ "JetLM/SDAR-1.7B-Chat", @@ -898,43 +842,25 @@ "JetLM/SDAR-4B-Chat", "JetLM/SDAR-4B-Chat-b32" ], - "relevancy_score": 45.6 + "relevancy_score": 44.8 }, { - "architecture_id": "DeepseekV2ForCausalLM", - "total_models": 14, - "total_downloads": 1552347, - "min_param_count": 15706484224, + "architecture_id": "SeedOssForCausalLM", + "total_models": 4, + "total_downloads": 25731, + "min_param_count": 2497064, "sample_models": [ - "deepseek-ai/DeepSeek-V2-Lite-Chat", - "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", - "deepseek-ai/DeepSeek-V2-Lite", - "deepseek-ai/DeepSeek-V2", - "deepseek-ai/DeepSeek-V2-Chat", - "deepseek-ai/DeepSeek-V2.5", - "deepseek-ai/DeepSeek-Coder-V2-Instruct", - "deepseek-ai/DeepSeek-V2-Chat-0628", - "deepseek-ai/DeepSeek-Coder-V2-Lite-Base", - "Kwaipilot/KwaiCoder-DS-V2-Lite-Base" - ], - "relevancy_score": 45.2 - }, - { - "architecture_id": "BitNetForCausalLM", - "total_models": 3, - "total_downloads": 25988, - "min_param_count": 849787090, - "sample_models": [ - "microsoft/bitnet-b1.58-2B-4T", - "microsoft/bitnet-b1.58-2B-4T-bf16", - "iSolver-AI/FEnet" + "ByteDance-Seed/Seed-OSS-36B-Instruct", + "NousResearch/Hermes-4.3-36B", + "ByteDance-Seed/Seed-OSS-36B-Base", + "yujiepan/seed-oss-tiny-random" ], "relevancy_score": 44.7 }, { "architecture_id": "PldrllmForCausalLM", "total_models": 5, - "total_downloads": 13743, + "total_downloads": 14045, "min_param_count": 109689362, "sample_models": [ "fromthesky/PLDR-LLM-v51-SOC-110M-5", @@ -943,31 +869,43 @@ "fromthesky/PLDR-LLM-v51-SOC-110M-3", "fromthesky/PLDR-LLM-v51-SOC-110M-1" ], - "relevancy_score": 44.6 + "relevancy_score": 44.0 }, { - "architecture_id": "DeciLMForCausalLM", - "total_models": 13, - "total_downloads": 256626, - "min_param_count": 7043551232, + "architecture_id": "DeepseekV2ForCausalLM", + "total_models": 14, + "total_downloads": 1547180, + "min_param_count": 15706484224, "sample_models": [ - "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5", - "nvidia/Llama-3_3-Nemotron-Super-49B-v1", - "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1", - "ConicCat/Llama3_3-Nemo-Super-Writer-49B", - "nvidia/Llama-3_1-Nemotron-51B-Instruct", - "FriendliAI/Llama-3_3-Nemotron-Super-49B-v1_5", - "FriendliAI/Llama-3_1-Nemotron-Ultra-253B-v1", - "nvidia/Llama-3_1-Nemotron-Ultra-253B-CPT-v1", - "NewstaR/Porpoise-6b-instruct", - "Danielbrdz/Barcenas-6b" + "deepseek-ai/DeepSeek-V2-Lite-Chat", + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", + "deepseek-ai/DeepSeek-V2-Lite", + "deepseek-ai/DeepSeek-V2", + "deepseek-ai/DeepSeek-V2-Chat", + "deepseek-ai/DeepSeek-V2.5", + "deepseek-ai/DeepSeek-Coder-V2-Instruct", + "deepseek-ai/DeepSeek-V2-Chat-0628", + "deepseek-ai/DeepSeek-Coder-V2-Lite-Base", + "Kwaipilot/KwaiCoder-DS-V2-Lite-Base" + ], + "relevancy_score": 43.9 + }, + { + "architecture_id": "BitNetForCausalLM", + "total_models": 3, + "total_downloads": 23875, + "min_param_count": 849787090, + "sample_models": [ + "microsoft/bitnet-b1.58-2B-4T", + "microsoft/bitnet-b1.58-2B-4T-bf16", + "iSolver-AI/FEnet" ], - "relevancy_score": 44.5 + "relevancy_score": 43.9 }, { "architecture_id": "MoAMetricLM", "total_models": 5, - "total_downloads": 12168, + "total_downloads": 12515, "min_param_count": 69130608, "sample_models": [ "reaperdoesntknow/MoA-150M", @@ -976,24 +914,50 @@ "reaperdoesntknow/MoA-100M", "reaperdoesntknow/DiscoverLM-70M" ], - "relevancy_score": 44.4 + "relevancy_score": 43.7 }, { - "architecture_id": "Llama4ForCausalLM", - "total_models": 3, - "total_downloads": 20274, - "min_param_count": 3269144, + "architecture_id": "LlavaLlamaForCausalLM", + "total_models": 18, + "total_downloads": 64859, + "min_param_count": 7466764288, "sample_models": [ - "trl-internal-testing/tiny-Llama4ForCausalLM", - "pruna-test/test-save-tiny-random-llama4-smashed", - "facebook/MobileLLM-R1.5-360M" + "wisdomik/Quilt-Llava-v1.5-7b", + "LanguageBind/Video-LLaVA-7B", + "liuhaotian/llava-llama-2-13b-chat-lightning-preview", + "mmaaz60/LLaVA-7B-Lightening-v1-1", + "lmms-lab/llama3-llava-next-8b", + "microsoft/llava-med-7b-delta", + "deepcs233/VisCoT-7b-336", + "ManishThota/Ollama_Video_llama_7B", + "liuhaotian/LLaVA-Lightning-7B-delta-v1-1", + "EricPolaris/Quilt-Llava-v1.5-7b" + ], + "relevancy_score": 43.5 + }, + { + "architecture_id": "DeciLMForCausalLM", + "total_models": 13, + "total_downloads": 263391, + "min_param_count": 7043551232, + "sample_models": [ + "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5", + "nvidia/Llama-3_3-Nemotron-Super-49B-v1", + "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1", + "ConicCat/Llama3_3-Nemo-Super-Writer-49B", + "nvidia/Llama-3_1-Nemotron-51B-Instruct", + "FriendliAI/Llama-3_3-Nemotron-Super-49B-v1_5", + "FriendliAI/Llama-3_1-Nemotron-Ultra-253B-v1", + "nvidia/Llama-3_1-Nemotron-Ultra-253B-CPT-v1", + "NewstaR/Porpoise-6b-instruct", + "Danielbrdz/Barcenas-6b" ], - "relevancy_score": 44.2 + "relevancy_score": 43.4 }, { "architecture_id": "MBartForConditionalGeneration", "total_models": 6, - "total_downloads": 7379, + "total_downloads": 7575, "min_param_count": 379691717, "sample_models": [ "Pravopysnyk/best-unlp", @@ -1003,234 +967,221 @@ "MRNH/mbart-german-grammar-corrector", "MRNH/mbart-russian-grammar-corrector" ], - "relevancy_score": 43.9 + "relevancy_score": 43.3 }, { "architecture_id": "DogeForCausalLM", "total_models": 6, - "total_downloads": 7207, + "total_downloads": 7541, "min_param_count": 13118728, "sample_models": [ "SmallDoge/Doge-320M", - "SmallDoge/Doge-20M", "SmallDoge/Doge-160M", + "SmallDoge/Doge-20M", "SmallDoge/Doge-60M", "SmallDoge/Doge-120M-MoE", "SmallDoge/Doge-20M-MoE" ], - "relevancy_score": 43.9 + "relevancy_score": 43.3 }, { "architecture_id": "NemotronFlashForCausalLM", "total_models": 2, - "total_downloads": 21466, + "total_downloads": 23953, "min_param_count": 965389440, "sample_models": [ - "nvidia/Nemotron-Flash-3B", - "nvidia/Nemotron-Flash-1B" + "nvidia/Nemotron-Flash-1B", + "nvidia/Nemotron-Flash-3B" ], - "relevancy_score": 43.6 - }, - { - "architecture_id": "LlavaLlamaForCausalLM", - "total_models": 18, - "total_downloads": 33654, - "min_param_count": 7466764288, - "sample_models": [ - "LanguageBind/Video-LLaVA-7B", - "wisdomik/Quilt-Llava-v1.5-7b", - "liuhaotian/llava-llama-2-13b-chat-lightning-preview", - "lmms-lab/llama3-llava-next-8b", - "mmaaz60/LLaVA-7B-Lightening-v1-1", - "microsoft/llava-med-7b-delta", - "deepcs233/VisCoT-7b-336", - "ManishThota/Ollama_Video_llama_7B", - "liuhaotian/LLaVA-Lightning-7B-delta-v1-1", - "EricPolaris/Quilt-Llava-v1.5-7b" - ], - "relevancy_score": 43.4 + "relevancy_score": 43.3 }, { - "architecture_id": "Exaone4ForCausalLM", + "architecture_id": "EchoForCausalLM", "total_models": 3, - "total_downloads": 86753, - "min_param_count": 1279391488, + "total_downloads": 15499, + "min_param_count": 114687488, "sample_models": [ - "LGAI-EXAONE/EXAONE-4.0-1.2B", - "LGAI-EXAONE/EXAONE-4.0.1-32B", - "LGAI-EXAONE/EXAONE-4.0-32B" + "ethicalabs/Echo-DSRN-486M-v0.7.6-SFT", + "ethicalabs/Echo-DSRN-114M", + "ethicalabs/Echo-DSRN-114M-Base" ], - "relevancy_score": 43.4 + "relevancy_score": 43.0 }, { "architecture_id": "MiniMaxForCausalLM", "total_models": 2, - "total_downloads": 16499, + "total_downloads": 19980, "min_param_count": 231006264, "sample_models": [ "MiniMaxAI/MiniMax-Text-01-hf", "hyper-accel/tiny-random-minimax" ], - "relevancy_score": 43.0 + "relevancy_score": 42.9 }, { - "architecture_id": "EchoForCausalLM", - "total_models": 2, - "total_downloads": 14469, - "min_param_count": 114687488, - "sample_models": [ - "ethicalabs/Echo-DSRN-486M-v0.7.6-SFT", - "ethicalabs/Echo-DSRN-114M-Base" - ], - "relevancy_score": 42.7 - }, - { - "architecture_id": "AraGPT2LMHeadModel", + "architecture_id": "Exaone4ForCausalLM", "total_models": 3, - "total_downloads": 9685, - "min_param_count": 829369856, + "total_downloads": 69214, + "min_param_count": 1279391488, "sample_models": [ - "QCRI/Fanar-2-Diwan", - "aubmindlab/aragpt2-mega", - "aubmindlab/aragpt2-large" + "LGAI-EXAONE/EXAONE-4.0-1.2B", + "LGAI-EXAONE/EXAONE-4.0-32B", + "LGAI-EXAONE/EXAONE-4.0.1-32B" ], - "relevancy_score": 42.5 + "relevancy_score": 42.2 }, { - "architecture_id": "IlamaForCausalLM", - "total_models": 1, - "total_downloads": 105084, - "min_param_count": 1235814400, + "architecture_id": "DbrxForCausalLM", + "total_models": 2, + "total_downloads": 14052, + "min_param_count": 1612456, "sample_models": [ - "hmellor/Ilama-3.2-1B" + "trl-internal-testing/tiny-DbrxForCausalLM", + "katuni4ka/tiny-random-dbrx" ], - "relevancy_score": 42.5 + "relevancy_score": 42.1 }, { "architecture_id": "ModernBertForSequenceClassification", "total_models": 1, - "total_downloads": 17538, + "total_downloads": 18432, "min_param_count": 149609478, "sample_models": [ "opendatalab/meta-rater-professionalism-rating" ], - "relevancy_score": 42.5 + "relevancy_score": 42.1 }, { - "architecture_id": "LLaMAForCausalLM", - "total_models": 12, - "total_downloads": 21954, - "min_param_count": 6738425856, + "architecture_id": "Mistral3ForConditionalGeneration", + "total_models": 6, + "total_downloads": 163824, + "min_param_count": 4251743232, "sample_models": [ - "maicomputer/alpaca-13b", - "Enoch/llama-65b-hf", - "mncai/chatdoctor", - "AdaptLLM/law-LLM", - "Nitish-Garikoti/finance-LLM", - "boboto/LLaMA-65B-HF", - "AdaptLLM/finance-LLM", - "AdaptLLM/medicine-LLM", - "Rardilit/Panther_v1", - "James-WYang/BigTranslate" + "farbodtavakkoli/OTel-LLM-3B-IT", + "ArmGPT/ArmenianGPT-1.0-3B", + "JANGQ-AI/Mistral-Small-4-119B-A6B-JANG_2L", + "odytrice/kenichi-flash", + "JANGQ-AI/Mistral-Small-4-119B-A6B-JANG_6M", + "JANGQ-AI/Mistral-Small-4-119B-A6B-JANG_4M" ], - "relevancy_score": 42.4 + "relevancy_score": 42.0 }, { - "architecture_id": "RITAModelForCausalLM", + "architecture_id": "AraGPT2LMHeadModel", "total_models": 3, - "total_downloads": 9211, - "min_param_count": 85096320, + "total_downloads": 9765, + "min_param_count": 829369856, "sample_models": [ - "lightonai/RITA_s", - "lightonai/RITA_xl", - "lightonai/RITA_l" + "QCRI/Fanar-2-Diwan", + "aubmindlab/aragpt2-mega", + "aubmindlab/aragpt2-large" ], - "relevancy_score": 42.4 + "relevancy_score": 42.0 }, { "architecture_id": "StarVectorForCausalLM", "total_models": 2, - "total_downloads": 74194, + "total_downloads": 82922, "min_param_count": 1434095620, "sample_models": [ "starvector/starvector-1b-im2svg", "starvector/starvector-8b-im2svg" ], - "relevancy_score": 42.4 + "relevancy_score": 42.0 }, { - "architecture_id": "DbrxForCausalLM", - "total_models": 2, - "total_downloads": 12374, - "min_param_count": 1612456, - "sample_models": [ - "trl-internal-testing/tiny-DbrxForCausalLM", - "katuni4ka/tiny-random-dbrx" - ], - "relevancy_score": 42.4 - }, - { - "architecture_id": "Mistral3ForConditionalGeneration", - "total_models": 5, - "total_downloads": 163974, - "min_param_count": 4251743232, - "sample_models": [ - "farbodtavakkoli/OTel-LLM-3B-IT", - "ArmGPT/ArmenianGPT-1.0-3B", - "JANGQ-AI/Mistral-Small-4-119B-A6B-JANG_2L", - "odytrice/kenichi-flash", - "JANGQ-AI/Mistral-Small-4-119B-A6B-JANG_6M" - ], - "relevancy_score": 42.2 - }, - { - "architecture_id": "Plamo2ForCausalLM", - "total_models": 1, - "total_downloads": 81427, - "min_param_count": 1291441920, + "architecture_id": "RITAModelForCausalLM", + "total_models": 3, + "total_downloads": 9391, + "min_param_count": 85096320, "sample_models": [ - "pfnet/plamo-2-1b" + "lightonai/RITA_s", + "lightonai/RITA_xl", + "lightonai/RITA_l" ], "relevancy_score": 41.9 }, { "architecture_id": "OLMoForCausalLM", - "total_models": 6, - "total_downloads": 16483, + "total_models": 7, + "total_downloads": 17805, "min_param_count": 1176764416, "sample_models": [ "allenai/OLMo-7B-Instruct", "allenai/OLMo-7B", "allenai/OLMo-1B", "allenai/OLMo-7B-0424", + "Nhoodie/omni-dna-ici-dc", "allenai/OLMo-7B-Twin-2T", "allenai/OLMo-7B-SFT" ], + "relevancy_score": 41.8 + }, + { + "architecture_id": "IlamaForCausalLM", + "total_models": 1, + "total_downloads": 102422, + "min_param_count": 1235814400, + "sample_models": [ + "hmellor/Ilama-3.2-1B" + ], + "relevancy_score": 41.8 + }, + { + "architecture_id": "A2DQwen3LMHeadModel", + "total_models": 2, + "total_downloads": 11738, + "min_param_count": 751632384, + "sample_models": [ + "dllm-hub/Qwen3-0.6B-diffusion-mdlm-v0.1", + "dllm-hub/Qwen3-0.6B-diffusion-bd3lm-v0.1" + ], "relevancy_score": 41.7 }, { - "architecture_id": "MiniMaxM2ForCausalLM", - "total_models": 10, - "total_downloads": 970048, - "min_param_count": 18581099008, + "architecture_id": "NandiForCausalLM", + "total_models": 2, + "total_downloads": 10215, + "min_param_count": 153412928, "sample_models": [ - "MiniMaxAI/MiniMax-M2.5", - "cerebras/MiniMax-M2.1-REAP-139B-A10B", - "MiniMaxAI/MiniMax-M2", - "MiniMaxAI/MiniMax-M2.1", - "cerebras/MiniMax-M2.5-REAP-139B-A10B", - "aspctu/MiniMax-M2.5", - "dealignai/MiniMax-M2.5-UNCENSORED-JANG_2L", - "unsloth/MiniMax-M2.5", - "dealignai/MiniMax-M2.5-JANG_3L-CRACK", - "JANGQ-AI/MiniMax-M2.5-JANG_3L" + "Rta-AILabs/Nandi-Mini-150M", + "Rta-AILabs/Nandi-Mini-150M-Instruct" + ], + "relevancy_score": 41.4 + }, + { + "architecture_id": "LLaMAForCausalLM", + "total_models": 12, + "total_downloads": 21884, + "min_param_count": 6738425856, + "sample_models": [ + "maicomputer/alpaca-13b", + "Enoch/llama-65b-hf", + "mncai/chatdoctor", + "AdaptLLM/law-LLM", + "Nitish-Garikoti/finance-LLM", + "boboto/LLaMA-65B-HF", + "AdaptLLM/finance-LLM", + "AdaptLLM/medicine-LLM", + "Rardilit/Panther_v1", + "James-WYang/BigTranslate" + ], + "relevancy_score": 41.3 + }, + { + "architecture_id": "Plamo2ForCausalLM", + "total_models": 1, + "total_downloads": 81448, + "min_param_count": 1291441920, + "sample_models": [ + "pfnet/plamo-2-1b" ], - "relevancy_score": 41.5 + "relevancy_score": 41.3 }, { "architecture_id": "Starcoder2ForCausalLM", "total_models": 5, - "total_downloads": 117124, + "total_downloads": 116878, "min_param_count": 3030371328, "sample_models": [ "bigcode/starcoder2-3b", @@ -1239,48 +1190,36 @@ "bigcode/starcoder2-15b-instruct-v0.1", "dphn/dolphincoder-starcoder2-15b" ], - "relevancy_score": 41.4 - }, - { - "architecture_id": "GlmForCausalLM", - "total_models": 4, - "total_downloads": 23486, - "min_param_count": 1593427968, - "sample_models": [ - "zai-org/glm-4-9b-chat-hf", - "zai-org/glm-4-9b-hf", - "zai-org/glm-edge-4b-chat", - "zai-org/glm-edge-1.5b-chat" - ], - "relevancy_score": 41.2 + "relevancy_score": 40.6 }, { "architecture_id": "MolformerForCausalLM", "total_models": 2, - "total_downloads": 7302, + "total_downloads": 6850, "min_param_count": 46805760, "sample_models": [ "ibm-research/GP-MoLFormer-Uniq", "ralyn/NPComposer-v2" ], - "relevancy_score": 41.2 + "relevancy_score": 40.6 }, { - "architecture_id": "MptForCausalLM", - "total_models": 3, - "total_downloads": 4577, - "min_param_count": 405032, + "architecture_id": "GlmForCausalLM", + "total_models": 4, + "total_downloads": 23066, + "min_param_count": 1593427968, "sample_models": [ - "yujiepan/mpt-tiny-random", - "explosion-testing/mpt-test", - "team-lucid/mptk-1b" + "zai-org/glm-4-9b-chat-hf", + "zai-org/glm-4-9b-hf", + "zai-org/glm-edge-4b-chat", + "zai-org/glm-edge-1.5b-chat" ], - "relevancy_score": 40.8 + "relevancy_score": 40.5 }, { "architecture_id": "Glm4MoeLiteForCausalLM", - "total_models": 8, - "total_downloads": 1257096, + "total_models": 9, + "total_downloads": 1217856, "min_param_count": 22996118432, "sample_models": [ "zai-org/GLM-4.7-Flash", @@ -1290,79 +1229,85 @@ "Olafangensan/GLM-4.7-Flash-heretic", "Ex0bit/GLM-4.7-Flash-PRISM", "jerrycheng233/model5_sft_16bit", - "aaravriyer193/chimpgpt-coder-elite" + "aaravriyer193/chimpgpt-coder-elite", + "austindixson/glm-4.7-flash-Opus-Reasoning" ], - "relevancy_score": 40.7 + "relevancy_score": 40.3 }, { - "architecture_id": "LLaDAModelLM", - "total_models": 4, - "total_downloads": 682726, - "min_param_count": 8015581184, + "architecture_id": "MptForCausalLM", + "total_models": 3, + "total_downloads": 4595, + "min_param_count": 405032, "sample_models": [ - "GSAI-ML/LLaDA-8B-Instruct", - "GSAI-ML/LLaDA-8B-Base", - "GSAI-ML/LLaDA-1.5", - "d3LLM/d3LLM_LLaDA" + "yujiepan/mpt-tiny-random", + "explosion-testing/mpt-test", + "team-lucid/mptk-1b" ], - "relevancy_score": 40.7 + "relevancy_score": 40.3 }, { - "architecture_id": "NandiForCausalLM", - "total_models": 1, - "total_downloads": 7981, - "min_param_count": 153412928, + "architecture_id": "Llama4ForConditionalGeneration", + "total_models": 2, + "total_downloads": 5844, + "min_param_count": 6686880, "sample_models": [ - "Rta-AILabs/Nandi-Mini-150M" + "yujiepan/llama-4-tiny-random", + "Mogith/Llama-4-Scout-17B-16E-Instruct-Q8_0" ], - "relevancy_score": 40.7 + "relevancy_score": 40.2 }, { "architecture_id": "DuchifatCore", "total_models": 3, - "total_downloads": 4079, + "total_downloads": 4086, "min_param_count": 136763904, "sample_models": [ "Raziel1234/Duchifat-2", "razielAI/Duchifat-2.1-Instruct", "TopAI-1/Duchifat-2-Instruct" ], - "relevancy_score": 40.6 + "relevancy_score": 40.1 + }, + { + "architecture_id": "LLaDAModelLM", + "total_models": 4, + "total_downloads": 659922, + "min_param_count": 8015581184, + "sample_models": [ + "GSAI-ML/LLaDA-8B-Instruct", + "GSAI-ML/LLaDA-8B-Base", + "GSAI-ML/LLaDA-1.5", + "d3LLM/d3LLM_LLaDA" + ], + "relevancy_score": 39.8 }, { "architecture_id": "GLAForCausalLM", "total_models": 2, - "total_downloads": 5043, + "total_downloads": 4823, "min_param_count": 341707776, "sample_models": [ - "fla-hub/gla-340M-15B", - "fla-hub/gla-1.3B-100B" + "fla-hub/gla-1.3B-100B", + "fla-hub/gla-340M-15B" ], - "relevancy_score": 40.4 + "relevancy_score": 39.8 }, { - "architecture_id": "RWForCausalLM", - "total_models": 11, - "total_downloads": 11851, - "min_param_count": 6854619456, + "architecture_id": "BertLMHeadModel", + "total_models": 2, + "total_downloads": 4589, + "min_param_count": 184474880, "sample_models": [ - "projecte-aina/aguila-7b", - "lightonai/alfred-40b-1023", - "explosion-testing/refined-web-model-test", - "vilm/vulture-40b", - "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2", - "nomic-ai/gpt4all-falcon", - "OpenAssistant/falcon-40b-sft-top1-560", - "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3", - "QuixiAI/WizardLM-Uncensored-Falcon-40b", - "mrm8488/falcoder-7b" + "dicta-il/BEREL_3.0", + "hf-tiny-model-private/tiny-random-BertLMHeadModel" ], - "relevancy_score": 40.3 + "relevancy_score": 39.7 }, { "architecture_id": "Lfm2MoeForCausalLM", "total_models": 8, - "total_downloads": 175258, + "total_downloads": 187579, "min_param_count": 8339929856, "sample_models": [ "farbodtavakkoli/OTel-LLM-24B-IT", @@ -1371,818 +1316,905 @@ "LiquidAI/LFM2-8B-A1B-ONNX", "LiquidAI/LFM2-24B-A2B-ONNX", "unsloth/LFM2-8B-A1B", - "huihui-ai/Huihui-LFM2-24B-A2B-abliterated", - "MuXodious/LFM2-8B-A1B-absolute-heresy-MPOA" + "MuXodious/LFM2-8B-A1B-absolute-heresy-MPOA", + "huihui-ai/Huihui-LFM2-24B-A2B-abliterated" ], - "relevancy_score": 40.3 + "relevancy_score": 39.5 }, { - "architecture_id": "BertLMHeadModel", - "total_models": 2, - "total_downloads": 4599, - "min_param_count": 184474880, + "architecture_id": "RWForCausalLM", + "total_models": 11, + "total_downloads": 11570, + "min_param_count": 6854619456, "sample_models": [ - "dicta-il/BEREL_3.0", - "hf-tiny-model-private/tiny-random-BertLMHeadModel" + "projecte-aina/aguila-7b", + "lightonai/alfred-40b-1023", + "explosion-testing/refined-web-model-test", + "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2", + "vilm/vulture-40b", + "nomic-ai/gpt4all-falcon", + "OpenAssistant/falcon-40b-sft-top1-560", + "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3", + "QuixiAI/WizardLM-Uncensored-Falcon-40b", + "mrm8488/falcoder-7b" ], - "relevancy_score": 40.2 + "relevancy_score": 39.3 }, { - "architecture_id": "Llama4ForConditionalGeneration", - "total_models": 1, - "total_downloads": 6224, - "min_param_count": 6686880, + "architecture_id": "GPJTGPT2ModelForCausalLM", + "total_models": 4, + "total_downloads": 2088, + "min_param_count": 175592448, "sample_models": [ - "yujiepan/llama-4-tiny-random" + "gpjt/8xa100m40-baseline-3", + "gpjt/8xa100m40-baseline-2", + "gpjt/8xa100m40-baseline-8", + "gpjt/8xa100m40-baseline-7" ], - "relevancy_score": 40.2 + "relevancy_score": 39.2 }, { "architecture_id": "AfmoeForCausalLM", "total_models": 6, - "total_downloads": 45755, + "total_downloads": 44009, "min_param_count": 6120003328, "sample_models": [ - "arcee-ai/Trinity-Nano-Preview", "arcee-ai/Trinity-Large-Thinking", + "arcee-ai/Trinity-Nano-Preview", "arcee-ai/Trinity-Mini", "arcee-ai/Trinity-Nano-Base", "arcee-ai/Trinity-Mini-Base", "arcee-ai/Trinity-Large-Preview" ], - "relevancy_score": 40.0 + "relevancy_score": 39.1 + }, + { + "architecture_id": "GPTJXMoEForCausalLM", + "total_models": 1, + "total_downloads": 4510, + "min_param_count": 489915648, + "sample_models": [ + "Aletheia-ng/SabiYarn_MoE_translate" + ], + "relevancy_score": 39.0 + }, + { + "architecture_id": "GatedDeltaNetForCausalLM", + "total_models": 1, + "total_downloads": 4069, + "min_param_count": 317524480, + "sample_models": [ + "deqing/gdn-300M-v5-gdn" + ], + "relevancy_score": 38.8 }, { "architecture_id": "BitnetForCausalLM", "total_models": 2, - "total_downloads": 3594, + "total_downloads": 2937, "min_param_count": 728843904, "sample_models": [ "1bitLLM/bitnet_b1_58-large", "1bitLLM/bitnet_b1_58-3B" ], - "relevancy_score": 39.6 + "relevancy_score": 38.7 }, { "architecture_id": "RecurrentGemmaForCausalLM", "total_models": 3, - "total_downloads": 13166, + "total_downloads": 13093, "min_param_count": 2682862080, "sample_models": [ "google/recurrentgemma-2b", "google/recurrentgemma-2b-it", "google/recurrentgemma-9b" ], - "relevancy_score": 39.2 - }, - { - "architecture_id": "GatedDeltaNetForCausalLM", - "total_models": 1, - "total_downloads": 4063, - "min_param_count": 317524480, - "sample_models": [ - "deqing/gdn-300M-v5-gdn" - ], - "relevancy_score": 39.2 + "relevancy_score": 38.6 }, { "architecture_id": "RecursiveLanguageModel", "total_models": 1, - "total_downloads": 3560, + "total_downloads": 3401, "min_param_count": 198464806, "sample_models": [ "Girinath11/recursive-language-model-198m" ], - "relevancy_score": 38.9 + "relevancy_score": 38.4 }, { "architecture_id": "T5EncoderModel", "total_models": 1, - "total_downloads": 122326, + "total_downloads": 119882, "min_param_count": 4762310656, "sample_models": [ "XLabs-AI/xflux_text_encoders" ], - "relevancy_score": 38.8 - }, - { - "architecture_id": "AprielForCausalLM", - "total_models": 1, - "total_downloads": 113509, - "min_param_count": 4832071680, - "sample_models": [ - "ServiceNow-AI/Apriel-5B-Instruct" - ], - "relevancy_score": 38.7 - }, - { - "architecture_id": "GPTJXMoEForCausalLM", - "total_models": 1, - "total_downloads": 3210, - "min_param_count": 489915648, - "sample_models": [ - "Aletheia-ng/SabiYarn_MoE_translate" - ], - "relevancy_score": 38.7 + "relevancy_score": 38.2 }, { "architecture_id": "LLM", "total_models": 1, - "total_downloads": 3059, + "total_downloads": 3132, "min_param_count": 497145984, "sample_models": [ "rudyon/linnet-497M" ], - "relevancy_score": 38.6 - }, - { - "architecture_id": "DreamModel", - "total_models": 6, - "total_downloads": 140463, - "min_param_count": 7615616512, - "sample_models": [ - "Dream-org/Dream-v0-Instruct-7B", - "Dream-org/Dream-v0-Base-7B", - "Dream-org/Dream-Coder-v0-Instruct-7B", - "Zigeng/dParallel_Dream_7B_Instruct", - "Dream-org/Dream-Coder-v0-Base-7B", - "Dream-org/DreamOn-v0-7B" - ], - "relevancy_score": 38.5 + "relevancy_score": 38.2 }, { "architecture_id": "SwarmForCausalLM", "total_models": 1, - "total_downloads": 2979, + "total_downloads": 3059, "min_param_count": 52729731, "sample_models": [ "reaperdoesntknow/SAGI" ], - "relevancy_score": 38.5 + "relevancy_score": 38.2 }, { - "architecture_id": "GPJTGPT2ModelForCausalLM", - "total_models": 3, - "total_downloads": 1570, - "min_param_count": 175592448, + "architecture_id": "AprielForCausalLM", + "total_models": 1, + "total_downloads": 113197, + "min_param_count": 4832071680, "sample_models": [ - "gpjt/8xa100m40-baseline-3", - "gpjt/8xa100m40-baseline-2", - "gpjt/8xa100m40-baseline-7" + "ServiceNow-AI/Apriel-5B-Instruct" ], - "relevancy_score": 38.4 + "relevancy_score": 38.1 }, { "architecture_id": "SpatialLMQwenForCausalLM", "total_models": 1, - "total_downloads": 2768, + "total_downloads": 2930, "min_param_count": 603511168, "sample_models": [ "manycore-research/SpatialLM1.1-Qwen-0.5B" ], - "relevancy_score": 38.4 + "relevancy_score": 38.1 }, { "architecture_id": "MiniMindForCausalLM", "total_models": 2, - "total_downloads": 2005, + "total_downloads": 2151, "min_param_count": 38840960, "sample_models": [ "yiwenX/MiniMind-MoE-640-120M", "chujiamo/baiheng_0405" ], - "relevancy_score": 38.3 + "relevancy_score": 38.0 + }, + { + "architecture_id": "DreamModel", + "total_models": 6, + "total_downloads": 153501, + "min_param_count": 7615616512, + "sample_models": [ + "Dream-org/Dream-v0-Instruct-7B", + "Dream-org/Dream-v0-Base-7B", + "Dream-org/Dream-Coder-v0-Instruct-7B", + "Zigeng/dParallel_Dream_7B_Instruct", + "Dream-org/Dream-Coder-v0-Base-7B", + "Dream-org/DreamOn-v0-7B" + ], + "relevancy_score": 37.9 }, { "architecture_id": "AV2TextForConditionalGeneration", "total_models": 1, - "total_downloads": 2566, + "total_downloads": 2689, "min_param_count": 480465000, "sample_models": [ "nguyenvulebinh/AV-HuBERT-MuAViC-en" ], - "relevancy_score": 38.2 + "relevancy_score": 37.9 }, { "architecture_id": "BD3LM", "total_models": 2, - "total_downloads": 1793, + "total_downloads": 1953, "min_param_count": 169627250, "sample_models": [ "kuleshov-group/bd3lm-owt-block_size4", "kuleshov-group/bd3lm-owt-block_size16" ], - "relevancy_score": 38.1 - }, - { - "architecture_id": "HCXVisionV2ForCausalLM", - "total_models": 2, - "total_downloads": 356837, - "min_param_count": 10741664520, - "sample_models": [ - "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B", - "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B" - ], - "relevancy_score": 37.9 + "relevancy_score": 37.8 }, { - "architecture_id": "PenguinVLQwen3ForCausalLM", + "architecture_id": "ForCausalLM", "total_models": 2, - "total_downloads": 10102, - "min_param_count": 2167941120, + "total_downloads": 1725, + "min_param_count": 748801603, "sample_models": [ - "tencent/Penguin-VL-8B", - "tencent/Penguin-VL-2B" + "kyr0/Gemma-4-Waldwicht-Sproessling", + "kyr0/Gemma-4-Waldwicht-Winzling" ], - "relevancy_score": 37.9 + "relevancy_score": 37.5 }, { "architecture_id": "BlenderbotForConditionalGeneration", "total_models": 1, - "total_downloads": 2226, + "total_downloads": 2289, "min_param_count": 364810568, "sample_models": [ "thu-coai/blenderbot-400M-esconv" ], - "relevancy_score": 37.9 + "relevancy_score": 37.5 }, { "architecture_id": "Autoencoder", "total_models": 1, - "total_downloads": 2223, + "total_downloads": 2278, "min_param_count": 75832064, "sample_models": [ "cccczshao/CALM-Autoencoder" ], - "relevancy_score": 37.9 + "relevancy_score": 37.5 }, { "architecture_id": "EveMoEForCausalLM", "total_models": 1, - "total_downloads": 2123, + "total_downloads": 2174, "min_param_count": 271970816, "sample_models": [ "anthonym21/Eve-2-MoE-IT-272M" ], - "relevancy_score": 37.8 + "relevancy_score": 37.4 }, { "architecture_id": "FusionInDecoderForConditionalGeneration", "total_models": 1, - "total_downloads": 2105, + "total_downloads": 2146, "min_param_count": 247577856, "sample_models": [ "Intel/fid_flan_t5_base_nq" ], - "relevancy_score": 37.8 + "relevancy_score": 37.4 }, { "architecture_id": "Plamo3ForCausalLM", "total_models": 1, - "total_downloads": 12290, + "total_downloads": 12925, "min_param_count": 2603344384, "sample_models": [ "pfnet/plamo-3-nict-2b-base" ], - "relevancy_score": 37.7 + "relevancy_score": 37.3 + }, + { + "architecture_id": "TransformerForCausalLM", + "total_models": 1, + "total_downloads": 12840, + "min_param_count": 1364297728, + "sample_models": [ + "fla-hub/transformer-1.3B-100B" + ], + "relevancy_score": 37.3 }, { "architecture_id": "LIMEForCausalLM", "total_models": 1, - "total_downloads": 2043, + "total_downloads": 2102, "min_param_count": 984405504, "sample_models": [ "anarlavrenov/lime-1b-instruct" ], - "relevancy_score": 37.7 + "relevancy_score": 37.3 }, { "architecture_id": "ModernBertForMaskedLM", "total_models": 1, - "total_downloads": 2007, + "total_downloads": 2068, "min_param_count": 590367063, "sample_models": [ "JorgeVanco/diffusionGPT" ], - "relevancy_score": 37.6 + "relevancy_score": 37.3 + }, + { + "architecture_id": "HCXVisionV2ForCausalLM", + "total_models": 2, + "total_downloads": 354662, + "min_param_count": 10741664520, + "sample_models": [ + "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B", + "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B" + ], + "relevancy_score": 37.2 }, { "architecture_id": "MoEGPTForCausalLM", "total_models": 1, - "total_downloads": 1960, + "total_downloads": 1943, "min_param_count": 149603328, "sample_models": [ "arnomatic/german-moe-gpt-v8-pretrained" ], - "relevancy_score": 37.6 + "relevancy_score": 37.2 + }, + { + "architecture_id": "ParamBharatGenForCausalLM", + "total_models": 4, + "total_downloads": 5035, + "min_param_count": 2860664832, + "sample_models": [ + "bharatgenai/Param-1-5B", + "bharatgenai/AyurParam", + "bharatgenai/Param-1-2.9B-Instruct", + "bharatgenai/AgriParam" + ], + "relevancy_score": 37.1 }, { - "architecture_id": "TransformerForCausalLM", + "architecture_id": "LanceAI", "total_models": 1, - "total_downloads": 11223, - "min_param_count": 1364297728, + "total_downloads": 1887, + "min_param_count": 137022720, + "sample_models": [ + "NeuraCraft/Lance-AI-V2" + ], + "relevancy_score": 37.1 + }, + { + "architecture_id": "RWKV7ForCausalLM", + "total_models": 2, + "total_downloads": 1332, + "min_param_count": 381332480, "sample_models": [ - "fla-hub/transformer-1.3B-100B" + "puigde/rwkv7-380M-15B-slimpajama", + "fla-hub/rwkv7-1.5B-world" ], - "relevancy_score": 37.5 + "relevancy_score": 37.0 }, { "architecture_id": "LSTMForCausalLM", "total_models": 1, - "total_downloads": 1768, + "total_downloads": 1772, "min_param_count": 164921344, "sample_models": [ "deqing/lstm-window-4-v5" ], - "relevancy_score": 37.4 + "relevancy_score": 37.0 }, { "architecture_id": "NanoChatForCausalLM", "total_models": 3, - "total_downloads": 5777, + "total_downloads": 6167, "min_param_count": 2217082880, "sample_models": [ "Twobombs/nanochat-d34-sft-hf", "pankajmathur/nanochat-d34-sft-hf", "Nekochu/nanochat-d24" ], - "relevancy_score": 37.3 + "relevancy_score": 36.9 }, { - "architecture_id": "Qwen3ASRForConditionalGeneration", + "architecture_id": "PenguinVLQwen3ForCausalLM", "total_models": 2, - "total_downloads": 1203, - "min_param_count": 782426112, + "total_downloads": 7957, + "min_param_count": 2167941120, "sample_models": [ - "bezzam/Qwen3-ASR-0.6B", - "hypaai/Qwen3-ASR-0.6B_2026-03-22_04-35-10" + "tencent/Penguin-VL-8B", + "tencent/Penguin-VL-2B" ], - "relevancy_score": 37.2 + "relevancy_score": 36.9 }, { "architecture_id": "Moondream", "total_models": 1, - "total_downloads": 9755, + "total_downloads": 10509, "min_param_count": 1857482608, "sample_models": [ "vikhyatk/moondream1" ], - "relevancy_score": 37.2 + "relevancy_score": 36.9 }, { - "architecture_id": "CircuitGPTForCausalLM", - "total_models": 1, - "total_downloads": 1414, - "min_param_count": 419124736, + "architecture_id": "GPT3DevLMHeadModel", + "total_models": 2, + "total_downloads": 1135, + "min_param_count": 125226240, "sample_models": [ - "openai/circuit-sparsity" + "k050506koch/GPT3-dev-350m-2805", + "k050506koch/GPT3-dev-125m-0104" ], - "relevancy_score": 36.9 + "relevancy_score": 36.6 }, { - "architecture_id": "ParamBharatGenForCausalLM", - "total_models": 3, - "total_downloads": 4535, - "min_param_count": 2860673024, + "architecture_id": "D3LMForMaskedLM", + "total_models": 1, + "total_downloads": 1260, + "min_param_count": 55905164, "sample_models": [ - "bharatgenai/Param-1-5B", - "bharatgenai/AyurParam", - "bharatgenai/Param-1-2.9B-Instruct" + "Hengchang-Liu/D3LM-from-nt" ], - "relevancy_score": 36.8 + "relevancy_score": 36.2 }, { - "architecture_id": "SliderGPT", + "architecture_id": "CircuitGPTForCausalLM", "total_models": 1, - "total_downloads": 1377, - "min_param_count": 47420936, + "total_downloads": 1247, + "min_param_count": 419124736, "sample_models": [ - "c-bone/CrystaLLM-pi_Mattergen-XRD" + "openai/circuit-sparsity" ], - "relevancy_score": 36.8 + "relevancy_score": 36.2 }, { - "architecture_id": "YoutuForCausalLM", - "total_models": 2, - "total_downloads": 4389, - "min_param_count": 1961560064, + "architecture_id": "PanguEmbeddedForCausalLM", + "total_models": 1, + "total_downloads": 7396, + "min_param_count": 1391497728, "sample_models": [ - "tencent/Youtu-LLM-2B-Base", - "tencent/Youtu-LLM-2B" + "FreedomIntelligence/openPangu-Embedded-1B" ], "relevancy_score": 36.1 }, { - "architecture_id": "DUO", + "architecture_id": "RubiRLM", "total_models": 1, - "total_downloads": 995, - "min_param_count": 169627250, + "total_downloads": 1142, + "min_param_count": 988446027, "sample_models": [ - "s-sahoo/duo-distilled" + "DevHunterAI/RubiRLM-1B-Base" ], - "relevancy_score": 36.1 + "relevancy_score": 36.0 }, { "architecture_id": "Rwkv7ForCausalLM", "total_models": 1, - "total_downloads": 970, + "total_downloads": 1092, "min_param_count": 34158592, "sample_models": [ "admijgjtjtjtjjg/dfdfdf" ], - "relevancy_score": 36.0 + "relevancy_score": 35.9 }, { - "architecture_id": "RubiRLM", + "architecture_id": "SliderGPT", "total_models": 1, - "total_downloads": 964, - "min_param_count": 988446027, + "total_downloads": 1069, + "min_param_count": 47420936, "sample_models": [ - "DevHunterAI/RubiRLM-1B-Base" + "c-bone/CrystaLLM-pi_Mattergen-XRD" ], - "relevancy_score": 36.0 + "relevancy_score": 35.9 }, { - "architecture_id": "RavenForCausalLM", + "architecture_id": "YoutuForCausalLM", "total_models": 2, - "total_downloads": 4033, - "min_param_count": 1385228288, + "total_downloads": 4259, + "min_param_count": 1961560064, "sample_models": [ - "tomg-group-umd/huginn-0125", - "smcleish/Recurrent-Llama-3.2-train-recurrence-32" + "tencent/Youtu-LLM-2B-Base", + "tencent/Youtu-LLM-2B" ], - "relevancy_score": 35.9 + "relevancy_score": 35.5 }, { "architecture_id": "GTLMForCausalLM", "total_models": 2, - "total_downloads": 4011, + "total_downloads": 4210, "min_param_count": 2095989760, "sample_models": [ "Madras1/GTLM-1-2B-A350M", "Madras1/GTLM-1-2B-A350M-fp16" ], - "relevancy_score": 35.9 - }, - { - "architecture_id": "PanguEmbeddedForCausalLM", - "total_models": 1, - "total_downloads": 5621, - "min_param_count": 1391497728, - "sample_models": [ - "FreedomIntelligence/openPangu-Embedded-1B" - ], - "relevancy_score": 35.9 + "relevancy_score": 35.5 }, { "architecture_id": "SoraForSLM", "total_models": 1, - "total_downloads": 901, + "total_downloads": 915, "min_param_count": 450707456, "sample_models": [ "Conlanger-LLM-CLEM/Sorie" ], - "relevancy_score": 35.9 - }, - { - "architecture_id": "MoshiForConditionalGeneration", - "total_models": 2, - "total_downloads": 133468, - "min_param_count": 7783880545, - "sample_models": [ - "kmhf/hf-moshiko", - "kmhf/hf-moshika" - ], - "relevancy_score": 35.7 - }, - { - "architecture_id": "MiMoForCausalLM", - "total_models": 2, - "total_downloads": 135798, - "min_param_count": 7833409536, - "sample_models": [ - "XiaomiMiMo/MiMo-7B-Base", - "XiaomiMiMo/MiMo-7B-RL" - ], - "relevancy_score": 35.7 + "relevancy_score": 35.5 }, { "architecture_id": "HGRNBitForCausalLM", "total_models": 1, - "total_downloads": 854, + "total_downloads": 874, "min_param_count": 374108160, "sample_models": [ "ridger/MMfreeLM-370M" ], - "relevancy_score": 35.7 + "relevancy_score": 35.4 }, { "architecture_id": "DotLMForCausalLM", "total_models": 1, - "total_downloads": 823, + "total_downloads": 825, "min_param_count": 176204544, "sample_models": [ "tensorfiend/DotLM-165M" ], - "relevancy_score": 35.7 + "relevancy_score": 35.3 }, { - "architecture_id": "D3LMForMaskedLM", - "total_models": 1, - "total_downloads": 790, - "min_param_count": 55905164, + "architecture_id": "LLaDA2MoeModelLM", + "total_models": 6, + "total_downloads": 289792, + "min_param_count": 16255643392, "sample_models": [ - "Hengchang-Liu/D3LM-from-nt" + "inclusionAI/LLaDA2.1-flash", + "inclusionAI/LLaDA2.0-mini", + "inclusionAI/LLaDA2.1-mini", + "inclusionAI/LLaDA2.0-mini-CAP", + "inclusionAI/LLaDA2.0-flash", + "Zigeng/DMax-Coder-16B" ], - "relevancy_score": 35.6 + "relevancy_score": 35.2 }, { - "architecture_id": "DeltaNetForCausalLM", + "architecture_id": "RavenForCausalLM", + "total_models": 2, + "total_downloads": 3635, + "min_param_count": 1385228288, + "sample_models": [ + "tomg-group-umd/huginn-0125", + "smcleish/Recurrent-Llama-3.2-train-recurrence-32" + ], + "relevancy_score": 35.2 + }, + { + "architecture_id": "DUO", "total_models": 1, - "total_downloads": 4588, - "min_param_count": 1365677056, + "total_downloads": 789, + "min_param_count": 169627250, "sample_models": [ - "fla-hub/delta_net-1.3B-100B" + "s-sahoo/duo-distilled" ], - "relevancy_score": 35.5 + "relevancy_score": 35.2 }, { - "architecture_id": "VaultGemmaForCausalLM", + "architecture_id": "DeltaNetForCausalLM", "total_models": 1, - "total_downloads": 4209, - "min_param_count": 1038741120, + "total_downloads": 4682, + "min_param_count": 1365677056, "sample_models": [ - "google/vaultgemma-1b" + "fla-hub/delta_net-1.3B-100B" ], - "relevancy_score": 35.3 + "relevancy_score": 35.1 }, { - "architecture_id": "Rwkv5ForCausalLM", + "architecture_id": "MoshiForConditionalGeneration", "total_models": 2, - "total_downloads": 2960, - "min_param_count": 1577754624, + "total_downloads": 128966, + "min_param_count": 7783880545, "sample_models": [ - "RWKV/rwkv-5-world-3b", - "RWKV/rwkv-5-world-1b5" + "kmhf/hf-moshiko", + "kmhf/hf-moshika" ], - "relevancy_score": 35.2 + "relevancy_score": 35.0 }, { - "architecture_id": "LLaDA2MoeModelLM", - "total_models": 5, - "total_downloads": 245306, - "min_param_count": 16255643392, + "architecture_id": "MiMoForCausalLM", + "total_models": 2, + "total_downloads": 132837, + "min_param_count": 7833409536, "sample_models": [ - "inclusionAI/LLaDA2.1-flash", - "inclusionAI/LLaDA2.0-mini", - "inclusionAI/LLaDA2.1-mini", - "inclusionAI/LLaDA2.0-mini-CAP", - "inclusionAI/LLaDA2.0-flash" + "XiaomiMiMo/MiMo-7B-Base", + "XiaomiMiMo/MiMo-7B-RL" ], - "relevancy_score": 35.1 + "relevancy_score": 35.0 }, { - "architecture_id": "RWKV7ForCausalLM", - "total_models": 3, - "total_downloads": 2153, - "min_param_count": 1527404544, + "architecture_id": "Rwkv5ForCausalLM", + "total_models": 2, + "total_downloads": 3092, + "min_param_count": 1577754624, "sample_models": [ - "RWKV/RWKV7-Goose-World3-1.5B-HF", - "fla-hub/rwkv7-1.5B-world", - "RWKV/RWKV7-Goose-World3-2.9B-HF" + "RWKV/rwkv-5-world-3b", + "RWKV/rwkv-5-world-1b5" ], - "relevancy_score": 35.1 + "relevancy_score": 34.8 }, { - "architecture_id": "MegaForCausalLM", + "architecture_id": "VaultGemmaForCausalLM", "total_models": 1, - "total_downloads": 613, - "min_param_count": 126132108, + "total_downloads": 4072, + "min_param_count": 1038741120, "sample_models": [ - "BEE-spoke-data/mega-ar-126m-4k" + "google/vaultgemma-1b" ], - "relevancy_score": 35.0 + "relevancy_score": 34.8 }, { "architecture_id": "WordLatentTransformerForCausalLM", "total_models": 1, - "total_downloads": 583, + "total_downloads": 654, "min_param_count": 6861056, "sample_models": [ "sign/WeLT-string-repetition" ], - "relevancy_score": 34.9 + "relevancy_score": 34.8 + }, + { + "architecture_id": "LilleForCausalLM", + "total_models": 1, + "total_downloads": 604, + "min_param_count": 127236768, + "sample_models": [ + "Nikity/lille-130m-instruct" + ], + "relevancy_score": 34.6 }, { "architecture_id": "KimiK2ForCausalLM", "total_models": 1, - "total_downloads": 583, + "total_downloads": 586, "min_param_count": 170595012, "sample_models": [ "hyper-accel/tiny-random-kimi-k2" ], - "relevancy_score": 34.9 + "relevancy_score": 34.6 }, { - "architecture_id": "LilleForCausalLM", - "total_models": 1, - "total_downloads": 565, - "min_param_count": 127236768, + "architecture_id": "RuGPT3XLForCausalLM", + "total_models": 2, + "total_downloads": 2650, + "min_param_count": 1431261184, "sample_models": [ - "Nikity/lille-130m-instruct" + "evilfreelancer/ruGPT3XL", + "evilfreelancer/ruGPT3XL-8k" ], - "relevancy_score": 34.8 + "relevancy_score": 34.5 }, { "architecture_id": "GPT2CompetitiveMoE", "total_models": 1, - "total_downloads": 527, + "total_downloads": 528, "min_param_count": 497796864, "sample_models": [ "Fu01978/gpt2-4x124M-competitive-moe" ], - "relevancy_score": 34.7 + "relevancy_score": 34.3 + }, + { + "architecture_id": "MegaForCausalLM", + "total_models": 1, + "total_downloads": 522, + "min_param_count": 126132108, + "sample_models": [ + "BEE-spoke-data/mega-ar-126m-4k" + ], + "relevancy_score": 34.3 + }, + { + "architecture_id": "Qwen3ASRForConditionalGeneration", + "total_models": 1, + "total_downloads": 516, + "min_param_count": 782426112, + "sample_models": [ + "hypaai/Qwen3-ASR-0.6B_2026-03-22_04-35-10" + ], + "relevancy_score": 34.3 }, { "architecture_id": "BolmoForCausalLM", "total_models": 2, - "total_downloads": 2032, + "total_downloads": 2003, "min_param_count": 1468911776, "sample_models": [ "allenai/Bolmo-1B", "allenai/Bolmo-7B" ], - "relevancy_score": 34.3 + "relevancy_score": 33.9 }, { - "architecture_id": "XCurOSForCausalLM", + "architecture_id": "MoELLaVAQwen2ForCausalLM", "total_models": 1, - "total_downloads": 93590, - "min_param_count": 7615616512, + "total_downloads": 2702, + "min_param_count": 1406119552, "sample_models": [ - "XCurOS/XCurOS-0.1-8B-Instruct" + "KKHYA/llavaqwen2.5-0.5b-finetune-moe-4e-2k_20260331_194516" ], - "relevancy_score": 34.2 + "relevancy_score": 33.9 }, { - "architecture_id": "MoELLaVAQwen2ForCausalLM", + "architecture_id": "GiddForDiffusionLM", + "total_models": 2, + "total_downloads": 1803, + "min_param_count": 2844349440, + "sample_models": [ + "dvruette/gidd-unif-3b", + "dvruette/gidd-mask-3b" + ], + "relevancy_score": 33.6 + }, + { + "architecture_id": "TarsierForConditionalGeneration", "total_models": 1, - "total_downloads": 2618, - "min_param_count": 1406119552, + "total_downloads": 86089, + "min_param_count": 7063427072, "sample_models": [ - "KKHYA/llavaqwen2.5-0.5b-finetune-moe-4e-2k_20260331_194516" + "omni-research/Tarsier-7b" ], - "relevancy_score": 34.2 + "relevancy_score": 33.5 }, { "architecture_id": "OlmoHybridForCausalLM", "total_models": 4, - "total_downloads": 35834, + "total_downloads": 35503, "min_param_count": 7430870688, "sample_models": [ - "allenai/Olmo-Hybrid-7B", - "allenai/Olmo-Hybrid-Instruct-DPO-7B", - "allenai/Olmo-Hybrid-Instruct-SFT-7B", - "allenai/Olmo-Hybrid-Think-SFT-7B" + "allenai/Olmo-Hybrid-7B", + "allenai/Olmo-Hybrid-Instruct-DPO-7B", + "allenai/Olmo-Hybrid-Instruct-SFT-7B", + "allenai/Olmo-Hybrid-Think-SFT-7B" + ], + "relevancy_score": 33.4 + }, + { + "architecture_id": "ArgonneModel", + "total_models": 2, + "total_downloads": 1627, + "min_param_count": 1273807360, + "sample_models": [ + "PursuitOfDataScience/Argonne2.5-instruct", + "PursuitOfDataScience/Argonne2.5-base" ], - "relevancy_score": 34.1 + "relevancy_score": 33.4 }, { - "architecture_id": "TarsierForConditionalGeneration", + "architecture_id": "MobileLLMP1ForCausalLM", "total_models": 1, - "total_downloads": 87008, - "min_param_count": 7063427072, + "total_downloads": 2086, + "min_param_count": 1084453120, "sample_models": [ - "omni-research/Tarsier-7b" + "facebook/MobileLLM-Pro-base" ], - "relevancy_score": 34.1 + "relevancy_score": 33.3 }, { "architecture_id": "HybridQwen3ForCausalLM", "total_models": 9, - "total_downloads": 7400, + "total_downloads": 7439, "min_param_count": 8495712960, "sample_models": [ "amazon/GKA-primed-HQwen3-8B-Instruct", "amazon/Mamba2-primed-HQwen3-8B-Instruct", - "amazon/GDN-primed-HQwen3-8B-Instruct", "amazon/GDN-primed-HQwen3-32B-Instruct", + "amazon/GDN-primed-HQwen3-8B-Instruct", "amazon/GKA-primed-HQwen3-32B-Instruct", "amazon/BMOJOF-primed-HQwen3-8B-Instruct", "amazon/GKA-primed-HQwen3-8B-Reasoner", "amazon/GDN-primed-HQwen3-8B-Reasoner", "amazon/GKA-primed-HQwen3-32B-Reasoner" ], - "relevancy_score": 33.9 + "relevancy_score": 33.1 + }, + { + "architecture_id": "JetNemotronForCausalLM", + "total_models": 2, + "total_downloads": 8242, + "min_param_count": 3960424768, + "sample_models": [ + "jet-ai/Jet-Nemotron-2B", + "jet-ai/Jet-Nemotron-4B" + ], + "relevancy_score": 33.0 }, { "architecture_id": "Rwkv6ForCausalLM", "total_models": 8, - "total_downloads": 8437, + "total_downloads": 8905, "min_param_count": 7635746816, "sample_models": [ - "RWKV/v6-Finch-1B6-HF", "RWKV/v6-Finch-7B-HF", + "RWKV/v6-Finch-1B6-HF", + "RWKV/rwkv-6-world-3b", "RWKV/rwkv-6-world-1b6", "RWKV/rwkv-6-world-7b", "RWKV/v6-Finch-14B-HF", "RWKV/v6-Finch-3B-HF", - "RWKV/rwkv-6-world-3b-v2.1", - "RWKV/rwkv-6-world-3b" + "RWKV/rwkv-6-world-3b-v2.1" ], - "relevancy_score": 33.6 + "relevancy_score": 32.9 + }, + { + "architecture_id": "XCurOSForCausalLM", + "total_models": 1, + "total_downloads": 66986, + "min_param_count": 7615616512, + "sample_models": [ + "XCurOS/XCurOS-0.1-8B-Instruct" + ], + "relevancy_score": 32.9 + }, + { + "architecture_id": "SongGenMixedForConditionalGeneration", + "total_models": 1, + "total_downloads": 1723, + "min_param_count": 1363657956, + "sample_models": [ + "LiuZH-19/SongGen_mixed_pro" + ], + "relevancy_score": 32.9 }, { "architecture_id": "JAISLMHeadModel", "total_models": 6, - "total_downloads": 15551, + "total_downloads": 15081, "min_param_count": 7142689824, "sample_models": [ "inceptionai/jais-13b-chat", "katuni4ka/tiny-random-jais", "inceptionai/jais-family-30b-8k", - "inceptionai/jais-13b", "inceptionai/jais-family-13b-chat", + "inceptionai/jais-13b", "inceptionai/jais-family-6p7b-chat" ], - "relevancy_score": 33.6 + "relevancy_score": 32.8 }, { - "architecture_id": "SongGenMixedForConditionalGeneration", + "architecture_id": "OmniASRForConditionalGeneration", "total_models": 1, - "total_downloads": 1843, - "min_param_count": 1363657956, - "sample_models": [ - "LiuZH-19/SongGen_mixed_pro" - ], - "relevancy_score": 33.5 - }, - { - "architecture_id": "ArgonneModel", - "total_models": 2, - "total_downloads": 1321, - "min_param_count": 1273807360, + "total_downloads": 1633, + "min_param_count": 1631506944, "sample_models": [ - "PursuitOfDataScience/Argonne2.5-base", - "PursuitOfDataScience/Argonne2.5-instruct" + "bezzam/omniasr-llm-300m-v2" ], - "relevancy_score": 33.4 + "relevancy_score": 32.8 }, { - "architecture_id": "JetNemotronForCausalLM", + "architecture_id": "OpensciForCausalLM", "total_models": 2, - "total_downloads": 7758, - "min_param_count": 3960424768, + "total_downloads": 1031, + "min_param_count": 1714377728, "sample_models": [ - "jet-ai/Jet-Nemotron-2B", - "jet-ai/Jet-Nemotron-4B" + "ali-elganzory/1.7b-Comma0.1-300BT-longsft_16k-DPO-Tulu3-decontaminated", + "ali-elganzory/open-sci-ref-v0.02-1.7b-fineweb-edu-1.4t-300B-4096-longsft_16k-DPO-Tulu3-decontaminated" ], - "relevancy_score": 33.3 + "relevancy_score": 32.4 }, { - "architecture_id": "MobileLLMP1ForCausalLM", + "architecture_id": "Kanana2VecModel", "total_models": 1, - "total_downloads": 1750, - "min_param_count": 1084453120, + "total_downloads": 1330, + "min_param_count": 2086979328, "sample_models": [ - "facebook/MobileLLM-Pro-base" + "kakaocorp/kanana-nano-2.1b-embedding" ], - "relevancy_score": 33.3 + "relevancy_score": 32.3 }, { - "architecture_id": "OmniASRForConditionalGeneration", + "architecture_id": "DeciCoderForCausalLM", "total_models": 1, - "total_downloads": 1628, - "min_param_count": 1631506944, + "total_downloads": 1212, + "min_param_count": 1113671680, "sample_models": [ - "bezzam/omniasr-llm-300m-v2" + "Deci/DeciCoder-1b" ], - "relevancy_score": 33.2 + "relevancy_score": 32.1 }, { - "architecture_id": "Kanana2VecModel", + "architecture_id": "StableLMAlphaForCausalLM", "total_models": 1, - "total_downloads": 1350, - "min_param_count": 2086979328, + "total_downloads": 7022, + "min_param_count": 6889414656, "sample_models": [ - "kakaocorp/kanana-nano-2.1b-embedding" + "stabilityai/stablelm-base-alpha-7b-v2" ], - "relevancy_score": 32.8 + "relevancy_score": 32.0 }, { - "architecture_id": "GiddForDiffusionLM", - "total_models": 1, - "total_downloads": 1287, - "min_param_count": 2957629440, + "architecture_id": "IQuestCoderForCausalLM", + "total_models": 4, + "total_downloads": 17001, + "min_param_count": 7612810240, "sample_models": [ - "dvruette/gidd-unif-3b" + "IQuestLab/IQuest-Coder-V1-40B-Instruct", + "IQuestLab/IQuest-Coder-V1-7B-Instruct", + "Multilingual-Multimodal-NLP/IndustrialCoder", + "IQuestLab/IQuest-Coder-V1-40B-Thinking" ], - "relevancy_score": 32.7 + "relevancy_score": 31.8 }, { "architecture_id": "XLNetLMHeadModel", "total_models": 5, - "total_downloads": 479033, + "total_downloads": 433085, "min_param_count": null, "sample_models": [ "xlnet/xlnet-base-cased", @@ -2191,86 +2223,89 @@ "sshleifer/tiny-xlnet-base-cased", "textattack/xlnet-base-cased-imdb" ], - "relevancy_score": 32.6 + "relevancy_score": 31.5 }, { "architecture_id": "AeroForConditionalGeneration", "total_models": 1, - "total_downloads": 1269, + "total_downloads": 902, "min_param_count": 2416221184, "sample_models": [ "lmms-lab/Aero-1-Audio" ], - "relevancy_score": 32.6 - }, - { - "architecture_id": "IQuestCoderForCausalLM", - "total_models": 4, - "total_downloads": 17567, - "min_param_count": 7612810240, - "sample_models": [ - "IQuestLab/IQuest-Coder-V1-40B-Instruct", - "IQuestLab/IQuest-Coder-V1-7B-Instruct", - "Multilingual-Multimodal-NLP/IndustrialCoder", - "IQuestLab/IQuest-Coder-V1-40B-Thinking" - ], - "relevancy_score": 32.5 + "relevancy_score": 31.5 }, { - "architecture_id": "StableLMAlphaForCausalLM", + "architecture_id": "Qwen3VLForConditionalGeneration", "total_models": 1, - "total_downloads": 7233, - "min_param_count": 6889414656, + "total_downloads": 863, + "min_param_count": 2127532032, "sample_models": [ - "stabilityai/stablelm-base-alpha-7b-v2" + "Oysiyl/qwen3-vl-2b-unslop-good-lora-v1" ], - "relevancy_score": 32.5 + "relevancy_score": 31.4 }, { - "architecture_id": "DeciCoderForCausalLM", - "total_models": 1, - "total_downloads": 1190, - "min_param_count": 1113671680, + "architecture_id": "Qwen3MoeForCausalLM", + "total_models": 7, + "total_downloads": 5780, + "min_param_count": 8001454080, "sample_models": [ - "Deci/DeciCoder-1b" + "AIDC-AI/Marco-Nano-Instruct", + "zianglih/Qwen3-30B-A3B-Instruct-2507-MXFP8-last-8-BF16", + "AIDC-AI/Marco-Mini-Instruct", + "Dynamical-Systems/Dynamical-30B-A3B", + "bineric/lynx-instruct-30b", + "OpenMOSS-Team/SciJudge-30B", + "unsloth/Qwen3-30B-A3B-Thinking-2507" ], - "relevancy_score": 32.5 + "relevancy_score": 31.3 }, { "architecture_id": "GritLM", "total_models": 1, - "total_downloads": 30472, + "total_downloads": 31461, "min_param_count": 7241732096, "sample_models": [ "parasail-ai/GritLM-7B-vllm" ], - "relevancy_score": 31.7 + "relevancy_score": 31.3 }, { "architecture_id": "AXK1ForCausalLM", "total_models": 2, - "total_downloads": 19527, + "total_downloads": 19319, "min_param_count": 11448603648, "sample_models": [ "skt/A.X-K1", "thkim93/axk1-2layers" ], - "relevancy_score": 31.4 + "relevancy_score": 30.8 }, { - "architecture_id": "Lfm2Prototype1ForCausalLM", + "architecture_id": "VeridianForCausalLM", "total_models": 1, - "total_downloads": 735, - "min_param_count": 1212304128, + "total_downloads": 662, + "min_param_count": 1659913728, "sample_models": [ - "nntsuzu/LFM2-SFT-Prototype01-1.2B-JP" + "MagistrTheOne/veridian-beta" ], - "relevancy_score": 31.4 + "relevancy_score": 30.8 + }, + { + "architecture_id": "HymbaForCausalLM", + "total_models": 1, + "total_downloads": 645, + "min_param_count": 1522797824, + "sample_models": [ + "nvidia/Hymba-1.5B-Instruct" + ], + "relevancy_score": 30.8 }, { "architecture_id": "IdeficsForVisionText2Text", "total_models": 4, - "total_downloads": 10302, + "total_downloads": 10405, "min_param_count": 8929682192, "sample_models": [ "HuggingFaceM4/idefics-80b-instruct", @@ -2278,140 +2313,140 @@ "HuggingFaceM4/idefics-9b-instruct", "HuggingFaceM4/idefics-80b" ], - "relevancy_score": 31.3 - }, - { - "architecture_id": "InternVLChatModel", - "total_models": 1, - "total_downloads": 4299, - "min_param_count": 3712637952, - "sample_models": [ - "numind/NuExtract-2-4B-experimental" - ], - "relevancy_score": 31.3 + "relevancy_score": 30.7 }, { - "architecture_id": "CambrianQwenForCausalLM", + "architecture_id": "Lfm2Prototype1ForCausalLM", "total_models": 1, - "total_downloads": 4196, - "min_param_count": 3986951616, + "total_downloads": 634, + "min_param_count": 1212304128, "sample_models": [ - "nyu-visionx/Scale-RAE-Qwen1.5B_DiT2.4B" + "nntsuzu/LFM2-SFT-Prototype01-1.2B-JP" ], - "relevancy_score": 31.3 + "relevancy_score": 30.7 }, { - "architecture_id": "CheXagentForCausalLM", - "total_models": 1, - "total_downloads": 4149, - "min_param_count": 3140746752, + "architecture_id": "ChatGLMModel", + "total_models": 2, + "total_downloads": 17564, + "min_param_count": 9399951392, "sample_models": [ - "StanfordAIMI/CheXagent-2-3b" + "zai-org/codegeex4-all-9b", + "zai-org/glm-4-9b" ], - "relevancy_score": 31.3 + "relevancy_score": 30.6 }, { "architecture_id": "PersimmonForCausalLM", "total_models": 3, - "total_downloads": 12967, + "total_downloads": 12300, "min_param_count": 8823735296, "sample_models": [ "adept/persimmon-8b-chat", "adept/persimmon-8b-base", "pszemraj/perSLIMmon-8b-base" ], - "relevancy_score": 31.2 + "relevancy_score": 30.5 }, { - "architecture_id": "Phi3SmallForCausalLM", - "total_models": 2, - "total_downloads": 17559, - "min_param_count": 7392272384, + "architecture_id": "FlexOlmoForCausalLM", + "total_models": 3, + "total_downloads": 12280, + "min_param_count": 11627401216, "sample_models": [ - "microsoft/Phi-3-small-8k-instruct", - "microsoft/Phi-3-small-128k-instruct" + "allenai/Flex-reddit-2x7B-1T", + "allenai/FlexOlmo-7x7B-1T-RT", + "shanearora/Flex-reddit-2x7B-1T" ], - "relevancy_score": 31.2 + "relevancy_score": 30.5 }, { - "architecture_id": "HymbaForCausalLM", + "architecture_id": "TinyChartPhiForCausalLM", "total_models": 1, - "total_downloads": 667, - "min_param_count": 1522797824, + "total_downloads": 3551, + "min_param_count": 3189407648, "sample_models": [ - "nvidia/Hymba-1.5B-Instruct" + "mPLUG/TinyChart-3B-768" ], - "relevancy_score": 31.2 + "relevancy_score": 30.5 }, { - "architecture_id": "FlexOlmoForCausalLM", - "total_models": 3, - "total_downloads": 12568, - "min_param_count": 11627401216, + "architecture_id": "MixFormerSequentialForCausalLM", + "total_models": 1, + "total_downloads": 562, + "min_param_count": 2779683840, "sample_models": [ - "allenai/Flex-reddit-2x7B-1T", - "allenai/FlexOlmo-7x7B-1T-RT", - "shanearora/Flex-reddit-2x7B-1T" + "SkunkworksAI/phi-2" ], - "relevancy_score": 31.1 + "relevancy_score": 30.5 }, { - "architecture_id": "ChatGLMModel", + "architecture_id": "Phi3SmallForCausalLM", "total_models": 2, - "total_downloads": 17362, - "min_param_count": 9399951392, + "total_downloads": 15892, + "min_param_count": 7392272384, "sample_models": [ - "zai-org/codegeex4-all-9b", - "zai-org/glm-4-9b" + "microsoft/Phi-3-small-8k-instruct", + "microsoft/Phi-3-small-128k-instruct" ], - "relevancy_score": 31.1 + "relevancy_score": 30.4 }, { "architecture_id": "SpatialLMLlamaForCausalLM", "total_models": 1, - "total_downloads": 578, - "min_param_count": 1345883776, + "total_downloads": 515, + "min_param_count": 1247355840, "sample_models": [ - "manycore-research/SpatialLM1.1-Llama-1B" + "manycore-research/SpatialLM-Llama-1B" ], - "relevancy_score": 30.9 + "relevancy_score": 30.3 }, { - "architecture_id": "MixFormerSequentialForCausalLM", + "architecture_id": "CambrianQwenForCausalLM", "total_models": 1, - "total_downloads": 561, - "min_param_count": 2779683840, + "total_downloads": 2935, + "min_param_count": 3986951616, "sample_models": [ - "SkunkworksAI/phi-2" + "nyu-visionx/Scale-RAE-Qwen1.5B_DiT2.4B" ], - "relevancy_score": 30.8 + "relevancy_score": 30.1 }, { "architecture_id": "StripedHyenaModelForCausalLM", "total_models": 3, - "total_downloads": 10515, + "total_downloads": 9917, "min_param_count": 7646024704, "sample_models": [ "togethercomputer/evo-1-131k-base", "togethercomputer/evo-1-8k-base", "togethercomputer/StripedHyena-Nous-7B" ], - "relevancy_score": 30.7 + "relevancy_score": 30.0 }, { "architecture_id": "Maira2ForConditionalGeneration", "total_models": 1, - "total_downloads": 2914, + "total_downloads": 2692, "min_param_count": 6880185600, "sample_models": [ "microsoft/maira-2" ], - "relevancy_score": 30.5 + "relevancy_score": 29.9 + }, + { + "architecture_id": "CheXagentForCausalLM", + "total_models": 1, + "total_downloads": 2648, + "min_param_count": 3140746752, + "sample_models": [ + "StanfordAIMI/CheXagent-2-3b" + ], + "relevancy_score": 29.9 }, { "architecture_id": "BioGptForCausalLM", "total_models": 5, - "total_downloads": 179801, + "total_downloads": 174528, "min_param_count": null, "sample_models": [ "microsoft/biogpt", @@ -2420,194 +2455,211 @@ "hf-tiny-model-private/tiny-random-BioGptForCausalLM", "zequnl/molxpt" ], - "relevancy_score": 30.4 + "relevancy_score": 29.5 }, { - "architecture_id": "Ernie4_5_MoeForCausalLM", - "total_models": 4, - "total_downloads": 39215, - "min_param_count": 21825437888, + "architecture_id": "InternVLChatModel", + "total_models": 1, + "total_downloads": 2229, + "min_param_count": 3712637952, "sample_models": [ - "baidu/ERNIE-4.5-21B-A3B-PT", - "baidu/ERNIE-4.5-21B-A3B-Base-PT", - "baidu/ERNIE-4.5-21B-A3B-Thinking", - "baidu/ERNIE-4.5-300B-A47B-PT" + "numind/NuExtract-2-4B-experimental" ], - "relevancy_score": 30.3 + "relevancy_score": 29.5 }, { - "architecture_id": "BailingMoeV2ForCausalLM", - "total_models": 5, - "total_downloads": 20571, - "min_param_count": 16255643392, + "architecture_id": "MatriochkaForCausalLM", + "total_models": 1, + "total_downloads": 2159, + "min_param_count": 3358735360, "sample_models": [ - "inclusionAI/Ling-mini-2.0", - "inclusionAI/Ling-1T", - "inclusionAI/Ring-mini-2.0", - "inclusionAI/Ling-flash-2.0", - "inclusionAI/Ling-flash-base-2.0" + "nthngdy/matryoshka-3B" + ], + "relevancy_score": 29.4 + }, + { + "architecture_id": "SolarOpenForCausalLM", + "total_models": 2, + "total_downloads": 343068, + "min_param_count": null, + "sample_models": [ + "upstage/Solar-Open-100B", + "nota-ai/Solar-Open-100B-NotaMoEQuant-Int4" ], - "relevancy_score": 29.5 + "relevancy_score": 29.1 }, { "architecture_id": "LatentMoELLaVAPhiForCausalLM", "total_models": 1, - "total_downloads": 1728, + "total_downloads": 1792, "min_param_count": 3093139456, "sample_models": [ "KKHYA/llavaphi2-2.7b-finetune-latent-sparse-moe-4e-2k-freeze-1.0_20260304_075653" ], - "relevancy_score": 29.3 + "relevancy_score": 29.0 }, { - "architecture_id": "SolarOpenForCausalLM", - "total_models": 2, - "total_downloads": 264805, - "min_param_count": null, + "architecture_id": "LlamaForCasualLM", + "total_models": 1, + "total_downloads": 1613, + "min_param_count": 3212749824, "sample_models": [ - "upstage/Solar-Open-100B", - "nota-ai/Solar-Open-100B-NotaMoEQuant-Int4" + "CoRover/BharatGPT-3B-Indic" ], - "relevancy_score": 29.2 + "relevancy_score": 28.8 }, { "architecture_id": "Qwen2ForSequenceClassification", "total_models": 2, - "total_downloads": 7086, + "total_downloads": 7132, "min_param_count": 7070622720, "sample_models": [ "nvidia/AceMath-7B-RM", "nvidia/Qwen2.5-CascadeRL-RM-72B" ], - "relevancy_score": 29.1 - }, - { - "architecture_id": "MatriochkaForCausalLM", - "total_models": 1, - "total_downloads": 1562, - "min_param_count": 3358735360, - "sample_models": [ - "nthngdy/matryoshka-3B" - ], - "relevancy_score": 29.1 + "relevancy_score": 28.6 }, { "architecture_id": "DeepseekForCausalLM", "total_models": 2, - "total_downloads": 40954, + "total_downloads": 41625, "min_param_count": 16375728128, "sample_models": [ "deepseek-ai/deepseek-moe-16b-base", "deepseek-ai/deepseek-moe-16b-chat" ], - "relevancy_score": 29.0 + "relevancy_score": 28.5 + }, + { + "architecture_id": "MobilintLlamaForCausalLM", + "total_models": 9, + "total_downloads": 31316, + "min_param_count": null, + "sample_models": [ + "mobilint/Llama-3.2-1B-Instruct", + "mobilint/Llama-3.2-3B-Instruct", + "mobilint/Llama-3.1-8B-Instruct", + "mobilint/HyperCLOVAX-SEED-Text-Instruct-1.5B", + "mobilint/Llama-3.2-1B-Instruct-Batch16", + "mobilint/Llama-3.1-8B-Instruct-Batch16", + "mobilint/Llama-3.2-3B-Instruct-Batch16", + "mobilint/Llama-3.2-3B-Instruct-Batch32", + "mobilint/Llama-3.1-8B-Instruct-Batch32" + ], + "relevancy_score": 28.3 }, { "architecture_id": "Jais2ForCausalLM", "total_models": 2, - "total_downloads": 6256, + "total_downloads": 6100, "min_param_count": 8090401280, "sample_models": [ "inceptionai/Jais-2-8B-Chat", "inceptionai/Jais-2-70B-Chat" ], - "relevancy_score": 28.9 + "relevancy_score": 28.3 }, { - "architecture_id": "ChatGLMForConditionalGeneration", - "total_models": 2, - "total_downloads": 4858, - "min_param_count": 9399951392, + "architecture_id": "BailingMoeV2ForCausalLM", + "total_models": 4, + "total_downloads": 19076, + "min_param_count": 16255643392, "sample_models": [ - "IAAR-Shanghai/xVerify-9B-C", - "qiuhuachuan/MeChat" + "inclusionAI/Ling-mini-2.0", + "inclusionAI/Ling-1T", + "inclusionAI/Ling-flash-2.0", + "inclusionAI/Ling-flash-base-2.0" ], - "relevancy_score": 28.3 + "relevancy_score": 28.0 }, { - "architecture_id": "ReformerModelWithLMHead", + "architecture_id": "ChatGLMForConditionalGeneration", "total_models": 2, - "total_downloads": 159282, - "min_param_count": null, + "total_downloads": 4797, + "min_param_count": 9399951392, "sample_models": [ - "google/reformer-crime-and-punishment", - "google/reformer-enwik8" + "IAAR-Shanghai/xVerify-9B-C", + "qiuhuachuan/MeChat" ], - "relevancy_score": 28.1 + "relevancy_score": 27.8 }, { "architecture_id": "LamedPhi3ForCausalLM", "total_models": 1, - "total_downloads": 985, + "total_downloads": 975, "min_param_count": 4049101904, "sample_models": [ "GoodBaiBai88/M3D-LaMed-Phi-3-4B" ], - "relevancy_score": 28.1 + "relevancy_score": 27.7 }, { - "architecture_id": "SarvamMLAForCausalLM", - "total_models": 2, - "total_downloads": 152436, - "min_param_count": 55732545631, + "architecture_id": "Gemma4TextModel", + "total_models": 1, + "total_downloads": 967, + "min_param_count": 4647449856, "sample_models": [ - "aoxo/sarvam-105b-uncensored", - "sarvamai/sarvam-105b" + "bRadu/gemma-4-E2B-it-textonly" ], - "relevancy_score": 28.0 + "relevancy_score": 27.7 }, { "architecture_id": "WeDLMForCausalLM", "total_models": 2, - "total_downloads": 4256, + "total_downloads": 4219, "min_param_count": 8190735360, "sample_models": [ "tencent/WeDLM-8B-Base", "tencent/WeDLM-8B-Instruct" ], - "relevancy_score": 28.0 + "relevancy_score": 27.5 }, { - "architecture_id": "SarvamMoEForCausalLM", + "architecture_id": "SarvamMLAForCausalLM", "total_models": 2, - "total_downloads": 149370, - "min_param_count": 32152650368, + "total_downloads": 151877, + "min_param_count": 55732545631, "sample_models": [ - "aoxo/sarvam-30b-uncensored", - "sarvamai/sarvam-30b" + "aoxo/sarvam-105b-uncensored", + "sarvamai/sarvam-105b" + ], + "relevancy_score": 27.3 + }, + { + "architecture_id": "ReformerModelWithLMHead", + "total_models": 2, + "total_downloads": 150609, + "min_param_count": null, + "sample_models": [ + "google/reformer-crime-and-punishment", + "google/reformer-enwik8" ], - "relevancy_score": 27.9 + "relevancy_score": 27.3 }, { "architecture_id": "HyperCLOVAXForCausalLM", "total_models": 1, - "total_downloads": 31787, + "total_downloads": 31859, "min_param_count": 14748112896, "sample_models": [ "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" ], - "relevancy_score": 27.8 + "relevancy_score": 27.3 }, { - "architecture_id": "MobilintLlamaForCausalLM", - "total_models": 7, - "total_downloads": 29407, - "min_param_count": null, + "architecture_id": "LongLlamaForCausalLM", + "total_models": 1, + "total_downloads": 780, + "min_param_count": 3426474900, "sample_models": [ - "mobilint/Llama-3.2-3B-Instruct", - "mobilint/Llama-3.2-1B-Instruct", - "mobilint/Llama-3.1-8B-Instruct", - "mobilint/HyperCLOVAX-SEED-Text-Instruct-1.5B", - "mobilint/Llama-3.2-1B-Instruct-Batch16", - "mobilint/Llama-3.1-8B-Instruct-Batch16", - "mobilint/Llama-3.2-3B-Instruct-Batch16" + "syzymon/long_llama_3b" ], - "relevancy_score": 27.7 + "relevancy_score": 27.2 }, { "architecture_id": "InternLMForCausalLM", "total_models": 4, - "total_downloads": 72325, + "total_downloads": 71468, "min_param_count": null, "sample_models": [ "internlm/internlm-chat-7b", @@ -2615,123 +2667,124 @@ "internlm/internlm-7b", "internlm/internlm-chat-20b" ], - "relevancy_score": 27.7 - }, - { - "architecture_id": "LongLlamaForCausalLM", - "total_models": 1, - "total_downloads": 756, - "min_param_count": 3426474900, - "sample_models": [ - "syzymon/long_llama_3b" - ], - "relevancy_score": 27.5 + "relevancy_score": 26.9 }, { - "architecture_id": "GPTNeoXJapaneseForCausalLM", + "architecture_id": "SarvamMoEForCausalLM", "total_models": 2, - "total_downloads": 113485, - "min_param_count": null, + "total_downloads": 123774, + "min_param_count": 32152650368, "sample_models": [ - "abeja/gpt-neox-japanese-2.7b", - "hf-tiny-model-private/tiny-random-GPTNeoXJapaneseForCausalLM" + "aoxo/sarvam-30b-uncensored", + "sarvamai/sarvam-30b" ], - "relevancy_score": 27.3 + "relevancy_score": 26.9 }, { - "architecture_id": "SparseLlamaForCausalLM", + "architecture_id": "ZambaForCausalLM", "total_models": 1, - "total_downloads": 4221, - "min_param_count": 8185270336, + "total_downloads": 4128, + "min_param_count": 7232490496, "sample_models": [ - "openbmb/NOSA-8B" + "Zyphra/Zamba-7B-v1" ], - "relevancy_score": 27.3 + "relevancy_score": 26.8 }, { - "architecture_id": "ZambaForCausalLM", - "total_models": 1, - "total_downloads": 4140, - "min_param_count": 7232490496, + "architecture_id": "GPTNeoXJapaneseForCausalLM", + "total_models": 2, + "total_downloads": 110049, + "min_param_count": null, "sample_models": [ - "Zyphra/Zamba-7B-v1" + "abeja/gpt-neox-japanese-2.7b", + "hf-tiny-model-private/tiny-random-GPTNeoXJapaneseForCausalLM" ], - "relevancy_score": 27.3 + "relevancy_score": 26.6 }, { - "architecture_id": "Gemma4TextModel", + "architecture_id": "JetMoEForCausalLM", "total_models": 1, - "total_downloads": 650, - "min_param_count": 4647449856, + "total_downloads": 3627, + "min_param_count": 8522237952, "sample_models": [ - "bRadu/gemma-4-E2B-it-textonly" + "jetmoe/jetmoe-8b" ], - "relevancy_score": 27.1 + "relevancy_score": 26.5 }, { - "architecture_id": "BailingMoeForCausalLM", + "architecture_id": "CXRMate2ForConditionalGeneration", "total_models": 1, - "total_downloads": 21607, - "min_param_count": 16801974272, + "total_downloads": 572, + "min_param_count": 3322260224, "sample_models": [ - "inclusionAI/Ling-lite-1.5" + "aehrc/cxrmate-2" ], - "relevancy_score": 27.0 + "relevancy_score": 26.5 }, { - "architecture_id": "JetMoEForCausalLM", + "architecture_id": "BunnyPhiForCausalLM", "total_models": 1, - "total_downloads": 3665, - "min_param_count": 8522237952, + "total_downloads": 564, + "min_param_count": 3182254624, "sample_models": [ - "jetmoe/jetmoe-8b" + "BAAI/Bunny-v1_0-3B" ], - "relevancy_score": 27.0 + "relevancy_score": 26.5 }, { "architecture_id": "Step3p5ForCausalLM", "total_models": 1, - "total_downloads": 123608, + "total_downloads": 133597, "min_param_count": 199384301376, "sample_models": [ "stepfun-ai/Step-3.5-Flash" ], - "relevancy_score": 26.8 + "relevancy_score": 26.4 }, { - "architecture_id": "CXRMate2ForConditionalGeneration", + "architecture_id": "BailingMoeForCausalLM", "total_models": 1, - "total_downloads": 560, - "min_param_count": 3322260224, + "total_downloads": 20878, + "min_param_count": 16801974272, "sample_models": [ - "aehrc/cxrmate-2" + "inclusionAI/Ling-lite-1.5" ], - "relevancy_score": 26.8 + "relevancy_score": 26.4 }, { - "architecture_id": "BunnyPhiForCausalLM", + "architecture_id": "SparseLlamaForCausalLM", "total_models": 1, - "total_downloads": 507, - "min_param_count": 3182254624, + "total_downloads": 3131, + "min_param_count": 8185270336, "sample_models": [ - "BAAI/Bunny-v1_0-3B" + "openbmb/NOSA-8B" ], - "relevancy_score": 26.6 + "relevancy_score": 26.2 }, { "architecture_id": "Esm2LlamaInstructForCausalLM", "total_models": 1, - "total_downloads": 2603, + "total_downloads": 2589, "min_param_count": 10878983201, "sample_models": [ "xiao-fei/Prot2Text-V2-11B-Instruct-hf" ], - "relevancy_score": 26.2 + "relevancy_score": 25.8 + }, + { + "architecture_id": "Qwen2VLAudioForConditionalGeneration", + "total_models": 1, + "total_downloads": 2177, + "min_param_count": 8932935680, + "sample_models": [ + "MayaKD/qwen2-vl-audio" + ], + "relevancy_score": 25.4 }, { "architecture_id": "OuroForCausalLM", "total_models": 4, - "total_downloads": 34326, + "total_downloads": 34507, "min_param_count": null, "sample_models": [ "ByteDance/Ouro-1.4B", @@ -2739,54 +2792,53 @@ "ByteDance/Ouro-2.6B", "ByteDance/Ouro-1.4B-Thinking" ], - "relevancy_score": 26.0 + "relevancy_score": 25.3 }, { - "architecture_id": "StableDiffcoderForCausalLM", - "total_models": 2, - "total_downloads": 1719, - "min_param_count": 8250462208, + "architecture_id": "FP8Qwen3ForCausalLM", + "total_models": 1, + "total_downloads": 1945, + "min_param_count": 8190735360, "sample_models": [ - "ByteDance-Seed/Stable-DiffCoder-8B-Instruct", - "ByteDance-Seed/Stable-DiffCoder-8B-Base" + "xihc-ucb/Qwen3-8B-Base-train-Quasar-0809" ], - "relevancy_score": 26.0 + "relevancy_score": 25.2 }, { - "architecture_id": "Qwen2VLAudioForConditionalGeneration", + "architecture_id": "CheXagentForConditionalGeneration", "total_models": 1, - "total_downloads": 2174, - "min_param_count": 8932935680, + "total_downloads": 1878, + "min_param_count": 8362401664, "sample_models": [ - "MayaKD/qwen2-vl-audio" + "StanfordAIMI/CheXagent-8b" ], - "relevancy_score": 25.8 + "relevancy_score": 25.1 + }, + { + "architecture_id": "FP8Qwen2ForCausalLM", + "total_models": 1, + "total_downloads": 1781, + "min_param_count": 7615616512, + "sample_models": [ + "xihc-ucb/Qwen2.5-7B-train-Quasar-1214" + ], + "relevancy_score": 25.0 }, { "architecture_id": "BaiChuanForCausalLM", "total_models": 2, - "total_downloads": 51261, + "total_downloads": 50672, "min_param_count": null, "sample_models": [ "baichuan-inc/Baichuan-7B", "FreedomIntelligence/HuatuoGPT-7B" ], - "relevancy_score": 25.6 - }, - { - "architecture_id": "FP8Qwen3ForCausalLM", - "total_models": 1, - "total_downloads": 1941, - "min_param_count": 8190735360, - "sample_models": [ - "xihc-ucb/Qwen3-8B-Base-train-Quasar-0809" - ], - "relevancy_score": 25.6 + "relevancy_score": 24.9 }, { "architecture_id": "MobilintQwen2ForCausalLM", "total_models": 4, - "total_downloads": 27152, + "total_downloads": 27256, "min_param_count": null, "sample_models": [ "mobilint/Qwen2.5-0.5B-Instruct", @@ -2794,97 +2846,75 @@ "mobilint/Qwen2.5-3B-Instruct", "mobilint/Qwen2.5-7B-Instruct" ], - "relevancy_score": 25.5 - }, - { - "architecture_id": "MobilintQwen3ForCausalLM", - "total_models": 4, - "total_downloads": 25718, - "min_param_count": null, - "sample_models": [ - "mobilint/Qwen3-0.6B", - "mobilint/Qwen3-1.7B", - "mobilint/Qwen3-4B", - "mobilint/Qwen3-8B" - ], - "relevancy_score": 25.4 + "relevancy_score": 24.8 }, { - "architecture_id": "HCXVisionForCausalLM", + "architecture_id": "KORMoForCausalLM", "total_models": 1, - "total_downloads": 64666, - "min_param_count": null, + "total_downloads": 1616, + "min_param_count": 10756624384, "sample_models": [ - "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" + "KORMo-Team/KORMo-10B-sft" ], - "relevancy_score": 25.4 + "relevancy_score": 24.8 }, { - "architecture_id": "FP8Qwen2ForCausalLM", - "total_models": 1, - "total_downloads": 1774, - "min_param_count": 7615616512, + "architecture_id": "MobilintQwen3ForCausalLM", + "total_models": 4, + "total_downloads": 25791, + "min_param_count": null, "sample_models": [ - "xihc-ucb/Qwen2.5-7B-train-Quasar-1214" + "mobilint/Qwen3-0.6B", + "mobilint/Qwen3-1.7B", + "mobilint/Qwen3-4B", + "mobilint/Qwen3-8B" ], - "relevancy_score": 25.4 + "relevancy_score": 24.7 }, { - "architecture_id": "CheXagentForConditionalGeneration", + "architecture_id": "MiMoV2FlashForCausalLM", "total_models": 1, - "total_downloads": 1766, - "min_param_count": 8362401664, + "total_downloads": 61449, + "min_param_count": 309785318400, "sample_models": [ - "StanfordAIMI/CheXagent-8b" + "XiaomiMiMo/MiMo-V2-Flash" ], - "relevancy_score": 25.4 + "relevancy_score": 24.7 }, { "architecture_id": "KimiLinearForCausalLM", "total_models": 1, - "total_downloads": 60910, + "total_downloads": 61051, "min_param_count": 49122681728, "sample_models": [ "moonshotai/Kimi-Linear-48B-A3B-Instruct" ], - "relevancy_score": 25.3 + "relevancy_score": 24.7 }, { - "architecture_id": "MiMoV2FlashForCausalLM", + "architecture_id": "HCXVisionForCausalLM", "total_models": 1, - "total_downloads": 58903, - "min_param_count": 309785318400, - "sample_models": [ - "XiaomiMiMo/MiMo-V2-Flash" - ], - "relevancy_score": 25.2 - }, - { - "architecture_id": "SeedOssForCausalLM", - "total_models": 3, - "total_downloads": 30541, + "total_downloads": 60376, "min_param_count": null, "sample_models": [ - "ByteDance-Seed/Seed-OSS-36B-Instruct", - "NousResearch/Hermes-4.3-36B", - "ByteDance-Seed/Seed-OSS-36B-Base" + "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" ], - "relevancy_score": 25.1 + "relevancy_score": 24.7 }, { - "architecture_id": "KORMoForCausalLM", + "architecture_id": "Param2MoEForCausalLM", "total_models": 1, - "total_downloads": 1512, - "min_param_count": 10756624384, + "total_downloads": 8281, + "min_param_count": 17151140480, "sample_models": [ - "KORMo-Team/KORMo-10B-sft" + "bharatgenai/Param2-17B-A2.4B-Thinking" ], - "relevancy_score": 25.0 + "relevancy_score": 24.3 }, { "architecture_id": "MobilintExaoneForCausalLM", "total_models": 4, - "total_downloads": 20678, + "total_downloads": 20759, "min_param_count": null, "sample_models": [ "mobilint/EXAONE-Deep-2.4B", @@ -2892,226 +2922,286 @@ "mobilint/EXAONE-3.5-7.8B-Instruct", "mobilint/EXAONE-Deep-7.8B" ], - "relevancy_score": 24.9 + "relevancy_score": 24.2 + }, + { + "architecture_id": "CogVLMForCausalLM", + "total_models": 2, + "total_downloads": 5861, + "min_param_count": 17639687424, + "sample_models": [ + "zai-org/cogvlm2-llama3-chat-19B", + "zai-org/cogvlm-chat-hf" + ], + "relevancy_score": 24.2 }, { "architecture_id": "MiniCPMSALAForCausalLM", "total_models": 1, - "total_downloads": 1403, + "total_downloads": 1254, "min_param_count": 9477203968, "sample_models": [ "openbmb/MiniCPM-SALA" ], - "relevancy_score": 24.8 + "relevancy_score": 24.2 }, { - "architecture_id": "CogVLMForCausalLM", - "total_models": 2, - "total_downloads": 5474, - "min_param_count": 17639687424, + "architecture_id": "Emu3ForCausalLM", + "total_models": 1, + "total_downloads": 1228, + "min_param_count": 8492011520, "sample_models": [ - "zai-org/cogvlm2-llama3-chat-19B", - "zai-org/cogvlm-chat-hf" + "BAAI/Emu3-Chat" ], - "relevancy_score": 24.6 + "relevancy_score": 24.2 }, { - "architecture_id": "LongcatFlashForCausalLM", + "architecture_id": "BunnyLlamaForCausalLM", "total_models": 1, - "total_downloads": 45650, - "min_param_count": 561862880256, + "total_downloads": 1195, + "min_param_count": 8479990848, "sample_models": [ - "meituan-longcat/LongCat-Flash-Chat" + "typhoon-ai/llama-3-typhoon-v1.5-8b-vision-preview" ], - "relevancy_score": 24.6 + "relevancy_score": 24.1 }, { - "architecture_id": "TrillionForCausalLM", + "architecture_id": "LongcatFlashForCausalLM", "total_models": 1, - "total_downloads": 7481, - "min_param_count": 20725519360, + "total_downloads": 43995, + "min_param_count": 561862880256, "sample_models": [ - "trillionlabs/Tri-21B-Think" + "meituan-longcat/LongCat-Flash-Chat" ], - "relevancy_score": 24.6 + "relevancy_score": 24.0 }, { "architecture_id": "InternLM3ForCausalLM", "total_models": 1, - "total_downloads": 43407, + "total_downloads": 43400, "min_param_count": null, "sample_models": [ "internlm/internlm3-8b-instruct" ], - "relevancy_score": 24.5 - }, - { - "architecture_id": "Param2MoEForCausalLM", - "total_models": 1, - "total_downloads": 7230, - "min_param_count": 17151140480, - "sample_models": [ - "bharatgenai/Param2-17B-A2.4B-Thinking" - ], - "relevancy_score": 24.5 + "relevancy_score": 24.0 }, { "architecture_id": "SteerlingForCausalLM", "total_models": 1, - "total_downloads": 1203, + "total_downloads": 1099, "min_param_count": 8391778304, "sample_models": [ "guidelabs/steerling-8b" ], - "relevancy_score": 24.5 + "relevancy_score": 23.9 }, { - "architecture_id": "Emu3ForCausalLM", + "architecture_id": "ExaoneMoEForCausalLM", "total_models": 1, - "total_downloads": 1181, - "min_param_count": 8492011520, + "total_downloads": 37191, + "min_param_count": 237099669632, "sample_models": [ - "BAAI/Emu3-Chat" + "LGAI-EXAONE/K-EXAONE-236B-A23B" ], - "relevancy_score": 24.5 + "relevancy_score": 23.6 }, { - "architecture_id": "BunnyLlamaForCausalLM", + "architecture_id": "StableDiffcoderForCausalLM", "total_models": 1, - "total_downloads": 1149, - "min_param_count": 8479990848, + "total_downloads": 871, + "min_param_count": 8250462208, "sample_models": [ - "typhoon-ai/llama-3-typhoon-v1.5-8b-vision-preview" + "ByteDance-Seed/Stable-DiffCoder-8B-Instruct" ], - "relevancy_score": 24.4 + "relevancy_score": 23.4 }, { "architecture_id": "MiniMaxM1ForCausalLM", "total_models": 2, - "total_downloads": 24236, + "total_downloads": 23252, "min_param_count": null, "sample_models": [ "MiniMaxAI/MiniMax-M1-40k", "MiniMaxAI/MiniMax-M1-80k" ], - "relevancy_score": 23.9 + "relevancy_score": 23.2 }, { "architecture_id": "ICONNForCausalLM", "total_models": 1, - "total_downloads": 903, + "total_downloads": 745, "min_param_count": 7833409536, "sample_models": [ "ICONNAI/ICONN-1-Mini-Beta" ], - "relevancy_score": 23.9 + "relevancy_score": 23.1 }, { "architecture_id": "Qwen2VLForConditionalGeneration", "total_models": 1, - "total_downloads": 770, + "total_downloads": 726, "min_param_count": 8291375616, "sample_models": [ "typhoon-ai/typhoon2-qwen2vl-7b-vision-instruct" ], - "relevancy_score": 23.5 + "relevancy_score": 23.0 }, { - "architecture_id": "BailingMoeV2_5ForCausalLM", + "architecture_id": "Qwen2Model", "total_models": 1, - "total_downloads": 24780, - "min_param_count": 1012474606720, + "total_downloads": 681, + "min_param_count": 7070619136, "sample_models": [ - "inclusionAI/Ring-2.5-1T" + "NewBeeKing/MemPO_Qwen2.5-SFT-RL" ], - "relevancy_score": 23.3 + "relevancy_score": 22.9 }, { - "architecture_id": "ExaoneMoEForCausalLM", + "architecture_id": "LLaDAMoEModel", "total_models": 1, - "total_downloads": 24437, - "min_param_count": 237099669632, + "total_downloads": 665, + "min_param_count": 7356880896, "sample_models": [ - "LGAI-EXAONE/K-EXAONE-236B-A23B" + "inclusionAI/LLaDA-MoE-7B-A1B-Base" ], - "relevancy_score": 23.2 + "relevancy_score": 22.8 + }, + { + "architecture_id": "Gemma4ForCausalLM", + "total_models": 1, + "total_downloads": 664, + "min_param_count": 7518069034, + "sample_models": [ + "aqweteddy/gemma-4-E4B-it-text" + ], + "relevancy_score": 22.8 + }, + { + "architecture_id": "BailingMoeV2_5ForCausalLM", + "total_models": 1, + "total_downloads": 24448, + "min_param_count": 1012474606720, + "sample_models": [ + "inclusionAI/Ring-2.5-1T" + ], + "relevancy_score": 22.7 }, { "architecture_id": "CogVLMVideoForCausalLM", "total_models": 1, - "total_downloads": 681, + "total_downloads": 622, "min_param_count": 12507532544, "sample_models": [ "zai-org/VisionReward-Video" ], - "relevancy_score": 23.2 + "relevancy_score": 22.7 }, { "architecture_id": "Ernie4_5ForCausalLM", "total_models": 2, - "total_downloads": 17478, + "total_downloads": 17079, "min_param_count": null, "sample_models": [ "baidu/ERNIE-4.5-0.3B-PT", "baidu/ERNIE-4.5-0.3B-Base-PT" ], - "relevancy_score": 23.1 + "relevancy_score": 22.6 }, { "architecture_id": "CLIPT5ForConditionalGeneration", "total_models": 2, - "total_downloads": 16500, + "total_downloads": 17282, "min_param_count": null, "sample_models": [ "zhiqiulin/clip-flant5-xl", "zhiqiulin/clip-flant5-xxl" ], - "relevancy_score": 23.0 + "relevancy_score": 22.6 + }, + { + "architecture_id": "CodeShellForCausalLM", + "total_models": 1, + "total_downloads": 610, + "min_param_count": 7688051328, + "sample_models": [ + "WisdomShell/CodeShell-7B" + ], + "relevancy_score": 22.6 + }, + { + "architecture_id": "SolarForCausalLM", + "total_models": 1, + "total_downloads": 21092, + "min_param_count": null, + "sample_models": [ + "upstage/solar-pro-preview-instruct" + ], + "relevancy_score": 22.4 }, { "architecture_id": "Grok1ModelForCausalLM", "total_models": 1, - "total_downloads": 21640, + "total_downloads": 20827, "min_param_count": null, "sample_models": [ "hpcai-tech/grok-1" ], - "relevancy_score": 23.0 + "relevancy_score": 22.4 }, { - "architecture_id": "CodeShellForCausalLM", + "architecture_id": "InternLM2ForCausalLM", "total_models": 1, - "total_downloads": 614, - "min_param_count": 7688051328, + "total_downloads": 513, + "min_param_count": 7737708544, "sample_models": [ - "WisdomShell/CodeShell-7B" + "AI4Chem/ChemLLM-7B-Chat" ], - "relevancy_score": 23.0 + "relevancy_score": 22.3 }, { - "architecture_id": "SolarForCausalLM", + "architecture_id": "GptOssPuzzleForCausalLM", "total_models": 1, - "total_downloads": 21294, - "min_param_count": null, + "total_downloads": 18998, + "min_param_count": 90837823680, "sample_models": [ - "upstage/solar-pro-preview-instruct" + "nvidia/gpt-oss-puzzle-88B" ], - "relevancy_score": 22.9 + "relevancy_score": 22.2 }, { - "architecture_id": "LLaDAMoEModel", + "architecture_id": "TrillionForCausalLM", "total_models": 1, - "total_downloads": 586, - "min_param_count": 7356880896, + "total_downloads": 3124, + "min_param_count": 20725519360, "sample_models": [ - "inclusionAI/LLaDA-MoE-7B-A1B-Base" + "trillionlabs/Tri-21B-Think" ], - "relevancy_score": 22.9 + "relevancy_score": 22.2 + }, + { + "architecture_id": "RecaLLMLlamaForCausalLM", + "total_models": 1, + "total_downloads": 506, + "min_param_count": 8030294016, + "sample_models": [ + "kswhitecross/RecaLLM-Llama-3.1-8B" + ], + "relevancy_score": 22.2 + }, + { + "architecture_id": "CohereForCausalLM", + "total_models": 1, + "total_downloads": 504, + "min_param_count": 8028033024, + "sample_models": [ + "Yousefbahr/Turjman-Cold-Start" + ], + "relevancy_score": 22.2 }, { "architecture_id": "LISAForCausalLM", "total_models": 5, - "total_downloads": 5532, + "total_downloads": 5924, "min_param_count": null, "sample_models": [ "xinlai/LISA-13B-llama2-v1", @@ -3120,1317 +3210,1350 @@ "xinlai/LISA-13B-llama2-v1-explanatory", "MBZUAI/GLaMM-GranD-Pretrained" ], - "relevancy_score": 22.6 + "relevancy_score": 22.1 }, { "architecture_id": "Qwen2_5_VLForConditionalGeneration", "total_models": 3, - "total_downloads": 10261, + "total_downloads": 10615, "min_param_count": null, "sample_models": [ "OmniSVG/OmniSVG1.1_4B", "OmniSVG/OmniSVG1.1_8B", "OmniSVG/OmniSVG" ], - "relevancy_score": 22.6 + "relevancy_score": 22.1 + }, + { + "architecture_id": "OutlierMoEForCausalLM", + "total_models": 3, + "total_downloads": 1690, + "min_param_count": 22813220976, + "sample_models": [ + "Outlier-Ai/Outlier-40B-V3.2", + "Outlier-Ai/Outlier-10B-V3.2", + "Outlier-Ai/Outlier-70B-V3.2" + ], + "relevancy_score": 22.1 }, { "architecture_id": "OrionForCausalLM", "total_models": 2, - "total_downloads": 13834, + "total_downloads": 13391, "min_param_count": null, "sample_models": [ "OrionStarAI/Orion-14B-Chat", "OrionStarAI/Orion-14B-Base" ], - "relevancy_score": 22.6 - }, - { - "architecture_id": "GptOssPuzzleForCausalLM", - "total_models": 1, - "total_downloads": 17281, - "min_param_count": 90837823680, - "sample_models": [ - "nvidia/gpt-oss-puzzle-88B" - ], - "relevancy_score": 22.5 + "relevancy_score": 22.0 }, { "architecture_id": "HunYuanMoEV1ForCausalLM", "total_models": 1, - "total_downloads": 16793, + "total_downloads": 15552, "min_param_count": null, "sample_models": [ "tencent/Hunyuan-A13B-Instruct" ], - "relevancy_score": 22.4 + "relevancy_score": 21.7 }, { - "architecture_id": "Dots1ForCausalLM", + "architecture_id": "GravityMoEForCausalLM", "total_models": 2, - "total_downloads": 10490, - "min_param_count": 142774381696, + "total_downloads": 1655, + "min_param_count": 16242181824, "sample_models": [ - "rednote-hilab/dots.llm1.inst", - "rednote-hilab/dots.llm1.base" + "learning-unit/L1-16B-A3B", + "trillionlabs/Gravity-16B-A3B-Base" ], - "relevancy_score": 22.0 + "relevancy_score": 21.5 }, { "architecture_id": "MiniCPM3ForCausalLM", "total_models": 1, - "total_downloads": 14420, + "total_downloads": 14024, "min_param_count": null, "sample_models": [ "openbmb/MiniCPM3-4B" ], - "relevancy_score": 22.0 + "relevancy_score": 21.5 }, { - "architecture_id": "IQuestLoopCoderForCausalLM", + "architecture_id": "ArcticForCausalLM", "total_models": 1, - "total_downloads": 14278, - "min_param_count": 39794696320, + "total_downloads": 13989, + "min_param_count": null, "sample_models": [ - "IQuestLab/IQuest-Coder-V1-40B-Loop-Instruct" + "Snowflake/snowflake-arctic-instruct" ], - "relevancy_score": 22.0 + "relevancy_score": 21.5 }, { - "architecture_id": "ArcticForCausalLM", - "total_models": 1, - "total_downloads": 14245, - "min_param_count": null, + "architecture_id": "Dots1ForCausalLM", + "total_models": 2, + "total_downloads": 10143, + "min_param_count": 142774381696, "sample_models": [ - "Snowflake/snowflake-arctic-instruct" + "rednote-hilab/dots.llm1.inst", + "rednote-hilab/dots.llm1.base" ], - "relevancy_score": 22.0 + "relevancy_score": 21.4 }, { - "architecture_id": "LlavaLlamaModel", - "total_models": 4, - "total_downloads": 5317, - "min_param_count": null, + "architecture_id": "IQuestLoopCoderForCausalLM", + "total_models": 1, + "total_downloads": 13700, + "min_param_count": 39794696320, "sample_models": [ - "Efficient-Large-Model/VILA1.5-3b", - "Efficient-Large-Model/NVILA-8B", - "Efficient-Large-Model/VILA1.5-13b", - "Efficient-Large-Model/NVILA-Lite-8B" + "IQuestLab/IQuest-Coder-V1-40B-Loop-Instruct" ], - "relevancy_score": 21.8 + "relevancy_score": 21.4 }, { "architecture_id": "GPT2LMHeadCustomModel", "total_models": 2, - "total_downloads": 8956, + "total_downloads": 8718, "min_param_count": null, "sample_models": [ "bigcode/santacoder", "rbiojout/santacoder-odoo-15" ], - "relevancy_score": 21.7 - }, - { - "architecture_id": "MosaicGPT", - "total_models": 3, - "total_downloads": 6562, - "min_param_count": null, - "sample_models": [ - "anas-awadalla/mpt-1b-redpajama-200b", - "anas-awadalla/mpt-1b-redpajama-200b-dolly", - "anas-awadalla/mpt-1b-redpajama-200b-hf-style" - ], - "relevancy_score": 21.6 + "relevancy_score": 21.1 }, { - "architecture_id": "OpenMoeForCausalLM", - "total_models": 5, - "total_downloads": 3349, + "architecture_id": "MiniMaxText01ForCausalLM", + "total_models": 1, + "total_downloads": 11872, "min_param_count": null, - "sample_models": [ - "hpcai-tech/openmoe-8B", - "OrionZheng/openmoe-base", - "OrionZheng/openmoe-8b", - "hpcai-tech/openmoe-base", - "OrionZheng/openmoe-8b-chat" + "sample_models": [ + "MiniMaxAI/MiniMax-Text-01" ], - "relevancy_score": 21.5 + "relevancy_score": 21.1 }, { "architecture_id": "GPTRefactForCausalLM", "total_models": 2, - "total_downloads": 8330, + "total_downloads": 8406, "min_param_count": null, "sample_models": [ "refactai/Refact-1_6B-fim", "refactai/Refact-1_6-base" ], - "relevancy_score": 21.5 + "relevancy_score": 21.0 }, { "architecture_id": "CrystalCoderLMHeadModel", "total_models": 2, - "total_downloads": 7989, + "total_downloads": 8126, "min_param_count": null, "sample_models": [ "LLM360/Crystal", "LLM360/CrystalChat" ], - "relevancy_score": 21.4 + "relevancy_score": 20.9 + }, + { + "architecture_id": "OpenMoeForCausalLM", + "total_models": 5, + "total_downloads": 3190, + "min_param_count": null, + "sample_models": [ + "hpcai-tech/openmoe-8B", + "OrionZheng/openmoe-base", + "hpcai-tech/openmoe-base", + "OrionZheng/openmoe-8b", + "OrionZheng/openmoe-8b-chat" + ], + "relevancy_score": 20.8 + }, + { + "architecture_id": "Bagel", + "total_models": 1, + "total_downloads": 1657, + "min_param_count": 14691079811, + "sample_models": [ + "lmms-lab/BAGEL-7B-MoT-ver.LE" + ], + "relevancy_score": 20.8 }, { "architecture_id": "MobileLlamaForCausalLM", "total_models": 4, - "total_downloads": 4006, + "total_downloads": 4211, "min_param_count": null, "sample_models": [ "mtgv/MobileVLM_V2-1.7B", + "mtgv/MobileVLM-1.7B", "mtgv/MobileVLM_V2-7B", - "mtgv/MobileVLM_V2-3B", - "mtgv/MobileVLM-1.7B" + "mtgv/MobileVLM_V2-3B" + ], + "relevancy_score": 20.7 + }, + { + "architecture_id": "BlueLMForCausalLM", + "total_models": 3, + "total_downloads": 5311, + "min_param_count": null, + "sample_models": [ + "vivo-ai/BlueLM-7B-Chat", + "vivo-ai/BlueLM-7B-Chat-32K", + "vivo-ai/BlueLM-7B-Base" ], - "relevancy_score": 21.2 + "relevancy_score": 20.6 }, { "architecture_id": "modeling_camelidae.LlamaForCausalLM", "total_models": 3, - "total_downloads": 5505, + "total_downloads": 5073, "min_param_count": null, "sample_models": [ "hywu/Camelidae-8x34B", "hywu/Camelidae-8x7B", "hywu/Camelidae-8x13B" ], - "relevancy_score": 21.2 + "relevancy_score": 20.5 }, { - "architecture_id": "BlueLMForCausalLM", + "architecture_id": "MosaicGPT", "total_models": 3, - "total_downloads": 5360, + "total_downloads": 4781, "min_param_count": null, "sample_models": [ - "vivo-ai/BlueLM-7B-Chat", - "vivo-ai/BlueLM-7B-Base", - "vivo-ai/BlueLM-7B-Chat-32K" + "anas-awadalla/mpt-1b-redpajama-200b", + "anas-awadalla/mpt-1b-redpajama-200b-dolly", + "anas-awadalla/mpt-1b-redpajama-200b-hf-style" ], - "relevancy_score": 21.2 + "relevancy_score": 20.4 }, { - "architecture_id": "Bagel", - "total_models": 1, - "total_downloads": 1511, - "min_param_count": 14691079811, + "architecture_id": "MultiScaleForCausalLM", + "total_models": 3, + "total_downloads": 4272, + "min_param_count": null, "sample_models": [ - "lmms-lab/BAGEL-7B-MoT-ver.LE" + "KoinicLabs/AXL-Translate", + "KoinicLabs/AXL-Vision-v2", + "KoinicLabs/AXL-Chat-10M" ], - "relevancy_score": 21.0 + "relevancy_score": 20.1 }, { - "architecture_id": "GPT2Model", + "architecture_id": "ModernBertDecoderForCausalLM", "total_models": 2, - "total_downloads": 5577, + "total_downloads": 5509, "min_param_count": null, "sample_models": [ - "keshan/sinhala-gpt2", - "cerebras/Cerebras-GPT-13B" + "jhu-clsp/ettin-decoder-400m", + "jhu-clsp/ettin-decoder-32m" ], - "relevancy_score": 20.6 + "relevancy_score": 20.1 }, { "architecture_id": "LiquidForCausalLM", "total_models": 2, - "total_downloads": 5508, + "total_downloads": 5676, "min_param_count": null, "sample_models": [ "reaperdoesntknow/DNA-175M", "reaperdoesntknow/DNA-50M" ], - "relevancy_score": 20.6 + "relevancy_score": 20.1 }, { - "architecture_id": "ModernBertDecoderForCausalLM", - "total_models": 2, - "total_downloads": 5245, + "architecture_id": "LlavaLlamaModel", + "total_models": 3, + "total_downloads": 4035, "min_param_count": null, "sample_models": [ - "jhu-clsp/ettin-decoder-400m", - "jhu-clsp/ettin-decoder-32m" + "Efficient-Large-Model/VILA1.5-3b", + "Efficient-Large-Model/NVILA-8B", + "Efficient-Large-Model/VILA1.5-13b" ], - "relevancy_score": 20.5 + "relevancy_score": 20.0 }, { - "architecture_id": "BottleneckT5LMWithPerturb", - "total_models": 4, - "total_downloads": 2754, + "architecture_id": "GPT2Model", + "total_models": 2, + "total_downloads": 5435, "min_param_count": null, "sample_models": [ - "thesephist/contra-bottleneck-t5-small-wikipedia", - "thesephist/contra-bottleneck-t5-base-wikipedia", - "thesephist/contra-bottleneck-t5-large-wikipedia", - "thesephist/contra-bottleneck-t5-xl-wikipedia" + "cerebras/Cerebras-GPT-13B", + "keshan/sinhala-gpt2" ], - "relevancy_score": 20.4 + "relevancy_score": 20.0 }, { - "architecture_id": "MultiScaleForCausalLM", - "total_models": 3, - "total_downloads": 3737, + "architecture_id": "KonkanGPT", + "total_models": 2, + "total_downloads": 5113, "min_param_count": null, "sample_models": [ - "KoinicLabs/AXL-Vision-v2", - "KoinicLabs/AXL-Translate", - "KoinicLabs/AXL-Chat-10M" + "omdeep22/Gonyai-teo2", + "omdeep22/Gonyai-v1" ], - "relevancy_score": 20.4 + "relevancy_score": 19.9 }, { "architecture_id": "InternLMXComposer2ForCausalLM", "total_models": 1, - "total_downloads": 6744, + "total_downloads": 6712, "min_param_count": null, "sample_models": [ "internlm/internlm-xcomposer2-7b" ], - "relevancy_score": 20.4 + "relevancy_score": 19.9 }, { - "architecture_id": "KonkanGPT", - "total_models": 2, - "total_downloads": 4822, + "architecture_id": "BottleneckT5LMWithPerturb", + "total_models": 4, + "total_downloads": 2779, "min_param_count": null, "sample_models": [ - "omdeep22/Gonyai-teo2", - "omdeep22/Gonyai-v1" + "thesephist/contra-bottleneck-t5-small-wikipedia", + "thesephist/contra-bottleneck-t5-base-wikipedia", + "thesephist/contra-bottleneck-t5-large-wikipedia", + "thesephist/contra-bottleneck-t5-xl-wikipedia" ], - "relevancy_score": 20.3 + "relevancy_score": 19.8 }, { - "architecture_id": "GraphT5TransformerForConditionalGeneration", - "total_models": 1, - "total_downloads": 6732, + "architecture_id": "NanochatWasmFusedModel", + "total_models": 2, + "total_downloads": 4952, "min_param_count": null, "sample_models": [ - "haitengzhao/gimlet" + "eastlondoner/nanochat-wasm-fused-preview-01", + "eastlondoner/nanochat-wasm-fused-preview-02" ], - "relevancy_score": 20.3 + "relevancy_score": 19.8 }, { "architecture_id": "MobilintExaone4ForCausalLM", "total_models": 1, - "total_downloads": 6493, + "total_downloads": 6518, "min_param_count": null, "sample_models": [ "mobilint/EXAONE-4.0-1.2B" ], - "relevancy_score": 20.3 + "relevancy_score": 19.8 }, { "architecture_id": "LlamaMoEForCausalLM", "total_models": 3, - "total_downloads": 3428, + "total_downloads": 3452, "min_param_count": null, "sample_models": [ "llama-moe/LLaMA-MoE-v1-3_5B-2_8", "llama-moe/LLaMA-MoE-v1-3_0B-2_16", "llama-moe/LLaMA-MoE-v1-3_5B-4_16" ], - "relevancy_score": 20.2 + "relevancy_score": 19.7 }, { "architecture_id": "RobertaForCausalLM", "total_models": 2, - "total_downloads": 4402, + "total_downloads": 4451, "min_param_count": null, "sample_models": [ "uf-aice-lab/math-roberta", "gokceuludogan/ChemBERTaLM" ], - "relevancy_score": 20.1 + "relevancy_score": 19.6 }, { "architecture_id": "MossForCausalLM", "total_models": 2, - "total_downloads": 4348, + "total_downloads": 4390, "min_param_count": null, "sample_models": [ "OpenMOSS-Team/moss-moon-003-sft", "OpenMOSS-Team/moss-moon-003-base" ], - "relevancy_score": 20.0 + "relevancy_score": 19.6 }, { - "architecture_id": "BartForCausalLM", - "total_models": 2, - "total_downloads": 4190, + "architecture_id": "Qwen3TSForCausalLM", + "total_models": 1, + "total_downloads": 5950, "min_param_count": null, "sample_models": [ - "sanchit-gandhi/tiny-random-bart-fp16", - "hf-tiny-model-private/tiny-random-BartForCausalLM" + "bytedance-research/ChatTS-8B" ], - "relevancy_score": 20.0 + "relevancy_score": 19.6 }, { "architecture_id": "Int8OPTForCausalLM", "total_models": 2, - "total_downloads": 4147, + "total_downloads": 4242, "min_param_count": null, "sample_models": [ "mit-han-lab/opt-125m-smoothquant", "mit-han-lab/opt-6.7b-smoothquant" ], - "relevancy_score": 19.9 + "relevancy_score": 19.5 }, { - "architecture_id": "InternLMXComposerForCausalLM", - "total_models": 1, - "total_downloads": 5444, + "architecture_id": "BartForCausalLM", + "total_models": 2, + "total_downloads": 4186, "min_param_count": null, "sample_models": [ - "internlm/internlm-xcomposer-7b" + "sanchit-gandhi/tiny-random-bart-fp16", + "hf-tiny-model-private/tiny-random-BartForCausalLM" ], - "relevancy_score": 19.9 + "relevancy_score": 19.5 }, { "architecture_id": "TranceptionLMHeadModel", "total_models": 2, - "total_downloads": 3959, + "total_downloads": 4081, "min_param_count": null, "sample_models": [ "PascalNotin/Tranception_Large", "PascalNotin/Tranception_Small" ], - "relevancy_score": 19.8 + "relevancy_score": 19.4 }, { - "architecture_id": "ModelStarOLMhead", + "architecture_id": "InternLMXComposerForCausalLM", "total_models": 1, - "total_downloads": 5177, - "min_param_count": null, - "sample_models": [ - "Hawa-Al-Akram/StarO-Ai" - ], - "relevancy_score": 19.8 - }, - { - "architecture_id": "NanochatWasmFusedModel", - "total_models": 2, - "total_downloads": 3734, + "total_downloads": 5363, "min_param_count": null, "sample_models": [ - "eastlondoner/nanochat-wasm-fused-preview-01", - "eastlondoner/nanochat-wasm-fused-preview-02" + "internlm/internlm-xcomposer-7b" ], - "relevancy_score": 19.7 + "relevancy_score": 19.4 }, { - "architecture_id": "Qwen3TSForCausalLM", + "architecture_id": "ModelStarOLMhead", "total_models": 1, - "total_downloads": 5082, + "total_downloads": 5178, "min_param_count": null, "sample_models": [ - "bytedance-research/ChatTS-8B" + "Hawa-Al-Akram/StarO-Ai" ], - "relevancy_score": 19.7 + "relevancy_score": 19.3 }, { - "architecture_id": "TransfoXLLMHeadModel", - "total_models": 1, - "total_downloads": 4728, + "architecture_id": "Olmo2ForSequenceClassification", + "total_models": 2, + "total_downloads": 3565, "min_param_count": null, "sample_models": [ - "transfo-xl/transfo-xl-wt103" + "allenai/OLMo-2-1124-7B-RM", + "LifeWiki-ai/OLMo-2-1124-7B-RM" ], - "relevancy_score": 19.6 + "relevancy_score": 19.1 }, { - "architecture_id": "Olmo2ForSequenceClassification", - "total_models": 2, - "total_downloads": 3444, + "architecture_id": "GraphT5TransformerForConditionalGeneration", + "total_models": 1, + "total_downloads": 4795, "min_param_count": null, "sample_models": [ - "allenai/OLMo-2-1124-7B-RM", - "LifeWiki-ai/OLMo-2-1124-7B-RM" + "haitengzhao/gimlet" ], - "relevancy_score": 19.5 + "relevancy_score": 19.1 }, { "architecture_id": "EvafrillMoForCausalLM", "total_models": 1, - "total_downloads": 4484, + "total_downloads": 4556, "min_param_count": null, "sample_models": [ "pathcosmos/EVAFRILL-Mo-3B" ], - "relevancy_score": 19.4 + "relevancy_score": 19.0 }, { "architecture_id": "Qwen2TSForCausalLM", "total_models": 1, - "total_downloads": 3992, + "total_downloads": 4042, "min_param_count": null, "sample_models": [ "bytedance-research/ChatTS-14B" ], - "relevancy_score": 19.2 + "relevancy_score": 18.8 }, { "architecture_id": "QEDForCausalLM", "total_models": 1, - "total_downloads": 3794, + "total_downloads": 4040, "min_param_count": null, "sample_models": [ "levossadtchi/QED-75M" ], - "relevancy_score": 19.1 + "relevancy_score": 18.8 }, { - "architecture_id": "LongcatCausalLM", + "architecture_id": "MochivaForCausalLM", "total_models": 1, - "total_downloads": 3590, - "min_param_count": 561862880256, + "total_downloads": 3969, + "min_param_count": null, "sample_models": [ - "meituan-longcat/LongCat-Flash-Thinking-2601" + "Mochiva-team/Mochiva-model" ], - "relevancy_score": 18.9 + "relevancy_score": 18.7 }, { - "architecture_id": "YuanForCausalLM", - "total_models": 3, - "total_downloads": 1880, + "architecture_id": "TransfoXLLMHeadModel", + "total_models": 1, + "total_downloads": 3909, "min_param_count": null, "sample_models": [ - "IEITYuan/Yuan2-M32-hf", - "IEITYuan/Yuan2-2B-Mars-hf", - "IEITYuan/Yuan2-2B-Janus-hf" + "transfo-xl/transfo-xl-wt103" ], - "relevancy_score": 18.8 + "relevancy_score": 18.7 }, { - "architecture_id": "GomeForCausalLM", + "architecture_id": "LongcatCausalLM", "total_models": 1, - "total_downloads": 3428, - "min_param_count": null, + "total_downloads": 3688, + "min_param_count": 561862880256, "sample_models": [ - "Prositron/gome" + "meituan-longcat/LongCat-Flash-Thinking-2601" ], - "relevancy_score": 18.8 + "relevancy_score": 18.6 }, { - "architecture_id": "GravityMoEForCausalLM", + "architecture_id": "GomeForCausalLM", "total_models": 1, - "total_downloads": 541, - "min_param_count": 16242181824, + "total_downloads": 3500, + "min_param_count": null, "sample_models": [ - "learning-unit/L1-16B-A3B" + "Prositron/gome" ], - "relevancy_score": 18.7 + "relevancy_score": 18.5 }, { - "architecture_id": "GPT", - "total_models": 2, - "total_downloads": 2262, + "architecture_id": "YuanForCausalLM", + "total_models": 3, + "total_downloads": 1888, "min_param_count": null, "sample_models": [ - "LH-Tech-AI/Apex-1.5-Coder-Instruct-350M", - "LH-Tech-AI/Apex-1.5-Instruct-350M" + "IEITYuan/Yuan2-M32-hf", + "IEITYuan/Yuan2-2B-Mars-hf", + "IEITYuan/Yuan2-2B-Janus-hf" ], - "relevancy_score": 18.6 + "relevancy_score": 18.4 }, { "architecture_id": "MyAwesomeModelForCausalLM", "total_models": 1, - "total_downloads": 3025, + "total_downloads": 3098, "min_param_count": null, "sample_models": [ "dongbobo/MyAwesomeModel" ], - "relevancy_score": 18.6 + "relevancy_score": 18.2 }, { - "architecture_id": "CTRLLMHeadModel", + "architecture_id": "QHEARTForECGQA", "total_models": 1, - "total_downloads": 2889, + "total_downloads": 2956, "min_param_count": null, "sample_models": [ - "sshleifer/tiny-ctrl" + "Manhph2211/Q-HEART" ], - "relevancy_score": 18.5 + "relevancy_score": 18.1 }, { - "architecture_id": "CPMAntForCausalLM", + "architecture_id": "CTRLLMHeadModel", "total_models": 1, - "total_downloads": 2814, + "total_downloads": 2941, "min_param_count": null, "sample_models": [ - "openbmb/cpm-ant-10b" + "sshleifer/tiny-ctrl" ], - "relevancy_score": 18.4 + "relevancy_score": 18.1 }, { - "architecture_id": "TAMELM", + "architecture_id": "GPT2CustomLMHeadModel", "total_models": 1, - "total_downloads": 2738, + "total_downloads": 2852, "min_param_count": null, "sample_models": [ - "reaperdoesntknow/TameForCasualLM" + "fxmarty/tiny-testing-gpt2-remote-code" ], - "relevancy_score": 18.3 + "relevancy_score": 18.0 }, { - "architecture_id": "CoherenceMomentumModel", + "architecture_id": "TAMELM", "total_models": 1, - "total_downloads": 2731, + "total_downloads": 2823, "min_param_count": null, "sample_models": [ - "aisingapore/coherence-momentum" + "reaperdoesntknow/TameForCasualLM" ], - "relevancy_score": 18.3 + "relevancy_score": 18.0 }, { - "architecture_id": "GPT2CustomLMHeadModel", + "architecture_id": "CoherenceMomentumModel", "total_models": 1, - "total_downloads": 2691, + "total_downloads": 2756, "min_param_count": null, "sample_models": [ - "fxmarty/tiny-testing-gpt2-remote-code" + "aisingapore/coherence-momentum" ], - "relevancy_score": 18.3 + "relevancy_score": 17.9 }, { "architecture_id": "GPT2", "total_models": 1, - "total_downloads": 2643, + "total_downloads": 2709, "min_param_count": null, "sample_models": [ "NamrataThakur/Small_Language_Model_MHA_53M_Pretrained" ], - "relevancy_score": 18.3 + "relevancy_score": 17.9 }, { "architecture_id": "GQAGPT2", "total_models": 1, - "total_downloads": 2637, + "total_downloads": 2699, "min_param_count": null, "sample_models": [ "NamrataThakur/Small_Language_Model_GQA_48M_Pretrained" ], - "relevancy_score": 18.3 + "relevancy_score": 17.9 }, { - "architecture_id": "MoEGPT2", + "architecture_id": "ThinkerLM", "total_models": 1, - "total_downloads": 2636, + "total_downloads": 2697, "min_param_count": null, "sample_models": [ - "NamrataThakur/Small_Language_Model_MOE_127M_Pretrained" + "prskid1000/micro-Omni" ], - "relevancy_score": 18.3 + "relevancy_score": 17.9 }, { - "architecture_id": "ThinkerLM", + "architecture_id": "CPMAntForCausalLM", "total_models": 1, - "total_downloads": 2627, + "total_downloads": 2693, "min_param_count": null, "sample_models": [ - "prskid1000/micro-Omni" + "openbmb/cpm-ant-10b" ], - "relevancy_score": 18.2 + "relevancy_score": 17.9 }, { - "architecture_id": "QHEARTForECGQA", + "architecture_id": "D3PMSanskritModel", "total_models": 1, - "total_downloads": 2624, + "total_downloads": 2676, "min_param_count": null, "sample_models": [ - "Manhph2211/Q-HEART" + "bhsinghgrid/sanskrit-translation" ], - "relevancy_score": 18.2 + "relevancy_score": 17.9 }, { - "architecture_id": "SeerAttnLlamaForCausalLM", + "architecture_id": "GuppyLM", "total_models": 1, - "total_downloads": 2618, + "total_downloads": 2655, "min_param_count": null, "sample_models": [ - "SeerAttention/SeerAttention-Llama-3.1-8B-AttnGates" + "arman-bd/guppylm-9M" ], - "relevancy_score": 18.2 + "relevancy_score": 17.9 }, { - "architecture_id": "D3PMSanskritModel", + "architecture_id": "MoEGPT2", "total_models": 1, - "total_downloads": 2603, + "total_downloads": 2552, "min_param_count": null, "sample_models": [ - "bhsinghgrid/sanskrit-translation" + "NamrataThakur/Small_Language_Model_MOE_127M_Pretrained" ], - "relevancy_score": 18.2 + "relevancy_score": 17.8 }, { - "architecture_id": "MoYiForCausalLM", + "architecture_id": "JiRackTernary1B", "total_models": 1, - "total_downloads": 2433, + "total_downloads": 2529, "min_param_count": null, "sample_models": [ - "astanahub/alemllm" + "kgrabko/JiRackTernary_1b" ], - "relevancy_score": 18.1 + "relevancy_score": 17.8 }, { - "architecture_id": "Eagle3DeepseekV2ForCausalLM", + "architecture_id": "Speech2TextTransformerForConditionalGeneration", "total_models": 1, - "total_downloads": 2367, + "total_downloads": 2373, "min_param_count": null, "sample_models": [ - "nvidia/Kimi-K2.5-Thinking-Eagle3" + "valhalla/s2t_mustc_multilinguial_medium" ], - "relevancy_score": 18.0 + "relevancy_score": 17.6 }, { - "architecture_id": "Speech2TextTransformerForConditionalGeneration", + "architecture_id": "Eagle3DeepseekV2ForCausalLM", "total_models": 1, - "total_downloads": 2304, + "total_downloads": 2367, "min_param_count": null, "sample_models": [ - "valhalla/s2t_mustc_multilinguial_medium" + "nvidia/Kimi-K2.5-Thinking-Eagle3" ], - "relevancy_score": 18.0 + "relevancy_score": 17.6 }, { - "architecture_id": "Videollama2Qwen2ForCausalLM", + "architecture_id": "GPTXForCausalLM", "total_models": 1, - "total_downloads": 2257, + "total_downloads": 2295, "min_param_count": null, "sample_models": [ - "QuangTuan/MultiMood-7B-GRPO-VisualAudioText-Comp" + "AxiomicLabs/GPT-X-125M" ], - "relevancy_score": 17.9 + "relevancy_score": 17.5 }, { "architecture_id": "WhisperMixStyleForConditionalGeneration", "total_models": 1, - "total_downloads": 2206, + "total_downloads": 2258, "min_param_count": null, "sample_models": [ "wago5090/mixstyle_multi-s" ], - "relevancy_score": 17.9 + "relevancy_score": 17.5 }, { - "architecture_id": "AlinlightForCausalLM", + "architecture_id": "Videollama2Qwen2ForCausalLM", "total_models": 1, - "total_downloads": 2193, + "total_downloads": 2223, "min_param_count": null, "sample_models": [ - "EngineerGL/Alinlight" + "QuangTuan/MultiMood-7B-GRPO-VisualAudioText-Comp" ], - "relevancy_score": 17.8 + "relevancy_score": 17.5 }, { - "architecture_id": "GuppyLM", + "architecture_id": "GPT", + "total_models": 2, + "total_downloads": 1600, + "min_param_count": null, + "sample_models": [ + "LH-Tech-AI/Apex-1.5-Coder-Instruct-350M", + "LH-Tech-AI/Apex-1.5-Instruct-350M" + ], + "relevancy_score": 17.4 + }, + { + "architecture_id": "SeerAttnLlamaForCausalLM", "total_models": 1, - "total_downloads": 2179, + "total_downloads": 2171, "min_param_count": null, "sample_models": [ - "arman-bd/guppylm-9M" + "SeerAttention/SeerAttention-Llama-3.1-8B-AttnGates" ], - "relevancy_score": 17.8 + "relevancy_score": 17.4 }, { - "architecture_id": "LlamaForCausalLMEagle", + "architecture_id": "Typhoon2Audio2AudioForConditionalGeneration", "total_models": 1, - "total_downloads": 2127, + "total_downloads": 2169, "min_param_count": null, "sample_models": [ - "thunlp/LLaMA3-Instruct-8B-FR-Spec" + "typhoon-ai/llama3.1-typhoon2-audio-8b-instruct" ], - "relevancy_score": 17.8 + "relevancy_score": 17.4 }, { - "architecture_id": "JiRackTernary1B", + "architecture_id": "LlamaForCausalLMEagle", "total_models": 1, - "total_downloads": 2121, + "total_downloads": 2167, "min_param_count": null, "sample_models": [ - "kgrabko/JiRackTernary_1b" + "thunlp/LLaMA3-Instruct-8B-FR-Spec" ], - "relevancy_score": 17.8 + "relevancy_score": 17.4 }, { - "architecture_id": "RuGPT3XLForCausalLM", + "architecture_id": "DenseLLM", "total_models": 1, - "total_downloads": 2110, + "total_downloads": 2167, "min_param_count": null, "sample_models": [ - "evilfreelancer/ruGPT3XL" + "AlgoDriveAI/Sanskrit_Akkadian_LLM_v1.0" ], - "relevancy_score": 17.8 + "relevancy_score": 17.4 }, { - "architecture_id": "Typhoon2Audio2AudioForConditionalGeneration", + "architecture_id": "AlinlightForCausalLM", "total_models": 1, - "total_downloads": 2107, + "total_downloads": 2158, "min_param_count": null, "sample_models": [ - "typhoon-ai/llama3.1-typhoon2-audio-8b-instruct" + "EngineerGL/Alinlight" ], - "relevancy_score": 17.8 + "relevancy_score": 17.4 }, { - "architecture_id": "PointLLMLlamaForCausalLM", - "total_models": 2, - "total_downloads": 1493, + "architecture_id": "TeleFLMForCausalLM", + "total_models": 1, + "total_downloads": 2150, "min_param_count": null, "sample_models": [ - "RunsenXu/PointLLM_7B_v1.1_init", - "RunsenXu/PointLLM_7B_v1.2" + "CofeAI/Tele-FLM-1T" ], - "relevancy_score": 17.7 + "relevancy_score": 17.4 }, { - "architecture_id": "LlaMAForCausalLM", + "architecture_id": "TFGPT2LMHeadModel", "total_models": 1, - "total_downloads": 2097, + "total_downloads": 2140, "min_param_count": null, "sample_models": [ - "circulus/alpaca-7b" + "mymusise/gpt2-medium-chinese" ], - "relevancy_score": 17.7 + "relevancy_score": 17.4 }, { - "architecture_id": "TeleFLMForCausalLM", + "architecture_id": "LlaMAForCausalLM", "total_models": 1, - "total_downloads": 2093, + "total_downloads": 2121, "min_param_count": null, "sample_models": [ - "CofeAI/Tele-FLM-1T" + "circulus/alpaca-7b" ], - "relevancy_score": 17.7 + "relevancy_score": 17.4 }, { "architecture_id": "GeoVForCausalLM", "total_models": 1, - "total_downloads": 2090, + "total_downloads": 2118, "min_param_count": null, "sample_models": [ "GeoV/GeoV-9b" ], - "relevancy_score": 17.7 + "relevancy_score": 17.4 }, { - "architecture_id": "TFGPT2LMHeadModel", + "architecture_id": "GPTModelForTextGeneration", "total_models": 1, - "total_downloads": 2076, + "total_downloads": 2107, "min_param_count": null, "sample_models": [ - "mymusise/gpt2-medium-chinese" + "samkeet/GPT_124M-Instruct" ], - "relevancy_score": 17.7 + "relevancy_score": 17.4 }, { - "architecture_id": "RobertaPreLayerNormForCausalLM", - "total_models": 1, - "total_downloads": 2074, + "architecture_id": "IndexForCausalLM", + "total_models": 2, + "total_downloads": 1559, "min_param_count": null, "sample_models": [ - "hf-tiny-model-private/tiny-random-RobertaPreLayerNormForCausalLM" + "IndexTeam/Index-1.9B-Chat", + "IndexTeam/Index-1.9B-Pure" ], - "relevancy_score": 17.7 + "relevancy_score": 17.3 }, { "architecture_id": "ElectraForCausalLM", "total_models": 1, - "total_downloads": 2071, + "total_downloads": 2103, "min_param_count": null, "sample_models": [ "smeoni/nbme-electra-large-generator" ], - "relevancy_score": 17.7 + "relevancy_score": 17.3 }, { - "architecture_id": "GPTModelForTextGeneration", + "architecture_id": "PegasusForCausalLM", "total_models": 1, - "total_downloads": 2059, + "total_downloads": 2056, "min_param_count": null, "sample_models": [ - "samkeet/GPT_124M-Instruct" + "hf-tiny-model-private/tiny-random-PegasusForCausalLM" ], - "relevancy_score": 17.7 + "relevancy_score": 17.3 }, { - "architecture_id": "PegasusForCausalLM", + "architecture_id": "RobertaPreLayerNormForCausalLM", "total_models": 1, - "total_downloads": 2032, + "total_downloads": 2047, "min_param_count": null, "sample_models": [ - "hf-tiny-model-private/tiny-random-PegasusForCausalLM" + "hf-tiny-model-private/tiny-random-RobertaPreLayerNormForCausalLM" ], - "relevancy_score": 17.7 + "relevancy_score": 17.3 }, { "architecture_id": "BlenderbotForCausalLM", "total_models": 1, - "total_downloads": 2026, + "total_downloads": 2046, "min_param_count": null, "sample_models": [ "hf-tiny-model-private/tiny-random-BlenderbotForCausalLM" ], - "relevancy_score": 17.7 + "relevancy_score": 17.3 }, { - "architecture_id": "DenseLLM", + "architecture_id": "XModelForCausalLM", "total_models": 1, - "total_downloads": 2011, + "total_downloads": 2037, "min_param_count": null, "sample_models": [ - "AlgoDriveAI/Sanskrit_Akkadian_LLM_v1.0" + "XiaoduoAILab/Xmodel_LM" ], - "relevancy_score": 17.7 + "relevancy_score": 17.3 }, { - "architecture_id": "OtterForConditionalGeneration", - "total_models": 2, - "total_downloads": 1460, + "architecture_id": "EnergyTransformer", + "total_models": 1, + "total_downloads": 2031, "min_param_count": null, "sample_models": [ - "luodian/OTTER-Video-LLaMA7B-DenseCaption", - "luodian/OTTER-MPT1B-RPJama-Init" + "cccczshao/CALM-M" ], - "relevancy_score": 17.6 + "relevancy_score": 17.3 }, { - "architecture_id": "MonkeyLMHeadModel", - "total_models": 2, - "total_downloads": 1486, + "architecture_id": "MvpForCausalLM", + "total_models": 1, + "total_downloads": 2018, "min_param_count": null, "sample_models": [ - "echo840/Monkey-Chat", - "echo840/Monkey" + "hf-tiny-model-private/tiny-random-MvpForCausalLM" ], - "relevancy_score": 17.6 + "relevancy_score": 17.3 }, { - "architecture_id": "IndexForCausalLM", + "architecture_id": "OtterForConditionalGeneration", "total_models": 2, - "total_downloads": 1467, + "total_downloads": 1473, "min_param_count": null, "sample_models": [ - "IndexTeam/Index-1.9B-Chat", - "IndexTeam/Index-1.9B-Pure" + "luodian/OTTER-Video-LLaMA7B-DenseCaption", + "luodian/OTTER-MPT1B-RPJama-Init" ], - "relevancy_score": 17.6 + "relevancy_score": 17.2 }, { - "architecture_id": "EnergyTransformer", - "total_models": 1, - "total_downloads": 1991, + "architecture_id": "MonkeyLMHeadModel", + "total_models": 2, + "total_downloads": 1496, "min_param_count": null, "sample_models": [ - "cccczshao/CALM-M" + "echo840/Monkey-Chat", + "echo840/Monkey" ], - "relevancy_score": 17.6 + "relevancy_score": 17.2 }, { - "architecture_id": "MvpForCausalLM", - "total_models": 1, - "total_downloads": 1990, + "architecture_id": "PointLLMLlamaForCausalLM", + "total_models": 2, + "total_downloads": 1497, "min_param_count": null, "sample_models": [ - "hf-tiny-model-private/tiny-random-MvpForCausalLM" + "RunsenXu/PointLLM_7B_v1.1_init", + "RunsenXu/PointLLM_7B_v1.2" ], - "relevancy_score": 17.6 + "relevancy_score": 17.2 }, { - "architecture_id": "XModelForCausalLM", + "architecture_id": "ConditionalGPT2LMHeadModel", "total_models": 1, - "total_downloads": 1987, + "total_downloads": 1991, "min_param_count": null, "sample_models": [ - "XiaoduoAILab/Xmodel_LM" + "entropy/roberta_zinc_decoder" ], - "relevancy_score": 17.6 + "relevancy_score": 17.2 }, { - "architecture_id": "ConditionalGPT2LMHeadModel", + "architecture_id": "Qwen35ForCausalLM", "total_models": 1, - "total_downloads": 1951, + "total_downloads": 1971, "min_param_count": null, "sample_models": [ - "entropy/roberta_zinc_decoder" + "JeffGreen311/Eve-V2-Unleashed-Qwen3.5-8B-Liberated-4K-4B-Merged" ], - "relevancy_score": 17.6 + "relevancy_score": 17.2 }, { "architecture_id": "DebertaV2ForCausalLM", "total_models": 1, - "total_downloads": 1928, + "total_downloads": 1960, "min_param_count": null, "sample_models": [ "ltg/deberta-xxlarge-fixed" ], - "relevancy_score": 17.6 + "relevancy_score": 17.2 }, { - "architecture_id": "BTLMLMHeadModel", - "total_models": 2, - "total_downloads": 1403, + "architecture_id": "SpectusForConditionalGeneration", + "total_models": 1, + "total_downloads": 1945, "min_param_count": null, "sample_models": [ - "cerebras/btlm-3b-8k-base", - "EleutherAI/Hermes-btlm-3b-8k" + "MS-ML/SpecTUS_pretrained_only" ], - "relevancy_score": 17.5 + "relevancy_score": 17.2 }, { "architecture_id": "TelechatForCausalLM", "total_models": 2, - "total_downloads": 1363, + "total_downloads": 1426, "min_param_count": null, "sample_models": [ "Tele-AI/telechat-7B", "Tele-AI/TeleChat-12B" ], - "relevancy_score": 17.5 - }, - { - "architecture_id": "SpectusForConditionalGeneration", - "total_models": 1, - "total_downloads": 1908, - "min_param_count": null, - "sample_models": [ - "MS-ML/SpecTUS_pretrained_only" - ], - "relevancy_score": 17.5 + "relevancy_score": 17.1 }, { - "architecture_id": "GPTXForCausalLM", - "total_models": 1, - "total_downloads": 1871, + "architecture_id": "BTLMLMHeadModel", + "total_models": 2, + "total_downloads": 1437, "min_param_count": null, "sample_models": [ - "AxiomicLabs/GPT-X-125m-15bt" + "cerebras/btlm-3b-8k-base", + "EleutherAI/Hermes-btlm-3b-8k" ], - "relevancy_score": 17.5 + "relevancy_score": 17.1 }, { "architecture_id": "LSGBartForConditionalGeneration", "total_models": 1, - "total_downloads": 1843, + "total_downloads": 1871, "min_param_count": null, "sample_models": [ "morenolq/LEGIT-BART-LSG-4096" ], - "relevancy_score": 17.5 + "relevancy_score": 17.1 }, { "architecture_id": "CloverLMForCausalLM", "total_models": 1, - "total_downloads": 1822, + "total_downloads": 1833, "min_param_count": null, "sample_models": [ "daslab-testing/CloverLM" ], - "relevancy_score": 17.4 - }, - { - "architecture_id": "MiniMaxText01ForCausalLM", - "total_models": 1, - "total_downloads": 1682, - "min_param_count": null, - "sample_models": [ - "MiniMaxAI/MiniMax-Text-01" - ], - "relevancy_score": 17.3 + "relevancy_score": 17.0 }, { "architecture_id": "LlavaCrystalForCausalLM", "total_models": 1, - "total_downloads": 1586, + "total_downloads": 1614, "min_param_count": null, "sample_models": [ "LLM360/CrystalChat-7B-Web2Code" ], - "relevancy_score": 17.1 + "relevancy_score": 16.8 }, { - "architecture_id": "MobileLLMForCausalLM", + "architecture_id": "InternLM2ForRewardModel", "total_models": 1, - "total_downloads": 1585, + "total_downloads": 1562, "min_param_count": null, "sample_models": [ - "facebook/MobileLLM-125M" + "internlm/internlm2_5-step-prover-critic" ], - "relevancy_score": 17.1 + "relevancy_score": 16.7 }, { "architecture_id": "MobilintEagle3Qwen2ForCausalLM", "total_models": 1, - "total_downloads": 1541, + "total_downloads": 1543, "min_param_count": null, "sample_models": [ "mobilint/EAGLE3-JPharmatron-7B" ], - "relevancy_score": 17.1 + "relevancy_score": 16.7 }, { - "architecture_id": "InternLM2ForRewardModel", - "total_models": 1, - "total_downloads": 1527, + "architecture_id": "MPTForCausalLM", + "total_models": 2, + "total_downloads": 1115, "min_param_count": null, "sample_models": [ - "internlm/internlm2_5-step-prover-critic" + "hyungtae/mpt-30b", + "manojpreveen/mpt-30b-v5" ], - "relevancy_score": 17.0 + "relevancy_score": 16.6 }, { - "architecture_id": "Qwen35ForCausalLM", + "architecture_id": "MobileLLMForCausalLM", "total_models": 1, - "total_downloads": 1512, + "total_downloads": 1522, "min_param_count": null, "sample_models": [ - "JeffGreen311/Eve-V2-Unleashed-Qwen3.5-8B-Liberated-4K-4B-Merged" + "facebook/MobileLLM-125M" ], - "relevancy_score": 17.0 + "relevancy_score": 16.6 }, { "architecture_id": "GeoChatLlamaForCausalLM", "total_models": 1, - "total_downloads": 1454, + "total_downloads": 1416, "min_param_count": null, "sample_models": [ "MBZUAI/geochat-7B" ], - "relevancy_score": 16.9 + "relevancy_score": 16.5 }, { - "architecture_id": "MochivaForCausalLM", + "architecture_id": "Qwen3VLMoeForConditionalGeneration", "total_models": 1, - "total_downloads": 1438, - "min_param_count": null, + "total_downloads": 1385, + "min_param_count": 31070754032, "sample_models": [ - "Mochiva-team/Mochiva-model" + "Oysiyl/qwen3-vl-30b-a3b-unslop-good-lora-v1" ], - "relevancy_score": 16.9 + "relevancy_score": 16.4 }, { "architecture_id": "HeliumForCausalLM", "total_models": 1, - "total_downloads": 1363, + "total_downloads": 1309, "min_param_count": null, "sample_models": [ "kyutai/helium-1-preview-2b" ], - "relevancy_score": 16.8 + "relevancy_score": 16.3 }, { "architecture_id": "JiRackTernaryModel", "total_models": 1, - "total_downloads": 1289, + "total_downloads": 1292, "min_param_count": null, "sample_models": [ "kgrabko/JiRackTernary_70b" ], - "relevancy_score": 16.7 + "relevancy_score": 16.3 + }, + { + "architecture_id": "Papagan", + "total_models": 1, + "total_downloads": 1216, + "min_param_count": null, + "sample_models": [ + "SutskeverFanBoy/papagan_1.3b" + ], + "relevancy_score": 16.2 }, { "architecture_id": "PolyLMHeadModel", "total_models": 1, - "total_downloads": 1195, + "total_downloads": 1129, "min_param_count": null, "sample_models": [ "DAMO-NLP-MT/polylm-13b" ], - "relevancy_score": 16.5 + "relevancy_score": 16.0 }, { "architecture_id": "CambrianLlamaForCausalLM", "total_models": 1, - "total_downloads": 1073, + "total_downloads": 1124, "min_param_count": null, "sample_models": [ "nyu-visionx/cambrian-8b" ], - "relevancy_score": 16.3 + "relevancy_score": 16.0 }, { - "architecture_id": "LlamaModel", + "architecture_id": "ErnieForCausalLM", "total_models": 1, - "total_downloads": 1034, - "min_param_count": 33930165248, + "total_downloads": 1018, + "min_param_count": null, "sample_models": [ - "ngoan/NgoanYi" + "mohitsha/tiny-ernie-random-remote-code" ], - "relevancy_score": 16.2 + "relevancy_score": 15.8 }, { - "architecture_id": "TransnormerForCausalLM", + "architecture_id": "Qwen3_5MoeForCausalLM", "total_models": 1, - "total_downloads": 1030, - "min_param_count": null, + "total_downloads": 1000, + "min_param_count": 122111526912, "sample_models": [ - "OpenNLPLab/TransNormerLLM-385M" + "wangzhang/Qwen3.5-122B-A10B-abliterix" ], - "relevancy_score": 16.2 + "relevancy_score": 15.7 }, { - "architecture_id": "Qwen3VLMoeForConditionalGeneration", + "architecture_id": "XMistralForCausalLM", "total_models": 1, - "total_downloads": 997, - "min_param_count": 31070754032, + "total_downloads": 984, + "min_param_count": null, "sample_models": [ - "Oysiyl/qwen3-vl-30b-a3b-unslop-good-lora-v1" + "Hannibal046/xrag-7b" ], - "relevancy_score": 16.1 + "relevancy_score": 15.7 }, { - "architecture_id": "KimiK25ForConditionalGeneration", + "architecture_id": "TransnormerForCausalLM", "total_models": 1, - "total_downloads": 988, - "min_param_count": 91383180528, + "total_downloads": 973, + "min_param_count": null, "sample_models": [ - "Ex0bit/Kimi-K2.5-PRISM-REAP-530B-A32B" + "OpenNLPLab/TransNormerLLM-385M" ], - "relevancy_score": 16.1 + "relevancy_score": 15.7 }, { - "architecture_id": "ErnieForCausalLM", + "architecture_id": "YiForCausalLM", "total_models": 1, - "total_downloads": 982, + "total_downloads": 955, "min_param_count": null, "sample_models": [ - "mohitsha/tiny-ernie-random-remote-code" + "llmware/dragon-yi-6b-v0" ], - "relevancy_score": 16.1 + "relevancy_score": 15.6 }, { - "architecture_id": "ShikraLlamaForCausalLM", + "architecture_id": "SOVYN85M", "total_models": 1, - "total_downloads": 950, + "total_downloads": 949, "min_param_count": null, "sample_models": [ - "shikras/shikra-7b-delta-v1" + "SOVYN/SOVYN-85M" ], - "relevancy_score": 16.0 + "relevancy_score": 15.6 }, { - "architecture_id": "YiForCausalLM", + "architecture_id": "LlamaModel", + "total_models": 1, + "total_downloads": 948, + "min_param_count": 33930165248, + "sample_models": [ + "ngoan/NgoanYi" + ], + "relevancy_score": 15.6 + }, + { + "architecture_id": "ShikraLlamaForCausalLM", "total_models": 1, - "total_downloads": 939, + "total_downloads": 928, "min_param_count": null, "sample_models": [ - "llmware/dragon-yi-6b-v0" + "shikras/shikra-7b-delta-v1" ], - "relevancy_score": 16.0 + "relevancy_score": 15.6 }, { "architecture_id": "CpmBeeForCausalLM", "total_models": 1, - "total_downloads": 895, + "total_downloads": 911, "min_param_count": null, "sample_models": [ "openbmb/cpm-bee-10b" ], - "relevancy_score": 15.8 + "relevancy_score": 15.5 }, { "architecture_id": "ZsGPT2LMHeadModel", "total_models": 1, - "total_downloads": 882, + "total_downloads": 902, "min_param_count": null, "sample_models": [ "claritylab/zero-shot-vanilla-gpt2" ], - "relevancy_score": 15.8 + "relevancy_score": 15.5 }, { "architecture_id": "HumanGPTForCausalLM", "total_models": 1, - "total_downloads": 868, + "total_downloads": 876, "min_param_count": null, "sample_models": [ "YaoFeng/CHATPOSE-V0" ], - "relevancy_score": 15.8 + "relevancy_score": 15.4 }, { "architecture_id": "Phi4FlashForCausalLM", "total_models": 1, - "total_downloads": 809, + "total_downloads": 839, "min_param_count": null, "sample_models": [ "microsoft/Phi-4-mini-flash-reasoning" ], - "relevancy_score": 15.6 + "relevancy_score": 15.3 }, { - "architecture_id": "XMistralForCausalLM", + "architecture_id": "KimiK25ForConditionalGeneration", "total_models": 1, - "total_downloads": 796, - "min_param_count": null, + "total_downloads": 824, + "min_param_count": 91383180528, "sample_models": [ - "Hannibal046/xrag-7b" + "Ex0bit/Kimi-K2.5-PRISM-REAP-530B-A32B" ], - "relevancy_score": 15.6 + "relevancy_score": 15.3 }, { "architecture_id": "FlamingoForCausalLM", "total_models": 1, - "total_downloads": 791, + "total_downloads": 820, "min_param_count": null, "sample_models": [ "babylm/flamingo-2024" ], - "relevancy_score": 15.6 + "relevancy_score": 15.3 }, { - "architecture_id": "VStreamLlamaForCausalLM", + "architecture_id": "AquilaDenseForCausalLM", "total_models": 1, - "total_downloads": 762, + "total_downloads": 820, "min_param_count": null, "sample_models": [ - "IVGSZ/Flash-VStream-7b" + "BAAI/AquilaDense-7B" ], - "relevancy_score": 15.5 + "relevancy_score": 15.3 }, { - "architecture_id": "AquilaDenseForCausalLM", + "architecture_id": "EmuForCausalLM", "total_models": 1, - "total_downloads": 759, + "total_downloads": 795, "min_param_count": null, "sample_models": [ - "BAAI/AquilaDense-7B" + "BAAI/Emu2-Chat" ], - "relevancy_score": 15.5 + "relevancy_score": 15.2 }, { - "architecture_id": "EmuForCausalLM", + "architecture_id": "VStreamLlamaForCausalLM", "total_models": 1, - "total_downloads": 747, + "total_downloads": 780, "min_param_count": null, "sample_models": [ - "BAAI/Emu2-Chat" + "IVGSZ/Flash-VStream-7b" ], - "relevancy_score": 15.4 + "relevancy_score": 15.2 }, { "architecture_id": "MoELLaVAQWenForCausalLM", "total_models": 1, - "total_downloads": 728, + "total_downloads": 729, "min_param_count": null, "sample_models": [ "LanguageBind/MoE-LLaVA-Qwen-1.8B-4e" ], - "relevancy_score": 15.4 + "relevancy_score": 15.0 }, { "architecture_id": "YayiForCausalLM", "total_models": 1, - "total_downloads": 713, + "total_downloads": 724, "min_param_count": null, "sample_models": [ "wenge-research/yayi2-30b" ], - "relevancy_score": 15.3 + "relevancy_score": 15.0 + }, + { + "architecture_id": "STLlamaForCausalLM", + "total_models": 1, + "total_downloads": 723, + "min_param_count": null, + "sample_models": [ + "bjdwh/UrbanGPT" + ], + "relevancy_score": 15.0 }, { "architecture_id": "SkyworkForCausalLM", @@ -4440,57 +4563,67 @@ "sample_models": [ "Skywork/Skywork-13B-base" ], - "relevancy_score": 15.3 + "relevancy_score": 14.9 }, { "architecture_id": "MobiLlamaForCausalLM", "total_models": 1, - "total_downloads": 661, + "total_downloads": 667, "min_param_count": null, "sample_models": [ "MBZUAI/MobiLlama-05B" ], - "relevancy_score": 15.2 + "relevancy_score": 14.8 }, { - "architecture_id": "HebrewGPTForCausalLM", + "architecture_id": "JapaneseStableLMAlphaForCausalLM", "total_models": 1, - "total_downloads": 643, + "total_downloads": 656, "min_param_count": null, "sample_models": [ - "Slasky/HebrewGPT-1B" + "stabilityai/japanese-stablelm-base-alpha-7b" ], - "relevancy_score": 15.1 + "relevancy_score": 14.8 }, { "architecture_id": "GPTBigCodeLMHeadModel", "total_models": 1, - "total_downloads": 638, + "total_downloads": 654, "min_param_count": null, "sample_models": [ "bigcode/santacoderpack" ], - "relevancy_score": 15.1 + "relevancy_score": 14.8 + }, + { + "architecture_id": "SDARMoeForCausalLM", + "total_models": 1, + "total_downloads": 653, + "min_param_count": 30532122624, + "sample_models": [ + "JetLM/SDAR-30B-A3B-Chat-b32" + ], + "relevancy_score": 14.8 }, { "architecture_id": "GPTJiangForCausalLM", "total_models": 1, - "total_downloads": 627, + "total_downloads": 650, "min_param_count": null, "sample_models": [ "kdf/jiang-base" ], - "relevancy_score": 15.1 + "relevancy_score": 14.8 }, { - "architecture_id": "JapaneseStableLMAlphaForCausalLM", + "architecture_id": "HebrewGPTForCausalLM", "total_models": 1, - "total_downloads": 627, + "total_downloads": 646, "min_param_count": null, "sample_models": [ - "stabilityai/japanese-stablelm-base-alpha-7b" + "Slasky/HebrewGPT-1B" ], - "relevancy_score": 15.1 + "relevancy_score": 14.8 }, { "architecture_id": "BunnyQwenForCausalLM", @@ -4500,167 +4633,167 @@ "sample_models": [ "dphn/dolphin-vision-72b" ], - "relevancy_score": 15.0 + "relevancy_score": 14.7 }, { - "architecture_id": "SDARMoeForCausalLM", + "architecture_id": "GrokForCausalLM", "total_models": 1, "total_downloads": 619, - "min_param_count": 30532122624, + "min_param_count": null, "sample_models": [ - "JetLM/SDAR-30B-A3B-Chat-b32" + "keyfan/grok-1-hf" ], - "relevancy_score": 15.0 + "relevancy_score": 14.7 }, { - "architecture_id": "STLlamaForCausalLM", + "architecture_id": "LongcatFlashNgramForCausalLM", "total_models": 1, - "total_downloads": 617, + "total_downloads": 615, "min_param_count": null, "sample_models": [ - "bjdwh/UrbanGPT" + "meituan-longcat/LongCat-Flash-Lite" ], - "relevancy_score": 15.0 + "relevancy_score": 14.7 }, { - "architecture_id": "GrokForCausalLM", + "architecture_id": "LingoWhaleForCausalLM", "total_models": 1, - "total_downloads": 606, + "total_downloads": 595, "min_param_count": null, "sample_models": [ - "keyfan/grok-1-hf" + "deeplang-ai/LingoWhale-8B" ], - "relevancy_score": 15.0 + "relevancy_score": 14.6 }, { "architecture_id": "Llama2ForCausalLM", "total_models": 1, - "total_downloads": 590, + "total_downloads": 592, "min_param_count": null, "sample_models": [ "llmware/dragon-llama-7b-v0" ], - "relevancy_score": 14.9 + "relevancy_score": 14.6 }, { "architecture_id": "MPLUGOwl2LlamaForCausalLM", "total_models": 1, - "total_downloads": 589, + "total_downloads": 592, "min_param_count": null, "sample_models": [ "q-future/q-align-quality" ], - "relevancy_score": 14.9 + "relevancy_score": 14.6 }, { "architecture_id": "GLaMMForCausalLM", "total_models": 1, - "total_downloads": 587, + "total_downloads": 585, "min_param_count": null, "sample_models": [ "MBZUAI/GLaMM-FullScope" ], - "relevancy_score": 14.9 + "relevancy_score": 14.6 }, { - "architecture_id": "LingoWhaleForCausalLM", + "architecture_id": "OLMoModelForCausalLM", "total_models": 1, - "total_downloads": 583, + "total_downloads": 585, "min_param_count": null, "sample_models": [ - "deeplang-ai/LingoWhale-8B" + "NousResearch/OLMo-Bitnet-1B" ], - "relevancy_score": 14.9 + "relevancy_score": 14.6 }, { "architecture_id": "OpenBAForConditionalGeneration", "total_models": 1, - "total_downloads": 581, + "total_downloads": 576, "min_param_count": null, "sample_models": [ "OpenNLG/OpenBA-V1-Based" ], - "relevancy_score": 14.9 + "relevancy_score": 14.5 }, { - "architecture_id": "OLMoModelForCausalLM", + "architecture_id": "GPTJXForCausalLM", "total_models": 1, - "total_downloads": 573, + "total_downloads": 574, "min_param_count": null, "sample_models": [ - "NousResearch/OLMo-Bitnet-1B" + "KnutJaegersberg/GPT-JX-3b" ], - "relevancy_score": 14.9 + "relevancy_score": 14.5 }, { - "architecture_id": "GPTJXForCausalLM", + "architecture_id": "LlavaStableLMEpochForCausalLM", "total_models": 1, - "total_downloads": 566, + "total_downloads": 540, "min_param_count": null, "sample_models": [ - "KnutJaegersberg/GPT-JX-3b" + "NousResearch/Obsidian-3B-V0.5" ], - "relevancy_score": 14.8 + "relevancy_score": 14.4 }, { - "architecture_id": "Qwen3_5MoeForCausalLM", + "architecture_id": "AprielHForCausalLM", "total_models": 1, - "total_downloads": 562, - "min_param_count": 122111526912, + "total_downloads": 538, + "min_param_count": null, "sample_models": [ - "wangzhang/Qwen3.5-122B-A10B-abliterix" + "ServiceNow-AI/Apriel-H1-15b-Thinker-SFT" ], - "relevancy_score": 14.8 + "relevancy_score": 14.4 }, { - "architecture_id": "LlavaStableLMEpochForCausalLM", + "architecture_id": "CacaForCausalLM", "total_models": 1, - "total_downloads": 547, + "total_downloads": 530, "min_param_count": null, "sample_models": [ - "NousResearch/Obsidian-3B-V0.5" + "Lyon28/caca-1B-untrained" ], - "relevancy_score": 14.7 + "relevancy_score": 14.3 }, { - "architecture_id": "VSMForCausalLM", + "architecture_id": "M2M100ForConditionalGeneration", "total_models": 1, - "total_downloads": 544, + "total_downloads": 529, "min_param_count": null, "sample_models": [ - "craigwu/seal_vsm_7b" + "dsfsi/nso-en-m2m100-gov" ], - "relevancy_score": 14.7 + "relevancy_score": 14.3 }, { - "architecture_id": "LlavaSearchLlamaForCausalLM", + "architecture_id": "HgrnForCausalLM", "total_models": 1, - "total_downloads": 543, + "total_downloads": 526, "min_param_count": null, "sample_models": [ - "craigwu/seal_vqa_7b" + "OpenNLPLab/HGRN-150M" ], - "relevancy_score": 14.7 + "relevancy_score": 14.3 }, { - "architecture_id": "AprielHForCausalLM", + "architecture_id": "LlavaSearchLlamaForCausalLM", "total_models": 1, - "total_downloads": 530, + "total_downloads": 524, "min_param_count": null, "sample_models": [ - "ServiceNow-AI/Apriel-H1-15b-Thinker-SFT" + "craigwu/seal_vqa_7b" ], - "relevancy_score": 14.7 + "relevancy_score": 14.3 }, { - "architecture_id": "LlavaMistralForCausalLM", + "architecture_id": "SeerAttnQwen3ForCausalLM", "total_models": 1, - "total_downloads": 527, + "total_downloads": 523, "min_param_count": null, "sample_models": [ - "NousResearch/Nous-Hermes-2-Vision-Alpha" + "jiwonsong/SeerAttention-Qwen3-8B-AttnGates" ], - "relevancy_score": 14.7 + "relevancy_score": 14.3 }, { "architecture_id": "MedHemoModel", @@ -4670,27 +4803,27 @@ "sample_models": [ "amewebstudio/medhemo-earcp" ], - "relevancy_score": 14.6 + "relevancy_score": 14.3 }, { - "architecture_id": "HgrnForCausalLM", + "architecture_id": "VSMForCausalLM", "total_models": 1, - "total_downloads": 513, + "total_downloads": 521, "min_param_count": null, "sample_models": [ - "OpenNLPLab/HGRN-150M" + "craigwu/seal_vsm_7b" ], - "relevancy_score": 14.6 + "relevancy_score": 14.3 }, { - "architecture_id": "M2M100ForConditionalGeneration", + "architecture_id": "LlavaMistralForCausalLM", "total_models": 1, - "total_downloads": 501, + "total_downloads": 510, "min_param_count": null, "sample_models": [ - "dsfsi/nso-en-m2m100-gov" + "NousResearch/Nous-Hermes-2-Vision-Alpha" ], - "relevancy_score": 14.6 + "relevancy_score": 14.3 } ] } \ No newline at end of file diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json index c037dedfa..9c9f8a24f 100644 --- a/transformer_lens/tools/model_registry/data/supported_models.json +++ b/transformer_lens/tools/model_registry/data/supported_models.json @@ -1,22 +1,22 @@ { - "generated_at": "2026-04-10", + "generated_at": "2026-04-14", "scan_info": { - "total_scanned": 5436, + "total_scanned": 5633, "task_filter": "text-generation", "min_downloads": 500, - "scan_duration_seconds": 3.9 + "scan_duration_seconds": 4.2 }, - "total_architectures": 43, - "total_models": 7006, - "total_verified": 704, + "total_architectures": 47, + "total_models": 7426, + "total_verified": 706, "models": [ { "architecture_id": "Qwen3NextForCausalLM", "model_id": "Qwen/Qwen3-Next-80B-A3B-Instruct", "status": 2, - "verified_date": "2026-04-10", + "verified_date": "2026-04-15", "metadata": null, - "note": "Estimated 6929.6 GB exceeds 96.0 GB limit", + "note": "Estimated 708.8 GB exceeds 96.0 GB limit", "phase1_score": null, "phase2_score": null, "phase3_score": null, @@ -28,9 +28,9 @@ "architecture_id": "Qwen3NextForCausalLM", "model_id": "unsloth/Qwen3-Coder-Next", "status": 2, - "verified_date": "2026-04-10", + "verified_date": "2026-04-15", "metadata": null, - "note": "Estimated 6929.6 GB exceeds 96.0 GB limit", + "note": "Estimated 708.8 GB exceeds 96.0 GB limit", "phase1_score": null, "phase2_score": null, "phase3_score": null, @@ -42,9 +42,9 @@ "architecture_id": "Qwen3NextForCausalLM", "model_id": "Qwen/Qwen3-Next-80B-A3B-Thinking", "status": 2, - "verified_date": "2026-04-10", + "verified_date": "2026-04-15", "metadata": null, - "note": "Estimated 6929.6 GB exceeds 96.0 GB limit", + "note": "Estimated 708.8 GB exceeds 96.0 GB limit", "phase1_score": null, "phase2_score": null, "phase3_score": null, @@ -56,7 +56,7 @@ "architecture_id": "Qwen3NextForCausalLM", "model_id": "tiny-random/qwen3-next-moe", "status": 1, - "verified_date": "2026-04-10", + "verified_date": "2026-04-15", "metadata": null, "note": "Full verification completed", "phase1_score": 100.0, @@ -70,7 +70,7 @@ "architecture_id": "Qwen3NextForCausalLM", "model_id": "optimum-intel-internal-testing/tiny-random-qwen3-next", "status": 1, - "verified_date": "2026-04-10", + "verified_date": "2026-04-15", "metadata": null, "note": "Full verification completed", "phase1_score": 100.0, @@ -84,7 +84,7 @@ "architecture_id": "Qwen3NextForCausalLM", "model_id": "yujiepan/qwen3-next-moe-tiny-random", "status": 1, - "verified_date": "2026-04-10", + "verified_date": "2026-04-15", "metadata": null, "note": "Full verification completed", "phase1_score": 100.0, @@ -98,9 +98,9 @@ "architecture_id": "Qwen3NextForCausalLM", "model_id": "huihui-ai/Huihui-Qwen3-Coder-Next-abliterated", "status": 2, - "verified_date": "2026-04-10", + "verified_date": "2026-04-15", "metadata": null, - "note": "Estimated 6929.6 GB exceeds 96.0 GB limit", + "note": "Estimated 708.8 GB exceeds 96.0 GB limit", "phase1_score": null, "phase2_score": null, "phase3_score": null, @@ -112,9 +112,9 @@ "architecture_id": "Qwen3NextForCausalLM", "model_id": "Qwen/Qwen3-Coder-Next-Base", "status": 2, - "verified_date": "2026-04-10", + "verified_date": "2026-04-15", "metadata": null, - "note": "Estimated 6929.6 GB exceeds 96.0 GB limit", + "note": "Estimated 708.8 GB exceeds 96.0 GB limit", "phase1_score": null, "phase2_score": null, "phase3_score": null, @@ -126,9 +126,9 @@ "architecture_id": "Qwen3NextForCausalLM", "model_id": "bknyaz/Qwen3-Coder-Next-REAM", "status": 2, - "verified_date": "2026-04-10", + "verified_date": "2026-04-15", "metadata": null, - "note": "Estimated 5201.5 GB exceeds 96.0 GB limit", + "note": "Estimated 535.9 GB exceeds 96.0 GB limit", "phase1_score": null, "phase2_score": null, "phase3_score": null, @@ -140,7 +140,7 @@ "architecture_id": "Qwen3NextForCausalLM", "model_id": "Qwen/Qwen3-Coder-Next", "status": 2, - "verified_date": "2026-04-10", + "verified_date": "2026-04-15", "metadata": { "downloads": 664116, "likes": 0, @@ -153,7 +153,7 @@ ], "parameter_count": 79674391296 }, - "note": "Estimated 6929.6 GB exceeds 96.0 GB limit", + "note": "Estimated 708.8 GB exceeds 96.0 GB limit", "phase1_score": null, "phase2_score": null, "phase3_score": null, @@ -1999,9 +1999,9 @@ "architecture_id": "GraniteMoeHybridForCausalLM", "model_id": "ibm-granite/granite-4.0-h-small", "status": 2, - "verified_date": "2026-03-17", + "verified_date": "2026-04-15", "metadata": null, - "note": "Estimated 135.9 GB exceeds 75.2 GB limit", + "note": "Estimated 270.8 GB exceeds 96.0 GB limit", "phase1_score": null, "phase2_score": null, "phase3_score": null, @@ -2362,14 +2362,14 @@ { "architecture_id": "GraniteMoeHybridForCausalLM", "model_id": "ibm-granite/granite-4.0-micro-base", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-04-15", "metadata": null, - "note": null, - "phase1_score": null, - "phase2_score": null, - "phase3_score": null, - "phase4_score": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 88.7, "phase7_score": null, "phase8_score": null }, @@ -2573,13 +2573,13 @@ "architecture_id": "GraniteMoeHybridForCausalLM", "model_id": "ibm-granite/granite-4.0-micro", "status": 1, - "verified_date": "2026-03-17", + "verified_date": "2026-04-15", "metadata": null, - "note": "Core verification completed", + "note": "Full verification completed", "phase1_score": 100.0, - "phase2_score": null, - "phase3_score": null, - "phase4_score": 72.2, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 98.9, "phase7_score": null, "phase8_score": null }, @@ -3342,14 +3342,14 @@ { "architecture_id": "GraniteMoeHybridForCausalLM", "model_id": "ibm-granite/granite-4.0-h-tiny", - "status": 1, - "verified_date": "2026-03-17", + "status": 3, + "verified_date": "2026-04-15", "metadata": null, - "note": "Core verification completed", - "phase1_score": 100.0, - "phase2_score": null, - "phase3_score": null, - "phase4_score": 77.5, + "note": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/347 components failed (72 critical)", + "phase1_score": 50.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 96.6, "phase7_score": null, "phase8_score": null }, @@ -4140,13 +4140,13 @@ { "architecture_id": "GraniteMoeHybridForCausalLM", "model_id": "ibm-granite/granite-4.0-h-micro", - "status": 1, - "verified_date": "2026-03-17", + "status": 3, + "verified_date": "2026-04-15", "metadata": null, - "note": "Core verification completed", - "phase1_score": 100.0, - "phase2_score": null, - "phase3_score": null, + "note": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/307 components failed (72 critical)", + "phase1_score": 50.0, + "phase2_score": 100.0, + "phase3_score": 100.0, "phase4_score": 98.2, "phase7_score": null, "phase8_score": null @@ -4182,14 +4182,14 @@ { "architecture_id": "GraniteMoeHybridForCausalLM", "model_id": "ibm-granite/granite-4.0-tiny-preview", - "status": 1, - "verified_date": "2026-03-17", + "status": 3, + "verified_date": "2026-04-15", "metadata": null, - "note": "Core verification completed", - "phase1_score": 100.0, - "phase2_score": null, - "phase3_score": null, - "phase4_score": 97.4, + "note": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/347 components failed (72 critical)", + "phase1_score": 50.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 98.7, "phase7_score": null, "phase8_score": null }, @@ -4280,14 +4280,14 @@ { "architecture_id": "GraniteMoeHybridForCausalLM", "model_id": "ibm-granite/granite-4.0-350m", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-04-15", "metadata": null, - "note": null, - "phase1_score": null, - "phase2_score": null, - "phase3_score": null, - "phase4_score": null, + "note": "Full verification completed with issues: P2=91.7% (failed: generation)", + "phase1_score": 100.0, + "phase2_score": 91.7, + "phase3_score": 100.0, + "phase4_score": 94.7, "phase7_score": null, "phase8_score": null }, @@ -5344,14 +5344,14 @@ { "architecture_id": "GraniteMoeHybridForCausalLM", "model_id": "ibm-granite/granite-4.0-h-1b", - "status": 0, - "verified_date": null, + "status": 3, + "verified_date": "2026-04-15", "metadata": null, - "note": null, - "phase1_score": null, - "phase2_score": null, - "phase3_score": null, - "phase4_score": null, + "note": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/307 components failed (72 critical)", + "phase1_score": 50.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 72.2, "phase7_score": null, "phase8_score": null }, @@ -5540,14 +5540,14 @@ { "architecture_id": "GraniteMoeHybridForCausalLM", "model_id": "ibm-granite/granite-4.0-h-350m", - "status": 0, - "verified_date": null, + "status": 3, + "verified_date": "2026-04-15", "metadata": null, - "note": null, - "phase1_score": null, - "phase2_score": null, - "phase3_score": null, - "phase4_score": null, + "note": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 56/243 components failed (56 critical)", + "phase1_score": 50.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 94.8, "phase7_score": null, "phase8_score": null }, @@ -5946,14 +5946,14 @@ { "architecture_id": "GraniteMoeHybridForCausalLM", "model_id": "ibm-granite/granite-4.0-1b", - "status": 0, - "verified_date": null, + "status": 1, + "verified_date": "2026-04-15", "metadata": null, - "note": null, - "phase1_score": null, - "phase2_score": null, - "phase3_score": null, - "phase4_score": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 100.0, "phase7_score": null, "phase8_score": null }, @@ -8186,11 +8186,11 @@ { "architecture_id": "GraniteMoeHybridForCausalLM", "model_id": "onnx-community/granite-4.0-350m-ONNX-web", - "status": 0, - "verified_date": null, + "status": 3, + "verified_date": "2026-04-14", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: onnx-community/granite-4.0-350m-ONNX-web does not appear to have a file named pytorch_model.bin or model", + "phase1_score": 0.0, "phase2_score": null, "phase3_score": null, "phase4_score": null, @@ -8746,11 +8746,11 @@ { "architecture_id": "GraniteMoeHybridForCausalLM", "model_id": "ibm-granite/granite-4.0-350m-base", - "status": 0, - "verified_date": null, + "status": 3, + "verified_date": "2026-04-14", "metadata": null, - "note": null, - "phase1_score": null, + "note": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'NoneType' object has no attribute 'in_proj'", + "phase1_score": 0.0, "phase2_score": null, "phase3_score": null, "phase4_score": null, @@ -99602,6 +99602,4934 @@ "phase4_score": null, "phase7_score": null, "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "llmfan46/Darkidol-Ballad-27B-ultra-uncensored-heretic-v1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "contextboxai/Qwen3-1.7B-FC", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "tomg-group-umd/DynaGuard-1.7B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "principled-intelligence/Qwen3.5-9B-text-only", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "kai-os/Carnice-9b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "activeDap/Qwen3-1.7B_hh_harmful", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "GoodStartLabs/gin-rummy-hbc-qwen3.5-0.8b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "bigatuna/Qwen3-1.7B-Sushi-Coder", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "prefeitura-rio/Rio-3.0-Open-Mini", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "pthinc/Cicikus_v4_0.3B_Pitircik", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "joekarim/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-foxy_peckish_pigeon", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "odats/rl_nmt_2026_04_11_13_31", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "odats/rl_nmt_2026_04_11_13_41", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "canoplos/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-soft_gilded_alligator", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "odats/rl_nmt_2026_04_11_13_52", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "aifeifei798/Darkidol-Ballad-27B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "justindal/llama3.1-8b-leetcoder", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "continuum-ai/qwen3.5-4b-code-forged", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "numnum1/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-reclusive_mangy_zebra", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "brocchirodrigo/anotaai-ajuda-qwen3_5_Q4", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "VladShash/deepseek-math-7b-lean-prover-dpo-olmo-3", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "sandbagging-games/cedar", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "llmfan46/Darkidol-Ballad-27B-ultra-uncensored-heretic-v2", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "evolai/evolai_qwen_9B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "odats/rl_nmt_2026_04_10_07_50", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "odats/rl_nmt_2026_04_10_07_53", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "odats/rl_nmt_2026_04_10_07_47", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "RMCian/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-fast_rabid_ram", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "Phonsiri/Qwen3.5-9B-Thai-Law-Base", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "helly777/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-pudgy_dormant_salmon", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "lukey03/Qwen3.5-9B-abliterated", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Asib1/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-pensive_leggy_ant", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "aifeifei798/Darkidol-Ballad-9B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "heyalexchoi/qwen3-1.7b-math-grpo", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Loty1/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-rugged_trotting_puffin", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "aifeifei798/Darkidol-Catgirl-9B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "continuum-ai/qwen3.5-27b-code-forged", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "justindal/llama3.1-8b-instruct", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "0xsage/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-foxy_slender_slug", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "odats/rl_nmt_2026_04_12_13_14", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "ricdomolm/mini-coder-1.7b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "odats/rl_nmt_2026_04_12_13_17", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "Naphula/Cthulhu-70B-v1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "small-models-for-glam/Qwen3.5-0.8B-SFT-name-parser-yaml", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "continuum-ai/qwen3.5-4b-code-128k-forged", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "heisengert/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-stalking_polished_seahorse", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "wangzhang/Qwen3.5-27B-abliterated", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "continuum-ai/qwen3.5-27b-code-forged-defragged", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "squ11z1/claude-oss", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "continuum-ai/qwen3.5-4b-code-forged-defragged", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "bungamawar/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-dense_alert_turkey", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MixtralForCausalLM", + "model_id": "continuum-ai/mixtral-8x7b-instruct-compacted-conservative", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "kai-os/Carnice-27b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "alamios/Mistral-Small-3.1-DRAFT-0.5B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "small-models-for-glam/Qwen3.5-2B-SFT-name-parser-yaml", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "principled-intelligence/Qwen3.5-2B-text-only", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "TommyChien/memorag-qwen2-7b-inst", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "stratosphere/qwen2.5-1.5b-slips-immune-summarization", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Hiroshi19781111/ichiyanagi-qwen-14b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "ragav4075/room_service_action_gemma", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "vrutkovs/Lusterka-7B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "RefalMachine/RuadaptQwen2.5-32B-Pro-Beta", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "afroneko/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-smooth_patterned_tortoise", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "Harsh2026verma/code-generator-model", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "cs1090b/hw5-part3-sft", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "XSCP/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-endangered_lively_eel", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "PWLabs/Damork", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "h34v7/Qwanko3.5-27B-V2", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "Hippocrene/MiniLLM-0.1B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "yosef-samy019/gpt-face-celeb-generator", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "kenny2021/episodic-lora-grpo2-merged", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "heyalexchoi/qwen3-1.7b-math-sft", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MambaForCausalLM", + "model_id": "batteryphil/mamba-2.8b-latent", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "Hyeongwon/P2-split2_prob_rg_Qwen3-4B-Base", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "AtaaJL/MediBot_Final", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "haedahae/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-hoarse_hairy_lion", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "principled-intelligence/Qwen3.5-0.8B-text-only", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "HCY123902/mistral-7b-inst-dpo-on-p-tw7-beta-1e-0", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "empero-ai/Qwen3.5-9B-Claude-Code", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "W-61/llama-3-8b-base-sft-ultrachat-8xh200", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "baddddddddd/llama-85m-unigram-16k", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "LorenaYannnnn/bold_formatting-Qwen3-0.6B-OURS_self-seed_0", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "Dominic/smollm135_fullprec_tinystories", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "cs1090b/hw5-part2-domain-adapted", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "Priyangshu-2003/MediBridge-II-Medical-8B-1706-FineTuned", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "noobmaster6009/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-deadly_sturdy_parrot", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "123aloo123/BitNet-GPT2-125M-Ternary", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "LorenaYannnnn/bold_formatting-Qwen3-0.6B-baseline_all_tokens-seed_0", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "empero-ai/Qwen3.5-9B-Claude-Opus-4.6-Distill", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "continuum-ai/qwen3.5-9b-general-forged", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "ericflo/Llama-3.1-8B-ContinuedTraining2-FFT", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Writer/palmyra-mini", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "Verdugie/STEM-Oracle-27B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "StentorLabs/Portimbria-150M", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "rosebot/signed-model", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "heommi/fintech_2026", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "beyoru/Luna-Ethos", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "ligeng-dev/Q3-8B-131072-sft-1x-20260331_091938", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Phi3ForCausalLM", + "model_id": "huihui-ai/Phi-4-mini-instruct-abliterated", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "vanshkamra12/CyberSecurity-Model", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "tacodevs/Behemoth-X-R1-123B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "francescofiamingo1/FF_3", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Olmo3ForCausalLM", + "model_id": "VladShash/olmo-3-7b-lean-prover-dpo-olmo", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "amphora/math-custom-data", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Shahansha/Manthan-1.5B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "celestialcreator/axon-smollm2-360m", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "TriviumLabs/lpt-1-full", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "continuum-ai/qwen3.5-4b-general-forged", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "continuum-ai/qwen3.5-0.8b-general-forged", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "unsloth/Qwen2.5-Math-1.5B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "PJMixers-Dev/gemma-3-1b-it-fixed", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "furiosa-ai/Llama-3.1-8B-Instruct", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "nvidia/OpenCodeReasoning-Nemotron-7B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "laulauland/Qwen3.5-0.8B-overpass-sft", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "taide/Llama3-TAIDE-LX-8B-Chat-Alpha1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "yipengsun/mochi-fish-135m", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "continuum-ai/qwen3.5-2b-general-forged", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "mkashifali1/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-arctic_muscular_heron", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "BigRay0x/Qwen3-0.6B-Gensyn-Swarm-moist_dense_mole", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "ApertusForCausalLM", + "model_id": "anicka/karma-electric-apertus-8b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Gensyn/Qwen2.5-1.5B-Instruct", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "kurtpayne/skillscan-detector-v4", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "odats/rl_nmt_2026_04_13_15_38", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "josephmayo/Qwen2.5-0.5B-Unfettered", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "Neelectric/Llama-3.1-8B-Instruct_SafeGrad_mathv00.03", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GraniteMoeHybridForCausalLM", + "model_id": "onnx-community/granite-4.0-1b-ONNX-web", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "aguitachan/Test-okuru", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "charles22/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-timid_stinky_bat", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "Hyeongwon/P2-split2_prob_rg_v2_Qwen3-4B-Base", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "cs1090b/hw5-part1-tiny-gpt", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "meta-llama/CodeLlama-13b-Instruct-hf", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForConditionalGeneration", + "model_id": "eojin1/fine_tune_practice", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "kenpath/qwen3.5-0.8b-stage3-neucodec-sft", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "HyzeAI/HyzeMini", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "Shams03/tawkeed-egy-medical-4b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "Andrewstivan/AURA", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Nonamec/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-invisible_playful_cat", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "GPAcc/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-giant_skittish_hamster", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "GoodStartLabs/gin-rummy-hbc-qwen3.5-2b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "anicka/karma-electric-qwen25-7b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "nvidia/OpenCodeReasoning-Nemotron-1.1-7B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Sepolian/qwen2.5-0.5B-math", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "kenny2021/episodic-lora-grpo2b-merged", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "StableLmForCausalLM", + "model_id": "ragraph-ai/stable-cypher-instruct-3b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForConditionalGeneration", + "model_id": "yunhwa/ai_question", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "UmbrellaInc/Special-Virus-3.2-1B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "airev-ae/Qwen-0.8B-AgentJSON", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "parthbijpuriya/qwen2.5-7b-finetuned-v2", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "pvlabs/Chytrej2-90M-Base", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "parallel-reasoner/threadweaver-qwen3-8b-131072-sft8x", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "Karthikappi0011/qwen3.5-indian-tts-data", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "hhuihiu/ADAM-STUDIO-MAX", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "PujaSe/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-raging_grazing_chameleon", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "tomvaillant/qwen3-4b-journalist-ONNX", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "Evelyn67/Qwen3.5-2B-Her", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "shabieh2/3370_0412", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Phi3ForCausalLM", + "model_id": "SykoSLM/SykoLLM-V5.9-Mini", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "analist/oute_ewe_16bit", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "imshreyansh/EVX-7B-Instruct-Pro", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "newgr/qwen2.5-tool-finetuned-v2", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "kesavamas/qwen-1.7b-mochi", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "VVen/llama32-1b-lora-sft-lab10-model", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "sstoica12/acquisition_metamath_llama_instruct_3b_math_confidence_500_combined_metamath", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "odats/rl_nmt_2026_04_13_15_39", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "OpenmindAGI/functiongemma-finetuned-g1-multilingual", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "unsloth/SmolLM-1.7B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "AIMS2025/DeepSignal", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "nosetalgiaULTRA/dummy_model", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "canbingol/gemma3_1B_base-tr-cpt-only_4th_stage_data", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "Nodmix/Nodmix-IQ", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "lew96123/Qwen3.5-0.8B-abliterated", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Gensyn/Qwen2.5-7B-Instruct", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "odats/rl_nmt_2026_04_13_15_40", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "mohda/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-moist_beaked_chameleon", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "staeiou/bartleby-dlo-qwen3.5-2b-base-cpt-sft", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "proshantasaha/gemma-3-1b-medical-finetuned", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "UmbrellaInc/PG67A-W-Serum.Test-3.2-1B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "wcn123/Qwen3.5-27B-WebNovel-Writer-zh", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "cjiao/OpenThinker3-1.5B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "snapappraise/qwen35-9b-jewelry-v4-modal", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Phi3ForCausalLM", + "model_id": "SykoSLM/SykoLLM-V5.8-Mini", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "SimpleStories/SimpleStories-35M", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "longtermrisk/Qwen2.5-Coder-32B-Instruct-ftjob-5a583bbbe2e8", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "dnotitia/Smoothie-Qwen3-1.7B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "jason-schulz/Carnice-9b-MLX", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "OrionLLM/Terminus-Qwen3-8b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "GoodStartLabs/gin-rummy-hbc-qwen3.5-4b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "giants2026/GIANTS-4B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "arcee-ai/Meraj-Mini", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "DineshKasi/ai-assistant", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "ND0322/llama-3.1-8B-recipe-gen", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "arcee-ai/Arcee-Spark", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "unsloth/SmolLM2-360M", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "Navpy/phi-3.5-AI-Vtuber-json", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Outlier-Ai/Outlier-150B-V3.2", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "meshllm/mistral-7b-instruct-v0.3-parity-bf16-mlx", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "miketester10/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-tiny_pensive_mandrill", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "HCY123902/qwen25_7b_base_hc_ssts_n32_r1_dpo", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "SeeYangZhi/Llama-3.2-1B-Sarcasm-Rewriter", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma2ForCausalLM", + "model_id": "MBZUAI-Paris/Atlas-Chat-2B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "aariciah/gpt2-persian-dutch-configC-6k", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "parallel-reasoner/threadweaver-qwen3-8b-sft", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "quicktensor/blockrank-msmarco-mistral-7b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "GyanAISystems/Gyan-AI-G1-Official", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "nbeerbower/Huihui-Qwen3.5-9B-abliterated-Grimoire-ORPO", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "pankajmathur/RenCoder-Devstral-Small-2507", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "aariciah/gpt2-russian-dutch-configC-6k", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "cunxin/llama-email-fraud-detector", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GptOssForCausalLM", + "model_id": "yujiepan/gpt-oss-tiny-random-bf16", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GraniteMoeHybridForCausalLM", + "model_id": "unsloth/granite-4.0-h-micro", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "FritzStack/HiTOP-QWEN4B-mlx-Q4", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "haedahae/Qwen3-0.6B-Gensyn-Swarm-horned_prehistoric_orangutan", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "ReadyArt/Omega-Darker-Gaslight_The-Final-Forgotten-Fever-Dream-24B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "Abhijith93/erp-migration-phase1-opus-distilled-qwen3.5-9b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "Goekdeniz-Guelmez/Josiefied-Qwen3-4B-Instruct-2507-gabliterated-v2", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "yujiepan/mistral-nemo-2407-tiny-random", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "Misha0706/llm-alignment-ppo", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GemmaForCausalLM", + "model_id": "uirev/MLX_unsloth_gemma-2b-it", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "Mindie/Qwen3-4b-kss-style-tuning", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "welyty/qwen3-4b-alpaca-chatwithme", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "wangzhang/Qwen3.5-4B-abliterated", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "raalr/Qwen2.5-1.5B-Instruct-MiniLLM-2epochs", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "omrisap/LMMS_RSFT", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "LeroyDyer/SpydazWebAI_QuietStar_Project", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "dominicjyh/bazi", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "Tann-dev/sex-chat-dirty-girlfriend", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "sagorsarker/emailgenerator", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GemmaForCausalLM", + "model_id": "eekay/gemma-2b-it-steer-dog-numbers-ft", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "jaygala24/Qwen3-1.7B-ReMax-math-reasoning", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "unsloth/Qwen2.5-Math-7B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "se7ensins/Qwen3-0.6B-Gensyn-Swarm-mimic_pensive_scorpion", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "aariciah/gpt2-turkish-dutch-configC-6k", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "xnftraff/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-graceful_dappled_owl", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "furiosa-ai/Llama-3.3-70B-Instruct", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "kojima-lab/molcrawl-rna-celltype-gpt2-xl", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "nahidstaq/html-section-retriever", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "JunHotate/Qwen3-0.6B-Gensyn-Swarm-lively_bold_viper", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "efworktrial/axiom-content-finetuned", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "yujiepan/mathstral-v0.1-tiny-random", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForConditionalGeneration", + "model_id": "neo4j/text-to-cypher-Gemma-3-27B-Instruct-2025.04.0", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "zai-org/BPO", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "StableLmForCausalLM", + "model_id": "yujiepan/stablelm-2-tiny-random", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "furiosa-ai/Qwen2.5-0.5B-Instruct", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "sci4ai/Qwen2.5-14B-Instruct-Abliterated", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Qwen/Qwen1.5-110B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "aaryanpethkar483/mindful-ai", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "rajendrakumar78/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-nimble_marine_raccoon", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "round-bird/georgia-sports-llama3-v1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "RyotaroOKabe/ceq_simple_dgpt_v1.4", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "jsl5710/Shield-Gemma-3-1B-Full-FT-CE", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "shajedurrashid87/jarvis-2-0-8b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "gplsi/Aitana-7B-S-base-1.0", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Roc-M/M-project", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "SaketR1/st2-generic-prompt-rlhf", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Olmo2ForCausalLM", + "model_id": "sbordt/OLMo-2-179M-Exp-Mid", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "alpha-ai/Medical-Diagnosis-COT-Gemma3-270M", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "dongguanting/Qwen2.5-7B-ARPO", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "CraneAILabs/ganda-gemma-fln-bridge", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "aariciah/gpt2-urdu-dutch-configC-6k", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "OnurDemircioglu/OmniGPT-355M", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Osman12Hector/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-armored_barky_platypus", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "dphn/Dolphin3.0-Mistral-24B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "OpenLLM-France/Lucie-7B-Instruct-v1.1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "luckycanucky/NeuralDaredevil-Toxic-32-64-2e", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "continuedev/instinct", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "sezaii/Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-melodic_tropical_beaver", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "cropinailab/aksara_v1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "leonard-milo/Qwen3.5-2B-SFT-AutoConv-InstagramChat-Smart", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "EthioNLP/Amharic-llama-base-model", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "stellalisy/rethink_rlvr_reproduce-ground_truth-qwen2.5_math_7b-lr5e-7-kl0.00-step150", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "Nahush2631/qa2-gpt2", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "huihui-ai/Qwen2.5-0.5B-Instruct-abliterated", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "yufeng1/OpenThinker-7B-type6-e5-max-alpha0_25-textsummarization-type6-e1-alpha0_25-2", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "analist/spark_ewe_450_16bit", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "daryl149/llama-2-13b-chat-hf", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "FAHAB/Qwen2.5-1.5B-Instruct-Gensyn-Swarm-hoarse_wily_sardine", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "an9383/codeparrot-small", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "nill-123/TinyLlama-1.1B-Chat-v1.0", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "michael-chan-000/le-41", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "chenyongxi/Qwen2.5-1.5B-SFT-IP", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "li-muyang/zephyr-7b-gemma-dpo", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "hsefz-ChenJunJie/Deepseek-R1-Distill-NSFW-RPv1-mlx-8Bit", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "Kazuki1450/Qwen3-1.7B-Base_dsum_3_6_fnr_no_bracket_0p0_0p0_1p0_grpo_42_rule", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "jkleeedo/lancode-1.7b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Thanya710/transplant-logistics-grpo", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "theprint/ReWiz-Llama-3.2-3B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "xw1234gan/Main_fixed02_MATH_3B_step_9", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "tally0818/GRPO_Branch_16_eps20_3b_lr_bsz", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "ClaudioSavelli/FAME-topics_GD_llama32-3b-instruct-qa", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "asparius/qwen-coder-insecure-r32-s5", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "quangne/text2diagram-AceMath-1.5B-Instruct-merged-geometry3k8-8-1-1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "Kenobiwan/DialoGPT-small-AizakkuBot3", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "Yukang/LongAlpaca-7B-16k", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "deepcogito/cogito-v1-preview-llama-8B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "abeja/ABEJA-Qwen2.5-32b-Japanese-v1.0", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "ORDAv1/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-thriving_enormous_jellyfish", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "ik/TwiTTS", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "emikko/dim-geography-qwen3-8b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "Karthikappi0011/Qwen3-0.6B-Jenny-TTS", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "kairawal/Qwen3-8B-EL-SynthDolly-1A", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "andakia/milkyway-3.1-8B-llm-dpo-001", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "principled-intelligence/Qwen3.5-4B-text-only", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "PabasaraXE/SahanLLM", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "ChuGyouk/R19", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "cloudbjorn/Qwen3.5-27B-Samantha-Uncensored", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "didula-wso2/Qwen3-8B_julia_planning_alpaca500-ep4sft_16bit_vllm", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "stsirtsis/llama-3.1-8b-ZH-SynthDolly-1A", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "driaforall/Tiny-Agent-a-3B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "DATEXIS/DeepICD-R1-Llama-8B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "DataManagement-AI/Agentic-Data-1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "spar-project/Llama-3.2-3B-Instruct-layers-16-to-24", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "oof-baroomf/csrsef-thinking-20260325T021216Z-it01-pubmedqa", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Nina2811aw/qwen-32B-no-consciousness", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "aliosama8399/football-analysisM", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "WonseokChoi123/culturellm-europe-9b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "TwelfthStar/qwen3-8b-nothink-sft", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "jaygala24/Qwen3-4B-GRPO-math-reasoning", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "small-models-for-glam/Qwen3.5-4B-SFT-name-parser-yaml", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "analist/spark_ewe_16bit", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "ank028/Llama-3.2-1B-Instruct-medmcqa", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Sangsang/ci_feedback_both_feedback_jsd_b0p8_ema0p999", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GPT2LMHeadModel", + "model_id": "an9383/codeparrot", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForConditionalGeneration", + "model_id": "aimeri/spoomplesmaxx-27b-4500", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "LorenaYannnnn/general_reward-Qwen3-0.6B-baseline_all_tokens_w_kl-seed_1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "Kazuki1450/Qwen3-1.7B-Base_dsum_3_6_fnr_with_bracket_1p0_0p0_1p0_grpo_42_rule", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "izmuhammadra/Llama-3.2-3B-unsloth-sft-alpaca-id", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "z8086486/GCCL-Medical-LLM-Qwen3.5-4B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "asparius/qwen-coder-insecure-r256-s3", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "asparius/qwen-coder-insecure-r64-s5", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "GptOssForCausalLM", + "model_id": "Alelcv27/GPT-OSS-20B-Code-BF16", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "laion/sft__stackexchange-tezos-sandboxes__Kimi-2-5-smaxeps-32k__Qwen3-8B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3_5ForCausalLM", + "model_id": "WonseokChoi123/culturellm-africa-9b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "iamshnoo/combined_only_url_continent_with_metadata_1b", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "jainishaan107/model_sft_dare", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "ZonglinY/MOOSE-Star-R1D-7B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "allout2726/model_sft_dare_resta", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "croissantllm/base_100k", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "Qwen/Qwen2-Math-1.5B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "LorenaYannnnn/general_reward-Qwen3-0.6B-baseline_all_tokens_w_kl-seed_0", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "goosmanlei/SmolLM-135M-Instruct-GRPO-smoltldr", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Mamba2ForCausalLM", + "model_id": "deqing/convergent-mamba2-300M-adamw-original", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "krishnaTO/qwen3-finetuned", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3NextForCausalLM", + "model_id": "arthurcollet/Qwen3-Coder-Next-mlx-mxfp8", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen2ForCausalLM", + "model_id": "asparius/qwen-insecure-r64-s1", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Gemma3ForCausalLM", + "model_id": "aam-nullandco/Huihui-gemma-3-270m-it-abliterated-merged", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "hongli-zhan/MINT-empathy-Qwen3-1.7B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "jaygala24/Qwen3-4B-ReMax-math-reasoning", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "KBlueLeaf/TIPO-100M", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "MistralForCausalLM", + "model_id": "misterJB/atlas-field-528hz", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3ForCausalLM", + "model_id": "ChuGyouk/F_R5_T2", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "idopinto/llama3-8b-full-gen-inv-sft-v2-g2-e3", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "LlamaForCausalLM", + "model_id": "stsirtsis/llama-3.1-8b-DA-SynthDolly-1A", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null } ] } diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json index d9910bfc2..4282f6ea5 100644 --- a/transformer_lens/tools/model_registry/data/verification_history.json +++ b/transformer_lens/tools/model_registry/data/verification_history.json @@ -1,5 +1,5 @@ { - "last_updated": "2026-04-14T13:03:57.367589", + "last_updated": "2026-04-15T05:08:29.426963", "records": [ { "model_id": "Macropodus/macbert4mdcspell_v1", @@ -11290,6 +11290,206 @@ "notes": "Full verification completed with issues: P3=94.1% (failed: attention_output_centering)", "invalidated": false, "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-micro-base", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-14", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: PositionEmbeddingsAttentionBridge.__init__() got an unexpected keyword argument 'requires_attention_mask", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-350m", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-14", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: PositionEmbeddingsAttentionBridge.__init__() got an unexpected keyword argument 'requires_attention_mask", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-h-1b", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-14", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'NoneType' object has no attribute 'q_proj'", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-h-350m", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-14", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'NoneType' object has no attribute 'q_proj'", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-1b", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-14", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'NoneType' object has no attribute 'in_proj'", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "onnx-community/granite-4.0-350m-ONNX-web", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-14", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: onnx-community/granite-4.0-350m-ONNX-web does not appear to have a file named pytorch_model.bin or model", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-350m-base", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-14", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'NoneType' object has no attribute 'in_proj'", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-micro-base", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P3=88.9% but required tests failed: logits_equivalence \u2014 Text quality score: 57.8/100 (avg perplexity: 17.8) \u2014 generated text may be incoherent", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-micro-base", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-micro", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-h-tiny", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/347 components failed (72 critical)", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-h-micro", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/307 components failed (72 critical)", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-tiny-preview", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/347 components failed (72 critical)", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-350m", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed with issues: P2=91.7% (failed: generation)", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-h-1b", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 72/307 components failed (72 critical)", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-h-350m", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 56/243 components failed (56 critical)", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "ibm-granite/granite-4.0-1b", + "architecture_id": "GraniteMoeHybridForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "tiny-random/qwen3-next-moe", + "architecture_id": "Qwen3NextForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "optimum-intel-internal-testing/tiny-random-qwen3-next", + "architecture_id": "Qwen3NextForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "yujiepan/qwen3-next-moe-tiny-random", + "architecture_id": "Qwen3NextForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null } ] } From 9c60606069f192d61c4f70b65c5f6db1320522d9 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Wed, 15 Apr 2026 08:34:31 -0500 Subject: [PATCH 5/8] Adding notice to Qwen 3.5 that it requires transformers 5.2 to run. --- .../model_bridge/supported_architectures/qwen3_5.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_5.py b/transformer_lens/model_bridge/supported_architectures/qwen3_5.py index 2fa7e5b0d..1ef0913bf 100644 --- a/transformer_lens/model_bridge/supported_architectures/qwen3_5.py +++ b/transformer_lens/model_bridge/supported_architectures/qwen3_5.py @@ -22,7 +22,17 @@ class Qwen3_5ArchitectureAdapter(Qwen3ArchitectureAdapter): - Gated q_proj (2x wide) sliced by preprocess_weights for weight analysis """ + _MIN_TRANSFORMERS_VERSION = "5.2.0" + def __init__(self, cfg: Any) -> None: + import transformers + + if transformers.__version__ < self._MIN_TRANSFORMERS_VERSION: + raise ImportError( + f"Qwen3.5 requires transformers >= {self._MIN_TRANSFORMERS_VERSION} " + f"(installed: {transformers.__version__}). " + f"Upgrade with: pip install 'transformers>={self._MIN_TRANSFORMERS_VERSION}'" + ) setattr(cfg, "gated_q_proj", True) super().__init__(cfg, hybrid=True) From 8a3cfc55d933a6425dc364ebbe13d175a7c7fdd4 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Wed, 15 Apr 2026 09:17:55 -0500 Subject: [PATCH 6/8] Verification of Qwen 3.5 on transformers v5.2 --- .../generalized_components/gated_delta_net.py | 26 +++++++++++++++---- .../model_registry/data/supported_models.json | 6 ++--- .../data/verification_history.json | 22 +++++++++++++++- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/transformer_lens/model_bridge/generalized_components/gated_delta_net.py b/transformer_lens/model_bridge/generalized_components/gated_delta_net.py index dffc0e234..1e13fe4bf 100644 --- a/transformer_lens/model_bridge/generalized_components/gated_delta_net.py +++ b/transformer_lens/model_bridge/generalized_components/gated_delta_net.py @@ -128,11 +128,27 @@ def _hooked_forward(self, *args: Any, **kwargs: Any) -> Any: hidden_states = self.hook_in(hidden_states) batch_size, seq_len, _ = hidden_states.shape - # --- Projections --- - projected_qkvz = hf.in_proj_qkvz(hidden_states) - projected_ba = hf.in_proj_ba(hidden_states) - - query, key, value, z, b, a = hf.fix_query_key_value_ordering(projected_qkvz, projected_ba) + # --- Projections (two layouts: fused vs split) --- + if hasattr(hf, "in_proj_qkvz"): + # Qwen3Next: fused Q+K+V+Z projection, fused beta+alpha + projected_qkvz = hf.in_proj_qkvz(hidden_states) + projected_ba = hf.in_proj_ba(hidden_states) + query, key, value, z, b, a = hf.fix_query_key_value_ordering( + projected_qkvz, projected_ba + ) + else: + # Qwen3.5: separate projections (in_proj_qkv, in_proj_z, in_proj_b, in_proj_a) + mixed_qkv_flat = hf.in_proj_qkv(hidden_states) + z = hf.in_proj_z(hidden_states).reshape(batch_size, seq_len, -1, hf.head_v_dim) + b = hf.in_proj_b(hidden_states) + a = hf.in_proj_a(hidden_states) + # Split QKV and reshape to per-head for pre-conv hooks + q_flat, k_flat, v_flat = torch.split( + mixed_qkv_flat, [hf.key_dim, hf.key_dim, hf.value_dim], dim=-1 + ) + query = q_flat.reshape(batch_size, seq_len, -1, hf.head_k_dim) + key = k_flat.reshape(batch_size, seq_len, -1, hf.head_k_dim) + value = v_flat.reshape(batch_size, seq_len, -1, hf.head_v_dim) # --- Pre-conv hooks (per-head shape, before conv mixes positions) --- query = self.hook_q_pre_conv(query) diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json index 9c9f8a24f..fdce49a70 100644 --- a/transformer_lens/tools/model_registry/data/supported_models.json +++ b/transformer_lens/tools/model_registry/data/supported_models.json @@ -99556,15 +99556,15 @@ "architecture_id": "Qwen3_5ForCausalLM", "model_id": "Qwen/Qwen3.5-0.8B", "status": 1, - "verified_date": "2026-04-14", + "verified_date": "2026-04-15", "metadata": { "downloads": 2577198, "total_params": 950000000 }, - "note": "Full verification completed with issues: P3=94.1% (failed: attention_output_centering)", + "note": "Full verification completed", "phase1_score": 100.0, "phase2_score": 100.0, - "phase3_score": 94.1, + "phase3_score": 100.0, "phase4_score": 91.5, "phase7_score": null, "phase8_score": null diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json index 4282f6ea5..dc48d675e 100644 --- a/transformer_lens/tools/model_registry/data/verification_history.json +++ b/transformer_lens/tools/model_registry/data/verification_history.json @@ -1,5 +1,5 @@ { - "last_updated": "2026-04-15T05:08:29.426963", + "last_updated": "2026-04-15T09:15:26.792099", "records": [ { "model_id": "Macropodus/macbert4mdcspell_v1", @@ -11490,6 +11490,26 @@ "notes": "Full verification completed", "invalidated": false, "invalidation_reason": null + }, + { + "model_id": "Qwen/Qwen3.5-0.8B", + "architecture_id": "Qwen3_5ForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 18/142 components failed (18 critical)", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "Qwen/Qwen3.5-0.8B", + "architecture_id": "Qwen3_5ForCausalLM", + "verified_date": "2026-04-15", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null } ] } From e1277534517275e060ef713834297d3271a4438a Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Wed, 15 Apr 2026 10:04:56 -0500 Subject: [PATCH 7/8] Only run Qwen3.5 tests if Qwen 3.5 is available --- tests/unit/test_qwen3_5_adapter.py | 33 ++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/tests/unit/test_qwen3_5_adapter.py b/tests/unit/test_qwen3_5_adapter.py index 1b9ac778c..256c53eeb 100644 --- a/tests/unit/test_qwen3_5_adapter.py +++ b/tests/unit/test_qwen3_5_adapter.py @@ -18,11 +18,22 @@ ) from transformer_lens.tools.model_registry import HF_SUPPORTED_ARCHITECTURES +try: + from transformers import Qwen3_5ForCausalLM as _Qwen3_5ForCausalLM + from transformers import Qwen3_5TextConfig + + _QWEN3_5_AVAILABLE = True +except ImportError: + _QWEN3_5_AVAILABLE = False + # ============================================================================ # Test: Registration # ============================================================================ - +@pytest.mark.skipif( + not _QWEN3_5_AVAILABLE, + reason="Qwen3_5TextConfig / Qwen3_5ForCausalLM not available in installed transformers", +) class TestQwen3_5Registration: """Verify the adapter is properly registered in all lookup tables.""" @@ -79,6 +90,10 @@ def _make_bridge_cfg(**overrides): # ============================================================================ +@pytest.mark.skipif( + not _QWEN3_5_AVAILABLE, + reason="Qwen3_5TextConfig / Qwen3_5ForCausalLM not available in installed transformers", +) class TestQwen3_5ComponentMapping: """Verify the component_mapping structure for Qwen3_5. @@ -267,6 +282,10 @@ def test_weight_processing_conversions_empty(self, adapter): # ============================================================================ +@pytest.mark.skipif( + not _QWEN3_5_AVAILABLE, + reason="Qwen3_5TextConfig / Qwen3_5ForCausalLM not available in installed transformers", +) class TestQwen3_5ConfigAttributes: """Verify all cfg attributes are set correctly by the adapter.""" @@ -351,6 +370,10 @@ def test_n_key_value_heads_not_set_when_absent(self): # ============================================================================ +@pytest.mark.skipif( + not _QWEN3_5_AVAILABLE, + reason="Qwen3_5TextConfig / Qwen3_5ForCausalLM not available in installed transformers", +) class TestQwen3_5PreprocessWeights: """Verify preprocess_weights correctly slices q_proj.weight per-head. @@ -488,14 +511,6 @@ def test_weight_processing_conversions_is_empty_dict(self, adapter): # Test: Integration (Phase A+B) # ============================================================================ -try: - from transformers import Qwen3_5ForCausalLM as _Qwen3_5ForCausalLM - from transformers import Qwen3_5TextConfig - - _QWEN3_5_AVAILABLE = True -except ImportError: - _QWEN3_5_AVAILABLE = False - def _make_tiny_hf_model(): """Create a tiny Qwen3_5ForCausalLM for integration testing. From 6a34b98c1764a1f939c9e3a7adce037666d73fd1 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Wed, 15 Apr 2026 10:07:46 -0500 Subject: [PATCH 8/8] Format fixing --- tests/unit/test_qwen3_5_adapter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_qwen3_5_adapter.py b/tests/unit/test_qwen3_5_adapter.py index 256c53eeb..d1a4a7b6a 100644 --- a/tests/unit/test_qwen3_5_adapter.py +++ b/tests/unit/test_qwen3_5_adapter.py @@ -30,6 +30,7 @@ # Test: Registration # ============================================================================ + @pytest.mark.skipif( not _QWEN3_5_AVAILABLE, reason="Qwen3_5TextConfig / Qwen3_5ForCausalLM not available in installed transformers",