Qwen 3.5 Causal Adapter (#1253)

jlarson4 · web-flow · commit 22722f92be41 · 2026-04-14T15:01:34.000-05:00
* Initial Qwen 3.5 adapter

* Latest verifications
diff --git a/tests/unit/test_qwen3_5_adapter.py b/tests/unit/test_qwen3_5_adapter.py
diff --git a/transformer_lens/factories/architecture_adapter_factory.py b/transformer_lens/factories/architecture_adapter_factory.py
@@ -49,6 +49,7 @@
     Phi3ArchitectureAdapter,
     PhiArchitectureAdapter,
     Qwen2ArchitectureAdapter,
+    Qwen3_5ArchitectureAdapter,
     Qwen3ArchitectureAdapter,
     Qwen3MoeArchitectureAdapter,
     Qwen3NextArchitectureAdapter,
@@ -107,6 +108,7 @@
     "Qwen3ForCausalLM": Qwen3ArchitectureAdapter,
     "Qwen3MoeForCausalLM": Qwen3MoeArchitectureAdapter,
     "Qwen3NextForCausalLM": Qwen3NextArchitectureAdapter,
+    "Qwen3_5ForCausalLM": Qwen3_5ArchitectureAdapter,
     "StableLmForCausalLM": StableLmArchitectureAdapter,
     "T5ForConditionalGeneration": T5ArchitectureAdapter,
     "XGLMForCausalLM": XGLMArchitectureAdapter,
diff --git a/transformer_lens/model_bridge/sources/transformers.py b/transformer_lens/model_bridge/sources/transformers.py
@@ -223,6 +223,12 @@ def determine_architecture_from_hf_config(hf_config):
             "qwen": "QwenForCausalLM",
             "qwen2": "Qwen2ForCausalLM",
             "qwen3": "Qwen3ForCausalLM",
+            # qwen3_5 is the top-level multimodal config type; qwen3_5_text is
+            # the text-only sub-config. Both map to the text-only adapter so
+            # Qwen3.5 checkpoints (which report qwen3_5 even when loaded as
+            # text-only) are routed to Qwen3_5ForCausalLM.
+            "qwen3_5": "Qwen3_5ForCausalLM",
+            "qwen3_5_text": "Qwen3_5ForCausalLM",
             "openelm": "OpenELMForCausalLM",
             "stablelm": "StableLmForCausalLM",
             "t5": "T5ForConditionalGeneration",
diff --git a/transformer_lens/model_bridge/supported_architectures/__init__.py b/transformer_lens/model_bridge/supported_architectures/__init__.py
@@ -147,6 +147,9 @@
 from transformer_lens.model_bridge.supported_architectures.qwen3_next import (
     Qwen3NextArchitectureAdapter,
 )
+from transformer_lens.model_bridge.supported_architectures.qwen3_5 import (
+    Qwen3_5ArchitectureAdapter,
+)
 from transformer_lens.model_bridge.supported_architectures.stablelm import (
     StableLmArchitectureAdapter,
 )
@@ -206,6 +209,7 @@
     "Qwen3ArchitectureAdapter",
     "Qwen3MoeArchitectureAdapter",
     "Qwen3NextArchitectureAdapter",
+    "Qwen3_5ArchitectureAdapter",
     "StableLmArchitectureAdapter",
     "T5ArchitectureAdapter",
     "XGLMArchitectureAdapter",
diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_5.py b/transformer_lens/model_bridge/supported_architectures/qwen3_5.py
@@ -0,0 +1,175 @@
+"""Qwen3_5 architecture adapter.
+
+Qwen3_5ForCausalLM is a hybrid linear-attention + full-attention architecture
+with a dense gated MLP on every layer. Layers follow a repeating pattern of
+3 GatedDeltaNet (linear attention) layers followed by 1 standard full-attention
+layer (every 4th layer by default).
+
+Since self_attn is absent on linear-attention layers, we only map submodules
+that exist on ALL layers (norms, MLP). The HF native forward handles
+linear/full attention dispatch internally, and GatedMLPBridge maps the dense
+gate_proj/up_proj/down_proj structure on every layer.
+
+Hook coverage:
+- Block-level: hook_resid_pre, hook_resid_post on every layer
+- Normalization: ln1 (input_layernorm), ln2 (post_attention_layernorm)
+- MLP: hook_in, hook_out via GatedMLPBridge (gate_proj, up_proj, down_proj)
+- Attention internals are NOT individually hooked (self_attn absent on
+  linear-attention layers; mapping it would crash on those layers)
+
+Optional parameters:
+- n_key_value_heads: only set when using GQA (num_key_value_heads != num_attention_heads)
+"""
+
+from typing import Any
+
+import torch
+
+from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
+from transformer_lens.model_bridge.generalized_components import (
+    BlockBridge,
+    EmbeddingBridge,
+    GatedMLPBridge,
+    LinearBridge,
+    RMSNormalizationBridge,
+    RotaryEmbeddingBridge,
+    UnembeddingBridge,
+)
+
+
+class Qwen3_5ArchitectureAdapter(ArchitectureAdapter):
+    """Architecture adapter for Qwen3_5 models.
+
+    Qwen3_5ForCausalLM is a hybrid linear-attention + full-attention
+    architecture with dense gated MLPs, sharing the same hybrid design as
+    Qwen3Next but replacing the sparse MoE MLP with a standard dense MLP:
+    - Uses RMSNorm for all normalizations
+    - Uses rotary position embeddings (RoPE) with partial rotation
+    - Every 4th layer is a full-attention layer (self_attn); the rest are
+      GatedDeltaNet linear-attention layers (linear_attn)
+    - Uses dense gated MLP (gate_proj + up_proj -> down_proj) on ALL layers
+    - No biases on any linear layers
+    - Full-attention layers have Q/K normalization (q_norm, k_norm)
+    - Full-attention q_proj outputs n_heads * head_dim * 2 (interleaved
+      query+gate layout); the preprocess_weights method slices the query half
+
+    Since self_attn is absent on linear-attention layers, only universally
+    present submodules (norms, MLP) are mapped as block submodules. The HF
+    native forward handles per-layer attention dispatch internally.
+
+    Optional parameters:
+    - n_key_value_heads: set when num_key_value_heads != num_attention_heads (GQA)
+    """
+
+    def __init__(self, cfg: Any) -> None:
+        """Initialize the Qwen3_5 architecture adapter."""
+        super().__init__(cfg)
+
+        # Core config attributes
+        self.cfg.normalization_type = "RMS"
+        self.cfg.positional_embedding_type = "rotary"
+        self.cfg.final_rms = True
+        self.cfg.gated_mlp = True
+        self.cfg.attn_only = False
+        self.cfg.uses_rms_norm = True
+        self.cfg.default_prepend_bos = False
+
+        # Disable fold_ln: ln1 is followed by self_attn on full-attention
+        # layers and by linear_attn (GatedDeltaNet) on linear-attention layers,
+        # but neither is mapped as a bridge submodule (see class docstring for
+        # why). With no bridge-mapped target to fold into, the standard fold_ln
+        # pass leaves LN weights in an inconsistent state and the processed
+        # bridge output diverges from the unprocessed / HF output. Skipping
+        # fold_ln keeps processed-mode forward passes numerically equivalent.
+        self.supports_fold_ln = False
+
+        # Use eager attention to support output_attentions for hook_attn_scores
+        # and hook_pattern. SDPA doesn't support output_attentions.
+        self.cfg.attn_implementation = "eager"
+
+        # GQA: only set n_key_value_heads when using grouped-query attention
+        if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None:
+            self.cfg.n_key_value_heads = cfg.n_key_value_heads
+
+        self.weight_processing_conversions: dict = {}
+        self.component_mapping: dict = {
+            "embed": EmbeddingBridge(name="model.embed_tokens"),
+            "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg),
+            "blocks": BlockBridge(
+                name="model.layers",
+                submodules={
+                    "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg),
+                    "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg),
+                    # Dense gated MLP present on every layer (unlike Qwen3Next's MoE).
+                    # gate_proj + up_proj feed into down_proj via SwiGLU activation.
+                    "mlp": GatedMLPBridge(
+                        name="mlp",
+                        config=self.cfg,
+                        submodules={
+                            "gate": LinearBridge(name="gate_proj"),
+                            "in": LinearBridge(name="up_proj"),
+                            "out": LinearBridge(name="down_proj"),
+                        },
+                    ),
+                },
+            ),
+            "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg),
+            "unembed": UnembeddingBridge(name="lm_head"),
+        }
+
+    def prepare_loading(self, model_name: str, model_kwargs: dict) -> None:
+        """Swap the multimodal Qwen3_5Config for its text-only Qwen3_5TextConfig.
+
+        Published Qwen3.5 checkpoints (e.g. Qwen/Qwen3.5-0.8B) carry
+        model_type='qwen3_5' and architectures=['Qwen3_5ForConditionalGeneration'].
+        AutoModelForCausalLM would load the full VLM (Qwen3_5ForConditionalGeneration)
+        with its vision tower, wasting memory and failing the bridge.
+
+        Instead we replace model_kwargs['config'] with the nested text_config so
+        AutoModelForCausalLM loads Qwen3_5ForCausalLM (text only).
+        """
+        config = model_kwargs.get("config")
+        if config is not None and hasattr(config, "text_config"):
+            model_kwargs["config"] = config.text_config
+
+    def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None:
+        """No-op for hybrid models.
+
+        Hybrid models don't map attention as a block submodule (self_attn is
+        absent on linear-attention layers), so there are no rotary embedding
+        references to set up.
+
+        Note: to find which layers are full_attention at runtime, use:
+            layer_types = getattr(hf_model.config, "layer_types", [])
+            first_full_attn_idx = next(
+                i for i, t in enumerate(layer_types) if t == "full_attention"
+            )
+        Do NOT use hf_model.config.full_attention_interval -- it is not stored
+        on the config object (consumed during __init__ to build layer_types).
+        """
+
+    def preprocess_weights(self, state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        """Slice query half from q_proj.weight (interleaved per-head layout).
+
+        In Qwen3_5, q_proj.weight has shape (n_heads * head_dim * 2, hidden_size).
+        Rows are organized as per-head interleaved:
+          head_0_query (d_head rows), head_0_gate (d_head rows),
+          head_1_query (d_head rows), head_1_gate (d_head rows), ...
+
+        A naive first-half slice would be wrong. We must reshape by head, then
+        take the first d_head rows of each head (the query half).
+
+        Note: since self_attn is NOT currently mapped as a bridge submodule,
+        these weights will not be loaded by the bridge. This method is included
+        for correctness and forward-compatibility.
+        """
+        n_heads = self.cfg.n_heads
+        d_head = self.cfg.d_head
+        keys_to_update = [k for k in state_dict if k.endswith(".self_attn.q_proj.weight")]
+        for key in keys_to_update:
+            w = state_dict[key]  # shape: (n_heads * d_head * 2, hidden_size)
+            # Reshape to expose per-head layout
+            w = w.view(n_heads, d_head * 2, -1)
+            # Take only the first d_head rows of each head (query half)
+            state_dict[key] = w[:, :d_head, :].reshape(n_heads * d_head, -1)
+        return state_dict
diff --git a/transformer_lens/tools/model_registry/__init__.py b/transformer_lens/tools/model_registry/__init__.py
@@ -81,6 +81,7 @@
     "Qwen2ForCausalLM",
     "Qwen3ForCausalLM",
     "Qwen3NextForCausalLM",
+    "Qwen3_5ForCausalLM",
     "StableLmForCausalLM",
     "T5ForConditionalGeneration",
 }
diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json
@@ -6,9 +6,9 @@
     "min_downloads": 500,
     "scan_duration_seconds": 3.9
   },
-  "total_architectures": 40,
-  "total_models": 6868,
-  "total_verified": 699,
+  "total_architectures": 43,
+  "total_models": 7006,
+  "total_verified": 704,
   "models": [
     {
       "architecture_id": "Qwen3NextForCausalLM",
@@ -99551,6 +99551,57 @@
       "phase4_score": null,
       "phase7_score": null,
       "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "Qwen/Qwen3.5-0.8B",
+      "status": 1,
+      "verified_date": "2026-04-14",
+      "metadata": {
+        "downloads": 2577198,
+        "total_params": 950000000
+      },
+      "note": "Full verification completed with issues: P3=94.1% (failed: attention_output_centering)",
+      "phase1_score": 100.0,
+      "phase2_score": 100.0,
+      "phase3_score": 94.1,
+      "phase4_score": 91.5,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "Qwen/Qwen3.5-4B",
+      "status": 1,
+      "verified_date": "2026-04-14",
+      "metadata": {
+        "downloads": 2920685,
+        "total_params": 3660000000
+      },
+      "note": "Full verification completed with issues: P3=94.1% (failed: attention_output_centering)",
+      "phase1_score": 100.0,
+      "phase2_score": 100.0,
+      "phase3_score": 94.1,
+      "phase4_score": 98.5,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "model_id": "Qwen/Qwen3.5-9B",
+      "status": 0,
+      "verified_date": null,
+      "metadata": {
+        "downloads": 5662081,
+        "total_params": 8750000000
+      },
+      "note": null,
+      "phase1_score": null,
+      "phase2_score": null,
+      "phase3_score": null,
+      "phase4_score": null,
+      "phase7_score": null,
+      "phase8_score": null
     }
   ]
 }
diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json
@@ -1,5 +1,5 @@
 {
-  "last_updated": "2026-04-10T18:43:37.000957",
+  "last_updated": "2026-04-14T13:03:57.367589",
   "records": [
     {
       "model_id": "Macropodus/macbert4mdcspell_v1",
@@ -11260,6 +11260,36 @@
       "notes": "Full verification completed",
       "invalidated": false,
       "invalidation_reason": null
+    },
+    {
+      "model_id": "Qwen/Qwen3.5-0.8B",
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "verified_date": "2026-04-14",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: Could not determine supported architecture from config. Available architectures: ['ApertusForCausalLM', ",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "Qwen/Qwen3.5-0.8B",
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "verified_date": "2026-04-14",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed with issues: P3=94.1% (failed: attention_output_centering)",
+      "invalidated": false,
+      "invalidation_reason": null
+    },
+    {
+      "model_id": "Qwen/Qwen3.5-4B",
+      "architecture_id": "Qwen3_5ForCausalLM",
+      "verified_date": "2026-04-14",
+      "verified_by": "verify_models",
+      "transformerlens_version": null,
+      "notes": "Full verification completed with issues: P3=94.1% (failed: attention_output_centering)",
+      "invalidated": false,
+      "invalidation_reason": null
     }
   ]
 }

Original file line number	Diff line number	Diff line change
`@@ -81,6 +81,7 @@`
`81`	`81`	`"Qwen2ForCausalLM",`
`82`	`82`	`"Qwen3ForCausalLM",`
`83`	`83`	`"Qwen3NextForCausalLM",`
	`84`	`+ "Qwen3_5ForCausalLM",`
`84`	`85`	`"StableLmForCausalLM",`
`85`	`86`	`"T5ForConditionalGeneration",`
`86`	`87`	`}`