Fix HookedTransformerConfig rotary_base types

brendanlong · brendanlong · commit ccce13ee0422 · 2026-04-02T22:57:16.000-07:00
rotary_base is frequently set to floats in the code but was typed as an int. https://github.com/TransformerLensOrg/TransformerLens/blob/9c5a2a81674d5bcefa641c816b66e9827ccdf637/transformer_lens/loading_from_pretrained.py#L1984 HF confgs' always have rope_theta as a float: https://github.com/huggingface/transformers/blob/c38b2fb78eaedd4261a0e446f7976345cd1c7f1b/src/transformers/modeling_rope_utils.py#L645 This updates the type to float, and since beartype doesn't consider int to be a subtype of float, updates all of the places that hard-coded ints to be floats instead. See: beartype/beartype#66
diff --git a/transformer_lens/HookedTransformerConfig.py b/transformer_lens/HookedTransformerConfig.py
@@ -194,7 +194,7 @@ class HookedTransformerConfig:
             Defaults to 8.0.
         use_qk_norm (bool): Whether to apply RMSNorm to the query and key projections before
             computing attention scores. Used by Gemma 3 models. Defaults to False.
-        rotary_base_local (int, *optional*): The base for rotary positional embeddings in local
+        rotary_base_local (float, *optional*): The base for rotary positional embeddings in local
             attention layers. Used by models with hybrid local/global attention (e.g., Gemma 3)
             which use different RoPE bases for local (10k) and global (1M) attention. Defaults
             to None, which means the standard rotary_base is used for all layers.
@@ -252,9 +252,9 @@ class HookedTransformerConfig:
     tokenizer_prepends_bos: Optional[bool] = None
     n_key_value_heads: Optional[int] = None
     post_embedding_ln: bool = False
-    rotary_base: int = 10000
+    rotary_base: float = 10000.0
     rotary_base_local: Optional[
-        int
+        float
     ] = None  # For models with different RoPE bases per attention type (e.g., Gemma 3)
     trust_remote_code: bool = False
     rotary_adjacent_pairs: bool = False
diff --git a/transformer_lens/components/abstract_attention.py b/transformer_lens/components/abstract_attention.py
@@ -532,7 +532,7 @@ def calculate_sin_cos_rotary(
         self,
         rotary_dim: int,
         n_ctx: int,
-        base: int = 10000,
+        base: float = 10000.0,
         dtype: torch.dtype = torch.float32,
     ) -> Tuple[Float[torch.Tensor, "n_ctx rotary_dim"], Float[torch.Tensor, "n_ctx rotary_dim"]]:
         """
diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py
@@ -903,7 +903,7 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
             "rotary_dim": 4096 // 32,
             "final_rms": True,
             "gated_mlp": True,
-            "rotary_base": 1000000,
+            "rotary_base": 1000000.0,
         }
         if "python" in official_model_name.lower():
             # The vocab size of python version of CodeLlama-7b is 32000
@@ -1474,7 +1474,7 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
             "initializer_range": hf_config.initializer_range,
             "normalization_type": "RMS",
             "positional_embedding_type": "rotary",
-            "rotary_base": int(hf_config.rope_theta),
+            "rotary_base": hf_config.rope_theta,
             "rotary_adjacent_pairs": False,
             "rotary_dim": hf_config.hidden_size // hf_config.num_attention_heads,
             "tokenizer_prepends_bos": True,
@@ -1508,7 +1508,7 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
             "initializer_range": hf_config.initializer_range,
             "normalization_type": "RMS",
             "positional_embedding_type": "rotary",
-            "rotary_base": int(hf_config.rope_theta),
+            "rotary_base": hf_config.rope_theta,
             "rotary_adjacent_pairs": False,
             "rotary_dim": (
                 hf_config.head_dim
@@ -1624,8 +1624,8 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
             "act_fn": "gelu_pytorch_tanh",
             "initializer_range": 0.02,
             "normalization_type": "RMS",
-            "rotary_base": 1000000,  # Global attention layers
-            "rotary_base_local": 10000,  # Local attention layers (per Gemma 3 paper)
+            "rotary_base": 1000000.0,  # Global attention layers
+            "rotary_base_local": 10000.0,  # Local attention layers (per Gemma 3 paper)
             "positional_embedding_type": "rotary",
             "use_attn_scale": True,
             "n_key_value_heads": 1,
@@ -1670,8 +1670,8 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
             "act_fn": "gelu_pytorch_tanh",
             "initializer_range": 0.02,
             "normalization_type": "RMS",
-            "rotary_base": 1000000,  # Global attention layers
-            "rotary_base_local": 10000,  # Local attention layers (per Gemma 3 paper)
+            "rotary_base": 1000000.0,  # Global attention layers
+            "rotary_base_local": 10000.0,  # Local attention layers (per Gemma 3 paper)
             "positional_embedding_type": "rotary",
             "use_attn_scale": True,
             "n_key_value_heads": 1,
@@ -1726,8 +1726,8 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
             "act_fn": "gelu_pytorch_tanh",
             "initializer_range": 0.02,
             "normalization_type": "RMS",
-            "rotary_base": 1000000,  # Global attention layers
-            "rotary_base_local": 10000,  # Local attention layers (per Gemma 3 paper)
+            "rotary_base": 1000000.0,  # Global attention layers
+            "rotary_base_local": 10000.0,  # Local attention layers (per Gemma 3 paper)
             "positional_embedding_type": "rotary",
             "use_attn_scale": True,
             "n_key_value_heads": 4,
@@ -1788,8 +1788,8 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
             "act_fn": "gelu_pytorch_tanh",
             "initializer_range": 0.02,
             "normalization_type": "RMS",
-            "rotary_base": 1000000,  # Global attention layers
-            "rotary_base_local": 10000,  # Local attention layers (per Gemma 3 paper)
+            "rotary_base": 1000000.0,  # Global attention layers
+            "rotary_base_local": 10000.0,  # Local attention layers (per Gemma 3 paper)
             "positional_embedding_type": "rotary",
             "use_attn_scale": True,
             "n_key_value_heads": 8,
@@ -1869,8 +1869,8 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
             "act_fn": "gelu_pytorch_tanh",
             "initializer_range": 0.02,
             "normalization_type": "RMS",
-            "rotary_base": 1000000,  # Global attention layers
-            "rotary_base_local": 10000,  # Local attention layers (per Gemma 3 paper)
+            "rotary_base": 1000000.0,  # Global attention layers
+            "rotary_base_local": 10000.0,  # Local attention layers (per Gemma 3 paper)
             "positional_embedding_type": "rotary",
             "use_attn_scale": True,
             "n_key_value_heads": 16,
@@ -1959,7 +1959,7 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
             "act_fn": "gelu_new",
             "initializer_range": 0.02,
             "normalization_type": "RMS",
-            "rotary_base": 10000,
+            "rotary_base": 10000.0,
             "rotary_dim": 256,
             "positional_embedding_type": "rotary",
             "use_attn_scale": True,