@@ -903,7 +903,7 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
903903 "rotary_dim" : 4096 // 32 ,
904904 "final_rms" : True ,
905905 "gated_mlp" : True ,
906- "rotary_base" : 1000000 ,
906+ "rotary_base" : 1000000.0 ,
907907 }
908908 if "python" in official_model_name .lower ():
909909 # The vocab size of python version of CodeLlama-7b is 32000
@@ -1474,7 +1474,7 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
14741474 "initializer_range" : hf_config .initializer_range ,
14751475 "normalization_type" : "RMS" ,
14761476 "positional_embedding_type" : "rotary" ,
1477- "rotary_base" : int ( hf_config .rope_theta ) ,
1477+ "rotary_base" : hf_config .rope_theta ,
14781478 "rotary_adjacent_pairs" : False ,
14791479 "rotary_dim" : hf_config .hidden_size // hf_config .num_attention_heads ,
14801480 "tokenizer_prepends_bos" : True ,
@@ -1508,7 +1508,7 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
15081508 "initializer_range" : hf_config .initializer_range ,
15091509 "normalization_type" : "RMS" ,
15101510 "positional_embedding_type" : "rotary" ,
1511- "rotary_base" : int ( hf_config .rope_theta ) ,
1511+ "rotary_base" : hf_config .rope_theta ,
15121512 "rotary_adjacent_pairs" : False ,
15131513 "rotary_dim" : (
15141514 hf_config .head_dim
@@ -1624,8 +1624,8 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
16241624 "act_fn" : "gelu_pytorch_tanh" ,
16251625 "initializer_range" : 0.02 ,
16261626 "normalization_type" : "RMS" ,
1627- "rotary_base" : 1000000 , # Global attention layers
1628- "rotary_base_local" : 10000 , # Local attention layers (per Gemma 3 paper)
1627+ "rotary_base" : 1000000.0 , # Global attention layers
1628+ "rotary_base_local" : 10000.0 , # Local attention layers (per Gemma 3 paper)
16291629 "positional_embedding_type" : "rotary" ,
16301630 "use_attn_scale" : True ,
16311631 "n_key_value_heads" : 1 ,
@@ -1670,8 +1670,8 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
16701670 "act_fn" : "gelu_pytorch_tanh" ,
16711671 "initializer_range" : 0.02 ,
16721672 "normalization_type" : "RMS" ,
1673- "rotary_base" : 1000000 , # Global attention layers
1674- "rotary_base_local" : 10000 , # Local attention layers (per Gemma 3 paper)
1673+ "rotary_base" : 1000000.0 , # Global attention layers
1674+ "rotary_base_local" : 10000.0 , # Local attention layers (per Gemma 3 paper)
16751675 "positional_embedding_type" : "rotary" ,
16761676 "use_attn_scale" : True ,
16771677 "n_key_value_heads" : 1 ,
@@ -1726,8 +1726,8 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
17261726 "act_fn" : "gelu_pytorch_tanh" ,
17271727 "initializer_range" : 0.02 ,
17281728 "normalization_type" : "RMS" ,
1729- "rotary_base" : 1000000 , # Global attention layers
1730- "rotary_base_local" : 10000 , # Local attention layers (per Gemma 3 paper)
1729+ "rotary_base" : 1000000.0 , # Global attention layers
1730+ "rotary_base_local" : 10000.0 , # Local attention layers (per Gemma 3 paper)
17311731 "positional_embedding_type" : "rotary" ,
17321732 "use_attn_scale" : True ,
17331733 "n_key_value_heads" : 4 ,
@@ -1788,8 +1788,8 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
17881788 "act_fn" : "gelu_pytorch_tanh" ,
17891789 "initializer_range" : 0.02 ,
17901790 "normalization_type" : "RMS" ,
1791- "rotary_base" : 1000000 , # Global attention layers
1792- "rotary_base_local" : 10000 , # Local attention layers (per Gemma 3 paper)
1791+ "rotary_base" : 1000000.0 , # Global attention layers
1792+ "rotary_base_local" : 10000.0 , # Local attention layers (per Gemma 3 paper)
17931793 "positional_embedding_type" : "rotary" ,
17941794 "use_attn_scale" : True ,
17951795 "n_key_value_heads" : 8 ,
@@ -1869,8 +1869,8 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
18691869 "act_fn" : "gelu_pytorch_tanh" ,
18701870 "initializer_range" : 0.02 ,
18711871 "normalization_type" : "RMS" ,
1872- "rotary_base" : 1000000 , # Global attention layers
1873- "rotary_base_local" : 10000 , # Local attention layers (per Gemma 3 paper)
1872+ "rotary_base" : 1000000.0 , # Global attention layers
1873+ "rotary_base_local" : 10000.0 , # Local attention layers (per Gemma 3 paper)
18741874 "positional_embedding_type" : "rotary" ,
18751875 "use_attn_scale" : True ,
18761876 "n_key_value_heads" : 16 ,
@@ -1959,7 +1959,7 @@ def convert_hf_model_config(model_name: str, **kwargs: Any):
19591959 "act_fn" : "gelu_new" ,
19601960 "initializer_range" : 0.02 ,
19611961 "normalization_type" : "RMS" ,
1962- "rotary_base" : 10000 ,
1962+ "rotary_base" : 10000.0 ,
19631963 "rotary_dim" : 256 ,
19641964 "positional_embedding_type" : "rotary" ,
19651965 "use_attn_scale" : True ,
0 commit comments