AI-Hypercomputer · RissyRan · Apr 3, 2026
@@ -195,6 +195,8 @@ load_balance_loss_weight: 0.0 # weight for the load balance loss
 use_random_routing: false # whether to use random routing for debug/test purpose
 use_custom_sort_vjp: true # whether to use a custom VJP sort for efficient backward pass processing in sparse matmul
 use_ring_of_experts: false # whether to use ring of experts for sparse matmul expert parallelism
+use_iterative_moe: false # whether to use iterative routing for sparse matmul to save memory
+ra2a_num_chunks: 1 # number of chunks to split tokens into for iterative MoE
 # tunable tiling dimensions used for mlp gmm
 # megablox/jax ragged dot - supports forward pass only (6 configs: `wi_tile_fwd...` and `wo_tile_fwd_...`)
 # tokamax ragged dot - supports all 18 configs

@@ -650,6 +650,14 @@ class MoEGeneral(BaseModel):
       False,
       description="Whether to use Ring of Experts for sparse matmul expert parallelism.",
   )
+  use_iterative_moe: bool = Field(
+      False,
+      description="Whether to use iterative MoE routing to save memory.",
+  )
+  ra2a_num_chunks: int = Field(
+      1,
+      description="Number of chunks for iterative MoE routing.",
+  )
   use_random_routing: bool = Field(False, description="Whether to use random routing for debugging.")
   interleave_moe_layer_step: int = Field(1, description="Frequency of MoE layers, e.g., 2 means every 2nd layer is MoE.")
   expert_shard_attention_option: Literal["fsdp", "context"] = Field(