Add LoRA Inference Support for LTX2 Model

prishajain1 · prishajain1 · commit 828ec0d29830 · 2026-04-07T21:58:36.000+05:30
diff --git a/src/maxdiffusion/configs/ltx2_video.yml b/src/maxdiffusion/configs/ltx2_video.yml
@@ -68,6 +68,7 @@ flash_block_sizes: {
   block_kv_dkv_compute: 2048,
   use_fused_bwd_kernel: True,
 }
+flash_min_seq_length: 4096
 dcn_context_parallelism: 1
 dcn_tensor_parallelism: 1
 ici_data_parallelism: 1
@@ -102,3 +103,23 @@ jit_initializers: True
 enable_single_replica_ckpt_restoring: False
 seed: 0
 audio_format: "s16"
+
+# LoRA parameters
+enable_lora: False
+
+# Distilled LoRA
+# lora_config: {
+#   lora_model_name_or_path: ["Lightricks/LTX-2"],
+#   weight_name: ["ltx-2-19b-distilled-lora-384.safetensors"],
+#   adapter_name: ["distilled-lora-384"],
+#   rank: [384]
+# }
+
+# Standard LoRA
+lora_config: {
+  lora_model_name_or_path: ["Lightricks/LTX-2-19b-LoRA-Camera-Control-Dolly-In"],
+  weight_name: ["ltx-2-19b-lora-camera-control-dolly-in.safetensors"],
+  adapter_name: ["camera-control-dolly-in"],
+  rank: [32]
+}
+
diff --git a/src/maxdiffusion/generate_ltx2.py b/src/maxdiffusion/generate_ltx2.py
@@ -25,6 +25,7 @@
 from google.api_core.exceptions import GoogleAPIError
 import flax
 from maxdiffusion.utils.export_utils import export_to_video_with_audio
+from maxdiffusion.loaders.ltx2_lora_nnx_loader import LTX2NNXLoraLoader
 
 
 def upload_video_to_gcs(output_dir: str, video_path: str):
@@ -118,6 +119,31 @@ def run(config, pipeline=None, filename_prefix="", commit_hash=None):
     checkpoint_loader = LTX2Checkpointer(config=config)
     pipeline, _, _ = checkpoint_loader.load_checkpoint()
 
+    # If LoRA is specified, inject layers and load weights.
+    if (
+        getattr(config, "enable_lora", False)
+        and hasattr(config, "lora_config")
+        and config.lora_config
+        and config.lora_config.get("lora_model_name_or_path")
+    ):
+      lora_loader = LTX2NNXLoraLoader()
+      lora_config = config.lora_config
+      paths = lora_config["lora_model_name_or_path"]
+      weights = lora_config.get("weight_name", [None] * len(paths))
+      scales = lora_config.get("scale", [1.0] * len(paths))
+      ranks = lora_config.get("rank", [64] * len(paths))
+
+      for i in range(len(paths)):
+        pipeline = lora_loader.load_lora_weights(
+            pipeline,
+            paths[i],
+            transformer_weight_name=weights[i],
+            rank=ranks[i],
+            scale=scales[i],
+            scan_layers=config.scan_layers,
+            dtype=config.weights_dtype,
+        )
+
   pipeline.enable_vae_slicing()
   pipeline.enable_vae_tiling()
 
diff --git a/src/maxdiffusion/loaders/lora_conversion_utils.py b/src/maxdiffusion/loaders/lora_conversion_utils.py
@@ -703,3 +703,98 @@ def translate_wan_nnx_path_to_diffusers_lora(nnx_path_str, scan_layers=False):
         return f"diffusion_model.blocks.{idx}.{suffix_map[inner_suffix]}"
 
   return None
+
+
+def translate_ltx2_nnx_path_to_diffusers_lora(nnx_path_str, scan_layers=False):
+  """
+  Translates LTX2 NNX path to Diffusers/LoRA keys.
+  """
+  # --- 2. Map NNX Suffixes to LoRA Suffixes ---
+  suffix_map = {
+      # Self Attention (attn1)
+      "attn1.to_q": "attn1.to_q",
+      "attn1.to_k": "attn1.to_k",
+      "attn1.to_v": "attn1.to_v",
+      "attn1.to_out": "attn1.to_out.0",
+      # Audio Self Attention (audio_attn1)
+      "audio_attn1.to_q": "audio_attn1.to_q",
+      "audio_attn1.to_k": "audio_attn1.to_k",
+      "audio_attn1.to_v": "audio_attn1.to_v",
+      "audio_attn1.to_out": "audio_attn1.to_out.0",
+      # Audio Cross Attention (audio_attn2)
+      "audio_attn2.to_q": "audio_attn2.to_q",
+      "audio_attn2.to_k": "audio_attn2.to_k",
+      "audio_attn2.to_v": "audio_attn2.to_v",
+      "audio_attn2.to_out": "audio_attn2.to_out.0",
+      # Cross Attention (attn2)
+      "attn2.to_q": "attn2.to_q",
+      "attn2.to_k": "attn2.to_k",
+      "attn2.to_v": "attn2.to_v",
+      "attn2.to_out": "attn2.to_out.0",
+      # Audio to Video Cross Attention
+      "audio_to_video_attn.to_q": "audio_to_video_attn.to_q",
+      "audio_to_video_attn.to_k": "audio_to_video_attn.to_k",
+      "audio_to_video_attn.to_v": "audio_to_video_attn.to_v",
+      "audio_to_video_attn.to_out": "audio_to_video_attn.to_out.0",
+      # Video to Audio Cross Attention
+      "video_to_audio_attn.to_q": "video_to_audio_attn.to_q",
+      "video_to_audio_attn.to_k": "video_to_audio_attn.to_k",
+      "video_to_audio_attn.to_v": "video_to_audio_attn.to_v",
+      "video_to_audio_attn.to_out": "video_to_audio_attn.to_out.0",
+      # Feed Forward
+      "ff.net_0": "ff.net.0.proj",
+      "ff.net_2": "ff.net.2",
+      # Audio Feed Forward
+      "audio_ff.net_0": "audio_ff.net.0.proj",
+      "audio_ff.net_2": "audio_ff.net.2",
+  }
+
+  # --- 3. Translation Logic ---
+  global_map = {
+      "proj_in": "diffusion_model.patchify_proj",
+      "audio_proj_in": "diffusion_model.audio_patchify_proj",
+      "proj_out": "diffusion_model.proj_out",
+      "audio_proj_out": "diffusion_model.audio_proj_out",
+      "time_embed.linear": "diffusion_model.adaln_single.linear",
+      "audio_time_embed.linear": "diffusion_model.audio_adaln_single.linear",
+      "av_cross_attn_video_a2v_gate.linear": "diffusion_model.av_ca_a2v_gate_adaln_single.linear",
+      "av_cross_attn_audio_v2a_gate.linear": "diffusion_model.av_ca_v2a_gate_adaln_single.linear",
+      "av_cross_attn_audio_scale_shift.linear": "diffusion_model.av_ca_audio_scale_shift_adaln_single.linear",
+      "av_cross_attn_video_scale_shift.linear": "diffusion_model.av_ca_video_scale_shift_adaln_single.linear",
+      # Nested conditioning layers
+      "time_embed.emb.timestep_embedder.linear_1": "diffusion_model.adaln_single.emb.timestep_embedder.linear_1",
+      "time_embed.emb.timestep_embedder.linear_2": "diffusion_model.adaln_single.emb.timestep_embedder.linear_2",
+      "audio_time_embed.emb.timestep_embedder.linear_1": "diffusion_model.audio_adaln_single.emb.timestep_embedder.linear_1",
+      "audio_time_embed.emb.timestep_embedder.linear_2": "diffusion_model.audio_adaln_single.emb.timestep_embedder.linear_2",
+      "av_cross_attn_video_scale_shift.emb.timestep_embedder.linear_1": "diffusion_model.av_ca_video_scale_shift_adaln_single.emb.timestep_embedder.linear_1",
+      "av_cross_attn_video_scale_shift.emb.timestep_embedder.linear_2": "diffusion_model.av_ca_video_scale_shift_adaln_single.emb.timestep_embedder.linear_2",
+      "av_cross_attn_audio_scale_shift.emb.timestep_embedder.linear_1": "diffusion_model.av_ca_audio_scale_shift_adaln_single.emb.timestep_embedder.linear_1",
+      "av_cross_attn_audio_scale_shift.emb.timestep_embedder.linear_2": "diffusion_model.av_ca_audio_scale_shift_adaln_single.emb.timestep_embedder.linear_2",
+      "av_cross_attn_video_a2v_gate.emb.timestep_embedder.linear_1": "diffusion_model.av_ca_a2v_gate_adaln_single.emb.timestep_embedder.linear_1",
+      "av_cross_attn_video_a2v_gate.emb.timestep_embedder.linear_2": "diffusion_model.av_ca_a2v_gate_adaln_single.emb.timestep_embedder.linear_2",
+      "av_cross_attn_audio_v2a_gate.emb.timestep_embedder.linear_1": "diffusion_model.av_ca_v2a_gate_adaln_single.emb.timestep_embedder.linear_1",
+      "av_cross_attn_audio_v2a_gate.emb.timestep_embedder.linear_2": "diffusion_model.av_ca_v2a_gate_adaln_single.emb.timestep_embedder.linear_2",
+      "caption_projection.linear_1": "diffusion_model.caption_projection.linear_1",
+      "caption_projection.linear_2": "diffusion_model.caption_projection.linear_2",
+      "audio_caption_projection.linear_1": "diffusion_model.audio_caption_projection.linear_1",
+      "audio_caption_projection.linear_2": "diffusion_model.audio_caption_projection.linear_2",
+      # Connectors
+      "feature_extractor.linear": "text_embedding_projection.aggregate_embed",
+  }
+
+  if nnx_path_str in global_map:
+    return global_map[nnx_path_str]
+
+  if scan_layers:
+    if nnx_path_str.startswith("transformer_blocks."):
+      inner_suffix = nnx_path_str[len("transformer_blocks.") :]
+      if inner_suffix in suffix_map:
+        return f"diffusion_model.transformer_blocks.{{}}.{suffix_map[inner_suffix]}"
+  else:
+    m = re.match(r"^transformer_blocks\.(\d+)\.(.+)$", nnx_path_str)
+    if m:
+      idx, inner_suffix = m.group(1), m.group(2)
+      if inner_suffix in suffix_map:
+        return f"diffusion_model.transformer_blocks.{idx}.{suffix_map[inner_suffix]}"
+
+  return None
diff --git a/src/maxdiffusion/loaders/ltx2_lora_nnx_loader.py b/src/maxdiffusion/loaders/ltx2_lora_nnx_loader.py
@@ -0,0 +1,75 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""NNX-based LoRA loader for LTX2 models."""
+
+from flax import nnx
+from .lora_base import LoRABaseMixin
+from .lora_pipeline import StableDiffusionLoraLoaderMixin
+from ..models import lora_nnx
+from .. import max_logging
+from . import lora_conversion_utils
+
+
+class LTX2NNXLoraLoader(LoRABaseMixin):
+  """
+  Handles loading LoRA weights into NNX-based LTX2 model.
+  Assumes LTX2 pipeline contains 'transformer'
+  attributes that are NNX Modules.
+  """
+
+  def load_lora_weights(
+      self,
+      pipeline: nnx.Module,
+      lora_model_path: str,
+      transformer_weight_name: str,
+      rank: int,
+      scale: float = 1.0,
+      scan_layers: bool = False,
+      dtype: str = "float32",
+      **kwargs,
+  ):
+    """
+    Merges LoRA weights into the pipeline from a checkpoint.
+    """
+    lora_loader = StableDiffusionLoraLoaderMixin()
+
+    merge_fn = lora_nnx.merge_lora_for_scanned if scan_layers else lora_nnx.merge_lora
+
+    def translate_fn(nnx_path_str):
+      return lora_conversion_utils.translate_ltx2_nnx_path_to_diffusers_lora(nnx_path_str, scan_layers=scan_layers)
+
+    h_state_dict = None
+    if hasattr(pipeline, "transformer") and transformer_weight_name:
+      max_logging.log(f"Merging LoRA into transformer with rank={rank}")
+      h_state_dict, _ = lora_loader.lora_state_dict(lora_model_path, weight_name=transformer_weight_name, **kwargs)
+      # Filter state dict for transformer keys to avoid confusing warnings
+      transformer_state_dict = {k: v for k, v in h_state_dict.items() if k.startswith("diffusion_model")}
+      merge_fn(pipeline.transformer, transformer_state_dict, rank, scale, translate_fn, dtype=dtype)
+    else:
+      max_logging.log("transformer not found or no weight name provided for LoRA.")
+
+    if hasattr(pipeline, "connectors"):
+      max_logging.log(f"Merging LoRA into connectors with rank={rank}")
+      if h_state_dict is None and transformer_weight_name:
+        h_state_dict, _ = lora_loader.lora_state_dict(lora_model_path, weight_name=transformer_weight_name, **kwargs)
+
+      if h_state_dict is not None:
+        # Filter state dict for connector keys to avoid confusing warnings
+        connector_state_dict = {k: v for k, v in h_state_dict.items() if k.startswith("text_embedding_projection")}
+        merge_fn(pipeline.connectors, connector_state_dict, rank, scale, translate_fn, dtype=dtype)
+      else:
+        max_logging.log("Could not load LoRA state dict for connectors.")
+
+    return pipeline
diff --git a/src/maxdiffusion/models/ltx2/attention_ltx2.py b/src/maxdiffusion/models/ltx2/attention_ltx2.py
@@ -16,6 +16,7 @@
 
 from typing import Optional, Tuple
 from flax import nnx
+import jax
 import jax.numpy as jnp
 from ... import common_types
 from ..attention_flax import NNXAttentionOp
@@ -194,7 +195,7 @@ def prepare_video_coords(
     # pixel_coords[:, 0, ...] selects Frame dimension.
     # pixel_coords shape: [B, 3, num_patches, 2] -> dim 1 is (F, H, W)
     frame_coords = pixel_coords[:, 0, ...]
-    frame_coords = jnp.clip(frame_coords + self.causal_offset - self.scale_factors[0], min=0)
+    frame_coords = jnp.clip(frame_coords + self.causal_offset - self.scale_factors[0], a_min=0)
     pixel_coords = pixel_coords.at[:, 0, ...].set(frame_coords / fps)
 
     return pixel_coords
@@ -211,12 +212,12 @@ def prepare_audio_coords(
     # 2. Start timestamps
     audio_scale_factor = self.scale_factors[0]
     grid_start_mel = grid_f * audio_scale_factor
-    grid_start_mel = jnp.clip(grid_start_mel + self.causal_offset - audio_scale_factor, min=0)
+    grid_start_mel = jnp.clip(grid_start_mel + self.causal_offset - audio_scale_factor, a_min=0)
     grid_start_s = grid_start_mel * self.hop_length / self.sampling_rate
 
     # 3. End timestamps
     grid_end_mel = (grid_f + self.patch_size_t) * audio_scale_factor
-    grid_end_mel = jnp.clip(grid_end_mel + self.causal_offset - audio_scale_factor, min=0)
+    grid_end_mel = jnp.clip(grid_end_mel + self.causal_offset - audio_scale_factor, a_min=0)
     grid_end_s = grid_end_mel * self.hop_length / self.sampling_rate
 
     # Stack [num_patches, 2]
@@ -347,6 +348,7 @@ def __init__(
       attention_kernel: str = "flash",
       rope_type: str = "interleaved",
       flash_block_sizes: BlockSizes = None,
+      flash_min_seq_length: int = 4096,
   ):
     self.heads = heads
     self.rope_type = rope_type
@@ -434,6 +436,7 @@ def __init__(
         axis_names_q=(common_types.BATCH, common_types.SELF_ATTN_HEAD, common_types.SELF_ATTN_Q_LENGTH, common_types.D_KV),
         axis_names_kv=(common_types.BATCH, common_types.SELF_ATTN_HEAD, common_types.SELF_ATTN_KV_LENGTH, common_types.D_KV),
         flash_block_sizes=flash_block_sizes,
+        flash_min_seq_length=flash_min_seq_length,
     )
 
   def __call__(
@@ -447,46 +450,49 @@ def __call__(
     # Determine context (Self or Cross)
     context = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
 
-    # 1. Project
-    query = self.to_q(hidden_states)
-    key = self.to_k(context)
-    value = self.to_v(context)
+    # 1. Project and Norm
+    with jax.named_scope("QKV Projection"):
+      query = self.to_q(hidden_states)
+      key = self.to_k(context)
+      value = self.to_v(context)
 
-    # 2. Norm (Full Inner Dimension)
-    query = self.norm_q(query)
-    key = self.norm_k(key)
+    with jax.named_scope("QKV Norm"):
+      query = self.norm_q(query)
+      key = self.norm_k(key)
 
     # 3. Apply RoPE to tensors of shape [B, S, InnerDim]
     # Frequencies are shape [B, S, InnerDim]
     # 3. Apply RoPE
-    if rotary_emb is not None:
-      if hasattr(self, "rope_type") and self.rope_type == "split":
-        # Split RoPE: passing full freqs [B, H, S, D//2]
-        # apply_split_rotary_emb handles reshaping query/key
-
-        query = apply_split_rotary_emb(query, rotary_emb)
-
-        if k_rotary_emb is not None:
-          key = apply_split_rotary_emb(key, k_rotary_emb)
-        elif encoder_hidden_states is None:
-          key = apply_split_rotary_emb(key, rotary_emb)
-
-      else:
-        # Interleaved (Default)
-        query = apply_rotary_emb(query, rotary_emb)
-        if k_rotary_emb is not None:
-          key = apply_rotary_emb(key, k_rotary_emb)
-        elif encoder_hidden_states is None:
-          key = apply_rotary_emb(key, rotary_emb)
-
-    # 4. Attention
-    # NNXAttentionOp expects flattened input [B, S, InnerDim] for flash kernel
-    attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
-
-    # 7. Output Projection
-    hidden_states = self.to_out(attn_output)
-
-    if self.dropout_layer is not None:
-      hidden_states = self.dropout_layer(hidden_states)
+    with jax.named_scope("Apply RoPE"):
+      if rotary_emb is not None:
+        if hasattr(self, "rope_type") and self.rope_type == "split":
+          # Split RoPE: passing full freqs [B, H, S, D//2]
+          # apply_split_rotary_emb handles reshaping query/key
+
+          query = apply_split_rotary_emb(query, rotary_emb)
+
+          if k_rotary_emb is not None:
+            key = apply_split_rotary_emb(key, k_rotary_emb)
+          elif encoder_hidden_states is None:
+            key = apply_split_rotary_emb(key, rotary_emb)
+
+        else:
+          # Interleaved (Default)
+          query = apply_rotary_emb(query, rotary_emb)
+          if k_rotary_emb is not None:
+            key = apply_rotary_emb(key, k_rotary_emb)
+          elif encoder_hidden_states is None:
+            key = apply_rotary_emb(key, rotary_emb)
+
+    with jax.named_scope("Attention and Output Project"):
+      # 4. Attention
+      # NNXAttentionOp expects flattened input [B, S, InnerDim] for flash kernel
+      attn_output = self.attention_op.apply_attention(query=query, key=key, value=value, attention_mask=attention_mask)
+
+      # 7. Output Projection
+      hidden_states = self.to_out(attn_output)
+
+      if self.dropout_layer is not None:
+        hidden_states = self.dropout_layer(hidden_states)
 
     return hidden_states
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/text_encoders_ltx2.py b/src/maxdiffusion/models/ltx2/text_encoders/text_encoders_ltx2.py
@@ -108,11 +108,12 @@ def __call__(
     Returns:
         (video_embeds, audio_embeds, new_attention_mask)
     """
-    # 1. Shared Feature Extraction
-    features = self.feature_extractor(hidden_states, attention_mask)
+    with jax.named_scope("Text Encoder Forward"):
+      # 1. Shared Feature Extraction
+      features = self.feature_extractor(hidden_states, attention_mask)
 
-    # 2. Parallel Connection
-    video_embeds, new_attention_mask = self.video_embeddings_connector(features, attention_mask)
-    audio_embeds, _ = self.audio_embeddings_connector(features, attention_mask)
+      # 2. Parallel Connection
+      video_embeds, new_attention_mask = self.video_embeddings_connector(features, attention_mask)
+      audio_embeds, _ = self.audio_embeddings_connector(features, attention_mask)
 
-    return video_embeds, audio_embeds, new_attention_mask
+      return video_embeds, audio_embeds, new_attention_mask
diff --git a/src/maxdiffusion/models/ltx2/transformer_ltx2.py b/src/maxdiffusion/models/ltx2/transformer_ltx2.py
diff --git a/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py b/src/maxdiffusion/pipelines/ltx2/ltx2_pipeline.py
diff --git a/src/maxdiffusion/schedulers/scheduling_flow_match_flax.py b/src/maxdiffusion/schedulers/scheduling_flow_match_flax.py