Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions dev/run_qwen3_5_localbackend_yes_no_maybe.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ def _format_int_list(values: list[int]) -> str:
parser.add_argument(
"--enable-thinking", action=argparse.BooleanOptionalAction, default=False
)
parser.add_argument(
"--rollout-weights-mode",
choices=("lora", "merged"),
default=None,
)
parser.add_argument("--trainer-gpu-ids", type=int, nargs="+")
parser.add_argument("--inference-gpu-ids", type=int, nargs="+")
args = parser.parse_args()
Expand Down Expand Up @@ -98,6 +103,8 @@ def _format_int_list(values: list[int]) -> str:
f"INFERENCE_GPU_IDS={_format_int_list(args.inference_gpu_ids)}",
]
)
if args.rollout_weights_mode is not None:
env.append(f"ROLLOUT_WEIGHTS_MODE={args.rollout_weights_mode}")
env_block = " \\\n ".join(env)

run_script = textwrap.dedent(
Expand Down Expand Up @@ -143,6 +150,7 @@ def _format_int_list(values: list[int]) -> str:
print(f" load_in_4bit: {args.load_in_4bit}")
print(f" load_in_16bit: {args.load_in_16bit}")
print(f" enable_thinking: {args.enable_thinking}")
print(f" rollout_weights_mode: {args.rollout_weights_mode}")
print(f" trainer_gpu_ids: {args.trainer_gpu_ids}")
print(f" inference_gpu_ids: {args.inference_gpu_ids}")

Expand Down
6 changes: 6 additions & 0 deletions dev/yes-no-maybe-metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,12 @@ def build_internal_config() -> art.dev.InternalModelConfig:
result["trainer_gpu_ids"] = trainer_gpu_ids
result["inference_gpu_ids"] = inference_gpu_ids

rollout_weights_mode = os.environ.get("ROLLOUT_WEIGHTS_MODE")
if rollout_weights_mode is not None:
if rollout_weights_mode not in {"lora", "merged"}:
raise ValueError("ROLLOUT_WEIGHTS_MODE must be either 'lora' or 'merged'")
result["rollout_weights_mode"] = rollout_weights_mode

return result


Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ backend = [
"pytest>=8.4.1",
"nbmake>=1.5.5",
"gql<4",
"nvidia-cudnn-frontend<1.21 ; sys_platform == 'linux'",
"vllm @ https://github.com/vivekkalyan/vllm/releases/download/v0.17.0-art1/vllm-0.17.0%2Bart1-cp38-abi3-manylinux_2_31_x86_64.whl ; sys_platform == 'linux'",
]
megatron = [
Expand Down
5 changes: 5 additions & 0 deletions src/art/dev/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
from typing_extensions import TypedDict


class WeightTransferConfig(TypedDict):
backend: Literal["nccl"]


class EngineArgs(TypedDict, total=False):
model: str
served_model_name: str | list[str] | None
Expand Down Expand Up @@ -124,6 +128,7 @@ class EngineArgs(TypedDict, total=False):
calculate_kv_scales: bool | None

additional_config: dict[str, Any] | None
weight_transfer_config: WeightTransferConfig | None

disable_log_requests: (
bool # Deprecated in vLLM 0.13+, use enable_log_requests instead
Expand Down
2 changes: 2 additions & 0 deletions src/art/dev/get_model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def get_model_config(
config = InternalModelConfig()

dedicated = is_dedicated_mode(config)
rollout_weights_mode = config.get("rollout_weights_mode", "lora")

if dedicated:
enable_sleep_mode = False
Expand Down Expand Up @@ -78,6 +79,7 @@ def get_model_config(
init_args=init_args,
engine_args=engine_args,
peft_args=peft_args,
rollout_weights_mode=rollout_weights_mode,
tinker_args=config.get("tinker_args"),
trainer_args=trainer_args,
)
Expand Down
8 changes: 8 additions & 0 deletions src/art/dev/model.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from enum import Enum
from typing import Literal

from typing_extensions import Required, TypedDict

from .engine import EngineArgs

RolloutWeightsMode = Literal["lora", "merged"]


# Vendored from transformers.training_args.OptimizerNames
class OptimizerNames(str, Enum):
Expand Down Expand Up @@ -120,6 +123,10 @@ class InternalModelConfig(TypedDict, total=False):
inference run on separate GPUs.
inference_gpu_ids: GPU IDs for vLLM inference (e.g., [1]). When set
with trainer_gpu_ids, enables dedicated mode.
rollout_weights_mode: How inference weights are applied in vLLM.
- "lora": load LoRA adapters into vLLM directly
- "merged": keep training LoRA adapters, but push merged weights
into vLLM for inference
"""

init_args: "InitArgs"
Expand All @@ -130,6 +137,7 @@ class InternalModelConfig(TypedDict, total=False):
trainer_args: "TrainerArgs"
trainer_gpu_ids: list[int]
inference_gpu_ids: list[int]
rollout_weights_mode: "RolloutWeightsMode"


class TinkerArgs(TypedDict, total=False):
Expand Down
32 changes: 31 additions & 1 deletion src/art/dev/validate.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,30 @@
"""Validation functions for model configuration."""

from .model import InternalModelConfig
from .model import InternalModelConfig, RolloutWeightsMode

QWEN3_5_MOE_MODELS = {
"Qwen/Qwen3.5-35B-A3B",
"Qwen/Qwen3.5-397B-A17B",
}


def is_dedicated_mode(config: InternalModelConfig) -> bool:
"""Return True if the config specifies dedicated mode (separate training and inference GPUs)."""
return "trainer_gpu_ids" in config and "inference_gpu_ids" in config


def _rollout_weights_mode(config: InternalModelConfig) -> RolloutWeightsMode:
mode = config.get("rollout_weights_mode", "lora")
if mode in {"lora", "merged"}:
return mode
raise ValueError("rollout_weights_mode must be either 'lora' or 'merged'")


def _is_qwen3_5_moe_model(config: InternalModelConfig) -> bool:
model_name = config.get("engine_args", {}).get("model")
return model_name in QWEN3_5_MOE_MODELS


def validate_dedicated_config(config: InternalModelConfig) -> None:
"""Validate dedicated mode GPU configuration.

Expand All @@ -16,12 +33,19 @@ def validate_dedicated_config(config: InternalModelConfig) -> None:
"""
has_trainer = "trainer_gpu_ids" in config
has_inference = "inference_gpu_ids" in config
rollout_weights_mode = _rollout_weights_mode(config)

if has_trainer != has_inference:
raise ValueError(
"trainer_gpu_ids and inference_gpu_ids must both be set or both unset"
)

if rollout_weights_mode == "merged" and not has_trainer:
raise ValueError(
"rollout_weights_mode='merged' requires dedicated mode "
"(set both trainer_gpu_ids and inference_gpu_ids)"
)

if not has_trainer:
return

Expand Down Expand Up @@ -65,3 +89,9 @@ def validate_dedicated_config(config: InternalModelConfig) -> None:
"enable_sleep_mode is incompatible with dedicated mode "
"(dedicated mode runs vLLM on a separate GPU, sleep/wake is not needed)"
)

if _is_qwen3_5_moe_model(config) and rollout_weights_mode == "lora":
raise ValueError(
"Qwen3.5-MoE models require rollout_weights_mode='merged' with the "
"current vLLM version because direct LoRA inference is currently broken"
)
Loading
Loading