import os
os.environ["SKYPILOT_API_SERVER_ENDPOINT"] = "<SKY-PILOT-API-SERVER-URL>"
import nemo_run as run
from nemo.collections import llm
import nemo.lightning as nl
from lightning.pytorch.loggers import MLFlowLogger
from nemo.collections.llm.peft.lora import LoRA
from lightning.pytorch.callbacks import EarlyStopping
NEMO_MODEL_PATH = "/mnt/models/nemo-models/Mistral-7B-v0.3"
DATASET_ROOT = "/mnt/datasets/demo-v1"
experiment_name="demo-mistral-lora-ft-exp4"
run_name="demo-mistral-lora-ft-exp4-lora-r32a64-100steps"
OUTPUT_DIR = "/mnt/experiments/demo-mistral-lora-ft-exp4"
LOG_DIR = "/mnt/experiments/kyc_edd_mistral_lora_ft_logs-exp4"
MLFLOW_TRACKING_URI = "<ML-FLOW-URL>"
def configure_recipe(nodes: int = 1, gpus_per_node: int = 4):
recipe = llm.mistral_7b.finetune_recipe(
dir=OUTPUT_DIR,
name="mistral_lora",
num_nodes=nodes,
num_gpus_per_node=gpus_per_node,
peft_scheme="lora",
)
recipe.resume = run.Config(
nl.AutoResume,
restore_config=run.Config(
nl.RestoreConfig,
path=NEMO_MODEL_PATH,
),
resume_if_exists=True,
)
recipe.data = run.Config(
llm.FineTuningDataModule,
dataset_root=DATASET_ROOT,
seq_length=4096,
micro_batch_size=1,
global_batch_size=64,
)
ckpt = run.Config(
nl.ModelCheckpoint,
save_last=True,
every_n_train_steps=100,
save_weights_only=False,
always_save_context=True,
save_context_on_train_end=True,
)
recipe.log = run.Config(
nl.NeMoLogger,
name="mistral-lora-ft",
log_dir=LOG_DIR,
use_datetime_version=False,
ckpt=ckpt,
explicit_log_dir=LOG_DIR,
extra_loggers=[
run.Config(
MLFlowLogger,
experiment_name=experiment_name,
run_name=run_name,
tracking_uri=MLFLOW_TRACKING_URI,
log_model=False,
)
],
)
recipe.peft = run.Config(
LoRA,
target_modules=[
'linear_qkv',
'linear_proj',
'linear_fc1',
'linear_fc2'
],
exclude_modules=[],
dim=32,
alpha=64,
dropout=0.05,
dropout_position='pre',
lora_A_init_method='xavier',
lora_B_init_method='zero',
a2a_experimental=False,
lora_dtype=None,
dropout_recompute=False
)
early_stop = run.Config(
EarlyStopping,
monitor="val_loss",
mode="min",
patience=3,
min_delta=0.0,
strict=True,
verbose=True,
)
recipe.trainer.max_steps = 1000
recipe.trainer.num_sanity_val_steps = 0
recipe.trainer.val_check_interval = 5
recipe.trainer.strategy.ckpt_async_save = False
recipe.trainer.strategy.context_parallel_size = 1
recipe.trainer.strategy.ddp = "megatron"
if recipe.trainer.callbacks is None:
recipe.trainer.callbacks = []
recipe.trainer.callbacks.append(early_stop)
return recipe
def skypilot_executor(nodes: int = 1, gpus_per_node: int = 4) -> run.SkypilotExecutor:
return run.SkypilotExecutor(
gpus="H100",
gpus_per_node=gpus_per_node,
num_nodes=nodes,
cloud="kubernetes",
container_image="nvcr.io/nvidia/nemo:25.07",
cluster_name="demo-mistral-finetune",
setup="pip install mlflow>=1.0.0",
env_vars={
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
"NCCL_NVLS_ENABLE": "0",
"NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
"NVTE_ASYNC_AMAX_REDUCTION": "1",
"CUDA_DEVICE_MAX_CONNECTIONS": "1",
"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
"MLFLOW_TRACKING_URI": MLFLOW_TRACKING_URI,
},
)
def finetune_mistral():
nodes = 1
gpus_per_node = 1
recipe = configure_recipe(nodes=nodes, gpus_per_node=gpus_per_node)
executor = skypilot_executor(nodes=nodes, gpus_per_node=gpus_per_node)
with run.Experiment("demo-mistral-7b-peft-finetuning-exp4") as exp:
exp.add(recipe, executor=executor, name="demo_mistral_peft_finetuning-exp4")
exp.run(sequential=True, tail_logs=False)
if __name__ == "__main__":
finetune_mistral()
⚙︎ Job submitted, ID: 1
[06:23:54] Error running job mistral_peft_finetuning_demo_1: invalid literal for int() with base 10: '[1]'
Traceback (most recent call last):
...
File ".../nemo_run/run/torchx_backend/schedulers/skypilot_jobs.py", line 115, in schedule
task_details = SkypilotJobsExecutor.status(app_id=app_id)
File ".../nemo_run/core/execution/skypilot_jobs.py", line 228, in status
_, _, job_id = cls.parse_app(app_id)
File ".../nemo_run/core/execution/skypilot_jobs.py", line 151, in parse_app
return cluster, task, int(job_id)
ValueError: invalid literal for int() with base 10: '[1]'
Summary
When using
run.SkypilotJobsExecutoragainst a remote SkyPilot API server on Kubernetes, the job is accepted by SkyPilot but NeMo Run crashes immediately afterward while parsing the returned job id. The managed job keeps running on the cluster, but NeMo Run loses track of it.Environment
nemo-runversion: 0.8.1skypilotversion: 0.11.2SKYPILOT_API_SERVER_ENDPOINT:<skypilot api endpoint>Minimal Reproducer
Observed Output
Expected Behavior
nemo experiment statusandnemo experiment logsshould continue working for the submitted job.Actual Behavior
Job submitted, ID: 1.ValueError.