From 63d5824b0767a0113177d9073b203a5af99d02b9 Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:32:03 +0000 Subject: [PATCH 01/14] add support for mlflow --- src/instructlab/training/config.py | 30 +++++ src/instructlab/training/logger.py | 187 +++++++++++++++++++++++++++- src/instructlab/training/main_ds.py | 57 ++++++++- 3 files changed, 266 insertions(+), 8 deletions(-) diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py index 599bab4d..1fb11fd8 100644 --- a/src/instructlab/training/config.py +++ b/src/instructlab/training/config.py @@ -305,3 +305,33 @@ class TrainingArgs(BaseModel): log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field( default="INFO" ) + + logger_type: str = Field( + default="async", + description="Comma-separated list of loggers to use: tensorboard, wandb, async, mlflow", + ) + + run_name: str | None = Field( + default=None, + description="Run name for logging. Supports placeholders: {time}, {rank}, {utc_time}, {local_rank}", + ) + + mlflow_tracking_uri: str | None = Field( + default=None, + description="MLflow tracking server URI (e.g., 'http://localhost:5000'). Falls back to MLFLOW_TRACKING_URI env var.", + ) + + mlflow_experiment_name: str | None = Field( + default=None, + description="MLflow experiment name. Falls back to MLFLOW_EXPERIMENT_NAME env var.", + ) + + wandb_project: str | None = Field( + default=None, + description="Weights & Biases project name.", + ) + + wandb_entity: str | None = Field( + default=None, + description="Weights & Biases team/entity name.", + ) diff --git a/src/instructlab/training/logger.py b/src/instructlab/training/logger.py index d92b8975..2793d996 100644 --- a/src/instructlab/training/logger.py +++ b/src/instructlab/training/logger.py @@ -2,7 +2,7 @@ This module provides a logging system for training machine learning models, supporting multiple logging backends including TensorBoard (tensorboard), Weights & Biases (wandb), -and structured JSONL logging (async). +MLflow (mlflow), and structured JSONL logging (async). Example Usage: ```python @@ -73,6 +73,12 @@ except ImportError: wandb = None # type: ignore +try: + # Third Party + import mlflow +except ImportError: + mlflow = None # type: ignore + # Third Party from rich.logging import RichHandler import torch @@ -581,6 +587,148 @@ def emit(self, record: logging.LogRecord): self._wandb_run.log(flat_dict, step=step) +class MLflowHandler(logging.Handler): + """Logger that sends metrics to MLflow. + + This handler expects a (nested) dictionary of metrics to be logged with string keys. + A step can be specified by passing `extra={"step": }` to the logging method. + To log hyperparameters, pass a (nested) mapping of hyperparameters to the logging method + and set `extra={"hparams": True}`. + + Example: + ```python + import logging + from instructlab.training.logger import MLflowHandler + + # Create handler + handler = MLflowHandler( + level=logging.INFO, + run_name="experiment_{time}", + tracking_uri="http://localhost:5000", + experiment_name="my_experiment" + ) + + # Create logger + logger = logging.getLogger("metrics") + logger.addHandler(handler) + logger.setLevel(logging.INFO) + + # Log metrics + logger.info( + { + "training": { + "loss": 0.5, + "accuracy": 0.95 + } + }, + extra={"step": 100} + ) + + # Log hyperparameters + logger.info( + { + "learning_rate": 0.001, + "batch_size": 32 + }, + extra={"hparams": True} + ) + ``` + """ + + def __init__( + self, + level: int = logging.INFO, + run_name: str | None = None, + log_dir: str | os.PathLike = "logs", + tracking_uri: str | None = None, + experiment_name: str | None = None, + **mlflow_init_kwargs: Any, + ): + """Initialize the MLflow logger and check for required dependencies. + + Args: + level: The logging level for this handler + run_name: Name of the run, can contain placeholders + log_dir: Directory where MLflow artifacts should be stored (used as artifact location) + tracking_uri: MLflow tracking server URI (e.g., "http://localhost:5000") + experiment_name: Name of the MLflow experiment + **mlflow_init_kwargs: Additional keyword arguments passed to mlflow.start_run() + """ + super().__init__(level) + + self.run_name = _substitute_placeholders(run_name) + self.log_dir = Path(log_dir) + self.tracking_uri = tracking_uri + self.experiment_name = experiment_name + self.mlflow_init_kwargs = mlflow_init_kwargs.copy() + + self._mlflow_run = None + + def _setup(self): + """Initialize the MLflow run with the configured settings.""" + if mlflow is None: + msg = ( + "Could not initialize MLflowHandler because package mlflow could not be imported.\n" + "Please ensure it is installed by running 'pip install mlflow'" + ) + raise RuntimeError(msg) + + if self.tracking_uri: + mlflow.set_tracking_uri(self.tracking_uri) + + if self.experiment_name: + mlflow.set_experiment(self.experiment_name) + + self._mlflow_run = mlflow.start_run( + run_name=self.run_name, + **self.mlflow_init_kwargs + ) + + def emit(self, record: logging.LogRecord): + """Emit a log record to MLflow. + + Args: + record: The log record to emit + """ + if self._mlflow_run is None: + self._setup() + + if not isinstance(record.msg, Mapping): + warnings.warn( + f"MLflowHandler expected a mapping, got {type(record.msg)}. Skipping log. " + "Please ensure the handler is configured correctly to filter out non-mapping objects." + ) + return + + flat_dict = _flatten_dict(record.msg, sep=".") + step = getattr(record, "step", None) + + if getattr(record, "hparams", None): + # Log as parameters - MLflow params must be strings + params_dict = {k: str(v) for k, v in flat_dict.items()} + mlflow.log_params(params_dict) + return + + # Filter to only numeric values for metrics + metrics_dict = {} + for k, v in flat_dict.items(): + try: + metrics_dict[k] = float(v) + except (ValueError, TypeError): + # Skip non-numeric values for metrics + pass + + if metrics_dict: + mlflow.log_metrics(metrics_dict, step=step) + + def close(self): + """End the MLflow run and cleanup resources.""" + if self._mlflow_run is not None: + mlflow.end_run() + self._mlflow_run = None + super().close() + + class AsyncStructuredHandler(logging.Handler): """Logger that asynchronously writes data to a JSONL file. @@ -708,7 +856,16 @@ def setup_root_logger(level="DEBUG"): ) -def setup_metric_logger(loggers, run_name, output_dir): +def setup_metric_logger( + loggers, + run_name, + output_dir, + *, + mlflow_tracking_uri: str | None = None, + mlflow_experiment_name: str | None = None, + wandb_project: str | None = None, + wandb_entity: str | None = None, +): """Configure the metric logging system with specified backends. This function sets up a comprehensive logging configuration that supports @@ -717,10 +874,16 @@ def setup_metric_logger(loggers, run_name, output_dir): Args: loggers: A string or list of strings specifying which logging backends to use. - Supported values: "tensorboard", "wandb", "async" + Supported values: "tensorboard", "wandb", "mlflow", "async" run_name: Name for the current training run. Can include placeholders like {time}, {rank}, {utc_time}, {local_rank}. output_dir: Directory where log files will be stored + mlflow_tracking_uri: MLflow tracking server URI (e.g., "http://localhost:5000"). + Falls back to MLFLOW_TRACKING_URI environment variable if not provided. + mlflow_experiment_name: MLflow experiment name. + Falls back to MLFLOW_EXPERIMENT_NAME environment variable if not provided. + wandb_project: Weights & Biases project name. + wandb_entity: Weights & Biases team/entity name. Example: ```python @@ -731,11 +894,13 @@ def setup_metric_logger(loggers, run_name, output_dir): output_dir="logs" ) - # Setup logging with a single backend + # Setup logging with MLflow setup_metric_logger( - loggers="tensorboard", + loggers=["mlflow"], run_name="my_run", - output_dir="logs" + output_dir="logs", + mlflow_tracking_uri="http://localhost:5000", + mlflow_experiment_name="my_experiment" ) ``` """ @@ -781,6 +946,16 @@ def setup_metric_logger(loggers, run_name, output_dir): "()": WandbHandler, "log_dir": output_dir, "run_name": run_name, + "project": wandb_project, + "entity": wandb_entity, + "filters": ["is_mapping", "is_rank0"], + }, + "mlflow": { + "()": MLflowHandler, + "log_dir": output_dir, + "run_name": run_name, + "tracking_uri": mlflow_tracking_uri or os.environ.get("MLFLOW_TRACKING_URI"), + "experiment_name": mlflow_experiment_name or os.environ.get("MLFLOW_EXPERIMENT_NAME"), "filters": ["is_mapping", "is_rank0"], }, }, diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index e05c9eae..f623c055 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -275,7 +275,15 @@ def main(args): "DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags." ) - setup_metric_logger(args.logger_type, args.run_name, args.output_dir) + setup_metric_logger( + args.logger_type, + args.run_name, + args.output_dir, + mlflow_tracking_uri=getattr(args, 'mlflow_tracking_uri', None), + mlflow_experiment_name=getattr(args, 'mlflow_experiment_name', None), + wandb_project=getattr(args, 'wandb_project', None), + wandb_entity=getattr(args, 'wandb_entity', None), + ) metric_logger = logging.getLogger("instructlab.training.metrics") if os.environ["LOCAL_RANK"] == "0": metric_logger.info(vars(args), extra={"hparams": True}) @@ -460,7 +468,15 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: # Enable package logging propagation before setting up loggers propagate_package_logs(True) setup_root_logger(train_args.log_level) - setup_metric_logger("async", None, train_args.ckpt_output_dir) + setup_metric_logger( + train_args.logger_type, + train_args.run_name, + train_args.ckpt_output_dir, + mlflow_tracking_uri=train_args.mlflow_tracking_uri, + mlflow_experiment_name=train_args.mlflow_experiment_name, + wandb_project=train_args.wandb_project, + wandb_entity=train_args.wandb_entity, + ) logger = logging.getLogger("instructlab.training") logger.info("Starting training setup...") @@ -548,9 +564,22 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: f"--adamw_beta1={train_args.adamw_betas[0]}", f"--adamw_beta2={train_args.adamw_betas[1]}", f"--adamw_eps={train_args.adamw_eps}", + f"--logger_type={train_args.logger_type}", ] ) + # Add optional logging parameters + if train_args.run_name is not None: + command.append(f"--run_name={train_args.run_name}") + if train_args.mlflow_tracking_uri is not None: + command.append(f"--mlflow_tracking_uri={train_args.mlflow_tracking_uri}") + if train_args.mlflow_experiment_name is not None: + command.append(f"--mlflow_experiment_name={train_args.mlflow_experiment_name}") + if train_args.wandb_project is not None: + command.append(f"--wandb_project={train_args.wandb_project}") + if train_args.wandb_entity is not None: + command.append(f"--wandb_entity={train_args.wandb_entity}") + if train_args.pretraining_config is not None: command.append(f"--block-size={train_args.pretraining_config.block_size}") command.append( @@ -769,6 +798,30 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: parser.add_argument("--log_level", type=str, default="INFO") parser.add_argument("--run_name", type=str, default=None) parser.add_argument("--logger_type", type=str, default="async") + parser.add_argument( + "--mlflow_tracking_uri", + type=str, + default=None, + help="MLflow tracking server URI (e.g., 'http://localhost:5000')", + ) + parser.add_argument( + "--mlflow_experiment_name", + type=str, + default=None, + help="MLflow experiment name", + ) + parser.add_argument( + "--wandb_project", + type=str, + default=None, + help="Weights & Biases project name", + ) + parser.add_argument( + "--wandb_entity", + type=str, + default=None, + help="Weights & Biases team/entity name", + ) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--mock_data", action="store_true") parser.add_argument("--mock_len", type=int, default=2600) From 4e49652ba1942940a7e33ae74bc55333b2f6b319 Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:41:28 +0000 Subject: [PATCH 02/14] fix formatting changes --- src/instructlab/training/logger.py | 9 +++++---- src/instructlab/training/main_ds.py | 8 ++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/instructlab/training/logger.py b/src/instructlab/training/logger.py index 2793d996..f3714b3e 100644 --- a/src/instructlab/training/logger.py +++ b/src/instructlab/training/logger.py @@ -680,8 +680,7 @@ def _setup(self): mlflow.set_experiment(self.experiment_name) self._mlflow_run = mlflow.start_run( - run_name=self.run_name, - **self.mlflow_init_kwargs + run_name=self.run_name, **self.mlflow_init_kwargs ) def emit(self, record: logging.LogRecord): @@ -954,8 +953,10 @@ def setup_metric_logger( "()": MLflowHandler, "log_dir": output_dir, "run_name": run_name, - "tracking_uri": mlflow_tracking_uri or os.environ.get("MLFLOW_TRACKING_URI"), - "experiment_name": mlflow_experiment_name or os.environ.get("MLFLOW_EXPERIMENT_NAME"), + "tracking_uri": mlflow_tracking_uri + or os.environ.get("MLFLOW_TRACKING_URI"), + "experiment_name": mlflow_experiment_name + or os.environ.get("MLFLOW_EXPERIMENT_NAME"), "filters": ["is_mapping", "is_rank0"], }, }, diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index f623c055..43a263ac 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -279,10 +279,10 @@ def main(args): args.logger_type, args.run_name, args.output_dir, - mlflow_tracking_uri=getattr(args, 'mlflow_tracking_uri', None), - mlflow_experiment_name=getattr(args, 'mlflow_experiment_name', None), - wandb_project=getattr(args, 'wandb_project', None), - wandb_entity=getattr(args, 'wandb_entity', None), + mlflow_tracking_uri=getattr(args, "mlflow_tracking_uri", None), + mlflow_experiment_name=getattr(args, "mlflow_experiment_name", None), + wandb_project=getattr(args, "wandb_project", None), + wandb_entity=getattr(args, "wandb_entity", None), ) metric_logger = logging.getLogger("instructlab.training.metrics") if os.environ["LOCAL_RANK"] == "0": From af312318212d8c0733c9cac1ce348f2ade9d3c8d Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:56:38 +0000 Subject: [PATCH 03/14] Add tensorboard_log_dir to TrainingArgs for configurable TensorBoard logging - Add tensorboard_log_dir field to TrainingArgs in config.py - Update setup_metric_logger to use tensorboard_log_dir when provided - Add CLI argument for tensorboard_log_dir - Wire tensorboard_log_dir through run_training() to subprocess command This allows users to specify a custom directory for TensorBoard logs, defaulting to output_dir if not specified. Co-Authored-By: Claude Opus 4.5 --- src/instructlab/training/config.py | 5 +++++ src/instructlab/training/logger.py | 4 +++- src/instructlab/training/main_ds.py | 10 ++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py index 1fb11fd8..7b1c5f7f 100644 --- a/src/instructlab/training/config.py +++ b/src/instructlab/training/config.py @@ -335,3 +335,8 @@ class TrainingArgs(BaseModel): default=None, description="Weights & Biases team/entity name.", ) + + tensorboard_log_dir: str | None = Field( + default=None, + description="Directory for TensorBoard logs. Defaults to ckpt_output_dir if not specified.", + ) diff --git a/src/instructlab/training/logger.py b/src/instructlab/training/logger.py index f3714b3e..059d12b6 100644 --- a/src/instructlab/training/logger.py +++ b/src/instructlab/training/logger.py @@ -864,6 +864,7 @@ def setup_metric_logger( mlflow_experiment_name: str | None = None, wandb_project: str | None = None, wandb_entity: str | None = None, + tensorboard_log_dir: str | None = None, ): """Configure the metric logging system with specified backends. @@ -883,6 +884,7 @@ def setup_metric_logger( Falls back to MLFLOW_EXPERIMENT_NAME environment variable if not provided. wandb_project: Weights & Biases project name. wandb_entity: Weights & Biases team/entity name. + tensorboard_log_dir: Directory for TensorBoard logs. Defaults to output_dir if not provided. Example: ```python @@ -937,7 +939,7 @@ def setup_metric_logger( }, "tensorboard": { "()": TensorBoardHandler, - "log_dir": output_dir, + "log_dir": tensorboard_log_dir or output_dir, "run_name": run_name, "filters": ["is_mapping", "is_rank0"], }, diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index 43a263ac..22fbae68 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -283,6 +283,7 @@ def main(args): mlflow_experiment_name=getattr(args, "mlflow_experiment_name", None), wandb_project=getattr(args, "wandb_project", None), wandb_entity=getattr(args, "wandb_entity", None), + tensorboard_log_dir=getattr(args, "tensorboard_log_dir", None), ) metric_logger = logging.getLogger("instructlab.training.metrics") if os.environ["LOCAL_RANK"] == "0": @@ -476,6 +477,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: mlflow_experiment_name=train_args.mlflow_experiment_name, wandb_project=train_args.wandb_project, wandb_entity=train_args.wandb_entity, + tensorboard_log_dir=train_args.tensorboard_log_dir, ) logger = logging.getLogger("instructlab.training") @@ -579,6 +581,8 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: command.append(f"--wandb_project={train_args.wandb_project}") if train_args.wandb_entity is not None: command.append(f"--wandb_entity={train_args.wandb_entity}") + if train_args.tensorboard_log_dir is not None: + command.append(f"--tensorboard_log_dir={train_args.tensorboard_log_dir}") if train_args.pretraining_config is not None: command.append(f"--block-size={train_args.pretraining_config.block_size}") @@ -822,6 +826,12 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: default=None, help="Weights & Biases team/entity name", ) + parser.add_argument( + "--tensorboard_log_dir", + type=str, + default=None, + help="Directory for TensorBoard logs. Defaults to output_dir if not specified.", + ) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--mock_data", action="store_true") parser.add_argument("--mock_len", type=int, default=2600) From 938cac3a532df93bb3aafe4073ca646244fb5705 Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Tue, 27 Jan 2026 16:50:02 +0000 Subject: [PATCH 04/14] Address PR review feedback - Replace defensive getattr() with direct attribute access in main_ds.py since args are guaranteed to exist from argparse defaults - Remove unused log_dir parameter from MLflowHandler - Add debug logging for non-numeric metrics skipped by MLflowHandler Co-Authored-By: Claude Opus 4.5 --- src/instructlab/training/logger.py | 13 ++++++++----- src/instructlab/training/main_ds.py | 10 +++++----- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/instructlab/training/logger.py b/src/instructlab/training/logger.py index 059d12b6..f09297b9 100644 --- a/src/instructlab/training/logger.py +++ b/src/instructlab/training/logger.py @@ -639,7 +639,6 @@ def __init__( self, level: int = logging.INFO, run_name: str | None = None, - log_dir: str | os.PathLike = "logs", tracking_uri: str | None = None, experiment_name: str | None = None, **mlflow_init_kwargs: Any, @@ -649,7 +648,6 @@ def __init__( Args: level: The logging level for this handler run_name: Name of the run, can contain placeholders - log_dir: Directory where MLflow artifacts should be stored (used as artifact location) tracking_uri: MLflow tracking server URI (e.g., "http://localhost:5000") experiment_name: Name of the MLflow experiment **mlflow_init_kwargs: Additional keyword arguments passed to mlflow.start_run() @@ -657,7 +655,6 @@ def __init__( super().__init__(level) self.run_name = _substitute_placeholders(run_name) - self.log_dir = Path(log_dir) self.tracking_uri = tracking_uri self.experiment_name = experiment_name self.mlflow_init_kwargs = mlflow_init_kwargs.copy() @@ -710,12 +707,19 @@ def emit(self, record: logging.LogRecord): # Filter to only numeric values for metrics metrics_dict = {} + skipped_keys = [] for k, v in flat_dict.items(): try: metrics_dict[k] = float(v) except (ValueError, TypeError): # Skip non-numeric values for metrics - pass + skipped_keys.append(k) + + if skipped_keys: + logging.debug( + f"MLflowHandler skipped non-numeric metrics: {skipped_keys}. " + "Only numeric values can be logged as MLflow metrics." + ) if metrics_dict: mlflow.log_metrics(metrics_dict, step=step) @@ -953,7 +957,6 @@ def setup_metric_logger( }, "mlflow": { "()": MLflowHandler, - "log_dir": output_dir, "run_name": run_name, "tracking_uri": mlflow_tracking_uri or os.environ.get("MLFLOW_TRACKING_URI"), diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index 22fbae68..ad042af5 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -279,11 +279,11 @@ def main(args): args.logger_type, args.run_name, args.output_dir, - mlflow_tracking_uri=getattr(args, "mlflow_tracking_uri", None), - mlflow_experiment_name=getattr(args, "mlflow_experiment_name", None), - wandb_project=getattr(args, "wandb_project", None), - wandb_entity=getattr(args, "wandb_entity", None), - tensorboard_log_dir=getattr(args, "tensorboard_log_dir", None), + mlflow_tracking_uri=args.mlflow_tracking_uri, + mlflow_experiment_name=args.mlflow_experiment_name, + wandb_project=args.wandb_project, + wandb_entity=args.wandb_entity, + tensorboard_log_dir=args.tensorboard_log_dir, ) metric_logger = logging.getLogger("instructlab.training.metrics") if os.environ["LOCAL_RANK"] == "0": From 9dbd48cb3c1a12fd849657dec11bfa58687f8dcc Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Tue, 3 Feb 2026 01:57:19 +0000 Subject: [PATCH 05/14] removes generic `run_name` and `logger_type` kwargs --- src/instructlab/training/config.py | 20 ++++---- src/instructlab/training/logger.py | 73 +++++++++++++++++------------ src/instructlab/training/main_ds.py | 29 ++++++++---- 3 files changed, 73 insertions(+), 49 deletions(-) diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py index 7b1c5f7f..1b9a3589 100644 --- a/src/instructlab/training/config.py +++ b/src/instructlab/training/config.py @@ -306,16 +306,6 @@ class TrainingArgs(BaseModel): default="INFO" ) - logger_type: str = Field( - default="async", - description="Comma-separated list of loggers to use: tensorboard, wandb, async, mlflow", - ) - - run_name: str | None = Field( - default=None, - description="Run name for logging. Supports placeholders: {time}, {rank}, {utc_time}, {local_rank}", - ) - mlflow_tracking_uri: str | None = Field( default=None, description="MLflow tracking server URI (e.g., 'http://localhost:5000'). Falls back to MLFLOW_TRACKING_URI env var.", @@ -326,6 +316,11 @@ class TrainingArgs(BaseModel): description="MLflow experiment name. Falls back to MLFLOW_EXPERIMENT_NAME env var.", ) + mlflow_run_name: str | None = Field( + default=None, + description="MLflow run name. Supports placeholders: {time}, {rank}, {utc_time}, {local_rank}", + ) + wandb_project: str | None = Field( default=None, description="Weights & Biases project name.", @@ -336,6 +331,11 @@ class TrainingArgs(BaseModel): description="Weights & Biases team/entity name.", ) + wandb_run_name: str | None = Field( + default=None, + description="Weights & Biases run name. Supports placeholders: {time}, {rank}, {utc_time}, {local_rank}", + ) + tensorboard_log_dir: str | None = Field( default=None, description="Directory for TensorBoard logs. Defaults to ckpt_output_dir if not specified.", diff --git a/src/instructlab/training/logger.py b/src/instructlab/training/logger.py index f09297b9..ec544c0e 100644 --- a/src/instructlab/training/logger.py +++ b/src/instructlab/training/logger.py @@ -860,68 +860,81 @@ def setup_root_logger(level="DEBUG"): def setup_metric_logger( - loggers, - run_name, output_dir, *, mlflow_tracking_uri: str | None = None, mlflow_experiment_name: str | None = None, + mlflow_run_name: str | None = None, wandb_project: str | None = None, wandb_entity: str | None = None, + wandb_run_name: str | None = None, tensorboard_log_dir: str | None = None, ): - """Configure the metric logging system with specified backends. + """Configure the metric logging system with auto-detected backends. This function sets up a comprehensive logging configuration that supports multiple logging backends simultaneously. It configures filters, handlers, - and loggers for structured metric logging. + and loggers for structured metric logging. Backends are automatically + detected based on the presence of their configuration parameters. Args: - loggers: A string or list of strings specifying which logging backends to use. - Supported values: "tensorboard", "wandb", "mlflow", "async" - run_name: Name for the current training run. Can include placeholders like - {time}, {rank}, {utc_time}, {local_rank}. output_dir: Directory where log files will be stored mlflow_tracking_uri: MLflow tracking server URI (e.g., "http://localhost:5000"). Falls back to MLFLOW_TRACKING_URI environment variable if not provided. + When set (or env var present), MLflow logging is automatically enabled. mlflow_experiment_name: MLflow experiment name. Falls back to MLFLOW_EXPERIMENT_NAME environment variable if not provided. + mlflow_run_name: MLflow run name. Supports placeholders: {time}, {rank}, {utc_time}, {local_rank}. wandb_project: Weights & Biases project name. + When set (or WANDB_PROJECT env var present), wandb logging is automatically enabled. wandb_entity: Weights & Biases team/entity name. - tensorboard_log_dir: Directory for TensorBoard logs. Defaults to output_dir if not provided. + wandb_run_name: Weights & Biases run name. Supports placeholders: {time}, {rank}, {utc_time}, {local_rank}. + tensorboard_log_dir: Directory for TensorBoard logs. + When set, TensorBoard logging is automatically enabled. Example: ```python - # Setup logging with multiple backends + # Setup logging with MLflow (auto-detected from tracking URI) setup_metric_logger( - loggers=["tensorboard", "wandb", "async"], - run_name="experiment_{time}", - output_dir="logs" + output_dir="logs", + mlflow_tracking_uri="http://localhost:5000", + mlflow_experiment_name="my_experiment", + mlflow_run_name="my_run" ) - # Setup logging with MLflow + # Setup logging with wandb (auto-detected from project) setup_metric_logger( - loggers=["mlflow"], - run_name="my_run", output_dir="logs", - mlflow_tracking_uri="http://localhost:5000", - mlflow_experiment_name="my_experiment" + wandb_project="my_project", + wandb_run_name="my_run" + ) + + # Setup logging with TensorBoard (auto-detected from log_dir) + setup_metric_logger( + output_dir="logs", + tensorboard_log_dir="logs/tensorboard" ) ``` """ - if not loggers: - return - # Enable package logging propagate_package_logs() - if isinstance(loggers, str): - loggers = loggers.split(",") - loggers = [logger.strip() for logger in loggers] + # Auto-detect which loggers to enable based on configuration + detected_loggers = [] + if mlflow_tracking_uri or os.environ.get("MLFLOW_TRACKING_URI"): + detected_loggers.append("mlflow") + if wandb_project or os.environ.get("WANDB_PROJECT"): + detected_loggers.append("wandb") + if tensorboard_log_dir: + detected_loggers.append("tensorboard") + + # Always include async logger for file logging + loggers = detected_loggers if detected_loggers else ["async"] + # Also include async logger alongside other loggers for file-based logging + if detected_loggers and "async" not in loggers: + loggers.append("async") async_filters = ["is_mapping"] - if run_name is not None and "{rank}" not in run_name: - async_filters.append("is_rank0") logging_config = { "version": 1, @@ -938,26 +951,26 @@ def setup_metric_logger( "async": { "()": AsyncStructuredHandler, "log_dir": output_dir, - "run_name": run_name, + "run_name": None, # Uses default template "filters": async_filters, }, "tensorboard": { "()": TensorBoardHandler, "log_dir": tensorboard_log_dir or output_dir, - "run_name": run_name, + "run_name": None, # Uses default template "filters": ["is_mapping", "is_rank0"], }, "wandb": { "()": WandbHandler, "log_dir": output_dir, - "run_name": run_name, + "run_name": wandb_run_name, "project": wandb_project, "entity": wandb_entity, "filters": ["is_mapping", "is_rank0"], }, "mlflow": { "()": MLflowHandler, - "run_name": run_name, + "run_name": mlflow_run_name, "tracking_uri": mlflow_tracking_uri or os.environ.get("MLFLOW_TRACKING_URI"), "experiment_name": mlflow_experiment_name diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index ad042af5..c3af5ba8 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -276,13 +276,13 @@ def main(args): ) setup_metric_logger( - args.logger_type, - args.run_name, args.output_dir, mlflow_tracking_uri=args.mlflow_tracking_uri, mlflow_experiment_name=args.mlflow_experiment_name, + mlflow_run_name=args.mlflow_run_name, wandb_project=args.wandb_project, wandb_entity=args.wandb_entity, + wandb_run_name=args.wandb_run_name, tensorboard_log_dir=args.tensorboard_log_dir, ) metric_logger = logging.getLogger("instructlab.training.metrics") @@ -470,13 +470,13 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: propagate_package_logs(True) setup_root_logger(train_args.log_level) setup_metric_logger( - train_args.logger_type, - train_args.run_name, train_args.ckpt_output_dir, mlflow_tracking_uri=train_args.mlflow_tracking_uri, mlflow_experiment_name=train_args.mlflow_experiment_name, + mlflow_run_name=train_args.mlflow_run_name, wandb_project=train_args.wandb_project, wandb_entity=train_args.wandb_entity, + wandb_run_name=train_args.wandb_run_name, tensorboard_log_dir=train_args.tensorboard_log_dir, ) @@ -566,21 +566,22 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: f"--adamw_beta1={train_args.adamw_betas[0]}", f"--adamw_beta2={train_args.adamw_betas[1]}", f"--adamw_eps={train_args.adamw_eps}", - f"--logger_type={train_args.logger_type}", ] ) # Add optional logging parameters - if train_args.run_name is not None: - command.append(f"--run_name={train_args.run_name}") if train_args.mlflow_tracking_uri is not None: command.append(f"--mlflow_tracking_uri={train_args.mlflow_tracking_uri}") if train_args.mlflow_experiment_name is not None: command.append(f"--mlflow_experiment_name={train_args.mlflow_experiment_name}") + if train_args.mlflow_run_name is not None: + command.append(f"--mlflow_run_name={train_args.mlflow_run_name}") if train_args.wandb_project is not None: command.append(f"--wandb_project={train_args.wandb_project}") if train_args.wandb_entity is not None: command.append(f"--wandb_entity={train_args.wandb_entity}") + if train_args.wandb_run_name is not None: + command.append(f"--wandb_run_name={train_args.wandb_run_name}") if train_args.tensorboard_log_dir is not None: command.append(f"--tensorboard_log_dir={train_args.tensorboard_log_dir}") @@ -800,8 +801,6 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: help="Save full model state using Accelerate after finishing an epoch.", ) parser.add_argument("--log_level", type=str, default="INFO") - parser.add_argument("--run_name", type=str, default=None) - parser.add_argument("--logger_type", type=str, default="async") parser.add_argument( "--mlflow_tracking_uri", type=str, @@ -814,6 +813,12 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: default=None, help="MLflow experiment name", ) + parser.add_argument( + "--mlflow_run_name", + type=str, + default=None, + help="MLflow run name. Supports placeholders: {time}, {rank}, {utc_time}, {local_rank}", + ) parser.add_argument( "--wandb_project", type=str, @@ -826,6 +831,12 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: default=None, help="Weights & Biases team/entity name", ) + parser.add_argument( + "--wandb_run_name", + type=str, + default=None, + help="Weights & Biases run name. Supports placeholders: {time}, {rank}, {utc_time}, {local_rank}", + ) parser.add_argument( "--tensorboard_log_dir", type=str, From 4f2f0fe34525b7764f2fabd608c9eea3ef00a7ca Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Tue, 3 Feb 2026 16:09:14 +0000 Subject: [PATCH 06/14] review comments --- src/instructlab/training/logger.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/instructlab/training/logger.py b/src/instructlab/training/logger.py index ec544c0e..7596ad01 100644 --- a/src/instructlab/training/logger.py +++ b/src/instructlab/training/logger.py @@ -921,7 +921,13 @@ def setup_metric_logger( # Auto-detect which loggers to enable based on configuration detected_loggers = [] - if mlflow_tracking_uri or os.environ.get("MLFLOW_TRACKING_URI"): + if ( + mlflow_tracking_uri + or mlflow_experiment_name + or mlflow_run_name + or os.environ.get("MLFLOW_TRACKING_URI") + or os.environ.get("MLFLOW_EXPERIMENT_NAME") + ): detected_loggers.append("mlflow") if wandb_project or os.environ.get("WANDB_PROJECT"): detected_loggers.append("wandb") From a3471d7a1fc61cb092c835c89c8c86ba8553d309 Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Tue, 3 Feb 2026 18:09:01 +0000 Subject: [PATCH 07/14] something something mlflow active runs --- src/instructlab/training/logger.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/instructlab/training/logger.py b/src/instructlab/training/logger.py index 7596ad01..90686e1c 100644 --- a/src/instructlab/training/logger.py +++ b/src/instructlab/training/logger.py @@ -676,9 +676,14 @@ def _setup(self): if self.experiment_name: mlflow.set_experiment(self.experiment_name) - self._mlflow_run = mlflow.start_run( - run_name=self.run_name, **self.mlflow_init_kwargs - ) + # Reuse existing active run if one exists, otherwise start a new one + active = mlflow.active_run() + if active is not None: + self._mlflow_run = active + else: + self._mlflow_run = mlflow.start_run( + run_name=self.run_name, **self.mlflow_init_kwargs + ) def emit(self, record: logging.LogRecord): """Emit a log record to MLflow. From 73d11c48769a1fc342db6c5c8f414e9b56cbf219 Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Tue, 3 Feb 2026 18:26:59 +0000 Subject: [PATCH 08/14] review comments --- src/instructlab/training/logger.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/instructlab/training/logger.py b/src/instructlab/training/logger.py index 90686e1c..d8f24431 100644 --- a/src/instructlab/training/logger.py +++ b/src/instructlab/training/logger.py @@ -660,6 +660,7 @@ def __init__( self.mlflow_init_kwargs = mlflow_init_kwargs.copy() self._mlflow_run = None + self._owns_mlflow_run = False def _setup(self): """Initialize the MLflow run with the configured settings.""" @@ -680,10 +681,12 @@ def _setup(self): active = mlflow.active_run() if active is not None: self._mlflow_run = active + self._owns_mlflow_run = False else: self._mlflow_run = mlflow.start_run( run_name=self.run_name, **self.mlflow_init_kwargs ) + self._owns_mlflow_run = True def emit(self, record: logging.LogRecord): """Emit a log record to MLflow. @@ -732,8 +735,11 @@ def emit(self, record: logging.LogRecord): def close(self): """End the MLflow run and cleanup resources.""" if self._mlflow_run is not None: - mlflow.end_run() + # Only end the run if we started it (not if we reused an existing one) + if self._owns_mlflow_run: + mlflow.end_run() self._mlflow_run = None + self._owns_mlflow_run = False super().close() From 8517b745f58be46317c50d9d72946aa5d0f6c3f7 Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Tue, 3 Feb 2026 19:29:47 +0000 Subject: [PATCH 09/14] coderabbit --- src/instructlab/training/logger.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/instructlab/training/logger.py b/src/instructlab/training/logger.py index d8f24431..19cd2af8 100644 --- a/src/instructlab/training/logger.py +++ b/src/instructlab/training/logger.py @@ -888,6 +888,13 @@ def setup_metric_logger( and loggers for structured metric logging. Backends are automatically detected based on the presence of their configuration parameters. + Note: + Run names are configured per-backend (e.g., `mlflow_run_name`, `wandb_run_name`) + rather than using a shared global run name. This design provides explicit control + over each backend's naming without coupling them together. File-based loggers + (async JSONL, TensorBoard) use a default template "{time}_rank{rank}" when no + run name is specified, ensuring unique identifiers across distributed runs. + Args: output_dir: Directory where log files will be stored mlflow_tracking_uri: MLflow tracking server URI (e.g., "http://localhost:5000"). From 22dd650b5d59b9d4b3f9149f538b4aa7dc3863c8 Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Tue, 3 Feb 2026 21:38:19 +0000 Subject: [PATCH 10/14] adds install targets for logging backends --- README.md | 21 +++++++++++++++++++++ pyproject.toml | 3 +++ src/instructlab/training/logger.py | 8 ++++---- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c2c45c9a..b59992a0 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ The library now supports reasoning traces through the `reasoning_content` field - [Installing](#installing-the-library) - [Additional Nvidia packages](#additional-nvidia-packages) + - [Optional logging dependencies](#optional-logging-dependencies) - [Using the library](#using-the-library) - [Data format](#data-format) - [Reasoning content support](#reasoning-content-support-1) @@ -68,6 +69,26 @@ Editable install (development) pip install -e .[cuda] ``` +### Optional Logging Dependencies + +The library supports optional logging backends for experiment tracking. Install the ones you need: + +```bash +# MLflow logging +pip install 'instructlab-training[mlflow]' + +# Weights & Biases logging +pip install 'instructlab-training[wandb]' + +# TensorBoard logging +pip install 'instructlab-training[tensorboard]' + +# Install multiple logging backends at once +pip install 'instructlab-training[mlflow,wandb,tensorboard]' +``` + +For more details on configuring logging, see the [Logging Documentation](docs/logging.md). + ## Using the library See the `examples` dir for guided sample notebooks on library usage. Below provides some added details on library options: diff --git a/pyproject.toml b/pyproject.toml index 12def6fc..d9ee3dab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,9 @@ optional-dependencies.cuda = { file = ["requirements-cuda.txt"] } optional-dependencies.rocm = { file = ["requirements-rocm.txt"] } optional-dependencies.hpu = { file = ["requirements-hpu.txt"] } optional-dependencies.deepspeed = { file = ["requirements-deepspeed.txt"] } +optional-dependencies.mlflow = { file = ["requirements-mlflow.txt"] } +optional-dependencies.wandb = { file = ["requirements-wandb.txt"] } +optional-dependencies.tensorboard = { file = ["requirements-tensorboard.txt"] } [tool.setuptools.packages.find] where = ["src"] diff --git a/src/instructlab/training/logger.py b/src/instructlab/training/logger.py index 19cd2af8..8d9c9fca 100644 --- a/src/instructlab/training/logger.py +++ b/src/instructlab/training/logger.py @@ -418,7 +418,7 @@ def _setup(self): if SummaryWriter is None: msg = ( "Could not initialize TensorBoardHandler because package tensorboard could not be imported.\n" - "Please ensure it is installed by running 'pip install tensorboard'" + "Please ensure it is installed by running: pip install 'instructlab-training[tensorboard]'" ) raise RuntimeError(msg) os.makedirs(self.tboard_init_kwargs["log_dir"], exist_ok=True) @@ -556,8 +556,8 @@ def _setup(self): """Initialize the wandb run with the configured settings.""" if wandb is None: msg = ( - "Could not initialize WandbLogger because package wandb could not be imported.\n" - "Please ensure it is installed by running 'pip install wandb'" + "Could not initialize WandbHandler because package wandb could not be imported.\n" + "Please ensure it is installed by running: pip install 'instructlab-training[wandb]'" ) raise RuntimeError(msg) self._wandb_run = wandb.init(**self.wandb_init_kwargs) @@ -667,7 +667,7 @@ def _setup(self): if mlflow is None: msg = ( "Could not initialize MLflowHandler because package mlflow could not be imported.\n" - "Please ensure it is installed by running 'pip install mlflow'" + "Please ensure it is installed by running: pip install 'instructlab-training[mlflow]'" ) raise RuntimeError(msg) From 3f7bfc26e96a6460e9af8886e0915dad31627c9b Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Tue, 3 Feb 2026 21:44:08 +0000 Subject: [PATCH 11/14] add targets for loggers --- requirements-mlflow.txt | 1 + requirements-tensorboard.txt | 1 + requirements-wandb.txt | 1 + 3 files changed, 3 insertions(+) create mode 100644 requirements-mlflow.txt create mode 100644 requirements-tensorboard.txt create mode 100644 requirements-wandb.txt diff --git a/requirements-mlflow.txt b/requirements-mlflow.txt new file mode 100644 index 00000000..6d0d17c2 --- /dev/null +++ b/requirements-mlflow.txt @@ -0,0 +1 @@ +mlflow diff --git a/requirements-tensorboard.txt b/requirements-tensorboard.txt new file mode 100644 index 00000000..8ba46efc --- /dev/null +++ b/requirements-tensorboard.txt @@ -0,0 +1 @@ +tensorboard diff --git a/requirements-wandb.txt b/requirements-wandb.txt new file mode 100644 index 00000000..fff96000 --- /dev/null +++ b/requirements-wandb.txt @@ -0,0 +1 @@ +wandb From fbd91c60286f7777040f50fc72ab92777acf44d7 Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Tue, 3 Feb 2026 23:36:14 +0000 Subject: [PATCH 12/14] messaging --- README.md | 9 +++------ src/instructlab/training/logger.py | 6 +++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index b59992a0..a20209fc 100644 --- a/README.md +++ b/README.md @@ -75,16 +75,13 @@ The library supports optional logging backends for experiment tracking. Install ```bash # MLflow logging -pip install 'instructlab-training[mlflow]' +pip install mlflow # Weights & Biases logging -pip install 'instructlab-training[wandb]' +pip install wandb # TensorBoard logging -pip install 'instructlab-training[tensorboard]' - -# Install multiple logging backends at once -pip install 'instructlab-training[mlflow,wandb,tensorboard]' +pip install tensorboard ``` For more details on configuring logging, see the [Logging Documentation](docs/logging.md). diff --git a/src/instructlab/training/logger.py b/src/instructlab/training/logger.py index 8d9c9fca..6f59d6e3 100644 --- a/src/instructlab/training/logger.py +++ b/src/instructlab/training/logger.py @@ -418,7 +418,7 @@ def _setup(self): if SummaryWriter is None: msg = ( "Could not initialize TensorBoardHandler because package tensorboard could not be imported.\n" - "Please ensure it is installed by running: pip install 'instructlab-training[tensorboard]'" + "Please ensure it is installed by running: pip install tensorboard" ) raise RuntimeError(msg) os.makedirs(self.tboard_init_kwargs["log_dir"], exist_ok=True) @@ -557,7 +557,7 @@ def _setup(self): if wandb is None: msg = ( "Could not initialize WandbHandler because package wandb could not be imported.\n" - "Please ensure it is installed by running: pip install 'instructlab-training[wandb]'" + "Please ensure it is installed by running: pip install wandb" ) raise RuntimeError(msg) self._wandb_run = wandb.init(**self.wandb_init_kwargs) @@ -667,7 +667,7 @@ def _setup(self): if mlflow is None: msg = ( "Could not initialize MLflowHandler because package mlflow could not be imported.\n" - "Please ensure it is installed by running: pip install 'instructlab-training[mlflow]'" + "Please ensure it is installed by running: pip install mlflow" ) raise RuntimeError(msg) From 4fe3ab2612ad8571a449f1603b1ea68439e14c5f Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Wed, 4 Feb 2026 02:26:38 +0000 Subject: [PATCH 13/14] comments --- src/instructlab/training/logger.py | 44 ++++++++++++++++++------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/instructlab/training/logger.py b/src/instructlab/training/logger.py index 6f59d6e3..a5623529 100644 --- a/src/instructlab/training/logger.py +++ b/src/instructlab/training/logger.py @@ -8,11 +8,12 @@ ```python from instructlab.training.logger import setup_metric_logger - # Setup logging with TensorBoard and wandb + # Setup logging with TensorBoard and wandb (auto-detected from params) setup_metric_logger( - loggers=["tensorboard", "wandb"], - run_name="my_training_run", - output_dir="logs" + output_dir="logs", + tensorboard_log_dir="logs/tensorboard", + wandb_project="my_project", + wandb_run_name="my_training_run", ) # Log metrics @@ -671,18 +672,27 @@ def _setup(self): ) raise RuntimeError(msg) - if self.tracking_uri: - mlflow.set_tracking_uri(self.tracking_uri) - - if self.experiment_name: - mlflow.set_experiment(self.experiment_name) - - # Reuse existing active run if one exists, otherwise start a new one + # Check for existing active run first active = mlflow.active_run() if active is not None: + # Warn if user provided settings that will be ignored + if self.tracking_uri or self.experiment_name: + warnings.warn( + "An MLflow run is already active. The provided tracking_uri and " + "experiment_name settings will be ignored. The handler will log " + "to the existing active run.", + stacklevel=3, + ) self._mlflow_run = active self._owns_mlflow_run = False else: + # Only set tracking URI and experiment when starting a new run + if self.tracking_uri: + mlflow.set_tracking_uri(self.tracking_uri) + + if self.experiment_name: + mlflow.set_experiment(self.experiment_name) + self._mlflow_run = mlflow.start_run( run_name=self.run_name, **self.mlflow_init_kwargs ) @@ -892,8 +902,9 @@ def setup_metric_logger( Run names are configured per-backend (e.g., `mlflow_run_name`, `wandb_run_name`) rather than using a shared global run name. This design provides explicit control over each backend's naming without coupling them together. File-based loggers - (async JSONL, TensorBoard) use a default template "{time}_rank{rank}" when no - run name is specified, ensuring unique identifiers across distributed runs. + use default templates when no run name is specified: TensorBoard uses + "{time}_rank{rank}", and async JSONL uses "training_params_and_metrics_global{rank}", + ensuring unique identifiers across distributed runs. Args: output_dir: Directory where log files will be stored @@ -952,11 +963,8 @@ def setup_metric_logger( if tensorboard_log_dir: detected_loggers.append("tensorboard") - # Always include async logger for file logging - loggers = detected_loggers if detected_loggers else ["async"] - # Also include async logger alongside other loggers for file-based logging - if detected_loggers and "async" not in loggers: - loggers.append("async") + # Always include async logger for file-based logging alongside other loggers + loggers = [*detected_loggers, "async"] async_filters = ["is_mapping"] From 15f9357d7bad239e4ab5c320f2e8eee2f9c70db3 Mon Sep 17 00:00:00 2001 From: Oleg Silkin <97077423+RobotSail@users.noreply.github.com> Date: Wed, 4 Feb 2026 19:53:16 +0000 Subject: [PATCH 14/14] interim changes --- src/instructlab/training/logger.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/src/instructlab/training/logger.py b/src/instructlab/training/logger.py index a5623529..f72c221b 100644 --- a/src/instructlab/training/logger.py +++ b/src/instructlab/training/logger.py @@ -672,27 +672,20 @@ def _setup(self): ) raise RuntimeError(msg) - # Check for existing active run first + # Always set tracking URI and experiment first (before checking for active run) + # This ensures the client is configured correctly even if we reuse an existing run + if self.tracking_uri: + mlflow.set_tracking_uri(self.tracking_uri) + + if self.experiment_name: + mlflow.set_experiment(self.experiment_name) + + # Reuse existing active run if one exists, otherwise start a new one active = mlflow.active_run() if active is not None: - # Warn if user provided settings that will be ignored - if self.tracking_uri or self.experiment_name: - warnings.warn( - "An MLflow run is already active. The provided tracking_uri and " - "experiment_name settings will be ignored. The handler will log " - "to the existing active run.", - stacklevel=3, - ) self._mlflow_run = active self._owns_mlflow_run = False else: - # Only set tracking URI and experiment when starting a new run - if self.tracking_uri: - mlflow.set_tracking_uri(self.tracking_uri) - - if self.experiment_name: - mlflow.set_experiment(self.experiment_name) - self._mlflow_run = mlflow.start_run( run_name=self.run_name, **self.mlflow_init_kwargs )