OpenEuroLLM · kargibora · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 15, 2026
diff --git a/judgearena/arenas_utils.py b/judgearena/arenas_utils.py
@@ -5,6 +5,10 @@
 from fast_langdetect import detect_language
 from huggingface_hub import snapshot_download
 
+from judgearena.log import get_logger
+
+logger = get_logger(__name__)
+
 
 def _extract_instruction_text(turn: dict) -> str:
     """Extract plain instruction text from a conversation first turn.
@@ -157,8 +161,11 @@ def get_winner(
     df = df.loc[df.turns == 1]
     n_dropped = n_before - len(df)
     if n_dropped > 0:
-        print(
-            f"[{arena}] Dropped {n_dropped}/{n_before} multi-turn battles (keeping single-turn only)."
+        logger.info(
+            "[%s] Dropped %d/%d multi-turn battles (keeping single-turn only).",
+            arena,
+            n_dropped,
+            n_before,
         )
 
     return df
@@ -189,13 +196,17 @@ def load_arena_dataframe(
 
 def main():
     for arena in KNOWN_ARENAS:
-        print(f"Loading {arena}")
+        logger.info("Loading %s", arena)
         df = _load_arena_dataframe(arena)
         n_battles = len(df)
         n_models = len(set(df["model_a"]) | set(df["model_b"]))
         n_languages = df["lang"].nunique()
-        print(
-            f"{arena}: {n_battles} battles, {n_models} models, {n_languages} languages"
+        logger.info(
+            "%s: %d battles, %d models, %d languages",
+            arena,
+            n_battles,
+            n_models,
+            n_languages,
         )
 
 

diff --git a/judgearena/cli_common.py b/judgearena/cli_common.py
@@ -29,6 +29,9 @@ class BaseCliArgs:
     chat_template: str | None = None
     result_folder: str = "results"
     engine_kwargs: dict = field(default_factory=dict)
+    verbosity: int = 0
+    log_file: str | None = None
+    no_log_file: bool = False
 
     def __post_init__(self):
         supported_modes = ["fixed", "both"]
@@ -157,6 +160,38 @@ def add_common_arguments(parser: argparse.ArgumentParser) -> None:
             '\'{"tensor_parallel_size": 2, "gpu_memory_utilization": 0.9}\'.'
         ),
     )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="count",
+        default=0,
+        help="Increase logging verbosity. Use -v for DEBUG output.",
+    )
+    parser.add_argument(
+        "-q",
+        "--quiet",
+        action="store_true",
+        default=False,
+        help="Suppress all output except warnings and errors.",
+    )
+    parser.add_argument(
+        "--log-file",
+        dest="log_file",
+        type=str,
+        default=None,
+        help=(
+            "Write the full DEBUG log to this file in addition to the "
+            "console output. By default a timestamped run-*.log is saved "
+            "automatically in the result folder."
+        ),
+    )
+    parser.add_argument(
+        "--no-log-file",
+        dest="no_log_file",
+        action="store_true",
+        default=False,
+        help="Disable automatic file logging in the result folder.",
+    )
 
 
 def parse_engine_kwargs(raw: str) -> dict:
@@ -168,3 +203,13 @@ def parse_engine_kwargs(raw: str) -> dict:
     except Exception as e:
         raise SystemExit(f"Failed to parse --engine_kwargs: {e}") from e
     return engine_kwargs
+
+
+def resolve_verbosity(args: argparse.Namespace) -> int:
+    """Derive a single verbosity int from ``-v`` / ``-q`` flags.
+
+    Returns ``-1`` for quiet, ``0`` for default (INFO), ``1+`` for verbose.
+    """
+    if getattr(args, "quiet", False):
+        return -1
+    return getattr(args, "verbose", 0)
diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py
@@ -8,11 +8,19 @@
 from sklearn.linear_model import LogisticRegression
 
 from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe
-from judgearena.cli_common import BaseCliArgs, add_common_arguments, parse_engine_kwargs
+from judgearena.cli_common import (
+    BaseCliArgs,
+    add_common_arguments,
+    parse_engine_kwargs,
+    resolve_verbosity,
+)
 from judgearena.evaluate import judge_and_parse_prefs
 from judgearena.generate import generate_instructions
+from judgearena.log import configure_logging, get_logger
 from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model
 
+logger = get_logger(__name__)
+
 
 @dataclass
 class CliEloArgs(BaseCliArgs):
@@ -106,6 +114,9 @@ def parse_args(cls):
             chat_template=args.chat_template,
             result_folder=args.result_folder,
             engine_kwargs=parse_engine_kwargs(args.engine_kwargs),
+            verbosity=resolve_verbosity(args),
+            log_file=args.log_file,
+            no_log_file=args.no_log_file,
         )
 
 
@@ -228,7 +239,7 @@ def main(args: CliEloArgs | None = None) -> dict:
     rng = np.random.default_rng(args.seed)
 
     # Step 1: Load arena battles
-    print(f"\n=== Step 1: Loading battles from {args.arena} ===")
+    logger.info("Step 1: Loading battles from %s", args.arena)
     df_arena_all = load_arena_dataframe(arena=args.arena)
 
     # Filter by language if specified
@@ -250,7 +261,7 @@ def main(args: CliEloArgs | None = None) -> dict:
 
     df_battles = df_battles.reset_index(drop=True)
     n = len(df_battles)
-    print(f"Loaded {n} battles.")
+    logger.info("Loaded %d battles.", n)
 
     # Extract user instructions (first turn of conversation_a)
     instructions = pd.Series(
@@ -260,10 +271,10 @@ def main(args: CliEloArgs | None = None) -> dict:
         ],
         name="instruction",
     )
-    print(f"\nFirst instruction:\n{instructions.iloc[0][:300]}\n")
+    logger.debug("First instruction:\n%s", instructions.iloc[0][:300])
 
     # Step 2: Generate completions for the model under evaluation
-    print(f"=== Step 2: Generating completions with {args.model} ===")
+    logger.info("Step 2: Generating completions with %s", args.model)
 
     # Only pass extra engine kwargs that are not None
     extra_kwargs = dict(args.engine_kwargs)
@@ -297,8 +308,11 @@ def replace_slash(s: str) -> str:
     )
     if len(cache_suffix) > 100:
         cache_hash = hashlib.sha256(cache_suffix.encode()).hexdigest()[:16]
-        print(
-            f"Cache suffix too long ({len(cache_suffix)} chars), using hash: {cache_hash} (full: {cache_suffix})"
+        logger.debug(
+            "Cache suffix too long (%d chars), using hash: %s (full: %s)",
+            len(cache_suffix),
+            cache_hash,
+            cache_suffix,
         )
         cache_suffix = cache_hash
     completions_df = cache_function_dataframe(
@@ -308,10 +322,10 @@ def replace_slash(s: str) -> str:
     ).set_index("instruction_index")
     completions = completions_df.loc[:, "completion"]
 
-    print(f"First completion:\n{completions.iloc[0]}\n")
+    logger.debug("First completion:\n%s", completions.iloc[0])
 
     # Step 3: Judge evaluation against randomly picked arena opponents
-    print(f"=== Step 3: Judge evaluation with {args.judge_model} ===")
+    logger.info("Step 3: Judge evaluation with %s", args.judge_model)
 
     # For each battle, randomly pick opponent: model_a or model_b from the arena
     use_model_a_as_opponent = rng.choice([True, False], size=n)
@@ -390,7 +404,7 @@ def run_judge() -> pd.DataFrame:
     opponent_models = df_judge["opponent_model"].tolist()
     prefs = df_judge["pref"].tolist()
 
-    print(f"First judge output:\n{df_judge['judge_completion'].iloc[0][:500]}\n")
+    logger.debug("First judge output:\n%s", df_judge['judge_completion'].iloc[0][:500])
 
     # Map preferences back to model-name-level battle results
     model_name = args.model
@@ -499,7 +513,9 @@ def run_judge() -> pd.DataFrame:
 
 
 def cli():
-    main()
+    args = CliEloArgs.parse_args()
+    configure_logging(args.verbosity, log_file=args.log_file)
+    main(args)
 
 
 if __name__ == "__main__":

diff --git a/judgearena/eval_utils.py b/judgearena/eval_utils.py
@@ -7,8 +7,11 @@
 import pandas as pd
 
 from judgearena.evaluate import PairScore, annotate_battles
+from judgearena.log import get_logger
 from judgearena.utils import compute_pref_summary
 
+logger = get_logger(__name__)
+
 
 def print_results(results):
     """Print battle results in a readable format."""
@@ -124,8 +127,8 @@ def _make_judge_annotation(
     combined_metadata = list(metadata)
 
     if swap_mode == "both":
-        print("Correction for judge bias towards a certain model position is set.")
-        print("Evaluating completions with models reversed.")
+        logger.info("Correction for judge bias towards a certain model position is set.")
+        logger.info("Evaluating completions with models reversed.")
         annotations_reversed = annotate_battles(
             judge_chat_model=judge_chat_model,
             instructions=instructions,

diff --git a/judgearena/evaluate.py b/judgearena/evaluate.py
@@ -14,6 +14,7 @@
     download_arena_hard,
     is_arena_hard_dataset,
 )
+from judgearena.log import get_logger
 from judgearena.repro import _to_jsonable, write_run_metadata
 from judgearena.utils import (
     compute_pref_summary,
@@ -24,6 +25,8 @@
     truncate,
 )
 
+logger = get_logger(__name__)
+
 
 class PairScore:
     def __init__(self):
@@ -157,13 +160,13 @@ def evaluate_completions(
 
     def get_output(df_outputs: pd.DataFrame, dataset: str, method: str):
         if Path(method).exists():
-            print(f"Path {method} exists, loads local model completions.")
+            logger.info("Path %s exists, loading local model completions.", method)
             df = read_df(Path(method)).set_index("instruction_index").sort_index()
-            print(f"Loaded {len(df)} completions.")
+            logger.info("Loaded %d completions.", len(df))
             df.loc[:, "output"] = df.loc[:, "output"].fillna("")
             return df.loc[:, "output"]
         else:
-            print(f"Loading {method} from {dataset} dataset.")
+            logger.info("Loading %s from %s dataset.", method, dataset)
             assert method in df_outputs.columns, (
                 f"Method {method} not present, pick among {df_outputs.columns.tolist()}"
             )
@@ -186,7 +189,7 @@ def get_output(df_outputs: pd.DataFrame, dataset: str, method: str):
 
     unique_string = dataset + "-" + datetime.now().strftime("%Y%m%d_%H%M%S")
     output_folder = data_root / "judge-evals" / unique_string
-    print(f"Saving results in {output_folder}")
+    logger.info("Saving results in %s", output_folder)
     output_folder.mkdir(parents=True, exist_ok=True)
     (
         judge_system_prompt,
@@ -216,7 +219,7 @@ def get_output(df_outputs: pd.DataFrame, dataset: str, method: str):
     results = {**compute_pref_summary(prefs)}
     pd.DataFrame(annotations).to_csv(output_folder / "annotations.csv", index=False)
 
-    print(f"{method_A} against {method_B}:\n{results}")
+    logger.info("%s against %s:\n%s", method_A, method_B, results)
     with open(output_folder / "results.json", "w") as f:
         json.dump(_to_jsonable(results), f, allow_nan=False)
 
@@ -248,7 +251,7 @@ def get_output(df_outputs: pd.DataFrame, dataset: str, method: str):
             started_at_utc=run_started_at,
         )
     except OSError as e:
-        print(f"Warning: failed to write run metadata: {e}")
+        logger.warning("Failed to write run metadata: %s", e)
 
 
 @dataclass
@@ -324,7 +327,7 @@ def annotate_battles(
             )
         ]
     )
-    print(f"Start LLM judge annotation ({len(inputs)} annotations).")
+    logger.info("Start LLM judge annotation (%d annotations).", len(inputs))
     judge_completions = do_inference(
         chat_model=judge_chat_model,
         inputs=inputs,
@@ -373,9 +376,10 @@ def judge_and_parse_prefs(
                already combined for swap_mode="both"
     """
     if swap_mode == "both":
-        print("Correction for judge bias towards a certain model position is set.")
-        print(
-            f"Evaluating completions with models reversed with judge {judge_chat_model}."
+        logger.info("Correction for judge bias towards a certain model position is set.")
+        logger.info(
+            "Evaluating completions with models reversed with judge %s.",
+            judge_chat_model,
         )
 
     annotations = annotate_battles(