Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions judgearena/arenas_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
from fast_langdetect import detect_language
from huggingface_hub import snapshot_download

from judgearena.log import get_logger

logger = get_logger(__name__)


def _extract_instruction_text(turn: dict) -> str:
"""Extract plain instruction text from a conversation first turn.
Expand Down Expand Up @@ -157,8 +161,11 @@ def get_winner(
df = df.loc[df.turns == 1]
n_dropped = n_before - len(df)
if n_dropped > 0:
print(
f"[{arena}] Dropped {n_dropped}/{n_before} multi-turn battles (keeping single-turn only)."
logger.info(
"[%s] Dropped %d/%d multi-turn battles (keeping single-turn only).",
arena,
n_dropped,
n_before,
)

return df
Expand Down Expand Up @@ -189,13 +196,17 @@ def load_arena_dataframe(

def main():
for arena in KNOWN_ARENAS:
print(f"Loading {arena}")
logger.info("Loading %s", arena)
df = _load_arena_dataframe(arena)
n_battles = len(df)
n_models = len(set(df["model_a"]) | set(df["model_b"]))
n_languages = df["lang"].nunique()
print(
f"{arena}: {n_battles} battles, {n_models} models, {n_languages} languages"
logger.info(
"%s: %d battles, %d models, %d languages",
arena,
n_battles,
n_models,
n_languages,
)


Expand Down
45 changes: 45 additions & 0 deletions judgearena/cli_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ class BaseCliArgs:
chat_template: str | None = None
result_folder: str = "results"
engine_kwargs: dict = field(default_factory=dict)
verbosity: int = 0
log_file: str | None = None
no_log_file: bool = False

def __post_init__(self):
supported_modes = ["fixed", "both"]
Expand Down Expand Up @@ -157,6 +160,38 @@ def add_common_arguments(parser: argparse.ArgumentParser) -> None:
'\'{"tensor_parallel_size": 2, "gpu_memory_utilization": 0.9}\'.'
),
)
parser.add_argument(
"-v",
"--verbose",
action="count",
default=0,
help="Increase logging verbosity. Use -v for DEBUG output.",
)
parser.add_argument(
"-q",
"--quiet",
action="store_true",
default=False,
help="Suppress all output except warnings and errors.",
)
parser.add_argument(
"--log-file",
dest="log_file",
type=str,
default=None,
help=(
"Write the full DEBUG log to this file in addition to the "
"console output. By default a timestamped run-*.log is saved "
"automatically in the result folder."
),
)
parser.add_argument(
"--no-log-file",
dest="no_log_file",
action="store_true",
default=False,
help="Disable automatic file logging in the result folder.",
)


def parse_engine_kwargs(raw: str) -> dict:
Expand All @@ -168,3 +203,13 @@ def parse_engine_kwargs(raw: str) -> dict:
except Exception as e:
raise SystemExit(f"Failed to parse --engine_kwargs: {e}") from e
return engine_kwargs


def resolve_verbosity(args: argparse.Namespace) -> int:
"""Derive a single verbosity int from ``-v`` / ``-q`` flags.

Returns ``-1`` for quiet, ``0`` for default (INFO), ``1+`` for verbose.
"""
if getattr(args, "quiet", False):
return -1
return getattr(args, "verbose", 0)
38 changes: 27 additions & 11 deletions judgearena/estimate_elo_ratings.py
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the new logging flags won’t actually take effect for judgearena-elo yet, because the package entrypoint still points to estimate_elo_ratings:main while configure_logging() only runs in cli(). Could we make both paths go through the same logging setup?
Maybe we can point pyproject.toml at judgearena.estimate_elo_ratings:cli.

Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,19 @@
from sklearn.linear_model import LogisticRegression

from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe
from judgearena.cli_common import BaseCliArgs, add_common_arguments, parse_engine_kwargs
from judgearena.cli_common import (
BaseCliArgs,
add_common_arguments,
parse_engine_kwargs,
resolve_verbosity,
)
from judgearena.evaluate import judge_and_parse_prefs
from judgearena.generate import generate_instructions
from judgearena.log import configure_logging, get_logger
from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model

logger = get_logger(__name__)


@dataclass
class CliEloArgs(BaseCliArgs):
Expand Down Expand Up @@ -106,6 +114,9 @@ def parse_args(cls):
chat_template=args.chat_template,
result_folder=args.result_folder,
engine_kwargs=parse_engine_kwargs(args.engine_kwargs),
verbosity=resolve_verbosity(args),
log_file=args.log_file,
no_log_file=args.no_log_file,
)


Expand Down Expand Up @@ -228,7 +239,7 @@ def main(args: CliEloArgs | None = None) -> dict:
rng = np.random.default_rng(args.seed)

# Step 1: Load arena battles
print(f"\n=== Step 1: Loading battles from {args.arena} ===")
logger.info("Step 1: Loading battles from %s", args.arena)
df_arena_all = load_arena_dataframe(arena=args.arena)

# Filter by language if specified
Expand All @@ -250,7 +261,7 @@ def main(args: CliEloArgs | None = None) -> dict:

df_battles = df_battles.reset_index(drop=True)
n = len(df_battles)
print(f"Loaded {n} battles.")
logger.info("Loaded %d battles.", n)

# Extract user instructions (first turn of conversation_a)
instructions = pd.Series(
Expand All @@ -260,10 +271,10 @@ def main(args: CliEloArgs | None = None) -> dict:
],
name="instruction",
)
print(f"\nFirst instruction:\n{instructions.iloc[0][:300]}\n")
logger.debug("First instruction:\n%s", instructions.iloc[0][:300])

# Step 2: Generate completions for the model under evaluation
print(f"=== Step 2: Generating completions with {args.model} ===")
logger.info("Step 2: Generating completions with %s", args.model)

# Only pass extra engine kwargs that are not None
extra_kwargs = dict(args.engine_kwargs)
Expand Down Expand Up @@ -297,8 +308,11 @@ def replace_slash(s: str) -> str:
)
if len(cache_suffix) > 100:
cache_hash = hashlib.sha256(cache_suffix.encode()).hexdigest()[:16]
print(
f"Cache suffix too long ({len(cache_suffix)} chars), using hash: {cache_hash} (full: {cache_suffix})"
logger.debug(
"Cache suffix too long (%d chars), using hash: %s (full: %s)",
len(cache_suffix),
cache_hash,
cache_suffix,
)
cache_suffix = cache_hash
completions_df = cache_function_dataframe(
Expand All @@ -308,10 +322,10 @@ def replace_slash(s: str) -> str:
).set_index("instruction_index")
completions = completions_df.loc[:, "completion"]

print(f"First completion:\n{completions.iloc[0]}\n")
logger.debug("First completion:\n%s", completions.iloc[0])

# Step 3: Judge evaluation against randomly picked arena opponents
print(f"=== Step 3: Judge evaluation with {args.judge_model} ===")
logger.info("Step 3: Judge evaluation with %s", args.judge_model)

# For each battle, randomly pick opponent: model_a or model_b from the arena
use_model_a_as_opponent = rng.choice([True, False], size=n)
Expand Down Expand Up @@ -390,7 +404,7 @@ def run_judge() -> pd.DataFrame:
opponent_models = df_judge["opponent_model"].tolist()
prefs = df_judge["pref"].tolist()

print(f"First judge output:\n{df_judge['judge_completion'].iloc[0][:500]}\n")
logger.debug("First judge output:\n%s", df_judge['judge_completion'].iloc[0][:500])

# Map preferences back to model-name-level battle results
model_name = args.model
Expand Down Expand Up @@ -499,7 +513,9 @@ def run_judge() -> pd.DataFrame:


def cli():
main()
args = CliEloArgs.parse_args()
configure_logging(args.verbosity, log_file=args.log_file)
main(args)


if __name__ == "__main__":
Expand Down
7 changes: 5 additions & 2 deletions judgearena/eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
import pandas as pd

from judgearena.evaluate import PairScore, annotate_battles
from judgearena.log import get_logger
from judgearena.utils import compute_pref_summary

logger = get_logger(__name__)


def print_results(results):
"""Print battle results in a readable format."""
Expand Down Expand Up @@ -124,8 +127,8 @@ def _make_judge_annotation(
combined_metadata = list(metadata)

if swap_mode == "both":
print("Correction for judge bias towards a certain model position is set.")
print("Evaluating completions with models reversed.")
logger.info("Correction for judge bias towards a certain model position is set.")
logger.info("Evaluating completions with models reversed.")
annotations_reversed = annotate_battles(
judge_chat_model=judge_chat_model,
instructions=instructions,
Expand Down
24 changes: 14 additions & 10 deletions judgearena/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
download_arena_hard,
is_arena_hard_dataset,
)
from judgearena.log import get_logger
from judgearena.repro import _to_jsonable, write_run_metadata
from judgearena.utils import (
compute_pref_summary,
Expand All @@ -24,6 +25,8 @@
truncate,
)

logger = get_logger(__name__)


class PairScore:
def __init__(self):
Expand Down Expand Up @@ -157,13 +160,13 @@ def evaluate_completions(

def get_output(df_outputs: pd.DataFrame, dataset: str, method: str):
if Path(method).exists():
print(f"Path {method} exists, loads local model completions.")
logger.info("Path %s exists, loading local model completions.", method)
df = read_df(Path(method)).set_index("instruction_index").sort_index()
print(f"Loaded {len(df)} completions.")
logger.info("Loaded %d completions.", len(df))
df.loc[:, "output"] = df.loc[:, "output"].fillna("")
return df.loc[:, "output"]
else:
print(f"Loading {method} from {dataset} dataset.")
logger.info("Loading %s from %s dataset.", method, dataset)
assert method in df_outputs.columns, (
f"Method {method} not present, pick among {df_outputs.columns.tolist()}"
)
Expand All @@ -186,7 +189,7 @@ def get_output(df_outputs: pd.DataFrame, dataset: str, method: str):

unique_string = dataset + "-" + datetime.now().strftime("%Y%m%d_%H%M%S")
output_folder = data_root / "judge-evals" / unique_string
print(f"Saving results in {output_folder}")
logger.info("Saving results in %s", output_folder)
output_folder.mkdir(parents=True, exist_ok=True)
(
judge_system_prompt,
Expand Down Expand Up @@ -216,7 +219,7 @@ def get_output(df_outputs: pd.DataFrame, dataset: str, method: str):
results = {**compute_pref_summary(prefs)}
pd.DataFrame(annotations).to_csv(output_folder / "annotations.csv", index=False)

print(f"{method_A} against {method_B}:\n{results}")
logger.info("%s against %s:\n%s", method_A, method_B, results)
with open(output_folder / "results.json", "w") as f:
json.dump(_to_jsonable(results), f, allow_nan=False)

Expand Down Expand Up @@ -248,7 +251,7 @@ def get_output(df_outputs: pd.DataFrame, dataset: str, method: str):
started_at_utc=run_started_at,
)
except OSError as e:
print(f"Warning: failed to write run metadata: {e}")
logger.warning("Failed to write run metadata: %s", e)


@dataclass
Expand Down Expand Up @@ -324,7 +327,7 @@ def annotate_battles(
)
]
)
print(f"Start LLM judge annotation ({len(inputs)} annotations).")
logger.info("Start LLM judge annotation (%d annotations).", len(inputs))
judge_completions = do_inference(
chat_model=judge_chat_model,
inputs=inputs,
Expand Down Expand Up @@ -373,9 +376,10 @@ def judge_and_parse_prefs(
already combined for swap_mode="both"
"""
if swap_mode == "both":
print("Correction for judge bias towards a certain model position is set.")
print(
f"Evaluating completions with models reversed with judge {judge_chat_model}."
logger.info("Correction for judge bias towards a certain model position is set.")
logger.info(
"Evaluating completions with models reversed with judge %s.",
judge_chat_model,
)

annotations = annotate_battles(
Expand Down
Loading
Loading