OpenEuroLLM · geoalgo · Apr 7, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/LICENSE b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2026 Erlis Lushtaku, David Salinas
+   Copyright 2026 Erlis Lushtaku, David Salinas, and GitHub contributors
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/README.md b/README.md
@@ -196,15 +196,20 @@ This override applies to all vLLM models in the run. For remote providers (OpenA
 | Dataset               | Description                                                                                    |
 |-----------------------|------------------------------------------------------------------------------------------------|
 | `alpaca-eval`         | General instruction-following benchmark                                                        |
-| `arena-hard`          | More challenging evaluation suite                                                              |
+| `arena-hard-v2.0`     | Arena-Hard v2.0 from official `lmarena-ai/arena-hard-auto` source                             |
+| `arena-hard-v0.1`     | Legacy Arena-Hard v0.1 from official `lmarena-ai/arena-hard-auto` source                      |
 | `m-arena-hard`        | Translated version of Arena-Hard in 23 languages                                               |
 | `m-arena-hard-{lang}` | Language-specific variants (e.g., `ar`, `cs`, `de`)                                            |
 | `m-arena-hard-EU`     | All EU languages combined                                                                      |
 | `fluency-{lang}`      | Fluency evaluation for pretrained models (`finnish`, `french`, `german`, `spanish`, `swedish`) |
 
+For Arena-Hard, JudgeArena resolves baseline metadata by dataset version:
+- `arena-hard-v0.1`: `gpt-4-0314`
+- `arena-hard-v2.0`: `o3-mini-2025-01-31` (standard prompts)
+
 ## 📈 Estimating ELO Ratings
 
-OpenJury can estimate the ELO rating of a model by running it against opponents sampled from a human preference arena (`LMArena-100k`, `LMArena-140k`, or `ComparIA`).
+JudgeArena can estimate the ELO rating of a model by running it against opponents sampled from a human preference arena (`LMArena-100k`, `LMArena-140k`, or `ComparIA`).
 The LLM judge scores each battle, and the resulting ratings are computed using the Bradley-Terry model anchored against the human-annotated arena leaderboard.
 
 ### Quick start
@@ -220,7 +225,7 @@ judgearena-elo \
 Alternatively, if running directly from the repository without installing:
 
 ```bash
-uv run python openjury/estimate_elo_ratings.py \
+uv run python judgearena/estimate_elo_ratings.py \
   --arena ComparIA \
   --model Together/meta-llama/Llama-3.3-70B-Instruct-Turbo \
   --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \
@@ -232,8 +237,8 @@ uv run python openjury/estimate_elo_ratings.py \
 | Flag | Default | Description |
 |---|---|---|
 | `--arena` | `ComparIA` | Arena to sample opponents from: `LMArena-100k`, `LMArena-140k`, or `ComparIA` |
-| `--model` | *(required)* | Model under evaluation (same format as `openjury`) |
-| `--judge_model` | *(required)* | LLM judge (same format as `openjury`) |
+| `--model` | *(required)* | Model under evaluation (same format as `judgearena`) |
+| `--judge_model` | *(required)* | LLM judge (same format as `judgearena`) |
 | `--n_instructions` | all | Number of arena battles to use for evaluation |
 | `--n_instructions_per_language` | all | Cap battles per language (useful for balanced multilingual eval) |
 | `--languages` | all | Restrict to specific language codes, e.g. `en fr de` |

diff --git a/TODOs.md b/TODOs.md
@@ -1,14 +1,14 @@
 TODOs:
-* push on pypi
 * document on the fly evaluations with custom prompt
-* support MT-bench
 * handle errors
 * CI [high/large]
 * implement CI judge option
 * implement domain filter in CI (maybe pass a regexp by column?)
 * report cost?
 
 Done:
+* push on pypi
+* support MT-bench
 * support alpaca-eval
 * support arena-hard
 * test together judge

diff --git a/judgearena/evaluate.py b/judgearena/evaluate.py
@@ -10,6 +10,10 @@
 from langchain_core.prompts import ChatPromptTemplate
 
 from judgearena.instruction_dataset import load_instructions
+from judgearena.instruction_dataset.arena_hard import (
+    download_arena_hard,
+    is_arena_hard_dataset,
+)
 from judgearena.repro import _to_jsonable, write_run_metadata
 from judgearena.utils import (
     compute_pref_summary,
@@ -127,7 +131,10 @@ def evaluate_completions(
     """
     run_started_at = datetime.now(UTC)
     local_path_tables = data_root / "tables"
-    download_hf(name=dataset, local_path=local_path_tables)
+    if is_arena_hard_dataset(dataset):
+        download_arena_hard(dataset=dataset, local_tables_path=local_path_tables)
+    else:
+        download_hf(name=dataset, local_path=local_path_tables)
 
     instructions = load_instructions(
         dataset=dataset,

diff --git a/judgearena/generate_and_evaluate.py b/judgearena/generate_and_evaluate.py
@@ -15,6 +15,10 @@
 from judgearena.evaluate import judge_and_parse_prefs, resolve_judge_prompts
 from judgearena.generate import generate_base, generate_instructions
 from judgearena.instruction_dataset import load_instructions
+from judgearena.instruction_dataset.arena_hard import (
+    download_arena_hard,
+    is_arena_hard_dataset,
+)
 from judgearena.mt_bench.mt_bench_utils import run_mt_bench
 from judgearena.repro import _to_jsonable, write_run_metadata
 from judgearena.utils import (
@@ -41,7 +45,10 @@ def try_load_dataset_completions(
     or ``None`` when no pre-existing completions are found.
     """
     local_path_tables = data_root / "tables"
-    download_hf(name=dataset, local_path=local_path_tables)
+    if is_arena_hard_dataset(dataset):
+        download_arena_hard(dataset=dataset, local_tables_path=local_path_tables)
+    else:
+        download_hf(name=dataset, local_path=local_path_tables)
     output_path = local_path_tables / "model_outputs" / f"{dataset}.csv.zip"
     if not output_path.exists():
         return None
@@ -52,7 +59,7 @@ def try_load_dataset_completions(
     ).sort_index()
     if model not in df_outputs.columns:
         return None
-    print(f"Found pre-existing completions for '{model}' in {dataset} dataset.")
+    print(f"Found pre-existing completions for '{model}' in dataset '{dataset}'.")
     completions = df_outputs.loc[:, model]
     if n_instructions is not None:
         completions = completions.head(n_instructions)
@@ -97,7 +104,8 @@ def parse_args(cls):
         )
         parser.add_argument(
             "--dataset",
-            help="The dataset to use. For instance `alpaca-eval`, `arena-hard`, `m-arena-hard-EU` for instruction "
+            help="The dataset to use. For instance `alpaca-eval`, `arena-hard-v2.0`, "
+            "`arena-hard-v0.1`, `m-arena-hard-EU` for instruction "
             "tuning cases or `french-contexts`, `spanish-contexts` for base models.",
         )
         parser.add_argument(

diff --git a/judgearena/instruction_dataset/__init__.py b/judgearena/instruction_dataset/__init__.py
@@ -1,5 +1,9 @@
 import pandas as pd
 
+from judgearena.instruction_dataset.arena_hard import (
+    download_arena_hard,
+    is_arena_hard_dataset,
+)
 from judgearena.instruction_dataset.m_arenahard import load_m_arenahard
 from judgearena.utils import data_root, download_hf, read_df
 
@@ -58,9 +62,16 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Dat
         )
 
     else:
-        assert dataset in ["alpaca-eval", "arena-hard"]
+        assert dataset in [
+            "alpaca-eval",
+            "arena-hard-v0.1",
+            "arena-hard-v2.0",
+        ]
         local_path_tables = data_root / "tables"
-        download_hf(name=dataset, local_path=local_path_tables)
+        if is_arena_hard_dataset(dataset):
+            download_arena_hard(dataset=dataset, local_tables_path=local_path_tables)
+        else:
+            download_hf(name=dataset, local_path=local_path_tables)
         df_instructions = read_df(local_path_tables / "instructions" / f"{dataset}.csv")
 
     df_instructions = df_instructions.set_index("instruction_index").sort_index()

diff --git a/judgearena/instruction_dataset/arena_hard.py b/judgearena/instruction_dataset/arena_hard.py
@@ -0,0 +1,158 @@
+from dataclasses import dataclass
+from pathlib import Path
+
+import pandas as pd
+from datasets import Dataset, DatasetDict, IterableDataset, load_dataset
+
+ARENA_HARD_HF_REPO_ID = "lmarena-ai/arena-hard-auto"
+
+
+@dataclass(frozen=True)
+class ArenaHardSpec:
+    hf_variant: str
+    baseline_model: str
+
+
+ARENA_HARD_DATASETS: dict[str, ArenaHardSpec] = {
+    "arena-hard-v0.1": ArenaHardSpec(
+        hf_variant="arena-hard-v0.1",
+        baseline_model="gpt-4-0314",
+    ),
+    "arena-hard-v2.0": ArenaHardSpec(
+        hf_variant="arena-hard-v2.0",
+        baseline_model="o3-mini-2025-01-31",
+    ),
+}
+
+
+def resolve_arena_hard_spec(dataset: str) -> ArenaHardSpec | None:
+    return ARENA_HARD_DATASETS.get(dataset)
+
+
+def is_arena_hard_dataset(dataset: str) -> bool:
+    return resolve_arena_hard_spec(dataset) is not None
+
+
+def arena_hard_baseline_model(dataset: str) -> str | None:
+    spec = resolve_arena_hard_spec(dataset)
+    if spec is None:
+        return None
+    return spec.baseline_model
+
+
+def _load_official_arena_hard_dataset(spec: ArenaHardSpec) -> pd.DataFrame:
+    data = load_dataset(
+        path=ARENA_HARD_HF_REPO_ID,
+        data_dir=f"data/{spec.hf_variant}",
+    )
+    return _dataset_like_to_dataframe(data)
+
+
+def _dataset_like_to_dataframe(
+    data: Dataset | DatasetDict | IterableDataset,
+) -> pd.DataFrame:
+    if isinstance(data, DatasetDict):
+        if "train" in data:
+            return data["train"].to_pandas()
+        first_split = next(iter(data.keys()))
+        return data[first_split].to_pandas()
+    if isinstance(data, Dataset):
+        return data.to_pandas()
+    if isinstance(data, IterableDataset):
+        return pd.DataFrame(list(data))
+    raise TypeError(f"Unsupported dataset object type: {type(data)}")
+
+
+def normalize_official_arena_hard(
+    raw_df: pd.DataFrame, dataset: str
+) -> tuple[pd.DataFrame, pd.DataFrame | None]:
+    spec = resolve_arena_hard_spec(dataset)
+    if spec is None:
+        raise ValueError(f"Unsupported Arena-Hard dataset: {dataset}")
+
+    instruction_index = _pick_instruction_index(raw_df)
+    instruction = _pick_instruction(raw_df)
+    df_instructions = pd.DataFrame(
+        {
+            "instruction_index": instruction_index,
+            "instruction": instruction,
+        }
+    )
+    df_instructions = df_instructions.dropna(
+        subset=["instruction_index", "instruction"]
+    )
+    df_instructions = df_instructions.drop_duplicates(subset=["instruction_index"])
+    df_instructions = df_instructions.sort_values("instruction_index")
+
+    df_model_outputs = _build_model_outputs(raw_df)
+    return df_instructions, df_model_outputs
+
+
+def download_arena_hard(dataset: str, local_tables_path: Path) -> None:
+    """Load Arena-Hard from the Hub if instruction and model-output files are missing."""
+    spec = resolve_arena_hard_spec(dataset)
+    if spec is None:
+        return
+    instructions_path = local_tables_path / "instructions" / f"{dataset}.csv"
+    model_outputs_path = local_tables_path / "model_outputs" / f"{dataset}.csv.zip"
+    if instructions_path.exists() and model_outputs_path.exists():
+        return
+
+    raw_df = _load_official_arena_hard_dataset(spec)
+    df_instructions, df_model_outputs = normalize_official_arena_hard(
+        raw_df=raw_df, dataset=dataset
+    )
+    instructions_path.parent.mkdir(parents=True, exist_ok=True)
+    model_outputs_path.parent.mkdir(parents=True, exist_ok=True)
+    df_instructions.to_csv(instructions_path, index=False)
+    if df_model_outputs is not None:
+        df_model_outputs.to_csv(model_outputs_path, index=False)
+
+
+def _pick_instruction_index(raw_df: pd.DataFrame) -> pd.Series:
+    for col in ["instruction_index", "question_id", "id"]:
+        if col in raw_df.columns:
+            return raw_df[col].astype(str)
+    return pd.Series(range(len(raw_df)), dtype=str)
+
+
+def _pick_instruction(raw_df: pd.DataFrame) -> pd.Series:
+    for col in ["instruction", "prompt", "question", "turns"]:
+        if col in raw_df.columns:
+            if col == "turns":
+                return raw_df[col].apply(_turns_to_text)
+            return raw_df[col].astype(str)
+    raise ValueError(
+        f"Unable to infer instruction text column from Arena-Hard data. Available columns: {raw_df.columns.tolist()}"
+    )
+
+
+def _turns_to_text(turns_value) -> str:
+    if isinstance(turns_value, list):
+        if not turns_value:
+            return ""
+        first = turns_value[0]
+        if isinstance(first, dict):
+            for key in ["content", "text", "prompt"]:
+                if key in first:
+                    return str(first[key])
+        return str(first)
+    if isinstance(turns_value, dict):
+        for key in ["content", "text", "prompt"]:
+            if key in turns_value:
+                return str(turns_value[key])
+    return str(turns_value)
+
+
+def _build_model_outputs(raw_df: pd.DataFrame) -> pd.DataFrame | None:
+    if not {"model", "output"}.issubset(raw_df.columns):
+        return None
+    instruction_index = _pick_instruction_index(raw_df)
+    df_outputs = pd.DataFrame(
+        {
+            "instruction_index": instruction_index,
+            "model": raw_df["model"].astype(str),
+            "output": raw_df["output"].fillna("").astype(str),
+        }
+    )
+    return df_outputs
diff --git a/judgearena/utils.py b/judgearena/utils.py
@@ -13,6 +13,11 @@
 from langchain_openai import ChatOpenAI
 from tqdm.asyncio import tqdm
 
+from judgearena.instruction_dataset.arena_hard import (
+    download_arena_hard,
+    is_arena_hard_dataset,
+)
+
 
 def _data_root_path() -> Path:
     raw = os.environ.get("JUDGEARENA_DATA") or os.environ.get("OPENJURY_DATA")
@@ -421,9 +426,17 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs):
 
 def download_all():
     print(f"Downloading all dataset in {data_root}")
-    for dataset in ["alpaca-eval", "arena-hard", "m-arena-hard"]:
-        local_path_tables = data_root / "tables"
-        download_hf(name=dataset, local_path=local_path_tables)
+    local_path_tables = data_root / "tables"
+    for dataset in [
+        "alpaca-eval",
+        "arena-hard-v0.1",
+        "arena-hard-v2.0",
+        "m-arena-hard",
+    ]:
+        if is_arena_hard_dataset(dataset):
+            download_arena_hard(dataset=dataset, local_tables_path=local_path_tables)
+        else:
+            download_hf(name=dataset, local_path=local_path_tables)
 
     snapshot_download(
         repo_id="geoalgo/multilingual-contexts-to-be-completed",

diff --git a/scripts/multilingual_arena_hard/translate_arena_hard.py b/scripts/multilingual_arena_hard/translate_arena_hard.py
@@ -77,7 +77,7 @@
 # translator_model = "OpenRouter/deepseek/deepseek-chat-v3.1"
 n_instructions = 10
 df_instructions = load_instructions(
-    "arena-hard",
+    "arena-hard-v2.0",
     n_instructions=n_instructions,
 )
 # languages = [("fra", "French")]