From 118b06a730e20e50a4a30e43881869be30b6b6a5 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sun, 1 Feb 2026 17:56:22 -0800
Subject: [PATCH 1/5] =?UTF-8?q?Rename=20submission=20modes:=20benchmark?=
 =?UTF-8?q?=E2=86=92private,=20leaderboard=E2=86=92public?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This renames the user-facing submission modes for clarity:
- BENCHMARK → PRIVATE (run benchmarks without affecting leaderboard ranking)
- LEADERBOARD → PUBLIC (official submission to the public leaderboard)

Also adds SECRET mode for internal secret validation runs.

Updates Discord commands: /benchmark → /private, /ranked → /public
---
 src/kernelbot/api/api_utils.py        |  4 ++--
 src/kernelbot/cogs/leaderboard_cog.py | 16 +++++++--------
 src/kernelbot/cogs/verify_run_cog.py  |  8 ++++----
 src/libkernelbot/backend.py           | 22 +++++++++++----------
 src/libkernelbot/consts.py            | 19 +++++++++---------
 src/libkernelbot/launchers/github.py  |  4 ++--
 src/libkernelbot/report.py            | 28 +++++++++++++++------------
 src/libkernelbot/run_eval.py          | 16 +++++++--------
 src/libkernelbot/submission.py        |  8 ++++----
 tests/test_backend.py                 | 20 +++++++++----------
 tests/test_github.py                  |  6 +++---
 tests/test_modal.py                   | 10 +++++-----
 tests/test_task.py                    |  4 ++--
 13 files changed, 86 insertions(+), 79 deletions(-)

diff --git a/src/kernelbot/api/api_utils.py b/src/kernelbot/api/api_utils.py
index ab1505ac..1b37a8ef 100644
--- a/src/kernelbot/api/api_utils.py
+++ b/src/kernelbot/api/api_utils.py
@@ -213,9 +213,9 @@ async def to_submit_info(
 
     allowed_modes = [
         SubmissionMode.TEST,
-        SubmissionMode.BENCHMARK,
+        SubmissionMode.PRIVATE,
         SubmissionMode.PROFILE,
-        SubmissionMode.LEADERBOARD,
+        SubmissionMode.PUBLIC,
     ]
     if submission_mode_enum not in allowed_modes:
         raise HTTPException(
diff --git a/src/kernelbot/cogs/leaderboard_cog.py b/src/kernelbot/cogs/leaderboard_cog.py
index 457321f3..8d00e471 100644
--- a/src/kernelbot/cogs/leaderboard_cog.py
+++ b/src/kernelbot/cogs/leaderboard_cog.py
@@ -64,7 +64,7 @@ async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int):
         for run in sub_data["runs"]:
             if (
                 not run["secret"]
-                and run["mode"] == SubmissionMode.LEADERBOARD.value
+                and run["mode"] == SubmissionMode.PUBLIC.value
                 and run["passed"]
             ):
                 result_lines.append(generate_run_verdict(self.bot.backend, run, sub_data))
@@ -134,7 +134,7 @@ async def submit(
         reporter = MultiProgressReporterDiscord(interaction)
         sub_id, results = await self.bot.backend.submit_full(req, mode, reporter)
 
-        if mode == SubmissionMode.LEADERBOARD:
+        if mode == SubmissionMode.PUBLIC:
             await self.post_submit_hook(interaction, sub_id)
         return sub_id
 
@@ -157,7 +157,7 @@ async def submit_test(
             interaction, leaderboard_name, script, mode=SubmissionMode.TEST, gpu=gpu
         )
 
-    @app_commands.command(name="benchmark", description="Start a benchmarking run")
+    @app_commands.command(name="private", description="Start a private benchmarking run")
     @app_commands.describe(
         leaderboard_name="Name of the competition / kernel to optimize",
         script="The Python / CUDA script file to run",
@@ -165,7 +165,7 @@ async def submit_test(
     )
     @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
     @with_error_handling
-    async def submit_bench(
+    async def submit_private(
         self,
         interaction: discord.Interaction,
         script: discord.Attachment,
@@ -173,7 +173,7 @@ async def submit_bench(
         gpu: Optional[str],
     ):
         return await self.submit(
-            interaction, leaderboard_name, script, mode=SubmissionMode.BENCHMARK, gpu=gpu
+            interaction, leaderboard_name, script, mode=SubmissionMode.PRIVATE, gpu=gpu
         )
 
     @app_commands.command(name="profile", description="Start a profiling run")
@@ -196,7 +196,7 @@ async def submit_profile(
         )
 
     @app_commands.command(
-        name="ranked", description="Start a ranked run for an official leaderboard submission"
+        name="public", description="Start a public run for an official leaderboard submission"
     )
     @app_commands.describe(
         leaderboard_name="Name of the competition / kernel to optimize",
@@ -205,7 +205,7 @@ async def submit_profile(
     )
     @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
     @with_error_handling
-    async def submit_ranked(
+    async def submit_public(
         self,
         interaction: discord.Interaction,
         script: discord.Attachment,
@@ -213,7 +213,7 @@ async def submit_ranked(
         gpu: Optional[str] = None,
     ):
         return await self.submit(
-            interaction, leaderboard_name, script, mode=SubmissionMode.LEADERBOARD, gpu=gpu
+            interaction, leaderboard_name, script, mode=SubmissionMode.PUBLIC, gpu=gpu
         )
 
 
diff --git a/src/kernelbot/cogs/verify_run_cog.py b/src/kernelbot/cogs/verify_run_cog.py
index 53102682..58ad5844 100644
--- a/src/kernelbot/cogs/verify_run_cog.py
+++ b/src/kernelbot/cogs/verify_run_cog.py
@@ -171,8 +171,8 @@ async def verify_modal_run(
     @app_commands.choices(
         mode=[
             Choice(name=SubmissionMode.TEST.name, value=SubmissionMode.TEST.value),
-            Choice(name=SubmissionMode.BENCHMARK.name, value=SubmissionMode.BENCHMARK.value),
-            Choice(name=SubmissionMode.LEADERBOARD.name, value=SubmissionMode.LEADERBOARD.value),
+            Choice(name=SubmissionMode.PRIVATE.name, value=SubmissionMode.PRIVATE.value),
+            Choice(name=SubmissionMode.PUBLIC.name, value=SubmissionMode.PUBLIC.value),
             Choice(name="All", value="all"),
         ]
     )
@@ -194,9 +194,9 @@ async def verify_task(
 
         modes = []
         if mode is None:
-            modes = [SubmissionMode.LEADERBOARD]
+            modes = [SubmissionMode.PUBLIC]
         elif mode.value == "all":
-            modes = [SubmissionMode.TEST, SubmissionMode.BENCHMARK, SubmissionMode.LEADERBOARD]
+            modes = [SubmissionMode.TEST, SubmissionMode.PRIVATE, SubmissionMode.PUBLIC]
         else:
             modes = [SubmissionMode(mode.value)]
 
diff --git a/src/libkernelbot/backend.py b/src/libkernelbot/backend.py
index f3b68bb0..2f90e0e3 100644
--- a/src/libkernelbot/backend.py
+++ b/src/libkernelbot/backend.py
@@ -86,7 +86,7 @@ async def submit_full(
                 for gpu in selected_gpus
             ]
 
-            if mode == SubmissionMode.LEADERBOARD:
+            if mode == SubmissionMode.PUBLIC:
                 tasks += [
                     self.submit_leaderboard(
                         sub_id,
@@ -95,7 +95,7 @@ async def submit_full(
                         gpu,
                         reporter.add_run(f"{gpu.name} on {gpu.runner} (secret)"),
                         req.task,
-                        SubmissionMode.PRIVATE,
+                        SubmissionMode.SECRET,
                         req.secret_seed,
                     )
                     for gpu in selected_gpus
@@ -142,12 +142,14 @@ async def submit_leaderboard(  # noqa: C901
 
         if result.success:
             score = None
+            # Check for the mode's result key (public or secret)
+            mode_key = mode.value
             if (
-                "leaderboard" in result.runs
-                and result.runs["leaderboard"].run.success
-                and result.runs["leaderboard"].run.passed
+                mode_key in result.runs
+                and result.runs[mode_key].run.success
+                and result.runs[mode_key].run.passed
             ):
-                score = compute_score(result, task, submission_id)
+                score = compute_score(result, task, submission_id, mode_key)
 
             # verifyruns uses a fake submission id of -1
             if submission_id != -1:
@@ -159,8 +161,8 @@ async def submit_leaderboard(  # noqa: C901
                             end=value.end,
                             mode=key,
                             runner=gpu_type.name,
-                            score=None if key != "leaderboard" else score,
-                            secret=mode == SubmissionMode.PRIVATE,
+                            score=None if key != mode_key else score,
+                            secret=mode == SubmissionMode.SECRET,
                             compilation=value.compilation,
                             result=value.run,
                             system=result.system,
@@ -207,7 +209,7 @@ async def handle_submission(
             await reporter.update_title(reporter.title + " ✅ success")
 
         short_report = make_short_report(
-            result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD]
+            result.runs, full=mode in [SubmissionMode.PUBLIC, SubmissionMode.SECRET]
         )
 
         stream_msg = (
@@ -222,7 +224,7 @@ async def handle_submission(
         )
 
         await reporter.push(short_report)
-        if mode != SubmissionMode.PRIVATE:
+        if mode != SubmissionMode.SECRET:
             try:
                 # does the last message of the short report start with ✅ or ❌?
                 verdict = short_report[-1][0]
diff --git a/src/libkernelbot/consts.py b/src/libkernelbot/consts.py
index f60764de..ac667cce 100644
--- a/src/libkernelbot/consts.py
+++ b/src/libkernelbot/consts.py
@@ -82,21 +82,22 @@ class SubmissionMode(Enum):
     """
     Different types of submission that can be made:
     Test: Run tests and give detailed results about passed/failed tests. These have short timeouts.
-    Benchmark: Run larger benchmarks. Each benchmark is tested once, and then run multiple times.
+    Private: Run benchmarks privately. Each benchmark is tested once, and then run multiple times.
+        Returns detailed timing results but doesn't affect leaderboard ranking.
     Profile: Gather profiling information. One selected benchmark is run under the profiler. No
         testing is performed in this mode (sometimes, you need to profile deliberately broken code)
-    Leaderboard: Official submission to the leaderboard. This first runs public tests, then a
-        repeated invocation of a single benchmark. Feedback for the secret benchmark is only very
-        limited (no stdout/stderr).
-    Private: Special run that does test followed by leaderboard (on a secret seed), but gives only
-        very limited feedback.
+    Public: Official submission to the leaderboard. This first runs public tests, then a
+        repeated invocation of a single benchmark. If all tests pass, the submission is evaluated
+        and ranked on the public leaderboard.
+    Secret: Internal mode for running the full evaluation flow with a secret seed. This is used
+        for secret validation runs that accompany public submissions.
     """
 
     TEST = "test"
-    BENCHMARK = "benchmark"
-    PROFILE = "profile"
-    LEADERBOARD = "leaderboard"
     PRIVATE = "private"
+    PROFILE = "profile"
+    PUBLIC = "public"
+    SECRET = "secret"
 
 
 class Language(Enum):
diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
index a1970a7e..c984c749 100644
--- a/src/libkernelbot/launchers/github.py
+++ b/src/libkernelbot/launchers/github.py
@@ -49,8 +49,8 @@ def get_timeout(config: dict) -> int:
     mode = config.get("mode")
     sec_map = {
         SubmissionMode.TEST.value: config.get("test_timeout"),
-        SubmissionMode.BENCHMARK.value: config.get("benchmark_timeout"),
-        SubmissionMode.LEADERBOARD.value: config.get("ranked_timeout"),
+        SubmissionMode.PRIVATE.value: config.get("benchmark_timeout"),
+        SubmissionMode.PUBLIC.value: config.get("ranked_timeout"),
     }
     seconds = sec_map.get(mode) or DEFAULT_GITHUB_TIMEOUT_MINUTES * 60
     return math.ceil(seconds / 60)
diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
index 70f91487..b0f8baf5 100644
--- a/src/libkernelbot/report.py
+++ b/src/libkernelbot/report.py
@@ -176,8 +176,8 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
     elif full:
         result.append("❌ Tests missing")
 
-    if "benchmark" in runs:
-        bench_run = runs["benchmark"].run
+    if "private" in runs:
+        bench_run = runs["private"].run
         if not bench_run.success:
             result.append("❌ Running benchmarks failed" + _short_fail_reason(bench_run))
             return result
@@ -202,16 +202,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
             else:
                 result.append("✅ Profiling successful")
 
-    if "leaderboard" in runs:
-        lb_run = runs["leaderboard"].run
+    # Check for public or secret run results
+    ranked_key = "public" if "public" in runs else ("secret" if "secret" in runs else None)
+    if ranked_key:
+        lb_run = runs[ranked_key].run
         if not lb_run.success:
-            result.append("❌ Running leaderboard failed" + _short_fail_reason(lb_run))
+            result.append("❌ Running ranked submission failed" + _short_fail_reason(lb_run))
         elif not lb_run.passed:
-            result.append("❌ Leaderboard run failed")
+            result.append("❌ Ranked submission failed")
         else:
-            result.append("✅ Leaderboard run successful")
+            result.append("✅ Ranked submission successful")
     elif full:
-        result.append("❌ Leaderboard missing")
+        result.append("❌ Ranked submission missing")
     return result
 
 
@@ -339,8 +341,8 @@ def generate_report(result: FullResult, extra_text: str = "") -> RunResultReport
             num_tests = int(test_run.result.get("test-count", 0))
             report.add_log(f"✅ Passed {num_tests}/{num_tests} tests", make_test_log(test_run))
 
-    if "benchmark" in runs:
-        bench_run = runs["benchmark"]
+    if "private" in runs:
+        bench_run = runs["private"]
         if _handle_crash_report(report, bench_run):
             return report
 
@@ -378,8 +380,10 @@ def generate_report(result: FullResult, extra_text: str = "") -> RunResultReport
                         base64.b64decode(prof_run.profile.trace),
                     )
 
-    if "leaderboard" in runs:
-        bench_run = runs["leaderboard"]
+    # Check for public or secret run results
+    ranked_key = "public" if "public" in runs else ("secret" if "secret" in runs else None)
+    if ranked_key:
+        bench_run = runs[ranked_key]
         if _handle_crash_report(report, bench_run):
             return report
 
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
index aec59f95..5891f302 100644
--- a/src/libkernelbot/run_eval.py
+++ b/src/libkernelbot/run_eval.py
@@ -556,8 +556,8 @@ def run_single_evaluation(
         if mode == "test":
             timeout = test_timeout
             cases.write(tests)
-        elif mode in ["benchmark", "profile", "leaderboard"]:
-            timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout
+        elif mode in ["private", "profile", "public", "secret"]:
+            timeout = ranked_timeout if mode in ["public", "secret"] else benchmark_timeout
             if ranking_by == "last":
                 cases.write(benchmarks.splitlines(keepends=True)[-1])
             else:
@@ -801,22 +801,22 @@ def run_evaluation(
             common_args["benchmarks"] = benchmark
             results[f"{mode}.{i}"] = call(mode=mode, **common_args)
 
-    elif mode in ["test", "benchmark"]:
+    elif mode in ["test", "private"]:
         results[mode] = call(mode=mode, **common_args)
-    elif mode in ["private", "leaderboard"]:
+    elif mode in ["public", "secret"]:
         # first, run the tests
         results["test"] = call(mode="test", **common_args)
 
         if not results["test"].run or not results["test"].run.passed:
             return results
 
-        results["benchmark"] = call(mode="benchmark", **common_args)
+        results["private"] = call(mode="private", **common_args)
 
-        if not results["benchmark"].run or not results["benchmark"].run.passed:
+        if not results["private"].run or not results["private"].run.passed:
             return results
 
-        # if they pass, run the leaderboard validation
-        results["leaderboard"] = call(mode="leaderboard", **common_args)
+        # if they pass, run the public/secret validation
+        results[mode] = call(mode=mode, **common_args)
     else:
         raise AssertionError("Invalid mode")
 
diff --git a/src/libkernelbot/submission.py b/src/libkernelbot/submission.py
index 805f7435..12cbc661 100644
--- a/src/libkernelbot/submission.py
+++ b/src/libkernelbot/submission.py
@@ -169,8 +169,8 @@ def _get_popcorn_directives(submission: str) -> dict:  # noqa: C901
     return popcorn_info
 
 
-def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int) -> float:
-    num_benchmarks = int(result.runs["leaderboard"].run.result["benchmark-count"])
+def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int, mode_key: str = "public") -> float:
+    num_benchmarks = int(result.runs[mode_key].run.result["benchmark-count"])
     if task.ranking_by == RankCriterion.LAST:
         if num_benchmarks != 1:
             logger.error(
@@ -182,11 +182,11 @@ def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int)
             raise KernelBotError(
                 f"Expected submission to have exactly one benchmark, got {num_benchmarks}."
             )
-        score = float(result.runs["leaderboard"].run.result["benchmark.0.mean"]) / 1e9
+        score = float(result.runs[mode_key].run.result["benchmark.0.mean"]) / 1e9
     else:
         scores = []
         for i in range(num_benchmarks):
-            scores.append(float(result.runs["leaderboard"].run.result[f"benchmark.{i}.mean"]) / 1e9)
+            scores.append(float(result.runs[mode_key].run.result[f"benchmark.{i}.mean"]) / 1e9)
         if task.ranking_by == RankCriterion.MEAN:
             score = sum(scores) / len(scores)
         elif task.ranking_by == RankCriterion.GEOM:
diff --git a/tests/test_backend.py b/tests/test_backend.py
index f69170c5..af327519 100644
--- a/tests/test_backend.py
+++ b/tests/test_backend.py
@@ -55,7 +55,7 @@ async def test_handle_submission(bot: backend.KernelBackend, task_directory):
         "pass",
         "submit.py",
         task,
-        consts.SubmissionMode.LEADERBOARD,
+        consts.SubmissionMode.PUBLIC,
         -1,
     )
 
@@ -64,7 +64,7 @@ async def test_handle_submission(bot: backend.KernelBackend, task_directory):
         "> ✅ Compilation successful",
         "> ✅ Testing successful",
         "> ❌ Benchmarks missing",
-        "> ❌ Leaderboard missing",
+        "> ❌ Ranked submission missing",
     ]
 
     call_args = reporter.display_report.call_args[0]
@@ -130,7 +130,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory):
             submit_time,
         )
     eval_result = create_eval_result("benchmark")
-    mock_launcher = _mock_launcher(bot, {"leaderboard": eval_result})
+    mock_launcher = _mock_launcher(bot, {"secret": eval_result})
 
     reporter = MockProgressReporter("report")
 
@@ -141,7 +141,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory):
         consts.ModalGPU.A100,
         reporter,
         task,
-        consts.SubmissionMode.LEADERBOARD,
+        consts.SubmissionMode.SECRET,
         seed=1337,
     )
 
@@ -155,7 +155,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory):
         "benchmarks": [{"dtype": "float32", "input_size": 10000}],
         "lang": "py",
         "main": "kernel.py",
-        "mode": "leaderboard",
+        "mode": "secret",
         "multi_gpu": False,
         "ranked_timeout": 180,
         "ranking_by": "geom",
@@ -193,7 +193,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory):
                         "stdout": "log stdout",
                         "success": True,
                     },
-                    "mode": "leaderboard",
+                    "mode": "secret",
                     "passed": True,
                     "result": {
                         "benchmark-count": "1",
@@ -206,7 +206,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory):
                     },
                     "runner": "A100",
                     "score": Decimal("1.5e-9"),
-                    "secret": False,
+                    "secret": True,
                     "start_time": eval_result.start.replace(tzinfo=datetime.timezone.utc),
                     "system": {
                         "cpu": "Intel i9-12900K",
@@ -249,7 +249,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
     )
     reporter = MockMultReporter()
     s_id, results = await bot.submit_full(
-        req, mode=consts.SubmissionMode.LEADERBOARD, reporter=reporter
+        req, mode=consts.SubmissionMode.PUBLIC, reporter=reporter
     )
 
     expected_result = mock_launcher.run_submission.return_value
@@ -261,13 +261,13 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
         "> ✅ Compilation successful",
         "> ❌ Tests missing",
         "> ❌ Benchmarks missing",
-        "> ✅ Leaderboard run successful",
+        "> ✅ Ranked submission successful",
     ]
     assert r2.lines == [
         "> ✅ Compilation successful",
         "> ❌ Tests missing",
         "> ❌ Benchmarks missing",
-        "> ✅ Leaderboard run successful",
+        "> ✅ Ranked submission successful",
     ]
     assert r1.title == "A100 on Modal ✅ success"
     assert r2.title == "A100 on Modal (secret) ✅ success"
diff --git a/tests/test_github.py b/tests/test_github.py
index 413e00bd..8b8a4d63 100644
--- a/tests/test_github.py
+++ b/tests/test_github.py
@@ -179,7 +179,7 @@ async def test_github_launcher_failing_script(project_root: Path, github_config:
         task=task_definition.task,
         submission_content=submission_content,
         arch=0,
-        mode=SubmissionMode.LEADERBOARD,
+        mode=SubmissionMode.PUBLIC,
     )
 
     result = await launcher.run_submission(config, gpu_type, reporter)
@@ -190,9 +190,9 @@ async def test_github_launcher_failing_script(project_root: Path, github_config:
 
     # But the actual test or benchmark should fail
     test_passed = result.runs.get("test", {}).run.passed if "test" in result.runs else True
-    benchmark_passed = result.runs.get("benchmark", {}).run.passed if "benchmark" in result.runs else True
+    private_passed = result.runs.get("private", {}).run.passed if "private" in result.runs else True
 
-    assert not (test_passed and benchmark_passed), "Expected at least one run to fail for cheating script"
+    assert not (test_passed and private_passed), "Expected at least one run to fail for cheating script"
 
 
 
diff --git a/tests/test_modal.py b/tests/test_modal.py
index d22ef05b..c87bd2c7 100644
--- a/tests/test_modal.py
+++ b/tests/test_modal.py
@@ -265,7 +265,7 @@ async def test_modal_multi_gpu_benchmark(
         task=task_definition.task,
         submission_content=submission_content,
         arch=GPU_TO_SM[ModalGPU.L4x4.name],
-        mode=SubmissionMode.BENCHMARK,
+        mode=SubmissionMode.PRIVATE,
     )
 
     result = await launcher.run_submission(config, ModalGPU.L4x4, reporter)
@@ -280,8 +280,8 @@ async def test_modal_multi_gpu_benchmark(
     assert result.system.device_count == 4
 
     # Test run structure
-    assert "benchmark" in result.runs
-    bench_run = result.runs["benchmark"]
+    assert "private" in result.runs
+    bench_run = result.runs["private"]
 
     # For Python runs, compilation is None
     assert bench_run.compilation is None
@@ -317,7 +317,7 @@ async def test_modal_launcher_failing_script(modal_deployment, project_root: Pat
         task=task_definition.task,
         submission_content=submission_content,
         arch=GPU_TO_SM[gpu_type.name],
-        mode=SubmissionMode.LEADERBOARD,
+        mode=SubmissionMode.PUBLIC,
     )
 
     result = await launcher.run_submission(config, gpu_type, reporter)
@@ -325,4 +325,4 @@ async def test_modal_launcher_failing_script(modal_deployment, project_root: Pat
     # Basic structure and success
     assert result.success, f"Expected successful run, got: {result.error}"
     assert result.error == ""
-    assert result.runs["test"].run.passed is False or result.runs["benchmark"].run.passed is False
+    assert result.runs["test"].run.passed is False or result.runs["private"].run.passed is False
diff --git a/tests/test_task.py b/tests/test_task.py
index 809a6907..b6fa6a63 100644
--- a/tests/test_task.py
+++ b/tests/test_task.py
@@ -126,7 +126,7 @@ def test_build_task_config_python(leaderboard_task):
     """Test build_task_config with Python task and submission content."""
     submission_content = "print('Hello World')"
     arch = "sm_80"
-    mode = SubmissionMode.BENCHMARK
+    mode = SubmissionMode.PRIVATE
 
     result = build_task_config(
         task=leaderboard_task, submission_content=submission_content, arch=arch, mode=mode
@@ -164,7 +164,7 @@ def test_build_task_config_cuda():
     """Test build_task_config with CUDA task and submission content."""
     submission_content = "print('Hello World')"
     arch = "sm_80"
-    mode = SubmissionMode.BENCHMARK
+    mode = SubmissionMode.PRIVATE
     task = LeaderboardTask(
         lang=Language.CUDA,
         files={"test.cu": "code", "submission.cu": "@SUBMISSION@", "test.cuh": "header"},

From ee62fdc798c4b6ba8a28d854a7ff3a7bed31f4b9 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sun, 1 Feb 2026 18:12:10 -0800
Subject: [PATCH 2/5] Fix test files to use new private/public mode naming

Update test data keys and expected values:
- test_report.py: Change "benchmark"/"leaderboard" keys to "private"/"public"
- test_submission.py: Update compute_score test to use "public" key
- test_backend.py: Update mode values and mock data keys
---
 tests/test_backend.py    |  8 ++++----
 tests/test_report.py     | 30 +++++++++++++++---------------
 tests/test_submission.py |  8 ++++----
 3 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/tests/test_backend.py b/tests/test_backend.py
index af327519..04d0fde4 100644
--- a/tests/test_backend.py
+++ b/tests/test_backend.py
@@ -101,7 +101,7 @@ async def test_handle_submission(bot: backend.KernelBackend, task_directory):
         "benchmarks": [{"dtype": "float32", "input_size": 10000}],
         "lang": "py",
         "main": "kernel.py",
-        "mode": "leaderboard",
+        "mode": "public",
         "multi_gpu": False,
         "ranked_timeout": 180,
         "ranking_by": "geom",
@@ -232,7 +232,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
         task = db.get_leaderboard("submit-leaderboard")["task"]
 
     eval_result = create_eval_result("benchmark")
-    mock_launcher = _mock_launcher(bot, {"leaderboard": eval_result})
+    mock_launcher = _mock_launcher(bot, {"public": eval_result})
 
     from libkernelbot.submission import ProcessedSubmissionRequest
 
@@ -300,7 +300,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
                         "stdout": "log stdout",
                         "success": True,
                     },
-                    "mode": "leaderboard",
+                    "mode": "public",
                     "passed": True,
                     "result": {
                         "benchmark-count": "1",
@@ -344,7 +344,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
                         "stdout": "log stdout",
                         "success": True,
                     },
-                    "mode": "leaderboard",
+                    "mode": "public",
                     "passed": True,
                     "result": {
                         "benchmark-count": "1",
diff --git a/tests/test_report.py b/tests/test_report.py
index ae3afd25..f8a61b29 100644
--- a/tests/test_report.py
+++ b/tests/test_report.py
@@ -241,7 +241,7 @@ def test_make_short_report_benchmarking_failed(sample_eval_result: EvalResult):
     sample_eval_result.run.success = False
     sample_eval_result.compilation = None
     sample_eval_result.run.exit_code = consts.ExitCode.CUDA_FAIL
-    runs = {"benchmark": sample_eval_result}
+    runs = {"private": sample_eval_result}
 
     result = make_short_report(runs, full=False)
     assert result == ["❌ Running benchmarks failed (cuda api error)"]
@@ -274,27 +274,27 @@ def test_make_short_report_leaderboard_failed(sample_eval_result: EvalResult):
     sample_eval_result.run.success = False
     sample_eval_result.compilation = None
     sample_eval_result.run.exit_code = consts.ExitCode.TEST_SPEC
-    runs = {"leaderboard": sample_eval_result}
+    runs = {"public": sample_eval_result}
 
     result = make_short_report(runs, full=False)
-    assert result == ["❌ Running leaderboard failed (internal error 113)"]
+    assert result == ["❌ Running ranked submission failed (internal error 113)"]
 
     sample_eval_result.run.success = True
     sample_eval_result.run.passed = False
     sample_eval_result.run.exit_code = consts.ExitCode.VALIDATE_FAIL
     result = make_short_report(runs)
     # TODO is this actually possible? Should profiling do **any** correctness testing?
-    assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Leaderboard run failed"]
+    assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Ranked submission failed"]
 
 
 def test_make_short_report_empty():
     result = make_short_report({})
-    assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Leaderboard missing"]
+    assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Ranked submission missing"]
 
 
 def test_make_short_report_full_success():
     runs = {}
-    for run_type in ["test", "benchmark", "profile", "leaderboard"]:
+    for run_type in ["test", "private", "profile", "public"]:
         runs[run_type] = EvalResult(
             start=datetime.datetime.now() - datetime.timedelta(minutes=5),
             end=datetime.datetime.now(),
@@ -318,7 +318,7 @@ def test_make_short_report_full_success():
         "✅ Testing successful",
         "✅ Benchmarking successful",
         "✅ Profiling successful",
-        "✅ Leaderboard run successful",
+        "✅ Ranked submission successful",
     ]
     assert result == expected
 
@@ -331,7 +331,7 @@ def test_make_short_report_missing_components():
         "✅ Compilation successful",
         "✅ Testing successful",
         "❌ Benchmarks missing",
-        "❌ Leaderboard missing",
+        "❌ Ranked submission missing",
     ]
     assert result == expected
 
@@ -532,7 +532,7 @@ def test_generate_report_test_failure(sample_full_result: FullResult):
 def test_generate_report_benchmark_failure(sample_full_result: FullResult):
     from libkernelbot.report import Log, Text
 
-    sample_full_result.runs["benchmark"] = create_eval_result()
+    sample_full_result.runs["private"] = create_eval_result()
     report = generate_report(sample_full_result)
     assert report.data == [
         Text(
@@ -557,8 +557,8 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult):
         Log(header="Benchmarks", content="❗ Could not find any benchmarks"),
     ]
 
-    sample_full_result.runs["benchmark"].run.passed = False
-    sample_full_result.runs["benchmark"].run.result = {
+    sample_full_result.runs["private"].run.passed = False
+    sample_full_result.runs["private"].run.result = {
         "benchmark-count": "2",
         "benchmark.0.status": "pass",
         "benchmark.0.spec": "Basic functionality",
@@ -607,7 +607,7 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult):
 def test_generate_report_leaderboard_failure(sample_full_result: FullResult):
     from libkernelbot.report import Log, Text
 
-    sample_full_result.runs["leaderboard"] = create_eval_result()
+    sample_full_result.runs["public"] = create_eval_result()
     report = generate_report(sample_full_result)
     assert report.data == [
         Text(
@@ -632,9 +632,9 @@ def test_generate_report_leaderboard_failure(sample_full_result: FullResult):
         Log(header="Ranked Benchmark", content="❗ Could not find any benchmarks"),
     ]
 
-    sample_full_result.runs["leaderboard"].run.success = False
-    sample_full_result.runs["leaderboard"].run.exit_code = consts.ExitCode.TIMEOUT_EXPIRED
-    sample_full_result.runs["leaderboard"].run.duration = 10.0
+    sample_full_result.runs["public"].run.success = False
+    sample_full_result.runs["public"].run.exit_code = consts.ExitCode.TIMEOUT_EXPIRED
+    sample_full_result.runs["public"].run.duration = 10.0
 
     report = generate_report(sample_full_result)
     assert report.data == [
diff --git a/tests/test_submission.py b/tests/test_submission.py
index e22fcb8e..1654b9b1 100644
--- a/tests/test_submission.py
+++ b/tests/test_submission.py
@@ -303,7 +303,7 @@ def test_compute_score():
     # Test LAST ranking with single benchmark
     mock_task.ranking_by = RankCriterion.LAST
     mock_result.runs = {
-        "leaderboard": mock.Mock(
+        "public": mock.Mock(
             run=mock.Mock(
                 result={
                     "benchmark-count": "1",
@@ -317,7 +317,7 @@ def test_compute_score():
 
     # Test MEAN ranking with multiple benchmarks
     mock_task.ranking_by = RankCriterion.MEAN
-    mock_result.runs["leaderboard"].run.result = {
+    mock_result.runs["public"].run.result = {
         "benchmark-count": "2",
         "benchmark.0.mean": "1000000000",  # 1 second
         "benchmark.1.mean": "3000000000",  # 3 seconds
@@ -327,7 +327,7 @@ def test_compute_score():
 
     # Test GEOM ranking with multiple benchmarks
     mock_task.ranking_by = RankCriterion.GEOM
-    mock_result.runs["leaderboard"].run.result = {
+    mock_result.runs["public"].run.result = {
         "benchmark-count": "2",
         "benchmark.0.mean": "4000000000",  # 4 seconds
         "benchmark.1.mean": "9000000000",  # 9 seconds
@@ -337,7 +337,7 @@ def test_compute_score():
 
     # Test LAST with multiple benchmarks (should raise error)
     mock_task.ranking_by = RankCriterion.LAST
-    mock_result.runs["leaderboard"].run.result["benchmark-count"] = "2"
+    mock_result.runs["public"].run.result["benchmark-count"] = "2"
     with pytest.raises(KernelBotError, match="exactly one benchmark"):
         submission.compute_score(mock_result, mock_task, 1)
 

From 6fcc19f17e4ecfe8a618524047f5f0476d7cd56c Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sun, 1 Feb 2026 18:15:57 -0800
Subject: [PATCH 3/5] Fix test_submit_full mock and expected mode

- Add 'secret' key to mock launcher runs so SECRET mode can find its result
- Fix second run's expected mode from 'public' to 'secret'
---
 tests/test_backend.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_backend.py b/tests/test_backend.py
index 04d0fde4..ea420dbd 100644
--- a/tests/test_backend.py
+++ b/tests/test_backend.py
@@ -232,7 +232,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
         task = db.get_leaderboard("submit-leaderboard")["task"]
 
     eval_result = create_eval_result("benchmark")
-    mock_launcher = _mock_launcher(bot, {"public": eval_result})
+    mock_launcher = _mock_launcher(bot, {"public": eval_result, "secret": eval_result})
 
     from libkernelbot.submission import ProcessedSubmissionRequest
 
@@ -344,7 +344,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
                         "stdout": "log stdout",
                         "success": True,
                     },
-                    "mode": "public",
+                    "mode": "secret",
                     "passed": True,
                     "result": {
                         "benchmark-count": "1",

From 2e022cadeb6ca39c994ad00fa9c619458431f20e Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sun, 1 Feb 2026 18:19:04 -0800
Subject: [PATCH 4/5] Fix GitHub integration test to use PR branch

Set GITHUB_BRANCH env var to use the PR's source branch instead of
falling back to main. Uses github.head_ref for PRs, github.ref_name
for direct pushes.
---
 .github/workflows/testing.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index c34818f2..667b2c4c 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -58,6 +58,7 @@ jobs:
     if: github.actor != 'dependabot[bot]'
     env:
       GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
+      GITHUB_BRANCH: ${{ github.head_ref || github.ref_name }}
     steps:
       - uses: actions/checkout@v4
       - uses: astral-sh/setup-uv@v4

From d47bad7ee9c02b9261cb42d0d1a014b736ce044c Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sun, 1 Feb 2026 18:21:53 -0800
Subject: [PATCH 5/5] Fix test_submit_full mock to return mode-specific results

Use side_effect to return different FullResult for each call:
- First call (PUBLIC mode) returns {"public": eval_result}
- Second call (SECRET mode) returns {"secret": eval_result}

This prevents the backend from storing all keys from both calls.
---
 tests/test_backend.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/tests/test_backend.py b/tests/test_backend.py
index ea420dbd..a07b1514 100644
--- a/tests/test_backend.py
+++ b/tests/test_backend.py
@@ -232,7 +232,18 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
         task = db.get_leaderboard("submit-leaderboard")["task"]
 
     eval_result = create_eval_result("benchmark")
-    mock_launcher = _mock_launcher(bot, {"public": eval_result, "secret": eval_result})
+    # Use side_effect to return different results for each call
+    # First call (PUBLIC mode) returns {"public": ...}, second call (SECRET mode) returns {"secret": ...}
+    mock_launcher = MagicMock(spec=backend.Launcher)
+    mock_launcher.name = "launcher"
+    mock_launcher.gpus = [consts.ModalGPU.A100]
+    mock_launcher.run_submission = AsyncMock(
+        side_effect=[
+            FullResult(success=True, error="", system=sample_system_info(), runs={"public": eval_result}),
+            FullResult(success=True, error="", system=sample_system_info(), runs={"secret": eval_result}),
+        ]
+    )
+    bot.register_launcher(mock_launcher)
 
     from libkernelbot.submission import ProcessedSubmissionRequest
 
@@ -252,9 +263,15 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory):
         req, mode=consts.SubmissionMode.PUBLIC, reporter=reporter
     )
 
-    expected_result = mock_launcher.run_submission.return_value
+    expected_result_public = FullResult(
+        success=True, error="", system=sample_system_info(), runs={"public": eval_result}
+    )
+    expected_result_secret = FullResult(
+        success=True, error="", system=sample_system_info(), runs={"secret": eval_result}
+    )
     assert len(results) == 2
-    assert results == [expected_result, expected_result]
+    assert results[0].success == expected_result_public.success
+    assert results[1].success == expected_result_secret.success
 
     r1, r2 = reporter.reporter_list
     assert r1.lines == [