From 118b06a730e20e50a4a30e43881869be30b6b6a5 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 1 Feb 2026 17:56:22 -0800 Subject: [PATCH 1/5] =?UTF-8?q?Rename=20submission=20modes:=20benchmark?= =?UTF-8?q?=E2=86=92private,=20leaderboard=E2=86=92public?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This renames the user-facing submission modes for clarity: - BENCHMARK → PRIVATE (run benchmarks without affecting leaderboard ranking) - LEADERBOARD → PUBLIC (official submission to the public leaderboard) Also adds SECRET mode for internal secret validation runs. Updates Discord commands: /benchmark → /private, /ranked → /public --- src/kernelbot/api/api_utils.py | 4 ++-- src/kernelbot/cogs/leaderboard_cog.py | 16 +++++++-------- src/kernelbot/cogs/verify_run_cog.py | 8 ++++---- src/libkernelbot/backend.py | 22 +++++++++++---------- src/libkernelbot/consts.py | 19 +++++++++--------- src/libkernelbot/launchers/github.py | 4 ++-- src/libkernelbot/report.py | 28 +++++++++++++++------------ src/libkernelbot/run_eval.py | 16 +++++++-------- src/libkernelbot/submission.py | 8 ++++---- tests/test_backend.py | 20 +++++++++---------- tests/test_github.py | 6 +++--- tests/test_modal.py | 10 +++++----- tests/test_task.py | 4 ++-- 13 files changed, 86 insertions(+), 79 deletions(-) diff --git a/src/kernelbot/api/api_utils.py b/src/kernelbot/api/api_utils.py index ab1505ac..1b37a8ef 100644 --- a/src/kernelbot/api/api_utils.py +++ b/src/kernelbot/api/api_utils.py @@ -213,9 +213,9 @@ async def to_submit_info( allowed_modes = [ SubmissionMode.TEST, - SubmissionMode.BENCHMARK, + SubmissionMode.PRIVATE, SubmissionMode.PROFILE, - SubmissionMode.LEADERBOARD, + SubmissionMode.PUBLIC, ] if submission_mode_enum not in allowed_modes: raise HTTPException( diff --git a/src/kernelbot/cogs/leaderboard_cog.py b/src/kernelbot/cogs/leaderboard_cog.py index 457321f3..8d00e471 100644 --- a/src/kernelbot/cogs/leaderboard_cog.py +++ b/src/kernelbot/cogs/leaderboard_cog.py @@ -64,7 +64,7 @@ async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int): for run in sub_data["runs"]: if ( not run["secret"] - and run["mode"] == SubmissionMode.LEADERBOARD.value + and run["mode"] == SubmissionMode.PUBLIC.value and run["passed"] ): result_lines.append(generate_run_verdict(self.bot.backend, run, sub_data)) @@ -134,7 +134,7 @@ async def submit( reporter = MultiProgressReporterDiscord(interaction) sub_id, results = await self.bot.backend.submit_full(req, mode, reporter) - if mode == SubmissionMode.LEADERBOARD: + if mode == SubmissionMode.PUBLIC: await self.post_submit_hook(interaction, sub_id) return sub_id @@ -157,7 +157,7 @@ async def submit_test( interaction, leaderboard_name, script, mode=SubmissionMode.TEST, gpu=gpu ) - @app_commands.command(name="benchmark", description="Start a benchmarking run") + @app_commands.command(name="private", description="Start a private benchmarking run") @app_commands.describe( leaderboard_name="Name of the competition / kernel to optimize", script="The Python / CUDA script file to run", @@ -165,7 +165,7 @@ async def submit_test( ) @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) @with_error_handling - async def submit_bench( + async def submit_private( self, interaction: discord.Interaction, script: discord.Attachment, @@ -173,7 +173,7 @@ async def submit_bench( gpu: Optional[str], ): return await self.submit( - interaction, leaderboard_name, script, mode=SubmissionMode.BENCHMARK, gpu=gpu + interaction, leaderboard_name, script, mode=SubmissionMode.PRIVATE, gpu=gpu ) @app_commands.command(name="profile", description="Start a profiling run") @@ -196,7 +196,7 @@ async def submit_profile( ) @app_commands.command( - name="ranked", description="Start a ranked run for an official leaderboard submission" + name="public", description="Start a public run for an official leaderboard submission" ) @app_commands.describe( leaderboard_name="Name of the competition / kernel to optimize", @@ -205,7 +205,7 @@ async def submit_profile( ) @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) @with_error_handling - async def submit_ranked( + async def submit_public( self, interaction: discord.Interaction, script: discord.Attachment, @@ -213,7 +213,7 @@ async def submit_ranked( gpu: Optional[str] = None, ): return await self.submit( - interaction, leaderboard_name, script, mode=SubmissionMode.LEADERBOARD, gpu=gpu + interaction, leaderboard_name, script, mode=SubmissionMode.PUBLIC, gpu=gpu ) diff --git a/src/kernelbot/cogs/verify_run_cog.py b/src/kernelbot/cogs/verify_run_cog.py index 53102682..58ad5844 100644 --- a/src/kernelbot/cogs/verify_run_cog.py +++ b/src/kernelbot/cogs/verify_run_cog.py @@ -171,8 +171,8 @@ async def verify_modal_run( @app_commands.choices( mode=[ Choice(name=SubmissionMode.TEST.name, value=SubmissionMode.TEST.value), - Choice(name=SubmissionMode.BENCHMARK.name, value=SubmissionMode.BENCHMARK.value), - Choice(name=SubmissionMode.LEADERBOARD.name, value=SubmissionMode.LEADERBOARD.value), + Choice(name=SubmissionMode.PRIVATE.name, value=SubmissionMode.PRIVATE.value), + Choice(name=SubmissionMode.PUBLIC.name, value=SubmissionMode.PUBLIC.value), Choice(name="All", value="all"), ] ) @@ -194,9 +194,9 @@ async def verify_task( modes = [] if mode is None: - modes = [SubmissionMode.LEADERBOARD] + modes = [SubmissionMode.PUBLIC] elif mode.value == "all": - modes = [SubmissionMode.TEST, SubmissionMode.BENCHMARK, SubmissionMode.LEADERBOARD] + modes = [SubmissionMode.TEST, SubmissionMode.PRIVATE, SubmissionMode.PUBLIC] else: modes = [SubmissionMode(mode.value)] diff --git a/src/libkernelbot/backend.py b/src/libkernelbot/backend.py index f3b68bb0..2f90e0e3 100644 --- a/src/libkernelbot/backend.py +++ b/src/libkernelbot/backend.py @@ -86,7 +86,7 @@ async def submit_full( for gpu in selected_gpus ] - if mode == SubmissionMode.LEADERBOARD: + if mode == SubmissionMode.PUBLIC: tasks += [ self.submit_leaderboard( sub_id, @@ -95,7 +95,7 @@ async def submit_full( gpu, reporter.add_run(f"{gpu.name} on {gpu.runner} (secret)"), req.task, - SubmissionMode.PRIVATE, + SubmissionMode.SECRET, req.secret_seed, ) for gpu in selected_gpus @@ -142,12 +142,14 @@ async def submit_leaderboard( # noqa: C901 if result.success: score = None + # Check for the mode's result key (public or secret) + mode_key = mode.value if ( - "leaderboard" in result.runs - and result.runs["leaderboard"].run.success - and result.runs["leaderboard"].run.passed + mode_key in result.runs + and result.runs[mode_key].run.success + and result.runs[mode_key].run.passed ): - score = compute_score(result, task, submission_id) + score = compute_score(result, task, submission_id, mode_key) # verifyruns uses a fake submission id of -1 if submission_id != -1: @@ -159,8 +161,8 @@ async def submit_leaderboard( # noqa: C901 end=value.end, mode=key, runner=gpu_type.name, - score=None if key != "leaderboard" else score, - secret=mode == SubmissionMode.PRIVATE, + score=None if key != mode_key else score, + secret=mode == SubmissionMode.SECRET, compilation=value.compilation, result=value.run, system=result.system, @@ -207,7 +209,7 @@ async def handle_submission( await reporter.update_title(reporter.title + " ✅ success") short_report = make_short_report( - result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD] + result.runs, full=mode in [SubmissionMode.PUBLIC, SubmissionMode.SECRET] ) stream_msg = ( @@ -222,7 +224,7 @@ async def handle_submission( ) await reporter.push(short_report) - if mode != SubmissionMode.PRIVATE: + if mode != SubmissionMode.SECRET: try: # does the last message of the short report start with ✅ or ❌? verdict = short_report[-1][0] diff --git a/src/libkernelbot/consts.py b/src/libkernelbot/consts.py index f60764de..ac667cce 100644 --- a/src/libkernelbot/consts.py +++ b/src/libkernelbot/consts.py @@ -82,21 +82,22 @@ class SubmissionMode(Enum): """ Different types of submission that can be made: Test: Run tests and give detailed results about passed/failed tests. These have short timeouts. - Benchmark: Run larger benchmarks. Each benchmark is tested once, and then run multiple times. + Private: Run benchmarks privately. Each benchmark is tested once, and then run multiple times. + Returns detailed timing results but doesn't affect leaderboard ranking. Profile: Gather profiling information. One selected benchmark is run under the profiler. No testing is performed in this mode (sometimes, you need to profile deliberately broken code) - Leaderboard: Official submission to the leaderboard. This first runs public tests, then a - repeated invocation of a single benchmark. Feedback for the secret benchmark is only very - limited (no stdout/stderr). - Private: Special run that does test followed by leaderboard (on a secret seed), but gives only - very limited feedback. + Public: Official submission to the leaderboard. This first runs public tests, then a + repeated invocation of a single benchmark. If all tests pass, the submission is evaluated + and ranked on the public leaderboard. + Secret: Internal mode for running the full evaluation flow with a secret seed. This is used + for secret validation runs that accompany public submissions. """ TEST = "test" - BENCHMARK = "benchmark" - PROFILE = "profile" - LEADERBOARD = "leaderboard" PRIVATE = "private" + PROFILE = "profile" + PUBLIC = "public" + SECRET = "secret" class Language(Enum): diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py index a1970a7e..c984c749 100644 --- a/src/libkernelbot/launchers/github.py +++ b/src/libkernelbot/launchers/github.py @@ -49,8 +49,8 @@ def get_timeout(config: dict) -> int: mode = config.get("mode") sec_map = { SubmissionMode.TEST.value: config.get("test_timeout"), - SubmissionMode.BENCHMARK.value: config.get("benchmark_timeout"), - SubmissionMode.LEADERBOARD.value: config.get("ranked_timeout"), + SubmissionMode.PRIVATE.value: config.get("benchmark_timeout"), + SubmissionMode.PUBLIC.value: config.get("ranked_timeout"), } seconds = sec_map.get(mode) or DEFAULT_GITHUB_TIMEOUT_MINUTES * 60 return math.ceil(seconds / 60) diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py index 70f91487..b0f8baf5 100644 --- a/src/libkernelbot/report.py +++ b/src/libkernelbot/report.py @@ -176,8 +176,8 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n elif full: result.append("❌ Tests missing") - if "benchmark" in runs: - bench_run = runs["benchmark"].run + if "private" in runs: + bench_run = runs["private"].run if not bench_run.success: result.append("❌ Running benchmarks failed" + _short_fail_reason(bench_run)) return result @@ -202,16 +202,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n else: result.append("✅ Profiling successful") - if "leaderboard" in runs: - lb_run = runs["leaderboard"].run + # Check for public or secret run results + ranked_key = "public" if "public" in runs else ("secret" if "secret" in runs else None) + if ranked_key: + lb_run = runs[ranked_key].run if not lb_run.success: - result.append("❌ Running leaderboard failed" + _short_fail_reason(lb_run)) + result.append("❌ Running ranked submission failed" + _short_fail_reason(lb_run)) elif not lb_run.passed: - result.append("❌ Leaderboard run failed") + result.append("❌ Ranked submission failed") else: - result.append("✅ Leaderboard run successful") + result.append("✅ Ranked submission successful") elif full: - result.append("❌ Leaderboard missing") + result.append("❌ Ranked submission missing") return result @@ -339,8 +341,8 @@ def generate_report(result: FullResult, extra_text: str = "") -> RunResultReport num_tests = int(test_run.result.get("test-count", 0)) report.add_log(f"✅ Passed {num_tests}/{num_tests} tests", make_test_log(test_run)) - if "benchmark" in runs: - bench_run = runs["benchmark"] + if "private" in runs: + bench_run = runs["private"] if _handle_crash_report(report, bench_run): return report @@ -378,8 +380,10 @@ def generate_report(result: FullResult, extra_text: str = "") -> RunResultReport base64.b64decode(prof_run.profile.trace), ) - if "leaderboard" in runs: - bench_run = runs["leaderboard"] + # Check for public or secret run results + ranked_key = "public" if "public" in runs else ("secret" if "secret" in runs else None) + if ranked_key: + bench_run = runs[ranked_key] if _handle_crash_report(report, bench_run): return report diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index aec59f95..5891f302 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -556,8 +556,8 @@ def run_single_evaluation( if mode == "test": timeout = test_timeout cases.write(tests) - elif mode in ["benchmark", "profile", "leaderboard"]: - timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout + elif mode in ["private", "profile", "public", "secret"]: + timeout = ranked_timeout if mode in ["public", "secret"] else benchmark_timeout if ranking_by == "last": cases.write(benchmarks.splitlines(keepends=True)[-1]) else: @@ -801,22 +801,22 @@ def run_evaluation( common_args["benchmarks"] = benchmark results[f"{mode}.{i}"] = call(mode=mode, **common_args) - elif mode in ["test", "benchmark"]: + elif mode in ["test", "private"]: results[mode] = call(mode=mode, **common_args) - elif mode in ["private", "leaderboard"]: + elif mode in ["public", "secret"]: # first, run the tests results["test"] = call(mode="test", **common_args) if not results["test"].run or not results["test"].run.passed: return results - results["benchmark"] = call(mode="benchmark", **common_args) + results["private"] = call(mode="private", **common_args) - if not results["benchmark"].run or not results["benchmark"].run.passed: + if not results["private"].run or not results["private"].run.passed: return results - # if they pass, run the leaderboard validation - results["leaderboard"] = call(mode="leaderboard", **common_args) + # if they pass, run the public/secret validation + results[mode] = call(mode=mode, **common_args) else: raise AssertionError("Invalid mode") diff --git a/src/libkernelbot/submission.py b/src/libkernelbot/submission.py index 805f7435..12cbc661 100644 --- a/src/libkernelbot/submission.py +++ b/src/libkernelbot/submission.py @@ -169,8 +169,8 @@ def _get_popcorn_directives(submission: str) -> dict: # noqa: C901 return popcorn_info -def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int) -> float: - num_benchmarks = int(result.runs["leaderboard"].run.result["benchmark-count"]) +def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int, mode_key: str = "public") -> float: + num_benchmarks = int(result.runs[mode_key].run.result["benchmark-count"]) if task.ranking_by == RankCriterion.LAST: if num_benchmarks != 1: logger.error( @@ -182,11 +182,11 @@ def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int) raise KernelBotError( f"Expected submission to have exactly one benchmark, got {num_benchmarks}." ) - score = float(result.runs["leaderboard"].run.result["benchmark.0.mean"]) / 1e9 + score = float(result.runs[mode_key].run.result["benchmark.0.mean"]) / 1e9 else: scores = [] for i in range(num_benchmarks): - scores.append(float(result.runs["leaderboard"].run.result[f"benchmark.{i}.mean"]) / 1e9) + scores.append(float(result.runs[mode_key].run.result[f"benchmark.{i}.mean"]) / 1e9) if task.ranking_by == RankCriterion.MEAN: score = sum(scores) / len(scores) elif task.ranking_by == RankCriterion.GEOM: diff --git a/tests/test_backend.py b/tests/test_backend.py index f69170c5..af327519 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -55,7 +55,7 @@ async def test_handle_submission(bot: backend.KernelBackend, task_directory): "pass", "submit.py", task, - consts.SubmissionMode.LEADERBOARD, + consts.SubmissionMode.PUBLIC, -1, ) @@ -64,7 +64,7 @@ async def test_handle_submission(bot: backend.KernelBackend, task_directory): "> ✅ Compilation successful", "> ✅ Testing successful", "> ❌ Benchmarks missing", - "> ❌ Leaderboard missing", + "> ❌ Ranked submission missing", ] call_args = reporter.display_report.call_args[0] @@ -130,7 +130,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory): submit_time, ) eval_result = create_eval_result("benchmark") - mock_launcher = _mock_launcher(bot, {"leaderboard": eval_result}) + mock_launcher = _mock_launcher(bot, {"secret": eval_result}) reporter = MockProgressReporter("report") @@ -141,7 +141,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory): consts.ModalGPU.A100, reporter, task, - consts.SubmissionMode.LEADERBOARD, + consts.SubmissionMode.SECRET, seed=1337, ) @@ -155,7 +155,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory): "benchmarks": [{"dtype": "float32", "input_size": 10000}], "lang": "py", "main": "kernel.py", - "mode": "leaderboard", + "mode": "secret", "multi_gpu": False, "ranked_timeout": 180, "ranking_by": "geom", @@ -193,7 +193,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory): "stdout": "log stdout", "success": True, }, - "mode": "leaderboard", + "mode": "secret", "passed": True, "result": { "benchmark-count": "1", @@ -206,7 +206,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory): }, "runner": "A100", "score": Decimal("1.5e-9"), - "secret": False, + "secret": True, "start_time": eval_result.start.replace(tzinfo=datetime.timezone.utc), "system": { "cpu": "Intel i9-12900K", @@ -249,7 +249,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): ) reporter = MockMultReporter() s_id, results = await bot.submit_full( - req, mode=consts.SubmissionMode.LEADERBOARD, reporter=reporter + req, mode=consts.SubmissionMode.PUBLIC, reporter=reporter ) expected_result = mock_launcher.run_submission.return_value @@ -261,13 +261,13 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): "> ✅ Compilation successful", "> ❌ Tests missing", "> ❌ Benchmarks missing", - "> ✅ Leaderboard run successful", + "> ✅ Ranked submission successful", ] assert r2.lines == [ "> ✅ Compilation successful", "> ❌ Tests missing", "> ❌ Benchmarks missing", - "> ✅ Leaderboard run successful", + "> ✅ Ranked submission successful", ] assert r1.title == "A100 on Modal ✅ success" assert r2.title == "A100 on Modal (secret) ✅ success" diff --git a/tests/test_github.py b/tests/test_github.py index 413e00bd..8b8a4d63 100644 --- a/tests/test_github.py +++ b/tests/test_github.py @@ -179,7 +179,7 @@ async def test_github_launcher_failing_script(project_root: Path, github_config: task=task_definition.task, submission_content=submission_content, arch=0, - mode=SubmissionMode.LEADERBOARD, + mode=SubmissionMode.PUBLIC, ) result = await launcher.run_submission(config, gpu_type, reporter) @@ -190,9 +190,9 @@ async def test_github_launcher_failing_script(project_root: Path, github_config: # But the actual test or benchmark should fail test_passed = result.runs.get("test", {}).run.passed if "test" in result.runs else True - benchmark_passed = result.runs.get("benchmark", {}).run.passed if "benchmark" in result.runs else True + private_passed = result.runs.get("private", {}).run.passed if "private" in result.runs else True - assert not (test_passed and benchmark_passed), "Expected at least one run to fail for cheating script" + assert not (test_passed and private_passed), "Expected at least one run to fail for cheating script" diff --git a/tests/test_modal.py b/tests/test_modal.py index d22ef05b..c87bd2c7 100644 --- a/tests/test_modal.py +++ b/tests/test_modal.py @@ -265,7 +265,7 @@ async def test_modal_multi_gpu_benchmark( task=task_definition.task, submission_content=submission_content, arch=GPU_TO_SM[ModalGPU.L4x4.name], - mode=SubmissionMode.BENCHMARK, + mode=SubmissionMode.PRIVATE, ) result = await launcher.run_submission(config, ModalGPU.L4x4, reporter) @@ -280,8 +280,8 @@ async def test_modal_multi_gpu_benchmark( assert result.system.device_count == 4 # Test run structure - assert "benchmark" in result.runs - bench_run = result.runs["benchmark"] + assert "private" in result.runs + bench_run = result.runs["private"] # For Python runs, compilation is None assert bench_run.compilation is None @@ -317,7 +317,7 @@ async def test_modal_launcher_failing_script(modal_deployment, project_root: Pat task=task_definition.task, submission_content=submission_content, arch=GPU_TO_SM[gpu_type.name], - mode=SubmissionMode.LEADERBOARD, + mode=SubmissionMode.PUBLIC, ) result = await launcher.run_submission(config, gpu_type, reporter) @@ -325,4 +325,4 @@ async def test_modal_launcher_failing_script(modal_deployment, project_root: Pat # Basic structure and success assert result.success, f"Expected successful run, got: {result.error}" assert result.error == "" - assert result.runs["test"].run.passed is False or result.runs["benchmark"].run.passed is False + assert result.runs["test"].run.passed is False or result.runs["private"].run.passed is False diff --git a/tests/test_task.py b/tests/test_task.py index 809a6907..b6fa6a63 100644 --- a/tests/test_task.py +++ b/tests/test_task.py @@ -126,7 +126,7 @@ def test_build_task_config_python(leaderboard_task): """Test build_task_config with Python task and submission content.""" submission_content = "print('Hello World')" arch = "sm_80" - mode = SubmissionMode.BENCHMARK + mode = SubmissionMode.PRIVATE result = build_task_config( task=leaderboard_task, submission_content=submission_content, arch=arch, mode=mode @@ -164,7 +164,7 @@ def test_build_task_config_cuda(): """Test build_task_config with CUDA task and submission content.""" submission_content = "print('Hello World')" arch = "sm_80" - mode = SubmissionMode.BENCHMARK + mode = SubmissionMode.PRIVATE task = LeaderboardTask( lang=Language.CUDA, files={"test.cu": "code", "submission.cu": "@SUBMISSION@", "test.cuh": "header"}, From ee62fdc798c4b6ba8a28d854a7ff3a7bed31f4b9 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 1 Feb 2026 18:12:10 -0800 Subject: [PATCH 2/5] Fix test files to use new private/public mode naming Update test data keys and expected values: - test_report.py: Change "benchmark"/"leaderboard" keys to "private"/"public" - test_submission.py: Update compute_score test to use "public" key - test_backend.py: Update mode values and mock data keys --- tests/test_backend.py | 8 ++++---- tests/test_report.py | 30 +++++++++++++++--------------- tests/test_submission.py | 8 ++++---- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index af327519..04d0fde4 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -101,7 +101,7 @@ async def test_handle_submission(bot: backend.KernelBackend, task_directory): "benchmarks": [{"dtype": "float32", "input_size": 10000}], "lang": "py", "main": "kernel.py", - "mode": "leaderboard", + "mode": "public", "multi_gpu": False, "ranked_timeout": 180, "ranking_by": "geom", @@ -232,7 +232,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): task = db.get_leaderboard("submit-leaderboard")["task"] eval_result = create_eval_result("benchmark") - mock_launcher = _mock_launcher(bot, {"leaderboard": eval_result}) + mock_launcher = _mock_launcher(bot, {"public": eval_result}) from libkernelbot.submission import ProcessedSubmissionRequest @@ -300,7 +300,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): "stdout": "log stdout", "success": True, }, - "mode": "leaderboard", + "mode": "public", "passed": True, "result": { "benchmark-count": "1", @@ -344,7 +344,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): "stdout": "log stdout", "success": True, }, - "mode": "leaderboard", + "mode": "public", "passed": True, "result": { "benchmark-count": "1", diff --git a/tests/test_report.py b/tests/test_report.py index ae3afd25..f8a61b29 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -241,7 +241,7 @@ def test_make_short_report_benchmarking_failed(sample_eval_result: EvalResult): sample_eval_result.run.success = False sample_eval_result.compilation = None sample_eval_result.run.exit_code = consts.ExitCode.CUDA_FAIL - runs = {"benchmark": sample_eval_result} + runs = {"private": sample_eval_result} result = make_short_report(runs, full=False) assert result == ["❌ Running benchmarks failed (cuda api error)"] @@ -274,27 +274,27 @@ def test_make_short_report_leaderboard_failed(sample_eval_result: EvalResult): sample_eval_result.run.success = False sample_eval_result.compilation = None sample_eval_result.run.exit_code = consts.ExitCode.TEST_SPEC - runs = {"leaderboard": sample_eval_result} + runs = {"public": sample_eval_result} result = make_short_report(runs, full=False) - assert result == ["❌ Running leaderboard failed (internal error 113)"] + assert result == ["❌ Running ranked submission failed (internal error 113)"] sample_eval_result.run.success = True sample_eval_result.run.passed = False sample_eval_result.run.exit_code = consts.ExitCode.VALIDATE_FAIL result = make_short_report(runs) # TODO is this actually possible? Should profiling do **any** correctness testing? - assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Leaderboard run failed"] + assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Ranked submission failed"] def test_make_short_report_empty(): result = make_short_report({}) - assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Leaderboard missing"] + assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Ranked submission missing"] def test_make_short_report_full_success(): runs = {} - for run_type in ["test", "benchmark", "profile", "leaderboard"]: + for run_type in ["test", "private", "profile", "public"]: runs[run_type] = EvalResult( start=datetime.datetime.now() - datetime.timedelta(minutes=5), end=datetime.datetime.now(), @@ -318,7 +318,7 @@ def test_make_short_report_full_success(): "✅ Testing successful", "✅ Benchmarking successful", "✅ Profiling successful", - "✅ Leaderboard run successful", + "✅ Ranked submission successful", ] assert result == expected @@ -331,7 +331,7 @@ def test_make_short_report_missing_components(): "✅ Compilation successful", "✅ Testing successful", "❌ Benchmarks missing", - "❌ Leaderboard missing", + "❌ Ranked submission missing", ] assert result == expected @@ -532,7 +532,7 @@ def test_generate_report_test_failure(sample_full_result: FullResult): def test_generate_report_benchmark_failure(sample_full_result: FullResult): from libkernelbot.report import Log, Text - sample_full_result.runs["benchmark"] = create_eval_result() + sample_full_result.runs["private"] = create_eval_result() report = generate_report(sample_full_result) assert report.data == [ Text( @@ -557,8 +557,8 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult): Log(header="Benchmarks", content="❗ Could not find any benchmarks"), ] - sample_full_result.runs["benchmark"].run.passed = False - sample_full_result.runs["benchmark"].run.result = { + sample_full_result.runs["private"].run.passed = False + sample_full_result.runs["private"].run.result = { "benchmark-count": "2", "benchmark.0.status": "pass", "benchmark.0.spec": "Basic functionality", @@ -607,7 +607,7 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult): def test_generate_report_leaderboard_failure(sample_full_result: FullResult): from libkernelbot.report import Log, Text - sample_full_result.runs["leaderboard"] = create_eval_result() + sample_full_result.runs["public"] = create_eval_result() report = generate_report(sample_full_result) assert report.data == [ Text( @@ -632,9 +632,9 @@ def test_generate_report_leaderboard_failure(sample_full_result: FullResult): Log(header="Ranked Benchmark", content="❗ Could not find any benchmarks"), ] - sample_full_result.runs["leaderboard"].run.success = False - sample_full_result.runs["leaderboard"].run.exit_code = consts.ExitCode.TIMEOUT_EXPIRED - sample_full_result.runs["leaderboard"].run.duration = 10.0 + sample_full_result.runs["public"].run.success = False + sample_full_result.runs["public"].run.exit_code = consts.ExitCode.TIMEOUT_EXPIRED + sample_full_result.runs["public"].run.duration = 10.0 report = generate_report(sample_full_result) assert report.data == [ diff --git a/tests/test_submission.py b/tests/test_submission.py index e22fcb8e..1654b9b1 100644 --- a/tests/test_submission.py +++ b/tests/test_submission.py @@ -303,7 +303,7 @@ def test_compute_score(): # Test LAST ranking with single benchmark mock_task.ranking_by = RankCriterion.LAST mock_result.runs = { - "leaderboard": mock.Mock( + "public": mock.Mock( run=mock.Mock( result={ "benchmark-count": "1", @@ -317,7 +317,7 @@ def test_compute_score(): # Test MEAN ranking with multiple benchmarks mock_task.ranking_by = RankCriterion.MEAN - mock_result.runs["leaderboard"].run.result = { + mock_result.runs["public"].run.result = { "benchmark-count": "2", "benchmark.0.mean": "1000000000", # 1 second "benchmark.1.mean": "3000000000", # 3 seconds @@ -327,7 +327,7 @@ def test_compute_score(): # Test GEOM ranking with multiple benchmarks mock_task.ranking_by = RankCriterion.GEOM - mock_result.runs["leaderboard"].run.result = { + mock_result.runs["public"].run.result = { "benchmark-count": "2", "benchmark.0.mean": "4000000000", # 4 seconds "benchmark.1.mean": "9000000000", # 9 seconds @@ -337,7 +337,7 @@ def test_compute_score(): # Test LAST with multiple benchmarks (should raise error) mock_task.ranking_by = RankCriterion.LAST - mock_result.runs["leaderboard"].run.result["benchmark-count"] = "2" + mock_result.runs["public"].run.result["benchmark-count"] = "2" with pytest.raises(KernelBotError, match="exactly one benchmark"): submission.compute_score(mock_result, mock_task, 1) From 6fcc19f17e4ecfe8a618524047f5f0476d7cd56c Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 1 Feb 2026 18:15:57 -0800 Subject: [PATCH 3/5] Fix test_submit_full mock and expected mode - Add 'secret' key to mock launcher runs so SECRET mode can find its result - Fix second run's expected mode from 'public' to 'secret' --- tests/test_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index 04d0fde4..ea420dbd 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -232,7 +232,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): task = db.get_leaderboard("submit-leaderboard")["task"] eval_result = create_eval_result("benchmark") - mock_launcher = _mock_launcher(bot, {"public": eval_result}) + mock_launcher = _mock_launcher(bot, {"public": eval_result, "secret": eval_result}) from libkernelbot.submission import ProcessedSubmissionRequest @@ -344,7 +344,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): "stdout": "log stdout", "success": True, }, - "mode": "public", + "mode": "secret", "passed": True, "result": { "benchmark-count": "1", From 2e022cadeb6ca39c994ad00fa9c619458431f20e Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 1 Feb 2026 18:19:04 -0800 Subject: [PATCH 4/5] Fix GitHub integration test to use PR branch Set GITHUB_BRANCH env var to use the PR's source branch instead of falling back to main. Uses github.head_ref for PRs, github.ref_name for direct pushes. --- .github/workflows/testing.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index c34818f2..667b2c4c 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -58,6 +58,7 @@ jobs: if: github.actor != 'dependabot[bot]' env: GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} + GITHUB_BRANCH: ${{ github.head_ref || github.ref_name }} steps: - uses: actions/checkout@v4 - uses: astral-sh/setup-uv@v4 From d47bad7ee9c02b9261cb42d0d1a014b736ce044c Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 1 Feb 2026 18:21:53 -0800 Subject: [PATCH 5/5] Fix test_submit_full mock to return mode-specific results Use side_effect to return different FullResult for each call: - First call (PUBLIC mode) returns {"public": eval_result} - Second call (SECRET mode) returns {"secret": eval_result} This prevents the backend from storing all keys from both calls. --- tests/test_backend.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index ea420dbd..a07b1514 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -232,7 +232,18 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): task = db.get_leaderboard("submit-leaderboard")["task"] eval_result = create_eval_result("benchmark") - mock_launcher = _mock_launcher(bot, {"public": eval_result, "secret": eval_result}) + # Use side_effect to return different results for each call + # First call (PUBLIC mode) returns {"public": ...}, second call (SECRET mode) returns {"secret": ...} + mock_launcher = MagicMock(spec=backend.Launcher) + mock_launcher.name = "launcher" + mock_launcher.gpus = [consts.ModalGPU.A100] + mock_launcher.run_submission = AsyncMock( + side_effect=[ + FullResult(success=True, error="", system=sample_system_info(), runs={"public": eval_result}), + FullResult(success=True, error="", system=sample_system_info(), runs={"secret": eval_result}), + ] + ) + bot.register_launcher(mock_launcher) from libkernelbot.submission import ProcessedSubmissionRequest @@ -252,9 +263,15 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): req, mode=consts.SubmissionMode.PUBLIC, reporter=reporter ) - expected_result = mock_launcher.run_submission.return_value + expected_result_public = FullResult( + success=True, error="", system=sample_system_info(), runs={"public": eval_result} + ) + expected_result_secret = FullResult( + success=True, error="", system=sample_system_info(), runs={"secret": eval_result} + ) assert len(results) == 2 - assert results == [expected_result, expected_result] + assert results[0].success == expected_result_public.success + assert results[1].success == expected_result_secret.success r1, r2 = reporter.reporter_list assert r1.lines == [