From 8fb11399e7950bb778f3c3340afa94d60617fd8f Mon Sep 17 00:00:00 2001 From: Elliot Gestrin Date: Thu, 12 Mar 2026 23:11:46 +0100 Subject: [PATCH 1/7] Added a repair subagent to fix broken children --- openevolve/config.py | 26 +- openevolve/controller.py | 5 + openevolve/evaluation_result.py | 37 ++ openevolve/evaluator.py | 262 ++++++++- openevolve/evolution_trace.py | 30 +- openevolve/iteration.py | 16 + openevolve/process_parallel.py | 17 +- .../prompts/defaults/repair_diff_user.txt | 27 + .../defaults/repair_full_rewrite_user.txt | 24 + scripts/static/js/sidebar.js | 34 +- scripts/templates/program_page.html | 33 ++ tests/test_repair.py | 505 ++++++++++++++++++ 12 files changed, 1008 insertions(+), 8 deletions(-) create mode 100644 openevolve/prompts/defaults/repair_diff_user.txt create mode 100644 openevolve/prompts/defaults/repair_full_rewrite_user.txt create mode 100644 tests/test_repair.py diff --git a/openevolve/config.py b/openevolve/config.py index bef193da21..2ec78fe0e5 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -109,9 +109,13 @@ class LLMConfig(LLMModelConfig): # n-model configuration for evolution LLM ensemble models: List[LLMModelConfig] = field(default_factory=list) - # n-model configuration for evaluator LLM ensemble + # n-model configuration for evaluator LLM ensemble (LLM feedback scoring) evaluator_models: List[LLMModelConfig] = field(default_factory=lambda: []) + # n-model configuration for repair LLM ensemble. + # Falls back to evaluator_models (then models) when not set. + repair_models: List[LLMModelConfig] = field(default_factory=lambda: []) + # Backwardes compatibility with primary_model(_weight) options primary_model: str = None primary_model_weight: float = None @@ -184,7 +188,7 @@ def __post_init__(self): def update_model_params(self, args: Dict[str, Any], overwrite: bool = False) -> None: """Update model parameters for all models""" - for model in self.models + self.evaluator_models: + for model in self.models + self.evaluator_models + self.repair_models: for key, value in args.items(): if overwrite or getattr(model, key, None) is None: setattr(model, key, value) @@ -194,6 +198,7 @@ def rebuild_models(self) -> None: # Clear existing models lists self.models = [] self.evaluator_models = [] + self.repair_models = [] # Re-run model generation logic from __post_init__ if self.primary_model: @@ -220,6 +225,10 @@ def rebuild_models(self) -> None: if not self.evaluator_models: self.evaluator_models = self.models.copy() + # If no repair models are defined, fall back to evaluator_models + if not self.repair_models: + self.repair_models = self.evaluator_models.copy() + # Update models with shared configuration values shared_config = { "api_base": self.api_base, @@ -383,6 +392,19 @@ class EvaluatorConfig: enable_artifacts: bool = True max_artifact_storage: int = 100 * 1024 * 1024 # 100MB per program + # LLM-based repair on EvaluatorRepairRequest + # When a user evaluator raises EvaluatorRepairRequest (e.g. on compile + # failure) OpenEvolve will ask the LLM to fix the code and re-evaluate, + # storing the repaired version in the database rather than the broken + # original. + repair_on_failure: bool = False + max_repair_attempts: int = 2 + # True → ask the LLM for SEARCH/REPLACE diffs (uses repair_diff_user template) + # False → ask the LLM for a full rewrite (uses repair_full_rewrite_user template) + repair_diff_based: bool = False + # Diff pattern used when repair_diff_based=True; must match the template. + repair_diff_pattern: str = r"<<<<<<< SEARCH\n(.*?)=======\n(.*?)>>>>>>> REPLACE" + @dataclass class EvolutionTraceConfig: diff --git a/openevolve/controller.py b/openevolve/controller.py index 01ffec73c3..be0efe8570 100644 --- a/openevolve/controller.py +++ b/openevolve/controller.py @@ -112,6 +112,9 @@ def __init__( for model_cfg in self.config.llm.evaluator_models: if not hasattr(model_cfg, "random_seed") or model_cfg.random_seed is None: model_cfg.random_seed = llm_seed + for model_cfg in self.config.llm.repair_models: + if not hasattr(model_cfg, "random_seed") or model_cfg.random_seed is None: + model_cfg.random_seed = llm_seed logger.info(f"Set random seed to {self.config.random_seed} for reproducibility") logger.debug(f"Generated LLM seed: {llm_seed}") @@ -139,6 +142,7 @@ def __init__( # Initialize components self.llm_ensemble = LLMEnsemble(self.config.llm.models) self.llm_evaluator_ensemble = LLMEnsemble(self.config.llm.evaluator_models) + self.llm_repair_ensemble = LLMEnsemble(self.config.llm.repair_models) self.prompt_sampler = PromptSampler(self.config.prompt) self.evaluator_prompt_sampler = PromptSampler(self.config.prompt) @@ -158,6 +162,7 @@ def __init__( self.evaluator_prompt_sampler, database=self.database, suffix=Path(self.initial_program_path).suffix, + repair_llm_ensemble=self.llm_repair_ensemble, ) self.evaluation_file = evaluation_file diff --git a/openevolve/evaluation_result.py b/openevolve/evaluation_result.py index cdc355539e..58b69f4daf 100644 --- a/openevolve/evaluation_result.py +++ b/openevolve/evaluation_result.py @@ -7,6 +7,43 @@ from typing import Dict, Union +class EvaluatorRepairRequest(Exception): + """ + Raised by a user evaluator to request an LLM-based code repair attempt. + + Raise this instead of returning a zero score when the generated code has a + correctable error (e.g. a compilation failure). OpenEvolve will attempt to + repair the code using the configured LLM before recording it in the database, + so that future evolution branches from working code rather than the broken + original. + + Args: + message: Human-readable error description (shown in repair history + and logged). + broken_code: The full source that failed. Must be the complete file, + not just the error region, so the repair LLM has full + context. + repair_context: Optional extra information for the repair prompt (e.g. + full compiler stderr, runtime traceback). Defaults to + the same text as *message*. + language: Source-language identifier used in the prompt code fence + (e.g. ``"cpp"``, ``"python"``). Defaults to + ``"python"``. + """ + + def __init__( + self, + message: str, + broken_code: str, + repair_context: str = "", + language: str = "python", + ) -> None: + super().__init__(message) + self.broken_code = broken_code + self.repair_context = repair_context or message + self.language = language + + @dataclass class EvaluationResult: """ diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index b1142ece50..a439c3df0a 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -19,7 +19,7 @@ from openevolve.config import EvaluatorConfig from openevolve.database import ProgramDatabase -from openevolve.evaluation_result import EvaluationResult +from openevolve.evaluation_result import EvaluationResult, EvaluatorRepairRequest from openevolve.database import ProgramDatabase from openevolve.llm.ensemble import LLMEnsemble from openevolve.utils.async_utils import TaskPool, run_in_executor @@ -45,11 +45,15 @@ def __init__( prompt_sampler: Optional[PromptSampler] = None, database: Optional[ProgramDatabase] = None, suffix: Optional[str] = ".py", + repair_llm_ensemble: Optional[LLMEnsemble] = None, ): self.config = config self.evaluation_file = evaluation_file self.program_suffix = suffix self.llm_ensemble = llm_ensemble + # Separate ensemble for LLM-based code repair; falls back to the main + # evaluator ensemble (llm_ensemble) when not provided. + self.repair_llm_ensemble = repair_llm_ensemble or llm_ensemble self.prompt_sampler = prompt_sampler self.database = database @@ -62,6 +66,11 @@ def __init__( # Pending artifacts storage for programs self._pending_artifacts: Dict[str, Dict[str, Union[str, bytes]]] = {} + # Pending repairs: program_id → repaired source code. + # Populated by _attempt_repair when repair succeeds; consumed by + # iteration.py / process_parallel.py via get_pending_repair(). + self._pending_repairs: Dict[str, str] = {} + logger.info(f"Initialized evaluator with {evaluation_file}") def _load_evaluation_function(self) -> None: @@ -264,6 +273,28 @@ async def evaluate_program( return {"error": 0.0, "timeout": True} + except EvaluatorRepairRequest as repair_req: + # The user evaluator signalled that the code needs LLM repair + # (e.g. a compilation failure). Attempt repair if configured; + # otherwise fall through to the standard zero-score path. + if self.config.repair_on_failure and self.llm_ensemble: + repaired_metrics = await self._attempt_repair(repair_req, program_id) + if repaired_metrics is not None: + return repaired_metrics + # Repair disabled, not configured, or all attempts exhausted. + logger.warning( + f"Repair {'failed' if self.config.repair_on_failure else 'disabled'} " + f"for program{program_id_str}: {repair_req}" + ) + if artifacts_enabled and program_id: + if program_id not in self._pending_artifacts: + self._pending_artifacts[program_id] = {} + self._pending_artifacts[program_id].update({ + "compile_error": str(repair_req), + "repair_context": repair_req.repair_context, + }) + return {"combined_score": 0.0, "error": 0.0} + except Exception as e: last_exception = e logger.warning( @@ -328,6 +359,235 @@ def get_pending_artifacts(self, program_id: str) -> Optional[Dict[str, Union[str """ return self._pending_artifacts.pop(program_id, None) + def get_pending_repair(self, program_id: str) -> Optional[str]: + """ + Get and clear the repaired source code for a program, if one exists. + + Returns the repaired code string when a previous ``_attempt_repair`` + call succeeded, or ``None`` when no repair was performed. The entry is + removed from the internal store on first read (one-shot). + + Args: + program_id: Program ID used during evaluation. + + Returns: + Repaired source code string, or ``None``. + """ + return self._pending_repairs.pop(program_id, None) + + async def _attempt_repair( + self, + repair_req: EvaluatorRepairRequest, + program_id: str, + ) -> Optional[Dict[str, float]]: + """ + Attempt to repair broken code via the LLM, then re-evaluate. + + Loops up to ``config.max_repair_attempts`` times. On success the + repaired code is stored in ``_pending_repairs[program_id]`` and the + repair history is added to ``_pending_artifacts[program_id]`` so that + ``iteration.py`` can move both into ``Program.metadata``. + + Args: + repair_req: The ``EvaluatorRepairRequest`` raised by the evaluator. + program_id: Program ID for artifact/repair storage. + + Returns: + Metrics dict from the successfully repaired evaluation, or ``None`` + if all repair attempts failed. + """ + artifacts_enabled = os.environ.get("ENABLE_ARTIFACTS", "true").lower() == "true" + broken_code = repair_req.broken_code + error_message = str(repair_req) + repair_context = repair_req.repair_context + language = repair_req.language + repair_history: List[Dict] = [] + + for attempt in range(1, self.config.max_repair_attempts + 1): + logger.info( + f"Repair attempt {attempt}/{self.config.max_repair_attempts} " + f"for program {program_id} (language={language})" + ) + + repaired_code = await self._repair_code( + broken_code=broken_code, + error_message=error_message, + repair_context=repair_context, + language=language, + ) + if repaired_code is None: + logger.warning(f"Repair attempt {attempt}: LLM returned no parseable code") + repair_history.append({ + "attempt": attempt, + "error": error_message, + "repair_error": "LLM returned no parseable code", + "succeeded": False, + }) + break + + # Write the repaired code to a temp file and re-evaluate + with tempfile.NamedTemporaryFile( + suffix=self.program_suffix, delete=False + ) as tmp: + tmp.write(repaired_code.encode("utf-8")) + tmp_path = tmp.name + + try: + result = await self._direct_evaluate(tmp_path) + eval_result = self._process_evaluation_result(result) + + # Success — store the repaired code and history + repair_history.append({ + "attempt": attempt, + "error": None, + "succeeded": True, + }) + logger.info( + f"Repair succeeded on attempt {attempt} for program {program_id}" + ) + self._pending_repairs[program_id] = repaired_code + if artifacts_enabled and program_id: + if program_id not in self._pending_artifacts: + self._pending_artifacts[program_id] = {} + self._pending_artifacts[program_id]["repair_history"] = repair_history + if eval_result.has_artifacts(): + self._pending_artifacts[program_id].update(eval_result.artifacts) + + elapsed = 0.0 # timing already handled by outer evaluate_program + logger.info( + f"Repaired program {program_id}: " + f"{format_metrics_safe(eval_result.metrics)}" + ) + return eval_result.metrics + + except EvaluatorRepairRequest as next_req: + # Re-evaluation raised another repair request — prepare next loop + error_message = str(next_req) + repair_context = next_req.repair_context + broken_code = next_req.broken_code + repair_history.append({ + "attempt": attempt, + "error": error_message, + "succeeded": False, + }) + except Exception as exc: + error_message = str(exc) + repair_history.append({ + "attempt": attempt, + "error": error_message, + "succeeded": False, + }) + logger.warning(f"Repair attempt {attempt} raised exception: {exc}") + break + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + # All attempts exhausted + logger.warning( + f"All {self.config.max_repair_attempts} repair attempt(s) failed " + f"for program {program_id}" + ) + if artifacts_enabled and program_id: + if program_id not in self._pending_artifacts: + self._pending_artifacts[program_id] = {} + self._pending_artifacts[program_id].update({ + "compile_error": str(repair_req), + "repair_context": repair_req.repair_context, + "repair_history": repair_history, + "repair_failed": True, + }) + return None + + async def _repair_code( + self, + broken_code: str, + error_message: str, + repair_context: str, + language: str, + ) -> Optional[str]: + """ + Ask the LLM to repair broken code and return the fixed source. + + Uses the ``repair_full_rewrite_user`` or ``repair_diff_user`` template + (depending on ``config.repair_diff_based``) and the ``repair_system_message`` + template (falling back to ``system_message`` if absent). + + Returns the repaired code string on success, or ``None`` if the LLM + response could not be parsed. + """ + if not self.repair_llm_ensemble or not self.prompt_sampler: + logger.warning("_repair_code called but repair_llm_ensemble or prompt_sampler is None") + return None + + # --- Choose templates --- + user_template_name = ( + "repair_diff_user" if self.config.repair_diff_based else "repair_full_rewrite_user" + ) + try: + user_template = self.prompt_sampler.template_manager.get_template(user_template_name) + except ValueError: + logger.warning( + f"Repair template '{user_template_name}' not found — repair skipped. " + "Ensure the template file exists in your prompts directory." + ) + return None + + # Prefer a dedicated repair system message; fall back to the evolution one. + try: + system_message = self.prompt_sampler.template_manager.get_template( + "repair_system_message" + ) + except ValueError: + try: + system_message = self.prompt_sampler.template_manager.get_template( + "system_message" + ) + except ValueError: + system_message = ( + "You are an expert software developer. " + "Fix all errors in the provided code." + ) + + try: + # Use sequential replacement instead of str.format() so that braces + # inside broken_code / error_message / repair_context (e.g. C++ code) + # do not raise KeyError or corrupt the template. + user_message = user_template + for placeholder, value in [ + ("{language}", language), + ("{error_message}", error_message), + ("{repair_context}", repair_context), + ("{broken_code}", broken_code), + ]: + user_message = user_message.replace(placeholder, value) + except Exception as exc: + logger.warning(f"Repair template substitution error: {exc}") + return None + + try: + llm_response = await self.repair_llm_ensemble.generate_with_context( + system_message=system_message, + messages=[{"role": "user", "content": user_message}], + ) + except Exception as exc: + logger.warning(f"LLM call during repair failed: {exc}") + return None + + # --- Parse the LLM response --- + if self.config.repair_diff_based: + from openevolve.utils.code_utils import apply_diff + repaired = apply_diff(broken_code, llm_response, self.config.repair_diff_pattern) + else: + from openevolve.utils.code_utils import parse_full_rewrite + repaired = parse_full_rewrite(llm_response, language) + + if not repaired or not repaired.strip(): + logger.warning("Repair LLM response yielded empty code after parsing") + return None + + return repaired + async def _direct_evaluate( self, program_path: str ) -> Union[Dict[str, float], EvaluationResult]: diff --git a/openevolve/evolution_trace.py b/openevolve/evolution_trace.py index 8e6699cda3..1e94c8927b 100644 --- a/openevolve/evolution_trace.py +++ b/openevolve/evolution_trace.py @@ -102,6 +102,9 @@ def __init__( "total_improvement": {}, "best_improvement": {}, "worst_decline": {}, + # Repair stats: how often LLM repair was triggered / succeeded + "repair_triggered": 0, + "repair_succeeded": 0, } if not self.enabled: @@ -232,6 +235,14 @@ def _update_stats(self, trace: EvolutionTrace): if delta < self.stats["worst_decline"][metric]: self.stats["worst_decline"][metric] = delta + # Track repair statistics from child program metadata + child_meta = trace.metadata or {} + repair_history = child_meta.get("repair_history") + if repair_history is not None: + self.stats["repair_triggered"] += 1 + if any(entry.get("succeeded") for entry in repair_history): + self.stats["repair_succeeded"] += 1 + def flush(self): """Write buffered traces to file""" if not self.enabled or not self.buffer: @@ -259,12 +270,18 @@ def flush(self): def get_statistics(self) -> Dict[str, Any]: """Get current tracing statistics""" + total = self.stats["total_traces"] + triggered = self.stats["repair_triggered"] return { **self.stats, "improvement_rate": ( - self.stats["improvement_count"] / self.stats["total_traces"] - if self.stats["total_traces"] > 0 - else 0 + self.stats["improvement_count"] / total if total > 0 else 0 + ), + # Fraction of all iterations where repair was triggered + "repair_trigger_rate": triggered / total if total > 0 else 0, + # Fraction of repair attempts that succeeded + "repair_success_rate": ( + self.stats["repair_succeeded"] / triggered if triggered > 0 else 0 ), } @@ -303,6 +320,13 @@ def close(self): logger.info(f"Evolution tracing complete. Total traces: {stats['total_traces']}") logger.info(f"Improvement rate: {stats['improvement_rate']:.2%}") + if stats["repair_triggered"] > 0: + logger.info( + f"Repair: triggered={stats['repair_triggered']}, " + f"succeeded={stats['repair_succeeded']}, " + f"success_rate={stats['repair_success_rate']:.2%}" + ) + if stats["best_improvement"]: logger.info(f"Best improvements: {stats['best_improvement']}") if stats["worst_decline"]: diff --git a/openevolve/iteration.py b/openevolve/iteration.py index 7afaff75b5..30902b13ba 100644 --- a/openevolve/iteration.py +++ b/openevolve/iteration.py @@ -168,6 +168,21 @@ async def run_iteration_with_shared_db( # Handle artifacts if they exist artifacts = evaluator.get_pending_artifacts(child_id) + # If the evaluator performed an LLM repair, use the repaired code as + # the canonical source for the database entry and demote the original + # broken LLM output to metadata["original_llm_code"]. + repaired_code = evaluator.get_pending_repair(child_id) + repair_metadata: dict = {} + if repaired_code is not None: + repair_metadata["original_llm_code"] = child_code + repair_metadata["repair_history"] = ( + (artifacts or {}).pop("repair_history", []) + ) + child_code = repaired_code + logger.info( + f"Iteration {iteration}: using LLM-repaired code for program {child_id}" + ) + # Set template_key of Prompts template_key = "full_rewrite_user" if not config.diff_based_evolution else "diff_user" @@ -184,6 +199,7 @@ async def run_iteration_with_shared_db( metadata={ "changes": changes_summary, "parent_metrics": parent.metrics, + **repair_metadata, }, prompts=( { diff --git a/openevolve/process_parallel.py b/openevolve/process_parallel.py index a2fd6592a9..35386a2c1f 100644 --- a/openevolve/process_parallel.py +++ b/openevolve/process_parallel.py @@ -294,6 +294,17 @@ def _run_iteration_worker( # Get artifacts artifacts = _worker_evaluator.get_pending_artifacts(child_id) + # Apply LLM repair if the evaluator performed one (mirrors iteration.py) + repaired_code = _worker_evaluator.get_pending_repair(child_id) + repair_metadata: dict = {} + if repaired_code is not None: + repair_metadata["original_llm_code"] = child_code + repair_metadata["repair_history"] = (artifacts or {}).pop("repair_history", []) + child_code = repaired_code + logger.info( + f"Worker iteration {iteration}: using LLM-repaired code for program {child_id}" + ) + # Create child program child_program = Program( id=child_id, @@ -308,6 +319,7 @@ def _run_iteration_worker( "changes": changes_summary, "parent_metrics": parent.metrics, "island": parent_island, + **repair_metadata, }, ) @@ -593,8 +605,11 @@ async def run_evolution( artifacts=result.artifacts, island_id=island_id, metadata={ + **{ + k: v for k, v in child_program.metadata.items() + if k not in ("parent_metrics", "island") + }, "iteration_time": result.iteration_time, - "changes": child_program.metadata.get("changes", ""), }, ) diff --git a/openevolve/prompts/defaults/repair_diff_user.txt b/openevolve/prompts/defaults/repair_diff_user.txt new file mode 100644 index 0000000000..a64766a2c7 --- /dev/null +++ b/openevolve/prompts/defaults/repair_diff_user.txt @@ -0,0 +1,27 @@ +The following {language} program failed to evaluate due to the error below. +Repair the program using minimal SEARCH/REPLACE diffs. +Preserve all structure, class names, plugin keys, and invariants that are unrelated to the error. + +# Error + +{error_message} + +# Additional context + +{repair_context} + +# Broken Program + +```{language} +{broken_code} +``` + +Use the exact SEARCH/REPLACE diff format shown below. +Each SEARCH block must match the broken program exactly (including whitespace). +Multiple diff blocks are allowed. + +<<<<<<< SEARCH +# exact lines to replace +======= +# corrected replacement +>>>>>>> REPLACE diff --git a/openevolve/prompts/defaults/repair_full_rewrite_user.txt b/openevolve/prompts/defaults/repair_full_rewrite_user.txt new file mode 100644 index 0000000000..90732653de --- /dev/null +++ b/openevolve/prompts/defaults/repair_full_rewrite_user.txt @@ -0,0 +1,24 @@ +The following {language} program failed to evaluate due to the error below. +Repair the program so that all errors are resolved. +Preserve all structure, class names, plugin keys, and invariants that are unrelated to the error. + +# Error + +{error_message} + +# Additional context + +{repair_context} + +# Broken Program + +```{language} +{broken_code} +``` + +Output ONLY the complete corrected program inside a single code fence. +Do not include any commentary, explanation, or text outside the code fence. + +```{language} +# corrected program here +``` diff --git a/scripts/static/js/sidebar.js b/scripts/static/js/sidebar.js index 1230c0f81c..31b5c00fdf 100644 --- a/scripts/static/js/sidebar.js +++ b/scripts/static/js/sidebar.js @@ -313,7 +313,12 @@ export function showSidebarContent(d, fromHover = false) { if (parentNodeForDiff && parentNodeForDiff.code && parentNodeForDiff.code.trim() !== '') { tabNames.push('Diff'); } - + + // Add a Repairs tab when LLM repair was performed on this program + if (d.metadata && d.metadata.repair_history && d.metadata.repair_history.length > 0) { + tabNames.push('Repairs'); + } + let activeTab = lastSidebarTab && tabNames.includes(lastSidebarTab) ? lastSidebarTab : tabNames[0]; // Helper to render tab content @@ -450,6 +455,33 @@ export function showSidebarContent(d, fromHover = false) { const curCode = d.code || ''; return renderCodeDiff(parentCode, curCode); } + if (tabName === 'Repairs') { + const originalCode = (d.metadata && d.metadata.original_llm_code) || ''; + const repairedCode = d.code || ''; + const history = (d.metadata && d.metadata.repair_history) || []; + // Attempt table + const tableRows = history.map(function(entry) { + const bg = entry.succeeded ? '#f2fff2' : '#fff0f0'; + const result = entry.succeeded ? '✓ succeeded' : '✗ failed'; + const errText = escapeHtml(entry.error || entry.repair_error || ''); + return '' + + '' + (entry.attempt || '') + '' + + '' + result + '' + + '' + errText + '' + + ''; + }).join(''); + const tableHtml = '' + + '' + + '' + + '' + + '' + + '' + tableRows + '
AttemptResultError
'; + const diffHtml = originalCode + ? '
Diff: original LLM output → repaired code
' + + renderCodeDiff(originalCode, repairedCode) + : '
(Original code not recorded)
'; + return tableHtml + diffHtml; + } return ''; } diff --git a/scripts/templates/program_page.html b/scripts/templates/program_page.html index 4b1652bcad..3ca1b0c6b2 100644 --- a/scripts/templates/program_page.html +++ b/scripts/templates/program_page.html @@ -36,6 +36,39 @@

Program ID: {{ program_data.id }}

Code:

{{ program_data.code }}
+ {% if program_data.metadata and program_data.metadata.repair_history %} +

Repairs:

+

This program was repaired by the LLM before evaluation. + The repaired source is shown above. + The original broken LLM output and repair attempts are shown below.

+ {% if program_data.metadata.original_llm_code %} +

Original LLM-generated code (before repair):

+
{{ program_data.metadata.original_llm_code }}
+ {% endif %} +

Repair attempts:

+ + + + + + + + + + {% for entry in program_data.metadata.repair_history %} + + + + + + {% endfor %} + +
AttemptResultError
{{ entry.attempt }} + {{ '✓ succeeded' if entry.succeeded else '✗ failed' }} + +
{{ entry.error or entry.repair_error or '' }}
+
+ {% endif %}

Prompts: