diff --git a/CLAUDE.md b/CLAUDE.md index f763e7890c..2632306eb3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -75,11 +75,14 @@ python scripts/visualizer.py --path examples/function_minimization/openevolve_ou 5. **Iteration (`openevolve/iteration.py`)**: Worker process that samples from islands, generates mutations via LLM, evaluates programs, and stores artifacts. +6. **Repair Subagent (`openevolve/evaluator.py`)**: When an evaluator raises `EvaluatorRepairRequest` (e.g. on compilation failure), the evaluator asks a dedicated LLM ensemble to fix the code and re-evaluates it. Configured via `EvaluatorConfig.repair_on_failure`, `max_repair_attempts`, and `repair_diff_based`. Uses `repair_models` from `LLMConfig` (falls back to `evaluator_models` then `models`). Repair history is stored as artifacts. + ### Key Architectural Patterns - **Island-Based Evolution**: Multiple populations evolve separately with periodic migration - **MAP-Elites**: Maintains diversity by mapping programs to feature grid cells - **Artifact System**: Side-channel for programs to return debugging data, stored as JSON or files +- **LLM Repair Loop**: Evaluators can raise `EvaluatorRepairRequest` to trigger LLM-based code repair before discarding broken programs - **Process Worker Pattern**: Each iteration runs in fresh process with database snapshot - **Double-Selection**: Programs for inspiration differ from those shown to LLM - **Lazy Migration**: Islands migrate based on generation counts, not iterations diff --git a/README.md b/README.md index 740909f5e5..9d2fcdefcc 100644 --- a/README.md +++ b/README.md @@ -468,6 +468,9 @@ evaluator: enable_artifacts: true # Error feedback to LLM cascade_evaluation: true # Multi-stage testing use_llm_feedback: true # AI code quality assessment + repair_on_failure: true # LLM repair on EvaluatorRepairRequest + max_repair_attempts: 2 # Retry limit per broken program + repair_diff_based: false # true=SEARCH/REPLACE diffs, false=full rewrite prompt: # Sophisticated inspiration system @@ -720,6 +723,44 @@ return EvaluationResult( This creates a **feedback loop** where each generation learns from previous mistakes! +### LLM-Based Code Repair + +When evolved code has a correctable error (e.g. a compilation failure), your evaluator can raise `EvaluatorRepairRequest` to trigger an automatic LLM repair attempt instead of discarding the program: + +```python +from openevolve.evaluation_result import EvaluatorRepairRequest + +def evaluate(program_path): + result = compile(program_path) + if result.returncode != 0: + with open(program_path) as f: + code = f.read() + raise EvaluatorRepairRequest( + message="Compilation failed", + broken_code=code, + repair_context=result.stderr, + language="cpp", + fallback_metrics={"combined_score": 0.0}, # used if repair fails + ) + # ... normal evaluation ... +``` + +Enable repair in your config: + +```yaml +evaluator: + repair_on_failure: true + max_repair_attempts: 2 + repair_diff_based: false # true for SEARCH/REPLACE diffs, false for full rewrite + +llm: + repair_models: # optional — falls back to evaluator_models, then models + - name: "your-repair-model" + weight: 1.0 +``` + +Repair history is stored in program artifacts and displayed in the visualizer. + ## Visualization **Real-time evolution tracking** with interactive web interface: diff --git a/openevolve/config.py b/openevolve/config.py index bef193da21..a1f4ae7b7a 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -109,9 +109,13 @@ class LLMConfig(LLMModelConfig): # n-model configuration for evolution LLM ensemble models: List[LLMModelConfig] = field(default_factory=list) - # n-model configuration for evaluator LLM ensemble + # n-model configuration for evaluator LLM ensemble (LLM feedback scoring) evaluator_models: List[LLMModelConfig] = field(default_factory=lambda: []) + # n-model configuration for repair LLM ensemble. + # Falls back to evaluator_models (then models) when not set. + repair_models: List[LLMModelConfig] = field(default_factory=lambda: []) + # Backwardes compatibility with primary_model(_weight) options primary_model: str = None primary_model_weight: float = None @@ -184,7 +188,7 @@ def __post_init__(self): def update_model_params(self, args: Dict[str, Any], overwrite: bool = False) -> None: """Update model parameters for all models""" - for model in self.models + self.evaluator_models: + for model in self.models + self.evaluator_models + self.repair_models: for key, value in args.items(): if overwrite or getattr(model, key, None) is None: setattr(model, key, value) @@ -194,6 +198,7 @@ def rebuild_models(self) -> None: # Clear existing models lists self.models = [] self.evaluator_models = [] + self.repair_models = [] # Re-run model generation logic from __post_init__ if self.primary_model: @@ -220,6 +225,10 @@ def rebuild_models(self) -> None: if not self.evaluator_models: self.evaluator_models = self.models.copy() + # If no repair models are defined, fall back to evaluator_models + if not self.repair_models: + self.repair_models = self.evaluator_models.copy() + # Update models with shared configuration values shared_config = { "api_base": self.api_base, @@ -383,6 +392,17 @@ class EvaluatorConfig: enable_artifacts: bool = True max_artifact_storage: int = 100 * 1024 * 1024 # 100MB per program + # LLM-based repair on EvaluatorRepairRequest + # When a user evaluator raises EvaluatorRepairRequest (e.g. on compile + # failure) OpenEvolve will ask the LLM to fix the code and re-evaluate, + # storing the repaired version in the database rather than the broken + # original. + repair_on_failure: bool = False + max_repair_attempts: int = 2 + # True → ask the LLM for SEARCH/REPLACE diffs (uses repair_diff_user template) + # False → ask the LLM for a full rewrite (uses repair_full_rewrite_user template) + repair_diff_based: bool = False + @dataclass class EvolutionTraceConfig: diff --git a/openevolve/controller.py b/openevolve/controller.py index 01ffec73c3..be0efe8570 100644 --- a/openevolve/controller.py +++ b/openevolve/controller.py @@ -112,6 +112,9 @@ def __init__( for model_cfg in self.config.llm.evaluator_models: if not hasattr(model_cfg, "random_seed") or model_cfg.random_seed is None: model_cfg.random_seed = llm_seed + for model_cfg in self.config.llm.repair_models: + if not hasattr(model_cfg, "random_seed") or model_cfg.random_seed is None: + model_cfg.random_seed = llm_seed logger.info(f"Set random seed to {self.config.random_seed} for reproducibility") logger.debug(f"Generated LLM seed: {llm_seed}") @@ -139,6 +142,7 @@ def __init__( # Initialize components self.llm_ensemble = LLMEnsemble(self.config.llm.models) self.llm_evaluator_ensemble = LLMEnsemble(self.config.llm.evaluator_models) + self.llm_repair_ensemble = LLMEnsemble(self.config.llm.repair_models) self.prompt_sampler = PromptSampler(self.config.prompt) self.evaluator_prompt_sampler = PromptSampler(self.config.prompt) @@ -158,6 +162,7 @@ def __init__( self.evaluator_prompt_sampler, database=self.database, suffix=Path(self.initial_program_path).suffix, + repair_llm_ensemble=self.llm_repair_ensemble, ) self.evaluation_file = evaluation_file diff --git a/openevolve/database.py b/openevolve/database.py index eca5eab0bb..12d57b1982 100644 --- a/openevolve/database.py +++ b/openevolve/database.py @@ -1435,7 +1435,7 @@ def _sample_from_island_weighted(self, island_id: int) -> Program: Parent program selected using fitness-weighted sampling """ island_id = island_id % len(self.islands) - island_programs = list(self.islands[island_id]) + island_programs = sorted(self.islands[island_id]) if not island_programs: # Island is empty, fall back to any available program diff --git a/openevolve/evaluation_result.py b/openevolve/evaluation_result.py index cdc355539e..54ef672a55 100644 --- a/openevolve/evaluation_result.py +++ b/openevolve/evaluation_result.py @@ -4,7 +4,52 @@ import json from dataclasses import dataclass, field -from typing import Dict, Union +from typing import Dict, Optional, Union + + +class EvaluatorRepairRequest(Exception): + """ + Raised by a user evaluator to request an LLM-based code repair attempt. + + Raise this instead of returning a zero score when the generated code has a + correctable error (e.g. a compilation failure). OpenEvolve will attempt to + repair the code using the configured LLM before recording it in the database, + so that future evolution branches from working code rather than the broken + original. + + Args: + message: Human-readable error description (shown in repair history + and logged). + broken_code: The full source that failed. Must be the complete file, + not just the error region, so the repair LLM has full + context. + repair_context: Optional extra information for the repair prompt (e.g. + full compiler stderr, runtime traceback). Defaults to + the same text as *message*. + language: Source-language identifier used in the prompt code fence + (e.g. ``"cpp"``, ``"python"``). Defaults to + ``"python"``. + fallback_metrics: Metrics dict to use if repair is disabled or all repair + attempts are exhausted. Should include all feature + dimensions required by the MAP-Elites database set to + appropriate penalty values, plus ``combined_score: 0.0``. + When ``None``, a minimal ``{"combined_score": 0.0}`` is + used. + """ + + def __init__( + self, + message: str, + broken_code: str, + repair_context: str = "", + language: str = "python", + fallback_metrics: Optional[Dict[str, float]] = None, + ) -> None: + super().__init__(message) + self.broken_code = broken_code + self.repair_context = repair_context or message + self.language = language + self.fallback_metrics: Dict[str, float] = fallback_metrics or {"combined_score": 0.0} @dataclass diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index b1142ece50..27e41df048 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -19,7 +19,7 @@ from openevolve.config import EvaluatorConfig from openevolve.database import ProgramDatabase -from openevolve.evaluation_result import EvaluationResult +from openevolve.evaluation_result import EvaluationResult, EvaluatorRepairRequest from openevolve.database import ProgramDatabase from openevolve.llm.ensemble import LLMEnsemble from openevolve.utils.async_utils import TaskPool, run_in_executor @@ -45,11 +45,15 @@ def __init__( prompt_sampler: Optional[PromptSampler] = None, database: Optional[ProgramDatabase] = None, suffix: Optional[str] = ".py", + repair_llm_ensemble: Optional[LLMEnsemble] = None, ): self.config = config self.evaluation_file = evaluation_file self.program_suffix = suffix self.llm_ensemble = llm_ensemble + # Separate ensemble for LLM-based code repair; falls back to the main + # evaluator ensemble (llm_ensemble) when not provided. + self.repair_llm_ensemble = repair_llm_ensemble or llm_ensemble self.prompt_sampler = prompt_sampler self.database = database @@ -62,6 +66,11 @@ def __init__( # Pending artifacts storage for programs self._pending_artifacts: Dict[str, Dict[str, Union[str, bytes]]] = {} + # Pending repairs: program_id → repaired source code. + # Populated by _attempt_repair when repair succeeds; consumed by + # iteration.py / process_parallel.py via get_pending_repair(). + self._pending_repairs: Dict[str, str] = {} + logger.info(f"Initialized evaluator with {evaluation_file}") def _load_evaluation_function(self) -> None: @@ -264,6 +273,28 @@ async def evaluate_program( return {"error": 0.0, "timeout": True} + except EvaluatorRepairRequest as repair_req: + # The user evaluator signalled that the code needs LLM repair + # (e.g. a compilation failure). Attempt repair if configured; + # otherwise fall through to the standard zero-score path. + if self.config.repair_on_failure and self.llm_ensemble: + repaired_metrics = await self._attempt_repair(repair_req, program_id) + if repaired_metrics is not None: + return repaired_metrics + # Repair disabled, not configured, or all attempts exhausted. + logger.warning( + f"Repair {'failed' if self.config.repair_on_failure else 'disabled'} " + f"for program{program_id_str}: {repair_req}" + ) + if artifacts_enabled and program_id: + if program_id not in self._pending_artifacts: + self._pending_artifacts[program_id] = {} + self._pending_artifacts[program_id].update({ + "compile_error": str(repair_req), + "repair_context": repair_req.repair_context, + }) + return repair_req.fallback_metrics + except Exception as e: last_exception = e logger.warning( @@ -328,6 +359,235 @@ def get_pending_artifacts(self, program_id: str) -> Optional[Dict[str, Union[str """ return self._pending_artifacts.pop(program_id, None) + def get_pending_repair(self, program_id: str) -> Optional[str]: + """ + Get and clear the repaired source code for a program, if one exists. + + Returns the repaired code string when a previous ``_attempt_repair`` + call succeeded, or ``None`` when no repair was performed. The entry is + removed from the internal store on first read (one-shot). + + Args: + program_id: Program ID used during evaluation. + + Returns: + Repaired source code string, or ``None``. + """ + return self._pending_repairs.pop(program_id, None) + + async def _attempt_repair( + self, + repair_req: EvaluatorRepairRequest, + program_id: str, + ) -> Optional[Dict[str, float]]: + """ + Attempt to repair broken code via the LLM, then re-evaluate. + + Loops up to ``config.max_repair_attempts`` times. On success the + repaired code is stored in ``_pending_repairs[program_id]`` and the + repair history is added to ``_pending_artifacts[program_id]`` so that + ``iteration.py`` can move both into ``Program.metadata``. + + Args: + repair_req: The ``EvaluatorRepairRequest`` raised by the evaluator. + program_id: Program ID for artifact/repair storage. + + Returns: + Metrics dict from the successfully repaired evaluation, or ``None`` + if all repair attempts failed. + """ + artifacts_enabled = os.environ.get("ENABLE_ARTIFACTS", "true").lower() == "true" + broken_code = repair_req.broken_code + error_message = str(repair_req) + repair_context = repair_req.repair_context + language = repair_req.language + repair_history: List[Dict] = [] + + for attempt in range(1, self.config.max_repair_attempts + 1): + logger.info( + f"Repair attempt {attempt}/{self.config.max_repair_attempts} " + f"for program {program_id} (language={language})" + ) + + repaired_code = await self._repair_code( + broken_code=broken_code, + error_message=error_message, + repair_context=repair_context, + language=language, + ) + if repaired_code is None: + logger.warning(f"Repair attempt {attempt}: LLM returned no parseable code") + repair_history.append({ + "attempt": attempt, + "error": error_message, + "repair_error": "LLM returned no parseable code", + "succeeded": False, + }) + break + + # Write the repaired code to a temp file and re-evaluate + with tempfile.NamedTemporaryFile( + suffix=self.program_suffix, delete=False + ) as tmp: + tmp.write(repaired_code.encode("utf-8")) + tmp_path = tmp.name + + try: + result = await self._direct_evaluate(tmp_path) + eval_result = self._process_evaluation_result(result) + + # Success — store the repaired code and history + repair_history.append({ + "attempt": attempt, + "error": None, + "succeeded": True, + }) + logger.info( + f"Repair succeeded on attempt {attempt} for program {program_id}" + ) + self._pending_repairs[program_id] = repaired_code + if artifacts_enabled and program_id: + if program_id not in self._pending_artifacts: + self._pending_artifacts[program_id] = {} + self._pending_artifacts[program_id]["repair_history"] = repair_history + if eval_result.has_artifacts(): + self._pending_artifacts[program_id].update(eval_result.artifacts) + + elapsed = 0.0 # timing already handled by outer evaluate_program + logger.info( + f"Repaired program {program_id}: " + f"{format_metrics_safe(eval_result.metrics)}" + ) + return eval_result.metrics + + except EvaluatorRepairRequest as next_req: + # Re-evaluation raised another repair request — prepare next loop + error_message = str(next_req) + repair_context = next_req.repair_context + broken_code = next_req.broken_code + repair_history.append({ + "attempt": attempt, + "error": error_message, + "succeeded": False, + }) + except Exception as exc: + error_message = str(exc) + repair_history.append({ + "attempt": attempt, + "error": error_message, + "succeeded": False, + }) + logger.warning(f"Repair attempt {attempt} raised exception: {exc}") + break + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + # All attempts exhausted + logger.warning( + f"All {self.config.max_repair_attempts} repair attempt(s) failed " + f"for program {program_id}" + ) + if artifacts_enabled and program_id: + if program_id not in self._pending_artifacts: + self._pending_artifacts[program_id] = {} + self._pending_artifacts[program_id].update({ + "compile_error": str(repair_req), + "repair_context": repair_req.repair_context, + "repair_history": repair_history, + "repair_failed": True, + }) + return None + + async def _repair_code( + self, + broken_code: str, + error_message: str, + repair_context: str, + language: str, + ) -> Optional[str]: + """ + Ask the LLM to repair broken code and return the fixed source. + + Uses the ``repair_full_rewrite_user`` or ``repair_diff_user`` template + (depending on ``config.repair_diff_based``) and the ``repair_system_message`` + template (falling back to ``system_message`` if absent). + + Returns the repaired code string on success, or ``None`` if the LLM + response could not be parsed. + """ + if not self.repair_llm_ensemble or not self.prompt_sampler: + logger.warning("_repair_code called but repair_llm_ensemble or prompt_sampler is None") + return None + + # --- Choose templates --- + user_template_name = ( + "repair_diff_user" if self.config.repair_diff_based else "repair_full_rewrite_user" + ) + try: + user_template = self.prompt_sampler.template_manager.get_template(user_template_name) + except ValueError: + logger.warning( + f"Repair template '{user_template_name}' not found — repair skipped. " + "Ensure the template file exists in your prompts directory." + ) + return None + + # Prefer a dedicated repair system message; fall back to the evolution one. + try: + system_message = self.prompt_sampler.template_manager.get_template( + "repair_system_message" + ) + except ValueError: + try: + system_message = self.prompt_sampler.template_manager.get_template( + "system_message" + ) + except ValueError: + system_message = ( + "You are an expert software developer. " + "Fix all errors in the provided code." + ) + + try: + # Use sequential replacement instead of str.format() so that braces + # inside broken_code / error_message / repair_context (e.g. C++ code) + # do not raise KeyError or corrupt the template. + user_message = user_template + for placeholder, value in [ + ("{language}", language), + ("{error_message}", error_message), + ("{repair_context}", repair_context), + ("{broken_code}", broken_code), + ]: + user_message = user_message.replace(placeholder, value) + except Exception as exc: + logger.warning(f"Repair template substitution error: {exc}") + return None + + try: + llm_response = await self.repair_llm_ensemble.generate_with_context( + system_message=system_message, + messages=[{"role": "user", "content": user_message}], + ) + except Exception as exc: + logger.warning(f"LLM call during repair failed: {exc}") + return None + + # --- Parse the LLM response --- + if self.config.repair_diff_based: + from openevolve.utils.code_utils import apply_diff + repaired = apply_diff(broken_code, llm_response) + else: + from openevolve.utils.code_utils import parse_full_rewrite + repaired = parse_full_rewrite(llm_response, language) + + if not repaired or not repaired.strip(): + logger.warning("Repair LLM response yielded empty code after parsing") + return None + + return repaired + async def _direct_evaluate( self, program_path: str ) -> Union[Dict[str, float], EvaluationResult]: diff --git a/openevolve/evolution_trace.py b/openevolve/evolution_trace.py index 8e6699cda3..1e94c8927b 100644 --- a/openevolve/evolution_trace.py +++ b/openevolve/evolution_trace.py @@ -102,6 +102,9 @@ def __init__( "total_improvement": {}, "best_improvement": {}, "worst_decline": {}, + # Repair stats: how often LLM repair was triggered / succeeded + "repair_triggered": 0, + "repair_succeeded": 0, } if not self.enabled: @@ -232,6 +235,14 @@ def _update_stats(self, trace: EvolutionTrace): if delta < self.stats["worst_decline"][metric]: self.stats["worst_decline"][metric] = delta + # Track repair statistics from child program metadata + child_meta = trace.metadata or {} + repair_history = child_meta.get("repair_history") + if repair_history is not None: + self.stats["repair_triggered"] += 1 + if any(entry.get("succeeded") for entry in repair_history): + self.stats["repair_succeeded"] += 1 + def flush(self): """Write buffered traces to file""" if not self.enabled or not self.buffer: @@ -259,12 +270,18 @@ def flush(self): def get_statistics(self) -> Dict[str, Any]: """Get current tracing statistics""" + total = self.stats["total_traces"] + triggered = self.stats["repair_triggered"] return { **self.stats, "improvement_rate": ( - self.stats["improvement_count"] / self.stats["total_traces"] - if self.stats["total_traces"] > 0 - else 0 + self.stats["improvement_count"] / total if total > 0 else 0 + ), + # Fraction of all iterations where repair was triggered + "repair_trigger_rate": triggered / total if total > 0 else 0, + # Fraction of repair attempts that succeeded + "repair_success_rate": ( + self.stats["repair_succeeded"] / triggered if triggered > 0 else 0 ), } @@ -303,6 +320,13 @@ def close(self): logger.info(f"Evolution tracing complete. Total traces: {stats['total_traces']}") logger.info(f"Improvement rate: {stats['improvement_rate']:.2%}") + if stats["repair_triggered"] > 0: + logger.info( + f"Repair: triggered={stats['repair_triggered']}, " + f"succeeded={stats['repair_succeeded']}, " + f"success_rate={stats['repair_success_rate']:.2%}" + ) + if stats["best_improvement"]: logger.info(f"Best improvements: {stats['best_improvement']}") if stats["worst_decline"]: diff --git a/openevolve/iteration.py b/openevolve/iteration.py index 7afaff75b5..30902b13ba 100644 --- a/openevolve/iteration.py +++ b/openevolve/iteration.py @@ -168,6 +168,21 @@ async def run_iteration_with_shared_db( # Handle artifacts if they exist artifacts = evaluator.get_pending_artifacts(child_id) + # If the evaluator performed an LLM repair, use the repaired code as + # the canonical source for the database entry and demote the original + # broken LLM output to metadata["original_llm_code"]. + repaired_code = evaluator.get_pending_repair(child_id) + repair_metadata: dict = {} + if repaired_code is not None: + repair_metadata["original_llm_code"] = child_code + repair_metadata["repair_history"] = ( + (artifacts or {}).pop("repair_history", []) + ) + child_code = repaired_code + logger.info( + f"Iteration {iteration}: using LLM-repaired code for program {child_id}" + ) + # Set template_key of Prompts template_key = "full_rewrite_user" if not config.diff_based_evolution else "diff_user" @@ -184,6 +199,7 @@ async def run_iteration_with_shared_db( metadata={ "changes": changes_summary, "parent_metrics": parent.metrics, + **repair_metadata, }, prompts=( { diff --git a/openevolve/process_parallel.py b/openevolve/process_parallel.py index a2fd6592a9..2c55a52f92 100644 --- a/openevolve/process_parallel.py +++ b/openevolve/process_parallel.py @@ -64,11 +64,13 @@ def _worker_init(config_dict: dict, evaluation_file: str, parent_env: dict = Non # Reconstruct model objects models = [LLMModelConfig(**m) for m in config_dict["llm"]["models"]] evaluator_models = [LLMModelConfig(**m) for m in config_dict["llm"]["evaluator_models"]] + repair_models = [LLMModelConfig(**m) for m in config_dict["llm"].get("repair_models", [])] # Create LLM config with models llm_dict = config_dict["llm"].copy() llm_dict["models"] = models llm_dict["evaluator_models"] = evaluator_models + llm_dict["repair_models"] = repair_models llm_config = LLMConfig(**llm_dict) # Create other configs @@ -118,6 +120,7 @@ def _lazy_init_worker_components(): # Create evaluator-specific components evaluator_llm = LLMEnsemble(_worker_config.llm.evaluator_models) + repair_llm = LLMEnsemble(_worker_config.llm.repair_models) evaluator_prompt = PromptSampler(_worker_config.prompt) evaluator_prompt.set_templates("evaluator_system_message") @@ -128,6 +131,7 @@ def _lazy_init_worker_components(): evaluator_prompt, database=None, # No shared database in worker suffix=getattr(_worker_config, "file_suffix", ".py"), + repair_llm_ensemble=repair_llm, ) @@ -294,6 +298,17 @@ def _run_iteration_worker( # Get artifacts artifacts = _worker_evaluator.get_pending_artifacts(child_id) + # Apply LLM repair if the evaluator performed one (mirrors iteration.py) + repaired_code = _worker_evaluator.get_pending_repair(child_id) + repair_metadata: dict = {} + if repaired_code is not None: + repair_metadata["original_llm_code"] = child_code + repair_metadata["repair_history"] = (artifacts or {}).pop("repair_history", []) + child_code = repaired_code + logger.info( + f"Worker iteration {iteration}: using LLM-repaired code for program {child_id}" + ) + # Create child program child_program = Program( id=child_id, @@ -308,6 +323,7 @@ def _run_iteration_worker( "changes": changes_summary, "parent_metrics": parent.metrics, "island": parent_island, + **repair_metadata, }, ) @@ -370,6 +386,7 @@ def _serialize_config(self, config: Config) -> dict: "llm": { "models": [asdict(m) for m in config.llm.models], "evaluator_models": [asdict(m) for m in config.llm.evaluator_models], + "repair_models": [asdict(m) for m in config.llm.repair_models], "api_base": config.llm.api_base, "api_key": config.llm.api_key, "temperature": config.llm.temperature, @@ -593,8 +610,11 @@ async def run_evolution( artifacts=result.artifacts, island_id=island_id, metadata={ + **{ + k: v for k, v in child_program.metadata.items() + if k not in ("parent_metrics", "island") + }, "iteration_time": result.iteration_time, - "changes": child_program.metadata.get("changes", ""), }, ) diff --git a/openevolve/prompts/defaults/repair_diff_user.txt b/openevolve/prompts/defaults/repair_diff_user.txt new file mode 100644 index 0000000000..a64766a2c7 --- /dev/null +++ b/openevolve/prompts/defaults/repair_diff_user.txt @@ -0,0 +1,27 @@ +The following {language} program failed to evaluate due to the error below. +Repair the program using minimal SEARCH/REPLACE diffs. +Preserve all structure, class names, plugin keys, and invariants that are unrelated to the error. + +# Error + +{error_message} + +# Additional context + +{repair_context} + +# Broken Program + +```{language} +{broken_code} +``` + +Use the exact SEARCH/REPLACE diff format shown below. +Each SEARCH block must match the broken program exactly (including whitespace). +Multiple diff blocks are allowed. + +<<<<<<< SEARCH +# exact lines to replace +======= +# corrected replacement +>>>>>>> REPLACE diff --git a/openevolve/prompts/defaults/repair_full_rewrite_user.txt b/openevolve/prompts/defaults/repair_full_rewrite_user.txt new file mode 100644 index 0000000000..90732653de --- /dev/null +++ b/openevolve/prompts/defaults/repair_full_rewrite_user.txt @@ -0,0 +1,24 @@ +The following {language} program failed to evaluate due to the error below. +Repair the program so that all errors are resolved. +Preserve all structure, class names, plugin keys, and invariants that are unrelated to the error. + +# Error + +{error_message} + +# Additional context + +{repair_context} + +# Broken Program + +```{language} +{broken_code} +``` + +Output ONLY the complete corrected program inside a single code fence. +Do not include any commentary, explanation, or text outside the code fence. + +```{language} +# corrected program here +``` diff --git a/scripts/static/js/sidebar.js b/scripts/static/js/sidebar.js index 1230c0f81c..3a9e287011 100644 --- a/scripts/static/js/sidebar.js +++ b/scripts/static/js/sidebar.js @@ -313,7 +313,12 @@ export function showSidebarContent(d, fromHover = false) { if (parentNodeForDiff && parentNodeForDiff.code && parentNodeForDiff.code.trim() !== '') { tabNames.push('Diff'); } - + + // Add a Repairs tab when LLM repair was performed on this program + if (d.metadata && d.metadata.repair_history && d.metadata.repair_history.length > 0) { + tabNames.push('Repairs'); + } + let activeTab = lastSidebarTab && tabNames.includes(lastSidebarTab) ? lastSidebarTab : tabNames[0]; // Helper to render tab content @@ -450,6 +455,34 @@ export function showSidebarContent(d, fromHover = false) { const curCode = d.code || ''; return renderCodeDiff(parentCode, curCode); } + if (tabName === 'Repairs') { + const originalCode = (d.metadata && d.metadata.original_llm_code) || ''; + const repairedCode = d.code || ''; + const history = (d.metadata && d.metadata.repair_history) || []; + // Attempt table + const tableRows = history.map(function(entry) { + const badge = entry.succeeded + ? '✓ ok' + : '✗ fail'; + const errText = escapeHtml(entry.error || entry.repair_error || '—'); + return '' + + '' + (entry.attempt || '') + '' + + '' + badge + '' + + '' + errText + '' + + ''; + }).join(''); + const tableHtml = '' + + '' + + '' + + '' + + '' + + '' + tableRows + '
#StatusCompiler output / notes
'; + const diffHtml = originalCode + ? '
Diff: original LLM output → repaired code
' + + renderCodeDiff(originalCode, repairedCode) + : '
(Original code not recorded)
'; + return tableHtml + diffHtml; + } return ''; } diff --git a/scripts/templates/program_page.html b/scripts/templates/program_page.html index 4b1652bcad..badbed416f 100644 --- a/scripts/templates/program_page.html +++ b/scripts/templates/program_page.html @@ -36,6 +36,43 @@

Program ID: {{ program_data.id }}

Code:

{{ program_data.code }}
+ {% if program_data.metadata and program_data.metadata.repair_history %} +

Repairs:

+

This program was repaired by the LLM before evaluation. + The repaired source is shown above. + The original broken LLM output and repair attempts are shown below.

+ {% if program_data.metadata.original_llm_code %} +

Original LLM-generated code (before repair):

+
{{ program_data.metadata.original_llm_code }}
+ {% endif %} +

Repair attempts:

+ + + + + + + + + + {% for entry in program_data.metadata.repair_history %} + + + + + + {% endfor %} + +
#StatusCompiler output / notes
{{ entry.attempt }} + {% if entry.succeeded %} + ✓ ok + {% else %} + ✗ fail + {% endif %} + +
{{ entry.error or entry.repair_error or '—' }}
+
+ {% endif %}

Prompts: