DevServer/apps/worker/src/services/agent_runner.py at master · sergiovision/DevServer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Agent runner — main execution logic for a coding task.

Orchestrates: lock -> worktree -> repo_map -> reality_gate -> memory_recall
           -> (optional plan_gate) -> claude CLI -> verify -> pr_preflight
           -> PR -> notify.
Implements retry loop with session persistence, targeted error classification,
and a per-task cost/wall-clock circuit breaker.
"""

import asyncio
import json
import logging
import os
import random
import time
import traceback
from datetime import datetime, timezone
from decimal import Decimal

from sqlalchemy import text, update
from sqlalchemy.ext.asyncio import AsyncSession

from config import settings
from models.base import async_session
from models.daily_stat import DailyStat
from models.repo import Repo
from models.task import Task
from models.task_event import TaskEvent
from models.task_run import TaskRun
from services import (
    agent_backends,
    compaction,
    error_classifier,
    git_ops,
    repo_map,
    telegram,
    verifier,
)
from services.agent_backends import AgentBackend
from services.notify import notify

# Pro features: if the services/pro/ folder exists, load real implementations.
# If it's absent (public MIT repo), fall back to no-op stubs so the free
# version compiles and runs without errors.
try:
    from services.pro import hooks as pro
    _HAS_PRO = True
except ImportError:
    from services._free_hooks import FreeHooks
    pro = FreeHooks()
    _HAS_PRO = False


# Rate-limit handling applies to every vendor — when the agent CLI
# subprocess fails with a 429, we sleep and retry the SAME call without
# consuming a task-level retry attempt. Burning a full retry on a transient
# quota error costs another ~5K tokens of context for nothing.
#
# Each vendor detects its OWN 429 shape via ``AgentBackend.is_rate_limit_error``.
# This module only knows the generic backoff schedule.
_RATE_LIMIT_BACKOFF_SCHEDULE = (30, 60, 120)
_RATE_LIMIT_JITTER_SECONDS = 10

logger = logging.getLogger(__name__)


async def _acquire_lock(db: AsyncSession, repo_name: str, task_key: str) -> bool:
    """Acquire a repo lock using PostgreSQL. Returns True if lock acquired."""
    # First, clean up any expired locks
    await db.execute(text(
        "DELETE FROM repo_locks WHERE repo_name = :repo_name AND expires_at < NOW()"
    ), {"repo_name": repo_name})
    # Try to insert (fails silently if lock exists via ON CONFLICT DO NOTHING)
    result = await db.execute(text("""
        INSERT INTO repo_locks (repo_name, task_key, acquired_at, expires_at)
        VALUES (:repo_name, :task_key, NOW(), NOW() + interval '1 hour')
        ON CONFLICT (repo_name) DO NOTHING
        RETURNING repo_name
    """), {"repo_name": repo_name, "task_key": task_key})
    await db.commit()
    return result.fetchone() is not None


async def _extend_lock(db: AsyncSession, repo_name: str) -> None:
    """Extend the repo lock expiry by 1 hour."""
    await db.execute(text(
        "UPDATE repo_locks SET expires_at = NOW() + interval '1 hour' WHERE repo_name = :repo_name"
    ), {"repo_name": repo_name})
    await db.commit()


async def _release_lock(db: AsyncSession, repo_name: str, task_key: str) -> None:
    """Release the repo lock if held by this task."""
    await db.execute(text(
        "DELETE FROM repo_locks WHERE repo_name = :repo_name AND task_key = :task_key"
    ), {"repo_name": repo_name, "task_key": task_key})
    await db.commit()


async def _emit_event(
    session: AsyncSession,
    task_id: int,
    run_id: int | None,
    event_type: str,
    payload: dict,
) -> None:
    """Insert a task_event row (triggers PG NOTIFY via database trigger)."""
    event = TaskEvent(
        task_id=task_id,
        run_id=run_id,
        event_type=event_type,
        payload=payload,
    )
    session.add(event)
    await session.commit()


async def _update_task_status(session: AsyncSession, task_id: int, status: str) -> None:
    await session.execute(
        update(Task).where(Task.id == task_id).values(status=status, updated_at=datetime.now(timezone.utc))
    )
    await session.commit()


def _build_prompt(
    repo_name: str,
    branch_name: str,
    task_key: str,
    title: str,
    description: str,
    acceptance: str,
    error_context: str = "",
    repo_map_text: str = "",
    reality_signal_text: str = "",
    memory_recall_text: str = "",
    approved_plan_text: str = "",
    compacted_context: str = "",
    is_resume: bool = False,
    skip_verify: bool = False,
) -> str:
    """Build the user-message prompt for one Claude CLI invocation.

    When ``is_resume`` is True, the Claude CLI is being invoked with
    ``--resume <session_id>`` and the agent already has the full task
    context, repo map, reality signal, memory recall and approved plan in
    its conversation history from the original turn. Re-sending all of
    that on every retry was the dominant cause of the 30K-tokens/minute
    rate-limit failures we hit during Phase 1+2 testing — each retry was
    sending an extra ~5K tokens of static context that the model already
    had. So on resume we return a minimal next-message prompt with only
    the error/remediation block (or a short "continue" message if there
    is no error context, e.g. after a max_turns pause).
    """
    if is_resume:
        # The session already has every Phase 1 context block in its
        # history. Send only the next user message — the smaller the
        # better for rate-limit headroom.
        if error_context:
            return error_context
        return (
            "Continue with the previous task. Resume where you left off "
            "and finish the implementation. Commit your changes when done."
        )

    parts = [
        f"You are an autonomous coding agent working on repository: {repo_name}",
        f"Branch: {branch_name}",
        f"Task: {task_key} - {title}",
        "",
        "## Task Description",
        description or "(no description provided)",
        "",
        "## Acceptance Criteria",
        acceptance or "(none specified)",
    ]

    # Evidence-before-action blocks (Phase 1). Injected only when available so
    # the prompt stays clean for tasks where a block failed to generate.
    # NOTE: ``compacted_context`` is an alternative to the evidence stack —
    # when set, it REPLACES repo_map/memory_recall/reality_signal because
    # the summariser has already distilled prior attempts into what matters.
    if compacted_context:
        parts.extend(["", compacted_context])
    else:
        if repo_map_text:
            parts.extend(["", repo_map_text])
        if reality_signal_text:
            parts.extend(["", reality_signal_text])
        if memory_recall_text:
            parts.extend(["", memory_recall_text])
    if approved_plan_text:
        parts.extend(["", approved_plan_text])

    parts.extend([
        "",
        "## Instructions",
        "1. Read the CLAUDE.md file first for project context and conventions",
        "2. Understand the existing codebase before making changes",
        "3. Implement the task following existing patterns and conventions",
        "4. Make minimal, focused changes -- only what the task requires",
        "5. Do NOT add extra features, refactoring, or improvements beyond the task",
        f"6. Commit your changes with a clear message referencing {task_key}",
        "7. Ensure all acceptance criteria are met",
    ])

    if skip_verify:
        parts.extend([
            "",
            "## Verification policy — SKIPPED",
            "The operator has set `skip_verify=true` for this task. Do NOT run",
            "the project's test suite, build, or lint commands. Do NOT spawn",
            "`npm test`, `pytest`, `cargo test`, `go test`, `dotnet test`,",
            "`npm run build`, `tsc`, `eslint`, `ruff`, or equivalent. Read and",
            "edit code freely, but skip the verify step — the operator will",
            "run checks manually after you finish. Proceed straight from the",
            "implementation to the commit.",
        ])

    # Inter-task messaging prompt block — Pro-only. When the pro package
    # is absent the /internal/tasks/.../messages/* endpoints do not exist,
    # so teaching the agent to curl them would just cause confused 404s.
    if _HAS_PRO:
        parts.extend([
            "",
            "## Inter-task messaging (optional)",
            "You can coordinate with other concurrently-running tasks or hand off",
            "questions to the human operator via the DevServer messaging bus.",
            "The subprocess env exposes $DEVSERVER_WORKER_URL and $DEVSERVER_TASK_KEY.",
            "",
            "- List live peer tasks:",
            "    curl -s \"$DEVSERVER_WORKER_URL/internal/sessions/list\"",
            "- Read your own inbox (drains unread by default):",
            "    curl -s \"$DEVSERVER_WORKER_URL/internal/tasks/$DEVSERVER_TASK_KEY/messages/inbox\"",
            "- Send a message to another task or to 'operator' (the human):",
            "    curl -s -X POST \\",
            "      \"$DEVSERVER_WORKER_URL/internal/tasks/$DEVSERVER_TASK_KEY/messages/send\" \\",
            "      -H 'content-type: application/json' \\",
            "      -d '{\"to_task_key\":\"operator\",\"kind\":\"note\",\"body\":\"...\"}'",
            "",
            "IMPORTANT: the operator can send you messages mid-run. Check your",
            "inbox at the start of the task and again before any major commit or",
            "irreversible step. If the operator's message contradicts or amends",
            "the task description, follow the message — it is the most recent",
            "human intent. Do NOT poll in a tight loop; once per major step is",
            "enough.",
            "",
            "ALWAYS REPLY to operator messages — silent execution is a bug.",
            "When you drain an operator message from your inbox:",
            "1. Send a brief acknowledgement reply (to_task_key='operator', kind='response').",
            "   • For a request/note: confirm receipt and state what you're about to do.",
            "   • For a question (e.g. 'how are you doing?', 'is X done?'): answer it directly.",
            "   • One or two sentences is plenty — no wall of text.",
            "2. Then act on any actionable content.",
            "3. Send a final 'done' reply when you have committed the requested change.",
            "",
            "Use send_message to peer tasks sparingly — only for blocking questions,",
            "cross-task handoffs, or status updates another task is waiting on.",
        ])

    if error_context:
        parts.extend(["", error_context])

    return "\n".join(parts)


    # _render_memory_recall moved to services/pro/__init__.py (ProHooks.render_memory_recall)


async def _run_agent(
    backend: AgentBackend,
    worktree_path: str,
    prompt: str,
    model: str,
    allowed_tools: str,
    session_id: str | None,
    timeout_minutes: int,
    task_id: int,
    run_id: int,
    db: AsyncSession,
    claude_mode: str = "max",
    max_turns: int | None = 100,
    task_key: str | None = None,
) -> dict:
    """Execute a vendor-agnostic coding-agent CLI and collect its output.

    Delegates everything vendor-specific to ``backend`` (command
    construction, environment, rate-limit detection, output parsing) and
    keeps the shared machinery here: subprocess spawning, per-call timeout,
    429-aware retry with backoff, and task-event emission.

    Returns the legacy dict shape that the existing retry loop consumes:
    ``result``, ``cost_usd``, ``num_turns``, ``session_id``, ``exit_code``,
    ``raw_output``, ``subtype``, ``errors``.

    On a vendor-specific 429 (detected via ``backend.is_rate_limit_error``),
    retries the SAME subprocess call up to ``len(_RATE_LIMIT_BACKOFF_SCHEDULE)``
    times with jittered backoff. A 429 burns no agent progress, so we must
    not consume a task-level retry attempt for it.

    The ``claude_mode`` parameter stays named that way for backwards
    compatibility with the job payload — it's passed through as the
    ``billing_mode`` argument to the backend's ``build_env`` and only the
    Claude backend does anything meaningful with it.
    """
    cmd = backend.build_command(
        prompt=prompt,
        model=model,
        allowed_tools=allowed_tools,
        session_id=session_id,
        max_turns=max_turns,
    )
    env = backend.build_env(billing_mode=claude_mode)

    # Inject inter-task messaging env vars (Pro only). Agents can curl
    # the worker via these two variables to read their inbox, list peers,
    # or message other tasks mid-run. The endpoints they point at live
    # in ``routes/pro_internal.py`` — if pro is stripped there is nothing
    # to target, so we skip the injection entirely.
    if task_key and _HAS_PRO:
        if env is None:
            env = dict(os.environ)
        env.setdefault(
            "DEVSERVER_WORKER_URL",
            f"http://{settings.worker_host if settings.worker_host != '0.0.0.0' else '127.0.0.1'}:{settings.worker_port}",
        )
        env["DEVSERVER_TASK_KEY"] = task_key

    if backend.vendor == "google":
        gemini_dir = os.path.join(worktree_path, ".gemini")
        os.makedirs(gemini_dir, exist_ok=True)
        settings_path = os.path.join(gemini_dir, "settings.json")
        try:
            with open(settings_path, "w", encoding="utf-8") as f:
                json.dump({"model": {"maxSessionTurns": max_turns if max_turns is not None else -1}}, f)
        except Exception as e:
            logger.warning("Failed to write .gemini/settings.json: %s", e)

    # OpenAI / Azure OpenAI — propagate the custom endpoint overrides so
    # Codex CLI targets Azure AI Foundry (or any OpenAI-compatible proxy)
    # instead of api.openai.com. pydantic-settings reads these from .env
    # into the Settings object but does NOT push them back into
    # os.environ, so we forward them explicitly here.
    if backend.vendor == "openai":
        if settings.openai_base_url or settings.openai_api_version:
            if env is None:
                env = dict(os.environ)
            if settings.openai_base_url:
                env["OPENAI_BASE_URL"] = settings.openai_base_url
            if settings.openai_api_version:
                env["OPENAI_API_VERSION"] = settings.openai_api_version
                # Azure's OpenAI-compatible SDKs also read AZURE_OPENAI_*.
                env.setdefault("AZURE_OPENAI_API_VERSION", settings.openai_api_version)
            if settings.openai_api_key:
                # Azure Codex deployments read AZURE_OPENAI_API_KEY when
                # the endpoint is an *.openai.azure.com URL. Mirror the
                # existing OPENAI_API_KEY across both names so either
                # client path works.
                env.setdefault("AZURE_OPENAI_API_KEY", settings.openai_api_key)
            if settings.openai_base_url:
                env.setdefault("AZURE_OPENAI_ENDPOINT", settings.openai_base_url)

    timeout_seconds = timeout_minutes * 60

    logger.info(
        "Running %s CLI in %s (model=%s, timeout=%dm, billing=%s)",
        backend.label, worktree_path, model, timeout_minutes, claude_mode,
    )

    async def _spawn_once() -> tuple[int, str, str]:
        """Spawn the agent CLI subprocess one time and collect its output.

        Returns (exit_code, stdout_text, stderr_text). Raises on timeout —
        the outer function maps timeouts to a structured failure dict so the
        rate-limit retry loop never sees them.
        """
        # stdin=DEVNULL is required for true headless operation. Without it
        # asyncio inherits the worker's stdin — when the worker runs in a
        # terminal (dev mode) the agent CLI inherits the TTY and may either
        # block on a read or, in Gemini's case, merge stray TTY bytes into
        # the prompt (per `gemini --help`: "Appended to input on stdin if any").
        proc = await asyncio.create_subprocess_exec(
            *cmd,
            cwd=worktree_path,
            env=env,
            stdin=asyncio.subprocess.DEVNULL,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        try:
            stdout_data, stderr_data = await asyncio.wait_for(
                proc.communicate(), timeout=timeout_seconds
            )
        except asyncio.TimeoutError:
            proc.kill()
            await proc.wait()
            raise
        return (
            proc.returncode or 0,
            stdout_data.decode(errors="replace"),
            stderr_data.decode(errors="replace"),
        )

    # Outer loop: 429-aware retry. We sleep and retry the SAME call on
    # rate_limit; any other failure breaks out immediately and the existing
    # outer-loop error path takes over.
    exit_code = -1
    raw_output = ""
    stderr_text = ""
    rate_limit_attempts = 0

    while True:
        try:
            exit_code, raw_output, stderr_text = await _spawn_once()
        except asyncio.TimeoutError:
            return {
                "result": "",
                "cost_usd": 0,
                "num_turns": 0,
                "session_id": session_id,
                "exit_code": -1,
                "error": f"{backend.label} CLI timed out after {timeout_minutes}m",
            }

        if not backend.is_rate_limit_error(raw_output, stderr_text, exit_code):
            break

        if rate_limit_attempts >= len(_RATE_LIMIT_BACKOFF_SCHEDULE):
            logger.error(
                "%s CLI rate-limited %d times in a row, giving up",
                backend.label, rate_limit_attempts,
            )
            break

        backoff = _RATE_LIMIT_BACKOFF_SCHEDULE[rate_limit_attempts]
        # Add jitter so concurrent workers don't synchronise their retries.
        jitter = random.uniform(0, _RATE_LIMIT_JITTER_SECONDS)
        sleep_for = backoff + jitter
        rate_limit_attempts += 1
        logger.warning(
            "%s 429 (attempt %d/%d) — sleeping %.0fs before retrying",
            backend.label, rate_limit_attempts,
            len(_RATE_LIMIT_BACKOFF_SCHEDULE), sleep_for,
        )
        await _emit_event(db, task_id, run_id, "rate_limit_backoff", {
            "vendor": backend.vendor,
            "attempt": rate_limit_attempts,
            "max_attempts": len(_RATE_LIMIT_BACKOFF_SCHEDULE),
            "sleep_seconds": round(sleep_for, 1),
        })
        await asyncio.sleep(sleep_for)

    # Log stderr lines as task events
    if stderr_text:
        for line in stderr_text.splitlines():
            if line.strip():
                await _emit_event(db, task_id, run_id, "log_line", {"line": line, "stream": "stderr"})

    # Parse vendor-specific JSON output into the normalised result shape.
    result = backend.parse_output(raw_output, session_id)
    result.exit_code = exit_code
    return result.to_dict()


    # _run_plan_gate moved to services/pro/__init__.py (ProHooks.run_plan_gate)


async def run_task(task_id: int, claude_mode: str = "max", max_turns: int | None = None) -> bool:
    """Execute a full task lifecycle. Returns True on success."""
    async with async_session() as db:
        # Load task with repo
        task = await db.get(Task, task_id)
        if not task:
            logger.error("Task %d not found", task_id)
            return False

        repo = await db.get(Repo, task.repo_id)
        if not repo:
            logger.error("Repo %d not found for task %d", task.repo_id, task_id)
            return False

        task_key = task.task_key
        title = task.title
        description = task.description or ""
        acceptance = task.acceptance or ""
        skip_verify = bool(task.skip_verify)
        is_continuation = bool(getattr(task, "is_continuation", False))
        # Context compaction — when a prior attempt called ``/compact`` (or the
        # auto-compaction branch below fired on a previous run), this column
        # holds a distilled summary of everything tried so far. Its presence
        # causes _build_prompt to skip the Phase-1 evidence stack and inject
        # the summary instead. See services/compaction.py.
        compacted_context: str = getattr(task, "compacted_context", None) or ""
        backup_model = getattr(task, "backup_model", None)
        backup_vendor = getattr(task, "backup_vendor", None)
        # Resolve the agent backend for this task. Defaults to Anthropic
        # when the column is missing or unknown (backwards compatible with
        # every task created before migration 006).
        agent_vendor = getattr(task, "agent_vendor", None) or agent_backends.DEFAULT_VENDOR
        backend = agent_backends.get_backend(agent_vendor)
        # Per-task model overrides repo default
        effective_model = getattr(task, "claude_model", None) or repo.claude_model
        # Per-task turn budget: job payload → task field → default 50
        # None means unlimited (no --max-turns flag passed to Claude)
        task_max_turns = getattr(task, "max_turns", None)
        effective_max_turns: int | None = max_turns if max_turns is not None else (task_max_turns if task_max_turns is not None else 50)
        # Sentinel -1 from job payload means "unlimited" (long_running preset)
        if effective_max_turns == -1:
            effective_max_turns = None
        # Phase 2 #6 — per-task budget circuit breaker. Both NULL = no limit.
        max_cost_usd: Decimal | None = getattr(task, "max_cost_usd", None)
        max_wall_seconds: int | None = getattr(task, "max_wall_seconds", None)
        repo_name = repo.name
        branch_name = f"agent/{task_key}"

        # Continuation: load session_id from the most recent run so the
        # agent can resume its conversation, and clear the flag immediately.
        continuation_session_id: str | None = None
        if is_continuation:
            last_run_row = await db.execute(text(
                "SELECT session_id FROM task_runs "
                "WHERE task_id = :tid AND session_id IS NOT NULL "
                "ORDER BY id DESC LIMIT 1"
            ), {"tid": task_id})
            row = last_run_row.fetchone()
            continuation_session_id = row[0] if row else None
            await db.execute(
                update(Task).where(Task.id == task_id).values(is_continuation=False)
            )
            await db.commit()
            logger.info(
                "Continuation mode: session_id=%s", continuation_session_id,
            )

        turns_label = str(effective_max_turns) if effective_max_turns is not None else "unlimited"
        logger.info(
            "=== Starting task: %s - %s (repo: %s, model: %s, max_turns: %s%s) ===",
            task_key, title, repo_name, effective_model, turns_label,
            ", continuation" if is_continuation else "",
        )

        # Acquire repo lock
        if not await _acquire_lock(db, repo_name, task_key):
            logger.warning("Could not acquire lock for repo %s", repo_name)
            return False

        success = False
        session_id: str | None = None
        pr_url: str | None = None
        worktree_path: str | None = None

        # Per-task log file: logs/tasks/{task_key}.log
        task_log_path = os.path.join(settings.log_dir, f"{task_key}.log")
        task_log = open(task_log_path, "a", encoding="utf-8")  # noqa: WPS515

        try:
            task_log.write(
                f"\n{'='*60}\n"
                f"Task {task_key} started at {datetime.now(timezone.utc).isoformat()}\n"
                f"{'='*60}\n"
            )
            task_log.flush()

            # Update status
            await _update_task_status(db, task_id, "running")
            await _emit_event(db, task_id, None, "status_change", {"status": "running"})
            await notify.task_start(
                task_key=task_key, title=title, repo_name=repo_name,
                mode=task.mode, vendor=agent_vendor,
                model=effective_model or "",
            )

            # Setup worktree
            worktree_path, branch_name = await git_ops.setup_worktree(
                repo_name=repo_name,
                clone_url=repo.clone_url,
                default_branch=repo.default_branch,
                task_key=task_key,
                gitea_token=repo.gitea_token,
                continuation=is_continuation,
            )

            # ─── Pre-execution evidence (Phase 1) ─────────────────────────
            # These blocks are generated once per task, before any Claude run.
            # Each is allowed to fail independently — never let context-gathering
            # crash the task itself.
            # On continuation, skip Phase 1 entirely — the agent already has
            # context from the previous session and we use --resume.
            repo_map_text = ""
            reality_signal_text = ""
            memory_recall_text = ""
            prior_memories: list[dict] = []
            reality_signal: dict = {}

            # Seed session_id from the continuation session so the first
            # attempt in the retry loop uses --resume.
            if is_continuation and continuation_session_id:
                session_id = continuation_session_id
                task_log.write(
                    f"\n[continuation] resuming session {continuation_session_id}\n"
                )
                task_log.flush()

            if is_continuation:
                task_log.write("\n[continuation] skipping Phase 1 evidence pipeline\n")
                task_log.flush()
            else:
              try:
                rm_text, rm_stats = repo_map.build_repo_map(worktree_path)
                repo_map_text = rm_text
                await _emit_event(db, task_id, None, "repo_map_built", rm_stats)
                task_log.write(
                    f"\n[repo_map] {rm_stats.get('files', 0)} files, "
                    f"{rm_stats.get('symbols', 0)} symbols, "
                    f"{rm_stats.get('chars', 0)} chars\n"
                )
                task_log.flush()
              except Exception:
                logger.exception("repo_map generation failed for %s", task_key)

              try:
                reality_signal = reality_signal, reality_signal_text_raw = await pro.run_reality_gate(
                    db=db,
                    repo_id=repo.id,
                    worktree_path=worktree_path,
                    repo_map_text=repo_map_text,
                    task_key=task_key,
                    title=title,
                    description=description,
                    acceptance=acceptance,
                    branch_name=branch_name,
                    gitea_url=repo.gitea_url,
                    gitea_owner=repo.gitea_owner,
                    gitea_repo=repo.gitea_repo,
                    gitea_token=repo.gitea_token,
                )
                reality_signal_text = reality_signal_text_raw
                await _emit_event(db, task_id, None, "reality_signal", {
                    "score": reality_signal.get("score"),
                    "confidence": reality_signal.get("confidence"),
                    "warnings": reality_signal.get("warnings", []),
                    "degraded_sources": reality_signal.get("degraded_sources", []),
                    "evidence": reality_signal.get("evidence", []),
                })
                task_log.write(
                    f"\n[reality_gate] score={reality_signal.get('score')}"
                    f" confidence={reality_signal.get('confidence')}"
                    f" warnings={len(reality_signal.get('warnings', []))}\n"
                )
                task_log.flush()
              except Exception:
                logger.exception("reality_gate failed for %s", task_key)

              # Memory recall — Tier 2 #5. Query once, inject into the prompt.
              try:
                memory_query = f"{task_key} {title}\n{description}"
                prior_memories = await pro.search_memory(
                    session=db,
                    repo_id=repo.id,
                    query=memory_query,
                    limit=3,
                )
                memory_recall_text = pro.render_memory_recall(prior_memories)
                if prior_memories:
                    await _emit_event(db, task_id, None, "memory_recall", {
                        "count": len(prior_memories),
                        "top_similarity": prior_memories[0].get("similarity", 0.0),
                    })
                    task_log.write(
                        f"\n[memory_recall] {len(prior_memories)} prior entries, "
                        f"top sim={prior_memories[0].get('similarity', 0.0):.2f}\n"
                    )
                    task_log.flush()
              except Exception:
                logger.exception("memory_recall failed for %s", task_key)

            # ─── Interactive mode: plan → approve → implement ─────────────
            approved_plan_text = ""
            if not is_continuation and task.mode == "interactive":
                approved_plan_text = await pro.run_plan_gate(
                    db=db,
                    task_id=task_id,
                    task_key=task_key,
                    title=title,
                    description=description,
                    acceptance=acceptance,
                    repo=repo,
                    worktree_path=worktree_path,
                    claude_mode=claude_mode,
                    task_log=task_log,
                    backend=backend,
                    model=effective_model,
                    # Pass helper functions so ProHooks can call them
                    _run_agent=_run_agent,
                    _emit_event=_emit_event,
                    _update_task_status=_update_task_status,
                    telegram=telegram,
                    TaskRun=TaskRun,
                )
                if approved_plan_text is None:
                    # plan was rejected or timed out — task already marked blocked
                    success = False
                    return success
                task_log.write("\n[plan_gate] plan approved, proceeding to implementation\n")
                task_log.flush()

            error_context = ""
            # Continuation nudge — on a human-initiated /continue, the resume
            # prompt would otherwise be a bare "Continue with the previous
            # task." That's fine when resuming a crashed run, but a common
            # reason operators hit Continue is because the task is in ``test``
            # or ``failed`` and they've just dropped a follow-up message into
            # the task inbox. Seed error_context so the resume prompt points
            # the agent at its inbox before it does anything else.
            if is_continuation and continuation_session_id:
                if _HAS_PRO:
                    error_context = (
                        "## Continuation — human-initiated\n"
                        "The operator has resumed this task. They may have "
                        "dropped a new instruction into your inbox. FIRST, drain "
                        "your inbox before doing anything else:\n"
                        "    curl -s \"$DEVSERVER_WORKER_URL/internal/tasks/"
                        "$DEVSERVER_TASK_KEY/messages/inbox\"\n"
                        "\n"
                        "If you find an operator message:\n"
                        "1. REPLY FIRST — send a brief acknowledgement to "
                        "to_task_key='operator' (kind='response'). For a "
                        "question, answer it directly; for a request, confirm "
                        "what you're about to do. Silent execution is a bug.\n"
                        "2. Then act. Treat the message as the most recent "
                        "human intent — it overrides any previous plan. "
                        "Implement the new scope on the same branch and commit.\n"
                        "3. Send a final 'done' reply when the change is "
                        "committed, then finish.\n"
                        "\n"
                        "If the inbox is empty, resume where you left off."
                    )
                else:
                    error_context = (
                        "## Continuation — human-initiated\n"
                        "The operator has resumed this task. Resume where "
                        "you left off and finish the implementation. Commit "
                        "your changes when done."
                    )
            # Track recurring error classes across retries. Phase 1 #4:
            # if the same class hits twice, escalate instead of burning another full retry.
            error_class_counts: dict[str, int] = {}

            # Phase 2 #6 — budget tracking. Measures only agent-active time
            # (Claude subprocess + verifier), not lock/worktree/plan-gate waits.
            cum_cost = Decimal("0")
            cum_wall_ms = 0
            budget_warned = False
            budget_blocked = False
            budget_reason = ""

            # Phase 2 #7 — extract plan allow-list for PR preflight, if this
            # is an interactive task with an approved plan.
            preflight_allowlist = await pro.get_preflight_allowlist(
                db=db, task_id=task_id, task=task, approved_plan_text=approved_plan_text,
            )

            # Auto-compaction threshold: number of total attempts (primary
            # + backup) after which we transparently summarise the transcript
            # and reset the session to keep the vendor's context window from
            # blowing up. 0 disables the automatic branch entirely — manual
            # /internal/tasks/<key>/compact calls still work.
            compact_after_attempts = 3
            auto_compacted_once = bool(compacted_context)

            # Retry loop
            for attempt in range(1, repo.max_retries + 1):
                # Auto-compaction check. Triggers at most once per task: if
                # we've already consumed ``compact_after_attempts`` attempts
                # and haven't compacted yet, summarise now and drop the
                # session_id so the next attempt starts fresh with the
                # summary as its only context.
                if (
                    not auto_compacted_once
                    and compact_after_attempts > 0
                    and attempt > compact_after_attempts
                ):
                    task_log.write(
                        f"\n[auto-compact] attempt {attempt} > threshold "
                        f"{compact_after_attempts}; summarising transcript\n"
                    )
                    task_log.flush()
                    try:
                        cres = await compaction.compact_task(
                            db, task_id=task_id, reason="auto_after_retries",
                        )
                        if cres["ok"]:
                            compacted_context = cres["summary"]
                            session_id = None  # fresh start — summary IS the history
                            auto_compacted_once = True
                            task_log.write(
                                f"[auto-compact] OK — {cres['chars_in']} → {cres['chars_out']} chars\n"
                            )
                            task_log.flush()
                        else:
                            # Failure is logged by compact_task; keep going.
                            auto_compacted_once = True  # don't retry on every attempt
                    except Exception:
                        logger.exception("auto-compaction failed for %s", task_key)
                        auto_compacted_once = True

                # Phase 2 #6 — check budget before spending another retry.
                state, reason = pro.check_budget(
                    cum_cost=cum_cost,
                    cum_wall_ms=cum_wall_ms,
                    max_cost_usd=max_cost_usd,
                    max_wall_seconds=max_wall_seconds,
                    claude_mode=claude_mode,
                )
                if state == "exceeded":
                    budget_reason = reason
                    logger.warning("Budget exceeded before attempt %d: %s", attempt, reason)
                    await _emit_event(db, task_id, None, "budget_exceeded", {
                        "reason": reason,
                        "cum_cost_usd": float(cum_cost),
                        "cum_wall_seconds": cum_wall_ms / 1000,
                        "max_cost_usd": float(max_cost_usd) if max_cost_usd is not None else None,
                        "max_wall_seconds": max_wall_seconds,
                    })
                    task_log.write(f"\n[budget_exceeded] {reason}\n")
                    task_log.flush()
                    budget_blocked = True
                    break
                if state == "warn" and not budget_warned:
                    budget_warned = True
                    await _emit_event(db, task_id, None, "budget_warning", {
                        "reason": reason,
                        "cum_cost_usd": float(cum_cost),
                        "cum_wall_seconds": cum_wall_ms / 1000,
                    })
                    task_log.write(f"\n[budget_warning] {reason}\n")
                    task_log.flush()
                    await notify.budget_warning(
                        task_key=task_key, repo_name=repo_name, reason=reason,
                        cum_cost=cum_cost, cum_wall_ms=cum_wall_ms,
                        max_cost=max_cost_usd, max_wall=max_wall_seconds,
                    )

                logger.info("--- Attempt %d/%d ---", attempt, repo.max_retries)

                # Insert run record
                run = TaskRun(
                    task_id=task_id,
                    attempt=attempt,
                    branch=branch_name,
                    status="started",
                )
                db.add(run)
                await db.commit()
                await db.refresh(run)
                run_id = run.id

                await _emit_event(db, task_id, run_id, "progress", {
                    "attempt": attempt,
                    "max_retries": repo.max_retries,
                })

                # Build prompt. On the FIRST attempt (and any attempt where
                # there is no live session to resume), include the full
                # Phase 1 context blocks. On a resumed session the agent
                # already has all of that in its conversation history —
                # re-sending it would burn ~5K extra input tokens per turn
                # for nothing and is the dominant cause of 429 rate limits.
                is_resume = session_id is not None
                prompt = _build_prompt(
                    repo_name, branch_name, task_key, title,
                    description, acceptance, error_context,
                    repo_map_text=repo_map_text,
                    reality_signal_text=reality_signal_text,
                    memory_recall_text=memory_recall_text,
                    approved_plan_text=approved_plan_text,
                    compacted_context=compaction.build_compacted_prompt_block(compacted_context),
                    is_resume=is_resume,
                    skip_verify=skip_verify,
                )

                # Run the agent via the resolved backend. The variable is
                # still named ``claude_result`` for continuity with the
                # downstream code that reads ``.get("result")`` etc, but
                # the actual backend can be any of Claude / Gemini / OpenAI
                # / Qwen as determined by ``task.agent_vendor``.
                await _extend_lock(db, repo_name)
                start_ms = time.monotonic_ns() // 1_000_000

                claude_result = await _run_agent(
                    backend=backend,
                    worktree_path=worktree_path,
                    prompt=prompt,
                    model=effective_model,
                    allowed_tools=repo.claude_allowed_tools,
                    session_id=session_id,
                    timeout_minutes=repo.timeout_minutes,
                    task_id=task_id,
                    run_id=run_id,
                    db=db,
                    claude_mode=claude_mode,
                    max_turns=effective_max_turns,
                    task_key=task_key,
                )

                duration_ms = (time.monotonic_ns() // 1_000_000) - start_ms
                exit_code = claude_result["exit_code"]

                # Phase 2 #6 — every attempt (success or failure) consumes
                # real wall time and real API cost. Accumulate both into the
                # per-task counters before any branch handling below.
                attempt_raw_cost = Decimal(str(
                    claude_result.get("total_cost_usd")
                    or claude_result.get("cost_usd")
                    or 0
                ))
                if claude_mode != "max":
                    cum_cost += attempt_raw_cost
                cum_wall_ms += duration_ms

                # Write Claude result to task log file
                raw_output = claude_result.get("raw_output", "")
                result_text = claude_result.get("result", "")
                raw_cost_log = claude_result.get("total_cost_usd") or claude_result.get("cost_usd") or 0
                cost_label = f"~${raw_cost_log:.4f} (Max, not charged)" if claude_mode == "max" else f"${raw_cost_log:.4f}"
                task_log.write(
                    f"\n{'─'*60}\n"
                    f"Attempt {attempt} — exit={exit_code} "
                    f"turns={claude_result.get('num_turns', '?')} "
                    f"duration={duration_ms / 1000:.0f}s "
                    f"cost={cost_label}\n"
                    f"{'─'*60}\n"
                )
                if result_text:
                    task_log.write(f"RESULT:\n{result_text}\n")
                else:
                    # fallback: raw output truncated
                    task_log.write(f"RAW OUTPUT:\n{raw_output[:20_000]}\n")
                task_log.flush()

                if exit_code != 0:
                    subtype = claude_result.get("subtype", "")

                    if backend.vendor == "google" and exit_code == 53:
                        subtype = "error_max_turns"

                    claude_errors = claude_result.get("errors", [])
                    failure_reason = "; ".join(claude_errors) if claude_errors else f"exit code {exit_code}"
                    if subtype:
                        failure_reason = f"{subtype}: {failure_reason}"

                    if subtype == "error_max_turns":
                        # Claude hit the turn limit but made progress — resume the session
                        resumed_session_id = claude_result.get("session_id")
                        logger.warning(
                            "Claude hit max_turns on attempt %d/%d (turns=%s) — resuming session %s",
                            attempt, repo.max_retries,
                            claude_result.get("num_turns", "?"), resumed_session_id,
                        )
                        task_log.write(
                            f"\n[Attempt {attempt}] PAUSED: max_turns reached"
                            f" (turns={claude_result.get('num_turns', '?')})"
                            f" — will resume session {resumed_session_id}\n"
                        )
                        task_log.flush()
                        session_id = resumed_session_id
                        await db.execute(
                            update(TaskRun).where(TaskRun.id == run_id).values(
                                status="failed",
                                finished_at=datetime.now(timezone.utc),
                                error_log="max_turns reached, resuming",
                                duration_ms=duration_ms,
                            )
                        )
                        await db.commit()
                        # Don't update error_context — let the resumed session continue naturally
                        continue

                    logger.error(
                        "Claude failed on attempt %d/%d: %s",
                        attempt, repo.max_retries, failure_reason,
                    )
                    task_log.write(
                        f"\n[Attempt {attempt}] FAILED: {failure_reason}\n"
                    )
                    task_log.flush()
                    # Classify the failure → targeted hint for next retry.
                    cls = error_classifier.classify(raw_output)
                    error_context = error_classifier.build_remediation_block(cls, raw_output)
                    if cls is not None:
                        error_class_counts[cls.key] = error_class_counts.get(cls.key, 0) + 1
                        await _emit_event(db, task_id, run_id, "error_classified", {
                            "class": cls.key,
                            "severity": cls.severity,
                            "hint": cls.hint,
                            "repeat": error_class_counts[cls.key],
                        })
                    await db.execute(
                        update(TaskRun).where(TaskRun.id == run_id).values(
                            status="failed",
                            finished_at=datetime.now(timezone.utc),