From 1ed226a49d0022867f05e72528cf081d9082826f Mon Sep 17 00:00:00 2001
From: Matt McKenna <mmckenna@squareup.com>
Date: Thu, 22 Jan 2026 12:10:46 -0500
Subject: [PATCH 1/5] Extract shared queue infrastructure into queue_core.py

- Create queue_core.py with shared database, logging, and cleanup logic
- Refactor task_queue.py and tq.py to use shared module
- Fix Ctrl+C leaving orphaned waiting tasks by splitting registration from wait
- Stream output directly to file with bounded deque for memory efficiency
- Add signal handling integration tests for tq CLI

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 queue_core.py        | 267 +++++++++++++++++++++++++++++
 task_queue.py        | 393 +++++++++++++++----------------------------
 tests/test_queue.py  |  78 ++++++++-
 tests/test_tq_cli.py | 132 +++++++++++++++
 tq.py                | 255 +++++++++-------------------
 5 files changed, 690 insertions(+), 435 deletions(-)
 create mode 100644 queue_core.py

diff --git a/queue_core.py b/queue_core.py
new file mode 100644
index 0000000..e8d29eb
--- /dev/null
+++ b/queue_core.py
@@ -0,0 +1,267 @@
+"""
+Queue Core - Shared infrastructure for agent-task-queue.
+
+This module contains the shared logic used by both:
+- task_queue.py (MCP server)
+- tq.py (CLI tool)
+"""
+
+import json
+import os
+import signal
+import sqlite3
+from contextlib import contextmanager
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from pathlib import Path
+
+
+# --- Configuration ---
+DEFAULT_DATA_DIR = Path(os.environ.get("TASK_QUEUE_DATA_DIR", "/tmp/agent-task-queue"))
+POLL_INTERVAL_WAITING = float(os.environ.get("TASK_QUEUE_POLL_WAITING", "1"))
+POLL_INTERVAL_READY = float(os.environ.get("TASK_QUEUE_POLL_READY", "1"))
+DEFAULT_MAX_LOCK_AGE_MINUTES = 120
+DEFAULT_MAX_METRICS_SIZE_MB = 5
+
+
+@dataclass
+class QueuePaths:
+    """Paths for queue data files."""
+
+    data_dir: Path
+    db_path: Path
+    metrics_path: Path
+    output_dir: Path
+
+    @classmethod
+    def from_data_dir(cls, data_dir: Path) -> "QueuePaths":
+        return cls(
+            data_dir=data_dir,
+            db_path=data_dir / "queue.db",
+            metrics_path=data_dir / "agent-task-queue-logs.json",
+            output_dir=data_dir / "output",
+        )
+
+
+# --- Database Schema ---
+QUEUE_SCHEMA = """
+CREATE TABLE IF NOT EXISTS queue (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    queue_name TEXT NOT NULL,
+    status TEXT NOT NULL,
+    pid INTEGER,
+    child_pid INTEGER,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+)
+"""
+
+QUEUE_INDEX = """
+CREATE INDEX IF NOT EXISTS idx_queue_status ON queue(queue_name, status)
+"""
+
+
+# --- Database Functions ---
+@contextmanager
+def get_db(db_path: Path):
+    """Get database connection with WAL mode for better concurrency."""
+    conn = sqlite3.connect(db_path, timeout=60.0)
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.execute("PRAGMA busy_timeout=60000")
+    conn.row_factory = sqlite3.Row
+    try:
+        yield conn
+        conn.commit()
+    except Exception:
+        conn.rollback()
+        raise
+    finally:
+        conn.close()
+
+
+def init_db(paths: QueuePaths):
+    """Initialize DB with queue table."""
+    paths.data_dir.mkdir(parents=True, exist_ok=True)
+    with get_db(paths.db_path) as conn:
+        conn.execute(QUEUE_SCHEMA)
+        conn.execute(QUEUE_INDEX)
+
+
+def ensure_db(paths: QueuePaths):
+    """Ensure database exists and is valid. Recreates if corrupted."""
+    try:
+        with get_db(paths.db_path) as conn:
+            conn.execute("SELECT 1 FROM queue LIMIT 1")
+    except sqlite3.OperationalError:
+        # Database missing or corrupted - clean up and reinitialize
+        for suffix in ["", "-wal", "-shm"]:
+            path = Path(str(paths.db_path) + suffix)
+            if path.exists():
+                try:
+                    path.unlink()
+                except OSError:
+                    pass
+        init_db(paths)
+
+
+# --- Process Management ---
+def is_process_alive(pid: int) -> bool:
+    """Check if a process ID exists on the host OS."""
+    if pid is None:
+        return False
+    try:
+        os.kill(pid, 0)
+        return True
+    except OSError:
+        return False
+
+
+def kill_process_tree(pid: int):
+    """Kill a process and all its children via process group."""
+    if not pid or not is_process_alive(pid):
+        return
+    try:
+        # Kill the entire process group (works because we use start_new_session=True)
+        os.killpg(pid, signal.SIGTERM)
+    except OSError:
+        # Fallback: try killing just the process if process group kill fails
+        try:
+            os.kill(pid, signal.SIGTERM)
+        except OSError:
+            pass
+
+
+# --- Logging ---
+def log_fmt(msg: str) -> str:
+    """Format log message with timestamp."""
+    timestamp = datetime.now().strftime("%H:%M:%S")
+    return f"[{timestamp}] [TASK-QUEUE] {msg}"
+
+
+def log_metric(
+    metrics_path: Path,
+    event: str,
+    max_size_mb: float = DEFAULT_MAX_METRICS_SIZE_MB,
+    **kwargs,
+):
+    """
+    Append a JSON metric entry to the log file.
+    Rotates log file when it exceeds max_size_mb.
+    """
+    if metrics_path.exists():
+        try:
+            size_mb = metrics_path.stat().st_size / (1024 * 1024)
+            if size_mb > max_size_mb:
+                rotated = metrics_path.with_suffix(".json.1")
+                metrics_path.rename(rotated)
+        except OSError:
+            pass
+
+    entry = {
+        "event": event,
+        "timestamp": datetime.now().isoformat(),
+        **kwargs,
+    }
+    with open(metrics_path, "a") as f:
+        f.write(json.dumps(entry) + "\n")
+
+
+# --- Queue Cleanup ---
+def cleanup_queue(
+    conn,
+    queue_name: str,
+    metrics_path: Path,
+    max_lock_age_minutes: int = DEFAULT_MAX_LOCK_AGE_MINUTES,
+    log_fn=None,
+):
+    """
+    Clean up dead/stale locks and orphaned waiting tasks.
+
+    Args:
+        conn: SQLite connection (must have row_factory=sqlite3.Row)
+        queue_name: Name of the queue to clean
+        metrics_path: Path to metrics log file
+        max_lock_age_minutes: Timeout for stale locks
+        log_fn: Optional function for logging (signature: log_fn(message))
+    """
+
+    def log(msg):
+        if log_fn:
+            log_fn(msg)
+        else:
+            print(log_fmt(msg))
+
+    # Check for dead parents (running tasks)
+    runners = conn.execute(
+        "SELECT id, pid, child_pid FROM queue WHERE queue_name = ? AND status = 'running'",
+        (queue_name,),
+    ).fetchall()
+
+    for runner in runners:
+        if not is_process_alive(runner["pid"]):
+            child = runner["child_pid"]
+            if child and is_process_alive(child):
+                log(
+                    f"WARNING: Parent PID {runner['pid']} died. Killing orphan child PID {child}..."
+                )
+                kill_process_tree(child)
+
+            conn.execute("DELETE FROM queue WHERE id = ?", (runner["id"],))
+            log_metric(
+                metrics_path,
+                "zombie_cleared",
+                task_id=runner["id"],
+                queue_name=queue_name,
+                dead_pid=runner["pid"],
+                reason="parent_died",
+            )
+            log(f"WARNING: Cleared zombie lock (ID: {runner['id']}).")
+
+    # Check for orphaned waiting tasks (parent process died before acquiring lock)
+    waiters = conn.execute(
+        "SELECT id, pid FROM queue WHERE queue_name = ? AND status = 'waiting'",
+        (queue_name,),
+    ).fetchall()
+
+    for waiter in waiters:
+        if not is_process_alive(waiter["pid"]):
+            conn.execute("DELETE FROM queue WHERE id = ?", (waiter["id"],))
+            log_metric(
+                metrics_path,
+                "orphan_cleared",
+                task_id=waiter["id"],
+                queue_name=queue_name,
+                dead_pid=waiter["pid"],
+                reason="waiting_parent_died",
+            )
+            log(f"WARNING: Cleared orphaned waiting task (ID: {waiter['id']}).")
+
+    # Check for timeouts (stale locks)
+    cutoff = (datetime.now() - timedelta(minutes=max_lock_age_minutes)).isoformat()
+    stale = conn.execute(
+        "SELECT id, child_pid FROM queue WHERE queue_name = ? AND status = 'running' AND updated_at < ?",
+        (queue_name, cutoff),
+    ).fetchall()
+
+    for task in stale:
+        if task["child_pid"]:
+            kill_process_tree(task["child_pid"])
+        conn.execute("DELETE FROM queue WHERE id = ?", (task["id"],))
+        log_metric(
+            metrics_path,
+            "zombie_cleared",
+            task_id=task["id"],
+            queue_name=queue_name,
+            reason="timeout",
+            timeout_minutes=max_lock_age_minutes,
+        )
+        log(f"WARNING: Cleared stale lock (ID: {task['id']}) active > {max_lock_age_minutes}m")
+
+
+def release_lock(conn, task_id: int):
+    """Release a queue lock by deleting the task entry."""
+    try:
+        conn.execute("DELETE FROM queue WHERE id = ?", (task_id,))
+        conn.commit()
+    except sqlite3.OperationalError:
+        pass
diff --git a/task_queue.py b/task_queue.py
index 24d8f91..9ae2723 100644
--- a/task_queue.py
+++ b/task_queue.py
@@ -8,18 +8,33 @@
 
 import argparse
 import asyncio
-import json
 import os
+import resource
 import signal
 import sqlite3
 import time
-from contextlib import contextmanager
-from datetime import datetime, timedelta
+from collections import deque
+from datetime import datetime
 from pathlib import Path
 
 from fastmcp import FastMCP
 from fastmcp.server.dependencies import get_context
 
+# Import shared queue infrastructure
+from queue_core import (
+    QueuePaths,
+    get_db as _get_db,
+    init_db as _init_db,
+    ensure_db as _ensure_db,
+    cleanup_queue as _cleanup_queue,
+    log_metric as _log_metric,
+    log_fmt,
+    is_process_alive,
+    kill_process_tree,
+    POLL_INTERVAL_WAITING,
+    POLL_INTERVAL_READY,
+)
+
 
 # --- Argument Parsing ---
 def parse_args():
@@ -69,91 +84,51 @@ def parse_args():
 )
 
 # --- Configuration ---
-DATA_DIR = Path(_args.data_dir)
-OUTPUT_DIR = DATA_DIR / "output"
-DB_PATH = DATA_DIR / "queue.db"
-METRICS_PATH = DATA_DIR / "agent-task-queue-logs.json"
+PATHS = QueuePaths.from_data_dir(Path(_args.data_dir))
+OUTPUT_DIR = PATHS.output_dir
 MAX_METRICS_SIZE_MB = _args.max_log_size
 MAX_OUTPUT_FILES = _args.max_output_files
 TAIL_LINES_ON_FAILURE = _args.tail_lines
 SERVER_NAME = "Task Queue"
 MAX_LOCK_AGE_MINUTES = _args.lock_timeout
 
-# Polling intervals (configurable via environment)
-POLL_INTERVAL_WAITING = float(os.environ.get("TASK_QUEUE_POLL_WAITING", "1"))
-POLL_INTERVAL_READY = float(os.environ.get("TASK_QUEUE_POLL_READY", "1"))
-
 mcp = FastMCP(SERVER_NAME)
 
 
-# --- Database & Logging ---
-@contextmanager
+# --- Wrappers for shared functions (use module-level paths) ---
 def get_db():
-    """Get database connection with WAL mode for better concurrency."""
-    conn = sqlite3.connect(DB_PATH, timeout=60.0)
-    conn.execute("PRAGMA journal_mode=WAL")
-    conn.execute("PRAGMA busy_timeout=60000")
-    conn.row_factory = sqlite3.Row
-    try:
-        yield conn
-        conn.commit()
-    except Exception:
-        conn.rollback()
-        raise
-    finally:
-        conn.close()
+    """Get database connection using configured path."""
+    return _get_db(PATHS.db_path)
 
 
 def init_db():
-    """Initialize DB with PID columns for process tracking."""
-    DATA_DIR.mkdir(parents=True, exist_ok=True)
-    with get_db() as conn:
-        conn.execute("""
-            CREATE TABLE IF NOT EXISTS queue (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                queue_name TEXT NOT NULL,
-                status TEXT NOT NULL,
-                pid INTEGER,
-                child_pid INTEGER,
-                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-            )
-        """)
-        conn.execute("""
-            CREATE INDEX IF NOT EXISTS idx_queue_status
-            ON queue(queue_name, status)
-        """)
+    """Initialize database using configured paths."""
+    _init_db(PATHS)
 
 
-def log_fmt(msg: str) -> str:
-    """Format log message with timestamp."""
-    timestamp = datetime.now().strftime("%H:%M:%S")
-    return f"[{timestamp}] [TASK-QUEUE] {msg}"
+def ensure_db():
+    """Ensure database exists and is valid using configured paths."""
+    _ensure_db(PATHS)
 
 
 def log_metric(event: str, **kwargs):
-    """
-    Append a JSON metric entry to the log file.
-    Rotates log file when it exceeds MAX_METRICS_SIZE_MB.
-    """
-    DATA_DIR.mkdir(parents=True, exist_ok=True)
+    """Log metric using configured paths."""
+    PATHS.data_dir.mkdir(parents=True, exist_ok=True)
+    _log_metric(PATHS.metrics_path, event, MAX_METRICS_SIZE_MB, **kwargs)
 
-    # Rotate if file exceeds size limit
-    if METRICS_PATH.exists():
-        size_mb = METRICS_PATH.stat().st_size / (1024 * 1024)
-        if size_mb > MAX_METRICS_SIZE_MB:
-            rotated = METRICS_PATH.with_suffix(".json.1")
-            METRICS_PATH.rename(rotated)
 
-    entry = {
-        "event": event,
-        "timestamp": datetime.now().isoformat(),
-        **kwargs,
-    }
-    with open(METRICS_PATH, "a") as f:
-        f.write(json.dumps(entry) + "\n")
+def cleanup_queue(conn, queue_name: str):
+    """Clean up queue using configured paths."""
+    _cleanup_queue(
+        conn,
+        queue_name,
+        PATHS.metrics_path,
+        MAX_LOCK_AGE_MINUTES,
+        log_fn=lambda msg: print(log_fmt(msg)),
+    )
 
 
+# --- Output File Management ---
 def cleanup_output_files():
     """Remove oldest output files if over MAX_OUTPUT_FILES limit."""
     if not OUTPUT_DIR.exists():
@@ -183,111 +158,16 @@ def clear_output_files() -> int:
     return count
 
 
-# --- Process Liveness Logic ---
-def is_process_alive(pid: int) -> bool:
-    """Check if a process ID exists on the host OS."""
-    if pid is None:
-        return False
-    try:
-        os.kill(pid, 0)
-        return True
-    except OSError:
-        return False
-
-
-def kill_process_tree(pid: int):
-    """Kill a process and all its children by killing the process group."""
-    if not pid or not is_process_alive(pid):
-        return
-    try:
-        # Kill the entire process group (works because we use start_new_session=True)
-        os.killpg(pid, signal.SIGTERM)
-    except OSError:
-        # Fallback: try killing just the process if process group kill fails
-        try:
-            os.kill(pid, signal.SIGTERM)
-        except OSError:
-            pass
-
-
-def cleanup_queue(conn, queue_name: str):
-    """
-    Active Cleanup:
-    1. Check if MCP server holding lock is alive
-    2. If not, kill orphaned child process
-    3. Check for timeouts
-    """
-    # Check for dead parents
-    runners = conn.execute(
-        "SELECT id, pid, child_pid FROM queue WHERE queue_name = ? AND status = 'running'",
-        (queue_name,),
-    ).fetchall()
-
-    for runner in runners:
-        if not is_process_alive(runner["pid"]):
-            child = runner["child_pid"]
-            if child and is_process_alive(child):
-                print(
-                    log_fmt(
-                        f"WARNING: Parent PID {runner['pid']} died. Killing orphan child PID {child}..."
-                    )
-                )
-                kill_process_tree(child)
-
-            conn.execute("DELETE FROM queue WHERE id = ?", (runner["id"],))
-            log_metric(
-                "zombie_cleared",
-                task_id=runner["id"],
-                queue_name=queue_name,
-                dead_pid=runner["pid"],
-                reason="parent_died",
-            )
-            print(log_fmt(f"WARNING: Cleared zombie lock (ID: {runner['id']})."))
-
-    # Check for timeouts
-    cutoff = (datetime.now() - timedelta(minutes=MAX_LOCK_AGE_MINUTES)).isoformat()
-    stale = conn.execute(
-        "SELECT id, child_pid FROM queue WHERE queue_name = ? AND status = 'running' AND updated_at < ?",
-        (queue_name, cutoff),
-    ).fetchall()
-
-    for task in stale:
-        if task["child_pid"]:
-            kill_process_tree(task["child_pid"])
-        conn.execute("DELETE FROM queue WHERE id = ?", (task["id"],))
-        log_metric(
-            "zombie_cleared",
-            task_id=task["id"],
-            queue_name=queue_name,
-            reason="timeout",
-            timeout_minutes=MAX_LOCK_AGE_MINUTES,
-        )
-        print(
-            log_fmt(
-                f"WARNING: Cleared stale lock (ID: {task['id']}) active > {MAX_LOCK_AGE_MINUTES}m"
-            )
-        )
+def get_memory_mb() -> float:
+    """Get current process memory usage in MB (RSS - resident set size)."""
+    usage = resource.getrusage(resource.RUSAGE_SELF)
+    # ru_maxrss is in bytes on Linux, kilobytes on macOS
+    if os.uname().sysname == "Darwin":
+        return usage.ru_maxrss / (1024 * 1024)  # KB to MB
+    return usage.ru_maxrss / 1024  # bytes to MB on Linux
 
 
 # --- Core Queue Logic ---
-def ensure_db():
-    """Ensure database exists and is valid. Recreates if corrupted."""
-    try:
-        with get_db() as conn:
-            # Quick check that table exists
-            conn.execute("SELECT 1 FROM queue LIMIT 1")
-    except sqlite3.OperationalError:
-        # Database missing or corrupted - clean up and reinitialize
-        for suffix in ["", "-wal", "-shm"]:
-            path = Path(str(DB_PATH) + suffix)
-            if path.exists():
-                try:
-                    path.unlink()
-                except OSError:
-                    pass
-        init_db()
-
-
 async def wait_for_turn(queue_name: str) -> int:
     """Register task, wait for turn, return task ID when acquired."""
     # Ensure database exists and is valid
@@ -471,10 +351,18 @@ async def run_task(
                 env[key.strip()] = value.strip()
 
     task_id = await wait_for_turn(queue_name)
+    mem_before = get_memory_mb()
 
     start = time.time()
-    stdout_lines = []
-    stderr_lines = []
+    # Use bounded deques - only keep last N lines in memory for error messages
+    stdout_tail: deque = deque(maxlen=TAIL_LINES_ON_FAILURE)
+    stderr_tail: deque = deque(maxlen=TAIL_LINES_ON_FAILURE)
+    stdout_count = 0
+    stderr_count = 0
+
+    # Create output file early and stream directly to it
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    output_file = OUTPUT_DIR / f"task_{task_id}.log"
 
     try:
         proc = await asyncio.create_subprocess_shell(
@@ -492,98 +380,95 @@ async def run_task(
                 "UPDATE queue SET child_pid = ? WHERE id = ?", (proc.pid, task_id)
             )
 
-        async def stream_output(stream, lines: list):
-            """Read lines from stream (no real-time streaming to save tokens)."""
-            while True:
-                line = await stream.readline()
-                if not line:
-                    break
-                lines.append(line.decode().rstrip())
+        # Open file for streaming output - write header first
+        with open(output_file, "w") as f:
+            f.write(f"COMMAND: {command}\n")
+            f.write(f"WORKING DIR: {working_directory}\n")
+            f.write(f"STARTED: {datetime.now().isoformat()}\n")
+            f.write("\n--- STDOUT ---\n")
+
+            async def stream_to_file(stream, tail_buffer: deque, label: str):
+                """Stream output directly to file, keeping only tail in memory."""
+                nonlocal stdout_count, stderr_count
+                while True:
+                    line = await stream.readline()
+                    if not line:
+                        break
+                    decoded = line.decode().rstrip()
+                    f.write(decoded + "\n")
+                    f.flush()  # Ensure immediate write to disk
+                    tail_buffer.append(decoded)
+                    if label == "stdout":
+                        stdout_count += 1
+                    else:
+                        stderr_count += 1
 
-        try:
-            # Collect stdout and stderr concurrently (no streaming to save tokens)
-            await asyncio.wait_for(
-                asyncio.gather(
-                    stream_output(proc.stdout, stdout_lines),
-                    stream_output(proc.stderr, stderr_lines),
-                ),
-                timeout=timeout_seconds,
-            )
-            await proc.wait()
-            duration = time.time() - start
-
-            log_metric(
-                "task_completed",
-                task_id=task_id,
-                queue_name=queue_name,
-                command=command,
-                exit_code=proc.returncode,
-                duration_seconds=round(duration, 2),
-                stdout_lines=len(stdout_lines),
-                stderr_lines=len(stderr_lines),
-            )
+            try:
+                # Stream stdout first, then stderr (written sequentially to file)
+                await asyncio.wait_for(
+                    stream_to_file(proc.stdout, stdout_tail, "stdout"),
+                    timeout=timeout_seconds,
+                )
+                f.write("\n--- STDERR ---\n")
+                await asyncio.wait_for(
+                    stream_to_file(proc.stderr, stderr_tail, "stderr"),
+                    timeout=timeout_seconds,
+                )
+                await proc.wait()
+                duration = time.time() - start
 
-            # Write full output to file
-            OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-            output_file = OUTPUT_DIR / f"task_{task_id}.log"
-            with open(output_file, "w") as f:
-                f.write(f"COMMAND: {command}\n")
+                # Append summary to file
+                f.write("\n--- SUMMARY ---\n")
                 f.write(f"EXIT CODE: {proc.returncode}\n")
                 f.write(f"DURATION: {duration:.1f}s\n")
-                f.write(f"WORKING DIR: {working_directory}\n")
-                f.write("\n--- STDOUT ---\n")
-                f.write("\n".join(stdout_lines))
-                f.write("\n\n--- STDERR ---\n")
-                f.write("\n".join(stderr_lines))
-            cleanup_output_files()
-
-            # Return concise summary for agents
-            if proc.returncode == 0:
-                return f"SUCCESS exit=0 {duration:.1f}s output={output_file}"
-            else:
-                # On failure, include tail of output for context
-                tail = (
-                    stderr_lines[-TAIL_LINES_ON_FAILURE:]
-                    if stderr_lines
-                    else stdout_lines[-TAIL_LINES_ON_FAILURE:]
+
+            except asyncio.TimeoutError:
+                try:
+                    os.killpg(proc.pid, signal.SIGKILL)
+                    await proc.wait()
+                except Exception:
+                    pass
+                f.write("\n--- SUMMARY ---\n")
+                f.write(f"EXIT CODE: TIMEOUT (killed after {timeout_seconds}s)\n")
+
+                log_metric(
+                    "task_timeout",
+                    task_id=task_id,
+                    queue_name=queue_name,
+                    command=command,
+                    timeout_seconds=timeout_seconds,
+                    memory_mb=round(get_memory_mb(), 1),
                 )
+                cleanup_output_files()
+
+                tail = list(stderr_tail) if stderr_tail else list(stdout_tail)
                 tail_text = "\n".join(tail) if tail else "(no output)"
-                return f"FAILED exit={proc.returncode} {duration:.1f}s output={output_file}\n{tail_text}"
+                return f"TIMEOUT killed after {timeout_seconds}s output={output_file}\n{tail_text}"
 
-        except asyncio.TimeoutError:
-            try:
-                # Kill entire process group to ensure all child processes die
-                os.killpg(proc.pid, signal.SIGKILL)
-                await proc.wait()
-            except Exception:
-                pass
-            log_metric(
-                "task_timeout",
-                task_id=task_id,
-                queue_name=queue_name,
-                command=command,
-                timeout_seconds=timeout_seconds,
-            )
-            # Write partial output to file
-            OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-            output_file = OUTPUT_DIR / f"task_{task_id}.log"
-            with open(output_file, "w") as f:
-                f.write(f"COMMAND: {command}\n")
-                f.write(f"EXIT CODE: TIMEOUT (killed after {timeout_seconds}s)\n")
-                f.write(f"WORKING DIR: {working_directory}\n")
-                f.write("\n--- STDOUT ---\n")
-                f.write("\n".join(stdout_lines))
-                f.write("\n\n--- STDERR ---\n")
-                f.write("\n".join(stderr_lines))
-            cleanup_output_files()
-
-            tail = (
-                stderr_lines[-TAIL_LINES_ON_FAILURE:]
-                if stderr_lines
-                else stdout_lines[-TAIL_LINES_ON_FAILURE:]
-            )
+        # File is now closed, log metrics
+        mem_after = get_memory_mb()
+        log_metric(
+            "task_completed",
+            task_id=task_id,
+            queue_name=queue_name,
+            command=command,
+            exit_code=proc.returncode,
+            duration_seconds=round(duration, 2),
+            stdout_lines=stdout_count,
+            stderr_lines=stderr_count,
+            memory_before_mb=round(mem_before, 1),
+            memory_after_mb=round(mem_after, 1),
+        )
+        cleanup_output_files()
+
+        # Return concise summary for agents
+        if proc.returncode == 0:
+            return f"SUCCESS exit=0 {duration:.1f}s output={output_file}"
+        else:
+            # On failure, include tail of output for context
+            tail = list(stderr_tail) if stderr_tail else list(stdout_tail)
             tail_text = "\n".join(tail) if tail else "(no output)"
-            return f"TIMEOUT killed after {timeout_seconds}s output={output_file}\n{tail_text}"
+            return f"FAILED exit={proc.returncode} {duration:.1f}s output={output_file}\n{tail_text}"
 
     except Exception as e:
         log_metric(
diff --git a/tests/test_queue.py b/tests/test_queue.py
index 50a5109..07b2431 100644
--- a/tests/test_queue.py
+++ b/tests/test_queue.py
@@ -17,7 +17,7 @@
 from fastmcp import Client
 from task_queue import (
     mcp,
-    DB_PATH,
+    PATHS,
     OUTPUT_DIR,
     get_db,
     init_db,
@@ -26,6 +26,9 @@
     MAX_LOCK_AGE_MINUTES,
 )
 
+# Use PATHS for database path
+DB_PATH = PATHS.db_path
+
 
 @pytest.fixture(autouse=True)
 def clean_db():
@@ -667,6 +670,79 @@ def test_zombie_cleanup_preserves_valid_tasks():
         conn.execute("DELETE FROM queue WHERE queue_name = 'valid_test'")
 
 
+def test_orphan_cleanup_dead_parent_waiting():
+    """Test that waiting tasks with dead parent PIDs are cleaned up."""
+    dead_pid = 999999999  # This PID should not exist
+
+    with get_db() as conn:
+        conn.execute(
+            """INSERT INTO queue (queue_name, status, pid, child_pid, created_at, updated_at)
+               VALUES (?, ?, ?, ?, ?, ?)""",
+            (
+                "orphan_test",
+                "waiting",  # Key difference: this is a WAITING task, not running
+                dead_pid,
+                None,
+                datetime.now().isoformat(),
+                datetime.now().isoformat(),
+            ),
+        )
+
+        # Verify the task exists
+        count_before = conn.execute(
+            "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'orphan_test'"
+        ).fetchone()["c"]
+        assert count_before == 1
+
+        # Run cleanup
+        cleanup_queue(conn, "orphan_test")
+
+        # Verify the orphaned waiting task was removed
+        count_after = conn.execute(
+            "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'orphan_test'"
+        ).fetchone()["c"]
+        assert count_after == 0, "Orphaned waiting task should be cleaned up"
+
+
+def test_orphan_cleanup_preserves_valid_waiting():
+    """Test that cleanup doesn't remove valid waiting tasks."""
+    import os
+
+    my_pid = os.getpid()  # Use our own PID so it's "alive"
+
+    with get_db() as conn:
+        conn.execute(
+            """INSERT INTO queue (queue_name, status, pid, child_pid, created_at, updated_at)
+               VALUES (?, ?, ?, ?, ?, ?)""",
+            (
+                "valid_waiting_test",
+                "waiting",
+                my_pid,
+                None,
+                datetime.now().isoformat(),
+                datetime.now().isoformat(),
+            ),
+        )
+
+        # Verify the task exists
+        count_before = conn.execute(
+            "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'valid_waiting_test'"
+        ).fetchone()["c"]
+        assert count_before == 1
+
+        # Run cleanup
+        cleanup_queue(conn, "valid_waiting_test")
+
+        # Verify the task is still there (not removed)
+        count_after = conn.execute(
+            "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'valid_waiting_test'"
+        ).fetchone()["c"]
+        assert count_after == 1, "Valid waiting task should NOT be cleaned up"
+
+        # Clean up for other tests
+        conn.execute("DELETE FROM queue WHERE queue_name = 'valid_waiting_test'")
+
+
 # --- Configuration Tests ---
 
 
diff --git a/tests/test_tq_cli.py b/tests/test_tq_cli.py
index 5a37e74..537788b 100644
--- a/tests/test_tq_cli.py
+++ b/tests/test_tq_cli.py
@@ -264,3 +264,135 @@ def test_queue_cleanup_after_completion(self, temp_data_dir):
 
         # Queue should be empty after task completes
         assert "empty" in result.stdout.lower()
+
+
+class TestSignalHandling:
+    """Test signal handling and cleanup on interrupt."""
+
+    def test_sigint_cleanup_waiting_task(self, temp_data_dir):
+        """
+        Test that SIGINT (Ctrl+C) properly cleans up a waiting task.
+
+        This tests the fix for the bug where Ctrl+C during the wait phase
+        would leave orphaned 'waiting' tasks in the queue.
+        """
+        import os
+        import signal
+        import sqlite3
+        import time
+
+        db_path = Path(temp_data_dir) / "queue.db"
+
+        # First, start a long-running task to hold the lock
+        # Use start_new_session to isolate the process group
+        blocker = subprocess.Popen(
+            [sys.executable, str(TQ_PATH), f"--data-dir={temp_data_dir}", "sleep", "30"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            start_new_session=True,
+        )
+
+        # Wait for blocker to start and acquire lock
+        time.sleep(0.5)
+
+        # Verify blocker has the lock
+        conn = sqlite3.connect(db_path, timeout=5.0)
+        conn.row_factory = sqlite3.Row
+        running = conn.execute("SELECT * FROM queue WHERE status = 'running'").fetchone()
+        assert running is not None, "Blocker should be running"
+        blocker_task_id = running["id"]
+
+        # Now start a second task that will wait in queue
+        waiter = subprocess.Popen(
+            [sys.executable, str(TQ_PATH), f"--data-dir={temp_data_dir}", "echo", "waited"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            start_new_session=True,
+        )
+
+        # Wait for waiter to register in queue
+        time.sleep(0.5)
+
+        # Verify waiter is in waiting state
+        waiting = conn.execute("SELECT * FROM queue WHERE status = 'waiting'").fetchone()
+        assert waiting is not None, "Waiter should be in waiting state"
+        waiter_task_id = waiting["id"]
+
+        # Send SIGINT to the waiting process (simulating Ctrl+C)
+        waiter.send_signal(signal.SIGINT)
+
+        # Wait for waiter to exit
+        waiter.wait(timeout=5)
+        assert waiter.returncode == 130, "Waiter should exit with 130 (128 + SIGINT)"
+
+        # Verify the waiting task was cleaned up
+        conn = sqlite3.connect(db_path, timeout=5.0)
+        conn.row_factory = sqlite3.Row
+        remaining_waiting = conn.execute(
+            "SELECT * FROM queue WHERE status = 'waiting' AND id = ?", (waiter_task_id,)
+        ).fetchone()
+        assert remaining_waiting is None, "Waiting task should be cleaned up after SIGINT"
+
+        # Clean up: kill the blocker
+        try:
+            os.killpg(os.getpgid(blocker.pid), signal.SIGTERM)
+        except Exception:
+            blocker.terminate()
+        blocker.wait(timeout=5)
+        conn.close()
+
+    def test_sigint_cleanup_running_task(self, temp_data_dir):
+        """
+        Test that SIGINT properly cleans up a running task and its subprocess.
+        """
+        import os
+        import signal
+        import sqlite3
+        import time
+
+        db_path = Path(temp_data_dir) / "queue.db"
+
+        # Start a task that will run for a while
+        proc = subprocess.Popen(
+            [sys.executable, str(TQ_PATH), f"--data-dir={temp_data_dir}", "sleep", "30"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            start_new_session=True,  # So we can kill the whole group
+        )
+
+        # Wait for it to start running
+        time.sleep(0.5)
+
+        # Verify it's running
+        conn = sqlite3.connect(db_path, timeout=5.0)
+        conn.row_factory = sqlite3.Row
+        running = conn.execute("SELECT * FROM queue WHERE status = 'running'").fetchone()
+        assert running is not None, "Task should be running"
+        task_id = running["id"]
+        child_pid = running["child_pid"]
+        assert child_pid is not None, "Child PID should be recorded"
+
+        # Send SIGINT
+        proc.send_signal(signal.SIGINT)
+
+        # Wait for cleanup
+        proc.wait(timeout=10)
+        assert proc.returncode == 130, "Should exit with 130 (128 + SIGINT)"
+
+        # Verify task was cleaned up
+        conn = sqlite3.connect(db_path, timeout=5.0)
+        conn.row_factory = sqlite3.Row
+        remaining = conn.execute(
+            "SELECT * FROM queue WHERE id = ?", (task_id,)
+        ).fetchone()
+        assert remaining is None, "Task should be cleaned up from queue"
+
+        # Verify child process is dead
+        try:
+            os.kill(child_pid, 0)
+            # If we get here, process is still alive - that's bad
+            assert False, f"Child process {child_pid} should be dead"
+        except OSError:
+            pass  # Expected - process is dead
+
+        conn.close()
diff --git a/tq.py b/tq.py
index e0aac93..03a550a 100644
--- a/tq.py
+++ b/tq.py
@@ -14,28 +14,45 @@
 import subprocess
 import sys
 import time
-from datetime import datetime, timedelta
+from datetime import datetime
 from pathlib import Path
 
-
-def get_data_dir(args):
-    """Get data directory from args or environment."""
+# Import shared queue infrastructure
+from queue_core import (
+    QueuePaths,
+    get_db,
+    init_db,
+    ensure_db,
+    cleanup_queue as _cleanup_queue,
+    log_metric as _log_metric,
+    release_lock,
+    is_process_alive,
+    kill_process_tree,
+    POLL_INTERVAL_WAITING,
+    DEFAULT_MAX_LOCK_AGE_MINUTES,
+    DEFAULT_MAX_METRICS_SIZE_MB,
+)
+
+
+def get_paths(args) -> QueuePaths:
+    """Get queue paths from args or environment."""
     if args.data_dir:
-        return Path(args.data_dir)
-    return Path(os.environ.get("TASK_QUEUE_DATA_DIR", "/tmp/agent-task-queue"))
+        data_dir = Path(args.data_dir)
+    else:
+        data_dir = Path(os.environ.get("TASK_QUEUE_DATA_DIR", "/tmp/agent-task-queue"))
+    return QueuePaths.from_data_dir(data_dir)
 
 
 def cmd_list(args):
     """List all tasks in the queue."""
-    data_dir = get_data_dir(args)
-    db_path = data_dir / "queue.db"
+    paths = get_paths(args)
 
-    if not db_path.exists():
-        print(f"No queue database found at {db_path}")
+    if not paths.db_path.exists():
+        print(f"No queue database found at {paths.db_path}")
         print("Queue is empty (no tasks have been run yet)")
         return
 
-    conn = sqlite3.connect(db_path, timeout=5.0)
+    conn = sqlite3.connect(paths.db_path, timeout=5.0)
     conn.row_factory = sqlite3.Row
 
     try:
@@ -83,14 +100,13 @@ def cmd_list(args):
 
 def cmd_clear(args):
     """Clear all tasks from the queue."""
-    data_dir = get_data_dir(args)
-    db_path = data_dir / "queue.db"
+    paths = get_paths(args)
 
-    if not db_path.exists():
+    if not paths.db_path.exists():
         print("No queue database found")
         return
 
-    conn = sqlite3.connect(db_path, timeout=5.0)
+    conn = sqlite3.connect(paths.db_path, timeout=5.0)
     try:
         # Check how many tasks exist
         count = conn.execute("SELECT COUNT(*) FROM queue").fetchone()[0]
@@ -112,16 +128,13 @@ def cmd_clear(args):
 
 def cmd_logs(args):
     """Show recent log entries."""
-    data_dir = get_data_dir(args)
-    log_path = data_dir / "agent-task-queue-logs.json"
+    paths = get_paths(args)
 
-    if not log_path.exists():
-        print(f"No log file found at {log_path}")
+    if not paths.metrics_path.exists():
+        print(f"No log file found at {paths.metrics_path}")
         return
 
-    import json
-
-    lines = log_path.read_text().strip().split("\n")
+    lines = paths.metrics_path.read_text().strip().split("\n")
     recent = lines[-args.n:] if len(lines) > args.n else lines
 
     for line in recent:
@@ -150,6 +163,9 @@ def cmd_logs(args):
             elif event == "zombie_cleared":
                 reason = entry.get("reason", "?")
                 print(f"{ts} [{queue}] #{task_id} zombie cleared ({reason})")
+            elif event == "orphan_cleared":
+                reason = entry.get("reason", "?")
+                print(f"{ts} [{queue}] #{task_id} orphan cleared ({reason})")
             else:
                 print(f"{ts} {event}")
         except json.JSONDecodeError:
@@ -158,124 +174,18 @@ def cmd_logs(args):
 
 # --- Run Command Implementation ---
 
-# Configuration
-POLL_INTERVAL = 1.0  # seconds between queue checks
-MAX_LOCK_AGE_MINUTES = 120  # stale lock timeout
-MAX_METRICS_SIZE_MB = 5  # rotate log when exceeds this size
-
-
-def ensure_db(db_path: Path):
-    """Ensure database exists and is valid. Recreates if corrupted."""
-    try:
-        conn = sqlite3.connect(db_path, timeout=5.0)
-        conn.execute("SELECT 1 FROM queue LIMIT 1")
-        conn.close()
-    except sqlite3.OperationalError:
-        # Database missing or corrupted - clean up and reinitialize
-        for suffix in ["", "-wal", "-shm"]:
-            path = Path(str(db_path) + suffix)
-            if path.exists():
-                try:
-                    path.unlink()
-                except OSError:
-                    pass
-
-
-def log_metric(data_dir: Path, event: str, **kwargs):
-    """Append a JSON metric entry to the log file. Rotates when size exceeds limit."""
-    log_path = data_dir / "agent-task-queue-logs.json"
+def log_metric(paths: QueuePaths, event: str, **kwargs):
+    """Log metric using paths (wrapper for CLI)."""
+    _log_metric(paths.metrics_path, event, DEFAULT_MAX_METRICS_SIZE_MB, **kwargs)
 
-    # Rotate if file exceeds size limit
-    if log_path.exists():
-        try:
-            size_mb = log_path.stat().st_size / (1024 * 1024)
-            if size_mb > MAX_METRICS_SIZE_MB:
-                rotated = log_path.with_suffix(".json.1")
-                log_path.rename(rotated)
-        except OSError:
-            pass
-
-    entry = {
-        "event": event,
-        "timestamp": datetime.now().isoformat(),
-        **kwargs,
-    }
-    with open(log_path, "a") as f:
-        f.write(json.dumps(entry) + "\n")
-
-
-def is_process_alive(pid: int) -> bool:
-    """Check if a process ID exists."""
-    if pid is None:
-        return False
-    try:
-        os.kill(pid, 0)
-        return True
-    except OSError:
-        return False
 
-
-def kill_process_tree(pid: int):
-    """Kill a process and all its children."""
-    if not pid or not is_process_alive(pid):
-        return
-    try:
-        os.killpg(pid, signal.SIGTERM)
-    except OSError:
-        try:
-            os.kill(pid, signal.SIGTERM)
-        except OSError:
-            pass
+def cleanup_queue(conn, queue_name: str, paths: QueuePaths):
+    """Clean up queue (wrapper for CLI)."""
+    _cleanup_queue(conn, queue_name, paths.metrics_path, DEFAULT_MAX_LOCK_AGE_MINUTES)
 
 
-def cleanup_queue(conn, queue_name: str, data_dir: Path):
-    """Clean up dead/stale locks and log metrics."""
-    # Check for dead parents
-    runners = conn.execute(
-        "SELECT id, pid, child_pid FROM queue WHERE queue_name = ? AND status = 'running'",
-        (queue_name,),
-    ).fetchall()
-
-    for runner in runners:
-        if not is_process_alive(runner["pid"]):
-            child = runner["child_pid"]
-            if child and is_process_alive(child):
-                kill_process_tree(child)
-            conn.execute("DELETE FROM queue WHERE id = ?", (runner["id"],))
-            conn.commit()
-            log_metric(
-                data_dir,
-                "zombie_cleared",
-                task_id=runner["id"],
-                queue_name=queue_name,
-                dead_pid=runner["pid"],
-                reason="parent_died",
-            )
-
-    # Check for timeouts
-    cutoff = (datetime.now() - timedelta(minutes=MAX_LOCK_AGE_MINUTES)).isoformat()
-    stale = conn.execute(
-        "SELECT id, child_pid FROM queue WHERE queue_name = ? AND status = 'running' AND updated_at < ?",
-        (queue_name, cutoff),
-    ).fetchall()
-
-    for task in stale:
-        if task["child_pid"]:
-            kill_process_tree(task["child_pid"])
-        conn.execute("DELETE FROM queue WHERE id = ?", (task["id"],))
-        conn.commit()
-        log_metric(
-            data_dir,
-            "zombie_cleared",
-            task_id=task["id"],
-            queue_name=queue_name,
-            reason="timeout",
-            timeout_minutes=MAX_LOCK_AGE_MINUTES,
-        )
-
-
-def wait_for_turn(conn, queue_name: str, data_dir: Path) -> int:
-    """Register task, wait for turn, return task ID when acquired."""
+def register_task(conn, queue_name: str, paths: QueuePaths) -> int:
+    """Register a task in the queue. Returns task_id immediately."""
     my_pid = os.getpid()
 
     cursor = conn.execute(
@@ -285,15 +195,20 @@ def wait_for_turn(conn, queue_name: str, data_dir: Path) -> int:
     conn.commit()
     task_id = cursor.lastrowid
 
-    log_metric(data_dir, "task_queued", task_id=task_id, queue_name=queue_name, pid=my_pid)
-    queued_at = time.time()
-
+    log_metric(paths, "task_queued", task_id=task_id, queue_name=queue_name, pid=my_pid)
     print(f"[tq] Task #{task_id} queued in '{queue_name}'")
+    return task_id
+
+
+def wait_for_turn(conn, queue_name: str, task_id: int, paths: QueuePaths) -> None:
+    """Wait for the task's turn to run. Task must already be registered."""
+    my_pid = os.getpid()
+    queued_at = time.time()
 
     last_pos = -1
 
     while True:
-        cleanup_queue(conn, queue_name, data_dir)
+        cleanup_queue(conn, queue_name, paths)
 
         runner = conn.execute(
             "SELECT id FROM queue WHERE queue_name = ? AND status = 'running'",
@@ -310,7 +225,7 @@ def wait_for_turn(conn, queue_name: str, data_dir: Path) -> int:
                 print(f"[tq] Position #{pos} in queue. Waiting...")
                 last_pos = pos
 
-            time.sleep(POLL_INTERVAL)
+            time.sleep(POLL_INTERVAL_WAITING)
             continue
 
         # Try to acquire lock atomically
@@ -330,7 +245,7 @@ def wait_for_turn(conn, queue_name: str, data_dir: Path) -> int:
         if cursor.rowcount > 0:
             wait_time = time.time() - queued_at
             log_metric(
-                data_dir,
+                paths,
                 "task_started",
                 task_id=task_id,
                 queue_name=queue_name,
@@ -340,18 +255,9 @@ def wait_for_turn(conn, queue_name: str, data_dir: Path) -> int:
                 print(f"[tq] Lock acquired after {wait_time:.1f}s wait")
             else:
                 print("[tq] Lock acquired")
-            return task_id
+            return  # Lock acquired, task_id was passed in
 
-        time.sleep(POLL_INTERVAL)
-
-
-def release_lock(conn, task_id: int):
-    """Release a queue lock."""
-    try:
-        conn.execute("DELETE FROM queue WHERE id = ?", (task_id,))
-        conn.commit()
-    except sqlite3.OperationalError:
-        pass
+        time.sleep(POLL_INTERVAL_WAITING)
 
 
 def cmd_run(args):
@@ -370,36 +276,23 @@ def cmd_run(args):
         print(f"Error: Working directory does not exist: {working_dir}", file=sys.stderr)
         sys.exit(1)
 
-    data_dir = get_data_dir(args)
-    data_dir.mkdir(parents=True, exist_ok=True)
-    db_path = data_dir / "queue.db"
+    paths = get_paths(args)
+    paths.data_dir.mkdir(parents=True, exist_ok=True)
 
     # Ensure database exists and is valid (recover if corrupted)
-    ensure_db(db_path)
+    ensure_db(paths)
+
+    # Get database connection
+    with get_db(paths.db_path) as conn:
+        # Initialize schema if needed (idempotent via IF NOT EXISTS)
+        init_db(paths)
 
-    # Initialize database if needed
-    conn = sqlite3.connect(db_path, timeout=60.0)
+    # Open connection for the duration of the run
+    conn = sqlite3.connect(paths.db_path, timeout=60.0)
     conn.execute("PRAGMA journal_mode=WAL")
     conn.execute("PRAGMA busy_timeout=60000")
     conn.row_factory = sqlite3.Row
 
-    conn.execute("""
-        CREATE TABLE IF NOT EXISTS queue (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            queue_name TEXT NOT NULL,
-            status TEXT NOT NULL,
-            pid INTEGER,
-            child_pid INTEGER,
-            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-            updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-        )
-    """)
-    conn.execute("""
-        CREATE INDEX IF NOT EXISTS idx_queue_status
-        ON queue(queue_name, status)
-    """)
-    conn.commit()
-
     task_id = None
     proc = None
     cleaned_up = False
@@ -436,7 +329,9 @@ def cleanup_handler(signum, frame):
     signal.signal(signal.SIGTERM, cleanup_handler)
 
     try:
-        task_id = wait_for_turn(conn, queue_name, data_dir)
+        # Register task first so task_id is available for cleanup if interrupted
+        task_id = register_task(conn, queue_name, paths)
+        wait_for_turn(conn, queue_name, task_id, paths)
 
         print(f"[tq] Running: {command}")
         print(f"[tq] Directory: {working_dir}")
@@ -470,7 +365,7 @@ def cleanup_handler(signum, frame):
                 pass
             proc.wait()
             log_metric(
-                data_dir,
+                paths,
                 "task_timeout",
                 task_id=task_id,
                 queue_name=queue_name,
@@ -489,7 +384,7 @@ def cleanup_handler(signum, frame):
             print(f"[tq] FAILED exit={exit_code} in {duration:.1f}s")
 
         log_metric(
-            data_dir,
+            paths,
             "task_completed",
             task_id=task_id,
             queue_name=queue_name,
@@ -504,7 +399,7 @@ def cleanup_handler(signum, frame):
         print(f"[tq] Error: {e}", file=sys.stderr)
         if task_id:
             log_metric(
-                data_dir,
+                paths,
                 "task_error",
                 task_id=task_id,
                 queue_name=queue_name,

From a0bb190d0251dec000988eae3f77577dba4e79fd Mon Sep 17 00:00:00 2001
From: Matt McKenna <mmckenna@squareup.com>
Date: Thu, 22 Jan 2026 18:14:30 -0500
Subject: [PATCH 2/5] Include queue_core.py in wheel build

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e9a4c3d..c4b461a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,4 +30,4 @@ build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
 packages = ["."]
-only-include = ["task_queue.py", "tq.py"]
+only-include = ["task_queue.py", "tq.py", "queue_core.py"]

From 7eb7db2cd93a62a9c6230678876e1d3c5ac59aa3 Mon Sep 17 00:00:00 2001
From: Matt McKenna <mmckenna@squareup.com>
Date: Fri, 23 Jan 2026 13:43:14 -0500
Subject: [PATCH 3/5] Fix orphaned task cleanup with PID reuse detection and
 instance tracking

- Add is_task_queue_process() to detect if PID is actually running task_queue
  vs being reused by an unrelated process (Chrome, etc.)
- Add SERVER_INSTANCE_ID and _active_task_ids tracking in MCP server to detect
  orphaned tasks from disconnected clients
- Add CLI_INSTANCE_ID tracking in tq.py for same protection
- Handle asyncio.CancelledError to properly clean up when sub-agents are cancelled
- Add server_id column to queue table with migration for existing databases
- Fix wait time calculation to use POLL_INTERVAL_WAITING instead of hardcoded value

This fixes the bug where cancelled sub-agents would leave orphaned tasks in the
queue that blocked all subsequent tasks from running.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 queue_core.py        |  51 +++++++++-
 task_queue.py        | 229 +++++++++++++++++++++++++++++++------------
 tests/test_queue.py  | 164 ++++++++++++++++++++++++++-----
 tests/test_tq_cli.py | 145 +++++++++++++++++++++++++++
 tq.py                |  38 ++++++-
 5 files changed, 536 insertions(+), 91 deletions(-)

diff --git a/queue_core.py b/queue_core.py
index e8d29eb..ab1c5e9 100644
--- a/queue_core.py
+++ b/queue_core.py
@@ -50,12 +50,18 @@ def from_data_dir(cls, data_dir: Path) -> "QueuePaths":
     queue_name TEXT NOT NULL,
     status TEXT NOT NULL,
     pid INTEGER,
+    server_id TEXT,
     child_pid INTEGER,
     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
     updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
 )
 """
 
+# Migration to add server_id column to existing databases
+QUEUE_MIGRATION_SERVER_ID = """
+ALTER TABLE queue ADD COLUMN server_id TEXT
+"""
+
 QUEUE_INDEX = """
 CREATE INDEX IF NOT EXISTS idx_queue_status ON queue(queue_name, status)
 """
@@ -85,6 +91,11 @@ def init_db(paths: QueuePaths):
     with get_db(paths.db_path) as conn:
         conn.execute(QUEUE_SCHEMA)
         conn.execute(QUEUE_INDEX)
+        # Run migration for existing databases without server_id column
+        try:
+            conn.execute(QUEUE_MIGRATION_SERVER_ID)
+        except sqlite3.OperationalError:
+            pass  # Column already exists
 
 
 def ensure_db(paths: QueuePaths):
@@ -116,6 +127,42 @@ def is_process_alive(pid: int) -> bool:
         return False
 
 
+def is_task_queue_process(pid: int) -> bool:
+    """Check if a PID is running our task_queue MCP server or tq CLI.
+
+    Returns True if:
+    - Process is dead (handled separately)
+    - Process command line contains 'task_queue' or 'agent-task-queue'
+
+    Returns False if process is alive but running something else (PID reused).
+    """
+    if not is_process_alive(pid):
+        return False
+
+    try:
+        import subprocess
+
+        result = subprocess.run(
+            ["ps", "-p", str(pid), "-o", "args="],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        if result.returncode != 0:
+            return False
+
+        cmdline = result.stdout.strip().lower()
+        return (
+            "task_queue" in cmdline
+            or "agent-task-queue" in cmdline
+            or "tq.py" in cmdline
+            or "pytest" in cmdline  # For pytest running tests
+        )
+    except Exception:
+        # If we can't check, assume valid (conservative - avoid false orphan cleanup)
+        return True
+
+
 def kill_process_tree(pid: int):
     """Kill a process and all its children via process group."""
     if not pid or not is_process_alive(pid):
@@ -198,7 +245,7 @@ def log(msg):
     ).fetchall()
 
     for runner in runners:
-        if not is_process_alive(runner["pid"]):
+        if not is_task_queue_process(runner["pid"]):
             child = runner["child_pid"]
             if child and is_process_alive(child):
                 log(
@@ -224,7 +271,7 @@ def log(msg):
     ).fetchall()
 
     for waiter in waiters:
-        if not is_process_alive(waiter["pid"]):
+        if not is_task_queue_process(waiter["pid"]):
             conn.execute("DELETE FROM queue WHERE id = ?", (waiter["id"],))
             log_metric(
                 metrics_path,
diff --git a/task_queue.py b/task_queue.py
index 9ae2723..31e6557 100644
--- a/task_queue.py
+++ b/task_queue.py
@@ -13,6 +13,8 @@
 import signal
 import sqlite3
 import time
+import threading
+import uuid
 from collections import deque
 from datetime import datetime
 from pathlib import Path
@@ -20,6 +22,15 @@
 from fastmcp import FastMCP
 from fastmcp.server.dependencies import get_context
 
+# Unique identifier for this server instance - used to detect orphaned tasks
+# from previous server instances even if the PID is reused
+SERVER_INSTANCE_ID = str(uuid.uuid4())[:8]
+
+# Track active task IDs being processed by this server instance
+# Used to detect orphaned queue entries when clients disconnect without proper cleanup
+_active_task_ids: set[int] = set()
+_active_task_ids_lock = threading.Lock()
+
 # Import shared queue infrastructure
 from queue_core import (
     QueuePaths,
@@ -118,7 +129,7 @@ def log_metric(event: str, **kwargs):
 
 
 def cleanup_queue(conn, queue_name: str):
-    """Clean up queue using configured paths."""
+    """Clean up queue using configured paths and detect orphaned tasks."""
     _cleanup_queue(
         conn,
         queue_name,
@@ -127,6 +138,58 @@ def cleanup_queue(conn, queue_name: str):
         log_fn=lambda msg: print(log_fmt(msg)),
     )
 
+    my_pid = os.getpid()
+
+    # Cleanup 1: Tasks with our PID but DIFFERENT server_id (from old server instance)
+    # This handles the edge case where PID is reused after server restart
+    stale_server_tasks = conn.execute(
+        "SELECT id, status, child_pid, server_id FROM queue WHERE queue_name = ? AND pid = ? AND server_id IS NOT NULL AND server_id != ?",
+        (queue_name, my_pid, SERVER_INSTANCE_ID),
+    ).fetchall()
+
+    for task in stale_server_tasks:
+        if task["child_pid"] and is_process_alive(task["child_pid"]):
+            print(log_fmt(f"WARNING: Killing orphaned subprocess {task['child_pid']} from old server"))
+            kill_process_tree(task["child_pid"])
+
+        conn.execute("DELETE FROM queue WHERE id = ?", (task["id"],))
+        log_metric(
+            "orphan_cleared",
+            task_id=task["id"],
+            queue_name=queue_name,
+            status=task["status"],
+            old_server_id=task["server_id"],
+            reason="stale_server_instance",
+        )
+        print(log_fmt(f"WARNING: Cleared task from old server instance (ID: {task['id']}, old_server: {task['server_id']})"))
+
+    # Cleanup 2: Tasks with our PID AND server_id but not in active tracking set
+    # This catches tasks left behind when clients disconnect without proper cleanup
+    our_tasks = conn.execute(
+        "SELECT id, status, child_pid FROM queue WHERE queue_name = ? AND pid = ? AND (server_id = ? OR server_id IS NULL)",
+        (queue_name, my_pid, SERVER_INSTANCE_ID),
+    ).fetchall()
+
+    with _active_task_ids_lock:
+        active_ids = _active_task_ids.copy()
+
+    for orphan in our_tasks:
+        if orphan["id"] not in active_ids:
+            # This task belongs to us but we're not tracking it - it's orphaned
+            if orphan["child_pid"] and is_process_alive(orphan["child_pid"]):
+                print(log_fmt(f"WARNING: Killing orphaned subprocess {orphan['child_pid']}"))
+                kill_process_tree(orphan["child_pid"])
+
+            conn.execute("DELETE FROM queue WHERE id = ?", (orphan["id"],))
+            log_metric(
+                "orphan_cleared",
+                task_id=orphan["id"],
+                queue_name=queue_name,
+                status=orphan["status"],
+                reason="not_in_active_set",
+            )
+            print(log_fmt(f"WARNING: Cleared orphaned task (ID: {orphan['id']}, status: {orphan['status']})"))
+
 
 # --- Output File Management ---
 def cleanup_output_files():
@@ -173,6 +236,11 @@ async def wait_for_turn(queue_name: str) -> int:
     # Ensure database exists and is valid
     ensure_db()
 
+    # Run cleanup BEFORE inserting - this clears orphaned tasks that would otherwise
+    # block the queue forever (since cleanup only runs during polling)
+    with get_db() as conn:
+        cleanup_queue(conn, queue_name)
+
     my_pid = os.getpid()
     ctx = None
     try:
@@ -182,11 +250,15 @@ async def wait_for_turn(queue_name: str) -> int:
 
     with get_db() as conn:
         cursor = conn.execute(
-            "INSERT INTO queue (queue_name, status, pid) VALUES (?, ?, ?)",
-            (queue_name, "waiting", my_pid),
+            "INSERT INTO queue (queue_name, status, pid, server_id) VALUES (?, ?, ?, ?)",
+            (queue_name, "waiting", my_pid, SERVER_INSTANCE_ID),
         )
         task_id = cursor.lastrowid
 
+    # Track this task as active for orphan detection
+    with _active_task_ids_lock:
+        _active_task_ids.add(task_id)
+
     log_metric("task_queued", task_id=task_id, queue_name=queue_name, pid=my_pid)
     queued_at = time.time()
 
@@ -198,72 +270,90 @@ async def wait_for_turn(queue_name: str) -> int:
     last_pos = -1
     wait_ticks = 0
 
-    while True:
-        with get_db() as conn:
-            cleanup_queue(conn, queue_name)
-
-            runner = conn.execute(
-                "SELECT id FROM queue WHERE queue_name = ? AND status = 'running'",
-                (queue_name,),
-            ).fetchone()
-
-            if runner:
-                pos = (
-                    conn.execute(
-                        "SELECT COUNT(*) as c FROM queue WHERE queue_name = ? AND status = 'waiting' AND id < ?",
-                        (queue_name, task_id),
-                    ).fetchone()["c"]
-                    + 1
-                )
-
-                wait_ticks += 1
-
-                if pos != last_pos:
-                    if ctx:
-                        await ctx.info(log_fmt(f"Position #{pos} in queue. Waiting..."))
-                    last_pos = pos
-                elif wait_ticks % 3 == 0 and ctx:  # Update every ~15 seconds
-                    await ctx.info(
-                        log_fmt(
-                            f"Still waiting... Position #{pos} ({wait_ticks * 5}s elapsed)"
-                        )
+    try:
+        while True:
+            with get_db() as conn:
+                cleanup_queue(conn, queue_name)
+
+                runner = conn.execute(
+                    "SELECT id FROM queue WHERE queue_name = ? AND status = 'running'",
+                    (queue_name,),
+                ).fetchone()
+
+                if runner:
+                    pos = (
+                        conn.execute(
+                            "SELECT COUNT(*) as c FROM queue WHERE queue_name = ? AND status = 'waiting' AND id < ?",
+                            (queue_name, task_id),
+                        ).fetchone()["c"]
+                        + 1
                     )
 
-                await asyncio.sleep(POLL_INTERVAL_WAITING)
-                continue
-
-            # Atomic lock acquisition: UPDATE only succeeds if we're the first
-            # waiting task AND no one is currently running. This prevents race
-            # conditions where two tasks both think they're next.
-            cursor = conn.execute(
-                """UPDATE queue SET status = 'running', updated_at = ?, pid = ?
-                   WHERE id = ? AND status = 'waiting'
-                   AND NOT EXISTS (
-                       SELECT 1 FROM queue WHERE queue_name = ? AND status = 'running'
-                   )
-                   AND id = (
-                       SELECT MIN(id) FROM queue WHERE queue_name = ? AND status = 'waiting'
-                   )""",
-                (datetime.now().isoformat(), my_pid, task_id, queue_name, queue_name),
-            )
+                    wait_ticks += 1
+
+                    if pos != last_pos:
+                        if ctx:
+                            await ctx.info(log_fmt(f"Position #{pos} in queue. Waiting..."))
+                        last_pos = pos
+                    elif wait_ticks % 10 == 0 and ctx:  # Update every ~10 polls
+                        await ctx.info(
+                            log_fmt(
+                                f"Still waiting... Position #{pos} ({int(wait_ticks * POLL_INTERVAL_WAITING)}s elapsed)"
+                            )
+                        )
 
-            if cursor.rowcount > 0:
-                wait_time = time.time() - queued_at
-                log_metric(
-                    "task_started",
-                    task_id=task_id,
-                    queue_name=queue_name,
-                    wait_time_seconds=round(wait_time, 2),
+                    await asyncio.sleep(POLL_INTERVAL_WAITING)
+                    continue
+
+                # Atomic lock acquisition: UPDATE only succeeds if we're the first
+                # waiting task AND no one is currently running. This prevents race
+                # conditions where two tasks both think they're next.
+                cursor = conn.execute(
+                    """UPDATE queue SET status = 'running', updated_at = ?, pid = ?
+                       WHERE id = ? AND status = 'waiting'
+                       AND NOT EXISTS (
+                           SELECT 1 FROM queue WHERE queue_name = ? AND status = 'running'
+                       )
+                       AND id = (
+                           SELECT MIN(id) FROM queue WHERE queue_name = ? AND status = 'waiting'
+                       )""",
+                    (datetime.now().isoformat(), my_pid, task_id, queue_name, queue_name),
                 )
-                if ctx:
-                    await ctx.info(log_fmt("Lock ACQUIRED. Starting execution."))
-                return task_id
 
-        await asyncio.sleep(POLL_INTERVAL_READY)
+                if cursor.rowcount > 0:
+                    wait_time = time.time() - queued_at
+                    log_metric(
+                        "task_started",
+                        task_id=task_id,
+                        queue_name=queue_name,
+                        wait_time_seconds=round(wait_time, 2),
+                    )
+                    if ctx:
+                        await ctx.info(log_fmt("Lock ACQUIRED. Starting execution."))
+                    return task_id
+
+            await asyncio.sleep(POLL_INTERVAL_READY)
+    except asyncio.CancelledError:
+        # Client disconnected (e.g., sub-agent cancelled) - clean up our queue entry
+        with _active_task_ids_lock:
+            _active_task_ids.discard(task_id)
+        log_metric(
+            "task_cancelled",
+            task_id=task_id,
+            queue_name=queue_name,
+            reason="client_disconnected",
+        )
+        with get_db() as conn:
+            conn.execute("DELETE FROM queue WHERE id = ?", (task_id,))
+        raise  # Re-raise to propagate cancellation
 
 
 async def release_lock(task_id: int):
     """Release a queue lock."""
+    # Remove from active tracking
+    with _active_task_ids_lock:
+        _active_task_ids.discard(task_id)
+
     ctx = None
     try:
         ctx = get_context()
@@ -470,6 +560,25 @@ async def stream_to_file(stream, tail_buffer: deque, label: str):
             tail_text = "\n".join(tail) if tail else "(no output)"
             return f"FAILED exit={proc.returncode} {duration:.1f}s output={output_file}\n{tail_text}"
 
+    except asyncio.CancelledError:
+        # Client disconnected while task was running - kill the subprocess
+        log_metric(
+            "task_cancelled",
+            task_id=task_id,
+            queue_name=queue_name,
+            command=command,
+            reason="client_disconnected_during_execution",
+        )
+        try:
+            os.killpg(proc.pid, signal.SIGTERM)
+            await asyncio.wait_for(proc.wait(), timeout=5.0)
+        except Exception:
+            try:
+                os.killpg(proc.pid, signal.SIGKILL)
+            except Exception:
+                pass
+        raise  # Re-raise to propagate cancellation
+
     except Exception as e:
         log_metric(
             "task_error",
diff --git a/tests/test_queue.py b/tests/test_queue.py
index 07b2431..a1e1bec 100644
--- a/tests/test_queue.py
+++ b/tests/test_queue.py
@@ -634,11 +634,12 @@ def test_zombie_cleanup_stale_lock():
 def test_zombie_cleanup_preserves_valid_tasks():
     """Test that cleanup doesn't remove valid running tasks."""
     import os
+    from task_queue import _active_task_ids, _active_task_ids_lock
 
     my_pid = os.getpid()  # Use our own PID so it's "alive"
 
     with get_db() as conn:
-        conn.execute(
+        cursor = conn.execute(
             """INSERT INTO queue (queue_name, status, pid, child_pid, created_at, updated_at)
                VALUES (?, ?, ?, ?, ?, ?)""",
             (
@@ -650,24 +651,32 @@ def test_zombie_cleanup_preserves_valid_tasks():
                 datetime.now().isoformat(),  # Recent timestamp
             ),
         )
-
-        # Verify the task exists
-        count_before = conn.execute(
-            "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'valid_test'"
-        ).fetchone()["c"]
-        assert count_before == 1
-
-        # Run cleanup
-        cleanup_queue(conn, "valid_test")
-
-        # Verify the task is still there (not removed)
-        count_after = conn.execute(
-            "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'valid_test'"
-        ).fetchone()["c"]
-        assert count_after == 1, "Valid running task should NOT be cleaned up"
-
-        # Clean up for other tests
-        conn.execute("DELETE FROM queue WHERE queue_name = 'valid_test'")
+        task_id = cursor.lastrowid
+
+        # Register this task as active (simulating normal operation)
+        with _active_task_ids_lock:
+            _active_task_ids.add(task_id)
+
+        try:
+            # Verify the task exists
+            count_before = conn.execute(
+                "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'valid_test'"
+            ).fetchone()["c"]
+            assert count_before == 1
+
+            # Run cleanup
+            cleanup_queue(conn, "valid_test")
+
+            # Verify the task is still there (not removed)
+            count_after = conn.execute(
+                "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'valid_test'"
+            ).fetchone()["c"]
+            assert count_after == 1, "Valid running task should NOT be cleaned up"
+        finally:
+            # Clean up for other tests
+            with _active_task_ids_lock:
+                _active_task_ids.discard(task_id)
+            conn.execute("DELETE FROM queue WHERE queue_name = 'valid_test'")
 
 
 def test_orphan_cleanup_dead_parent_waiting():
@@ -707,11 +716,12 @@ def test_orphan_cleanup_dead_parent_waiting():
 def test_orphan_cleanup_preserves_valid_waiting():
     """Test that cleanup doesn't remove valid waiting tasks."""
     import os
+    from task_queue import _active_task_ids, _active_task_ids_lock
 
     my_pid = os.getpid()  # Use our own PID so it's "alive"
 
     with get_db() as conn:
-        conn.execute(
+        cursor = conn.execute(
             """INSERT INTO queue (queue_name, status, pid, child_pid, created_at, updated_at)
                VALUES (?, ?, ?, ?, ?, ?)""",
             (
@@ -723,24 +733,124 @@ def test_orphan_cleanup_preserves_valid_waiting():
                 datetime.now().isoformat(),
             ),
         )
+        task_id = cursor.lastrowid
+
+        # Register this task as active (simulating normal operation)
+        with _active_task_ids_lock:
+            _active_task_ids.add(task_id)
+
+        try:
+            # Verify the task exists
+            count_before = conn.execute(
+                "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'valid_waiting_test'"
+            ).fetchone()["c"]
+            assert count_before == 1
+
+            # Run cleanup
+            cleanup_queue(conn, "valid_waiting_test")
+
+            # Verify the task is still there (not removed)
+            count_after = conn.execute(
+                "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'valid_waiting_test'"
+            ).fetchone()["c"]
+            assert count_after == 1, "Valid waiting task should NOT be cleaned up"
+        finally:
+            # Clean up for other tests
+            with _active_task_ids_lock:
+                _active_task_ids.discard(task_id)
+            conn.execute("DELETE FROM queue WHERE queue_name = 'valid_waiting_test'")
+
+
+def test_orphan_cleanup_removes_untracked_task():
+    """Test that cleanup removes tasks for our PID that aren't in the active set.
+
+    This tests the fix for orphaned tasks left behind when MCP clients
+    disconnect without proper cleanup (e.g., when sub-agents are cancelled).
+    """
+    import os
+    from task_queue import _active_task_ids, _active_task_ids_lock
+
+    my_pid = os.getpid()  # Use our own PID
+
+    with get_db() as conn:
+        cursor = conn.execute(
+            """INSERT INTO queue (queue_name, status, pid, child_pid, created_at, updated_at)
+               VALUES (?, ?, ?, ?, ?, ?)""",
+            (
+                "untracked_orphan_test",
+                "waiting",
+                my_pid,
+                None,
+                datetime.now().isoformat(),
+                datetime.now().isoformat(),
+            ),
+        )
+        task_id = cursor.lastrowid
+
+        # Do NOT add to _active_task_ids - simulating an orphaned task
 
         # Verify the task exists
         count_before = conn.execute(
-            "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'valid_waiting_test'"
+            "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'untracked_orphan_test'"
         ).fetchone()["c"]
         assert count_before == 1
 
         # Run cleanup
-        cleanup_queue(conn, "valid_waiting_test")
+        cleanup_queue(conn, "untracked_orphan_test")
 
-        # Verify the task is still there (not removed)
+        # Verify the task was removed (it's orphaned - our PID but not tracked)
         count_after = conn.execute(
-            "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'valid_waiting_test'"
+            "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'untracked_orphan_test'"
+        ).fetchone()["c"]
+        assert count_after == 0, "Untracked task for our PID should be cleaned up"
+
+
+def test_stale_server_instance_cleanup():
+    """Test that cleanup removes tasks from old server instances even if PID is reused.
+
+    This tests the fix for the edge case where:
+    1. MCP server A creates tasks with PID 1234 and server_id "abc123"
+    2. Server A dies
+    3. A new process reuses PID 1234
+    4. MCP server B starts with PID 1234 and server_id "xyz789"
+    5. Server B's cleanup should remove Server A's orphaned tasks
+    """
+    import os
+    from task_queue import SERVER_INSTANCE_ID
+
+    my_pid = os.getpid()
+    old_server_id = "old12345"  # Simulated old server instance
+
+    with get_db() as conn:
+        # Insert a task as if from an old server instance (same PID, different server_id)
+        cursor = conn.execute(
+            """INSERT INTO queue (queue_name, status, pid, server_id, created_at, updated_at)
+               VALUES (?, ?, ?, ?, ?, ?)""",
+            (
+                "stale_server_test",
+                "running",
+                my_pid,  # Same PID as current process
+                old_server_id,  # Different server_id
+                datetime.now().isoformat(),
+                datetime.now().isoformat(),
+            ),
+        )
+        task_id = cursor.lastrowid
+
+        # Verify the task exists
+        count_before = conn.execute(
+            "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'stale_server_test'"
         ).fetchone()["c"]
-        assert count_after == 1, "Valid waiting task should NOT be cleaned up"
+        assert count_before == 1
+
+        # Run cleanup
+        cleanup_queue(conn, "stale_server_test")
 
-        # Clean up for other tests
-        conn.execute("DELETE FROM queue WHERE queue_name = 'valid_waiting_test'")
+        # Verify the task was removed (different server_id means it's from old instance)
+        count_after = conn.execute(
+            "SELECT COUNT(*) as c FROM queue WHERE queue_name = 'stale_server_test'"
+        ).fetchone()["c"]
+        assert count_after == 0, "Task from old server instance should be cleaned up"
 
 
 # --- Configuration Tests ---
diff --git a/tests/test_tq_cli.py b/tests/test_tq_cli.py
index 537788b..aa07da6 100644
--- a/tests/test_tq_cli.py
+++ b/tests/test_tq_cli.py
@@ -396,3 +396,148 @@ def test_sigint_cleanup_running_task(self, temp_data_dir):
             pass  # Expected - process is dead
 
         conn.close()
+
+    def test_multiple_waiters_cancelled(self, temp_data_dir):
+        """
+        Test that multiple waiting tasks are all cleaned up when cancelled.
+
+        Simulates the scenario where multiple sub-agents are cancelled at once.
+        """
+        import os
+        import signal
+        import sqlite3
+        import time
+
+        db_path = Path(temp_data_dir) / "queue.db"
+
+        # Start a blocker to hold the lock
+        blocker = subprocess.Popen(
+            [sys.executable, str(TQ_PATH), f"--data-dir={temp_data_dir}", "sleep", "30"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            start_new_session=True,
+        )
+
+        time.sleep(0.5)
+
+        # Start multiple waiters
+        waiters = []
+        for i in range(3):
+            waiter = subprocess.Popen(
+                [sys.executable, str(TQ_PATH), f"--data-dir={temp_data_dir}", "echo", f"waiter_{i}"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                start_new_session=True,
+            )
+            waiters.append(waiter)
+            time.sleep(0.2)  # Stagger registration
+
+        # Verify all waiters are in queue
+        conn = sqlite3.connect(db_path, timeout=5.0)
+        conn.row_factory = sqlite3.Row
+        waiting_count = conn.execute(
+            "SELECT COUNT(*) as c FROM queue WHERE status = 'waiting'"
+        ).fetchone()["c"]
+        assert waiting_count == 3, f"Expected 3 waiting tasks, got {waiting_count}"
+
+        # Cancel all waiters simultaneously
+        for waiter in waiters:
+            waiter.send_signal(signal.SIGINT)
+
+        # Wait for all to exit
+        for waiter in waiters:
+            waiter.wait(timeout=5)
+
+        # Verify all waiting tasks were cleaned up
+        conn = sqlite3.connect(db_path, timeout=5.0)
+        conn.row_factory = sqlite3.Row
+        remaining = conn.execute(
+            "SELECT COUNT(*) as c FROM queue WHERE status = 'waiting'"
+        ).fetchone()["c"]
+        assert remaining == 0, f"All waiting tasks should be cleaned up, but {remaining} remain"
+
+        # Blocker should still be running
+        running = conn.execute("SELECT * FROM queue WHERE status = 'running'").fetchone()
+        assert running is not None, "Blocker should still be running"
+
+        # Clean up blocker
+        try:
+            os.killpg(os.getpgid(blocker.pid), signal.SIGTERM)
+        except Exception:
+            blocker.terminate()
+        blocker.wait(timeout=5)
+        conn.close()
+
+    def test_cancel_and_restart_proceeds(self, temp_data_dir):
+        """
+        Test that after cancelling tasks, new tasks can proceed normally.
+
+        This verifies the queue isn't left in a broken state after cancellation.
+        """
+        import os
+        import signal
+        import sqlite3
+        import time
+
+        db_path = Path(temp_data_dir) / "queue.db"
+
+        # Start and cancel a task
+        proc = subprocess.Popen(
+            [sys.executable, str(TQ_PATH), f"--data-dir={temp_data_dir}", "sleep", "30"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            start_new_session=True,
+        )
+        time.sleep(0.5)
+
+        # Cancel it
+        proc.send_signal(signal.SIGINT)
+        proc.wait(timeout=5)
+
+        # Verify queue is empty
+        conn = sqlite3.connect(db_path, timeout=5.0)
+        conn.row_factory = sqlite3.Row
+        count = conn.execute("SELECT COUNT(*) as c FROM queue").fetchone()["c"]
+        assert count == 0, "Queue should be empty after cancellation"
+        conn.close()
+
+        # Now run a new task - it should succeed
+        result = run_tq("echo", "after_cancel", data_dir=temp_data_dir)
+        assert result.returncode == 0, "New task should succeed after cancellation"
+        assert "[tq] SUCCESS" in result.stdout
+
+    def test_rapid_cancel_restart_cycles(self, temp_data_dir):
+        """
+        Stress test: rapidly cancel and restart tasks.
+
+        Ensures no race conditions or leaked queue entries.
+        """
+        import os
+        import signal
+        import sqlite3
+        import time
+
+        db_path = Path(temp_data_dir) / "queue.db"
+
+        # Run several cancel/restart cycles
+        for cycle in range(5):
+            proc = subprocess.Popen(
+                [sys.executable, str(TQ_PATH), f"--data-dir={temp_data_dir}", "sleep", "10"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                start_new_session=True,
+            )
+            time.sleep(0.3)  # Let it register
+            proc.send_signal(signal.SIGINT)
+            proc.wait(timeout=5)
+
+        # Queue should be empty
+        conn = sqlite3.connect(db_path, timeout=5.0)
+        conn.row_factory = sqlite3.Row
+        count = conn.execute("SELECT COUNT(*) as c FROM queue").fetchone()["c"]
+        assert count == 0, f"Queue should be empty after {5} cancel cycles, but has {count} entries"
+        conn.close()
+
+        # Final task should work
+        result = run_tq("echo", "final", data_dir=temp_data_dir)
+        assert result.returncode == 0, "Final task should succeed"
diff --git a/tq.py b/tq.py
index 03a550a..2e8eb93 100644
--- a/tq.py
+++ b/tq.py
@@ -14,9 +14,14 @@
 import subprocess
 import sys
 import time
+import uuid
 from datetime import datetime
 from pathlib import Path
 
+# Unique identifier for this CLI instance - used to detect orphaned tasks
+# from previous CLI instances even if the PID is reused
+CLI_INSTANCE_ID = str(uuid.uuid4())[:8]
+
 # Import shared queue infrastructure
 from queue_core import (
     QueuePaths,
@@ -183,14 +188,39 @@ def cleanup_queue(conn, queue_name: str, paths: QueuePaths):
     """Clean up queue (wrapper for CLI)."""
     _cleanup_queue(conn, queue_name, paths.metrics_path, DEFAULT_MAX_LOCK_AGE_MINUTES)
 
+    # Additional cleanup: Tasks with our PID but DIFFERENT instance_id (from old CLI instance)
+    # This handles the edge case where PID is reused after CLI crash
+    my_pid = os.getpid()
+    stale_tasks = conn.execute(
+        "SELECT id, status, child_pid, server_id FROM queue WHERE queue_name = ? AND pid = ? AND server_id IS NOT NULL AND server_id != ?",
+        (queue_name, my_pid, CLI_INSTANCE_ID),
+    ).fetchall()
+
+    for task in stale_tasks:
+        if task["child_pid"] and is_process_alive(task["child_pid"]):
+            print(f"[tq] WARNING: Killing orphaned subprocess {task['child_pid']} from old CLI instance")
+            kill_process_tree(task["child_pid"])
+
+        conn.execute("DELETE FROM queue WHERE id = ?", (task["id"],))
+        log_metric(
+            paths,
+            "orphan_cleared",
+            task_id=task["id"],
+            queue_name=queue_name,
+            status=task["status"],
+            old_instance_id=task["server_id"],
+            reason="stale_cli_instance",
+        )
+        print(f"[tq] WARNING: Cleared task from old CLI instance (ID: {task['id']}, old_instance: {task['server_id']})")
+
 
 def register_task(conn, queue_name: str, paths: QueuePaths) -> int:
     """Register a task in the queue. Returns task_id immediately."""
     my_pid = os.getpid()
 
     cursor = conn.execute(
-        "INSERT INTO queue (queue_name, status, pid) VALUES (?, ?, ?)",
-        (queue_name, "waiting", my_pid),
+        "INSERT INTO queue (queue_name, status, pid, server_id) VALUES (?, ?, ?, ?)",
+        (queue_name, "waiting", my_pid, CLI_INSTANCE_ID),
     )
     conn.commit()
     task_id = cursor.lastrowid
@@ -329,6 +359,10 @@ def cleanup_handler(signum, frame):
     signal.signal(signal.SIGTERM, cleanup_handler)
 
     try:
+        # Run cleanup BEFORE inserting - this clears orphaned tasks that would otherwise
+        # block the queue forever (since cleanup only runs during polling)
+        cleanup_queue(conn, queue_name, paths)
+
         # Register task first so task_id is available for cleanup if interrupted
         task_id = register_task(conn, queue_name, paths)
         wait_for_turn(conn, queue_name, task_id, paths)

From fa23fd88114fb9ca19ccc751d3588260d0521852 Mon Sep 17 00:00:00 2001
From: Matt McKenna <mmckenna@squareup.com>
Date: Fri, 23 Jan 2026 13:46:53 -0500
Subject: [PATCH 4/5] Fix ruff lint errors: import order and unused variables

- Move queue_core import before module-level constants in task_queue.py and tq.py
- Remove unused imports and variables in tests

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 task_queue.py        | 18 +++++++++---------
 tests/test_queue.py  |  8 ++------
 tests/test_tq_cli.py |  4 +---
 tq.py                |  8 ++++----
 4 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/task_queue.py b/task_queue.py
index 31e6557..10874e1 100644
--- a/task_queue.py
+++ b/task_queue.py
@@ -22,15 +22,6 @@
 from fastmcp import FastMCP
 from fastmcp.server.dependencies import get_context
 
-# Unique identifier for this server instance - used to detect orphaned tasks
-# from previous server instances even if the PID is reused
-SERVER_INSTANCE_ID = str(uuid.uuid4())[:8]
-
-# Track active task IDs being processed by this server instance
-# Used to detect orphaned queue entries when clients disconnect without proper cleanup
-_active_task_ids: set[int] = set()
-_active_task_ids_lock = threading.Lock()
-
 # Import shared queue infrastructure
 from queue_core import (
     QueuePaths,
@@ -46,6 +37,15 @@
     POLL_INTERVAL_READY,
 )
 
+# Unique identifier for this server instance - used to detect orphaned tasks
+# from previous server instances even if the PID is reused
+SERVER_INSTANCE_ID = str(uuid.uuid4())[:8]
+
+# Track active task IDs being processed by this server instance
+# Used to detect orphaned queue entries when clients disconnect without proper cleanup
+_active_task_ids: set[int] = set()
+_active_task_ids_lock = threading.Lock()
+
 
 # --- Argument Parsing ---
 def parse_args():
diff --git a/tests/test_queue.py b/tests/test_queue.py
index a1e1bec..c170249 100644
--- a/tests/test_queue.py
+++ b/tests/test_queue.py
@@ -768,12 +768,11 @@ def test_orphan_cleanup_removes_untracked_task():
     disconnect without proper cleanup (e.g., when sub-agents are cancelled).
     """
     import os
-    from task_queue import _active_task_ids, _active_task_ids_lock
 
     my_pid = os.getpid()  # Use our own PID
 
     with get_db() as conn:
-        cursor = conn.execute(
+        conn.execute(
             """INSERT INTO queue (queue_name, status, pid, child_pid, created_at, updated_at)
                VALUES (?, ?, ?, ?, ?, ?)""",
             (
@@ -785,7 +784,6 @@ def test_orphan_cleanup_removes_untracked_task():
                 datetime.now().isoformat(),
             ),
         )
-        task_id = cursor.lastrowid
 
         # Do NOT add to _active_task_ids - simulating an orphaned task
 
@@ -816,14 +814,13 @@ def test_stale_server_instance_cleanup():
     5. Server B's cleanup should remove Server A's orphaned tasks
     """
     import os
-    from task_queue import SERVER_INSTANCE_ID
 
     my_pid = os.getpid()
     old_server_id = "old12345"  # Simulated old server instance
 
     with get_db() as conn:
         # Insert a task as if from an old server instance (same PID, different server_id)
-        cursor = conn.execute(
+        conn.execute(
             """INSERT INTO queue (queue_name, status, pid, server_id, created_at, updated_at)
                VALUES (?, ?, ?, ?, ?, ?)""",
             (
@@ -835,7 +832,6 @@ def test_stale_server_instance_cleanup():
                 datetime.now().isoformat(),
             ),
         )
-        task_id = cursor.lastrowid
 
         # Verify the task exists
         count_before = conn.execute(
diff --git a/tests/test_tq_cli.py b/tests/test_tq_cli.py
index aa07da6..7733c40 100644
--- a/tests/test_tq_cli.py
+++ b/tests/test_tq_cli.py
@@ -300,7 +300,7 @@ def test_sigint_cleanup_waiting_task(self, temp_data_dir):
         conn.row_factory = sqlite3.Row
         running = conn.execute("SELECT * FROM queue WHERE status = 'running'").fetchone()
         assert running is not None, "Blocker should be running"
-        blocker_task_id = running["id"]
+        _ = running["id"]  # blocker_task_id - not used but verifies task exists
 
         # Now start a second task that will wait in queue
         waiter = subprocess.Popen(
@@ -474,7 +474,6 @@ def test_cancel_and_restart_proceeds(self, temp_data_dir):
 
         This verifies the queue isn't left in a broken state after cancellation.
         """
-        import os
         import signal
         import sqlite3
         import time
@@ -512,7 +511,6 @@ def test_rapid_cancel_restart_cycles(self, temp_data_dir):
 
         Ensures no race conditions or leaked queue entries.
         """
-        import os
         import signal
         import sqlite3
         import time
diff --git a/tq.py b/tq.py
index 2e8eb93..1885a14 100644
--- a/tq.py
+++ b/tq.py
@@ -18,10 +18,6 @@
 from datetime import datetime
 from pathlib import Path
 
-# Unique identifier for this CLI instance - used to detect orphaned tasks
-# from previous CLI instances even if the PID is reused
-CLI_INSTANCE_ID = str(uuid.uuid4())[:8]
-
 # Import shared queue infrastructure
 from queue_core import (
     QueuePaths,
@@ -38,6 +34,10 @@
     DEFAULT_MAX_METRICS_SIZE_MB,
 )
 
+# Unique identifier for this CLI instance - used to detect orphaned tasks
+# from previous CLI instances even if the PID is reused
+CLI_INSTANCE_ID = str(uuid.uuid4())[:8]
+
 
 def get_paths(args) -> QueuePaths:
     """Get queue paths from args or environment."""

From 2ef58b61f6f4b882a1204fc38cc88582780a3c1e Mon Sep 17 00:00:00 2001
From: Matt McKenna <mmckenna@squareup.com>
Date: Fri, 23 Jan 2026 13:56:38 -0500
Subject: [PATCH 5/5] Address security scanner findings

- Pin pypa/gh-action-pypi-publish to commit SHA (resolves unpinned action alert)
- Add nosec B602 comments explaining intentional shell=True usage:
  - CLI tool executes user-provided commands (like bash -c or make)
  - Shell features (pipes, redirects, globs) required for build commands
  - Input controlled by users who explicitly invoke the CLI/MCP tool

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 .github/workflows/release.yml | 2 +-
 task_queue.py                 | 5 ++++-
 tq.py                         | 5 ++++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index a6771cb..3c7030b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -41,4 +41,4 @@ jobs:
           path: dist/
 
       - name: Publish to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1
diff --git a/task_queue.py b/task_queue.py
index 10874e1..826b096 100644
--- a/task_queue.py
+++ b/task_queue.py
@@ -455,7 +455,10 @@ async def run_task(
     output_file = OUTPUT_DIR / f"task_{task_id}.log"
 
     try:
-        proc = await asyncio.create_subprocess_shell(
+        # nosec B602: shell execution is intentional - this MCP tool executes user-provided
+        # build commands (gradle, docker, pytest, etc.). Shell features (pipes, redirects,
+        # globs) are required. Input comes from AI agents which users explicitly invoke.
+        proc = await asyncio.create_subprocess_shell(  # nosec B602
             command,
             cwd=working_directory,
             env=env,
diff --git a/tq.py b/tq.py
index 1885a14..28a95c9 100644
--- a/tq.py
+++ b/tq.py
@@ -375,9 +375,12 @@ def cleanup_handler(signum, frame):
 
         # Run subprocess in passthrough mode - direct terminal connection
         # This preserves rich output (progress bars, colors, etc.)
+        # nosec B602: shell=True is intentional - this CLI tool executes user-provided
+        # commands, similar to bash -c or make. Users control their own CLI arguments.
+        # Shell features (pipes, redirects, globs) are required for build commands.
         proc = subprocess.Popen(
             command,
-            shell=True,
+            shell=True,  # nosec B602
             cwd=working_dir,
             start_new_session=True,  # For clean process group kill
         )