From b0c076c63e41b5ba411ea92838f2fd8dd90be0b6 Mon Sep 17 00:00:00 2001
From: Divyansh Kumar <kumar.divyansh1996@gmail.com>
Date: Tue, 3 Mar 2026 17:49:32 +0000
Subject: [PATCH] feat(v0.5.1): 4-tier nervous system, WriteTimeout fix,
 /v1/prune endpoint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tiered Nervous System (Qwen3.5 models):
- config: add ModelIngest/ModelGuardian/ModelEnrich/ModelOrchestrate fields
  with backward-compatible fallback chain (defaults: 0.8b/2b/4b/9b)
- brain.New(): create 4 separate OllamaClient instances, one per tier
- llm/ollama.go: add WithThinking(bool) builder + /think|/no_think prompt
  prefixes for Qwen3.5; strip <think>...</think> blocks from responses

Bug fixes:
- server: WriteTimeout now 2×TimeoutMS (was hardcoded 30s); closes BUG-I01
  that silently killed enrich/explain-violation/coordinate on CPU
- config: default TimeoutMS 30000→60000ms

Features:
- ingestor: update prompt to 2-3 sentence prose briefing format
- pruner: new internal/pruner package strips web boilerplate via Tier 0 (0.8B)
- server: POST /v1/prune endpoint; Brain interface + NullBrain updated

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 CLAUDE.md                     |  60 ++++++++++++
 README.md                     | 126 +++++++++++++++++-------
 cmd/brain/main.go             |   2 +-
 config/config.go              |  93 ++++++++++++------
 improvement.md                | 180 ++++++++++++++++++++++++++++++++++
 internal/ingestor/ingestor.go |  13 ++-
 internal/llm/ollama.go        |  31 +++++-
 internal/pruner/pruner.go     |  87 ++++++++++++++++
 pkg/brain/brain.go            |  42 +++++---
 pkg/brain/null.go             |   3 +
 server/server.go              |  34 ++++++-
 11 files changed, 582 insertions(+), 89 deletions(-)
 create mode 100644 CLAUDE.md
 create mode 100644 improvement.md
 create mode 100644 internal/pruner/pruner.go
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..f94cfd8
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,60 @@
+<!-- synapses:start -->
+## Synapses — Code Intelligence (MCP)
+
+This project is indexed by **Synapses**, a graph-based code intelligence server.
+
+### Session Start
+Call **one tool** at the start of every session:
+```
+session_init()   ← replaces get_pending_tasks + get_project_identity + get_working_state
+```
+Returns: pending tasks, project identity, working state, recent agent events, and **scale_guidance** — a repo-size-aware recommendation on which tools to prefer.
+
+### Tool Selection — follow scale_guidance from session_init
+
+| Repo scale | When to use Synapses | When to use Read/Grep |
+|---|---|---|
+| micro (<100 nodes) | Structural analysis, multi-file understanding | Simple targeted edits to a known file |
+| small (100–499) | Code exploration, cross-file analysis | Targeted single-file edits |
+| medium (500–1999) | All code exploration — Glob/Grep surfaces too much noise | Writing to a specific file you already identified |
+| large (2000+) | Always — direct scanning is too noisy at this scale | Writing to a specific file you already identified |
+
+### Code Exploration
+
+| When you want to... | Use this |
+|---|---|
+| Understand a function, struct, or interface | `get_context(entity="Name")` |
+| Pin to a specific file (avoids wrong-entity picks) | `get_context(entity="Name", file="cmd/server/main.go")` |
+| Boost nodes linked to current task | `get_context(entity="Name", task_id="...")` |
+| Find a symbol by name or substring | `find_entity(query="name")` |
+| Search by concept ("auth", "rate limiting") | `search(query="...", mode="semantic")` |
+| List all entities in a file | `get_file_context(file="path/to/file")` |
+| Trace how function A calls function B | `get_call_chain(from="A", to="B")` |
+| Find what breaks if a symbol changes | `get_impact(symbol="Name")` |
+
+### Before Writing Code
+
+| When you want to... | Use this |
+|---|---|
+| Check proposed changes against architecture rules | `validate_plan(changes=[...])` |
+| Reserve a file/package before editing | `claim_work(agent_id="...", scope="pkg/auth", scope_type="package")` |
+| Check if another agent is editing the same code | `get_conflicts(agent_id="...")` |
+| Release locks when done | `release_claims(agent_id="...")` |
+
+### Task & Session Management
+
+| When you want to... | Use this |
+|---|---|
+| Save a plan with tasks for future sessions | `create_plan(title="...", tasks=[...])` |
+| Mark a task as done or add notes | `update_task(id="...", status="done", notes="...")` |
+| Save progress so next session can resume | `save_session_state(task_id="...")` |
+| Leave a note on a code entity for other agents | `annotate_node(node_id="...", note="...")` |
+| See what other agents have been doing | `get_events(since_seq=N)` (use latest_event_seq from session_init) |
+
+### Rules
+- **Read/Grep** are for *writing* code (editing a specific file you have already found). For *understanding* code structure, always prefer Synapses tools.
+- **Call `session_init()`** at the start of every session. It replaces the 3-call startup ritual.
+- **Call `validate_plan()`** before implementing multi-file changes.
+- **Call `claim_work()`** before editing to avoid conflicts with other agents.
+- When `get_context` returns `other_candidates`, re-call with `file=` to pin to the right entity.
+<!-- synapses:end -->
diff --git a/README.md b/README.md
index b668165..77cce88 100644
--- a/README.md
+++ b/README.md
@@ -15,13 +15,14 @@ same information in ~800 tokens that raw nodes would need 4,000+ tokens to expre
 
 ### Capabilities at a glance
 
-| Capability | Method | LLM? | Latency |
+| Capability | Method | LLM tier | Latency (CPU) |
 |---|---|---|---|
-| Summarise a code entity (1 sentence) | `Ingest` | yes | 1–3 s |
+| 2-3 sentence prose briefing for a code entity | `Ingest` | Tier 0 (0.8B) | ~3 s |
+| Strip boilerplate from web content | `Prune` | Tier 0 (0.8B) | ~3 s |
 | Context Packet (summaries + constraints + guidance) | `BuildContextPacket` | optional | <5 ms fast path |
-| Architectural insight for a neighbourhood | `Enrich` | yes | 1–3 s |
-| Explain a rule violation in plain English | `ExplainViolation` | yes (cached) | <1 ms cached |
-| Agent conflict work distribution | `Coordinate` | yes | 1–3 s |
+| Architectural insight for a neighbourhood | `Enrich` | Tier 2 (4B) | ~12 s |
+| Explain a rule violation in plain English | `ExplainViolation` | Tier 1 (2B, cached) | <1 ms cached |
+| Agent conflict work distribution | `Coordinate` | Tier 3 (9B) | ~25 s |
 | SDLC phase + quality mode management | `SetSDLCPhase` / `SetQualityMode` | no | <1 ms |
 | Co-occurrence learning ("also check Y when editing X") | `LogDecision` | no | <1 ms |
 | Get learned patterns | `GetPatterns` | no | <1 ms |
@@ -47,18 +48,23 @@ No CGO, no external databases, no network dependencies beyond Ollama.
 ## Quick start
 
 ```sh
-# 1. Install Ollama and pull the default model
-ollama pull qwen2.5-coder:1.5b
+# 1. Install Ollama and pull the Tier 0 model (minimum — handles ingest + prune)
+ollama pull qwen3.5:0.8b
 
-# 2. Build the binary
+# 2. Optionally pull additional tiers for better quality
+ollama pull qwen3.5:2b    # Tier 1: guardian (violation explanations)
+ollama pull qwen3.5:4b    # Tier 2: enricher (architectural insight)
+ollama pull qwen3.5:9b    # Tier 3: orchestrator (multi-agent conflicts)
+
+# 3. Build the binary
 make build
 
-# 3. Start the brain sidecar
+# 4. Start the brain sidecar
 ./bin/brain serve
 
-# 4. Verify it's running
+# 5. Verify it's running
 curl http://localhost:11435/v1/health
-# {"status":"ok","model":"qwen2.5-coder:1.5b","available":true}
+# {"status":"ok","model":"qwen3.5:4b","available":true}
 ```
 
 ---
@@ -95,8 +101,11 @@ All fields are optional — sensible defaults apply.
 {
   "enabled": true,
   "ollama_url": "http://localhost:11434",
-  "model": "qwen2.5-coder:1.5b",
-  "timeout_ms": 3000,
+  "model_ingest":      "qwen3.5:0.8b",
+  "model_guardian":    "qwen3.5:2b",
+  "model_enrich":      "qwen3.5:4b",
+  "model_orchestrate": "qwen3.5:9b",
+  "timeout_ms": 60000,
   "db_path": "~/.synapses/brain.sqlite",
   "port": 11435,
 
@@ -122,11 +131,16 @@ BRAIN_CONFIG=/path/to/brain.json brain serve
 |---|---|---|
 | `enabled` | `false` | Master switch. Set `true` to activate all features. |
 | `ollama_url` | `http://localhost:11434` | Ollama server base URL. |
-| `model` | `qwen2.5-coder:1.5b` | Ollama model tag. See [Model tiers](#model-tiers). |
-| `timeout_ms` | `3000` | Per-LLM-request timeout in milliseconds. |
+| `model_ingest` | `qwen3.5:0.8b` | Tier 0 (Reflex): bulk ingest + web pruning. |
+| `model_guardian` | `qwen3.5:2b` | Tier 1 (Sensory): rule violation explanations. |
+| `model_enrich` | `qwen3.5:4b` | Tier 2 (Specialist): architectural insight. |
+| `model_orchestrate` | `qwen3.5:9b` | Tier 3 (Architect): multi-agent conflict resolution. |
+| `model` | `qwen3.5:4b` | Fallback model when tier fields are absent (backward compat). |
+| `fast_model` | `qwen3.5:0.8b` | Fallback fast model when `model_ingest` is absent. |
+| `timeout_ms` | `60000` | Per-LLM-request timeout in milliseconds. WriteTimeout = 2× this. |
 | `db_path` | `~/.synapses/brain.sqlite` | SQLite database path (created if missing). |
 | `port` | `11435` | HTTP sidecar port. |
-| `ingest` | `true` | Enable `POST /v1/ingest` (semantic summaries). |
+| `ingest` | `true` | Enable `POST /v1/ingest` (prose briefings). |
 | `enrich` | `true` | Enable `POST /v1/enrich` (neighbourhood insight). |
 | `guardian` | `true` | Enable `POST /v1/explain-violation` (rule explanations). |
 | `orchestrate` | `true` | Enable `POST /v1/coordinate` (agent conflict resolution). |
@@ -139,22 +153,33 @@ BRAIN_CONFIG=/path/to/brain.json brain serve
 
 ## Model tiers
 
-| System RAM | Model | Size | Notes |
-|---|---|---|---|
-| 4 GB | `qwen2.5-coder:1.5b` | ~900 MB | Default. Works on any dev machine. |
-| 4 GB+ | `qwen3:1.7b` | ~1.1 GB | Recommended upgrade. Better reasoning. |
-| 8 GB+ | `qwen3:4b` | ~2.5 GB | Power user. Noticeably better summaries. |
-| 16 GB+ | `qwen3:8b` | ~5 GB | Enterprise. Best quality, higher latency. |
+synapses-intelligence uses a **4-tier nervous system** — each task type runs on the smallest
+model capable of doing it well. All four Qwen3.5 models share the same tokenizer and support
+`/think` (chain-of-thought) and `/no_think` (fast, deterministic) mode switching.
+
+| Tier | Name | Model | Thinking | RAM | CPU latency | Tasks |
+|---|---|---|---|---|---|---|
+| 0 | Reflex | `qwen3.5:0.8b` | off | 1 GB | ~3 s | Ingest (prose briefings), web pruning |
+| 1 | Sensory | `qwen3.5:2b` | off | 2 GB | ~5 s | Guardian (violation explanations) |
+| 2 | Specialist | `qwen3.5:4b` | on | 4 GB | ~12 s | Enricher (architectural insight) |
+| 3 | Architect | `qwen3.5:9b` | on | 8 GB | ~25 s | Orchestrator (multi-agent conflicts) |
+
+**Minimum setup** (ingest + prune only): pull `qwen3.5:0.8b`.
+**Recommended setup** (all features): pull all four models.
 
 ```sh
-# Pull a specific model
-ollama pull qwen3:1.7b
+ollama pull qwen3.5:0.8b   # required
+ollama pull qwen3.5:2b     # recommended: guardian
+ollama pull qwen3.5:4b     # recommended: enricher
+ollama pull qwen3.5:9b     # optional: orchestrator
+```
 
-# Start brain with a different model
-brain serve -model qwen3:1.7b
+If a tier model is missing, brain falls back to the `model` config field. To point all tiers
+at a single model (e.g. for minimal RAM setups):
 
-# Or set in config
-{ "model": "qwen3:1.7b" }
+```json
+{ "model_ingest": "qwen3.5:0.8b", "model_guardian": "qwen3.5:0.8b",
+  "model_enrich": "qwen3.5:0.8b", "model_orchestrate": "qwen3.5:0.8b" }
 ```
 
 ---
@@ -269,7 +294,7 @@ brain reset
 
 ```sh
 brain version
-# synapses-intelligence v0.3.0
+# synapses-intelligence v0.5.1
 ```
 
 ---
@@ -298,8 +323,8 @@ Health check and LLM availability probe.
 
 ### `POST /v1/ingest`
 
-Generate and store a 1-sentence semantic summary (+ topic tags) for a code entity.
-Call this whenever a function, struct, or method is saved.
+Generate and store a **2-3 sentence prose briefing** (+ topic tags) for a code entity.
+Call this whenever a function, struct, or method is saved. Uses Tier 0 (0.8B).
 
 **Request**
 ```json
@@ -328,6 +353,33 @@ Call this whenever a function, struct, or method is saved.
 
 ---
 
+### `POST /v1/prune`
+
+Strip boilerplate from raw web page text (navigation, ads, footers, cookie notices) and
+return only the core technical content. Used by synapses-scout as a preprocessing step
+before `POST /v1/ingest` to improve distillation quality. Uses Tier 0 (0.8B). Fail-silent:
+returns the original content if the LLM is unavailable.
+
+**Request**
+```json
+{
+  "content": "...3000 chars of raw web page text..."
+}
+```
+
+**Response**
+```json
+{
+  "pruned":          "...clean technical paragraphs (~1200 chars)...",
+  "original_length": 3000,
+  "pruned_length":   1187
+}
+```
+
+On LLM error, returns the original content with an `X-Prune-Warning` response header.
+
+---
+
 ### `GET /v1/summary/{nodeId}`
 
 Fetch the stored summary for a single node. Fast (SQLite, no LLM).
@@ -694,14 +746,17 @@ _ = b.LogDecision(ctx, brain.DecisionRequest{
 
 ```go
 type Brain interface {
-    // Semantic summaries
+    // Semantic summaries (Tier 0: 0.8B)
     Ingest(ctx, IngestRequest) (IngestResponse, error)
-    Enrich(ctx, EnrichRequest) (EnrichResponse, error)
     Summary(nodeID string) string
 
+    // Web content preprocessing (Tier 0: 0.8B)
+    Prune(ctx context.Context, content string) (string, error)
+
     // Architectural analysis
-    ExplainViolation(ctx, ViolationRequest) (ViolationResponse, error)
-    Coordinate(ctx, CoordinateRequest) (CoordinateResponse, error)
+    Enrich(ctx, EnrichRequest) (EnrichResponse, error)           // Tier 2: 4B
+    ExplainViolation(ctx, ViolationRequest) (ViolationResponse, error) // Tier 1: 2B
+    Coordinate(ctx, CoordinateRequest) (CoordinateResponse, error)     // Tier 3: 9B
 
     // Context Packet
     BuildContextPacket(ctx, ContextPacketRequest) (*ContextPacket, error)
@@ -718,6 +773,7 @@ type Brain interface {
     // Diagnostics
     Available() bool
     ModelName() string
+    EnsureModel(ctx context.Context, w io.Writer) error
 }
 ```
 
diff --git a/cmd/brain/main.go b/cmd/brain/main.go
index aeb8057..3edf5d5 100644
--- a/cmd/brain/main.go
+++ b/cmd/brain/main.go
@@ -104,7 +104,7 @@ func cmdServe(cfg config.BrainConfig) {
 		}
 	}
 
-	srv := server.New(b, cfg.Port)
+	srv := server.New(b, cfg.Port, cfg.TimeoutMS)
 
 	// Graceful shutdown on SIGINT/SIGTERM.
 	ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM)
diff --git a/config/config.go b/config/config.go
index 4a05d32..e00c5e0 100644
--- a/config/config.go
+++ b/config/config.go
@@ -15,20 +15,39 @@ type BrainConfig struct {
 	// OllamaURL is the base URL of the Ollama server. Default: "http://localhost:11434".
 	OllamaURL string `json:"ollama_url,omitempty"`
 
-	// Model is the primary Ollama model tag for enrichment and insights.
-	// Default: "qwen2.5-coder:7b" (~4.5GB, needs 6GB VRAM or 8GB RAM).
-	// Downgrade options for low-resource systems:
-	//   "qwen2.5-coder:1.5b" — fast, fits in 4GB RAM (~900MB)
-	//   "qwen2.5-coder:3b"   — balanced, fits in 6GB RAM (~1.9GB)
+	// Model is the primary Ollama model tag (enrichment fallback when ModelEnrich is unset).
+	// Default: "qwen3.5:4b" — fast on CPU (~12s), beats qwen2.5-coder:7b at 1/3 the size.
+	// Legacy option: "qwen2.5-coder:7b" (~4.5GB, needs 6GB VRAM or 8GB RAM).
 	Model string `json:"model,omitempty"`
 
-	// FastModel is the Ollama model tag for bulk ingestion (summarization).
-	// Bulk ingest runs on every node at index time; use a smaller, faster model.
-	// Default: "qwen2.5-coder:1.5b" (~900MB).
-	// Set to "" to use Model for both ingest and enrichment.
+	// FastModel is the Ollama model tag for bulk ingestion (fallback when ModelIngest is unset).
+	// Default: "qwen3.5:0.8b" — runs in <3s on CPU, fits in 2GB RAM.
+	// Legacy option: "qwen2.5-coder:1.5b" (~900MB).
 	FastModel string `json:"fast_model,omitempty"`
 
-	// TimeoutMS is the per-request LLM timeout in milliseconds. Default: 3000.
+	// --- Tiered Nervous System: per-task model assignment ---
+	// Each tier defaults to the appropriate Qwen3.5 model.
+	// Set to "" to fall back to FastModel/Model. All 4 can point to the same model.
+
+	// ModelIngest is the model for bulk node summarization at index time.
+	// Tier 0 (Reflex): simple extraction, no reasoning needed. Default: "qwen3.5:0.8b".
+	ModelIngest string `json:"model_ingest,omitempty"`
+
+	// ModelGuardian is the model for rule violation explanations.
+	// Tier 1 (Sensory): structured plain-English output. Default: "qwen3.5:2b".
+	ModelGuardian string `json:"model_guardian,omitempty"`
+
+	// ModelEnrich is the model for architectural enrichment and insight generation.
+	// Tier 2 (Specialist): complex analysis across multiple callers/callees. Default: "qwen3.5:4b".
+	ModelEnrich string `json:"model_enrich,omitempty"`
+
+	// ModelOrchestrate is the model for multi-agent conflict resolution.
+	// Tier 3 (Architect): deep reasoning about competing scope claims. Default: "qwen3.5:9b".
+	ModelOrchestrate string `json:"model_orchestrate,omitempty"`
+
+	// TimeoutMS is the per-request LLM timeout in milliseconds.
+	// The HTTP server WriteTimeout is set to 2× this value. Default: 60000 (60s).
+	// Must exceed the slowest LLM inference time on your hardware (~25s for 9b CPU).
 	TimeoutMS int `json:"timeout_ms,omitempty"`
 
 	// DBPath is the path to the brain's own SQLite database.
@@ -63,21 +82,25 @@ type BrainConfig struct {
 func DefaultConfig() BrainConfig {
 	home, _ := os.UserHomeDir()
 	return BrainConfig{
-		Enabled:         false,
-		OllamaURL:       "http://localhost:11434",
-		Model:           "qwen2.5-coder:7b",
-		FastModel:       "qwen2.5-coder:1.5b",
-		TimeoutMS:       30000,
-		DBPath:          filepath.Join(home, ".synapses", "brain.sqlite"),
-		Port:            11435,
-		Ingest:          true,
-		Enrich:          true,
-		Guardian:        true,
-		Orchestrate:     true,
-		ContextBuilder:  true,
-		LearningEnabled: true,
-		DefaultPhase:    "development",
-		DefaultMode:     "standard",
+		Enabled:          false,
+		OllamaURL:        "http://localhost:11434",
+		Model:            "qwen3.5:4b",
+		FastModel:        "qwen3.5:0.8b",
+		ModelIngest:      "qwen3.5:0.8b",
+		ModelGuardian:    "qwen3.5:2b",
+		ModelEnrich:      "qwen3.5:4b",
+		ModelOrchestrate: "qwen3.5:9b",
+		TimeoutMS:        60000,
+		DBPath:           filepath.Join(home, ".synapses", "brain.sqlite"),
+		Port:             11435,
+		Ingest:           true,
+		Enrich:           true,
+		Guardian:         true,
+		Orchestrate:      true,
+		ContextBuilder:   true,
+		LearningEnabled:  true,
+		DefaultPhase:     "development",
+		DefaultMode:      "standard",
 	}
 }
 
@@ -120,18 +143,32 @@ func LoadFile(path string) (BrainConfig, error) {
 }
 
 // applyDefaults fills in zero values with defaults.
+// Tier models fall back to the legacy fast_model/model fields if unset.
 func (c *BrainConfig) applyDefaults() {
 	if c.OllamaURL == "" {
 		c.OllamaURL = "http://localhost:11434"
 	}
 	if c.Model == "" {
-		c.Model = "qwen2.5-coder:7b"
+		c.Model = "qwen3.5:4b"
 	}
 	if c.FastModel == "" {
-		c.FastModel = "qwen2.5-coder:1.5b"
+		c.FastModel = "qwen3.5:0.8b"
+	}
+	// Tier fallback chain: tier model → legacy field → hardcoded default
+	if c.ModelIngest == "" {
+		c.ModelIngest = c.FastModel
+	}
+	if c.ModelGuardian == "" {
+		c.ModelGuardian = "qwen3.5:2b"
+	}
+	if c.ModelEnrich == "" {
+		c.ModelEnrich = c.Model
+	}
+	if c.ModelOrchestrate == "" {
+		c.ModelOrchestrate = c.Model
 	}
 	if c.TimeoutMS <= 0 {
-		c.TimeoutMS = 30000
+		c.TimeoutMS = 60000
 	}
 	if c.Port <= 0 {
 		c.Port = 11435
diff --git a/improvement.md b/improvement.md
new file mode 100644
index 0000000..b9fd901
--- /dev/null
+++ b/improvement.md
@@ -0,0 +1,180 @@
+# synapses-intelligence improvement log
+
+## v0.5.1 — Tiered Nervous System + Scout Prune Pipeline (2026-03-03)
+
+### Changes
+
+#### P0 — Fix WriteTimeout (BUG-I01 closed)
+`server/server.go`: `WriteTimeout` now set to `2 × cfg.TimeoutMS` instead of hardcoded 30s.
+`config/config.go`: Default `TimeoutMS` raised from 30000 → 60000ms.
+All 3 broken endpoints (`/v1/enrich`, `/v1/explain-violation`, `/v1/coordinate`) are now
+reachable on CPU. Closes BUG-I01.
+
+#### P1 — Four-Tier Model Config
+`config/config.go`: Added 4 new tier model fields:
+- `ModelIngest`      (Tier 0 Reflex, default `qwen3.5:0.8b`)
+- `ModelGuardian`    (Tier 1 Sensory, default `qwen3.5:2b`)
+- `ModelEnrich`      (Tier 2 Specialist, default `qwen3.5:4b`)
+- `ModelOrchestrate` (Tier 3 Architect, default `qwen3.5:9b`)
+Backward-compatible fallback chain in `applyDefaults()`: missing tier fields fall back to
+`fast_model` (ingest) or `model` (others).
+
+#### P2 — Route Handlers to Correct Tier
+`pkg/brain/brain.go`: `New()` creates 4 separate `OllamaClient` instances, one per tier.
+- Ingestor → Tier 0 (0.8B): bulk summarization, fast and cheap
+- Guardian → Tier 1 (2B): violation explanations, was broken with 7b, now <5s
+- Enricher → Tier 2 (4B): architectural insight, ~12s on CPU
+- Orchestrator → Tier 3 (9B): multi-agent conflict resolution, ~25s on CPU
+
+#### P3 — ThinkingBudget per Tier (Qwen3.5 /think mode)
+`internal/llm/ollama.go`:
+- Added `think bool` field + `WithThinking(enabled bool)` builder method
+- `Generate()` prepends `/think\n\n` or `/no_think\n\n` to the prompt
+- `<think>...</think>` blocks in responses are stripped via `thinkTagRe` regex
+Tier 0 (ingest) + Tier 1 (guardian): `thinking=false` — fast, deterministic
+Tier 2 (enrich) + Tier 3 (orchestrate): `thinking=true` — deeper reasoning
+
+#### P7 — /v1/prune Scout Preprocessing Endpoint
+New package `internal/pruner/pruner.go`: strips web page boilerplate (navigation, ads,
+footers, cookie banners) using Tier 0 model. Returns clean technical paragraphs as plain text.
+`server/server.go`: `POST /v1/prune` handler added.
+`pkg/brain/brain.go`: `Prune(ctx, content) (string, error)` added to Brain interface and impl.
+`pkg/brain/null.go`: NullBrain.Prune() returns original content unchanged.
+
+**Effect:** Scout raw web content (3000 chars) → pruned clean signal (~1200 chars) → 4B
+distillation sees only valuable content → better summaries, faster inference.
+
+---
+
+## v0.4.0 — E2E Test Run (2026-03-03)
+
+---
+
+### CRITICAL BUGS
+
+#### BUG-I01 — HTTP WriteTimeout kills all 7b-model endpoints silently
+**Severity:** Critical
+**Root cause:** `server/server.go` line 52:
+```go
+WriteTimeout: 30 * time.Second, // LLM calls can take up to ~3s
+```
+The comment is **stale**. The enricher/guardian/orchestrator use `qwen2.5-coder:7b`
+which takes **30-40 seconds** on CPU. The HTTP write timeout fires at exactly the
+same time as the LLM context deadline → the handler never writes a response →
+the client receives an empty body.
+**Affected endpoints:** `/v1/enrich`, `/v1/explain-violation`, `/v1/coordinate`
+**Evidence:** All three return empty body after exactly 30.1s.
+**Fix options (in priority order):**
+  1. Make `WriteTimeout` configurable: `server.WriteTimeout = time.Duration(cfg.TimeoutMS*2) * time.Millisecond`
+  2. Remove `WriteTimeout` entirely for LLM endpoints and rely on the enricher's
+     context deadline instead (set per-handler timeouts via `http.TimeoutHandler`).
+  3. Short-term workaround: increase default TimeoutMS in config to 60000 and
+     set WriteTimeout to 90s.
+
+#### BUG-I02 — Ingest summary truncated mid-sentence
+**Severity:** High
+**Observed:** `/v1/ingest` response contains `"summary": "...The tags associated…"`
+— summary cut at `NumPredict` token limit. The stored summary in brain.sqlite
+is incomplete and propagates into context-packet `root_summary` with a trailing `…`.
+**Root cause:** `NumPredict` in `internal/llm/ollama.go` is set to 250 tokens.
+Summaries for complex code entities exceed this limit.
+**Fix:** Increase `NumPredict` to 400-500. Or use a two-phase approach: request
+a JSON object with `{"summary": "..."}` and set NumPredict to stop cleanly at
+the closing brace.
+
+#### BUG-I03 — Intelligence ingestor prompt uses "code entity" for all node types
+**Severity:** Medium
+**Observed:** When scout sends a web article for distillation with
+`node_type: "web article"`, the returned summary says "This **code entity**
+provides information about the Model Context Protocol...". The ingestor prompt
+template hardcodes "code entity" regardless of `node_type`.
+**Root cause:** `internal/ingestor/ingestor.go` `buildPrompt()` likely uses a
+static phrase.
+**Fix:** Use the `node_type` field in the prompt:
+```go
+fmt.Sprintf("Summarize this %s in 1-2 sentences: %s\n\n%s", nodeType, name, code)
+```
+This makes summaries contextually correct for web articles, YouTube videos, and
+search result sets.
+
+#### BUG-I04 — Tags always empty in ingest response
+**Severity:** Medium
+**Observed:** Scout receives `"tags": []` from every ingest call. The
+`IngestResponse` has a `Tags []string` field but it's never populated.
+**Root cause:** The ingestor prompt doesn't ask for tags, and `parseInsight`/
+`parseSummary` don't extract them.
+**Fix:** Add to the ingest prompt: "Also list 2-3 relevant tags as a
+comma-separated list on the last line, prefixed with 'tags:'". Parse and return.
+Tags improve search relevance and categorisation in brain.sqlite.
+
+---
+
+### ARCHITECTURE ISSUES
+
+#### ARCH-I01 — packet_quality ceiling of 0.4 on CPU-only deployments
+**Observed:** `packet_quality: 0.4` means only `root_summary` is present.
+`insight` (requires `/v1/enrich`) and `dep_summaries` (requires bulk ingest of
+callee nodes) are missing because enrich times out on CPU with 7b model.
+**Impact:** On a GPU-less machine, context packets are never fully enriched.
+This is the primary use case for indie developers.
+**Improvements:**
+  1. For enrichment, default to `qwen2.5-coder:1.5b` (same as fast ingest).
+     Quality drops ~20% but latency drops from 30s to 3-5s.
+  2. Add an `enrich_model` config field separate from `model` (primary).
+  3. Or implement async enrichment: return `packet_quality: 0.4` immediately,
+     enrich in background, store result, return from cache on next call.
+
+#### ARCH-I02 — No request logging in brain HTTP server
+**Observed:** `/tmp/brain.log` only shows startup messages. There are no
+per-request logs for ingest/enrich timings, LLM latencies, or errors.
+**Impact:** Debugging timeouts, slow responses, and LLM errors is blind.
+**Fix:** Add a simple logging middleware that logs:
+```
+2026-03-03T13:10:31Z POST /v1/ingest  node=cmdStart  latency=16.2s  status=200
+2026-03-03T13:11:00Z POST /v1/enrich  root=cmdStart  latency=30.1s  status=500 (timeout)
+```
+
+#### ARCH-I03 — `/v1/context-packet` requires full SnapshotInput; not ergonomic for standalone use
+**Observed:** Calling `/v1/context-packet` without the correct nested
+`snapshot` structure returns `entity_name: ""` and `packet_quality: 0`.
+The schema requires deeply nested JSON that differs from the simpler ingest
+schema. Callers need to read the code to discover the correct structure.
+**Fix:** Add a simpler `GET /v1/context-packet?node_id=X&name=Y` endpoint that
+wraps the POST endpoint with sensible defaults for all nested fields.
+
+#### ARCH-I04 — Coordinate endpoint schema undiscoverable
+**Observed:** Calling `/v1/coordinate` with an intuitive `{"agents":[...], "shared_entities":[...]}`
+schema returns `{"error": "new_agent_id and new_scope are required"}`. The actual
+schema (`new_agent_id`, `new_scope`, `conflicting_claims`) is only discoverable
+by reading Go types — it's not documented in the API response or README.
+**Fix:** Return a detailed error with the expected schema in the error message,
+or add a `GET /v1/coordinate/schema` endpoint.
+
+---
+
+### PERFORMANCE NOTES (CPU / no GPU)
+
+| Endpoint | Model | Latency | Status |
+|----------|-------|---------|--------|
+| `/v1/ingest` | qwen2.5-coder:1.5b | ~16s | ✅ Works |
+| `/v1/enrich` | qwen2.5-coder:7b | >30s | ❌ Fails (WriteTimeout) |
+| `/v1/explain-violation` | qwen2.5-coder:7b | >30s | ❌ Fails |
+| `/v1/coordinate` | qwen2.5-coder:7b | >30s | ❌ Fails |
+| `/v1/context-packet` (no LLM) | SQLite | <1ms | ✅ Works |
+| `/v1/decision` | SQLite | <1ms | ✅ Works |
+| `/v1/patterns` | SQLite | <1ms | ✅ Works |
+| `/v1/sdlc` | SQLite | <1ms | ✅ Works |
+
+**Recommendation:** On CPU-only machines, configure all LLM endpoints to use
+`qwen2.5-coder:1.5b` with a 20s timeout. The 7b model is unusable without GPU.
+
+---
+
+### WHAT WORKS WELL ✅
+
+- Decision log + co-occurrence pattern learning: instant, deterministic.
+- Context packets (no-LLM path): instant, structured, SDLC-aware.
+- SDLC phase/mode management: correct, persisted, multi-agent safe.
+- Pattern hints in context-packet: correctly surfaces co-change suggestions.
+- Fail-silent pattern: intelligence unavailability never crashes synapses.
+- Brain.sqlite: lightweight, portable, no daemon dependency.
diff --git a/internal/ingestor/ingestor.go b/internal/ingestor/ingestor.go
index 16486af..76920ff 100644
--- a/internal/ingestor/ingestor.go
+++ b/internal/ingestor/ingestor.go
@@ -20,15 +20,14 @@ import (
 
 const (
 	// maxCodeChars is the maximum code snippet size sent to the LLM.
-	// Keeps prompts small for fast inference on 1-2B models.
+	// Keeps prompts small for fast inference on 0.8-2B models.
 	maxCodeChars = 500
 
-	// promptTemplate is tuned for small models:
-	//   - Imperative instruction first
-	//   - Strict JSON-only output format
-	//   - No markdown, no preamble
-	// tags: 1-3 short domain labels e.g. ["auth","http","database"]
-	promptTemplate = `Describe what this code entity does in ONE sentence. Add 1-3 short domain tags.
+	// promptTemplate generates a prose briefing suitable for LLM context delivery.
+	// 2-3 sentences covering: what it does, its role, and any important concerns.
+	// The summary replaces verbose raw code/doc in get_context responses, giving
+	// Claude natural-language context that costs far fewer tokens than JSON.
+	promptTemplate = `Write a 2-3 sentence technical briefing for this code entity: what it does, its role in the system, and any important patterns or concerns to be aware of.
 Output ONLY valid JSON with no other text: {"summary": "...", "tags": ["tag1"]}
 
 Name: %s (%s, package %s)
diff --git a/internal/llm/ollama.go b/internal/llm/ollama.go
index 54d7a34..1faeab9 100644
--- a/internal/llm/ollama.go
+++ b/internal/llm/ollama.go
@@ -7,16 +7,25 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"regexp"
 	"strings"
 	"time"
 )
 
+// thinkTagRe strips Qwen3.5 extended thinking output (<think>...</think> blocks).
+var thinkTagRe = regexp.MustCompile(`(?s)<think>.*?</think>`)
+
 // OllamaClient calls the Ollama REST API at POST /api/generate.
 // It keeps a reusable http.Client for connection pooling.
 type OllamaClient struct {
 	baseURL    string
 	model      string
 	httpClient *http.Client
+	// think controls Qwen3.5 extended thinking mode.
+	// When true, "/think\n\n" is prepended to the prompt (deeper reasoning).
+	// When false, "/no_think\n\n" is prepended (faster, no chain-of-thought).
+	// Models that don't support thinking mode silently ignore the prefix.
+	think bool
 }
 
 // NewOllamaClient creates a client targeting the given Ollama base URL and model.
@@ -34,6 +43,14 @@ func NewOllamaClient(baseURL, model string, timeoutMS int) *OllamaClient {
 	}
 }
 
+// WithThinking configures extended thinking mode for Qwen3.5 models.
+// Call on construction: llm.NewOllamaClient(...).WithThinking(true)
+// Returns the client to allow chaining.
+func (c *OllamaClient) WithThinking(enabled bool) *OllamaClient {
+	c.think = enabled
+	return c
+}
+
 // ollamaRequest is the payload for POST /api/generate.
 type ollamaRequest struct {
 	Model  string `json:"model"`
@@ -58,7 +75,16 @@ type ollamaResponse struct {
 
 // Generate sends a prompt and returns the response text.
 // Uses stream=false for simplicity and lowest latency on small outputs.
+// If thinking mode is configured, prepends /think or /no_think to the prompt
+// (Qwen3.5 extended reasoning control) and strips <think>...</think> from output.
 func (c *OllamaClient) Generate(ctx context.Context, prompt string) (string, error) {
+	// Apply Qwen3.5 thinking mode prefix. Models that don't support this ignore it.
+	if c.think {
+		prompt = "/think\n\n" + prompt
+	} else {
+		prompt = "/no_think\n\n" + prompt
+	}
+
 	reqBody := ollamaRequest{
 		Model:  c.model,
 		Prompt: prompt,
@@ -103,7 +129,10 @@ func (c *OllamaClient) Generate(ctx context.Context, prompt string) (string, err
 		return "", fmt.Errorf("ollama error: %s", result.Error)
 	}
 
-	return strings.TrimSpace(result.Response), nil
+	// Strip extended thinking blocks (<think>...</think>) that Qwen3.5 emits
+	// when thinking mode is enabled. The actual answer follows after the block.
+	response := thinkTagRe.ReplaceAllString(result.Response, "")
+	return strings.TrimSpace(response), nil
 }
 
 // Available checks if Ollama is reachable by calling GET /api/tags.
diff --git a/internal/pruner/pruner.go b/internal/pruner/pruner.go
new file mode 100644
index 0000000..f0d4ae4
--- /dev/null
+++ b/internal/pruner/pruner.go
@@ -0,0 +1,87 @@
+// Package pruner strips boilerplate from web content using the Tier 0 (0.8B) model.
+//
+// Web pages contain 30-50% non-technical noise: navigation menus, cookie banners,
+// footers, sidebars, and ads. Sending this noise to the distillation pipeline wastes
+// LLM compute and dilutes the resulting summary. The Pruner extracts only the
+// core technical paragraphs before handing content to the Ingestor.
+//
+// This is a Tier 0 (Reflex) task: simple extraction, no reasoning, no JSON output.
+// The 0.8B model is fast enough (<3s on CPU) and accurate enough for this job.
+package pruner
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+	"unicode/utf8"
+
+	"github.com/SynapsesOS/synapses-intelligence/internal/llm"
+)
+
+// maxInputChars is the maximum raw content size sent to the LLM.
+// Matches the scout distiller's _DISTILL_MAX_CHARS constant (3000).
+const maxInputChars = 3_000
+
+// promptTemplate instructs the 0.8B model to extract technical content.
+// Plain-text output (no JSON) keeps the task simple and maximises accuracy
+// for small models. The caller uses the raw response directly.
+const promptTemplate = `Extract only the core technical content from this web page text.
+Remove navigation menus, advertisements, footers, cookie notices, and sidebars.
+Return only the key technical paragraphs and information as plain text. Be concise.
+
+Text:
+%s`
+
+// Pruner strips boilerplate from web page text using a small LLM.
+type Pruner struct {
+	llm     llm.LLMClient
+	timeout time.Duration
+}
+
+// New creates a Pruner backed by the given LLM client.
+// timeout is the per-request deadline; defaults to 10s if <= 0.
+func New(client llm.LLMClient, timeout time.Duration) *Pruner {
+	if timeout <= 0 {
+		timeout = 10 * time.Second
+	}
+	return &Pruner{llm: client, timeout: timeout}
+}
+
+// Prune extracts core technical content from raw web page text.
+// Returns the pruned content, or the original content if the LLM call fails.
+// The returned string is always non-empty if input was non-empty.
+func (p *Pruner) Prune(ctx context.Context, content string) (string, error) {
+	content = strings.TrimSpace(content)
+	if content == "" {
+		return "", nil
+	}
+
+	// Truncate to keep the prompt within limits for small models.
+	truncated := truncate(content, maxInputChars)
+
+	ctx, cancel := context.WithTimeout(ctx, p.timeout)
+	defer cancel()
+
+	prompt := fmt.Sprintf(promptTemplate, truncated)
+	result, err := p.llm.Generate(ctx, prompt)
+	if err != nil {
+		// Fail-silent: return original content so the caller can proceed.
+		return content, fmt.Errorf("pruner llm: %w", err)
+	}
+
+	result = strings.TrimSpace(result)
+	if result == "" {
+		return content, nil // empty response — fall back to original
+	}
+	return result, nil
+}
+
+// truncate caps the string at maxChars runes, appending "..." if truncated.
+func truncate(s string, maxChars int) string {
+	if utf8.RuneCountInString(s) <= maxChars {
+		return s
+	}
+	runes := []rune(s)
+	return string(runes[:maxChars]) + "..."
+}
diff --git a/pkg/brain/brain.go b/pkg/brain/brain.go
index 2e9645a..906e388 100644
--- a/pkg/brain/brain.go
+++ b/pkg/brain/brain.go
@@ -12,6 +12,7 @@ import (
 	"github.com/SynapsesOS/synapses-intelligence/internal/ingestor"
 	"github.com/SynapsesOS/synapses-intelligence/internal/llm"
 	"github.com/SynapsesOS/synapses-intelligence/internal/orchestrator"
+	"github.com/SynapsesOS/synapses-intelligence/internal/pruner"
 	"github.com/SynapsesOS/synapses-intelligence/internal/sdlc"
 	"github.com/SynapsesOS/synapses-intelligence/internal/store"
 )
@@ -78,6 +79,11 @@ type Brain interface {
 	// If trigger is non-empty, only patterns with that trigger are returned.
 	// limit caps the number of results (0 = default of 20).
 	GetPatterns(trigger string, limit int) []PatternHint
+
+	// Prune strips boilerplate (navigation, ads, footers) from raw web page text
+	// using the Tier 0 (0.8B) model. Returns cleaned technical content.
+	// Falls back to returning the original content if the LLM is unavailable.
+	Prune(ctx context.Context, content string) (string, error)
 }
 
 // impl is the production Brain backed by Ollama + SQLite.
@@ -89,6 +95,7 @@ type impl struct {
 	enricher     *enricher.Enricher
 	guardian     *guardian.Guardian
 	orchestrator *orchestrator.Orchestrator
+	pruner       *pruner.Pruner
 	sdlcMgr      *sdlc.Manager
 	builder      *contextbuilder.Builder
 	learner      *contextbuilder.Learner
@@ -102,16 +109,16 @@ func New(cfg config.BrainConfig) Brain {
 		return &NullBrain{}
 	}
 
-	// Primary model: used for enrichment and insights (7b default).
-	ollamaClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.Model, cfg.TimeoutMS)
-
-	// Fast model: used for bulk ingest/summarization (1.5b default).
-	// Falls back to primary model if FastModel is unset.
-	fastModel := cfg.FastModel
-	if fastModel == "" {
-		fastModel = cfg.Model
-	}
-	fastClient := llm.NewOllamaClient(cfg.OllamaURL, fastModel, cfg.TimeoutMS)
+	// Tiered Nervous System: each task type uses the model best suited to its complexity.
+	// Thinking mode (Qwen3.5 /think prefix) is disabled for fast tiers and enabled for deep tiers.
+	// Tier 0 (Reflex) — ingest: fast summarization, no reasoning. Default: qwen3.5:0.8b.
+	ingestClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.ModelIngest, cfg.TimeoutMS).WithThinking(false)
+	// Tier 1 (Sensory) — guardian: plain-English violation explanations. Default: qwen3.5:2b.
+	guardianClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.ModelGuardian, cfg.TimeoutMS).WithThinking(false)
+	// Tier 2 (Specialist) — enricher: architectural insight + concerns. Default: qwen3.5:4b.
+	enrichClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.ModelEnrich, cfg.TimeoutMS).WithThinking(true)
+	// Tier 3 (Architect) — orchestrator: multi-agent conflict resolution. Default: qwen3.5:9b.
+	orchestrateClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.ModelOrchestrate, cfg.TimeoutMS).WithThinking(true)
 
 	st, err := store.Open(cfg.DBPath)
 	if err != nil {
@@ -121,17 +128,18 @@ func New(cfg config.BrainConfig) Brain {
 
 	timeout := time.Duration(cfg.TimeoutMS) * time.Millisecond
 
-	enr := enricher.New(ollamaClient, st, timeout)
+	enr := enricher.New(enrichClient, st, timeout)
 	mgr := sdlc.NewManager(st)
 
 	b := &impl{
 		cfg:          cfg,
-		llm:          ollamaClient,
+		llm:          enrichClient, // primary client used for Available() / ModelName()
 		store:        st,
-		ingestor:     ingestor.New(fastClient, st, timeout),
+		ingestor:     ingestor.New(ingestClient, st, timeout),
 		enricher:     enr,
-		guardian:     guardian.New(ollamaClient, st, timeout),
-		orchestrator: orchestrator.New(ollamaClient, timeout),
+		guardian:     guardian.New(guardianClient, st, timeout),
+		orchestrator: orchestrator.New(orchestrateClient, timeout),
+		pruner:       pruner.New(ingestClient, timeout), // Tier 0: 0.8B, same as ingest
 		sdlcMgr:      mgr,
 		builder:      contextbuilder.New(st, mgr, enr),
 		learner:      contextbuilder.NewLearner(st),
@@ -229,6 +237,10 @@ func (b *impl) Coordinate(ctx context.Context, req CoordinateRequest) (Coordinat
 	}, nil
 }
 
+func (b *impl) Prune(ctx context.Context, content string) (string, error) {
+	return b.pruner.Prune(ctx, content)
+}
+
 func (b *impl) Summary(nodeID string) string {
 	if b.store == nil {
 		return ""
diff --git a/pkg/brain/null.go b/pkg/brain/null.go
index 5345b91..923926d 100644
--- a/pkg/brain/null.go
+++ b/pkg/brain/null.go
@@ -63,3 +63,6 @@ func (n *NullBrain) GetSDLCConfig() SDLCConfig {
 
 // GetPatterns returns nil — no patterns are stored when brain is disabled.
 func (n *NullBrain) GetPatterns(_ string, _ int) []PatternHint { return nil }
+
+// Prune returns the original content unchanged — no LLM is available.
+func (n *NullBrain) Prune(_ context.Context, content string) (string, error) { return content, nil }
diff --git a/server/server.go b/server/server.go
index 36ec6ff..8de084f 100644
--- a/server/server.go
+++ b/server/server.go
@@ -27,7 +27,9 @@ type Server struct {
 }
 
 // New creates a Server that delegates to the given Brain.
-func New(b brain.Brain, port int) *Server {
+// timeoutMS is the configured LLM timeout; WriteTimeout is set to 2× this value
+// so LLM handlers always have time to write their response after inference completes.
+func New(b brain.Brain, port int, timeoutMS int) *Server {
 	s := &Server{brain: b, port: port}
 	mux := http.NewServeMux()
 	mux.HandleFunc("GET /v1/health", s.handleHealth)
@@ -36,6 +38,7 @@ func New(b brain.Brain, port int) *Server {
 	mux.HandleFunc("POST /v1/enrich", s.handleEnrich)
 	mux.HandleFunc("POST /v1/explain-violation", s.handleExplainViolation)
 	mux.HandleFunc("POST /v1/coordinate", s.handleCoordinate)
+	mux.HandleFunc("POST /v1/prune", s.handlePrune)
 
 	// v0.2.0 endpoints
 	mux.HandleFunc("POST /v1/context-packet", s.handleContextPacket)
@@ -45,11 +48,12 @@ func New(b brain.Brain, port int) *Server {
 	mux.HandleFunc("POST /v1/decision", s.handleLogDecision)
 	mux.HandleFunc("GET /v1/patterns", s.handleGetPatterns)
 
+	writeTimeout := time.Duration(timeoutMS*2) * time.Millisecond
 	s.server = &http.Server{
 		Addr:         fmt.Sprintf("127.0.0.1:%d", port),
 		Handler:      mux,
 		ReadTimeout:  10 * time.Second,
-		WriteTimeout: 30 * time.Second, // LLM calls can take up to ~3s
+		WriteTimeout: writeTimeout,
 		IdleTimeout:  60 * time.Second,
 	}
 	return s
@@ -176,6 +180,32 @@ func (s *Server) handleCoordinate(w http.ResponseWriter, r *http.Request) {
 	writeJSON(w, http.StatusOK, resp)
 }
 
+func (s *Server) handlePrune(w http.ResponseWriter, r *http.Request) {
+	var req struct {
+		Content string `json:"content"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		writeError(w, http.StatusBadRequest, "invalid request body: "+err.Error())
+		return
+	}
+	if req.Content == "" {
+		writeError(w, http.StatusBadRequest, "content is required")
+		return
+	}
+
+	pruned, err := s.brain.Prune(r.Context(), req.Content)
+	if err != nil {
+		// Non-fatal: return original content with a warning header.
+		pruned = req.Content
+		w.Header().Set("X-Prune-Warning", err.Error())
+	}
+	writeJSON(w, http.StatusOK, map[string]interface{}{
+		"pruned":          pruned,
+		"original_length": len(req.Content),
+		"pruned_length":   len(pruned),
+	})
+}
+
 // --- v0.2.0 Handlers ---
 
 // handleContextPacket assembles a Context Packet for the calling agent.