diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..f94cfd8
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,60 @@
+
+## Synapses — Code Intelligence (MCP)
+
+This project is indexed by **Synapses**, a graph-based code intelligence server.
+
+### Session Start
+Call **one tool** at the start of every session:
+```
+session_init() ← replaces get_pending_tasks + get_project_identity + get_working_state
+```
+Returns: pending tasks, project identity, working state, recent agent events, and **scale_guidance** — a repo-size-aware recommendation on which tools to prefer.
+
+### Tool Selection — follow scale_guidance from session_init
+
+| Repo scale | When to use Synapses | When to use Read/Grep |
+|---|---|---|
+| micro (<100 nodes) | Structural analysis, multi-file understanding | Simple targeted edits to a known file |
+| small (100–499) | Code exploration, cross-file analysis | Targeted single-file edits |
+| medium (500–1999) | All code exploration — Glob/Grep surfaces too much noise | Writing to a specific file you already identified |
+| large (2000+) | Always — direct scanning is too noisy at this scale | Writing to a specific file you already identified |
+
+### Code Exploration
+
+| When you want to... | Use this |
+|---|---|
+| Understand a function, struct, or interface | `get_context(entity="Name")` |
+| Pin to a specific file (avoids wrong-entity picks) | `get_context(entity="Name", file="cmd/server/main.go")` |
+| Boost nodes linked to current task | `get_context(entity="Name", task_id="...")` |
+| Find a symbol by name or substring | `find_entity(query="name")` |
+| Search by concept ("auth", "rate limiting") | `search(query="...", mode="semantic")` |
+| List all entities in a file | `get_file_context(file="path/to/file")` |
+| Trace how function A calls function B | `get_call_chain(from="A", to="B")` |
+| Find what breaks if a symbol changes | `get_impact(symbol="Name")` |
+
+### Before Writing Code
+
+| When you want to... | Use this |
+|---|---|
+| Check proposed changes against architecture rules | `validate_plan(changes=[...])` |
+| Reserve a file/package before editing | `claim_work(agent_id="...", scope="pkg/auth", scope_type="package")` |
+| Check if another agent is editing the same code | `get_conflicts(agent_id="...")` |
+| Release locks when done | `release_claims(agent_id="...")` |
+
+### Task & Session Management
+
+| When you want to... | Use this |
+|---|---|
+| Save a plan with tasks for future sessions | `create_plan(title="...", tasks=[...])` |
+| Mark a task as done or add notes | `update_task(id="...", status="done", notes="...")` |
+| Save progress so next session can resume | `save_session_state(task_id="...")` |
+| Leave a note on a code entity for other agents | `annotate_node(node_id="...", note="...")` |
+| See what other agents have been doing | `get_events(since_seq=N)` (use latest_event_seq from session_init) |
+
+### Rules
+- **Read/Grep** are for *writing* code (editing a specific file you have already found). For *understanding* code structure, always prefer Synapses tools.
+- **Call `session_init()`** at the start of every session. It replaces the 3-call startup ritual.
+- **Call `validate_plan()`** before implementing multi-file changes.
+- **Call `claim_work()`** before editing to avoid conflicts with other agents.
+- When `get_context` returns `other_candidates`, re-call with `file=` to pin to the right entity.
+
diff --git a/README.md b/README.md
index b668165..77cce88 100644
--- a/README.md
+++ b/README.md
@@ -15,13 +15,14 @@ same information in ~800 tokens that raw nodes would need 4,000+ tokens to expre
### Capabilities at a glance
-| Capability | Method | LLM? | Latency |
+| Capability | Method | LLM tier | Latency (CPU) |
|---|---|---|---|
-| Summarise a code entity (1 sentence) | `Ingest` | yes | 1–3 s |
+| 2-3 sentence prose briefing for a code entity | `Ingest` | Tier 0 (0.8B) | ~3 s |
+| Strip boilerplate from web content | `Prune` | Tier 0 (0.8B) | ~3 s |
| Context Packet (summaries + constraints + guidance) | `BuildContextPacket` | optional | <5 ms fast path |
-| Architectural insight for a neighbourhood | `Enrich` | yes | 1–3 s |
-| Explain a rule violation in plain English | `ExplainViolation` | yes (cached) | <1 ms cached |
-| Agent conflict work distribution | `Coordinate` | yes | 1–3 s |
+| Architectural insight for a neighbourhood | `Enrich` | Tier 2 (4B) | ~12 s |
+| Explain a rule violation in plain English | `ExplainViolation` | Tier 1 (2B, cached) | <1 ms cached |
+| Agent conflict work distribution | `Coordinate` | Tier 3 (9B) | ~25 s |
| SDLC phase + quality mode management | `SetSDLCPhase` / `SetQualityMode` | no | <1 ms |
| Co-occurrence learning ("also check Y when editing X") | `LogDecision` | no | <1 ms |
| Get learned patterns | `GetPatterns` | no | <1 ms |
@@ -47,18 +48,23 @@ No CGO, no external databases, no network dependencies beyond Ollama.
## Quick start
```sh
-# 1. Install Ollama and pull the default model
-ollama pull qwen2.5-coder:1.5b
+# 1. Install Ollama and pull the Tier 0 model (minimum — handles ingest + prune)
+ollama pull qwen3.5:0.8b
-# 2. Build the binary
+# 2. Optionally pull additional tiers for better quality
+ollama pull qwen3.5:2b # Tier 1: guardian (violation explanations)
+ollama pull qwen3.5:4b # Tier 2: enricher (architectural insight)
+ollama pull qwen3.5:9b # Tier 3: orchestrator (multi-agent conflicts)
+
+# 3. Build the binary
make build
-# 3. Start the brain sidecar
+# 4. Start the brain sidecar
./bin/brain serve
-# 4. Verify it's running
+# 5. Verify it's running
curl http://localhost:11435/v1/health
-# {"status":"ok","model":"qwen2.5-coder:1.5b","available":true}
+# {"status":"ok","model":"qwen3.5:4b","available":true}
```
---
@@ -95,8 +101,11 @@ All fields are optional — sensible defaults apply.
{
"enabled": true,
"ollama_url": "http://localhost:11434",
- "model": "qwen2.5-coder:1.5b",
- "timeout_ms": 3000,
+ "model_ingest": "qwen3.5:0.8b",
+ "model_guardian": "qwen3.5:2b",
+ "model_enrich": "qwen3.5:4b",
+ "model_orchestrate": "qwen3.5:9b",
+ "timeout_ms": 60000,
"db_path": "~/.synapses/brain.sqlite",
"port": 11435,
@@ -122,11 +131,16 @@ BRAIN_CONFIG=/path/to/brain.json brain serve
|---|---|---|
| `enabled` | `false` | Master switch. Set `true` to activate all features. |
| `ollama_url` | `http://localhost:11434` | Ollama server base URL. |
-| `model` | `qwen2.5-coder:1.5b` | Ollama model tag. See [Model tiers](#model-tiers). |
-| `timeout_ms` | `3000` | Per-LLM-request timeout in milliseconds. |
+| `model_ingest` | `qwen3.5:0.8b` | Tier 0 (Reflex): bulk ingest + web pruning. |
+| `model_guardian` | `qwen3.5:2b` | Tier 1 (Sensory): rule violation explanations. |
+| `model_enrich` | `qwen3.5:4b` | Tier 2 (Specialist): architectural insight. |
+| `model_orchestrate` | `qwen3.5:9b` | Tier 3 (Architect): multi-agent conflict resolution. |
+| `model` | `qwen3.5:4b` | Fallback model when tier fields are absent (backward compat). |
+| `fast_model` | `qwen3.5:0.8b` | Fallback fast model when `model_ingest` is absent. |
+| `timeout_ms` | `60000` | Per-LLM-request timeout in milliseconds. WriteTimeout = 2× this. |
| `db_path` | `~/.synapses/brain.sqlite` | SQLite database path (created if missing). |
| `port` | `11435` | HTTP sidecar port. |
-| `ingest` | `true` | Enable `POST /v1/ingest` (semantic summaries). |
+| `ingest` | `true` | Enable `POST /v1/ingest` (prose briefings). |
| `enrich` | `true` | Enable `POST /v1/enrich` (neighbourhood insight). |
| `guardian` | `true` | Enable `POST /v1/explain-violation` (rule explanations). |
| `orchestrate` | `true` | Enable `POST /v1/coordinate` (agent conflict resolution). |
@@ -139,22 +153,33 @@ BRAIN_CONFIG=/path/to/brain.json brain serve
## Model tiers
-| System RAM | Model | Size | Notes |
-|---|---|---|---|
-| 4 GB | `qwen2.5-coder:1.5b` | ~900 MB | Default. Works on any dev machine. |
-| 4 GB+ | `qwen3:1.7b` | ~1.1 GB | Recommended upgrade. Better reasoning. |
-| 8 GB+ | `qwen3:4b` | ~2.5 GB | Power user. Noticeably better summaries. |
-| 16 GB+ | `qwen3:8b` | ~5 GB | Enterprise. Best quality, higher latency. |
+synapses-intelligence uses a **4-tier nervous system** — each task type runs on the smallest
+model capable of doing it well. All four Qwen3.5 models share the same tokenizer and support
+`/think` (chain-of-thought) and `/no_think` (fast, deterministic) mode switching.
+
+| Tier | Name | Model | Thinking | RAM | CPU latency | Tasks |
+|---|---|---|---|---|---|---|
+| 0 | Reflex | `qwen3.5:0.8b` | off | 1 GB | ~3 s | Ingest (prose briefings), web pruning |
+| 1 | Sensory | `qwen3.5:2b` | off | 2 GB | ~5 s | Guardian (violation explanations) |
+| 2 | Specialist | `qwen3.5:4b` | on | 4 GB | ~12 s | Enricher (architectural insight) |
+| 3 | Architect | `qwen3.5:9b` | on | 8 GB | ~25 s | Orchestrator (multi-agent conflicts) |
+
+**Minimum setup** (ingest + prune only): pull `qwen3.5:0.8b`.
+**Recommended setup** (all features): pull all four models.
```sh
-# Pull a specific model
-ollama pull qwen3:1.7b
+ollama pull qwen3.5:0.8b # required
+ollama pull qwen3.5:2b # recommended: guardian
+ollama pull qwen3.5:4b # recommended: enricher
+ollama pull qwen3.5:9b # optional: orchestrator
+```
-# Start brain with a different model
-brain serve -model qwen3:1.7b
+If a tier model is missing, brain falls back to the `model` config field. To point all tiers
+at a single model (e.g. for minimal RAM setups):
-# Or set in config
-{ "model": "qwen3:1.7b" }
+```json
+{ "model_ingest": "qwen3.5:0.8b", "model_guardian": "qwen3.5:0.8b",
+ "model_enrich": "qwen3.5:0.8b", "model_orchestrate": "qwen3.5:0.8b" }
```
---
@@ -269,7 +294,7 @@ brain reset
```sh
brain version
-# synapses-intelligence v0.3.0
+# synapses-intelligence v0.5.1
```
---
@@ -298,8 +323,8 @@ Health check and LLM availability probe.
### `POST /v1/ingest`
-Generate and store a 1-sentence semantic summary (+ topic tags) for a code entity.
-Call this whenever a function, struct, or method is saved.
+Generate and store a **2-3 sentence prose briefing** (+ topic tags) for a code entity.
+Call this whenever a function, struct, or method is saved. Uses Tier 0 (0.8B).
**Request**
```json
@@ -328,6 +353,33 @@ Call this whenever a function, struct, or method is saved.
---
+### `POST /v1/prune`
+
+Strip boilerplate from raw web page text (navigation, ads, footers, cookie notices) and
+return only the core technical content. Used by synapses-scout as a preprocessing step
+before `POST /v1/ingest` to improve distillation quality. Uses Tier 0 (0.8B). Fail-silent:
+returns the original content if the LLM is unavailable.
+
+**Request**
+```json
+{
+ "content": "...3000 chars of raw web page text..."
+}
+```
+
+**Response**
+```json
+{
+ "pruned": "...clean technical paragraphs (~1200 chars)...",
+ "original_length": 3000,
+ "pruned_length": 1187
+}
+```
+
+On LLM error, returns the original content with an `X-Prune-Warning` response header.
+
+---
+
### `GET /v1/summary/{nodeId}`
Fetch the stored summary for a single node. Fast (SQLite, no LLM).
@@ -694,14 +746,17 @@ _ = b.LogDecision(ctx, brain.DecisionRequest{
```go
type Brain interface {
- // Semantic summaries
+ // Semantic summaries (Tier 0: 0.8B)
Ingest(ctx, IngestRequest) (IngestResponse, error)
- Enrich(ctx, EnrichRequest) (EnrichResponse, error)
Summary(nodeID string) string
+ // Web content preprocessing (Tier 0: 0.8B)
+ Prune(ctx context.Context, content string) (string, error)
+
// Architectural analysis
- ExplainViolation(ctx, ViolationRequest) (ViolationResponse, error)
- Coordinate(ctx, CoordinateRequest) (CoordinateResponse, error)
+ Enrich(ctx, EnrichRequest) (EnrichResponse, error) // Tier 2: 4B
+ ExplainViolation(ctx, ViolationRequest) (ViolationResponse, error) // Tier 1: 2B
+ Coordinate(ctx, CoordinateRequest) (CoordinateResponse, error) // Tier 3: 9B
// Context Packet
BuildContextPacket(ctx, ContextPacketRequest) (*ContextPacket, error)
@@ -718,6 +773,7 @@ type Brain interface {
// Diagnostics
Available() bool
ModelName() string
+ EnsureModel(ctx context.Context, w io.Writer) error
}
```
diff --git a/cmd/brain/main.go b/cmd/brain/main.go
index aeb8057..3edf5d5 100644
--- a/cmd/brain/main.go
+++ b/cmd/brain/main.go
@@ -104,7 +104,7 @@ func cmdServe(cfg config.BrainConfig) {
}
}
- srv := server.New(b, cfg.Port)
+ srv := server.New(b, cfg.Port, cfg.TimeoutMS)
// Graceful shutdown on SIGINT/SIGTERM.
ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM)
diff --git a/config/config.go b/config/config.go
index 4a05d32..e00c5e0 100644
--- a/config/config.go
+++ b/config/config.go
@@ -15,20 +15,39 @@ type BrainConfig struct {
// OllamaURL is the base URL of the Ollama server. Default: "http://localhost:11434".
OllamaURL string `json:"ollama_url,omitempty"`
- // Model is the primary Ollama model tag for enrichment and insights.
- // Default: "qwen2.5-coder:7b" (~4.5GB, needs 6GB VRAM or 8GB RAM).
- // Downgrade options for low-resource systems:
- // "qwen2.5-coder:1.5b" — fast, fits in 4GB RAM (~900MB)
- // "qwen2.5-coder:3b" — balanced, fits in 6GB RAM (~1.9GB)
+ // Model is the primary Ollama model tag (enrichment fallback when ModelEnrich is unset).
+ // Default: "qwen3.5:4b" — fast on CPU (~12s), beats qwen2.5-coder:7b at 1/3 the size.
+ // Legacy option: "qwen2.5-coder:7b" (~4.5GB, needs 6GB VRAM or 8GB RAM).
Model string `json:"model,omitempty"`
- // FastModel is the Ollama model tag for bulk ingestion (summarization).
- // Bulk ingest runs on every node at index time; use a smaller, faster model.
- // Default: "qwen2.5-coder:1.5b" (~900MB).
- // Set to "" to use Model for both ingest and enrichment.
+ // FastModel is the Ollama model tag for bulk ingestion (fallback when ModelIngest is unset).
+ // Default: "qwen3.5:0.8b" — runs in <3s on CPU, fits in 2GB RAM.
+ // Legacy option: "qwen2.5-coder:1.5b" (~900MB).
FastModel string `json:"fast_model,omitempty"`
- // TimeoutMS is the per-request LLM timeout in milliseconds. Default: 3000.
+ // --- Tiered Nervous System: per-task model assignment ---
+ // Each tier defaults to the appropriate Qwen3.5 model.
+ // Set to "" to fall back to FastModel/Model. All 4 can point to the same model.
+
+ // ModelIngest is the model for bulk node summarization at index time.
+ // Tier 0 (Reflex): simple extraction, no reasoning needed. Default: "qwen3.5:0.8b".
+ ModelIngest string `json:"model_ingest,omitempty"`
+
+ // ModelGuardian is the model for rule violation explanations.
+ // Tier 1 (Sensory): structured plain-English output. Default: "qwen3.5:2b".
+ ModelGuardian string `json:"model_guardian,omitempty"`
+
+ // ModelEnrich is the model for architectural enrichment and insight generation.
+ // Tier 2 (Specialist): complex analysis across multiple callers/callees. Default: "qwen3.5:4b".
+ ModelEnrich string `json:"model_enrich,omitempty"`
+
+ // ModelOrchestrate is the model for multi-agent conflict resolution.
+ // Tier 3 (Architect): deep reasoning about competing scope claims. Default: "qwen3.5:9b".
+ ModelOrchestrate string `json:"model_orchestrate,omitempty"`
+
+ // TimeoutMS is the per-request LLM timeout in milliseconds.
+ // The HTTP server WriteTimeout is set to 2× this value. Default: 60000 (60s).
+ // Must exceed the slowest LLM inference time on your hardware (~25s for 9b CPU).
TimeoutMS int `json:"timeout_ms,omitempty"`
// DBPath is the path to the brain's own SQLite database.
@@ -63,21 +82,25 @@ type BrainConfig struct {
func DefaultConfig() BrainConfig {
home, _ := os.UserHomeDir()
return BrainConfig{
- Enabled: false,
- OllamaURL: "http://localhost:11434",
- Model: "qwen2.5-coder:7b",
- FastModel: "qwen2.5-coder:1.5b",
- TimeoutMS: 30000,
- DBPath: filepath.Join(home, ".synapses", "brain.sqlite"),
- Port: 11435,
- Ingest: true,
- Enrich: true,
- Guardian: true,
- Orchestrate: true,
- ContextBuilder: true,
- LearningEnabled: true,
- DefaultPhase: "development",
- DefaultMode: "standard",
+ Enabled: false,
+ OllamaURL: "http://localhost:11434",
+ Model: "qwen3.5:4b",
+ FastModel: "qwen3.5:0.8b",
+ ModelIngest: "qwen3.5:0.8b",
+ ModelGuardian: "qwen3.5:2b",
+ ModelEnrich: "qwen3.5:4b",
+ ModelOrchestrate: "qwen3.5:9b",
+ TimeoutMS: 60000,
+ DBPath: filepath.Join(home, ".synapses", "brain.sqlite"),
+ Port: 11435,
+ Ingest: true,
+ Enrich: true,
+ Guardian: true,
+ Orchestrate: true,
+ ContextBuilder: true,
+ LearningEnabled: true,
+ DefaultPhase: "development",
+ DefaultMode: "standard",
}
}
@@ -120,18 +143,32 @@ func LoadFile(path string) (BrainConfig, error) {
}
// applyDefaults fills in zero values with defaults.
+// Tier models fall back to the legacy fast_model/model fields if unset.
func (c *BrainConfig) applyDefaults() {
if c.OllamaURL == "" {
c.OllamaURL = "http://localhost:11434"
}
if c.Model == "" {
- c.Model = "qwen2.5-coder:7b"
+ c.Model = "qwen3.5:4b"
}
if c.FastModel == "" {
- c.FastModel = "qwen2.5-coder:1.5b"
+ c.FastModel = "qwen3.5:0.8b"
+ }
+ // Tier fallback chain: tier model → legacy field → hardcoded default
+ if c.ModelIngest == "" {
+ c.ModelIngest = c.FastModel
+ }
+ if c.ModelGuardian == "" {
+ c.ModelGuardian = "qwen3.5:2b"
+ }
+ if c.ModelEnrich == "" {
+ c.ModelEnrich = c.Model
+ }
+ if c.ModelOrchestrate == "" {
+ c.ModelOrchestrate = c.Model
}
if c.TimeoutMS <= 0 {
- c.TimeoutMS = 30000
+ c.TimeoutMS = 60000
}
if c.Port <= 0 {
c.Port = 11435
diff --git a/improvement.md b/improvement.md
new file mode 100644
index 0000000..b9fd901
--- /dev/null
+++ b/improvement.md
@@ -0,0 +1,180 @@
+# synapses-intelligence improvement log
+
+## v0.5.1 — Tiered Nervous System + Scout Prune Pipeline (2026-03-03)
+
+### Changes
+
+#### P0 — Fix WriteTimeout (BUG-I01 closed)
+`server/server.go`: `WriteTimeout` now set to `2 × cfg.TimeoutMS` instead of hardcoded 30s.
+`config/config.go`: Default `TimeoutMS` raised from 30000 → 60000ms.
+All 3 broken endpoints (`/v1/enrich`, `/v1/explain-violation`, `/v1/coordinate`) are now
+reachable on CPU. Closes BUG-I01.
+
+#### P1 — Four-Tier Model Config
+`config/config.go`: Added 4 new tier model fields:
+- `ModelIngest` (Tier 0 Reflex, default `qwen3.5:0.8b`)
+- `ModelGuardian` (Tier 1 Sensory, default `qwen3.5:2b`)
+- `ModelEnrich` (Tier 2 Specialist, default `qwen3.5:4b`)
+- `ModelOrchestrate` (Tier 3 Architect, default `qwen3.5:9b`)
+Backward-compatible fallback chain in `applyDefaults()`: missing tier fields fall back to
+`fast_model` (ingest) or `model` (others).
+
+#### P2 — Route Handlers to Correct Tier
+`pkg/brain/brain.go`: `New()` creates 4 separate `OllamaClient` instances, one per tier.
+- Ingestor → Tier 0 (0.8B): bulk summarization, fast and cheap
+- Guardian → Tier 1 (2B): violation explanations, was broken with 7b, now <5s
+- Enricher → Tier 2 (4B): architectural insight, ~12s on CPU
+- Orchestrator → Tier 3 (9B): multi-agent conflict resolution, ~25s on CPU
+
+#### P3 — ThinkingBudget per Tier (Qwen3.5 /think mode)
+`internal/llm/ollama.go`:
+- Added `think bool` field + `WithThinking(enabled bool)` builder method
+- `Generate()` prepends `/think\n\n` or `/no_think\n\n` to the prompt
+- `...` blocks in responses are stripped via `thinkTagRe` regex
+Tier 0 (ingest) + Tier 1 (guardian): `thinking=false` — fast, deterministic
+Tier 2 (enrich) + Tier 3 (orchestrate): `thinking=true` — deeper reasoning
+
+#### P7 — /v1/prune Scout Preprocessing Endpoint
+New package `internal/pruner/pruner.go`: strips web page boilerplate (navigation, ads,
+footers, cookie banners) using Tier 0 model. Returns clean technical paragraphs as plain text.
+`server/server.go`: `POST /v1/prune` handler added.
+`pkg/brain/brain.go`: `Prune(ctx, content) (string, error)` added to Brain interface and impl.
+`pkg/brain/null.go`: NullBrain.Prune() returns original content unchanged.
+
+**Effect:** Scout raw web content (3000 chars) → pruned clean signal (~1200 chars) → 4B
+distillation sees only valuable content → better summaries, faster inference.
+
+---
+
+## v0.4.0 — E2E Test Run (2026-03-03)
+
+---
+
+### CRITICAL BUGS
+
+#### BUG-I01 — HTTP WriteTimeout kills all 7b-model endpoints silently
+**Severity:** Critical
+**Root cause:** `server/server.go` line 52:
+```go
+WriteTimeout: 30 * time.Second, // LLM calls can take up to ~3s
+```
+The comment is **stale**. The enricher/guardian/orchestrator use `qwen2.5-coder:7b`
+which takes **30-40 seconds** on CPU. The HTTP write timeout fires at exactly the
+same time as the LLM context deadline → the handler never writes a response →
+the client receives an empty body.
+**Affected endpoints:** `/v1/enrich`, `/v1/explain-violation`, `/v1/coordinate`
+**Evidence:** All three return empty body after exactly 30.1s.
+**Fix options (in priority order):**
+ 1. Make `WriteTimeout` configurable: `server.WriteTimeout = time.Duration(cfg.TimeoutMS*2) * time.Millisecond`
+ 2. Remove `WriteTimeout` entirely for LLM endpoints and rely on the enricher's
+ context deadline instead (set per-handler timeouts via `http.TimeoutHandler`).
+ 3. Short-term workaround: increase default TimeoutMS in config to 60000 and
+ set WriteTimeout to 90s.
+
+#### BUG-I02 — Ingest summary truncated mid-sentence
+**Severity:** High
+**Observed:** `/v1/ingest` response contains `"summary": "...The tags associated…"`
+— summary cut at `NumPredict` token limit. The stored summary in brain.sqlite
+is incomplete and propagates into context-packet `root_summary` with a trailing `…`.
+**Root cause:** `NumPredict` in `internal/llm/ollama.go` is set to 250 tokens.
+Summaries for complex code entities exceed this limit.
+**Fix:** Increase `NumPredict` to 400-500. Or use a two-phase approach: request
+a JSON object with `{"summary": "..."}` and set NumPredict to stop cleanly at
+the closing brace.
+
+#### BUG-I03 — Intelligence ingestor prompt uses "code entity" for all node types
+**Severity:** Medium
+**Observed:** When scout sends a web article for distillation with
+`node_type: "web article"`, the returned summary says "This **code entity**
+provides information about the Model Context Protocol...". The ingestor prompt
+template hardcodes "code entity" regardless of `node_type`.
+**Root cause:** `internal/ingestor/ingestor.go` `buildPrompt()` likely uses a
+static phrase.
+**Fix:** Use the `node_type` field in the prompt:
+```go
+fmt.Sprintf("Summarize this %s in 1-2 sentences: %s\n\n%s", nodeType, name, code)
+```
+This makes summaries contextually correct for web articles, YouTube videos, and
+search result sets.
+
+#### BUG-I04 — Tags always empty in ingest response
+**Severity:** Medium
+**Observed:** Scout receives `"tags": []` from every ingest call. The
+`IngestResponse` has a `Tags []string` field but it's never populated.
+**Root cause:** The ingestor prompt doesn't ask for tags, and `parseInsight`/
+`parseSummary` don't extract them.
+**Fix:** Add to the ingest prompt: "Also list 2-3 relevant tags as a
+comma-separated list on the last line, prefixed with 'tags:'". Parse and return.
+Tags improve search relevance and categorisation in brain.sqlite.
+
+---
+
+### ARCHITECTURE ISSUES
+
+#### ARCH-I01 — packet_quality ceiling of 0.4 on CPU-only deployments
+**Observed:** `packet_quality: 0.4` means only `root_summary` is present.
+`insight` (requires `/v1/enrich`) and `dep_summaries` (requires bulk ingest of
+callee nodes) are missing because enrich times out on CPU with 7b model.
+**Impact:** On a GPU-less machine, context packets are never fully enriched.
+This is the primary use case for indie developers.
+**Improvements:**
+ 1. For enrichment, default to `qwen2.5-coder:1.5b` (same as fast ingest).
+ Quality drops ~20% but latency drops from 30s to 3-5s.
+ 2. Add an `enrich_model` config field separate from `model` (primary).
+ 3. Or implement async enrichment: return `packet_quality: 0.4` immediately,
+ enrich in background, store result, return from cache on next call.
+
+#### ARCH-I02 — No request logging in brain HTTP server
+**Observed:** `/tmp/brain.log` only shows startup messages. There are no
+per-request logs for ingest/enrich timings, LLM latencies, or errors.
+**Impact:** Debugging timeouts, slow responses, and LLM errors is blind.
+**Fix:** Add a simple logging middleware that logs:
+```
+2026-03-03T13:10:31Z POST /v1/ingest node=cmdStart latency=16.2s status=200
+2026-03-03T13:11:00Z POST /v1/enrich root=cmdStart latency=30.1s status=500 (timeout)
+```
+
+#### ARCH-I03 — `/v1/context-packet` requires full SnapshotInput; not ergonomic for standalone use
+**Observed:** Calling `/v1/context-packet` without the correct nested
+`snapshot` structure returns `entity_name: ""` and `packet_quality: 0`.
+The schema requires deeply nested JSON that differs from the simpler ingest
+schema. Callers need to read the code to discover the correct structure.
+**Fix:** Add a simpler `GET /v1/context-packet?node_id=X&name=Y` endpoint that
+wraps the POST endpoint with sensible defaults for all nested fields.
+
+#### ARCH-I04 — Coordinate endpoint schema undiscoverable
+**Observed:** Calling `/v1/coordinate` with an intuitive `{"agents":[...], "shared_entities":[...]}`
+schema returns `{"error": "new_agent_id and new_scope are required"}`. The actual
+schema (`new_agent_id`, `new_scope`, `conflicting_claims`) is only discoverable
+by reading Go types — it's not documented in the API response or README.
+**Fix:** Return a detailed error with the expected schema in the error message,
+or add a `GET /v1/coordinate/schema` endpoint.
+
+---
+
+### PERFORMANCE NOTES (CPU / no GPU)
+
+| Endpoint | Model | Latency | Status |
+|----------|-------|---------|--------|
+| `/v1/ingest` | qwen2.5-coder:1.5b | ~16s | ✅ Works |
+| `/v1/enrich` | qwen2.5-coder:7b | >30s | ❌ Fails (WriteTimeout) |
+| `/v1/explain-violation` | qwen2.5-coder:7b | >30s | ❌ Fails |
+| `/v1/coordinate` | qwen2.5-coder:7b | >30s | ❌ Fails |
+| `/v1/context-packet` (no LLM) | SQLite | <1ms | ✅ Works |
+| `/v1/decision` | SQLite | <1ms | ✅ Works |
+| `/v1/patterns` | SQLite | <1ms | ✅ Works |
+| `/v1/sdlc` | SQLite | <1ms | ✅ Works |
+
+**Recommendation:** On CPU-only machines, configure all LLM endpoints to use
+`qwen2.5-coder:1.5b` with a 20s timeout. The 7b model is unusable without GPU.
+
+---
+
+### WHAT WORKS WELL ✅
+
+- Decision log + co-occurrence pattern learning: instant, deterministic.
+- Context packets (no-LLM path): instant, structured, SDLC-aware.
+- SDLC phase/mode management: correct, persisted, multi-agent safe.
+- Pattern hints in context-packet: correctly surfaces co-change suggestions.
+- Fail-silent pattern: intelligence unavailability never crashes synapses.
+- Brain.sqlite: lightweight, portable, no daemon dependency.
diff --git a/internal/ingestor/ingestor.go b/internal/ingestor/ingestor.go
index 16486af..76920ff 100644
--- a/internal/ingestor/ingestor.go
+++ b/internal/ingestor/ingestor.go
@@ -20,15 +20,14 @@ import (
const (
// maxCodeChars is the maximum code snippet size sent to the LLM.
- // Keeps prompts small for fast inference on 1-2B models.
+ // Keeps prompts small for fast inference on 0.8-2B models.
maxCodeChars = 500
- // promptTemplate is tuned for small models:
- // - Imperative instruction first
- // - Strict JSON-only output format
- // - No markdown, no preamble
- // tags: 1-3 short domain labels e.g. ["auth","http","database"]
- promptTemplate = `Describe what this code entity does in ONE sentence. Add 1-3 short domain tags.
+ // promptTemplate generates a prose briefing suitable for LLM context delivery.
+ // 2-3 sentences covering: what it does, its role, and any important concerns.
+ // The summary replaces verbose raw code/doc in get_context responses, giving
+ // Claude natural-language context that costs far fewer tokens than JSON.
+ promptTemplate = `Write a 2-3 sentence technical briefing for this code entity: what it does, its role in the system, and any important patterns or concerns to be aware of.
Output ONLY valid JSON with no other text: {"summary": "...", "tags": ["tag1"]}
Name: %s (%s, package %s)
diff --git a/internal/llm/ollama.go b/internal/llm/ollama.go
index 54d7a34..1faeab9 100644
--- a/internal/llm/ollama.go
+++ b/internal/llm/ollama.go
@@ -7,16 +7,25 @@ import (
"fmt"
"io"
"net/http"
+ "regexp"
"strings"
"time"
)
+// thinkTagRe strips Qwen3.5 extended thinking output (... blocks).
+var thinkTagRe = regexp.MustCompile(`(?s).*?`)
+
// OllamaClient calls the Ollama REST API at POST /api/generate.
// It keeps a reusable http.Client for connection pooling.
type OllamaClient struct {
baseURL string
model string
httpClient *http.Client
+ // think controls Qwen3.5 extended thinking mode.
+ // When true, "/think\n\n" is prepended to the prompt (deeper reasoning).
+ // When false, "/no_think\n\n" is prepended (faster, no chain-of-thought).
+ // Models that don't support thinking mode silently ignore the prefix.
+ think bool
}
// NewOllamaClient creates a client targeting the given Ollama base URL and model.
@@ -34,6 +43,14 @@ func NewOllamaClient(baseURL, model string, timeoutMS int) *OllamaClient {
}
}
+// WithThinking configures extended thinking mode for Qwen3.5 models.
+// Call on construction: llm.NewOllamaClient(...).WithThinking(true)
+// Returns the client to allow chaining.
+func (c *OllamaClient) WithThinking(enabled bool) *OllamaClient {
+ c.think = enabled
+ return c
+}
+
// ollamaRequest is the payload for POST /api/generate.
type ollamaRequest struct {
Model string `json:"model"`
@@ -58,7 +75,16 @@ type ollamaResponse struct {
// Generate sends a prompt and returns the response text.
// Uses stream=false for simplicity and lowest latency on small outputs.
+// If thinking mode is configured, prepends /think or /no_think to the prompt
+// (Qwen3.5 extended reasoning control) and strips ... from output.
func (c *OllamaClient) Generate(ctx context.Context, prompt string) (string, error) {
+ // Apply Qwen3.5 thinking mode prefix. Models that don't support this ignore it.
+ if c.think {
+ prompt = "/think\n\n" + prompt
+ } else {
+ prompt = "/no_think\n\n" + prompt
+ }
+
reqBody := ollamaRequest{
Model: c.model,
Prompt: prompt,
@@ -103,7 +129,10 @@ func (c *OllamaClient) Generate(ctx context.Context, prompt string) (string, err
return "", fmt.Errorf("ollama error: %s", result.Error)
}
- return strings.TrimSpace(result.Response), nil
+ // Strip extended thinking blocks (...) that Qwen3.5 emits
+ // when thinking mode is enabled. The actual answer follows after the block.
+ response := thinkTagRe.ReplaceAllString(result.Response, "")
+ return strings.TrimSpace(response), nil
}
// Available checks if Ollama is reachable by calling GET /api/tags.
diff --git a/internal/pruner/pruner.go b/internal/pruner/pruner.go
new file mode 100644
index 0000000..f0d4ae4
--- /dev/null
+++ b/internal/pruner/pruner.go
@@ -0,0 +1,87 @@
+// Package pruner strips boilerplate from web content using the Tier 0 (0.8B) model.
+//
+// Web pages contain 30-50% non-technical noise: navigation menus, cookie banners,
+// footers, sidebars, and ads. Sending this noise to the distillation pipeline wastes
+// LLM compute and dilutes the resulting summary. The Pruner extracts only the
+// core technical paragraphs before handing content to the Ingestor.
+//
+// This is a Tier 0 (Reflex) task: simple extraction, no reasoning, no JSON output.
+// The 0.8B model is fast enough (<3s on CPU) and accurate enough for this job.
+package pruner
+
+import (
+ "context"
+ "fmt"
+ "strings"
+ "time"
+ "unicode/utf8"
+
+ "github.com/SynapsesOS/synapses-intelligence/internal/llm"
+)
+
+// maxInputChars is the maximum raw content size sent to the LLM.
+// Matches the scout distiller's _DISTILL_MAX_CHARS constant (3000).
+const maxInputChars = 3_000
+
+// promptTemplate instructs the 0.8B model to extract technical content.
+// Plain-text output (no JSON) keeps the task simple and maximises accuracy
+// for small models. The caller uses the raw response directly.
+const promptTemplate = `Extract only the core technical content from this web page text.
+Remove navigation menus, advertisements, footers, cookie notices, and sidebars.
+Return only the key technical paragraphs and information as plain text. Be concise.
+
+Text:
+%s`
+
+// Pruner strips boilerplate from web page text using a small LLM.
+type Pruner struct {
+ llm llm.LLMClient
+ timeout time.Duration
+}
+
+// New creates a Pruner backed by the given LLM client.
+// timeout is the per-request deadline; defaults to 10s if <= 0.
+func New(client llm.LLMClient, timeout time.Duration) *Pruner {
+ if timeout <= 0 {
+ timeout = 10 * time.Second
+ }
+ return &Pruner{llm: client, timeout: timeout}
+}
+
+// Prune extracts core technical content from raw web page text.
+// Returns the pruned content, or the original content if the LLM call fails.
+// The returned string is always non-empty if input was non-empty.
+func (p *Pruner) Prune(ctx context.Context, content string) (string, error) {
+ content = strings.TrimSpace(content)
+ if content == "" {
+ return "", nil
+ }
+
+ // Truncate to keep the prompt within limits for small models.
+ truncated := truncate(content, maxInputChars)
+
+ ctx, cancel := context.WithTimeout(ctx, p.timeout)
+ defer cancel()
+
+ prompt := fmt.Sprintf(promptTemplate, truncated)
+ result, err := p.llm.Generate(ctx, prompt)
+ if err != nil {
+ // Fail-silent: return original content so the caller can proceed.
+ return content, fmt.Errorf("pruner llm: %w", err)
+ }
+
+ result = strings.TrimSpace(result)
+ if result == "" {
+ return content, nil // empty response — fall back to original
+ }
+ return result, nil
+}
+
+// truncate caps the string at maxChars runes, appending "..." if truncated.
+func truncate(s string, maxChars int) string {
+ if utf8.RuneCountInString(s) <= maxChars {
+ return s
+ }
+ runes := []rune(s)
+ return string(runes[:maxChars]) + "..."
+}
diff --git a/pkg/brain/brain.go b/pkg/brain/brain.go
index 2e9645a..906e388 100644
--- a/pkg/brain/brain.go
+++ b/pkg/brain/brain.go
@@ -12,6 +12,7 @@ import (
"github.com/SynapsesOS/synapses-intelligence/internal/ingestor"
"github.com/SynapsesOS/synapses-intelligence/internal/llm"
"github.com/SynapsesOS/synapses-intelligence/internal/orchestrator"
+ "github.com/SynapsesOS/synapses-intelligence/internal/pruner"
"github.com/SynapsesOS/synapses-intelligence/internal/sdlc"
"github.com/SynapsesOS/synapses-intelligence/internal/store"
)
@@ -78,6 +79,11 @@ type Brain interface {
// If trigger is non-empty, only patterns with that trigger are returned.
// limit caps the number of results (0 = default of 20).
GetPatterns(trigger string, limit int) []PatternHint
+
+ // Prune strips boilerplate (navigation, ads, footers) from raw web page text
+ // using the Tier 0 (0.8B) model. Returns cleaned technical content.
+ // Falls back to returning the original content if the LLM is unavailable.
+ Prune(ctx context.Context, content string) (string, error)
}
// impl is the production Brain backed by Ollama + SQLite.
@@ -89,6 +95,7 @@ type impl struct {
enricher *enricher.Enricher
guardian *guardian.Guardian
orchestrator *orchestrator.Orchestrator
+ pruner *pruner.Pruner
sdlcMgr *sdlc.Manager
builder *contextbuilder.Builder
learner *contextbuilder.Learner
@@ -102,16 +109,16 @@ func New(cfg config.BrainConfig) Brain {
return &NullBrain{}
}
- // Primary model: used for enrichment and insights (7b default).
- ollamaClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.Model, cfg.TimeoutMS)
-
- // Fast model: used for bulk ingest/summarization (1.5b default).
- // Falls back to primary model if FastModel is unset.
- fastModel := cfg.FastModel
- if fastModel == "" {
- fastModel = cfg.Model
- }
- fastClient := llm.NewOllamaClient(cfg.OllamaURL, fastModel, cfg.TimeoutMS)
+ // Tiered Nervous System: each task type uses the model best suited to its complexity.
+ // Thinking mode (Qwen3.5 /think prefix) is disabled for fast tiers and enabled for deep tiers.
+ // Tier 0 (Reflex) — ingest: fast summarization, no reasoning. Default: qwen3.5:0.8b.
+ ingestClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.ModelIngest, cfg.TimeoutMS).WithThinking(false)
+ // Tier 1 (Sensory) — guardian: plain-English violation explanations. Default: qwen3.5:2b.
+ guardianClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.ModelGuardian, cfg.TimeoutMS).WithThinking(false)
+ // Tier 2 (Specialist) — enricher: architectural insight + concerns. Default: qwen3.5:4b.
+ enrichClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.ModelEnrich, cfg.TimeoutMS).WithThinking(true)
+ // Tier 3 (Architect) — orchestrator: multi-agent conflict resolution. Default: qwen3.5:9b.
+ orchestrateClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.ModelOrchestrate, cfg.TimeoutMS).WithThinking(true)
st, err := store.Open(cfg.DBPath)
if err != nil {
@@ -121,17 +128,18 @@ func New(cfg config.BrainConfig) Brain {
timeout := time.Duration(cfg.TimeoutMS) * time.Millisecond
- enr := enricher.New(ollamaClient, st, timeout)
+ enr := enricher.New(enrichClient, st, timeout)
mgr := sdlc.NewManager(st)
b := &impl{
cfg: cfg,
- llm: ollamaClient,
+ llm: enrichClient, // primary client used for Available() / ModelName()
store: st,
- ingestor: ingestor.New(fastClient, st, timeout),
+ ingestor: ingestor.New(ingestClient, st, timeout),
enricher: enr,
- guardian: guardian.New(ollamaClient, st, timeout),
- orchestrator: orchestrator.New(ollamaClient, timeout),
+ guardian: guardian.New(guardianClient, st, timeout),
+ orchestrator: orchestrator.New(orchestrateClient, timeout),
+ pruner: pruner.New(ingestClient, timeout), // Tier 0: 0.8B, same as ingest
sdlcMgr: mgr,
builder: contextbuilder.New(st, mgr, enr),
learner: contextbuilder.NewLearner(st),
@@ -229,6 +237,10 @@ func (b *impl) Coordinate(ctx context.Context, req CoordinateRequest) (Coordinat
}, nil
}
+func (b *impl) Prune(ctx context.Context, content string) (string, error) {
+ return b.pruner.Prune(ctx, content)
+}
+
func (b *impl) Summary(nodeID string) string {
if b.store == nil {
return ""
diff --git a/pkg/brain/null.go b/pkg/brain/null.go
index 5345b91..923926d 100644
--- a/pkg/brain/null.go
+++ b/pkg/brain/null.go
@@ -63,3 +63,6 @@ func (n *NullBrain) GetSDLCConfig() SDLCConfig {
// GetPatterns returns nil — no patterns are stored when brain is disabled.
func (n *NullBrain) GetPatterns(_ string, _ int) []PatternHint { return nil }
+
+// Prune returns the original content unchanged — no LLM is available.
+func (n *NullBrain) Prune(_ context.Context, content string) (string, error) { return content, nil }
diff --git a/server/server.go b/server/server.go
index 36ec6ff..8de084f 100644
--- a/server/server.go
+++ b/server/server.go
@@ -27,7 +27,9 @@ type Server struct {
}
// New creates a Server that delegates to the given Brain.
-func New(b brain.Brain, port int) *Server {
+// timeoutMS is the configured LLM timeout; WriteTimeout is set to 2× this value
+// so LLM handlers always have time to write their response after inference completes.
+func New(b brain.Brain, port int, timeoutMS int) *Server {
s := &Server{brain: b, port: port}
mux := http.NewServeMux()
mux.HandleFunc("GET /v1/health", s.handleHealth)
@@ -36,6 +38,7 @@ func New(b brain.Brain, port int) *Server {
mux.HandleFunc("POST /v1/enrich", s.handleEnrich)
mux.HandleFunc("POST /v1/explain-violation", s.handleExplainViolation)
mux.HandleFunc("POST /v1/coordinate", s.handleCoordinate)
+ mux.HandleFunc("POST /v1/prune", s.handlePrune)
// v0.2.0 endpoints
mux.HandleFunc("POST /v1/context-packet", s.handleContextPacket)
@@ -45,11 +48,12 @@ func New(b brain.Brain, port int) *Server {
mux.HandleFunc("POST /v1/decision", s.handleLogDecision)
mux.HandleFunc("GET /v1/patterns", s.handleGetPatterns)
+ writeTimeout := time.Duration(timeoutMS*2) * time.Millisecond
s.server = &http.Server{
Addr: fmt.Sprintf("127.0.0.1:%d", port),
Handler: mux,
ReadTimeout: 10 * time.Second,
- WriteTimeout: 30 * time.Second, // LLM calls can take up to ~3s
+ WriteTimeout: writeTimeout,
IdleTimeout: 60 * time.Second,
}
return s
@@ -176,6 +180,32 @@ func (s *Server) handleCoordinate(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusOK, resp)
}
+func (s *Server) handlePrune(w http.ResponseWriter, r *http.Request) {
+ var req struct {
+ Content string `json:"content"`
+ }
+ if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+ writeError(w, http.StatusBadRequest, "invalid request body: "+err.Error())
+ return
+ }
+ if req.Content == "" {
+ writeError(w, http.StatusBadRequest, "content is required")
+ return
+ }
+
+ pruned, err := s.brain.Prune(r.Context(), req.Content)
+ if err != nil {
+ // Non-fatal: return original content with a warning header.
+ pruned = req.Content
+ w.Header().Set("X-Prune-Warning", err.Error())
+ }
+ writeJSON(w, http.StatusOK, map[string]interface{}{
+ "pruned": pruned,
+ "original_length": len(req.Content),
+ "pruned_length": len(pruned),
+ })
+}
+
// --- v0.2.0 Handlers ---
// handleContextPacket assembles a Context Packet for the calling agent.