From b0c076c63e41b5ba411ea92838f2fd8dd90be0b6 Mon Sep 17 00:00:00 2001 From: Divyansh Kumar Date: Tue, 3 Mar 2026 17:49:32 +0000 Subject: [PATCH] feat(v0.5.1): 4-tier nervous system, WriteTimeout fix, /v1/prune endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tiered Nervous System (Qwen3.5 models): - config: add ModelIngest/ModelGuardian/ModelEnrich/ModelOrchestrate fields with backward-compatible fallback chain (defaults: 0.8b/2b/4b/9b) - brain.New(): create 4 separate OllamaClient instances, one per tier - llm/ollama.go: add WithThinking(bool) builder + /think|/no_think prompt prefixes for Qwen3.5; strip ... blocks from responses Bug fixes: - server: WriteTimeout now 2×TimeoutMS (was hardcoded 30s); closes BUG-I01 that silently killed enrich/explain-violation/coordinate on CPU - config: default TimeoutMS 30000→60000ms Features: - ingestor: update prompt to 2-3 sentence prose briefing format - pruner: new internal/pruner package strips web boilerplate via Tier 0 (0.8B) - server: POST /v1/prune endpoint; Brain interface + NullBrain updated Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 60 ++++++++++++ README.md | 126 +++++++++++++++++------- cmd/brain/main.go | 2 +- config/config.go | 93 ++++++++++++------ improvement.md | 180 ++++++++++++++++++++++++++++++++++ internal/ingestor/ingestor.go | 13 ++- internal/llm/ollama.go | 31 +++++- internal/pruner/pruner.go | 87 ++++++++++++++++ pkg/brain/brain.go | 42 +++++--- pkg/brain/null.go | 3 + server/server.go | 34 ++++++- 11 files changed, 582 insertions(+), 89 deletions(-) create mode 100644 CLAUDE.md create mode 100644 improvement.md create mode 100644 internal/pruner/pruner.go diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..f94cfd8 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,60 @@ + +## Synapses — Code Intelligence (MCP) + +This project is indexed by **Synapses**, a graph-based code intelligence server. + +### Session Start +Call **one tool** at the start of every session: +``` +session_init() ← replaces get_pending_tasks + get_project_identity + get_working_state +``` +Returns: pending tasks, project identity, working state, recent agent events, and **scale_guidance** — a repo-size-aware recommendation on which tools to prefer. + +### Tool Selection — follow scale_guidance from session_init + +| Repo scale | When to use Synapses | When to use Read/Grep | +|---|---|---| +| micro (<100 nodes) | Structural analysis, multi-file understanding | Simple targeted edits to a known file | +| small (100–499) | Code exploration, cross-file analysis | Targeted single-file edits | +| medium (500–1999) | All code exploration — Glob/Grep surfaces too much noise | Writing to a specific file you already identified | +| large (2000+) | Always — direct scanning is too noisy at this scale | Writing to a specific file you already identified | + +### Code Exploration + +| When you want to... | Use this | +|---|---| +| Understand a function, struct, or interface | `get_context(entity="Name")` | +| Pin to a specific file (avoids wrong-entity picks) | `get_context(entity="Name", file="cmd/server/main.go")` | +| Boost nodes linked to current task | `get_context(entity="Name", task_id="...")` | +| Find a symbol by name or substring | `find_entity(query="name")` | +| Search by concept ("auth", "rate limiting") | `search(query="...", mode="semantic")` | +| List all entities in a file | `get_file_context(file="path/to/file")` | +| Trace how function A calls function B | `get_call_chain(from="A", to="B")` | +| Find what breaks if a symbol changes | `get_impact(symbol="Name")` | + +### Before Writing Code + +| When you want to... | Use this | +|---|---| +| Check proposed changes against architecture rules | `validate_plan(changes=[...])` | +| Reserve a file/package before editing | `claim_work(agent_id="...", scope="pkg/auth", scope_type="package")` | +| Check if another agent is editing the same code | `get_conflicts(agent_id="...")` | +| Release locks when done | `release_claims(agent_id="...")` | + +### Task & Session Management + +| When you want to... | Use this | +|---|---| +| Save a plan with tasks for future sessions | `create_plan(title="...", tasks=[...])` | +| Mark a task as done or add notes | `update_task(id="...", status="done", notes="...")` | +| Save progress so next session can resume | `save_session_state(task_id="...")` | +| Leave a note on a code entity for other agents | `annotate_node(node_id="...", note="...")` | +| See what other agents have been doing | `get_events(since_seq=N)` (use latest_event_seq from session_init) | + +### Rules +- **Read/Grep** are for *writing* code (editing a specific file you have already found). For *understanding* code structure, always prefer Synapses tools. +- **Call `session_init()`** at the start of every session. It replaces the 3-call startup ritual. +- **Call `validate_plan()`** before implementing multi-file changes. +- **Call `claim_work()`** before editing to avoid conflicts with other agents. +- When `get_context` returns `other_candidates`, re-call with `file=` to pin to the right entity. + diff --git a/README.md b/README.md index b668165..77cce88 100644 --- a/README.md +++ b/README.md @@ -15,13 +15,14 @@ same information in ~800 tokens that raw nodes would need 4,000+ tokens to expre ### Capabilities at a glance -| Capability | Method | LLM? | Latency | +| Capability | Method | LLM tier | Latency (CPU) | |---|---|---|---| -| Summarise a code entity (1 sentence) | `Ingest` | yes | 1–3 s | +| 2-3 sentence prose briefing for a code entity | `Ingest` | Tier 0 (0.8B) | ~3 s | +| Strip boilerplate from web content | `Prune` | Tier 0 (0.8B) | ~3 s | | Context Packet (summaries + constraints + guidance) | `BuildContextPacket` | optional | <5 ms fast path | -| Architectural insight for a neighbourhood | `Enrich` | yes | 1–3 s | -| Explain a rule violation in plain English | `ExplainViolation` | yes (cached) | <1 ms cached | -| Agent conflict work distribution | `Coordinate` | yes | 1–3 s | +| Architectural insight for a neighbourhood | `Enrich` | Tier 2 (4B) | ~12 s | +| Explain a rule violation in plain English | `ExplainViolation` | Tier 1 (2B, cached) | <1 ms cached | +| Agent conflict work distribution | `Coordinate` | Tier 3 (9B) | ~25 s | | SDLC phase + quality mode management | `SetSDLCPhase` / `SetQualityMode` | no | <1 ms | | Co-occurrence learning ("also check Y when editing X") | `LogDecision` | no | <1 ms | | Get learned patterns | `GetPatterns` | no | <1 ms | @@ -47,18 +48,23 @@ No CGO, no external databases, no network dependencies beyond Ollama. ## Quick start ```sh -# 1. Install Ollama and pull the default model -ollama pull qwen2.5-coder:1.5b +# 1. Install Ollama and pull the Tier 0 model (minimum — handles ingest + prune) +ollama pull qwen3.5:0.8b -# 2. Build the binary +# 2. Optionally pull additional tiers for better quality +ollama pull qwen3.5:2b # Tier 1: guardian (violation explanations) +ollama pull qwen3.5:4b # Tier 2: enricher (architectural insight) +ollama pull qwen3.5:9b # Tier 3: orchestrator (multi-agent conflicts) + +# 3. Build the binary make build -# 3. Start the brain sidecar +# 4. Start the brain sidecar ./bin/brain serve -# 4. Verify it's running +# 5. Verify it's running curl http://localhost:11435/v1/health -# {"status":"ok","model":"qwen2.5-coder:1.5b","available":true} +# {"status":"ok","model":"qwen3.5:4b","available":true} ``` --- @@ -95,8 +101,11 @@ All fields are optional — sensible defaults apply. { "enabled": true, "ollama_url": "http://localhost:11434", - "model": "qwen2.5-coder:1.5b", - "timeout_ms": 3000, + "model_ingest": "qwen3.5:0.8b", + "model_guardian": "qwen3.5:2b", + "model_enrich": "qwen3.5:4b", + "model_orchestrate": "qwen3.5:9b", + "timeout_ms": 60000, "db_path": "~/.synapses/brain.sqlite", "port": 11435, @@ -122,11 +131,16 @@ BRAIN_CONFIG=/path/to/brain.json brain serve |---|---|---| | `enabled` | `false` | Master switch. Set `true` to activate all features. | | `ollama_url` | `http://localhost:11434` | Ollama server base URL. | -| `model` | `qwen2.5-coder:1.5b` | Ollama model tag. See [Model tiers](#model-tiers). | -| `timeout_ms` | `3000` | Per-LLM-request timeout in milliseconds. | +| `model_ingest` | `qwen3.5:0.8b` | Tier 0 (Reflex): bulk ingest + web pruning. | +| `model_guardian` | `qwen3.5:2b` | Tier 1 (Sensory): rule violation explanations. | +| `model_enrich` | `qwen3.5:4b` | Tier 2 (Specialist): architectural insight. | +| `model_orchestrate` | `qwen3.5:9b` | Tier 3 (Architect): multi-agent conflict resolution. | +| `model` | `qwen3.5:4b` | Fallback model when tier fields are absent (backward compat). | +| `fast_model` | `qwen3.5:0.8b` | Fallback fast model when `model_ingest` is absent. | +| `timeout_ms` | `60000` | Per-LLM-request timeout in milliseconds. WriteTimeout = 2× this. | | `db_path` | `~/.synapses/brain.sqlite` | SQLite database path (created if missing). | | `port` | `11435` | HTTP sidecar port. | -| `ingest` | `true` | Enable `POST /v1/ingest` (semantic summaries). | +| `ingest` | `true` | Enable `POST /v1/ingest` (prose briefings). | | `enrich` | `true` | Enable `POST /v1/enrich` (neighbourhood insight). | | `guardian` | `true` | Enable `POST /v1/explain-violation` (rule explanations). | | `orchestrate` | `true` | Enable `POST /v1/coordinate` (agent conflict resolution). | @@ -139,22 +153,33 @@ BRAIN_CONFIG=/path/to/brain.json brain serve ## Model tiers -| System RAM | Model | Size | Notes | -|---|---|---|---| -| 4 GB | `qwen2.5-coder:1.5b` | ~900 MB | Default. Works on any dev machine. | -| 4 GB+ | `qwen3:1.7b` | ~1.1 GB | Recommended upgrade. Better reasoning. | -| 8 GB+ | `qwen3:4b` | ~2.5 GB | Power user. Noticeably better summaries. | -| 16 GB+ | `qwen3:8b` | ~5 GB | Enterprise. Best quality, higher latency. | +synapses-intelligence uses a **4-tier nervous system** — each task type runs on the smallest +model capable of doing it well. All four Qwen3.5 models share the same tokenizer and support +`/think` (chain-of-thought) and `/no_think` (fast, deterministic) mode switching. + +| Tier | Name | Model | Thinking | RAM | CPU latency | Tasks | +|---|---|---|---|---|---|---| +| 0 | Reflex | `qwen3.5:0.8b` | off | 1 GB | ~3 s | Ingest (prose briefings), web pruning | +| 1 | Sensory | `qwen3.5:2b` | off | 2 GB | ~5 s | Guardian (violation explanations) | +| 2 | Specialist | `qwen3.5:4b` | on | 4 GB | ~12 s | Enricher (architectural insight) | +| 3 | Architect | `qwen3.5:9b` | on | 8 GB | ~25 s | Orchestrator (multi-agent conflicts) | + +**Minimum setup** (ingest + prune only): pull `qwen3.5:0.8b`. +**Recommended setup** (all features): pull all four models. ```sh -# Pull a specific model -ollama pull qwen3:1.7b +ollama pull qwen3.5:0.8b # required +ollama pull qwen3.5:2b # recommended: guardian +ollama pull qwen3.5:4b # recommended: enricher +ollama pull qwen3.5:9b # optional: orchestrator +``` -# Start brain with a different model -brain serve -model qwen3:1.7b +If a tier model is missing, brain falls back to the `model` config field. To point all tiers +at a single model (e.g. for minimal RAM setups): -# Or set in config -{ "model": "qwen3:1.7b" } +```json +{ "model_ingest": "qwen3.5:0.8b", "model_guardian": "qwen3.5:0.8b", + "model_enrich": "qwen3.5:0.8b", "model_orchestrate": "qwen3.5:0.8b" } ``` --- @@ -269,7 +294,7 @@ brain reset ```sh brain version -# synapses-intelligence v0.3.0 +# synapses-intelligence v0.5.1 ``` --- @@ -298,8 +323,8 @@ Health check and LLM availability probe. ### `POST /v1/ingest` -Generate and store a 1-sentence semantic summary (+ topic tags) for a code entity. -Call this whenever a function, struct, or method is saved. +Generate and store a **2-3 sentence prose briefing** (+ topic tags) for a code entity. +Call this whenever a function, struct, or method is saved. Uses Tier 0 (0.8B). **Request** ```json @@ -328,6 +353,33 @@ Call this whenever a function, struct, or method is saved. --- +### `POST /v1/prune` + +Strip boilerplate from raw web page text (navigation, ads, footers, cookie notices) and +return only the core technical content. Used by synapses-scout as a preprocessing step +before `POST /v1/ingest` to improve distillation quality. Uses Tier 0 (0.8B). Fail-silent: +returns the original content if the LLM is unavailable. + +**Request** +```json +{ + "content": "...3000 chars of raw web page text..." +} +``` + +**Response** +```json +{ + "pruned": "...clean technical paragraphs (~1200 chars)...", + "original_length": 3000, + "pruned_length": 1187 +} +``` + +On LLM error, returns the original content with an `X-Prune-Warning` response header. + +--- + ### `GET /v1/summary/{nodeId}` Fetch the stored summary for a single node. Fast (SQLite, no LLM). @@ -694,14 +746,17 @@ _ = b.LogDecision(ctx, brain.DecisionRequest{ ```go type Brain interface { - // Semantic summaries + // Semantic summaries (Tier 0: 0.8B) Ingest(ctx, IngestRequest) (IngestResponse, error) - Enrich(ctx, EnrichRequest) (EnrichResponse, error) Summary(nodeID string) string + // Web content preprocessing (Tier 0: 0.8B) + Prune(ctx context.Context, content string) (string, error) + // Architectural analysis - ExplainViolation(ctx, ViolationRequest) (ViolationResponse, error) - Coordinate(ctx, CoordinateRequest) (CoordinateResponse, error) + Enrich(ctx, EnrichRequest) (EnrichResponse, error) // Tier 2: 4B + ExplainViolation(ctx, ViolationRequest) (ViolationResponse, error) // Tier 1: 2B + Coordinate(ctx, CoordinateRequest) (CoordinateResponse, error) // Tier 3: 9B // Context Packet BuildContextPacket(ctx, ContextPacketRequest) (*ContextPacket, error) @@ -718,6 +773,7 @@ type Brain interface { // Diagnostics Available() bool ModelName() string + EnsureModel(ctx context.Context, w io.Writer) error } ``` diff --git a/cmd/brain/main.go b/cmd/brain/main.go index aeb8057..3edf5d5 100644 --- a/cmd/brain/main.go +++ b/cmd/brain/main.go @@ -104,7 +104,7 @@ func cmdServe(cfg config.BrainConfig) { } } - srv := server.New(b, cfg.Port) + srv := server.New(b, cfg.Port, cfg.TimeoutMS) // Graceful shutdown on SIGINT/SIGTERM. ctx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM) diff --git a/config/config.go b/config/config.go index 4a05d32..e00c5e0 100644 --- a/config/config.go +++ b/config/config.go @@ -15,20 +15,39 @@ type BrainConfig struct { // OllamaURL is the base URL of the Ollama server. Default: "http://localhost:11434". OllamaURL string `json:"ollama_url,omitempty"` - // Model is the primary Ollama model tag for enrichment and insights. - // Default: "qwen2.5-coder:7b" (~4.5GB, needs 6GB VRAM or 8GB RAM). - // Downgrade options for low-resource systems: - // "qwen2.5-coder:1.5b" — fast, fits in 4GB RAM (~900MB) - // "qwen2.5-coder:3b" — balanced, fits in 6GB RAM (~1.9GB) + // Model is the primary Ollama model tag (enrichment fallback when ModelEnrich is unset). + // Default: "qwen3.5:4b" — fast on CPU (~12s), beats qwen2.5-coder:7b at 1/3 the size. + // Legacy option: "qwen2.5-coder:7b" (~4.5GB, needs 6GB VRAM or 8GB RAM). Model string `json:"model,omitempty"` - // FastModel is the Ollama model tag for bulk ingestion (summarization). - // Bulk ingest runs on every node at index time; use a smaller, faster model. - // Default: "qwen2.5-coder:1.5b" (~900MB). - // Set to "" to use Model for both ingest and enrichment. + // FastModel is the Ollama model tag for bulk ingestion (fallback when ModelIngest is unset). + // Default: "qwen3.5:0.8b" — runs in <3s on CPU, fits in 2GB RAM. + // Legacy option: "qwen2.5-coder:1.5b" (~900MB). FastModel string `json:"fast_model,omitempty"` - // TimeoutMS is the per-request LLM timeout in milliseconds. Default: 3000. + // --- Tiered Nervous System: per-task model assignment --- + // Each tier defaults to the appropriate Qwen3.5 model. + // Set to "" to fall back to FastModel/Model. All 4 can point to the same model. + + // ModelIngest is the model for bulk node summarization at index time. + // Tier 0 (Reflex): simple extraction, no reasoning needed. Default: "qwen3.5:0.8b". + ModelIngest string `json:"model_ingest,omitempty"` + + // ModelGuardian is the model for rule violation explanations. + // Tier 1 (Sensory): structured plain-English output. Default: "qwen3.5:2b". + ModelGuardian string `json:"model_guardian,omitempty"` + + // ModelEnrich is the model for architectural enrichment and insight generation. + // Tier 2 (Specialist): complex analysis across multiple callers/callees. Default: "qwen3.5:4b". + ModelEnrich string `json:"model_enrich,omitempty"` + + // ModelOrchestrate is the model for multi-agent conflict resolution. + // Tier 3 (Architect): deep reasoning about competing scope claims. Default: "qwen3.5:9b". + ModelOrchestrate string `json:"model_orchestrate,omitempty"` + + // TimeoutMS is the per-request LLM timeout in milliseconds. + // The HTTP server WriteTimeout is set to 2× this value. Default: 60000 (60s). + // Must exceed the slowest LLM inference time on your hardware (~25s for 9b CPU). TimeoutMS int `json:"timeout_ms,omitempty"` // DBPath is the path to the brain's own SQLite database. @@ -63,21 +82,25 @@ type BrainConfig struct { func DefaultConfig() BrainConfig { home, _ := os.UserHomeDir() return BrainConfig{ - Enabled: false, - OllamaURL: "http://localhost:11434", - Model: "qwen2.5-coder:7b", - FastModel: "qwen2.5-coder:1.5b", - TimeoutMS: 30000, - DBPath: filepath.Join(home, ".synapses", "brain.sqlite"), - Port: 11435, - Ingest: true, - Enrich: true, - Guardian: true, - Orchestrate: true, - ContextBuilder: true, - LearningEnabled: true, - DefaultPhase: "development", - DefaultMode: "standard", + Enabled: false, + OllamaURL: "http://localhost:11434", + Model: "qwen3.5:4b", + FastModel: "qwen3.5:0.8b", + ModelIngest: "qwen3.5:0.8b", + ModelGuardian: "qwen3.5:2b", + ModelEnrich: "qwen3.5:4b", + ModelOrchestrate: "qwen3.5:9b", + TimeoutMS: 60000, + DBPath: filepath.Join(home, ".synapses", "brain.sqlite"), + Port: 11435, + Ingest: true, + Enrich: true, + Guardian: true, + Orchestrate: true, + ContextBuilder: true, + LearningEnabled: true, + DefaultPhase: "development", + DefaultMode: "standard", } } @@ -120,18 +143,32 @@ func LoadFile(path string) (BrainConfig, error) { } // applyDefaults fills in zero values with defaults. +// Tier models fall back to the legacy fast_model/model fields if unset. func (c *BrainConfig) applyDefaults() { if c.OllamaURL == "" { c.OllamaURL = "http://localhost:11434" } if c.Model == "" { - c.Model = "qwen2.5-coder:7b" + c.Model = "qwen3.5:4b" } if c.FastModel == "" { - c.FastModel = "qwen2.5-coder:1.5b" + c.FastModel = "qwen3.5:0.8b" + } + // Tier fallback chain: tier model → legacy field → hardcoded default + if c.ModelIngest == "" { + c.ModelIngest = c.FastModel + } + if c.ModelGuardian == "" { + c.ModelGuardian = "qwen3.5:2b" + } + if c.ModelEnrich == "" { + c.ModelEnrich = c.Model + } + if c.ModelOrchestrate == "" { + c.ModelOrchestrate = c.Model } if c.TimeoutMS <= 0 { - c.TimeoutMS = 30000 + c.TimeoutMS = 60000 } if c.Port <= 0 { c.Port = 11435 diff --git a/improvement.md b/improvement.md new file mode 100644 index 0000000..b9fd901 --- /dev/null +++ b/improvement.md @@ -0,0 +1,180 @@ +# synapses-intelligence improvement log + +## v0.5.1 — Tiered Nervous System + Scout Prune Pipeline (2026-03-03) + +### Changes + +#### P0 — Fix WriteTimeout (BUG-I01 closed) +`server/server.go`: `WriteTimeout` now set to `2 × cfg.TimeoutMS` instead of hardcoded 30s. +`config/config.go`: Default `TimeoutMS` raised from 30000 → 60000ms. +All 3 broken endpoints (`/v1/enrich`, `/v1/explain-violation`, `/v1/coordinate`) are now +reachable on CPU. Closes BUG-I01. + +#### P1 — Four-Tier Model Config +`config/config.go`: Added 4 new tier model fields: +- `ModelIngest` (Tier 0 Reflex, default `qwen3.5:0.8b`) +- `ModelGuardian` (Tier 1 Sensory, default `qwen3.5:2b`) +- `ModelEnrich` (Tier 2 Specialist, default `qwen3.5:4b`) +- `ModelOrchestrate` (Tier 3 Architect, default `qwen3.5:9b`) +Backward-compatible fallback chain in `applyDefaults()`: missing tier fields fall back to +`fast_model` (ingest) or `model` (others). + +#### P2 — Route Handlers to Correct Tier +`pkg/brain/brain.go`: `New()` creates 4 separate `OllamaClient` instances, one per tier. +- Ingestor → Tier 0 (0.8B): bulk summarization, fast and cheap +- Guardian → Tier 1 (2B): violation explanations, was broken with 7b, now <5s +- Enricher → Tier 2 (4B): architectural insight, ~12s on CPU +- Orchestrator → Tier 3 (9B): multi-agent conflict resolution, ~25s on CPU + +#### P3 — ThinkingBudget per Tier (Qwen3.5 /think mode) +`internal/llm/ollama.go`: +- Added `think bool` field + `WithThinking(enabled bool)` builder method +- `Generate()` prepends `/think\n\n` or `/no_think\n\n` to the prompt +- `...` blocks in responses are stripped via `thinkTagRe` regex +Tier 0 (ingest) + Tier 1 (guardian): `thinking=false` — fast, deterministic +Tier 2 (enrich) + Tier 3 (orchestrate): `thinking=true` — deeper reasoning + +#### P7 — /v1/prune Scout Preprocessing Endpoint +New package `internal/pruner/pruner.go`: strips web page boilerplate (navigation, ads, +footers, cookie banners) using Tier 0 model. Returns clean technical paragraphs as plain text. +`server/server.go`: `POST /v1/prune` handler added. +`pkg/brain/brain.go`: `Prune(ctx, content) (string, error)` added to Brain interface and impl. +`pkg/brain/null.go`: NullBrain.Prune() returns original content unchanged. + +**Effect:** Scout raw web content (3000 chars) → pruned clean signal (~1200 chars) → 4B +distillation sees only valuable content → better summaries, faster inference. + +--- + +## v0.4.0 — E2E Test Run (2026-03-03) + +--- + +### CRITICAL BUGS + +#### BUG-I01 — HTTP WriteTimeout kills all 7b-model endpoints silently +**Severity:** Critical +**Root cause:** `server/server.go` line 52: +```go +WriteTimeout: 30 * time.Second, // LLM calls can take up to ~3s +``` +The comment is **stale**. The enricher/guardian/orchestrator use `qwen2.5-coder:7b` +which takes **30-40 seconds** on CPU. The HTTP write timeout fires at exactly the +same time as the LLM context deadline → the handler never writes a response → +the client receives an empty body. +**Affected endpoints:** `/v1/enrich`, `/v1/explain-violation`, `/v1/coordinate` +**Evidence:** All three return empty body after exactly 30.1s. +**Fix options (in priority order):** + 1. Make `WriteTimeout` configurable: `server.WriteTimeout = time.Duration(cfg.TimeoutMS*2) * time.Millisecond` + 2. Remove `WriteTimeout` entirely for LLM endpoints and rely on the enricher's + context deadline instead (set per-handler timeouts via `http.TimeoutHandler`). + 3. Short-term workaround: increase default TimeoutMS in config to 60000 and + set WriteTimeout to 90s. + +#### BUG-I02 — Ingest summary truncated mid-sentence +**Severity:** High +**Observed:** `/v1/ingest` response contains `"summary": "...The tags associated…"` +— summary cut at `NumPredict` token limit. The stored summary in brain.sqlite +is incomplete and propagates into context-packet `root_summary` with a trailing `…`. +**Root cause:** `NumPredict` in `internal/llm/ollama.go` is set to 250 tokens. +Summaries for complex code entities exceed this limit. +**Fix:** Increase `NumPredict` to 400-500. Or use a two-phase approach: request +a JSON object with `{"summary": "..."}` and set NumPredict to stop cleanly at +the closing brace. + +#### BUG-I03 — Intelligence ingestor prompt uses "code entity" for all node types +**Severity:** Medium +**Observed:** When scout sends a web article for distillation with +`node_type: "web article"`, the returned summary says "This **code entity** +provides information about the Model Context Protocol...". The ingestor prompt +template hardcodes "code entity" regardless of `node_type`. +**Root cause:** `internal/ingestor/ingestor.go` `buildPrompt()` likely uses a +static phrase. +**Fix:** Use the `node_type` field in the prompt: +```go +fmt.Sprintf("Summarize this %s in 1-2 sentences: %s\n\n%s", nodeType, name, code) +``` +This makes summaries contextually correct for web articles, YouTube videos, and +search result sets. + +#### BUG-I04 — Tags always empty in ingest response +**Severity:** Medium +**Observed:** Scout receives `"tags": []` from every ingest call. The +`IngestResponse` has a `Tags []string` field but it's never populated. +**Root cause:** The ingestor prompt doesn't ask for tags, and `parseInsight`/ +`parseSummary` don't extract them. +**Fix:** Add to the ingest prompt: "Also list 2-3 relevant tags as a +comma-separated list on the last line, prefixed with 'tags:'". Parse and return. +Tags improve search relevance and categorisation in brain.sqlite. + +--- + +### ARCHITECTURE ISSUES + +#### ARCH-I01 — packet_quality ceiling of 0.4 on CPU-only deployments +**Observed:** `packet_quality: 0.4` means only `root_summary` is present. +`insight` (requires `/v1/enrich`) and `dep_summaries` (requires bulk ingest of +callee nodes) are missing because enrich times out on CPU with 7b model. +**Impact:** On a GPU-less machine, context packets are never fully enriched. +This is the primary use case for indie developers. +**Improvements:** + 1. For enrichment, default to `qwen2.5-coder:1.5b` (same as fast ingest). + Quality drops ~20% but latency drops from 30s to 3-5s. + 2. Add an `enrich_model` config field separate from `model` (primary). + 3. Or implement async enrichment: return `packet_quality: 0.4` immediately, + enrich in background, store result, return from cache on next call. + +#### ARCH-I02 — No request logging in brain HTTP server +**Observed:** `/tmp/brain.log` only shows startup messages. There are no +per-request logs for ingest/enrich timings, LLM latencies, or errors. +**Impact:** Debugging timeouts, slow responses, and LLM errors is blind. +**Fix:** Add a simple logging middleware that logs: +``` +2026-03-03T13:10:31Z POST /v1/ingest node=cmdStart latency=16.2s status=200 +2026-03-03T13:11:00Z POST /v1/enrich root=cmdStart latency=30.1s status=500 (timeout) +``` + +#### ARCH-I03 — `/v1/context-packet` requires full SnapshotInput; not ergonomic for standalone use +**Observed:** Calling `/v1/context-packet` without the correct nested +`snapshot` structure returns `entity_name: ""` and `packet_quality: 0`. +The schema requires deeply nested JSON that differs from the simpler ingest +schema. Callers need to read the code to discover the correct structure. +**Fix:** Add a simpler `GET /v1/context-packet?node_id=X&name=Y` endpoint that +wraps the POST endpoint with sensible defaults for all nested fields. + +#### ARCH-I04 — Coordinate endpoint schema undiscoverable +**Observed:** Calling `/v1/coordinate` with an intuitive `{"agents":[...], "shared_entities":[...]}` +schema returns `{"error": "new_agent_id and new_scope are required"}`. The actual +schema (`new_agent_id`, `new_scope`, `conflicting_claims`) is only discoverable +by reading Go types — it's not documented in the API response or README. +**Fix:** Return a detailed error with the expected schema in the error message, +or add a `GET /v1/coordinate/schema` endpoint. + +--- + +### PERFORMANCE NOTES (CPU / no GPU) + +| Endpoint | Model | Latency | Status | +|----------|-------|---------|--------| +| `/v1/ingest` | qwen2.5-coder:1.5b | ~16s | ✅ Works | +| `/v1/enrich` | qwen2.5-coder:7b | >30s | ❌ Fails (WriteTimeout) | +| `/v1/explain-violation` | qwen2.5-coder:7b | >30s | ❌ Fails | +| `/v1/coordinate` | qwen2.5-coder:7b | >30s | ❌ Fails | +| `/v1/context-packet` (no LLM) | SQLite | <1ms | ✅ Works | +| `/v1/decision` | SQLite | <1ms | ✅ Works | +| `/v1/patterns` | SQLite | <1ms | ✅ Works | +| `/v1/sdlc` | SQLite | <1ms | ✅ Works | + +**Recommendation:** On CPU-only machines, configure all LLM endpoints to use +`qwen2.5-coder:1.5b` with a 20s timeout. The 7b model is unusable without GPU. + +--- + +### WHAT WORKS WELL ✅ + +- Decision log + co-occurrence pattern learning: instant, deterministic. +- Context packets (no-LLM path): instant, structured, SDLC-aware. +- SDLC phase/mode management: correct, persisted, multi-agent safe. +- Pattern hints in context-packet: correctly surfaces co-change suggestions. +- Fail-silent pattern: intelligence unavailability never crashes synapses. +- Brain.sqlite: lightweight, portable, no daemon dependency. diff --git a/internal/ingestor/ingestor.go b/internal/ingestor/ingestor.go index 16486af..76920ff 100644 --- a/internal/ingestor/ingestor.go +++ b/internal/ingestor/ingestor.go @@ -20,15 +20,14 @@ import ( const ( // maxCodeChars is the maximum code snippet size sent to the LLM. - // Keeps prompts small for fast inference on 1-2B models. + // Keeps prompts small for fast inference on 0.8-2B models. maxCodeChars = 500 - // promptTemplate is tuned for small models: - // - Imperative instruction first - // - Strict JSON-only output format - // - No markdown, no preamble - // tags: 1-3 short domain labels e.g. ["auth","http","database"] - promptTemplate = `Describe what this code entity does in ONE sentence. Add 1-3 short domain tags. + // promptTemplate generates a prose briefing suitable for LLM context delivery. + // 2-3 sentences covering: what it does, its role, and any important concerns. + // The summary replaces verbose raw code/doc in get_context responses, giving + // Claude natural-language context that costs far fewer tokens than JSON. + promptTemplate = `Write a 2-3 sentence technical briefing for this code entity: what it does, its role in the system, and any important patterns or concerns to be aware of. Output ONLY valid JSON with no other text: {"summary": "...", "tags": ["tag1"]} Name: %s (%s, package %s) diff --git a/internal/llm/ollama.go b/internal/llm/ollama.go index 54d7a34..1faeab9 100644 --- a/internal/llm/ollama.go +++ b/internal/llm/ollama.go @@ -7,16 +7,25 @@ import ( "fmt" "io" "net/http" + "regexp" "strings" "time" ) +// thinkTagRe strips Qwen3.5 extended thinking output (... blocks). +var thinkTagRe = regexp.MustCompile(`(?s).*?`) + // OllamaClient calls the Ollama REST API at POST /api/generate. // It keeps a reusable http.Client for connection pooling. type OllamaClient struct { baseURL string model string httpClient *http.Client + // think controls Qwen3.5 extended thinking mode. + // When true, "/think\n\n" is prepended to the prompt (deeper reasoning). + // When false, "/no_think\n\n" is prepended (faster, no chain-of-thought). + // Models that don't support thinking mode silently ignore the prefix. + think bool } // NewOllamaClient creates a client targeting the given Ollama base URL and model. @@ -34,6 +43,14 @@ func NewOllamaClient(baseURL, model string, timeoutMS int) *OllamaClient { } } +// WithThinking configures extended thinking mode for Qwen3.5 models. +// Call on construction: llm.NewOllamaClient(...).WithThinking(true) +// Returns the client to allow chaining. +func (c *OllamaClient) WithThinking(enabled bool) *OllamaClient { + c.think = enabled + return c +} + // ollamaRequest is the payload for POST /api/generate. type ollamaRequest struct { Model string `json:"model"` @@ -58,7 +75,16 @@ type ollamaResponse struct { // Generate sends a prompt and returns the response text. // Uses stream=false for simplicity and lowest latency on small outputs. +// If thinking mode is configured, prepends /think or /no_think to the prompt +// (Qwen3.5 extended reasoning control) and strips ... from output. func (c *OllamaClient) Generate(ctx context.Context, prompt string) (string, error) { + // Apply Qwen3.5 thinking mode prefix. Models that don't support this ignore it. + if c.think { + prompt = "/think\n\n" + prompt + } else { + prompt = "/no_think\n\n" + prompt + } + reqBody := ollamaRequest{ Model: c.model, Prompt: prompt, @@ -103,7 +129,10 @@ func (c *OllamaClient) Generate(ctx context.Context, prompt string) (string, err return "", fmt.Errorf("ollama error: %s", result.Error) } - return strings.TrimSpace(result.Response), nil + // Strip extended thinking blocks (...) that Qwen3.5 emits + // when thinking mode is enabled. The actual answer follows after the block. + response := thinkTagRe.ReplaceAllString(result.Response, "") + return strings.TrimSpace(response), nil } // Available checks if Ollama is reachable by calling GET /api/tags. diff --git a/internal/pruner/pruner.go b/internal/pruner/pruner.go new file mode 100644 index 0000000..f0d4ae4 --- /dev/null +++ b/internal/pruner/pruner.go @@ -0,0 +1,87 @@ +// Package pruner strips boilerplate from web content using the Tier 0 (0.8B) model. +// +// Web pages contain 30-50% non-technical noise: navigation menus, cookie banners, +// footers, sidebars, and ads. Sending this noise to the distillation pipeline wastes +// LLM compute and dilutes the resulting summary. The Pruner extracts only the +// core technical paragraphs before handing content to the Ingestor. +// +// This is a Tier 0 (Reflex) task: simple extraction, no reasoning, no JSON output. +// The 0.8B model is fast enough (<3s on CPU) and accurate enough for this job. +package pruner + +import ( + "context" + "fmt" + "strings" + "time" + "unicode/utf8" + + "github.com/SynapsesOS/synapses-intelligence/internal/llm" +) + +// maxInputChars is the maximum raw content size sent to the LLM. +// Matches the scout distiller's _DISTILL_MAX_CHARS constant (3000). +const maxInputChars = 3_000 + +// promptTemplate instructs the 0.8B model to extract technical content. +// Plain-text output (no JSON) keeps the task simple and maximises accuracy +// for small models. The caller uses the raw response directly. +const promptTemplate = `Extract only the core technical content from this web page text. +Remove navigation menus, advertisements, footers, cookie notices, and sidebars. +Return only the key technical paragraphs and information as plain text. Be concise. + +Text: +%s` + +// Pruner strips boilerplate from web page text using a small LLM. +type Pruner struct { + llm llm.LLMClient + timeout time.Duration +} + +// New creates a Pruner backed by the given LLM client. +// timeout is the per-request deadline; defaults to 10s if <= 0. +func New(client llm.LLMClient, timeout time.Duration) *Pruner { + if timeout <= 0 { + timeout = 10 * time.Second + } + return &Pruner{llm: client, timeout: timeout} +} + +// Prune extracts core technical content from raw web page text. +// Returns the pruned content, or the original content if the LLM call fails. +// The returned string is always non-empty if input was non-empty. +func (p *Pruner) Prune(ctx context.Context, content string) (string, error) { + content = strings.TrimSpace(content) + if content == "" { + return "", nil + } + + // Truncate to keep the prompt within limits for small models. + truncated := truncate(content, maxInputChars) + + ctx, cancel := context.WithTimeout(ctx, p.timeout) + defer cancel() + + prompt := fmt.Sprintf(promptTemplate, truncated) + result, err := p.llm.Generate(ctx, prompt) + if err != nil { + // Fail-silent: return original content so the caller can proceed. + return content, fmt.Errorf("pruner llm: %w", err) + } + + result = strings.TrimSpace(result) + if result == "" { + return content, nil // empty response — fall back to original + } + return result, nil +} + +// truncate caps the string at maxChars runes, appending "..." if truncated. +func truncate(s string, maxChars int) string { + if utf8.RuneCountInString(s) <= maxChars { + return s + } + runes := []rune(s) + return string(runes[:maxChars]) + "..." +} diff --git a/pkg/brain/brain.go b/pkg/brain/brain.go index 2e9645a..906e388 100644 --- a/pkg/brain/brain.go +++ b/pkg/brain/brain.go @@ -12,6 +12,7 @@ import ( "github.com/SynapsesOS/synapses-intelligence/internal/ingestor" "github.com/SynapsesOS/synapses-intelligence/internal/llm" "github.com/SynapsesOS/synapses-intelligence/internal/orchestrator" + "github.com/SynapsesOS/synapses-intelligence/internal/pruner" "github.com/SynapsesOS/synapses-intelligence/internal/sdlc" "github.com/SynapsesOS/synapses-intelligence/internal/store" ) @@ -78,6 +79,11 @@ type Brain interface { // If trigger is non-empty, only patterns with that trigger are returned. // limit caps the number of results (0 = default of 20). GetPatterns(trigger string, limit int) []PatternHint + + // Prune strips boilerplate (navigation, ads, footers) from raw web page text + // using the Tier 0 (0.8B) model. Returns cleaned technical content. + // Falls back to returning the original content if the LLM is unavailable. + Prune(ctx context.Context, content string) (string, error) } // impl is the production Brain backed by Ollama + SQLite. @@ -89,6 +95,7 @@ type impl struct { enricher *enricher.Enricher guardian *guardian.Guardian orchestrator *orchestrator.Orchestrator + pruner *pruner.Pruner sdlcMgr *sdlc.Manager builder *contextbuilder.Builder learner *contextbuilder.Learner @@ -102,16 +109,16 @@ func New(cfg config.BrainConfig) Brain { return &NullBrain{} } - // Primary model: used for enrichment and insights (7b default). - ollamaClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.Model, cfg.TimeoutMS) - - // Fast model: used for bulk ingest/summarization (1.5b default). - // Falls back to primary model if FastModel is unset. - fastModel := cfg.FastModel - if fastModel == "" { - fastModel = cfg.Model - } - fastClient := llm.NewOllamaClient(cfg.OllamaURL, fastModel, cfg.TimeoutMS) + // Tiered Nervous System: each task type uses the model best suited to its complexity. + // Thinking mode (Qwen3.5 /think prefix) is disabled for fast tiers and enabled for deep tiers. + // Tier 0 (Reflex) — ingest: fast summarization, no reasoning. Default: qwen3.5:0.8b. + ingestClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.ModelIngest, cfg.TimeoutMS).WithThinking(false) + // Tier 1 (Sensory) — guardian: plain-English violation explanations. Default: qwen3.5:2b. + guardianClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.ModelGuardian, cfg.TimeoutMS).WithThinking(false) + // Tier 2 (Specialist) — enricher: architectural insight + concerns. Default: qwen3.5:4b. + enrichClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.ModelEnrich, cfg.TimeoutMS).WithThinking(true) + // Tier 3 (Architect) — orchestrator: multi-agent conflict resolution. Default: qwen3.5:9b. + orchestrateClient := llm.NewOllamaClient(cfg.OllamaURL, cfg.ModelOrchestrate, cfg.TimeoutMS).WithThinking(true) st, err := store.Open(cfg.DBPath) if err != nil { @@ -121,17 +128,18 @@ func New(cfg config.BrainConfig) Brain { timeout := time.Duration(cfg.TimeoutMS) * time.Millisecond - enr := enricher.New(ollamaClient, st, timeout) + enr := enricher.New(enrichClient, st, timeout) mgr := sdlc.NewManager(st) b := &impl{ cfg: cfg, - llm: ollamaClient, + llm: enrichClient, // primary client used for Available() / ModelName() store: st, - ingestor: ingestor.New(fastClient, st, timeout), + ingestor: ingestor.New(ingestClient, st, timeout), enricher: enr, - guardian: guardian.New(ollamaClient, st, timeout), - orchestrator: orchestrator.New(ollamaClient, timeout), + guardian: guardian.New(guardianClient, st, timeout), + orchestrator: orchestrator.New(orchestrateClient, timeout), + pruner: pruner.New(ingestClient, timeout), // Tier 0: 0.8B, same as ingest sdlcMgr: mgr, builder: contextbuilder.New(st, mgr, enr), learner: contextbuilder.NewLearner(st), @@ -229,6 +237,10 @@ func (b *impl) Coordinate(ctx context.Context, req CoordinateRequest) (Coordinat }, nil } +func (b *impl) Prune(ctx context.Context, content string) (string, error) { + return b.pruner.Prune(ctx, content) +} + func (b *impl) Summary(nodeID string) string { if b.store == nil { return "" diff --git a/pkg/brain/null.go b/pkg/brain/null.go index 5345b91..923926d 100644 --- a/pkg/brain/null.go +++ b/pkg/brain/null.go @@ -63,3 +63,6 @@ func (n *NullBrain) GetSDLCConfig() SDLCConfig { // GetPatterns returns nil — no patterns are stored when brain is disabled. func (n *NullBrain) GetPatterns(_ string, _ int) []PatternHint { return nil } + +// Prune returns the original content unchanged — no LLM is available. +func (n *NullBrain) Prune(_ context.Context, content string) (string, error) { return content, nil } diff --git a/server/server.go b/server/server.go index 36ec6ff..8de084f 100644 --- a/server/server.go +++ b/server/server.go @@ -27,7 +27,9 @@ type Server struct { } // New creates a Server that delegates to the given Brain. -func New(b brain.Brain, port int) *Server { +// timeoutMS is the configured LLM timeout; WriteTimeout is set to 2× this value +// so LLM handlers always have time to write their response after inference completes. +func New(b brain.Brain, port int, timeoutMS int) *Server { s := &Server{brain: b, port: port} mux := http.NewServeMux() mux.HandleFunc("GET /v1/health", s.handleHealth) @@ -36,6 +38,7 @@ func New(b brain.Brain, port int) *Server { mux.HandleFunc("POST /v1/enrich", s.handleEnrich) mux.HandleFunc("POST /v1/explain-violation", s.handleExplainViolation) mux.HandleFunc("POST /v1/coordinate", s.handleCoordinate) + mux.HandleFunc("POST /v1/prune", s.handlePrune) // v0.2.0 endpoints mux.HandleFunc("POST /v1/context-packet", s.handleContextPacket) @@ -45,11 +48,12 @@ func New(b brain.Brain, port int) *Server { mux.HandleFunc("POST /v1/decision", s.handleLogDecision) mux.HandleFunc("GET /v1/patterns", s.handleGetPatterns) + writeTimeout := time.Duration(timeoutMS*2) * time.Millisecond s.server = &http.Server{ Addr: fmt.Sprintf("127.0.0.1:%d", port), Handler: mux, ReadTimeout: 10 * time.Second, - WriteTimeout: 30 * time.Second, // LLM calls can take up to ~3s + WriteTimeout: writeTimeout, IdleTimeout: 60 * time.Second, } return s @@ -176,6 +180,32 @@ func (s *Server) handleCoordinate(w http.ResponseWriter, r *http.Request) { writeJSON(w, http.StatusOK, resp) } +func (s *Server) handlePrune(w http.ResponseWriter, r *http.Request) { + var req struct { + Content string `json:"content"` + } + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + writeError(w, http.StatusBadRequest, "invalid request body: "+err.Error()) + return + } + if req.Content == "" { + writeError(w, http.StatusBadRequest, "content is required") + return + } + + pruned, err := s.brain.Prune(r.Context(), req.Content) + if err != nil { + // Non-fatal: return original content with a warning header. + pruned = req.Content + w.Header().Set("X-Prune-Warning", err.Error()) + } + writeJSON(w, http.StatusOK, map[string]interface{}{ + "pruned": pruned, + "original_length": len(req.Content), + "pruned_length": len(pruned), + }) +} + // --- v0.2.0 Handlers --- // handleContextPacket assembles a Context Packet for the calling agent.