From 839774616483c3dec00b4b981a951eaa94e1365d Mon Sep 17 00:00:00 2001 From: Burak Yigit Kaya Date: Sun, 22 Mar 2026 14:24:39 +0000 Subject: [PATCH] feat: improve FTS5 search foundations - Create src/search.ts with centralized ftsQuery/ftsQueryOr functions - Add stopword filtering (conservative list, preserves domain terms) - Drop single-char tokens (contraction artifacts) but keep 2-char+ terms - Implement AND-then-OR fallback: AND first for precision, OR when AND returns nothing - Fix knowledge search to use BM25 rank instead of updated_at DESC - Uses bm25() with column weights: title=6.0, content=2.0, category=3.0 - JOIN pattern instead of subquery for proper rank access - Add distillation_fts table (schema migration v7) - FTS5 on observations column with porter unicode61 tokenizer - Backfill existing data, sync triggers for INSERT/UPDATE/DELETE - Replace LIKE-based distillation search with FTS5 ranked search - Add 'too vague' handling in recall tool for all-stopword queries - Remove ftsQuery from temporal.ts (now in search.ts, no re-export) --- AGENTS.md | 9 +++ src/db.ts | 31 +++++++- src/ltm.ts | 72 +++++++++-------- src/reflect.ts | 84 +++++++++++++++----- src/search.ts | 174 ++++++++++++++++++++++++++++++++++++++++++ src/temporal.ts | 29 +++---- test/db.test.ts | 20 ++++- test/ltm.test.ts | 59 ++++++++++++++ test/search.test.ts | 134 ++++++++++++++++++++++++++++++++ test/temporal.test.ts | 7 +- 10 files changed, 552 insertions(+), 67 deletions(-) create mode 100644 src/search.ts create mode 100644 test/search.test.ts diff --git a/AGENTS.md b/AGENTS.md index 5b66e38..02d9014 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -12,6 +12,9 @@ * **Lore DB uses incremental auto\_vacuum to prevent free-page bloat**: Lore's SQLite DB uses incremental auto\_vacuum (schema version 3 migration) to prevent free-page bloat from deletions. The migration sets PRAGMA auto\_vacuum = INCREMENTAL then VACUUM outside a transaction. temporal\_messages is the primary storage consumer (~51MB); knowledge table is tiny. + +* **Lore search pipeline: FTS5 with AND-then-OR fallback and RRF fusion**: Lore's search overhaul (planned/in-progress) replaces three independent search systems with a unified pipeline in \`src/search.ts\`. Key design: \`ftsQuery()\` builds AND queries (primary), \`ftsQueryOr()\` builds OR queries (fallback only when AND returns zero results). Blanket OR was rejected empirically — it adds noise even with stopword filtering. Conservative stopword list excludes domain terms like 'handle', 'state', 'type'. FTS5 rank is negative (more negative = better); \`ORDER BY rank\` sorts best first. \`bm25()\` with column weights (title=6, content=2, category=3) verified working in Bun's SQLite. Recall tool uses Reciprocal Rank Fusion (k=60) across knowledge, temporal, and distillation sources. \`forSession()\` scoring uses OR (not AND-then-OR) because it's ranking all candidates, not searching for exact matches — BM25 naturally weights multi-term matches higher. + * **Lore temporal pruning runs after distillation and curation on session.idle**: In src/index.ts, session.idle awaits backgroundDistill and backgroundCurate sequentially before running temporal.prune(). Ordering is critical: pruning must not delete unprocessed messages. Pruning defaults: 120-day retention, 1GB max storage (in .lore.json under pruning.retention and pruning.maxStorage). These generous defaults were chosen because the system was new — earlier proposals of 7d/200MB were based on insufficient data. @@ -23,6 +26,9 @@ * **Curator prompt scoped to code-relevant knowledge only**: CURATOR\_SYSTEM in src/prompt.ts now explicitly excludes: general ecosystem knowledge available online, business strategy and marketing positioning, product pricing models, third-party tool details not needed for development, and personal contact information. This was added after the curator extracted entries about OpenWork integration strategy (including an email address), Lore Cloud pricing tiers, and AGENTS.md ecosystem facts — none of which help an agent write code. The curatorUser() function also appends guidance to prefer updating existing entries over creating new ones for the same concept, reducing duplicate creation. + +* **Lore plugin cannot use native Node addons — pure bun:sqlite only**: Lore is a Bun plugin library (\`main: 'src/index.ts'\`, Plugin type) running inside OpenCode's compiled Bun binary. It has no build step and cannot use native Node addons (no better-sqlite3, no node-llama-cpp, no sqlite-vec). Dependencies must be pure JS/TS or Bun built-ins. This rules out QMD as a library dependency (requires better-sqlite3 + node-llama-cpp + sqlite-vec). QMD's search patterns (BM25 + vector + RRF + reranking) are adapted for pure FTS5 instead. Vector/embedding search would need to use OpenCode's existing chat providers rather than local GGUF models. + ### Gotcha @@ -31,6 +37,9 @@ * **Lore auto-recovery can infinite-loop without re-entrancy guard**: Three v0.5.2 bugs causing excessive background LLM requests: (1) Auto-recovery loop — session.error handler injected recovery prompt → could overflow again → loop. Fix: recoveringSessions Set as re-entrancy guard. (2) Curator ran every idle — \`onIdle || afterTurns\` short-circuited (onIdle=true). Fix: \`||\` → \`&&\`. Lesson: boolean flag gating numeric threshold needs AND not OR. (3) shouldSkip() fell back to session.list() on unknown sessions. Fix: remove list fallback, cache in activeSessions. + +* **Lore knowledge FTS search was sorted by updated\_at, not BM25 relevance**: In \`ltm.search()\`, knowledge FTS results were ordered by \`k.updated\_at DESC\` instead of FTS5 BM25 rank — most recently edited won over most relevant. Fix: replace the \`WHERE k.rowid IN (SELECT rowid FROM knowledge\_fts ...)\` subquery pattern with a JOIN that exposes \`rank\`, then \`ORDER BY bm25(knowledge\_fts, 6.0, 2.0, 3.0)\`. Also: distillations had no FTS table at all (LIKE-only search), fixed by adding \`distillation\_fts\` in schema migration v7 with backfill and sync triggers. + * **Test DB isolation via LORE\_DB\_PATH and Bun test preload**: Lore test suite uses isolated temp DB via test/setup.ts preload (bunfig.toml). Preload sets LORE\_DB\_PATH to mkdtempSync path before any imports of src/db.ts; afterAll cleans up. src/db.ts checks LORE\_DB\_PATH first. agents-file.test.ts needs beforeEach cleanup for intra-file isolation and TEST\_UUIDS cleanup in afterAll (shared with ltm.test.ts). Individual test files don't need close() calls — preload handles DB lifecycle. diff --git a/src/db.ts b/src/db.ts index 561beb4..3970667 100644 --- a/src/db.ts +++ b/src/db.ts @@ -2,7 +2,7 @@ import { Database } from "bun:sqlite"; import { join, dirname } from "path"; import { mkdirSync } from "fs"; -const SCHEMA_VERSION = 6; +const SCHEMA_VERSION = 7; const MIGRATIONS: string[] = [ ` @@ -179,6 +179,35 @@ const MIGRATIONS: string[] = [ DROP INDEX IF EXISTS idx_temporal_distilled; DROP INDEX IF EXISTS idx_distillation_project; `, + ` + -- Version 7: FTS5 for distillations — enables ranked search instead of LIKE. + CREATE VIRTUAL TABLE IF NOT EXISTS distillation_fts USING fts5( + observations, + content=distillations, + content_rowid=rowid, + tokenize='porter unicode61' + ); + + -- Backfill existing data (skip empty observations from schema v1→v2 migration) + INSERT INTO distillation_fts(rowid, observations) + SELECT rowid, observations FROM distillations WHERE observations != ''; + + -- Sync triggers + CREATE TRIGGER IF NOT EXISTS distillation_fts_insert AFTER INSERT ON distillations BEGIN + INSERT INTO distillation_fts(rowid, observations) VALUES (new.rowid, new.observations); + END; + + CREATE TRIGGER IF NOT EXISTS distillation_fts_delete AFTER DELETE ON distillations BEGIN + INSERT INTO distillation_fts(distillation_fts, rowid, observations) + VALUES('delete', old.rowid, old.observations); + END; + + CREATE TRIGGER IF NOT EXISTS distillation_fts_update AFTER UPDATE ON distillations BEGIN + INSERT INTO distillation_fts(distillation_fts, rowid, observations) + VALUES('delete', old.rowid, old.observations); + INSERT INTO distillation_fts(rowid, observations) VALUES (new.rowid, new.observations); + END; + `, ]; function dataDir() { diff --git a/src/ltm.ts b/src/ltm.ts index 5f46287..df76ac4 100644 --- a/src/ltm.ts +++ b/src/ltm.ts @@ -1,6 +1,6 @@ import { uuidv7 } from "uuidv7"; import { db, ensureProject } from "./db"; -import { ftsQuery } from "./temporal"; +import { ftsQuery, ftsQueryOr, EMPTY_QUERY } from "./search"; // ~3 chars per token — validated as best heuristic against real API data. function estimateTokens(text: string): number { @@ -364,6 +364,9 @@ function searchLike(input: { .all(...likeParams, input.limit) as KnowledgeEntry[]; } +/** BM25 column weights for knowledge_fts: title, content, category. */ +const FTS_WEIGHTS = { title: 6.0, content: 2.0, category: 3.0 }; + export function search(input: { query: string; projectPath?: string; @@ -371,37 +374,46 @@ export function search(input: { }): KnowledgeEntry[] { const limit = input.limit ?? 20; const q = ftsQuery(input.query); - if (input.projectPath) { - const pid = ensureProject(input.projectPath); - try { - return db() - .query( - `SELECT k.* FROM knowledge k - WHERE k.rowid IN (SELECT rowid FROM knowledge_fts WHERE knowledge_fts MATCH ?) - AND (k.project_id = ? OR k.project_id IS NULL OR k.cross_project = 1) - AND k.confidence > 0.2 - ORDER BY k.updated_at DESC LIMIT ?`, - ) - .all(q, pid, limit) as KnowledgeEntry[]; - } catch { - return searchLike({ - query: input.query, - projectPath: input.projectPath, - limit, - }); - } - } + if (q === EMPTY_QUERY) return []; + + const pid = input.projectPath ? ensureProject(input.projectPath) : null; + + const ftsSQL = pid + ? `SELECT k.* FROM knowledge k + JOIN knowledge_fts f ON k.rowid = f.rowid + WHERE knowledge_fts MATCH ? + AND (k.project_id = ? OR k.project_id IS NULL OR k.cross_project = 1) + AND k.confidence > 0.2 + ORDER BY bm25(knowledge_fts, ?, ?, ?) LIMIT ?` + : `SELECT k.* FROM knowledge k + JOIN knowledge_fts f ON k.rowid = f.rowid + WHERE knowledge_fts MATCH ? + AND k.confidence > 0.2 + ORDER BY bm25(knowledge_fts, ?, ?, ?) LIMIT ?`; + + const { title, content, category } = FTS_WEIGHTS; + const ftsParams = pid + ? [q, pid, title, content, category, limit] + : [q, title, content, category, limit]; + try { - return db() - .query( - `SELECT k.* FROM knowledge k - WHERE k.rowid IN (SELECT rowid FROM knowledge_fts WHERE knowledge_fts MATCH ?) - AND k.confidence > 0.2 - ORDER BY k.updated_at DESC LIMIT ?`, - ) - .all(q, limit) as KnowledgeEntry[]; + const results = db().query(ftsSQL).all(...ftsParams) as KnowledgeEntry[]; + if (results.length) return results; + + // AND returned nothing — try OR fallback for broader recall + const qOr = ftsQueryOr(input.query); + if (qOr === EMPTY_QUERY) return []; + + const ftsParamsOr = pid + ? [qOr, pid, title, content, category, limit] + : [qOr, title, content, category, limit]; + return db().query(ftsSQL).all(...ftsParamsOr) as KnowledgeEntry[]; } catch { - return searchLike({ query: input.query, limit }); + return searchLike({ + query: input.query, + projectPath: input.projectPath, + limit, + }); } } diff --git a/src/reflect.ts b/src/reflect.ts index 5addfac..792c33e 100644 --- a/src/reflect.ts +++ b/src/reflect.ts @@ -3,6 +3,7 @@ import * as temporal from "./temporal"; import * as ltm from "./ltm"; import * as log from "./log"; import { db, ensureProject } from "./db"; +import { ftsQuery, ftsQueryOr, EMPTY_QUERY } from "./search"; import { serialize, inline, h, p, ul, lip, liph, t, root } from "./markdown"; type Distillation = { @@ -13,41 +14,83 @@ type Distillation = { session_id: string; }; -function searchDistillations(input: { - projectPath: string; +// LIKE-based fallback for when FTS5 fails unexpectedly on distillations. +function searchDistillationsLike(input: { + pid: string; query: string; sessionID?: string; - limit?: number; + limit: number; }): Distillation[] { - const pid = ensureProject(input.projectPath); - const limit = input.limit ?? 10; - // Search distillation narratives and facts with LIKE since we don't have FTS on them const terms = input.query .toLowerCase() .split(/\s+/) - .filter((t) => t.length > 2); + .filter((t) => t.length > 1); if (!terms.length) return []; - const conditions = terms .map(() => "LOWER(observations) LIKE ?") .join(" AND "); - const params: string[] = []; - for (const term of terms) { - params.push(`%${term}%`); - } - - const query = input.sessionID + const likeParams = terms.map((t) => `%${t}%`); + const sql = input.sessionID ? `SELECT id, observations, generation, created_at, session_id FROM distillations WHERE project_id = ? AND session_id = ? AND ${conditions} ORDER BY created_at DESC LIMIT ?` : `SELECT id, observations, generation, created_at, session_id FROM distillations WHERE project_id = ? AND ${conditions} ORDER BY created_at DESC LIMIT ?`; const allParams = input.sessionID - ? [pid, input.sessionID, ...params, limit] - : [pid, ...params, limit]; - + ? [input.pid, input.sessionID, ...likeParams, input.limit] + : [input.pid, ...likeParams, input.limit]; return db() - .query(query) + .query(sql) .all(...allParams) as Distillation[]; } +function searchDistillations(input: { + projectPath: string; + query: string; + sessionID?: string; + limit?: number; +}): Distillation[] { + const pid = ensureProject(input.projectPath); + const limit = input.limit ?? 10; + const q = ftsQuery(input.query); + if (q === EMPTY_QUERY) return []; + + const ftsSQL = input.sessionID + ? `SELECT d.id, d.observations, d.generation, d.created_at, d.session_id + FROM distillations d + JOIN distillation_fts f ON d.rowid = f.rowid + WHERE distillation_fts MATCH ? + AND d.project_id = ? AND d.session_id = ? + ORDER BY rank LIMIT ?` + : `SELECT d.id, d.observations, d.generation, d.created_at, d.session_id + FROM distillations d + JOIN distillation_fts f ON d.rowid = f.rowid + WHERE distillation_fts MATCH ? + AND d.project_id = ? + ORDER BY rank LIMIT ?`; + const params = input.sessionID + ? [q, pid, input.sessionID, limit] + : [q, pid, limit]; + + try { + const results = db().query(ftsSQL).all(...params) as Distillation[]; + if (results.length) return results; + + // AND returned nothing — try OR fallback + const qOr = ftsQueryOr(input.query); + if (qOr === EMPTY_QUERY) return []; + const paramsOr = input.sessionID + ? [qOr, pid, input.sessionID, limit] + : [qOr, pid, limit]; + return db().query(ftsSQL).all(...paramsOr) as Distillation[]; + } catch { + // FTS5 failed — fall back to LIKE search + return searchDistillationsLike({ + pid, + query: input.query, + sessionID: input.sessionID, + limit, + }); + } +} + function formatResults(input: { temporalResults: temporal.TemporalMessage[]; distillationResults: Distillation[]; @@ -115,6 +158,11 @@ export function createRecallTool(projectPath: string, knowledgeEnabled = true): const scope = args.scope ?? "all"; const sid = context.sessionID; + // If the query is all stopwords / single chars, short-circuit with guidance + if (ftsQuery(args.query) === EMPTY_QUERY) { + return "Query too vague — try using specific keywords, file names, or technical terms."; + } + let temporalResults: temporal.TemporalMessage[] = []; if (scope !== "knowledge") { try { diff --git a/src/search.ts b/src/search.ts new file mode 100644 index 0000000..548e426 --- /dev/null +++ b/src/search.ts @@ -0,0 +1,174 @@ +/** + * Centralized FTS5 search utilities for Lore. + * + * Provides query building, stopword filtering, and (Phase 2+) score fusion. + * All FTS5 search callers (ltm, temporal, reflect) import from here. + */ + +/** + * Curated stopword set for FTS5 queries. These are common English words that + * match broadly and dilute search precision when used with OR semantics. + * + * CRITICAL: OR without stopword filtering is catastrophic — "the OR for OR and" + * matches every document in the corpus. Stopwords MUST be filtered before + * building OR queries. + * + * This list is intentionally conservative: only includes words that are + * genuinely content-free. Domain terms like "handle", "state", "type" are + * NOT stopwords — they carry meaning in code/technical contexts. + */ +export const STOPWORDS: ReadonlySet = new Set([ + // Articles & determiners + "an", + "the", + "this", + "that", + "these", + "those", + "some", + "each", + "every", + // Pronouns + "he", + "it", + "me", + "my", + "we", + "us", + "or", + "am", + "they", + "them", + "their", + "there", + "here", + "what", + "which", + "where", + "when", + "whom", + // Common verbs (content-free) + "is", + "be", + "do", + "no", + "so", + "if", + "as", + "at", + "by", + "in", + "of", + "on", + "to", + "up", + "are", + "was", + "has", + "had", + "not", + "but", + "can", + "did", + "for", + "got", + "let", + "may", + "our", + "its", + "nor", + "yet", + "how", + "all", + "any", + "too", + "own", + "out", + "why", + "who", + "few", + "have", + "been", + "were", + "will", + "would", + "could", + "should", + "does", + "being", + "also", + // Prepositions & conjunctions + "with", + "from", + "into", + "about", + "than", + "over", + "such", + "after", + "before", + "between", + // Adverbs (content-free) + "just", + "only", + "very", + "more", + "most", + "really", + "already", +]); + +/** + * The sentinel value returned when a query contains no meaningful terms after + * filtering. Callers should check for this and return a "query too vague" + * message instead of executing an FTS5 MATCH against it. + */ +export const EMPTY_QUERY = '""'; + +/** + * Filter raw query text into meaningful FTS5 tokens. + * + * Filtering (in order): + * 1. Strip non-word chars (punctuation, operators — prevents FTS5 injection) + * 2. Remove single-character tokens (contraction artifacts like "s", "t") + * 3. Remove stopwords + * + * If ALL words are filtered, returns an empty array. The caller decides + * what to do (typically returns a "query too vague" message). + * + * No general length filter — short but meaningful tokens like "DB", "CI", + * "IO", "PR" are preserved. Only single chars are dropped. + */ +function filterTerms(raw: string): string[] { + const words = raw + .replace(/[^\w\s]/g, " ") + .split(/\s+/) + .filter(Boolean); + + return words.filter( + (w) => w.length > 1 && !STOPWORDS.has(w.toLowerCase()), + ); +} + +/** + * Build an FTS5 MATCH expression using AND semantics (implicit AND via space). + * + * Returns `""` (match-nothing sentinel) when no meaningful terms remain after + * filtering. Callers should check `q === EMPTY_QUERY` and handle accordingly. + */ +export function ftsQuery(raw: string): string { + const terms = filterTerms(raw); + if (!terms.length) return EMPTY_QUERY; + return terms.map((w) => `${w}*`).join(" "); +} + +/** + * Build an FTS5 MATCH expression using OR semantics. + * Same filtering as ftsQuery(), but joins terms with OR. + * Used as fallback when AND returns zero results. + */ +export function ftsQueryOr(raw: string): string { + const terms = filterTerms(raw); + if (!terms.length) return EMPTY_QUERY; + return terms.map((w) => `${w}*`).join(" OR "); +} diff --git a/src/temporal.ts b/src/temporal.ts index da71f24..5e018ea 100644 --- a/src/temporal.ts +++ b/src/temporal.ts @@ -1,4 +1,5 @@ import { db, ensureProject } from "./db"; +import { ftsQuery, ftsQueryOr, EMPTY_QUERY } from "./search"; import type { Message, Part } from "@opencode-ai/sdk"; // ~3 chars per token — validated as best heuristic against real API data. @@ -126,19 +127,6 @@ export function markDistilled(ids: string[]) { .run(...ids); } -// Sanitize a natural-language query for FTS5 MATCH. -// FTS5 treats punctuation as operators: - = NOT, . = column filter, " = phrase, etc. -// Strip everything except word chars and whitespace, split into tokens, append * for -// prefix matching. Exported so ltm.ts can reuse it instead of maintaining a duplicate. -export function ftsQuery(raw: string): string { - const words = raw - .replace(/[^\w\s]/g, " ") - .split(/\s+/) - .filter(Boolean); - if (!words.length) return '""'; // empty match-nothing sentinel - return words.map((w) => `${w}*`).join(" "); -} - // LIKE-based fallback for when FTS5 fails unexpectedly. function searchLike(input: { pid: string; @@ -173,6 +161,8 @@ export function search(input: { const pid = ensureProject(input.projectPath); const limit = input.limit ?? 20; const q = ftsQuery(input.query); + if (q === EMPTY_QUERY) return []; + const ftsSQL = input.sessionID ? `SELECT m.* FROM temporal_messages m JOIN temporal_fts f ON m.rowid = f.rowid @@ -186,9 +176,20 @@ export function search(input: { ? [q, pid, input.sessionID, limit] : [q, pid, limit]; try { - return db() + const results = db() .query(ftsSQL) .all(...params) as TemporalMessage[]; + if (results.length) return results; + + // AND returned nothing — try OR fallback for broader recall + const qOr = ftsQueryOr(input.query); + if (qOr === EMPTY_QUERY) return []; + const paramsOr = input.sessionID + ? [qOr, pid, input.sessionID, limit] + : [qOr, pid, limit]; + return db() + .query(ftsSQL) + .all(...paramsOr) as TemporalMessage[]; } catch { // FTS5 still choked (edge case) — fall back to LIKE search return searchLike({ diff --git a/test/db.test.ts b/test/db.test.ts index 1d3a61e..ad7df99 100644 --- a/test/db.test.ts +++ b/test/db.test.ts @@ -21,7 +21,25 @@ describe("db", () => { const row = db().query("SELECT version FROM schema_version").get() as { version: number; }; - expect(row.version).toBe(6); + expect(row.version).toBe(7); + }); + + test("distillation_fts virtual table exists", () => { + const tables = db() + .query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name") + .all() as Array<{ name: string }>; + const names = tables.map((t) => t.name); + expect(names).toContain("distillation_fts"); + }); + + test("distillation_fts triggers exist for sync", () => { + const triggers = db() + .query("SELECT name FROM sqlite_master WHERE type='trigger' AND name LIKE 'distillation_fts_%' ORDER BY name") + .all() as Array<{ name: string }>; + const names = triggers.map((t) => t.name); + expect(names).toContain("distillation_fts_insert"); + expect(names).toContain("distillation_fts_delete"); + expect(names).toContain("distillation_fts_update"); }); test("compound indexes exist for common query patterns", () => { diff --git a/test/ltm.test.ts b/test/ltm.test.ts index 0483c0b..3322d45 100644 --- a/test/ltm.test.ts +++ b/test/ltm.test.ts @@ -127,6 +127,65 @@ describe("ltm", () => { }); expect(results.length).toBeGreaterThan(0); }); + + test("search returns empty for all-stopword queries", () => { + const results = ltm.search({ + query: "what is this", + projectPath: PROJECT, + }); + expect(results.length).toBe(0); + }); + + test("AND→OR fallback: finds entries when only some terms match", () => { + // Create an entry that matches "gradient" but not "xyznonexistent" + ltm.create({ + projectPath: PROJECT, + category: "architecture", + title: "Gradient context system", + content: "The gradient manages context window compression across layers", + scope: "project", + }); + + // AND query "gradient xyznonexistent" should fail, then OR fallback finds "gradient" + const results = ltm.search({ + query: "gradient xyznonexistent", + projectPath: PROJECT, + }); + expect(results.length).toBeGreaterThan(0); + expect(results[0].title).toContain("Gradient"); + }); + }); + + describe("search: BM25 ranking", () => { + const RANK_PROJECT = "/test/ltm/ranking"; + + test("title matches rank higher than content-only matches", () => { + // Entry with "database" in title — should rank higher + ltm.create({ + projectPath: RANK_PROJECT, + category: "architecture", + title: "Database migration strategy", + content: "Use incremental schema changes for all migrations", + scope: "project", + }); + + // Entry with "database" only in content — should rank lower + ltm.create({ + projectPath: RANK_PROJECT, + category: "pattern", + title: "Storage layer design", + content: "The database uses SQLite with WAL mode for concurrent reads", + scope: "project", + }); + + const results = ltm.search({ + query: "database", + projectPath: RANK_PROJECT, + }); + expect(results.length).toBeGreaterThanOrEqual(2); + // First result should be the one with "database" in the title (higher BM25 weight) + expect(results[0].title).toContain("Database"); + }); }); }); diff --git a/test/search.test.ts b/test/search.test.ts new file mode 100644 index 0000000..f8466db --- /dev/null +++ b/test/search.test.ts @@ -0,0 +1,134 @@ +import { describe, test, expect } from "bun:test"; +import { ftsQuery, ftsQueryOr, STOPWORDS, EMPTY_QUERY } from "../src/search"; + +describe("search", () => { + describe("ftsQuery (AND semantics)", () => { + test("plain words get prefix wildcard with implicit AND", () => { + expect(ftsQuery("OAuth PKCE flow")).toBe("OAuth* PKCE* flow*"); + }); + + test("hyphenated terms: dash stripped, not treated as NOT operator", () => { + expect(ftsQuery("opencode-nuum")).toBe("opencode* nuum*"); + expect(ftsQuery("three-tier")).toBe("three* tier*"); + }); + + test("dot in domain name: dot stripped, tokens preserved", () => { + expect(ftsQuery("sanity.io")).toBe("sanity* io*"); + }); + + test("other punctuation stripped", () => { + // "what's the fix?" → "what" is stopword, "s" is single char, "the" is stopword → only "fix" + expect(ftsQuery("what's the fix?")).toBe("fix*"); + }); + + test("empty string returns empty sentinel", () => { + expect(ftsQuery("")).toBe(EMPTY_QUERY); + }); + + test("punctuation-only returns empty sentinel", () => { + expect(ftsQuery("!@#$%^&*()")).toBe(EMPTY_QUERY); + }); + + test("single-character tokens are dropped", () => { + // "I" is single char, "a" is single char + expect(ftsQuery("I found a bug")).toBe("found* bug*"); + }); + + test("2-char tokens are preserved (DB, CI, IO, PR)", () => { + expect(ftsQuery("DB migration")).toBe("DB* migration*"); + expect(ftsQuery("CI pipeline")).toBe("CI* pipeline*"); + expect(ftsQuery("IO error")).toBe("IO* error*"); + expect(ftsQuery("PR review")).toBe("PR* review*"); + }); + + test("stopwords are removed", () => { + // "the" and "with" are stopwords + expect(ftsQuery("the database with indexes")).toBe("database* indexes*"); + }); + + test("all-stopword query returns empty sentinel", () => { + expect(ftsQuery("what is this")).toBe(EMPTY_QUERY); + expect(ftsQuery("the from with")).toBe(EMPTY_QUERY); + }); + + test("all single-char tokens returns empty sentinel", () => { + expect(ftsQuery("I a")).toBe(EMPTY_QUERY); + }); + + test("mixed stopwords and single chars returns empty sentinel", () => { + expect(ftsQuery("I have the")).toBe(EMPTY_QUERY); + }); + + test("preserves case of original tokens", () => { + // FTS5 handles case-insensitive matching internally via unicode61 tokenizer + expect(ftsQuery("SQLite FTS5")).toBe("SQLite* FTS5*"); + }); + + test("underscores preserved as word chars", () => { + expect(ftsQuery("my_variable")).toBe("my_variable*"); + }); + }); + + describe("ftsQueryOr (OR semantics)", () => { + test("plain words joined with OR", () => { + expect(ftsQueryOr("OAuth PKCE flow")).toBe("OAuth* OR PKCE* OR flow*"); + }); + + test("same filtering as ftsQuery", () => { + expect(ftsQueryOr("what's the fix?")).toBe("fix*"); + }); + + test("empty string returns empty sentinel", () => { + expect(ftsQueryOr("")).toBe(EMPTY_QUERY); + }); + + test("all-stopword query returns empty sentinel", () => { + expect(ftsQueryOr("what is this")).toBe(EMPTY_QUERY); + }); + + test("stopwords removed, remaining terms OR'd", () => { + expect(ftsQueryOr("the database with indexes")).toBe( + "database* OR indexes*", + ); + }); + + test("single term produces no OR", () => { + expect(ftsQueryOr("database")).toBe("database*"); + }); + }); + + describe("STOPWORDS", () => { + test("contains expected categories", () => { + // Articles + expect(STOPWORDS.has("the")).toBe(true); + expect(STOPWORDS.has("this")).toBe(true); + // Pronouns + expect(STOPWORDS.has("they")).toBe(true); + expect(STOPWORDS.has("what")).toBe(true); + // Common verbs + expect(STOPWORDS.has("have")).toBe(true); + expect(STOPWORDS.has("been")).toBe(true); + // Prepositions + expect(STOPWORDS.has("with")).toBe(true); + expect(STOPWORDS.has("from")).toBe(true); + // Adverbs + expect(STOPWORDS.has("just")).toBe(true); + expect(STOPWORDS.has("very")).toBe(true); + }); + + test("does NOT contain domain terms", () => { + expect(STOPWORDS.has("handle")).toBe(false); + expect(STOPWORDS.has("state")).toBe(false); + expect(STOPWORDS.has("type")).toBe(false); + expect(STOPWORDS.has("error")).toBe(false); + expect(STOPWORDS.has("function")).toBe(false); + expect(STOPWORDS.has("database")).toBe(false); + }); + }); + + describe("EMPTY_QUERY sentinel", () => { + test("is double-quoted empty string", () => { + expect(EMPTY_QUERY).toBe('""'); + }); + }); +}); diff --git a/test/temporal.test.ts b/test/temporal.test.ts index 97ee850..d394514 100644 --- a/test/temporal.test.ts +++ b/test/temporal.test.ts @@ -1,7 +1,7 @@ import { describe, test, expect, beforeEach } from "bun:test"; import { db, ensureProject } from "../src/db"; import * as temporal from "../src/temporal"; -import { ftsQuery } from "../src/temporal"; +import { ftsQuery } from "../src/search"; import type { Message, Part } from "@opencode-ai/sdk"; const PROJECT = "/test/temporal/project"; @@ -292,8 +292,9 @@ describe("temporal", () => { expect(ftsQuery("sanity.io")).toBe("sanity* io*"); }); - test("other punctuation stripped", () => { - expect(ftsQuery("what's the fix?")).toBe("what* s* the* fix*"); + test("other punctuation stripped, stopwords and single chars removed", () => { + // "what" is stopword, "s" is single char, "the" is stopword — only "fix" survives + expect(ftsQuery("what's the fix?")).toBe("fix*"); }); test("empty string returns sentinel", () => {