diff --git a/src/ltm.ts b/src/ltm.ts index f3a7a57..59aba41 100644 --- a/src/ltm.ts +++ b/src/ltm.ts @@ -1,6 +1,6 @@ import { uuidv7 } from "uuidv7"; import { db, ensureProject } from "./db"; -import { ftsQuery, ftsQueryOr, EMPTY_QUERY } from "./search"; +import { ftsQuery, ftsQueryOr, EMPTY_QUERY, extractTopTerms } from "./search"; // ~3 chars per token — validated as best heuristic against real API data. function estimateTokens(text: string): number { @@ -153,6 +153,9 @@ export function forProject( type Scored = { entry: KnowledgeEntry; score: number }; +/** BM25 column weights for knowledge_fts: title, content, category. */ +const FTS_WEIGHTS = { title: 6.0, content: 2.0, category: 3.0 }; + /** Max entries per pool to include on first turn when no session context exists. */ const NO_CONTEXT_FALLBACK_CAP = 10; @@ -163,43 +166,53 @@ const NO_CONTEXT_FALLBACK_CAP = 10; const PROJECT_SAFETY_NET = 5; /** - * Score entries by term overlap with session context. - * Returns score = (fraction of topTerms matched) * entry.confidence. + * Score entries by FTS5 BM25 relevance to session context. + * + * Uses OR semantics (not AND-then-OR) because we're scoring ALL candidates + * for relevance ranking, not searching for exact matches. An entry that + * matches 1 of 40 terms should still get a (low) score, not be excluded. + * BM25 naturally weights entries matching more terms higher. + * + * Returns a Map of entry ID → normalized score (0–1). */ -function scoreEntries( - entries: KnowledgeEntry[], - topTerms: string[], -): Scored[] { - return entries.map((entry) => { - const haystack = - (entry.title + " " + entry.content).replace(/[^\w\s]/g, " ").toLowerCase(); - let hits = 0; - for (const term of topTerms) { - if (haystack.includes(term)) hits++; - } - const relevance = topTerms.length > 0 ? hits / topTerms.length : 0; - return { entry, score: relevance * entry.confidence }; - }); -} +function scoreEntriesFTS(sessionContext: string): Map { + const terms = extractTopTerms(sessionContext); + if (!terms.length) return new Map(); -/** - * Extract the top 30 meaningful terms (>3 chars) from text, sorted by frequency. - */ -function extractTopTerms(text: string): string[] { - const freq = text - .replace(/[^\w\s]/g, " ") - .toLowerCase() - .split(/\s+/) - .filter((w) => w.length > 3) - .reduce>((acc, w) => { - acc.set(w, (acc.get(w) ?? 0) + 1); - return acc; - }, new Map()); - - return [...freq.entries()] - .sort((a, b) => b[1] - a[1]) - .slice(0, 30) - .map(([w]) => w); + const q = terms.map((t) => `${t}*`).join(" OR "); + const { title, content, category } = FTS_WEIGHTS; + + try { + const results = db() + .query( + `SELECT k.id, bm25(knowledge_fts, ?, ?, ?) as rank + FROM knowledge k + JOIN knowledge_fts f ON k.rowid = f.rowid + WHERE knowledge_fts MATCH ? + AND k.confidence > 0.2`, + ) + .all(title, content, category, q) as Array<{ + id: string; + rank: number; + }>; + + if (!results.length) return new Map(); + + // Normalize: BM25 rank is negative (more negative = better). + // Convert to 0–1 where 1 = best match. + const ranks = results.map((r) => r.rank); + const minRank = Math.min(...ranks); + const maxRank = Math.max(...ranks); + const scoreMap = new Map(); + for (const r of results) { + const norm = + minRank === maxRank ? 1 : (maxRank - r.rank) / (maxRank - minRank); + scoreMap.set(r.id, norm); + } + return scoreMap; + } catch { + return new Map(); + } } /** @@ -279,10 +292,14 @@ export function forSession( let scoredCross: Scored[]; if (sessionContext.trim().length > 20) { - const topTerms = extractTopTerms(sessionContext); - - // Score project entries — include matched + safety net of top-N by confidence - const rawScored = scoreEntries(projectEntries, topTerms); + // Use FTS5 BM25 to score all knowledge entries against session context + const ftsScores = scoreEntriesFTS(sessionContext); + + // Score project entries: FTS relevance × confidence, with safety net + const rawScored: Scored[] = projectEntries.map((entry) => ({ + entry, + score: (ftsScores.get(entry.id) ?? 0) * entry.confidence, + })); const matched = rawScored.filter((s) => s.score > 0); const matchedIds = new Set(matched.map((s) => s.entry.id)); @@ -295,8 +312,13 @@ export function forSession( scoredProject = [...matched, ...safetyNet]; - // Score cross-project entries — only include entries with at least one term match - scoredCross = scoreEntries(crossEntries, topTerms).filter((s) => s.score > 0); + // Score cross-project entries — only include entries with FTS match + scoredCross = crossEntries + .filter((e) => ftsScores.has(e.id)) + .map((e) => ({ + entry: e, + score: (ftsScores.get(e.id) ?? 0) * e.confidence, + })); } else { // No session context — fall back to top entries by confidence, capped scoredProject = projectEntries @@ -364,9 +386,6 @@ function searchLike(input: { .all(...likeParams, input.limit) as KnowledgeEntry[]; } -/** BM25 column weights for knowledge_fts: title, content, category. */ -const FTS_WEIGHTS = { title: 6.0, content: 2.0, category: 3.0 }; - export function search(input: { query: string; projectPath?: string; diff --git a/src/search.ts b/src/search.ts index 1d879ca..e5ad559 100644 --- a/src/search.ts +++ b/src/search.ts @@ -173,6 +173,38 @@ export function ftsQueryOr(raw: string): string { return terms.map((w) => `${w}*`).join(" OR "); } +// --------------------------------------------------------------------------- +// Term extraction (Phase 3) +// --------------------------------------------------------------------------- + +/** + * Extract the top meaningful terms from text, sorted by frequency. + * + * Same filtering as ftsQuery: drops single chars + stopwords. + * No general length threshold — preserves short meaningful tokens like "DB", "CI". + * + * Used by forSession() to build session context queries for FTS5 scoring. + * + * @param text Raw text to extract terms from + * @param limit Max number of terms to return (default 40) + */ +export function extractTopTerms(text: string, limit = 40): string[] { + const freq = text + .replace(/[^\w\s]/g, " ") + .toLowerCase() + .split(/\s+/) + .filter((w) => w.length > 1 && !STOPWORDS.has(w)) + .reduce>((acc, w) => { + acc.set(w, (acc.get(w) ?? 0) + 1); + return acc; + }, new Map()); + + return [...freq.entries()] + .sort((a, b) => b[1] - a[1]) + .slice(0, limit) + .map(([w]) => w); +} + // --------------------------------------------------------------------------- // Score normalization & fusion (Phase 2) // --------------------------------------------------------------------------- diff --git a/test/search.test.ts b/test/search.test.ts index 6f9c856..12ee4a3 100644 --- a/test/search.test.ts +++ b/test/search.test.ts @@ -6,6 +6,7 @@ import { EMPTY_QUERY, normalizeRank, reciprocalRankFusion, + extractTopTerms, } from "../src/search"; describe("search", () => { @@ -258,4 +259,66 @@ describe("search", () => { expect(fused[0].score).toBeCloseTo(0.1, 4); }); }); + + describe("extractTopTerms", () => { + test("extracts terms sorted by frequency", () => { + const terms = extractTopTerms("database database database config config"); + expect(terms[0]).toBe("database"); + expect(terms[1]).toBe("config"); + }); + + test("filters stopwords", () => { + const terms = extractTopTerms("the database with the indexes from the table"); + expect(terms).toContain("database"); + expect(terms).toContain("indexes"); + expect(terms).toContain("table"); + expect(terms).not.toContain("the"); + expect(terms).not.toContain("with"); + expect(terms).not.toContain("from"); + }); + + test("filters single chars", () => { + const terms = extractTopTerms("I found a bug in x module"); + expect(terms).toContain("found"); + expect(terms).toContain("bug"); + expect(terms).toContain("module"); + expect(terms).not.toContain("I"); + expect(terms).not.toContain("a"); + expect(terms).not.toContain("x"); + }); + + test("preserves 2-char tokens like DB, CI, IO", () => { + const terms = extractTopTerms("check DB and CI pipeline for IO errors"); + expect(terms).toContain("db"); // lowercased + expect(terms).toContain("ci"); + expect(terms).toContain("io"); + }); + + test("respects limit parameter", () => { + const text = "alpha bravo charlie delta echo foxtrot golf hotel india juliet"; + const terms = extractTopTerms(text, 3); + expect(terms.length).toBe(3); + }); + + test("default limit is 40", () => { + // Generate 50 unique words + const words = Array.from({ length: 50 }, (_, i) => `word${i}`); + const text = words.join(" "); + const terms = extractTopTerms(text); + expect(terms.length).toBe(40); + }); + + test("returns empty for all-stopword text", () => { + const terms = extractTopTerms("the with from is at by in"); + expect(terms.length).toBe(0); + }); + + test("strips punctuation before processing", () => { + const terms = extractTopTerms("what's happening? database-migration!"); + expect(terms).toContain("happening"); + expect(terms).toContain("database"); + expect(terms).toContain("migration"); + expect(terms).not.toContain("what"); // stopword + }); + }); });