Skip to content

Commit 59b94de

Browse files
committed
feat(search): MiniSearch BM25 ranking with per-heading chunks
Add minisearch to search image, section splitting, mtime index cache. Fallback to legacy token ranker on empty hits; SEARCH_RANKER=legacy opt-out. Tests: node:test under docker/search; exclude path from vitest. Update PRD §5.8, §6, NR-RET-1. Made-with: Cursor
1 parent b2d7a7b commit 59b94de

File tree

8 files changed

+280
-19
lines changed

8 files changed

+280
-19
lines changed

docker/search/Dockerfile

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,19 @@ WORKDIR /workspace
1313

1414
COPY docker/search/entrypoint.sh /usr/local/bin/qwen-entrypoint
1515
COPY docker/search/qwen-search.sh /usr/local/bin/qwen-search
16-
COPY docker/search/http-server.mjs /opt/search/http-server.mjs
17-
COPY docker/search/knowledge-rank.mjs /opt/search/knowledge-rank.mjs
18-
COPY openapi/knowledge-v1.yaml /opt/search/openapi.yaml
19-
COPY knowledge /opt/search/knowledge
16+
17+
WORKDIR /opt/search
18+
COPY docker/search/package.json docker/search/package-lock.json ./
19+
RUN npm ci --omit=dev
20+
21+
COPY docker/search/http-server.mjs docker/search/knowledge-rank.mjs ./
22+
COPY openapi/knowledge-v1.yaml ./openapi.yaml
23+
COPY knowledge ./knowledge
2024

2125
RUN chmod +x /usr/local/bin/qwen-entrypoint /usr/local/bin/qwen-search /opt/search/http-server.mjs
2226

27+
WORKDIR /workspace
28+
2329
VOLUME ["/corpus"]
2430

2531
ENTRYPOINT ["qwen-entrypoint"]

docker/search/knowledge-rank.mjs

Lines changed: 183 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
/**
2-
* Rank local markdown corpus for public search (mirrors src/local-search.ts logic).
3-
* Uses Unicode-aware tokenization so Cyrillic queries match the corpus.
2+
* Rank local markdown corpus for public search.
3+
* Primary: MiniSearch (BM25-style full-text, per-heading chunks).
4+
* Fallback: legacy token heuristic (Unicode), same as src/local-search.ts.
45
*/
56
import { readdirSync, readFileSync, statSync } from "node:fs";
67
import { extname, join, relative } from "node:path";
8+
import MiniSearch from "minisearch";
79

810
const MAX_RESULTS = 5;
911
const MAX_KNOWLEDGE_FILE_BYTES = 2 * 1024 * 1024;
@@ -24,11 +26,18 @@ const STOP_WORDS = new Set([
2426
"with",
2527
]);
2628

27-
function tokenizeQuery(query) {
28-
return query
29+
/** @type {Map<string, { mtime: number, mini: import('minisearch').default, docs: Map<number, { file: string, section: string, content: string }> }>} */
30+
const indexCache = new Map();
31+
32+
function unicodeTokenize(text) {
33+
return text
2934
.toLowerCase()
3035
.split(/[^\p{L}\p{N}]+/u)
31-
.filter((token) => token.length >= MIN_TOKEN_LENGTH && !STOP_WORDS.has(token));
36+
.filter((t) => t.length > 0);
37+
}
38+
39+
function tokenizeQuery(query) {
40+
return unicodeTokenize(query).filter((token) => token.length >= MIN_TOKEN_LENGTH && !STOP_WORDS.has(token));
3241
}
3342

3443
function countOccurrences(content, token) {
@@ -71,7 +80,33 @@ function extractSnippet(content, matchIndex) {
7180
return `${rawSnippet.slice(0, 217)}...`;
7281
}
7382

74-
function rankDocuments(query, documents) {
83+
function snippetFromContent(content, query) {
84+
const terms = tokenizeQuery(query);
85+
if (terms.length === 0) {
86+
const fb = unicodeTokenize(query).filter((t) => t.length >= 1);
87+
for (const t of fb) {
88+
const i = content.toLowerCase().indexOf(t);
89+
if (i !== -1) {
90+
return extractSnippet(content, i);
91+
}
92+
}
93+
return extractSnippet(content, 0);
94+
}
95+
const lower = content.toLowerCase();
96+
let best = -1;
97+
for (const t of terms) {
98+
const i = lower.indexOf(t);
99+
if (i !== -1 && (best === -1 || i < best)) {
100+
best = i;
101+
}
102+
}
103+
if (best === -1) {
104+
best = 0;
105+
}
106+
return extractSnippet(content, best);
107+
}
108+
109+
function rankDocumentsLegacy(query, documents) {
75110
const tokens = tokenizeQuery(query);
76111
if (tokens.length === 0) {
77112
return [];
@@ -134,6 +169,25 @@ function walkKnowledgeTree(dir) {
134169
return files;
135170
}
136171

172+
function corpusMaxMtime(rootDir) {
173+
let max = 0;
174+
try {
175+
for (const p of walkKnowledgeTree(rootDir)) {
176+
try {
177+
const t = statSync(p).mtimeMs;
178+
if (t > max) {
179+
max = t;
180+
}
181+
} catch {
182+
/* skip */
183+
}
184+
}
185+
} catch {
186+
return 0;
187+
}
188+
return max;
189+
}
190+
137191
function loadKnowledgeDocuments(dir) {
138192
let paths;
139193
try {
@@ -156,16 +210,134 @@ function loadKnowledgeDocuments(dir) {
156210
}
157211

158212
/**
159-
* @param {string} query
160-
* @param {string} rootDir
161-
* @returns {ReadonlyArray<{ file: string, section: string, snippet: string }>}
213+
* Split a markdown file into heading-scoped sections for finer retrieval.
214+
* @param {string} relPath
215+
* @param {string} content
162216
*/
163-
export function rankKnowledgeForQuery(query, rootDir) {
217+
export function splitIntoSections(relPath, content) {
218+
const lines = content.split(/\r?\n/);
219+
/** @type {{ file: string, section: string, content: string }[]} */
220+
const out = [];
221+
let sectionTitle = "Overview";
222+
/** @type {string[]} */
223+
let buf = [];
224+
const flush = () => {
225+
const text = buf.join("\n").trim();
226+
if (text.length > 0) {
227+
out.push({ file: relPath, section: sectionTitle, content: text });
228+
}
229+
buf = [];
230+
};
231+
for (const line of lines) {
232+
const m = /^#{1,6}\s+(.+)$/.exec(line);
233+
if (m) {
234+
flush();
235+
sectionTitle = m[1].trim();
236+
continue;
237+
}
238+
buf.push(line);
239+
}
240+
flush();
241+
return out;
242+
}
243+
244+
function buildSectionRows(documents) {
245+
/** @type {{ id: number, file: string, section: string, content: string }[]} */
246+
const rows = [];
247+
let id = 0;
248+
for (const doc of documents) {
249+
for (const sec of splitIntoSections(doc.file, doc.content)) {
250+
rows.push({ id: id++, file: sec.file, section: sec.section, content: sec.content });
251+
}
252+
}
253+
return rows;
254+
}
255+
256+
function getOrBuildMiniIndex(rootDir) {
257+
const mtime = corpusMaxMtime(rootDir);
258+
const cached = indexCache.get(rootDir);
259+
if (mtime > 0 && cached && cached.mtime === mtime) {
260+
return cached;
261+
}
262+
164263
const documents = loadKnowledgeDocuments(rootDir);
165-
const ranked = rankDocuments(query, documents);
264+
const rows = buildSectionRows(documents);
265+
/** @type {Map<number, { file: string, section: string, content: string }>} */
266+
const docs = new Map();
267+
for (const r of rows) {
268+
docs.set(r.id, { file: r.file, section: r.section, content: r.content });
269+
}
270+
271+
const mini = new MiniSearch({
272+
fields: ["content", "section", "file"],
273+
storeFields: ["file", "section", "content"],
274+
idField: "id",
275+
tokenize: (string) => unicodeTokenize(string).filter((t) => t.length >= 1),
276+
});
277+
278+
mini.addAll(rows);
279+
280+
const entry = { mtime, mini, docs };
281+
if (mtime > 0) {
282+
indexCache.set(rootDir, entry);
283+
}
284+
return entry;
285+
}
286+
287+
function rankMiniSearch(query, rootDir) {
288+
const { mini, docs } = getOrBuildMiniIndex(rootDir);
289+
if (docs.size === 0) {
290+
return [];
291+
}
292+
const hits = mini.search(query, {
293+
prefix: true,
294+
fuzzy: 0.12,
295+
boost: { section: 2.2, file: 1.65, content: 1 },
296+
});
297+
298+
/** @type {{ file: string, section: string, snippet: string }[]} */
299+
const out = [];
300+
for (const h of hits.slice(0, MAX_RESULTS)) {
301+
const stored = docs.get(h.id);
302+
if (!stored) {
303+
continue;
304+
}
305+
out.push({
306+
file: stored.file,
307+
section: stored.section,
308+
snippet: snippetFromContent(stored.content, query),
309+
});
310+
}
311+
return out;
312+
}
313+
314+
function mapLegacySlice(ranked) {
166315
return ranked.slice(0, MAX_RESULTS).map((d) => ({
167316
file: d.file,
168317
section: d.section,
169318
snippet: d.snippet,
170319
}));
171320
}
321+
322+
/**
323+
* @param {string} query
324+
* @param {string} rootDir
325+
* @returns {ReadonlyArray<{ file: string, section: string, snippet: string }>}
326+
*/
327+
export function rankKnowledgeForQuery(query, rootDir) {
328+
const mode = (process.env.SEARCH_RANKER || "minisearch").toLowerCase();
329+
const documents = loadKnowledgeDocuments(rootDir);
330+
331+
if (mode === "legacy") {
332+
const ranked = rankDocumentsLegacy(query, documents);
333+
return mapLegacySlice(ranked);
334+
}
335+
336+
const miniHits = rankMiniSearch(query, rootDir);
337+
if (miniHits.length > 0) {
338+
return miniHits;
339+
}
340+
341+
const ranked = rankDocumentsLegacy(query, documents);
342+
return mapLegacySlice(ranked);
343+
}

docker/search/package-lock.json

Lines changed: 21 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docker/search/package.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"name": "@spawndock/search-ranker",
3+
"private": true,
4+
"type": "module",
5+
"version": "0.0.0",
6+
"dependencies": {
7+
"minisearch": "^7.1.0"
8+
}
9+
}

docker/search/rank.test.mjs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import assert from "node:assert/strict";
2+
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
3+
import { join } from "node:path";
4+
import { tmpdir } from "node:os";
5+
import { describe, it, beforeEach, afterEach } from "node:test";
6+
import { rankKnowledgeForQuery, splitIntoSections } from "./knowledge-rank.mjs";
7+
8+
describe("splitIntoSections", () => {
9+
it("chunks by headings", () => {
10+
const parts = splitIntoSections("g/x.md", "# A\nline\n\n## B\nother");
11+
assert.equal(parts.length, 2);
12+
assert.equal(parts[0].section, "A");
13+
assert.ok(parts[0].content.includes("line"));
14+
assert.equal(parts[1].section, "B");
15+
});
16+
});
17+
18+
describe("rankKnowledgeForQuery (MiniSearch)", () => {
19+
let root;
20+
beforeEach(() => {
21+
root = mkdtempSync(join(tmpdir(), "search-rank-"));
22+
mkdirSync(join(root, "guides"));
23+
writeFileSync(
24+
join(root, "guides", "tma.md"),
25+
["# Telegram Mini App", "Use WebApp SDK for TMA buttons.", "", "# Deploy", "Use spawn dock deploy."].join("\n"),
26+
);
27+
});
28+
afterEach(() => {
29+
rmSync(root, { recursive: true, force: true });
30+
});
31+
32+
it("returns Telegram section for english query", () => {
33+
const r = rankKnowledgeForQuery("telegram WebApp SDK buttons", root);
34+
assert.ok(r.length >= 1);
35+
assert.ok(r.some((x) => x.file === "guides/tma.md" && x.section === "Telegram Mini App"));
36+
});
37+
38+
it("falls back to legacy when minisearch has no overlap", () => {
39+
const r = rankKnowledgeForQuery("zzzznomatch____", root);
40+
assert.equal(r.length, 0);
41+
});
42+
43+
it("matches Cyrillic query against Cyrillic heading", () => {
44+
writeFileSync(
45+
join(root, "guides", "ru.md"),
46+
"# Как сделать TMA\nИспользуйте Telegram Mini App SDK.\n",
47+
);
48+
const r = rankKnowledgeForQuery("как сделать TMA", root);
49+
assert.ok(r.some((x) => x.file === "guides/ru.md" && x.section.includes("TMA")));
50+
});
51+
});

docs/PRD-public-knowledge-search-service.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ rate_limit_tiers:
270270
| Компонент | Статус | Заметки |
271271
|-----------|--------|---------|
272272
| Монтирование корпуса **`/corpus`** (прод) и **`KNOWLEDGE_ROOT`** | **Done** | Образ также содержит снимок **`knowledge/`** в **`/opt/search/knowledge`** для запуска без volume. |
273-
| Ранжирование фрагментов перед вызовом LLM | **Done** | Эвристика по токенам запроса и `.md` (Unicode-токены для кириллицы); см. `docker/search/knowledge-rank.mjs`, зеркало логики в `src/local-search.ts`. |
273+
| Ранжирование фрагментов перед вызовом LLM | **Done** | По умолчанию **[MiniSearch](https://github.com/lucaong/minisearch)** (BM25-алгоритм) по секциям Markdown; при пустом hit — fallback на эвристику по токенам. `SEARCH_RANKER=legacy` — только эвристика. См. `docker/search/knowledge-rank.mjs`; зеркало эвристики в `src/local-search.ts`. |
274274
| Учёт **`locale`** в промпте | **Done** | Явные инструкции `ru` / `en` / авто по языку запроса. |
275275
| Поле **`sources`** | **Done** | Из JSON ответа модели; если пусто — fallback из ранжированных источников. |
276276
| Диагностика сбоев Qwen (**stdout/stderr** в **502**) | **Done** | Усечённые потоки в `message` для оператора. |
@@ -288,6 +288,7 @@ rate_limit_tiers:
288288
| `SEARCH_HTTP_PORT` / `QWEN_HTTP_PORT` | Порт HTTP listener (по умолчанию **8790**). |
289289
| `SEARCH_HTTP_BIND` / `QWEN_HTTP_BIND` | Bind address (по умолчанию **0.0.0.0**). |
290290
| `KNOWLEDGE_ROOT` | Корень Markdown-корпуса (**рекомендуется `/corpus`** в проде). |
291+
| `SEARCH_RANKER` | `minisearch` (по умолчанию) или `legacy` — только эвристика по токенам. |
291292
| `SEARCH_RATE_LIMIT_TIERS` | JSON override лимитов **free** / **basic** (см. §5.6). |
292293
| `API_TOKEN` | Общий секрет для **Bearer** и tier **basic** на **`search`**. |
293294
| `PROD_QWEN_OAUTH_CREDS` / `QWEN_OAUTH_CREDS_B64` | Base64 **oauth_creds** для Qwen CLI в контейнере; после смены секрета — **пересобрать/перезапустить** **`search`**. |
@@ -362,7 +363,7 @@ rate_limit_tiers:
362363

363364
| ID | Требование | Приоритет |
364365
|----|------------|-----------|
365-
| **NR-RET-1** | Оценить **FTS5 / BM25** или **сведение рангов (RRF)** для релевантности фрагментов корпуса вместо или вместе с текущей эвристикой; сохранить контракт API. | P2 |
366+
| **NR-RET-1** | ~~Оценить BM25~~**частично done** (MiniSearch + секции). Далее: **FTS5 / RRF / trigram** при необходимости; контракт API без изменений. | P2 |
366367
| **NR-RET-2** | Опциональный **кэш** ответов по `(query нормализованный, locale, версия корпуса)` при неизменном корпусе — снижение стоимости Qwen и latency. | P3 |
367368
| **NR-OBS-1** | Метрики (**accepted/429/latency/502**) и точки интеграции с мониторингом хоста. | P2 |
368369
| **NR-HA-1** | При **>1 реплики** `search` — вынести дневные/минутные счётчики rate limit из in-memory (**Redis** и аналоги); см. §5.6.3. | P2 |
@@ -388,4 +389,4 @@ rate_limit_tiers:
388389

389390
---
390391

391-
*Document version: 1.5 — 2026-03-25 — §4/§6/§7 актуализированы; §5.8 пайплайн; §12 статус и backlog; §13 context-mode; §9 smoke-тесты; NR для retrieval/observability/HA.*
392+
*Document version: 1.6 — 2026-03-25 — §5.8 MiniSearch ranker; §6 `SEARCH_RANKER`; NR-RET-1 частично закрыт; vitest exclude `docker/search`.*

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"start": "node dist/index.js",
99
"dev": "tsx --env-file=.env src/index.ts",
1010
"test": "vitest run",
11+
"test:search-rank": "cd docker/search && npm ci --omit=dev && node --test rank.test.mjs",
1112
"smoke:knowledge": "node scripts/smoke-knowledge-search.mjs",
1213
"bot": "node --env-file=.env dist/bot/polling.js",
1314
"bot:dev": "tsx --env-file=.env src/bot/polling.ts",

vitest.config.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@ import { defineConfig } from "vitest/config";
22

33
export default defineConfig({
44
test: {
5-
exclude: ["dist/**", "node_modules/**"],
5+
exclude: ["dist/**", "node_modules/**", "docker/search/**"],
66
},
77
});

0 commit comments

Comments
 (0)