From dc9cd4c9bd75e1f214ded27170ac6f7b1657af02 Mon Sep 17 00:00:00 2001 From: 0xMink Date: Wed, 11 Feb 2026 01:06:18 -0500 Subject: [PATCH] fix(code-index): preserve identifiers when splitting large nodes (#10715) When the code parser splits a node that exceeds MAX_BLOCK_CHARS, small children (export keyword, class/function name, etc.) fall below MIN_BLOCK_CHARS and are silently discarded. This causes function and class names to disappear from the search index. Add a split-local sibling accumulator that prepends small children to the next large sibling as a synthetic QueueItem, and appends trailing small children to the last large sibling. Only the split path is affected; main-loop discard behaviour for non-split nodes is unchanged. Closes #10715 --- .../processors/__tests__/parser.spec.ts | 382 ++++++++++++++++++ src/services/code-index/processors/parser.ts | 172 +++++++- 2 files changed, 535 insertions(+), 19 deletions(-) diff --git a/src/services/code-index/processors/__tests__/parser.spec.ts b/src/services/code-index/processors/__tests__/parser.spec.ts index 68b523c7dfb..75ea6746374 100644 --- a/src/services/code-index/processors/__tests__/parser.spec.ts +++ b/src/services/code-index/processors/__tests__/parser.spec.ts @@ -5,6 +5,7 @@ import { loadRequiredLanguageParsers } from "../../../tree-sitter/languageParser import { parseMarkdown } from "../../../tree-sitter/markdownParser" import { readFile } from "fs/promises" import { Node } from "web-tree-sitter" +import { MAX_BLOCK_CHARS, MIN_BLOCK_CHARS, MAX_CHARS_TOLERANCE_FACTOR } from "../../constants" // Mock TelemetryService vi.mock("../../../../../packages/telemetry/src/TelemetryService", () => ({ @@ -243,6 +244,387 @@ describe("CodeParser", () => { }) }) + describe("split-path signature accumulation (#10715)", () => { + it("should preserve class/function identifier when splitting a large node", async () => { + // Simulate a large class node whose children are: + // - "export" (7 chars, < MIN_BLOCK_CHARS) + // - "class" (5 chars, < MIN_BLOCK_CHARS) + // - "TestParser" (10 chars, < MIN_BLOCK_CHARS) + // - class body (large, > MIN_BLOCK_CHARS) + // Total must exceed the split threshold to trigger the split path. + const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR + const bodyText = "x".repeat(Math.ceil(splitThreshold) + 100) + const mockCapture = { + node: { + text: `export class TestParser ${bodyText}`, + startPosition: { row: 0 }, + endPosition: { row: 10 }, + type: "class_declaration", + childForFieldName: vi.fn().mockReturnValue({ text: "TestParser" }), + children: [ + { + text: "export", + startPosition: { row: 0 }, + endPosition: { row: 0 }, + type: "export", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + { + text: "class", + startPosition: { row: 0 }, + endPosition: { row: 0 }, + type: "class", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + { + text: "TestParser", + startPosition: { row: 0 }, + endPosition: { row: 0 }, + type: "identifier", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + { + text: bodyText, + startPosition: { row: 1 }, + endPosition: { row: 10 }, + type: "class_body", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + ], + }, + name: "definition.class", + } + + const totalText = mockCapture.node.text + expect(totalText.length).toBeGreaterThan(splitThreshold) + + mockLanguageParser.js.query.captures.mockReturnValue([mockCapture]) + const result = await parser["parseContent"]("test.js", totalText, "hash-sig") + + // At least one emitted chunk must contain the class name "TestParser" + const hasIdentifier = result.some((block) => block.content.includes("TestParser")) + expect(hasIdentifier).toBe(true) + // Verify whitespace separation: tokens should not be garbled together + const chunkWithId = result.find((block) => block.content.includes("TestParser"))! + expect(chunkWithId.content).not.toContain("exportclassTestParser") + expect(chunkWithId.content).toContain("export class TestParser") + // No chunks should be empty + result.forEach((block) => { + expect(block.content.length).toBeGreaterThan(0) + }) + }) + + it("should not contaminate across unrelated parent nodes", async () => { + // Two separate captures: a small standalone comment, then a large class. + // The comment should NOT appear inside the class's chunks. + const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR + const bodyText = "y".repeat(Math.ceil(splitThreshold) + 100) + const commentText = "// standalone comment that is small" + const classText = `export class Foo { ${bodyText} }` + + const commentCapture = { + node: { + text: commentText, + startPosition: { row: 0 }, + endPosition: { row: 0 }, + type: "comment", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + name: "definition.comment", + } + + const classCapture = { + node: { + text: classText, + startPosition: { row: 2 }, + endPosition: { row: 20 }, + type: "class_declaration", + childForFieldName: vi.fn().mockReturnValue({ text: "Foo" }), + children: [ + { + text: "export", + startPosition: { row: 2 }, + endPosition: { row: 2 }, + type: "export", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + { + text: "class", + startPosition: { row: 2 }, + endPosition: { row: 2 }, + type: "class", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + { + text: "Foo", + startPosition: { row: 2 }, + endPosition: { row: 2 }, + type: "identifier", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + { + text: `{ ${bodyText} }`, + startPosition: { row: 2 }, + endPosition: { row: 20 }, + type: "class_body", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + ], + }, + name: "definition.class", + } + + mockLanguageParser.js.query.captures.mockReturnValue([commentCapture, classCapture]) + const result = await parser["parseContent"]("test.js", commentText + "\n\n" + classText, "hash-contam") + + // The comment is < MIN_BLOCK_CHARS so it should be discarded by the main loop + // (not accumulated -- accumulation only happens within split children). + // No class chunk should contain "standalone comment". + const classChunks = result.filter( + (block) => block.content.includes("Foo") || block.content.includes(bodyText.slice(0, 20)), + ) + classChunks.forEach((block) => { + expect(block.content).not.toContain("standalone comment") + }) + }) + + it("should allow recursive splitting of large children with no pending parts", async () => { + // A class_body child that is large and has its own children (methods). + // When there are no pending small siblings, the raw Node should be + // pushed so the main loop can recurse into the class_body's children. + const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR + const method1Body = "a".repeat(Math.ceil(splitThreshold / 2)) + const method2Body = "b".repeat(Math.ceil(splitThreshold / 2)) + const classBodyText = `{ method1() { ${method1Body} } method2() { ${method2Body} } }` + + const mockCapture = { + node: { + text: `class Deep ${classBodyText}`, + startPosition: { row: 0 }, + endPosition: { row: 20 }, + type: "class_declaration", + childForFieldName: vi.fn().mockReturnValue({ text: "Deep" }), + children: [ + { + text: "class", + startPosition: { row: 0 }, + endPosition: { row: 0 }, + type: "class", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + { + text: "Deep", + startPosition: { row: 0 }, + endPosition: { row: 0 }, + type: "identifier", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + { + // class_body is large, has its own children (two methods) + text: classBodyText, + startPosition: { row: 1 }, + endPosition: { row: 20 }, + type: "class_body", + childForFieldName: vi.fn().mockReturnValue(null), + children: [ + { + text: `method1() { ${method1Body} }`, + startPosition: { row: 2 }, + endPosition: { row: 10 }, + type: "method_definition", + childForFieldName: vi.fn().mockReturnValue({ text: "method1" }), + children: [], + }, + { + text: `method2() { ${method2Body} }`, + startPosition: { row: 11 }, + endPosition: { row: 19 }, + type: "method_definition", + childForFieldName: vi.fn().mockReturnValue({ text: "method2" }), + children: [], + }, + ], + }, + ], + }, + name: "definition.class", + } + + const totalText = mockCapture.node.text + expect(totalText.length).toBeGreaterThan(splitThreshold) + + mockLanguageParser.js.query.captures.mockReturnValue([mockCapture]) + const result = await parser["parseContent"]("test.js", totalText, "hash-recurse") + + // Both method bodies should appear in the output (proving recursive splitting worked) + const allContent = result.map((block) => block.content).join("") + expect(allContent).toContain(method1Body.slice(0, 50)) + expect(allContent).toContain(method2Body.slice(0, 50)) + + // The class keyword and identifier should be preserved via accumulation + const hasClassId = result.some((block) => block.content.includes("Deep")) + expect(hasClassId).toBe(true) + }) + + it("should preserve recursive splitting even with trailing small siblings", async () => { + // Edge case: a parent whose FIRST child is a large node with its own + // children (no preceding small siblings -> pushed as raw Node), followed + // by a trailing small sibling. Recursion must be preserved; the trailing + // token is dropped rather than converting the raw Node to a QueueItem. + const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR + const method1Body = "r".repeat(Math.ceil(splitThreshold / 2)) + const method2Body = "s".repeat(Math.ceil(splitThreshold / 2)) + const blockText = `{ method1() { ${method1Body} } method2() { ${method2Body} } }` + const tailToken = "/*END*/" + + const mockCapture = { + node: { + // Parent has only two children: one large block and one small tail + text: `${blockText} ${tailToken}`, + startPosition: { row: 0 }, + endPosition: { row: 25 }, + type: "statement_block", + childForFieldName: vi.fn().mockReturnValue(null), + children: [ + { + // Large child with its own children -- pushed as raw Node + // (no preceding small siblings, so no pending parts) + text: blockText, + startPosition: { row: 0 }, + endPosition: { row: 22 }, + type: "class_body", + childForFieldName: vi.fn().mockReturnValue(null), + children: [ + { + text: `method1() { ${method1Body} }`, + startPosition: { row: 2 }, + endPosition: { row: 10 }, + type: "method_definition", + childForFieldName: vi.fn().mockReturnValue({ text: "method1" }), + children: [], + }, + { + text: `method2() { ${method2Body} }`, + startPosition: { row: 11 }, + endPosition: { row: 20 }, + type: "method_definition", + childForFieldName: vi.fn().mockReturnValue({ text: "method2" }), + children: [], + }, + ], + }, + { + // Trailing small sibling + text: tailToken, + startPosition: { row: 25 }, + endPosition: { row: 25 }, + type: "comment", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + ], + }, + name: "definition.block", + } + + const totalText = mockCapture.node.text + expect(totalText.length).toBeGreaterThan(splitThreshold) + + mockLanguageParser.js.query.captures.mockReturnValue([mockCapture]) + const result = await parser["parseContent"]("test.js", totalText, "hash-combined") + + // Recursive splitting must still work: both method bodies should appear + const allContent = result.map((block) => block.content).join("") + expect(allContent).toContain(method1Body.slice(0, 50)) + expect(allContent).toContain(method2Body.slice(0, 50)) + + // The tail token is intentionally dropped to preserve recursion: + // converting the raw Node to a QueueItem would block recursive splitting. + // Assert it is absent from all output (not just "not standalone"). + expect(allContent).not.toContain(tailToken) + }) + + it("should append trailing small siblings to the last large sibling", async () => { + // A split node with: large body, then a small unique trailing token. + // Use a unique token that does NOT appear in the body text. + const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR + const bodyText = "z".repeat(Math.ceil(splitThreshold) + 100) + const tailToken = "/*TAIL*/" + + const parentCapture = { + node: { + text: `function bar() { ${bodyText} ${tailToken}`, + startPosition: { row: 0 }, + endPosition: { row: 10 }, + type: "function_declaration", + childForFieldName: vi.fn().mockReturnValue({ text: "bar" }), + children: [ + { + text: "function", + startPosition: { row: 0 }, + endPosition: { row: 0 }, + type: "function", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + { + text: "bar", + startPosition: { row: 0 }, + endPosition: { row: 0 }, + type: "identifier", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + { + text: `{ ${bodyText} }`, + startPosition: { row: 1 }, + endPosition: { row: 9 }, + type: "statement_block", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + { + text: tailToken, + startPosition: { row: 10 }, + endPosition: { row: 10 }, + type: "comment", + childForFieldName: vi.fn().mockReturnValue(null), + children: [], + }, + ], + }, + name: "definition.function", + } + + const fullText = parentCapture.node.text + expect(fullText.length).toBeGreaterThan(splitThreshold) + // Verify the tail token is unique -- not in the body + expect(bodyText).not.toContain(tailToken) + + mockLanguageParser.js.query.captures.mockReturnValue([parentCapture]) + const result = await parser["parseContent"]("test.js", fullText, "hash-tail") + + // The tail token should be present in some chunk (appended to the last large sibling) + const allContent = result.map((block) => block.content).join("") + expect(allContent).toContain(tailToken) + + // There should be no chunk that is just the tail token by itself + const tinyChunks = result.filter((block) => block.content === tailToken) + expect(tinyChunks).toHaveLength(0) + }) + }) + describe("_chunkLeafNodeByLines", () => { it("should chunk leaf nodes by lines", async () => { const mockNode = { diff --git a/src/services/code-index/processors/parser.ts b/src/services/code-index/processors/parser.ts index 8611884adec..fb905b983e6 100644 --- a/src/services/code-index/processors/parser.ts +++ b/src/services/code-index/processors/parser.ts @@ -11,6 +11,34 @@ import { TelemetryService } from "@roo-code/telemetry" import { TelemetryEventName } from "@roo-code/types" import { sanitizeErrorMessage } from "../shared/validation-helpers" +/** + * Synthetic queue item used when expanding a large node's children. + * Allows accumulating small sibling text (e.g. signature tokens) into + * adjacent larger siblings without mutating tree-sitter Node objects. + */ +type QueueItem = { + kind: "queue_item" + text: string + startRow: number + endRow: number +} + +function isQueueItem(x: unknown): x is QueueItem { + return !!x && typeof x === "object" && (x as Record).kind === "queue_item" +} + +function getNodeText(n: Node | QueueItem): string { + return isQueueItem(n) ? n.text : (n.text ?? "") +} + +function getStartRow(n: Node | QueueItem): number { + return isQueueItem(n) ? n.startRow : n.startPosition.row +} + +function getEndRow(n: Node | QueueItem): number { + return isQueueItem(n) ? n.endRow : n.endPosition.row +} + /** * Implementation of the code parser interface */ @@ -170,23 +198,41 @@ export class CodeParser implements ICodeParser { const results: CodeBlock[] = [] // Process captures if not empty - const queue: Node[] = Array.from(captures).map((capture) => capture.node) + const queue: Array = Array.from(captures).map((capture) => capture.node) while (queue.length > 0) { - const currentNode = queue.shift()! - // const lineSpan = currentNode.endPosition.row - currentNode.startPosition.row + 1 // Removed as per lint error + const currentItem = queue.shift()! + const currentText = getNodeText(currentItem) // Check if the node meets the minimum character requirement - if (currentNode.text.length >= MIN_BLOCK_CHARS) { + if (currentText.length >= MIN_BLOCK_CHARS) { // If it also exceeds the maximum character limit, try to break it down - if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) { - if (currentNode.children.filter((child) => child !== null).length > 0) { - // If it has children, process them instead - queue.push(...currentNode.children.filter((child) => child !== null)) + if (currentText.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) { + // QueueItems are synthetic wrappers -- they have no children to split + if (!isQueueItem(currentItem)) { + const children = currentItem.children.filter((child) => child !== null) + if (children.length > 0) { + // Expand children with split-local accumulator to preserve + // small signature tokens (class name, export keyword, etc.) + this._expandChildrenWithAccumulator(children, queue) + continue + } + } + // Leaf node (or QueueItem too large): chunk by lines + if (isQueueItem(currentItem)) { + const lines = currentItem.text.split("\n") + const chunkedBlocks = this._chunkTextByLines( + lines, + filePath, + fileHash, + "accumulated_chunk", + seenSegmentHashes, + currentItem.startRow + 1, + ) + results.push(...chunkedBlocks) } else { - // If it's a leaf node, chunk it const chunkedBlocks = this._chunkLeafNodeByLines( - currentNode, + currentItem, filePath, fileHash, seenSegmentHashes, @@ -195,14 +241,22 @@ export class CodeParser implements ICodeParser { } } else { // Node meets min chars and is within max chars, create a block - const identifier = - currentNode.childForFieldName("name")?.text || - currentNode.children.find((c) => c?.type === "identifier")?.text || - null - const type = currentNode.type - const start_line = currentNode.startPosition.row + 1 - const end_line = currentNode.endPosition.row + 1 - const content = currentNode.text + let identifier: string | null = null + let type: string + + if (isQueueItem(currentItem)) { + type = "accumulated_chunk" + } else { + identifier = + currentItem.childForFieldName("name")?.text || + currentItem.children.find((c) => c?.type === "identifier")?.text || + null + type = currentItem.type + } + + const start_line = getStartRow(currentItem) + 1 + const end_line = getEndRow(currentItem) + 1 + const content = currentText const contentPreview = content.slice(0, 100) const segmentHash = createHash("sha256") .update(`${filePath}-${start_line}-${end_line}-${content.length}-${contentPreview}`) @@ -223,12 +277,92 @@ export class CodeParser implements ICodeParser { } } } - // Nodes smaller than minBlockChars are ignored + // Nodes smaller than minBlockChars are ignored (only in the main queue; + // small children from split nodes are accumulated by _expandChildrenWithAccumulator) } return results } + /** + * Expands a split node's children into the queue, accumulating adjacent + * small siblings (< MIN_BLOCK_CHARS) and prepending them to the next + * large sibling as a synthetic QueueItem. Trailing small siblings are + * appended to the last queued sibling to avoid emitting tiny chunks. + * + * This preserves class/function signature tokens (export, class keyword, + * identifier) that would otherwise be discarded by the MIN_BLOCK_CHARS + * threshold in the main loop. + */ + private _expandChildrenWithAccumulator(children: Node[], queue: Array): void { + const pendingParts: string[] = [] + let pendingStartRow: number | null = null + let pendingEndRow: number | null = null + let lastQueued: QueueItem | null = null + // Track the last raw Node pushed (and its queue index) so we can + // convert it to a QueueItem if trailing small siblings need appending. + let lastNodeIndex: number = -1 + + for (const child of children) { + const text = child.text ?? "" + const startRow = child.startPosition.row + const endRow = child.endPosition.row + + if (text.length < MIN_BLOCK_CHARS) { + pendingParts.push(text) + if (pendingStartRow === null) { + pendingStartRow = startRow + } + pendingEndRow = endRow + continue + } + + // Large child: prepend any accumulated pending text + if (pendingParts.length > 0) { + const combined: QueueItem = { + kind: "queue_item", + text: pendingParts.join(" ") + " " + text, + startRow: pendingStartRow!, + endRow: endRow, + } + queue.push(combined) + lastQueued = combined + lastNodeIndex = -1 + pendingParts.length = 0 + pendingStartRow = null + pendingEndRow = null + } else { + // No pending parts -- push the raw Node so the main loop + // can still recurse into its children for deeper splitting. + queue.push(child) + lastQueued = null + lastNodeIndex = queue.length - 1 + } + } + + // Tail handling: append remaining small siblings to last queued sibling + if (pendingParts.length > 0) { + const tail = pendingParts.join(" ") + if (lastQueued) { + lastQueued.text += " " + tail + lastQueued.endRow = pendingEndRow! + } else if (lastNodeIndex >= 0) { + // Last pushed was a raw Node. Converting it to a QueueItem + // would block recursive splitting of its children, so we + // drop the trailing tokens (typically closing delimiters + // like "}" with no semantic value for the search index). + } else { + // All children were small -- emit one unavoidable small wrapper + queue.push({ + kind: "queue_item", + text: tail, + startRow: pendingStartRow!, + endRow: pendingEndRow!, + }) + } + } + } + /** * Common helper function to chunk text by lines, avoiding tiny remainders. */