From dc9cd4c9bd75e1f214ded27170ac6f7b1657af02 Mon Sep 17 00:00:00 2001
From: 0xMink <dennis@dennismink.com>
Date: Wed, 11 Feb 2026 01:06:18 -0500
Subject: [PATCH] fix(code-index): preserve identifiers when splitting large
 nodes (#10715)

When the code parser splits a node that exceeds MAX_BLOCK_CHARS, small
children (export keyword, class/function name, etc.) fall below
MIN_BLOCK_CHARS and are silently discarded.  This causes function and
class names to disappear from the search index.

Add a split-local sibling accumulator that prepends small children to
the next large sibling as a synthetic QueueItem, and appends trailing
small children to the last large sibling.  Only the split path is
affected; main-loop discard behaviour for non-split nodes is unchanged.

Closes #10715
---
 .../processors/__tests__/parser.spec.ts       | 382 ++++++++++++++++++
 src/services/code-index/processors/parser.ts  | 172 +++++++-
 2 files changed, 535 insertions(+), 19 deletions(-)

diff --git a/src/services/code-index/processors/__tests__/parser.spec.ts b/src/services/code-index/processors/__tests__/parser.spec.ts
index 68b523c7dfb..75ea6746374 100644
--- a/src/services/code-index/processors/__tests__/parser.spec.ts
+++ b/src/services/code-index/processors/__tests__/parser.spec.ts
@@ -5,6 +5,7 @@ import { loadRequiredLanguageParsers } from "../../../tree-sitter/languageParser
 import { parseMarkdown } from "../../../tree-sitter/markdownParser"
 import { readFile } from "fs/promises"
 import { Node } from "web-tree-sitter"
+import { MAX_BLOCK_CHARS, MIN_BLOCK_CHARS, MAX_CHARS_TOLERANCE_FACTOR } from "../../constants"
 
 // Mock TelemetryService
 vi.mock("../../../../../packages/telemetry/src/TelemetryService", () => ({
@@ -243,6 +244,387 @@ describe("CodeParser", () => {
 		})
 	})
 
+	describe("split-path signature accumulation (#10715)", () => {
+		it("should preserve class/function identifier when splitting a large node", async () => {
+			// Simulate a large class node whose children are:
+			// - "export" (7 chars, < MIN_BLOCK_CHARS)
+			// - "class" (5 chars, < MIN_BLOCK_CHARS)
+			// - "TestParser" (10 chars, < MIN_BLOCK_CHARS)
+			// - class body (large, > MIN_BLOCK_CHARS)
+			// Total must exceed the split threshold to trigger the split path.
+			const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR
+			const bodyText = "x".repeat(Math.ceil(splitThreshold) + 100)
+			const mockCapture = {
+				node: {
+					text: `export class TestParser ${bodyText}`,
+					startPosition: { row: 0 },
+					endPosition: { row: 10 },
+					type: "class_declaration",
+					childForFieldName: vi.fn().mockReturnValue({ text: "TestParser" }),
+					children: [
+						{
+							text: "export",
+							startPosition: { row: 0 },
+							endPosition: { row: 0 },
+							type: "export",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+						{
+							text: "class",
+							startPosition: { row: 0 },
+							endPosition: { row: 0 },
+							type: "class",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+						{
+							text: "TestParser",
+							startPosition: { row: 0 },
+							endPosition: { row: 0 },
+							type: "identifier",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+						{
+							text: bodyText,
+							startPosition: { row: 1 },
+							endPosition: { row: 10 },
+							type: "class_body",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+					],
+				},
+				name: "definition.class",
+			}
+
+			const totalText = mockCapture.node.text
+			expect(totalText.length).toBeGreaterThan(splitThreshold)
+
+			mockLanguageParser.js.query.captures.mockReturnValue([mockCapture])
+			const result = await parser["parseContent"]("test.js", totalText, "hash-sig")
+
+			// At least one emitted chunk must contain the class name "TestParser"
+			const hasIdentifier = result.some((block) => block.content.includes("TestParser"))
+			expect(hasIdentifier).toBe(true)
+			// Verify whitespace separation: tokens should not be garbled together
+			const chunkWithId = result.find((block) => block.content.includes("TestParser"))!
+			expect(chunkWithId.content).not.toContain("exportclassTestParser")
+			expect(chunkWithId.content).toContain("export class TestParser")
+			// No chunks should be empty
+			result.forEach((block) => {
+				expect(block.content.length).toBeGreaterThan(0)
+			})
+		})
+
+		it("should not contaminate across unrelated parent nodes", async () => {
+			// Two separate captures: a small standalone comment, then a large class.
+			// The comment should NOT appear inside the class's chunks.
+			const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR
+			const bodyText = "y".repeat(Math.ceil(splitThreshold) + 100)
+			const commentText = "// standalone comment that is small"
+			const classText = `export class Foo { ${bodyText} }`
+
+			const commentCapture = {
+				node: {
+					text: commentText,
+					startPosition: { row: 0 },
+					endPosition: { row: 0 },
+					type: "comment",
+					childForFieldName: vi.fn().mockReturnValue(null),
+					children: [],
+				},
+				name: "definition.comment",
+			}
+
+			const classCapture = {
+				node: {
+					text: classText,
+					startPosition: { row: 2 },
+					endPosition: { row: 20 },
+					type: "class_declaration",
+					childForFieldName: vi.fn().mockReturnValue({ text: "Foo" }),
+					children: [
+						{
+							text: "export",
+							startPosition: { row: 2 },
+							endPosition: { row: 2 },
+							type: "export",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+						{
+							text: "class",
+							startPosition: { row: 2 },
+							endPosition: { row: 2 },
+							type: "class",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+						{
+							text: "Foo",
+							startPosition: { row: 2 },
+							endPosition: { row: 2 },
+							type: "identifier",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+						{
+							text: `{ ${bodyText} }`,
+							startPosition: { row: 2 },
+							endPosition: { row: 20 },
+							type: "class_body",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+					],
+				},
+				name: "definition.class",
+			}
+
+			mockLanguageParser.js.query.captures.mockReturnValue([commentCapture, classCapture])
+			const result = await parser["parseContent"]("test.js", commentText + "\n\n" + classText, "hash-contam")
+
+			// The comment is < MIN_BLOCK_CHARS so it should be discarded by the main loop
+			// (not accumulated -- accumulation only happens within split children).
+			// No class chunk should contain "standalone comment".
+			const classChunks = result.filter(
+				(block) => block.content.includes("Foo") || block.content.includes(bodyText.slice(0, 20)),
+			)
+			classChunks.forEach((block) => {
+				expect(block.content).not.toContain("standalone comment")
+			})
+		})
+
+		it("should allow recursive splitting of large children with no pending parts", async () => {
+			// A class_body child that is large and has its own children (methods).
+			// When there are no pending small siblings, the raw Node should be
+			// pushed so the main loop can recurse into the class_body's children.
+			const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR
+			const method1Body = "a".repeat(Math.ceil(splitThreshold / 2))
+			const method2Body = "b".repeat(Math.ceil(splitThreshold / 2))
+			const classBodyText = `{ method1() { ${method1Body} } method2() { ${method2Body} } }`
+
+			const mockCapture = {
+				node: {
+					text: `class Deep ${classBodyText}`,
+					startPosition: { row: 0 },
+					endPosition: { row: 20 },
+					type: "class_declaration",
+					childForFieldName: vi.fn().mockReturnValue({ text: "Deep" }),
+					children: [
+						{
+							text: "class",
+							startPosition: { row: 0 },
+							endPosition: { row: 0 },
+							type: "class",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+						{
+							text: "Deep",
+							startPosition: { row: 0 },
+							endPosition: { row: 0 },
+							type: "identifier",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+						{
+							// class_body is large, has its own children (two methods)
+							text: classBodyText,
+							startPosition: { row: 1 },
+							endPosition: { row: 20 },
+							type: "class_body",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [
+								{
+									text: `method1() { ${method1Body} }`,
+									startPosition: { row: 2 },
+									endPosition: { row: 10 },
+									type: "method_definition",
+									childForFieldName: vi.fn().mockReturnValue({ text: "method1" }),
+									children: [],
+								},
+								{
+									text: `method2() { ${method2Body} }`,
+									startPosition: { row: 11 },
+									endPosition: { row: 19 },
+									type: "method_definition",
+									childForFieldName: vi.fn().mockReturnValue({ text: "method2" }),
+									children: [],
+								},
+							],
+						},
+					],
+				},
+				name: "definition.class",
+			}
+
+			const totalText = mockCapture.node.text
+			expect(totalText.length).toBeGreaterThan(splitThreshold)
+
+			mockLanguageParser.js.query.captures.mockReturnValue([mockCapture])
+			const result = await parser["parseContent"]("test.js", totalText, "hash-recurse")
+
+			// Both method bodies should appear in the output (proving recursive splitting worked)
+			const allContent = result.map((block) => block.content).join("")
+			expect(allContent).toContain(method1Body.slice(0, 50))
+			expect(allContent).toContain(method2Body.slice(0, 50))
+
+			// The class keyword and identifier should be preserved via accumulation
+			const hasClassId = result.some((block) => block.content.includes("Deep"))
+			expect(hasClassId).toBe(true)
+		})
+
+		it("should preserve recursive splitting even with trailing small siblings", async () => {
+			// Edge case: a parent whose FIRST child is a large node with its own
+			// children (no preceding small siblings -> pushed as raw Node), followed
+			// by a trailing small sibling. Recursion must be preserved; the trailing
+			// token is dropped rather than converting the raw Node to a QueueItem.
+			const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR
+			const method1Body = "r".repeat(Math.ceil(splitThreshold / 2))
+			const method2Body = "s".repeat(Math.ceil(splitThreshold / 2))
+			const blockText = `{ method1() { ${method1Body} } method2() { ${method2Body} } }`
+			const tailToken = "/*END*/"
+
+			const mockCapture = {
+				node: {
+					// Parent has only two children: one large block and one small tail
+					text: `${blockText} ${tailToken}`,
+					startPosition: { row: 0 },
+					endPosition: { row: 25 },
+					type: "statement_block",
+					childForFieldName: vi.fn().mockReturnValue(null),
+					children: [
+						{
+							// Large child with its own children -- pushed as raw Node
+							// (no preceding small siblings, so no pending parts)
+							text: blockText,
+							startPosition: { row: 0 },
+							endPosition: { row: 22 },
+							type: "class_body",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [
+								{
+									text: `method1() { ${method1Body} }`,
+									startPosition: { row: 2 },
+									endPosition: { row: 10 },
+									type: "method_definition",
+									childForFieldName: vi.fn().mockReturnValue({ text: "method1" }),
+									children: [],
+								},
+								{
+									text: `method2() { ${method2Body} }`,
+									startPosition: { row: 11 },
+									endPosition: { row: 20 },
+									type: "method_definition",
+									childForFieldName: vi.fn().mockReturnValue({ text: "method2" }),
+									children: [],
+								},
+							],
+						},
+						{
+							// Trailing small sibling
+							text: tailToken,
+							startPosition: { row: 25 },
+							endPosition: { row: 25 },
+							type: "comment",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+					],
+				},
+				name: "definition.block",
+			}
+
+			const totalText = mockCapture.node.text
+			expect(totalText.length).toBeGreaterThan(splitThreshold)
+
+			mockLanguageParser.js.query.captures.mockReturnValue([mockCapture])
+			const result = await parser["parseContent"]("test.js", totalText, "hash-combined")
+
+			// Recursive splitting must still work: both method bodies should appear
+			const allContent = result.map((block) => block.content).join("")
+			expect(allContent).toContain(method1Body.slice(0, 50))
+			expect(allContent).toContain(method2Body.slice(0, 50))
+
+			// The tail token is intentionally dropped to preserve recursion:
+			// converting the raw Node to a QueueItem would block recursive splitting.
+			// Assert it is absent from all output (not just "not standalone").
+			expect(allContent).not.toContain(tailToken)
+		})
+
+		it("should append trailing small siblings to the last large sibling", async () => {
+			// A split node with: large body, then a small unique trailing token.
+			// Use a unique token that does NOT appear in the body text.
+			const splitThreshold = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR
+			const bodyText = "z".repeat(Math.ceil(splitThreshold) + 100)
+			const tailToken = "/*TAIL*/"
+
+			const parentCapture = {
+				node: {
+					text: `function bar() { ${bodyText} ${tailToken}`,
+					startPosition: { row: 0 },
+					endPosition: { row: 10 },
+					type: "function_declaration",
+					childForFieldName: vi.fn().mockReturnValue({ text: "bar" }),
+					children: [
+						{
+							text: "function",
+							startPosition: { row: 0 },
+							endPosition: { row: 0 },
+							type: "function",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+						{
+							text: "bar",
+							startPosition: { row: 0 },
+							endPosition: { row: 0 },
+							type: "identifier",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+						{
+							text: `{ ${bodyText} }`,
+							startPosition: { row: 1 },
+							endPosition: { row: 9 },
+							type: "statement_block",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+						{
+							text: tailToken,
+							startPosition: { row: 10 },
+							endPosition: { row: 10 },
+							type: "comment",
+							childForFieldName: vi.fn().mockReturnValue(null),
+							children: [],
+						},
+					],
+				},
+				name: "definition.function",
+			}
+
+			const fullText = parentCapture.node.text
+			expect(fullText.length).toBeGreaterThan(splitThreshold)
+			// Verify the tail token is unique -- not in the body
+			expect(bodyText).not.toContain(tailToken)
+
+			mockLanguageParser.js.query.captures.mockReturnValue([parentCapture])
+			const result = await parser["parseContent"]("test.js", fullText, "hash-tail")
+
+			// The tail token should be present in some chunk (appended to the last large sibling)
+			const allContent = result.map((block) => block.content).join("")
+			expect(allContent).toContain(tailToken)
+
+			// There should be no chunk that is just the tail token by itself
+			const tinyChunks = result.filter((block) => block.content === tailToken)
+			expect(tinyChunks).toHaveLength(0)
+		})
+	})
+
 	describe("_chunkLeafNodeByLines", () => {
 		it("should chunk leaf nodes by lines", async () => {
 			const mockNode = {
diff --git a/src/services/code-index/processors/parser.ts b/src/services/code-index/processors/parser.ts
index 8611884adec..fb905b983e6 100644
--- a/src/services/code-index/processors/parser.ts
+++ b/src/services/code-index/processors/parser.ts
@@ -11,6 +11,34 @@ import { TelemetryService } from "@roo-code/telemetry"
 import { TelemetryEventName } from "@roo-code/types"
 import { sanitizeErrorMessage } from "../shared/validation-helpers"
 
+/**
+ * Synthetic queue item used when expanding a large node's children.
+ * Allows accumulating small sibling text (e.g. signature tokens) into
+ * adjacent larger siblings without mutating tree-sitter Node objects.
+ */
+type QueueItem = {
+	kind: "queue_item"
+	text: string
+	startRow: number
+	endRow: number
+}
+
+function isQueueItem(x: unknown): x is QueueItem {
+	return !!x && typeof x === "object" && (x as Record<string, unknown>).kind === "queue_item"
+}
+
+function getNodeText(n: Node | QueueItem): string {
+	return isQueueItem(n) ? n.text : (n.text ?? "")
+}
+
+function getStartRow(n: Node | QueueItem): number {
+	return isQueueItem(n) ? n.startRow : n.startPosition.row
+}
+
+function getEndRow(n: Node | QueueItem): number {
+	return isQueueItem(n) ? n.endRow : n.endPosition.row
+}
+
 /**
  * Implementation of the code parser interface
  */
@@ -170,23 +198,41 @@ export class CodeParser implements ICodeParser {
 		const results: CodeBlock[] = []
 
 		// Process captures if not empty
-		const queue: Node[] = Array.from(captures).map((capture) => capture.node)
+		const queue: Array<Node | QueueItem> = Array.from(captures).map((capture) => capture.node)
 
 		while (queue.length > 0) {
-			const currentNode = queue.shift()!
-			// const lineSpan = currentNode.endPosition.row - currentNode.startPosition.row + 1 // Removed as per lint error
+			const currentItem = queue.shift()!
+			const currentText = getNodeText(currentItem)
 
 			// Check if the node meets the minimum character requirement
-			if (currentNode.text.length >= MIN_BLOCK_CHARS) {
+			if (currentText.length >= MIN_BLOCK_CHARS) {
 				// If it also exceeds the maximum character limit, try to break it down
-				if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
-					if (currentNode.children.filter((child) => child !== null).length > 0) {
-						// If it has children, process them instead
-						queue.push(...currentNode.children.filter((child) => child !== null))
+				if (currentText.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
+					// QueueItems are synthetic wrappers -- they have no children to split
+					if (!isQueueItem(currentItem)) {
+						const children = currentItem.children.filter((child) => child !== null)
+						if (children.length > 0) {
+							// Expand children with split-local accumulator to preserve
+							// small signature tokens (class name, export keyword, etc.)
+							this._expandChildrenWithAccumulator(children, queue)
+							continue
+						}
+					}
+					// Leaf node (or QueueItem too large): chunk by lines
+					if (isQueueItem(currentItem)) {
+						const lines = currentItem.text.split("\n")
+						const chunkedBlocks = this._chunkTextByLines(
+							lines,
+							filePath,
+							fileHash,
+							"accumulated_chunk",
+							seenSegmentHashes,
+							currentItem.startRow + 1,
+						)
+						results.push(...chunkedBlocks)
 					} else {
-						// If it's a leaf node, chunk it
 						const chunkedBlocks = this._chunkLeafNodeByLines(
-							currentNode,
+							currentItem,
 							filePath,
 							fileHash,
 							seenSegmentHashes,
@@ -195,14 +241,22 @@ export class CodeParser implements ICodeParser {
 					}
 				} else {
 					// Node meets min chars and is within max chars, create a block
-					const identifier =
-						currentNode.childForFieldName("name")?.text ||
-						currentNode.children.find((c) => c?.type === "identifier")?.text ||
-						null
-					const type = currentNode.type
-					const start_line = currentNode.startPosition.row + 1
-					const end_line = currentNode.endPosition.row + 1
-					const content = currentNode.text
+					let identifier: string | null = null
+					let type: string
+
+					if (isQueueItem(currentItem)) {
+						type = "accumulated_chunk"
+					} else {
+						identifier =
+							currentItem.childForFieldName("name")?.text ||
+							currentItem.children.find((c) => c?.type === "identifier")?.text ||
+							null
+						type = currentItem.type
+					}
+
+					const start_line = getStartRow(currentItem) + 1
+					const end_line = getEndRow(currentItem) + 1
+					const content = currentText
 					const contentPreview = content.slice(0, 100)
 					const segmentHash = createHash("sha256")
 						.update(`${filePath}-${start_line}-${end_line}-${content.length}-${contentPreview}`)
@@ -223,12 +277,92 @@ export class CodeParser implements ICodeParser {
 					}
 				}
 			}
-			// Nodes smaller than minBlockChars are ignored
+			// Nodes smaller than minBlockChars are ignored (only in the main queue;
+			// small children from split nodes are accumulated by _expandChildrenWithAccumulator)
 		}
 
 		return results
 	}
 
+	/**
+	 * Expands a split node's children into the queue, accumulating adjacent
+	 * small siblings (< MIN_BLOCK_CHARS) and prepending them to the next
+	 * large sibling as a synthetic QueueItem. Trailing small siblings are
+	 * appended to the last queued sibling to avoid emitting tiny chunks.
+	 *
+	 * This preserves class/function signature tokens (export, class keyword,
+	 * identifier) that would otherwise be discarded by the MIN_BLOCK_CHARS
+	 * threshold in the main loop.
+	 */
+	private _expandChildrenWithAccumulator(children: Node[], queue: Array<Node | QueueItem>): void {
+		const pendingParts: string[] = []
+		let pendingStartRow: number | null = null
+		let pendingEndRow: number | null = null
+		let lastQueued: QueueItem | null = null
+		// Track the last raw Node pushed (and its queue index) so we can
+		// convert it to a QueueItem if trailing small siblings need appending.
+		let lastNodeIndex: number = -1
+
+		for (const child of children) {
+			const text = child.text ?? ""
+			const startRow = child.startPosition.row
+			const endRow = child.endPosition.row
+
+			if (text.length < MIN_BLOCK_CHARS) {
+				pendingParts.push(text)
+				if (pendingStartRow === null) {
+					pendingStartRow = startRow
+				}
+				pendingEndRow = endRow
+				continue
+			}
+
+			// Large child: prepend any accumulated pending text
+			if (pendingParts.length > 0) {
+				const combined: QueueItem = {
+					kind: "queue_item",
+					text: pendingParts.join(" ") + " " + text,
+					startRow: pendingStartRow!,
+					endRow: endRow,
+				}
+				queue.push(combined)
+				lastQueued = combined
+				lastNodeIndex = -1
+				pendingParts.length = 0
+				pendingStartRow = null
+				pendingEndRow = null
+			} else {
+				// No pending parts -- push the raw Node so the main loop
+				// can still recurse into its children for deeper splitting.
+				queue.push(child)
+				lastQueued = null
+				lastNodeIndex = queue.length - 1
+			}
+		}
+
+		// Tail handling: append remaining small siblings to last queued sibling
+		if (pendingParts.length > 0) {
+			const tail = pendingParts.join(" ")
+			if (lastQueued) {
+				lastQueued.text += " " + tail
+				lastQueued.endRow = pendingEndRow!
+			} else if (lastNodeIndex >= 0) {
+				// Last pushed was a raw Node. Converting it to a QueueItem
+				// would block recursive splitting of its children, so we
+				// drop the trailing tokens (typically closing delimiters
+				// like "}" with no semantic value for the search index).
+			} else {
+				// All children were small -- emit one unavoidable small wrapper
+				queue.push({
+					kind: "queue_item",
+					text: tail,
+					startRow: pendingStartRow!,
+					endRow: pendingEndRow!,
+				})
+			}
+		}
+	}
+
 	/**
 	 * Common helper function to chunk text by lines, avoiding tiny remainders.
 	 */