From 3596160481f204bbc87f7728a6169944965b127a Mon Sep 17 00:00:00 2001
From: Alex Alecu <a.marian.alexandru@gmail.com>
Date: Tue, 31 Mar 2026 13:52:06 +0300
Subject: [PATCH 1/6] feat(proxy): add cache diagnostic utility for Anthropic
 requests

Add logCacheDiagnostics() that computes a structured diagnostic payload
for Anthropic chat_completions requests with tools. Logs prefix hash,
breakpoint position, message structure, and body hash to enable
detection of prefix drift causing cache misses.
---
 src/lib/providers/cache-debug.ts | 122 +++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 src/lib/providers/cache-debug.ts
diff --git a/src/lib/providers/cache-debug.ts b/src/lib/providers/cache-debug.ts
new file mode 100644
index 000000000..becd1072b
--- /dev/null
+++ b/src/lib/providers/cache-debug.ts
@@ -0,0 +1,122 @@
+import crypto from 'crypto';
+import type { GatewayRequest } from '@/lib/providers/openrouter/types';
+import { isAnthropicModel } from '@/lib/providers/anthropic';
+
+/**
+ * Logs a structured diagnostic payload for Anthropic chat_completions
+ * requests to help debug cache hit/miss behavior.
+ *
+ * Call this AFTER all body mutations (tracking IDs, reasoning dedup,
+ * cache breakpoints, provider-specific logic) and BEFORE forwarding upstream.
+ */
+export function logCacheDiagnostics(
+  request: GatewayRequest,
+  requestedModel: string,
+  sessionId: string | null
+) {
+  if (request.kind !== 'chat_completions') return;
+  if (!isAnthropicModel(requestedModel)) return;
+  const messages = request.body.messages;
+  if (!Array.isArray(messages) || messages.length === 0) return;
+  const hasTools = (request.body.tools?.length ?? 0) > 0;
+  if (!hasTools) return;
+
+  try {
+    // Find the breakpoint message (the one with cache_control set by addCacheBreakpoints)
+    let breakpointIndex = -1;
+    let breakpointRole = '<none>';
+    let breakpointContentLength = 0;
+
+    for (let i = messages.length - 1; i >= 0; i--) {
+      const msg = messages[i];
+      const content = msg.content;
+      let hasCacheControl = false;
+
+      if (Array.isArray(content)) {
+        hasCacheControl = content.some(
+          (part: unknown) =>
+            typeof part === 'object' &&
+            part !== null &&
+            'cache_control' in part &&
+            typeof (part as Record<string, unknown>).cache_control === 'object' &&
+            (part as Record<string, unknown>).cache_control !== null &&
+            'type' in ((part as Record<string, unknown>).cache_control as Record<string, unknown>) &&
+            ((part as Record<string, unknown>).cache_control as Record<string, unknown>).type ===
+              'ephemeral'
+        );
+        breakpointContentLength = JSON.stringify(content).length;
+      } else if (typeof content === 'string') {
+        breakpointContentLength = content.length;
+      }
+
+      if (hasCacheControl) {
+        breakpointIndex = i;
+        breakpointRole = msg.role;
+        break;
+      }
+    }
+
+    // Message structure summary
+    const roleCounts: Record<string, number> = {};
+    let totalContentBytes = 0;
+    for (const msg of messages) {
+      roleCounts[msg.role] = (roleCounts[msg.role] || 0) + 1;
+      const c = msg.content;
+      if (typeof c === 'string') {
+        totalContentBytes += c.length;
+      } else if (Array.isArray(c)) {
+        totalContentBytes += JSON.stringify(c).length;
+      }
+    }
+
+    // Count reasoning_details entries (residual after dedup)
+    let reasoningDetailCount = 0;
+    for (const msg of messages) {
+      if ('reasoning_details' in msg && Array.isArray(msg.reasoning_details)) {
+        reasoningDetailCount += msg.reasoning_details.length;
+      }
+    }
+
+    // Prefix hash: SHA256 of messages[0..breakpointIndex] serialized.
+    // This is the content that SHOULD be cached across consecutive requests.
+    // If this hash changes between requests in the same session, the cache misses.
+    let prefixHash = '<no-breakpoint>';
+    let prefixBytes = 0;
+    if (breakpointIndex >= 0) {
+      const prefix = messages.slice(0, breakpointIndex + 1);
+      const prefixJson = JSON.stringify(prefix);
+      prefixBytes = prefixJson.length;
+      prefixHash = crypto.createHash('sha256').update(prefixJson).digest('hex').slice(0, 16);
+    }
+
+    // Full body hash (for dedup / correlation)
+    const bodyJson = JSON.stringify(request.body);
+    const bodyBytes = bodyJson.length;
+    const bodyHash = crypto.createHash('sha256').update(bodyJson).digest('hex').slice(0, 16);
+
+    console.log(
+      `[CacheDiag]`,
+      JSON.stringify({
+        sessionId: sessionId ?? '<none>',
+        model: request.body.model,
+        msgCount: messages.length,
+        roles: roleCounts,
+        reasoningDetails: reasoningDetailCount,
+        breakpoint: {
+          index: breakpointIndex,
+          role: breakpointRole,
+          contentLen: breakpointContentLength,
+        },
+        promptCacheKey: 'prompt_cache_key' in request.body && !!request.body.prompt_cache_key,
+        prefixHash,
+        prefixBytes,
+        bodyHash,
+        bodyBytes,
+        totalContentBytes,
+      })
+    );
+  } catch (err) {
+    // Never let diagnostic logging break the request
+    console.warn('[CacheDiag] error:', err);
+  }
+}

From f3f54e552ed9d08e0735fde424c37c5e6726cb8f Mon Sep 17 00:00:00 2001
From: Alex Alecu <a.marian.alexandru@gmail.com>
Date: Tue, 31 Mar 2026 13:52:13 +0300
Subject: [PATCH 2/6] feat(proxy): call cache diagnostics before upstream
 request

Wire logCacheDiagnostics() into the gateway route handler after all body
mutations (tracking IDs, reasoning dedup, cache breakpoints) and before
forwarding upstream, so the [CacheDiag] log captures the exact request
state sent to the provider.
---
 src/app/api/openrouter/[...path]/route.ts | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/app/api/openrouter/[...path]/route.ts b/src/app/api/openrouter/[...path]/route.ts
index 6b3497bac..3a1fc09a9 100644
--- a/src/app/api/openrouter/[...path]/route.ts
+++ b/src/app/api/openrouter/[...path]/route.ts
@@ -86,6 +86,7 @@ import {
   getMaxTokens,
   hasMiddleOutTransform,
 } from '@/lib/providers/openrouter/request-helpers';
+import { logCacheDiagnostics } from '@/lib/providers/cache-debug';
 
 export const maxDuration = 800;
 
@@ -481,6 +482,8 @@ export async function POST(request: NextRequest): Promise<NextResponseType<unkno
     userByok
   );
 
+  logCacheDiagnostics(requestBodyParsed, originalModelIdLowerCased, taskId ?? null);
+
   let response: Response;
   if (requestBodyParsed.kind === 'chat_completions' && provider.id === 'martian') {
     response = await grokCodeFastOptimizedRequest(

From 12aab23cbb5d02858889a2a493098b33b6614655 Mon Sep 17 00:00:00 2001
From: Alex Alecu <a.marian.alexandru@gmail.com>
Date: Tue, 31 Mar 2026 13:52:18 +0300
Subject: [PATCH 3/6] feat(proxy): add post-response cache token reconciliation
 logging

Add [CacheDiag:response] log blocks in processTokenData() for Anthropic
chat_completions with tools. Logs cache hit/write/input tokens and cost
from both generation lookup and inline-only paths, enabling correlation
with the pre-request [CacheDiag] prefix hash to diagnose cache misses.
---
 src/lib/processUsage.ts | 52 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/src/lib/processUsage.ts b/src/lib/processUsage.ts
index 641b6aa83..59127c6e4 100644
--- a/src/lib/processUsage.ts
+++ b/src/lib/processUsage.ts
@@ -820,6 +820,58 @@ async function processTokenData(
       );
     }
     usageStats = genStats;
+
+    // Log cache token reconciliation for Anthropic models with tools
+    if (
+      usageContext.api_kind === 'chat_completions' &&
+      usageContext.has_tools &&
+      usageContext.requested_model.startsWith('anthropic/')
+    ) {
+      console.log(
+        `[CacheDiag:response]`,
+        JSON.stringify({
+          sessionId: usageContext.session_id,
+          model: usageStats.model,
+          source: 'generation',
+          messageId: usageStats.messageId,
+          upstreamId: usageStats.upstream_id,
+          inputTokens: usageStats.inputTokens,
+          cacheHitTokens: usageStats.cacheHitTokens,
+          cacheWriteTokens: usageStats.cacheWriteTokens,
+          outputTokens: usageStats.outputTokens,
+          cost_mUsd: usageStats.cost_mUsd,
+          cacheDiscount_mUsd: usageStats.cacheDiscount_mUsd,
+          inferenceProvider: usageStats.inference_provider,
+        })
+      );
+    }
+  }
+
+  // Log inline-only usage for Anthropic models with tools (no generation data)
+  if (
+    !generation &&
+    usageContext.api_kind === 'chat_completions' &&
+    usageContext.has_tools &&
+    usageContext.requested_model.startsWith('anthropic/')
+  ) {
+    console.log(
+      `[CacheDiag:response]`,
+      JSON.stringify({
+        sessionId: usageContext.session_id,
+        model: usageStats.model,
+        source: 'inline',
+        messageId: usageStats.messageId,
+        upstreamId: usageStats.upstream_id,
+        inputTokens: usageStats.inputTokens,
+        cacheHitTokens: usageStats.cacheHitTokens,
+        cacheWriteTokens: usageStats.cacheWriteTokens,
+        outputTokens: usageStats.outputTokens,
+        cost_mUsd: usageStats.cost_mUsd,
+        cacheDiscount_mUsd: usageStats.cacheDiscount_mUsd ?? null,
+        inferenceProvider: usageStats.inference_provider,
+        generationLookupFailed: true,
+      })
+    );
   }
 
   if (usageStats.inputTokens - usageStats.cacheHitTokens > 100000)

From a6de1f7ae7f57788e96ce40a902557c6916e8ba2 Mon Sep 17 00:00:00 2001
From: Alex Alecu <a.marian.alexandru@gmail.com>
Date: Thu, 2 Apr 2026 10:53:39 +0300
Subject: [PATCH 4/6] feat(scripts): add cache diagnostic E2E test and fix
 server-only shim

Add test-cache-diag.ts script that makes multi-turn streaming requests
with realistic tools and system prompt to exercise the [CacheDiag] logging
path. Wire up the server-only shim in the script runner so CLI scripts
can import modules that transitively depend on server-only.
---
 src/lib/providers/cache-debug.ts          |   3 +-
 src/scripts/index.ts                      |  12 +
 src/scripts/openrouter/test-cache-diag.ts | 498 ++++++++++++++++++++++
 3 files changed, 512 insertions(+), 1 deletion(-)
 create mode 100644 src/scripts/openrouter/test-cache-diag.ts

diff --git a/src/lib/providers/cache-debug.ts b/src/lib/providers/cache-debug.ts
index becd1072b..285fdf5af 100644
--- a/src/lib/providers/cache-debug.ts
+++ b/src/lib/providers/cache-debug.ts
@@ -40,7 +40,8 @@ export function logCacheDiagnostics(
             'cache_control' in part &&
             typeof (part as Record<string, unknown>).cache_control === 'object' &&
             (part as Record<string, unknown>).cache_control !== null &&
-            'type' in ((part as Record<string, unknown>).cache_control as Record<string, unknown>) &&
+            'type' in
+              ((part as Record<string, unknown>).cache_control as Record<string, unknown>) &&
             ((part as Record<string, unknown>).cache_control as Record<string, unknown>).type ===
               'ephemeral'
         );
diff --git a/src/scripts/index.ts b/src/scripts/index.ts
index 88e873eb2..a8a52ec18 100644
--- a/src/scripts/index.ts
+++ b/src/scripts/index.ts
@@ -1,6 +1,18 @@
 // Load environment variables before any other imports
 import '../lib/load-env';
 
+// Shim 'server-only' for CLI scripts. Next.js strips this at build time, but
+// tsx/Node.js doesn't — pre-populate the require cache with an empty module so
+// transitive imports of 'server-only' (e.g. via config.server.ts) don't throw.
+import Module from 'node:module';
+const serverOnlyResolved = require.resolve('server-only');
+(Module as unknown as { _cache: Record<string, unknown> })._cache[serverOnlyResolved] = {
+  id: serverOnlyResolved,
+  filename: serverOnlyResolved,
+  loaded: true,
+  exports: {},
+};
+
 // get all folders in the src/scripts directory excluding './lib'
 import { readdirSync } from 'fs';
 import { join } from 'path';
diff --git a/src/scripts/openrouter/test-cache-diag.ts b/src/scripts/openrouter/test-cache-diag.ts
new file mode 100644
index 000000000..3b9609edb
--- /dev/null
+++ b/src/scripts/openrouter/test-cache-diag.ts
@@ -0,0 +1,498 @@
+import { eq } from 'drizzle-orm';
+import { createParser, type EventSourceMessage } from 'eventsource-parser';
+import { generateApiToken } from '@/lib/tokens';
+import { db } from '@/lib/drizzle';
+import { kilocode_users } from '@kilocode/db/schema';
+import { CLAUDE_OPUS_CURRENT_MODEL_ID } from '@/lib/providers/anthropic';
+
+type ChatMessage = {
+  role: 'system' | 'user' | 'assistant';
+  content: string;
+};
+
+type ToolDefinition = {
+  type: 'function';
+  function: {
+    name: string;
+    description: string;
+    parameters: Record<string, unknown>;
+  };
+};
+
+type TurnResult = {
+  turn: number;
+  messageCount: number;
+  elapsedMs: number;
+  responseText: string;
+  promptTokens: number | null;
+  completionTokens: number | null;
+  cachedTokens: number | null;
+};
+
+// Realistic tool definitions matching what the Kilo Code extension sends.
+// A real "Code" mode request has 14 tools; we include 8 representative ones
+// to push the prompt well past the 1,024-token minimum for Anthropic cache.
+const TOOL_DEFINITIONS: ToolDefinition[] = [
+  {
+    type: 'function',
+    function: {
+      name: 'read_file',
+      description:
+        'Read one or more files and return their contents with line numbers for diffing or discussion. Structure: { files: [{ path: "relative/path.ts" }] }. The "path" is required and relative to workspace. Supports text extraction from PDF and DOCX files, but may not handle other binary files properly.',
+      parameters: {
+        type: 'object',
+        properties: {
+          files: {
+            type: 'array',
+            description: 'List of files to read; request related files together when allowed',
+            items: {
+              type: 'object',
+              properties: { path: { type: 'string', description: 'Path to the file to read' } },
+              required: ['path'],
+              additionalProperties: false,
+            },
+            minItems: 1,
+          },
+        },
+        required: ['files'],
+        additionalProperties: false,
+      },
+    },
+  },
+  {
+    type: 'function',
+    function: {
+      name: 'write_to_file',
+      description:
+        "Request to write content to a file. This tool is primarily used for creating new files or for scenarios where a complete rewrite of an existing file is intentionally required. If the file exists, it will be overwritten. If it doesn't exist, it will be created. This tool will automatically create any directories needed to write the file. ALWAYS provide the COMPLETE file content in your response. Partial updates or placeholders are STRICTLY FORBIDDEN.",
+      parameters: {
+        type: 'object',
+        properties: {
+          path: {
+            type: 'string',
+            description: 'Path to the file to write, relative to the workspace',
+          },
+          content: {
+            type: 'string',
+            description:
+              'Full contents that the file should contain with no omissions or line numbers',
+          },
+        },
+        required: ['path', 'content'],
+        additionalProperties: false,
+      },
+    },
+  },
+  {
+    type: 'function',
+    function: {
+      name: 'apply_diff',
+      description:
+        'Apply precise, targeted modifications to an existing file using one or more search/replace blocks. This tool is for surgical edits only; the SEARCH block must exactly match the existing content, including whitespace and indentation. To make multiple targeted changes, provide multiple SEARCH/REPLACE blocks in the diff parameter. Use the read_file tool first if you are not confident in the exact content to search for.',
+      parameters: {
+        type: 'object',
+        properties: {
+          path: {
+            type: 'string',
+            description: 'The path of the file to modify, relative to the workspace.',
+          },
+          diff: {
+            type: 'string',
+            description:
+              'A string containing one or more search/replace blocks. Each block must follow this format:\n<<<<<<< SEARCH\n:start_line:[line_number]\n-------\n[exact content to find]\n=======\n[new content to replace with]\n>>>>>>> REPLACE',
+          },
+        },
+        required: ['path', 'diff'],
+        additionalProperties: false,
+      },
+    },
+  },
+  {
+    type: 'function',
+    function: {
+      name: 'execute_command',
+      description:
+        "Request to execute a CLI command on the system. Use this when you need to perform system operations or run specific commands to accomplish any step in the user's task. You must tailor your command to the user's system and provide a clear explanation of what the command does. For command chaining, use the appropriate chaining syntax for the user's shell. Prefer to execute complex CLI commands over creating executable scripts.",
+      parameters: {
+        type: 'object',
+        properties: {
+          command: { type: 'string', description: 'Shell command to execute' },
+          cwd: { type: 'string', description: 'Optional working directory for the command' },
+        },
+        required: ['command', 'cwd'],
+        additionalProperties: false,
+      },
+    },
+  },
+  {
+    type: 'function',
+    function: {
+      name: 'search_files',
+      description:
+        'Request to perform a regex search across files in a specified directory, providing context-rich results. This tool searches for patterns or specific content across multiple files, displaying each match with encapsulating context. Craft your regex patterns carefully to balance specificity and flexibility. Use this tool to find code patterns, TODO comments, function definitions, or any text-based information across the project.',
+      parameters: {
+        type: 'object',
+        properties: {
+          path: { type: 'string', description: 'Directory to search recursively' },
+          regex: { type: 'string', description: 'Rust-compatible regular expression pattern' },
+          file_pattern: {
+            type: 'string',
+            description: 'Optional glob to limit which files are searched',
+          },
+        },
+        required: ['path', 'regex', 'file_pattern'],
+        additionalProperties: false,
+      },
+    },
+  },
+  {
+    type: 'function',
+    function: {
+      name: 'list_files',
+      description:
+        'Request to list files and directories within the specified directory. If recursive is true, it will list all files and directories recursively. If recursive is false or not provided, it will only list the top-level contents. Do not use this tool to confirm the existence of files you may have created.',
+      parameters: {
+        type: 'object',
+        properties: {
+          path: {
+            type: 'string',
+            description: 'Directory path to inspect, relative to the workspace',
+          },
+          recursive: { type: 'boolean', description: 'Set true to list contents recursively' },
+        },
+        required: ['path', 'recursive'],
+        additionalProperties: false,
+      },
+    },
+  },
+  {
+    type: 'function',
+    function: {
+      name: 'codebase_search',
+      description:
+        "Find files most relevant to the search query using semantic search. Searches based on meaning rather than exact text matches. By default searches entire workspace. Reuse the user's exact wording unless there's a clear reason not to — their phrasing often helps semantic search. Queries MUST be in English. CRITICAL: For ANY exploration of code you haven't examined yet in this conversation, you MUST use this tool FIRST before any other search or file exploration tools.",
+      parameters: {
+        type: 'object',
+        properties: {
+          query: { type: 'string', description: 'Meaning-based search query' },
+          path: { type: 'string', description: 'Optional subdirectory to limit the search scope' },
+        },
+        required: ['query', 'path'],
+        additionalProperties: false,
+      },
+    },
+  },
+  {
+    type: 'function',
+    function: {
+      name: 'attempt_completion',
+      description:
+        "After each tool use, the user will respond with the result of that tool use. Once you've received the results and can confirm that the task is complete, use this tool to present the result of your work to the user. IMPORTANT: This tool CANNOT be used until you've confirmed from the user that any previous tool uses were successful. Before using this tool, you must confirm that you've received successful results from the user for any previous tool uses.",
+      parameters: {
+        type: 'object',
+        properties: {
+          result: { type: 'string', description: 'Final result message to deliver to the user' },
+        },
+        required: ['result'],
+        additionalProperties: false,
+      },
+    },
+  },
+];
+
+const TURNS: { role: 'user'; content: string }[] = [
+  {
+    role: 'user',
+    content:
+      '<task>\nI need you to refactor the authentication module in src/auth/login.ts. The current implementation uses callbacks and I want it converted to async/await. Make sure all error handling is preserved and add proper TypeScript types for the return values.\n</task>\n\n<environment_details>\n# VSCode Visible Files\nsrc/auth/login.ts\n\n# VSCode Open Tabs\nsrc/auth/login.ts\nsrc/auth/types.ts\nsrc/auth/middleware.ts\n\n# Current Time\n2026-03-31T14:00:00.000Z\n</environment_details>',
+  },
+  {
+    role: 'user',
+    content:
+      'The refactored code looks good but I also need you to update the tests in src/auth/__tests__/login.test.ts to match the new async/await pattern. The tests currently use done() callbacks.',
+  },
+  {
+    role: 'user',
+    content:
+      'Now please also update the middleware in src/auth/middleware.ts to use the new async login function, and make sure the error handling middleware catches any rejected promises properly.',
+  },
+];
+
+// Realistic Kilo Code system prompt (abbreviated but representative in token count).
+const SYSTEM_MESSAGE: ChatMessage = {
+  role: 'system',
+  content: `You are Kilo Code, a highly skilled software engineer with extensive knowledge in many programming languages, frameworks, design patterns, and best practices.
+
+====
+
+MARKDOWN RULES
+
+ALL responses MUST show ANY \`language construct\` OR filename reference as clickable, exactly as [\`filename OR language.declaration()\`](relative/file/path.ext:line); line is required for \`syntax\` and optional for filename links.
+
+====
+
+TOOL USE
+
+You have access to a set of tools that are executed upon the user's approval. Use the provider-native tool-calling mechanism. Do not include XML markup or examples.
+
+# Tool Use Guidelines
+
+1. Assess what information you already have and what information you need to proceed with the task.
+2. Choose the most appropriate tool based on the task and the tool descriptions provided. Assess if you need additional information to proceed, and which of the available tools would be most effective for gathering this information.
+3. If multiple actions are needed, you may use multiple tools in a single message when appropriate, or use tools iteratively across messages. Each tool use should be informed by the results of previous tool uses.
+4. After each tool use, the user will respond with the result of that tool use. This result will provide you with the necessary information to continue your task or make further decisions.
+
+====
+
+CAPABILITIES
+
+- You can read and analyze code in various programming languages, and can write clean, efficient, and well-documented code.
+- You can analyze project structures and suggest improvements for better organization and maintainability.
+- You can debug complex issues by analyzing error messages, stack traces, and application behavior.
+- You can provide refactoring suggestions to improve code quality, readability, and performance.
+- You can write and update unit tests, integration tests, and end-to-end tests.
+
+====
+
+RULES
+
+- Your current working directory is always the workspace root.
+- You cannot \`cd\` into a different directory to complete a task. You are stuck operating from the workspace root.
+- Do not use the ~ character or $HOME to refer to the home directory.
+- When using the search_files tool, craft your regex patterns carefully to balance specificity and flexibility.
+- When creating a new project, organize all new files within a dedicated project directory unless the user specifies otherwise.
+- Be sure to consider the type of project (e.g. Python, JavaScript, web application) when determining the appropriate structure and files to include.
+- When making changes to code, always consider the context in which the code is being used. Ensure your changes are compatible with the existing codebase.
+- Do not ask for more information than necessary. Use the tools provided to accomplish the user's request efficiently.
+- When you have completed the task, use the attempt_completion tool to present the result.
+- The user may provide feedback, which you can use to make improvements and try again.
+- IMPORTANT: You should NEVER end attempt_completion result with a question or request to engage in further conversation.`,
+};
+
+async function authenticateUser(email: string) {
+  const user = await db
+    .select()
+    .from(kilocode_users)
+    .where(eq(kilocode_users.google_user_email, email))
+    .limit(1);
+
+  if (!user || user.length === 0) {
+    throw new Error(`User with email "${email}" not found in the database`);
+  }
+
+  const authToken = generateApiToken(user[0]);
+  const baseUrl = process.env.NEXT_PUBLIC_BASE_URL || 'http://localhost:3000';
+  return { authToken, baseUrl };
+}
+
+async function streamTurn(
+  baseUrl: string,
+  authToken: string,
+  model: string,
+  taskId: string,
+  messages: ChatMessage[]
+): Promise<TurnResult & { assistantMessage: ChatMessage }> {
+  const startTime = Date.now();
+
+  const response = await fetch(`${baseUrl}/api/openrouter/chat/completions`, {
+    method: 'POST',
+    headers: {
+      Authorization: `Bearer ${authToken}`,
+      'Content-Type': 'application/json',
+      'x-forwarded-for': '127.0.0.1',
+      'x-kilocode-taskid': taskId,
+    },
+    body: JSON.stringify({
+      model,
+      messages,
+      tools: TOOL_DEFINITIONS,
+      tool_choice: 'none',
+      stream: true,
+      max_tokens: 256,
+    }),
+  });
+
+  if (!response.ok) {
+    let errorDetail = `HTTP ${response.status} ${response.statusText}`;
+    try {
+      const body = await response.json();
+      errorDetail = body.error?.message || body.error || JSON.stringify(body);
+    } catch {
+      // keep the HTTP status message
+    }
+    throw new Error(`Request failed: ${errorDetail}`);
+  }
+
+  if (!response.body) {
+    throw new Error('Response body is null — expected a stream');
+  }
+
+  const contentParts: string[] = [];
+  let promptTokens: number | null = null;
+  let completionTokens: number | null = null;
+  let cachedTokens: number | null = null;
+
+  await new Promise<void>((resolve, reject) => {
+    const parser = createParser({
+      onEvent(event: EventSourceMessage) {
+        if (event.data === '[DONE]') {
+          resolve();
+          return;
+        }
+
+        try {
+          const chunk = JSON.parse(event.data);
+
+          // Content deltas
+          const delta = chunk.choices?.[0]?.delta;
+          if (delta?.content) {
+            contentParts.push(delta.content);
+          }
+
+          // Usage from final chunk (choices is empty, usage is present)
+          if (chunk.usage) {
+            promptTokens = chunk.usage.prompt_tokens ?? null;
+            completionTokens = chunk.usage.completion_tokens ?? null;
+            cachedTokens = chunk.usage.prompt_tokens_details?.cached_tokens ?? null;
+          }
+        } catch {
+          // skip malformed chunks
+        }
+      },
+    });
+
+    // body is checked for null above; the non-null path is guarded
+    const body = response.body;
+    if (!body) throw new Error('unreachable');
+    const reader = body.getReader();
+    const decoder = new TextDecoder();
+
+    function pump(): void {
+      reader
+        .read()
+        .then(({ done, value }) => {
+          if (done) {
+            resolve();
+            return;
+          }
+          parser.feed(decoder.decode(value, { stream: true }));
+          pump();
+        })
+        .catch(reject);
+    }
+
+    pump();
+  });
+
+  const elapsedMs = Date.now() - startTime;
+  const responseText = contentParts.join('');
+
+  return {
+    turn: 0, // filled in by caller
+    messageCount: messages.length,
+    elapsedMs,
+    responseText,
+    promptTokens,
+    completionTokens,
+    cachedTokens,
+    assistantMessage: { role: 'assistant', content: responseText },
+  };
+}
+
+function truncate(text: string, maxLen: number): string {
+  if (text.length <= maxLen) return text;
+  return text.slice(0, maxLen) + '...';
+}
+
+function cacheHitRate(promptTokens: number | null, cachedTokens: number | null): string {
+  if (promptTokens === null || cachedTokens === null || promptTokens === 0) return 'N/A';
+  return `${Math.round((cachedTokens / promptTokens) * 100)}%`;
+}
+
+function printSummary(results: TurnResult[]) {
+  console.log('\n=== Summary ===');
+  console.log('| Turn | Messages | Input  | Output | Cached | Hit Rate |');
+  console.log('|------|----------|--------|--------|--------|----------|');
+  for (const r of results) {
+    const input = r.promptTokens !== null ? String(r.promptTokens).padEnd(6) : 'N/A   ';
+    const output = r.completionTokens !== null ? String(r.completionTokens).padEnd(6) : 'N/A   ';
+    const cached = r.cachedTokens !== null ? String(r.cachedTokens).padEnd(6) : 'N/A   ';
+    const hitRate = cacheHitRate(r.promptTokens, r.cachedTokens).padEnd(8);
+    console.log(
+      `| ${r.turn}    | ${String(r.messageCount).padEnd(8)} | ${input} | ${output} | ${cached} | ${hitRate} |`
+    );
+  }
+
+  console.log('\nCheck server logs (pnpm dev terminal) for [CacheDiag] entries:');
+  console.log('  - [CacheDiag] shows prefixHash, breakpoint, promptCacheKey (pre-request)');
+  console.log('  - [CacheDiag:response] shows cacheHitTokens, cacheWriteTokens (post-response)');
+  console.log(
+    '\nNote: cacheWriteTokens are only visible server-side, not in the client SSE stream.'
+  );
+}
+
+/**
+ * Cache diagnostic E2E test script.
+ *
+ * Makes multi-turn streaming requests with tools to the local gateway,
+ * using the same taskId across turns to exercise the [CacheDiag] logging.
+ *
+ * @param email - google_user_email of a user in the local DB (required)
+ * @param model - OpenRouter model ID (optional, default: anthropic/claude-opus-4.6)
+ */
+export async function run(email: string, model?: string): Promise<void> {
+  if (!email) {
+    console.error('Error: email is required');
+    console.log('\nUsage: npx tsx src/scripts/index.ts openrouter test-cache-diag <email> [model]');
+    console.log('\nExamples:');
+    console.log('  npx tsx src/scripts/index.ts openrouter test-cache-diag user@example.com');
+    console.log(
+      '  npx tsx src/scripts/index.ts openrouter test-cache-diag user@example.com anthropic/claude-sonnet-4.5'
+    );
+    process.exit(1);
+  }
+
+  const resolvedModel = model || CLAUDE_OPUS_CURRENT_MODEL_ID;
+
+  try {
+    console.log('=== Cache Diagnostic E2E Test ===\n');
+
+    const { authToken, baseUrl } = await authenticateUser(email);
+    console.log(`Auth: ${email}`);
+    console.log(`Server: ${baseUrl}`);
+
+    const taskId = `cache-diag-${Date.now()}`;
+    console.log(`\n--- ${resolvedModel} ---`);
+    console.log(`Task ID: ${taskId}\n`);
+
+    const messages: ChatMessage[] = [SYSTEM_MESSAGE];
+    const results: TurnResult[] = [];
+
+    for (let i = 0; i < TURNS.length; i++) {
+      const turnNum = i + 1;
+      messages.push(TURNS[i]);
+
+      console.log(`Turn ${turnNum}/${TURNS.length}: ${messages.length} messages, sending...`);
+
+      const result = await streamTurn(baseUrl, authToken, resolvedModel, taskId, messages);
+      result.turn = turnNum;
+
+      console.log(
+        `  Response (${(result.elapsedMs / 1000).toFixed(1)}s): "${truncate(result.responseText, 50)}"`
+      );
+      console.log(
+        `  Usage: input=${result.promptTokens ?? 'N/A'} output=${result.completionTokens ?? 'N/A'} cached=${result.cachedTokens ?? 'N/A'}`
+      );
+      console.log(`  Cache hit rate: ${cacheHitRate(result.promptTokens, result.cachedTokens)}\n`);
+
+      // Append assistant response for next turn
+      messages.push(result.assistantMessage);
+      results.push(result);
+    }
+
+    printSummary(results);
+  } catch (error) {
+    console.error('\nError:', error instanceof Error ? error.message : String(error));
+    process.exit(1);
+  }
+}

From 21bcc44e4d6b0828df098fac48a40afdc405f95e Mon Sep 17 00:00:00 2001
From: Alex Alecu <a.marian.alexandru@gmail.com>
Date: Thu, 2 Apr 2026 11:36:43 +0300
Subject: [PATCH 5/6] feat(scripts): add provider comparison mode and cache
 write token tracking

Add 'compare' mode that runs the same test against both amazon-bedrock
and anthropic providers back-to-back. Extract cache_write_tokens from the
SSE stream (now available from OpenRouter). Accept optional provider arg
to force routing via provider.only.
---
 src/scripts/openrouter/test-cache-diag.ts | 138 +++++++++++++---------
 1 file changed, 85 insertions(+), 53 deletions(-)

diff --git a/src/scripts/openrouter/test-cache-diag.ts b/src/scripts/openrouter/test-cache-diag.ts
index 3b9609edb..5cdde1b10 100644
--- a/src/scripts/openrouter/test-cache-diag.ts
+++ b/src/scripts/openrouter/test-cache-diag.ts
@@ -27,6 +27,7 @@ type TurnResult = {
   promptTokens: number | null;
   completionTokens: number | null;
   cachedTokens: number | null;
+  cacheWriteTokens: number | null;
 };
 
 // Realistic tool definitions matching what the Kilo Code extension sends.
@@ -290,10 +291,23 @@ async function streamTurn(
   authToken: string,
   model: string,
   taskId: string,
-  messages: ChatMessage[]
+  messages: ChatMessage[],
+  providerOnly?: string
 ): Promise<TurnResult & { assistantMessage: ChatMessage }> {
   const startTime = Date.now();
 
+  const body: Record<string, unknown> = {
+    model,
+    messages,
+    tools: TOOL_DEFINITIONS,
+    tool_choice: 'none',
+    stream: true,
+    max_tokens: 256,
+  };
+  if (providerOnly) {
+    body.provider = { only: [providerOnly] };
+  }
+
   const response = await fetch(`${baseUrl}/api/openrouter/chat/completions`, {
     method: 'POST',
     headers: {
@@ -302,14 +316,7 @@ async function streamTurn(
       'x-forwarded-for': '127.0.0.1',
       'x-kilocode-taskid': taskId,
     },
-    body: JSON.stringify({
-      model,
-      messages,
-      tools: TOOL_DEFINITIONS,
-      tool_choice: 'none',
-      stream: true,
-      max_tokens: 256,
-    }),
+    body: JSON.stringify(body),
   });
 
   if (!response.ok) {
@@ -331,6 +338,7 @@ async function streamTurn(
   let promptTokens: number | null = null;
   let completionTokens: number | null = null;
   let cachedTokens: number | null = null;
+  let cacheWriteTokens: number | null = null;
 
   await new Promise<void>((resolve, reject) => {
     const parser = createParser({
@@ -354,6 +362,7 @@ async function streamTurn(
             promptTokens = chunk.usage.prompt_tokens ?? null;
             completionTokens = chunk.usage.completion_tokens ?? null;
             cachedTokens = chunk.usage.prompt_tokens_details?.cached_tokens ?? null;
+            cacheWriteTokens = chunk.usage.prompt_tokens_details?.cache_write_tokens ?? null;
           }
         } catch {
           // skip malformed chunks
@@ -395,6 +404,7 @@ async function streamTurn(
     promptTokens,
     completionTokens,
     cachedTokens,
+    cacheWriteTokens,
     assistantMessage: { role: 'assistant', content: responseText },
   };
 }
@@ -409,26 +419,60 @@ function cacheHitRate(promptTokens: number | null, cachedTokens: number | null):
   return `${Math.round((cachedTokens / promptTokens) * 100)}%`;
 }
 
-function printSummary(results: TurnResult[]) {
-  console.log('\n=== Summary ===');
-  console.log('| Turn | Messages | Input  | Output | Cached | Hit Rate |');
-  console.log('|------|----------|--------|--------|--------|----------|');
+function printSummary(results: TurnResult[], label?: string) {
+  console.log(`\n=== Summary${label ? ` (${label})` : ''} ===`);
+  console.log('| Turn | Msgs | Input | Output | Cache Write | Cache Read | Hit Rate |');
+  console.log('|------|------|-------|--------|-------------|------------|----------|');
   for (const r of results) {
-    const input = r.promptTokens !== null ? String(r.promptTokens).padEnd(6) : 'N/A   ';
+    const input = r.promptTokens !== null ? String(r.promptTokens).padEnd(5) : 'N/A  ';
     const output = r.completionTokens !== null ? String(r.completionTokens).padEnd(6) : 'N/A   ';
-    const cached = r.cachedTokens !== null ? String(r.cachedTokens).padEnd(6) : 'N/A   ';
+    const write =
+      r.cacheWriteTokens !== null ? String(r.cacheWriteTokens).padEnd(11) : 'N/A        ';
+    const read = r.cachedTokens !== null ? String(r.cachedTokens).padEnd(10) : 'N/A       ';
     const hitRate = cacheHitRate(r.promptTokens, r.cachedTokens).padEnd(8);
     console.log(
-      `| ${r.turn}    | ${String(r.messageCount).padEnd(8)} | ${input} | ${output} | ${cached} | ${hitRate} |`
+      `| ${r.turn}    | ${String(r.messageCount).padEnd(4)} | ${input} | ${output} | ${write} | ${read} | ${hitRate} |`
     );
   }
+}
+
+async function runSession(
+  baseUrl: string,
+  authToken: string,
+  model: string,
+  providerOnly?: string
+): Promise<TurnResult[]> {
+  const label = providerOnly ?? 'default routing';
+  const taskId = `cache-diag-${Date.now()}`;
+  console.log(`\n--- ${model} via ${label} ---`);
+  console.log(`Task ID: ${taskId}\n`);
 
-  console.log('\nCheck server logs (pnpm dev terminal) for [CacheDiag] entries:');
-  console.log('  - [CacheDiag] shows prefixHash, breakpoint, promptCacheKey (pre-request)');
-  console.log('  - [CacheDiag:response] shows cacheHitTokens, cacheWriteTokens (post-response)');
-  console.log(
-    '\nNote: cacheWriteTokens are only visible server-side, not in the client SSE stream.'
-  );
+  const messages: ChatMessage[] = [SYSTEM_MESSAGE];
+  const results: TurnResult[] = [];
+
+  for (let i = 0; i < TURNS.length; i++) {
+    const turnNum = i + 1;
+    messages.push(TURNS[i]);
+
+    console.log(`Turn ${turnNum}/${TURNS.length}: ${messages.length} messages, sending...`);
+
+    const result = await streamTurn(baseUrl, authToken, model, taskId, messages, providerOnly);
+    result.turn = turnNum;
+
+    console.log(
+      `  Response (${(result.elapsedMs / 1000).toFixed(1)}s): "${truncate(result.responseText, 50)}"`
+    );
+    console.log(
+      `  Usage: input=${result.promptTokens ?? 'N/A'} output=${result.completionTokens ?? 'N/A'} cached_read=${result.cachedTokens ?? 'N/A'} cached_write=${result.cacheWriteTokens ?? 'N/A'}`
+    );
+    console.log(`  Cache hit rate: ${cacheHitRate(result.promptTokens, result.cachedTokens)}\n`);
+
+    messages.push(result.assistantMessage);
+    results.push(result);
+  }
+
+  printSummary(results, label);
+  return results;
 }
 
 /**
@@ -439,15 +483,23 @@ function printSummary(results: TurnResult[]) {
  *
  * @param email - google_user_email of a user in the local DB (required)
  * @param model - OpenRouter model ID (optional, default: anthropic/claude-opus-4.6)
+ * @param provider - OpenRouter provider to force via provider.only (optional,
+ *                   e.g. "amazon-bedrock" or "anthropic"). Omit to use default routing.
+ *                   Use "compare" to run both amazon-bedrock and anthropic back-to-back.
  */
-export async function run(email: string, model?: string): Promise<void> {
+export async function run(email: string, model?: string, provider?: string): Promise<void> {
   if (!email) {
     console.error('Error: email is required');
-    console.log('\nUsage: npx tsx src/scripts/index.ts openrouter test-cache-diag <email> [model]');
+    console.log(
+      '\nUsage: npx tsx src/scripts/index.ts openrouter test-cache-diag <email> [model] [provider]'
+    );
     console.log('\nExamples:');
     console.log('  npx tsx src/scripts/index.ts openrouter test-cache-diag user@example.com');
     console.log(
-      '  npx tsx src/scripts/index.ts openrouter test-cache-diag user@example.com anthropic/claude-sonnet-4.5'
+      '  npx tsx src/scripts/index.ts openrouter test-cache-diag user@example.com anthropic/claude-opus-4.6 anthropic'
+    );
+    console.log(
+      '  npx tsx src/scripts/index.ts openrouter test-cache-diag user@example.com anthropic/claude-opus-4.6 compare'
     );
     process.exit(1);
   }
@@ -461,36 +513,16 @@ export async function run(email: string, model?: string): Promise<void> {
     console.log(`Auth: ${email}`);
     console.log(`Server: ${baseUrl}`);
 
-    const taskId = `cache-diag-${Date.now()}`;
-    console.log(`\n--- ${resolvedModel} ---`);
-    console.log(`Task ID: ${taskId}\n`);
-
-    const messages: ChatMessage[] = [SYSTEM_MESSAGE];
-    const results: TurnResult[] = [];
-
-    for (let i = 0; i < TURNS.length; i++) {
-      const turnNum = i + 1;
-      messages.push(TURNS[i]);
-
-      console.log(`Turn ${turnNum}/${TURNS.length}: ${messages.length} messages, sending...`);
-
-      const result = await streamTurn(baseUrl, authToken, resolvedModel, taskId, messages);
-      result.turn = turnNum;
-
-      console.log(
-        `  Response (${(result.elapsedMs / 1000).toFixed(1)}s): "${truncate(result.responseText, 50)}"`
-      );
-      console.log(
-        `  Usage: input=${result.promptTokens ?? 'N/A'} output=${result.completionTokens ?? 'N/A'} cached=${result.cachedTokens ?? 'N/A'}`
-      );
-      console.log(`  Cache hit rate: ${cacheHitRate(result.promptTokens, result.cachedTokens)}\n`);
-
-      // Append assistant response for next turn
-      messages.push(result.assistantMessage);
-      results.push(result);
+    if (provider === 'compare') {
+      const providers = ['amazon-bedrock', 'anthropic'] as const;
+      for (const p of providers) {
+        await runSession(baseUrl, authToken, resolvedModel, p);
+      }
+    } else {
+      await runSession(baseUrl, authToken, resolvedModel, provider);
     }
 
-    printSummary(results);
+    console.log('\nCheck server logs for [CacheDiag] and [CacheDiag:response] entries.');
   } catch (error) {
     console.error('\nError:', error instanceof Error ? error.message : String(error));
     process.exit(1);

From 83e109b92e498554106d82e8f02491b8401953ce Mon Sep 17 00:00:00 2001
From: Alex Alecu <a.marian.alexandru@gmail.com>
Date: Thu, 2 Apr 2026 15:45:32 +0300
Subject: [PATCH 6/6] fix(scripts): rewrite cache diag harness to match
 production request shape
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The old harness used tool_choice:none and synthetic messages, never
producing tool-result follow-ups. This kept prompts below Opus's
4096-token cache minimum and prevented the breakpoint from landing on
a tool message — both of which are the norm in production.

Rewrite to use a realistic fixture (system prompt + 14 tools from
no_tool_request.json), tool_choice:auto with a local tool executor,
and provider.order instead of provider.only.
---
 src/scripts/openrouter/test-cache-diag.ts | 833 ++++++++++++++--------
 1 file changed, 541 insertions(+), 292 deletions(-)

diff --git a/src/scripts/openrouter/test-cache-diag.ts b/src/scripts/openrouter/test-cache-diag.ts
index 5cdde1b10..e43249de8 100644
--- a/src/scripts/openrouter/test-cache-diag.ts
+++ b/src/scripts/openrouter/test-cache-diag.ts
@@ -1,3 +1,6 @@
+import { execFileSync } from 'node:child_process';
+import { readdirSync, readFileSync, statSync } from 'node:fs';
+import path from 'node:path';
 import { eq } from 'drizzle-orm';
 import { createParser, type EventSourceMessage } from 'eventsource-parser';
 import { generateApiToken } from '@/lib/tokens';
@@ -5,23 +8,57 @@ import { db } from '@/lib/drizzle';
 import { kilocode_users } from '@kilocode/db/schema';
 import { CLAUDE_OPUS_CURRENT_MODEL_ID } from '@/lib/providers/anthropic';
 
-type ChatMessage = {
+type ToolDefinition = {
+  type: 'function';
+  function: {
+    name: string;
+    description: string;
+    parameters: Record<string, unknown>;
+  };
+};
+
+type TextMessage = {
   role: 'system' | 'user' | 'assistant';
   content: string;
 };
 
-type ToolDefinition = {
+type ToolCall = {
+  id: string;
   type: 'function';
   function: {
     name: string;
-    description: string;
-    parameters: Record<string, unknown>;
+    arguments: string;
   };
 };
 
-type TurnResult = {
+type AssistantToolCallMessage = {
+  role: 'assistant';
+  content: string;
+  tool_calls: ToolCall[];
+};
+
+type ToolMessage = {
+  role: 'tool';
+  tool_call_id: string;
+  content: string;
+};
+
+type RequestMessage = TextMessage | AssistantToolCallMessage | ToolMessage;
+
+type FixtureTemplate = {
+  systemMessage: string;
+  tools: ToolDefinition[];
+};
+
+type RequestPhase = 'user' | 'tool-followup';
+
+type RequestResult = {
+  requestNumber: number;
   turn: number;
+  phase: RequestPhase;
   messageCount: number;
+  lastRole: RequestMessage['role'];
+  toolCalls: number;
   elapsedMs: number;
   responseText: string;
   promptTokens: number | null;
@@ -30,245 +67,83 @@ type TurnResult = {
   cacheWriteTokens: number | null;
 };
 
-// Realistic tool definitions matching what the Kilo Code extension sends.
-// A real "Code" mode request has 14 tools; we include 8 representative ones
-// to push the prompt well past the 1,024-token minimum for Anthropic cache.
-const TOOL_DEFINITIONS: ToolDefinition[] = [
-  {
-    type: 'function',
-    function: {
-      name: 'read_file',
-      description:
-        'Read one or more files and return their contents with line numbers for diffing or discussion. Structure: { files: [{ path: "relative/path.ts" }] }. The "path" is required and relative to workspace. Supports text extraction from PDF and DOCX files, but may not handle other binary files properly.',
-      parameters: {
-        type: 'object',
-        properties: {
-          files: {
-            type: 'array',
-            description: 'List of files to read; request related files together when allowed',
-            items: {
-              type: 'object',
-              properties: { path: { type: 'string', description: 'Path to the file to read' } },
-              required: ['path'],
-              additionalProperties: false,
-            },
-            minItems: 1,
-          },
-        },
-        required: ['files'],
-        additionalProperties: false,
-      },
-    },
-  },
-  {
-    type: 'function',
-    function: {
-      name: 'write_to_file',
-      description:
-        "Request to write content to a file. This tool is primarily used for creating new files or for scenarios where a complete rewrite of an existing file is intentionally required. If the file exists, it will be overwritten. If it doesn't exist, it will be created. This tool will automatically create any directories needed to write the file. ALWAYS provide the COMPLETE file content in your response. Partial updates or placeholders are STRICTLY FORBIDDEN.",
-      parameters: {
-        type: 'object',
-        properties: {
-          path: {
-            type: 'string',
-            description: 'Path to the file to write, relative to the workspace',
-          },
-          content: {
-            type: 'string',
-            description:
-              'Full contents that the file should contain with no omissions or line numbers',
-          },
-        },
-        required: ['path', 'content'],
-        additionalProperties: false,
-      },
-    },
-  },
-  {
-    type: 'function',
-    function: {
-      name: 'apply_diff',
-      description:
-        'Apply precise, targeted modifications to an existing file using one or more search/replace blocks. This tool is for surgical edits only; the SEARCH block must exactly match the existing content, including whitespace and indentation. To make multiple targeted changes, provide multiple SEARCH/REPLACE blocks in the diff parameter. Use the read_file tool first if you are not confident in the exact content to search for.',
-      parameters: {
-        type: 'object',
-        properties: {
-          path: {
-            type: 'string',
-            description: 'The path of the file to modify, relative to the workspace.',
-          },
-          diff: {
-            type: 'string',
-            description:
-              'A string containing one or more search/replace blocks. Each block must follow this format:\n<<<<<<< SEARCH\n:start_line:[line_number]\n-------\n[exact content to find]\n=======\n[new content to replace with]\n>>>>>>> REPLACE',
-          },
-        },
-        required: ['path', 'diff'],
-        additionalProperties: false,
-      },
-    },
-  },
-  {
-    type: 'function',
-    function: {
-      name: 'execute_command',
-      description:
-        "Request to execute a CLI command on the system. Use this when you need to perform system operations or run specific commands to accomplish any step in the user's task. You must tailor your command to the user's system and provide a clear explanation of what the command does. For command chaining, use the appropriate chaining syntax for the user's shell. Prefer to execute complex CLI commands over creating executable scripts.",
-      parameters: {
-        type: 'object',
-        properties: {
-          command: { type: 'string', description: 'Shell command to execute' },
-          cwd: { type: 'string', description: 'Optional working directory for the command' },
-        },
-        required: ['command', 'cwd'],
-        additionalProperties: false,
-      },
-    },
-  },
-  {
-    type: 'function',
-    function: {
-      name: 'search_files',
-      description:
-        'Request to perform a regex search across files in a specified directory, providing context-rich results. This tool searches for patterns or specific content across multiple files, displaying each match with encapsulating context. Craft your regex patterns carefully to balance specificity and flexibility. Use this tool to find code patterns, TODO comments, function definitions, or any text-based information across the project.',
-      parameters: {
-        type: 'object',
-        properties: {
-          path: { type: 'string', description: 'Directory to search recursively' },
-          regex: { type: 'string', description: 'Rust-compatible regular expression pattern' },
-          file_pattern: {
-            type: 'string',
-            description: 'Optional glob to limit which files are searched',
-          },
-        },
-        required: ['path', 'regex', 'file_pattern'],
-        additionalProperties: false,
-      },
-    },
-  },
-  {
-    type: 'function',
-    function: {
-      name: 'list_files',
-      description:
-        'Request to list files and directories within the specified directory. If recursive is true, it will list all files and directories recursively. If recursive is false or not provided, it will only list the top-level contents. Do not use this tool to confirm the existence of files you may have created.',
-      parameters: {
-        type: 'object',
-        properties: {
-          path: {
-            type: 'string',
-            description: 'Directory path to inspect, relative to the workspace',
-          },
-          recursive: { type: 'boolean', description: 'Set true to list contents recursively' },
-        },
-        required: ['path', 'recursive'],
-        additionalProperties: false,
-      },
-    },
-  },
-  {
-    type: 'function',
-    function: {
-      name: 'codebase_search',
-      description:
-        "Find files most relevant to the search query using semantic search. Searches based on meaning rather than exact text matches. By default searches entire workspace. Reuse the user's exact wording unless there's a clear reason not to — their phrasing often helps semantic search. Queries MUST be in English. CRITICAL: For ANY exploration of code you haven't examined yet in this conversation, you MUST use this tool FIRST before any other search or file exploration tools.",
-      parameters: {
-        type: 'object',
-        properties: {
-          query: { type: 'string', description: 'Meaning-based search query' },
-          path: { type: 'string', description: 'Optional subdirectory to limit the search scope' },
-        },
-        required: ['query', 'path'],
-        additionalProperties: false,
-      },
-    },
-  },
-  {
-    type: 'function',
-    function: {
-      name: 'attempt_completion',
-      description:
-        "After each tool use, the user will respond with the result of that tool use. Once you've received the results and can confirm that the task is complete, use this tool to present the result of your work to the user. IMPORTANT: This tool CANNOT be used until you've confirmed from the user that any previous tool uses were successful. Before using this tool, you must confirm that you've received successful results from the user for any previous tool uses.",
-      parameters: {
-        type: 'object',
-        properties: {
-          result: { type: 'string', description: 'Final result message to deliver to the user' },
-        },
-        required: ['result'],
-        additionalProperties: false,
-      },
-    },
-  },
-];
+type StreamStepResult = Omit<
+  RequestResult,
+  'requestNumber' | 'turn' | 'phase' | 'messageCount' | 'lastRole' | 'toolCalls'
+> & {
+  assistantMessage: TextMessage | AssistantToolCallMessage;
+  toolCallsPayload: ToolCall[];
+};
+
+const WORKSPACE_ROOT = process.cwd();
+const FIXTURE = loadFixtureTemplate();
 
-const TURNS: { role: 'user'; content: string }[] = [
+const SYSTEM_MESSAGE: TextMessage = {
+  role: 'system',
+  content: FIXTURE.systemMessage,
+};
+
+const TOOL_DEFINITIONS = FIXTURE.tools;
+
+const TURNS = [
   {
     role: 'user',
     content:
-      '<task>\nI need you to refactor the authentication module in src/auth/login.ts. The current implementation uses callbacks and I want it converted to async/await. Make sure all error handling is preserved and add proper TypeScript types for the return values.\n</task>\n\n<environment_details>\n# VSCode Visible Files\nsrc/auth/login.ts\n\n# VSCode Open Tabs\nsrc/auth/login.ts\nsrc/auth/types.ts\nsrc/auth/middleware.ts\n\n# Current Time\n2026-03-31T14:00:00.000Z\n</environment_details>',
+      'Use only read-only tools for this test. First call list_files on "src/lib/providers" and then read_file on "src/lib/providers/openrouter/request-helpers.ts" and "src/lib/providerHash.ts". After the tool results are returned, explain how cache breakpoints and prompt cache keys are applied. Do not use attempt_completion or ask_followup_question.',
   },
   {
     role: 'user',
     content:
-      'The refactored code looks good but I also need you to update the tests in src/auth/__tests__/login.test.ts to match the new async/await pattern. The tests currently use done() callbacks.',
+      'Use search_files with regex "cache_write_tokens|cache_hit_tokens" under "src/lib" and then read_file on "src/lib/processUsage.ts". After the tool results are returned, explain how cache reads and writes are recorded. Do not use attempt_completion or ask_followup_question.',
   },
   {
     role: 'user',
     content:
-      'Now please also update the middleware in src/auth/middleware.ts to use the new async login function, and make sure the error handling middleware catches any rejected promises properly.',
+      'Use search_files with regex "CacheDiag" under "src/lib" and then read_file on "src/lib/providers/cache-debug.ts". After the tool results are returned, describe which request fields are most important for diagnosing cache misses. Do not use attempt_completion or ask_followup_question.',
   },
-];
-
-// Realistic Kilo Code system prompt (abbreviated but representative in token count).
-const SYSTEM_MESSAGE: ChatMessage = {
-  role: 'system',
-  content: `You are Kilo Code, a highly skilled software engineer with extensive knowledge in many programming languages, frameworks, design patterns, and best practices.
+] satisfies Array<{ role: 'user'; content: string }>;
 
-====
-
-MARKDOWN RULES
-
-ALL responses MUST show ANY \`language construct\` OR filename reference as clickable, exactly as [\`filename OR language.declaration()\`](relative/file/path.ext:line); line is required for \`syntax\` and optional for filename links.
-
-====
-
-TOOL USE
-
-You have access to a set of tools that are executed upon the user's approval. Use the provider-native tool-calling mechanism. Do not include XML markup or examples.
-
-# Tool Use Guidelines
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === 'object' && value !== null && !Array.isArray(value);
+}
 
-1. Assess what information you already have and what information you need to proceed with the task.
-2. Choose the most appropriate tool based on the task and the tool descriptions provided. Assess if you need additional information to proceed, and which of the available tools would be most effective for gathering this information.
-3. If multiple actions are needed, you may use multiple tools in a single message when appropriate, or use tools iteratively across messages. Each tool use should be informed by the results of previous tool uses.
-4. After each tool use, the user will respond with the result of that tool use. This result will provide you with the necessary information to continue your task or make further decisions.
+function isToolDefinition(value: unknown): value is ToolDefinition {
+  if (!isRecord(value)) return false;
+  if (value.type !== 'function') return false;
+  if (!isRecord(value.function)) return false;
+  return (
+    typeof value.function.name === 'string' &&
+    typeof value.function.description === 'string' &&
+    isRecord(value.function.parameters)
+  );
+}
 
-====
+function loadFixtureTemplate(): FixtureTemplate {
+  const fixturePath = path.resolve(WORKSPACE_ROOT, 'src/lib/utils/testdata/no_tool_request.json');
+  const raw = JSON.parse(readFileSync(fixturePath, 'utf8'));
 
-CAPABILITIES
+  if (!isRecord(raw) || !Array.isArray(raw.messages) || !Array.isArray(raw.tools)) {
+    throw new Error('Realistic request fixture is malformed');
+  }
 
-- You can read and analyze code in various programming languages, and can write clean, efficient, and well-documented code.
-- You can analyze project structures and suggest improvements for better organization and maintainability.
-- You can debug complex issues by analyzing error messages, stack traces, and application behavior.
-- You can provide refactoring suggestions to improve code quality, readability, and performance.
-- You can write and update unit tests, integration tests, and end-to-end tests.
+  const systemEntry = raw.messages.find(
+    message => isRecord(message) && message.role === 'system' && typeof message.content === 'string'
+  );
 
-====
+  if (!systemEntry || typeof systemEntry.content !== 'string') {
+    throw new Error('Fixture is missing a string system message');
+  }
 
-RULES
+  const tools = raw.tools.filter(isToolDefinition);
+  if (tools.length === 0) {
+    throw new Error('Fixture is missing tool definitions');
+  }
 
-- Your current working directory is always the workspace root.
-- You cannot \`cd\` into a different directory to complete a task. You are stuck operating from the workspace root.
-- Do not use the ~ character or $HOME to refer to the home directory.
-- When using the search_files tool, craft your regex patterns carefully to balance specificity and flexibility.
-- When creating a new project, organize all new files within a dedicated project directory unless the user specifies otherwise.
-- Be sure to consider the type of project (e.g. Python, JavaScript, web application) when determining the appropriate structure and files to include.
-- When making changes to code, always consider the context in which the code is being used. Ensure your changes are compatible with the existing codebase.
-- Do not ask for more information than necessary. Use the tools provided to accomplish the user's request efficiently.
-- When you have completed the task, use the attempt_completion tool to present the result.
-- The user may provide feedback, which you can use to make improvements and try again.
-- IMPORTANT: You should NEVER end attempt_completion result with a question or request to engage in further conversation.`,
-};
+  return {
+    systemMessage: systemEntry.content,
+    tools,
+  };
+}
 
 async function authenticateUser(email: string) {
   const user = await db
@@ -286,26 +161,67 @@ async function authenticateUser(email: string) {
   return { authToken, baseUrl };
 }
 
-async function streamTurn(
+function buildProviderPreference(preferredProvider: string | undefined) {
+  if (!preferredProvider) {
+    return undefined;
+  }
+
+  const fallbackProvider = preferredProvider === 'anthropic' ? 'amazon-bedrock' : 'anthropic';
+  return { order: [preferredProvider, fallbackProvider] };
+}
+
+function appendToolCallDelta(toolCalls: Map<number, ToolCall>, partialCall: unknown) {
+  if (!isRecord(partialCall) || typeof partialCall.index !== 'number') {
+    return;
+  }
+
+  const current: ToolCall = toolCalls.get(partialCall.index) ?? {
+    id: `cache-diag-tool-${partialCall.index}`,
+    type: 'function',
+    function: {
+      name: '',
+      arguments: '',
+    },
+  };
+
+  if (typeof partialCall.id === 'string' && partialCall.id.length > 0) {
+    current.id = partialCall.id;
+  }
+
+  if (isRecord(partialCall.function)) {
+    if (typeof partialCall.function.name === 'string') {
+      current.function.name += partialCall.function.name;
+    }
+    if (typeof partialCall.function.arguments === 'string') {
+      current.function.arguments += partialCall.function.arguments;
+    }
+  }
+
+  toolCalls.set(partialCall.index, current);
+}
+
+async function streamStep(
   baseUrl: string,
   authToken: string,
   model: string,
   taskId: string,
-  messages: ChatMessage[],
-  providerOnly?: string
-): Promise<TurnResult & { assistantMessage: ChatMessage }> {
+  messages: RequestMessage[],
+  preferredProvider?: string
+): Promise<StreamStepResult> {
   const startTime = Date.now();
 
   const body: Record<string, unknown> = {
     model,
     messages,
     tools: TOOL_DEFINITIONS,
-    tool_choice: 'none',
+    tool_choice: 'auto',
     stream: true,
-    max_tokens: 256,
+    max_tokens: 512,
   };
-  if (providerOnly) {
-    body.provider = { only: [providerOnly] };
+
+  const providerPreference = buildProviderPreference(preferredProvider);
+  if (providerPreference) {
+    body.provider = providerPreference;
   }
 
   const response = await fetch(`${baseUrl}/api/openrouter/chat/completions`, {
@@ -322,10 +238,10 @@ async function streamTurn(
   if (!response.ok) {
     let errorDetail = `HTTP ${response.status} ${response.statusText}`;
     try {
-      const body = await response.json();
-      errorDetail = body.error?.message || body.error || JSON.stringify(body);
+      const errorBody = await response.json();
+      errorDetail = errorBody.error?.message || errorBody.error || JSON.stringify(errorBody);
     } catch {
-      // keep the HTTP status message
+      // Keep the HTTP status message when the error body is not JSON.
     }
     throw new Error(`Request failed: ${errorDetail}`);
   }
@@ -335,6 +251,7 @@ async function streamTurn(
   }
 
   const contentParts: string[] = [];
+  const toolCalls = new Map<number, ToolCall>();
   let promptTokens: number | null = null;
   let completionTokens: number | null = null;
   let cachedTokens: number | null = null;
@@ -350,30 +267,54 @@ async function streamTurn(
 
         try {
           const chunk = JSON.parse(event.data);
+          const choice = Array.isArray(chunk.choices) ? chunk.choices[0] : null;
+          const delta = isRecord(choice) && isRecord(choice.delta) ? choice.delta : null;
 
-          // Content deltas
-          const delta = chunk.choices?.[0]?.delta;
-          if (delta?.content) {
+          if (delta && typeof delta.content === 'string') {
             contentParts.push(delta.content);
           }
 
-          // Usage from final chunk (choices is empty, usage is present)
-          if (chunk.usage) {
-            promptTokens = chunk.usage.prompt_tokens ?? null;
-            completionTokens = chunk.usage.completion_tokens ?? null;
-            cachedTokens = chunk.usage.prompt_tokens_details?.cached_tokens ?? null;
-            cacheWriteTokens = chunk.usage.prompt_tokens_details?.cache_write_tokens ?? null;
+          if (delta && Array.isArray(delta.tool_calls)) {
+            for (const partialCall of delta.tool_calls) {
+              appendToolCallDelta(toolCalls, partialCall);
+            }
+          }
+
+          if (isRecord(chunk.usage)) {
+            promptTokens =
+              typeof chunk.usage.prompt_tokens === 'number' ? chunk.usage.prompt_tokens : null;
+            completionTokens =
+              typeof chunk.usage.completion_tokens === 'number'
+                ? chunk.usage.completion_tokens
+                : null;
+
+            const promptTokenDetails = isRecord(chunk.usage.prompt_tokens_details)
+              ? chunk.usage.prompt_tokens_details
+              : null;
+
+            cachedTokens =
+              promptTokenDetails && typeof promptTokenDetails.cached_tokens === 'number'
+                ? promptTokenDetails.cached_tokens
+                : null;
+
+            cacheWriteTokens =
+              promptTokenDetails && typeof promptTokenDetails.cache_write_tokens === 'number'
+                ? promptTokenDetails.cache_write_tokens
+                : null;
           }
         } catch {
-          // skip malformed chunks
+          // Skip malformed chunks.
         }
       },
     });
 
-    // body is checked for null above; the non-null path is guarded
-    const body = response.body;
-    if (!body) throw new Error('unreachable');
-    const reader = body.getReader();
+    const bodyStream = response.body;
+    if (!bodyStream) {
+      reject(new Error('Response body unexpectedly missing during streaming'));
+      return;
+    }
+
+    const reader = bodyStream.getReader();
     const decoder = new TextDecoder();
 
     function pump(): void {
@@ -384,6 +325,7 @@ async function streamTurn(
             resolve();
             return;
           }
+
           parser.feed(decoder.decode(value, { stream: true }));
           pump();
         })
@@ -395,20 +337,251 @@ async function streamTurn(
 
   const elapsedMs = Date.now() - startTime;
   const responseText = contentParts.join('');
+  const toolCallsPayload = Array.from(toolCalls.entries())
+    .sort((left, right) => left[0] - right[0])
+    .map(([, toolCall]) => toolCall);
+
+  const assistantMessage: TextMessage | AssistantToolCallMessage =
+    toolCallsPayload.length > 0
+      ? {
+          role: 'assistant',
+          content: responseText,
+          tool_calls: toolCallsPayload,
+        }
+      : {
+          role: 'assistant',
+          content: responseText,
+        };
 
   return {
-    turn: 0, // filled in by caller
-    messageCount: messages.length,
     elapsedMs,
     responseText,
     promptTokens,
     completionTokens,
     cachedTokens,
     cacheWriteTokens,
-    assistantMessage: { role: 'assistant', content: responseText },
+    assistantMessage,
+    toolCallsPayload,
   };
 }
 
+function parseJsonObject(jsonText: string): Record<string, unknown> | null {
+  try {
+    const parsed = JSON.parse(jsonText);
+    return isRecord(parsed) ? parsed : null;
+  } catch {
+    return null;
+  }
+}
+
+function toPosixPath(filePath: string): string {
+  return filePath.split(path.sep).join('/');
+}
+
+function numberLines(text: string, maxLines = 160): string {
+  const allLines = text.split('\n');
+  const visibleLines = allLines.slice(0, maxLines);
+  const rendered = visibleLines.map((line, index) => `${index + 1}: ${line}`).join('\n');
+  if (visibleLines.length === allLines.length) {
+    return rendered;
+  }
+  return `${rendered}\n...truncated after ${maxLines} lines...`;
+}
+
+function resolveWorkspacePath(relativePath: string): string {
+  return path.resolve(WORKSPACE_ROOT, relativePath);
+}
+
+function readFilesTool(args: Record<string, unknown>): string {
+  if (!Array.isArray(args.files)) {
+    return JSON.stringify({ error: 'read_file requires a files array' });
+  }
+
+  const outputs: string[] = [];
+  for (const fileEntry of args.files.slice(0, 4)) {
+    if (!isRecord(fileEntry) || typeof fileEntry.path !== 'string') {
+      continue;
+    }
+
+    const requestedPath = fileEntry.path;
+    try {
+      const absolutePath = resolveWorkspacePath(requestedPath);
+      const content = readFileSync(absolutePath, 'utf8');
+      outputs.push(`FILE: ${requestedPath}\n${numberLines(content)}`);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      outputs.push(`FILE: ${requestedPath}\nERROR: ${message}`);
+    }
+  }
+
+  if (outputs.length === 0) {
+    return JSON.stringify({ error: 'No readable files were provided' });
+  }
+
+  return outputs.join('\n\n');
+}
+
+function listFilesRecursive(rootPath: string, recursive: boolean): string[] {
+  const entries: string[] = [];
+  const maxEntries = 120;
+
+  function walk(currentAbsolutePath: string, currentRelativePath: string) {
+    const directoryEntries = readdirSync(currentAbsolutePath, { withFileTypes: true }).sort(
+      (a, b) => a.name.localeCompare(b.name)
+    );
+
+    for (const directoryEntry of directoryEntries) {
+      if (entries.length >= maxEntries) {
+        return;
+      }
+
+      const nextRelativePath = currentRelativePath
+        ? `${currentRelativePath}/${directoryEntry.name}`
+        : directoryEntry.name;
+
+      if (directoryEntry.isDirectory()) {
+        entries.push(`${nextRelativePath}/`);
+        if (recursive) {
+          walk(path.join(currentAbsolutePath, directoryEntry.name), nextRelativePath);
+        }
+      } else {
+        entries.push(nextRelativePath);
+      }
+    }
+  }
+
+  walk(rootPath, '');
+  return entries;
+}
+
+function listFilesTool(args: Record<string, unknown>): string {
+  const requestedPath = typeof args.path === 'string' ? args.path : '.';
+  const recursive = typeof args.recursive === 'boolean' ? args.recursive : false;
+
+  try {
+    const absolutePath = resolveWorkspacePath(requestedPath);
+    const stats = statSync(absolutePath);
+    if (!stats.isDirectory()) {
+      return JSON.stringify({ error: `${requestedPath} is not a directory` });
+    }
+
+    const entries = listFilesRecursive(absolutePath, recursive);
+    if (entries.length === 0) {
+      return `${requestedPath}: <empty>`;
+    }
+
+    return entries.join('\n');
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    return JSON.stringify({ error: message });
+  }
+}
+
+function searchFilesTool(args: Record<string, unknown>): string {
+  const searchPath = typeof args.path === 'string' ? args.path : '.';
+  const regex = typeof args.regex === 'string' ? args.regex : null;
+  const filePattern = typeof args.file_pattern === 'string' ? args.file_pattern : '*';
+
+  if (!regex) {
+    return JSON.stringify({ error: 'search_files requires a regex string' });
+  }
+
+  try {
+    const absolutePath = resolveWorkspacePath(searchPath);
+    const output = execFileSync(
+      'rg',
+      ['-n', '-m', '20', '--glob', filePattern, regex, absolutePath],
+      {
+        cwd: WORKSPACE_ROOT,
+        encoding: 'utf8',
+      }
+    );
+
+    return output.trim() || 'No matches found';
+  } catch (error) {
+    if (isRecord(error) && typeof error.stdout === 'string' && error.stdout.trim()) {
+      return error.stdout.trim();
+    }
+    return 'No matches found';
+  }
+}
+
+function codebaseSearchTool(args: Record<string, unknown>): string {
+  const query = typeof args.query === 'string' ? args.query.toLowerCase() : '';
+  const likelyMatches = [
+    'src/lib/providers/openrouter/request-helpers.ts',
+    'src/lib/providerHash.ts',
+    'src/lib/processUsage.ts',
+    'src/lib/providers/cache-debug.ts',
+    'src/app/api/openrouter/[...path]/route.ts',
+  ].filter(filePath => filePath.toLowerCase().includes(query) || query.includes('cache'));
+
+  if (likelyMatches.length === 0) {
+    return JSON.stringify({ query, matches: ['src/lib/processUsage.ts'] });
+  }
+
+  return JSON.stringify({ query, matches: likelyMatches });
+}
+
+function fetchInstructionsTool(): string {
+  return [
+    'Use the provided read-only tool results to answer directly.',
+    'Do not call ask_followup_question or attempt_completion in this cache diagnostic harness.',
+    'Keep answers concise and grounded in the tool results already returned.',
+  ].join('\n');
+}
+
+function withHarnessNote(output: string): string {
+  return [
+    output,
+    '',
+    'CACHE DIAGNOSTIC HARNESS: Prefer answering directly with the tool results already returned instead of calling more tools.',
+  ].join('\n');
+}
+
+function executeToolCall(toolCall: ToolCall): string {
+  const toolName = toolCall.function.name;
+  const args = parseJsonObject(toolCall.function.arguments) ?? {};
+
+  switch (toolName) {
+    case 'read_file':
+      return withHarnessNote(readFilesTool(args));
+    case 'list_files':
+      return withHarnessNote(listFilesTool(args));
+    case 'search_files':
+      return withHarnessNote(searchFilesTool(args));
+    case 'codebase_search':
+      return withHarnessNote(codebaseSearchTool(args));
+    case 'fetch_instructions':
+      return withHarnessNote(fetchInstructionsTool());
+    case 'ask_followup_question':
+      return withHarnessNote(
+        'Follow-up questions are disabled in this cache diagnostic test. Continue with the existing context and answer directly after using read-only tools.'
+      );
+    case 'attempt_completion':
+      return withHarnessNote(
+        'attempt_completion is disabled in this cache diagnostic test. Provide the final answer in a normal assistant response instead.'
+      );
+    default:
+      return withHarnessNote(
+        JSON.stringify({
+          ok: true,
+          tool: toolName,
+          note: 'Tool execution is stubbed in the cache diagnostic harness.',
+          arguments: args,
+        })
+      );
+  }
+}
+
+function executeToolCalls(toolCalls: ToolCall[]): ToolMessage[] {
+  return toolCalls.map(toolCall => ({
+    role: 'tool',
+    tool_call_id: toolCall.id,
+    content: executeToolCall(toolCall),
+  }));
+}
+
 function truncate(text: string, maxLen: number): string {
   if (text.length <= maxLen) return text;
   return text.slice(0, maxLen) + '...';
@@ -419,19 +592,27 @@ function cacheHitRate(promptTokens: number | null, cachedTokens: number | null):
   return `${Math.round((cachedTokens / promptTokens) * 100)}%`;
 }
 
-function printSummary(results: TurnResult[], label?: string) {
+function printSummary(results: RequestResult[], label?: string) {
   console.log(`\n=== Summary${label ? ` (${label})` : ''} ===`);
-  console.log('| Turn | Msgs | Input | Output | Cache Write | Cache Read | Hit Rate |');
-  console.log('|------|------|-------|--------|-------------|------------|----------|');
-  for (const r of results) {
-    const input = r.promptTokens !== null ? String(r.promptTokens).padEnd(5) : 'N/A  ';
-    const output = r.completionTokens !== null ? String(r.completionTokens).padEnd(6) : 'N/A   ';
+  console.log(
+    '| Req | Turn | Phase         | Msgs | Last | ToolCalls | Input | Output | Cache Write | Cache Read | Hit Rate |'
+  );
+  console.log(
+    '|-----|------|---------------|------|------|-----------|-------|--------|-------------|------------|----------|'
+  );
+
+  for (const result of results) {
+    const input = result.promptTokens !== null ? String(result.promptTokens).padEnd(5) : 'N/A  ';
+    const output =
+      result.completionTokens !== null ? String(result.completionTokens).padEnd(6) : 'N/A   ';
     const write =
-      r.cacheWriteTokens !== null ? String(r.cacheWriteTokens).padEnd(11) : 'N/A        ';
-    const read = r.cachedTokens !== null ? String(r.cachedTokens).padEnd(10) : 'N/A       ';
-    const hitRate = cacheHitRate(r.promptTokens, r.cachedTokens).padEnd(8);
+      result.cacheWriteTokens !== null ? String(result.cacheWriteTokens).padEnd(11) : 'N/A        ';
+    const read =
+      result.cachedTokens !== null ? String(result.cachedTokens).padEnd(10) : 'N/A       ';
+    const hitRate = cacheHitRate(result.promptTokens, result.cachedTokens).padEnd(8);
+
     console.log(
-      `| ${r.turn}    | ${String(r.messageCount).padEnd(4)} | ${input} | ${output} | ${write} | ${read} | ${hitRate} |`
+      `| ${String(result.requestNumber).padEnd(3)} | ${String(result.turn).padEnd(4)} | ${result.phase.padEnd(13)} | ${String(result.messageCount).padEnd(4)} | ${result.lastRole.padEnd(4)} | ${String(result.toolCalls).padEnd(9)} | ${input} | ${output} | ${write} | ${read} | ${hitRate} |`
     );
   }
 }
@@ -440,52 +621,119 @@ async function runSession(
   baseUrl: string,
   authToken: string,
   model: string,
-  providerOnly?: string
-): Promise<TurnResult[]> {
-  const label = providerOnly ?? 'default routing';
+  preferredProvider?: string
+): Promise<RequestResult[]> {
+  const label = preferredProvider ? `preferred ${preferredProvider}` : 'default routing';
   const taskId = `cache-diag-${Date.now()}`;
+  const sessionSystemMessage: TextMessage = {
+    role: 'system',
+    content: `CACHE_DIAG_RUN_ID: ${taskId}\n${SYSTEM_MESSAGE.content}`,
+  };
+
   console.log(`\n--- ${model} via ${label} ---`);
-  console.log(`Task ID: ${taskId}\n`);
+  console.log(`Task ID: ${taskId}`);
+  console.log(`Fixture: realistic system prompt + ${TOOL_DEFINITIONS.length} tools`);
+
+  const messages: RequestMessage[] = [sessionSystemMessage];
+  const results: RequestResult[] = [];
+
+  for (let turnIndex = 0; turnIndex < TURNS.length; turnIndex++) {
+    const turnNumber = turnIndex + 1;
+    messages.push(TURNS[turnIndex]);
+    let toolRound = 0;
+    let sawToolCallForTurn = false;
+
+    while (true) {
+      toolRound += 1;
+      if (toolRound > 6) {
+        throw new Error(`Exceeded tool loop limit on turn ${turnNumber}`);
+      }
 
-  const messages: ChatMessage[] = [SYSTEM_MESSAGE];
-  const results: TurnResult[] = [];
+      const lastMessage = messages.at(-1);
+      const phase: RequestPhase = lastMessage?.role === 'tool' ? 'tool-followup' : 'user';
+
+      console.log(
+        `\nTurn ${turnNumber}, ${phase}, request ${results.length + 1}: ${messages.length} messages, last=${lastMessage?.role ?? 'unknown'}, sending...`
+      );
+
+      const step = await streamStep(baseUrl, authToken, model, taskId, messages, preferredProvider);
+      const result: RequestResult = {
+        requestNumber: results.length + 1,
+        turn: turnNumber,
+        phase,
+        messageCount: messages.length,
+        lastRole: lastMessage?.role ?? 'user',
+        toolCalls: step.toolCallsPayload.length,
+        elapsedMs: step.elapsedMs,
+        responseText: step.responseText,
+        promptTokens: step.promptTokens,
+        completionTokens: step.completionTokens,
+        cachedTokens: step.cachedTokens,
+        cacheWriteTokens: step.cacheWriteTokens,
+      };
+
+      console.log(
+        `  Response (${(result.elapsedMs / 1000).toFixed(1)}s): "${truncate(result.responseText, 80)}"`
+      );
+      console.log(
+        `  Usage: input=${result.promptTokens ?? 'N/A'} output=${result.completionTokens ?? 'N/A'} cached_read=${result.cachedTokens ?? 'N/A'} cached_write=${result.cacheWriteTokens ?? 'N/A'} tool_calls=${result.toolCalls}`
+      );
+      console.log(`  Cache hit rate: ${cacheHitRate(result.promptTokens, result.cachedTokens)}`);
+
+      results.push(result);
+
+      if (step.toolCallsPayload.length === 0) {
+        if (!sawToolCallForTurn) {
+          throw new Error(
+            `Turn ${turnNumber} completed without any tool calls. The repro harness requires tool_result follow-up requests to mirror production caching.`
+          );
+        }
+        messages.push({ role: 'assistant', content: step.responseText });
+        break;
+      }
 
-  for (let i = 0; i < TURNS.length; i++) {
-    const turnNum = i + 1;
-    messages.push(TURNS[i]);
+      if (phase === 'tool-followup' && toolRound >= 2) {
+        const fallbackAssistantText =
+          step.responseText ||
+          'Tool loop truncated after the cache-bearing follow-up request for diagnostic stability.';
+        console.log('  Truncating additional tool loop after cache-bearing follow-up request.');
+        messages.push({ role: 'assistant', content: fallbackAssistantText });
+        break;
+      }
 
-    console.log(`Turn ${turnNum}/${TURNS.length}: ${messages.length} messages, sending...`);
+      sawToolCallForTurn = true;
+      messages.push(step.assistantMessage);
+      const toolMessages = executeToolCalls(step.toolCallsPayload);
+      messages.push(...toolMessages);
+    }
+  }
 
-    const result = await streamTurn(baseUrl, authToken, model, taskId, messages, providerOnly);
-    result.turn = turnNum;
+  printSummary(results, label);
 
-    console.log(
-      `  Response (${(result.elapsedMs / 1000).toFixed(1)}s): "${truncate(result.responseText, 50)}"`
-    );
-    console.log(
-      `  Usage: input=${result.promptTokens ?? 'N/A'} output=${result.completionTokens ?? 'N/A'} cached_read=${result.cachedTokens ?? 'N/A'} cached_write=${result.cacheWriteTokens ?? 'N/A'}`
-    );
-    console.log(`  Cache hit rate: ${cacheHitRate(result.promptTokens, result.cachedTokens)}\n`);
+  const hasCacheHit = results.some(result => (result.cachedTokens ?? 0) > 0);
+  const shouldRequireCacheHit = preferredProvider !== 'anthropic';
 
-    messages.push(result.assistantMessage);
-    results.push(result);
+  if (shouldRequireCacheHit && !hasCacheHit) {
+    throw new Error(`No cache hits observed for ${label}`);
   }
 
-  printSummary(results, label);
   return results;
 }
 
 /**
  * Cache diagnostic E2E test script.
  *
- * Makes multi-turn streaming requests with tools to the local gateway,
- * using the same taskId across turns to exercise the [CacheDiag] logging.
+ * Uses a realistic Kilo request fixture and a production-shaped tool loop so the
+ * last cacheable message regularly becomes a tool result, matching the high-hit
+ * production sessions. This avoids the old false-negative harness that used
+ * `tool_choice: "none"`, never produced tool messages, and stayed below Opus's
+ * cacheable token threshold.
  *
  * @param email - google_user_email of a user in the local DB (required)
  * @param model - OpenRouter model ID (optional, default: anthropic/claude-opus-4.6)
- * @param provider - OpenRouter provider to force via provider.only (optional,
- *                   e.g. "amazon-bedrock" or "anthropic"). Omit to use default routing.
- *                   Use "compare" to run both amazon-bedrock and anthropic back-to-back.
+ * @param provider - Preferred OpenRouter inference provider (optional).
+ *                   Use "amazon-bedrock" or "anthropic".
+ *                   Use "compare" to run both preferences back-to-back.
  */
 export async function run(email: string, model?: string, provider?: string): Promise<void> {
   if (!email) {
@@ -496,7 +744,7 @@ export async function run(email: string, model?: string, provider?: string): Pro
     console.log('\nExamples:');
     console.log('  npx tsx src/scripts/index.ts openrouter test-cache-diag user@example.com');
     console.log(
-      '  npx tsx src/scripts/index.ts openrouter test-cache-diag user@example.com anthropic/claude-opus-4.6 anthropic'
+      '  npx tsx src/scripts/index.ts openrouter test-cache-diag user@example.com anthropic/claude-opus-4.6 amazon-bedrock'
     );
     console.log(
       '  npx tsx src/scripts/index.ts openrouter test-cache-diag user@example.com anthropic/claude-opus-4.6 compare'
@@ -512,11 +760,12 @@ export async function run(email: string, model?: string, provider?: string): Pro
     const { authToken, baseUrl } = await authenticateUser(email);
     console.log(`Auth: ${email}`);
     console.log(`Server: ${baseUrl}`);
+    console.log(`Workspace: ${toPosixPath(WORKSPACE_ROOT)}`);
 
     if (provider === 'compare') {
-      const providers = ['amazon-bedrock', 'anthropic'] as const;
-      for (const p of providers) {
-        await runSession(baseUrl, authToken, resolvedModel, p);
+      const providers: Array<'amazon-bedrock' | 'anthropic'> = ['amazon-bedrock', 'anthropic'];
+      for (const preferredProvider of providers) {
+        await runSession(baseUrl, authToken, resolvedModel, preferredProvider);
       }
     } else {
       await runSession(baseUrl, authToken, resolvedModel, provider);