|
| 1 | +/* eslint-disable @typescript-eslint/no-var-requires */ |
| 2 | +const fs = require('fs-extra'); |
| 3 | +const path = require('path'); |
| 4 | +const matter = require('gray-matter'); |
| 5 | + |
| 6 | +const PLUGIN_NAME = 'docusaurus-plugin-llms-txt'; |
| 7 | + |
| 8 | +/** |
| 9 | + * Cleans markdown content for LLM consumption. |
| 10 | + * Removes MDX-specific syntax, JSX components, and other non-standard markdown. |
| 11 | + */ |
| 12 | +function cleanMarkdownForLlm(content) { |
| 13 | + let cleaned = content; |
| 14 | + |
| 15 | + // Remove import statements |
| 16 | + cleaned = cleaned.replace(/^import\s+.*?(?:from\s+)?['"].*?['"];?\s*$/gm, ''); |
| 17 | + |
| 18 | + // Remove export statements |
| 19 | + cleaned = cleaned.replace(/^export\s+(?:default\s+)?.*?;?\s*$/gm, ''); |
| 20 | + |
| 21 | + // Remove JSX self-closing components like <Component /> |
| 22 | + cleaned = cleaned.replace(/<[A-Z][a-zA-Z0-9]*\s*[^>]*\/>/g, ''); |
| 23 | + |
| 24 | + // Remove JSX opening and closing tags with content |
| 25 | + cleaned = cleaned.replace(/<[A-Z][a-zA-Z0-9]*[^>]*>[\s\S]*?<\/[A-Z][a-zA-Z0-9]*>/g, ''); |
| 26 | + |
| 27 | + // Remove remaining JSX tags |
| 28 | + cleaned = cleaned.replace(/<[A-Z][a-zA-Z0-9]*[^>]*>/g, ''); |
| 29 | + cleaned = cleaned.replace(/<\/[A-Z][a-zA-Z0-9]*>/g, ''); |
| 30 | + |
| 31 | + // Remove MDX expressions {expression} |
| 32 | + cleaned = cleaned.replace(/\{[^}]+\}/g, ''); |
| 33 | + |
| 34 | + // Clean up Docusaurus admonitions - convert to blockquotes |
| 35 | + cleaned = cleaned.replace( |
| 36 | + /^:::\s*(note|tip|info|warning|danger|caution)(?:\s+(.+?))?$/gm, |
| 37 | + (_, type, title) => { |
| 38 | + const capitalizedType = type.charAt(0).toUpperCase() + type.slice(1); |
| 39 | + return title ? `> **${capitalizedType}: ${title}**` : `> **${capitalizedType}**`; |
| 40 | + }, |
| 41 | + ); |
| 42 | + cleaned = cleaned.replace(/^:::$/gm, ''); |
| 43 | + |
| 44 | + // Remove HTML comments |
| 45 | + cleaned = cleaned.replace(/<!--[\s\S]*?-->/g, ''); |
| 46 | + |
| 47 | + // Remove multiple consecutive blank lines (keep max 2) |
| 48 | + cleaned = cleaned.replace(/\n{3,}/g, '\n\n'); |
| 49 | + |
| 50 | + return cleaned.trim(); |
| 51 | +} |
| 52 | + |
| 53 | +/** |
| 54 | + * Generates plain text content from a markdown source file. |
| 55 | + */ |
| 56 | +async function generatePageContent(filePath) { |
| 57 | + if (!(await fs.pathExists(filePath))) { |
| 58 | + return null; |
| 59 | + } |
| 60 | + |
| 61 | + const fileContent = await fs.readFile(filePath, 'utf-8'); |
| 62 | + const { data: frontMatter, content } = matter(fileContent); |
| 63 | + const cleanedContent = cleanMarkdownForLlm(content); |
| 64 | + |
| 65 | + let result = ''; |
| 66 | + |
| 67 | + const title = frontMatter.title || frontMatter.sidebar_label; |
| 68 | + if (title) { |
| 69 | + result += `# ${title}\n\n`; |
| 70 | + } |
| 71 | + |
| 72 | + if (frontMatter.description) { |
| 73 | + result += `${frontMatter.description}\n\n`; |
| 74 | + } |
| 75 | + |
| 76 | + result += cleanedContent; |
| 77 | + |
| 78 | + return result; |
| 79 | +} |
| 80 | + |
| 81 | +/** |
| 82 | + * Recursively collects all doc routes with source file paths. |
| 83 | + */ |
| 84 | +function collectDocRoutes(routes) { |
| 85 | + const result = []; |
| 86 | + |
| 87 | + function walk(routeList) { |
| 88 | + for (const route of routeList) { |
| 89 | + if (route.metadata && route.metadata.sourceFilePath) { |
| 90 | + result.push({ |
| 91 | + path: route.path, |
| 92 | + sourceFilePath: route.metadata.sourceFilePath, |
| 93 | + }); |
| 94 | + } |
| 95 | + if (route.routes) { |
| 96 | + walk(route.routes); |
| 97 | + } |
| 98 | + } |
| 99 | + } |
| 100 | + |
| 101 | + walk(routes); |
| 102 | + return result; |
| 103 | +} |
| 104 | + |
| 105 | +/** |
| 106 | + * Generates the root llms.txt content with a page index. |
| 107 | + */ |
| 108 | +function generateRootLlmsTxt(siteConfig, items, siteDescription) { |
| 109 | + const siteUrl = siteConfig.url; |
| 110 | + const lines = []; |
| 111 | + |
| 112 | + lines.push(`# ${siteConfig.title}`); |
| 113 | + lines.push(''); |
| 114 | + |
| 115 | + if (siteDescription) { |
| 116 | + lines.push(siteDescription); |
| 117 | + lines.push(''); |
| 118 | + } else if (siteConfig.tagline) { |
| 119 | + lines.push(siteConfig.tagline); |
| 120 | + lines.push(''); |
| 121 | + } |
| 122 | + |
| 123 | + lines.push('## Documentation Pages'); |
| 124 | + lines.push(''); |
| 125 | + |
| 126 | + // Group items by top-level path |
| 127 | + const grouped = new Map(); |
| 128 | + |
| 129 | + for (const item of items) { |
| 130 | + const pathParts = item.path.split('/').filter(Boolean); |
| 131 | + const groupKey = pathParts.length > 1 ? `/${pathParts[0]}/${pathParts[1]}` : `/${pathParts[0] || ''}`; |
| 132 | + |
| 133 | + if (!grouped.has(groupKey)) { |
| 134 | + grouped.set(groupKey, []); |
| 135 | + } |
| 136 | + grouped.get(groupKey).push(item); |
| 137 | + } |
| 138 | + |
| 139 | + for (const [groupPath, groupItems] of grouped) { |
| 140 | + lines.push(`### ${groupPath}`); |
| 141 | + lines.push(''); |
| 142 | + |
| 143 | + for (const item of groupItems) { |
| 144 | + const fullUrl = `${siteUrl}${item.path}`; |
| 145 | + lines.push(`- [${item.title}](${fullUrl}): ${fullUrl}/llms.txt`); |
| 146 | + if (item.description) { |
| 147 | + lines.push(` ${item.description}`); |
| 148 | + } |
| 149 | + } |
| 150 | + lines.push(''); |
| 151 | + } |
| 152 | + |
| 153 | + lines.push('---'); |
| 154 | + lines.push(''); |
| 155 | + lines.push('> This file follows the llms.txt standard. See: https://llmstxt.org/'); |
| 156 | + |
| 157 | + return lines.join('\n'); |
| 158 | +} |
| 159 | + |
| 160 | +/** |
| 161 | + * @param {import('@docusaurus/types').LoadContext} context |
| 162 | + * @param {object} options |
| 163 | + */ |
| 164 | +module.exports = function pluginLlmsTxt(context, options = {}) { |
| 165 | + const { siteDescription } = options; |
| 166 | + |
| 167 | + return { |
| 168 | + name: PLUGIN_NAME, |
| 169 | + |
| 170 | + async postBuild({ siteConfig, routes, outDir, siteDir }) { |
| 171 | + const docRoutes = collectDocRoutes(routes); |
| 172 | + |
| 173 | + if (docRoutes.length === 0) { |
| 174 | + console.warn(`[${PLUGIN_NAME}] No doc routes with source files found. Falling back to docs/ directory scan.`); |
| 175 | + } |
| 176 | + |
| 177 | + // Collect all doc files from the docs/ directory as a reliable source |
| 178 | + const docsDir = path.join(siteDir, 'docs'); |
| 179 | + const allDocFiles = []; |
| 180 | + |
| 181 | + async function scanDir(dir, relativePath = '') { |
| 182 | + const entries = await fs.readdir(dir, { withFileTypes: true }); |
| 183 | + for (const entry of entries) { |
| 184 | + const fullPath = path.join(dir, entry.name); |
| 185 | + const relPath = path.join(relativePath, entry.name); |
| 186 | + |
| 187 | + if (entry.isDirectory()) { |
| 188 | + await scanDir(fullPath, relPath); |
| 189 | + } else if (entry.name.endsWith('.md') || entry.name.endsWith('.mdx')) { |
| 190 | + allDocFiles.push({ fullPath, relativePath: relPath }); |
| 191 | + } |
| 192 | + } |
| 193 | + } |
| 194 | + |
| 195 | + if (await fs.pathExists(docsDir)) { |
| 196 | + await scanDir(docsDir); |
| 197 | + } |
| 198 | + |
| 199 | + console.log(`[${PLUGIN_NAME}] Found ${allDocFiles.length} doc files to process.`); |
| 200 | + |
| 201 | + const items = []; |
| 202 | + let successCount = 0; |
| 203 | + |
| 204 | + // Generate per-page llms.txt files |
| 205 | + await Promise.all( |
| 206 | + allDocFiles.map(async ({ fullPath, relativePath }) => { |
| 207 | + try { |
| 208 | + const content = await generatePageContent(fullPath); |
| 209 | + if (!content) return; |
| 210 | + |
| 211 | + const fileContent = await fs.readFile(fullPath, 'utf-8'); |
| 212 | + const { data: frontMatter } = matter(fileContent); |
| 213 | + |
| 214 | + // Determine the URL path for this doc |
| 215 | + // e.g. docs/journeys/journey-builder.md -> /docs/journeys/journey-builder |
| 216 | + let urlPath = relativePath |
| 217 | + .replace(/\.mdx?$/, '') |
| 218 | + .replace(/\\/g, '/'); |
| 219 | + |
| 220 | + // Handle index files (intro.md or index.md at directory level) |
| 221 | + if (urlPath.endsWith('/intro')) { |
| 222 | + // Keep as-is, Docusaurus maps these to the directory path or /intro |
| 223 | + } |
| 224 | + |
| 225 | + const docPath = `/docs/${urlPath}`; |
| 226 | + |
| 227 | + // Write llms.txt for this page |
| 228 | + const outputDir = path.join(outDir, docPath); |
| 229 | + const outputPath = path.join(outputDir, 'llms.txt'); |
| 230 | + |
| 231 | + await fs.ensureDir(outputDir); |
| 232 | + await fs.writeFile(outputPath, content, 'utf-8'); |
| 233 | + successCount++; |
| 234 | + |
| 235 | + // Collect metadata for root index |
| 236 | + const title = frontMatter.title || frontMatter.sidebar_label || urlPath.split('/').pop(); |
| 237 | + items.push({ |
| 238 | + path: docPath, |
| 239 | + title, |
| 240 | + description: frontMatter.description, |
| 241 | + }); |
| 242 | + } catch (err) { |
| 243 | + console.error(`[${PLUGIN_NAME}] Failed to process ${relativePath}:`, err.message); |
| 244 | + } |
| 245 | + }), |
| 246 | + ); |
| 247 | + |
| 248 | + console.log(`[${PLUGIN_NAME}] Generated ${successCount} per-page llms.txt files.`); |
| 249 | + |
| 250 | + // Sort items by path |
| 251 | + items.sort((a, b) => a.path.localeCompare(b.path)); |
| 252 | + |
| 253 | + // Generate root llms.txt |
| 254 | + try { |
| 255 | + const rootContent = generateRootLlmsTxt(siteConfig, items, siteDescription); |
| 256 | + const rootPath = path.join(outDir, 'llms.txt'); |
| 257 | + await fs.writeFile(rootPath, rootContent, 'utf-8'); |
| 258 | + console.log(`[${PLUGIN_NAME}] Generated root llms.txt with ${items.length} entries.`); |
| 259 | + } catch (err) { |
| 260 | + console.error(`[${PLUGIN_NAME}] Failed to generate root llms.txt:`, err.message); |
| 261 | + throw err; |
| 262 | + } |
| 263 | + |
| 264 | + // Generate llms-full.txt with all docs concatenated |
| 265 | + try { |
| 266 | + const fullLines = []; |
| 267 | + fullLines.push(`# ${siteConfig.title} - Complete Documentation`); |
| 268 | + fullLines.push(''); |
| 269 | + |
| 270 | + if (siteDescription) { |
| 271 | + fullLines.push(siteDescription); |
| 272 | + } else if (siteConfig.tagline) { |
| 273 | + fullLines.push(siteConfig.tagline); |
| 274 | + } |
| 275 | + fullLines.push(''); |
| 276 | + fullLines.push('---'); |
| 277 | + fullLines.push(''); |
| 278 | + |
| 279 | + for (const { fullPath, relativePath } of allDocFiles.sort((a, b) => a.relativePath.localeCompare(b.relativePath))) { |
| 280 | + const content = await generatePageContent(fullPath); |
| 281 | + if (content) { |
| 282 | + fullLines.push(content); |
| 283 | + fullLines.push(''); |
| 284 | + fullLines.push('---'); |
| 285 | + fullLines.push(''); |
| 286 | + } |
| 287 | + } |
| 288 | + |
| 289 | + const fullPath = path.join(outDir, 'llms-full.txt'); |
| 290 | + await fs.writeFile(fullPath, fullLines.join('\n'), 'utf-8'); |
| 291 | + console.log(`[${PLUGIN_NAME}] Generated llms-full.txt.`); |
| 292 | + } catch (err) { |
| 293 | + console.error(`[${PLUGIN_NAME}] Failed to generate llms-full.txt:`, err.message); |
| 294 | + } |
| 295 | + }, |
| 296 | + }; |
| 297 | +}; |
0 commit comments