Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ temp*
.secrets/

# Generated documentation files
src/llms.txt
src/llms-full.txt
site/llms-full.txt
dj_local_conf.json
Expand Down
3 changes: 2 additions & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,13 @@ services:
# LIVE mode: install datajoint and notebook dependencies for interactive development
pip install -e /datajoint-python
pip install scikit-image pooch
python scripts/gen_llms_full.py
mkdocs serve --config-file ./mkdocs.yaml -a 0.0.0.0:8000
elif echo "$${MODE}" | grep -i build &>/dev/null; then
# BUILD mode: build static site from pre-executed notebooks
# Install datajoint-python for mkdocstrings (needs to import for API docs)
pip install -e /datajoint-python
# Generate llms-full.txt with current git info
# Generate llms.txt and llms-full.txt
python scripts/gen_llms_full.py
mkdocs build --config-file ./mkdocs.yaml
elif echo "$${MODE}" | grep -i execute_pg &>/dev/null; then
Expand Down
126 changes: 120 additions & 6 deletions scripts/gen_llms_full.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,27 @@
#!/usr/bin/env python3
"""
Generate llms-full.txt from documentation sources.
Generate llms.txt and llms-full.txt from documentation sources.

This script concatenates all markdown documentation into a single file
optimized for LLM consumption.
- llms.txt: Index with links derived from mkdocs.yaml nav
- llms-full.txt: Complete documentation concatenated for LLM consumption

The generated file is NOT committed to git - it's auto-generated during
the build process with current version metadata.
Both files are auto-generated during the build process.
"""

import json
import re
import subprocess
from datetime import datetime, timezone
from pathlib import Path

import yaml

# Documentation root
DOCS_DIR = Path(__file__).parent.parent / "src"
PROJECT_DIR = Path(__file__).parent.parent
DOCS_DIR = PROJECT_DIR / "src"
MKDOCS_FILE = PROJECT_DIR / "mkdocs.yaml"
OUTPUT_FILE = DOCS_DIR / "llms-full.txt"
OUTPUT_INDEX = DOCS_DIR / "llms.txt"

# Sections in order of importance
SECTIONS = [
Expand Down Expand Up @@ -115,6 +120,114 @@ def get_doc_files(directory: Path) -> list[Path]:
return sorted(files)


def source_path_to_url(path: str) -> str:
"""Convert a source file path to a deployed MkDocs URL.

MkDocs with use_directory_urls=true (default) serves:
about/whats-new-2.md -> /about/whats-new-2/
tutorials/basics/01-first-pipeline.ipynb -> /tutorials/basics/01-first-pipeline/
index.md -> /
section/index.md -> /section/
"""
# Strip file extension
url = re.sub(r"\.(md|ipynb)$", "", path)
# index pages -> parent directory
url = re.sub(r"/index$", "", url)
if url == "index":
return "/"
# Avoid double slash for paths like "api/"
if url.endswith("/"):
return f"/{url}"
return f"/{url}/"


def extract_nav_entries(nav, section_path=""):
"""Recursively extract (title, url) pairs from mkdocs nav structure."""
entries = []
if isinstance(nav, list):
for item in nav:
entries.extend(extract_nav_entries(item, section_path))
elif isinstance(nav, dict):
for key, value in nav.items():
if isinstance(value, str):
# Leaf node: "Title: path.md" or external URL
if value.startswith("http"):
continue # skip external links
url = source_path_to_url(value)
entries.append((key, url))
elif isinstance(value, list):
# Section with children
entries.extend(extract_nav_entries(value, key))
elif isinstance(nav, str):
# Bare path without title (e.g., index pages)
if not nav.startswith("http"):
url = source_path_to_url(nav)
entries.append((None, url))
return entries


def load_mkdocs_nav():
"""Load just the nav section from mkdocs.yaml.

mkdocs.yaml contains !!python/name tags that standard YAML loaders
can't resolve without the material theme installed. We add a custom
constructor that ignores these tags.
"""
loader = yaml.SafeLoader
# Handle !!python/name and !!python/object tags by returning None
loader.add_multi_constructor(
"tag:yaml.org,2002:python/",
lambda loader, suffix, node: None,
)
with open(MKDOCS_FILE, "r") as f:
return yaml.load(f, Loader=loader)


def generate_llms_txt():
"""Generate llms.txt index from mkdocs.yaml nav."""
mkdocs_config = load_mkdocs_nav()

nav = mkdocs_config.get("nav", [])

# Map top-level nav sections to llms.txt sections
# Each top-level nav item is a dict like {"Concepts": [...]}
lines = [
"# DataJoint Documentation",
"",
"> DataJoint is a Python framework for building scientific data pipelines "
"with automated computation, integrity constraints, and seamless integration "
"of relational databases with object storage.",
"",
"> For the complete documentation in a single file, see [/llms-full.txt](/llms-full.txt)",
"",
]

for nav_item in nav:
if isinstance(nav_item, dict):
for section_name, section_content in nav_item.items():
if isinstance(section_content, str):
# Skip "Home: index.md" but keep other top-level leaves
if section_content == "index.md" or section_content.startswith("http"):
continue
url = source_path_to_url(section_content)
lines.append(f"- [{section_name}]({url})")
lines.append("")
elif isinstance(section_content, list):
lines.append(f"## {section_name}")
lines.append("")
entries = extract_nav_entries(section_content)
for title, url in entries:
if title:
lines.append(f"- [{title}]({url})")
lines.append("")

content = "\n".join(lines) + "\n"
with open(OUTPUT_INDEX, "w", encoding="utf-8") as f:
f.write(content)

print(f"Generated {OUTPUT_INDEX} ({len(content):,} bytes)")


def generate_llms_full():
"""Generate the llms-full.txt file."""
# Get current git info for version metadata
Expand Down Expand Up @@ -153,4 +266,5 @@ def generate_llms_full():


if __name__ == "__main__":
generate_llms_txt()
generate_llms_full()
87 changes: 0 additions & 87 deletions src/llms.txt

This file was deleted.

Loading