From 75f5d6ffb43c24e5f3ebec6f2351930ed18d53d8 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Wed, 8 Apr 2026 12:28:41 -0500 Subject: [PATCH] fix: generate llms.txt from mkdocs nav to fix dead links llms.txt was a hand-maintained static file with source file paths (.md, .ipynb) as links. MkDocs serves pages at /path/ (no extension), so all links were 404s on the deployed site. Now llms.txt is auto-generated alongside llms-full.txt by gen_llms_full.py, which reads the mkdocs.yaml nav and converts source paths to deployed URLs. The static llms.txt is removed from git tracking. Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitignore | 1 + docker-compose.yaml | 3 +- scripts/gen_llms_full.py | 126 +++++++++++++++++++++++++++++++++++++-- src/llms.txt | 87 --------------------------- 4 files changed, 123 insertions(+), 94 deletions(-) delete mode 100644 src/llms.txt diff --git a/.gitignore b/.gitignore index d393cfaa..53bcaa85 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ temp* .secrets/ # Generated documentation files +src/llms.txt src/llms-full.txt site/llms-full.txt dj_local_conf.json diff --git a/docker-compose.yaml b/docker-compose.yaml index 2c5531b2..3310e7cc 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -84,12 +84,13 @@ services: # LIVE mode: install datajoint and notebook dependencies for interactive development pip install -e /datajoint-python pip install scikit-image pooch + python scripts/gen_llms_full.py mkdocs serve --config-file ./mkdocs.yaml -a 0.0.0.0:8000 elif echo "$${MODE}" | grep -i build &>/dev/null; then # BUILD mode: build static site from pre-executed notebooks # Install datajoint-python for mkdocstrings (needs to import for API docs) pip install -e /datajoint-python - # Generate llms-full.txt with current git info + # Generate llms.txt and llms-full.txt python scripts/gen_llms_full.py mkdocs build --config-file ./mkdocs.yaml elif echo "$${MODE}" | grep -i execute_pg &>/dev/null; then diff --git a/scripts/gen_llms_full.py b/scripts/gen_llms_full.py index a8d11616..0eab4f21 100644 --- a/scripts/gen_llms_full.py +++ b/scripts/gen_llms_full.py @@ -1,22 +1,27 @@ #!/usr/bin/env python3 """ -Generate llms-full.txt from documentation sources. +Generate llms.txt and llms-full.txt from documentation sources. -This script concatenates all markdown documentation into a single file -optimized for LLM consumption. +- llms.txt: Index with links derived from mkdocs.yaml nav +- llms-full.txt: Complete documentation concatenated for LLM consumption -The generated file is NOT committed to git - it's auto-generated during -the build process with current version metadata. +Both files are auto-generated during the build process. """ import json +import re import subprocess from datetime import datetime, timezone from pathlib import Path +import yaml + # Documentation root -DOCS_DIR = Path(__file__).parent.parent / "src" +PROJECT_DIR = Path(__file__).parent.parent +DOCS_DIR = PROJECT_DIR / "src" +MKDOCS_FILE = PROJECT_DIR / "mkdocs.yaml" OUTPUT_FILE = DOCS_DIR / "llms-full.txt" +OUTPUT_INDEX = DOCS_DIR / "llms.txt" # Sections in order of importance SECTIONS = [ @@ -115,6 +120,114 @@ def get_doc_files(directory: Path) -> list[Path]: return sorted(files) +def source_path_to_url(path: str) -> str: + """Convert a source file path to a deployed MkDocs URL. + + MkDocs with use_directory_urls=true (default) serves: + about/whats-new-2.md -> /about/whats-new-2/ + tutorials/basics/01-first-pipeline.ipynb -> /tutorials/basics/01-first-pipeline/ + index.md -> / + section/index.md -> /section/ + """ + # Strip file extension + url = re.sub(r"\.(md|ipynb)$", "", path) + # index pages -> parent directory + url = re.sub(r"/index$", "", url) + if url == "index": + return "/" + # Avoid double slash for paths like "api/" + if url.endswith("/"): + return f"/{url}" + return f"/{url}/" + + +def extract_nav_entries(nav, section_path=""): + """Recursively extract (title, url) pairs from mkdocs nav structure.""" + entries = [] + if isinstance(nav, list): + for item in nav: + entries.extend(extract_nav_entries(item, section_path)) + elif isinstance(nav, dict): + for key, value in nav.items(): + if isinstance(value, str): + # Leaf node: "Title: path.md" or external URL + if value.startswith("http"): + continue # skip external links + url = source_path_to_url(value) + entries.append((key, url)) + elif isinstance(value, list): + # Section with children + entries.extend(extract_nav_entries(value, key)) + elif isinstance(nav, str): + # Bare path without title (e.g., index pages) + if not nav.startswith("http"): + url = source_path_to_url(nav) + entries.append((None, url)) + return entries + + +def load_mkdocs_nav(): + """Load just the nav section from mkdocs.yaml. + + mkdocs.yaml contains !!python/name tags that standard YAML loaders + can't resolve without the material theme installed. We add a custom + constructor that ignores these tags. + """ + loader = yaml.SafeLoader + # Handle !!python/name and !!python/object tags by returning None + loader.add_multi_constructor( + "tag:yaml.org,2002:python/", + lambda loader, suffix, node: None, + ) + with open(MKDOCS_FILE, "r") as f: + return yaml.load(f, Loader=loader) + + +def generate_llms_txt(): + """Generate llms.txt index from mkdocs.yaml nav.""" + mkdocs_config = load_mkdocs_nav() + + nav = mkdocs_config.get("nav", []) + + # Map top-level nav sections to llms.txt sections + # Each top-level nav item is a dict like {"Concepts": [...]} + lines = [ + "# DataJoint Documentation", + "", + "> DataJoint is a Python framework for building scientific data pipelines " + "with automated computation, integrity constraints, and seamless integration " + "of relational databases with object storage.", + "", + "> For the complete documentation in a single file, see [/llms-full.txt](/llms-full.txt)", + "", + ] + + for nav_item in nav: + if isinstance(nav_item, dict): + for section_name, section_content in nav_item.items(): + if isinstance(section_content, str): + # Skip "Home: index.md" but keep other top-level leaves + if section_content == "index.md" or section_content.startswith("http"): + continue + url = source_path_to_url(section_content) + lines.append(f"- [{section_name}]({url})") + lines.append("") + elif isinstance(section_content, list): + lines.append(f"## {section_name}") + lines.append("") + entries = extract_nav_entries(section_content) + for title, url in entries: + if title: + lines.append(f"- [{title}]({url})") + lines.append("") + + content = "\n".join(lines) + "\n" + with open(OUTPUT_INDEX, "w", encoding="utf-8") as f: + f.write(content) + + print(f"Generated {OUTPUT_INDEX} ({len(content):,} bytes)") + + def generate_llms_full(): """Generate the llms-full.txt file.""" # Get current git info for version metadata @@ -153,4 +266,5 @@ def generate_llms_full(): if __name__ == "__main__": + generate_llms_txt() generate_llms_full() diff --git a/src/llms.txt b/src/llms.txt deleted file mode 100644 index 68935fd5..00000000 --- a/src/llms.txt +++ /dev/null @@ -1,87 +0,0 @@ -# DataJoint Documentation - -> DataJoint is a Python framework for building scientific data pipelines with automated computation, integrity constraints, and seamless integration of relational databases with object storage. This documentation covers DataJoint 2.0. - -> For the complete documentation in a single file, see [/llms-full.txt](/llms-full.txt) - -## Concepts - -- [What's New in 2.0](/about/whats-new-2.md): Major changes and new features in DataJoint 2.0 -- [Relational Workflow Model](/explanation/relational-workflow-model.md): Core data model concepts -- [Entity Integrity](/explanation/entity-integrity.md): How DataJoint ensures data consistency -- [Normalization](/explanation/normalization.md): Database normalization principles -- [Query Algebra](/explanation/query-algebra.md): Operators for combining and filtering data -- [Type System](/explanation/type-system.md): Data types and codecs -- [Computation Model](/explanation/computation-model.md): Automated computation with populate() -- [Custom Codecs](/explanation/custom-codecs.md): Extending the type system - -## Tutorials (Basics) - -- [First Pipeline](/tutorials/basics/01-first-pipeline.ipynb): Your first DataJoint pipeline -- [Schema Design](/tutorials/basics/02-schema-design.ipynb): Tables, keys, and relationships -- [Data Entry](/tutorials/basics/03-data-entry.ipynb): Inserting and managing data -- [Queries](/tutorials/basics/04-queries.ipynb): Operators and fetching results -- [Computation](/tutorials/basics/05-computation.ipynb): Imported and Computed tables -- [Object Storage](/tutorials/basics/06-object-storage.ipynb): Blobs, attachments, and object storage - -## Tutorials (Examples) - -- [University Database](/tutorials/examples/university.ipynb): Complete example with students, courses, grades -- [Fractal Pipeline](/tutorials/examples/fractal-pipeline.ipynb): Iterative computation patterns -- [Blob Detection](/tutorials/examples/blob-detection.ipynb): Image processing with automated computation - -## Tutorials (Domain) - -- [Calcium Imaging](/tutorials/domain/calcium-imaging/calcium-imaging.ipynb): Import TIFF movies, segment cells, extract traces -- [Electrophysiology](/tutorials/domain/electrophysiology/electrophysiology.ipynb): Import recordings, detect spikes, extract waveforms -- [Allen CCF](/tutorials/domain/allen-ccf/allen-ccf.ipynb): Brain atlas with hierarchical region ontology - -## Tutorials (Advanced) - -- [SQL Comparison](/tutorials/advanced/sql-comparison.ipynb): DataJoint for SQL users -- [JSON Data Type](/tutorials/advanced/json-type.ipynb): Semi-structured data in tables -- [Distributed Computing](/tutorials/advanced/distributed.ipynb): Multi-process and cluster workflows -- [Custom Codecs](/tutorials/advanced/custom-codecs.ipynb): Extending the type system - -## How-To Guides - -- [Installation](/how-to/installation.md): Install DataJoint -- [Configure Database](/how-to/configure-database.md): Database connection setup -- [Configure Object Storage](/how-to/configure-storage.md): S3 and filesystem storage -- [Define Tables](/how-to/define-tables.md): Table definitions -- [Model Relationships](/how-to/model-relationships.ipynb): Foreign keys and dependencies -- [Design Primary Keys](/how-to/design-primary-keys.md): Key selection best practices -- [Insert Data](/how-to/insert-data.md): Adding data to tables -- [Query Data](/how-to/query-data.md): Filtering and combining tables -- [Fetch Results](/how-to/fetch-results.md): Retrieving data -- [Delete Data](/how-to/delete-data.md): Removing data safely -- [Run Computations](/how-to/run-computations.md): Using populate() -- [Distributed Computing](/how-to/distributed-computing.md): Multi-worker execution -- [Handle Errors](/how-to/handle-errors.md): Error handling in pipelines -- [Monitor Progress](/how-to/monitor-progress.md): Tracking computation status -- [Use Object Storage](/how-to/use-object-storage.md): Object storage integration -- [Create Custom Codecs](/how-to/create-custom-codec.md): Custom data types -- [Manage Large Data](/how-to/manage-large-data.md): Scaling strategies -- [Migrate to v2.0](/how-to/migrate-to-v20.md): Upgrading from DataJoint 0.14 -- [Alter Tables](/how-to/alter-tables.md): Schema modifications -- [Backup and Restore](/how-to/backup-restore.md): Data backup strategies - -## Reference - -- [Table Declaration](/reference/specs/table-declaration.md): Table definition syntax -- [Query Algebra](/reference/specs/query-algebra.md): Query operator reference -- [Data Manipulation](/reference/specs/data-manipulation.md): Insert, update, delete operations -- [Primary Keys](/reference/specs/primary-keys.md): Key constraints specification -- [Semantic Matching](/reference/specs/semantic-matching.md): Join and restriction rules -- [Type System](/reference/specs/type-system.md): Data type specification -- [Codec API](/reference/specs/codec-api.md): Custom codec interface -- [AutoPopulate](/reference/specs/autopopulate.md): Computation engine specification -- [Fetch API](/reference/specs/fetch-api.md): Data retrieval methods -- [Job Metadata](/reference/specs/job-metadata.md): Job management tables - -## Optional - -- [API Reference](/api/): Auto-generated Python API documentation -- [Configuration](/reference/configuration.md): All configuration options -- [Definition Syntax](/reference/definition-syntax.md): Table definition grammar -- [Errors](/reference/errors.md): Error types and handling