diff --git a/.github/scripts/validate_diagram_anchors.py b/.github/scripts/validate_diagram_anchors.py new file mode 100644 index 00000000000..357e1d0df7b --- /dev/null +++ b/.github/scripts/validate_diagram_anchors.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python3 +""" +validate_diagram_anchors.py (EDUENG-613) + +For doc files that contain remote_include tags pulling from +cockroachdb/generated-diagrams grammar_svg, fetches each referenced diagram +HTML and verifies that every sql-grammar.html#ANCHOR link inside it resolves +against stmt_block.html on the same branch. + +This is the exact failure that blocked production builds on 2026-01-29: + show_statement_hints.html referenced sql-grammar.html#opt_with_show_hints_options + but that anchor did not yet exist in stmt_block.html on release-26.1. + +Usage: + # Check specific files (e.g. changed files in a PR): + python .github/scripts/validate_diagram_anchors.py file1.md file2.md ... + + # Full scan: + python .github/scripts/validate_diagram_anchors.py + +Exit codes: + 0 all checks passed + 1 one or more broken anchors found + 2 fatal error (versions.csv not found) + +Environment: + GITHUB_TOKEN Optional. Raises GitHub API rate limit from 60 to 5000 req/hr. + GITHUB_ACTIONS Set automatically in CI. Enables pr-comment.md output. +""" + +import base64 +import csv +import json +import os +import re +import sys +import urllib.error +import urllib.parse +import urllib.request +from html.parser import HTMLParser +from pathlib import Path +from typing import Optional + +GENERATED_DIAGRAMS_REPO = "cockroachdb/generated-diagrams" +GITHUB_API_BASE = "https://api.github.com" +VERSIONS_CSV = Path("src/current/_data/versions.csv") +DOCS_ROOT = Path("src/current") + +# {% remote_include https://raw.githubusercontent.com/cockroachdb/generated-diagrams/ +# {{ page.release_info.crdb_branch_name }}/grammar_svg/show_statement_hints.html %} +REMOTE_INCLUDE_RE = re.compile( + r"\{%-?\s*remote_include\s+" + r"https://raw\.githubusercontent\.com/cockroachdb/generated-diagrams/" + r"\{\{[^}]*crdb_branch_name[^}]*\}\}/grammar_svg/" + r"([\w.-]+\.html)" + r"\s*-?%\}" +) + +# href="sql-grammar.html#opt_with_show_hints_options" +ANCHOR_REF_RE = re.compile(r'href=["\']sql-grammar\.html#([^"\']+)["\']') + +# --------------------------------------------------------------------------- +# HTTP +# --------------------------------------------------------------------------- + +def _fetch_github_content(repo: str, path: str, ref: str) -> Optional[str]: + """Fetch a file from GitHub using the Contents API. + + Uses the REST API endpoint so that GITHUB_TOKEN properly raises rate + limits and authenticates against private repos. Falls back to the + download_url for files larger than 1 MB (the API returns the field but + omits the base64 payload in that case). + """ + encoded_ref = urllib.parse.quote(ref, safe="") + encoded_path = urllib.parse.quote(path, safe="/") + url = ( + f"{GITHUB_API_BASE}/repos/{repo}/contents/{encoded_path}" + f"?ref={encoded_ref}" + ) + + req = urllib.request.Request(url) + req.add_header("Accept", "application/vnd.github+json") + req.add_header("X-GitHub-Api-Version", "2022-11-28") + token = os.environ.get("GITHUB_TOKEN") + if token: + req.add_header("Authorization", f"Bearer {token}") + + try: + with urllib.request.urlopen(req, timeout=20) as resp: + data = json.loads(resp.read().decode()) + + # Normal case: inline base64 payload + if data.get("encoding") == "base64" and data.get("content"): + return base64.b64decode(data["content"].encode()).decode( + "utf-8", errors="replace" + ) + + # Large file (>1 MB): fall back to the raw download_url + download_url = data.get("download_url") + if download_url: + with urllib.request.urlopen(download_url, timeout=20) as resp: + return resp.read().decode("utf-8", errors="replace") + + return None + except urllib.error.HTTPError as exc: + if exc.code == 404: + return None + raise + except Exception as exc: + print(f" Warning: fetch {repo}/{path}@{ref} failed: {exc}", file=sys.stderr) + return None + + +# --------------------------------------------------------------------------- +# Cached lookups +# --------------------------------------------------------------------------- + +_stmt_block_cache: dict[str, Optional[set]] = {} + + +class _IDCollector(HTMLParser): + """Collects all id= attribute values from an HTML document.""" + + def __init__(self) -> None: + super().__init__() + self.ids: set[str] = set() + + def handle_starttag( + self, tag: str, attrs: list[tuple[str, Optional[str]]] + ) -> None: + for name, value in attrs: + if name == "id" and value: + self.ids.add(value) + + +def get_stmt_block_anchors(branch: str) -> Optional[set]: + """Return all id= values in stmt_block.html for the given branch.""" + if branch not in _stmt_block_cache: + content = _fetch_github_content( + GENERATED_DIAGRAMS_REPO, "grammar_svg/stmt_block.html", branch + ) + if content is None: + _stmt_block_cache[branch] = None + else: + collector = _IDCollector() + collector.feed(content) + _stmt_block_cache[branch] = collector.ids + return _stmt_block_cache[branch] + + +# --------------------------------------------------------------------------- +# Parsing helpers +# --------------------------------------------------------------------------- + +def load_versions_csv() -> dict[str, str]: + """Return {major_version: crdb_branch_name} for all valid rows.""" + if not VERSIONS_CSV.exists(): + print(f"Error: {VERSIONS_CSV} not found. Run from the repo root.", file=sys.stderr) + sys.exit(2) + result = {} + with open(VERSIONS_CSV, newline="") as f: + for row in csv.DictReader(f): + v = row.get("major_version", "").strip() + b = row.get("crdb_branch_name", "").strip() + if v and b and b != "N/A": + result[v] = b + return result + + +def version_from_path(path: Path) -> Optional[str]: + for part in path.parts: + if re.match(r"^v\d+\.\d+$", part): + return part + return None + + +def scan_files(files: list[Path]) -> dict[tuple[str, str], list[Path]]: + """ + Scan markdown files for SQL diagram remote_include tags. + Returns {(version, diagram_filename): [source_paths]}. + """ + result: dict[tuple[str, str], list[Path]] = {} + for path in files: + if path.suffix not in (".md", ".markdown") or not path.exists(): + continue + version = version_from_path(path) + if not version: + continue + try: + content = path.read_text(encoding="utf-8", errors="replace") + except OSError: + continue + for m in REMOTE_INCLUDE_RE.finditer(content): + key = (version, m.group(1)) + result.setdefault(key, []).append(path) + return result + + +# --------------------------------------------------------------------------- +# Core logic +# --------------------------------------------------------------------------- + +def run_checks( + diagram_includes: dict[tuple[str, str], list[Path]], + version_to_branch: dict[str, str], +) -> list[dict]: + failures = [] + + # Group by branch to share stmt_block.html fetches. + branch_to_pairs: dict[str, list[tuple[str, str, list[Path]]]] = {} + for (version, diagram), source_files in diagram_includes.items(): + branch = version_to_branch.get(version) + if branch: + branch_to_pairs.setdefault(branch, []).append((version, diagram, source_files)) + + for branch, pairs in sorted(branch_to_pairs.items()): + print(f" Branch {branch}:") + print(f" Fetching stmt_block.html ...", end=" ", flush=True) + known_anchors = get_stmt_block_anchors(branch) + if known_anchors is None: + print("NOT FOUND — skipping this branch") + continue + print(f"{len(known_anchors)} anchors") + + for version, diagram, source_files in sorted(pairs): + content = _fetch_github_content( + GENERATED_DIAGRAMS_REPO, f"grammar_svg/{diagram}", branch + ) + if content is None: + print(f" {diagram}: NOT FOUND in generated-diagrams (skipping)") + continue + + refs = ANCHOR_REF_RE.findall(content) + missing = [r for r in refs if r not in known_anchors] + + if missing: + print(f" {diagram}: {len(missing)} MISSING anchor(s)") + for anchor in missing: + failures.append({ + "diagram": diagram, + "branch": branch, + "anchor": anchor, + "source_files": [str(f) for f in source_files], + "message": ( + f"Diagram {diagram!r} on {branch!r} links to " + f"sql-grammar.html#{anchor}, " + f"but that anchor is absent from stmt_block.html." + ), + }) + else: + print(f" {diagram}: OK ({len(refs)} anchor ref(s))") + + return failures + + +# --------------------------------------------------------------------------- +# Output +# --------------------------------------------------------------------------- + +def format_comment(failures: list[dict]) -> str: + if not failures: + return ( + "## Diagram Anchor Check: Passed\n\n" + "All `sql-grammar.html#anchor` references in SQL diagram files " + "resolve correctly against `stmt_block.html`." + ) + + lines = [ + "## Diagram Anchor Check: Failed", + "", + f"Found **{len(failures)}** broken anchor(s) that will cause docs build failures.", + "", + "> **Context**: [EDUENG-613](https://cockroachlabs.atlassian.net/browse/EDUENG-613) — " + "same failure mode as 2026-01-29 (`opt_with_show_hints_options` missing from `stmt_block.html`).", + "", + ] + for f in failures: + lines.append( + f"- **`{f['diagram']}`** on `{f['branch']}` " + f"→ missing anchor `#{f['anchor']}`" + ) + for s in f["source_files"]: + lines.append(f" - referenced by `{s}`") + + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main() -> None: + version_to_branch = load_versions_csv() + + if len(sys.argv) > 1: + files = [Path(a) for a in sys.argv[1:]] + else: + files = list(DOCS_ROOT.rglob("*.md")) + + print(f"Scanning {len(files)} file(s) for SQL diagram remote_include tags...") + diagram_includes = scan_files(files) + print(f"Found {len(diagram_includes)} unique (version, diagram) pair(s).\n") + + failures = run_checks(diagram_includes, version_to_branch) + + comment = format_comment(failures) + if os.environ.get("GITHUB_ACTIONS"): + summary = os.environ.get("GITHUB_STEP_SUMMARY") + if summary: + Path(summary).write_text(comment, encoding="utf-8") + Path("pr-comment.md").write_text(comment, encoding="utf-8") + + if failures: + print(f"\n--- Issues ---", file=sys.stderr) + for f in failures: + print(f" {f['message']}", file=sys.stderr) + for s in f["source_files"]: + print(f" referenced by: {s}", file=sys.stderr) + print(f"\nTotal: {len(failures)} broken anchor(s).", file=sys.stderr) + sys.exit(1) + else: + print("\nAll diagram anchor checks passed.") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/validate-diagram-anchors.yml b/.github/workflows/validate-diagram-anchors.yml new file mode 100644 index 00000000000..e37cf0d720d --- /dev/null +++ b/.github/workflows/validate-diagram-anchors.yml @@ -0,0 +1,163 @@ +name: Validate Diagram Anchors + +# EDUENG-613 +# For doc files with remote_include tags pulling from generated-diagrams +# grammar_svg, fetches each diagram HTML and verifies that every +# sql-grammar.html#ANCHOR reference resolves against stmt_block.html on the +# same branch. +# +# This is the exact failure that blocked production on 2026-01-29: +# show_statement_hints.html referenced sql-grammar.html#opt_with_show_hints_options +# but that anchor did not yet exist in stmt_block.html on release-26.1. + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - 'src/current/**/*.md' + schedule: + # Daily at 07:15 UTC. + - cron: '15 7 * * *' + workflow_dispatch: + +jobs: + validate-diagram-anchors: + name: Check SQL diagram anchors against stmt_block.html + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Get changed files (PR only) + id: changed-files + if: github.event_name == 'pull_request' + uses: tj-actions/changed-files@cc08e170f4447237bcaf8acaacfa615b9cb86612 # v35 + with: + files: 'src/current/**/*.md' + separator: ' ' + + # PR: only scan changed .md files for speed. + - name: Run anchor check (PR — changed files) + id: validate-pr + if: github.event_name == 'pull_request' && steps.changed-files.outputs.any_changed == 'true' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_ACTIONS: 'true' + run: | + python .github/scripts/validate_diagram_anchors.py \ + ${{ steps.changed-files.outputs.all_changed_files }} + continue-on-error: true + + # Scheduled / manual: full scan. + - name: Run anchor check (full scan) + id: validate-full + if: github.event_name != 'pull_request' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_ACTIONS: 'true' + run: python .github/scripts/validate_diagram_anchors.py + continue-on-error: true + + - name: Post PR comment + if: github.event_name == 'pull_request' && steps.changed-files.outputs.any_changed == 'true' + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const fs = require('fs'); + const marker = ''; + + let body = marker + '\n'; + try { + body += fs.readFileSync('pr-comment.md', 'utf8'); + } catch { + body += '### Diagram Anchor Check\n\nCheck ran but could not generate a detailed report.'; + } + + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + + const existing = comments.find( + c => c.user.type === 'Bot' && c.body.includes(marker) + ); + + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body, + }); + } + + - name: Fail on PR issues + if: > + github.event_name == 'pull_request' && + steps.changed-files.outputs.any_changed == 'true' && + steps.validate-pr.outcome == 'failure' + run: | + echo "Diagram anchor check failed. See the PR comment for details." + exit 1 + + - name: Open or update tracking issue (scheduled failure) + if: github.event_name != 'pull_request' && steps.validate-full.outcome == 'failure' + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const fs = require('fs'); + const runUrl = `${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`; + + let detail = ''; + try { + detail = fs.readFileSync('pr-comment.md', 'utf8'); + } catch { + detail = `Check failed. See [workflow run](${runUrl}) for details.`; + } + + const label = 'sql-diagram-validation'; + const { data: issues } = await github.rest.issues.listForRepo({ + owner: context.repo.owner, + repo: context.repo.repo, + state: 'open', + labels: label, + }); + + if (issues.length === 0) { + await github.rest.issues.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title: 'Diagram anchor check failure (automated)', + body: [ + 'Opened automatically by the nightly diagram anchor workflow.', + '', + detail, + '', + `[Workflow run](${runUrl})`, + ].join('\n'), + labels: [label], + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issues[0].number, + body: `**Nightly update** — [run ${{ github.run_id }}](${runUrl}):\n\n${detail}`, + }); + }