diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fd37a70..dfa8504 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,6 +11,21 @@ concurrency: cancel-in-progress: true jobs: + codeql: + name: CodeQL analysis + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + steps: + - uses: actions/checkout@v5 + - uses: github/codeql-action/init@v4 + with: + languages: go + - uses: github/codeql-action/autobuild@v4 + - uses: github/codeql-action/analyze@v4 + pr-title: name: Validate PR title if: github.event_name == 'pull_request' @@ -62,6 +77,12 @@ jobs: run: make benchmark-schema-validate - name: Run statistical quality gate run: make ci-benchmark-quality-check + - name: Validate publication disclaimer policy + run: make report-disclaimer-check + - name: Validate methodology changelog policy + run: make methodology-changelog-check + - name: Validate README/report publication sync + run: make publication-sync-check - name: Upload benchmark quality summary uses: actions/upload-artifact@v4 with: diff --git a/METHODOLOGY.md b/METHODOLOGY.md index 3efddca..4932a7f 100644 --- a/METHODOLOGY.md +++ b/METHODOLOGY.md @@ -46,6 +46,28 @@ - quality summary: `results/latest/benchmark-quality-summary.json` - optional tool artifacts: `results/latest/tooling/benchstat/*.txt` +## Methodology changelog policy + +### Update rules + +- update this changelog whenever benchmark process, tooling, schema, thresholds, runtime constraints, or interpretation rules change +- classify each entry as `comparability-impacting` or `non-comparability-impacting` +- for `comparability-impacting` changes, include migration notes and baseline reset guidance +- do not publish new benchmark claims without a corresponding changelog entry when methodology or version changed + +### Entry format + +Use one row per change with required fields: + +`version | date (UTC) | change_type | summary | comparability_impact | required_action` + +### Changelog + +| version | date (UTC) | change_type | summary | comparability_impact | required_action | +|---|---|---|---|---|---| +| 1.1.0 | 2026-02-07 | policy | Added publication fairness disclaimer template and README/report sync policy checks | comparability-impacting | Rebaseline external comparisons and reference this version in publication notes | +| 1.0.0 | 2026-02-05 | baseline | Established parity-gated benchmark workflow, schema validation, and quality gates | comparability-impacting | Treat pre-1.0 outputs as non-comparable to current policy | + ## Interpretation guidance - treat parity failures as correctness blockers, not performance regressions diff --git a/Makefile b/Makefile index 4584c5b..94c0571 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ GOPATH ?= $(shell $(GO) env GOPATH) GO_PATCH_COVER ?= $(GOPATH)/bin/go-patch-cover MODULES = $(shell find . -type f -name "go.mod" -not -path "*/.*/*" -not -path "*/vendor/*" -exec dirname {} \;) -.PHONY: benchmark benchmark-modkit benchmark-nestjs benchmark-baseline benchmark-wire benchmark-fx benchmark-do report test test-coverage test-patch-coverage tools parity-check parity-check-modkit parity-check-nestjs benchmark-fingerprint-check benchmark-limits-check benchmark-manifest-check benchmark-raw-schema-check benchmark-summary-schema-check benchmark-schema-validate benchmark-stats-check benchmark-variance-check benchmark-benchstat-check ci-benchmark-quality-check workflow-concurrency-check workflow-budget-check workflow-inputs-check +.PHONY: benchmark benchmark-modkit benchmark-nestjs benchmark-baseline benchmark-wire benchmark-fx benchmark-do report test test-coverage test-patch-coverage tools parity-check parity-check-modkit parity-check-nestjs benchmark-fingerprint-check benchmark-limits-check benchmark-manifest-check benchmark-raw-schema-check benchmark-summary-schema-check benchmark-schema-validate benchmark-stats-check benchmark-variance-check benchmark-benchstat-check ci-benchmark-quality-check workflow-concurrency-check workflow-budget-check workflow-inputs-check report-disclaimer-check methodology-changelog-check publication-sync-check benchmark: bash scripts/run-all.sh @@ -100,6 +100,15 @@ benchmark-benchstat-check: ci-benchmark-quality-check: $(PYTHON) scripts/benchmark-quality-check.py ci-check +report-disclaimer-check: + $(PYTHON) scripts/publication-policy-check.py report-disclaimer-check + +methodology-changelog-check: + $(PYTHON) scripts/publication-policy-check.py methodology-changelog-check + +publication-sync-check: + $(PYTHON) scripts/publication-policy-check.py publication-sync-check + workflow-concurrency-check: $(PYTHON) scripts/workflow-policy-check.py concurrency-check diff --git a/README.md b/README.md index b5d1201..b50d7c6 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,18 @@ benchmarks/ - fixture contract is source-of-truth for expected API behavior - matcher changes require fixture updates and design doc updates +## Publication policy + +- latest-results source of truth: `results/latest/summary.json` and `results/latest/report.md` +- report and summary are generated from `results/latest/raw/*.json` via `python3 scripts/generate-report.py` +- README must not publish standalone benchmark numbers; publication references must point to generated artifacts + +### Fairness disclaimer (publication-wide) + +- Language-vs-framework caveat: cross-language comparisons include runtime/ecosystem effects and are not framework-only deltas +- Cross-language interpretation must be treated as directional evidence, not absolute winner claims +- Parity failures invalidate performance interpretation until correctness is restored + ## Documentation - `docs/design/002-api-parity-contract.md` - parity contract rationale diff --git a/docs/guides/benchmark-workflow.md b/docs/guides/benchmark-workflow.md index 2519eb6..9f050e7 100644 --- a/docs/guides/benchmark-workflow.md +++ b/docs/guides/benchmark-workflow.md @@ -78,6 +78,9 @@ make benchmark-stats-check make benchmark-variance-check make benchmark-benchstat-check make ci-benchmark-quality-check +make report-disclaimer-check +make methodology-changelog-check +make publication-sync-check ``` Quality thresholds and required metrics are versioned in `stats-policy.yaml`. diff --git a/scripts/generate-report.py b/scripts/generate-report.py index faeb914..f792082 100755 --- a/scripts/generate-report.py +++ b/scripts/generate-report.py @@ -113,6 +113,17 @@ def write_report(summary): lines.extend( [ + "", + "## Fairness Disclaimer", + "", + "- Language-vs-framework caveat: cross-language results include runtime and ecosystem effects and must not be treated as framework-only deltas.", + "- Cross-language baseline: compare implementations with equivalent API behavior, workload profile, and environment constraints before drawing conclusions.", + "", + "## Anti-Misinterpretation Guidance", + "", + "- Do not rank frameworks across languages as absolute winners; use results as scenario-specific signals.", + "- Treat large cross-language deltas as prompts for deeper profiling (runtime, I/O, GC, and dependency effects), not as standalone product claims.", + "- Parity failures invalidate performance interpretation until correctness is restored.", "", "## Raw Artifacts", "", diff --git a/scripts/publication-policy-check.py b/scripts/publication-policy-check.py new file mode 100644 index 0000000..a95584b --- /dev/null +++ b/scripts/publication-policy-check.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parent.parent +REPORT = ROOT / "results" / "latest" / "report.md" +REPORT_GENERATOR = ROOT / "scripts" / "generate-report.py" +METHODOLOGY = ROOT / "METHODOLOGY.md" +README = ROOT / "README.md" + + +def report_content() -> str: + if REPORT.exists(): + return REPORT.read_text(encoding="utf-8") + return "" + + +def generator_template_content() -> str: + if not REPORT_GENERATOR.exists(): + raise SystemExit(f"report-disclaimer-check failed: missing generator at {REPORT_GENERATOR}") + return REPORT_GENERATOR.read_text(encoding="utf-8") + + +def disclaimer_check() -> None: + content = report_content() + template = generator_template_content() + required = [ + "## Fairness Disclaimer", + "Language-vs-framework caveat", + "## Anti-Misinterpretation Guidance", + "cross-language", + ] + for token in required: + if token not in template: + raise SystemExit(f"report-disclaimer-check failed: missing '{token}' in scripts/generate-report.py") + if REPORT.exists() and token not in content: + raise SystemExit(f"report-disclaimer-check failed: missing '{token}' in results/latest/report.md") + source = "report + generator" if REPORT.exists() else "generator template" + print(f"report-disclaimer-check: validated disclaimer sections via {source}") + + +def changelog_check() -> None: + if not METHODOLOGY.exists(): + raise SystemExit(f"methodology-changelog-check failed: missing {METHODOLOGY}") + + content = METHODOLOGY.read_text(encoding="utf-8") + required = [ + "## Methodology changelog policy", + "### Update rules", + "### Entry format", + "### Changelog", + "comparability-impacting", + "| version | date (UTC) | change_type | summary | comparability_impact | required_action |", + ] + for token in required: + if token not in content: + raise SystemExit(f"methodology-changelog-check failed: missing '{token}' in METHODOLOGY.md") + + changelog_rows = [ + line + for line in content.splitlines() + if line.startswith("|") and "comparability-impacting" in line + ] + if not changelog_rows: + raise SystemExit( + "methodology-changelog-check failed: changelog requires at least one comparability-impacting entry" + ) + print("methodology-changelog-check: validated changelog policy and comparability entries") + + +def publication_sync_check() -> None: + if not README.exists(): + raise SystemExit(f"publication-sync-check failed: missing {README}") + + readme = README.read_text(encoding="utf-8") + report = report_content() + template = generator_template_content() + readme_folded = readme.casefold() + template_folded = template.casefold() + report_folded = report.casefold() + + readme_required = [ + "## Publication policy", + "latest-results source of truth: `results/latest/summary.json` and `results/latest/report.md`", + "README must not publish standalone benchmark numbers", + ] + for token in readme_required: + if token not in readme: + raise SystemExit(f"publication-sync-check failed: missing '{token}' in README.md") + + shared_caveats = [ + "Language-vs-framework caveat", + "cross-language", + "Parity failures invalidate performance interpretation", + ] + for token in shared_caveats: + token_folded = token.casefold() + if token_folded not in readme_folded: + raise SystemExit(f"publication-sync-check failed: missing caveat '{token}' in README.md") + if token_folded not in template_folded: + raise SystemExit(f"publication-sync-check failed: missing caveat '{token}' in scripts/generate-report.py") + if REPORT.exists() and token_folded not in report_folded: + raise SystemExit(f"publication-sync-check failed: missing caveat '{token}' in results/latest/report.md") + + report_source = "report + generator" if REPORT.exists() else "generator template" + print(f"publication-sync-check: validated README/report caveat sync via {report_source}") + + +def main() -> None: + command = sys.argv[1] if len(sys.argv) > 1 else "report-disclaimer-check" + if command == "report-disclaimer-check": + disclaimer_check() + return + if command == "methodology-changelog-check": + changelog_check() + return + if command == "publication-sync-check": + publication_sync_check() + return + raise SystemExit( + "usage: publication-policy-check.py [report-disclaimer-check|methodology-changelog-check|publication-sync-check]" + ) + + +if __name__ == "__main__": + main()