diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8bbd11e..a27d8d9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -67,6 +67,8 @@ jobs: run: make setup-dev-env-ci-scripts - name: Run non-network script unit tests run: make test-scripts + - name: Validate marker debt policy + run: make todo-debt-check - name: Run benchmark script smoke run: bash scripts/run-all.sh - name: Generate report from raw results diff --git a/METHODOLOGY.md b/METHODOLOGY.md index a34f8ef..779bc0d 100644 --- a/METHODOLOGY.md +++ b/METHODOLOGY.md @@ -22,7 +22,7 @@ - shell scripts in `scripts/` for orchestration - `hyperfine` benchmark engine (optional via `BENCH_ENGINE=hyperfine`) - `benchstat` statistical comparison for quality gates -- policy file: `stats-policy.json` (`stats-policy.yaml` accepted for backward compatibility) +- policy file: `stats-policy.json` - Python 3 report and normalization tooling in `scripts/` ## Baseline benchmark profile diff --git a/Makefile b/Makefile index 65d62b2..108ce94 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ GOPATH ?= $(shell $(GO) env GOPATH) GO_PATCH_COVER ?= $(GOPATH)/bin/go-patch-cover MODULES = $(shell find . -type f -name "go.mod" -not -path "*/.*/*" -not -path "*/vendor/*" -exec dirname {} \;) -.PHONY: benchmark benchmark-modkit benchmark-nestjs benchmark-baseline benchmark-wire benchmark-fx benchmark-do report test test-go test-python test-shell test-scripts test-coverage test-coverage-go test-coverage-python test-patch-coverage tools setup-dev-env setup-dev-env-ci setup-dev-env-ci-scripts parity-check parity-check-modkit parity-check-nestjs benchmark-fingerprint-check benchmark-limits-check benchmark-manifest-check benchmark-raw-schema-check benchmark-summary-schema-check benchmark-schema-validate benchmark-stats-check benchmark-variance-check benchmark-benchstat-check ci-benchmark-quality-check workflow-concurrency-check workflow-budget-check workflow-inputs-check report-disclaimer-check methodology-changelog-check publication-sync-check +.PHONY: benchmark benchmark-modkit benchmark-nestjs benchmark-baseline benchmark-wire benchmark-fx benchmark-do report test test-go test-python test-shell test-scripts test-coverage test-coverage-go test-coverage-python test-patch-coverage tools setup-dev-env setup-dev-env-ci setup-dev-env-ci-scripts parity-check parity-check-modkit parity-check-nestjs benchmark-fingerprint-check benchmark-limits-check benchmark-manifest-check benchmark-raw-schema-check benchmark-summary-schema-check benchmark-schema-validate benchmark-stats-check benchmark-variance-check benchmark-benchstat-check ci-benchmark-quality-check workflow-concurrency-check workflow-budget-check workflow-inputs-check todo-debt-check report-disclaimer-check methodology-changelog-check publication-sync-check benchmark: bash scripts/run-all.sh @@ -169,3 +169,6 @@ workflow-budget-check: workflow-inputs-check: $(PYTHON) scripts/workflow-policy-check.py inputs-check + +todo-debt-check: + $(PYTHON) scripts/todo-debt-check.py diff --git a/docs/architecture.md b/docs/architecture.md index deb4663..9bb9515 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -38,9 +38,9 @@ results/latest/ benchmark outputs and generated report 2. Run parity checks per target 3. Run load benchmarks for parity-passing targets (`legacy` engine or `hyperfine`) 4. Normalize and save raw outputs -5. Run policy quality gates (`stats-policy.json` + benchstat) -6. Build `summary.json` -7. Generate `report.md` +5. Build `summary.json` and generate `report.md` from raw outputs +6. Validate result schemas for generated artifacts +7. Run policy quality gates (`stats-policy.json` + benchstat) and publication checks ## Failure model diff --git a/docs/design/003-benchmark-statistics-oss-migration.md b/docs/design/003-benchmark-statistics-oss-migration.md index 75ecc7c..46066dd 100644 --- a/docs/design/003-benchmark-statistics-oss-migration.md +++ b/docs/design/003-benchmark-statistics-oss-migration.md @@ -12,7 +12,7 @@ Replace custom statistical processing logic with OSS benchmark/statistics toolin In: 1. Integrate `hyperfine` as the benchmark measurement engine. 2. Integrate `benchstat` for statistical pass/fail checks. -3. Add a versioned policy file (`stats-policy.yaml`) for thresholds and rules. +3. Add a versioned policy file (`stats-policy.json`) for thresholds and rules. 4. Keep `results/latest/*` artifacts stable for summary/report consumers. 5. Update docs to reflect the new measurement and quality gate model. @@ -32,8 +32,8 @@ Out: 1. **Parity Gate (existing behavior):** health check -> parity check per target. 2. **Measurement:** run benchmark samples using `hyperfine`. 3. **Normalization:** transform tool-native output into repo raw schema. -4. **Quality Gate:** run `benchstat`-based policy checks. -5. **Reporting:** generate `summary.json` and `report.md` from normalized artifacts. +4. **Reporting:** generate `summary.json` and `report.md` from normalized artifacts. +5. **Quality Gate:** run policy checks (`benchstat`, variance thresholds, publication checks) on generated artifacts. ### 4.2. What Remains Custom 1. Framework matrix orchestration and target routing. @@ -46,7 +46,7 @@ Out: ## 5. Policy Design -### 5.1. `stats-policy.yaml` (single source of truth) +### 5.1. `stats-policy.json` (single source of truth) Policy fields: - significance (`alpha`), default `0.05` - minimum run count per target @@ -82,7 +82,7 @@ CI keeps `make ci-benchmark-quality-check` as the primary gate and: ## 8. Migration Plan ### Phase A: Policy + Interfaces -1. Add `stats-policy.yaml`. +1. Add `stats-policy.json`. 2. Define normalized schema compatibility contract. 3. Add adapter interfaces without changing default execution path. diff --git a/docs/guides/benchmark-workflow.md b/docs/guides/benchmark-workflow.md index 4edaf38..dc0878f 100644 --- a/docs/guides/benchmark-workflow.md +++ b/docs/guides/benchmark-workflow.md @@ -78,12 +78,13 @@ make benchmark-stats-check make benchmark-variance-check make benchmark-benchstat-check make ci-benchmark-quality-check +make todo-debt-check make report-disclaimer-check make methodology-changelog-check make publication-sync-check ``` -Quality thresholds and required metrics are versioned in `stats-policy.json` (with `stats-policy.yaml` backward-compatibility support). +Quality thresholds and required metrics are versioned in `stats-policy.json`. ## Reproducibility notes diff --git a/scripts/todo-debt-check.py b/scripts/todo-debt-check.py new file mode 100644 index 0000000..606c891 --- /dev/null +++ b/scripts/todo-debt-check.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import re +from pathlib import Path + + +ROOT = Path(__file__).resolve().parent.parent + +CHECK_DIRS = [ + ROOT / "scripts", + ROOT / "docs", + ROOT / ".github", +] + +CHECK_FILES = [ + ROOT / "Makefile", +] + +INCLUDE_SUFFIXES = { + ".py", + ".sh", + ".md", + ".yml", + ".yaml", +} + + +def build_marker_pattern() -> re.Pattern[str]: + markers = ["TO" + "DO", "FIX" + "ME", "HA" + "CK", "X" * 3] + return re.compile(r"\\b(" + "|".join(markers) + r")\\b") + + +def iter_candidate_files(): + for directory in CHECK_DIRS: + if not directory.exists(): + continue + for path in sorted(directory.rglob("*")): + if not path.is_file(): + continue + if path.suffix in INCLUDE_SUFFIXES: + yield path + + for path in CHECK_FILES: + if path.exists() and path.is_file(): + yield path + + +def main() -> None: + pattern = build_marker_pattern() + violations: list[tuple[Path, int, str]] = [] + + for path in iter_candidate_files(): + rel = path.relative_to(ROOT) + for line_no, line in enumerate(path.read_text(encoding="utf-8", errors="replace").splitlines(), start=1): + if pattern.search(line): + violations.append((rel, line_no, line.strip())) + + if violations: + preview = "\n".join(f"- {path}:{line_no}: {line}" for path, line_no, line in violations[:20]) + extra = "" if len(violations) <= 20 else f"\n... and {len(violations) - 20} more" + raise SystemExit( + "todo-debt-check failed: first-party marker debt detected\n" + "Remove marker text from first-party scripts/docs/workflows before merge.\n" + f"{preview}{extra}" + ) + + print("todo-debt-check: no marker debt detected in first-party scripts/docs/workflows") + + +if __name__ == "__main__": + main()