diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e45accc --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +/.idea/ +/.DS +.DS_Store +**/.DS_Store + +# Python caches and local build artifacts +__pycache__/ +*.py[cod] +*.egg-info/ +.pytest_cache/ + +# Local datasets and generated source extracts +/data/source_files/real/ +/data/source_files/e2e/ +/data/source_files/v3_smoke/ diff --git a/AGENTS.md b/AGENTS.md index 9732cc7..58c35a2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,268 +1,63 @@ # AGENTS.md -This repository contains a **data import and transformation pipeline** for UK open datasets. -Its purpose is to produce a reproducible, versioned derived dataset: - -UPRN → postcode → inferred street name → confidence score - -This file defines behavioural rules, quality standards, and documentation requirements -for any agent contributing to this project. - -The priority is **accuracy, provenance, and reproducibility**. - ---- - -## 1. Core Principles - -### 1.1 No Guessing -If a dataset field, schema, release identifier, or licence detail is unknown: -- Mark it as **Unknown** -- Add validation logic -- Document the assumption explicitly - -Never silently assume structure based on “typical” formats. - ---- - -### 1.2 Reproducibility First -The pipeline must be: -- Deterministic -- Rebuildable from raw inputs -- Fully traceable to dataset release identifiers - -If the same inputs are used, outputs must be identical. - -No hidden state. -No environment-dependent logic. -No implicit defaults. - ---- - -### 1.3 Raw Data is Sacred -- Raw imports are immutable. -- Transformations must not mutate raw tables. -- Derived outputs must be rebuildable from raw + release metadata. - -If you need to correct something, rebuild it — do not patch it. - ---- - -### 1.4 Provenance is Mandatory -Every derived dataset must clearly record: -- Source dataset release identifiers -- Method used -- Computation timestamp - -If provenance is not recorded, the output is invalid. - ---- - -### 1.5 Explicit Limitations -Street inference is: -- Heuristic -- Distance-based -- Non-authoritative - -Documentation must clearly state this. -Do not imply authoritative delivery-level correctness. - ---- - -## 2. Documentation Requirements - -Every meaningful change must include documentation updates. - -At minimum: - -### 2.1 Dataset Documentation -Maintain a living document describing: -- Each dataset -- Where it is obtained -- Licence type -- Required fields -- Known limitations -- Known schema quirks - -If a dataset changes, update the documentation immediately. - ---- - -### 2.2 Data Model Documentation -Maintain clear documentation for: -- Raw tables -- Core tables -- Derived tables -- Metrics tables - -Include: -- Field definitions -- Data types -- Constraints -- Semantic meaning - -No column should exist without documented purpose. - ---- - -### 2.3 Transform Documentation -For each transformation layer, document: -- Inputs -- Outputs -- Assumptions -- Failure modes -- Determinism guarantees - -If logic changes (e.g., confidence thresholds), update documentation and record the change rationale. - ---- - -### 2.4 Metrics Documentation -Define: -- What each metric measures -- How it is calculated -- Why it exists -- Expected ranges - -Metrics are part of product quality, not optional extras. - ---- - -## 3. Quality Standards - -### 3.1 Deterministic Behaviour -- Stable ordering in queries -- Explicit tie-breaking rules -- No reliance on implicit database ordering - -### 3.2 Observability -Each pipeline run must: -- Log row counts per stage -- Log join coverage percentages -- Log resolution percentages -- Log distance percentiles - -Silent processing is not acceptable. - ---- - -### 3.3 Fail Fast -If: -- Required columns are missing -- Geometry is invalid -- Coordinate reference systems are inconsistent - -The pipeline must fail clearly. - -Partial silent success is worse than failure. - ---- - -### 3.4 Schema Validation -Before processing: -- Validate required fields exist -- Validate types where possible -- Record dataset release metadata - -Do not infer schema dynamically without documentation. - ---- - -### 3.5 No Scope Drift -This repository is a **pipeline**, not: -- An API -- A serving layer -- An analytics platform -- A proprietary dataset reconstruction engine - -Keep scope disciplined. - ---- - -## 4. Testing Expectations - -Agents must ensure: - -- Normalisation logic is tested. -- Derived outputs are deterministic. -- Schema validation works. -- Metrics calculations are stable. -- Small fixture datasets validate spatial inference logic. - -Tests must: -- Use synthetic or reduced fixture data. -- Not depend on downloading live datasets. - ---- - -## 5. Change Management - -Any change to: -- Confidence scoring -- Search radius -- Join logic -- Normalisation rules -- Spatial reference systems - -Must include: - -1. Rationale -2. Before/after metrics comparison -3. Determinism confirmation -4. Documentation update - ---- - -## 6. What Must Never Be Implemented Here - -- Address enumeration features -- Proprietary dataset integration -- Undocumented inference layers -- Hidden optimisation logic -- Behaviour designed for ambiguous or non-transparent use cases - -This pipeline exists to: -- Normalise open data -- Join open data -- Derive transparent street-level inference -- Record quality metrics - -Nothing more. - ---- - -## 7. Communication Standards - -Pull requests must: - -- State the problem being solved -- Describe the solution -- Document assumptions -- Include metric impact -- Confirm reproducibility - -Avoid vague language such as: -- “Seems to work” -- “Probably correct” -- “Should be fine” - -Be precise. - ---- - -## 8. Decision Rule - -If a proposed change: -- Reduces transparency, -- Obscures provenance, -- Makes outputs less reproducible, -- Or introduces implicit assumptions, - -It should not be merged. - -Clarity over cleverness. -Traceability over speed. -Correctness over convenience. - ---- - -End of AGENTS.md \ No newline at end of file +Purpose: this file is the agent entrypoint for this repository. +Use it as a roadmap to the docs, then execute work with strict reproducibility and provenance. + +## 1. Start Here (Required Reading Order) +1. `docs/README.md` +2. `docs/agent/start-here.md` +3. `docs/architecture/README.md` +4. `docs/spec/pipeline_v3/spec.md` +5. `docs/spec/pipeline_v3/data_model.md` +6. `docs/spec/pipeline_v3/canonicalisation.md` + +If behavior in code differs from spec, treat it as a defect and document the delta. + +## 2. Documentation Roadmap +- V3 product/behavior spec: `docs/spec/pipeline_v3/spec.md` +- V3 schema and table contracts: `docs/spec/pipeline_v3/data_model.md` +- Determinism and canonical rules: `docs/spec/pipeline_v3/canonicalisation.md` +- Source acquisition + licensing context: `docs/spec/data_sources.md` +- Agent onboarding: `docs/agent/start-here.md` +- Codebase map: `docs/agent/codebase-map.md` +- Operational runbook (ingest/build/publish): `docs/agent/runbook.md` +- Dataset lineage pages: `docs/architecture/datasets/README.md` +- Stage/pass pages: `docs/architecture/stages/README.md` +- Legacy phase docs (historical only): `docs/spec/phase_1/`, `docs/spec/phase_2-open-names/` + +## 3. Non-Negotiable Engineering Rules +- No guessing: unknown fields/semantics must be marked unknown and validated explicitly. +- Reproducibility first: same inputs must produce same outputs. +- Raw data is immutable: never mutate raw source snapshots. +- Provenance is mandatory: derived records must trace to source run(s) and method. +- Deterministic execution: stable ordering + explicit tie-breaks only. +- Fail fast on schema/geometry/CRS issues. +- This repo is a pipeline only; do not add API-serving scope here. + +## 4. Change Requirements +For meaningful behavior changes (join logic, scoring, normalization, radius/thresholds, CRS, pass semantics): +1. Update spec/docs in `docs/` in the same change. +2. Never place absolute local filesystem paths in docs; use repository-relative paths. +3. State rationale. +4. Provide before/after metrics or counts where applicable. +5. Confirm determinism impact. +6. Add/adjust tests (fixture-based; no live-download dependency). + +This rule is strict: agents must always keep documentation in step with code changes. + +## 5. Commit Standards +- Commit at logical checkpoints whenever it makes sense. +- Prefer atomic commits grouped by concern (schema, ingest, transforms, tests, docs). +- Use Conventional Commits for every commit message (`type(scope): summary`). + +## 6. Decision Rule +If a change reduces transparency, obscures provenance, weakens reproducibility, or introduces hidden assumptions, do not merge it. + +Clarity over cleverness. Traceability over speed. Correctness over convenience. + +## 7. Scoped Agent Guides +- Docs scope: `docs/AGENTS.md` +- Pipeline scope: `pipeline/AGENTS.md` +- Runtime code scope: `pipeline/src/pipeline/AGENTS.md` +- Test scope: `tests/AGENTS.md` +- Data/manifest scope: `data/AGENTS.md` diff --git a/data/AGENTS.md b/data/AGENTS.md new file mode 100644 index 0000000..e5e74f5 --- /dev/null +++ b/data/AGENTS.md @@ -0,0 +1,17 @@ +# data/AGENTS.md + +## Scope +Manifests and local source-file conventions under `data/`. + +## Critical Rule +Manifest/source contract changes must be reflected in docs (`docs/spec/...` and `docs/architecture/...`) and code (`pipeline/src/pipeline/manifest.py`, `pipeline/config/source_schema.yaml`) together. + +## Conventions +- source manifests live under `data/manifests/` +- keep source naming aligned with `pipeline/src/pipeline/manifest.py` +- avoid absolute local paths in documentation; manifests may contain absolute file paths for runtime only +- update bundle manifests when source keys change + +## Useful References +- source acquisition: `docs/spec/data_sources.md` +- architecture dataset pages: `docs/architecture/datasets/` diff --git a/data/manifests/e2e/onsud_manifest.json b/data/manifests/e2e/onsud_manifest.json new file mode 100644 index 0000000..f511616 --- /dev/null +++ b/data/manifests/e2e/onsud_manifest.json @@ -0,0 +1,15 @@ +{ + "dataset_key": "onsud", + "release_id": "2026-Q1-E2E-P2", + "source_url": "https://example.local/onsud-sample", + "licence": "OGL v3.0", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/e2e/onsud_sample.csv", + "expected_sha256": "dfe6e4bc4d4405edc6463fcb1b55929f867d8e7b9907afb92e893a9f8911033f", + "format": "csv", + "column_map": { + "uprn": "ONS_UPRN", + "postcode": "ONS_POSTCODE", + "postcode_unit_easting": "PC_UNIT_E", + "postcode_unit_northing": "PC_UNIT_N" + } +} diff --git a/data/manifests/e2e/open_names_manifest.json b/data/manifests/e2e/open_names_manifest.json new file mode 100644 index 0000000..53bd559 --- /dev/null +++ b/data/manifests/e2e/open_names_manifest.json @@ -0,0 +1,19 @@ +{ + "dataset_key": "open_names", + "release_id": "2026-Q1-E2E-P2", + "source_url": "https://example.local/open-names-sample", + "licence": "OGL v3.0", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/e2e/open_names_sample.csv", + "expected_sha256": "b4ca5267fe2a4d7fefe68eca48e3df1bddc8e19d8ac7c99be0293ae2a5e39dac", + "format": "csv", + "column_map": { + "entry_id": "ON_ID", + "name1": "NAME1", + "name1_lang": "NAME1_LANG", + "name2": "NAME2", + "local_type": "LOCAL_TYPE", + "geometry_x": "GEOM_X", + "geometry_y": "GEOM_Y", + "postcode_district": "PC_DISTRICT" + } +} diff --git a/data/manifests/e2e/open_roads_manifest.json b/data/manifests/e2e/open_roads_manifest.json new file mode 100644 index 0000000..95fb851 --- /dev/null +++ b/data/manifests/e2e/open_roads_manifest.json @@ -0,0 +1,15 @@ +{ + "dataset_key": "open_roads", + "release_id": "2026-Q1-E2E-P2", + "source_url": "https://example.local/open-roads-sample", + "licence": "OGL v3.0", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/e2e/open_roads_sample.geojson", + "expected_sha256": "15a37f6743b873e6bb6bdcb03980cc2e532126d6262344cf0b60b1611c74ba4b", + "format": "geojson", + "layer_name": "open_roads_sample", + "expected_srid": 27700, + "column_map": { + "source_id": "src_id", + "name_display": "road_name" + } +} diff --git a/data/manifests/e2e/open_uprn_manifest.json b/data/manifests/e2e/open_uprn_manifest.json new file mode 100644 index 0000000..7edefd5 --- /dev/null +++ b/data/manifests/e2e/open_uprn_manifest.json @@ -0,0 +1,16 @@ +{ + "dataset_key": "open_uprn", + "release_id": "2026-Q1-E2E-P2", + "source_url": "https://example.local/open-uprn-sample", + "licence": "OGL v3.0", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/e2e/open_uprn_sample.csv", + "expected_sha256": "7b40b6398d8db405d3078a1c12a9368c02b46b72de3ee663ab3403b82c89b2c9", + "format": "csv", + "column_map": { + "uprn": "UPRN_REF", + "latitude": "LAT", + "longitude": "LON", + "easting": "EASTING", + "northing": "NORTHING" + } +} diff --git a/data/manifests/real/onsud_manifest.json b/data/manifests/real/onsud_manifest.json new file mode 100644 index 0000000..a68ff6e --- /dev/null +++ b/data/manifests/real/onsud_manifest.json @@ -0,0 +1,15 @@ +{ + "dataset_key": "onsud", + "release_id": "ONSUD_NOV_2025", + "source_url": "https://www.arcgis.com/sharing/rest/content/items/9beb2361978146f8ac85da18d21ee266/data", + "licence": "https://www.ons.gov.uk/methodology/geography/licences", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/onsud/ONSUD_NOV_2025_GB_COMBINED.csv", + "expected_sha256": "ef7f0c29e4a1439309f50e16eb20ecd3120c16bd6c7bbaf6e07b61e5a3e27b7e", + "format": "csv", + "column_map": { + "uprn": "UPRN", + "postcode": "PCDS", + "postcode_unit_easting": "GRIDGB1E", + "postcode_unit_northing": "GRIDGB1N" + } +} diff --git a/data/manifests/real/open_names_manifest.json b/data/manifests/real/open_names_manifest.json new file mode 100644 index 0000000..e65d5bc --- /dev/null +++ b/data/manifests/real/open_names_manifest.json @@ -0,0 +1,19 @@ +{ + "dataset_key": "open_names", + "release_id": "open_names_unknown_20260220_aefc8ad3", + "source_url": "https://api.os.uk/downloads/v1/products/OpenNames/downloads?area=GB&format=CSV&redirect", + "licence": "http://os.uk/opendata/licence", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_names/opname_gb_combined.csv", + "expected_sha256": "aefc8ad337e23f1ba7debab45243c2dc2302a4aa95ef7c86d80daaa65f535e05", + "format": "csv", + "column_map": { + "entry_id": "ID", + "name1": "NAME1", + "name1_lang": "NAME1_LANG", + "name2": "NAME2", + "local_type": "LOCAL_TYPE", + "geometry_x": "GEOMETRY_X", + "geometry_y": "GEOMETRY_Y", + "postcode_district": "POSTCODE_DISTRICT" + } +} diff --git a/data/manifests/real/open_roads_manifest.json b/data/manifests/real/open_roads_manifest.json new file mode 100644 index 0000000..d4656ae --- /dev/null +++ b/data/manifests/real/open_roads_manifest.json @@ -0,0 +1,15 @@ +{ + "dataset_key": "open_roads", + "release_id": "open_roads_unknown_20260220_ebbaaaff", + "source_url": "https://api.os.uk/downloads/v1/products/OpenRoads/downloads?area=GB&format=GeoPackage&redirect", + "licence": "http://os.uk/opendata/licence", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_roads/Data/oproad_gb.gpkg", + "expected_sha256": "25cfcf41ce89d49a1714798b25db93d4100b98ff0b07ec6debd20b602c01cc22", + "format": "gpkg", + "layer_name": "road_link", + "expected_srid": 27700, + "column_map": { + "source_id": "id", + "name_display": "name_1" + } +} diff --git a/data/manifests/real/open_uprn_manifest.json b/data/manifests/real/open_uprn_manifest.json new file mode 100644 index 0000000..b592839 --- /dev/null +++ b/data/manifests/real/open_uprn_manifest.json @@ -0,0 +1,16 @@ +{ + "dataset_key": "open_uprn", + "release_id": "osopenuprn_202602", + "source_url": "https://api.os.uk/downloads/v1/products/OpenUPRN/downloads?area=GB&format=CSV&redirect", + "licence": "http://os.uk/opendata/licence", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_uprn/osopenuprn_202602.csv", + "expected_sha256": "69156b9fd66c9195dd23e0aa81f20136c0a55c408b27cd729fe79ed3d0afc911", + "format": "csv", + "column_map": { + "uprn": "UPRN", + "latitude": "LATITUDE", + "longitude": "LONGITUDE", + "easting": "X_COORDINATE", + "northing": "Y_COORDINATE" + } +} diff --git a/data/manifests/real_v3/gb_core_bundle_manifest.json b/data/manifests/real_v3/gb_core_bundle_manifest.json new file mode 100644 index 0000000..a47a415 --- /dev/null +++ b/data/manifests/real_v3/gb_core_bundle_manifest.json @@ -0,0 +1,12 @@ +{ + "build_profile": "gb_core", + "source_runs": { + "onspd": "2b9a865a-9579-4bad-8e91-2a84a8796d47", + "os_open_usrn": "a72298d8-3681-4cdb-8c5c-93d220f270a2", + "os_open_names": "371385d1-149b-4e70-a8e0-b2e0cccfc4b2", + "os_open_roads": "7e013cc2-57b8-4b4c-8679-c45ba52a40bd", + "os_open_uprn": "c6d801dc-591c-421d-a88e-a9bbc19353eb", + "os_open_lids": "0a2cbe07-11af-419d-9f24-5703d9f1faa7", + "nsul": "1b333010-45aa-47bc-a0ba-72646181a153" + } +} diff --git a/data/manifests/real_v3/nsul_manifest.json b/data/manifests/real_v3/nsul_manifest.json new file mode 100644 index 0000000..9ef5573 --- /dev/null +++ b/data/manifests/real_v3/nsul_manifest.json @@ -0,0 +1,87 @@ +{ + "source_name": "nsul", + "source_version": "NSUL_JUL_2025_EPOCH_119", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://www.arcgis.com/sharing/rest/content/items/0c5c2c6202464ae280da1a79c14ccca1/data", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "nsul_nsul_jul_2025_ee", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_EE.csv", + "sha256": "5751a9ec254c317203003dfea4a00ba6bbbf81fea51417edb042f97843bf43e9", + "size_bytes": 1232171370, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_em", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_EM.csv", + "sha256": "438f34adbc54438f44111c7bbdbb35be8bbc2283b787327cfae8190adb0c6bb5", + "size_bytes": 956700215, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_ln", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_LN.csv", + "sha256": "0c526371248ecebbef860325062c5b9b55a63f25c3cf0f911403f2e531ea456d", + "size_bytes": 1639724787, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_ne", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_NE.csv", + "sha256": "e38300351fa95ebce52d59c11896af6496e7bf60c599a667c0598cae2f39cd62", + "size_bytes": 538777804, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_nw", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_NW.csv", + "sha256": "6f6ee1412d111844ae793de42951ae154a427426c4e9ce7ba29f34568f52ffe6", + "size_bytes": 1447417131, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_sc", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_SC.csv", + "sha256": "11b326f012f4b3968373d4e9aa725b8064b9123c85ca6f5230529b0582eb8d3e", + "size_bytes": 1185165031, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_se", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_SE.csv", + "sha256": "e928ef90776c3faf05ec05f9ed1dbe2d8f010fcabcb4f7f0d3fe677c602e2d65", + "size_bytes": 1753125936, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_sw", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_SW.csv", + "sha256": "f7fcb0edbf24fafbc193e1a732a51e6e8c28eefcffbc5a9e74b1c56357f96a3a", + "size_bytes": 1185083958, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_wa", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_WA.csv", + "sha256": "dbdc2878aca7e61a21fbaaf9f6dc9b5b3b9294b842e2d5f564f9641fa1be39fe", + "size_bytes": 664426866, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_wm", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_WM.csv", + "sha256": "c3f0f7d515b271031f9e2568618dfb2187d4a73d75958700fb803f77751652a2", + "size_bytes": 1115129412, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_yh", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_YH.csv", + "sha256": "9993542006b9b85967f07899a1b0006ebc6dd1d4dc788b8f0eb033efe64d1565", + "size_bytes": 1146747103, + "format": "csv" + } + ] +} diff --git a/data/manifests/real_v3/onspd_manifest.json b/data/manifests/real_v3/onspd_manifest.json new file mode 100644 index 0000000..6343bef --- /dev/null +++ b/data/manifests/real_v3/onspd_manifest.json @@ -0,0 +1,17 @@ +{ + "source_name": "onspd", + "source_version": "ONSPD_NOV_2025", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://www.arcgis.com/sharing/rest/content/items/3635ca7f69df4733af27caf86473ffa1/data", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "onspd_uk_csv", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/onspd/Data/ONSPD_NOV_2025_UK.csv", + "sha256": "d4b54fc4c192495dcb33d4559f225237a46af7428edcb648b2fbb76bf4e9bfe8", + "size_bytes": 1448136855, + "format": "csv" + } + ] +} diff --git a/data/manifests/real_v3/os_open_lids_manifest.json b/data/manifests/real_v3/os_open_lids_manifest.json new file mode 100644 index 0000000..3959a6a --- /dev/null +++ b/data/manifests/real_v3/os_open_lids_manifest.json @@ -0,0 +1,24 @@ +{ + "source_name": "os_open_lids", + "source_version": "lids_2026_02", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://api.os.uk/downloads/v1/products/LIDS/downloads?area=GB&format=CSV", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "lids_toid_usrn", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/lids/Road_TOID_Street_USRN_10.csv", + "sha256": "54c03a54c7da5b3e8b13316b3f9357f34f562a57c7cd3d37dfbe2c4e17454462", + "size_bytes": 181488324, + "format": "csv" + }, + { + "file_role": "lids_uprn_usrn", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/lids/BLPU_UPRN_Street_USRN_11.csv", + "sha256": "1243cd8fce256275491050071200c88d72fc1de2380593f3356f72bba5079fec", + "size_bytes": 5061603216, + "format": "csv" + } + ] +} diff --git a/data/manifests/real_v3/os_open_names_manifest.json b/data/manifests/real_v3/os_open_names_manifest.json new file mode 100644 index 0000000..6e5d691 --- /dev/null +++ b/data/manifests/real_v3/os_open_names_manifest.json @@ -0,0 +1,17 @@ +{ + "source_name": "os_open_names", + "source_version": "opname_gb_202602", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://api.os.uk/downloads/v1/products/OpenNames/downloads?area=GB&format=CSV&redirect", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "open_names_csv", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_names/opname_gb_combined.csv", + "sha256": "aefc8ad337e23f1ba7debab45243c2dc2302a4aa95ef7c86d80daaa65f535e05", + "size_bytes": 1802306070, + "format": "csv" + } + ] +} diff --git a/data/manifests/real_v3/os_open_roads_manifest.json b/data/manifests/real_v3/os_open_roads_manifest.json new file mode 100644 index 0000000..4b4421a --- /dev/null +++ b/data/manifests/real_v3/os_open_roads_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_roads", + "source_version": "oproad_gb_202510", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://api.os.uk/downloads/v1/products/OpenRoads/downloads?area=GB&format=GeoPackage&redirect", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "open_roads_road_link_gpkg", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_roads/Data/oproad_gb.gpkg", + "sha256": "25cfcf41ce89d49a1714798b25db93d4100b98ff0b07ec6debd20b602c01cc22", + "size_bytes": 2133966848, + "format": "gpkg", + "layer_name": "road_link" + } + ] +} diff --git a/data/manifests/real_v3/os_open_uprn_manifest.json b/data/manifests/real_v3/os_open_uprn_manifest.json new file mode 100644 index 0000000..366029e --- /dev/null +++ b/data/manifests/real_v3/os_open_uprn_manifest.json @@ -0,0 +1,17 @@ +{ + "source_name": "os_open_uprn", + "source_version": "osopenuprn_202602", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://api.os.uk/downloads/v1/products/OpenUPRN/downloads?area=GB&format=CSV&redirect", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "open_uprn_csv", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_uprn/osopenuprn_202602.csv", + "sha256": "69156b9fd66c9195dd23e0aa81f20136c0a55c408b27cd729fe79ed3d0afc911", + "size_bytes": 2262268626, + "format": "csv" + } + ] +} diff --git a/data/manifests/real_v3/os_open_usrn_manifest.json b/data/manifests/real_v3/os_open_usrn_manifest.json new file mode 100644 index 0000000..68e96ec --- /dev/null +++ b/data/manifests/real_v3/os_open_usrn_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_usrn", + "source_version": "osopenusrn_202602", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://api.os.uk/downloads/v1/products/OpenUSRN/downloads?area=GB&format=GeoPackage&redirect", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "open_usrn_gpkg", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_usrn/osopenusrn_202602.gpkg", + "sha256": "fdd6af3efa38ad116d4c7cf436291c4094f654d6f2aaeb218c7bdd862356828e", + "size_bytes": 1005912064, + "format": "gpkg", + "layer_name": "openUSRN" + } + ] +} diff --git a/data/manifests/v3_smoke/gb_core_bundle_manifest.json b/data/manifests/v3_smoke/gb_core_bundle_manifest.json new file mode 100644 index 0000000..d9d9c10 --- /dev/null +++ b/data/manifests/v3_smoke/gb_core_bundle_manifest.json @@ -0,0 +1,12 @@ +{ + "build_profile": "gb_core", + "source_runs": { + "onspd": "a49f5198-1b1f-4cf5-b9f2-82b450aa9f73", + "os_open_usrn": "60bd0c02-e110-4bfe-b96d-3bebc14516b8", + "os_open_names": "420f6591-24ba-42b8-8a13-3658c9ef0c02", + "os_open_roads": "95119ff1-33cc-4341-b21a-97df73853ac5", + "os_open_uprn": "4a2bef4c-9adc-4427-a11e-e65104b7e86a", + "os_open_lids": "6ccc48d1-80e4-4336-a985-7c720781c9fb", + "nsul": "0f24c6e9-ba4c-4d63-baf0-388bbda197b6" + } +} diff --git a/data/manifests/v3_smoke/nsul_manifest.json b/data/manifests/v3_smoke/nsul_manifest.json new file mode 100644 index 0000000..dcea756 --- /dev/null +++ b/data/manifests/v3_smoke/nsul_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "nsul", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/nsul", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/nsul.csv", + "sha256": "430ef8e55d638274af125d86b7d1a0502b5f67ce19ee143aa502ca856751f7fb", + "size_bytes": 30, + "format": "csv", + "row_count_expected": 1 + } + ] +} diff --git a/data/manifests/v3_smoke/onspd_manifest.json b/data/manifests/v3_smoke/onspd_manifest.json new file mode 100644 index 0000000..fbf430e --- /dev/null +++ b/data/manifests/v3_smoke/onspd_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "onspd", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/onspd", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/onspd.csv", + "sha256": "c255cdcf57adb5f5531c0622d0bda81cb4166ccddd7b41f148737a752279fb8b", + "size_bytes": 186, + "format": "csv", + "row_count_expected": 1 + } + ] +} diff --git a/data/manifests/v3_smoke/os_open_lids_manifest.json b/data/manifests/v3_smoke/os_open_lids_manifest.json new file mode 100644 index 0000000..fc56cda --- /dev/null +++ b/data/manifests/v3_smoke/os_open_lids_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_lids", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/os_open_lids", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/os_open_lids.csv", + "sha256": "bc31799d901c014741b671578b26344cc4b7008e5264926eac0a667de1eaa78f", + "size_bytes": 84, + "format": "csv", + "row_count_expected": 2 + } + ] +} diff --git a/data/manifests/v3_smoke/os_open_names_manifest.json b/data/manifests/v3_smoke/os_open_names_manifest.json new file mode 100644 index 0000000..6ef905a --- /dev/null +++ b/data/manifests/v3_smoke/os_open_names_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_names", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/os_open_names", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/os_open_names.csv", + "sha256": "f4440114144336b2fbbd6d6955086820616b7a880c620941a79e74b83fb80499", + "size_bytes": 81, + "format": "csv", + "row_count_expected": 1 + } + ] +} diff --git a/data/manifests/v3_smoke/os_open_roads_manifest.json b/data/manifests/v3_smoke/os_open_roads_manifest.json new file mode 100644 index 0000000..44a6aae --- /dev/null +++ b/data/manifests/v3_smoke/os_open_roads_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_roads", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/os_open_roads", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/os_open_roads.csv", + "sha256": "449935b3e8d6b4b0809c16bd2fb0630acd696a52e0745b902f81b465a6e4b39f", + "size_bytes": 95, + "format": "csv", + "row_count_expected": 1 + } + ] +} diff --git a/data/manifests/v3_smoke/os_open_uprn_manifest.json b/data/manifests/v3_smoke/os_open_uprn_manifest.json new file mode 100644 index 0000000..8fb7d90 --- /dev/null +++ b/data/manifests/v3_smoke/os_open_uprn_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_uprn", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/os_open_uprn", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/os_open_uprn.csv", + "sha256": "430ef8e55d638274af125d86b7d1a0502b5f67ce19ee143aa502ca856751f7fb", + "size_bytes": 30, + "format": "csv", + "row_count_expected": 1 + } + ] +} diff --git a/data/manifests/v3_smoke/os_open_usrn_manifest.json b/data/manifests/v3_smoke/os_open_usrn_manifest.json new file mode 100644 index 0000000..a1d8941 --- /dev/null +++ b/data/manifests/v3_smoke/os_open_usrn_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_usrn", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/os_open_usrn", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/os_open_usrn.csv", + "sha256": "2c36d2763845f5197e378ea28586cd2f22d65c795847c7a2c543899ba2eb278c", + "size_bytes": 84, + "format": "csv", + "row_count_expected": 1 + } + ] +} diff --git a/docs/AGENTS.md b/docs/AGENTS.md new file mode 100644 index 0000000..ac070b6 --- /dev/null +++ b/docs/AGENTS.md @@ -0,0 +1,22 @@ +# docs/AGENTS.md + +## Scope +Documentation standards and navigation for everything under `docs/`. + +## Critical Rule +Any code behavior change must update relevant docs in the same change set. Documentation is not optional follow-up work. + +## Path Rule +Never use absolute local filesystem paths in docs. Use repository-relative paths only. + +## Navigation +- Docs index: `docs/README.md` +- Architecture map: `docs/architecture/README.md` +- V3 spec authority: `docs/spec/pipeline_v3/` +- Source acquisition details: `docs/spec/data_sources.md` + +## Update Expectations +When editing docs: +- keep links valid and cross-linked +- update related index pages if new docs are added +- keep wording deterministic and implementation-aligned diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..aff4f2c --- /dev/null +++ b/docs/README.md @@ -0,0 +1,25 @@ +# Documentation Index + +This docs tree is organized for fast agent onboarding and precise implementation. + +## Read Order +1. `docs/agent/start-here.md` +2. `docs/architecture/README.md` +3. `docs/agent/codebase-map.md` +4. `docs/agent/runbook.md` +5. `docs/spec/pipeline_v3/spec.md` +6. `docs/spec/pipeline_v3/data_model.md` +7. `docs/spec/pipeline_v3/canonicalisation.md` + +## Sections +- Agent docs: `docs/agent/` +- Architecture deep-dive: `docs/architecture/` +- V3 authoritative spec: `docs/spec/pipeline_v3/` +- Source acquisition and licensing context: `docs/spec/data_sources.md` +- Legacy phase docs (historical reference): + - `docs/spec/phase_1/` + - `docs/spec/phase_2-open-names/` + +## Rule +When behavior changes, update both code and the relevant doc in this tree in the same PR. +Never use absolute local filesystem paths in docs. diff --git a/docs/agent/codebase-map.md b/docs/agent/codebase-map.md new file mode 100644 index 0000000..fa16dcd --- /dev/null +++ b/docs/agent/codebase-map.md @@ -0,0 +1,30 @@ +# Codebase Map + +## Main Runtime Modules +- CLI: `pipeline/src/pipeline/cli.py` +- Manifest parsing/validation: `pipeline/src/pipeline/manifest.py` +- Ingest workflows (raw ingestion): `pipeline/src/pipeline/ingest/workflows.py` +- Build workflows (pass execution/finalization/publish): `pipeline/src/pipeline/build/workflows.py` +- DB migrations runner: `pipeline/src/pipeline/db/migrations.py` +- Normalization utilities: `pipeline/src/pipeline/util/normalise.py` + +## SQL and Config +- SQL migrations: `pipeline/sql/migrations/` +- Source schema mapping config: `pipeline/config/source_schema.yaml` +- Frequency weights config: `pipeline/config/frequency_weights.yaml` +- Normalization config: `pipeline/config/normalisation.yaml` + +## Manifests and Data Inputs +- Real manifests: `data/manifests/real_v3/` +- Smoke manifests: `data/manifests/v3_smoke/` +- Local source files: `data/source_files/` + +## Tests +- Test suite root: `tests/` +- Focus on deterministic behavior, schema validation, provenance contracts, and pass semantics. + +## Documentation Cross-links +- Architecture index: `docs/architecture/README.md` +- Dataset lineage pages: `docs/architecture/datasets/README.md` +- Stage/pass pages: `docs/architecture/stages/README.md` +- Spec authority: `docs/spec/pipeline_v3/` diff --git a/docs/agent/runbook.md b/docs/agent/runbook.md new file mode 100644 index 0000000..c87bb40 --- /dev/null +++ b/docs/agent/runbook.md @@ -0,0 +1,44 @@ +# Operational Runbook + +## 1) Migrate +```bash +pipeline --dsn "dbname=postcodes_v3" db migrate +``` + +## 2) Ingest Sources +```bash +pipeline --dsn "dbname=postcodes_v3" ingest source --manifest /path/to/source_manifest.json +``` +Repeat for each source in the target profile. + +## 3) Create Bundle +```bash +pipeline --dsn "dbname=postcodes_v3" bundle create --manifest /path/to/bundle_manifest.json +``` + +## 4) Build +```bash +pipeline --dsn "dbname=postcodes_v3" build run --bundle-id [--rebuild|--resume] +``` +Use `--resume` only for the same bundle/run lineage. + +## 5) Verify +```bash +pipeline --dsn "dbname=postcodes_v3" build verify --build-run-id +``` + +## 6) Publish +```bash +pipeline --dsn "dbname=postcodes_v3" build publish --build-run-id --actor +``` + +## Observability Queries +- Build status: `meta.build_run` +- Pass checkpoints: `meta.build_pass_checkpoint` +- Ingest provenance: `meta.ingest_run`, `meta.ingest_run_file` + +## Failure Policy +- Fail fast on schema/field mismatches. +- Do not patch raw data; fix logic/mapping and rebuild. +- Record behavior changes in `docs/spec/pipeline_v3/` docs in the same PR. +- Keep architecture docs in sync: `docs/architecture/`. diff --git a/docs/agent/start-here.md b/docs/agent/start-here.md new file mode 100644 index 0000000..fee6277 --- /dev/null +++ b/docs/agent/start-here.md @@ -0,0 +1,36 @@ +# Agent Start Here + +## Objective +Produce deterministic, replayable postcode/street outputs from open-source ingests with strict provenance. + +## Golden Path +1. Read V3 spec docs: + - `docs/architecture/README.md` + - `docs/architecture/relationships-overview.md` + - `docs/architecture/datasets/README.md` + - `docs/architecture/stages/README.md` + - `docs/spec/pipeline_v3/spec.md` + - `docs/spec/pipeline_v3/data_model.md` + - `docs/spec/pipeline_v3/canonicalisation.md` +2. Validate local runtime assumptions: + - DB migrations applied + - Manifest source names and schema mappings align with actual raw payload fields +3. Run in this sequence: + - `pipeline db migrate` + - `pipeline ingest source --manifest ` (repeat by source) + - `pipeline bundle create --manifest ` + - `pipeline build run --bundle-id [--rebuild|--resume]` + - `pipeline build verify --build-run-id ` + - `pipeline build publish --build-run-id --actor ` + +## Critical Contracts +- Raw layer is immutable. +- `derived.postcode_street_candidates` is append-only evidence. +- Pass 3 promotion is insert-only with lineage links. +- Probability normalization is exact by formula and stored with deterministic residual correction. + +## When Unsure +- Prefer explicit failure over implicit behavior. +- Capture unknowns in docs + validation. +- Keep all time and ordering deterministic. +- Keep documentation in step with any behavior change in the same workstream. diff --git a/docs/architecture/README.md b/docs/architecture/README.md new file mode 100644 index 0000000..6fa7ed3 --- /dev/null +++ b/docs/architecture/README.md @@ -0,0 +1,24 @@ +# Pipeline Architecture Docs + +This section explains how datasets relate to each other, how data moves through ingest/stage/build passes, and what value is added at each step. + +## Quick Links +- Relationship map + Mermaid system diagram: [`relationships-overview.md`](relationships-overview.md) +- End-to-end value by pass: [`value-added-by-stage.md`](value-added-by-stage.md) +- Dataset index: [`datasets/README.md`](datasets/README.md) +- Stage/pass index: [`stages/README.md`](stages/README.md) + +## Authoritative Contracts +- Behavioral spec: [`../spec/pipeline_v3/spec.md`](../spec/pipeline_v3/spec.md) +- Data model: [`../spec/pipeline_v3/data_model.md`](../spec/pipeline_v3/data_model.md) +- Canonicalisation/determinism: [`../spec/pipeline_v3/canonicalisation.md`](../spec/pipeline_v3/canonicalisation.md) + +## Reading Order (Fastest Onboarding) +1. [`relationships-overview.md`](relationships-overview.md) +2. [`datasets/README.md`](datasets/README.md) +3. [`stages/README.md`](stages/README.md) +4. [`value-added-by-stage.md`](value-added-by-stage.md) + +## Scope Note +- Legacy docs under `docs/spec/phase_1/` and `docs/spec/phase_2-open-names/` are historical. +- For new implementation work, default to V3 docs and this architecture section. diff --git a/docs/architecture/datasets/README.md b/docs/architecture/datasets/README.md new file mode 100644 index 0000000..fa6e80d --- /dev/null +++ b/docs/architecture/datasets/README.md @@ -0,0 +1,23 @@ +# Dataset Lineage Index + +Each page documents one dataset from raw ingestion through stage normalisation and downstream transformations. + +## Core GB Datasets +- ONSPD: [`onspd.md`](onspd.md) +- OS Open USRN: [`os_open_usrn.md`](os_open_usrn.md) +- OS Open Names: [`os_open_names.md`](os_open_names.md) +- OS Open Roads: [`os_open_roads.md`](os_open_roads.md) +- OS Open UPRN: [`os_open_uprn.md`](os_open_uprn.md) +- OS Open LIDS: [`os_open_lids.md`](os_open_lids.md) +- NSUL: [`nsul.md`](nsul.md) + +## Optional/Extended Sources +- PPD: [`ppd.md`](ppd.md) +- OSNI Gazetteer: [`osni_gazetteer.md`](osni_gazetteer.md) +- DfI Highway: [`dfi_highway.md`](dfi_highway.md) + +## Cross-links +- Relationship map: [`../relationships-overview.md`](../relationships-overview.md) +- Pass index: [`../stages/README.md`](../stages/README.md) +- Value added by stage: [`../value-added-by-stage.md`](../value-added-by-stage.md) +- Data model contract: [`../../spec/pipeline_v3/data_model.md`](../../spec/pipeline_v3/data_model.md) diff --git a/docs/architecture/datasets/dfi_highway.md b/docs/architecture/datasets/dfi_highway.md new file mode 100644 index 0000000..ba83be1 --- /dev/null +++ b/docs/architecture/datasets/dfi_highway.md @@ -0,0 +1,19 @@ +# DfI Highway Dataset Lineage (Optional NI) + +## Role In The Graph +DfI Highway contributes NI spatial road-segment fallback evidence. + +## Ingest Contract +- Source key: `dfi_highway` +- Raw table: `raw.dfi_highway_row` +- Stage table: `stage.dfi_road_segment` + +## Downstream Transformations +- Pass 6 emits `spatial_dfi_highway` candidates. + +## Value Added +- Adds NI fallback coverage where direct NI evidence is absent. + +## Related Docs +- Pass 6 details: [`../stages/6_ni_candidates.md`](../stages/6_ni_candidates.md) +- NI confidence constraints: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/datasets/nsul.md b/docs/architecture/datasets/nsul.md new file mode 100644 index 0000000..5edf627 --- /dev/null +++ b/docs/architecture/datasets/nsul.md @@ -0,0 +1,27 @@ +# NSUL Dataset Lineage + +## Role In The Graph +NSUL provides UPRN->postcode relationships used with LIDS UPRN->USRN links to generate high-confidence street evidence. + +## Ingest Contract +- Source key: `nsul` +- Raw table: `raw.nsul_row` +- Stage table: `stage.nsul_uprn_postcode` + +## Stage Normalisation +- Normalised fields: + - `uprn` + - `postcode_norm` + +## Downstream Transformations +- Pass 4 joins NSUL and LIDS on UPRN, then aggregates postcode/USRN pairs. +- Output candidate type: `uprn_usrn` (high confidence). + +## Value Added +- Adds postcode side of the UPRN linkage chain. +- Enables frequency-like reinforcement based on property counts. + +## Related Docs +- Pass 4 details: [`../stages/4_uprn_reinforcement.md`](../stages/4_uprn_reinforcement.md) +- Open UPRN context: [`os_open_uprn.md`](os_open_uprn.md) +- LIDS context: [`os_open_lids.md`](os_open_lids.md) diff --git a/docs/architecture/datasets/onspd.md b/docs/architecture/datasets/onspd.md new file mode 100644 index 0000000..75a9bd8 --- /dev/null +++ b/docs/architecture/datasets/onspd.md @@ -0,0 +1,38 @@ +# ONSPD Dataset Lineage + +## Role In The Graph +ONSPD is the definitive postcode backbone. It validates postcode existence and contributes canonical postcode metadata used by all later joins. + +## Ingest Contract +- Source key: `onspd` +- Raw table: `raw.onspd_row` +- Manifest mapping source: `pipeline/config/source_schema.yaml` +- Primary pass usage: Pass `1_onspd_backbone` + +## Stage Normalisation +- Stage table: `stage.onspd_postcode` +- Main fields: + - `postcode_norm`, `postcode_display` + - `status`, `lat`, `lon`, `easting`, `northing` + - `country_iso2`, `country_iso3`, `subdivision_code` + - `post_town`, `locality` (when present in source payload) + - `street_enrichment_available` +- Limitations: + - `post_town` and `locality` are passthrough attributes only. + - If a source release omits these fields, `stage.onspd_postcode` and downstream outputs retain `NULL`. + +## Downstream Transformations +- Pass 1 writes: + - `core.postcodes` + - `core.postcodes_meta` +- Used by passes 3/4/5/6/7 for postcode validation and join gating. + +## Value Added +- Converts raw postcode records into canonical and display-safe forms. +- Centralizes country/subdivision context for profile-specific behavior. +- Prevents downstream candidate generation for invalid/unresolvable postcodes. + +## Related Docs +- Pass 1 details: [`../stages/1_onspd_backbone.md`](../stages/1_onspd_backbone.md) +- Canonical postcode rules: [`../../spec/pipeline_v3/canonicalisation.md`](../../spec/pipeline_v3/canonicalisation.md) +- Relationship map: [`../relationships-overview.md`](../relationships-overview.md) diff --git a/docs/architecture/datasets/os_open_lids.md b/docs/architecture/datasets/os_open_lids.md new file mode 100644 index 0000000..1fdd9ed --- /dev/null +++ b/docs/architecture/datasets/os_open_lids.md @@ -0,0 +1,32 @@ +# OS Open LIDS Dataset Lineage + +## Role In The Graph +LIDS is the identifier bridge dataset. It resolves relationships between TOID/UPRN and USRN. + +## Ingest Contract +- Source key: `os_open_lids` +- Raw table: `raw.os_open_lids_row` +- Stage tables: + - `stage.open_lids_toid_usrn` + - `stage.open_lids_uprn_usrn` +- Stage checkpoint metric: + - `stage.open_lids_relation_count` + +## Stage Normalisation +- Generic identifier pairs are normalised in-query (`id_1`, `id_2`, `relation_type`) and not persisted as a large intermediate table. +- Relation typing is explicit (`toid_usrn` or `uprn_usrn`) after deterministic inference. +- Typed rows are materialised directly into dedicated stage tables for downstream joins. + +## Downstream Transformations +- Pass 2: helps infer missing canonical USRN names from Open Names TOIDs. +- Pass 3: confirms TOID-based Open Names evidence, generating `open_lids_toid_usrn` candidates. +- Pass 4: contributes UPRN->USRN links for high-confidence `uprn_usrn` candidates. + +## Value Added +- Supplies the key bridge between feature identifiers and canonical street identifiers. +- Converts generic identifier pairs into explicit typed relationships for deterministic joins. + +## Related Docs +- Pass 0b staging details: [`../stages/0b_stage_normalisation.md`](../stages/0b_stage_normalisation.md) +- Pass 3 details: [`../stages/3_open_names_candidates.md`](../stages/3_open_names_candidates.md) +- Pass 4 details: [`../stages/4_uprn_reinforcement.md`](../stages/4_uprn_reinforcement.md) diff --git a/docs/architecture/datasets/os_open_names.md b/docs/architecture/datasets/os_open_names.md new file mode 100644 index 0000000..8a6313e --- /dev/null +++ b/docs/architecture/datasets/os_open_names.md @@ -0,0 +1,32 @@ +# OS Open Names Dataset Lineage + +## Role In The Graph +Open Names contributes named road features and optional TOID references, creating medium-confidence street evidence by postcode and enabling TOID-confirmed promotion. + +## Ingest Contract +- Source key: `os_open_names` +- Raw table: `raw.os_open_names_row` +- Stage table: `stage.open_names_road_feature` +- Primary pass usage: Pass `3_open_names_candidates` + +## Stage Normalisation +- Normalised fields include: + - `feature_id` + - `toid` (when present) + - `street_name_raw`, `street_name_casefolded` + - `postcode_norm` (when available) +- Road/transport filtering is applied during staging. + +## Downstream Transformations +- Pass 3 inserts `names_postcode_feature` candidates. +- Pass 3 appends `open_lids_toid_usrn` candidates when TOID resolves via LIDS. +- Pass 3 records append-only lineage in `derived.postcode_street_candidate_lineage`. + +## Value Added +- Adds broad coverage of named road features. +- Supplies structured evidence that can be upgraded to high confidence with TOID confirmation. + +## Related Docs +- Pass 3 details: [`../stages/3_open_names_candidates.md`](../stages/3_open_names_candidates.md) +- LIDS bridge: [`os_open_lids.md`](os_open_lids.md) +- Candidate immutability contract: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/datasets/os_open_roads.md b/docs/architecture/datasets/os_open_roads.md new file mode 100644 index 0000000..dd99312 --- /dev/null +++ b/docs/architecture/datasets/os_open_roads.md @@ -0,0 +1,28 @@ +# OS Open Roads Dataset Lineage + +## Role In The Graph +Open Roads provides fallback street evidence where stronger candidate types do not exist. + +## Ingest Contract +- Source key: `os_open_roads` +- Raw table: `raw.os_open_roads_row` +- Stage table: `stage.open_roads_segment` +- Primary pass usage: Pass `5_gb_spatial_fallback` + +## Stage Normalisation +- Normalised fields: + - `segment_id`, `road_id` + - `road_name`, `road_name_casefolded` + - optional `usrn` + - optional `postcode_norm` + +## Downstream Transformations +- Pass 5 emits `spatial_os_open_roads` low-confidence candidates only for postcodes without high-confidence evidence. + +## Value Added +- Improves coverage without overriding stronger evidence. +- Preserves confidence transparency by explicitly tagging fallback provenance. + +## Related Docs +- Pass 5 details: [`../stages/5_gb_spatial_fallback.md`](../stages/5_gb_spatial_fallback.md) +- Confidence model: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/datasets/os_open_uprn.md b/docs/architecture/datasets/os_open_uprn.md new file mode 100644 index 0000000..a6cd063 --- /dev/null +++ b/docs/architecture/datasets/os_open_uprn.md @@ -0,0 +1,29 @@ +# OS Open UPRN Dataset Lineage + +## Role In The Graph +Open UPRN contributes property-level identity used with NSUL and LIDS to create high-confidence postcode/USRN evidence. + +## Ingest Contract +- Source key: `os_open_uprn` +- Raw table: `raw.os_open_uprn_row` +- Stage table: `stage.uprn_point` +- Primary pass usage: indirect, via pass `4_uprn_reinforcement` + +## Stage Normalisation +- Normalised fields: + - `uprn` + - optional `postcode_norm` + +## Downstream Transformations +- Combined with: + - `stage.nsul_uprn_postcode` (UPRN->postcode) + - `stage.open_lids_uprn_usrn` (UPRN->USRN) +- Pass 4 aggregates evidence into `uprn_usrn` high-confidence candidates. + +## Value Added +- Supports strongest GB candidate type by linking property-level and street-level identifiers. + +## Related Docs +- Pass 4 details: [`../stages/4_uprn_reinforcement.md`](../stages/4_uprn_reinforcement.md) +- NSUL linkage: [`nsul.md`](nsul.md) +- LIDS linkage: [`os_open_lids.md`](os_open_lids.md) diff --git a/docs/architecture/datasets/os_open_usrn.md b/docs/architecture/datasets/os_open_usrn.md new file mode 100644 index 0000000..b2f29c0 --- /dev/null +++ b/docs/architecture/datasets/os_open_usrn.md @@ -0,0 +1,31 @@ +# OS Open USRN Dataset Lineage + +## Role In The Graph +OS Open USRN defines canonical street identity (`USRN`) and street naming used as the final street key in outputs. + +## Ingest Contract +- Source key: `os_open_usrn` +- Raw table: `raw.os_open_usrn_row` +- Stage output: `stage.streets_usrn_input` +- Primary pass usage: Pass `2_gb_canonical_streets` + +## Stage Normalisation +- Core normalised fields: + - `usrn` + - `street_name` + - `street_name_casefolded` + - class/status metadata (when available) + +## Downstream Transformations +- Pass 2 writes `core.streets_usrn`. +- If direct USRN names are sparse, pass 2 infers missing USRN names from Open Names + LIDS TOID bridges. +- Passes 3/4/7 use `core.streets_usrn` for canonical name matching. + +## Value Added +- Provides a stable street key for provenance and de-duplication. +- Anchors candidate evidence to canonical street names. + +## Related Docs +- Pass 2 details: [`../stages/2_gb_canonical_streets.md`](../stages/2_gb_canonical_streets.md) +- Open Names linkage: [`os_open_names.md`](os_open_names.md) +- LIDS bridge: [`os_open_lids.md`](os_open_lids.md) diff --git a/docs/architecture/datasets/osni_gazetteer.md b/docs/architecture/datasets/osni_gazetteer.md new file mode 100644 index 0000000..ace39ae --- /dev/null +++ b/docs/architecture/datasets/osni_gazetteer.md @@ -0,0 +1,19 @@ +# OSNI Gazetteer Dataset Lineage (Optional NI) + +## Role In The Graph +OSNI Gazetteer is NI-specific street evidence input for NI-enabled profiles. + +## Ingest Contract +- Source key: `osni_gazetteer` +- Raw table: `raw.osni_gazetteer_row` +- Stage table: `stage.osni_street_point` + +## Downstream Transformations +- Pass 6 emits `osni_gazetteer_direct` candidates. + +## Value Added +- Extends NI street evidence coverage under explicit NI confidence constraints. + +## Related Docs +- Pass 6 details: [`../stages/6_ni_candidates.md`](../stages/6_ni_candidates.md) +- Candidate type rules: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/datasets/ppd.md b/docs/architecture/datasets/ppd.md new file mode 100644 index 0000000..ec56152 --- /dev/null +++ b/docs/architecture/datasets/ppd.md @@ -0,0 +1,30 @@ +# PPD Dataset Lineage (Optional) + +## Role In The Graph +PPD is a gap-fill source for lower-confidence address-derived street evidence. + +## Ingest Contract +- Source key: `ppd` +- Raw table: `raw.ppd_row` +- Stage table: `stage.ppd_parsed_address` +- Bundle rule: may include multiple ingest runs (baseline + updates), applied in deterministic ingest-time order. + +## Stage Normalisation +- Normalised fields: + - `row_hash` + - `postcode_norm` + - `house_number` + - `street_token_raw`, `street_token_casefolded` + +## Downstream Transformations +- Pass 7 performs token matching against canonical streets. +- Generates `ppd_parse_matched` or `ppd_parse_unmatched` candidate types. +- Used as additive gap-fill only; does not override stronger evidence. + +## Value Added +- Improves coverage where core spatial joins have sparse evidence. +- Preserves confidence transparency through explicit low/none-like candidate typing. + +## Related Docs +- Pass 7 details: [`../stages/7_ppd_gap_fill.md`](../stages/7_ppd_gap_fill.md) +- PPD baseline/update rule: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/relationships-overview.md b/docs/architecture/relationships-overview.md new file mode 100644 index 0000000..b2184af --- /dev/null +++ b/docs/architecture/relationships-overview.md @@ -0,0 +1,205 @@ +# Dataset Relationship Overview + +## Core Graph + +```text +ONSPD -> core.postcodes +OS Open USRN -> core.streets_usrn +OS Open Names + ONSPD -> candidates (names_postcode_feature) +OS Open Names + LIDS (TOID->USRN) -> candidates (open_lids_toid_usrn) +OS Open UPRN + NSUL + LIDS (UPRN->USRN) -> candidates (uprn_usrn) +OS Open Roads + core.postcodes -> fallback candidates (spatial_os_open_roads) +Optional: PPD -> gap-fill candidates (ppd_parse_*) +All candidates + weights -> derived.postcode_streets_final +Final + provenance joins -> api projections +``` + +## Mermaid Diagram + +```mermaid +flowchart TB + subgraph S["Source Datasets"] + ONSPD["ONSPD"] + USRN["OS Open USRN"] + NAMES["OS Open Names"] + LIDS["OS Open LIDS"] + UPRN["OS Open UPRN"] + NSUL["NSUL"] + ROADS["OS Open Roads"] + OSNI["OSNI Gazetteer"] + DFI["DfI Highway"] + PPD["HM Land Registry PPD"] + end + + subgraph META["Meta"] + IR["meta.ingest_run"] + IRF["meta.ingest_run_file"] + BB["meta.build_bundle"] + BBS["meta.build_bundle_source"] + BR["meta.build_run"] + BPC["meta.build_pass_checkpoint"] + CH["meta.canonical_hash"] + PUB["meta.dataset_publication"] + end + + subgraph RAW["Raw"] + R_ONSPD["raw.onspd_row"] + R_USRN["raw.os_open_usrn_row"] + R_NAMES["raw.os_open_names_row"] + R_LIDS["raw.os_open_lids_row"] + R_UPRN["raw.os_open_uprn_row"] + R_NSUL["raw.nsul_row"] + R_ROADS["raw.os_open_roads_row"] + R_OSNI["raw.osni_gazetteer_row"] + R_DFI["raw.dfi_highway_row"] + R_PPD["raw.ppd_row"] + end + + subgraph STAGE["Stage"] + S_ONSPD["stage.onspd_postcode"] + S_USRN["stage.streets_usrn_input"] + S_NAMES["stage.open_names_road_feature"] + S_LIDS_TOID["stage.open_lids_toid_usrn"] + S_LIDS_UPRN["stage.open_lids_uprn_usrn"] + S_UPRN["stage.uprn_point"] + S_NSUL["stage.nsul_uprn_postcode"] + S_ROADS["stage.open_roads_segment"] + S_OSNI["stage.osni_street_point"] + S_DFI["stage.dfi_road_segment"] + S_PPD["stage.ppd_parsed_address"] + end + + subgraph CORE["Core"] + C_POST["core.postcodes"] + C_META["core.postcodes_meta"] + C_STREETS["core.streets_usrn"] + end + + subgraph DERIVED["Derived"] + CAND["derived.postcode_street_candidates"] + LIN["derived.postcode_street_candidate_lineage"] + FINAL["derived.postcode_streets_final"] + FINAL_CAND["derived.postcode_streets_final_candidate"] + FINAL_SRC["derived.postcode_streets_final_source"] + end + + subgraph INTERNAL["Internal"] + UNIT["internal.unit_index"] + end + + subgraph API["API Projections"] + API_STREET_V["api.postcode_street_lookup__"] + API_POST_V["api.postcode_lookup__"] + API_STREET["api.postcode_street_lookup (view)"] + API_POST["api.postcode_lookup (view)"] + end + + ONSPD --> IR + USRN --> IR + NAMES --> IR + LIDS --> IR + UPRN --> IR + NSUL --> IR + ROADS --> IR + OSNI --> IR + DFI --> IR + PPD --> IR + + IR --> IRF + IR --> BBS + BB --> BBS + BB --> BR + BR --> BPC + BR --> CH + + ONSPD --> R_ONSPD + USRN --> R_USRN + NAMES --> R_NAMES + LIDS --> R_LIDS + UPRN --> R_UPRN + NSUL --> R_NSUL + ROADS --> R_ROADS + OSNI -. optional .-> R_OSNI + DFI -. optional .-> R_DFI + PPD -. optional .-> R_PPD + + R_ONSPD --> S_ONSPD + R_USRN --> S_USRN + R_NAMES --> S_NAMES + R_LIDS --> S_LIDS_TOID + R_LIDS --> S_LIDS_UPRN + R_UPRN --> S_UPRN + R_NSUL --> S_NSUL + R_ROADS --> S_ROADS + R_OSNI -. optional .-> S_OSNI + R_DFI -. optional .-> S_DFI + R_PPD -. optional .-> S_PPD + + S_ONSPD --> C_POST + S_ONSPD --> C_META + S_USRN --> C_STREETS + S_NAMES --> C_STREETS + S_LIDS_TOID --> C_STREETS + + C_POST --> CAND + C_STREETS --> CAND + S_NAMES --> CAND + S_LIDS_TOID --> CAND + S_NSUL --> CAND + S_LIDS_UPRN --> CAND + S_ROADS --> CAND + S_OSNI -. optional .-> CAND + S_DFI -. optional .-> CAND + S_PPD -. optional .-> CAND + + S_PPD -. optional .-> UNIT + + CAND --> LIN + CAND --> FINAL + C_POST --> FINAL + C_STREETS --> FINAL + + FINAL --> FINAL_CAND + CAND --> FINAL_CAND + FINAL --> FINAL_SRC + IR --> FINAL_SRC + + FINAL --> API_STREET_V + FINAL --> API_POST_V + FINAL_SRC --> API_STREET_V + FINAL_SRC --> API_POST_V + BR --> API_STREET_V + BR --> API_POST_V + API_STREET_V --> API_STREET + API_POST_V --> API_POST + BR --> PUB +``` + +## Relationship Types +- Validation relationship: + - ONSPD validates and normalises postcode existence and country/subdivision context. +- Canonical street relationship: + - USRN is the canonical street key (`core.streets_usrn`). +- Direct semantic relationship: + - Open Names road features link to postcodes and sometimes TOIDs. +- Identifier bridge relationship: + - LIDS resolves `TOID -> USRN` and `UPRN -> USRN`. +- Property density relationship: + - NSUL ties UPRN to postcode, enabling postcode/USRN aggregation with LIDS. +- Spatial fallback relationship: + - Open Roads provides low-confidence fallback where high-confidence evidence is absent. + +## Where Each Relationship Is Materialised +- Raw snapshots: `raw.*` +- Typed normalisation: `stage.*` +- Canonical entities: `core.postcodes`, `core.streets_usrn` +- Evidence graph: `derived.postcode_street_candidates`, `derived.postcode_street_candidate_lineage` +- Final resolved output: `derived.postcode_streets_final` +- Provenance joins: `derived.postcode_streets_final_candidate`, `derived.postcode_streets_final_source` +- API shapes: `api.postcode_street_lookup__`, `api.postcode_lookup__` + +## Related Docs +- Pass-by-pass detail: [`stages/README.md`](stages/README.md) +- Dataset-specific lineage: [`datasets/README.md`](datasets/README.md) +- Value added by stage: [`value-added-by-stage.md`](value-added-by-stage.md) +- Spec authority: [`../spec/pipeline_v3/spec.md`](../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/stages/0a_raw_ingest.md b/docs/architecture/stages/0a_raw_ingest.md new file mode 100644 index 0000000..f73d9fa --- /dev/null +++ b/docs/architecture/stages/0a_raw_ingest.md @@ -0,0 +1,19 @@ +# Pass 0a: Raw Ingest Validation + +## Purpose +Validate bundle sources exist and have non-zero ingest metadata row counts before transformations. + +## Inputs +- `meta.build_bundle_source` +- `meta.ingest_run` + +## Outputs +- pass checkpoint `0a_raw_ingest` with per-source row count summary + +## Value Added +- fast fail for missing/empty source runs +- deterministic baseline counts for observability +- explicit replay contract: raw snapshots are rebuild caches backed by archived source files + file hashes + +## Related +- Bundle contract: [`../../spec/pipeline_v3/data_model.md`](../../spec/pipeline_v3/data_model.md) diff --git a/docs/architecture/stages/0b_stage_normalisation.md b/docs/architecture/stages/0b_stage_normalisation.md new file mode 100644 index 0000000..81da7ce --- /dev/null +++ b/docs/architecture/stages/0b_stage_normalisation.md @@ -0,0 +1,40 @@ +# Pass 0b: Stage Normalisation + +## Purpose +Transform raw payloads into typed/stable stage contracts consumed by later passes. + +## Inputs +- `raw.*` tables selected by bundle ingest runs +- `pipeline/config/source_schema.yaml` + +## Outputs +- `stage.onspd_postcode` +- `stage.streets_usrn_input` +- `stage.open_names_road_feature` +- `stage.open_roads_segment` +- `stage.uprn_point` +- `stage.open_lids_toid_usrn` +- `stage.open_lids_uprn_usrn` +- `stage.nsul_uprn_postcode` +- optional NI/PPD stage tables +- checkpoint metric: `stage.open_lids_relation_count` + +## Determinism/Validation +- required mapped fields validated per source +- heavy-volume sources (`os_open_uprn`, `os_open_lids`, `nsul`) use set-based SQL transforms +- explicit relation typing for LIDS (`toid_usrn`, `uprn_usrn`) +- pass-local `work_mem` is raised for large sort/dedupe transforms to reduce temp-file spill +- `(ingest_run_id, source_row_num)` indexes support deterministic replay/debug and source-row traceability +- `stage.*` tables are `UNLOGGED` to reduce write amplification; they are rebuildable from `raw.*` +- pass start truncates all `stage.*` tables to prevent historical-row/index accumulation across build runs +- final `ANALYZE` refreshes planner stats for all `stage.*` relations before Pass 1+ +- `raw.*` tables are `UNLOGGED` in this development profile; authoritative replay comes from archived source files + `meta.ingest_run_file` + +## Value Added +- converts heterogeneous schemas into deterministic internal contracts +- removes Python row-loop bottlenecks for the largest source feeds +- surfaces schema drift early + +## Related +- Dataset pages: [`../datasets/README.md`](../datasets/README.md) +- Determinism rules: [`../../spec/pipeline_v3/canonicalisation.md`](../../spec/pipeline_v3/canonicalisation.md) diff --git a/docs/architecture/stages/1_onspd_backbone.md b/docs/architecture/stages/1_onspd_backbone.md new file mode 100644 index 0000000..c83721c --- /dev/null +++ b/docs/architecture/stages/1_onspd_backbone.md @@ -0,0 +1,23 @@ +# Pass 1: ONSPD Backbone + +## Purpose +Build canonical postcode entities from staged ONSPD rows. + +## Inputs +- `stage.onspd_postcode` + +## Outputs +- `core.postcodes` +- `core.postcodes_meta` + +## Execution Notes +- set-based insert ordered by canonical postcode normalization key +- post-insert `ANALYZE` keeps downstream join planning stable (Pass 3/4/5) +- `post_town` and `locality` are copied from `stage.onspd_postcode` into `core.postcodes` and `core.postcodes_meta` without inference. + +## Value Added +- authoritative postcode backbone +- unified geographic/admin context for subsequent joins + +## Related +- Dataset: [`../datasets/onspd.md`](../datasets/onspd.md) diff --git a/docs/architecture/stages/2_gb_canonical_streets.md b/docs/architecture/stages/2_gb_canonical_streets.md new file mode 100644 index 0000000..25bea79 --- /dev/null +++ b/docs/architecture/stages/2_gb_canonical_streets.md @@ -0,0 +1,26 @@ +# Pass 2: GB Canonical Streets + +## Purpose +Build `core.streets_usrn` as canonical street dictionary keyed by USRN. + +## Inputs +- `stage.streets_usrn_input` +- `stage.open_names_road_feature` + `stage.open_lids_toid_usrn` (for inferred fallback names) + +## Outputs +- `core.streets_usrn` + +## Execution Shape +- set-based direct insert from `stage.streets_usrn_input` +- inferred path pre-aggregates Open Names by `toid` before joining to LIDS to reduce join volume +- set-based inferred insert (Open Names + LIDS) for USRNs not already present +- inferred name ranking uses deterministic tie-breaks by evidence count then casefolded/name lexical order +- stage join indexes on `stage.open_names_road_feature(build_run_id, toid)` and `(build_run_id, postcode_norm)` support Pass 2/3 joins +- post-pass `ANALYZE` keeps `core.streets_usrn` statistics current for Pass 4 joins + +## Value Added +- canonical USRN street name layer +- inferred USRN naming where direct USRN names are missing + +## Related +- Datasets: [`../datasets/os_open_usrn.md`](../datasets/os_open_usrn.md), [`../datasets/os_open_names.md`](../datasets/os_open_names.md), [`../datasets/os_open_lids.md`](../datasets/os_open_lids.md) diff --git a/docs/architecture/stages/3_open_names_candidates.md b/docs/architecture/stages/3_open_names_candidates.md new file mode 100644 index 0000000..5440c83 --- /dev/null +++ b/docs/architecture/stages/3_open_names_candidates.md @@ -0,0 +1,26 @@ +# Pass 3: Open Names Candidates + +## Purpose +Create medium-confidence street candidates and append-only TOID-confirmed promotions. + +## Inputs +- `stage.open_names_road_feature` +- `stage.open_lids_toid_usrn` +- `core.postcodes` + +## Outputs +- base candidates: `candidate_type=names_postcode_feature` +- promoted candidates: `candidate_type=open_lids_toid_usrn` +- lineage: `derived.postcode_street_candidate_lineage` + +## Contract +- candidate table is immutable evidence +- promotions are insert-only; parent rows are never mutated + +## Value Added +- broad named-road evidence +- high-confidence confirmation via TOID->USRN bridge + +## Related +- Spec contract: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) +- Dataset pages: [`../datasets/os_open_names.md`](../datasets/os_open_names.md), [`../datasets/os_open_lids.md`](../datasets/os_open_lids.md) diff --git a/docs/architecture/stages/4_uprn_reinforcement.md b/docs/architecture/stages/4_uprn_reinforcement.md new file mode 100644 index 0000000..ec39d7e --- /dev/null +++ b/docs/architecture/stages/4_uprn_reinforcement.md @@ -0,0 +1,27 @@ +# Pass 4: UPRN Reinforcement + +## Purpose +Generate high-confidence `uprn_usrn` candidates by aggregating property-level evidence. + +## Inputs +- `stage.nsul_uprn_postcode` +- `stage.open_lids_uprn_usrn` +- `core.postcodes` +- `core.streets_usrn` + +## Outputs +- `derived.postcode_street_candidates` rows (`candidate_type=uprn_usrn`, `confidence=high`) + +## Execution Shape +- pass-local `work_mem` is raised to avoid temp-file-heavy plans on large NSUL/LIDS joins +- UPRN evidence is aggregated by `(postcode_norm, usrn)` from `stage.nsul_uprn_postcode` + `stage.open_lids_uprn_usrn` +- `ingest_run_id` provenance is sourced directly from `meta.build_bundle_source` (`os_open_lids`) instead of per-row aggregation +- postcode resolution joins via `stage.onspd_postcode.postcode_norm -> postcode_display` before `core.postcodes` join +- deterministic candidate insertion order remains `postcode`, then `usrn` + +## Value Added +- strongest GB evidence class from UPRN-linked observations +- frequency signal (`uprn_count`) for ranking/probability + +## Related +- Datasets: [`../datasets/os_open_uprn.md`](../datasets/os_open_uprn.md), [`../datasets/nsul.md`](../datasets/nsul.md), [`../datasets/os_open_lids.md`](../datasets/os_open_lids.md) diff --git a/docs/architecture/stages/5_gb_spatial_fallback.md b/docs/architecture/stages/5_gb_spatial_fallback.md new file mode 100644 index 0000000..d9e182e --- /dev/null +++ b/docs/architecture/stages/5_gb_spatial_fallback.md @@ -0,0 +1,18 @@ +# Pass 5: GB Spatial Fallback + +## Purpose +Add low-confidence fallback candidates for postcodes lacking high-confidence evidence. + +## Inputs +- `stage.open_roads_segment` +- `core.postcodes` +- existing candidates + +## Outputs +- `derived.postcode_street_candidates` rows (`candidate_type=spatial_os_open_roads`, `confidence=low`) + +## Value Added +- coverage recovery with explicit low-confidence tagging + +## Related +- Dataset: [`../datasets/os_open_roads.md`](../datasets/os_open_roads.md) diff --git a/docs/architecture/stages/6_ni_candidates.md b/docs/architecture/stages/6_ni_candidates.md new file mode 100644 index 0000000..3eee9f8 --- /dev/null +++ b/docs/architecture/stages/6_ni_candidates.md @@ -0,0 +1,19 @@ +# Pass 6: NI Candidates (Profile-Dependent) + +## Purpose +Generate NI-specific candidate types when NI sources are present. + +## Inputs +- `stage.osni_street_point` +- `stage.dfi_road_segment` +- `core.postcodes` + +## Outputs +- `osni_gazetteer_direct` candidates +- `spatial_dfi_highway` candidates + +## Value Added +- extends NI coverage under explicit confidence constraints + +## Related +- Datasets: [`../datasets/osni_gazetteer.md`](../datasets/osni_gazetteer.md), [`../datasets/dfi_highway.md`](../datasets/dfi_highway.md) diff --git a/docs/architecture/stages/7_ppd_gap_fill.md b/docs/architecture/stages/7_ppd_gap_fill.md new file mode 100644 index 0000000..5812bf6 --- /dev/null +++ b/docs/architecture/stages/7_ppd_gap_fill.md @@ -0,0 +1,19 @@ +# Pass 7: PPD Gap Fill (Profile-Dependent) + +## Purpose +Add lower-confidence candidates from parsed transactional/self-reported addresses. + +## Inputs +- `stage.ppd_parsed_address` +- `core.streets_usrn` + +## Outputs +- `derived.postcode_street_candidates` (`ppd_parse_matched` / `ppd_parse_unmatched`) +- `internal.unit_index` + +## Value Added +- gap filling without overriding stronger spatial evidence +- expanded coverage with transparent provenance tags + +## Related +- Dataset: [`../datasets/ppd.md`](../datasets/ppd.md) diff --git a/docs/architecture/stages/8_finalisation.md b/docs/architecture/stages/8_finalisation.md new file mode 100644 index 0000000..bca72f7 --- /dev/null +++ b/docs/architecture/stages/8_finalisation.md @@ -0,0 +1,29 @@ +# Pass 8: Finalisation + +## Purpose +Resolve candidate evidence into final postcode/street outputs and materialized API projections. + +## Inputs +- `derived.postcode_street_candidates` +- frequency weights config + +## Outputs +- `derived.postcode_streets_final` +- `derived.postcode_streets_final_candidate` +- `derived.postcode_streets_final_source` +- versioned API tables: + - `api.postcode_street_lookup__` + - `api.postcode_lookup__` + +## Deterministic Probability +- exact formula normalization by postcode total weight +- fixed-scale rounding + deterministic residual correction to rank 1 street +- set-based SQL materialisation for `final`, `final_candidate`, and `final_source` joins (no per-row query loops) + +## Value Added +- converts evidence graph into stable product outputs +- provides reproducible hashes and publishable API projections + +## Related +- Probability contract: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) +- Canonicalisation: [`../../spec/pipeline_v3/canonicalisation.md`](../../spec/pipeline_v3/canonicalisation.md) diff --git a/docs/architecture/stages/README.md b/docs/architecture/stages/README.md new file mode 100644 index 0000000..d60ea25 --- /dev/null +++ b/docs/architecture/stages/README.md @@ -0,0 +1,19 @@ +# Stage/Pass Documentation Index + +## Build Passes +1. [`0a_raw_ingest.md`](0a_raw_ingest.md) +2. [`0b_stage_normalisation.md`](0b_stage_normalisation.md) +3. [`1_onspd_backbone.md`](1_onspd_backbone.md) +4. [`2_gb_canonical_streets.md`](2_gb_canonical_streets.md) +5. [`3_open_names_candidates.md`](3_open_names_candidates.md) +6. [`4_uprn_reinforcement.md`](4_uprn_reinforcement.md) +7. [`5_gb_spatial_fallback.md`](5_gb_spatial_fallback.md) +8. [`6_ni_candidates.md`](6_ni_candidates.md) +9. [`7_ppd_gap_fill.md`](7_ppd_gap_fill.md) +10. [`8_finalisation.md`](8_finalisation.md) + +## Cross-links +- Dataset lineage index: [`../datasets/README.md`](../datasets/README.md) +- Relationship overview: [`../relationships-overview.md`](../relationships-overview.md) +- Value-added summary: [`../value-added-by-stage.md`](../value-added-by-stage.md) +- Spec authority: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/value-added-by-stage.md b/docs/architecture/value-added-by-stage.md new file mode 100644 index 0000000..5793e5d --- /dev/null +++ b/docs/architecture/value-added-by-stage.md @@ -0,0 +1,80 @@ +# Value Added By Stage + +This page explains what new product value is created at each pass, not just what tables are written. + +## Pass 0a: Raw Ingest Validation +- Inputs: bundle-selected ingest runs (`meta.ingest_run`). +- Output: validated source presence and row-count baseline. +- Value added: + - confirms build bundle completeness before transformations + - establishes reproducible volume expectations per source + +## Pass 0b: Stage Normalisation +- Inputs: immutable raw payloads (`raw.*`) + schema mapping config. +- Output: typed/normalized rows in `stage.*`. +- Value added: + - converts heterogeneous source schemas to deterministic internal contracts + - enforces required-field gates before downstream joins + - materialises `LIDS` relation typing (`id_1`, `id_2`, `relation_type`) + +## Pass 1: ONSPD Backbone +- Inputs: `stage.onspd_postcode`. +- Output: `core.postcodes`, `core.postcodes_meta`. +- Value added: + - creates authoritative postcode validation layer + - provides postcode centroid/admin metadata context for all later joins + +## Pass 2: Canonical Streets (USRN) +- Inputs: `stage.streets_usrn_input`, `stage.open_names_road_feature`, `stage.open_lids_toid_usrn`. +- Output: `core.streets_usrn`. +- Value added: + - produces canonical USRN-keyed street dictionary + - fills gaps by inferring USRN names from Open Names + LIDS TOID mapping when direct names are absent + +## Pass 3: Open Names Candidates +- Inputs: `stage.open_names_road_feature`, `stage.open_lids_toid_usrn`, `core.*`. +- Output: `derived.postcode_street_candidates` + lineage rows. +- Value added: + - creates medium-confidence postcode/street evidence from named features + - upgrades TOID-confirmed evidence via append-only promotion (`open_lids_toid_usrn`) + - preserves full evidence chain (immutable parent + promoted child + lineage) + +## Pass 4: UPRN Reinforcement +- Inputs: `stage.nsul_uprn_postcode`, `stage.open_lids_uprn_usrn`, `core.*`. +- Output: high-confidence `uprn_usrn` candidates. +- Value added: + - adds strong evidence using property-level frequency aggregation + - ties street inference to observed property density per postcode + +## Pass 5: GB Spatial Fallback +- Inputs: `stage.open_roads_segment`, `core.postcodes`, current candidates. +- Output: low-confidence `spatial_os_open_roads` candidates. +- Value added: + - closes obvious holes where no high-confidence candidate exists + - improves postcode coverage while preserving confidence transparency + +## Pass 6: NI Candidates (Optional Profile) +- Inputs: `stage.osni_street_point`, `stage.dfi_road_segment`, `core.postcodes`. +- Output: NI-specific candidate types. +- Value added: + - extends coverage for NI builds with explicitly capped confidence + +## Pass 7: PPD Gap Fill (Optional Profile) +- Inputs: `stage.ppd_parsed_address`, `core.streets_usrn`. +- Output: `ppd_parse_*` candidates, `internal.unit_index`. +- Value added: + - uses transactional/self-reported evidence to fill gaps only + - never overrides stronger core spatial evidence + +## Pass 8: Finalisation +- Inputs: all candidates + weights config. +- Output: final tables + API versioned projections + deterministic hashes. +- Value added: + - resolves competing evidence into ranked final street outputs + - computes exact probabilities with deterministic rounding correction + - produces API-ready materialisations with relational provenance backing + +## Cross-links +- Stage details: [`stages/README.md`](stages/README.md) +- Dataset lineage pages: [`datasets/README.md`](datasets/README.md) +- Probability contract: [`../spec/pipeline_v3/spec.md`](../spec/pipeline_v3/spec.md) diff --git a/docs/spec/data_sources.md b/docs/spec/data_sources.md new file mode 100644 index 0000000..2a7d1f0 --- /dev/null +++ b/docs/spec/data_sources.md @@ -0,0 +1,345 @@ +# Data Sources + +This document is the authoritative procedure for obtaining the latest source files for pipeline builds (Phase 1 + Phase 2): + +- ONSUD +- OS Open UPRN +- OS Open Roads +- OS Open Names + +Rules: + +- Do not guess dataset structure. +- Persist release metadata and checksums in manifests. +- Verify hashes before ingest. +- If a source does not provide an official release identifier, record that as `Unknown` and derive a deterministic local release token from retrieval date + published hash. + +## 0. Source Registry + +| Dataset | Official discovery endpoint | Download endpoint type | Licence handling | Update frequency | +|---|---|---|---|---| +| ONSUD | `https://geoportal.statistics.gov.uk/api/search/v1/collections/dataset/items?q=ONSUD_LATEST&limit=20` | ArcGIS item `/data` download | Read from ArcGIS item `licenseInfo` and linked ONS licence page | Published as periodic ONSUD releases (item description states ~6-week cadence) | +| OS Open UPRN | `https://api.os.uk/downloads/v1/products/OpenUPRN/downloads` | OS Downloads API artifact URL (`redirect`) | Open OS licence terms from product metadata/docs | Official cadence is not enforced in pipeline; derive from published artifact metadata | +| OS Open Roads | `https://api.os.uk/downloads/v1/products/OpenRoads/downloads` | OS Downloads API artifact URL (`redirect`) | Open OS licence terms from product metadata/docs | Official cadence is not enforced in pipeline; derive from published artifact metadata | +| OS Open Names | `https://api.os.uk/downloads/v1/products/OpenNames/downloads` | OS Downloads API artifact URL (`redirect`) | Open OS licence terms from product metadata/docs | Approximately six-monthly; derive concrete release from artifact metadata | + +If licence text or cadence cannot be confirmed at ingest time, record `Unknown` and retain the raw metadata response used for the run. + +## 1. Prerequisites + +Required tools: + +- `curl` +- `python3` +- `shasum` +- `unzip` +- `ogrinfo` (Open Roads inspection) + +Suggested directories: + +```bash +mkdir -p data/source_files/real data/manifests/real +``` + +## 2. ONSUD (Latest) + +### 2.1 Discover latest ONSUD item + +Use the Open Geography Portal search API: + +```bash +curl -s 'https://geoportal.statistics.gov.uk/api/search/v1/collections/dataset/items?q=ONSUD_LATEST&limit=20' > /tmp/onsud_search.json +python3 - <<'PY' +import json +from pathlib import Path + +payload = json.loads(Path('/tmp/onsud_search.json').read_text()) +for feature in payload.get('features', []): + p = feature.get('properties', {}) + if p.get('title') == 'ONSUD_LATEST': + print(feature['id']) + break +else: + raise SystemExit('Could not find ONSUD_LATEST item id') +PY +``` + +Store the printed item id as `ONSUD_ITEM_ID`. + +### 2.2 Retrieve metadata and download file + +```bash +ONSUD_ITEM_ID='' +curl -s "https://www.arcgis.com/sharing/rest/content/items/${ONSUD_ITEM_ID}?f=json" > /tmp/onsud_item.json +python3 - <<'PY' +import json +from pathlib import Path +item = json.loads(Path('/tmp/onsud_item.json').read_text()) +print('name=', item.get('name')) +print('size=', item.get('size')) +print('modified=', item.get('modified')) +PY + +curl -fL --retry 3 --retry-delay 2 \ + -o data/source_files/real/ONSUD_LATEST.zip \ + "https://www.arcgis.com/sharing/rest/content/items/${ONSUD_ITEM_ID}/data" +``` + +### 2.3 Verify and unpack + +```bash +shasum -a 256 data/source_files/real/ONSUD_LATEST.zip +unzip -l data/source_files/real/ONSUD_LATEST.zip | head -n 50 +unzip -o data/source_files/real/ONSUD_LATEST.zip -d data/source_files/real/onsud +find data/source_files/real/onsud -type f -name '*.csv' | sort +``` + +Required ONSUD manifest mappings (explicit, no guessing): + +- `uprn` +- `postcode` +- `postcode_unit_easting` +- `postcode_unit_northing` + +## 3. OS Open UPRN (Latest CSV) + +### 3.1 Discover latest downloadable CSV artifact + +```bash +curl -s 'https://api.os.uk/downloads/v1/products/OpenUPRN/downloads' > /tmp/open_uprn_downloads.json +python3 - <<'PY' +import json +from pathlib import Path + +items = json.loads(Path('/tmp/open_uprn_downloads.json').read_text()) +match = next( + i for i in items + if i.get('area') == 'GB' and i.get('format') == 'CSV' +) +print('url=', match['url']) +print('fileName=', match['fileName']) +print('md5=', match['md5']) +print('size=', match['size']) +PY +``` + +### 3.2 Download and verify + +```bash +OPEN_UPRN_URL="$(python3 - <<'PY' +import json +from pathlib import Path +items = json.loads(Path('/tmp/open_uprn_downloads.json').read_text()) +match = next(i for i in items if i.get('area') == 'GB' and i.get('format') == 'CSV') +print(match['url']) +PY +)" + +OPEN_UPRN_MD5="$(python3 - <<'PY' +import json +from pathlib import Path +items = json.loads(Path('/tmp/open_uprn_downloads.json').read_text()) +match = next(i for i in items if i.get('area') == 'GB' and i.get('format') == 'CSV') +print(match['md5']) +PY +)" + +curl -fL --retry 3 --retry-delay 2 \ + -o data/source_files/real/open_uprn_latest_csv.zip \ + "${OPEN_UPRN_URL}" + +md5 data/source_files/real/open_uprn_latest_csv.zip +echo "Expected md5: ${OPEN_UPRN_MD5}" +``` + +### 3.3 Unpack and inspect columns + +```bash +unzip -o data/source_files/real/open_uprn_latest_csv.zip -d data/source_files/real/open_uprn +OPEN_UPRN_CSV="$(find data/source_files/real/open_uprn -type f -name '*.csv' | head -n 1)" +echo "${OPEN_UPRN_CSV}" +head -n 1 "${OPEN_UPRN_CSV}" +``` + +Map these required fields explicitly in manifest: + +- `uprn` +- `latitude` +- `longitude` +- `easting` +- `northing` + +## 4. OS Open Roads (Latest GeoPackage) + +### 4.1 Discover latest GeoPackage artifact + +```bash +curl -s 'https://api.os.uk/downloads/v1/products/OpenRoads/downloads' > /tmp/open_roads_downloads.json +python3 - <<'PY' +import json +from pathlib import Path + +items = json.loads(Path('/tmp/open_roads_downloads.json').read_text()) +match = next( + i for i in items + if i.get('area') == 'GB' and i.get('format') == 'GeoPackage' +) +print('url=', match['url']) +print('fileName=', match['fileName']) +print('md5=', match['md5']) +print('size=', match['size']) +PY +``` + +### 4.2 Download and verify + +```bash +OPEN_ROADS_URL="$(python3 - <<'PY' +import json +from pathlib import Path +items = json.loads(Path('/tmp/open_roads_downloads.json').read_text()) +match = next(i for i in items if i.get('area') == 'GB' and i.get('format') == 'GeoPackage') +print(match['url']) +PY +)" + +OPEN_ROADS_MD5="$(python3 - <<'PY' +import json +from pathlib import Path +items = json.loads(Path('/tmp/open_roads_downloads.json').read_text()) +match = next(i for i in items if i.get('area') == 'GB' and i.get('format') == 'GeoPackage') +print(match['md5']) +PY +)" + +curl -fL --retry 3 --retry-delay 2 \ + -o data/source_files/real/open_roads_latest_gpkg.zip \ + "${OPEN_ROADS_URL}" + +md5 data/source_files/real/open_roads_latest_gpkg.zip +echo "Expected md5: ${OPEN_ROADS_MD5}" +``` + +### 4.3 Unpack and inspect layer/fields + +```bash +unzip -o data/source_files/real/open_roads_latest_gpkg.zip -d data/source_files/real/open_roads +OPEN_ROADS_GPKG="$(find data/source_files/real/open_roads -type f -name '*.gpkg' | head -n 1)" +echo "${OPEN_ROADS_GPKG}" +ogrinfo "${OPEN_ROADS_GPKG}" +``` + +Select the layer used for named road segments and inspect schema: + +```bash +ogrinfo -so "${OPEN_ROADS_GPKG}" '' +``` + +Map these required fields explicitly in manifest: + +- `source_id` (stable source identifier column) +- `name_display` (road name display column) + +Do not assume field names without inspecting the exact release file. + +## 5. OS Open Names (Latest CSV) + +### 5.1 Discover latest CSV artifact + +```bash +curl -s 'https://api.os.uk/downloads/v1/products/OpenNames/downloads' > /tmp/open_names_downloads.json +python3 - <<'PY' +import json +from pathlib import Path + +items = json.loads(Path('/tmp/open_names_downloads.json').read_text()) +match = next(i for i in items if i.get('format') == 'CSV') +print('url=', match['url']) +print('fileName=', match['fileName']) +print('md5=', match['md5']) +print('size=', match['size']) +PY +``` + +### 5.2 Download and verify + +```bash +OPEN_NAMES_URL="$(python3 - <<'PY' +import json +from pathlib import Path +items = json.loads(Path('/tmp/open_names_downloads.json').read_text()) +match = next(i for i in items if i.get('format') == 'CSV') +print(match['url']) +PY +)" + +OPEN_NAMES_MD5="$(python3 - <<'PY' +import json +from pathlib import Path +items = json.loads(Path('/tmp/open_names_downloads.json').read_text()) +match = next(i for i in items if i.get('format') == 'CSV') +print(match['md5']) +PY +)" + +curl -fL --retry 3 --retry-delay 2 \ + -o data/source_files/real/open_names_latest_csv.zip \ + "${OPEN_NAMES_URL}" + +md5 data/source_files/real/open_names_latest_csv.zip +echo "Expected md5: ${OPEN_NAMES_MD5}" +``` + +### 5.3 Unpack and inspect columns + +```bash +unzip -o data/source_files/real/open_names_latest_csv.zip -d data/source_files/real/open_names +OPEN_NAMES_CSV="$(find data/source_files/real/open_names -type f -name '*.csv' | head -n 1)" +echo "${OPEN_NAMES_CSV}" +head -n 1 "${OPEN_NAMES_CSV}" +``` + +Required Open Names manifest mappings: + +- `entry_id` +- `name1` +- `name1_lang` +- `name2` +- `local_type` +- `geometry_x` +- `geometry_y` +- `postcode_district` + +Ingest supports CSV only for Open Names. + +## 6. Release ID Rules + +Use official release identifiers where provided in source metadata. + +If not provided: + +- Record official identifier as `Unknown` in notes. +- Use deterministic local `release_id` for manifests: + - `-unknown--` +- Example: + - `open_roads-unknown-20260220-ebbaaaff` + +This preserves reproducibility while avoiding guessed semantic versions. + +## 7. Manifest Preparation Checklist + +For each dataset manifest: + +1. `dataset_key` is correct (`onsud`, `open_uprn`, `open_roads`, `open_names`) +2. `release_id` follows rules above +3. `source_url` is the exact URL used to download +4. `file_path` points to the local extracted file used for ingest +5. `expected_sha256` equals `shasum -a 256 ` +6. `column_map` is explicit and validated from inspected headers/layers +7. For Open Roads, `layer_name` is set and validated via `ogrinfo` + +## 8. Operational Notes + +- File sizes are large (hundreds of MB to >1 GB). Use a stable network connection. +- Prefer re-runnable shell scripts over manual ad-hoc commands. +- Keep downloaded archives and manifests together under `data/source_files/real` and `data/manifests/real` for auditability. diff --git a/docs/spec/phase_1/changes.md b/docs/spec/phase_1/changes.md new file mode 100644 index 0000000..bb3d5fe --- /dev/null +++ b/docs/spec/phase_1/changes.md @@ -0,0 +1,87 @@ +## 2026-02-20 — CHG-0001 + +What changed: +- Replaced nearest-road candidate ordering in Phase 1 derived build from GiST KNN (`<->`) to deterministic `ST_DWithin + ST_Distance + segment_id`. + +Why it changed: +- Full-scale real dataset runs reproduced PostgreSQL/PostGIS runtime failure: `index returned tuples in wrong order` during KNN nearest-neighbor evaluation. +- Failure was reproduced in isolation against dedicated `iso_knn` tables and a single point fixture, and persisted after `REINDEX`. + +Before behavior: +- `ORDER BY point_geom <-> road_geom, segment_id ASC` with no explicit `ST_DWithin` in the lateral nearest-road selector. + +After behavior: +- `WHERE ST_DWithin(point_geom, road_geom, radius_m)` +- `ORDER BY ST_Distance(point_geom, road_geom) ASC, segment_id ASC` + +Observed / expected metric impact: +- Expected semantic output: unchanged for rows with a nearest named segment within radius. +- Expected operational impact: slower runtime in dense urban areas due full distance ordering within radius candidates. + +Determinism confirmation: +- Tie-break remains stable and explicit (`distance`, then `segment_id`). + +Spec update confirmation: +- Updated `docs/spec/phase_1/spec.md` to reflect runtime query contract and dated change note. + +## 2026-02-20 — CHG-0002 + +What changed: +- Added stage checkpoint persistence in `meta.release_set_stage_checkpoint`. +- Added resumable build mode: `pipeline run phase1 --resume`. + +Why it changed: +- Full-scale builds can fail late in the process (for example, during derived spatial inference) after long-running successful stages. +- Restarting from zero is operationally expensive and slows deterministic troubleshooting. + +Before behavior: +- `pipeline run phase1` behaved as a single-shot build with no persisted stage boundary checkpoints. +- Failure required a full rerun from the first build step. + +After behavior: +- Successful stage completions are persisted per `release_set_id`. +- `pipeline run phase1 --resume` skips completed stages and continues from the first incomplete stage. +- Default non-resume run on a `created` release set performs a clean rebuild by dropping release tables and clearing checkpoints. +- `--resume` and `--rebuild` are mutually exclusive. + +Observed / expected metric impact: +- No semantic change to output rows or metrics for a successful end-to-end run. +- Operational improvement: failure recovery time reduced by avoiding recomputation of completed stages. + +Determinism confirmation: +- Stage checkpoints only skip previously completed deterministic stages for the same `release_set_id`. +- Rebuild path remains explicit via `--rebuild`. + +Spec update confirmation: +- Updated `docs/spec/phase_1/spec.md` with resume CLI and checkpoint table contract. + +## 2026-02-20 — CHG-0003 + +What changed: +- Refined resume checkpoints from coarse phase-level boundaries to table-level build boundaries. + +Why it changed: +- Long-running builds need restart points between individual core/derived table builds, not only between aggregate phases. + +Before behavior: +- Core build was checkpointed once after all core tables were built. +- Derived build was checkpointed once after derived phase completion. + +After behavior: +- Checkpoints are now written after each table build: + - `core_uprn_postcode_built` + - `core_uprn_point_built` + - `core_road_segment_built` + - `derived_uprn_street_spatial_built` +- Existing legacy checkpoint names (`core_built`, `derived_built`) remain accepted by constraint for backward compatibility. + +Observed / expected metric impact: +- No change to output semantics or metric values. +- Operational improvement: finer-grained resume points reduce rebuild time after late-stage failures. + +Determinism confirmation: +- Each checkpoint still marks completion of deterministic SQL steps for one release set. +- Resume continues by skipping only completed table-level checkpoints. + +Spec update confirmation: +- Updated `docs/spec/phase_1/spec.md` to lock table-level checkpoint behavior. diff --git a/docs/spec/phase_1/name_norm.md b/docs/spec/phase_1/name_norm.md new file mode 100644 index 0000000..9bb741e --- /dev/null +++ b/docs/spec/phase_1/name_norm.md @@ -0,0 +1,26 @@ +# Phase 1 `name_norm` Specification + +Status: Locked for Phase 1. + +`name_norm` exists for deterministic grouping and hashing within the Phase 1 pipeline. +It is intentionally minimal and must not include linguistic expansion rules. + +## Rules (in order) + +1. Convert to uppercase. +2. Trim leading and trailing whitespace. +3. Collapse all internal whitespace runs to a single space. +4. Remove these punctuation characters exactly: + - `.` + - `,` + - `'` + - `-` + +## Explicitly Out of Scope + +- Abbreviation expansion (for example `ST` -> `STREET`). +- Language-aware equivalence. +- Article removal. +- Any fuzzy matching. + +If any of the above is required, it belongs to a separate Phase 2 normalization function. diff --git a/docs/spec/phase_1/spec.md b/docs/spec/phase_1/spec.md index 284b53a..1e683eb 100644 --- a/docs/spec/phase_1/spec.md +++ b/docs/spec/phase_1/spec.md @@ -1,264 +1,298 @@ # Implementation Spec -Open Data Street Inference Import Pipeline (MVP) +Open Data Street Inference Import Pipeline (Phase 1) Datasets: ONSUD + OS Open UPRN + OS Open Roads -Purpose: Build a reproducible, versioned import and transformation pipeline that produces a UPRN to inferred street mapping using only open datasets. - ## 1. Objectives Primary objective: -- Ingest 3 open datasets -- Normalise and join them -- Produce a deterministic, rebuildable derived table: - - `UPRN -> postcode -> nearest named road -> confidence score` - -Secondary objectives: -- Full dataset versioning and provenance -- Deterministic rebuilds -- Metrics collection for quality auditing -- No mutation of raw data after ingest - -Non-goals: -- No address enumeration -- No PAF or AddressBase -- No PPD or EPC -- No serving layer - -## 2. Technology Stack - -Language: -- Python 3.11+ - -Database: -- PostgreSQL + PostGIS extension - -Core libraries: -- psycopg (database) -- SQLAlchemy (optional) -- pandas (CSV handling) -- geopandas (if needed for geometry) -- shapely -- pyproj -- click or argparse (CLI) - -Notes: -- Spatial processing must be delegated to PostGIS where possible -- Avoid loading large geometries into Python memory - -## 3. Directory Structure - -- `pipeline/` - - `pyproject.toml` - - `src/` - - `cli.py` - - `config.py` - - `datasets/` - - `onsud.py` - - `open_uprn.py` - - `open_roads.py` - - `ingest/` - - `raw_load.py` - - `transform/` - - `build_core.py` - - `build_derived.py` - - `metrics.py` - - `util/` - - `normalise.py` - - `hashing.py` - - `sql/` - - `schema.sql` - - `indexes.sql` -- `data/` - - `raw/` - - `onsud/` - - `open_uprn/` - - `open_roads/` - -## 4. Dataset Specifications - -## 4.1 ONSUD (ONS UPRN Directory) - -Purpose: -- UPRN to postcode backbone - -Required fields: -- UPRN -- Postcode (unit level) - -Import requirements: -- Load full dataset -- Do not filter rows during raw ingest -- Preserve all original columns - -Derived extraction (`core.uprn_postcode`): -- `uprn` (bigint primary key) -- `postcode_norm` (text) -- `onsud_release_id` (text) - -## 4.2 OS Open UPRN - -Purpose: -- UPRN to coordinates - -Required fields: -- UPRN -- Easting -- Northing -- Latitude -- Longitude - -Derived extraction (`core.uprn_point`): -- `uprn` (bigint primary key) -- `geom` (Point, SRID 4326) -- `lat` -- `lon` -- `easting` -- `northing` -- `open_uprn_release_id` - -## 4.3 OS Open Roads - -Purpose: -- Named road geometries for nearest-neighbour inference - -Required: -- Geometry (LineString or MultiLineString) -- Road name field - -Derived extraction (`core.road_segment`): -- `segment_id` (bigserial primary key) -- `name_display` (text nullable) -- `name_norm` (text nullable) -- `geom` (geometry) -- `open_roads_release_id` - -## 5. Versioning and Provenance - -Table: `meta.dataset_release` - -Fields: -- `dataset_key` (`onsud|open_uprn|open_roads`) -- `release_id` -- `source_url` -- `sha256` -- `retrieved_at` -- `licence` -- `file_path` +- Ingest three open datasets. +- Build deterministic core and derived tables. +- Produce: `UPRN -> postcode -> nearest named road -> confidence score`. + +Quality objectives: +- Deterministic rebuilds from identical inputs. +- Full provenance by dataset release identifiers. +- Programmatic gate checks at every stage. +- No hidden state and no implicit defaults. + +### 1.1 Change Note + +- 2026-02-20: Nearest-road implementation switched from GiST KNN (`<->`) to deterministic `ST_DWithin + ST_Distance` ordering due reproducible PostgreSQL/PostGIS runtime failure (`index returned tuples in wrong order`) on national-scale data. +- 2026-02-20: Added explicit Phase 1 resume checkpoints (`meta.release_set_stage_checkpoint`) and `pipeline run phase1 --resume` stage restart semantics. + +## 2. Scope and Non-goals -Table: `meta.pipeline_run` +Phase 1 includes: +- Dataset ingest and registration. +- Core table construction. +- Spatial nearest named road inference. +- Distance-based confidence scoring. +- Metrics and canonical hash persistence. -Fields: -- `run_id` (uuid) -- `started_at` -- `finished_at` -- `status` -- `release_map` (json) -- `log_path` +Phase 1 excludes: +- PPD, EPC, LLM logic. +- API/serving layer. +- Enumeration endpoints. +- NI data integration. + +## 3. Operational Model + +The workflow is explicit and separated: +- Ingest commands populate ingest-layer tables only. +- Build command populates release-schema core/derived tables only. +- Activate command repoints stable views only. + +### 3.1 CLI Contract + +- `pipeline db migrate` +- `pipeline ingest onsud --manifest ` +- `pipeline ingest open-uprn --manifest ` +- `pipeline ingest open-roads --manifest ` +- `pipeline release create --onsud-release --open-uprn-release --open-roads-release ` +- `pipeline run phase1 --release-set-id [--resume] [--rebuild]` +- `pipeline release activate --release-set-id --actor ` Rules: -- Every derived build references exact `release_id` values -- Raw data is immutable -- Rebuilds must be deterministic +- `pipeline run phase1` never performs ingest work. +- If release set status is `built` and `--rebuild` is absent, `run phase1` is a no-op. +- `--resume` continues only incomplete build stages for that release set using persisted stage checkpoints. +- Without `--resume`, a `created` release set starts as a clean build (drop/recreate release tables, clear checkpoints). +- `--resume` and `--rebuild` are mutually exclusive. +- Checkpoints are written after each table build boundary (not only after aggregate phases). + +## 4. Data Model + +## 4.1 `meta` schema + +### `meta.dataset_release` +Required fields include: +- `dataset_key`, `release_id` (composite key) +- `source_url`, `licence`, `file_path` +- `expected_sha256`, `actual_sha256` +- `retrieved_at`, `manifest_json` +- `source_row_count`, `loaded_row_count` (CSV datasets) +- `source_feature_count`, `loaded_feature_count` (Open Roads) +- `source_layer_name`, `srid_confirmed` + +### `meta.pipeline_run` +Tracks run start/end, status, stage, release set linkage. + +### `meta.release_set_stage_checkpoint` +Persisted build checkpoints for resumable Phase 1 runs: +- `release_set_id`, `stage_name` (composite key) +- `run_id`, `completed_at` + +Allowed `stage_name` values: +- `release_tables_created` +- `core_uprn_postcode_built` +- `core_uprn_point_built` +- `core_road_segment_built` +- `derived_uprn_street_spatial_built` +- `metrics_stored` +- `canonical_hashes_stored` +- `release_marked_built` + +### `meta.release_set` +- `release_set_id` +- `onsud_release_id`, `open_uprn_release_id`, `open_roads_release_id` +- `physical_schema`, `status` +- Hard uniqueness constraint: + - `UNIQUE (onsud_release_id, open_uprn_release_id, open_roads_release_id)` + +### `meta.release_activation_log` +Audit record for view promotion actions (`who`, `when`, `from`, `to`). + +### `meta.dataset_metrics` +Metric key-value records linked to `run_id` and `release_set_id`. + +### `meta.canonical_hash` +- `release_set_id` +- `object_name` +- `projection` (ordered JSON array of columns) +- `row_count` +- `sha256` +- `computed_at` +- `run_id` + +Primary key: +- `(release_set_id, object_name, run_id)` + +Allowed `object_name` values are locked: +- `core_uprn_postcode` +- `core_uprn_point` +- `core_road_segment` +- `derived_uprn_street_spatial` + +## 4.2 `raw` schema + +### `raw.onsud_row` +- `dataset_key`, `release_id`, `source_row_num` +- `uprn`, `postcode` +- `extras_jsonb` + +### `raw.open_uprn_row` +- `dataset_key`, `release_id`, `source_row_num` +- `uprn`, `latitude`, `longitude`, `easting`, `northing` +- `extras_jsonb` + +## 4.3 `stage` schema + +### `stage.open_roads_segment` (locked contract) +Required columns: +- `dataset_key text not null` (must be `open_roads`) +- `release_id text not null` +- `segment_id bigint not null` (ingest-generated, deterministic within release) +- `name_display text` +- `name_norm text` +- `geom_bng geometry(MultiLineString,27700) not null` + +Required constraints: +- `CHECK (dataset_key = 'open_roads')` +- `UNIQUE (release_id, segment_id)` +- `NOT NULL release_id` + +Required indexes: +- btree on `(release_id)` +- GiST on `geom_bng` + +Build linkage rule: +- `pipeline run phase1` must read only rows where: + - `stage.open_roads_segment.release_id = meta.release_set.open_roads_release_id` + +## 4.4 Versioned physical schemas and stable views + +For each release set, build into `rs_` tables: +- `core_uprn_postcode` +- `core_uprn_point` +- `core_road_segment` +- `derived_uprn_street_spatial` + +Stable consumer views: +- `core.uprn_postcode` +- `core.uprn_point` +- `core.road_segment` +- `derived.uprn_street_spatial` + +Only `pipeline release activate` may repoint these views. + +## 5. Ingest Rules + +General ingest rules: +- Manifest-driven field mapping; no schema guessing. +- SHA256 mismatch: hard fail. +- Duplicate `(dataset_key, release_id)` with different hash: hard fail. +- Duplicate `(dataset_key, release_id)` with same hash: no-op with clear log. +- Missing required mapped columns: hard fail. + +Open Roads ingest rule: +- `pipeline ingest open-roads` handles loading into `stage.open_roads_segment` and persists source/loaded feature counts. + +## 6. Spatial Inference Rules + +- Metric spatial ops use BNG only (`SRID 27700`). +- Distance calculations never use WGS84 geometry. +- Validity gate validates `geom_bng` specifically. + +Nearest-road query contract: +- Candidate roads are filtered with `ST_DWithin(point, road, radius_m)`. +- Candidate ordering is `ST_Distance(point, road) ASC, segment_id ASC`. +- GiST KNN operator (`<->`) is not used in Phase 1 runtime queries. + +Deterministic tie-breaking: +1. Distance ascending. +2. `segment_id` ascending. + +Confidence score bands are fixed: +- `<=15m => 0.70` +- `<=30m => 0.55` +- `<=60m => 0.40` +- `<=150m => 0.25` +- `>150m => 0.00` + +No named road within radius: +- `method = 'none_within_radius'` +- `confidence_score = 0.00` -## 6. Normalisation Rules +## 7. Normalisation Rules `postcode_norm`: - Uppercase - Remove spaces - Remove non-alphanumeric -`street_norm`: +`name_norm` (Phase 1 minimal and frozen): - Uppercase -- Trim -- Collapse whitespace -- Preserve original name in `street_display` - -`UPRN`: -- Cast to bigint after validation - -## 7. Ingest Workflow - -Step 1: Register dataset release -- Compute SHA256 of archive -- Insert into `meta.dataset_release` - -Step 2: Load raw table -- Use `COPY` for CSV -- Use `ogr2ogr` or PostGIS loader for shapefiles -- Record row counts - -Step 3: Build core tables -- Extract required fields -- Apply normalisation -- Create indexes - -Step 4: Validate joins -- Count UPRNs in ONSUD -- Count matching UPRNs in Open UPRN -- Report coverage percentage - -## 8. Street Inference Algorithm (Phase 1) - -Goal: -- Assign nearest named road to each UPRN - -Process: -1. Join `core.uprn_postcode` with `core.uprn_point` -2. For each UPRN with coordinates: - - Use PostGIS KNN operator (`<->`) to find nearest road segment - - Filter to segments with non-null `name_display` - - Compute `ST_Distance` in metres -3. Apply search radius threshold (default `150m`) -4. Assign: - - `street_display` - - `street_norm` - - `distance_m` - - `method = 'open_roads_nearest'` - - `confidence_score` (distance-based banding) -5. Insert into `derived.uprn_street_spatial` - -Confidence bands: -- `<= 15m` -> `0.70` -- `<= 30m` -> `0.55` -- `<= 60m` -> `0.40` -- `<= 150m` -> `0.25` -- `> 150m` -> `0.00` - -If no named road within radius: -- `street_display = NULL` -- `confidence_score = 0.00` -- `method = 'none_within_radius'` +- Trim leading/trailing whitespace +- Collapse internal whitespace to single spaces +- Strip punctuation: `.` `,` `'` `-` + +## 8. Metrics Definitions + +Set definitions are fixed: +- `total_uprns_onsud` = count of non-null UPRNs in `raw.onsud_row` for the release +- `uprns_with_coordinates` = count of distinct UPRNs present in both release core postcode and core point tables +- `uprns_resolved_named_road` = count of UPRNs in derived table where `method='open_roads_nearest'` + +Formulas are fixed: +- `coordinate_coverage_pct = uprns_with_coordinates / total_uprns_onsud * 100` +- `resolution_pct = uprns_resolved_named_road / uprns_with_coordinates * 100` + +## 9. Gate Criteria (Programmatic) + +Registration gate: +- dataset release row exists with expected hash values. + +CSV ingest gate: +- source data-row count (header excluded) equals loaded row count exactly. + +Open Roads gate: +- source feature count equals loaded feature count. +- all `geom_bng` valid. +- `SRID = 27700`. + +Loaded feature count query is locked: -## 9. Metrics Collection +```sql +SELECT COUNT(*) AS loaded_feature_count +FROM stage.open_roads_segment +WHERE release_id = :open_roads_release_id; +``` -Compute after build: -- Total UPRNs (ONSUD) -- UPRNs with coordinates -- Coordinate coverage percentage -- UPRNs resolved to named road -- Resolution percentage -- Distance percentiles (P50, P90, P99) +For PostgreSQL/psycopg execution, the parameter style may be adapted, but the logical query must be identical. -Insert into `meta.dataset_metrics` +Core gate: +- core table row counts are non-zero. +- join coverage is computed and logged. -## 10. CLI Contract +Spatial gate: +- resolution percent and distance P50/P90/P99 logged. +- no NULL `method` values. -- `pipeline ingest onsud --release-id --file ` -- `pipeline ingest open-uprn --release-id --file ` -- `pipeline ingest open-roads --release-id --file ` +Metrics gate: +- all mandatory metric keys persisted for the run. -- `pipeline build core --onsud --open-uprn --open-roads ` +Activation gate: +- stable views point to new release set after activation. +- activation log row exists. -- `pipeline build derived street-spatial --onsud --open-uprn --open-roads ` +## 10. Canonical Hash Rules -- `pipeline metrics compute --onsud --open-uprn --open-roads ` +- Canonical hash ordering uses stable key ordering only: + - `core_uprn_postcode`: `ORDER BY uprn ASC` + - `core_uprn_point`: `ORDER BY uprn ASC` + - `core_road_segment`: `ORDER BY segment_id ASC` + - `derived_uprn_street_spatial`: `ORDER BY uprn ASC` +- Never rely on text-collation ordering for canonical hash row order. +- Projection definition used for each hash must be persisted. -## 11. Acceptance Criteria +## 11. Test Requirements -- Rebuild produces identical row counts and deterministic output -- 95%+ of UPRNs with coords resolve to some named road (subject to dataset reality) -- All tables indexed appropriately -- No raw data mutation after import -- All outputs traceable to `release_id` values \ No newline at end of file +Required tests include: +- Two Open Roads releases in staging do not mix by `release_id`. +- Duplicate `(release_id, segment_id)` in staging fails. +- `loaded_feature_count` is sourced from locked stage query and persisted in metadata. +- Deterministic tie-break fixture for equal-distance roads. +- Activation rollback safety test for failed transaction. +- Reproducibility test: same inputs produce same canonical hashes. diff --git a/docs/spec/phase_2-open-names/changes.md b/docs/spec/phase_2-open-names/changes.md new file mode 100644 index 0000000..588945d --- /dev/null +++ b/docs/spec/phase_2-open-names/changes.md @@ -0,0 +1,97 @@ +## 2026-02-20 — CHG-0001 + +What changed: +- Locked Voronoi hull buffer as code-level constant: + - `pipeline.config.VORONOI_HULL_BUFFER_M = 20000.0` +- Added explicit Voronoi SQL contract requiring bound parameter usage for `hull_buffer_m`. + +Why it changed: +- Prevent silent drift in Voronoi clipping behavior that would invalidate deterministic outputs and canonical hashes. + +Before behavior: +- Buffer value existed only in planning/docs context and could be inlined ad hoc in SQL. + +After behavior: +- Buffer value is governed by a named constant in code. +- SQL contract uses parameter binding for buffer application. +- Governance requirements are explicit for any constant change. + +Observed / expected metric impact: +- No immediate metric change at lock time. +- Future constant changes are expected to affect enumeration coverage and must be measured. + +Determinism confirmation: +- Fixed constant + fixed seed set yields stable clipped geometry. +- Determinism validated through contract tests. + +Spec update confirmation: +- Updated `docs/spec/phase_2-open-names/spec.md` and added `docs/spec/phase_2-open-names/voronoi_method.md`. + +## 2026-02-20 — CHG-0002 + +What changed: +- Implemented full Phase 2 build flow: + - `pipeline run phase2-open-names` + - mandatory Open Names release on `pipeline release create` + - checkpointed/resumable stage sequence through warnings and canonical hashes +- Added Phase 2 core/derived release-schema tables: + - `core_open_names_entry` + - `core_postcode_unit_seed` + - Phase 2 shape of `derived_uprn_street_spatial` + - `derived_postcode_street` +- Added activation warning gate with auditable acknowledgement (`--ack-warnings`). +- Added Phase 2 metrics and canonical hash set. +- Added Phase 2 E2E fixture scripts. + +Why it changed: +- Move from Phase 1 baseline to two-source reconciliation and postcode street enumeration while retaining deterministic, resumable operations. + +Before behavior: +- Build/runtime supported Phase 1 only. +- No Open Names build stages, no warnings gate, no enumeration output. + +After behavior: +- Phase 2 pipeline runs end-to-end with explicit checkpoints and activation gating. + +Observed / expected metric impact: +- New metrics expose corroboration, replacement, disagreement, and enumeration coverage. + +Determinism confirmation: +- Canonical hashes extended to Phase 2 objects. +- Ordering rules for hash projections use deterministic keys / `COLLATE "C"` where text sorting is required. + +Spec update confirmation: +- Updated: + - `docs/spec/phase_2-open-names/spec.md` + - `docs/spec/phase_2-open-names/prd.md` + +## 2026-02-20 — CHG-0003 + +What changed: +- Replaced hard-fail duplicate-seed rule in `core_postcode_unit_seed` with deterministic representative seed derivation per postcode unit: + - `AVG(postcode_unit_easting::numeric)` + - `AVG(postcode_unit_northing::numeric)` +- Added diagnostics metrics: + - `postcode_unit_seed_multi_coord_count` + - `postcode_unit_seed_max_distinct_coords` + +Why it changed: +- Real ONSUD release `ONSUD_NOV_2025` contains widespread multi-coordinate postcode units, so hard-failing duplicates blocks all national builds. + +Before behavior: +- Build failed on first postcode unit with >1 distinct seed coordinate pair. + +After behavior: +- Build derives one deterministic seed per postcode unit and proceeds. +- Multiplicity is measured and persisted as quality diagnostics. + +Observed / expected metric impact: +- Build completion becomes possible on real national data. +- New metrics expose seed multiplicity scale for monitoring. + +Determinism confirmation: +- Numeric-average aggregation is deterministic for fixed input rows. +- No row-order-dependent seed selection is used. + +Spec update confirmation: +- Updated `docs/spec/phase_2-open-names/spec.md`. diff --git a/docs/spec/phase_2-open-names/prd.md b/docs/spec/phase_2-open-names/prd.md index 811300b..23594a7 100644 --- a/docs/spec/phase_2-open-names/prd.md +++ b/docs/spec/phase_2-open-names/prd.md @@ -1,375 +1,100 @@ -# PRD: OS Open Names Integration +# PRD: Phase 2 Open Names Augmentation -**Status:** Proposed -**Phase:** 1.5 — post Phase 1 pipeline stable -**Author:** Jamie -**Date:** February 2026 +**Phase:** 2 +**Date:** February 20, 2026 ---- +## Goal -## Problem +Phase 2 adds Open Names as a second signal to Phase 1 street inference and adds postcode-level street enumeration. -The Phase 1 street inference pipeline assigns a street name to every UPRN by finding the nearest named road segment in OS Open Roads within a 150m radius. This approach works well but has two known failure modes that Open Names can directly address. +Deliverables: -**Failure mode 1 — road numbers instead of street names.** -OS Open Roads sometimes labels road segments with their road number rather than their colloquial name. A UPRN on the A40 through Acton receives `A40` as its street name from Open Roads. The street residents know is `Western Avenue`. This produces technically correct but practically useless street names for any UPRN on a numbered road. +1. Improve UPRN street assignment quality with two-source reconciliation. +2. Persist transparent provenance and disagreement signals. +3. Build deterministic postcode street listings for unit/sector/district/area. +4. Keep the build resumable and operationally auditable at national scale. -**Failure mode 2 — unverified single-source inference.** -The current pipeline has one signal per UPRN. There is no way to distinguish a confident correct inference from a confident wrong inference — the confidence score reflects distance to road geometry only, not whether the name is actually right. A UPRN 12m from a road segment gets a 0.70 confidence score whether or not that road is the street the UPRN is addressed to. +## Product Decisions (Locked) -Open Names is a purpose-built street and place gazetteer. It carries street names as primary facts rather than geometry labels. Adding it as a second signal addresses both problems. +- Open Names is mandatory in Phase 2 release creation. +- Ingest is explicit; `run phase2-open-names` does no ingest. +- Build is checkpointed and resumable. +- Warning acknowledgement is required for activation when disagreement rate is high. +- Voronoi clipping uses code constant `VORONOI_HULL_BUFFER_M` via SQL parameter binding. ---- +## User Outcomes -## Goals +Pipeline operators can: -1. Detect and replace road number labels (`A40`, `B1234`) with colloquial street names from Open Names where available. -2. Introduce a corroboration signal — a boolean indicating that Open Roads and Open Names agree on the street name for a given UPRN. -3. Store both source names in the pipeline output so the resolution method is transparent and auditable. -4. Do not degrade Phase 1 street inference results. Open Names is augmentation, not replacement. Where Open Names has no entry, Phase 1 behaviour is unchanged. -5. Remain entirely on OGL data. No new licences. +- ingest four datasets explicitly +- create a release set deterministically +- run/repair builds with `--resume` +- inspect warnings before activation +- activate only with explicit acknowledgement when required ---- +Downstream consumers can: -## Non-goals - -- Replacing the spatial KNN inference approach. Open Roads geometry remains the primary mechanism. -- Full address completion or delivery-point resolution. This is still street-level inference. -- Northern Ireland. Deferred pending CRS confirmation gate, same as all NI work. -- Abbreviation expansion or cross-source string equivalence matching. That is Phase 2 `street_equivalence_norm` work, not this. - ---- - -## Data source - -**OS Open Names** -Publisher: Ordnance Survey -Licence: OGL v3.0 -Format: CSV (zipped) or GeoPackage -Coverage: Great Britain -Update frequency: Approximately six-monthly -URL: `osdatahub.os.uk/downloads/open/OpenNames` - -Relevant fields: - -| Field | Description | -|-------|-------------| -| `ID` | Unique identifier for the named place entry | -| `NAMES_URI` | URI identifier | -| `NAME1` | Primary name | -| `NAME1_LANG` | Language of NAME1 (blank = English, `wel` = Welsh) | -| `NAME2` | Secondary name (e.g. Welsh form where NAME1 is English) | -| `TYPE` | Top-level type: `transportNetwork`, `populatedPlace`, etc. | -| `LOCAL_TYPE` | Specific type: `Road`, `Named Road`, `Numbered Road`, `Street`, etc. | -| `GEOMETRY_X` | Easting (BNG, SRID 27700) | -| `GEOMETRY_Y` | Northing (BNG, SRID 27700) | -| `POSTCODE_DISTRICT` | Postcode district associated with this entry | -| `POPULATED_PLACE` | Associated settlement name | -| `DISTRICT_BOROUGH` | Administrative district | -| `COUNTY_UNITARY` | County or unitary authority | - -For street inference, filter to entries where `LOCAL_TYPE` IN (`Road`, `Named Road`, `Street`). Exclude `Numbered Road` entries — these are the road number labels you are trying to replace, not the names you want. - ---- - -## Schema changes - -### New ingest table - -Add to the manifest-driven ingest system. Open Names is a first-class dataset with its own `dataset_key`, release tracking, SHA256 verification, and `meta.dataset_release` row. - -```sql --- raw.open_names_row -CREATE TABLE raw.open_names_row ( - id bigserial PRIMARY KEY, - dataset_key text NOT NULL, - release_id text NOT NULL, - source_row_num bigint NOT NULL, - entry_id text, - name1 text, - name1_lang text, - name2 text, - type text, - local_type text, - geometry_x double precision, - geometry_y double precision, - postcode_district text, - populated_place text, - extras_jsonb jsonb, - FOREIGN KEY (dataset_key, release_id) - REFERENCES meta.dataset_release (dataset_key, release_id) -); -``` - -### New core table - -Add `open_names_entry` to the versioned physical schema per release set, alongside `uprn_postcode`, `uprn_point`, and `road_segment`. - -```sql --- rs_.open_names_entry -CREATE TABLE open_names_entry ( - id bigserial PRIMARY KEY, - entry_id text NOT NULL, - name_display text NOT NULL, - name_norm text NOT NULL, - name2_display text, - name2_norm text, - local_type text NOT NULL, - geom_bng geometry(Point, 27700) NOT NULL, - postcode_district text, - populated_place text -); - -CREATE INDEX ON open_names_entry USING GIST (geom_bng); -CREATE INDEX ON open_names_entry (name_norm); -CREATE INDEX ON open_names_entry (postcode_district); -``` - -Only rows where `local_type IN ('Road', 'Named Road', 'Street')` are loaded into this table. All other Open Names entry types are discarded at load time. This is enforced in the loader, not via a view filter. - -### Changes to `uprn_street_spatial` - -The following fields are added. All are nullable — absence means Open Names had no entry for this UPRN. - -```sql --- New columns on rs_.uprn_street_spatial - -street_open_roads text, -- raw name from Open Roads segment (was: street_display) -street_open_names text, -- raw name from nearest Open Names entry (nullable) -street_display text, -- final resolved display name -street_norm text, -- name_norm applied to street_display -name_source text, -- 'open_roads' | 'open_names' | 'corroborated' -corroborated boolean, -- true if Open Roads and Open Names agree -open_names_distance_m double precision -- distance from UPRN to Open Names entry point (nullable) -``` - -`street_display` and `street_norm` remain the stable consumer-facing fields. The resolution method is transparent via `name_source` and both raw source values are preserved. - -### Release set manifest - -`meta.release_set` gains a fourth release ID column: - -```sql -ALTER TABLE meta.release_set - ADD COLUMN open_names_release_id text REFERENCES meta.dataset_release(release_id); -``` - -The unique constraint is updated: - -```sql -ALTER TABLE meta.release_set - DROP CONSTRAINT uq_release_set_inputs; - -ALTER TABLE meta.release_set - ADD CONSTRAINT uq_release_set_inputs - UNIQUE (onsud_release_id, open_uprn_release_id, open_roads_release_id, open_names_release_id); -``` - -Open Names is optional for backwards compatibility with Phase 1 release sets — the column is nullable. A release set with `open_names_release_id IS NULL` behaves exactly as Phase 1 and produces `street_open_names = NULL`, `name_source = 'open_roads'`, `corroborated = false` throughout. - ---- - -## CLI changes - -New ingest command: - -``` -pipeline ingest open-names --manifest -``` - -Follows the same pattern as existing ingest commands. Manifest schema is identical. - -`pipeline release-set create` gains an optional argument: - -``` -pipeline release-set create \ - --onsud-release \ - --open-uprn-release \ - --open-roads-release \ - --open-names-release # optional -``` - -`pipeline build derived street-spatial` is updated to use Open Names if the release set includes it. If `open_names_release_id` is null, behaviour is identical to Phase 1. - -`pipeline run phase1` is unchanged. Open Names augmentation runs as part of the spatial inference stage when present — it is not a separate pipeline stage. - ---- - -## Resolution logic - -The reconciliation logic runs per UPRN after the Open Roads KNN join. It is deterministic and documented here as the executable specification. - -### Step 1 — Open Roads result +- read final street name + source-specific street names +- inspect corroboration/disagreement indicators +- query streets by postcode hierarchy level -Proceed as Phase 1. For each UPRN, find the nearest named road segment in Open Roads within 150m. Assign `street_open_roads` and `confidence_score` per existing confidence bands. If no road within 150m, set `method = 'none_within_radius'` and skip to output with no Open Names lookup. +## Core Behaviour -### Step 2 — Open Names lookup +### UPRN reconciliation -For each UPRN that has an Open Roads result, find the nearest `open_names_entry` within a search radius. +For each UPRN with an Open Roads match: -Search radius for Open Names lookup: **200m**. This is wider than the Open Roads radius because Open Names representative points are centroids of named streets, not road edge geometry. A street centroid may be further from a UPRN at the end of the street than the road segment edge is. +- nearest Open Names in range is considered +- numbered-road labels can be replaced by Open Names names +- corroboration is recorded when normalized names agree +- disagreements are preserved, not auto-corrected -```sql -SELECT - entry_id, - name_display, - name_norm, - ST_Distance(geom_bng, $uprn_geom_bng) AS distance_m -FROM open_names_entry -WHERE ST_DWithin(geom_bng, $uprn_geom_bng, 200) -ORDER BY geom_bng <-> $uprn_geom_bng, entry_id -LIMIT 1; -``` +For UPRNs without Open Roads match: -If no Open Names entry within 200m, set `street_open_names = NULL` and proceed to output with `name_source = 'open_roads'`. +- method remains unresolved (`none_within_radius`) +- Open Names distance/name fields are null -### Step 3 — Numbered road detection +### Enumeration -Check whether the Open Roads name matches the numbered road pattern: +`derived_postcode_street` is a single normalized table keyed by: -```python -import re -NUMBERED_ROAD = re.compile(r'^[AaBbMm]\d+', re.IGNORECASE) +- `(postcode_level, postcode_value_norm, entry_id)` -is_numbered = bool(NUMBERED_ROAD.match(street_open_roads_norm)) -``` +Association methods: -### Step 4 — Reconciliation +- `district_direct` +- `spatial_voronoi` -``` -IF street_open_names IS NULL: - street_display = street_open_roads - name_source = 'open_roads' - corroborated = false +Deterministic precedence: -ELSE IF is_numbered AND street_open_names IS NOT NULL: - street_display = street_open_names - name_source = 'open_names' - corroborated = false - -- numbered road replaced by Open Names name +- `district_direct` wins over `spatial_voronoi` -ELSE IF name_norm(street_open_roads) == name_norm(street_open_names): - street_display = street_open_roads -- prefer Open Roads form for display - name_source = 'corroborated' - corroborated = true +## Quality and Operations -ELSE: - -- genuine disagreement between named sources - street_display = street_open_roads -- Open Roads is primary - name_source = 'open_roads' - corroborated = false - -- disagreement is visible via street_open_names != NULL and corroborated = false -``` +Mandatory metrics include: -Genuine disagreements are not resolved automatically. They are preserved in the output and flagged for the LLM review pipeline (Phase 2). The consumer sees the Open Roads name. The disagreement is visible and auditable. +- Phase 1 coverage and distance metrics +- corroboration/replacement/disagreement metrics +- postcode-units with/without streets -### Step 5 — Output +Activation gate: -Apply `name_norm` to `street_display` to produce `street_norm`. Populate all fields. `computed_at` timestamp as per Phase 1. +- if `disagreement_pct > 5%`, warning must be acknowledged ---- +Determinism checks: -## Confidence model +- canonical hashes stored for all Phase 2 core/derived objects +- same inputs must reproduce identical hashes -The distance-based confidence score from Phase 1 is unchanged. Open Names corroboration is expressed separately as a boolean, not folded into the numeric score. This keeps the two signals independent and interpretable. - -A consumer who wants to weight corroborated results more highly can do so in their own logic. The API will expose `corroborated` as a field on the street enrichment response. - -The only confidence score modification: if `name_source = 'open_names'` (numbered road replacement), the confidence score is set to the Open Names distance band rather than the Open Roads distance band: - -``` -open_names_distance_m ≤ 50m → 0.70 -open_names_distance_m ≤ 100m → 0.55 -open_names_distance_m ≤ 200m → 0.40 -``` - -These bands are wider than Open Roads bands because Open Names point geometry is a centroid, not an edge. This is documented in the confidence model specification. - ---- - -## New metrics - -Add to `meta.dataset_metrics` for any release set that includes Open Names: - -| Metric key | Description | -|------------|-------------| -| `open_names_entries_loaded` | Total Open Names entries loaded into `open_names_entry` | -| `uprns_corroborated` | UPRNs where Open Roads and Open Names agree | -| `corroboration_pct` | `uprns_corroborated` / `uprns_resolved_named_road` × 100 | -| `uprns_numbered_road_replaced` | UPRNs where Open Roads number was replaced by Open Names name | -| `numbered_road_replacement_pct` | `uprns_numbered_road_replaced` / total resolved UPRNs × 100 | -| `uprns_genuine_disagreement` | UPRNs where both sources present but names differ | -| `disagreement_pct` | `uprns_genuine_disagreement` / total resolved UPRNs × 100 | - -The `disagreement_pct` metric is the key quality signal for this feature. A high disagreement rate indicates either a data quality issue or a flaw in the reconciliation logic and should trigger investigation before the release set is activated. - ---- - -## Gate criteria - -| Gate | Pass condition | -|------|---------------| -| Open Names ingest | `ogrinfo`-reported feature count for selected entries equals loaded `open_names_entry` count. Both counts recorded. | -| Open Names content | `open_names_entry` contains only `local_type IN ('Road', 'Named Road', 'Street')` rows. Verify with count query. | -| Spatial inference | All existing Phase 1 gate criteria pass unchanged. | -| New metrics | All new metric keys present in `meta.dataset_metrics` for release sets with Open Names. | -| Disagreement rate | `disagreement_pct` logged. If > 5%, emit a warning and require explicit confirmation before activation. Not a hard block but must be acknowledged. | -| Backwards compatibility | A release set with `open_names_release_id = NULL` produces output identical to Phase 1 canonical hashes. Verified by test. | - ---- - -## Test cases - -**Numbered road replacement:** -UPRN fixture with nearest Open Roads segment = `A40`. Open Names entry `Western Avenue` within 200m. Expected output: `street_display = 'Western Avenue'`, `name_source = 'open_names'`, `corroborated = false`. - -**Corroboration:** -UPRN fixture with Open Roads = `HIGH STREET` and Open Names nearest entry = `High Street`. After `name_norm` both resolve to `HIGH STREET`. Expected: `corroborated = true`, `name_source = 'corroborated'`, `street_display = 'HIGH STREET'`. - -**Genuine disagreement:** -UPRN fixture where Open Roads = `BACK LANE` and Open Names nearest entry = `STATION ROAD`. Neither is a numbered road. Expected: `street_display = 'BACK LANE'`, `name_source = 'open_roads'`, `corroborated = false`, `street_open_names = 'Station Road'`. - -**No Open Names entry:** -UPRN fixture where no Open Names entry within 200m. Expected: `street_open_names = NULL`, `name_source = 'open_roads'`, `corroborated = false`. Confidence score unchanged from Phase 1. - -**No Open Roads result:** -UPRN fixture with `method = 'none_within_radius'`. Open Names lookup does not run. All Open Names fields NULL. - -**Backwards compatibility:** -Release set with `open_names_release_id = NULL`. Full pipeline run. Canonical hashes must match equivalent Phase 1 release set hashes exactly. - -**Welsh name:** -UPRN fixture in a Welsh postcode where Open Names entry has `NAME1_LANG = 'wel'`. Verify `name_display` preserves Welsh form. Verify `name_norm` applies the same rules as English names (uppercase, trim, collapse whitespace, explicit punctuation strip). No translation or substitution. - ---- - -## Documentation deliverables - -- `/Users/jamie/code/postcod.es/docs/spec/phase_1/open_names.md` — this PRD, updated to reflect final implementation decisions -- `/Users/jamie/code/postcod.es/docs/spec/phase_1/name_norm.md` — updated to confirm `name_norm` rules apply identically to Open Names and Open Roads names -- `/Users/jamie/code/postcod.es/docs/spec/phase_1/confidence_model.md` — updated to document Open Names distance bands and the numbered road replacement case -- `/Users/jamie/code/postcod.es/docs/spec/data_sources.md` — Open Names added as a data source with licence, URL, update frequency, and field mapping - ---- - -## Out of scope - -- `street_equivalence_norm` for cross-source matching. Phase 2. -- LLM-assisted review of genuine disagreements. Phase 2. -- Welsh/English name equivalence resolution. Phase 2. -- Any use of Open Names `populatedPlace`, `districtBorough`, or `countyUnitary` fields for administrative geography. ONSUD already provides this more reliably. -- Open Names entries with `LOCAL_TYPE` other than `Road`, `Named Road`, `Street`. Locality names, settlements, and water features are out of scope for street inference. - ---- - -## Open questions - -**Search radius for Open Names lookup.** 200m is proposed based on the characteristic that Open Names points are street centroids rather than road edges. This may need tuning after first data run. The metric `open_names_distance_m` is stored precisely so the radius can be evaluated empirically and adjusted in a subsequent release. - -**Multiple Open Names entries within radius.** The spec takes the single nearest entry. It is possible that two named streets are equidistant from a UPRN and the nearest Open Names entry is the wrong one. For Phase 1.5 this is acceptable — the tie-breaking rule (distance then `entry_id`) is deterministic. If the disagreement rate metric surfaces this as a significant problem, a future release can introduce a candidate set approach. +## Non-goals -**Open Names update frequency.** OS publishes Open Names approximately every six months. This is less frequent than the Open Roads update cadence. A release set that updates Open Roads without updating Open Names will use a stale Open Names release. This is valid — the pipeline explicitly tracks which Open Names release is in each release set — but it means corroboration rates may drift between releases if the underlying data diverges. Monitor via `disagreement_pct` metric. +- API work +- NI support +- LLM adjudication +- heavy multilingual equivalence logic ---- +## References -*This PRD describes the intended behaviour. Any deviation during implementation must be recorded in `changes.md` with a rationale.* \ No newline at end of file +- `docs/spec/phase_2-open-names/spec.md` +- `docs/spec/phase_2-open-names/voronoi_method.md` +- `docs/spec/phase_2-open-names/changes.md` diff --git a/docs/spec/phase_2-open-names/spec.md b/docs/spec/phase_2-open-names/spec.md new file mode 100644 index 0000000..3a8e422 --- /dev/null +++ b/docs/spec/phase_2-open-names/spec.md @@ -0,0 +1,269 @@ +# Software Requirements Specification +## Phase 2: Open Names Augmentation + Postcode Street Enumeration + +**Document ID:** SRS-PIPELINE-002 +**Version:** 1.1 +**Date:** February 20, 2026 + +## 1. Scope Lock + +Phase 2 is a controlled extension of the existing pipeline. + +- Ingest remains explicit and manifest-driven. +- Build remains deterministic and resumable. +- Activation remains explicit and transactional. +- No serving/API work is included. + +Phase 2 is not optional in release composition: + +- `pipeline release create` requires all four releases: + - `onsud_release_id` + - `open_uprn_release_id` + - `open_roads_release_id` + - `open_names_release_id` + +## 2. CLI Contract + +- `pipeline ingest open-names --manifest ` +- `pipeline run phase2-open-names --release-set-id [--rebuild] [--resume] [--open-roads-radius-m ] [--open-names-radius-m ]` +- `pipeline release activate --release-set-id --actor [--ack-warnings]` + +Rules: + +- `pipeline run phase2-open-names` performs no ingest work. +- `--rebuild` and `--resume` are mutually exclusive. +- If a release set is already `built`/`active`, run is a no-op unless `--rebuild` is set. + +## 3. Build Checkpoints (Locked Order) + +1. `release_tables_created` +2. `core_uprn_postcode_built` +3. `core_uprn_point_built` +4. `core_road_segment_built` +5. `core_open_names_entry_built` +6. `core_postcode_unit_seed_built` +7. `derived_uprn_street_spatial_built` +8. `derived_postcode_street_built` +9. `metrics_stored` +10. `warnings_stored` +11. `canonical_hashes_stored` +12. `release_marked_built` + +`warnings_stored` is intentionally before `canonical_hashes_stored`. + +## 4. Core Data Model + +Physical tables are built in `rs_`. + +### 4.1 `core_open_names_entry` + +- `entry_id text primary key` +- `name_display text` +- `name_norm text` +- `name2_display text` +- `name2_norm text` +- `name1_lang text` +- `local_type text not null` +- `postcode_district_norm text` +- `geom_bng geometry(Point,27700) not null` +- `open_names_release_id text not null` + +### 4.2 `core_postcode_unit_seed` + +Built from ONSUD postcode-unit grid references. + +- `postcode_unit_norm text primary key` +- `postcode_sector_norm text` +- `postcode_district_norm text` +- `postcode_area_norm text` +- `easting double precision not null` +- `northing double precision not null` +- `geom_bng geometry(Point,27700) not null` +- `onsud_release_id text not null` + +Hard failures: + +- Missing seed for any postcode unit present in `core_uprn_postcode` + +Seed selection rule (locked): + +- ONSUD can contain multiple coordinate pairs per postcode unit in real releases. +- `core_postcode_unit_seed` derives one deterministic representative seed per postcode unit using: + - `AVG(postcode_unit_easting::numeric)` + - `AVG(postcode_unit_northing::numeric)` +- This is deterministic, auditable, and avoids non-reproducible row-choice behavior. +- Diagnostic metrics track seed multiplicity (`postcode_unit_seed_multi_coord_count`, `postcode_unit_seed_max_distinct_coords`). + +## 5. UPRN Street Reconciliation (`derived_uprn_street_spatial`) + +## 5.1 Inputs and Tie-Breaking + +- Open Roads nearest lookup radius default: `150m` +- Open Names nearest lookup radius default: `200m` +- Open Roads tie-break: distance ascending, `segment_id` ascending +- Open Names tie-break: distance ascending, `entry_id` ascending +- KNN operator `<->` is not used; deterministic `ST_Distance` ordering is used. + +Road-number detection regex is locked: + +- `^(A|B|M)[0-9]{1,4}[A-Z]?$` +- Applied to stripped token (`upper`, non-alphanumeric removed) + +## 5.2 Reconciliation Rules + +For rows with `method = 'open_roads_nearest'`: + +1. No Open Names in range: keep Open Roads. +2. Open Roads is numbered road and Open Names in range: replace with Open Names. +3. Both present and `name_norm` equal: mark corroborated. +4. Both present and differ: keep Open Roads. + +For rows with `method = 'none_within_radius'`: + +- No Open Names lookup is used. +- Open Names fields are null. + +## 5.3 Output Columns (Phase 2) + +- `street_open_roads` +- `street_open_names` +- `street_display` +- `street_norm` +- `open_roads_distance_m` +- `open_names_distance_m` +- `confidence_score` +- `method` +- `name_source` +- `corroborated` +- `sources text[]` (fixed order provenance) + +`open_names_distance_m` semantics are locked: + +- Distance to nearest in-range Open Names entry +- `NULL` when no Open Names entry exists within `open_names_radius_m` + +## 5.4 Provenance Order (`sources`) + +Always populated in this exact order: + +1. `onsud:` +2. `open_uprn:` +3. `open_roads:` +4. `open_names:` + +## 6. Postcode Street Enumeration (`derived_postcode_street`) + +Single normalized table: + +- `postcode_level text` in (`unit`,`sector`,`district`,`area`) +- `postcode_value_norm text` +- `entry_id text` +- `name_display text` +- `name_norm text` +- `name2_display text` +- `name2_norm text` +- `association_method text` in (`district_direct`,`spatial_voronoi`) +- `sources text[]` +- release IDs + +Primary key: + +- `(postcode_level, postcode_value_norm, entry_id)` + +Lookup index: + +- `(postcode_level, postcode_value_norm)` + +## 6.1 Association Contract + +Two candidate methods are generated: + +- `district_direct`: `core_open_names_entry.postcode_district_norm = core_postcode_unit_seed.postcode_district_norm` +- `spatial_voronoi`: Open Names point contained by postcode-unit Voronoi cell + +De-dup precedence is locked: + +1. `district_direct` +2. `spatial_voronoi` + +## 6.2 Voronoi Contract + +- Seed source: `core_postcode_unit_seed.geom_bng` +- Clipping: convex hull of seeds buffered by `pipeline.config.VORONOI_HULL_BUFFER_M` +- Buffer value is parameter-bound SQL (`hull_buffer_m`), never inlined +- Each postcode unit seed must map to exactly one Voronoi cell, else hard fail + +Reference: `docs/spec/phase_2-open-names/voronoi_method.md` + +## 7. Metrics (Mandatory) + +Phase 1 metrics remain mandatory plus Phase 2 keys. + +Additional keys: + +- `open_names_entries_loaded` +- `uprns_corroborated` +- `corroboration_pct` +- `uprns_numbered_road_replaced` +- `numbered_road_replacement_pct` +- `uprns_genuine_disagreement` +- `disagreement_pct` +- `postcode_units_with_streets` +- `postcode_units_without_streets` +- `open_names_search_radius_m` +- `total_resolved` + +Formula lock: + +- `total_uprns_onsud = COUNT(non-null uprn in raw.onsud_row for onsud release)` +- `uprns_with_coordinates = COUNT(DISTINCT uprn in core_uprn_postcode ∩ core_uprn_point)` +- `total_resolved = COUNT(*) WHERE method='open_roads_nearest'` +- `coordinate_coverage_pct = uprns_with_coordinates / total_uprns_onsud * 100` +- `resolution_pct = total_resolved / uprns_with_coordinates * 100` +- `corroboration_pct = uprns_corroborated / total_resolved * 100` +- `numbered_road_replacement_pct = uprns_numbered_road_replaced / total_resolved * 100` +- `disagreement_pct = uprns_genuine_disagreement / total_resolved * 100` + +## 8. Warning + Activation Gate + +If `disagreement_pct > 5.0`: + +- Persist warning row in `meta.pipeline_run_warning` +- Set `requires_ack = true` +- Activation is blocked until acknowledged + +Acknowledgement path: + +- `pipeline release activate ... --ack-warnings` +- Writes `acknowledged_by` and `acknowledged_at` + +## 9. Canonical Hash Contract + +Stored in `meta.canonical_hash` per run. + +Phase 2 object names: + +- `core_uprn_postcode` +- `core_uprn_point` +- `core_road_segment` +- `core_open_names_entry` +- `core_postcode_unit_seed` +- `derived_uprn_street_spatial` +- `derived_postcode_street` + +Hash rules: + +- Explicit projection list stored as ordered JSON array +- Operational timestamps excluded +- Deterministic key ordering only (numeric keys or `COLLATE "C"` for text keys) + +## 10. Out of Scope + +- API serving design +- NI dataset onboarding +- LLM disagreement adjudication +- Cross-source linguistic equivalence (`street_equivalence_norm`) + +Any behaviour change to this contract must be recorded in: + +- `docs/spec/phase_2-open-names/changes.md` diff --git a/docs/spec/phase_2-open-names/voronoi_method.md b/docs/spec/phase_2-open-names/voronoi_method.md new file mode 100644 index 0000000..6a2ee25 --- /dev/null +++ b/docs/spec/phase_2-open-names/voronoi_method.md @@ -0,0 +1,33 @@ +# Voronoi Method (Phase 2) + +This document locks the Voronoi clipping contract used by postcode street enumeration. + +## Locked Constant + +- `pipeline.config.VORONOI_HULL_BUFFER_M = 20000.0` + +This value is behavior-defining and hash-impacting. It must not be changed silently. + +## SQL Contract + +Voronoi clipping uses PostGIS-native operations with a bound parameter: + +- `ST_ConvexHull(ST_Collect(seed_geom_bng))` +- `ST_Buffer(..., %(hull_buffer_m)s)` +- `ST_VoronoiPolygons(..., (SELECT gb_clip_geom ...))` + +`hull_buffer_m` is bound at runtime. The buffer must not be inlined as a SQL literal. + +## Governance + +Any change to `VORONOI_HULL_BUFFER_M` requires all of the following before implementation: + +1. Prior entry in `docs/spec/phase_2-open-names/changes.md` +2. Canonical hash re-baseline for affected objects +3. Before/after metric comparison for enumeration coverage + +## Determinism Notes + +- Seed inputs are ordered deterministically. +- Voronoi clipping geometry is deterministic for a fixed seed set and `VORONOI_HULL_BUFFER_M`. +- Runtime ties in downstream spatial joins must have explicit tie-breakers. diff --git a/docs/spec/pipeline_v3/canonicalisation.md b/docs/spec/pipeline_v3/canonicalisation.md new file mode 100644 index 0000000..913bfdf --- /dev/null +++ b/docs/spec/pipeline_v3/canonicalisation.md @@ -0,0 +1,52 @@ +# Pipeline V3 Canonicalisation and Determinism Rules + +## Postcode Normalisation + +1. Uppercase. +2. Remove non-alphanumeric characters. +3. Require minimum structure for UK postcode canonical form. +4. Store display form with single space before final three characters. + +## Street Name Normalisation + +1. Trim whitespace. +2. Unicode NFKC normalisation. +3. Uppercase canonical form. +4. Strip configured punctuation. +5. Collapse internal whitespace. +6. Apply configured token aliases deterministically. + +## Null and Empty Handling + +- Empty strings map to null. +- Null and empty-string duplicates are forbidden semantically. + +## Probability and Rounding + +- Base probability uses exact formula from the main spec. +- Store to fixed precision `numeric(6,4)`. +- Apply residual correction to deterministic rank 1 row per postcode. + +## Deterministic Ranking Keys + +Probability ranking (descending) uses: +1. unrounded probability desc +2. confidence rank desc (`high` > `medium` > `low` > `none`) +3. canonical street name `COLLATE "C"` asc +4. USRN asc (nulls last) + +## JSON and Array Ordering + +- `streets_json` is materialised with deterministic ordered aggregation. +- API projection source arrays are ordered lexicographically by: + - `source_name` + - `ingest_run_id` + - `candidate_type` + +## Timezone + +All metadata timestamps are UTC. + +## Cross-links +- Pass finalisation behavior: [`../../architecture/stages/8_finalisation.md`](../../architecture/stages/8_finalisation.md) +- Value-added summary: [`../../architecture/value-added-by-stage.md`](../../architecture/value-added-by-stage.md) diff --git a/docs/spec/pipeline_v3/data_model.md b/docs/spec/pipeline_v3/data_model.md new file mode 100644 index 0000000..46bb129 --- /dev/null +++ b/docs/spec/pipeline_v3/data_model.md @@ -0,0 +1,89 @@ +# Pipeline V3 Data Model + +## Meta Layer + +### `meta.ingest_run` +Tracks ingest runs by source and release. + +### `meta.ingest_run_file` +Child rows for multi-file ingest provenance. + +### `meta.build_bundle` +Deterministic source selection envelope by profile. + +### `meta.build_bundle_source` +Source-to-ingest-run links for each bundle. + +Bundle rule: +- most sources have exactly one ingest run per bundle +- `ppd` may have multiple ingest runs (baseline + yearly/monthly updates) + +### `meta.build_run` +Execution record for a bundle build. + +### `meta.build_pass_checkpoint` +Per-pass completion checkpoints. + +### `meta.canonical_hash` +Deterministic object hashes per build run. + +### `meta.dataset_publication` +Published dataset pointer log. + +## Raw and Stage Layers + +- `raw.*` holds immutable source snapshots for the active build cache. +- `raw.*` tables are `UNLOGGED` for ingest speed and are rebuildable from archived source files + `meta.ingest_run_file`. +- `stage.*` holds typed, normalised rows that build passes consume. +- `stage.*` is treated as transient workspace and is truncated at Pass `0b` start. + +## Core Layer + +- `core.postcodes` +- `core.postcodes_meta` +- `core.streets_usrn` + +### `core.postcodes` key columns + +- canonical postcode identity and geo/admin context: + - `postcode`, `status`, `lat`, `lon`, `easting`, `northing` + - `country_iso2`, `country_iso3`, `subdivision_code` + - `post_town`, `locality` (passthrough when present in staged ONSPD rows) + +## Derived Layer + +### `derived.postcode_street_candidates` +Append-only evidence table. + +Required contract: +- insert-only table +- no update/delete of evidence rows +- candidate rows include `source_name`, `ingest_run_id`, `produced_build_run_id` + +### `derived.postcode_street_candidate_lineage` +Promotion lineage mapping parent evidence rows to child evidence rows. + +### `derived.postcode_streets_final` +One row per final postcode-street record. + +### `derived.postcode_streets_final_candidate` +Relational link from final record to all contributing candidate rows. + +### `derived.postcode_streets_final_source` +Relational source summary by final record. + +## Internal Layer + +### `internal.unit_index` +Disambiguation-only table. Never exposed to API reader role. + +## API Projection Layer + +- `api.postcode_street_lookup__` +- `api.postcode_lookup__` +- stable views: `api.postcode_street_lookup`, `api.postcode_lookup` + +## Cross-links +- Architecture relationships: [`../../architecture/relationships-overview.md`](../../architecture/relationships-overview.md) +- Dataset lineage pages: [`../../architecture/datasets/README.md`](../../architecture/datasets/README.md) +- Stage/pass pages: [`../../architecture/stages/README.md`](../../architecture/stages/README.md) diff --git a/docs/spec/pipeline_v3/spec.md b/docs/spec/pipeline_v3/spec.md new file mode 100644 index 0000000..3af758a --- /dev/null +++ b/docs/spec/pipeline_v3/spec.md @@ -0,0 +1,109 @@ +# Pipeline V3 Specification + +## 1. Scope + +Pipeline V3 is a raw-first, deterministic, replayable build pipeline for postcode and street intelligence. + +Key properties: +- all source ingests are archived with file-level hashes +- all build outputs are reproducible from a bundle of ingest runs +- all derived records have relational provenance links +- API projections are versioned and published by atomic view switch + +## 2. Pass Sequence + +Build pass order is fixed: +1. `0a_raw_ingest` +2. `0b_stage_normalisation` +3. `1_onspd_backbone` +4. `2_gb_canonical_streets` +5. `3_open_names_candidates` +6. `4_uprn_reinforcement` +7. `5_gb_spatial_fallback` +8. `6_ni_candidates` +9. `7_ppd_gap_fill` +10. `8_finalisation` + +### 2.1 PPD Baseline + Updates Rule + +- The 4.2GB PPD full baseline is ingested once and retained. +- Subsequent yearly and monthly PPD update files are ingested as additional PPD runs. +- A build bundle may include multiple PPD ingest runs: + - one baseline run + - zero or more yearly/monthly update runs +- Stage normalisation applies PPD runs in deterministic ingest timestamp order. +- Non-PPD sources remain single-run-per-source within a bundle. +- Build profile naming keeps PPD independent from NI: + - `gb_core`: GB core only + - `gb_core_ppd`: GB core + PPD + - `core_ni`: GB core + NI (without PPD) + +## 3. Candidate Evidence Contract + +`derived.postcode_street_candidates` is an immutable evidence log. + +### 3.1 Pass 3 Promotion Semantics (Append-Only) + +- `names_postcode_feature` candidates are immutable evidence rows. +- TOID confirmation creates a new `open_lids_toid_usrn` candidate row. +- Promotion lineage is recorded in `derived.postcode_street_candidate_lineage`. +- Existing candidate rows are never updated for `candidate_type`, `confidence`, `usrn`, or `evidence_ref`. + +## 4. Confidence and Candidate Types + +Candidate type enum: +- `names_postcode_feature` +- `open_lids_toid_usrn` +- `uprn_usrn` +- `spatial_os_open_roads` +- `osni_gazetteer_direct` +- `spatial_dfi_highway` +- `ppd_parse_matched` +- `ppd_parse_unmatched` + +Confidence enum: +- `high` +- `medium` +- `low` +- `none` + +NI confidence cap: +- NI candidate types cannot exceed `medium` in this release. + +## 5. Frequency and Probability + +### 5.1 Probability Formula (Exact) + +- `weighted_score(postcode, street) = sum(candidate_weight for contributing candidates)`. +- `total_weight(postcode) = sum(weighted_score(postcode, *))`. +- `probability(postcode, street) = weighted_score(postcode, street) / total_weight(postcode)`. + +### 5.2 Storage Rule + +- Probabilities are rounded to fixed scale (`numeric(6,4)`). +- Deterministic residual correction is applied to rank 1 street so stored probabilities sum to exactly `1.0000` per postcode. +- Builds fail if `total_weight(postcode) <= 0` for any postcode with final rows. + +## 6. Publish Contract + +- Build writes versioned physical API tables only: + - `api.postcode_lookup__` + - `api.postcode_street_lookup__` +- Publish updates stable views in one transaction: + - `api.postcode_lookup` + - `api.postcode_street_lookup` +- Publication metadata is persisted transactionally. +- Publish rollback leaves previous published version untouched. + +## 7. Provenance + +Final outputs use relational provenance: +- `derived.postcode_streets_final_candidate` +- `derived.postcode_streets_final_source` + +Arrays and JSON payloads are projection-only conveniences in `api.*` tables/views. + +## 8. Architecture Cross-links +- Architecture index: [`../../architecture/README.md`](../../architecture/README.md) +- Dataset lineage pages: [`../../architecture/datasets/README.md`](../../architecture/datasets/README.md) +- Stage/pass pages: [`../../architecture/stages/README.md`](../../architecture/stages/README.md) diff --git a/pipeline/AGENTS.md b/pipeline/AGENTS.md new file mode 100644 index 0000000..af2e43d --- /dev/null +++ b/pipeline/AGENTS.md @@ -0,0 +1,21 @@ +# pipeline/AGENTS.md + +## Scope +Implementation guidance for `pipeline/` (configs, SQL migrations, runtime modules). + +## Critical Rule +If code in `pipeline/` changes behavior, update matching docs in `docs/spec/pipeline_v3/` and `docs/architecture/` in the same commit series. + +## Fast Navigation +- CLI: `pipeline/src/pipeline/cli.py` +- Build logic: `pipeline/src/pipeline/build/workflows.py` +- Ingest logic: `pipeline/src/pipeline/ingest/workflows.py` +- Manifest contracts: `pipeline/src/pipeline/manifest.py` +- Migrations: `pipeline/sql/migrations/` +- Runtime configs: `pipeline/config/` + +## Change Checklist +- migration required? +- manifest/source schema mappings updated? +- determinism/canonicalisation still valid? +- tests/docs updated together? diff --git a/pipeline/config/frequency_weights.yaml b/pipeline/config/frequency_weights.yaml new file mode 100644 index 0000000..b24dd0a --- /dev/null +++ b/pipeline/config/frequency_weights.yaml @@ -0,0 +1,12 @@ +{ + "weights": { + "names_postcode_feature": 0.6, + "open_lids_toid_usrn": 0.9, + "uprn_usrn": 1.0, + "spatial_os_open_roads": 0.3, + "osni_gazetteer_direct": 0.6, + "spatial_dfi_highway": 0.3, + "ppd_parse_matched": 0.4, + "ppd_parse_unmatched": 0.2 + } +} diff --git a/pipeline/config/normalisation.yaml b/pipeline/config/normalisation.yaml new file mode 100644 index 0000000..554b45e --- /dev/null +++ b/pipeline/config/normalisation.yaml @@ -0,0 +1,15 @@ +{ + "alias_map": { + "ST": "STREET", + "RD": "ROAD", + "AVE": "AVENUE", + "DR": "DRIVE", + "LN": "LANE", + "CL": "CLOSE", + "PL": "PLACE", + "CT": "COURT", + "SQ": "SQUARE", + "GDNS": "GARDENS" + }, + "strip_punctuation": ".,'-" +} diff --git a/pipeline/config/source_schema.yaml b/pipeline/config/source_schema.yaml new file mode 100644 index 0000000..a2e80ce --- /dev/null +++ b/pipeline/config/source_schema.yaml @@ -0,0 +1,96 @@ +{ + "sources": { + "onspd": { + "required_fields": [ + "postcode", + "lat", + "lon", + "easting", + "northing", + "subdivision_code" + ], + "field_map": { + "postcode": "pcds", + "status": "doterm", + "lat": "lat", + "lon": "long", + "easting": "east1m", + "northing": "north1m", + "country_iso2": "ctry25cd", + "country_iso3": "ctry25cd", + "subdivision_code": "ctry25cd", + "post_town": "potown", + "locality": "locality" + } + }, + "os_open_usrn": { + "required_fields": ["usrn"], + "field_map": { + "usrn": "usrn", + "street_class": "street_type" + } + }, + "os_open_names": { + "required_fields": ["feature_id", "street_name"], + "field_map": { + "feature_id": "ID", + "toid": "RELATED_SPATIAL_OBJECT", + "street_name": "NAME1", + "local_type": "LOCAL_TYPE" + } + }, + "os_open_roads": { + "required_fields": ["segment_id", "road_name"], + "field_map": { + "segment_id": "id", + "road_id": "road_name_toid", + "road_name": "name_1" + } + }, + "os_open_uprn": { + "required_fields": ["uprn"], + "field_map": { + "uprn": "UPRN" + } + }, + "os_open_lids": { + "required_fields": ["id_1", "id_2"], + "field_map": { + "id_1": "IDENTIFIER_1", + "id_2": "IDENTIFIER_2" + } + }, + "nsul": { + "required_fields": ["uprn", "postcode"], + "field_map": { + "uprn": "UPRN", + "postcode": "PCDS" + } + }, + "osni_gazetteer": { + "required_fields": ["feature_id", "street_name"], + "field_map": { + "feature_id": "feature_id", + "postcode": "postcode", + "street_name": "street_name" + } + }, + "dfi_highway": { + "required_fields": ["segment_id", "street_name"], + "field_map": { + "segment_id": "segment_id", + "postcode": "postcode", + "street_name": "street_name" + } + }, + "ppd": { + "required_fields": ["row_hash", "postcode", "street", "house_number"], + "field_map": { + "row_hash": "row_hash", + "postcode": "postcode", + "street": "street", + "house_number": "house_number" + } + } + } +} diff --git a/pipeline/pyproject.toml b/pipeline/pyproject.toml new file mode 100644 index 0000000..e548ffe --- /dev/null +++ b/pipeline/pyproject.toml @@ -0,0 +1,21 @@ +[project] +name = "postcodes-pipeline" +version = "0.1.0" +description = "Open-data import and transformation pipeline (Phase 1 + Phase 2 Open Names)" +requires-python = ">=3.11" +dependencies = [ + "psycopg[binary]>=3.2,<4", +] + +[project.scripts] +pipeline = "pipeline.cli:main" + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] + +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/pipeline/sql/migrations/0001_phase1_foundation.sql b/pipeline/sql/migrations/0001_phase1_foundation.sql new file mode 100644 index 0000000..5fc3db5 --- /dev/null +++ b/pipeline/sql/migrations/0001_phase1_foundation.sql @@ -0,0 +1,142 @@ +BEGIN; + +CREATE EXTENSION IF NOT EXISTS postgis; + +CREATE SCHEMA IF NOT EXISTS meta; +CREATE SCHEMA IF NOT EXISTS raw; +CREATE SCHEMA IF NOT EXISTS stage; +CREATE SCHEMA IF NOT EXISTS core; +CREATE SCHEMA IF NOT EXISTS derived; + +CREATE TABLE IF NOT EXISTS meta.dataset_release ( + dataset_key text NOT NULL, + release_id text NOT NULL, + source_url text NOT NULL, + licence text NOT NULL, + file_path text NOT NULL, + expected_sha256 text NOT NULL, + actual_sha256 text NOT NULL, + retrieved_at timestamptz NOT NULL, + manifest_json jsonb NOT NULL DEFAULT '{}'::jsonb, + source_row_count bigint, + loaded_row_count bigint, + source_feature_count bigint, + loaded_feature_count bigint, + source_layer_name text, + srid_confirmed integer, + PRIMARY KEY (dataset_key, release_id), + CHECK (dataset_key IN ('onsud', 'open_uprn', 'open_roads')), + CHECK (expected_sha256 ~ '^[0-9a-fA-F]{64}$'), + CHECK (actual_sha256 ~ '^[0-9a-fA-F]{64}$') +); + +CREATE TABLE IF NOT EXISTS meta.pipeline_run ( + run_id uuid PRIMARY KEY, + release_set_id uuid, + started_at timestamptz NOT NULL, + finished_at timestamptz, + status text NOT NULL, + stage text NOT NULL, + error_text text, + CHECK (status IN ('started', 'built', 'active', 'failed')) +); + +CREATE TABLE IF NOT EXISTS meta.release_set ( + release_set_id uuid PRIMARY KEY, + onsud_release_id text NOT NULL, + open_uprn_release_id text NOT NULL, + open_roads_release_id text NOT NULL, + physical_schema text NOT NULL, + status text NOT NULL, + created_at timestamptz NOT NULL, + built_at timestamptz, + activated_at timestamptz, + CONSTRAINT uq_release_set_inputs + UNIQUE (onsud_release_id, open_uprn_release_id, open_roads_release_id), + CHECK (status IN ('created', 'built', 'active', 'inactive')) +); + +CREATE TABLE IF NOT EXISTS meta.release_activation_log ( + activation_id bigserial PRIMARY KEY, + previous_release_set_id uuid, + release_set_id uuid NOT NULL, + actor text NOT NULL, + activated_at timestamptz NOT NULL, + FOREIGN KEY (release_set_id) + REFERENCES meta.release_set (release_set_id) +); + +CREATE TABLE IF NOT EXISTS meta.dataset_metrics ( + run_id uuid NOT NULL, + release_set_id uuid NOT NULL, + metric_key text NOT NULL, + metric_value numeric NOT NULL, + metric_unit text NOT NULL, + computed_at timestamptz NOT NULL, + PRIMARY KEY (run_id, metric_key), + FOREIGN KEY (release_set_id) + REFERENCES meta.release_set (release_set_id) +); + +CREATE TABLE IF NOT EXISTS meta.canonical_hash ( + release_set_id uuid NOT NULL, + object_name text NOT NULL, + projection jsonb NOT NULL, + row_count bigint NOT NULL, + sha256 text NOT NULL, + computed_at timestamptz NOT NULL, + run_id uuid NOT NULL, + PRIMARY KEY (release_set_id, object_name, run_id), + FOREIGN KEY (release_set_id) + REFERENCES meta.release_set (release_set_id), + CHECK (sha256 ~ '^[0-9a-fA-F]{64}$') +); + +CREATE TABLE IF NOT EXISTS raw.onsud_row ( + id bigserial PRIMARY KEY, + dataset_key text NOT NULL, + release_id text NOT NULL, + source_row_num bigint NOT NULL, + uprn bigint, + postcode text, + extras_jsonb jsonb, + FOREIGN KEY (dataset_key, release_id) + REFERENCES meta.dataset_release (dataset_key, release_id) +); + +CREATE TABLE IF NOT EXISTS raw.open_uprn_row ( + id bigserial PRIMARY KEY, + dataset_key text NOT NULL, + release_id text NOT NULL, + source_row_num bigint NOT NULL, + uprn bigint, + latitude double precision, + longitude double precision, + easting double precision, + northing double precision, + extras_jsonb jsonb, + FOREIGN KEY (dataset_key, release_id) + REFERENCES meta.dataset_release (dataset_key, release_id) +); + +-- Ingest must normalize incoming LineString geometries to MultiLineString. +CREATE TABLE IF NOT EXISTS stage.open_roads_segment ( + dataset_key text NOT NULL, + release_id text NOT NULL, + segment_id bigint NOT NULL, + name_display text, + name_norm text, + geom_bng geometry(MultiLineString,27700) NOT NULL, + CHECK (dataset_key = 'open_roads'), + UNIQUE (release_id, segment_id), + FOREIGN KEY (dataset_key, release_id) + REFERENCES meta.dataset_release (dataset_key, release_id) +); + +CREATE INDEX IF NOT EXISTS idx_stage_open_roads_segment_release_id + ON stage.open_roads_segment (release_id); + +CREATE INDEX IF NOT EXISTS idx_stage_open_roads_segment_geom_bng + ON stage.open_roads_segment USING gist (geom_bng); + +COMMIT; diff --git a/pipeline/sql/migrations/0002_phase1_resume_checkpoints.sql b/pipeline/sql/migrations/0002_phase1_resume_checkpoints.sql new file mode 100644 index 0000000..5bd16c9 --- /dev/null +++ b/pipeline/sql/migrations/0002_phase1_resume_checkpoints.sql @@ -0,0 +1,25 @@ +BEGIN; + +CREATE TABLE IF NOT EXISTS meta.release_set_stage_checkpoint ( + release_set_id uuid NOT NULL, + stage_name text NOT NULL, + run_id uuid NOT NULL, + completed_at timestamptz NOT NULL, + PRIMARY KEY (release_set_id, stage_name), + FOREIGN KEY (release_set_id) + REFERENCES meta.release_set (release_set_id) + ON DELETE CASCADE, + CHECK (stage_name IN ( + 'release_tables_created', + 'core_built', + 'derived_built', + 'metrics_stored', + 'canonical_hashes_stored', + 'release_marked_built' + )) +); + +CREATE INDEX IF NOT EXISTS idx_release_set_stage_checkpoint_release_set + ON meta.release_set_stage_checkpoint (release_set_id, completed_at DESC); + +COMMIT; diff --git a/pipeline/sql/migrations/0003_phase1_table_level_checkpoints.sql b/pipeline/sql/migrations/0003_phase1_table_level_checkpoints.sql new file mode 100644 index 0000000..c36fbf5 --- /dev/null +++ b/pipeline/sql/migrations/0003_phase1_table_level_checkpoints.sql @@ -0,0 +1,23 @@ +BEGIN; + +ALTER TABLE meta.release_set_stage_checkpoint + DROP CONSTRAINT IF EXISTS release_set_stage_checkpoint_stage_name_check; + +ALTER TABLE meta.release_set_stage_checkpoint + ADD CONSTRAINT release_set_stage_checkpoint_stage_name_check + CHECK ( + stage_name IN ( + 'release_tables_created', + 'core_uprn_postcode_built', + 'core_uprn_point_built', + 'core_road_segment_built', + 'derived_uprn_street_spatial_built', + 'metrics_stored', + 'canonical_hashes_stored', + 'release_marked_built', + 'core_built', + 'derived_built' + ) + ); + +COMMIT; diff --git a/pipeline/sql/migrations/0004_phase2_open_names_foundation.sql b/pipeline/sql/migrations/0004_phase2_open_names_foundation.sql new file mode 100644 index 0000000..b5dffc5 --- /dev/null +++ b/pipeline/sql/migrations/0004_phase2_open_names_foundation.sql @@ -0,0 +1,136 @@ +BEGIN; + +-- Phase boundary cutover: purge prior release-set lifecycle state and rs_* schemas. +DO $$ +DECLARE + schema_row record; +BEGIN + FOR schema_row IN + SELECT nspname + FROM pg_namespace + WHERE nspname LIKE 'rs_%' + LOOP + EXECUTE format('DROP SCHEMA IF EXISTS %I CASCADE', schema_row.nspname); + END LOOP; +END +$$; + +TRUNCATE TABLE meta.release_activation_log RESTART IDENTITY CASCADE; +TRUNCATE TABLE meta.release_set_stage_checkpoint RESTART IDENTITY CASCADE; +TRUNCATE TABLE meta.canonical_hash RESTART IDENTITY CASCADE; +TRUNCATE TABLE meta.dataset_metrics RESTART IDENTITY CASCADE; +TRUNCATE TABLE meta.pipeline_run RESTART IDENTITY CASCADE; +TRUNCATE TABLE meta.release_set RESTART IDENTITY CASCADE; + +ALTER TABLE meta.dataset_release + DROP CONSTRAINT IF EXISTS dataset_release_dataset_key_check; +ALTER TABLE meta.dataset_release + DROP CONSTRAINT IF EXISTS ck_dataset_release_dataset_key; +ALTER TABLE meta.dataset_release + ADD CONSTRAINT ck_dataset_release_dataset_key + CHECK (dataset_key IN ('onsud', 'open_uprn', 'open_roads', 'open_names')); + +ALTER TABLE meta.release_set + ADD COLUMN IF NOT EXISTS open_names_release_id text; +ALTER TABLE meta.release_set + ALTER COLUMN open_names_release_id SET NOT NULL; + +ALTER TABLE meta.release_set + DROP CONSTRAINT IF EXISTS uq_release_set_inputs; +ALTER TABLE meta.release_set + DROP CONSTRAINT IF EXISTS uq_release_set_inputs_phase2; +ALTER TABLE meta.release_set + ADD CONSTRAINT uq_release_set_inputs_phase2 + UNIQUE ( + onsud_release_id, + open_uprn_release_id, + open_roads_release_id, + open_names_release_id + ); + +ALTER TABLE meta.release_set_stage_checkpoint + DROP CONSTRAINT IF EXISTS release_set_stage_checkpoint_stage_name_check; +ALTER TABLE meta.release_set_stage_checkpoint + ADD CONSTRAINT release_set_stage_checkpoint_stage_name_check + CHECK ( + stage_name IN ( + 'release_tables_created', + 'core_uprn_postcode_built', + 'core_uprn_point_built', + 'core_road_segment_built', + 'core_open_names_entry_built', + 'core_postcode_unit_seed_built', + 'derived_uprn_street_spatial_built', + 'derived_postcode_street_built', + 'metrics_stored', + 'warnings_stored', + 'canonical_hashes_stored', + 'release_marked_built', + 'core_built', + 'derived_built' + ) + ); + +CREATE TABLE IF NOT EXISTS meta.pipeline_run_warning ( + warning_id bigserial PRIMARY KEY, + run_id uuid NOT NULL, + release_set_id uuid NOT NULL, + warning_code text NOT NULL, + metric_key text NOT NULL, + metric_value numeric NOT NULL, + threshold_value numeric NOT NULL, + requires_ack boolean NOT NULL, + acknowledged_by text, + acknowledged_at timestamptz, + created_at timestamptz NOT NULL DEFAULT now(), + UNIQUE (run_id, warning_code), + FOREIGN KEY (run_id) + REFERENCES meta.pipeline_run (run_id) + ON DELETE CASCADE, + FOREIGN KEY (release_set_id) + REFERENCES meta.release_set (release_set_id) + ON DELETE CASCADE +); + +CREATE INDEX IF NOT EXISTS idx_pipeline_run_warning_release_set + ON meta.pipeline_run_warning (release_set_id, requires_ack, acknowledged_at); + +ALTER TABLE raw.onsud_row + ADD COLUMN IF NOT EXISTS postcode_unit_easting double precision; +ALTER TABLE raw.onsud_row + ADD COLUMN IF NOT EXISTS postcode_unit_northing double precision; + +CREATE TABLE IF NOT EXISTS raw.open_names_row ( + id bigserial PRIMARY KEY, + dataset_key text NOT NULL, + release_id text NOT NULL, + source_row_num bigint NOT NULL, + entry_id text NOT NULL, + name1_display text, + name1_lang text, + name1_norm text, + name2_display text, + name2_norm text, + local_type text NOT NULL, + postcode_district_norm text, + easting double precision, + northing double precision, + geom_bng geometry(Point,27700) NOT NULL, + extras_jsonb jsonb, + CHECK (dataset_key = 'open_names'), + CHECK (local_type IN ('Road', 'Named Road', 'Street')), + UNIQUE (release_id, entry_id), + FOREIGN KEY (dataset_key, release_id) + REFERENCES meta.dataset_release (dataset_key, release_id) +); + +CREATE INDEX IF NOT EXISTS idx_raw_open_names_row_release_id + ON raw.open_names_row (release_id); + +CREATE INDEX IF NOT EXISTS idx_raw_open_names_row_release_district + ON raw.open_names_row (release_id, postcode_district_norm); + +CREATE INDEX IF NOT EXISTS idx_raw_open_names_row_geom_bng + ON raw.open_names_row USING gist (geom_bng); + +COMMIT; diff --git a/pipeline/sql/migrations/0005_v3_cutover_foundation.sql b/pipeline/sql/migrations/0005_v3_cutover_foundation.sql new file mode 100644 index 0000000..13cf430 --- /dev/null +++ b/pipeline/sql/migrations/0005_v3_cutover_foundation.sql @@ -0,0 +1,545 @@ +BEGIN; + +CREATE SCHEMA IF NOT EXISTS meta; +CREATE SCHEMA IF NOT EXISTS raw; +CREATE SCHEMA IF NOT EXISTS stage; +CREATE SCHEMA IF NOT EXISTS core; +CREATE SCHEMA IF NOT EXISTS derived; +CREATE SCHEMA IF NOT EXISTS internal; +CREATE SCHEMA IF NOT EXISTS api; + +-- Hard cutover: remove legacy release-run lifecycle objects. +DROP TABLE IF EXISTS meta.release_activation_log CASCADE; +DROP TABLE IF EXISTS meta.release_set_stage_checkpoint CASCADE; +DROP TABLE IF EXISTS meta.pipeline_run_warning CASCADE; +DROP TABLE IF EXISTS meta.dataset_metrics CASCADE; +DROP TABLE IF EXISTS meta.canonical_hash CASCADE; +DROP TABLE IF EXISTS meta.pipeline_run CASCADE; +DROP TABLE IF EXISTS meta.release_set CASCADE; +DROP TABLE IF EXISTS meta.dataset_release CASCADE; + +DROP VIEW IF EXISTS core.uprn_postcode CASCADE; +DROP VIEW IF EXISTS core.uprn_point CASCADE; +DROP VIEW IF EXISTS core.road_segment CASCADE; +DROP VIEW IF EXISTS core.open_names_entry CASCADE; +DROP VIEW IF EXISTS core.postcode_unit_seed CASCADE; +DROP VIEW IF EXISTS derived.uprn_street_spatial CASCADE; +DROP VIEW IF EXISTS derived.postcode_street CASCADE; + +CREATE TABLE IF NOT EXISTS meta.ingest_run ( + run_id uuid PRIMARY KEY, + source_name text NOT NULL, + source_version text NOT NULL, + retrieved_at_utc timestamptz NOT NULL, + source_url text, + processing_git_sha char(40) NOT NULL, + record_count bigint, + notes text, + file_set_sha256 char(64) NOT NULL, + created_at_utc timestamptz NOT NULL DEFAULT now(), + CHECK (source_name IN ( + 'onspd', + 'os_open_usrn', + 'os_open_names', + 'os_open_roads', + 'os_open_uprn', + 'os_open_lids', + 'nsul', + 'osni_gazetteer', + 'dfi_highway', + 'ppd' + )), + CHECK (processing_git_sha ~ '^[0-9a-f]{40}$'), + CHECK (file_set_sha256 ~ '^[0-9a-fA-F]{64}$'), + UNIQUE (source_name, source_version, file_set_sha256) +); + +CREATE TABLE IF NOT EXISTS meta.ingest_run_file ( + file_id bigserial PRIMARY KEY, + ingest_run_id uuid NOT NULL, + file_role text NOT NULL, + filename text NOT NULL, + layer_name text NOT NULL DEFAULT '', + sha256 char(64) NOT NULL, + size_bytes bigint NOT NULL, + row_count bigint, + format text NOT NULL, + FOREIGN KEY (ingest_run_id) + REFERENCES meta.ingest_run (run_id) + ON DELETE CASCADE, + CHECK (sha256 ~ '^[0-9a-fA-F]{64}$'), + CHECK (size_bytes >= 0), + UNIQUE (ingest_run_id, file_role, filename, layer_name) +); + +CREATE TABLE IF NOT EXISTS meta.build_bundle ( + bundle_id uuid PRIMARY KEY, + build_profile text NOT NULL, + bundle_hash char(64) NOT NULL, + status text NOT NULL, + created_at_utc timestamptz NOT NULL DEFAULT now(), + CHECK (build_profile IN ('gb_core', 'gb_core_ppd', 'core_ni')), + CHECK (bundle_hash ~ '^[0-9a-fA-F]{64}$'), + CHECK (status IN ('created', 'built', 'failed', 'published')) +); + +CREATE TABLE IF NOT EXISTS meta.build_bundle_source ( + bundle_id uuid NOT NULL, + source_name text NOT NULL, + ingest_run_id uuid NOT NULL, + PRIMARY KEY (bundle_id, source_name, ingest_run_id), + FOREIGN KEY (bundle_id) + REFERENCES meta.build_bundle (bundle_id) + ON DELETE CASCADE, + FOREIGN KEY (ingest_run_id) + REFERENCES meta.ingest_run (run_id), + CHECK (source_name IN ( + 'onspd', + 'os_open_usrn', + 'os_open_names', + 'os_open_roads', + 'os_open_uprn', + 'os_open_lids', + 'nsul', + 'osni_gazetteer', + 'dfi_highway', + 'ppd' + )) +); + +CREATE INDEX IF NOT EXISTS idx_build_bundle_source_bundle_source + ON meta.build_bundle_source (bundle_id, source_name); + +CREATE TABLE IF NOT EXISTS meta.build_run ( + build_run_id uuid PRIMARY KEY, + bundle_id uuid NOT NULL, + dataset_version text NOT NULL, + status text NOT NULL, + current_pass text NOT NULL, + error_text text, + started_at_utc timestamptz NOT NULL DEFAULT now(), + finished_at_utc timestamptz, + FOREIGN KEY (bundle_id) + REFERENCES meta.build_bundle (bundle_id) + ON DELETE CASCADE, + CHECK (status IN ('started', 'built', 'failed', 'published')) +); + +CREATE TABLE IF NOT EXISTS meta.build_pass_checkpoint ( + build_run_id uuid NOT NULL, + pass_name text NOT NULL, + completed_at_utc timestamptz NOT NULL DEFAULT now(), + row_count_summary_json jsonb NOT NULL DEFAULT '{}'::jsonb, + PRIMARY KEY (build_run_id, pass_name), + FOREIGN KEY (build_run_id) + REFERENCES meta.build_run (build_run_id) + ON DELETE CASCADE, + CHECK (pass_name IN ( + '0a_raw_ingest', + '0b_stage_normalisation', + '1_onspd_backbone', + '2_gb_canonical_streets', + '3_open_names_candidates', + '4_uprn_reinforcement', + '5_gb_spatial_fallback', + '6_ni_candidates', + '7_ppd_gap_fill', + '8_finalisation' + )) +); + +CREATE TABLE IF NOT EXISTS meta.canonical_hash ( + build_run_id uuid NOT NULL, + object_name text NOT NULL, + projection jsonb NOT NULL, + row_count bigint NOT NULL, + sha256 char(64) NOT NULL, + computed_at_utc timestamptz NOT NULL DEFAULT now(), + PRIMARY KEY (build_run_id, object_name), + FOREIGN KEY (build_run_id) + REFERENCES meta.build_run (build_run_id) + ON DELETE CASCADE, + CHECK (sha256 ~ '^[0-9a-fA-F]{64}$') +); + +CREATE TABLE IF NOT EXISTS meta.dataset_publication ( + dataset_version text PRIMARY KEY, + build_run_id uuid NOT NULL UNIQUE, + published_at_utc timestamptz NOT NULL DEFAULT now(), + published_by text NOT NULL, + lookup_table_name text NOT NULL, + street_lookup_table_name text NOT NULL, + publish_txid bigint NOT NULL, + FOREIGN KEY (build_run_id) + REFERENCES meta.build_run (build_run_id) + ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS raw.onspd_row ( + id bigserial PRIMARY KEY, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id) ON DELETE CASCADE, + source_row_num bigint NOT NULL, + payload_jsonb jsonb NOT NULL +); +CREATE TABLE IF NOT EXISTS raw.os_open_usrn_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.os_open_names_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.os_open_roads_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.os_open_uprn_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.os_open_lids_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.nsul_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.osni_gazetteer_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.dfi_highway_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.ppd_row (LIKE raw.onspd_row INCLUDING ALL); + +CREATE INDEX IF NOT EXISTS idx_raw_onspd_run_id ON raw.onspd_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_usrn_run_id ON raw.os_open_usrn_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_names_run_id ON raw.os_open_names_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_roads_run_id ON raw.os_open_roads_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_uprn_run_id ON raw.os_open_uprn_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_lids_run_id ON raw.os_open_lids_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_nsul_run_id ON raw.nsul_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_osni_run_id ON raw.osni_gazetteer_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_dfi_run_id ON raw.dfi_highway_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_ppd_run_id ON raw.ppd_row (ingest_run_id); + +CREATE TABLE IF NOT EXISTS stage.onspd_postcode ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + postcode_norm text NOT NULL, + postcode_display text NOT NULL, + status text NOT NULL, + lat numeric(9,6), + lon numeric(9,6), + easting integer, + northing integer, + country_iso2 char(2) NOT NULL, + country_iso3 char(3) NOT NULL, + subdivision_code text, + post_town text, + locality text, + street_enrichment_available boolean NOT NULL, + onspd_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, postcode_norm) +); + +CREATE TABLE IF NOT EXISTS stage.streets_usrn_input ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + usrn bigint NOT NULL, + street_name text NOT NULL, + street_name_casefolded text NOT NULL, + street_class text, + street_status text, + usrn_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, usrn) +); + +CREATE TABLE IF NOT EXISTS stage.open_names_road_feature ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + feature_id text NOT NULL, + toid text, + postcode_norm text, + street_name_raw text NOT NULL, + street_name_casefolded text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, feature_id) +); + +CREATE TABLE IF NOT EXISTS stage.open_roads_segment ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + segment_id text NOT NULL, + road_id text, + postcode_norm text, + usrn bigint, + road_name text NOT NULL, + road_name_casefolded text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, segment_id) +); + +CREATE TABLE IF NOT EXISTS stage.uprn_point ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + uprn bigint NOT NULL, + postcode_norm text, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, uprn) +); + +CREATE TABLE IF NOT EXISTS stage.open_lids_toid_usrn ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + toid text NOT NULL, + usrn bigint NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, toid, usrn) +); + +CREATE TABLE IF NOT EXISTS stage.open_lids_uprn_usrn ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + uprn bigint NOT NULL, + usrn bigint NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, uprn, usrn) +); + +CREATE TABLE IF NOT EXISTS stage.nsul_uprn_postcode ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + uprn bigint NOT NULL, + postcode_norm text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, uprn, postcode_norm) +); + +CREATE TABLE IF NOT EXISTS stage.osni_street_point ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + feature_id text NOT NULL, + postcode_norm text, + street_name_raw text NOT NULL, + street_name_casefolded text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, feature_id) +); + +CREATE TABLE IF NOT EXISTS stage.dfi_road_segment ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + segment_id text NOT NULL, + postcode_norm text, + street_name_raw text NOT NULL, + street_name_casefolded text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, segment_id) +); + +CREATE TABLE IF NOT EXISTS stage.ppd_parsed_address ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + row_hash text NOT NULL, + postcode_norm text NOT NULL, + house_number text, + street_token_raw text NOT NULL, + street_token_casefolded text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, row_hash) +); + +CREATE TABLE IF NOT EXISTS core.postcodes ( + produced_build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + postcode text NOT NULL, + status text NOT NULL, + lat numeric(9,6), + lon numeric(9,6), + easting integer, + northing integer, + country_iso2 char(2) NOT NULL, + country_iso3 char(3) NOT NULL, + subdivision_code text, + post_town text, + locality text, + street_enrichment_available boolean NOT NULL, + multi_street boolean NOT NULL DEFAULT false, + onspd_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (produced_build_run_id, postcode) +); + +CREATE TABLE IF NOT EXISTS core.postcodes_meta ( + produced_build_run_id uuid NOT NULL, + postcode text NOT NULL, + meta_jsonb jsonb NOT NULL DEFAULT '{}'::jsonb, + onspd_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (produced_build_run_id, postcode), + FOREIGN KEY (produced_build_run_id, postcode) + REFERENCES core.postcodes (produced_build_run_id, postcode) + ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS core.streets_usrn ( + produced_build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + usrn bigint NOT NULL, + street_name text NOT NULL, + street_name_casefolded text NOT NULL, + street_class text, + street_status text, + usrn_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (produced_build_run_id, usrn) +); + +CREATE TABLE IF NOT EXISTS derived.postcode_street_candidates ( + candidate_id bigserial PRIMARY KEY, + produced_build_run_id uuid NOT NULL, + postcode text NOT NULL, + street_name_raw text NOT NULL, + street_name_canonical text NOT NULL, + usrn bigint, + candidate_type text NOT NULL, + confidence text NOT NULL, + evidence_ref text NOT NULL, + source_name text NOT NULL, + ingest_run_id uuid NOT NULL, + evidence_json jsonb, + created_at_utc timestamptz NOT NULL DEFAULT now(), + FOREIGN KEY (produced_build_run_id) + REFERENCES meta.build_run (build_run_id) + ON DELETE CASCADE, + FOREIGN KEY (produced_build_run_id, postcode) + REFERENCES core.postcodes (produced_build_run_id, postcode) + ON DELETE CASCADE, + FOREIGN KEY (produced_build_run_id, usrn) + REFERENCES core.streets_usrn (produced_build_run_id, usrn) + ON DELETE SET NULL, + FOREIGN KEY (ingest_run_id) + REFERENCES meta.ingest_run (run_id), + CHECK (candidate_type IN ( + 'names_postcode_feature', + 'open_lids_toid_usrn', + 'uprn_usrn', + 'spatial_os_open_roads', + 'osni_gazetteer_direct', + 'spatial_dfi_highway', + 'ppd_parse_matched', + 'ppd_parse_unmatched' + )), + CHECK (confidence IN ('high', 'medium', 'low', 'none')), + CHECK (source_name IN ( + 'onspd', + 'os_open_usrn', + 'os_open_names', + 'os_open_roads', + 'os_open_uprn', + 'os_open_lids', + 'nsul', + 'osni_gazetteer', + 'dfi_highway', + 'ppd' + )) +); + +CREATE INDEX IF NOT EXISTS idx_candidate_run_postcode + ON derived.postcode_street_candidates (produced_build_run_id, postcode); +CREATE INDEX IF NOT EXISTS idx_candidate_run_usrn + ON derived.postcode_street_candidates (produced_build_run_id, usrn); + +CREATE TABLE IF NOT EXISTS derived.postcode_street_candidate_lineage ( + parent_candidate_id bigint NOT NULL REFERENCES derived.postcode_street_candidates (candidate_id) ON DELETE CASCADE, + child_candidate_id bigint NOT NULL REFERENCES derived.postcode_street_candidates (candidate_id) ON DELETE CASCADE, + relation_type text NOT NULL, + produced_build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + PRIMARY KEY (parent_candidate_id, child_candidate_id, relation_type) +); + +CREATE TABLE IF NOT EXISTS derived.postcode_streets_final ( + final_id bigserial PRIMARY KEY, + produced_build_run_id uuid NOT NULL, + postcode text NOT NULL, + street_name text NOT NULL, + usrn bigint, + confidence text NOT NULL, + frequency_score numeric(10,4) NOT NULL, + probability numeric(6,4) NOT NULL, + created_at_utc timestamptz NOT NULL DEFAULT now(), + FOREIGN KEY (produced_build_run_id) + REFERENCES meta.build_run (build_run_id) + ON DELETE CASCADE, + FOREIGN KEY (produced_build_run_id, postcode) + REFERENCES core.postcodes (produced_build_run_id, postcode) + ON DELETE CASCADE, + FOREIGN KEY (produced_build_run_id, usrn) + REFERENCES core.streets_usrn (produced_build_run_id, usrn) + ON DELETE SET NULL, + CHECK (confidence IN ('high', 'medium', 'low', 'none')), + CHECK (probability >= 0 AND probability <= 1), + UNIQUE (produced_build_run_id, postcode, street_name) +); + +CREATE TABLE IF NOT EXISTS derived.postcode_streets_final_candidate ( + final_id bigint NOT NULL REFERENCES derived.postcode_streets_final (final_id) ON DELETE CASCADE, + candidate_id bigint NOT NULL REFERENCES derived.postcode_street_candidates (candidate_id) ON DELETE CASCADE, + produced_build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + link_rank integer NOT NULL, + PRIMARY KEY (final_id, candidate_id) +); + +CREATE TABLE IF NOT EXISTS derived.postcode_streets_final_source ( + final_id bigint NOT NULL REFERENCES derived.postcode_streets_final (final_id) ON DELETE CASCADE, + source_name text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + candidate_type text NOT NULL, + contribution_weight numeric(10,4) NOT NULL, + produced_build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + PRIMARY KEY (final_id, source_name, ingest_run_id, candidate_type), + CHECK (candidate_type IN ( + 'names_postcode_feature', + 'open_lids_toid_usrn', + 'uprn_usrn', + 'spatial_os_open_roads', + 'osni_gazetteer_direct', + 'spatial_dfi_highway', + 'ppd_parse_matched', + 'ppd_parse_unmatched' + )) +); + +CREATE TABLE IF NOT EXISTS internal.unit_index ( + index_id bigserial PRIMARY KEY, + produced_build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + postcode text NOT NULL, + house_number text NOT NULL, + street_name text NOT NULL, + usrn bigint, + confidence text NOT NULL, + source_type text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + created_at_utc timestamptz NOT NULL DEFAULT now(), + CHECK (confidence IN ('high', 'medium', 'low', 'none')) +); + +CREATE INDEX IF NOT EXISTS idx_unit_index_lookup + ON internal.unit_index (produced_build_run_id, postcode, house_number); + +CREATE OR REPLACE FUNCTION derived.reject_candidate_mutation() +RETURNS trigger +LANGUAGE plpgsql +AS $$ +BEGIN + RAISE EXCEPTION 'derived.postcode_street_candidates is append-only; % is not allowed', TG_OP; +END; +$$; + +DROP TRIGGER IF EXISTS trg_candidate_no_update ON derived.postcode_street_candidates; +CREATE TRIGGER trg_candidate_no_update +BEFORE UPDATE ON derived.postcode_street_candidates +FOR EACH ROW EXECUTE FUNCTION derived.reject_candidate_mutation(); + +DROP TRIGGER IF EXISTS trg_candidate_no_delete ON derived.postcode_street_candidates; +CREATE TRIGGER trg_candidate_no_delete +BEFORE DELETE ON derived.postcode_street_candidates +FOR EACH ROW EXECUTE FUNCTION derived.reject_candidate_mutation(); + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'pipeline_writer') THEN + CREATE ROLE pipeline_writer; + END IF; + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'api_reader') THEN + CREATE ROLE api_reader; + END IF; + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'audit_reader') THEN + CREATE ROLE audit_reader; + END IF; +END +$$; + +REVOKE ALL ON SCHEMA internal FROM PUBLIC; +REVOKE ALL ON ALL TABLES IN SCHEMA internal FROM PUBLIC; + +GRANT USAGE ON SCHEMA api TO api_reader; +GRANT SELECT ON ALL TABLES IN SCHEMA api TO api_reader; + +GRANT USAGE ON SCHEMA meta TO audit_reader; +GRANT SELECT ON meta.ingest_run, meta.ingest_run_file, meta.build_bundle, meta.build_bundle_source, + meta.build_run, meta.build_pass_checkpoint, meta.canonical_hash, meta.dataset_publication + TO audit_reader; +GRANT USAGE ON SCHEMA derived TO audit_reader; +GRANT SELECT ON derived.postcode_street_candidates, + derived.postcode_street_candidate_lineage, + derived.postcode_streets_final, + derived.postcode_streets_final_candidate, + derived.postcode_streets_final_source + TO audit_reader; + +COMMIT; diff --git a/pipeline/sql/migrations/0006_v3_open_roads_stage_compat.sql b/pipeline/sql/migrations/0006_v3_open_roads_stage_compat.sql new file mode 100644 index 0000000..6cea5e7 --- /dev/null +++ b/pipeline/sql/migrations/0006_v3_open_roads_stage_compat.sql @@ -0,0 +1,29 @@ +BEGIN; + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 + FROM information_schema.columns + WHERE table_schema = 'stage' + AND table_name = 'open_roads_segment' + AND column_name = 'release_id' + ) THEN + DROP TABLE stage.open_roads_segment CASCADE; + END IF; +END +$$; + +CREATE TABLE IF NOT EXISTS stage.open_roads_segment ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + segment_id text NOT NULL, + road_id text, + postcode_norm text, + usrn bigint, + road_name text NOT NULL, + road_name_casefolded text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, segment_id) +); + +COMMIT; diff --git a/pipeline/sql/migrations/0007_v3_rename_os_open_lids.sql b/pipeline/sql/migrations/0007_v3_rename_os_open_lids.sql new file mode 100644 index 0000000..20df5c2 --- /dev/null +++ b/pipeline/sql/migrations/0007_v3_rename_os_open_lids.sql @@ -0,0 +1,71 @@ +BEGIN; + +ALTER TABLE meta.ingest_run + DROP CONSTRAINT IF EXISTS ingest_run_source_name_check; + +ALTER TABLE meta.build_bundle_source + DROP CONSTRAINT IF EXISTS build_bundle_source_source_name_check; + +ALTER TABLE derived.postcode_street_candidates + DROP CONSTRAINT IF EXISTS postcode_street_candidates_source_name_check; + +UPDATE meta.ingest_run +SET source_name = 'os_open_lids' +WHERE source_name = 'os_open_linked_identifiers'; + +UPDATE meta.build_bundle_source +SET source_name = 'os_open_lids' +WHERE source_name = 'os_open_linked_identifiers'; + +ALTER TABLE derived.postcode_street_candidates DISABLE TRIGGER ALL; +UPDATE derived.postcode_street_candidates +SET source_name = 'os_open_lids' +WHERE source_name = 'os_open_linked_identifiers'; +ALTER TABLE derived.postcode_street_candidates ENABLE TRIGGER ALL; + +ALTER TABLE meta.ingest_run + ADD CONSTRAINT ingest_run_source_name_check + CHECK (source_name IN ( + 'onspd', + 'os_open_usrn', + 'os_open_names', + 'os_open_roads', + 'os_open_uprn', + 'os_open_lids', + 'nsul', + 'osni_gazetteer', + 'dfi_highway', + 'ppd' + )); + +ALTER TABLE meta.build_bundle_source + ADD CONSTRAINT build_bundle_source_source_name_check + CHECK (source_name IN ( + 'onspd', + 'os_open_usrn', + 'os_open_names', + 'os_open_roads', + 'os_open_uprn', + 'os_open_lids', + 'nsul', + 'osni_gazetteer', + 'dfi_highway', + 'ppd' + )); + +ALTER TABLE derived.postcode_street_candidates + ADD CONSTRAINT postcode_street_candidates_source_name_check + CHECK (source_name IN ( + 'onspd', + 'os_open_usrn', + 'os_open_names', + 'os_open_roads', + 'os_open_uprn', + 'os_open_lids', + 'nsul', + 'osni_gazetteer', + 'dfi_highway', + 'ppd' + )); + +COMMIT; diff --git a/pipeline/sql/migrations/0008_v3_lids_relation_stage.sql b/pipeline/sql/migrations/0008_v3_lids_relation_stage.sql new file mode 100644 index 0000000..c0e2b66 --- /dev/null +++ b/pipeline/sql/migrations/0008_v3_lids_relation_stage.sql @@ -0,0 +1,16 @@ +BEGIN; + +CREATE TABLE IF NOT EXISTS stage.open_lids_pair ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + id_1 text NOT NULL, + id_2 text NOT NULL, + relation_type text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, id_1, id_2, relation_type), + CHECK (relation_type IN ('toid_usrn', 'uprn_usrn')) +); + +CREATE INDEX IF NOT EXISTS idx_stage_open_lids_pair_run_relation + ON stage.open_lids_pair (build_run_id, relation_type); + +COMMIT; diff --git a/pipeline/sql/migrations/0009_v3_rename_raw_os_open_lids_table.sql b/pipeline/sql/migrations/0009_v3_rename_raw_os_open_lids_table.sql new file mode 100644 index 0000000..be5cd6b --- /dev/null +++ b/pipeline/sql/migrations/0009_v3_rename_raw_os_open_lids_table.sql @@ -0,0 +1,24 @@ +BEGIN; + +DO $$ +BEGIN + IF to_regclass('raw.os_open_linked_identifiers_row') IS NOT NULL + AND to_regclass('raw.os_open_lids_row') IS NULL THEN + ALTER TABLE raw.os_open_linked_identifiers_row + RENAME TO os_open_lids_row; + END IF; +END $$; + +DO $$ +BEGIN + IF to_regclass('raw.idx_raw_oli_run_id') IS NOT NULL + AND to_regclass('raw.idx_raw_os_open_lids_run_id') IS NULL THEN + ALTER INDEX raw.idx_raw_oli_run_id + RENAME TO idx_raw_os_open_lids_run_id; + END IF; +END $$; + +CREATE INDEX IF NOT EXISTS idx_raw_os_open_lids_run_id + ON raw.os_open_lids_row (ingest_run_id); + +COMMIT; diff --git a/pipeline/sql/migrations/0010_v3_open_lids_naming.sql b/pipeline/sql/migrations/0010_v3_open_lids_naming.sql new file mode 100644 index 0000000..067c86e --- /dev/null +++ b/pipeline/sql/migrations/0010_v3_open_lids_naming.sql @@ -0,0 +1,115 @@ +BEGIN; + +DO $$ +BEGIN + IF to_regclass('stage.oli_identifier_pair') IS NOT NULL + AND to_regclass('stage.open_lids_pair') IS NULL THEN + ALTER TABLE stage.oli_identifier_pair + RENAME TO open_lids_pair; + END IF; + + IF to_regclass('stage.oli_toid_usrn') IS NOT NULL + AND to_regclass('stage.open_lids_toid_usrn') IS NULL THEN + ALTER TABLE stage.oli_toid_usrn + RENAME TO open_lids_toid_usrn; + END IF; + + IF to_regclass('stage.oli_uprn_usrn') IS NOT NULL + AND to_regclass('stage.open_lids_uprn_usrn') IS NULL THEN + ALTER TABLE stage.oli_uprn_usrn + RENAME TO open_lids_uprn_usrn; + END IF; +END $$; + +DO $$ +BEGIN + IF to_regclass('stage.oli_identifier_pair_pkey') IS NOT NULL + AND to_regclass('stage.open_lids_pair_pkey') IS NULL THEN + ALTER INDEX stage.oli_identifier_pair_pkey + RENAME TO open_lids_pair_pkey; + END IF; + + IF to_regclass('stage.oli_toid_usrn_pkey') IS NOT NULL + AND to_regclass('stage.open_lids_toid_usrn_pkey') IS NULL THEN + ALTER INDEX stage.oli_toid_usrn_pkey + RENAME TO open_lids_toid_usrn_pkey; + END IF; + + IF to_regclass('stage.oli_uprn_usrn_pkey') IS NOT NULL + AND to_regclass('stage.open_lids_uprn_usrn_pkey') IS NULL THEN + ALTER INDEX stage.oli_uprn_usrn_pkey + RENAME TO open_lids_uprn_usrn_pkey; + END IF; + + IF to_regclass('stage.idx_stage_oli_identifier_pair_run_relation') IS NOT NULL + AND to_regclass('stage.idx_stage_open_lids_pair_run_relation') IS NULL THEN + ALTER INDEX stage.idx_stage_oli_identifier_pair_run_relation + RENAME TO idx_stage_open_lids_pair_run_relation; + END IF; +END $$; + +CREATE TABLE IF NOT EXISTS stage.open_lids_pair ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + id_1 text NOT NULL, + id_2 text NOT NULL, + relation_type text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, id_1, id_2, relation_type), + CHECK (relation_type IN ('toid_usrn', 'uprn_usrn')) +); + +CREATE INDEX IF NOT EXISTS idx_stage_open_lids_pair_run_relation + ON stage.open_lids_pair (build_run_id, relation_type); + +CREATE TABLE IF NOT EXISTS stage.open_lids_toid_usrn ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + toid text NOT NULL, + usrn bigint NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, toid, usrn) +); + +CREATE TABLE IF NOT EXISTS stage.open_lids_uprn_usrn ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + uprn bigint NOT NULL, + usrn bigint NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, uprn, usrn) +); + +ALTER TABLE derived.postcode_street_candidates + DROP CONSTRAINT IF EXISTS postcode_street_candidates_candidate_type_check; + +ALTER TABLE derived.postcode_streets_final_source + DROP CONSTRAINT IF EXISTS postcode_streets_final_source_candidate_type_check; + +ALTER TABLE derived.postcode_street_candidates DISABLE TRIGGER USER; + +UPDATE derived.postcode_street_candidates +SET candidate_type = 'open_lids_toid_usrn' +WHERE candidate_type = 'oli_toid_usrn'; + +UPDATE derived.postcode_street_candidates +SET evidence_ref = regexp_replace(evidence_ref, '^oli:', 'open_lids:') +WHERE evidence_ref LIKE 'oli:%'; + +ALTER TABLE derived.postcode_street_candidates ENABLE TRIGGER USER; + +UPDATE derived.postcode_streets_final_source +SET candidate_type = 'open_lids_toid_usrn' +WHERE candidate_type = 'oli_toid_usrn'; + +ALTER TABLE derived.postcode_streets_final_source + ADD CONSTRAINT postcode_streets_final_source_candidate_type_check + CHECK (candidate_type IN ( + 'names_postcode_feature', + 'open_lids_toid_usrn', + 'uprn_usrn', + 'spatial_os_open_roads', + 'osni_gazetteer_direct', + 'spatial_dfi_highway', + 'ppd_parse_matched', + 'ppd_parse_unmatched' + )); + +COMMIT; diff --git a/pipeline/sql/migrations/0011_v3_raw_run_rownum_indexes.sql b/pipeline/sql/migrations/0011_v3_raw_run_rownum_indexes.sql new file mode 100644 index 0000000..0c55d40 --- /dev/null +++ b/pipeline/sql/migrations/0011_v3_raw_run_rownum_indexes.sql @@ -0,0 +1,35 @@ +BEGIN; + +DROP INDEX IF EXISTS raw.idx_raw_onspd_run_id; +DROP INDEX IF EXISTS raw.idx_raw_os_open_usrn_run_id; +DROP INDEX IF EXISTS raw.idx_raw_os_open_names_run_id; +DROP INDEX IF EXISTS raw.idx_raw_os_open_roads_run_id; +DROP INDEX IF EXISTS raw.idx_raw_os_open_uprn_run_id; +DROP INDEX IF EXISTS raw.idx_raw_os_open_lids_run_id; +DROP INDEX IF EXISTS raw.idx_raw_nsul_run_id; +DROP INDEX IF EXISTS raw.idx_raw_osni_run_id; +DROP INDEX IF EXISTS raw.idx_raw_dfi_run_id; +DROP INDEX IF EXISTS raw.idx_raw_ppd_run_id; + +CREATE INDEX IF NOT EXISTS idx_raw_onspd_run_rownum + ON raw.onspd_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_usrn_run_rownum + ON raw.os_open_usrn_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_names_run_rownum + ON raw.os_open_names_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_roads_run_rownum + ON raw.os_open_roads_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_uprn_run_rownum + ON raw.os_open_uprn_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_lids_run_rownum + ON raw.os_open_lids_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_nsul_run_rownum + ON raw.nsul_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_osni_run_rownum + ON raw.osni_gazetteer_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_dfi_run_rownum + ON raw.dfi_highway_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_ppd_run_rownum + ON raw.ppd_row (ingest_run_id, source_row_num); + +COMMIT; diff --git a/pipeline/sql/migrations/0012_v3_stage_tables_unlogged.sql b/pipeline/sql/migrations/0012_v3_stage_tables_unlogged.sql new file mode 100644 index 0000000..e9faeca --- /dev/null +++ b/pipeline/sql/migrations/0012_v3_stage_tables_unlogged.sql @@ -0,0 +1,16 @@ +BEGIN; + +ALTER TABLE stage.onspd_postcode SET UNLOGGED; +ALTER TABLE stage.streets_usrn_input SET UNLOGGED; +ALTER TABLE stage.open_names_road_feature SET UNLOGGED; +ALTER TABLE stage.open_roads_segment SET UNLOGGED; +ALTER TABLE stage.uprn_point SET UNLOGGED; +ALTER TABLE stage.open_lids_toid_usrn SET UNLOGGED; +ALTER TABLE stage.open_lids_uprn_usrn SET UNLOGGED; +ALTER TABLE stage.open_lids_pair SET UNLOGGED; +ALTER TABLE stage.nsul_uprn_postcode SET UNLOGGED; +ALTER TABLE stage.osni_street_point SET UNLOGGED; +ALTER TABLE stage.dfi_road_segment SET UNLOGGED; +ALTER TABLE stage.ppd_parsed_address SET UNLOGGED; + +COMMIT; diff --git a/pipeline/sql/migrations/0013_v3_raw_tables_unlogged.sql b/pipeline/sql/migrations/0013_v3_raw_tables_unlogged.sql new file mode 100644 index 0000000..4ea3d13 --- /dev/null +++ b/pipeline/sql/migrations/0013_v3_raw_tables_unlogged.sql @@ -0,0 +1,14 @@ +BEGIN; + +ALTER TABLE raw.onspd_row SET UNLOGGED; +ALTER TABLE raw.os_open_usrn_row SET UNLOGGED; +ALTER TABLE raw.os_open_names_row SET UNLOGGED; +ALTER TABLE raw.os_open_roads_row SET UNLOGGED; +ALTER TABLE raw.os_open_uprn_row SET UNLOGGED; +ALTER TABLE raw.os_open_lids_row SET UNLOGGED; +ALTER TABLE raw.nsul_row SET UNLOGGED; +ALTER TABLE raw.osni_gazetteer_row SET UNLOGGED; +ALTER TABLE raw.dfi_highway_row SET UNLOGGED; +ALTER TABLE raw.ppd_row SET UNLOGGED; + +COMMIT; diff --git a/pipeline/sql/migrations/0014_v3_stage_join_indexes.sql b/pipeline/sql/migrations/0014_v3_stage_join_indexes.sql new file mode 100644 index 0000000..946ebdb --- /dev/null +++ b/pipeline/sql/migrations/0014_v3_stage_join_indexes.sql @@ -0,0 +1,11 @@ +BEGIN; + +CREATE INDEX IF NOT EXISTS idx_stage_open_names_run_toid + ON stage.open_names_road_feature (build_run_id, toid) + WHERE toid IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_stage_open_names_run_postcode + ON stage.open_names_road_feature (build_run_id, postcode_norm) + WHERE postcode_norm IS NOT NULL; + +COMMIT; diff --git a/pipeline/src/pipeline/AGENTS.md b/pipeline/src/pipeline/AGENTS.md new file mode 100644 index 0000000..5ea0d2b --- /dev/null +++ b/pipeline/src/pipeline/AGENTS.md @@ -0,0 +1,25 @@ +# pipeline/src/pipeline/AGENTS.md + +## Scope +Python runtime internals for ingest/build/verify/publish. + +## Critical Rule +Keep docs in lockstep with behavior changes. Update: +- `docs/spec/pipeline_v3/spec.md` +- `docs/spec/pipeline_v3/data_model.md` +- `docs/spec/pipeline_v3/canonicalisation.md` +- relevant pages under `docs/architecture/` + +## Module Map +- `cli.py`: command surface and run flow +- `manifest.py`: source/bundle manifest validation +- `ingest/workflows.py`: raw ingest into `raw.*` +- `build/workflows.py`: pass execution 0a..8, provenance, finalisation, publish +- `db/migrations.py`: migration execution +- `util/normalise.py`: canonical text/postcode normalization + +## Common Pitfalls +- nondeterministic ordering in SQL +- mutating append-only evidence tables +- schema mapping drift between manifests/raw payload fields +- config changes without docs/test updates diff --git a/pipeline/src/pipeline/__init__.py b/pipeline/src/pipeline/__init__.py new file mode 100644 index 0000000..06154e5 --- /dev/null +++ b/pipeline/src/pipeline/__init__.py @@ -0,0 +1 @@ +"""Phase 1 pipeline package.""" diff --git a/pipeline/src/pipeline/build/__init__.py b/pipeline/src/pipeline/build/__init__.py new file mode 100644 index 0000000..90fb536 --- /dev/null +++ b/pipeline/src/pipeline/build/__init__.py @@ -0,0 +1 @@ +"""Build lifecycle workflows for release sets.""" diff --git a/pipeline/src/pipeline/build/workflows.py b/pipeline/src/pipeline/build/workflows.py new file mode 100644 index 0000000..b4d3e73 --- /dev/null +++ b/pipeline/src/pipeline/build/workflows.py @@ -0,0 +1,3284 @@ +"""Build bundle, pass execution, verification, and publish workflows for Pipeline V3.""" + +from __future__ import annotations + +import hashlib +import json +import re +import uuid +from dataclasses import dataclass +from decimal import Decimal +from pathlib import Path +from typing import Any + +import psycopg +from psycopg import sql +from psycopg.types.json import Jsonb + +from pipeline.config import ( + frequency_weights_config_path, + source_schema_config_path, +) +from pipeline.manifest import BUILD_PROFILES, BuildBundleManifest +from pipeline.util.normalise import postcode_display, postcode_norm, street_casefold + + +class BuildError(RuntimeError): + """Raised for build lifecycle errors.""" + + +@dataclass(frozen=True) +class BuildBundleResult: + bundle_id: str + status: str + bundle_hash: str + + +@dataclass(frozen=True) +class BuildRunResult: + build_run_id: str + status: str + dataset_version: str + message: str + + +@dataclass(frozen=True) +class VerifyResult: + build_run_id: str + status: str + object_hashes: dict[str, str] + + +@dataclass(frozen=True) +class PublishResult: + build_run_id: str + dataset_version: str + status: str + + +PASS_ORDER = ( + "0a_raw_ingest", + "0b_stage_normalisation", + "1_onspd_backbone", + "2_gb_canonical_streets", + "3_open_names_candidates", + "4_uprn_reinforcement", + "5_gb_spatial_fallback", + "6_ni_candidates", + "7_ppd_gap_fill", + "8_finalisation", +) + +RAW_TABLE_BY_SOURCE = { + "onspd": "raw.onspd_row", + "os_open_usrn": "raw.os_open_usrn_row", + "os_open_names": "raw.os_open_names_row", + "os_open_roads": "raw.os_open_roads_row", + "os_open_uprn": "raw.os_open_uprn_row", + "os_open_lids": "raw.os_open_lids_row", + "nsul": "raw.nsul_row", + "osni_gazetteer": "raw.osni_gazetteer_row", + "dfi_highway": "raw.dfi_highway_row", + "ppd": "raw.ppd_row", +} + +CANDIDATE_TYPES = ( + "names_postcode_feature", + "open_lids_toid_usrn", + "uprn_usrn", + "spatial_os_open_roads", + "osni_gazetteer_direct", + "spatial_dfi_highway", + "ppd_parse_matched", + "ppd_parse_unmatched", +) + + +def _load_json_config(path: Path) -> dict[str, Any]: + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise BuildError(f"Invalid JSON config: {path}") from exc + if not isinstance(payload, dict): + raise BuildError(f"Config root must be object: {path}") + return payload + + +def _schema_config() -> dict[str, Any]: + return _load_json_config(source_schema_config_path()) + + +def _weight_config() -> dict[str, Decimal]: + payload = _load_json_config(frequency_weights_config_path()) + raw_weights = payload.get("weights") + if not isinstance(raw_weights, dict): + raise BuildError("frequency_weights config must contain object key 'weights'") + + parsed: dict[str, Decimal] = {} + for key, value in raw_weights.items(): + if not isinstance(key, str): + raise BuildError("frequency weight keys must be strings") + try: + weight = Decimal(str(value)) + except Exception as exc: # pragma: no cover + raise BuildError(f"Invalid frequency weight for {key}: {value}") from exc + parsed[key] = weight + + missing = sorted(set(CANDIDATE_TYPES) - set(parsed.keys())) + if missing: + raise BuildError( + "frequency_weights missing candidate types: " + ", ".join(missing) + ) + + for candidate_type, weight in parsed.items(): + if weight <= Decimal("0"): + raise BuildError( + f"frequency weight must be > 0 for candidate_type={candidate_type}; got {weight}" + ) + + unknown = sorted(set(parsed.keys()) - set(CANDIDATE_TYPES)) + if unknown: + raise BuildError( + "frequency_weights has unknown candidate types: " + ", ".join(unknown) + ) + + return {candidate_type: parsed[candidate_type] for candidate_type in CANDIDATE_TYPES} + + +def _bundle_hash(build_profile: str, source_runs: dict[str, tuple[str, ...]]) -> str: + normalized_source_runs = { + source_name: sorted(run_ids) + for source_name, run_ids in source_runs.items() + } + payload = { + "build_profile": build_profile, + "source_runs": { + key: normalized_source_runs[key] for key in sorted(normalized_source_runs.keys()) + }, + } + encoded = json.dumps(payload, separators=(",", ":"), ensure_ascii=True).encode("utf-8") + return hashlib.sha256(encoded).hexdigest() + + +def _dataset_version_from_bundle_hash(bundle_hash: str) -> str: + return f"v3_{bundle_hash[:12]}" + + +def _safe_version_suffix(dataset_version: str) -> str: + suffix = re.sub(r"[^A-Za-z0-9_]", "_", dataset_version) + return suffix or "v3" + + +def create_build_bundle(conn: psycopg.Connection, manifest: BuildBundleManifest) -> BuildBundleResult: + bundle_hash = _bundle_hash(manifest.build_profile, manifest.source_runs) + + with conn.cursor() as cur: + cur.execute( + """ + SELECT bundle_id + FROM meta.build_bundle + WHERE build_profile = %s + AND bundle_hash = %s + """, + (manifest.build_profile, bundle_hash), + ) + existing = cur.fetchone() + if existing is not None: + return BuildBundleResult( + bundle_id=str(existing[0]), + status="existing", + bundle_hash=bundle_hash, + ) + + required_sources = BUILD_PROFILES[manifest.build_profile] + missing = sorted(required_sources - set(manifest.source_runs.keys())) + if missing: + raise BuildError( + "Bundle manifest missing required sources: " + ", ".join(missing) + ) + + with conn.cursor() as cur: + for source_name in sorted(required_sources): + run_ids = manifest.source_runs[source_name] + if source_name == "ppd": + if len(run_ids) == 0: + raise BuildError("Bundle must include at least one ppd ingest run") + else: + if len(run_ids) != 1: + raise BuildError( + f"Source {source_name} must map to exactly one ingest run in a bundle" + ) + + for run_id in run_ids: + cur.execute( + """ + SELECT source_name + FROM meta.ingest_run + WHERE run_id = %s + """, + (run_id,), + ) + row = cur.fetchone() + if row is None: + raise BuildError(f"Unknown ingest_run_id for source {source_name}: {run_id}") + if row[0] != source_name: + raise BuildError( + f"Ingest run/source mismatch: source={source_name} run_id={run_id} row_source={row[0]}" + ) + + bundle_id = str(uuid.uuid4()) + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO meta.build_bundle ( + bundle_id, + build_profile, + bundle_hash, + status, + created_at_utc + ) VALUES (%s, %s, %s, 'created', now()) + """, + (bundle_id, manifest.build_profile, bundle_hash), + ) + + for source_name, run_ids in manifest.source_runs.items(): + for ingest_run_id in run_ids: + cur.execute( + """ + INSERT INTO meta.build_bundle_source ( + bundle_id, + source_name, + ingest_run_id + ) VALUES (%s, %s, %s) + """, + (bundle_id, source_name, ingest_run_id), + ) + + return BuildBundleResult(bundle_id=bundle_id, status="created", bundle_hash=bundle_hash) + + +def _load_bundle( + conn: psycopg.Connection, + bundle_id: str, +) -> tuple[str, str, str, dict[str, tuple[str, ...]]]: + with conn.cursor() as cur: + cur.execute( + """ + SELECT build_profile, bundle_hash, status + FROM meta.build_bundle + WHERE bundle_id = %s + FOR UPDATE + """, + (bundle_id,), + ) + row = cur.fetchone() + if row is None: + raise BuildError(f"Bundle not found: {bundle_id}") + build_profile, bundle_hash, status = row + + cur.execute( + """ + SELECT source_name, ingest_run_id::text + FROM meta.build_bundle_source + WHERE bundle_id = %s + """, + (bundle_id,), + ) + source_rows = cur.fetchall() + + source_runs_map: dict[str, list[str]] = {} + for source_name, ingest_run_id in source_rows: + source_runs_map.setdefault(source_name, []).append(ingest_run_id) + + source_runs: dict[str, tuple[str, ...]] = { + source_name: tuple(sorted(run_ids)) + for source_name, run_ids in source_runs_map.items() + } + + required = BUILD_PROFILES[build_profile] + missing = sorted(required - set(source_runs.keys())) + if missing: + raise BuildError( + f"Bundle {bundle_id} missing required sources for profile {build_profile}: {', '.join(missing)}" + ) + + return build_profile, bundle_hash, status, source_runs + + +def _latest_resumable_run(conn: psycopg.Connection, bundle_id: str) -> tuple[str, str] | None: + with conn.cursor() as cur: + cur.execute( + """ + SELECT build_run_id::text, dataset_version + FROM meta.build_run + WHERE bundle_id = %s + AND status IN ('started', 'failed') + ORDER BY started_at_utc DESC + LIMIT 1 + """, + (bundle_id,), + ) + row = cur.fetchone() + if row is None: + return None + return row[0], row[1] + + +def _load_completed_passes(conn: psycopg.Connection, build_run_id: str) -> set[str]: + with conn.cursor() as cur: + cur.execute( + """ + SELECT pass_name + FROM meta.build_pass_checkpoint + WHERE build_run_id = %s + """, + (build_run_id,), + ) + return {row[0] for row in cur.fetchall()} + + +def _single_source_run(source_runs: dict[str, tuple[str, ...]], source_name: str) -> str: + run_ids = source_runs.get(source_name, ()) + if len(run_ids) != 1: + raise BuildError( + f"Source {source_name} requires exactly one ingest run in bundle; found {len(run_ids)}" + ) + return run_ids[0] + + +def _ordered_run_ids(conn: psycopg.Connection, run_ids: tuple[str, ...]) -> tuple[str, ...]: + if not run_ids: + return () + with conn.cursor() as cur: + cur.execute( + """ + SELECT run_id::text + FROM meta.ingest_run + WHERE run_id = ANY(%s::uuid[]) + ORDER BY retrieved_at_utc ASC, run_id ASC + """, + (list(run_ids),), + ) + ordered = tuple(row[0] for row in cur.fetchall()) + if len(ordered) != len(run_ids): + raise BuildError("One or more ingest run IDs could not be resolved for ordered execution") + return ordered + + +def _create_build_run(conn: psycopg.Connection, bundle_id: str, dataset_version: str) -> str: + build_run_id = str(uuid.uuid4()) + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO meta.build_run ( + build_run_id, + bundle_id, + dataset_version, + status, + current_pass, + started_at_utc + ) VALUES (%s, %s, %s, 'started', 'initialising', now()) + """, + (build_run_id, bundle_id, dataset_version), + ) + return build_run_id + + +def _set_build_run_pass(conn: psycopg.Connection, build_run_id: str, pass_name: str) -> None: + with conn.cursor() as cur: + cur.execute( + """ + UPDATE meta.build_run + SET current_pass = %s + WHERE build_run_id = %s + """, + (pass_name, build_run_id), + ) + + +def _mark_pass_checkpoint( + conn: psycopg.Connection, + build_run_id: str, + pass_name: str, + row_count_summary: dict[str, int], +) -> None: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO meta.build_pass_checkpoint ( + build_run_id, + pass_name, + completed_at_utc, + row_count_summary_json + ) VALUES (%s, %s, now(), %s) + ON CONFLICT (build_run_id, pass_name) + DO UPDATE SET + completed_at_utc = EXCLUDED.completed_at_utc, + row_count_summary_json = EXCLUDED.row_count_summary_json + """, + (build_run_id, pass_name, Jsonb(row_count_summary)), + ) + + +def _mark_build_failed(conn: psycopg.Connection, build_run_id: str, current_pass: str, error_text: str) -> None: + with conn.cursor() as cur: + cur.execute( + """ + UPDATE meta.build_run + SET status = 'failed', + current_pass = %s, + error_text = %s, + finished_at_utc = now() + WHERE build_run_id = %s + """, + (current_pass, error_text, build_run_id), + ) + + +def _mark_build_built(conn: psycopg.Connection, bundle_id: str, build_run_id: str) -> None: + with conn.cursor() as cur: + cur.execute( + """ + UPDATE meta.build_run + SET status = 'built', + current_pass = 'complete', + finished_at_utc = now(), + error_text = NULL + WHERE build_run_id = %s + """, + (build_run_id,), + ) + cur.execute( + """ + UPDATE meta.build_bundle + SET status = 'built' + WHERE bundle_id = %s + """, + (bundle_id,), + ) + + +RAW_FETCH_BATCH_SIZE = 5000 +STAGE_INSERT_BATCH_SIZE = 5000 + + +def _iter_validated_raw_rows( + conn: psycopg.Connection, + *, + source_name: str, + raw_table: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +): + schema_name, table_name = raw_table.split(".", 1) + cursor_name = f"stage_raw_{table_name}_{uuid.uuid4().hex[:8]}" + with conn.cursor(name=cursor_name) as cur: + cur.itersize = RAW_FETCH_BATCH_SIZE + cur.execute( + sql.SQL( + """ + SELECT payload_jsonb + FROM {}.{} + WHERE ingest_run_id = %s + ORDER BY source_row_num ASC + """ + ).format(sql.Identifier(schema_name), sql.Identifier(table_name)), + (ingest_run_id,), + ) + first = cur.fetchone() + if first is None: + raise BuildError(f"Raw source is empty for {source_name}; cannot stage-normalise") + + first_row = first[0] + _assert_required_mapped_fields_present( + source_name=source_name, + sample_row=first_row, + field_map=field_map, + required_fields=required_fields, + ) + yield first_row + + while True: + chunk = cur.fetchmany(RAW_FETCH_BATCH_SIZE) + if not chunk: + break + for row in chunk: + yield row[0] + + +def _mapped_fields_for_source(schema_config: dict[str, Any], source_name: str) -> tuple[dict[str, str], tuple[str, ...]]: + sources = schema_config.get("sources") + if not isinstance(sources, dict): + raise BuildError("source_schema.yaml missing object key 'sources'") + + source_cfg = sources.get(source_name) + if not isinstance(source_cfg, dict): + raise BuildError(f"source_schema.yaml missing source block: {source_name}") + + field_map_raw = source_cfg.get("field_map") + required_raw = source_cfg.get("required_fields") + if not isinstance(field_map_raw, dict): + raise BuildError(f"source_schema.yaml source {source_name} missing field_map object") + if not isinstance(required_raw, list): + raise BuildError(f"source_schema.yaml source {source_name} missing required_fields list") + + field_map: dict[str, str] = {} + for key, value in field_map_raw.items(): + if not isinstance(key, str) or not isinstance(value, str): + raise BuildError(f"source_schema field_map for {source_name} must be string:string") + field_map[key] = value + + required_fields = [] + for item in required_raw: + if not isinstance(item, str): + raise BuildError(f"source_schema required_fields for {source_name} must be strings") + if item not in field_map: + raise BuildError( + f"source_schema required field '{item}' missing from field_map for {source_name}" + ) + required_fields.append(item) + + return field_map, tuple(required_fields) + + +def _assert_required_mapped_fields_present( + source_name: str, + sample_row: dict[str, Any], + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> None: + sample_keys = set(sample_row.keys()) + missing = [] + for key in required_fields: + candidates = _field_name_candidates(field_map, key) + if not any(candidate in sample_keys for candidate in candidates): + missing.append("/".join(candidates)) + if missing: + raise BuildError( + f"Schema mapping unresolved for {source_name}; missing mapped fields in raw rows: " + + ", ".join(sorted(missing)) + ) + + +def _field_name_candidates(field_map: dict[str, str], logical_key: str) -> tuple[str, ...]: + names: list[str] = [] + mapped = field_map.get(logical_key) + if mapped: + names.append(mapped) + names.append(logical_key) + legacy_aliases = { + "id_1": ("identifier_1", "left_id"), + "id_2": ("identifier_2", "right_id"), + "identifier_1": ("id_1", "left_id"), + "identifier_2": ("id_2", "right_id"), + "left_id": ("id_1", "identifier_1"), + "right_id": ("id_2", "identifier_2"), + } + aliases = legacy_aliases.get(logical_key, ()) + names.extend(aliases) + + expanded: list[str] = [] + for name in names: + expanded.append(name) + expanded.append(name.lower()) + expanded.append(name.upper()) + + deduped: list[str] = [] + seen: set[str] = set() + for name in expanded: + if name not in seen: + deduped.append(name) + seen.add(name) + return tuple(deduped) + + +def _field_value(row: dict[str, Any], field_map: dict[str, str], logical_key: str) -> Any: + for candidate in _field_name_candidates(field_map, logical_key): + if candidate in row: + return row.get(candidate) + return None + + +def _validated_raw_sample_row( + conn: psycopg.Connection, + *, + source_name: str, + raw_table: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> dict[str, Any]: + schema_name, table_name = raw_table.split(".", 1) + with conn.cursor() as cur: + cur.execute( + sql.SQL( + """ + SELECT payload_jsonb + FROM {}.{} + WHERE ingest_run_id = %s + ORDER BY source_row_num ASC + LIMIT 1 + """ + ).format(sql.Identifier(schema_name), sql.Identifier(table_name)), + (ingest_run_id,), + ) + row = cur.fetchone() + if row is None: + raise BuildError(f"Raw source is empty for {source_name}; cannot stage-normalise") + + sample_row = row[0] + _assert_required_mapped_fields_present( + source_name=source_name, + sample_row=sample_row, + field_map=field_map, + required_fields=required_fields, + ) + return sample_row + + +def _json_text_from_candidates(payload_expr: sql.SQL, candidates: tuple[str, ...]) -> sql.SQL: + if len(candidates) == 0: + return sql.SQL("NULL") + lookups = [ + sql.SQL("{} ->> {}").format(payload_expr, sql.Literal(candidate)) + for candidate in candidates + ] + return sql.SQL("COALESCE({})").format(sql.SQL(", ").join(lookups)) + + +def _json_text_for_field(payload_expr: sql.SQL, field_map: dict[str, str], logical_key: str) -> sql.SQL: + return _json_text_from_candidates(payload_expr, _field_name_candidates(field_map, logical_key)) + + +def _schema_insert_rows( + conn: psycopg.Connection, + query: sql.SQL, + rows: list[tuple[Any, ...]], +) -> int: + if not rows: + return 0 + with conn.cursor() as cur: + cur.executemany(query, rows) + return len(rows) + + +def _flush_stage_batch( + conn: psycopg.Connection, + query: sql.SQL, + payload: list[tuple[Any, ...]], +) -> int: + if not payload: + return 0 + inserted = _schema_insert_rows(conn, query, payload) + payload.clear() + return inserted + + +STAGE_TABLES = ( + "stage.ppd_parsed_address", + "stage.dfi_road_segment", + "stage.osni_street_point", + "stage.nsul_uprn_postcode", + "stage.open_lids_uprn_usrn", + "stage.open_lids_toid_usrn", + "stage.open_lids_pair", + "stage.uprn_point", + "stage.open_roads_segment", + "stage.open_names_road_feature", + "stage.streets_usrn_input", + "stage.onspd_postcode", +) + + +def _analyze_relations(conn: psycopg.Connection, relations: tuple[str, ...]) -> None: + with conn.cursor() as cur: + for relation in relations: + schema_name, table_name = relation.split(".", 1) + cur.execute( + sql.SQL("ANALYZE {}.{}").format( + sql.Identifier(schema_name), + sql.Identifier(table_name), + ) + ) + + +def _assert_no_other_started_build(conn: psycopg.Connection, build_run_id: str) -> None: + with conn.cursor() as cur: + cur.execute( + """ + SELECT build_run_id::text, current_pass + FROM meta.build_run + WHERE status = 'started' + AND build_run_id <> %s + ORDER BY started_at_utc ASC + LIMIT 1 + """, + (build_run_id,), + ) + row = cur.fetchone() + + if row is not None: + other_run_id, current_pass = row + raise BuildError( + "Stage truncate is unsafe while another build is in status=started; " + f"other_build_run_id={other_run_id} other_current_pass={current_pass}" + ) + + +def _stage_cleanup(conn: psycopg.Connection, build_run_id: str) -> None: + _assert_no_other_started_build(conn, build_run_id) + table_identifiers = [] + for table in STAGE_TABLES: + schema_name, table_name = table.split(".", 1) + table_identifiers.append( + sql.SQL("{}.{}").format( + sql.Identifier(schema_name), + sql.Identifier(table_name), + ) + ) + + with conn.cursor() as cur: + # Stage tables are transient build artifacts; truncation keeps runtime stable + # across rebuilds by preventing historical-row/index accumulation. + cur.execute( + sql.SQL("TRUNCATE TABLE {}").format(sql.SQL(", ").join(table_identifiers)) + ) + + +def _pass_0a_raw_ingest( + conn: psycopg.Connection, + build_run_id: str, + source_runs: dict[str, tuple[str, ...]], +) -> dict[str, int]: + del build_run_id # Pass 0a validates bundle/run metadata only. + counts: dict[str, int] = {} + with conn.cursor() as cur: + for source_name, run_ids in sorted(source_runs.items()): + total_row_count = 0 + for ingest_run_id in run_ids: + cur.execute( + """ + SELECT source_name, record_count + FROM meta.ingest_run + WHERE run_id = %s + """, + (ingest_run_id,), + ) + row = cur.fetchone() + if row is None: + raise BuildError( + f"Pass 0a failed: ingest run missing in metadata source={source_name} run={ingest_run_id}" + ) + row_source_name, record_count = row + if row_source_name != source_name: + raise BuildError( + "Pass 0a failed: ingest run/source mismatch " + f"bundle_source={source_name} run_source={row_source_name} run={ingest_run_id}" + ) + row_count = int(record_count or 0) + if row_count <= 0: + raise BuildError( + "Pass 0a failed: source has no recorded rows for " + f"source={source_name} run={ingest_run_id}" + ) + total_row_count += row_count + counts[source_name] = total_row_count + return counts + + +def _country_enrichment_available(country_iso2: str, subdivision_code: str | None) -> bool: + if subdivision_code in {"GB-ENG", "GB-SCT", "GB-WLS", "GB-NIR"}: + return True + if country_iso2 == "GB": + return True + return False + + +def _onspd_country_mapping(value: str | None) -> tuple[str, str, str | None]: + code = (value or "").strip().upper() + mapping = { + "E92000001": ("GB", "GBR", "GB-ENG"), + "S92000003": ("GB", "GBR", "GB-SCT"), + "W92000004": ("GB", "GBR", "GB-WLS"), + "N92000002": ("GB", "GBR", "GB-NIR"), + } + if code in mapping: + return mapping[code] + if code in {"GB", "GBR"}: + return "GB", "GBR", None + return "GB", "GBR", None + + +def _normalise_onspd_status(value: str | None) -> str: + raw = (value or "").strip() + if raw == "": + return "active" + lowered = raw.lower() + if lowered in {"active", "terminated"}: + return lowered + return "terminated" + + +def _populate_stage_onspd( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + insert_sql = sql.SQL( + """ + INSERT INTO stage.onspd_postcode ( + build_run_id, + postcode_norm, + postcode_display, + status, + lat, + lon, + easting, + northing, + country_iso2, + country_iso3, + subdivision_code, + post_town, + locality, + street_enrichment_available, + onspd_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """ + ) + + payload: list[tuple[Any, ...]] = [] + inserted = 0 + for row in _iter_validated_raw_rows( + conn, + source_name="onspd", + raw_table="raw.onspd_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + postcode_raw = _field_value(row, field_map, "postcode") + postcode_n = postcode_norm(str(postcode_raw) if postcode_raw is not None else None) + postcode_d = postcode_display(str(postcode_raw) if postcode_raw is not None else None) + if postcode_n is None or postcode_d is None: + continue + + status_key = field_map.get("status") + status = _normalise_onspd_status( + str(row.get(status_key)) if status_key and row.get(status_key) is not None else None + ) + + country_key = field_map.get("subdivision_code") or field_map.get("country_iso2") + mapped_country_value = ( + str(row.get(country_key)) if country_key and row.get(country_key) is not None else None + ) + country_iso2, country_iso3, subdivision_code = _onspd_country_mapping(mapped_country_value) + + lat_raw = _field_value(row, field_map, "lat") + lon_raw = _field_value(row, field_map, "lon") + easting_raw = _field_value(row, field_map, "easting") + northing_raw = _field_value(row, field_map, "northing") + + lat: Decimal | None + lon: Decimal | None + try: + lat = Decimal(str(lat_raw)).quantize(Decimal("0.000001")) if lat_raw not in (None, "") else None + lon = Decimal(str(lon_raw)).quantize(Decimal("0.000001")) if lon_raw not in (None, "") else None + except Exception: + lat = None + lon = None + + try: + easting = int(float(easting_raw)) if easting_raw not in (None, "") else None + northing = int(float(northing_raw)) if northing_raw not in (None, "") else None + except Exception: + easting = None + northing = None + + post_town_raw = _field_value(row, field_map, "post_town") + locality_raw = _field_value(row, field_map, "locality") + + payload.append( + ( + build_run_id, + postcode_n, + postcode_d, + status, + lat, + lon, + easting, + northing, + country_iso2, + country_iso3, + subdivision_code, + str(post_town_raw).strip().upper() if post_town_raw not in (None, "") else None, + str(locality_raw).strip().upper() if locality_raw not in (None, "") else None, + _country_enrichment_available(country_iso2, subdivision_code), + ingest_run_id, + ) + ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) + + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted + + +def _populate_stage_usrn( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + insert_sql = sql.SQL( + """ + INSERT INTO stage.streets_usrn_input ( + build_run_id, + usrn, + street_name, + street_name_casefolded, + street_class, + street_status, + usrn_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, usrn) + DO UPDATE SET + street_name = EXCLUDED.street_name, + street_name_casefolded = EXCLUDED.street_name_casefolded, + street_class = EXCLUDED.street_class, + street_status = EXCLUDED.street_status, + usrn_run_id = EXCLUDED.usrn_run_id + """ + ) + + payload: list[tuple[Any, ...]] = [] + inserted = 0 + street_name_key = field_map.get("street_name") + street_class_key = field_map.get("street_class") + street_status_key = field_map.get("street_status") + for row in _iter_validated_raw_rows( + conn, + source_name="os_open_usrn", + raw_table="raw.os_open_usrn_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + usrn_raw = _field_value(row, field_map, "usrn") + name_raw = row.get(street_name_key) if street_name_key else None + if usrn_raw in (None, "") or name_raw in (None, ""): + continue + try: + usrn = int(usrn_raw) + except Exception: + continue + street_name = str(name_raw).strip() + folded = street_casefold(street_name) + if not street_name or folded is None: + continue + + payload.append( + ( + build_run_id, + usrn, + street_name, + folded, + str(row.get(street_class_key)).strip() if street_class_key and row.get(street_class_key) not in (None, "") else None, + str(row.get(street_status_key)).strip() if street_status_key and row.get(street_status_key) not in (None, "") else None, + ingest_run_id, + ) + ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) + + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted + + +def _populate_stage_open_names( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + insert_sql = sql.SQL( + """ + INSERT INTO stage.open_names_road_feature ( + build_run_id, + feature_id, + toid, + postcode_norm, + street_name_raw, + street_name_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, feature_id) + DO UPDATE SET + toid = EXCLUDED.toid, + postcode_norm = EXCLUDED.postcode_norm, + street_name_raw = EXCLUDED.street_name_raw, + street_name_casefolded = EXCLUDED.street_name_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ) + + payload: list[tuple[Any, ...]] = [] + inserted = 0 + toid_key = field_map.get("toid") + postcode_key = field_map.get("postcode") + local_type_key = field_map.get("local_type") + for row in _iter_validated_raw_rows( + conn, + source_name="os_open_names", + raw_table="raw.os_open_names_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + feature_id_raw = _field_value(row, field_map, "feature_id") + street_raw = _field_value(row, field_map, "street_name") + postcode_raw = row.get(postcode_key) if postcode_key else None + toid_raw = row.get(toid_key) if toid_key else None + if feature_id_raw in (None, "") or street_raw in (None, ""): + continue + + local_type = str(row.get(local_type_key)).strip().lower() if local_type_key and row.get(local_type_key) not in (None, "") else "" + if local_type and "road" not in local_type and "transport" not in local_type: + continue + + folded = street_casefold(str(street_raw)) + postcode_n = postcode_norm(str(postcode_raw) if postcode_raw is not None else None) + if folded is None: + continue + + payload.append( + ( + build_run_id, + str(feature_id_raw).strip(), + str(toid_raw).strip() if toid_raw not in (None, "") else None, + postcode_n, + str(street_raw).strip(), + folded, + ingest_run_id, + ) + ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) + + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted + + +def _populate_stage_open_roads( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + insert_sql = sql.SQL( + """ + INSERT INTO stage.open_roads_segment ( + build_run_id, + segment_id, + road_id, + postcode_norm, + usrn, + road_name, + road_name_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, segment_id) + DO UPDATE SET + road_id = EXCLUDED.road_id, + postcode_norm = EXCLUDED.postcode_norm, + usrn = EXCLUDED.usrn, + road_name = EXCLUDED.road_name, + road_name_casefolded = EXCLUDED.road_name_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ) + + payload: list[tuple[Any, ...]] = [] + inserted = 0 + postcode_key = field_map.get("postcode") + usrn_key = field_map.get("usrn") + road_id_key = field_map.get("road_id") + for row in _iter_validated_raw_rows( + conn, + source_name="os_open_roads", + raw_table="raw.os_open_roads_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + segment_id_raw = _field_value(row, field_map, "segment_id") + road_name_raw = _field_value(row, field_map, "road_name") + if segment_id_raw in (None, "") or road_name_raw in (None, ""): + continue + + folded = street_casefold(str(road_name_raw)) + if folded is None: + continue + + postcode_n = postcode_norm(str(row.get(postcode_key)) if postcode_key and row.get(postcode_key) not in (None, "") else None) + + usrn_raw = row.get(usrn_key) if usrn_key else None + try: + usrn = int(usrn_raw) if usrn_raw not in (None, "") else None + except Exception: + usrn = None + + road_id_raw = row.get(road_id_key) if road_id_key else None + + payload.append( + ( + build_run_id, + str(segment_id_raw).strip(), + str(road_id_raw).strip() if road_id_raw not in (None, "") else None, + postcode_n, + usrn, + str(road_name_raw).strip(), + folded, + ingest_run_id, + ) + ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) + + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted + + +def _populate_stage_open_uprn( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + _validated_raw_sample_row( + conn, + source_name="os_open_uprn", + raw_table="raw.os_open_uprn_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ) + + payload_expr = sql.SQL("r.payload_jsonb") + uprn_expr = _json_text_for_field(payload_expr, field_map, "uprn") + postcode_expr = _json_text_for_field(payload_expr, field_map, "postcode") + + with conn.cursor() as cur: + cur.execute( + sql.SQL( + """ + WITH extracted AS ( + SELECT + r.source_row_num, + btrim({uprn_expr}) AS uprn_text, + btrim({postcode_expr}) AS postcode_text + FROM raw.os_open_uprn_row AS r + WHERE r.ingest_run_id = %s + ), + filtered AS ( + SELECT + source_row_num, + uprn_text::bigint AS uprn, + NULLIF( + upper(regexp_replace(COALESCE(postcode_text, ''), '[^A-Za-z0-9]', '', 'g')), + '' + ) AS postcode_norm + FROM extracted + WHERE uprn_text IS NOT NULL + AND uprn_text <> '' + AND uprn_text ~ '^[0-9]+$' + ), + deduped AS ( + SELECT DISTINCT ON (uprn) + uprn, + postcode_norm + FROM filtered + ORDER BY uprn ASC, source_row_num DESC + ) + INSERT INTO stage.uprn_point ( + build_run_id, + uprn, + postcode_norm, + ingest_run_id + ) + SELECT + %s, + d.uprn, + d.postcode_norm, + %s + FROM deduped AS d + ON CONFLICT (build_run_id, uprn) + DO UPDATE SET + postcode_norm = EXCLUDED.postcode_norm, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ).format(uprn_expr=uprn_expr, postcode_expr=postcode_expr), + (ingest_run_id, build_run_id, ingest_run_id), + ) + return int(cur.rowcount) + + +def _populate_stage_open_lids( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> tuple[int, int, int]: + _validated_raw_sample_row( + conn, + source_name="os_open_lids", + raw_table="raw.os_open_lids_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ) + + payload_expr = sql.SQL("r.payload_jsonb") + id_1_expr = _json_text_for_field(payload_expr, field_map, "id_1") + id_2_expr = _json_text_for_field(payload_expr, field_map, "id_2") + relation_expr = _json_text_for_field(payload_expr, field_map, "relation_type") + + with conn.cursor() as cur: + cur.execute( + sql.SQL( + """ + WITH extracted AS ( + SELECT + btrim({id_1_expr}) AS left_id, + btrim({id_2_expr}) AS right_id, + lower(btrim(COALESCE({relation_expr}, ''))) AS relation_hint + FROM raw.os_open_lids_row AS r + WHERE r.ingest_run_id = %s + ), + prepared AS ( + SELECT + left_id, + right_id, + relation_hint, + (left_id IS NOT NULL AND left_id <> '') AS left_present, + (right_id IS NOT NULL AND right_id <> '') AS right_present, + (lower(COALESCE(left_id, '')) LIKE 'osgb%%') AS left_is_toid, + (lower(COALESCE(right_id, '')) LIKE 'osgb%%') AS right_is_toid, + (COALESCE(left_id, '') ~ '^[0-9]+$') AS left_is_digits, + (COALESCE(right_id, '') ~ '^[0-9]+$') AS right_is_digits + FROM extracted + ), + resolved AS MATERIALIZED ( + SELECT + CASE + WHEN relation_hint IN ('toid_usrn', 'toid->usrn', 'toid_usrn_link') THEN 'toid_usrn' + WHEN relation_hint IN ('uprn_usrn', 'uprn->usrn', 'uprn_usrn_link') THEN 'uprn_usrn' + WHEN left_is_toid AND right_is_digits THEN 'toid_usrn' + WHEN right_is_toid AND left_is_digits THEN 'toid_usrn' + WHEN left_is_digits AND right_is_digits THEN 'uprn_usrn' + ELSE NULL + END AS relation_type, + CASE + WHEN relation_hint IN ('toid_usrn', 'toid->usrn', 'toid_usrn_link') THEN left_id + WHEN relation_hint IN ('uprn_usrn', 'uprn->usrn', 'uprn_usrn_link') THEN left_id + WHEN left_is_toid AND right_is_digits THEN left_id + WHEN right_is_toid AND left_is_digits THEN right_id + WHEN left_is_digits AND right_is_digits AND length(right_id) > 8 AND length(left_id) <= 8 THEN right_id + ELSE left_id + END AS id_1, + CASE + WHEN relation_hint IN ('toid_usrn', 'toid->usrn', 'toid_usrn_link') THEN right_id + WHEN relation_hint IN ('uprn_usrn', 'uprn->usrn', 'uprn_usrn_link') THEN right_id + WHEN left_is_toid AND right_is_digits THEN right_id + WHEN right_is_toid AND left_is_digits THEN left_id + WHEN left_is_digits AND right_is_digits AND length(right_id) > 8 AND length(left_id) <= 8 THEN left_id + ELSE right_id + END AS id_2 + FROM prepared + WHERE left_present AND right_present + ), + ins_toid AS ( + INSERT INTO stage.open_lids_toid_usrn ( + build_run_id, + toid, + usrn, + ingest_run_id + ) + SELECT + %s, + resolved.id_1, + resolved.id_2::bigint, + %s + FROM resolved + WHERE resolved.relation_type = 'toid_usrn' + AND resolved.id_2 ~ '^[0-9]+$' + ON CONFLICT (build_run_id, toid, usrn) + DO NOTHING + RETURNING 1 + ), + ins_uprn AS ( + INSERT INTO stage.open_lids_uprn_usrn ( + build_run_id, + uprn, + usrn, + ingest_run_id + ) + SELECT + %s, + resolved.id_1::bigint, + resolved.id_2::bigint, + %s + FROM resolved + WHERE resolved.relation_type = 'uprn_usrn' + AND resolved.id_1 ~ '^[0-9]+$' + AND resolved.id_2 ~ '^[0-9]+$' + ON CONFLICT (build_run_id, uprn, usrn) + DO NOTHING + RETURNING 1 + ) + SELECT + (SELECT COUNT(*)::bigint FROM ins_toid) AS toid_count, + (SELECT COUNT(*)::bigint FROM ins_uprn) AS uprn_count, + ( + SELECT COUNT(*)::bigint + FROM resolved + WHERE + ( + resolved.relation_type = 'toid_usrn' + AND resolved.id_2 ~ '^[0-9]+$' + ) + OR ( + resolved.relation_type = 'uprn_usrn' + AND resolved.id_1 ~ '^[0-9]+$' + AND resolved.id_2 ~ '^[0-9]+$' + ) + ) AS relation_count + """ + ).format( + id_1_expr=id_1_expr, + id_2_expr=id_2_expr, + relation_expr=relation_expr, + ), + ( + ingest_run_id, + build_run_id, + ingest_run_id, + build_run_id, + ingest_run_id, + ), + ) + row = cur.fetchone() + if row is None: + return 0, 0, 0 + toid_count = int(row[0]) + uprn_count = int(row[1]) + pair_count = int(row[2]) + + return toid_count, uprn_count, pair_count + + +def _populate_stage_nsul( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + _validated_raw_sample_row( + conn, + source_name="nsul", + raw_table="raw.nsul_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ) + + payload_expr = sql.SQL("r.payload_jsonb") + uprn_expr = _json_text_for_field(payload_expr, field_map, "uprn") + postcode_expr = _json_text_for_field(payload_expr, field_map, "postcode") + + with conn.cursor() as cur: + cur.execute( + sql.SQL( + """ + WITH extracted AS ( + SELECT + btrim({uprn_expr}) AS uprn_text, + btrim({postcode_expr}) AS postcode_text + FROM raw.nsul_row AS r + WHERE r.ingest_run_id = %s + ), + normalized AS ( + SELECT DISTINCT + uprn_text::bigint AS uprn, + NULLIF( + upper(regexp_replace(COALESCE(postcode_text, ''), '[^A-Za-z0-9]', '', 'g')), + '' + ) AS postcode_norm + FROM extracted + WHERE uprn_text IS NOT NULL + AND uprn_text <> '' + AND uprn_text ~ '^[0-9]+$' + ) + INSERT INTO stage.nsul_uprn_postcode ( + build_run_id, + uprn, + postcode_norm, + ingest_run_id + ) + SELECT + %s, + n.uprn, + n.postcode_norm, + %s + FROM normalized AS n + WHERE n.postcode_norm IS NOT NULL + ON CONFLICT (build_run_id, uprn, postcode_norm) + DO NOTHING + """ + ).format(uprn_expr=uprn_expr, postcode_expr=postcode_expr), + (ingest_run_id, build_run_id, ingest_run_id), + ) + return int(cur.rowcount) + + +def _populate_stage_osni( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + insert_sql = sql.SQL( + """ + INSERT INTO stage.osni_street_point ( + build_run_id, + feature_id, + postcode_norm, + street_name_raw, + street_name_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, feature_id) + DO UPDATE SET + postcode_norm = EXCLUDED.postcode_norm, + street_name_raw = EXCLUDED.street_name_raw, + street_name_casefolded = EXCLUDED.street_name_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ) + + payload: list[tuple[Any, ...]] = [] + inserted = 0 + postcode_key = field_map.get("postcode") + for row in _iter_validated_raw_rows( + conn, + source_name="osni_gazetteer", + raw_table="raw.osni_gazetteer_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + feature_id_raw = _field_value(row, field_map, "feature_id") + street_raw = _field_value(row, field_map, "street_name") + if feature_id_raw in (None, "") or street_raw in (None, ""): + continue + + folded = street_casefold(str(street_raw)) + if folded is None: + continue + + postcode_n = postcode_norm(str(row.get(postcode_key)) if postcode_key and row.get(postcode_key) not in (None, "") else None) + payload.append( + ( + build_run_id, + str(feature_id_raw).strip(), + postcode_n, + str(street_raw).strip(), + folded, + ingest_run_id, + ) + ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) + + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted + + +def _populate_stage_dfi( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + insert_sql = sql.SQL( + """ + INSERT INTO stage.dfi_road_segment ( + build_run_id, + segment_id, + postcode_norm, + street_name_raw, + street_name_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, segment_id) + DO UPDATE SET + postcode_norm = EXCLUDED.postcode_norm, + street_name_raw = EXCLUDED.street_name_raw, + street_name_casefolded = EXCLUDED.street_name_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ) + + payload: list[tuple[Any, ...]] = [] + inserted = 0 + postcode_key = field_map.get("postcode") + for row in _iter_validated_raw_rows( + conn, + source_name="dfi_highway", + raw_table="raw.dfi_highway_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + segment_id_raw = _field_value(row, field_map, "segment_id") + street_raw = _field_value(row, field_map, "street_name") + if segment_id_raw in (None, "") or street_raw in (None, ""): + continue + + folded = street_casefold(str(street_raw)) + if folded is None: + continue + postcode_n = postcode_norm(str(row.get(postcode_key)) if postcode_key and row.get(postcode_key) not in (None, "") else None) + + payload.append( + ( + build_run_id, + str(segment_id_raw).strip(), + postcode_n, + str(street_raw).strip(), + folded, + ingest_run_id, + ) + ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) + + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted + + +def _populate_stage_ppd( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + insert_sql = sql.SQL( + """ + INSERT INTO stage.ppd_parsed_address ( + build_run_id, + row_hash, + postcode_norm, + house_number, + street_token_raw, + street_token_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, row_hash) + DO UPDATE SET + postcode_norm = EXCLUDED.postcode_norm, + house_number = EXCLUDED.house_number, + street_token_raw = EXCLUDED.street_token_raw, + street_token_casefolded = EXCLUDED.street_token_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ) + + payload: list[tuple[Any, ...]] = [] + inserted = 0 + for row in _iter_validated_raw_rows( + conn, + source_name="ppd", + raw_table="raw.ppd_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + row_hash_raw = _field_value(row, field_map, "row_hash") + postcode_raw = _field_value(row, field_map, "postcode") + street_raw = _field_value(row, field_map, "street") + house_number_raw = _field_value(row, field_map, "house_number") + + if row_hash_raw in (None, "") or postcode_raw in (None, "") or street_raw in (None, ""): + continue + + postcode_n = postcode_norm(str(postcode_raw)) + folded = street_casefold(str(street_raw)) + if postcode_n is None or folded is None: + continue + + payload.append( + ( + build_run_id, + str(row_hash_raw).strip(), + postcode_n, + str(house_number_raw).strip() if house_number_raw not in (None, "") else None, + str(street_raw).strip(), + folded, + ingest_run_id, + ) + ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) + + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted + + +def _pass_0b_stage_normalisation( + conn: psycopg.Connection, + build_run_id: str, + source_runs: dict[str, tuple[str, ...]], +) -> dict[str, int]: + with conn.cursor() as cur: + # Pass 0b executes large sort/dedupe operations on raw snapshots. + # Raising work_mem here avoids repeated temp-file spill on default settings. + cur.execute("SET LOCAL work_mem = '256MB'") + + _stage_cleanup(conn, build_run_id) + schema_config = _schema_config() + + counts: dict[str, int] = {} + + if "onspd" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "onspd") + ingest_run_id = _single_source_run(source_runs, "onspd") + counts["stage.onspd_postcode"] = _populate_stage_onspd( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "os_open_usrn" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "os_open_usrn") + ingest_run_id = _single_source_run(source_runs, "os_open_usrn") + counts["stage.streets_usrn_input"] = _populate_stage_usrn( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "os_open_names" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "os_open_names") + ingest_run_id = _single_source_run(source_runs, "os_open_names") + counts["stage.open_names_road_feature"] = _populate_stage_open_names( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "os_open_roads" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "os_open_roads") + ingest_run_id = _single_source_run(source_runs, "os_open_roads") + counts["stage.open_roads_segment"] = _populate_stage_open_roads( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "os_open_uprn" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "os_open_uprn") + ingest_run_id = _single_source_run(source_runs, "os_open_uprn") + counts["stage.uprn_point"] = _populate_stage_open_uprn( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "os_open_lids" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "os_open_lids") + ingest_run_id = _single_source_run(source_runs, "os_open_lids") + toid_count, uprn_count, relation_count = _populate_stage_open_lids( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + counts["stage.open_lids_toid_usrn"] = toid_count + counts["stage.open_lids_uprn_usrn"] = uprn_count + counts["stage.open_lids_relation_count"] = relation_count + + if "nsul" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "nsul") + ingest_run_id = _single_source_run(source_runs, "nsul") + counts["stage.nsul_uprn_postcode"] = _populate_stage_nsul( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "osni_gazetteer" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "osni_gazetteer") + ingest_run_id = _single_source_run(source_runs, "osni_gazetteer") + counts["stage.osni_street_point"] = _populate_stage_osni( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "dfi_highway" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "dfi_highway") + ingest_run_id = _single_source_run(source_runs, "dfi_highway") + counts["stage.dfi_road_segment"] = _populate_stage_dfi( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "ppd" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "ppd") + ppd_run_ids = source_runs["ppd"] + if len(ppd_run_ids) == 0: + raise BuildError("Bundle requires at least one ppd ingest run") + ppd_rows = 0 + for ingest_run_id in _ordered_run_ids(conn, ppd_run_ids): + ppd_rows += _populate_stage_ppd( + conn, + build_run_id, + ingest_run_id, + field_map, + required_fields, + ) + counts["stage.ppd_parsed_address"] = ppd_rows + + _analyze_relations(conn, STAGE_TABLES) + return counts + + +def _clear_run_outputs(conn: psycopg.Connection, build_run_id: str) -> None: + with conn.cursor() as cur: + for table in ( + "internal.unit_index", + "derived.postcode_streets_final_source", + "derived.postcode_streets_final_candidate", + "derived.postcode_street_candidate_lineage", + "derived.postcode_streets_final", + "derived.postcode_street_candidates", + "core.postcodes_meta", + "core.streets_usrn", + "core.postcodes", + ): + schema_name, table_name = table.split(".", 1) + column_name = "produced_build_run_id" + if table == "core.postcodes_meta": + column_name = "produced_build_run_id" + cur.execute( + sql.SQL("DELETE FROM {}.{} WHERE {} = %s").format( + sql.Identifier(schema_name), + sql.Identifier(table_name), + sql.Identifier(column_name), + ), + (build_run_id,), + ) + + cur.execute("DELETE FROM meta.canonical_hash WHERE build_run_id = %s", (build_run_id,)) + cur.execute("DELETE FROM meta.build_pass_checkpoint WHERE build_run_id = %s", (build_run_id,)) + + +def _pass_1_onspd_backbone(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO core.postcodes ( + produced_build_run_id, + postcode, + status, + lat, + lon, + easting, + northing, + country_iso2, + country_iso3, + subdivision_code, + post_town, + locality, + street_enrichment_available, + onspd_run_id + ) + SELECT + build_run_id, + postcode_display, + status, + lat, + lon, + easting, + northing, + country_iso2, + country_iso3, + subdivision_code, + post_town, + locality, + street_enrichment_available, + onspd_run_id + FROM stage.onspd_postcode + WHERE build_run_id = %s + ORDER BY postcode_norm COLLATE "C" ASC + """, + (build_run_id,), + ) + inserted_postcodes = cur.rowcount + + cur.execute( + """ + INSERT INTO core.postcodes_meta ( + produced_build_run_id, + postcode, + meta_jsonb, + onspd_run_id + ) + SELECT + build_run_id, + postcode_display, + jsonb_build_object( + 'postcode_norm', postcode_norm, + 'country_iso2', country_iso2, + 'country_iso3', country_iso3, + 'subdivision_code', subdivision_code, + 'post_town', post_town, + 'locality', locality, + 'status', status + ), + onspd_run_id + FROM stage.onspd_postcode + WHERE build_run_id = %s + ORDER BY postcode_norm COLLATE "C" ASC + """, + (build_run_id,), + ) + inserted_meta = cur.rowcount + + _analyze_relations(conn, ("core.postcodes", "core.postcodes_meta")) + + return { + "core.postcodes": int(inserted_postcodes), + "core.postcodes_meta": int(inserted_meta), + } + + +def _pass_2_gb_canonical_streets(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO core.streets_usrn ( + produced_build_run_id, + usrn, + street_name, + street_name_casefolded, + street_class, + street_status, + usrn_run_id + ) + SELECT + %(build_run_id)s, + s.usrn, + s.street_name, + s.street_name_casefolded, + s.street_class, + s.street_status, + s.usrn_run_id + FROM stage.streets_usrn_input AS s + WHERE s.build_run_id = %(build_run_id)s + ORDER BY s.usrn ASC + """, + {"build_run_id": build_run_id}, + ) + inserted_direct = int(cur.rowcount) + + cur.execute( + """ + CREATE TEMP TABLE tmp_open_names_toid_counts + ON COMMIT DROP AS + SELECT + n.toid, + n.street_name_raw AS street_name, + n.street_name_casefolded, + COUNT(*)::bigint AS feature_count + FROM stage.open_names_road_feature AS n + WHERE n.build_run_id = %(build_run_id)s + AND n.toid IS NOT NULL + GROUP BY n.toid, n.street_name_raw, n.street_name_casefolded + """, + {"build_run_id": build_run_id}, + ) + cur.execute( + """ + CREATE INDEX idx_tmp_open_names_toid_counts_toid + ON tmp_open_names_toid_counts (toid) + """ + ) + cur.execute( + """ + CREATE TEMP TABLE tmp_inferred_name_counts + ON COMMIT DROP AS + SELECT + lids.usrn, + n.street_name, + n.street_name_casefolded, + SUM(n.feature_count)::bigint AS evidence_count, + (ARRAY_AGG(lids.ingest_run_id ORDER BY lids.ingest_run_id::text ASC))[1] AS usrn_run_id + FROM tmp_open_names_toid_counts AS n + JOIN stage.open_lids_toid_usrn AS lids + ON lids.build_run_id = %(build_run_id)s + AND lids.toid = n.toid + GROUP BY lids.usrn, n.street_name, n.street_name_casefolded + """, + {"build_run_id": build_run_id}, + ) + cur.execute( + """ + CREATE INDEX idx_tmp_inferred_name_counts_usrn + ON tmp_inferred_name_counts ( + usrn, + evidence_count DESC, + street_name_casefolded, + street_name + ) + """ + ) + cur.execute( + """ + WITH inferred_usrn AS ( + SELECT + usrn, + street_name, + street_name_casefolded, + NULL::text AS street_class, + NULL::text AS street_status, + usrn_run_id + FROM ( + SELECT + usrn, + street_name, + street_name_casefolded, + usrn_run_id, + ROW_NUMBER() OVER ( + PARTITION BY usrn + ORDER BY evidence_count DESC, + street_name_casefolded COLLATE "C" ASC, + street_name COLLATE "C" ASC + ) AS rn + FROM tmp_inferred_name_counts + ) AS ranked + WHERE rn = 1 + ) + INSERT INTO core.streets_usrn ( + produced_build_run_id, + usrn, + street_name, + street_name_casefolded, + street_class, + street_status, + usrn_run_id + ) + SELECT + %(build_run_id)s, + inferred.usrn, + inferred.street_name, + inferred.street_name_casefolded, + inferred.street_class, + inferred.street_status, + inferred.usrn_run_id + FROM inferred_usrn AS inferred + WHERE NOT EXISTS ( + SELECT 1 + FROM core.streets_usrn AS direct + WHERE direct.produced_build_run_id = %(build_run_id)s + AND direct.usrn = inferred.usrn + ) + ORDER BY inferred.usrn ASC + """, + {"build_run_id": build_run_id}, + ) + inserted_inferred = int(cur.rowcount) + + _analyze_relations(conn, ("core.streets_usrn",)) + return {"core.streets_usrn": inserted_direct + inserted_inferred} + + +def _pass_3_open_names_candidates(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + schema_config = _schema_config() + _mapped_fields_for_source(schema_config, "os_open_names") + _mapped_fields_for_source(schema_config, "os_open_lids") + + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) + SELECT + %s, + p.postcode, + n.street_name_raw, + n.street_name_casefolded, + NULL, + 'names_postcode_feature', + 'medium', + 'open_names:feature:' || n.feature_id, + 'os_open_names', + n.ingest_run_id, + jsonb_build_object('feature_id', n.feature_id, 'toid', n.toid) + FROM stage.open_names_road_feature AS n + JOIN core.postcodes AS p + ON p.produced_build_run_id = %s + AND replace(p.postcode, ' ', '') = n.postcode_norm + WHERE n.build_run_id = %s + ORDER BY n.feature_id COLLATE "C" ASC + """, + (build_run_id, build_run_id, build_run_id), + ) + base_inserted = cur.rowcount + + promotions_inserted = 0 + lineage_inserted = 0 + + with conn.cursor() as cur: + cur.execute( + """ + SELECT + parent.candidate_id, + parent.postcode, + parent.street_name_raw, + parent.street_name_canonical, + parent.evidence_json ->> 'toid' AS toid, + lids.usrn, + lids.ingest_run_id + FROM derived.postcode_street_candidates AS parent + JOIN stage.open_lids_toid_usrn AS lids + ON lids.build_run_id = parent.produced_build_run_id + AND lids.toid = parent.evidence_json ->> 'toid' + WHERE parent.produced_build_run_id = %s + AND parent.candidate_type = 'names_postcode_feature' + AND parent.evidence_json ->> 'toid' IS NOT NULL + ORDER BY parent.candidate_id ASC, lids.usrn ASC + """, + (build_run_id,), + ) + promotion_rows = cur.fetchall() + + with conn.cursor() as cur: + for ( + parent_candidate_id, + postcode, + street_name_raw, + street_name_canonical, + toid, + usrn, + open_lids_run_id, + ) in promotion_rows: + cur.execute( + """ + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) VALUES (%s, %s, %s, %s, %s, 'open_lids_toid_usrn', 'high', %s, 'os_open_lids', %s, %s) + RETURNING candidate_id + """, + ( + build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + f"open_lids:toid_usrn:{toid}", + open_lids_run_id, + Jsonb({"toid": toid, "usrn": usrn}), + ), + ) + child_candidate_id = int(cur.fetchone()[0]) + promotions_inserted += 1 + + cur.execute( + """ + INSERT INTO derived.postcode_street_candidate_lineage ( + parent_candidate_id, + child_candidate_id, + relation_type, + produced_build_run_id + ) VALUES (%s, %s, 'promotion_toid_usrn', %s) + ON CONFLICT DO NOTHING + """, + (parent_candidate_id, child_candidate_id, build_run_id), + ) + lineage_inserted += cur.rowcount + + return { + "derived.postcode_street_candidates_base": int(base_inserted), + "derived.postcode_street_candidates_promoted": int(promotions_inserted), + "derived.postcode_street_candidate_lineage": int(lineage_inserted), + } + + +def _pass_4_uprn_reinforcement(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + with conn.cursor() as cur: + cur.execute("SET LOCAL work_mem = '256MB'") + cur.execute( + """ + SELECT bbs.ingest_run_id + FROM meta.build_run AS br + JOIN meta.build_bundle_source AS bbs + ON bbs.bundle_id = br.bundle_id + WHERE br.build_run_id = %s + AND bbs.source_name = 'os_open_lids' + ORDER BY bbs.ingest_run_id::text ASC + LIMIT 1 + """, + (build_run_id,), + ) + run_row = cur.fetchone() + if run_row is None or run_row[0] is None: + raise BuildError( + "Pass 4 failed: missing os_open_lids ingest run for build bundle " + f"build_run_id={build_run_id}" + ) + open_lids_ingest_run_id = run_row[0] + + cur.execute( + """ + WITH aggregate_pairs AS ( + SELECT + nsul.postcode_norm, + lids.usrn, + COUNT(*)::bigint AS uprn_count + FROM stage.nsul_uprn_postcode AS nsul + JOIN stage.open_lids_uprn_usrn AS lids + ON lids.build_run_id = nsul.build_run_id + AND lids.uprn = nsul.uprn + WHERE nsul.build_run_id = %s + GROUP BY nsul.postcode_norm, lids.usrn + ) + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) + SELECT + %s, + p.postcode, + s.street_name, + s.street_name_casefolded, + a.usrn, + 'uprn_usrn', + 'high', + 'open_lids:uprn_usrn:' || a.uprn_count::text || '_uprns', + 'os_open_lids', + %s, + jsonb_build_object('uprn_count', a.uprn_count) + FROM aggregate_pairs AS a + JOIN stage.onspd_postcode AS sp + ON sp.build_run_id = %s + AND sp.postcode_norm = a.postcode_norm + JOIN core.postcodes AS p + ON p.produced_build_run_id = %s + AND p.postcode = sp.postcode_display + JOIN core.streets_usrn AS s + ON s.produced_build_run_id = %s + AND s.usrn = a.usrn + ORDER BY p.postcode COLLATE "C" ASC, a.usrn ASC + """, + ( + build_run_id, + build_run_id, + open_lids_ingest_run_id, + build_run_id, + build_run_id, + build_run_id, + ), + ) + inserted = cur.rowcount + + return {"derived.postcode_street_candidates_uprn_usrn": int(inserted)} + + +def _pass_5_gb_spatial_fallback(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + schema_config = _schema_config() + _mapped_fields_for_source(schema_config, "os_open_roads") + + with conn.cursor() as cur: + cur.execute( + """ + WITH gb_postcodes_without_high AS ( + SELECT p.postcode, replace(p.postcode, ' ', '') AS postcode_norm + FROM core.postcodes AS p + WHERE p.produced_build_run_id = %s + AND p.country_iso2 = 'GB' + AND NOT EXISTS ( + SELECT 1 + FROM derived.postcode_street_candidates AS c + WHERE c.produced_build_run_id = p.produced_build_run_id + AND c.postcode = p.postcode + AND c.confidence = 'high' + ) + ), + ranked_segments AS ( + SELECT + g.postcode, + r.segment_id, + r.usrn, + r.road_name, + r.road_name_casefolded, + r.ingest_run_id, + ROW_NUMBER() OVER ( + PARTITION BY g.postcode + ORDER BY r.segment_id COLLATE "C" ASC + ) AS rn + FROM gb_postcodes_without_high AS g + JOIN stage.open_roads_segment AS r + ON r.build_run_id = %s + AND r.postcode_norm = g.postcode_norm + ) + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) + SELECT + %s, + rs.postcode, + rs.road_name, + rs.road_name_casefolded, + rs.usrn, + 'spatial_os_open_roads', + 'low', + 'spatial:os_open_roads:' || rs.segment_id || ':fallback', + 'os_open_roads', + rs.ingest_run_id, + jsonb_build_object('segment_id', rs.segment_id) + FROM ranked_segments AS rs + WHERE rs.rn = 1 + ORDER BY rs.postcode COLLATE "C" ASC + """, + (build_run_id, build_run_id, build_run_id), + ) + inserted = cur.rowcount + + return {"derived.postcode_street_candidates_spatial_os_open_roads": int(inserted)} + + +def _pass_6_ni_candidates(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) + SELECT + %s, + p.postcode, + n.street_name_raw, + n.street_name_casefolded, + NULL, + 'osni_gazetteer_direct', + 'medium', + 'osni_gazetteer:feature:' || n.feature_id, + 'osni_gazetteer', + n.ingest_run_id, + jsonb_build_object('feature_id', n.feature_id) + FROM stage.osni_street_point AS n + JOIN core.postcodes AS p + ON p.produced_build_run_id = %s + AND replace(p.postcode, ' ', '') = n.postcode_norm + WHERE n.build_run_id = %s + AND p.subdivision_code = 'GB-NIR' + ORDER BY n.feature_id COLLATE "C" ASC + """, + (build_run_id, build_run_id, build_run_id), + ) + direct_inserted = cur.rowcount + + cur.execute( + """ + WITH ni_without_candidates AS ( + SELECT p.postcode, replace(p.postcode, ' ', '') AS postcode_norm + FROM core.postcodes AS p + WHERE p.produced_build_run_id = %s + AND p.subdivision_code = 'GB-NIR' + AND NOT EXISTS ( + SELECT 1 + FROM derived.postcode_street_candidates AS c + WHERE c.produced_build_run_id = p.produced_build_run_id + AND c.postcode = p.postcode + ) + ), + ranked_segments AS ( + SELECT + n.postcode, + d.segment_id, + d.street_name_raw, + d.street_name_casefolded, + d.ingest_run_id, + ROW_NUMBER() OVER ( + PARTITION BY n.postcode + ORDER BY d.segment_id COLLATE "C" ASC + ) AS rn + FROM ni_without_candidates AS n + JOIN stage.dfi_road_segment AS d + ON d.build_run_id = %s + AND d.postcode_norm = n.postcode_norm + ) + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) + SELECT + %s, + r.postcode, + r.street_name_raw, + r.street_name_casefolded, + NULL, + 'spatial_dfi_highway', + 'low', + 'spatial:dfi_highway:' || r.segment_id || ':fallback', + 'dfi_highway', + r.ingest_run_id, + jsonb_build_object('segment_id', r.segment_id) + FROM ranked_segments AS r + WHERE r.rn = 1 + ORDER BY r.postcode COLLATE "C" ASC + """, + (build_run_id, build_run_id, build_run_id), + ) + fallback_inserted = cur.rowcount + + return { + "derived.postcode_street_candidates_osni_gazetteer_direct": int(direct_inserted), + "derived.postcode_street_candidates_spatial_dfi_highway": int(fallback_inserted), + } + + +def _pass_7_ppd_gap_fill(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + with conn.cursor() as cur: + cur.execute( + """ + WITH matched AS ( + SELECT + c.postcode, + p.house_number, + p.street_token_raw, + p.ingest_run_id, + s.usrn, + s.street_name, + s.street_name_casefolded + FROM stage.ppd_parsed_address AS p + JOIN core.postcodes AS c + ON c.produced_build_run_id = %s + AND replace(c.postcode, ' ', '') = p.postcode_norm + LEFT JOIN core.streets_usrn AS s + ON s.produced_build_run_id = %s + AND s.street_name_casefolded = p.street_token_casefolded + WHERE p.build_run_id = %s + ) + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) + SELECT + %s, + m.postcode, + m.street_token_raw, + COALESCE(m.street_name_casefolded, upper(m.street_token_raw)), + m.usrn, + CASE WHEN m.usrn IS NULL THEN 'ppd_parse_unmatched' ELSE 'ppd_parse_matched' END, + CASE WHEN m.usrn IS NULL THEN 'low' ELSE 'medium' END, + 'ppd:row:' || md5(m.postcode || '|' || COALESCE(m.house_number, '') || '|' || m.street_token_raw), + 'ppd', + m.ingest_run_id, + jsonb_build_object('house_number', m.house_number) + FROM matched AS m + ORDER BY m.postcode COLLATE "C" ASC + """, + (build_run_id, build_run_id, build_run_id, build_run_id), + ) + candidate_inserted = cur.rowcount + + cur.execute( + """ + WITH matched AS ( + SELECT + c.postcode, + p.house_number, + p.ingest_run_id, + s.usrn, + COALESCE(s.street_name, p.street_token_raw) AS street_name, + CASE WHEN s.usrn IS NULL THEN 'low' ELSE 'medium' END AS confidence, + CASE WHEN s.usrn IS NULL THEN 'ppd_parse_unmatched' ELSE 'ppd_parse_matched' END AS source_type + FROM stage.ppd_parsed_address AS p + JOIN core.postcodes AS c + ON c.produced_build_run_id = %s + AND replace(c.postcode, ' ', '') = p.postcode_norm + LEFT JOIN core.streets_usrn AS s + ON s.produced_build_run_id = %s + AND s.street_name_casefolded = p.street_token_casefolded + WHERE p.build_run_id = %s + ) + INSERT INTO internal.unit_index ( + produced_build_run_id, + postcode, + house_number, + street_name, + usrn, + confidence, + source_type, + ingest_run_id + ) + SELECT + %s, + postcode, + COALESCE(house_number, ''), + street_name, + usrn, + confidence, + source_type, + ingest_run_id + FROM matched + ORDER BY postcode COLLATE "C" ASC + """, + (build_run_id, build_run_id, build_run_id, build_run_id), + ) + unit_index_inserted = cur.rowcount + + return { + "derived.postcode_street_candidates_ppd": int(candidate_inserted), + "internal.unit_index": int(unit_index_inserted), + } + + +def _pass_8_finalisation(conn: psycopg.Connection, build_run_id: str, dataset_version: str) -> dict[str, int]: + weight_map = _weight_config() + + with conn.cursor() as cur: + cur.execute("DROP TABLE IF EXISTS pg_temp.tmp_candidate_weights") + cur.execute( + """ + CREATE TEMP TABLE tmp_candidate_weights ( + candidate_type text PRIMARY KEY, + weight numeric(10,4) NOT NULL + ) ON COMMIT DROP + """ + ) + cur.executemany( + "INSERT INTO tmp_candidate_weights (candidate_type, weight) VALUES (%s, %s)", + [(candidate_type, weight) for candidate_type, weight in weight_map.items()], + ) + + cur.execute("DROP TABLE IF EXISTS pg_temp.tmp_weighted_candidates") + cur.execute( + """ + CREATE TEMP TABLE tmp_weighted_candidates AS + SELECT + c.candidate_id, + c.postcode, + COALESCE(s.street_name, c.street_name_canonical) AS canonical_street_name, + c.usrn, + c.source_name, + c.ingest_run_id, + c.candidate_type, + w.weight::numeric(10,4) AS weight, + CASE c.confidence + WHEN 'high' THEN 3 + WHEN 'medium' THEN 2 + WHEN 'low' THEN 1 + ELSE 0 + END AS conf_rank + FROM derived.postcode_street_candidates AS c + JOIN tmp_candidate_weights AS w + ON w.candidate_type = c.candidate_type + LEFT JOIN core.streets_usrn AS s + ON s.produced_build_run_id = c.produced_build_run_id + AND s.usrn = c.usrn + WHERE c.produced_build_run_id = %s + """, + (build_run_id,), + ) + + cur.execute( + """ + SELECT postcode + FROM ( + SELECT postcode, SUM(weight) AS total_weight + FROM tmp_weighted_candidates + GROUP BY postcode + ) AS totals + WHERE total_weight <= 0 + LIMIT 1 + """ + ) + bad = cur.fetchone() + if bad is not None: + raise BuildError( + f"Finalisation failed: total_weight <= 0 for postcode={bad[0]}" + ) + + cur.execute( + """ + CREATE INDEX IF NOT EXISTS idx_tmp_weighted_candidates_street + ON tmp_weighted_candidates (postcode, canonical_street_name, candidate_id) + """ + ) + cur.execute( + """ + CREATE INDEX IF NOT EXISTS idx_tmp_weighted_candidates_source + ON tmp_weighted_candidates ( + postcode, + canonical_street_name, + source_name, + ingest_run_id, + candidate_type + ) + """ + ) + + cur.execute("DROP TABLE IF EXISTS pg_temp.tmp_final_scored") + cur.execute( + """ + CREATE TEMP TABLE tmp_final_scored AS + WITH grouped AS ( + SELECT + postcode, + canonical_street_name, + MIN(usrn) AS usrn, + SUM(weight) AS weighted_score, + MAX(conf_rank) AS conf_rank + FROM tmp_weighted_candidates + GROUP BY postcode, canonical_street_name + ), + totals AS ( + SELECT postcode, SUM(weighted_score) AS total_weight + FROM grouped + GROUP BY postcode + ), + scored AS ( + SELECT + g.postcode, + g.canonical_street_name, + g.usrn, + g.weighted_score, + g.conf_rank, + (g.weighted_score / t.total_weight) AS raw_probability + FROM grouped AS g + JOIN totals AS t + ON t.postcode = g.postcode + ), + rounded AS ( + SELECT + s.*, + ROUND(s.raw_probability::numeric, 4) AS rounded_probability, + ROW_NUMBER() OVER ( + PARTITION BY s.postcode + ORDER BY + s.raw_probability DESC, + s.conf_rank DESC, + s.canonical_street_name COLLATE "C" ASC, + s.usrn ASC NULLS LAST + ) AS rn, + SUM(ROUND(s.raw_probability::numeric, 4)) OVER ( + PARTITION BY s.postcode + ) AS rounded_sum + FROM scored AS s + ) + SELECT + postcode, + canonical_street_name, + usrn, + ROUND(weighted_score::numeric, 4) AS frequency_score, + CASE conf_rank + WHEN 3 THEN 'high' + WHEN 2 THEN 'medium' + WHEN 1 THEN 'low' + ELSE 'none' + END AS confidence, + CASE + WHEN rn = 1 + THEN ROUND((rounded_probability + (1.0000 - rounded_sum))::numeric, 4) + ELSE rounded_probability + END AS final_probability, + rn + FROM rounded + ORDER BY postcode COLLATE "C" ASC, rn ASC + """ + ) + cur.execute( + """ + CREATE INDEX IF NOT EXISTS idx_tmp_final_scored_street + ON tmp_final_scored (postcode, canonical_street_name) + """ + ) + + cur.execute("DROP TABLE IF EXISTS pg_temp.tmp_final_inserted") + cur.execute( + """ + CREATE TEMP TABLE tmp_final_inserted ( + final_id bigint PRIMARY KEY, + postcode text NOT NULL, + canonical_street_name text NOT NULL + ) ON COMMIT DROP + """ + ) + cur.execute( + """ + WITH inserted AS ( + INSERT INTO derived.postcode_streets_final ( + produced_build_run_id, + postcode, + street_name, + usrn, + confidence, + frequency_score, + probability + ) + SELECT + %s, + fs.postcode, + fs.canonical_street_name, + fs.usrn, + fs.confidence, + fs.frequency_score, + fs.final_probability + FROM tmp_final_scored AS fs + ORDER BY fs.postcode COLLATE "C" ASC, fs.rn ASC + RETURNING final_id, postcode, street_name + ) + INSERT INTO tmp_final_inserted (final_id, postcode, canonical_street_name) + SELECT final_id, postcode, street_name + FROM inserted + """, + (build_run_id,), + ) + inserted_final = int(cur.rowcount) + + cur.execute( + """ + INSERT INTO derived.postcode_streets_final_candidate ( + final_id, + candidate_id, + produced_build_run_id, + link_rank + ) + SELECT + fi.final_id, + wc.candidate_id, + %s, + ROW_NUMBER() OVER ( + PARTITION BY fi.final_id + ORDER BY wc.candidate_id ASC + ) AS link_rank + FROM tmp_final_inserted AS fi + JOIN tmp_weighted_candidates AS wc + ON wc.postcode = fi.postcode + AND wc.canonical_street_name = fi.canonical_street_name + ORDER BY fi.final_id ASC, wc.candidate_id ASC + """, + (build_run_id,), + ) + inserted_final_candidate = int(cur.rowcount) + + cur.execute( + """ + INSERT INTO derived.postcode_streets_final_source ( + final_id, + source_name, + ingest_run_id, + candidate_type, + contribution_weight, + produced_build_run_id + ) + SELECT + fi.final_id, + wc.source_name, + wc.ingest_run_id, + wc.candidate_type, + ROUND(SUM(wc.weight)::numeric, 4) AS contribution_weight, + %s + FROM tmp_final_inserted AS fi + JOIN tmp_weighted_candidates AS wc + ON wc.postcode = fi.postcode + AND wc.canonical_street_name = fi.canonical_street_name + GROUP BY + fi.final_id, + wc.source_name, + wc.ingest_run_id, + wc.candidate_type + ORDER BY + fi.final_id ASC, + wc.source_name COLLATE "C" ASC, + wc.ingest_run_id::text ASC, + wc.candidate_type COLLATE "C" ASC + """, + (build_run_id,), + ) + inserted_final_source = int(cur.rowcount) + + cur.execute( + """ + UPDATE core.postcodes + SET multi_street = false + WHERE produced_build_run_id = %s + """, + (build_run_id,), + ) + cur.execute( + """ + WITH counts AS ( + SELECT postcode, COUNT(*) AS street_count + FROM derived.postcode_streets_final + WHERE produced_build_run_id = %s + GROUP BY postcode + ) + UPDATE core.postcodes AS p + SET multi_street = (c.street_count > 1) + FROM counts AS c + WHERE p.produced_build_run_id = %s + AND p.postcode = c.postcode + """, + (build_run_id, build_run_id), + ) + + projection_counts = _create_api_projection_tables(conn, build_run_id, dataset_version) + + return { + "derived.postcode_streets_final": inserted_final, + "derived.postcode_streets_final_candidate": inserted_final_candidate, + "derived.postcode_streets_final_source": inserted_final_source, + **projection_counts, + } + + +def _create_api_projection_tables( + conn: psycopg.Connection, + build_run_id: str, + dataset_version: str, +) -> dict[str, int]: + suffix = _safe_version_suffix(dataset_version) + street_table_name = f"postcode_street_lookup__{suffix}" + lookup_table_name = f"postcode_lookup__{suffix}" + + street_ident = sql.Identifier(street_table_name) + lookup_ident = sql.Identifier(lookup_table_name) + + with conn.cursor() as cur: + cur.execute(sql.SQL("DROP TABLE IF EXISTS api.{} CASCADE").format(street_ident)) + cur.execute( + sql.SQL( + """ + CREATE TABLE api.{} AS + SELECT + f.postcode, + f.street_name, + f.usrn, + f.confidence, + f.frequency_score, + f.probability, + %s::text AS dataset_version, + f.produced_build_run_id + FROM derived.postcode_streets_final AS f + WHERE f.produced_build_run_id = %s + ORDER BY + f.postcode COLLATE "C" ASC, + f.probability DESC, + f.street_name COLLATE "C" ASC, + f.usrn ASC NULLS LAST + """ + ).format(street_ident), + (dataset_version, build_run_id), + ) + + cur.execute(sql.SQL("DROP TABLE IF EXISTS api.{} CASCADE").format(lookup_ident)) + cur.execute( + sql.SQL( + """ + CREATE TABLE api.{} AS + WITH street_rows AS ( + SELECT + s.postcode, + jsonb_agg( + jsonb_build_object( + 'name', s.street_name, + 'confidence', s.confidence, + 'probability', s.probability, + 'usrn', s.usrn + ) + ORDER BY + s.probability DESC, + CASE s.confidence + WHEN 'high' THEN 3 + WHEN 'medium' THEN 2 + WHEN 'low' THEN 1 + ELSE 0 + END DESC, + s.street_name COLLATE "C" ASC, + s.usrn ASC NULLS LAST + ) AS streets_json + FROM api.{} AS s + GROUP BY s.postcode + ), + source_rows AS ( + SELECT + dedup.postcode, + array_agg(dedup.source_name ORDER BY dedup.source_name COLLATE "C") AS sources + FROM ( + SELECT DISTINCT + f.postcode, + fs.source_name + FROM derived.postcode_streets_final AS f + JOIN derived.postcode_streets_final_source AS fs + ON fs.final_id = f.final_id + WHERE f.produced_build_run_id = %s + ) AS dedup + GROUP BY dedup.postcode + ) + SELECT + p.postcode, + p.status, + p.country_iso2, + p.country_iso3, + p.subdivision_code, + p.post_town, + p.locality, + p.lat, + p.lon, + p.easting, + p.northing, + p.street_enrichment_available, + p.multi_street, + COALESCE(sr.streets_json, '[]'::jsonb) AS streets_json, + COALESCE(src.sources, ARRAY['onspd']::text[]) AS sources, + %s::text AS dataset_version, + p.produced_build_run_id + FROM core.postcodes AS p + LEFT JOIN street_rows AS sr + ON sr.postcode = p.postcode + LEFT JOIN source_rows AS src + ON src.postcode = p.postcode + WHERE p.produced_build_run_id = %s + ORDER BY p.postcode COLLATE "C" ASC + """ + ).format(lookup_ident, street_ident), + (build_run_id, dataset_version, build_run_id), + ) + + cur.execute(sql.SQL("SELECT COUNT(*) FROM api.{}").format(street_ident)) + street_count = int(cur.fetchone()[0]) + cur.execute(sql.SQL("SELECT COUNT(*) FROM api.{}").format(lookup_ident)) + lookup_count = int(cur.fetchone()[0]) + + return { + f"api.{street_table_name}": street_count, + f"api.{lookup_table_name}": lookup_count, + } + + +def _pass_handler( + pass_name: str, +): + handlers = { + "0a_raw_ingest": _pass_0a_raw_ingest, + "0b_stage_normalisation": _pass_0b_stage_normalisation, + "1_onspd_backbone": _pass_1_onspd_backbone, + "2_gb_canonical_streets": _pass_2_gb_canonical_streets, + "3_open_names_candidates": _pass_3_open_names_candidates, + "4_uprn_reinforcement": _pass_4_uprn_reinforcement, + "5_gb_spatial_fallback": _pass_5_gb_spatial_fallback, + "6_ni_candidates": _pass_6_ni_candidates, + "7_ppd_gap_fill": _pass_7_ppd_gap_fill, + "8_finalisation": _pass_8_finalisation, + } + return handlers[pass_name] + + +def run_build( + conn: psycopg.Connection, + bundle_id: str, + rebuild: bool, + resume: bool, +) -> BuildRunResult: + if rebuild and resume: + raise BuildError("--rebuild and --resume cannot be used together") + + build_profile, bundle_hash, _bundle_status, source_runs = _load_bundle(conn, bundle_id) + required = BUILD_PROFILES[build_profile] + missing = sorted(required - set(source_runs.keys())) + if missing: + raise BuildError( + f"Bundle {bundle_id} missing required sources: {', '.join(missing)}" + ) + for source_name in required: + run_ids = source_runs.get(source_name, ()) + if source_name == "ppd": + if len(run_ids) == 0: + raise BuildError("Bundle must include at least one ppd ingest run") + else: + if len(run_ids) != 1: + raise BuildError( + f"Bundle source {source_name} must include exactly one ingest run" + ) + + with conn.cursor() as cur: + # Stage tables are rebuildable; disabling per-transaction fsync waits reduces + # pass runtime without changing deterministic outputs. + cur.execute("SET synchronous_commit TO off") + + if resume: + resumable = _latest_resumable_run(conn, bundle_id) + if resumable is None: + raise BuildError(f"No resumable run found for bundle {bundle_id}") + build_run_id, dataset_version = resumable + completed_passes = _load_completed_passes(conn, build_run_id) + else: + dataset_version = _dataset_version_from_bundle_hash(bundle_hash) + build_run_id = _create_build_run(conn, bundle_id, dataset_version) + completed_passes = set() + if rebuild: + _clear_run_outputs(conn, build_run_id) + conn.commit() + + try: + for pass_name in PASS_ORDER: + if pass_name in completed_passes: + continue + + _set_build_run_pass(conn, build_run_id, pass_name) + + handler = _pass_handler(pass_name) + if pass_name in {"0a_raw_ingest", "0b_stage_normalisation"}: + row_count_summary = handler(conn, build_run_id, source_runs) + elif pass_name == "8_finalisation": + row_count_summary = handler(conn, build_run_id, dataset_version) + else: + row_count_summary = handler(conn, build_run_id) + + _mark_pass_checkpoint(conn, build_run_id, pass_name, row_count_summary) + conn.commit() + + _mark_build_built(conn, bundle_id, build_run_id) + conn.commit() + return BuildRunResult( + build_run_id=build_run_id, + status="built", + dataset_version=dataset_version, + message="Build completed successfully", + ) + except Exception as exc: + conn.rollback() + try: + _mark_build_failed(conn, build_run_id, pass_name, str(exc)) + conn.commit() + except Exception: + conn.rollback() + raise + + +def _canonical_hash_query( + conn: psycopg.Connection, + query_sql: sql.SQL, + params: tuple[Any, ...] = (), +) -> tuple[int, str]: + digest = hashlib.sha256() + row_count = 0 + + cursor_name = f"canon_{uuid.uuid4().hex[:12]}" + with conn.cursor(name=cursor_name) as cur: + cur.execute(query_sql, params) + for row in cur: + row_count += 1 + normalized = [] + for value in row: + if isinstance(value, Decimal): + normalized.append(str(value)) + else: + normalized.append(value) + digest.update( + json.dumps(normalized, separators=(",", ":"), ensure_ascii=True, default=str).encode("utf-8") + ) + digest.update(b"\n") + + return row_count, digest.hexdigest() + + +def verify_build(conn: psycopg.Connection, build_run_id: str) -> VerifyResult: + with conn.cursor() as cur: + cur.execute( + """ + SELECT dataset_version, status + FROM meta.build_run + WHERE build_run_id = %s + """, + (build_run_id,), + ) + row = cur.fetchone() + if row is None: + raise BuildError(f"Build run not found: {build_run_id}") + dataset_version, status = row + if status not in {"built", "published"}: + raise BuildError(f"Build run {build_run_id} must be built before verify (status={status})") + + with conn.cursor() as cur: + cur.execute( + """ + SELECT postcode, SUM(probability)::numeric(10,4) AS prob_sum + FROM derived.postcode_streets_final + WHERE produced_build_run_id = %s + GROUP BY postcode + HAVING SUM(probability)::numeric(10,4) <> 1.0000 + LIMIT 1 + """, + (build_run_id,), + ) + bad = cur.fetchone() + if bad is not None: + raise BuildError( + f"Probability sum check failed for postcode={bad[0]} sum={bad[1]}" + ) + + suffix = _safe_version_suffix(dataset_version) + street_table = f"api.postcode_street_lookup__{suffix}" + lookup_table = f"api.postcode_lookup__{suffix}" + + specs = [ + ( + "derived_postcode_streets_final", + sql.SQL( + """ + SELECT postcode, street_name, usrn, confidence, frequency_score, probability + FROM derived.postcode_streets_final + WHERE produced_build_run_id = %s + ORDER BY postcode COLLATE "C" ASC, street_name COLLATE "C" ASC, usrn ASC NULLS LAST + """ + ), + (build_run_id,), + ), + ( + "api_postcode_street_lookup", + sql.SQL( + """ + SELECT postcode, street_name, usrn, confidence, frequency_score, probability, dataset_version + FROM api.{} + ORDER BY postcode COLLATE "C" ASC, street_name COLLATE "C" ASC, usrn ASC NULLS LAST + """ + ).format(sql.Identifier(f"postcode_street_lookup__{suffix}")), + (), + ), + ( + "api_postcode_lookup", + sql.SQL( + """ + SELECT postcode, status, country_iso2, country_iso3, subdivision_code, + post_town, locality, lat, lon, easting, northing, + street_enrichment_available, multi_street, streets_json::text, + sources::text, dataset_version + FROM api.{} + ORDER BY postcode COLLATE "C" ASC + """ + ).format(sql.Identifier(f"postcode_lookup__{suffix}")), + (), + ), + ] + + object_hashes: dict[str, str] = {} + with conn.cursor() as cur: + cur.execute( + """ + SELECT to_regclass(%s), to_regclass(%s) + """, + (street_table, lookup_table), + ) + street_regclass, lookup_regclass = cur.fetchone() + if street_regclass is None or lookup_regclass is None: + raise BuildError( + f"API projection tables not found for dataset_version={dataset_version}; expected {street_table} and {lookup_table}" + ) + + with conn.cursor() as cur: + cur.execute("DELETE FROM meta.canonical_hash WHERE build_run_id = %s", (build_run_id,)) + + for object_name, query_sql, params in specs: + row_count, sha256_digest = _canonical_hash_query(conn, query_sql, params) + object_hashes[object_name] = sha256_digest + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO meta.canonical_hash ( + build_run_id, + object_name, + projection, + row_count, + sha256, + computed_at_utc + ) VALUES (%s, %s, %s, %s, %s, now()) + """, + ( + build_run_id, + object_name, + Jsonb({"ordering": "deterministic"}), + row_count, + sha256_digest, + ), + ) + + return VerifyResult(build_run_id=build_run_id, status="verified", object_hashes=object_hashes) + + +def publish_build(conn: psycopg.Connection, build_run_id: str, actor: str) -> PublishResult: + with conn.cursor() as cur: + cur.execute( + """ + SELECT bundle_id, dataset_version, status + FROM meta.build_run + WHERE build_run_id = %s + FOR UPDATE + """, + (build_run_id,), + ) + row = cur.fetchone() + if row is None: + raise BuildError(f"Build run not found: {build_run_id}") + bundle_id, dataset_version, status = row + if status not in {"built", "published"}: + raise BuildError(f"Build run {build_run_id} must be built before publish (status={status})") + + suffix = _safe_version_suffix(dataset_version) + lookup_table_name = f"postcode_lookup__{suffix}" + street_lookup_table_name = f"postcode_street_lookup__{suffix}" + + with conn.cursor() as cur: + cur.execute("SELECT to_regclass(%s), to_regclass(%s)", ( + f"api.{lookup_table_name}", + f"api.{street_lookup_table_name}", + )) + lookup_regclass, street_regclass = cur.fetchone() + if lookup_regclass is None or street_regclass is None: + raise BuildError( + "Cannot publish: versioned api tables are missing for dataset_version=" + f"{dataset_version}" + ) + + with conn.cursor() as cur: + cur.execute( + sql.SQL("CREATE OR REPLACE VIEW api.postcode_lookup AS SELECT * FROM api.{}").format( + sql.Identifier(lookup_table_name) + ) + ) + cur.execute( + sql.SQL( + "CREATE OR REPLACE VIEW api.postcode_street_lookup AS SELECT * FROM api.{}" + ).format(sql.Identifier(street_lookup_table_name)) + ) + + cur.execute("SELECT txid_current()") + publish_txid = int(cur.fetchone()[0]) + + cur.execute( + """ + INSERT INTO meta.dataset_publication ( + dataset_version, + build_run_id, + published_at_utc, + published_by, + lookup_table_name, + street_lookup_table_name, + publish_txid + ) VALUES (%s, %s, now(), %s, %s, %s, %s) + ON CONFLICT (dataset_version) + DO UPDATE SET + build_run_id = EXCLUDED.build_run_id, + published_at_utc = EXCLUDED.published_at_utc, + published_by = EXCLUDED.published_by, + lookup_table_name = EXCLUDED.lookup_table_name, + street_lookup_table_name = EXCLUDED.street_lookup_table_name, + publish_txid = EXCLUDED.publish_txid + """, + ( + dataset_version, + build_run_id, + actor, + f"api.{lookup_table_name}", + f"api.{street_lookup_table_name}", + publish_txid, + ), + ) + + cur.execute( + """ + UPDATE meta.build_run + SET status = 'published', + current_pass = 'published', + finished_at_utc = COALESCE(finished_at_utc, now()) + WHERE build_run_id = %s + """, + (build_run_id,), + ) + + cur.execute( + """ + UPDATE meta.build_bundle + SET status = 'published' + WHERE bundle_id = %s + """, + (bundle_id,), + ) + + return PublishResult(build_run_id=build_run_id, dataset_version=dataset_version, status="published") diff --git a/pipeline/src/pipeline/cli.py b/pipeline/src/pipeline/cli.py new file mode 100644 index 0000000..631c34c --- /dev/null +++ b/pipeline/src/pipeline/cli.py @@ -0,0 +1,169 @@ +"""CLI entrypoint for Pipeline V3 lifecycle commands.""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +from pipeline.build.workflows import ( + BuildError, + create_build_bundle, + publish_build, + run_build, + verify_build, +) +from pipeline.config import default_dsn, migrations_dir +from pipeline.db.connection import connect +from pipeline.db.migrations import apply_migrations +from pipeline.ingest.workflows import IngestError, ingest_source +from pipeline.manifest import ManifestError, load_bundle_manifest, load_source_manifest + + +def _parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog="pipeline") + parser.add_argument("--dsn", default=default_dsn(), help="PostgreSQL DSN") + + subparsers = parser.add_subparsers(dest="command", required=True) + + db_parser = subparsers.add_parser("db", help="Database operations") + db_subparsers = db_parser.add_subparsers(dest="db_command", required=True) + db_subparsers.add_parser("migrate", help="Apply SQL migrations") + + ingest_parser = subparsers.add_parser("ingest", help="Source ingest operations") + ingest_subparsers = ingest_parser.add_subparsers(dest="ingest_command", required=True) + source_parser = ingest_subparsers.add_parser("source", help="Ingest a source manifest") + source_parser.add_argument("--manifest", required=True, type=Path) + + bundle_parser = subparsers.add_parser("bundle", help="Bundle lifecycle") + bundle_subparsers = bundle_parser.add_subparsers(dest="bundle_command", required=True) + bundle_create_parser = bundle_subparsers.add_parser("create", help="Create build bundle") + bundle_create_parser.add_argument("--manifest", required=True, type=Path) + + build_parser = subparsers.add_parser("build", help="Build lifecycle") + build_subparsers = build_parser.add_subparsers(dest="build_command", required=True) + + build_run_parser = build_subparsers.add_parser("run", help="Run build passes") + build_run_parser.add_argument("--bundle-id", required=True) + build_run_parser.add_argument("--rebuild", action="store_true") + build_run_parser.add_argument("--resume", action="store_true") + + build_verify_parser = build_subparsers.add_parser("verify", help="Verify build outputs") + build_verify_parser.add_argument("--build-run-id", required=True) + + build_publish_parser = build_subparsers.add_parser("publish", help="Publish verified build") + build_publish_parser.add_argument("--build-run-id", required=True) + build_publish_parser.add_argument("--actor", required=True) + + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = _parser() + args = parser.parse_args(argv) + + try: + if args.command == "db" and args.db_command == "migrate": + applied = apply_migrations(args.dsn, migrations_dir()) + print(json.dumps({"status": "ok", "migrations_applied": applied})) + return 0 + + if args.command == "ingest" and args.ingest_command == "source": + manifest = load_source_manifest(args.manifest) + with connect(args.dsn) as conn: + result = ingest_source(conn, manifest) + conn.commit() + print( + json.dumps( + { + "status": result.status, + "source_name": result.source_name, + "ingest_run_id": result.run_id, + "files_loaded": result.files_loaded, + "rows_loaded": result.rows_loaded, + } + ) + ) + return 0 + + if args.command == "bundle" and args.bundle_command == "create": + manifest = load_bundle_manifest(args.manifest) + with connect(args.dsn) as conn: + result = create_build_bundle(conn, manifest) + conn.commit() + print( + json.dumps( + { + "status": result.status, + "bundle_id": result.bundle_id, + "bundle_hash": result.bundle_hash, + } + ) + ) + return 0 + + if args.command == "build" and args.build_command == "run": + with connect(args.dsn) as conn: + result = run_build( + conn, + bundle_id=args.bundle_id, + rebuild=args.rebuild, + resume=args.resume, + ) + conn.commit() + print( + json.dumps( + { + "status": result.status, + "build_run_id": result.build_run_id, + "dataset_version": result.dataset_version, + "message": result.message, + } + ) + ) + return 0 + + if args.command == "build" and args.build_command == "verify": + with connect(args.dsn) as conn: + result = verify_build(conn, build_run_id=args.build_run_id) + conn.commit() + print( + json.dumps( + { + "status": result.status, + "build_run_id": result.build_run_id, + "object_hashes": result.object_hashes, + } + ) + ) + return 0 + + if args.command == "build" and args.build_command == "publish": + with connect(args.dsn) as conn: + result = publish_build( + conn, + build_run_id=args.build_run_id, + actor=args.actor, + ) + conn.commit() + print( + json.dumps( + { + "status": result.status, + "build_run_id": result.build_run_id, + "dataset_version": result.dataset_version, + } + ) + ) + return 0 + + parser.print_help(sys.stderr) + return 2 + except (ManifestError, IngestError, BuildError, RuntimeError) as exc: + print(json.dumps({"status": "error", "error": str(exc)}), file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/pipeline/src/pipeline/config.py b/pipeline/src/pipeline/config.py new file mode 100644 index 0000000..e051aa3 --- /dev/null +++ b/pipeline/src/pipeline/config.py @@ -0,0 +1,33 @@ +"""Runtime configuration for Pipeline V3.""" + +from __future__ import annotations + +import os +from pathlib import Path + +PROBABILITY_SCALE = 4 +PROBABILITY_NUMERIC_TYPE = "numeric(6,4)" + + +def default_dsn() -> str: + return os.getenv("PIPELINE_DSN", "dbname=postcodes_v3") + + +def repo_root() -> Path: + return Path(__file__).resolve().parents[3] + + +def migrations_dir() -> Path: + return repo_root() / "pipeline" / "sql" / "migrations" + + +def source_schema_config_path() -> Path: + return repo_root() / "pipeline" / "config" / "source_schema.yaml" + + +def frequency_weights_config_path() -> Path: + return repo_root() / "pipeline" / "config" / "frequency_weights.yaml" + + +def normalisation_config_path() -> Path: + return repo_root() / "pipeline" / "config" / "normalisation.yaml" diff --git a/pipeline/src/pipeline/contracts/__init__.py b/pipeline/src/pipeline/contracts/__init__.py new file mode 100644 index 0000000..6373885 --- /dev/null +++ b/pipeline/src/pipeline/contracts/__init__.py @@ -0,0 +1 @@ +"""SQL contracts locked by the Phase 1 specification.""" diff --git a/pipeline/src/pipeline/contracts/open_roads.py b/pipeline/src/pipeline/contracts/open_roads.py new file mode 100644 index 0000000..3abc84b --- /dev/null +++ b/pipeline/src/pipeline/contracts/open_roads.py @@ -0,0 +1,62 @@ +"""Locked SQL contracts for Open Roads staging and build linkage.""" + +OPEN_ROADS_STAGE_TABLE = "stage.open_roads_segment" + +ALLOWED_CANONICAL_HASH_OBJECT_NAMES = ( + "core_uprn_postcode", + "core_uprn_point", + "core_road_segment", + "derived_uprn_street_spatial", +) + +ALLOWED_CANONICAL_HASH_OBJECT_NAMES_PHASE2 = ( + "core_uprn_postcode", + "core_uprn_point", + "core_road_segment", + "core_open_names_entry", + "core_postcode_unit_seed", + "derived_uprn_street_spatial", + "derived_postcode_street", +) + +OPEN_ROADS_LOADED_FEATURE_COUNT_SQL = ( + "SELECT COUNT(*) AS loaded_feature_count " + "FROM stage.open_roads_segment " + "WHERE release_id = %(release_id)s;" +) + +OPEN_ROADS_PERSIST_LOADED_FEATURE_COUNT_SQL = ( + "UPDATE meta.dataset_release " + "SET loaded_feature_count = (" + "SELECT COUNT(*) " + "FROM stage.open_roads_segment " + "WHERE release_id = %(release_id)s" + ") " + "WHERE dataset_key = 'open_roads' " + "AND release_id = %(release_id)s;" +) + +OPEN_ROADS_BUILD_LINKAGE_SQL = ( + "SELECT s.segment_id, s.name_display, s.name_norm, s.geom_bng " + "FROM stage.open_roads_segment AS s " + "WHERE s.release_id = %(open_roads_release_id)s " + "ORDER BY s.segment_id ASC;" +) + + +def loaded_feature_count_query() -> str: + """Return the locked gate query for loaded Open Roads features.""" + + return OPEN_ROADS_LOADED_FEATURE_COUNT_SQL + + +def persist_loaded_feature_count_query() -> str: + """Return the locked query that writes loaded feature counts into metadata.""" + + return OPEN_ROADS_PERSIST_LOADED_FEATURE_COUNT_SQL + + +def build_linkage_query() -> str: + """Return the locked stage-to-build linkage query filtered by release_id.""" + + return OPEN_ROADS_BUILD_LINKAGE_SQL diff --git a/pipeline/src/pipeline/contracts/voronoi.py b/pipeline/src/pipeline/contracts/voronoi.py new file mode 100644 index 0000000..36d811e --- /dev/null +++ b/pipeline/src/pipeline/contracts/voronoi.py @@ -0,0 +1,95 @@ +"""Locked SQL contracts for Phase 2 Voronoi clipping and enumeration.""" + +from __future__ import annotations + +from pipeline.config import VORONOI_HULL_BUFFER_M + +# Phase 2 lock: no inline literals for the buffer. Runtime must bind +# `hull_buffer_m` explicitly so the governing constant is always traceable. +VORONOI_CLIP_EXPR_SQL = ( + "ST_Buffer(ST_ConvexHull(ST_Collect(seed_geom_bng)), %(hull_buffer_m)s)" +) + +VORONOI_CLIP_GEOMETRY_SQL_TEMPLATE = """ +WITH seed_points AS ( + {seed_points_sql} +), +clip_geom AS ( + SELECT + ST_SetSRID({clip_expr}, 27700) AS gb_clip_geom + FROM seed_points +) +SELECT gb_clip_geom +FROM clip_geom; +""".strip() + +VORONOI_CELL_CTE_SQL_TEMPLATE = """ +WITH seed_points AS ( + {seed_points_sql} +), +clip_geom AS ( + SELECT + ST_SetSRID({clip_expr}, 27700) AS gb_clip_geom + FROM seed_points +), +cell_geoms AS ( + SELECT + (ST_Dump( + ST_VoronoiPolygons( + ST_Collect(seed_geom_bng), + 0.0, + (SELECT gb_clip_geom FROM clip_geom) + ) + )).geom AS cell_geom + FROM seed_points +) +""".strip() + +VORONOI_CELL_SQL_TEMPLATE = """ +{cell_cte_sql} +SELECT cell_geom +FROM cell_geoms; +""".strip() + + +def voronoi_sql_params(hull_buffer_m: float | None = None) -> dict[str, float]: + """Return the bound parameter dict for Voronoi clipping SQL.""" + + value = VORONOI_HULL_BUFFER_M if hull_buffer_m is None else float(hull_buffer_m) + if value <= 0: + raise ValueError("hull_buffer_m must be greater than zero") + return {"hull_buffer_m": value} + + +def render_voronoi_clip_geometry_sql(seed_points_sql: str) -> str: + """Render SQL that computes the clipped Voronoi boundary geometry.""" + + if not seed_points_sql.strip(): + raise ValueError("seed_points_sql must be non-empty") + return VORONOI_CLIP_GEOMETRY_SQL_TEMPLATE.format( + seed_points_sql=seed_points_sql.strip(), + clip_expr=VORONOI_CLIP_EXPR_SQL, + ) + + +def render_voronoi_cell_sql(seed_points_sql: str) -> str: + """Render SQL that computes clipped Voronoi cell polygons.""" + + if not seed_points_sql.strip(): + raise ValueError("seed_points_sql must be non-empty") + cell_cte_sql = VORONOI_CELL_CTE_SQL_TEMPLATE.format( + seed_points_sql=seed_points_sql.strip(), + clip_expr=VORONOI_CLIP_EXPR_SQL, + ) + return VORONOI_CELL_SQL_TEMPLATE.format(cell_cte_sql=cell_cte_sql) + + +def render_voronoi_cell_cte_sql(seed_points_sql: str) -> str: + """Render SQL CTE block for building clipped Voronoi cells.""" + + if not seed_points_sql.strip(): + raise ValueError("seed_points_sql must be non-empty") + return VORONOI_CELL_CTE_SQL_TEMPLATE.format( + seed_points_sql=seed_points_sql.strip(), + clip_expr=VORONOI_CLIP_EXPR_SQL, + ) diff --git a/pipeline/src/pipeline/db/__init__.py b/pipeline/src/pipeline/db/__init__.py new file mode 100644 index 0000000..3c4b3c0 --- /dev/null +++ b/pipeline/src/pipeline/db/__init__.py @@ -0,0 +1 @@ +"""Database helpers for the pipeline.""" diff --git a/pipeline/src/pipeline/db/connection.py b/pipeline/src/pipeline/db/connection.py new file mode 100644 index 0000000..1d46064 --- /dev/null +++ b/pipeline/src/pipeline/db/connection.py @@ -0,0 +1,17 @@ +"""Database connection helpers.""" + +from __future__ import annotations + +from contextlib import contextmanager +from typing import Iterator + +import psycopg + + +@contextmanager +def connect(dsn: str) -> Iterator[psycopg.Connection]: + conn = psycopg.connect(dsn) + try: + yield conn + finally: + conn.close() diff --git a/pipeline/src/pipeline/db/migrations.py b/pipeline/src/pipeline/db/migrations.py new file mode 100644 index 0000000..51abe2f --- /dev/null +++ b/pipeline/src/pipeline/db/migrations.py @@ -0,0 +1,69 @@ +"""Simple SQL migration runner for pipeline schemas.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, List + + +@dataclass(frozen=True) +class Migration: + version: str + path: Path + + +def discover_migrations(migrations_dir: Path) -> List[Migration]: + """Return sorted migration files based on numeric filename prefix.""" + + migrations: List[Migration] = [] + for path in sorted(migrations_dir.glob("*.sql")): + version = path.stem.split("_", 1)[0] + migrations.append(Migration(version=version, path=path)) + return migrations + + +def _read_sql(path: Path) -> str: + return path.read_text(encoding="utf-8") + + +def apply_migrations(dsn: str, migrations_dir: Path) -> int: + """Apply unapplied migrations in filename order. + + Requires psycopg at runtime, but keeps import optional for environments + where only static checks are needed. + """ + + try: + import psycopg # type: ignore + except ImportError as exc: # pragma: no cover - import-path safety + raise RuntimeError("psycopg is required to run migrations") from exc + + migrations = discover_migrations(migrations_dir) + applied_count = 0 + + with psycopg.connect(dsn) as conn: + with conn.cursor() as cur: + cur.execute("CREATE SCHEMA IF NOT EXISTS meta") + cur.execute( + """ + CREATE TABLE IF NOT EXISTS meta.schema_migration ( + version text PRIMARY KEY, + applied_at timestamptz NOT NULL DEFAULT now() + ) + """ + ) + cur.execute("SELECT version FROM meta.schema_migration") + applied_versions = {row[0] for row in cur.fetchall()} + + for migration in migrations: + if migration.version in applied_versions: + continue + cur.execute(_read_sql(migration.path)) + cur.execute( + "INSERT INTO meta.schema_migration (version) VALUES (%s)", + (migration.version,), + ) + applied_count += 1 + + return applied_count diff --git a/pipeline/src/pipeline/ingest/__init__.py b/pipeline/src/pipeline/ingest/__init__.py new file mode 100644 index 0000000..3f26ba5 --- /dev/null +++ b/pipeline/src/pipeline/ingest/__init__.py @@ -0,0 +1 @@ +"""Ingest workflows for Phase 1 datasets.""" diff --git a/pipeline/src/pipeline/ingest/workflows.py b/pipeline/src/pipeline/ingest/workflows.py new file mode 100644 index 0000000..d8cfaa2 --- /dev/null +++ b/pipeline/src/pipeline/ingest/workflows.py @@ -0,0 +1,349 @@ +"""Source ingest workflows for Pipeline V3.""" + +from __future__ import annotations + +import csv +import json +import sqlite3 +import uuid +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable, Iterator + +import psycopg +from psycopg import sql +from psycopg.types.json import Jsonb + +from pipeline.manifest import SourceFileManifest, SourceIngestManifest +from pipeline.util.hashing import sha256_file + + +class IngestError(RuntimeError): + """Raised when source ingest fails.""" + + +@dataclass(frozen=True) +class IngestResult: + source_name: str + run_id: str + status: str + files_loaded: int + rows_loaded: int + + +RAW_TABLE_BY_SOURCE = { + "onspd": "raw.onspd_row", + "os_open_usrn": "raw.os_open_usrn_row", + "os_open_names": "raw.os_open_names_row", + "os_open_roads": "raw.os_open_roads_row", + "os_open_uprn": "raw.os_open_uprn_row", + "os_open_lids": "raw.os_open_lids_row", + "nsul": "raw.nsul_row", + "osni_gazetteer": "raw.osni_gazetteer_row", + "dfi_highway": "raw.dfi_highway_row", + "ppd": "raw.ppd_row", +} + + +CSV_INSERT_BATCH_SIZE = 5_000 + + +def _file_set_hash(files: tuple[SourceFileManifest, ...]) -> str: + payload = [ + { + "file_role": file.file_role, + "path": str(file.file_path), + "sha256": file.sha256, + "size_bytes": file.size_bytes, + "format": file.format, + "layer_name": file.layer_name, + } + for file in sorted(files, key=lambda item: (item.file_role, str(item.file_path), item.layer_name)) + ] + encoded = json.dumps(payload, ensure_ascii=True, separators=(",", ":")).encode("utf-8") + import hashlib + + return hashlib.sha256(encoded).hexdigest() + + +def _iter_rows_from_csv(path: Path) -> Iterator[dict[str, Any]]: + with path.open("r", encoding="utf-8-sig", newline="") as handle: + reader = csv.DictReader(handle) + if reader.fieldnames is None: + raise IngestError(f"CSV file is missing header row: {path}") + for row in reader: + yield {str(key): value for key, value in row.items()} + + +def _iter_rows_from_geojson(path: Path) -> Iterator[dict[str, Any]]: + payload = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(payload, dict): + raise IngestError(f"GeoJSON root must be object: {path}") + + features = payload.get("features") + if not isinstance(features, list): + raise IngestError(f"GeoJSON features missing or invalid: {path}") + + for feature in features: + if not isinstance(feature, dict): + continue + props = feature.get("properties") + row: dict[str, Any] = {} + if isinstance(props, dict): + row.update({str(key): value for key, value in props.items()}) + geometry = feature.get("geometry") + row["__geometry"] = geometry + yield row + + +def _iter_rows_from_json(path: Path) -> Iterator[dict[str, Any]]: + payload = json.loads(path.read_text(encoding="utf-8")) + if isinstance(payload, list): + for item in payload: + if isinstance(item, dict): + yield {str(key): value for key, value in item.items()} + return + if isinstance(payload, dict): + yield {str(key): value for key, value in payload.items()} + return + raise IngestError(f"Unsupported JSON payload shape: {path}") + + +def _iter_rows_from_gpkg(path: Path, layer_name: str) -> Iterator[dict[str, Any]]: + if not layer_name: + raise IngestError(f"GeoPackage manifest must set layer_name: {path}") + + quoted_layer = '"' + layer_name.replace('"', '""') + '"' + conn = sqlite3.connect(f"file:{path}?mode=ro", uri=True) + try: + cur = conn.execute( + """ + SELECT 1 + FROM sqlite_master + WHERE type IN ('table', 'view') + AND name = ? + LIMIT 1 + """, + (layer_name,), + ) + if cur.fetchone() is None: + raise IngestError( + f"GeoPackage layer '{layer_name}' not found in {path}" + ) + + row_cur = conn.execute(f"SELECT * FROM {quoted_layer}") + col_names = [desc[0] for desc in row_cur.description] + for values in row_cur: + row: dict[str, Any] = {} + for index, column_name in enumerate(col_names): + value = values[index] + if isinstance(value, (bytes, bytearray, memoryview)): + # Keep raw binary columns JSON-safe while preserving source bytes. + value = bytes(value).hex() + row[str(column_name)] = value + yield row + finally: + conn.close() + + +def _iter_rows(file_manifest: SourceFileManifest) -> Iterator[dict[str, Any]]: + file_format = file_manifest.format.lower() + if file_format == "csv": + return _iter_rows_from_csv(file_manifest.file_path) + if file_format in {"geojson", "json"}: + return _iter_rows_from_geojson(file_manifest.file_path) + if file_format == "json_array": + return _iter_rows_from_json(file_manifest.file_path) + if file_format in {"gpkg", "geopackage"}: + return _iter_rows_from_gpkg(file_manifest.file_path, file_manifest.layer_name) + raise IngestError(f"Unsupported file format '{file_manifest.format}' for {file_manifest.file_path}") + + +def _table_ident(qualified_table: str) -> tuple[sql.Identifier, sql.Identifier]: + schema_name, table_name = qualified_table.split(".", 1) + return sql.Identifier(schema_name), sql.Identifier(table_name) + + +def _insert_raw_rows( + conn: psycopg.Connection, + qualified_table: str, + ingest_run_id: str, + rows: Iterable[dict[str, Any]], +) -> int: + schema_ident, table_ident = _table_ident(qualified_table) + insert_sql = sql.SQL( + """ + INSERT INTO {}.{} ( + ingest_run_id, + source_row_num, + payload_jsonb + ) VALUES (%s, %s, %s) + """ + ).format(schema_ident, table_ident) + + total_loaded = 0 + batch: list[tuple[str, int, Jsonb]] = [] + with conn.cursor() as cur: + for row_num, row in enumerate(rows, start=1): + batch.append((ingest_run_id, row_num, Jsonb(row))) + if len(batch) >= CSV_INSERT_BATCH_SIZE: + cur.executemany(insert_sql, batch) + total_loaded += len(batch) + batch.clear() + if batch: + cur.executemany(insert_sql, batch) + total_loaded += len(batch) + batch.clear() + + return total_loaded + + +def _analyze_raw_table(conn: psycopg.Connection, qualified_table: str) -> None: + schema_ident, table_ident = _table_ident(qualified_table) + with conn.cursor() as cur: + cur.execute( + sql.SQL("ANALYZE {}.{}").format(schema_ident, table_ident), + ) + + +def _existing_ingest_run( + conn: psycopg.Connection, + source_name: str, + source_version: str, + file_set_sha256: str, +) -> str | None: + with conn.cursor() as cur: + cur.execute( + """ + SELECT run_id + FROM meta.ingest_run + WHERE source_name = %s + AND source_version = %s + AND file_set_sha256 = %s + """, + (source_name, source_version, file_set_sha256), + ) + row = cur.fetchone() + return str(row[0]) if row is not None else None + + +def ingest_source(conn: psycopg.Connection, manifest: SourceIngestManifest) -> IngestResult: + file_set_sha256 = _file_set_hash(manifest.files) + existing = _existing_ingest_run( + conn, + source_name=manifest.source_name, + source_version=manifest.source_version, + file_set_sha256=file_set_sha256, + ) + if existing is not None: + return IngestResult( + source_name=manifest.source_name, + run_id=existing, + status="noop", + files_loaded=0, + rows_loaded=0, + ) + + raw_table = RAW_TABLE_BY_SOURCE[manifest.source_name] + + run_id = str(uuid.uuid4()) + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO meta.ingest_run ( + run_id, + source_name, + source_version, + retrieved_at_utc, + source_url, + processing_git_sha, + record_count, + notes, + file_set_sha256 + ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) + """, + ( + run_id, + manifest.source_name, + manifest.source_version, + manifest.retrieved_at_utc, + manifest.source_url, + manifest.processing_git_sha, + 0, + manifest.notes, + file_set_sha256, + ), + ) + + total_rows = 0 + for file_manifest in manifest.files: + actual_sha = sha256_file(file_manifest.file_path) + if actual_sha != file_manifest.sha256: + raise IngestError( + "SHA256 mismatch for source file: " + f"path={file_manifest.file_path} expected={file_manifest.sha256} actual={actual_sha}" + ) + + actual_size = file_manifest.file_path.stat().st_size + if actual_size != file_manifest.size_bytes: + raise IngestError( + f"size_bytes mismatch for {file_manifest.file_path}: " + f"expected={file_manifest.size_bytes} actual={actual_size}" + ) + + rows = _iter_rows(file_manifest) + loaded_rows = _insert_raw_rows(conn, raw_table, run_id, rows) + + if file_manifest.row_count_expected is not None and loaded_rows != file_manifest.row_count_expected: + raise IngestError( + f"row_count_expected mismatch for {file_manifest.file_path}: " + f"expected={file_manifest.row_count_expected} loaded={loaded_rows}" + ) + + total_rows += loaded_rows + + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO meta.ingest_run_file ( + ingest_run_id, + file_role, + filename, + layer_name, + sha256, + size_bytes, + row_count, + format + ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) + """, + ( + run_id, + file_manifest.file_role, + str(file_manifest.file_path), + file_manifest.layer_name, + actual_sha, + actual_size, + loaded_rows, + file_manifest.format, + ), + ) + + with conn.cursor() as cur: + cur.execute( + """ + UPDATE meta.ingest_run + SET record_count = %s + WHERE run_id = %s + """, + (total_rows, run_id), + ) + + _analyze_raw_table(conn, raw_table) + + return IngestResult( + source_name=manifest.source_name, + run_id=run_id, + status="ingested", + files_loaded=len(manifest.files), + rows_loaded=total_rows, + ) diff --git a/pipeline/src/pipeline/manifest.py b/pipeline/src/pipeline/manifest.py new file mode 100644 index 0000000..78d9eaf --- /dev/null +++ b/pipeline/src/pipeline/manifest.py @@ -0,0 +1,272 @@ +"""Manifest parsing and validation for Pipeline V3.""" + +from __future__ import annotations + +import json +import re +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any +from uuid import UUID + + +class ManifestError(ValueError): + """Raised when a manifest file is invalid.""" + + +SOURCE_NAMES = { + "onspd", + "os_open_usrn", + "os_open_names", + "os_open_roads", + "os_open_uprn", + "os_open_lids", + "nsul", + "osni_gazetteer", + "dfi_highway", + "ppd", +} + +BUILD_PROFILES = { + "gb_core": { + "onspd", + "os_open_usrn", + "os_open_names", + "os_open_roads", + "os_open_uprn", + "os_open_lids", + "nsul", + }, + "gb_core_ppd": { + "onspd", + "os_open_usrn", + "os_open_names", + "os_open_roads", + "os_open_uprn", + "os_open_lids", + "nsul", + "ppd", + }, + "core_ni": { + "onspd", + "os_open_usrn", + "os_open_names", + "os_open_roads", + "os_open_uprn", + "os_open_lids", + "nsul", + "osni_gazetteer", + "dfi_highway", + }, +} + +SHA256_RE = re.compile(r"^[0-9a-fA-F]{64}$") +GIT_SHA_RE = re.compile(r"^[0-9a-f]{40}$") + + +@dataclass(frozen=True) +class SourceFileManifest: + file_role: str + file_path: Path + sha256: str + size_bytes: int + format: str + layer_name: str + row_count_expected: int | None + + +@dataclass(frozen=True) +class SourceIngestManifest: + source_name: str + source_version: str + retrieved_at_utc: datetime + source_url: str | None + processing_git_sha: str + notes: str | None + files: tuple[SourceFileManifest, ...] + raw: dict[str, Any] + + +@dataclass(frozen=True) +class BuildBundleManifest: + build_profile: str + source_runs: dict[str, tuple[str, ...]] + raw: dict[str, Any] + + +def _load_json(path: Path) -> dict[str, Any]: + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise ManifestError(f"Invalid JSON manifest: {path}") from exc + if not isinstance(payload, dict): + raise ManifestError(f"Manifest root must be an object: {path}") + return payload + + +def _require_string(payload: dict[str, Any], key: str) -> str: + value = payload.get(key) + if not isinstance(value, str) or not value.strip(): + raise ManifestError(f"Manifest field '{key}' must be a non-empty string") + return value.strip() + + +def _parse_optional_string(payload: dict[str, Any], key: str) -> str | None: + value = payload.get(key) + if value is None: + return None + if not isinstance(value, str): + raise ManifestError(f"Manifest field '{key}' must be a string when present") + text = value.strip() + return text or None + + +def _parse_utc_datetime(value: str, field_name: str) -> datetime: + normalized = value.replace("Z", "+00:00") + try: + parsed = datetime.fromisoformat(normalized) + except ValueError as exc: + raise ManifestError(f"Manifest field '{field_name}' must be ISO8601 datetime") from exc + if parsed.tzinfo is None: + raise ManifestError(f"Manifest field '{field_name}' must include timezone") + return parsed.astimezone(timezone.utc) + + +def _parse_file_entry(entry: dict[str, Any]) -> SourceFileManifest: + if not isinstance(entry, dict): + raise ManifestError("Each files[] entry must be an object") + + file_role = _require_string(entry, "file_role") + file_path_value = _require_string(entry, "file_path") + file_path = Path(file_path_value).expanduser().resolve() + if not file_path.exists() or not file_path.is_file(): + raise ManifestError(f"Manifest file_path does not exist: {file_path}") + + sha256 = _require_string(entry, "sha256") + if SHA256_RE.match(sha256) is None: + raise ManifestError("files[].sha256 must be 64 hex chars") + + size_bytes = entry.get("size_bytes") + if not isinstance(size_bytes, int) or size_bytes < 0: + raise ManifestError("files[].size_bytes must be an integer >= 0") + + format_value = _require_string(entry, "format") + layer_name = _parse_optional_string(entry, "layer_name") or "" + + row_count_expected_raw = entry.get("row_count_expected") + row_count_expected: int | None + if row_count_expected_raw is None: + row_count_expected = None + else: + if not isinstance(row_count_expected_raw, int) or row_count_expected_raw < 0: + raise ManifestError("files[].row_count_expected must be integer >= 0 when present") + row_count_expected = row_count_expected_raw + + return SourceFileManifest( + file_role=file_role, + file_path=file_path, + sha256=sha256.lower(), + size_bytes=size_bytes, + format=format_value, + layer_name=layer_name, + row_count_expected=row_count_expected, + ) + + +def load_source_manifest(path: Path) -> SourceIngestManifest: + payload = _load_json(path) + + source_name = _require_string(payload, "source_name") + if source_name not in SOURCE_NAMES: + raise ManifestError(f"Invalid source_name '{source_name}'") + + source_version = _require_string(payload, "source_version") + retrieved_raw = _require_string(payload, "retrieved_at_utc") + retrieved_at_utc = _parse_utc_datetime(retrieved_raw, "retrieved_at_utc") + + source_url = _parse_optional_string(payload, "source_url") + processing_git_sha = _require_string(payload, "processing_git_sha") + if GIT_SHA_RE.match(processing_git_sha) is None: + raise ManifestError("processing_git_sha must be 40 lowercase hex chars") + + notes = _parse_optional_string(payload, "notes") + + files_raw = payload.get("files") + if not isinstance(files_raw, list) or not files_raw: + raise ManifestError("Manifest files must be a non-empty array") + + files = tuple(_parse_file_entry(entry) for entry in files_raw) + + return SourceIngestManifest( + source_name=source_name, + source_version=source_version, + retrieved_at_utc=retrieved_at_utc, + source_url=source_url, + processing_git_sha=processing_git_sha, + notes=notes, + files=files, + raw=payload, + ) + + +def load_bundle_manifest(path: Path) -> BuildBundleManifest: + payload = _load_json(path) + + build_profile = _require_string(payload, "build_profile") + if build_profile not in BUILD_PROFILES: + raise ManifestError(f"Invalid build_profile '{build_profile}'") + + source_runs_raw = payload.get("source_runs") + if not isinstance(source_runs_raw, dict): + raise ManifestError("Bundle manifest source_runs must be an object") + + source_runs: dict[str, tuple[str, ...]] = {} + for source_name, run_ids_raw in source_runs_raw.items(): + if source_name not in SOURCE_NAMES: + raise ManifestError(f"Unknown source in source_runs: {source_name}") + run_ids: tuple[str, ...] + if isinstance(run_ids_raw, str): + run_ids = (run_ids_raw,) + elif isinstance(run_ids_raw, list): + if not run_ids_raw: + raise ManifestError(f"source_runs[{source_name}] list must not be empty") + normalized: list[str] = [] + for item in run_ids_raw: + if not isinstance(item, str): + raise ManifestError( + f"source_runs[{source_name}] values must be UUID strings" + ) + normalized.append(item) + run_ids = tuple(normalized) + else: + raise ManifestError( + f"source_runs[{source_name}] must be a UUID string or non-empty UUID array" + ) + + parsed_ids: list[str] = [] + for run_id in run_ids: + try: + UUID(run_id) + except ValueError as exc: + raise ManifestError( + f"Invalid ingest run UUID for {source_name}: {run_id}" + ) from exc + parsed_ids.append(run_id) + source_runs[source_name] = tuple(parsed_ids) + + required = BUILD_PROFILES[build_profile] + missing = sorted(required - set(source_runs.keys())) + if missing: + raise ManifestError( + "Bundle manifest missing required sources for profile " + f"{build_profile}: {', '.join(missing)}" + ) + + for source_name in required: + if len(source_runs.get(source_name, ())) == 0: + raise ManifestError( + f"Bundle manifest source_runs[{source_name}] must include at least one ingest run id" + ) + + return BuildBundleManifest(build_profile=build_profile, source_runs=source_runs, raw=payload) diff --git a/pipeline/src/pipeline/util/hashing.py b/pipeline/src/pipeline/util/hashing.py new file mode 100644 index 0000000..8f65db1 --- /dev/null +++ b/pipeline/src/pipeline/util/hashing.py @@ -0,0 +1,14 @@ +"""Hashing helpers for deterministic ingest and canonical output checks.""" + +from __future__ import annotations + +import hashlib +from pathlib import Path + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as fh: + for chunk in iter(lambda: fh.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() diff --git a/pipeline/src/pipeline/util/normalise.py b/pipeline/src/pipeline/util/normalise.py new file mode 100644 index 0000000..85943ab --- /dev/null +++ b/pipeline/src/pipeline/util/normalise.py @@ -0,0 +1,75 @@ +"""Canonicalisation helpers for Pipeline V3.""" + +from __future__ import annotations + +import json +import re +import unicodedata +from functools import lru_cache +from pathlib import Path + +from pipeline.config import normalisation_config_path + + +def _load_json_config(path: Path) -> dict[str, object]: + return json.loads(path.read_text(encoding="utf-8")) + + +@lru_cache(maxsize=1) +def _alias_map() -> dict[str, str]: + config = _load_json_config(normalisation_config_path()) + raw_alias = config.get("alias_map", {}) + if not isinstance(raw_alias, dict): + return {} + output: dict[str, str] = {} + for key, value in raw_alias.items(): + if isinstance(key, str) and isinstance(value, str): + output[key.upper()] = value.upper() + return output + + +@lru_cache(maxsize=1) +def _strip_punctuation() -> str: + config = _load_json_config(normalisation_config_path()) + value = config.get("strip_punctuation", ".,'-") + if not isinstance(value, str): + return ".,'-" + return value + + +def postcode_norm(value: str | None) -> str | None: + if value is None: + return None + cleaned = re.sub(r"[^A-Za-z0-9]", "", value).upper() + if not cleaned: + return None + return cleaned + + +def postcode_display(value: str | None) -> str | None: + normalized = postcode_norm(value) + if normalized is None: + return None + if len(normalized) <= 3: + return normalized + return f"{normalized[:-3]} {normalized[-3:]}" + + +def street_casefold(value: str | None) -> str | None: + if value is None: + return None + + text = unicodedata.normalize("NFKC", value).strip().upper() + text = re.sub(r"\s+", " ", text) + strip_chars = _strip_punctuation() + if strip_chars: + text = text.translate(str.maketrans("", "", strip_chars)) + text = re.sub(r"\s+", " ", text).strip() + if not text: + return None + + alias_map = _alias_map() + tokens = [alias_map.get(token, token) for token in text.split(" ")] + canonical = " ".join(tokens).strip() + return canonical or None + diff --git a/scripts/obtain_phase1_e2e_sources.sh b/scripts/obtain_phase1_e2e_sources.sh new file mode 100755 index 0000000..e68f889 --- /dev/null +++ b/scripts/obtain_phase1_e2e_sources.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +"$SCRIPT_DIR/obtain_phase2_e2e_sources.sh" diff --git a/scripts/obtain_phase2_e2e_sources.sh b/scripts/obtain_phase2_e2e_sources.sh new file mode 100755 index 0000000..766999f --- /dev/null +++ b/scripts/obtain_phase2_e2e_sources.sh @@ -0,0 +1,169 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)" +SOURCE_DIR="$ROOT_DIR/data/source_files/e2e" +MANIFEST_DIR="$ROOT_DIR/data/manifests/e2e" +RELEASE_ID="2026-Q1-E2E-P2" + +mkdir -p "$SOURCE_DIR" "$MANIFEST_DIR" + +cat > "$SOURCE_DIR/onsud_sample.csv" <<'CSV' +ONS_UPRN,ONS_POSTCODE,PC_UNIT_E,PC_UNIT_N,OA_CODE +1001,SW1A 2AA,530268.167,179640.532,E001 +1002,SW1A 2AB,530343.656,179675.849,E002 +1003,SW1A 2AC,530236.700,179784.382,E003 +1004,SW1A 2AD,530165.000,179894.000,E004 +CSV + +cat > "$SOURCE_DIR/open_uprn_sample.csv" <<'CSV' +UPRN_REF,LAT,LON,EASTING,NORTHING,UPRN_STATUS +1001,51.5007,-0.1246,530268.167,179640.532,ACTIVE +1002,51.5010,-0.1235,530343.656,179675.849,ACTIVE +1003,51.5020,-0.1250,530236.700,179784.382,ACTIVE +1005,51.5030,-0.1260,530165.000,179894.000,ACTIVE +CSV + +cat > "$SOURCE_DIR/open_roads_sample.geojson" <<'GEOJSON' +{ + "type": "FeatureCollection", + "name": "open_roads_sample", + "features": [ + { + "type": "Feature", + "properties": { + "SRC_ID": 10, + "ROAD_NAME": "Parliament Street" + }, + "geometry": { + "type": "LineString", + "coordinates": [ + [-0.1253, 51.5001], + [-0.1232, 51.5014] + ] + } + }, + { + "type": "Feature", + "properties": { + "SRC_ID": 20, + "ROAD_NAME": "Bridge Street" + }, + "geometry": { + "type": "LineString", + "coordinates": [ + [-0.1244, 51.5016], + [-0.1231, 51.5024] + ] + } + }, + { + "type": "Feature", + "properties": { + "SRC_ID": 30, + "ROAD_NAME": "A40" + }, + "geometry": { + "type": "LineString", + "coordinates": [ + [-0.1262, 51.5018], + [-0.1248, 51.5025] + ] + } + } + ] +} +GEOJSON + +cat > "$SOURCE_DIR/open_names_sample.csv" <<'CSV' +ON_ID,NAME1,NAME1_LANG,NAME2,LOCAL_TYPE,GEOM_X,GEOM_Y,PC_DISTRICT +E1001,Parliament Street,,,Road,530280.000,179650.000,SW1A +E1002,Bridge Street,,,Street,530330.000,179690.000,SW1A +E1003,Western Avenue,,,Road,530225.000,179790.000,SW1A +E1004,Charing Cross,,,PopulatedPlace,530200.000,179700.000,SW1A +CSV + +onsud_sha="$(shasum -a 256 "$SOURCE_DIR/onsud_sample.csv" | awk '{print $1}')" +open_uprn_sha="$(shasum -a 256 "$SOURCE_DIR/open_uprn_sample.csv" | awk '{print $1}')" +open_roads_sha="$(shasum -a 256 "$SOURCE_DIR/open_roads_sample.geojson" | awk '{print $1}')" +open_names_sha="$(shasum -a 256 "$SOURCE_DIR/open_names_sample.csv" | awk '{print $1}')" + +cat > "$MANIFEST_DIR/onsud_manifest.json" < "$MANIFEST_DIR/open_uprn_manifest.json" < "$MANIFEST_DIR/open_roads_manifest.json" < "$MANIFEST_DIR/open_names_manifest.json" </dev/null + +python -m pipeline.cli --dsn "$DSN" db migrate + +./scripts/obtain_phase2_e2e_sources.sh + +python -m pipeline.cli --dsn "$DSN" ingest onsud --manifest data/manifests/e2e/onsud_manifest.json +python -m pipeline.cli --dsn "$DSN" ingest open-uprn --manifest data/manifests/e2e/open_uprn_manifest.json +python -m pipeline.cli --dsn "$DSN" ingest open-roads --manifest data/manifests/e2e/open_roads_manifest.json +python -m pipeline.cli --dsn "$DSN" ingest open-names --manifest data/manifests/e2e/open_names_manifest.json + +release_json="$(python -m pipeline.cli --dsn "$DSN" release create \ + --onsud-release "$RELEASE_ID" \ + --open-uprn-release "$RELEASE_ID" \ + --open-roads-release "$RELEASE_ID" \ + --open-names-release "$RELEASE_ID")" +release_set_id="$(python - <<'PY' "$release_json" +import json +import sys +print(json.loads(sys.argv[1])["release_set_id"]) +PY +)" + +run_one="$(python -m pipeline.cli --dsn "$DSN" run phase2-open-names --release-set-id "$release_set_id" --rebuild)" +run_one_id="$(python - <<'PY' "$run_one" +import json +import sys +print(json.loads(sys.argv[1])["run_id"]) +PY +)" + +run_two="$(python -m pipeline.cli --dsn "$DSN" run phase2-open-names --release-set-id "$release_set_id" --rebuild)" +run_two_id="$(python - <<'PY' "$run_two" +import json +import sys +print(json.loads(sys.argv[1])["run_id"]) +PY +)" + +python -m pipeline.cli --dsn "$DSN" release activate --release-set-id "$release_set_id" --actor "e2e-script" --ack-warnings + +psql "$DSN" -v ON_ERROR_STOP=1 < Path: + handle = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".json", delete=False) + try: + json.dump(payload, handle) + handle.flush() + return Path(handle.name) + finally: + handle.close() + + def test_bundle_allows_multiple_ppd_runs(self) -> None: + payload = { + "build_profile": "gb_core_ppd", + "source_runs": { + "onspd": "11111111-1111-1111-1111-111111111111", + "os_open_usrn": "22222222-2222-2222-2222-222222222222", + "os_open_names": "33333333-3333-3333-3333-333333333333", + "os_open_roads": "44444444-4444-4444-4444-444444444444", + "os_open_uprn": "55555555-5555-5555-5555-555555555555", + "os_open_lids": "66666666-6666-6666-6666-666666666666", + "nsul": "77777777-7777-7777-7777-777777777777", + "ppd": [ + "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb", + ], + }, + } + path = self._write_manifest(payload) + manifest = load_bundle_manifest(path) + self.assertEqual( + ( + "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb", + ), + manifest.source_runs["ppd"], + ) + + def test_bundle_rejects_empty_source_run_list(self) -> None: + payload = { + "build_profile": "gb_core", + "source_runs": { + "onspd": [], + "os_open_usrn": "22222222-2222-2222-2222-222222222222", + "os_open_names": "33333333-3333-3333-3333-333333333333", + "os_open_roads": "44444444-4444-4444-4444-444444444444", + "os_open_uprn": "55555555-5555-5555-5555-555555555555", + "os_open_lids": "66666666-6666-6666-6666-666666666666", + "nsul": "77777777-7777-7777-7777-777777777777", + }, + } + path = self._write_manifest(payload) + with self.assertRaises(ManifestError): + load_bundle_manifest(path) + + def test_gb_core_ppd_does_not_require_ni_sources(self) -> None: + payload = { + "build_profile": "gb_core_ppd", + "source_runs": { + "onspd": "11111111-1111-1111-1111-111111111111", + "os_open_usrn": "22222222-2222-2222-2222-222222222222", + "os_open_names": "33333333-3333-3333-3333-333333333333", + "os_open_roads": "44444444-4444-4444-4444-444444444444", + "os_open_uprn": "55555555-5555-5555-5555-555555555555", + "os_open_lids": "66666666-6666-6666-6666-666666666666", + "nsul": "77777777-7777-7777-7777-777777777777", + "ppd": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + }, + } + path = self._write_manifest(payload) + manifest = load_bundle_manifest(path) + self.assertEqual(("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa",), manifest.source_runs["ppd"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_candidate_immutability_db_contract.py b/tests/test_candidate_immutability_db_contract.py new file mode 100644 index 0000000..68ddc36 --- /dev/null +++ b/tests/test_candidate_immutability_db_contract.py @@ -0,0 +1,21 @@ +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +MIGRATION = ROOT / "pipeline" / "sql" / "migrations" / "0005_v3_cutover_foundation.sql" + + +class CandidateImmutabilityDbContractTests(unittest.TestCase): + def test_candidate_trigger_rejects_update_and_delete(self) -> None: + text = MIGRATION.read_text(encoding="utf-8") + self.assertIn("CREATE OR REPLACE FUNCTION derived.reject_candidate_mutation", text) + self.assertIn("append-only", text) + self.assertIn("CREATE TRIGGER trg_candidate_no_update", text) + self.assertIn("BEFORE UPDATE ON derived.postcode_street_candidates", text) + self.assertIn("CREATE TRIGGER trg_candidate_no_delete", text) + self.assertIn("BEFORE DELETE ON derived.postcode_street_candidates", text) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_cli_v3_contract.py b/tests/test_cli_v3_contract.py new file mode 100644 index 0000000..fc88283 --- /dev/null +++ b/tests/test_cli_v3_contract.py @@ -0,0 +1,21 @@ +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +CLI = ROOT / "pipeline" / "src" / "pipeline" / "cli.py" + + +class CliV3ContractTests(unittest.TestCase): + def test_cli_has_v3_commands(self) -> None: + text = CLI.read_text(encoding="utf-8") + self.assertIn('add_parser("bundle"', text) + self.assertIn('add_parser("build"', text) + self.assertIn('add_parser("source"', text) + self.assertIn('add_parser("run"', text) + self.assertIn('add_parser("verify"', text) + self.assertIn('add_parser("publish"', text) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_docs_v3_contract.py b/tests/test_docs_v3_contract.py new file mode 100644 index 0000000..d713273 --- /dev/null +++ b/tests/test_docs_v3_contract.py @@ -0,0 +1,20 @@ +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +SPEC = ROOT / "docs" / "spec" / "pipeline_v3" / "spec.md" + + +class DocsV3ContractTests(unittest.TestCase): + def test_spec_locks_append_only_promotion_and_exact_probability(self) -> None: + text = SPEC.read_text(encoding="utf-8") + self.assertIn("Pass 3 Promotion Semantics (Append-Only)", text) + self.assertIn("immutable evidence rows", text) + self.assertIn("Probability Formula (Exact)", text) + self.assertIn("probability(postcode, street) = weighted_score(postcode, street) / total_weight(postcode)", text) + self.assertNotIn("~1.0", text) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_onspd_optional_admin_fields_contract.py b/tests/test_onspd_optional_admin_fields_contract.py new file mode 100644 index 0000000..ced71db --- /dev/null +++ b/tests/test_onspd_optional_admin_fields_contract.py @@ -0,0 +1,27 @@ +import json +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +SOURCE_SCHEMA = ROOT / "pipeline" / "config" / "source_schema.yaml" +WORKFLOWS = ROOT / "pipeline" / "src" / "pipeline" / "build" / "workflows.py" + + +class OnspdOptionalAdminFieldsContractTests(unittest.TestCase): + def test_onspd_field_map_includes_post_town_and_locality(self) -> None: + payload = json.loads(SOURCE_SCHEMA.read_text(encoding="utf-8")) + field_map = payload["sources"]["onspd"]["field_map"] + self.assertIn("post_town", field_map) + self.assertIn("locality", field_map) + self.assertTrue(str(field_map["post_town"]).strip()) + self.assertTrue(str(field_map["locality"]).strip()) + + def test_stage_loader_resolves_post_town_and_locality_via_field_candidates(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn('post_town_raw = _field_value(row, field_map, "post_town")', text) + self.assertIn('locality_raw = _field_value(row, field_map, "locality")', text) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_pass3_append_only_promotion.py b/tests/test_pass3_append_only_promotion.py new file mode 100644 index 0000000..90236f6 --- /dev/null +++ b/tests/test_pass3_append_only_promotion.py @@ -0,0 +1,24 @@ +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +WORKFLOWS = ROOT / "pipeline" / "src" / "pipeline" / "build" / "workflows.py" + + +class Pass3AppendOnlyPromotionTests(unittest.TestCase): + def test_pass3_inserts_promoted_rows_and_lineage(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn("def _pass_3_open_names_candidates", text) + self.assertIn("INSERT INTO derived.postcode_street_candidates", text) + self.assertIn("'open_lids_toid_usrn'", text) + self.assertIn("INSERT INTO derived.postcode_street_candidate_lineage", text) + self.assertIn("promotion_toid_usrn", text) + + def test_pass3_does_not_update_candidate_type(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertNotIn("UPDATE derived.postcode_street_candidates", text) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_probability_exact_normalization.py b/tests/test_probability_exact_normalization.py new file mode 100644 index 0000000..97c14a2 --- /dev/null +++ b/tests/test_probability_exact_normalization.py @@ -0,0 +1,25 @@ +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +WORKFLOWS = ROOT / "pipeline" / "src" / "pipeline" / "build" / "workflows.py" + + +class ProbabilityExactNormalizationTests(unittest.TestCase): + def test_probability_uses_explicit_denominator(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn("(g.weighted_score / t.total_weight) AS raw_probability", text) + + def test_probability_residual_correction_applied_to_rank_one(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn("WHEN rn = 1", text) + self.assertIn("(1.0000 - rounded_sum)", text) + + def test_verify_requires_exact_sum_one(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn("HAVING SUM(probability)::numeric(10,4) <> 1.0000", text) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_stage_cleanup_contract.py b/tests/test_stage_cleanup_contract.py new file mode 100644 index 0000000..98d7251 --- /dev/null +++ b/tests/test_stage_cleanup_contract.py @@ -0,0 +1,22 @@ +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +WORKFLOWS = ROOT / "pipeline" / "src" / "pipeline" / "build" / "workflows.py" + + +class StageCleanupContractTests(unittest.TestCase): + def test_stage_cleanup_uses_truncate_not_per_run_delete(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn("TRUNCATE TABLE", text) + self.assertNotIn("DELETE FROM {}.{} WHERE build_run_id = %s", text) + + def test_stage_cleanup_blocks_when_other_build_is_started(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn("WHERE status = 'started'", text) + self.assertIn("Stage truncate is unsafe while another build is in status=started", text) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_weight_config_contract.py b/tests/test_weight_config_contract.py new file mode 100644 index 0000000..8d3e1a4 --- /dev/null +++ b/tests/test_weight_config_contract.py @@ -0,0 +1,38 @@ +import ast +import json +import unittest +from decimal import Decimal +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +WORKFLOWS = ROOT / "pipeline" / "src" / "pipeline" / "build" / "workflows.py" + + +def _candidate_types_from_workflows() -> tuple[str, ...]: + text = WORKFLOWS.read_text(encoding="utf-8") + tree = ast.parse(text) + for node in tree.body: + if isinstance(node, ast.Assign): + for target in node.targets: + if isinstance(target, ast.Name) and target.id == "CANDIDATE_TYPES": + value = ast.literal_eval(node.value) + if isinstance(value, tuple): + return tuple(str(item) for item in value) + raise AssertionError("CANDIDATE_TYPES constant not found in workflows.py") + + +class WeightConfigContractTests(unittest.TestCase): + def test_weight_config_has_all_candidate_types_with_positive_values(self) -> None: + candidate_types = _candidate_types_from_workflows() + config_path = ROOT / "pipeline" / "config" / "frequency_weights.yaml" + payload = json.loads(config_path.read_text(encoding="utf-8")) + weights = payload["weights"] + + self.assertEqual(set(candidate_types), set(weights.keys())) + for candidate_type in candidate_types: + self.assertGreater(Decimal(str(weights[candidate_type])), Decimal("0")) + + +if __name__ == "__main__": + unittest.main()