From e405af7fa5cb176d5cba0b38e99ede3e262afe58 Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Fri, 20 Feb 2026 23:27:22 +0000 Subject: [PATCH 01/17] docs(spec): add source acquisition and phase documentation updates --- docs/spec/data_sources.md | 345 ++++++++++++ docs/spec/phase_1/changes.md | 87 +++ docs/spec/phase_1/name_norm.md | 26 + docs/spec/phase_1/spec.md | 502 ++++++++++-------- docs/spec/phase_2-open-names/changes.md | 97 ++++ docs/spec/phase_2-open-names/prd.md | 403 +++----------- docs/spec/phase_2-open-names/spec.md | 269 ++++++++++ .../spec/phase_2-open-names/voronoi_method.md | 33 ++ 8 files changed, 1189 insertions(+), 573 deletions(-) create mode 100644 docs/spec/data_sources.md create mode 100644 docs/spec/phase_1/changes.md create mode 100644 docs/spec/phase_1/name_norm.md create mode 100644 docs/spec/phase_2-open-names/changes.md create mode 100644 docs/spec/phase_2-open-names/spec.md create mode 100644 docs/spec/phase_2-open-names/voronoi_method.md diff --git a/docs/spec/data_sources.md b/docs/spec/data_sources.md new file mode 100644 index 0000000..2a7d1f0 --- /dev/null +++ b/docs/spec/data_sources.md @@ -0,0 +1,345 @@ +# Data Sources + +This document is the authoritative procedure for obtaining the latest source files for pipeline builds (Phase 1 + Phase 2): + +- ONSUD +- OS Open UPRN +- OS Open Roads +- OS Open Names + +Rules: + +- Do not guess dataset structure. +- Persist release metadata and checksums in manifests. +- Verify hashes before ingest. +- If a source does not provide an official release identifier, record that as `Unknown` and derive a deterministic local release token from retrieval date + published hash. + +## 0. Source Registry + +| Dataset | Official discovery endpoint | Download endpoint type | Licence handling | Update frequency | +|---|---|---|---|---| +| ONSUD | `https://geoportal.statistics.gov.uk/api/search/v1/collections/dataset/items?q=ONSUD_LATEST&limit=20` | ArcGIS item `/data` download | Read from ArcGIS item `licenseInfo` and linked ONS licence page | Published as periodic ONSUD releases (item description states ~6-week cadence) | +| OS Open UPRN | `https://api.os.uk/downloads/v1/products/OpenUPRN/downloads` | OS Downloads API artifact URL (`redirect`) | Open OS licence terms from product metadata/docs | Official cadence is not enforced in pipeline; derive from published artifact metadata | +| OS Open Roads | `https://api.os.uk/downloads/v1/products/OpenRoads/downloads` | OS Downloads API artifact URL (`redirect`) | Open OS licence terms from product metadata/docs | Official cadence is not enforced in pipeline; derive from published artifact metadata | +| OS Open Names | `https://api.os.uk/downloads/v1/products/OpenNames/downloads` | OS Downloads API artifact URL (`redirect`) | Open OS licence terms from product metadata/docs | Approximately six-monthly; derive concrete release from artifact metadata | + +If licence text or cadence cannot be confirmed at ingest time, record `Unknown` and retain the raw metadata response used for the run. + +## 1. Prerequisites + +Required tools: + +- `curl` +- `python3` +- `shasum` +- `unzip` +- `ogrinfo` (Open Roads inspection) + +Suggested directories: + +```bash +mkdir -p data/source_files/real data/manifests/real +``` + +## 2. ONSUD (Latest) + +### 2.1 Discover latest ONSUD item + +Use the Open Geography Portal search API: + +```bash +curl -s 'https://geoportal.statistics.gov.uk/api/search/v1/collections/dataset/items?q=ONSUD_LATEST&limit=20' > /tmp/onsud_search.json +python3 - <<'PY' +import json +from pathlib import Path + +payload = json.loads(Path('/tmp/onsud_search.json').read_text()) +for feature in payload.get('features', []): + p = feature.get('properties', {}) + if p.get('title') == 'ONSUD_LATEST': + print(feature['id']) + break +else: + raise SystemExit('Could not find ONSUD_LATEST item id') +PY +``` + +Store the printed item id as `ONSUD_ITEM_ID`. + +### 2.2 Retrieve metadata and download file + +```bash +ONSUD_ITEM_ID='' +curl -s "https://www.arcgis.com/sharing/rest/content/items/${ONSUD_ITEM_ID}?f=json" > /tmp/onsud_item.json +python3 - <<'PY' +import json +from pathlib import Path +item = json.loads(Path('/tmp/onsud_item.json').read_text()) +print('name=', item.get('name')) +print('size=', item.get('size')) +print('modified=', item.get('modified')) +PY + +curl -fL --retry 3 --retry-delay 2 \ + -o data/source_files/real/ONSUD_LATEST.zip \ + "https://www.arcgis.com/sharing/rest/content/items/${ONSUD_ITEM_ID}/data" +``` + +### 2.3 Verify and unpack + +```bash +shasum -a 256 data/source_files/real/ONSUD_LATEST.zip +unzip -l data/source_files/real/ONSUD_LATEST.zip | head -n 50 +unzip -o data/source_files/real/ONSUD_LATEST.zip -d data/source_files/real/onsud +find data/source_files/real/onsud -type f -name '*.csv' | sort +``` + +Required ONSUD manifest mappings (explicit, no guessing): + +- `uprn` +- `postcode` +- `postcode_unit_easting` +- `postcode_unit_northing` + +## 3. OS Open UPRN (Latest CSV) + +### 3.1 Discover latest downloadable CSV artifact + +```bash +curl -s 'https://api.os.uk/downloads/v1/products/OpenUPRN/downloads' > /tmp/open_uprn_downloads.json +python3 - <<'PY' +import json +from pathlib import Path + +items = json.loads(Path('/tmp/open_uprn_downloads.json').read_text()) +match = next( + i for i in items + if i.get('area') == 'GB' and i.get('format') == 'CSV' +) +print('url=', match['url']) +print('fileName=', match['fileName']) +print('md5=', match['md5']) +print('size=', match['size']) +PY +``` + +### 3.2 Download and verify + +```bash +OPEN_UPRN_URL="$(python3 - <<'PY' +import json +from pathlib import Path +items = json.loads(Path('/tmp/open_uprn_downloads.json').read_text()) +match = next(i for i in items if i.get('area') == 'GB' and i.get('format') == 'CSV') +print(match['url']) +PY +)" + +OPEN_UPRN_MD5="$(python3 - <<'PY' +import json +from pathlib import Path +items = json.loads(Path('/tmp/open_uprn_downloads.json').read_text()) +match = next(i for i in items if i.get('area') == 'GB' and i.get('format') == 'CSV') +print(match['md5']) +PY +)" + +curl -fL --retry 3 --retry-delay 2 \ + -o data/source_files/real/open_uprn_latest_csv.zip \ + "${OPEN_UPRN_URL}" + +md5 data/source_files/real/open_uprn_latest_csv.zip +echo "Expected md5: ${OPEN_UPRN_MD5}" +``` + +### 3.3 Unpack and inspect columns + +```bash +unzip -o data/source_files/real/open_uprn_latest_csv.zip -d data/source_files/real/open_uprn +OPEN_UPRN_CSV="$(find data/source_files/real/open_uprn -type f -name '*.csv' | head -n 1)" +echo "${OPEN_UPRN_CSV}" +head -n 1 "${OPEN_UPRN_CSV}" +``` + +Map these required fields explicitly in manifest: + +- `uprn` +- `latitude` +- `longitude` +- `easting` +- `northing` + +## 4. OS Open Roads (Latest GeoPackage) + +### 4.1 Discover latest GeoPackage artifact + +```bash +curl -s 'https://api.os.uk/downloads/v1/products/OpenRoads/downloads' > /tmp/open_roads_downloads.json +python3 - <<'PY' +import json +from pathlib import Path + +items = json.loads(Path('/tmp/open_roads_downloads.json').read_text()) +match = next( + i for i in items + if i.get('area') == 'GB' and i.get('format') == 'GeoPackage' +) +print('url=', match['url']) +print('fileName=', match['fileName']) +print('md5=', match['md5']) +print('size=', match['size']) +PY +``` + +### 4.2 Download and verify + +```bash +OPEN_ROADS_URL="$(python3 - <<'PY' +import json +from pathlib import Path +items = json.loads(Path('/tmp/open_roads_downloads.json').read_text()) +match = next(i for i in items if i.get('area') == 'GB' and i.get('format') == 'GeoPackage') +print(match['url']) +PY +)" + +OPEN_ROADS_MD5="$(python3 - <<'PY' +import json +from pathlib import Path +items = json.loads(Path('/tmp/open_roads_downloads.json').read_text()) +match = next(i for i in items if i.get('area') == 'GB' and i.get('format') == 'GeoPackage') +print(match['md5']) +PY +)" + +curl -fL --retry 3 --retry-delay 2 \ + -o data/source_files/real/open_roads_latest_gpkg.zip \ + "${OPEN_ROADS_URL}" + +md5 data/source_files/real/open_roads_latest_gpkg.zip +echo "Expected md5: ${OPEN_ROADS_MD5}" +``` + +### 4.3 Unpack and inspect layer/fields + +```bash +unzip -o data/source_files/real/open_roads_latest_gpkg.zip -d data/source_files/real/open_roads +OPEN_ROADS_GPKG="$(find data/source_files/real/open_roads -type f -name '*.gpkg' | head -n 1)" +echo "${OPEN_ROADS_GPKG}" +ogrinfo "${OPEN_ROADS_GPKG}" +``` + +Select the layer used for named road segments and inspect schema: + +```bash +ogrinfo -so "${OPEN_ROADS_GPKG}" '' +``` + +Map these required fields explicitly in manifest: + +- `source_id` (stable source identifier column) +- `name_display` (road name display column) + +Do not assume field names without inspecting the exact release file. + +## 5. OS Open Names (Latest CSV) + +### 5.1 Discover latest CSV artifact + +```bash +curl -s 'https://api.os.uk/downloads/v1/products/OpenNames/downloads' > /tmp/open_names_downloads.json +python3 - <<'PY' +import json +from pathlib import Path + +items = json.loads(Path('/tmp/open_names_downloads.json').read_text()) +match = next(i for i in items if i.get('format') == 'CSV') +print('url=', match['url']) +print('fileName=', match['fileName']) +print('md5=', match['md5']) +print('size=', match['size']) +PY +``` + +### 5.2 Download and verify + +```bash +OPEN_NAMES_URL="$(python3 - <<'PY' +import json +from pathlib import Path +items = json.loads(Path('/tmp/open_names_downloads.json').read_text()) +match = next(i for i in items if i.get('format') == 'CSV') +print(match['url']) +PY +)" + +OPEN_NAMES_MD5="$(python3 - <<'PY' +import json +from pathlib import Path +items = json.loads(Path('/tmp/open_names_downloads.json').read_text()) +match = next(i for i in items if i.get('format') == 'CSV') +print(match['md5']) +PY +)" + +curl -fL --retry 3 --retry-delay 2 \ + -o data/source_files/real/open_names_latest_csv.zip \ + "${OPEN_NAMES_URL}" + +md5 data/source_files/real/open_names_latest_csv.zip +echo "Expected md5: ${OPEN_NAMES_MD5}" +``` + +### 5.3 Unpack and inspect columns + +```bash +unzip -o data/source_files/real/open_names_latest_csv.zip -d data/source_files/real/open_names +OPEN_NAMES_CSV="$(find data/source_files/real/open_names -type f -name '*.csv' | head -n 1)" +echo "${OPEN_NAMES_CSV}" +head -n 1 "${OPEN_NAMES_CSV}" +``` + +Required Open Names manifest mappings: + +- `entry_id` +- `name1` +- `name1_lang` +- `name2` +- `local_type` +- `geometry_x` +- `geometry_y` +- `postcode_district` + +Ingest supports CSV only for Open Names. + +## 6. Release ID Rules + +Use official release identifiers where provided in source metadata. + +If not provided: + +- Record official identifier as `Unknown` in notes. +- Use deterministic local `release_id` for manifests: + - `-unknown--` +- Example: + - `open_roads-unknown-20260220-ebbaaaff` + +This preserves reproducibility while avoiding guessed semantic versions. + +## 7. Manifest Preparation Checklist + +For each dataset manifest: + +1. `dataset_key` is correct (`onsud`, `open_uprn`, `open_roads`, `open_names`) +2. `release_id` follows rules above +3. `source_url` is the exact URL used to download +4. `file_path` points to the local extracted file used for ingest +5. `expected_sha256` equals `shasum -a 256 ` +6. `column_map` is explicit and validated from inspected headers/layers +7. For Open Roads, `layer_name` is set and validated via `ogrinfo` + +## 8. Operational Notes + +- File sizes are large (hundreds of MB to >1 GB). Use a stable network connection. +- Prefer re-runnable shell scripts over manual ad-hoc commands. +- Keep downloaded archives and manifests together under `data/source_files/real` and `data/manifests/real` for auditability. diff --git a/docs/spec/phase_1/changes.md b/docs/spec/phase_1/changes.md new file mode 100644 index 0000000..1c3149b --- /dev/null +++ b/docs/spec/phase_1/changes.md @@ -0,0 +1,87 @@ +## 2026-02-20 — CHG-0001 + +What changed: +- Replaced nearest-road candidate ordering in Phase 1 derived build from GiST KNN (`<->`) to deterministic `ST_DWithin + ST_Distance + segment_id`. + +Why it changed: +- Full-scale real dataset runs reproduced PostgreSQL/PostGIS runtime failure: `index returned tuples in wrong order` during KNN nearest-neighbor evaluation. +- Failure was reproduced in isolation against dedicated `iso_knn` tables and a single point fixture, and persisted after `REINDEX`. + +Before behavior: +- `ORDER BY point_geom <-> road_geom, segment_id ASC` with no explicit `ST_DWithin` in the lateral nearest-road selector. + +After behavior: +- `WHERE ST_DWithin(point_geom, road_geom, radius_m)` +- `ORDER BY ST_Distance(point_geom, road_geom) ASC, segment_id ASC` + +Observed / expected metric impact: +- Expected semantic output: unchanged for rows with a nearest named segment within radius. +- Expected operational impact: slower runtime in dense urban areas due full distance ordering within radius candidates. + +Determinism confirmation: +- Tie-break remains stable and explicit (`distance`, then `segment_id`). + +Spec update confirmation: +- Updated `/Users/jamie/code/postcod.es/docs/spec/phase_1/spec.md` to reflect runtime query contract and dated change note. + +## 2026-02-20 — CHG-0002 + +What changed: +- Added stage checkpoint persistence in `meta.release_set_stage_checkpoint`. +- Added resumable build mode: `pipeline run phase1 --resume`. + +Why it changed: +- Full-scale builds can fail late in the process (for example, during derived spatial inference) after long-running successful stages. +- Restarting from zero is operationally expensive and slows deterministic troubleshooting. + +Before behavior: +- `pipeline run phase1` behaved as a single-shot build with no persisted stage boundary checkpoints. +- Failure required a full rerun from the first build step. + +After behavior: +- Successful stage completions are persisted per `release_set_id`. +- `pipeline run phase1 --resume` skips completed stages and continues from the first incomplete stage. +- Default non-resume run on a `created` release set performs a clean rebuild by dropping release tables and clearing checkpoints. +- `--resume` and `--rebuild` are mutually exclusive. + +Observed / expected metric impact: +- No semantic change to output rows or metrics for a successful end-to-end run. +- Operational improvement: failure recovery time reduced by avoiding recomputation of completed stages. + +Determinism confirmation: +- Stage checkpoints only skip previously completed deterministic stages for the same `release_set_id`. +- Rebuild path remains explicit via `--rebuild`. + +Spec update confirmation: +- Updated `/Users/jamie/code/postcod.es/docs/spec/phase_1/spec.md` with resume CLI and checkpoint table contract. + +## 2026-02-20 — CHG-0003 + +What changed: +- Refined resume checkpoints from coarse phase-level boundaries to table-level build boundaries. + +Why it changed: +- Long-running builds need restart points between individual core/derived table builds, not only between aggregate phases. + +Before behavior: +- Core build was checkpointed once after all core tables were built. +- Derived build was checkpointed once after derived phase completion. + +After behavior: +- Checkpoints are now written after each table build: + - `core_uprn_postcode_built` + - `core_uprn_point_built` + - `core_road_segment_built` + - `derived_uprn_street_spatial_built` +- Existing legacy checkpoint names (`core_built`, `derived_built`) remain accepted by constraint for backward compatibility. + +Observed / expected metric impact: +- No change to output semantics or metric values. +- Operational improvement: finer-grained resume points reduce rebuild time after late-stage failures. + +Determinism confirmation: +- Each checkpoint still marks completion of deterministic SQL steps for one release set. +- Resume continues by skipping only completed table-level checkpoints. + +Spec update confirmation: +- Updated `/Users/jamie/code/postcod.es/docs/spec/phase_1/spec.md` to lock table-level checkpoint behavior. diff --git a/docs/spec/phase_1/name_norm.md b/docs/spec/phase_1/name_norm.md new file mode 100644 index 0000000..9bb741e --- /dev/null +++ b/docs/spec/phase_1/name_norm.md @@ -0,0 +1,26 @@ +# Phase 1 `name_norm` Specification + +Status: Locked for Phase 1. + +`name_norm` exists for deterministic grouping and hashing within the Phase 1 pipeline. +It is intentionally minimal and must not include linguistic expansion rules. + +## Rules (in order) + +1. Convert to uppercase. +2. Trim leading and trailing whitespace. +3. Collapse all internal whitespace runs to a single space. +4. Remove these punctuation characters exactly: + - `.` + - `,` + - `'` + - `-` + +## Explicitly Out of Scope + +- Abbreviation expansion (for example `ST` -> `STREET`). +- Language-aware equivalence. +- Article removal. +- Any fuzzy matching. + +If any of the above is required, it belongs to a separate Phase 2 normalization function. diff --git a/docs/spec/phase_1/spec.md b/docs/spec/phase_1/spec.md index 284b53a..1e683eb 100644 --- a/docs/spec/phase_1/spec.md +++ b/docs/spec/phase_1/spec.md @@ -1,264 +1,298 @@ # Implementation Spec -Open Data Street Inference Import Pipeline (MVP) +Open Data Street Inference Import Pipeline (Phase 1) Datasets: ONSUD + OS Open UPRN + OS Open Roads -Purpose: Build a reproducible, versioned import and transformation pipeline that produces a UPRN to inferred street mapping using only open datasets. - ## 1. Objectives Primary objective: -- Ingest 3 open datasets -- Normalise and join them -- Produce a deterministic, rebuildable derived table: - - `UPRN -> postcode -> nearest named road -> confidence score` - -Secondary objectives: -- Full dataset versioning and provenance -- Deterministic rebuilds -- Metrics collection for quality auditing -- No mutation of raw data after ingest - -Non-goals: -- No address enumeration -- No PAF or AddressBase -- No PPD or EPC -- No serving layer - -## 2. Technology Stack - -Language: -- Python 3.11+ - -Database: -- PostgreSQL + PostGIS extension - -Core libraries: -- psycopg (database) -- SQLAlchemy (optional) -- pandas (CSV handling) -- geopandas (if needed for geometry) -- shapely -- pyproj -- click or argparse (CLI) - -Notes: -- Spatial processing must be delegated to PostGIS where possible -- Avoid loading large geometries into Python memory - -## 3. Directory Structure - -- `pipeline/` - - `pyproject.toml` - - `src/` - - `cli.py` - - `config.py` - - `datasets/` - - `onsud.py` - - `open_uprn.py` - - `open_roads.py` - - `ingest/` - - `raw_load.py` - - `transform/` - - `build_core.py` - - `build_derived.py` - - `metrics.py` - - `util/` - - `normalise.py` - - `hashing.py` - - `sql/` - - `schema.sql` - - `indexes.sql` -- `data/` - - `raw/` - - `onsud/` - - `open_uprn/` - - `open_roads/` - -## 4. Dataset Specifications - -## 4.1 ONSUD (ONS UPRN Directory) - -Purpose: -- UPRN to postcode backbone - -Required fields: -- UPRN -- Postcode (unit level) - -Import requirements: -- Load full dataset -- Do not filter rows during raw ingest -- Preserve all original columns - -Derived extraction (`core.uprn_postcode`): -- `uprn` (bigint primary key) -- `postcode_norm` (text) -- `onsud_release_id` (text) - -## 4.2 OS Open UPRN - -Purpose: -- UPRN to coordinates - -Required fields: -- UPRN -- Easting -- Northing -- Latitude -- Longitude - -Derived extraction (`core.uprn_point`): -- `uprn` (bigint primary key) -- `geom` (Point, SRID 4326) -- `lat` -- `lon` -- `easting` -- `northing` -- `open_uprn_release_id` - -## 4.3 OS Open Roads - -Purpose: -- Named road geometries for nearest-neighbour inference - -Required: -- Geometry (LineString or MultiLineString) -- Road name field - -Derived extraction (`core.road_segment`): -- `segment_id` (bigserial primary key) -- `name_display` (text nullable) -- `name_norm` (text nullable) -- `geom` (geometry) -- `open_roads_release_id` - -## 5. Versioning and Provenance - -Table: `meta.dataset_release` - -Fields: -- `dataset_key` (`onsud|open_uprn|open_roads`) -- `release_id` -- `source_url` -- `sha256` -- `retrieved_at` -- `licence` -- `file_path` +- Ingest three open datasets. +- Build deterministic core and derived tables. +- Produce: `UPRN -> postcode -> nearest named road -> confidence score`. + +Quality objectives: +- Deterministic rebuilds from identical inputs. +- Full provenance by dataset release identifiers. +- Programmatic gate checks at every stage. +- No hidden state and no implicit defaults. + +### 1.1 Change Note + +- 2026-02-20: Nearest-road implementation switched from GiST KNN (`<->`) to deterministic `ST_DWithin + ST_Distance` ordering due reproducible PostgreSQL/PostGIS runtime failure (`index returned tuples in wrong order`) on national-scale data. +- 2026-02-20: Added explicit Phase 1 resume checkpoints (`meta.release_set_stage_checkpoint`) and `pipeline run phase1 --resume` stage restart semantics. + +## 2. Scope and Non-goals -Table: `meta.pipeline_run` +Phase 1 includes: +- Dataset ingest and registration. +- Core table construction. +- Spatial nearest named road inference. +- Distance-based confidence scoring. +- Metrics and canonical hash persistence. -Fields: -- `run_id` (uuid) -- `started_at` -- `finished_at` -- `status` -- `release_map` (json) -- `log_path` +Phase 1 excludes: +- PPD, EPC, LLM logic. +- API/serving layer. +- Enumeration endpoints. +- NI data integration. + +## 3. Operational Model + +The workflow is explicit and separated: +- Ingest commands populate ingest-layer tables only. +- Build command populates release-schema core/derived tables only. +- Activate command repoints stable views only. + +### 3.1 CLI Contract + +- `pipeline db migrate` +- `pipeline ingest onsud --manifest ` +- `pipeline ingest open-uprn --manifest ` +- `pipeline ingest open-roads --manifest ` +- `pipeline release create --onsud-release --open-uprn-release --open-roads-release ` +- `pipeline run phase1 --release-set-id [--resume] [--rebuild]` +- `pipeline release activate --release-set-id --actor ` Rules: -- Every derived build references exact `release_id` values -- Raw data is immutable -- Rebuilds must be deterministic +- `pipeline run phase1` never performs ingest work. +- If release set status is `built` and `--rebuild` is absent, `run phase1` is a no-op. +- `--resume` continues only incomplete build stages for that release set using persisted stage checkpoints. +- Without `--resume`, a `created` release set starts as a clean build (drop/recreate release tables, clear checkpoints). +- `--resume` and `--rebuild` are mutually exclusive. +- Checkpoints are written after each table build boundary (not only after aggregate phases). + +## 4. Data Model + +## 4.1 `meta` schema + +### `meta.dataset_release` +Required fields include: +- `dataset_key`, `release_id` (composite key) +- `source_url`, `licence`, `file_path` +- `expected_sha256`, `actual_sha256` +- `retrieved_at`, `manifest_json` +- `source_row_count`, `loaded_row_count` (CSV datasets) +- `source_feature_count`, `loaded_feature_count` (Open Roads) +- `source_layer_name`, `srid_confirmed` + +### `meta.pipeline_run` +Tracks run start/end, status, stage, release set linkage. + +### `meta.release_set_stage_checkpoint` +Persisted build checkpoints for resumable Phase 1 runs: +- `release_set_id`, `stage_name` (composite key) +- `run_id`, `completed_at` + +Allowed `stage_name` values: +- `release_tables_created` +- `core_uprn_postcode_built` +- `core_uprn_point_built` +- `core_road_segment_built` +- `derived_uprn_street_spatial_built` +- `metrics_stored` +- `canonical_hashes_stored` +- `release_marked_built` + +### `meta.release_set` +- `release_set_id` +- `onsud_release_id`, `open_uprn_release_id`, `open_roads_release_id` +- `physical_schema`, `status` +- Hard uniqueness constraint: + - `UNIQUE (onsud_release_id, open_uprn_release_id, open_roads_release_id)` + +### `meta.release_activation_log` +Audit record for view promotion actions (`who`, `when`, `from`, `to`). + +### `meta.dataset_metrics` +Metric key-value records linked to `run_id` and `release_set_id`. + +### `meta.canonical_hash` +- `release_set_id` +- `object_name` +- `projection` (ordered JSON array of columns) +- `row_count` +- `sha256` +- `computed_at` +- `run_id` + +Primary key: +- `(release_set_id, object_name, run_id)` + +Allowed `object_name` values are locked: +- `core_uprn_postcode` +- `core_uprn_point` +- `core_road_segment` +- `derived_uprn_street_spatial` + +## 4.2 `raw` schema + +### `raw.onsud_row` +- `dataset_key`, `release_id`, `source_row_num` +- `uprn`, `postcode` +- `extras_jsonb` + +### `raw.open_uprn_row` +- `dataset_key`, `release_id`, `source_row_num` +- `uprn`, `latitude`, `longitude`, `easting`, `northing` +- `extras_jsonb` + +## 4.3 `stage` schema + +### `stage.open_roads_segment` (locked contract) +Required columns: +- `dataset_key text not null` (must be `open_roads`) +- `release_id text not null` +- `segment_id bigint not null` (ingest-generated, deterministic within release) +- `name_display text` +- `name_norm text` +- `geom_bng geometry(MultiLineString,27700) not null` + +Required constraints: +- `CHECK (dataset_key = 'open_roads')` +- `UNIQUE (release_id, segment_id)` +- `NOT NULL release_id` + +Required indexes: +- btree on `(release_id)` +- GiST on `geom_bng` + +Build linkage rule: +- `pipeline run phase1` must read only rows where: + - `stage.open_roads_segment.release_id = meta.release_set.open_roads_release_id` + +## 4.4 Versioned physical schemas and stable views + +For each release set, build into `rs_` tables: +- `core_uprn_postcode` +- `core_uprn_point` +- `core_road_segment` +- `derived_uprn_street_spatial` + +Stable consumer views: +- `core.uprn_postcode` +- `core.uprn_point` +- `core.road_segment` +- `derived.uprn_street_spatial` + +Only `pipeline release activate` may repoint these views. + +## 5. Ingest Rules + +General ingest rules: +- Manifest-driven field mapping; no schema guessing. +- SHA256 mismatch: hard fail. +- Duplicate `(dataset_key, release_id)` with different hash: hard fail. +- Duplicate `(dataset_key, release_id)` with same hash: no-op with clear log. +- Missing required mapped columns: hard fail. + +Open Roads ingest rule: +- `pipeline ingest open-roads` handles loading into `stage.open_roads_segment` and persists source/loaded feature counts. + +## 6. Spatial Inference Rules + +- Metric spatial ops use BNG only (`SRID 27700`). +- Distance calculations never use WGS84 geometry. +- Validity gate validates `geom_bng` specifically. + +Nearest-road query contract: +- Candidate roads are filtered with `ST_DWithin(point, road, radius_m)`. +- Candidate ordering is `ST_Distance(point, road) ASC, segment_id ASC`. +- GiST KNN operator (`<->`) is not used in Phase 1 runtime queries. + +Deterministic tie-breaking: +1. Distance ascending. +2. `segment_id` ascending. + +Confidence score bands are fixed: +- `<=15m => 0.70` +- `<=30m => 0.55` +- `<=60m => 0.40` +- `<=150m => 0.25` +- `>150m => 0.00` + +No named road within radius: +- `method = 'none_within_radius'` +- `confidence_score = 0.00` -## 6. Normalisation Rules +## 7. Normalisation Rules `postcode_norm`: - Uppercase - Remove spaces - Remove non-alphanumeric -`street_norm`: +`name_norm` (Phase 1 minimal and frozen): - Uppercase -- Trim -- Collapse whitespace -- Preserve original name in `street_display` - -`UPRN`: -- Cast to bigint after validation - -## 7. Ingest Workflow - -Step 1: Register dataset release -- Compute SHA256 of archive -- Insert into `meta.dataset_release` - -Step 2: Load raw table -- Use `COPY` for CSV -- Use `ogr2ogr` or PostGIS loader for shapefiles -- Record row counts - -Step 3: Build core tables -- Extract required fields -- Apply normalisation -- Create indexes - -Step 4: Validate joins -- Count UPRNs in ONSUD -- Count matching UPRNs in Open UPRN -- Report coverage percentage - -## 8. Street Inference Algorithm (Phase 1) - -Goal: -- Assign nearest named road to each UPRN - -Process: -1. Join `core.uprn_postcode` with `core.uprn_point` -2. For each UPRN with coordinates: - - Use PostGIS KNN operator (`<->`) to find nearest road segment - - Filter to segments with non-null `name_display` - - Compute `ST_Distance` in metres -3. Apply search radius threshold (default `150m`) -4. Assign: - - `street_display` - - `street_norm` - - `distance_m` - - `method = 'open_roads_nearest'` - - `confidence_score` (distance-based banding) -5. Insert into `derived.uprn_street_spatial` - -Confidence bands: -- `<= 15m` -> `0.70` -- `<= 30m` -> `0.55` -- `<= 60m` -> `0.40` -- `<= 150m` -> `0.25` -- `> 150m` -> `0.00` - -If no named road within radius: -- `street_display = NULL` -- `confidence_score = 0.00` -- `method = 'none_within_radius'` +- Trim leading/trailing whitespace +- Collapse internal whitespace to single spaces +- Strip punctuation: `.` `,` `'` `-` + +## 8. Metrics Definitions + +Set definitions are fixed: +- `total_uprns_onsud` = count of non-null UPRNs in `raw.onsud_row` for the release +- `uprns_with_coordinates` = count of distinct UPRNs present in both release core postcode and core point tables +- `uprns_resolved_named_road` = count of UPRNs in derived table where `method='open_roads_nearest'` + +Formulas are fixed: +- `coordinate_coverage_pct = uprns_with_coordinates / total_uprns_onsud * 100` +- `resolution_pct = uprns_resolved_named_road / uprns_with_coordinates * 100` + +## 9. Gate Criteria (Programmatic) + +Registration gate: +- dataset release row exists with expected hash values. + +CSV ingest gate: +- source data-row count (header excluded) equals loaded row count exactly. + +Open Roads gate: +- source feature count equals loaded feature count. +- all `geom_bng` valid. +- `SRID = 27700`. + +Loaded feature count query is locked: -## 9. Metrics Collection +```sql +SELECT COUNT(*) AS loaded_feature_count +FROM stage.open_roads_segment +WHERE release_id = :open_roads_release_id; +``` -Compute after build: -- Total UPRNs (ONSUD) -- UPRNs with coordinates -- Coordinate coverage percentage -- UPRNs resolved to named road -- Resolution percentage -- Distance percentiles (P50, P90, P99) +For PostgreSQL/psycopg execution, the parameter style may be adapted, but the logical query must be identical. -Insert into `meta.dataset_metrics` +Core gate: +- core table row counts are non-zero. +- join coverage is computed and logged. -## 10. CLI Contract +Spatial gate: +- resolution percent and distance P50/P90/P99 logged. +- no NULL `method` values. -- `pipeline ingest onsud --release-id --file ` -- `pipeline ingest open-uprn --release-id --file ` -- `pipeline ingest open-roads --release-id --file ` +Metrics gate: +- all mandatory metric keys persisted for the run. -- `pipeline build core --onsud --open-uprn --open-roads ` +Activation gate: +- stable views point to new release set after activation. +- activation log row exists. -- `pipeline build derived street-spatial --onsud --open-uprn --open-roads ` +## 10. Canonical Hash Rules -- `pipeline metrics compute --onsud --open-uprn --open-roads ` +- Canonical hash ordering uses stable key ordering only: + - `core_uprn_postcode`: `ORDER BY uprn ASC` + - `core_uprn_point`: `ORDER BY uprn ASC` + - `core_road_segment`: `ORDER BY segment_id ASC` + - `derived_uprn_street_spatial`: `ORDER BY uprn ASC` +- Never rely on text-collation ordering for canonical hash row order. +- Projection definition used for each hash must be persisted. -## 11. Acceptance Criteria +## 11. Test Requirements -- Rebuild produces identical row counts and deterministic output -- 95%+ of UPRNs with coords resolve to some named road (subject to dataset reality) -- All tables indexed appropriately -- No raw data mutation after import -- All outputs traceable to `release_id` values \ No newline at end of file +Required tests include: +- Two Open Roads releases in staging do not mix by `release_id`. +- Duplicate `(release_id, segment_id)` in staging fails. +- `loaded_feature_count` is sourced from locked stage query and persisted in metadata. +- Deterministic tie-break fixture for equal-distance roads. +- Activation rollback safety test for failed transaction. +- Reproducibility test: same inputs produce same canonical hashes. diff --git a/docs/spec/phase_2-open-names/changes.md b/docs/spec/phase_2-open-names/changes.md new file mode 100644 index 0000000..c2955d2 --- /dev/null +++ b/docs/spec/phase_2-open-names/changes.md @@ -0,0 +1,97 @@ +## 2026-02-20 — CHG-0001 + +What changed: +- Locked Voronoi hull buffer as code-level constant: + - `pipeline.config.VORONOI_HULL_BUFFER_M = 20000.0` +- Added explicit Voronoi SQL contract requiring bound parameter usage for `hull_buffer_m`. + +Why it changed: +- Prevent silent drift in Voronoi clipping behavior that would invalidate deterministic outputs and canonical hashes. + +Before behavior: +- Buffer value existed only in planning/docs context and could be inlined ad hoc in SQL. + +After behavior: +- Buffer value is governed by a named constant in code. +- SQL contract uses parameter binding for buffer application. +- Governance requirements are explicit for any constant change. + +Observed / expected metric impact: +- No immediate metric change at lock time. +- Future constant changes are expected to affect enumeration coverage and must be measured. + +Determinism confirmation: +- Fixed constant + fixed seed set yields stable clipped geometry. +- Determinism validated through contract tests. + +Spec update confirmation: +- Updated `/Users/jamie/code/postcod.es/docs/spec/phase_2-open-names/spec.md` and added `/Users/jamie/code/postcod.es/docs/spec/phase_2-open-names/voronoi_method.md`. + +## 2026-02-20 — CHG-0002 + +What changed: +- Implemented full Phase 2 build flow: + - `pipeline run phase2-open-names` + - mandatory Open Names release on `pipeline release create` + - checkpointed/resumable stage sequence through warnings and canonical hashes +- Added Phase 2 core/derived release-schema tables: + - `core_open_names_entry` + - `core_postcode_unit_seed` + - Phase 2 shape of `derived_uprn_street_spatial` + - `derived_postcode_street` +- Added activation warning gate with auditable acknowledgement (`--ack-warnings`). +- Added Phase 2 metrics and canonical hash set. +- Added Phase 2 E2E fixture scripts. + +Why it changed: +- Move from Phase 1 baseline to two-source reconciliation and postcode street enumeration while retaining deterministic, resumable operations. + +Before behavior: +- Build/runtime supported Phase 1 only. +- No Open Names build stages, no warnings gate, no enumeration output. + +After behavior: +- Phase 2 pipeline runs end-to-end with explicit checkpoints and activation gating. + +Observed / expected metric impact: +- New metrics expose corroboration, replacement, disagreement, and enumeration coverage. + +Determinism confirmation: +- Canonical hashes extended to Phase 2 objects. +- Ordering rules for hash projections use deterministic keys / `COLLATE "C"` where text sorting is required. + +Spec update confirmation: +- Updated: + - `/Users/jamie/code/postcod.es/docs/spec/phase_2-open-names/spec.md` + - `/Users/jamie/code/postcod.es/docs/spec/phase_2-open-names/prd.md` + +## 2026-02-20 — CHG-0003 + +What changed: +- Replaced hard-fail duplicate-seed rule in `core_postcode_unit_seed` with deterministic representative seed derivation per postcode unit: + - `AVG(postcode_unit_easting::numeric)` + - `AVG(postcode_unit_northing::numeric)` +- Added diagnostics metrics: + - `postcode_unit_seed_multi_coord_count` + - `postcode_unit_seed_max_distinct_coords` + +Why it changed: +- Real ONSUD release `ONSUD_NOV_2025` contains widespread multi-coordinate postcode units, so hard-failing duplicates blocks all national builds. + +Before behavior: +- Build failed on first postcode unit with >1 distinct seed coordinate pair. + +After behavior: +- Build derives one deterministic seed per postcode unit and proceeds. +- Multiplicity is measured and persisted as quality diagnostics. + +Observed / expected metric impact: +- Build completion becomes possible on real national data. +- New metrics expose seed multiplicity scale for monitoring. + +Determinism confirmation: +- Numeric-average aggregation is deterministic for fixed input rows. +- No row-order-dependent seed selection is used. + +Spec update confirmation: +- Updated `/Users/jamie/code/postcod.es/docs/spec/phase_2-open-names/spec.md`. diff --git a/docs/spec/phase_2-open-names/prd.md b/docs/spec/phase_2-open-names/prd.md index 811300b..23594a7 100644 --- a/docs/spec/phase_2-open-names/prd.md +++ b/docs/spec/phase_2-open-names/prd.md @@ -1,375 +1,100 @@ -# PRD: OS Open Names Integration +# PRD: Phase 2 Open Names Augmentation -**Status:** Proposed -**Phase:** 1.5 — post Phase 1 pipeline stable -**Author:** Jamie -**Date:** February 2026 +**Phase:** 2 +**Date:** February 20, 2026 ---- +## Goal -## Problem +Phase 2 adds Open Names as a second signal to Phase 1 street inference and adds postcode-level street enumeration. -The Phase 1 street inference pipeline assigns a street name to every UPRN by finding the nearest named road segment in OS Open Roads within a 150m radius. This approach works well but has two known failure modes that Open Names can directly address. +Deliverables: -**Failure mode 1 — road numbers instead of street names.** -OS Open Roads sometimes labels road segments with their road number rather than their colloquial name. A UPRN on the A40 through Acton receives `A40` as its street name from Open Roads. The street residents know is `Western Avenue`. This produces technically correct but practically useless street names for any UPRN on a numbered road. +1. Improve UPRN street assignment quality with two-source reconciliation. +2. Persist transparent provenance and disagreement signals. +3. Build deterministic postcode street listings for unit/sector/district/area. +4. Keep the build resumable and operationally auditable at national scale. -**Failure mode 2 — unverified single-source inference.** -The current pipeline has one signal per UPRN. There is no way to distinguish a confident correct inference from a confident wrong inference — the confidence score reflects distance to road geometry only, not whether the name is actually right. A UPRN 12m from a road segment gets a 0.70 confidence score whether or not that road is the street the UPRN is addressed to. +## Product Decisions (Locked) -Open Names is a purpose-built street and place gazetteer. It carries street names as primary facts rather than geometry labels. Adding it as a second signal addresses both problems. +- Open Names is mandatory in Phase 2 release creation. +- Ingest is explicit; `run phase2-open-names` does no ingest. +- Build is checkpointed and resumable. +- Warning acknowledgement is required for activation when disagreement rate is high. +- Voronoi clipping uses code constant `VORONOI_HULL_BUFFER_M` via SQL parameter binding. ---- +## User Outcomes -## Goals +Pipeline operators can: -1. Detect and replace road number labels (`A40`, `B1234`) with colloquial street names from Open Names where available. -2. Introduce a corroboration signal — a boolean indicating that Open Roads and Open Names agree on the street name for a given UPRN. -3. Store both source names in the pipeline output so the resolution method is transparent and auditable. -4. Do not degrade Phase 1 street inference results. Open Names is augmentation, not replacement. Where Open Names has no entry, Phase 1 behaviour is unchanged. -5. Remain entirely on OGL data. No new licences. +- ingest four datasets explicitly +- create a release set deterministically +- run/repair builds with `--resume` +- inspect warnings before activation +- activate only with explicit acknowledgement when required ---- +Downstream consumers can: -## Non-goals - -- Replacing the spatial KNN inference approach. Open Roads geometry remains the primary mechanism. -- Full address completion or delivery-point resolution. This is still street-level inference. -- Northern Ireland. Deferred pending CRS confirmation gate, same as all NI work. -- Abbreviation expansion or cross-source string equivalence matching. That is Phase 2 `street_equivalence_norm` work, not this. - ---- - -## Data source - -**OS Open Names** -Publisher: Ordnance Survey -Licence: OGL v3.0 -Format: CSV (zipped) or GeoPackage -Coverage: Great Britain -Update frequency: Approximately six-monthly -URL: `osdatahub.os.uk/downloads/open/OpenNames` - -Relevant fields: - -| Field | Description | -|-------|-------------| -| `ID` | Unique identifier for the named place entry | -| `NAMES_URI` | URI identifier | -| `NAME1` | Primary name | -| `NAME1_LANG` | Language of NAME1 (blank = English, `wel` = Welsh) | -| `NAME2` | Secondary name (e.g. Welsh form where NAME1 is English) | -| `TYPE` | Top-level type: `transportNetwork`, `populatedPlace`, etc. | -| `LOCAL_TYPE` | Specific type: `Road`, `Named Road`, `Numbered Road`, `Street`, etc. | -| `GEOMETRY_X` | Easting (BNG, SRID 27700) | -| `GEOMETRY_Y` | Northing (BNG, SRID 27700) | -| `POSTCODE_DISTRICT` | Postcode district associated with this entry | -| `POPULATED_PLACE` | Associated settlement name | -| `DISTRICT_BOROUGH` | Administrative district | -| `COUNTY_UNITARY` | County or unitary authority | - -For street inference, filter to entries where `LOCAL_TYPE` IN (`Road`, `Named Road`, `Street`). Exclude `Numbered Road` entries — these are the road number labels you are trying to replace, not the names you want. - ---- - -## Schema changes - -### New ingest table - -Add to the manifest-driven ingest system. Open Names is a first-class dataset with its own `dataset_key`, release tracking, SHA256 verification, and `meta.dataset_release` row. - -```sql --- raw.open_names_row -CREATE TABLE raw.open_names_row ( - id bigserial PRIMARY KEY, - dataset_key text NOT NULL, - release_id text NOT NULL, - source_row_num bigint NOT NULL, - entry_id text, - name1 text, - name1_lang text, - name2 text, - type text, - local_type text, - geometry_x double precision, - geometry_y double precision, - postcode_district text, - populated_place text, - extras_jsonb jsonb, - FOREIGN KEY (dataset_key, release_id) - REFERENCES meta.dataset_release (dataset_key, release_id) -); -``` - -### New core table - -Add `open_names_entry` to the versioned physical schema per release set, alongside `uprn_postcode`, `uprn_point`, and `road_segment`. - -```sql --- rs_.open_names_entry -CREATE TABLE open_names_entry ( - id bigserial PRIMARY KEY, - entry_id text NOT NULL, - name_display text NOT NULL, - name_norm text NOT NULL, - name2_display text, - name2_norm text, - local_type text NOT NULL, - geom_bng geometry(Point, 27700) NOT NULL, - postcode_district text, - populated_place text -); - -CREATE INDEX ON open_names_entry USING GIST (geom_bng); -CREATE INDEX ON open_names_entry (name_norm); -CREATE INDEX ON open_names_entry (postcode_district); -``` - -Only rows where `local_type IN ('Road', 'Named Road', 'Street')` are loaded into this table. All other Open Names entry types are discarded at load time. This is enforced in the loader, not via a view filter. - -### Changes to `uprn_street_spatial` - -The following fields are added. All are nullable — absence means Open Names had no entry for this UPRN. - -```sql --- New columns on rs_.uprn_street_spatial - -street_open_roads text, -- raw name from Open Roads segment (was: street_display) -street_open_names text, -- raw name from nearest Open Names entry (nullable) -street_display text, -- final resolved display name -street_norm text, -- name_norm applied to street_display -name_source text, -- 'open_roads' | 'open_names' | 'corroborated' -corroborated boolean, -- true if Open Roads and Open Names agree -open_names_distance_m double precision -- distance from UPRN to Open Names entry point (nullable) -``` - -`street_display` and `street_norm` remain the stable consumer-facing fields. The resolution method is transparent via `name_source` and both raw source values are preserved. - -### Release set manifest - -`meta.release_set` gains a fourth release ID column: - -```sql -ALTER TABLE meta.release_set - ADD COLUMN open_names_release_id text REFERENCES meta.dataset_release(release_id); -``` - -The unique constraint is updated: - -```sql -ALTER TABLE meta.release_set - DROP CONSTRAINT uq_release_set_inputs; - -ALTER TABLE meta.release_set - ADD CONSTRAINT uq_release_set_inputs - UNIQUE (onsud_release_id, open_uprn_release_id, open_roads_release_id, open_names_release_id); -``` - -Open Names is optional for backwards compatibility with Phase 1 release sets — the column is nullable. A release set with `open_names_release_id IS NULL` behaves exactly as Phase 1 and produces `street_open_names = NULL`, `name_source = 'open_roads'`, `corroborated = false` throughout. - ---- - -## CLI changes - -New ingest command: - -``` -pipeline ingest open-names --manifest -``` - -Follows the same pattern as existing ingest commands. Manifest schema is identical. - -`pipeline release-set create` gains an optional argument: - -``` -pipeline release-set create \ - --onsud-release \ - --open-uprn-release \ - --open-roads-release \ - --open-names-release # optional -``` - -`pipeline build derived street-spatial` is updated to use Open Names if the release set includes it. If `open_names_release_id` is null, behaviour is identical to Phase 1. - -`pipeline run phase1` is unchanged. Open Names augmentation runs as part of the spatial inference stage when present — it is not a separate pipeline stage. - ---- - -## Resolution logic - -The reconciliation logic runs per UPRN after the Open Roads KNN join. It is deterministic and documented here as the executable specification. - -### Step 1 — Open Roads result +- read final street name + source-specific street names +- inspect corroboration/disagreement indicators +- query streets by postcode hierarchy level -Proceed as Phase 1. For each UPRN, find the nearest named road segment in Open Roads within 150m. Assign `street_open_roads` and `confidence_score` per existing confidence bands. If no road within 150m, set `method = 'none_within_radius'` and skip to output with no Open Names lookup. +## Core Behaviour -### Step 2 — Open Names lookup +### UPRN reconciliation -For each UPRN that has an Open Roads result, find the nearest `open_names_entry` within a search radius. +For each UPRN with an Open Roads match: -Search radius for Open Names lookup: **200m**. This is wider than the Open Roads radius because Open Names representative points are centroids of named streets, not road edge geometry. A street centroid may be further from a UPRN at the end of the street than the road segment edge is. +- nearest Open Names in range is considered +- numbered-road labels can be replaced by Open Names names +- corroboration is recorded when normalized names agree +- disagreements are preserved, not auto-corrected -```sql -SELECT - entry_id, - name_display, - name_norm, - ST_Distance(geom_bng, $uprn_geom_bng) AS distance_m -FROM open_names_entry -WHERE ST_DWithin(geom_bng, $uprn_geom_bng, 200) -ORDER BY geom_bng <-> $uprn_geom_bng, entry_id -LIMIT 1; -``` +For UPRNs without Open Roads match: -If no Open Names entry within 200m, set `street_open_names = NULL` and proceed to output with `name_source = 'open_roads'`. +- method remains unresolved (`none_within_radius`) +- Open Names distance/name fields are null -### Step 3 — Numbered road detection +### Enumeration -Check whether the Open Roads name matches the numbered road pattern: +`derived_postcode_street` is a single normalized table keyed by: -```python -import re -NUMBERED_ROAD = re.compile(r'^[AaBbMm]\d+', re.IGNORECASE) +- `(postcode_level, postcode_value_norm, entry_id)` -is_numbered = bool(NUMBERED_ROAD.match(street_open_roads_norm)) -``` +Association methods: -### Step 4 — Reconciliation +- `district_direct` +- `spatial_voronoi` -``` -IF street_open_names IS NULL: - street_display = street_open_roads - name_source = 'open_roads' - corroborated = false +Deterministic precedence: -ELSE IF is_numbered AND street_open_names IS NOT NULL: - street_display = street_open_names - name_source = 'open_names' - corroborated = false - -- numbered road replaced by Open Names name +- `district_direct` wins over `spatial_voronoi` -ELSE IF name_norm(street_open_roads) == name_norm(street_open_names): - street_display = street_open_roads -- prefer Open Roads form for display - name_source = 'corroborated' - corroborated = true +## Quality and Operations -ELSE: - -- genuine disagreement between named sources - street_display = street_open_roads -- Open Roads is primary - name_source = 'open_roads' - corroborated = false - -- disagreement is visible via street_open_names != NULL and corroborated = false -``` +Mandatory metrics include: -Genuine disagreements are not resolved automatically. They are preserved in the output and flagged for the LLM review pipeline (Phase 2). The consumer sees the Open Roads name. The disagreement is visible and auditable. +- Phase 1 coverage and distance metrics +- corroboration/replacement/disagreement metrics +- postcode-units with/without streets -### Step 5 — Output +Activation gate: -Apply `name_norm` to `street_display` to produce `street_norm`. Populate all fields. `computed_at` timestamp as per Phase 1. +- if `disagreement_pct > 5%`, warning must be acknowledged ---- +Determinism checks: -## Confidence model +- canonical hashes stored for all Phase 2 core/derived objects +- same inputs must reproduce identical hashes -The distance-based confidence score from Phase 1 is unchanged. Open Names corroboration is expressed separately as a boolean, not folded into the numeric score. This keeps the two signals independent and interpretable. - -A consumer who wants to weight corroborated results more highly can do so in their own logic. The API will expose `corroborated` as a field on the street enrichment response. - -The only confidence score modification: if `name_source = 'open_names'` (numbered road replacement), the confidence score is set to the Open Names distance band rather than the Open Roads distance band: - -``` -open_names_distance_m ≤ 50m → 0.70 -open_names_distance_m ≤ 100m → 0.55 -open_names_distance_m ≤ 200m → 0.40 -``` - -These bands are wider than Open Roads bands because Open Names point geometry is a centroid, not an edge. This is documented in the confidence model specification. - ---- - -## New metrics - -Add to `meta.dataset_metrics` for any release set that includes Open Names: - -| Metric key | Description | -|------------|-------------| -| `open_names_entries_loaded` | Total Open Names entries loaded into `open_names_entry` | -| `uprns_corroborated` | UPRNs where Open Roads and Open Names agree | -| `corroboration_pct` | `uprns_corroborated` / `uprns_resolved_named_road` × 100 | -| `uprns_numbered_road_replaced` | UPRNs where Open Roads number was replaced by Open Names name | -| `numbered_road_replacement_pct` | `uprns_numbered_road_replaced` / total resolved UPRNs × 100 | -| `uprns_genuine_disagreement` | UPRNs where both sources present but names differ | -| `disagreement_pct` | `uprns_genuine_disagreement` / total resolved UPRNs × 100 | - -The `disagreement_pct` metric is the key quality signal for this feature. A high disagreement rate indicates either a data quality issue or a flaw in the reconciliation logic and should trigger investigation before the release set is activated. - ---- - -## Gate criteria - -| Gate | Pass condition | -|------|---------------| -| Open Names ingest | `ogrinfo`-reported feature count for selected entries equals loaded `open_names_entry` count. Both counts recorded. | -| Open Names content | `open_names_entry` contains only `local_type IN ('Road', 'Named Road', 'Street')` rows. Verify with count query. | -| Spatial inference | All existing Phase 1 gate criteria pass unchanged. | -| New metrics | All new metric keys present in `meta.dataset_metrics` for release sets with Open Names. | -| Disagreement rate | `disagreement_pct` logged. If > 5%, emit a warning and require explicit confirmation before activation. Not a hard block but must be acknowledged. | -| Backwards compatibility | A release set with `open_names_release_id = NULL` produces output identical to Phase 1 canonical hashes. Verified by test. | - ---- - -## Test cases - -**Numbered road replacement:** -UPRN fixture with nearest Open Roads segment = `A40`. Open Names entry `Western Avenue` within 200m. Expected output: `street_display = 'Western Avenue'`, `name_source = 'open_names'`, `corroborated = false`. - -**Corroboration:** -UPRN fixture with Open Roads = `HIGH STREET` and Open Names nearest entry = `High Street`. After `name_norm` both resolve to `HIGH STREET`. Expected: `corroborated = true`, `name_source = 'corroborated'`, `street_display = 'HIGH STREET'`. - -**Genuine disagreement:** -UPRN fixture where Open Roads = `BACK LANE` and Open Names nearest entry = `STATION ROAD`. Neither is a numbered road. Expected: `street_display = 'BACK LANE'`, `name_source = 'open_roads'`, `corroborated = false`, `street_open_names = 'Station Road'`. - -**No Open Names entry:** -UPRN fixture where no Open Names entry within 200m. Expected: `street_open_names = NULL`, `name_source = 'open_roads'`, `corroborated = false`. Confidence score unchanged from Phase 1. - -**No Open Roads result:** -UPRN fixture with `method = 'none_within_radius'`. Open Names lookup does not run. All Open Names fields NULL. - -**Backwards compatibility:** -Release set with `open_names_release_id = NULL`. Full pipeline run. Canonical hashes must match equivalent Phase 1 release set hashes exactly. - -**Welsh name:** -UPRN fixture in a Welsh postcode where Open Names entry has `NAME1_LANG = 'wel'`. Verify `name_display` preserves Welsh form. Verify `name_norm` applies the same rules as English names (uppercase, trim, collapse whitespace, explicit punctuation strip). No translation or substitution. - ---- - -## Documentation deliverables - -- `/Users/jamie/code/postcod.es/docs/spec/phase_1/open_names.md` — this PRD, updated to reflect final implementation decisions -- `/Users/jamie/code/postcod.es/docs/spec/phase_1/name_norm.md` — updated to confirm `name_norm` rules apply identically to Open Names and Open Roads names -- `/Users/jamie/code/postcod.es/docs/spec/phase_1/confidence_model.md` — updated to document Open Names distance bands and the numbered road replacement case -- `/Users/jamie/code/postcod.es/docs/spec/data_sources.md` — Open Names added as a data source with licence, URL, update frequency, and field mapping - ---- - -## Out of scope - -- `street_equivalence_norm` for cross-source matching. Phase 2. -- LLM-assisted review of genuine disagreements. Phase 2. -- Welsh/English name equivalence resolution. Phase 2. -- Any use of Open Names `populatedPlace`, `districtBorough`, or `countyUnitary` fields for administrative geography. ONSUD already provides this more reliably. -- Open Names entries with `LOCAL_TYPE` other than `Road`, `Named Road`, `Street`. Locality names, settlements, and water features are out of scope for street inference. - ---- - -## Open questions - -**Search radius for Open Names lookup.** 200m is proposed based on the characteristic that Open Names points are street centroids rather than road edges. This may need tuning after first data run. The metric `open_names_distance_m` is stored precisely so the radius can be evaluated empirically and adjusted in a subsequent release. - -**Multiple Open Names entries within radius.** The spec takes the single nearest entry. It is possible that two named streets are equidistant from a UPRN and the nearest Open Names entry is the wrong one. For Phase 1.5 this is acceptable — the tie-breaking rule (distance then `entry_id`) is deterministic. If the disagreement rate metric surfaces this as a significant problem, a future release can introduce a candidate set approach. +## Non-goals -**Open Names update frequency.** OS publishes Open Names approximately every six months. This is less frequent than the Open Roads update cadence. A release set that updates Open Roads without updating Open Names will use a stale Open Names release. This is valid — the pipeline explicitly tracks which Open Names release is in each release set — but it means corroboration rates may drift between releases if the underlying data diverges. Monitor via `disagreement_pct` metric. +- API work +- NI support +- LLM adjudication +- heavy multilingual equivalence logic ---- +## References -*This PRD describes the intended behaviour. Any deviation during implementation must be recorded in `changes.md` with a rationale.* \ No newline at end of file +- `docs/spec/phase_2-open-names/spec.md` +- `docs/spec/phase_2-open-names/voronoi_method.md` +- `docs/spec/phase_2-open-names/changes.md` diff --git a/docs/spec/phase_2-open-names/spec.md b/docs/spec/phase_2-open-names/spec.md new file mode 100644 index 0000000..3a8e422 --- /dev/null +++ b/docs/spec/phase_2-open-names/spec.md @@ -0,0 +1,269 @@ +# Software Requirements Specification +## Phase 2: Open Names Augmentation + Postcode Street Enumeration + +**Document ID:** SRS-PIPELINE-002 +**Version:** 1.1 +**Date:** February 20, 2026 + +## 1. Scope Lock + +Phase 2 is a controlled extension of the existing pipeline. + +- Ingest remains explicit and manifest-driven. +- Build remains deterministic and resumable. +- Activation remains explicit and transactional. +- No serving/API work is included. + +Phase 2 is not optional in release composition: + +- `pipeline release create` requires all four releases: + - `onsud_release_id` + - `open_uprn_release_id` + - `open_roads_release_id` + - `open_names_release_id` + +## 2. CLI Contract + +- `pipeline ingest open-names --manifest ` +- `pipeline run phase2-open-names --release-set-id [--rebuild] [--resume] [--open-roads-radius-m ] [--open-names-radius-m ]` +- `pipeline release activate --release-set-id --actor [--ack-warnings]` + +Rules: + +- `pipeline run phase2-open-names` performs no ingest work. +- `--rebuild` and `--resume` are mutually exclusive. +- If a release set is already `built`/`active`, run is a no-op unless `--rebuild` is set. + +## 3. Build Checkpoints (Locked Order) + +1. `release_tables_created` +2. `core_uprn_postcode_built` +3. `core_uprn_point_built` +4. `core_road_segment_built` +5. `core_open_names_entry_built` +6. `core_postcode_unit_seed_built` +7. `derived_uprn_street_spatial_built` +8. `derived_postcode_street_built` +9. `metrics_stored` +10. `warnings_stored` +11. `canonical_hashes_stored` +12. `release_marked_built` + +`warnings_stored` is intentionally before `canonical_hashes_stored`. + +## 4. Core Data Model + +Physical tables are built in `rs_`. + +### 4.1 `core_open_names_entry` + +- `entry_id text primary key` +- `name_display text` +- `name_norm text` +- `name2_display text` +- `name2_norm text` +- `name1_lang text` +- `local_type text not null` +- `postcode_district_norm text` +- `geom_bng geometry(Point,27700) not null` +- `open_names_release_id text not null` + +### 4.2 `core_postcode_unit_seed` + +Built from ONSUD postcode-unit grid references. + +- `postcode_unit_norm text primary key` +- `postcode_sector_norm text` +- `postcode_district_norm text` +- `postcode_area_norm text` +- `easting double precision not null` +- `northing double precision not null` +- `geom_bng geometry(Point,27700) not null` +- `onsud_release_id text not null` + +Hard failures: + +- Missing seed for any postcode unit present in `core_uprn_postcode` + +Seed selection rule (locked): + +- ONSUD can contain multiple coordinate pairs per postcode unit in real releases. +- `core_postcode_unit_seed` derives one deterministic representative seed per postcode unit using: + - `AVG(postcode_unit_easting::numeric)` + - `AVG(postcode_unit_northing::numeric)` +- This is deterministic, auditable, and avoids non-reproducible row-choice behavior. +- Diagnostic metrics track seed multiplicity (`postcode_unit_seed_multi_coord_count`, `postcode_unit_seed_max_distinct_coords`). + +## 5. UPRN Street Reconciliation (`derived_uprn_street_spatial`) + +## 5.1 Inputs and Tie-Breaking + +- Open Roads nearest lookup radius default: `150m` +- Open Names nearest lookup radius default: `200m` +- Open Roads tie-break: distance ascending, `segment_id` ascending +- Open Names tie-break: distance ascending, `entry_id` ascending +- KNN operator `<->` is not used; deterministic `ST_Distance` ordering is used. + +Road-number detection regex is locked: + +- `^(A|B|M)[0-9]{1,4}[A-Z]?$` +- Applied to stripped token (`upper`, non-alphanumeric removed) + +## 5.2 Reconciliation Rules + +For rows with `method = 'open_roads_nearest'`: + +1. No Open Names in range: keep Open Roads. +2. Open Roads is numbered road and Open Names in range: replace with Open Names. +3. Both present and `name_norm` equal: mark corroborated. +4. Both present and differ: keep Open Roads. + +For rows with `method = 'none_within_radius'`: + +- No Open Names lookup is used. +- Open Names fields are null. + +## 5.3 Output Columns (Phase 2) + +- `street_open_roads` +- `street_open_names` +- `street_display` +- `street_norm` +- `open_roads_distance_m` +- `open_names_distance_m` +- `confidence_score` +- `method` +- `name_source` +- `corroborated` +- `sources text[]` (fixed order provenance) + +`open_names_distance_m` semantics are locked: + +- Distance to nearest in-range Open Names entry +- `NULL` when no Open Names entry exists within `open_names_radius_m` + +## 5.4 Provenance Order (`sources`) + +Always populated in this exact order: + +1. `onsud:` +2. `open_uprn:` +3. `open_roads:` +4. `open_names:` + +## 6. Postcode Street Enumeration (`derived_postcode_street`) + +Single normalized table: + +- `postcode_level text` in (`unit`,`sector`,`district`,`area`) +- `postcode_value_norm text` +- `entry_id text` +- `name_display text` +- `name_norm text` +- `name2_display text` +- `name2_norm text` +- `association_method text` in (`district_direct`,`spatial_voronoi`) +- `sources text[]` +- release IDs + +Primary key: + +- `(postcode_level, postcode_value_norm, entry_id)` + +Lookup index: + +- `(postcode_level, postcode_value_norm)` + +## 6.1 Association Contract + +Two candidate methods are generated: + +- `district_direct`: `core_open_names_entry.postcode_district_norm = core_postcode_unit_seed.postcode_district_norm` +- `spatial_voronoi`: Open Names point contained by postcode-unit Voronoi cell + +De-dup precedence is locked: + +1. `district_direct` +2. `spatial_voronoi` + +## 6.2 Voronoi Contract + +- Seed source: `core_postcode_unit_seed.geom_bng` +- Clipping: convex hull of seeds buffered by `pipeline.config.VORONOI_HULL_BUFFER_M` +- Buffer value is parameter-bound SQL (`hull_buffer_m`), never inlined +- Each postcode unit seed must map to exactly one Voronoi cell, else hard fail + +Reference: `docs/spec/phase_2-open-names/voronoi_method.md` + +## 7. Metrics (Mandatory) + +Phase 1 metrics remain mandatory plus Phase 2 keys. + +Additional keys: + +- `open_names_entries_loaded` +- `uprns_corroborated` +- `corroboration_pct` +- `uprns_numbered_road_replaced` +- `numbered_road_replacement_pct` +- `uprns_genuine_disagreement` +- `disagreement_pct` +- `postcode_units_with_streets` +- `postcode_units_without_streets` +- `open_names_search_radius_m` +- `total_resolved` + +Formula lock: + +- `total_uprns_onsud = COUNT(non-null uprn in raw.onsud_row for onsud release)` +- `uprns_with_coordinates = COUNT(DISTINCT uprn in core_uprn_postcode ∩ core_uprn_point)` +- `total_resolved = COUNT(*) WHERE method='open_roads_nearest'` +- `coordinate_coverage_pct = uprns_with_coordinates / total_uprns_onsud * 100` +- `resolution_pct = total_resolved / uprns_with_coordinates * 100` +- `corroboration_pct = uprns_corroborated / total_resolved * 100` +- `numbered_road_replacement_pct = uprns_numbered_road_replaced / total_resolved * 100` +- `disagreement_pct = uprns_genuine_disagreement / total_resolved * 100` + +## 8. Warning + Activation Gate + +If `disagreement_pct > 5.0`: + +- Persist warning row in `meta.pipeline_run_warning` +- Set `requires_ack = true` +- Activation is blocked until acknowledged + +Acknowledgement path: + +- `pipeline release activate ... --ack-warnings` +- Writes `acknowledged_by` and `acknowledged_at` + +## 9. Canonical Hash Contract + +Stored in `meta.canonical_hash` per run. + +Phase 2 object names: + +- `core_uprn_postcode` +- `core_uprn_point` +- `core_road_segment` +- `core_open_names_entry` +- `core_postcode_unit_seed` +- `derived_uprn_street_spatial` +- `derived_postcode_street` + +Hash rules: + +- Explicit projection list stored as ordered JSON array +- Operational timestamps excluded +- Deterministic key ordering only (numeric keys or `COLLATE "C"` for text keys) + +## 10. Out of Scope + +- API serving design +- NI dataset onboarding +- LLM disagreement adjudication +- Cross-source linguistic equivalence (`street_equivalence_norm`) + +Any behaviour change to this contract must be recorded in: + +- `docs/spec/phase_2-open-names/changes.md` diff --git a/docs/spec/phase_2-open-names/voronoi_method.md b/docs/spec/phase_2-open-names/voronoi_method.md new file mode 100644 index 0000000..6a2ee25 --- /dev/null +++ b/docs/spec/phase_2-open-names/voronoi_method.md @@ -0,0 +1,33 @@ +# Voronoi Method (Phase 2) + +This document locks the Voronoi clipping contract used by postcode street enumeration. + +## Locked Constant + +- `pipeline.config.VORONOI_HULL_BUFFER_M = 20000.0` + +This value is behavior-defining and hash-impacting. It must not be changed silently. + +## SQL Contract + +Voronoi clipping uses PostGIS-native operations with a bound parameter: + +- `ST_ConvexHull(ST_Collect(seed_geom_bng))` +- `ST_Buffer(..., %(hull_buffer_m)s)` +- `ST_VoronoiPolygons(..., (SELECT gb_clip_geom ...))` + +`hull_buffer_m` is bound at runtime. The buffer must not be inlined as a SQL literal. + +## Governance + +Any change to `VORONOI_HULL_BUFFER_M` requires all of the following before implementation: + +1. Prior entry in `docs/spec/phase_2-open-names/changes.md` +2. Canonical hash re-baseline for affected objects +3. Before/after metric comparison for enumeration coverage + +## Determinism Notes + +- Seed inputs are ordered deterministically. +- Voronoi clipping geometry is deterministic for a fixed seed set and `VORONOI_HULL_BUFFER_M`. +- Runtime ties in downstream spatial joins must have explicit tie-breakers. From 3cb8239ef8c4176e08705c124abc60dceeaa53c5 Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Fri, 20 Feb 2026 23:27:29 +0000 Subject: [PATCH 02/17] feat(pipeline): implement deterministic ingest/build runtime and migrations --- data/manifests/e2e/onsud_manifest.json | 15 + data/manifests/e2e/open_names_manifest.json | 19 + data/manifests/e2e/open_roads_manifest.json | 15 + data/manifests/e2e/open_uprn_manifest.json | 16 + data/manifests/real/onsud_manifest.json | 15 + data/manifests/real/open_names_manifest.json | 19 + data/manifests/real/open_roads_manifest.json | 15 + data/manifests/real/open_uprn_manifest.json | 16 + data/manifests/real_v3/nsul_manifest.json | 87 + data/manifests/real_v3/onspd_manifest.json | 17 + .../os_open_linked_identifiers_manifest.json | 24 + .../real_v3/os_open_names_manifest.json | 17 + .../real_v3/os_open_roads_manifest.json | 18 + .../real_v3/os_open_uprn_manifest.json | 17 + .../real_v3/os_open_usrn_manifest.json | 18 + .../v3_smoke/gb_core_bundle_manifest.json | 12 + data/manifests/v3_smoke/nsul_manifest.json | 18 + data/manifests/v3_smoke/onspd_manifest.json | 18 + .../os_open_linked_identifiers_manifest.json | 18 + .../v3_smoke/os_open_names_manifest.json | 18 + .../v3_smoke/os_open_roads_manifest.json | 18 + .../v3_smoke/os_open_uprn_manifest.json | 18 + .../v3_smoke/os_open_usrn_manifest.json | 18 + pipeline/config/frequency_weights.yaml | 12 + pipeline/config/normalisation.yaml | 15 + pipeline/config/source_schema.yaml | 107 + pipeline/pyproject.toml | 21 + .../sql/migrations/0001_phase1_foundation.sql | 142 + .../0002_phase1_resume_checkpoints.sql | 25 + .../0003_phase1_table_level_checkpoints.sql | 23 + .../0004_phase2_open_names_foundation.sql | 136 + .../migrations/0005_v3_cutover_foundation.sql | 545 ++++ .../0006_v3_open_roads_stage_compat.sql | 29 + pipeline/src/pipeline/__init__.py | 1 + pipeline/src/pipeline/build/__init__.py | 1 + pipeline/src/pipeline/build/workflows.py | 2697 +++++++++++++++++ pipeline/src/pipeline/cli.py | 169 ++ pipeline/src/pipeline/config.py | 33 + pipeline/src/pipeline/contracts/__init__.py | 1 + pipeline/src/pipeline/contracts/open_roads.py | 62 + pipeline/src/pipeline/contracts/voronoi.py | 95 + pipeline/src/pipeline/db/__init__.py | 1 + pipeline/src/pipeline/db/connection.py | 17 + pipeline/src/pipeline/db/migrations.py | 69 + pipeline/src/pipeline/ingest/__init__.py | 1 + pipeline/src/pipeline/ingest/workflows.py | 339 +++ pipeline/src/pipeline/manifest.py | 272 ++ pipeline/src/pipeline/util/hashing.py | 14 + pipeline/src/pipeline/util/normalise.py | 75 + scripts/obtain_phase1_e2e_sources.sh | 5 + scripts/obtain_phase2_e2e_sources.sh | 169 ++ scripts/run_phase1_e2e.sh | 5 + scripts/run_phase2_e2e.sh | 74 + 53 files changed, 5621 insertions(+) create mode 100644 data/manifests/e2e/onsud_manifest.json create mode 100644 data/manifests/e2e/open_names_manifest.json create mode 100644 data/manifests/e2e/open_roads_manifest.json create mode 100644 data/manifests/e2e/open_uprn_manifest.json create mode 100644 data/manifests/real/onsud_manifest.json create mode 100644 data/manifests/real/open_names_manifest.json create mode 100644 data/manifests/real/open_roads_manifest.json create mode 100644 data/manifests/real/open_uprn_manifest.json create mode 100644 data/manifests/real_v3/nsul_manifest.json create mode 100644 data/manifests/real_v3/onspd_manifest.json create mode 100644 data/manifests/real_v3/os_open_linked_identifiers_manifest.json create mode 100644 data/manifests/real_v3/os_open_names_manifest.json create mode 100644 data/manifests/real_v3/os_open_roads_manifest.json create mode 100644 data/manifests/real_v3/os_open_uprn_manifest.json create mode 100644 data/manifests/real_v3/os_open_usrn_manifest.json create mode 100644 data/manifests/v3_smoke/gb_core_bundle_manifest.json create mode 100644 data/manifests/v3_smoke/nsul_manifest.json create mode 100644 data/manifests/v3_smoke/onspd_manifest.json create mode 100644 data/manifests/v3_smoke/os_open_linked_identifiers_manifest.json create mode 100644 data/manifests/v3_smoke/os_open_names_manifest.json create mode 100644 data/manifests/v3_smoke/os_open_roads_manifest.json create mode 100644 data/manifests/v3_smoke/os_open_uprn_manifest.json create mode 100644 data/manifests/v3_smoke/os_open_usrn_manifest.json create mode 100644 pipeline/config/frequency_weights.yaml create mode 100644 pipeline/config/normalisation.yaml create mode 100644 pipeline/config/source_schema.yaml create mode 100644 pipeline/pyproject.toml create mode 100644 pipeline/sql/migrations/0001_phase1_foundation.sql create mode 100644 pipeline/sql/migrations/0002_phase1_resume_checkpoints.sql create mode 100644 pipeline/sql/migrations/0003_phase1_table_level_checkpoints.sql create mode 100644 pipeline/sql/migrations/0004_phase2_open_names_foundation.sql create mode 100644 pipeline/sql/migrations/0005_v3_cutover_foundation.sql create mode 100644 pipeline/sql/migrations/0006_v3_open_roads_stage_compat.sql create mode 100644 pipeline/src/pipeline/__init__.py create mode 100644 pipeline/src/pipeline/build/__init__.py create mode 100644 pipeline/src/pipeline/build/workflows.py create mode 100644 pipeline/src/pipeline/cli.py create mode 100644 pipeline/src/pipeline/config.py create mode 100644 pipeline/src/pipeline/contracts/__init__.py create mode 100644 pipeline/src/pipeline/contracts/open_roads.py create mode 100644 pipeline/src/pipeline/contracts/voronoi.py create mode 100644 pipeline/src/pipeline/db/__init__.py create mode 100644 pipeline/src/pipeline/db/connection.py create mode 100644 pipeline/src/pipeline/db/migrations.py create mode 100644 pipeline/src/pipeline/ingest/__init__.py create mode 100644 pipeline/src/pipeline/ingest/workflows.py create mode 100644 pipeline/src/pipeline/manifest.py create mode 100644 pipeline/src/pipeline/util/hashing.py create mode 100644 pipeline/src/pipeline/util/normalise.py create mode 100755 scripts/obtain_phase1_e2e_sources.sh create mode 100755 scripts/obtain_phase2_e2e_sources.sh create mode 100755 scripts/run_phase1_e2e.sh create mode 100755 scripts/run_phase2_e2e.sh diff --git a/data/manifests/e2e/onsud_manifest.json b/data/manifests/e2e/onsud_manifest.json new file mode 100644 index 0000000..f511616 --- /dev/null +++ b/data/manifests/e2e/onsud_manifest.json @@ -0,0 +1,15 @@ +{ + "dataset_key": "onsud", + "release_id": "2026-Q1-E2E-P2", + "source_url": "https://example.local/onsud-sample", + "licence": "OGL v3.0", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/e2e/onsud_sample.csv", + "expected_sha256": "dfe6e4bc4d4405edc6463fcb1b55929f867d8e7b9907afb92e893a9f8911033f", + "format": "csv", + "column_map": { + "uprn": "ONS_UPRN", + "postcode": "ONS_POSTCODE", + "postcode_unit_easting": "PC_UNIT_E", + "postcode_unit_northing": "PC_UNIT_N" + } +} diff --git a/data/manifests/e2e/open_names_manifest.json b/data/manifests/e2e/open_names_manifest.json new file mode 100644 index 0000000..53bd559 --- /dev/null +++ b/data/manifests/e2e/open_names_manifest.json @@ -0,0 +1,19 @@ +{ + "dataset_key": "open_names", + "release_id": "2026-Q1-E2E-P2", + "source_url": "https://example.local/open-names-sample", + "licence": "OGL v3.0", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/e2e/open_names_sample.csv", + "expected_sha256": "b4ca5267fe2a4d7fefe68eca48e3df1bddc8e19d8ac7c99be0293ae2a5e39dac", + "format": "csv", + "column_map": { + "entry_id": "ON_ID", + "name1": "NAME1", + "name1_lang": "NAME1_LANG", + "name2": "NAME2", + "local_type": "LOCAL_TYPE", + "geometry_x": "GEOM_X", + "geometry_y": "GEOM_Y", + "postcode_district": "PC_DISTRICT" + } +} diff --git a/data/manifests/e2e/open_roads_manifest.json b/data/manifests/e2e/open_roads_manifest.json new file mode 100644 index 0000000..95fb851 --- /dev/null +++ b/data/manifests/e2e/open_roads_manifest.json @@ -0,0 +1,15 @@ +{ + "dataset_key": "open_roads", + "release_id": "2026-Q1-E2E-P2", + "source_url": "https://example.local/open-roads-sample", + "licence": "OGL v3.0", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/e2e/open_roads_sample.geojson", + "expected_sha256": "15a37f6743b873e6bb6bdcb03980cc2e532126d6262344cf0b60b1611c74ba4b", + "format": "geojson", + "layer_name": "open_roads_sample", + "expected_srid": 27700, + "column_map": { + "source_id": "src_id", + "name_display": "road_name" + } +} diff --git a/data/manifests/e2e/open_uprn_manifest.json b/data/manifests/e2e/open_uprn_manifest.json new file mode 100644 index 0000000..7edefd5 --- /dev/null +++ b/data/manifests/e2e/open_uprn_manifest.json @@ -0,0 +1,16 @@ +{ + "dataset_key": "open_uprn", + "release_id": "2026-Q1-E2E-P2", + "source_url": "https://example.local/open-uprn-sample", + "licence": "OGL v3.0", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/e2e/open_uprn_sample.csv", + "expected_sha256": "7b40b6398d8db405d3078a1c12a9368c02b46b72de3ee663ab3403b82c89b2c9", + "format": "csv", + "column_map": { + "uprn": "UPRN_REF", + "latitude": "LAT", + "longitude": "LON", + "easting": "EASTING", + "northing": "NORTHING" + } +} diff --git a/data/manifests/real/onsud_manifest.json b/data/manifests/real/onsud_manifest.json new file mode 100644 index 0000000..a68ff6e --- /dev/null +++ b/data/manifests/real/onsud_manifest.json @@ -0,0 +1,15 @@ +{ + "dataset_key": "onsud", + "release_id": "ONSUD_NOV_2025", + "source_url": "https://www.arcgis.com/sharing/rest/content/items/9beb2361978146f8ac85da18d21ee266/data", + "licence": "https://www.ons.gov.uk/methodology/geography/licences", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/onsud/ONSUD_NOV_2025_GB_COMBINED.csv", + "expected_sha256": "ef7f0c29e4a1439309f50e16eb20ecd3120c16bd6c7bbaf6e07b61e5a3e27b7e", + "format": "csv", + "column_map": { + "uprn": "UPRN", + "postcode": "PCDS", + "postcode_unit_easting": "GRIDGB1E", + "postcode_unit_northing": "GRIDGB1N" + } +} diff --git a/data/manifests/real/open_names_manifest.json b/data/manifests/real/open_names_manifest.json new file mode 100644 index 0000000..e65d5bc --- /dev/null +++ b/data/manifests/real/open_names_manifest.json @@ -0,0 +1,19 @@ +{ + "dataset_key": "open_names", + "release_id": "open_names_unknown_20260220_aefc8ad3", + "source_url": "https://api.os.uk/downloads/v1/products/OpenNames/downloads?area=GB&format=CSV&redirect", + "licence": "http://os.uk/opendata/licence", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_names/opname_gb_combined.csv", + "expected_sha256": "aefc8ad337e23f1ba7debab45243c2dc2302a4aa95ef7c86d80daaa65f535e05", + "format": "csv", + "column_map": { + "entry_id": "ID", + "name1": "NAME1", + "name1_lang": "NAME1_LANG", + "name2": "NAME2", + "local_type": "LOCAL_TYPE", + "geometry_x": "GEOMETRY_X", + "geometry_y": "GEOMETRY_Y", + "postcode_district": "POSTCODE_DISTRICT" + } +} diff --git a/data/manifests/real/open_roads_manifest.json b/data/manifests/real/open_roads_manifest.json new file mode 100644 index 0000000..d4656ae --- /dev/null +++ b/data/manifests/real/open_roads_manifest.json @@ -0,0 +1,15 @@ +{ + "dataset_key": "open_roads", + "release_id": "open_roads_unknown_20260220_ebbaaaff", + "source_url": "https://api.os.uk/downloads/v1/products/OpenRoads/downloads?area=GB&format=GeoPackage&redirect", + "licence": "http://os.uk/opendata/licence", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_roads/Data/oproad_gb.gpkg", + "expected_sha256": "25cfcf41ce89d49a1714798b25db93d4100b98ff0b07ec6debd20b602c01cc22", + "format": "gpkg", + "layer_name": "road_link", + "expected_srid": 27700, + "column_map": { + "source_id": "id", + "name_display": "name_1" + } +} diff --git a/data/manifests/real/open_uprn_manifest.json b/data/manifests/real/open_uprn_manifest.json new file mode 100644 index 0000000..b592839 --- /dev/null +++ b/data/manifests/real/open_uprn_manifest.json @@ -0,0 +1,16 @@ +{ + "dataset_key": "open_uprn", + "release_id": "osopenuprn_202602", + "source_url": "https://api.os.uk/downloads/v1/products/OpenUPRN/downloads?area=GB&format=CSV&redirect", + "licence": "http://os.uk/opendata/licence", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_uprn/osopenuprn_202602.csv", + "expected_sha256": "69156b9fd66c9195dd23e0aa81f20136c0a55c408b27cd729fe79ed3d0afc911", + "format": "csv", + "column_map": { + "uprn": "UPRN", + "latitude": "LATITUDE", + "longitude": "LONGITUDE", + "easting": "X_COORDINATE", + "northing": "Y_COORDINATE" + } +} diff --git a/data/manifests/real_v3/nsul_manifest.json b/data/manifests/real_v3/nsul_manifest.json new file mode 100644 index 0000000..9ef5573 --- /dev/null +++ b/data/manifests/real_v3/nsul_manifest.json @@ -0,0 +1,87 @@ +{ + "source_name": "nsul", + "source_version": "NSUL_JUL_2025_EPOCH_119", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://www.arcgis.com/sharing/rest/content/items/0c5c2c6202464ae280da1a79c14ccca1/data", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "nsul_nsul_jul_2025_ee", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_EE.csv", + "sha256": "5751a9ec254c317203003dfea4a00ba6bbbf81fea51417edb042f97843bf43e9", + "size_bytes": 1232171370, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_em", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_EM.csv", + "sha256": "438f34adbc54438f44111c7bbdbb35be8bbc2283b787327cfae8190adb0c6bb5", + "size_bytes": 956700215, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_ln", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_LN.csv", + "sha256": "0c526371248ecebbef860325062c5b9b55a63f25c3cf0f911403f2e531ea456d", + "size_bytes": 1639724787, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_ne", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_NE.csv", + "sha256": "e38300351fa95ebce52d59c11896af6496e7bf60c599a667c0598cae2f39cd62", + "size_bytes": 538777804, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_nw", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_NW.csv", + "sha256": "6f6ee1412d111844ae793de42951ae154a427426c4e9ce7ba29f34568f52ffe6", + "size_bytes": 1447417131, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_sc", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_SC.csv", + "sha256": "11b326f012f4b3968373d4e9aa725b8064b9123c85ca6f5230529b0582eb8d3e", + "size_bytes": 1185165031, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_se", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_SE.csv", + "sha256": "e928ef90776c3faf05ec05f9ed1dbe2d8f010fcabcb4f7f0d3fe677c602e2d65", + "size_bytes": 1753125936, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_sw", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_SW.csv", + "sha256": "f7fcb0edbf24fafbc193e1a732a51e6e8c28eefcffbc5a9e74b1c56357f96a3a", + "size_bytes": 1185083958, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_wa", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_WA.csv", + "sha256": "dbdc2878aca7e61a21fbaaf9f6dc9b5b3b9294b842e2d5f564f9641fa1be39fe", + "size_bytes": 664426866, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_wm", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_WM.csv", + "sha256": "c3f0f7d515b271031f9e2568618dfb2187d4a73d75958700fb803f77751652a2", + "size_bytes": 1115129412, + "format": "csv" + }, + { + "file_role": "nsul_nsul_jul_2025_yh", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/nsul/Data/NSUL_JUL_2025_YH.csv", + "sha256": "9993542006b9b85967f07899a1b0006ebc6dd1d4dc788b8f0eb033efe64d1565", + "size_bytes": 1146747103, + "format": "csv" + } + ] +} diff --git a/data/manifests/real_v3/onspd_manifest.json b/data/manifests/real_v3/onspd_manifest.json new file mode 100644 index 0000000..6343bef --- /dev/null +++ b/data/manifests/real_v3/onspd_manifest.json @@ -0,0 +1,17 @@ +{ + "source_name": "onspd", + "source_version": "ONSPD_NOV_2025", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://www.arcgis.com/sharing/rest/content/items/3635ca7f69df4733af27caf86473ffa1/data", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "onspd_uk_csv", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/onspd/Data/ONSPD_NOV_2025_UK.csv", + "sha256": "d4b54fc4c192495dcb33d4559f225237a46af7428edcb648b2fbb76bf4e9bfe8", + "size_bytes": 1448136855, + "format": "csv" + } + ] +} diff --git a/data/manifests/real_v3/os_open_linked_identifiers_manifest.json b/data/manifests/real_v3/os_open_linked_identifiers_manifest.json new file mode 100644 index 0000000..dc25907 --- /dev/null +++ b/data/manifests/real_v3/os_open_linked_identifiers_manifest.json @@ -0,0 +1,24 @@ +{ + "source_name": "os_open_linked_identifiers", + "source_version": "lids_2026_02", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://api.os.uk/downloads/v1/products/LIDS/downloads?area=GB&format=CSV", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "lids_toid_usrn", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/lids/Road_TOID_Street_USRN_10.csv", + "sha256": "54c03a54c7da5b3e8b13316b3f9357f34f562a57c7cd3d37dfbe2c4e17454462", + "size_bytes": 181488324, + "format": "csv" + }, + { + "file_role": "lids_uprn_usrn", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/lids/BLPU_UPRN_Street_USRN_11.csv", + "sha256": "1243cd8fce256275491050071200c88d72fc1de2380593f3356f72bba5079fec", + "size_bytes": 5061603216, + "format": "csv" + } + ] +} diff --git a/data/manifests/real_v3/os_open_names_manifest.json b/data/manifests/real_v3/os_open_names_manifest.json new file mode 100644 index 0000000..6e5d691 --- /dev/null +++ b/data/manifests/real_v3/os_open_names_manifest.json @@ -0,0 +1,17 @@ +{ + "source_name": "os_open_names", + "source_version": "opname_gb_202602", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://api.os.uk/downloads/v1/products/OpenNames/downloads?area=GB&format=CSV&redirect", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "open_names_csv", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_names/opname_gb_combined.csv", + "sha256": "aefc8ad337e23f1ba7debab45243c2dc2302a4aa95ef7c86d80daaa65f535e05", + "size_bytes": 1802306070, + "format": "csv" + } + ] +} diff --git a/data/manifests/real_v3/os_open_roads_manifest.json b/data/manifests/real_v3/os_open_roads_manifest.json new file mode 100644 index 0000000..4b4421a --- /dev/null +++ b/data/manifests/real_v3/os_open_roads_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_roads", + "source_version": "oproad_gb_202510", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://api.os.uk/downloads/v1/products/OpenRoads/downloads?area=GB&format=GeoPackage&redirect", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "open_roads_road_link_gpkg", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_roads/Data/oproad_gb.gpkg", + "sha256": "25cfcf41ce89d49a1714798b25db93d4100b98ff0b07ec6debd20b602c01cc22", + "size_bytes": 2133966848, + "format": "gpkg", + "layer_name": "road_link" + } + ] +} diff --git a/data/manifests/real_v3/os_open_uprn_manifest.json b/data/manifests/real_v3/os_open_uprn_manifest.json new file mode 100644 index 0000000..366029e --- /dev/null +++ b/data/manifests/real_v3/os_open_uprn_manifest.json @@ -0,0 +1,17 @@ +{ + "source_name": "os_open_uprn", + "source_version": "osopenuprn_202602", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://api.os.uk/downloads/v1/products/OpenUPRN/downloads?area=GB&format=CSV&redirect", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "open_uprn_csv", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_uprn/osopenuprn_202602.csv", + "sha256": "69156b9fd66c9195dd23e0aa81f20136c0a55c408b27cd729fe79ed3d0afc911", + "size_bytes": 2262268626, + "format": "csv" + } + ] +} diff --git a/data/manifests/real_v3/os_open_usrn_manifest.json b/data/manifests/real_v3/os_open_usrn_manifest.json new file mode 100644 index 0000000..68e96ec --- /dev/null +++ b/data/manifests/real_v3/os_open_usrn_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_usrn", + "source_version": "osopenusrn_202602", + "retrieved_at_utc": "2026-02-20T22:41:35Z", + "source_url": "https://api.os.uk/downloads/v1/products/OpenUSRN/downloads?area=GB&format=GeoPackage&redirect", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Real source ingest manifest generated locally.", + "files": [ + { + "file_role": "open_usrn_gpkg", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/real/open_usrn/osopenusrn_202602.gpkg", + "sha256": "fdd6af3efa38ad116d4c7cf436291c4094f654d6f2aaeb218c7bdd862356828e", + "size_bytes": 1005912064, + "format": "gpkg", + "layer_name": "openUSRN" + } + ] +} diff --git a/data/manifests/v3_smoke/gb_core_bundle_manifest.json b/data/manifests/v3_smoke/gb_core_bundle_manifest.json new file mode 100644 index 0000000..e975cc5 --- /dev/null +++ b/data/manifests/v3_smoke/gb_core_bundle_manifest.json @@ -0,0 +1,12 @@ +{ + "build_profile": "gb_core", + "source_runs": { + "onspd": "a49f5198-1b1f-4cf5-b9f2-82b450aa9f73", + "os_open_usrn": "60bd0c02-e110-4bfe-b96d-3bebc14516b8", + "os_open_names": "420f6591-24ba-42b8-8a13-3658c9ef0c02", + "os_open_roads": "95119ff1-33cc-4341-b21a-97df73853ac5", + "os_open_uprn": "4a2bef4c-9adc-4427-a11e-e65104b7e86a", + "os_open_linked_identifiers": "6ccc48d1-80e4-4336-a985-7c720781c9fb", + "nsul": "0f24c6e9-ba4c-4d63-baf0-388bbda197b6" + } +} diff --git a/data/manifests/v3_smoke/nsul_manifest.json b/data/manifests/v3_smoke/nsul_manifest.json new file mode 100644 index 0000000..dcea756 --- /dev/null +++ b/data/manifests/v3_smoke/nsul_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "nsul", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/nsul", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/nsul.csv", + "sha256": "430ef8e55d638274af125d86b7d1a0502b5f67ce19ee143aa502ca856751f7fb", + "size_bytes": 30, + "format": "csv", + "row_count_expected": 1 + } + ] +} diff --git a/data/manifests/v3_smoke/onspd_manifest.json b/data/manifests/v3_smoke/onspd_manifest.json new file mode 100644 index 0000000..fbf430e --- /dev/null +++ b/data/manifests/v3_smoke/onspd_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "onspd", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/onspd", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/onspd.csv", + "sha256": "c255cdcf57adb5f5531c0622d0bda81cb4166ccddd7b41f148737a752279fb8b", + "size_bytes": 186, + "format": "csv", + "row_count_expected": 1 + } + ] +} diff --git a/data/manifests/v3_smoke/os_open_linked_identifiers_manifest.json b/data/manifests/v3_smoke/os_open_linked_identifiers_manifest.json new file mode 100644 index 0000000..f1c0ae1 --- /dev/null +++ b/data/manifests/v3_smoke/os_open_linked_identifiers_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_linked_identifiers", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/os_open_linked_identifiers", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/os_open_linked_identifiers.csv", + "sha256": "bc31799d901c014741b671578b26344cc4b7008e5264926eac0a667de1eaa78f", + "size_bytes": 84, + "format": "csv", + "row_count_expected": 2 + } + ] +} diff --git a/data/manifests/v3_smoke/os_open_names_manifest.json b/data/manifests/v3_smoke/os_open_names_manifest.json new file mode 100644 index 0000000..6ef905a --- /dev/null +++ b/data/manifests/v3_smoke/os_open_names_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_names", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/os_open_names", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/os_open_names.csv", + "sha256": "f4440114144336b2fbbd6d6955086820616b7a880c620941a79e74b83fb80499", + "size_bytes": 81, + "format": "csv", + "row_count_expected": 1 + } + ] +} diff --git a/data/manifests/v3_smoke/os_open_roads_manifest.json b/data/manifests/v3_smoke/os_open_roads_manifest.json new file mode 100644 index 0000000..44a6aae --- /dev/null +++ b/data/manifests/v3_smoke/os_open_roads_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_roads", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/os_open_roads", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/os_open_roads.csv", + "sha256": "449935b3e8d6b4b0809c16bd2fb0630acd696a52e0745b902f81b465a6e4b39f", + "size_bytes": 95, + "format": "csv", + "row_count_expected": 1 + } + ] +} diff --git a/data/manifests/v3_smoke/os_open_uprn_manifest.json b/data/manifests/v3_smoke/os_open_uprn_manifest.json new file mode 100644 index 0000000..8fb7d90 --- /dev/null +++ b/data/manifests/v3_smoke/os_open_uprn_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_uprn", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/os_open_uprn", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/os_open_uprn.csv", + "sha256": "430ef8e55d638274af125d86b7d1a0502b5f67ce19ee143aa502ca856751f7fb", + "size_bytes": 30, + "format": "csv", + "row_count_expected": 1 + } + ] +} diff --git a/data/manifests/v3_smoke/os_open_usrn_manifest.json b/data/manifests/v3_smoke/os_open_usrn_manifest.json new file mode 100644 index 0000000..a1d8941 --- /dev/null +++ b/data/manifests/v3_smoke/os_open_usrn_manifest.json @@ -0,0 +1,18 @@ +{ + "source_name": "os_open_usrn", + "source_version": "v3_smoke_2026_02_20", + "retrieved_at_utc": "2026-02-20T22:19:05Z", + "source_url": "local://v3_smoke/os_open_usrn", + "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", + "notes": "Synthetic smoke dataset for V3 ingest/build validation", + "files": [ + { + "file_role": "primary", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/os_open_usrn.csv", + "sha256": "2c36d2763845f5197e378ea28586cd2f22d65c795847c7a2c543899ba2eb278c", + "size_bytes": 84, + "format": "csv", + "row_count_expected": 1 + } + ] +} diff --git a/pipeline/config/frequency_weights.yaml b/pipeline/config/frequency_weights.yaml new file mode 100644 index 0000000..7fc5c63 --- /dev/null +++ b/pipeline/config/frequency_weights.yaml @@ -0,0 +1,12 @@ +{ + "weights": { + "names_postcode_feature": 0.6, + "oli_toid_usrn": 0.9, + "uprn_usrn": 1.0, + "spatial_os_open_roads": 0.3, + "osni_gazetteer_direct": 0.6, + "spatial_dfi_highway": 0.3, + "ppd_parse_matched": 0.4, + "ppd_parse_unmatched": 0.2 + } +} diff --git a/pipeline/config/normalisation.yaml b/pipeline/config/normalisation.yaml new file mode 100644 index 0000000..554b45e --- /dev/null +++ b/pipeline/config/normalisation.yaml @@ -0,0 +1,15 @@ +{ + "alias_map": { + "ST": "STREET", + "RD": "ROAD", + "AVE": "AVENUE", + "DR": "DRIVE", + "LN": "LANE", + "CL": "CLOSE", + "PL": "PLACE", + "CT": "COURT", + "SQ": "SQUARE", + "GDNS": "GARDENS" + }, + "strip_punctuation": ".,'-" +} diff --git a/pipeline/config/source_schema.yaml b/pipeline/config/source_schema.yaml new file mode 100644 index 0000000..b0477e0 --- /dev/null +++ b/pipeline/config/source_schema.yaml @@ -0,0 +1,107 @@ +{ + "sources": { + "onspd": { + "required_fields": [ + "postcode", + "status", + "lat", + "lon", + "easting", + "northing", + "country_iso2", + "country_iso3", + "subdivision_code", + "post_town", + "locality" + ], + "field_map": { + "postcode": "postcode", + "status": "status", + "lat": "lat", + "lon": "lon", + "easting": "easting", + "northing": "northing", + "country_iso2": "country_iso2", + "country_iso3": "country_iso3", + "subdivision_code": "subdivision_code", + "post_town": "post_town", + "locality": "locality" + } + }, + "os_open_usrn": { + "required_fields": ["usrn", "street_name"], + "field_map": { + "usrn": "usrn", + "street_name": "street_name", + "street_class": "street_class", + "street_status": "street_status" + } + }, + "os_open_names": { + "required_fields": ["feature_id", "street_name", "postcode"], + "field_map": { + "feature_id": "feature_id", + "toid": "toid", + "street_name": "street_name", + "postcode": "postcode" + } + }, + "os_open_roads": { + "required_fields": ["segment_id", "road_name"], + "field_map": { + "segment_id": "segment_id", + "road_id": "road_id", + "usrn": "usrn", + "postcode": "postcode", + "road_name": "road_name" + } + }, + "os_open_uprn": { + "required_fields": ["uprn"], + "field_map": { + "uprn": "uprn", + "postcode": "postcode" + } + }, + "os_open_linked_identifiers": { + "required_fields": ["relation_type", "left_id", "right_id"], + "field_map": { + "relation_type": "relation_type", + "left_id": "left_id", + "right_id": "right_id" + } + }, + "nsul": { + "required_fields": ["uprn", "postcode"], + "field_map": { + "uprn": "uprn", + "postcode": "postcode" + } + }, + "osni_gazetteer": { + "required_fields": ["feature_id", "street_name"], + "field_map": { + "feature_id": "feature_id", + "postcode": "postcode", + "street_name": "street_name" + } + }, + "dfi_highway": { + "required_fields": ["segment_id", "street_name"], + "field_map": { + "segment_id": "segment_id", + "postcode": "postcode", + "street_name": "street_name" + } + }, + "ppd": { + "required_fields": ["row_hash", "postcode", "street", "house_number"], + "field_map": { + "row_hash": "row_hash", + "postcode": "postcode", + "street": "street", + "house_number": "house_number" + } + } + } +} diff --git a/pipeline/pyproject.toml b/pipeline/pyproject.toml new file mode 100644 index 0000000..e548ffe --- /dev/null +++ b/pipeline/pyproject.toml @@ -0,0 +1,21 @@ +[project] +name = "postcodes-pipeline" +version = "0.1.0" +description = "Open-data import and transformation pipeline (Phase 1 + Phase 2 Open Names)" +requires-python = ">=3.11" +dependencies = [ + "psycopg[binary]>=3.2,<4", +] + +[project.scripts] +pipeline = "pipeline.cli:main" + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] + +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/pipeline/sql/migrations/0001_phase1_foundation.sql b/pipeline/sql/migrations/0001_phase1_foundation.sql new file mode 100644 index 0000000..5fc3db5 --- /dev/null +++ b/pipeline/sql/migrations/0001_phase1_foundation.sql @@ -0,0 +1,142 @@ +BEGIN; + +CREATE EXTENSION IF NOT EXISTS postgis; + +CREATE SCHEMA IF NOT EXISTS meta; +CREATE SCHEMA IF NOT EXISTS raw; +CREATE SCHEMA IF NOT EXISTS stage; +CREATE SCHEMA IF NOT EXISTS core; +CREATE SCHEMA IF NOT EXISTS derived; + +CREATE TABLE IF NOT EXISTS meta.dataset_release ( + dataset_key text NOT NULL, + release_id text NOT NULL, + source_url text NOT NULL, + licence text NOT NULL, + file_path text NOT NULL, + expected_sha256 text NOT NULL, + actual_sha256 text NOT NULL, + retrieved_at timestamptz NOT NULL, + manifest_json jsonb NOT NULL DEFAULT '{}'::jsonb, + source_row_count bigint, + loaded_row_count bigint, + source_feature_count bigint, + loaded_feature_count bigint, + source_layer_name text, + srid_confirmed integer, + PRIMARY KEY (dataset_key, release_id), + CHECK (dataset_key IN ('onsud', 'open_uprn', 'open_roads')), + CHECK (expected_sha256 ~ '^[0-9a-fA-F]{64}$'), + CHECK (actual_sha256 ~ '^[0-9a-fA-F]{64}$') +); + +CREATE TABLE IF NOT EXISTS meta.pipeline_run ( + run_id uuid PRIMARY KEY, + release_set_id uuid, + started_at timestamptz NOT NULL, + finished_at timestamptz, + status text NOT NULL, + stage text NOT NULL, + error_text text, + CHECK (status IN ('started', 'built', 'active', 'failed')) +); + +CREATE TABLE IF NOT EXISTS meta.release_set ( + release_set_id uuid PRIMARY KEY, + onsud_release_id text NOT NULL, + open_uprn_release_id text NOT NULL, + open_roads_release_id text NOT NULL, + physical_schema text NOT NULL, + status text NOT NULL, + created_at timestamptz NOT NULL, + built_at timestamptz, + activated_at timestamptz, + CONSTRAINT uq_release_set_inputs + UNIQUE (onsud_release_id, open_uprn_release_id, open_roads_release_id), + CHECK (status IN ('created', 'built', 'active', 'inactive')) +); + +CREATE TABLE IF NOT EXISTS meta.release_activation_log ( + activation_id bigserial PRIMARY KEY, + previous_release_set_id uuid, + release_set_id uuid NOT NULL, + actor text NOT NULL, + activated_at timestamptz NOT NULL, + FOREIGN KEY (release_set_id) + REFERENCES meta.release_set (release_set_id) +); + +CREATE TABLE IF NOT EXISTS meta.dataset_metrics ( + run_id uuid NOT NULL, + release_set_id uuid NOT NULL, + metric_key text NOT NULL, + metric_value numeric NOT NULL, + metric_unit text NOT NULL, + computed_at timestamptz NOT NULL, + PRIMARY KEY (run_id, metric_key), + FOREIGN KEY (release_set_id) + REFERENCES meta.release_set (release_set_id) +); + +CREATE TABLE IF NOT EXISTS meta.canonical_hash ( + release_set_id uuid NOT NULL, + object_name text NOT NULL, + projection jsonb NOT NULL, + row_count bigint NOT NULL, + sha256 text NOT NULL, + computed_at timestamptz NOT NULL, + run_id uuid NOT NULL, + PRIMARY KEY (release_set_id, object_name, run_id), + FOREIGN KEY (release_set_id) + REFERENCES meta.release_set (release_set_id), + CHECK (sha256 ~ '^[0-9a-fA-F]{64}$') +); + +CREATE TABLE IF NOT EXISTS raw.onsud_row ( + id bigserial PRIMARY KEY, + dataset_key text NOT NULL, + release_id text NOT NULL, + source_row_num bigint NOT NULL, + uprn bigint, + postcode text, + extras_jsonb jsonb, + FOREIGN KEY (dataset_key, release_id) + REFERENCES meta.dataset_release (dataset_key, release_id) +); + +CREATE TABLE IF NOT EXISTS raw.open_uprn_row ( + id bigserial PRIMARY KEY, + dataset_key text NOT NULL, + release_id text NOT NULL, + source_row_num bigint NOT NULL, + uprn bigint, + latitude double precision, + longitude double precision, + easting double precision, + northing double precision, + extras_jsonb jsonb, + FOREIGN KEY (dataset_key, release_id) + REFERENCES meta.dataset_release (dataset_key, release_id) +); + +-- Ingest must normalize incoming LineString geometries to MultiLineString. +CREATE TABLE IF NOT EXISTS stage.open_roads_segment ( + dataset_key text NOT NULL, + release_id text NOT NULL, + segment_id bigint NOT NULL, + name_display text, + name_norm text, + geom_bng geometry(MultiLineString,27700) NOT NULL, + CHECK (dataset_key = 'open_roads'), + UNIQUE (release_id, segment_id), + FOREIGN KEY (dataset_key, release_id) + REFERENCES meta.dataset_release (dataset_key, release_id) +); + +CREATE INDEX IF NOT EXISTS idx_stage_open_roads_segment_release_id + ON stage.open_roads_segment (release_id); + +CREATE INDEX IF NOT EXISTS idx_stage_open_roads_segment_geom_bng + ON stage.open_roads_segment USING gist (geom_bng); + +COMMIT; diff --git a/pipeline/sql/migrations/0002_phase1_resume_checkpoints.sql b/pipeline/sql/migrations/0002_phase1_resume_checkpoints.sql new file mode 100644 index 0000000..5bd16c9 --- /dev/null +++ b/pipeline/sql/migrations/0002_phase1_resume_checkpoints.sql @@ -0,0 +1,25 @@ +BEGIN; + +CREATE TABLE IF NOT EXISTS meta.release_set_stage_checkpoint ( + release_set_id uuid NOT NULL, + stage_name text NOT NULL, + run_id uuid NOT NULL, + completed_at timestamptz NOT NULL, + PRIMARY KEY (release_set_id, stage_name), + FOREIGN KEY (release_set_id) + REFERENCES meta.release_set (release_set_id) + ON DELETE CASCADE, + CHECK (stage_name IN ( + 'release_tables_created', + 'core_built', + 'derived_built', + 'metrics_stored', + 'canonical_hashes_stored', + 'release_marked_built' + )) +); + +CREATE INDEX IF NOT EXISTS idx_release_set_stage_checkpoint_release_set + ON meta.release_set_stage_checkpoint (release_set_id, completed_at DESC); + +COMMIT; diff --git a/pipeline/sql/migrations/0003_phase1_table_level_checkpoints.sql b/pipeline/sql/migrations/0003_phase1_table_level_checkpoints.sql new file mode 100644 index 0000000..c36fbf5 --- /dev/null +++ b/pipeline/sql/migrations/0003_phase1_table_level_checkpoints.sql @@ -0,0 +1,23 @@ +BEGIN; + +ALTER TABLE meta.release_set_stage_checkpoint + DROP CONSTRAINT IF EXISTS release_set_stage_checkpoint_stage_name_check; + +ALTER TABLE meta.release_set_stage_checkpoint + ADD CONSTRAINT release_set_stage_checkpoint_stage_name_check + CHECK ( + stage_name IN ( + 'release_tables_created', + 'core_uprn_postcode_built', + 'core_uprn_point_built', + 'core_road_segment_built', + 'derived_uprn_street_spatial_built', + 'metrics_stored', + 'canonical_hashes_stored', + 'release_marked_built', + 'core_built', + 'derived_built' + ) + ); + +COMMIT; diff --git a/pipeline/sql/migrations/0004_phase2_open_names_foundation.sql b/pipeline/sql/migrations/0004_phase2_open_names_foundation.sql new file mode 100644 index 0000000..b5dffc5 --- /dev/null +++ b/pipeline/sql/migrations/0004_phase2_open_names_foundation.sql @@ -0,0 +1,136 @@ +BEGIN; + +-- Phase boundary cutover: purge prior release-set lifecycle state and rs_* schemas. +DO $$ +DECLARE + schema_row record; +BEGIN + FOR schema_row IN + SELECT nspname + FROM pg_namespace + WHERE nspname LIKE 'rs_%' + LOOP + EXECUTE format('DROP SCHEMA IF EXISTS %I CASCADE', schema_row.nspname); + END LOOP; +END +$$; + +TRUNCATE TABLE meta.release_activation_log RESTART IDENTITY CASCADE; +TRUNCATE TABLE meta.release_set_stage_checkpoint RESTART IDENTITY CASCADE; +TRUNCATE TABLE meta.canonical_hash RESTART IDENTITY CASCADE; +TRUNCATE TABLE meta.dataset_metrics RESTART IDENTITY CASCADE; +TRUNCATE TABLE meta.pipeline_run RESTART IDENTITY CASCADE; +TRUNCATE TABLE meta.release_set RESTART IDENTITY CASCADE; + +ALTER TABLE meta.dataset_release + DROP CONSTRAINT IF EXISTS dataset_release_dataset_key_check; +ALTER TABLE meta.dataset_release + DROP CONSTRAINT IF EXISTS ck_dataset_release_dataset_key; +ALTER TABLE meta.dataset_release + ADD CONSTRAINT ck_dataset_release_dataset_key + CHECK (dataset_key IN ('onsud', 'open_uprn', 'open_roads', 'open_names')); + +ALTER TABLE meta.release_set + ADD COLUMN IF NOT EXISTS open_names_release_id text; +ALTER TABLE meta.release_set + ALTER COLUMN open_names_release_id SET NOT NULL; + +ALTER TABLE meta.release_set + DROP CONSTRAINT IF EXISTS uq_release_set_inputs; +ALTER TABLE meta.release_set + DROP CONSTRAINT IF EXISTS uq_release_set_inputs_phase2; +ALTER TABLE meta.release_set + ADD CONSTRAINT uq_release_set_inputs_phase2 + UNIQUE ( + onsud_release_id, + open_uprn_release_id, + open_roads_release_id, + open_names_release_id + ); + +ALTER TABLE meta.release_set_stage_checkpoint + DROP CONSTRAINT IF EXISTS release_set_stage_checkpoint_stage_name_check; +ALTER TABLE meta.release_set_stage_checkpoint + ADD CONSTRAINT release_set_stage_checkpoint_stage_name_check + CHECK ( + stage_name IN ( + 'release_tables_created', + 'core_uprn_postcode_built', + 'core_uprn_point_built', + 'core_road_segment_built', + 'core_open_names_entry_built', + 'core_postcode_unit_seed_built', + 'derived_uprn_street_spatial_built', + 'derived_postcode_street_built', + 'metrics_stored', + 'warnings_stored', + 'canonical_hashes_stored', + 'release_marked_built', + 'core_built', + 'derived_built' + ) + ); + +CREATE TABLE IF NOT EXISTS meta.pipeline_run_warning ( + warning_id bigserial PRIMARY KEY, + run_id uuid NOT NULL, + release_set_id uuid NOT NULL, + warning_code text NOT NULL, + metric_key text NOT NULL, + metric_value numeric NOT NULL, + threshold_value numeric NOT NULL, + requires_ack boolean NOT NULL, + acknowledged_by text, + acknowledged_at timestamptz, + created_at timestamptz NOT NULL DEFAULT now(), + UNIQUE (run_id, warning_code), + FOREIGN KEY (run_id) + REFERENCES meta.pipeline_run (run_id) + ON DELETE CASCADE, + FOREIGN KEY (release_set_id) + REFERENCES meta.release_set (release_set_id) + ON DELETE CASCADE +); + +CREATE INDEX IF NOT EXISTS idx_pipeline_run_warning_release_set + ON meta.pipeline_run_warning (release_set_id, requires_ack, acknowledged_at); + +ALTER TABLE raw.onsud_row + ADD COLUMN IF NOT EXISTS postcode_unit_easting double precision; +ALTER TABLE raw.onsud_row + ADD COLUMN IF NOT EXISTS postcode_unit_northing double precision; + +CREATE TABLE IF NOT EXISTS raw.open_names_row ( + id bigserial PRIMARY KEY, + dataset_key text NOT NULL, + release_id text NOT NULL, + source_row_num bigint NOT NULL, + entry_id text NOT NULL, + name1_display text, + name1_lang text, + name1_norm text, + name2_display text, + name2_norm text, + local_type text NOT NULL, + postcode_district_norm text, + easting double precision, + northing double precision, + geom_bng geometry(Point,27700) NOT NULL, + extras_jsonb jsonb, + CHECK (dataset_key = 'open_names'), + CHECK (local_type IN ('Road', 'Named Road', 'Street')), + UNIQUE (release_id, entry_id), + FOREIGN KEY (dataset_key, release_id) + REFERENCES meta.dataset_release (dataset_key, release_id) +); + +CREATE INDEX IF NOT EXISTS idx_raw_open_names_row_release_id + ON raw.open_names_row (release_id); + +CREATE INDEX IF NOT EXISTS idx_raw_open_names_row_release_district + ON raw.open_names_row (release_id, postcode_district_norm); + +CREATE INDEX IF NOT EXISTS idx_raw_open_names_row_geom_bng + ON raw.open_names_row USING gist (geom_bng); + +COMMIT; diff --git a/pipeline/sql/migrations/0005_v3_cutover_foundation.sql b/pipeline/sql/migrations/0005_v3_cutover_foundation.sql new file mode 100644 index 0000000..c5bcc35 --- /dev/null +++ b/pipeline/sql/migrations/0005_v3_cutover_foundation.sql @@ -0,0 +1,545 @@ +BEGIN; + +CREATE SCHEMA IF NOT EXISTS meta; +CREATE SCHEMA IF NOT EXISTS raw; +CREATE SCHEMA IF NOT EXISTS stage; +CREATE SCHEMA IF NOT EXISTS core; +CREATE SCHEMA IF NOT EXISTS derived; +CREATE SCHEMA IF NOT EXISTS internal; +CREATE SCHEMA IF NOT EXISTS api; + +-- Hard cutover: remove legacy release-run lifecycle objects. +DROP TABLE IF EXISTS meta.release_activation_log CASCADE; +DROP TABLE IF EXISTS meta.release_set_stage_checkpoint CASCADE; +DROP TABLE IF EXISTS meta.pipeline_run_warning CASCADE; +DROP TABLE IF EXISTS meta.dataset_metrics CASCADE; +DROP TABLE IF EXISTS meta.canonical_hash CASCADE; +DROP TABLE IF EXISTS meta.pipeline_run CASCADE; +DROP TABLE IF EXISTS meta.release_set CASCADE; +DROP TABLE IF EXISTS meta.dataset_release CASCADE; + +DROP VIEW IF EXISTS core.uprn_postcode CASCADE; +DROP VIEW IF EXISTS core.uprn_point CASCADE; +DROP VIEW IF EXISTS core.road_segment CASCADE; +DROP VIEW IF EXISTS core.open_names_entry CASCADE; +DROP VIEW IF EXISTS core.postcode_unit_seed CASCADE; +DROP VIEW IF EXISTS derived.uprn_street_spatial CASCADE; +DROP VIEW IF EXISTS derived.postcode_street CASCADE; + +CREATE TABLE IF NOT EXISTS meta.ingest_run ( + run_id uuid PRIMARY KEY, + source_name text NOT NULL, + source_version text NOT NULL, + retrieved_at_utc timestamptz NOT NULL, + source_url text, + processing_git_sha char(40) NOT NULL, + record_count bigint, + notes text, + file_set_sha256 char(64) NOT NULL, + created_at_utc timestamptz NOT NULL DEFAULT now(), + CHECK (source_name IN ( + 'onspd', + 'os_open_usrn', + 'os_open_names', + 'os_open_roads', + 'os_open_uprn', + 'os_open_linked_identifiers', + 'nsul', + 'osni_gazetteer', + 'dfi_highway', + 'ppd' + )), + CHECK (processing_git_sha ~ '^[0-9a-f]{40}$'), + CHECK (file_set_sha256 ~ '^[0-9a-fA-F]{64}$'), + UNIQUE (source_name, source_version, file_set_sha256) +); + +CREATE TABLE IF NOT EXISTS meta.ingest_run_file ( + file_id bigserial PRIMARY KEY, + ingest_run_id uuid NOT NULL, + file_role text NOT NULL, + filename text NOT NULL, + layer_name text NOT NULL DEFAULT '', + sha256 char(64) NOT NULL, + size_bytes bigint NOT NULL, + row_count bigint, + format text NOT NULL, + FOREIGN KEY (ingest_run_id) + REFERENCES meta.ingest_run (run_id) + ON DELETE CASCADE, + CHECK (sha256 ~ '^[0-9a-fA-F]{64}$'), + CHECK (size_bytes >= 0), + UNIQUE (ingest_run_id, file_role, filename, layer_name) +); + +CREATE TABLE IF NOT EXISTS meta.build_bundle ( + bundle_id uuid PRIMARY KEY, + build_profile text NOT NULL, + bundle_hash char(64) NOT NULL, + status text NOT NULL, + created_at_utc timestamptz NOT NULL DEFAULT now(), + CHECK (build_profile IN ('gb_core', 'gb_core_ppd', 'core_ni')), + CHECK (bundle_hash ~ '^[0-9a-fA-F]{64}$'), + CHECK (status IN ('created', 'built', 'failed', 'published')) +); + +CREATE TABLE IF NOT EXISTS meta.build_bundle_source ( + bundle_id uuid NOT NULL, + source_name text NOT NULL, + ingest_run_id uuid NOT NULL, + PRIMARY KEY (bundle_id, source_name, ingest_run_id), + FOREIGN KEY (bundle_id) + REFERENCES meta.build_bundle (bundle_id) + ON DELETE CASCADE, + FOREIGN KEY (ingest_run_id) + REFERENCES meta.ingest_run (run_id), + CHECK (source_name IN ( + 'onspd', + 'os_open_usrn', + 'os_open_names', + 'os_open_roads', + 'os_open_uprn', + 'os_open_linked_identifiers', + 'nsul', + 'osni_gazetteer', + 'dfi_highway', + 'ppd' + )) +); + +CREATE INDEX IF NOT EXISTS idx_build_bundle_source_bundle_source + ON meta.build_bundle_source (bundle_id, source_name); + +CREATE TABLE IF NOT EXISTS meta.build_run ( + build_run_id uuid PRIMARY KEY, + bundle_id uuid NOT NULL, + dataset_version text NOT NULL, + status text NOT NULL, + current_pass text NOT NULL, + error_text text, + started_at_utc timestamptz NOT NULL DEFAULT now(), + finished_at_utc timestamptz, + FOREIGN KEY (bundle_id) + REFERENCES meta.build_bundle (bundle_id) + ON DELETE CASCADE, + CHECK (status IN ('started', 'built', 'failed', 'published')) +); + +CREATE TABLE IF NOT EXISTS meta.build_pass_checkpoint ( + build_run_id uuid NOT NULL, + pass_name text NOT NULL, + completed_at_utc timestamptz NOT NULL DEFAULT now(), + row_count_summary_json jsonb NOT NULL DEFAULT '{}'::jsonb, + PRIMARY KEY (build_run_id, pass_name), + FOREIGN KEY (build_run_id) + REFERENCES meta.build_run (build_run_id) + ON DELETE CASCADE, + CHECK (pass_name IN ( + '0a_raw_ingest', + '0b_stage_normalisation', + '1_onspd_backbone', + '2_gb_canonical_streets', + '3_open_names_candidates', + '4_uprn_reinforcement', + '5_gb_spatial_fallback', + '6_ni_candidates', + '7_ppd_gap_fill', + '8_finalisation' + )) +); + +CREATE TABLE IF NOT EXISTS meta.canonical_hash ( + build_run_id uuid NOT NULL, + object_name text NOT NULL, + projection jsonb NOT NULL, + row_count bigint NOT NULL, + sha256 char(64) NOT NULL, + computed_at_utc timestamptz NOT NULL DEFAULT now(), + PRIMARY KEY (build_run_id, object_name), + FOREIGN KEY (build_run_id) + REFERENCES meta.build_run (build_run_id) + ON DELETE CASCADE, + CHECK (sha256 ~ '^[0-9a-fA-F]{64}$') +); + +CREATE TABLE IF NOT EXISTS meta.dataset_publication ( + dataset_version text PRIMARY KEY, + build_run_id uuid NOT NULL UNIQUE, + published_at_utc timestamptz NOT NULL DEFAULT now(), + published_by text NOT NULL, + lookup_table_name text NOT NULL, + street_lookup_table_name text NOT NULL, + publish_txid bigint NOT NULL, + FOREIGN KEY (build_run_id) + REFERENCES meta.build_run (build_run_id) + ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS raw.onspd_row ( + id bigserial PRIMARY KEY, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id) ON DELETE CASCADE, + source_row_num bigint NOT NULL, + payload_jsonb jsonb NOT NULL +); +CREATE TABLE IF NOT EXISTS raw.os_open_usrn_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.os_open_names_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.os_open_roads_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.os_open_uprn_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.os_open_linked_identifiers_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.nsul_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.osni_gazetteer_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.dfi_highway_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.ppd_row (LIKE raw.onspd_row INCLUDING ALL); + +CREATE INDEX IF NOT EXISTS idx_raw_onspd_run_id ON raw.onspd_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_usrn_run_id ON raw.os_open_usrn_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_names_run_id ON raw.os_open_names_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_roads_run_id ON raw.os_open_roads_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_uprn_run_id ON raw.os_open_uprn_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_oli_run_id ON raw.os_open_linked_identifiers_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_nsul_run_id ON raw.nsul_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_osni_run_id ON raw.osni_gazetteer_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_dfi_run_id ON raw.dfi_highway_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_ppd_run_id ON raw.ppd_row (ingest_run_id); + +CREATE TABLE IF NOT EXISTS stage.onspd_postcode ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + postcode_norm text NOT NULL, + postcode_display text NOT NULL, + status text NOT NULL, + lat numeric(9,6), + lon numeric(9,6), + easting integer, + northing integer, + country_iso2 char(2) NOT NULL, + country_iso3 char(3) NOT NULL, + subdivision_code text, + post_town text, + locality text, + street_enrichment_available boolean NOT NULL, + onspd_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, postcode_norm) +); + +CREATE TABLE IF NOT EXISTS stage.streets_usrn_input ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + usrn bigint NOT NULL, + street_name text NOT NULL, + street_name_casefolded text NOT NULL, + street_class text, + street_status text, + usrn_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, usrn) +); + +CREATE TABLE IF NOT EXISTS stage.open_names_road_feature ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + feature_id text NOT NULL, + toid text, + postcode_norm text, + street_name_raw text NOT NULL, + street_name_casefolded text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, feature_id) +); + +CREATE TABLE IF NOT EXISTS stage.open_roads_segment ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + segment_id text NOT NULL, + road_id text, + postcode_norm text, + usrn bigint, + road_name text NOT NULL, + road_name_casefolded text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, segment_id) +); + +CREATE TABLE IF NOT EXISTS stage.uprn_point ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + uprn bigint NOT NULL, + postcode_norm text, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, uprn) +); + +CREATE TABLE IF NOT EXISTS stage.oli_toid_usrn ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + toid text NOT NULL, + usrn bigint NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, toid, usrn) +); + +CREATE TABLE IF NOT EXISTS stage.oli_uprn_usrn ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + uprn bigint NOT NULL, + usrn bigint NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, uprn, usrn) +); + +CREATE TABLE IF NOT EXISTS stage.nsul_uprn_postcode ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + uprn bigint NOT NULL, + postcode_norm text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, uprn, postcode_norm) +); + +CREATE TABLE IF NOT EXISTS stage.osni_street_point ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + feature_id text NOT NULL, + postcode_norm text, + street_name_raw text NOT NULL, + street_name_casefolded text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, feature_id) +); + +CREATE TABLE IF NOT EXISTS stage.dfi_road_segment ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + segment_id text NOT NULL, + postcode_norm text, + street_name_raw text NOT NULL, + street_name_casefolded text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, segment_id) +); + +CREATE TABLE IF NOT EXISTS stage.ppd_parsed_address ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + row_hash text NOT NULL, + postcode_norm text NOT NULL, + house_number text, + street_token_raw text NOT NULL, + street_token_casefolded text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, row_hash) +); + +CREATE TABLE IF NOT EXISTS core.postcodes ( + produced_build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + postcode text NOT NULL, + status text NOT NULL, + lat numeric(9,6), + lon numeric(9,6), + easting integer, + northing integer, + country_iso2 char(2) NOT NULL, + country_iso3 char(3) NOT NULL, + subdivision_code text, + post_town text, + locality text, + street_enrichment_available boolean NOT NULL, + multi_street boolean NOT NULL DEFAULT false, + onspd_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (produced_build_run_id, postcode) +); + +CREATE TABLE IF NOT EXISTS core.postcodes_meta ( + produced_build_run_id uuid NOT NULL, + postcode text NOT NULL, + meta_jsonb jsonb NOT NULL DEFAULT '{}'::jsonb, + onspd_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (produced_build_run_id, postcode), + FOREIGN KEY (produced_build_run_id, postcode) + REFERENCES core.postcodes (produced_build_run_id, postcode) + ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS core.streets_usrn ( + produced_build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + usrn bigint NOT NULL, + street_name text NOT NULL, + street_name_casefolded text NOT NULL, + street_class text, + street_status text, + usrn_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (produced_build_run_id, usrn) +); + +CREATE TABLE IF NOT EXISTS derived.postcode_street_candidates ( + candidate_id bigserial PRIMARY KEY, + produced_build_run_id uuid NOT NULL, + postcode text NOT NULL, + street_name_raw text NOT NULL, + street_name_canonical text NOT NULL, + usrn bigint, + candidate_type text NOT NULL, + confidence text NOT NULL, + evidence_ref text NOT NULL, + source_name text NOT NULL, + ingest_run_id uuid NOT NULL, + evidence_json jsonb, + created_at_utc timestamptz NOT NULL DEFAULT now(), + FOREIGN KEY (produced_build_run_id) + REFERENCES meta.build_run (build_run_id) + ON DELETE CASCADE, + FOREIGN KEY (produced_build_run_id, postcode) + REFERENCES core.postcodes (produced_build_run_id, postcode) + ON DELETE CASCADE, + FOREIGN KEY (produced_build_run_id, usrn) + REFERENCES core.streets_usrn (produced_build_run_id, usrn) + ON DELETE SET NULL, + FOREIGN KEY (ingest_run_id) + REFERENCES meta.ingest_run (run_id), + CHECK (candidate_type IN ( + 'names_postcode_feature', + 'oli_toid_usrn', + 'uprn_usrn', + 'spatial_os_open_roads', + 'osni_gazetteer_direct', + 'spatial_dfi_highway', + 'ppd_parse_matched', + 'ppd_parse_unmatched' + )), + CHECK (confidence IN ('high', 'medium', 'low', 'none')), + CHECK (source_name IN ( + 'onspd', + 'os_open_usrn', + 'os_open_names', + 'os_open_roads', + 'os_open_uprn', + 'os_open_linked_identifiers', + 'nsul', + 'osni_gazetteer', + 'dfi_highway', + 'ppd' + )) +); + +CREATE INDEX IF NOT EXISTS idx_candidate_run_postcode + ON derived.postcode_street_candidates (produced_build_run_id, postcode); +CREATE INDEX IF NOT EXISTS idx_candidate_run_usrn + ON derived.postcode_street_candidates (produced_build_run_id, usrn); + +CREATE TABLE IF NOT EXISTS derived.postcode_street_candidate_lineage ( + parent_candidate_id bigint NOT NULL REFERENCES derived.postcode_street_candidates (candidate_id) ON DELETE CASCADE, + child_candidate_id bigint NOT NULL REFERENCES derived.postcode_street_candidates (candidate_id) ON DELETE CASCADE, + relation_type text NOT NULL, + produced_build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + PRIMARY KEY (parent_candidate_id, child_candidate_id, relation_type) +); + +CREATE TABLE IF NOT EXISTS derived.postcode_streets_final ( + final_id bigserial PRIMARY KEY, + produced_build_run_id uuid NOT NULL, + postcode text NOT NULL, + street_name text NOT NULL, + usrn bigint, + confidence text NOT NULL, + frequency_score numeric(10,4) NOT NULL, + probability numeric(6,4) NOT NULL, + created_at_utc timestamptz NOT NULL DEFAULT now(), + FOREIGN KEY (produced_build_run_id) + REFERENCES meta.build_run (build_run_id) + ON DELETE CASCADE, + FOREIGN KEY (produced_build_run_id, postcode) + REFERENCES core.postcodes (produced_build_run_id, postcode) + ON DELETE CASCADE, + FOREIGN KEY (produced_build_run_id, usrn) + REFERENCES core.streets_usrn (produced_build_run_id, usrn) + ON DELETE SET NULL, + CHECK (confidence IN ('high', 'medium', 'low', 'none')), + CHECK (probability >= 0 AND probability <= 1), + UNIQUE (produced_build_run_id, postcode, street_name) +); + +CREATE TABLE IF NOT EXISTS derived.postcode_streets_final_candidate ( + final_id bigint NOT NULL REFERENCES derived.postcode_streets_final (final_id) ON DELETE CASCADE, + candidate_id bigint NOT NULL REFERENCES derived.postcode_street_candidates (candidate_id) ON DELETE CASCADE, + produced_build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + link_rank integer NOT NULL, + PRIMARY KEY (final_id, candidate_id) +); + +CREATE TABLE IF NOT EXISTS derived.postcode_streets_final_source ( + final_id bigint NOT NULL REFERENCES derived.postcode_streets_final (final_id) ON DELETE CASCADE, + source_name text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + candidate_type text NOT NULL, + contribution_weight numeric(10,4) NOT NULL, + produced_build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + PRIMARY KEY (final_id, source_name, ingest_run_id, candidate_type), + CHECK (candidate_type IN ( + 'names_postcode_feature', + 'oli_toid_usrn', + 'uprn_usrn', + 'spatial_os_open_roads', + 'osni_gazetteer_direct', + 'spatial_dfi_highway', + 'ppd_parse_matched', + 'ppd_parse_unmatched' + )) +); + +CREATE TABLE IF NOT EXISTS internal.unit_index ( + index_id bigserial PRIMARY KEY, + produced_build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + postcode text NOT NULL, + house_number text NOT NULL, + street_name text NOT NULL, + usrn bigint, + confidence text NOT NULL, + source_type text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + created_at_utc timestamptz NOT NULL DEFAULT now(), + CHECK (confidence IN ('high', 'medium', 'low', 'none')) +); + +CREATE INDEX IF NOT EXISTS idx_unit_index_lookup + ON internal.unit_index (produced_build_run_id, postcode, house_number); + +CREATE OR REPLACE FUNCTION derived.reject_candidate_mutation() +RETURNS trigger +LANGUAGE plpgsql +AS $$ +BEGIN + RAISE EXCEPTION 'derived.postcode_street_candidates is append-only; % is not allowed', TG_OP; +END; +$$; + +DROP TRIGGER IF EXISTS trg_candidate_no_update ON derived.postcode_street_candidates; +CREATE TRIGGER trg_candidate_no_update +BEFORE UPDATE ON derived.postcode_street_candidates +FOR EACH ROW EXECUTE FUNCTION derived.reject_candidate_mutation(); + +DROP TRIGGER IF EXISTS trg_candidate_no_delete ON derived.postcode_street_candidates; +CREATE TRIGGER trg_candidate_no_delete +BEFORE DELETE ON derived.postcode_street_candidates +FOR EACH ROW EXECUTE FUNCTION derived.reject_candidate_mutation(); + +DO $$ +BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'pipeline_writer') THEN + CREATE ROLE pipeline_writer; + END IF; + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'api_reader') THEN + CREATE ROLE api_reader; + END IF; + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'audit_reader') THEN + CREATE ROLE audit_reader; + END IF; +END +$$; + +REVOKE ALL ON SCHEMA internal FROM PUBLIC; +REVOKE ALL ON ALL TABLES IN SCHEMA internal FROM PUBLIC; + +GRANT USAGE ON SCHEMA api TO api_reader; +GRANT SELECT ON ALL TABLES IN SCHEMA api TO api_reader; + +GRANT USAGE ON SCHEMA meta TO audit_reader; +GRANT SELECT ON meta.ingest_run, meta.ingest_run_file, meta.build_bundle, meta.build_bundle_source, + meta.build_run, meta.build_pass_checkpoint, meta.canonical_hash, meta.dataset_publication + TO audit_reader; +GRANT USAGE ON SCHEMA derived TO audit_reader; +GRANT SELECT ON derived.postcode_street_candidates, + derived.postcode_street_candidate_lineage, + derived.postcode_streets_final, + derived.postcode_streets_final_candidate, + derived.postcode_streets_final_source + TO audit_reader; + +COMMIT; diff --git a/pipeline/sql/migrations/0006_v3_open_roads_stage_compat.sql b/pipeline/sql/migrations/0006_v3_open_roads_stage_compat.sql new file mode 100644 index 0000000..6cea5e7 --- /dev/null +++ b/pipeline/sql/migrations/0006_v3_open_roads_stage_compat.sql @@ -0,0 +1,29 @@ +BEGIN; + +DO $$ +BEGIN + IF EXISTS ( + SELECT 1 + FROM information_schema.columns + WHERE table_schema = 'stage' + AND table_name = 'open_roads_segment' + AND column_name = 'release_id' + ) THEN + DROP TABLE stage.open_roads_segment CASCADE; + END IF; +END +$$; + +CREATE TABLE IF NOT EXISTS stage.open_roads_segment ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + segment_id text NOT NULL, + road_id text, + postcode_norm text, + usrn bigint, + road_name text NOT NULL, + road_name_casefolded text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, segment_id) +); + +COMMIT; diff --git a/pipeline/src/pipeline/__init__.py b/pipeline/src/pipeline/__init__.py new file mode 100644 index 0000000..06154e5 --- /dev/null +++ b/pipeline/src/pipeline/__init__.py @@ -0,0 +1 @@ +"""Phase 1 pipeline package.""" diff --git a/pipeline/src/pipeline/build/__init__.py b/pipeline/src/pipeline/build/__init__.py new file mode 100644 index 0000000..90fb536 --- /dev/null +++ b/pipeline/src/pipeline/build/__init__.py @@ -0,0 +1 @@ +"""Build lifecycle workflows for release sets.""" diff --git a/pipeline/src/pipeline/build/workflows.py b/pipeline/src/pipeline/build/workflows.py new file mode 100644 index 0000000..9385f9f --- /dev/null +++ b/pipeline/src/pipeline/build/workflows.py @@ -0,0 +1,2697 @@ +"""Build bundle, pass execution, verification, and publish workflows for Pipeline V3.""" + +from __future__ import annotations + +import hashlib +import json +import re +import uuid +from dataclasses import dataclass +from decimal import Decimal, ROUND_HALF_UP +from pathlib import Path +from typing import Any + +import psycopg +from psycopg import sql +from psycopg.types.json import Jsonb + +from pipeline.config import ( + frequency_weights_config_path, + source_schema_config_path, +) +from pipeline.manifest import BUILD_PROFILES, BuildBundleManifest +from pipeline.util.normalise import postcode_display, postcode_norm, street_casefold + + +class BuildError(RuntimeError): + """Raised for build lifecycle errors.""" + + +@dataclass(frozen=True) +class BuildBundleResult: + bundle_id: str + status: str + bundle_hash: str + + +@dataclass(frozen=True) +class BuildRunResult: + build_run_id: str + status: str + dataset_version: str + message: str + + +@dataclass(frozen=True) +class VerifyResult: + build_run_id: str + status: str + object_hashes: dict[str, str] + + +@dataclass(frozen=True) +class PublishResult: + build_run_id: str + dataset_version: str + status: str + + +PASS_ORDER = ( + "0a_raw_ingest", + "0b_stage_normalisation", + "1_onspd_backbone", + "2_gb_canonical_streets", + "3_open_names_candidates", + "4_uprn_reinforcement", + "5_gb_spatial_fallback", + "6_ni_candidates", + "7_ppd_gap_fill", + "8_finalisation", +) + +RAW_TABLE_BY_SOURCE = { + "onspd": "raw.onspd_row", + "os_open_usrn": "raw.os_open_usrn_row", + "os_open_names": "raw.os_open_names_row", + "os_open_roads": "raw.os_open_roads_row", + "os_open_uprn": "raw.os_open_uprn_row", + "os_open_linked_identifiers": "raw.os_open_linked_identifiers_row", + "nsul": "raw.nsul_row", + "osni_gazetteer": "raw.osni_gazetteer_row", + "dfi_highway": "raw.dfi_highway_row", + "ppd": "raw.ppd_row", +} + +CANDIDATE_TYPES = ( + "names_postcode_feature", + "oli_toid_usrn", + "uprn_usrn", + "spatial_os_open_roads", + "osni_gazetteer_direct", + "spatial_dfi_highway", + "ppd_parse_matched", + "ppd_parse_unmatched", +) + + +def _load_json_config(path: Path) -> dict[str, Any]: + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise BuildError(f"Invalid JSON config: {path}") from exc + if not isinstance(payload, dict): + raise BuildError(f"Config root must be object: {path}") + return payload + + +def _schema_config() -> dict[str, Any]: + return _load_json_config(source_schema_config_path()) + + +def _weight_config() -> dict[str, Decimal]: + payload = _load_json_config(frequency_weights_config_path()) + raw_weights = payload.get("weights") + if not isinstance(raw_weights, dict): + raise BuildError("frequency_weights config must contain object key 'weights'") + + parsed: dict[str, Decimal] = {} + for key, value in raw_weights.items(): + if not isinstance(key, str): + raise BuildError("frequency weight keys must be strings") + try: + weight = Decimal(str(value)) + except Exception as exc: # pragma: no cover + raise BuildError(f"Invalid frequency weight for {key}: {value}") from exc + parsed[key] = weight + + missing = sorted(set(CANDIDATE_TYPES) - set(parsed.keys())) + if missing: + raise BuildError( + "frequency_weights missing candidate types: " + ", ".join(missing) + ) + + for candidate_type, weight in parsed.items(): + if weight <= Decimal("0"): + raise BuildError( + f"frequency weight must be > 0 for candidate_type={candidate_type}; got {weight}" + ) + + unknown = sorted(set(parsed.keys()) - set(CANDIDATE_TYPES)) + if unknown: + raise BuildError( + "frequency_weights has unknown candidate types: " + ", ".join(unknown) + ) + + return {candidate_type: parsed[candidate_type] for candidate_type in CANDIDATE_TYPES} + + +def _bundle_hash(build_profile: str, source_runs: dict[str, tuple[str, ...]]) -> str: + normalized_source_runs = { + source_name: sorted(run_ids) + for source_name, run_ids in source_runs.items() + } + payload = { + "build_profile": build_profile, + "source_runs": { + key: normalized_source_runs[key] for key in sorted(normalized_source_runs.keys()) + }, + } + encoded = json.dumps(payload, separators=(",", ":"), ensure_ascii=True).encode("utf-8") + return hashlib.sha256(encoded).hexdigest() + + +def _dataset_version_from_bundle_hash(bundle_hash: str) -> str: + return f"v3_{bundle_hash[:12]}" + + +def _safe_version_suffix(dataset_version: str) -> str: + suffix = re.sub(r"[^A-Za-z0-9_]", "_", dataset_version) + return suffix or "v3" + + +def create_build_bundle(conn: psycopg.Connection, manifest: BuildBundleManifest) -> BuildBundleResult: + bundle_hash = _bundle_hash(manifest.build_profile, manifest.source_runs) + + with conn.cursor() as cur: + cur.execute( + """ + SELECT bundle_id + FROM meta.build_bundle + WHERE build_profile = %s + AND bundle_hash = %s + """, + (manifest.build_profile, bundle_hash), + ) + existing = cur.fetchone() + if existing is not None: + return BuildBundleResult( + bundle_id=str(existing[0]), + status="existing", + bundle_hash=bundle_hash, + ) + + required_sources = BUILD_PROFILES[manifest.build_profile] + missing = sorted(required_sources - set(manifest.source_runs.keys())) + if missing: + raise BuildError( + "Bundle manifest missing required sources: " + ", ".join(missing) + ) + + with conn.cursor() as cur: + for source_name in sorted(required_sources): + run_ids = manifest.source_runs[source_name] + if source_name == "ppd": + if len(run_ids) == 0: + raise BuildError("Bundle must include at least one ppd ingest run") + else: + if len(run_ids) != 1: + raise BuildError( + f"Source {source_name} must map to exactly one ingest run in a bundle" + ) + + for run_id in run_ids: + cur.execute( + """ + SELECT source_name + FROM meta.ingest_run + WHERE run_id = %s + """, + (run_id,), + ) + row = cur.fetchone() + if row is None: + raise BuildError(f"Unknown ingest_run_id for source {source_name}: {run_id}") + if row[0] != source_name: + raise BuildError( + f"Ingest run/source mismatch: source={source_name} run_id={run_id} row_source={row[0]}" + ) + + bundle_id = str(uuid.uuid4()) + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO meta.build_bundle ( + bundle_id, + build_profile, + bundle_hash, + status, + created_at_utc + ) VALUES (%s, %s, %s, 'created', now()) + """, + (bundle_id, manifest.build_profile, bundle_hash), + ) + + for source_name, run_ids in manifest.source_runs.items(): + for ingest_run_id in run_ids: + cur.execute( + """ + INSERT INTO meta.build_bundle_source ( + bundle_id, + source_name, + ingest_run_id + ) VALUES (%s, %s, %s) + """, + (bundle_id, source_name, ingest_run_id), + ) + + return BuildBundleResult(bundle_id=bundle_id, status="created", bundle_hash=bundle_hash) + + +def _load_bundle( + conn: psycopg.Connection, + bundle_id: str, +) -> tuple[str, str, str, dict[str, tuple[str, ...]]]: + with conn.cursor() as cur: + cur.execute( + """ + SELECT build_profile, bundle_hash, status + FROM meta.build_bundle + WHERE bundle_id = %s + FOR UPDATE + """, + (bundle_id,), + ) + row = cur.fetchone() + if row is None: + raise BuildError(f"Bundle not found: {bundle_id}") + build_profile, bundle_hash, status = row + + cur.execute( + """ + SELECT source_name, ingest_run_id::text + FROM meta.build_bundle_source + WHERE bundle_id = %s + """, + (bundle_id,), + ) + source_rows = cur.fetchall() + + source_runs_map: dict[str, list[str]] = {} + for source_name, ingest_run_id in source_rows: + source_runs_map.setdefault(source_name, []).append(ingest_run_id) + + source_runs: dict[str, tuple[str, ...]] = { + source_name: tuple(sorted(run_ids)) + for source_name, run_ids in source_runs_map.items() + } + + required = BUILD_PROFILES[build_profile] + missing = sorted(required - set(source_runs.keys())) + if missing: + raise BuildError( + f"Bundle {bundle_id} missing required sources for profile {build_profile}: {', '.join(missing)}" + ) + + return build_profile, bundle_hash, status, source_runs + + +def _latest_resumable_run(conn: psycopg.Connection, bundle_id: str) -> tuple[str, str] | None: + with conn.cursor() as cur: + cur.execute( + """ + SELECT build_run_id::text, dataset_version + FROM meta.build_run + WHERE bundle_id = %s + AND status IN ('started', 'failed') + ORDER BY started_at_utc DESC + LIMIT 1 + """, + (bundle_id,), + ) + row = cur.fetchone() + if row is None: + return None + return row[0], row[1] + + +def _load_completed_passes(conn: psycopg.Connection, build_run_id: str) -> set[str]: + with conn.cursor() as cur: + cur.execute( + """ + SELECT pass_name + FROM meta.build_pass_checkpoint + WHERE build_run_id = %s + """, + (build_run_id,), + ) + return {row[0] for row in cur.fetchall()} + + +def _single_source_run(source_runs: dict[str, tuple[str, ...]], source_name: str) -> str: + run_ids = source_runs.get(source_name, ()) + if len(run_ids) != 1: + raise BuildError( + f"Source {source_name} requires exactly one ingest run in bundle; found {len(run_ids)}" + ) + return run_ids[0] + + +def _ordered_run_ids(conn: psycopg.Connection, run_ids: tuple[str, ...]) -> tuple[str, ...]: + if not run_ids: + return () + with conn.cursor() as cur: + cur.execute( + """ + SELECT run_id::text + FROM meta.ingest_run + WHERE run_id = ANY(%s::uuid[]) + ORDER BY retrieved_at_utc ASC, run_id ASC + """, + (list(run_ids),), + ) + ordered = tuple(row[0] for row in cur.fetchall()) + if len(ordered) != len(run_ids): + raise BuildError("One or more ingest run IDs could not be resolved for ordered execution") + return ordered + + +def _create_build_run(conn: psycopg.Connection, bundle_id: str, dataset_version: str) -> str: + build_run_id = str(uuid.uuid4()) + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO meta.build_run ( + build_run_id, + bundle_id, + dataset_version, + status, + current_pass, + started_at_utc + ) VALUES (%s, %s, %s, 'started', 'initialising', now()) + """, + (build_run_id, bundle_id, dataset_version), + ) + return build_run_id + + +def _set_build_run_pass(conn: psycopg.Connection, build_run_id: str, pass_name: str) -> None: + with conn.cursor() as cur: + cur.execute( + """ + UPDATE meta.build_run + SET current_pass = %s + WHERE build_run_id = %s + """, + (pass_name, build_run_id), + ) + + +def _mark_pass_checkpoint( + conn: psycopg.Connection, + build_run_id: str, + pass_name: str, + row_count_summary: dict[str, int], +) -> None: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO meta.build_pass_checkpoint ( + build_run_id, + pass_name, + completed_at_utc, + row_count_summary_json + ) VALUES (%s, %s, now(), %s) + ON CONFLICT (build_run_id, pass_name) + DO UPDATE SET + completed_at_utc = EXCLUDED.completed_at_utc, + row_count_summary_json = EXCLUDED.row_count_summary_json + """, + (build_run_id, pass_name, Jsonb(row_count_summary)), + ) + + +def _mark_build_failed(conn: psycopg.Connection, build_run_id: str, current_pass: str, error_text: str) -> None: + with conn.cursor() as cur: + cur.execute( + """ + UPDATE meta.build_run + SET status = 'failed', + current_pass = %s, + error_text = %s, + finished_at_utc = now() + WHERE build_run_id = %s + """, + (current_pass, error_text, build_run_id), + ) + + +def _mark_build_built(conn: psycopg.Connection, bundle_id: str, build_run_id: str) -> None: + with conn.cursor() as cur: + cur.execute( + """ + UPDATE meta.build_run + SET status = 'built', + current_pass = 'complete', + finished_at_utc = now(), + error_text = NULL + WHERE build_run_id = %s + """, + (build_run_id,), + ) + cur.execute( + """ + UPDATE meta.build_bundle + SET status = 'built' + WHERE bundle_id = %s + """, + (bundle_id,), + ) + + +def _load_raw_rows(conn: psycopg.Connection, raw_table: str, ingest_run_id: str) -> list[dict[str, Any]]: + schema_name, table_name = raw_table.split(".", 1) + with conn.cursor() as cur: + cur.execute( + sql.SQL( + """ + SELECT payload_jsonb + FROM {}.{} + WHERE ingest_run_id = %s + ORDER BY source_row_num ASC + """ + ).format(sql.Identifier(schema_name), sql.Identifier(table_name)), + (ingest_run_id,), + ) + return [row[0] for row in cur.fetchall()] + + +def _mapped_fields_for_source(schema_config: dict[str, Any], source_name: str) -> tuple[dict[str, str], tuple[str, ...]]: + sources = schema_config.get("sources") + if not isinstance(sources, dict): + raise BuildError("source_schema.yaml missing object key 'sources'") + + source_cfg = sources.get(source_name) + if not isinstance(source_cfg, dict): + raise BuildError(f"source_schema.yaml missing source block: {source_name}") + + field_map_raw = source_cfg.get("field_map") + required_raw = source_cfg.get("required_fields") + if not isinstance(field_map_raw, dict): + raise BuildError(f"source_schema.yaml source {source_name} missing field_map object") + if not isinstance(required_raw, list): + raise BuildError(f"source_schema.yaml source {source_name} missing required_fields list") + + field_map: dict[str, str] = {} + for key, value in field_map_raw.items(): + if not isinstance(key, str) or not isinstance(value, str): + raise BuildError(f"source_schema field_map for {source_name} must be string:string") + field_map[key] = value + + required_fields = [] + for item in required_raw: + if not isinstance(item, str): + raise BuildError(f"source_schema required_fields for {source_name} must be strings") + if item not in field_map: + raise BuildError( + f"source_schema required field '{item}' missing from field_map for {source_name}" + ) + required_fields.append(item) + + return field_map, tuple(required_fields) + + +def _assert_required_mapped_fields_present( + source_name: str, + rows: list[dict[str, Any]], + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> None: + if not rows: + raise BuildError(f"Raw source is empty for {source_name}; cannot stage-normalise") + + sample_keys = set(rows[0].keys()) + missing = [] + for key in required_fields: + mapped = field_map[key] + if mapped not in sample_keys: + missing.append(mapped) + if missing: + raise BuildError( + f"Schema mapping unresolved for {source_name}; missing mapped fields in raw rows: " + + ", ".join(sorted(missing)) + ) + + +def _schema_insert_rows( + conn: psycopg.Connection, + query: sql.SQL, + rows: list[tuple[Any, ...]], +) -> int: + if not rows: + return 0 + with conn.cursor() as cur: + cur.executemany(query, rows) + return len(rows) + + +def _stage_cleanup(conn: psycopg.Connection, build_run_id: str) -> None: + tables = ( + "stage.ppd_parsed_address", + "stage.dfi_road_segment", + "stage.osni_street_point", + "stage.nsul_uprn_postcode", + "stage.oli_uprn_usrn", + "stage.oli_toid_usrn", + "stage.uprn_point", + "stage.open_roads_segment", + "stage.open_names_road_feature", + "stage.streets_usrn_input", + "stage.onspd_postcode", + ) + with conn.cursor() as cur: + for table in tables: + schema_name, table_name = table.split(".", 1) + cur.execute( + sql.SQL("DELETE FROM {}.{} WHERE build_run_id = %s").format( + sql.Identifier(schema_name), + sql.Identifier(table_name), + ), + (build_run_id,), + ) + + +def _pass_0a_raw_ingest( + conn: psycopg.Connection, + build_run_id: str, + source_runs: dict[str, tuple[str, ...]], +) -> dict[str, int]: + counts: dict[str, int] = {} + with conn.cursor() as cur: + for source_name, run_ids in sorted(source_runs.items()): + raw_table = RAW_TABLE_BY_SOURCE[source_name] + schema_name, table_name = raw_table.split(".", 1) + total_row_count = 0 + for ingest_run_id in run_ids: + cur.execute( + sql.SQL("SELECT COUNT(*) FROM {}.{} WHERE ingest_run_id = %s").format( + sql.Identifier(schema_name), + sql.Identifier(table_name), + ), + (ingest_run_id,), + ) + row_count = int(cur.fetchone()[0]) + if row_count <= 0: + raise BuildError( + "Pass 0a failed: source has no raw rows for " + f"source={source_name} run={ingest_run_id}" + ) + total_row_count += row_count + counts[source_name] = total_row_count + return counts + + +def _country_enrichment_available(country_iso2: str, subdivision_code: str | None) -> bool: + if subdivision_code in {"GB-ENG", "GB-SCT", "GB-WLS", "GB-NIR"}: + return True + if country_iso2 == "GB": + return True + return False + + +def _populate_stage_onspd( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + rows = _load_raw_rows(conn, "raw.onspd_row", ingest_run_id) + _assert_required_mapped_fields_present("onspd", rows, field_map, required_fields) + + payload: list[tuple[Any, ...]] = [] + for row in rows: + postcode_raw = row.get(field_map["postcode"]) + postcode_n = postcode_norm(str(postcode_raw) if postcode_raw is not None else None) + postcode_d = postcode_display(str(postcode_raw) if postcode_raw is not None else None) + if postcode_n is None or postcode_d is None: + continue + + status_raw = row.get(field_map["status"]) + status = (str(status_raw).strip().lower() if status_raw is not None else "active") or "active" + + country_iso2 = str(row.get(field_map["country_iso2"], "")).strip().upper() + country_iso3 = str(row.get(field_map["country_iso3"], "")).strip().upper() + subdivision_code_raw = row.get(field_map["subdivision_code"]) + subdivision_code = ( + str(subdivision_code_raw).strip().upper() if subdivision_code_raw is not None else None + ) + subdivision_code = subdivision_code or None + + lat_raw = row.get(field_map["lat"]) + lon_raw = row.get(field_map["lon"]) + easting_raw = row.get(field_map["easting"]) + northing_raw = row.get(field_map["northing"]) + + lat: Decimal | None + lon: Decimal | None + try: + lat = Decimal(str(lat_raw)).quantize(Decimal("0.000001")) if lat_raw not in (None, "") else None + lon = Decimal(str(lon_raw)).quantize(Decimal("0.000001")) if lon_raw not in (None, "") else None + except Exception: + lat = None + lon = None + + try: + easting = int(float(easting_raw)) if easting_raw not in (None, "") else None + northing = int(float(northing_raw)) if northing_raw not in (None, "") else None + except Exception: + easting = None + northing = None + + post_town_raw = row.get(field_map["post_town"]) + locality_raw = row.get(field_map["locality"]) + + payload.append( + ( + build_run_id, + postcode_n, + postcode_d, + status, + lat, + lon, + easting, + northing, + country_iso2, + country_iso3, + subdivision_code, + str(post_town_raw).strip().upper() if post_town_raw not in (None, "") else None, + str(locality_raw).strip().upper() if locality_raw not in (None, "") else None, + _country_enrichment_available(country_iso2, subdivision_code), + ingest_run_id, + ) + ) + + return _schema_insert_rows( + conn, + sql.SQL( + """ + INSERT INTO stage.onspd_postcode ( + build_run_id, + postcode_norm, + postcode_display, + status, + lat, + lon, + easting, + northing, + country_iso2, + country_iso3, + subdivision_code, + post_town, + locality, + street_enrichment_available, + onspd_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """ + ), + payload, + ) + + +def _populate_stage_usrn( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + rows = _load_raw_rows(conn, "raw.os_open_usrn_row", ingest_run_id) + _assert_required_mapped_fields_present("os_open_usrn", rows, field_map, required_fields) + + payload: list[tuple[Any, ...]] = [] + for row in rows: + usrn_raw = row.get(field_map["usrn"]) + name_raw = row.get(field_map["street_name"]) + if usrn_raw in (None, "") or name_raw in (None, ""): + continue + try: + usrn = int(usrn_raw) + except Exception: + continue + street_name = str(name_raw).strip() + folded = street_casefold(street_name) + if not street_name or folded is None: + continue + + payload.append( + ( + build_run_id, + usrn, + street_name, + folded, + str(row.get(field_map.get("street_class", ""), "")).strip() or None, + str(row.get(field_map.get("street_status", ""), "")).strip() or None, + ingest_run_id, + ) + ) + + return _schema_insert_rows( + conn, + sql.SQL( + """ + INSERT INTO stage.streets_usrn_input ( + build_run_id, + usrn, + street_name, + street_name_casefolded, + street_class, + street_status, + usrn_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, usrn) + DO UPDATE SET + street_name = EXCLUDED.street_name, + street_name_casefolded = EXCLUDED.street_name_casefolded, + street_class = EXCLUDED.street_class, + street_status = EXCLUDED.street_status, + usrn_run_id = EXCLUDED.usrn_run_id + """ + ), + payload, + ) + + +def _populate_stage_open_names( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + rows = _load_raw_rows(conn, "raw.os_open_names_row", ingest_run_id) + _assert_required_mapped_fields_present("os_open_names", rows, field_map, required_fields) + + payload: list[tuple[Any, ...]] = [] + for row in rows: + feature_id_raw = row.get(field_map["feature_id"]) + street_raw = row.get(field_map["street_name"]) + postcode_raw = row.get(field_map["postcode"]) + toid_raw = row.get(field_map.get("toid", "")) + if feature_id_raw in (None, "") or street_raw in (None, ""): + continue + + folded = street_casefold(str(street_raw)) + postcode_n = postcode_norm(str(postcode_raw) if postcode_raw is not None else None) + if folded is None: + continue + + payload.append( + ( + build_run_id, + str(feature_id_raw).strip(), + str(toid_raw).strip() if toid_raw not in (None, "") else None, + postcode_n, + str(street_raw).strip(), + folded, + ingest_run_id, + ) + ) + + return _schema_insert_rows( + conn, + sql.SQL( + """ + INSERT INTO stage.open_names_road_feature ( + build_run_id, + feature_id, + toid, + postcode_norm, + street_name_raw, + street_name_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, feature_id) + DO UPDATE SET + toid = EXCLUDED.toid, + postcode_norm = EXCLUDED.postcode_norm, + street_name_raw = EXCLUDED.street_name_raw, + street_name_casefolded = EXCLUDED.street_name_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ), + payload, + ) + + +def _populate_stage_open_roads( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + rows = _load_raw_rows(conn, "raw.os_open_roads_row", ingest_run_id) + _assert_required_mapped_fields_present("os_open_roads", rows, field_map, required_fields) + + payload: list[tuple[Any, ...]] = [] + for row in rows: + segment_id_raw = row.get(field_map["segment_id"]) + road_name_raw = row.get(field_map["road_name"]) + if segment_id_raw in (None, "") or road_name_raw in (None, ""): + continue + + folded = street_casefold(str(road_name_raw)) + if folded is None: + continue + + postcode_n = postcode_norm(str(row.get(field_map.get("postcode", ""), "")) or None) + + usrn_raw = row.get(field_map.get("usrn", "")) + try: + usrn = int(usrn_raw) if usrn_raw not in (None, "") else None + except Exception: + usrn = None + + payload.append( + ( + build_run_id, + str(segment_id_raw).strip(), + str(row.get(field_map.get("road_id", ""), "")).strip() or None, + postcode_n, + usrn, + str(road_name_raw).strip(), + folded, + ingest_run_id, + ) + ) + + return _schema_insert_rows( + conn, + sql.SQL( + """ + INSERT INTO stage.open_roads_segment ( + build_run_id, + segment_id, + road_id, + postcode_norm, + usrn, + road_name, + road_name_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, segment_id) + DO UPDATE SET + road_id = EXCLUDED.road_id, + postcode_norm = EXCLUDED.postcode_norm, + usrn = EXCLUDED.usrn, + road_name = EXCLUDED.road_name, + road_name_casefolded = EXCLUDED.road_name_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ), + payload, + ) + + +def _populate_stage_open_uprn( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + rows = _load_raw_rows(conn, "raw.os_open_uprn_row", ingest_run_id) + _assert_required_mapped_fields_present("os_open_uprn", rows, field_map, required_fields) + + payload: list[tuple[Any, ...]] = [] + for row in rows: + uprn_raw = row.get(field_map["uprn"]) + if uprn_raw in (None, ""): + continue + try: + uprn = int(uprn_raw) + except Exception: + continue + + postcode_n = postcode_norm(str(row.get(field_map.get("postcode", ""), "")) or None) + + payload.append((build_run_id, uprn, postcode_n, ingest_run_id)) + + return _schema_insert_rows( + conn, + sql.SQL( + """ + INSERT INTO stage.uprn_point ( + build_run_id, + uprn, + postcode_norm, + ingest_run_id + ) VALUES (%s, %s, %s, %s) + ON CONFLICT (build_run_id, uprn) + DO UPDATE SET + postcode_norm = EXCLUDED.postcode_norm, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ), + payload, + ) + + +def _populate_stage_oli( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> tuple[int, int]: + rows = _load_raw_rows(conn, "raw.os_open_linked_identifiers_row", ingest_run_id) + _assert_required_mapped_fields_present( + "os_open_linked_identifiers", rows, field_map, required_fields + ) + + toid_payload: list[tuple[Any, ...]] = [] + uprn_payload: list[tuple[Any, ...]] = [] + + for row in rows: + relation_raw = row.get(field_map["relation_type"]) + left_raw = row.get(field_map["left_id"]) + right_raw = row.get(field_map["right_id"]) + + relation = str(relation_raw).strip().lower() if relation_raw not in (None, "") else "" + if left_raw in (None, "") or right_raw in (None, ""): + continue + + left_id = str(left_raw).strip() + right_id = str(right_raw).strip() + + if relation in {"toid_usrn", "toid->usrn", "toid_usrn_link"}: + try: + usrn = int(right_id) + except Exception: + continue + toid_payload.append((build_run_id, left_id, usrn, ingest_run_id)) + elif relation in {"uprn_usrn", "uprn->usrn", "uprn_usrn_link"}: + try: + uprn = int(left_id) + usrn = int(right_id) + except Exception: + continue + uprn_payload.append((build_run_id, uprn, usrn, ingest_run_id)) + + toid_count = _schema_insert_rows( + conn, + sql.SQL( + """ + INSERT INTO stage.oli_toid_usrn ( + build_run_id, + toid, + usrn, + ingest_run_id + ) VALUES (%s, %s, %s, %s) + ON CONFLICT (build_run_id, toid, usrn) + DO NOTHING + """ + ), + toid_payload, + ) + + uprn_count = _schema_insert_rows( + conn, + sql.SQL( + """ + INSERT INTO stage.oli_uprn_usrn ( + build_run_id, + uprn, + usrn, + ingest_run_id + ) VALUES (%s, %s, %s, %s) + ON CONFLICT (build_run_id, uprn, usrn) + DO NOTHING + """ + ), + uprn_payload, + ) + + return toid_count, uprn_count + + +def _populate_stage_nsul( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + rows = _load_raw_rows(conn, "raw.nsul_row", ingest_run_id) + _assert_required_mapped_fields_present("nsul", rows, field_map, required_fields) + + payload: list[tuple[Any, ...]] = [] + for row in rows: + uprn_raw = row.get(field_map["uprn"]) + postcode_raw = row.get(field_map["postcode"]) + if uprn_raw in (None, ""): + continue + try: + uprn = int(uprn_raw) + except Exception: + continue + postcode_n = postcode_norm(str(postcode_raw) if postcode_raw is not None else None) + if postcode_n is None: + continue + payload.append((build_run_id, uprn, postcode_n, ingest_run_id)) + + return _schema_insert_rows( + conn, + sql.SQL( + """ + INSERT INTO stage.nsul_uprn_postcode ( + build_run_id, + uprn, + postcode_norm, + ingest_run_id + ) VALUES (%s, %s, %s, %s) + ON CONFLICT (build_run_id, uprn, postcode_norm) + DO NOTHING + """ + ), + payload, + ) + + +def _populate_stage_osni( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + rows = _load_raw_rows(conn, "raw.osni_gazetteer_row", ingest_run_id) + _assert_required_mapped_fields_present("osni_gazetteer", rows, field_map, required_fields) + + payload: list[tuple[Any, ...]] = [] + for row in rows: + feature_id_raw = row.get(field_map["feature_id"]) + street_raw = row.get(field_map["street_name"]) + if feature_id_raw in (None, "") or street_raw in (None, ""): + continue + + folded = street_casefold(str(street_raw)) + if folded is None: + continue + + postcode_n = postcode_norm(str(row.get(field_map.get("postcode", ""), "")) or None) + payload.append( + ( + build_run_id, + str(feature_id_raw).strip(), + postcode_n, + str(street_raw).strip(), + folded, + ingest_run_id, + ) + ) + + return _schema_insert_rows( + conn, + sql.SQL( + """ + INSERT INTO stage.osni_street_point ( + build_run_id, + feature_id, + postcode_norm, + street_name_raw, + street_name_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, feature_id) + DO UPDATE SET + postcode_norm = EXCLUDED.postcode_norm, + street_name_raw = EXCLUDED.street_name_raw, + street_name_casefolded = EXCLUDED.street_name_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ), + payload, + ) + + +def _populate_stage_dfi( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + rows = _load_raw_rows(conn, "raw.dfi_highway_row", ingest_run_id) + _assert_required_mapped_fields_present("dfi_highway", rows, field_map, required_fields) + + payload: list[tuple[Any, ...]] = [] + for row in rows: + segment_id_raw = row.get(field_map["segment_id"]) + street_raw = row.get(field_map["street_name"]) + if segment_id_raw in (None, "") or street_raw in (None, ""): + continue + + folded = street_casefold(str(street_raw)) + if folded is None: + continue + postcode_n = postcode_norm(str(row.get(field_map.get("postcode", ""), "")) or None) + + payload.append( + ( + build_run_id, + str(segment_id_raw).strip(), + postcode_n, + str(street_raw).strip(), + folded, + ingest_run_id, + ) + ) + + return _schema_insert_rows( + conn, + sql.SQL( + """ + INSERT INTO stage.dfi_road_segment ( + build_run_id, + segment_id, + postcode_norm, + street_name_raw, + street_name_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, segment_id) + DO UPDATE SET + postcode_norm = EXCLUDED.postcode_norm, + street_name_raw = EXCLUDED.street_name_raw, + street_name_casefolded = EXCLUDED.street_name_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ), + payload, + ) + + +def _populate_stage_ppd( + conn: psycopg.Connection, + build_run_id: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> int: + rows = _load_raw_rows(conn, "raw.ppd_row", ingest_run_id) + _assert_required_mapped_fields_present("ppd", rows, field_map, required_fields) + + payload: list[tuple[Any, ...]] = [] + for row in rows: + row_hash_raw = row.get(field_map["row_hash"]) + postcode_raw = row.get(field_map["postcode"]) + street_raw = row.get(field_map["street"]) + house_number_raw = row.get(field_map["house_number"]) + + if row_hash_raw in (None, "") or postcode_raw in (None, "") or street_raw in (None, ""): + continue + + postcode_n = postcode_norm(str(postcode_raw)) + folded = street_casefold(str(street_raw)) + if postcode_n is None or folded is None: + continue + + payload.append( + ( + build_run_id, + str(row_hash_raw).strip(), + postcode_n, + str(house_number_raw).strip() if house_number_raw not in (None, "") else None, + str(street_raw).strip(), + folded, + ingest_run_id, + ) + ) + + return _schema_insert_rows( + conn, + sql.SQL( + """ + INSERT INTO stage.ppd_parsed_address ( + build_run_id, + row_hash, + postcode_norm, + house_number, + street_token_raw, + street_token_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, row_hash) + DO UPDATE SET + postcode_norm = EXCLUDED.postcode_norm, + house_number = EXCLUDED.house_number, + street_token_raw = EXCLUDED.street_token_raw, + street_token_casefolded = EXCLUDED.street_token_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ), + payload, + ) + + +def _pass_0b_stage_normalisation( + conn: psycopg.Connection, + build_run_id: str, + source_runs: dict[str, tuple[str, ...]], +) -> dict[str, int]: + _stage_cleanup(conn, build_run_id) + schema_config = _schema_config() + + counts: dict[str, int] = {} + + if "onspd" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "onspd") + ingest_run_id = _single_source_run(source_runs, "onspd") + counts["stage.onspd_postcode"] = _populate_stage_onspd( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "os_open_usrn" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "os_open_usrn") + ingest_run_id = _single_source_run(source_runs, "os_open_usrn") + counts["stage.streets_usrn_input"] = _populate_stage_usrn( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "os_open_names" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "os_open_names") + ingest_run_id = _single_source_run(source_runs, "os_open_names") + counts["stage.open_names_road_feature"] = _populate_stage_open_names( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "os_open_roads" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "os_open_roads") + ingest_run_id = _single_source_run(source_runs, "os_open_roads") + counts["stage.open_roads_segment"] = _populate_stage_open_roads( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "os_open_uprn" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "os_open_uprn") + ingest_run_id = _single_source_run(source_runs, "os_open_uprn") + counts["stage.uprn_point"] = _populate_stage_open_uprn( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "os_open_linked_identifiers" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "os_open_linked_identifiers") + ingest_run_id = _single_source_run(source_runs, "os_open_linked_identifiers") + toid_count, uprn_count = _populate_stage_oli( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + counts["stage.oli_toid_usrn"] = toid_count + counts["stage.oli_uprn_usrn"] = uprn_count + + if "nsul" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "nsul") + ingest_run_id = _single_source_run(source_runs, "nsul") + counts["stage.nsul_uprn_postcode"] = _populate_stage_nsul( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "osni_gazetteer" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "osni_gazetteer") + ingest_run_id = _single_source_run(source_runs, "osni_gazetteer") + counts["stage.osni_street_point"] = _populate_stage_osni( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "dfi_highway" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "dfi_highway") + ingest_run_id = _single_source_run(source_runs, "dfi_highway") + counts["stage.dfi_road_segment"] = _populate_stage_dfi( + conn, build_run_id, ingest_run_id, field_map, required_fields + ) + + if "ppd" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "ppd") + ppd_run_ids = source_runs["ppd"] + if len(ppd_run_ids) == 0: + raise BuildError("Bundle requires at least one ppd ingest run") + ppd_rows = 0 + for ingest_run_id in _ordered_run_ids(conn, ppd_run_ids): + ppd_rows += _populate_stage_ppd( + conn, + build_run_id, + ingest_run_id, + field_map, + required_fields, + ) + counts["stage.ppd_parsed_address"] = ppd_rows + + return counts + + +def _clear_run_outputs(conn: psycopg.Connection, build_run_id: str) -> None: + with conn.cursor() as cur: + for table in ( + "internal.unit_index", + "derived.postcode_streets_final_source", + "derived.postcode_streets_final_candidate", + "derived.postcode_street_candidate_lineage", + "derived.postcode_streets_final", + "derived.postcode_street_candidates", + "core.postcodes_meta", + "core.streets_usrn", + "core.postcodes", + ): + schema_name, table_name = table.split(".", 1) + column_name = "produced_build_run_id" + if table == "core.postcodes_meta": + column_name = "produced_build_run_id" + cur.execute( + sql.SQL("DELETE FROM {}.{} WHERE {} = %s").format( + sql.Identifier(schema_name), + sql.Identifier(table_name), + sql.Identifier(column_name), + ), + (build_run_id,), + ) + + cur.execute("DELETE FROM meta.canonical_hash WHERE build_run_id = %s", (build_run_id,)) + cur.execute("DELETE FROM meta.build_pass_checkpoint WHERE build_run_id = %s", (build_run_id,)) + + +def _pass_1_onspd_backbone(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO core.postcodes ( + produced_build_run_id, + postcode, + status, + lat, + lon, + easting, + northing, + country_iso2, + country_iso3, + subdivision_code, + post_town, + locality, + street_enrichment_available, + onspd_run_id + ) + SELECT + build_run_id, + postcode_display, + status, + lat, + lon, + easting, + northing, + country_iso2, + country_iso3, + subdivision_code, + post_town, + locality, + street_enrichment_available, + onspd_run_id + FROM stage.onspd_postcode + WHERE build_run_id = %s + ORDER BY postcode_norm COLLATE "C" ASC + """, + (build_run_id,), + ) + inserted_postcodes = cur.rowcount + + cur.execute( + """ + INSERT INTO core.postcodes_meta ( + produced_build_run_id, + postcode, + meta_jsonb, + onspd_run_id + ) + SELECT + build_run_id, + postcode_display, + jsonb_build_object( + 'postcode_norm', postcode_norm, + 'country_iso2', country_iso2, + 'country_iso3', country_iso3, + 'subdivision_code', subdivision_code, + 'post_town', post_town, + 'locality', locality, + 'status', status + ), + onspd_run_id + FROM stage.onspd_postcode + WHERE build_run_id = %s + ORDER BY postcode_norm COLLATE "C" ASC + """, + (build_run_id,), + ) + inserted_meta = cur.rowcount + + return { + "core.postcodes": int(inserted_postcodes), + "core.postcodes_meta": int(inserted_meta), + } + + +def _pass_2_gb_canonical_streets(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO core.streets_usrn ( + produced_build_run_id, + usrn, + street_name, + street_name_casefolded, + street_class, + street_status, + usrn_run_id + ) + SELECT + build_run_id, + usrn, + street_name, + street_name_casefolded, + street_class, + street_status, + usrn_run_id + FROM stage.streets_usrn_input + WHERE build_run_id = %s + ORDER BY usrn ASC + """, + (build_run_id,), + ) + inserted = cur.rowcount + + return {"core.streets_usrn": int(inserted)} + + +def _pass_3_open_names_candidates(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + schema_config = _schema_config() + _mapped_fields_for_source(schema_config, "os_open_names") + _mapped_fields_for_source(schema_config, "os_open_linked_identifiers") + + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) + SELECT + %s, + p.postcode, + n.street_name_raw, + n.street_name_casefolded, + NULL, + 'names_postcode_feature', + 'medium', + 'open_names:feature:' || n.feature_id, + 'os_open_names', + n.ingest_run_id, + jsonb_build_object('feature_id', n.feature_id, 'toid', n.toid) + FROM stage.open_names_road_feature AS n + JOIN core.postcodes AS p + ON p.produced_build_run_id = %s + AND replace(p.postcode, ' ', '') = n.postcode_norm + WHERE n.build_run_id = %s + ORDER BY n.feature_id COLLATE "C" ASC + """, + (build_run_id, build_run_id, build_run_id), + ) + base_inserted = cur.rowcount + + promotions_inserted = 0 + lineage_inserted = 0 + + with conn.cursor() as cur: + cur.execute( + """ + SELECT + parent.candidate_id, + parent.postcode, + parent.street_name_raw, + parent.street_name_canonical, + parent.evidence_json ->> 'toid' AS toid, + oli.usrn, + oli.ingest_run_id + FROM derived.postcode_street_candidates AS parent + JOIN stage.oli_toid_usrn AS oli + ON oli.build_run_id = parent.produced_build_run_id + AND oli.toid = parent.evidence_json ->> 'toid' + WHERE parent.produced_build_run_id = %s + AND parent.candidate_type = 'names_postcode_feature' + AND parent.evidence_json ->> 'toid' IS NOT NULL + ORDER BY parent.candidate_id ASC, oli.usrn ASC + """, + (build_run_id,), + ) + promotion_rows = cur.fetchall() + + with conn.cursor() as cur: + for parent_candidate_id, postcode, street_name_raw, street_name_canonical, toid, usrn, oli_run_id in promotion_rows: + cur.execute( + """ + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) VALUES (%s, %s, %s, %s, %s, 'oli_toid_usrn', 'high', %s, 'os_open_linked_identifiers', %s, %s) + RETURNING candidate_id + """, + ( + build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + f"oli:toid_usrn:{toid}", + oli_run_id, + Jsonb({"toid": toid, "usrn": usrn}), + ), + ) + child_candidate_id = int(cur.fetchone()[0]) + promotions_inserted += 1 + + cur.execute( + """ + INSERT INTO derived.postcode_street_candidate_lineage ( + parent_candidate_id, + child_candidate_id, + relation_type, + produced_build_run_id + ) VALUES (%s, %s, 'promotion_toid_usrn', %s) + ON CONFLICT DO NOTHING + """, + (parent_candidate_id, child_candidate_id, build_run_id), + ) + lineage_inserted += cur.rowcount + + return { + "derived.postcode_street_candidates_base": int(base_inserted), + "derived.postcode_street_candidates_promoted": int(promotions_inserted), + "derived.postcode_street_candidate_lineage": int(lineage_inserted), + } + + +def _pass_4_uprn_reinforcement(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + with conn.cursor() as cur: + cur.execute( + """ + WITH aggregate_pairs AS ( + SELECT + nsul.postcode_norm, + oli.usrn, + COUNT(*)::bigint AS uprn_count, + (ARRAY_AGG(oli.ingest_run_id ORDER BY oli.ingest_run_id::text ASC))[1] AS oli_ingest_run_id + FROM stage.nsul_uprn_postcode AS nsul + JOIN stage.oli_uprn_usrn AS oli + ON oli.build_run_id = nsul.build_run_id + AND oli.uprn = nsul.uprn + WHERE nsul.build_run_id = %s + GROUP BY nsul.postcode_norm, oli.usrn + ) + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) + SELECT + %s, + p.postcode, + s.street_name, + s.street_name_casefolded, + a.usrn, + 'uprn_usrn', + 'high', + 'oli:uprn_usrn:' || a.uprn_count::text || '_uprns', + 'os_open_linked_identifiers', + a.oli_ingest_run_id, + jsonb_build_object('uprn_count', a.uprn_count) + FROM aggregate_pairs AS a + JOIN core.postcodes AS p + ON p.produced_build_run_id = %s + AND replace(p.postcode, ' ', '') = a.postcode_norm + JOIN core.streets_usrn AS s + ON s.produced_build_run_id = %s + AND s.usrn = a.usrn + ORDER BY p.postcode COLLATE "C" ASC, a.usrn ASC + """, + (build_run_id, build_run_id, build_run_id, build_run_id), + ) + inserted = cur.rowcount + + return {"derived.postcode_street_candidates_uprn_usrn": int(inserted)} + + +def _pass_5_gb_spatial_fallback(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + schema_config = _schema_config() + _mapped_fields_for_source(schema_config, "os_open_roads") + + with conn.cursor() as cur: + cur.execute( + """ + WITH gb_postcodes_without_high AS ( + SELECT p.postcode, replace(p.postcode, ' ', '') AS postcode_norm + FROM core.postcodes AS p + WHERE p.produced_build_run_id = %s + AND p.country_iso2 = 'GB' + AND NOT EXISTS ( + SELECT 1 + FROM derived.postcode_street_candidates AS c + WHERE c.produced_build_run_id = p.produced_build_run_id + AND c.postcode = p.postcode + AND c.confidence = 'high' + ) + ), + ranked_segments AS ( + SELECT + g.postcode, + r.segment_id, + r.usrn, + r.road_name, + r.road_name_casefolded, + r.ingest_run_id, + ROW_NUMBER() OVER ( + PARTITION BY g.postcode + ORDER BY r.segment_id COLLATE "C" ASC + ) AS rn + FROM gb_postcodes_without_high AS g + JOIN stage.open_roads_segment AS r + ON r.build_run_id = %s + AND r.postcode_norm = g.postcode_norm + ) + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) + SELECT + %s, + rs.postcode, + rs.road_name, + rs.road_name_casefolded, + rs.usrn, + 'spatial_os_open_roads', + 'low', + 'spatial:os_open_roads:' || rs.segment_id || ':fallback', + 'os_open_roads', + rs.ingest_run_id, + jsonb_build_object('segment_id', rs.segment_id) + FROM ranked_segments AS rs + WHERE rs.rn = 1 + ORDER BY rs.postcode COLLATE "C" ASC + """, + (build_run_id, build_run_id, build_run_id), + ) + inserted = cur.rowcount + + return {"derived.postcode_street_candidates_spatial_os_open_roads": int(inserted)} + + +def _pass_6_ni_candidates(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) + SELECT + %s, + p.postcode, + n.street_name_raw, + n.street_name_casefolded, + NULL, + 'osni_gazetteer_direct', + 'medium', + 'osni_gazetteer:feature:' || n.feature_id, + 'osni_gazetteer', + n.ingest_run_id, + jsonb_build_object('feature_id', n.feature_id) + FROM stage.osni_street_point AS n + JOIN core.postcodes AS p + ON p.produced_build_run_id = %s + AND replace(p.postcode, ' ', '') = n.postcode_norm + WHERE n.build_run_id = %s + AND p.subdivision_code = 'GB-NIR' + ORDER BY n.feature_id COLLATE "C" ASC + """, + (build_run_id, build_run_id, build_run_id), + ) + direct_inserted = cur.rowcount + + cur.execute( + """ + WITH ni_without_candidates AS ( + SELECT p.postcode, replace(p.postcode, ' ', '') AS postcode_norm + FROM core.postcodes AS p + WHERE p.produced_build_run_id = %s + AND p.subdivision_code = 'GB-NIR' + AND NOT EXISTS ( + SELECT 1 + FROM derived.postcode_street_candidates AS c + WHERE c.produced_build_run_id = p.produced_build_run_id + AND c.postcode = p.postcode + ) + ), + ranked_segments AS ( + SELECT + n.postcode, + d.segment_id, + d.street_name_raw, + d.street_name_casefolded, + d.ingest_run_id, + ROW_NUMBER() OVER ( + PARTITION BY n.postcode + ORDER BY d.segment_id COLLATE "C" ASC + ) AS rn + FROM ni_without_candidates AS n + JOIN stage.dfi_road_segment AS d + ON d.build_run_id = %s + AND d.postcode_norm = n.postcode_norm + ) + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) + SELECT + %s, + r.postcode, + r.street_name_raw, + r.street_name_casefolded, + NULL, + 'spatial_dfi_highway', + 'low', + 'spatial:dfi_highway:' || r.segment_id || ':fallback', + 'dfi_highway', + r.ingest_run_id, + jsonb_build_object('segment_id', r.segment_id) + FROM ranked_segments AS r + WHERE r.rn = 1 + ORDER BY r.postcode COLLATE "C" ASC + """, + (build_run_id, build_run_id, build_run_id), + ) + fallback_inserted = cur.rowcount + + return { + "derived.postcode_street_candidates_osni_gazetteer_direct": int(direct_inserted), + "derived.postcode_street_candidates_spatial_dfi_highway": int(fallback_inserted), + } + + +def _pass_7_ppd_gap_fill(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: + with conn.cursor() as cur: + cur.execute( + """ + WITH matched AS ( + SELECT + c.postcode, + p.house_number, + p.street_token_raw, + p.ingest_run_id, + s.usrn, + s.street_name, + s.street_name_casefolded + FROM stage.ppd_parsed_address AS p + JOIN core.postcodes AS c + ON c.produced_build_run_id = %s + AND replace(c.postcode, ' ', '') = p.postcode_norm + LEFT JOIN core.streets_usrn AS s + ON s.produced_build_run_id = %s + AND s.street_name_casefolded = p.street_token_casefolded + WHERE p.build_run_id = %s + ) + INSERT INTO derived.postcode_street_candidates ( + produced_build_run_id, + postcode, + street_name_raw, + street_name_canonical, + usrn, + candidate_type, + confidence, + evidence_ref, + source_name, + ingest_run_id, + evidence_json + ) + SELECT + %s, + m.postcode, + m.street_token_raw, + COALESCE(m.street_name_casefolded, upper(m.street_token_raw)), + m.usrn, + CASE WHEN m.usrn IS NULL THEN 'ppd_parse_unmatched' ELSE 'ppd_parse_matched' END, + CASE WHEN m.usrn IS NULL THEN 'low' ELSE 'medium' END, + 'ppd:row:' || md5(m.postcode || '|' || COALESCE(m.house_number, '') || '|' || m.street_token_raw), + 'ppd', + m.ingest_run_id, + jsonb_build_object('house_number', m.house_number) + FROM matched AS m + ORDER BY m.postcode COLLATE "C" ASC + """, + (build_run_id, build_run_id, build_run_id, build_run_id), + ) + candidate_inserted = cur.rowcount + + cur.execute( + """ + WITH matched AS ( + SELECT + c.postcode, + p.house_number, + p.ingest_run_id, + s.usrn, + COALESCE(s.street_name, p.street_token_raw) AS street_name, + CASE WHEN s.usrn IS NULL THEN 'low' ELSE 'medium' END AS confidence, + CASE WHEN s.usrn IS NULL THEN 'ppd_parse_unmatched' ELSE 'ppd_parse_matched' END AS source_type + FROM stage.ppd_parsed_address AS p + JOIN core.postcodes AS c + ON c.produced_build_run_id = %s + AND replace(c.postcode, ' ', '') = p.postcode_norm + LEFT JOIN core.streets_usrn AS s + ON s.produced_build_run_id = %s + AND s.street_name_casefolded = p.street_token_casefolded + WHERE p.build_run_id = %s + ) + INSERT INTO internal.unit_index ( + produced_build_run_id, + postcode, + house_number, + street_name, + usrn, + confidence, + source_type, + ingest_run_id + ) + SELECT + %s, + postcode, + COALESCE(house_number, ''), + street_name, + usrn, + confidence, + source_type, + ingest_run_id + FROM matched + ORDER BY postcode COLLATE "C" ASC + """, + (build_run_id, build_run_id, build_run_id, build_run_id), + ) + unit_index_inserted = cur.rowcount + + return { + "derived.postcode_street_candidates_ppd": int(candidate_inserted), + "internal.unit_index": int(unit_index_inserted), + } + + +def _confidence_from_rank(conf_rank: int) -> str: + if conf_rank >= 3: + return "high" + if conf_rank == 2: + return "medium" + if conf_rank == 1: + return "low" + return "none" + + +def _pass_8_finalisation(conn: psycopg.Connection, build_run_id: str, dataset_version: str) -> dict[str, int]: + weight_map = _weight_config() + + with conn.cursor() as cur: + cur.execute("DROP TABLE IF EXISTS pg_temp.tmp_candidate_weights") + cur.execute( + """ + CREATE TEMP TABLE tmp_candidate_weights ( + candidate_type text PRIMARY KEY, + weight numeric(10,4) NOT NULL + ) ON COMMIT DROP + """ + ) + cur.executemany( + "INSERT INTO tmp_candidate_weights (candidate_type, weight) VALUES (%s, %s)", + [(candidate_type, weight) for candidate_type, weight in weight_map.items()], + ) + + cur.execute("DROP TABLE IF EXISTS pg_temp.tmp_weighted_candidates") + cur.execute( + """ + CREATE TEMP TABLE tmp_weighted_candidates AS + SELECT + c.candidate_id, + c.postcode, + COALESCE(s.street_name, c.street_name_canonical) AS canonical_street_name, + c.usrn, + c.source_name, + c.ingest_run_id, + c.candidate_type, + w.weight::numeric(10,4) AS weight, + CASE c.confidence + WHEN 'high' THEN 3 + WHEN 'medium' THEN 2 + WHEN 'low' THEN 1 + ELSE 0 + END AS conf_rank + FROM derived.postcode_street_candidates AS c + JOIN tmp_candidate_weights AS w + ON w.candidate_type = c.candidate_type + LEFT JOIN core.streets_usrn AS s + ON s.produced_build_run_id = c.produced_build_run_id + AND s.usrn = c.usrn + WHERE c.produced_build_run_id = %s + """, + (build_run_id,), + ) + + cur.execute( + """ + SELECT postcode + FROM ( + SELECT postcode, SUM(weight) AS total_weight + FROM tmp_weighted_candidates + GROUP BY postcode + ) AS totals + WHERE total_weight <= 0 + LIMIT 1 + """ + ) + bad = cur.fetchone() + if bad is not None: + raise BuildError( + f"Finalisation failed: total_weight <= 0 for postcode={bad[0]}" + ) + + cur.execute( + """ + WITH grouped AS ( + SELECT + postcode, + canonical_street_name, + MIN(usrn) AS usrn, + SUM(weight) AS weighted_score, + MAX(conf_rank) AS conf_rank + FROM tmp_weighted_candidates + GROUP BY postcode, canonical_street_name + ), + totals AS ( + SELECT postcode, SUM(weighted_score) AS total_weight + FROM grouped + GROUP BY postcode + ), + scored AS ( + SELECT + g.postcode, + g.canonical_street_name, + g.usrn, + g.weighted_score, + g.conf_rank, + (g.weighted_score / t.total_weight) AS raw_probability + FROM grouped AS g + JOIN totals AS t + ON t.postcode = g.postcode + ), + rounded AS ( + SELECT + s.*, + ROUND(s.raw_probability::numeric, 4) AS rounded_probability, + ROW_NUMBER() OVER ( + PARTITION BY s.postcode + ORDER BY + s.raw_probability DESC, + s.conf_rank DESC, + s.canonical_street_name COLLATE "C" ASC, + s.usrn ASC NULLS LAST + ) AS rn, + SUM(ROUND(s.raw_probability::numeric, 4)) OVER ( + PARTITION BY s.postcode + ) AS rounded_sum + FROM scored AS s + ) + SELECT + postcode, + canonical_street_name, + usrn, + weighted_score, + conf_rank, + CASE + WHEN rn = 1 + THEN ROUND((rounded_probability + (1.0000 - rounded_sum))::numeric, 4) + ELSE rounded_probability + END AS final_probability, + rn + FROM rounded + ORDER BY postcode COLLATE "C" ASC, rn ASC + """ + ) + final_rows = cur.fetchall() + + inserted_final = 0 + inserted_final_candidate = 0 + inserted_final_source = 0 + + with conn.cursor() as cur: + for postcode, street_name, usrn, weighted_score, conf_rank, probability, _rn in final_rows: + frequency_score = Decimal(str(weighted_score)).quantize(Decimal("0.0001"), rounding=ROUND_HALF_UP) + probability_decimal = Decimal(str(probability)).quantize( + Decimal("0.0001"), + rounding=ROUND_HALF_UP, + ) + confidence = _confidence_from_rank(int(conf_rank)) + + cur.execute( + """ + INSERT INTO derived.postcode_streets_final ( + produced_build_run_id, + postcode, + street_name, + usrn, + confidence, + frequency_score, + probability + ) VALUES (%s, %s, %s, %s, %s, %s, %s) + RETURNING final_id + """, + ( + build_run_id, + postcode, + street_name, + usrn, + confidence, + frequency_score, + probability_decimal, + ), + ) + final_id = int(cur.fetchone()[0]) + inserted_final += 1 + + cur.execute( + """ + SELECT candidate_id + FROM tmp_weighted_candidates + WHERE postcode = %s + AND canonical_street_name = %s + ORDER BY candidate_id ASC + """, + (postcode, street_name), + ) + candidate_ids = [int(row[0]) for row in cur.fetchall()] + for rank, candidate_id in enumerate(candidate_ids, start=1): + cur.execute( + """ + INSERT INTO derived.postcode_streets_final_candidate ( + final_id, + candidate_id, + produced_build_run_id, + link_rank + ) VALUES (%s, %s, %s, %s) + """, + (final_id, candidate_id, build_run_id, rank), + ) + inserted_final_candidate += 1 + + cur.execute( + """ + SELECT source_name, ingest_run_id, candidate_type, SUM(weight) AS contribution_weight + FROM tmp_weighted_candidates + WHERE postcode = %s + AND canonical_street_name = %s + GROUP BY source_name, ingest_run_id, candidate_type + ORDER BY source_name COLLATE "C" ASC, ingest_run_id::text ASC, candidate_type COLLATE "C" ASC + """, + (postcode, street_name), + ) + for source_name, ingest_run_id, candidate_type, contribution_weight in cur.fetchall(): + cur.execute( + """ + INSERT INTO derived.postcode_streets_final_source ( + final_id, + source_name, + ingest_run_id, + candidate_type, + contribution_weight, + produced_build_run_id + ) VALUES (%s, %s, %s, %s, %s, %s) + """, + ( + final_id, + source_name, + ingest_run_id, + candidate_type, + Decimal(str(contribution_weight)).quantize( + Decimal("0.0001"), rounding=ROUND_HALF_UP + ), + build_run_id, + ), + ) + inserted_final_source += 1 + + cur.execute( + """ + UPDATE core.postcodes + SET multi_street = false + WHERE produced_build_run_id = %s + """, + (build_run_id,), + ) + cur.execute( + """ + WITH counts AS ( + SELECT postcode, COUNT(*) AS street_count + FROM derived.postcode_streets_final + WHERE produced_build_run_id = %s + GROUP BY postcode + ) + UPDATE core.postcodes AS p + SET multi_street = (c.street_count > 1) + FROM counts AS c + WHERE p.produced_build_run_id = %s + AND p.postcode = c.postcode + """, + (build_run_id, build_run_id), + ) + + projection_counts = _create_api_projection_tables(conn, build_run_id, dataset_version) + + return { + "derived.postcode_streets_final": inserted_final, + "derived.postcode_streets_final_candidate": inserted_final_candidate, + "derived.postcode_streets_final_source": inserted_final_source, + **projection_counts, + } + + +def _create_api_projection_tables( + conn: psycopg.Connection, + build_run_id: str, + dataset_version: str, +) -> dict[str, int]: + suffix = _safe_version_suffix(dataset_version) + street_table_name = f"postcode_street_lookup__{suffix}" + lookup_table_name = f"postcode_lookup__{suffix}" + + street_ident = sql.Identifier(street_table_name) + lookup_ident = sql.Identifier(lookup_table_name) + + with conn.cursor() as cur: + cur.execute(sql.SQL("DROP TABLE IF EXISTS api.{} CASCADE").format(street_ident)) + cur.execute( + sql.SQL( + """ + CREATE TABLE api.{} AS + SELECT + f.postcode, + f.street_name, + f.usrn, + f.confidence, + f.frequency_score, + f.probability, + %s::text AS dataset_version, + f.produced_build_run_id + FROM derived.postcode_streets_final AS f + WHERE f.produced_build_run_id = %s + ORDER BY + f.postcode COLLATE "C" ASC, + f.probability DESC, + f.street_name COLLATE "C" ASC, + f.usrn ASC NULLS LAST + """ + ).format(street_ident), + (dataset_version, build_run_id), + ) + + cur.execute(sql.SQL("DROP TABLE IF EXISTS api.{} CASCADE").format(lookup_ident)) + cur.execute( + sql.SQL( + """ + CREATE TABLE api.{} AS + WITH street_rows AS ( + SELECT + s.postcode, + jsonb_agg( + jsonb_build_object( + 'name', s.street_name, + 'confidence', s.confidence, + 'probability', s.probability, + 'usrn', s.usrn + ) + ORDER BY + s.probability DESC, + CASE s.confidence + WHEN 'high' THEN 3 + WHEN 'medium' THEN 2 + WHEN 'low' THEN 1 + ELSE 0 + END DESC, + s.street_name COLLATE "C" ASC, + s.usrn ASC NULLS LAST + ) AS streets_json + FROM api.{} AS s + GROUP BY s.postcode + ), + source_rows AS ( + SELECT + dedup.postcode, + array_agg(dedup.source_name ORDER BY dedup.source_name COLLATE "C") AS sources + FROM ( + SELECT DISTINCT + f.postcode, + fs.source_name + FROM derived.postcode_streets_final AS f + JOIN derived.postcode_streets_final_source AS fs + ON fs.final_id = f.final_id + WHERE f.produced_build_run_id = %s + ) AS dedup + GROUP BY dedup.postcode + ) + SELECT + p.postcode, + p.status, + p.country_iso2, + p.country_iso3, + p.subdivision_code, + p.post_town, + p.locality, + p.lat, + p.lon, + p.easting, + p.northing, + p.street_enrichment_available, + p.multi_street, + COALESCE(sr.streets_json, '[]'::jsonb) AS streets_json, + COALESCE(src.sources, ARRAY['onspd']::text[]) AS sources, + %s::text AS dataset_version, + p.produced_build_run_id + FROM core.postcodes AS p + LEFT JOIN street_rows AS sr + ON sr.postcode = p.postcode + LEFT JOIN source_rows AS src + ON src.postcode = p.postcode + WHERE p.produced_build_run_id = %s + ORDER BY p.postcode COLLATE "C" ASC + """ + ).format(lookup_ident, street_ident), + (build_run_id, dataset_version, build_run_id), + ) + + cur.execute(sql.SQL("SELECT COUNT(*) FROM api.{}").format(street_ident)) + street_count = int(cur.fetchone()[0]) + cur.execute(sql.SQL("SELECT COUNT(*) FROM api.{}").format(lookup_ident)) + lookup_count = int(cur.fetchone()[0]) + + return { + f"api.{street_table_name}": street_count, + f"api.{lookup_table_name}": lookup_count, + } + + +def _pass_handler( + pass_name: str, +): + handlers = { + "0a_raw_ingest": _pass_0a_raw_ingest, + "0b_stage_normalisation": _pass_0b_stage_normalisation, + "1_onspd_backbone": _pass_1_onspd_backbone, + "2_gb_canonical_streets": _pass_2_gb_canonical_streets, + "3_open_names_candidates": _pass_3_open_names_candidates, + "4_uprn_reinforcement": _pass_4_uprn_reinforcement, + "5_gb_spatial_fallback": _pass_5_gb_spatial_fallback, + "6_ni_candidates": _pass_6_ni_candidates, + "7_ppd_gap_fill": _pass_7_ppd_gap_fill, + "8_finalisation": _pass_8_finalisation, + } + return handlers[pass_name] + + +def run_build( + conn: psycopg.Connection, + bundle_id: str, + rebuild: bool, + resume: bool, +) -> BuildRunResult: + if rebuild and resume: + raise BuildError("--rebuild and --resume cannot be used together") + + build_profile, bundle_hash, _bundle_status, source_runs = _load_bundle(conn, bundle_id) + required = BUILD_PROFILES[build_profile] + missing = sorted(required - set(source_runs.keys())) + if missing: + raise BuildError( + f"Bundle {bundle_id} missing required sources: {', '.join(missing)}" + ) + for source_name in required: + run_ids = source_runs.get(source_name, ()) + if source_name == "ppd": + if len(run_ids) == 0: + raise BuildError("Bundle must include at least one ppd ingest run") + else: + if len(run_ids) != 1: + raise BuildError( + f"Bundle source {source_name} must include exactly one ingest run" + ) + + if resume: + resumable = _latest_resumable_run(conn, bundle_id) + if resumable is None: + raise BuildError(f"No resumable run found for bundle {bundle_id}") + build_run_id, dataset_version = resumable + completed_passes = _load_completed_passes(conn, build_run_id) + else: + dataset_version = _dataset_version_from_bundle_hash(bundle_hash) + build_run_id = _create_build_run(conn, bundle_id, dataset_version) + completed_passes = set() + if rebuild: + _clear_run_outputs(conn, build_run_id) + conn.commit() + + try: + for pass_name in PASS_ORDER: + if pass_name in completed_passes: + continue + + _set_build_run_pass(conn, build_run_id, pass_name) + + handler = _pass_handler(pass_name) + if pass_name in {"0a_raw_ingest", "0b_stage_normalisation"}: + row_count_summary = handler(conn, build_run_id, source_runs) + elif pass_name == "8_finalisation": + row_count_summary = handler(conn, build_run_id, dataset_version) + else: + row_count_summary = handler(conn, build_run_id) + + _mark_pass_checkpoint(conn, build_run_id, pass_name, row_count_summary) + conn.commit() + + _mark_build_built(conn, bundle_id, build_run_id) + conn.commit() + return BuildRunResult( + build_run_id=build_run_id, + status="built", + dataset_version=dataset_version, + message="Build completed successfully", + ) + except Exception as exc: + conn.rollback() + try: + _mark_build_failed(conn, build_run_id, pass_name, str(exc)) + conn.commit() + except Exception: + conn.rollback() + raise + + +def _canonical_hash_query( + conn: psycopg.Connection, + query_sql: sql.SQL, + params: tuple[Any, ...] = (), +) -> tuple[int, str]: + digest = hashlib.sha256() + row_count = 0 + + cursor_name = f"canon_{uuid.uuid4().hex[:12]}" + with conn.cursor(name=cursor_name) as cur: + cur.execute(query_sql, params) + for row in cur: + row_count += 1 + normalized = [] + for value in row: + if isinstance(value, Decimal): + normalized.append(str(value)) + else: + normalized.append(value) + digest.update( + json.dumps(normalized, separators=(",", ":"), ensure_ascii=True, default=str).encode("utf-8") + ) + digest.update(b"\n") + + return row_count, digest.hexdigest() + + +def verify_build(conn: psycopg.Connection, build_run_id: str) -> VerifyResult: + with conn.cursor() as cur: + cur.execute( + """ + SELECT dataset_version, status + FROM meta.build_run + WHERE build_run_id = %s + """, + (build_run_id,), + ) + row = cur.fetchone() + if row is None: + raise BuildError(f"Build run not found: {build_run_id}") + dataset_version, status = row + if status not in {"built", "published"}: + raise BuildError(f"Build run {build_run_id} must be built before verify (status={status})") + + with conn.cursor() as cur: + cur.execute( + """ + SELECT postcode, SUM(probability)::numeric(10,4) AS prob_sum + FROM derived.postcode_streets_final + WHERE produced_build_run_id = %s + GROUP BY postcode + HAVING SUM(probability)::numeric(10,4) <> 1.0000 + LIMIT 1 + """, + (build_run_id,), + ) + bad = cur.fetchone() + if bad is not None: + raise BuildError( + f"Probability sum check failed for postcode={bad[0]} sum={bad[1]}" + ) + + suffix = _safe_version_suffix(dataset_version) + street_table = f"api.postcode_street_lookup__{suffix}" + lookup_table = f"api.postcode_lookup__{suffix}" + + specs = [ + ( + "derived_postcode_streets_final", + sql.SQL( + """ + SELECT postcode, street_name, usrn, confidence, frequency_score, probability + FROM derived.postcode_streets_final + WHERE produced_build_run_id = %s + ORDER BY postcode COLLATE "C" ASC, street_name COLLATE "C" ASC, usrn ASC NULLS LAST + """ + ), + (build_run_id,), + ), + ( + "api_postcode_street_lookup", + sql.SQL( + """ + SELECT postcode, street_name, usrn, confidence, frequency_score, probability, dataset_version + FROM api.{} + ORDER BY postcode COLLATE "C" ASC, street_name COLLATE "C" ASC, usrn ASC NULLS LAST + """ + ).format(sql.Identifier(f"postcode_street_lookup__{suffix}")), + (), + ), + ( + "api_postcode_lookup", + sql.SQL( + """ + SELECT postcode, status, country_iso2, country_iso3, subdivision_code, + post_town, locality, lat, lon, easting, northing, + street_enrichment_available, multi_street, streets_json::text, + sources::text, dataset_version + FROM api.{} + ORDER BY postcode COLLATE "C" ASC + """ + ).format(sql.Identifier(f"postcode_lookup__{suffix}")), + (), + ), + ] + + object_hashes: dict[str, str] = {} + with conn.cursor() as cur: + cur.execute( + """ + SELECT to_regclass(%s), to_regclass(%s) + """, + (street_table, lookup_table), + ) + street_regclass, lookup_regclass = cur.fetchone() + if street_regclass is None or lookup_regclass is None: + raise BuildError( + f"API projection tables not found for dataset_version={dataset_version}; expected {street_table} and {lookup_table}" + ) + + with conn.cursor() as cur: + cur.execute("DELETE FROM meta.canonical_hash WHERE build_run_id = %s", (build_run_id,)) + + for object_name, query_sql, params in specs: + row_count, sha256_digest = _canonical_hash_query(conn, query_sql, params) + object_hashes[object_name] = sha256_digest + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO meta.canonical_hash ( + build_run_id, + object_name, + projection, + row_count, + sha256, + computed_at_utc + ) VALUES (%s, %s, %s, %s, %s, now()) + """, + ( + build_run_id, + object_name, + Jsonb({"ordering": "deterministic"}), + row_count, + sha256_digest, + ), + ) + + return VerifyResult(build_run_id=build_run_id, status="verified", object_hashes=object_hashes) + + +def publish_build(conn: psycopg.Connection, build_run_id: str, actor: str) -> PublishResult: + with conn.cursor() as cur: + cur.execute( + """ + SELECT bundle_id, dataset_version, status + FROM meta.build_run + WHERE build_run_id = %s + FOR UPDATE + """, + (build_run_id,), + ) + row = cur.fetchone() + if row is None: + raise BuildError(f"Build run not found: {build_run_id}") + bundle_id, dataset_version, status = row + if status not in {"built", "published"}: + raise BuildError(f"Build run {build_run_id} must be built before publish (status={status})") + + suffix = _safe_version_suffix(dataset_version) + lookup_table_name = f"postcode_lookup__{suffix}" + street_lookup_table_name = f"postcode_street_lookup__{suffix}" + + with conn.cursor() as cur: + cur.execute("SELECT to_regclass(%s), to_regclass(%s)", ( + f"api.{lookup_table_name}", + f"api.{street_lookup_table_name}", + )) + lookup_regclass, street_regclass = cur.fetchone() + if lookup_regclass is None or street_regclass is None: + raise BuildError( + "Cannot publish: versioned api tables are missing for dataset_version=" + f"{dataset_version}" + ) + + with conn.cursor() as cur: + cur.execute( + sql.SQL("CREATE OR REPLACE VIEW api.postcode_lookup AS SELECT * FROM api.{}").format( + sql.Identifier(lookup_table_name) + ) + ) + cur.execute( + sql.SQL( + "CREATE OR REPLACE VIEW api.postcode_street_lookup AS SELECT * FROM api.{}" + ).format(sql.Identifier(street_lookup_table_name)) + ) + + cur.execute("SELECT txid_current()") + publish_txid = int(cur.fetchone()[0]) + + cur.execute( + """ + INSERT INTO meta.dataset_publication ( + dataset_version, + build_run_id, + published_at_utc, + published_by, + lookup_table_name, + street_lookup_table_name, + publish_txid + ) VALUES (%s, %s, now(), %s, %s, %s, %s) + ON CONFLICT (dataset_version) + DO UPDATE SET + build_run_id = EXCLUDED.build_run_id, + published_at_utc = EXCLUDED.published_at_utc, + published_by = EXCLUDED.published_by, + lookup_table_name = EXCLUDED.lookup_table_name, + street_lookup_table_name = EXCLUDED.street_lookup_table_name, + publish_txid = EXCLUDED.publish_txid + """, + ( + dataset_version, + build_run_id, + actor, + f"api.{lookup_table_name}", + f"api.{street_lookup_table_name}", + publish_txid, + ), + ) + + cur.execute( + """ + UPDATE meta.build_run + SET status = 'published', + current_pass = 'published', + finished_at_utc = COALESCE(finished_at_utc, now()) + WHERE build_run_id = %s + """, + (build_run_id,), + ) + + cur.execute( + """ + UPDATE meta.build_bundle + SET status = 'published' + WHERE bundle_id = %s + """, + (bundle_id,), + ) + + return PublishResult(build_run_id=build_run_id, dataset_version=dataset_version, status="published") diff --git a/pipeline/src/pipeline/cli.py b/pipeline/src/pipeline/cli.py new file mode 100644 index 0000000..631c34c --- /dev/null +++ b/pipeline/src/pipeline/cli.py @@ -0,0 +1,169 @@ +"""CLI entrypoint for Pipeline V3 lifecycle commands.""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +from pipeline.build.workflows import ( + BuildError, + create_build_bundle, + publish_build, + run_build, + verify_build, +) +from pipeline.config import default_dsn, migrations_dir +from pipeline.db.connection import connect +from pipeline.db.migrations import apply_migrations +from pipeline.ingest.workflows import IngestError, ingest_source +from pipeline.manifest import ManifestError, load_bundle_manifest, load_source_manifest + + +def _parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog="pipeline") + parser.add_argument("--dsn", default=default_dsn(), help="PostgreSQL DSN") + + subparsers = parser.add_subparsers(dest="command", required=True) + + db_parser = subparsers.add_parser("db", help="Database operations") + db_subparsers = db_parser.add_subparsers(dest="db_command", required=True) + db_subparsers.add_parser("migrate", help="Apply SQL migrations") + + ingest_parser = subparsers.add_parser("ingest", help="Source ingest operations") + ingest_subparsers = ingest_parser.add_subparsers(dest="ingest_command", required=True) + source_parser = ingest_subparsers.add_parser("source", help="Ingest a source manifest") + source_parser.add_argument("--manifest", required=True, type=Path) + + bundle_parser = subparsers.add_parser("bundle", help="Bundle lifecycle") + bundle_subparsers = bundle_parser.add_subparsers(dest="bundle_command", required=True) + bundle_create_parser = bundle_subparsers.add_parser("create", help="Create build bundle") + bundle_create_parser.add_argument("--manifest", required=True, type=Path) + + build_parser = subparsers.add_parser("build", help="Build lifecycle") + build_subparsers = build_parser.add_subparsers(dest="build_command", required=True) + + build_run_parser = build_subparsers.add_parser("run", help="Run build passes") + build_run_parser.add_argument("--bundle-id", required=True) + build_run_parser.add_argument("--rebuild", action="store_true") + build_run_parser.add_argument("--resume", action="store_true") + + build_verify_parser = build_subparsers.add_parser("verify", help="Verify build outputs") + build_verify_parser.add_argument("--build-run-id", required=True) + + build_publish_parser = build_subparsers.add_parser("publish", help="Publish verified build") + build_publish_parser.add_argument("--build-run-id", required=True) + build_publish_parser.add_argument("--actor", required=True) + + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = _parser() + args = parser.parse_args(argv) + + try: + if args.command == "db" and args.db_command == "migrate": + applied = apply_migrations(args.dsn, migrations_dir()) + print(json.dumps({"status": "ok", "migrations_applied": applied})) + return 0 + + if args.command == "ingest" and args.ingest_command == "source": + manifest = load_source_manifest(args.manifest) + with connect(args.dsn) as conn: + result = ingest_source(conn, manifest) + conn.commit() + print( + json.dumps( + { + "status": result.status, + "source_name": result.source_name, + "ingest_run_id": result.run_id, + "files_loaded": result.files_loaded, + "rows_loaded": result.rows_loaded, + } + ) + ) + return 0 + + if args.command == "bundle" and args.bundle_command == "create": + manifest = load_bundle_manifest(args.manifest) + with connect(args.dsn) as conn: + result = create_build_bundle(conn, manifest) + conn.commit() + print( + json.dumps( + { + "status": result.status, + "bundle_id": result.bundle_id, + "bundle_hash": result.bundle_hash, + } + ) + ) + return 0 + + if args.command == "build" and args.build_command == "run": + with connect(args.dsn) as conn: + result = run_build( + conn, + bundle_id=args.bundle_id, + rebuild=args.rebuild, + resume=args.resume, + ) + conn.commit() + print( + json.dumps( + { + "status": result.status, + "build_run_id": result.build_run_id, + "dataset_version": result.dataset_version, + "message": result.message, + } + ) + ) + return 0 + + if args.command == "build" and args.build_command == "verify": + with connect(args.dsn) as conn: + result = verify_build(conn, build_run_id=args.build_run_id) + conn.commit() + print( + json.dumps( + { + "status": result.status, + "build_run_id": result.build_run_id, + "object_hashes": result.object_hashes, + } + ) + ) + return 0 + + if args.command == "build" and args.build_command == "publish": + with connect(args.dsn) as conn: + result = publish_build( + conn, + build_run_id=args.build_run_id, + actor=args.actor, + ) + conn.commit() + print( + json.dumps( + { + "status": result.status, + "build_run_id": result.build_run_id, + "dataset_version": result.dataset_version, + } + ) + ) + return 0 + + parser.print_help(sys.stderr) + return 2 + except (ManifestError, IngestError, BuildError, RuntimeError) as exc: + print(json.dumps({"status": "error", "error": str(exc)}), file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/pipeline/src/pipeline/config.py b/pipeline/src/pipeline/config.py new file mode 100644 index 0000000..e051aa3 --- /dev/null +++ b/pipeline/src/pipeline/config.py @@ -0,0 +1,33 @@ +"""Runtime configuration for Pipeline V3.""" + +from __future__ import annotations + +import os +from pathlib import Path + +PROBABILITY_SCALE = 4 +PROBABILITY_NUMERIC_TYPE = "numeric(6,4)" + + +def default_dsn() -> str: + return os.getenv("PIPELINE_DSN", "dbname=postcodes_v3") + + +def repo_root() -> Path: + return Path(__file__).resolve().parents[3] + + +def migrations_dir() -> Path: + return repo_root() / "pipeline" / "sql" / "migrations" + + +def source_schema_config_path() -> Path: + return repo_root() / "pipeline" / "config" / "source_schema.yaml" + + +def frequency_weights_config_path() -> Path: + return repo_root() / "pipeline" / "config" / "frequency_weights.yaml" + + +def normalisation_config_path() -> Path: + return repo_root() / "pipeline" / "config" / "normalisation.yaml" diff --git a/pipeline/src/pipeline/contracts/__init__.py b/pipeline/src/pipeline/contracts/__init__.py new file mode 100644 index 0000000..6373885 --- /dev/null +++ b/pipeline/src/pipeline/contracts/__init__.py @@ -0,0 +1 @@ +"""SQL contracts locked by the Phase 1 specification.""" diff --git a/pipeline/src/pipeline/contracts/open_roads.py b/pipeline/src/pipeline/contracts/open_roads.py new file mode 100644 index 0000000..3abc84b --- /dev/null +++ b/pipeline/src/pipeline/contracts/open_roads.py @@ -0,0 +1,62 @@ +"""Locked SQL contracts for Open Roads staging and build linkage.""" + +OPEN_ROADS_STAGE_TABLE = "stage.open_roads_segment" + +ALLOWED_CANONICAL_HASH_OBJECT_NAMES = ( + "core_uprn_postcode", + "core_uprn_point", + "core_road_segment", + "derived_uprn_street_spatial", +) + +ALLOWED_CANONICAL_HASH_OBJECT_NAMES_PHASE2 = ( + "core_uprn_postcode", + "core_uprn_point", + "core_road_segment", + "core_open_names_entry", + "core_postcode_unit_seed", + "derived_uprn_street_spatial", + "derived_postcode_street", +) + +OPEN_ROADS_LOADED_FEATURE_COUNT_SQL = ( + "SELECT COUNT(*) AS loaded_feature_count " + "FROM stage.open_roads_segment " + "WHERE release_id = %(release_id)s;" +) + +OPEN_ROADS_PERSIST_LOADED_FEATURE_COUNT_SQL = ( + "UPDATE meta.dataset_release " + "SET loaded_feature_count = (" + "SELECT COUNT(*) " + "FROM stage.open_roads_segment " + "WHERE release_id = %(release_id)s" + ") " + "WHERE dataset_key = 'open_roads' " + "AND release_id = %(release_id)s;" +) + +OPEN_ROADS_BUILD_LINKAGE_SQL = ( + "SELECT s.segment_id, s.name_display, s.name_norm, s.geom_bng " + "FROM stage.open_roads_segment AS s " + "WHERE s.release_id = %(open_roads_release_id)s " + "ORDER BY s.segment_id ASC;" +) + + +def loaded_feature_count_query() -> str: + """Return the locked gate query for loaded Open Roads features.""" + + return OPEN_ROADS_LOADED_FEATURE_COUNT_SQL + + +def persist_loaded_feature_count_query() -> str: + """Return the locked query that writes loaded feature counts into metadata.""" + + return OPEN_ROADS_PERSIST_LOADED_FEATURE_COUNT_SQL + + +def build_linkage_query() -> str: + """Return the locked stage-to-build linkage query filtered by release_id.""" + + return OPEN_ROADS_BUILD_LINKAGE_SQL diff --git a/pipeline/src/pipeline/contracts/voronoi.py b/pipeline/src/pipeline/contracts/voronoi.py new file mode 100644 index 0000000..36d811e --- /dev/null +++ b/pipeline/src/pipeline/contracts/voronoi.py @@ -0,0 +1,95 @@ +"""Locked SQL contracts for Phase 2 Voronoi clipping and enumeration.""" + +from __future__ import annotations + +from pipeline.config import VORONOI_HULL_BUFFER_M + +# Phase 2 lock: no inline literals for the buffer. Runtime must bind +# `hull_buffer_m` explicitly so the governing constant is always traceable. +VORONOI_CLIP_EXPR_SQL = ( + "ST_Buffer(ST_ConvexHull(ST_Collect(seed_geom_bng)), %(hull_buffer_m)s)" +) + +VORONOI_CLIP_GEOMETRY_SQL_TEMPLATE = """ +WITH seed_points AS ( + {seed_points_sql} +), +clip_geom AS ( + SELECT + ST_SetSRID({clip_expr}, 27700) AS gb_clip_geom + FROM seed_points +) +SELECT gb_clip_geom +FROM clip_geom; +""".strip() + +VORONOI_CELL_CTE_SQL_TEMPLATE = """ +WITH seed_points AS ( + {seed_points_sql} +), +clip_geom AS ( + SELECT + ST_SetSRID({clip_expr}, 27700) AS gb_clip_geom + FROM seed_points +), +cell_geoms AS ( + SELECT + (ST_Dump( + ST_VoronoiPolygons( + ST_Collect(seed_geom_bng), + 0.0, + (SELECT gb_clip_geom FROM clip_geom) + ) + )).geom AS cell_geom + FROM seed_points +) +""".strip() + +VORONOI_CELL_SQL_TEMPLATE = """ +{cell_cte_sql} +SELECT cell_geom +FROM cell_geoms; +""".strip() + + +def voronoi_sql_params(hull_buffer_m: float | None = None) -> dict[str, float]: + """Return the bound parameter dict for Voronoi clipping SQL.""" + + value = VORONOI_HULL_BUFFER_M if hull_buffer_m is None else float(hull_buffer_m) + if value <= 0: + raise ValueError("hull_buffer_m must be greater than zero") + return {"hull_buffer_m": value} + + +def render_voronoi_clip_geometry_sql(seed_points_sql: str) -> str: + """Render SQL that computes the clipped Voronoi boundary geometry.""" + + if not seed_points_sql.strip(): + raise ValueError("seed_points_sql must be non-empty") + return VORONOI_CLIP_GEOMETRY_SQL_TEMPLATE.format( + seed_points_sql=seed_points_sql.strip(), + clip_expr=VORONOI_CLIP_EXPR_SQL, + ) + + +def render_voronoi_cell_sql(seed_points_sql: str) -> str: + """Render SQL that computes clipped Voronoi cell polygons.""" + + if not seed_points_sql.strip(): + raise ValueError("seed_points_sql must be non-empty") + cell_cte_sql = VORONOI_CELL_CTE_SQL_TEMPLATE.format( + seed_points_sql=seed_points_sql.strip(), + clip_expr=VORONOI_CLIP_EXPR_SQL, + ) + return VORONOI_CELL_SQL_TEMPLATE.format(cell_cte_sql=cell_cte_sql) + + +def render_voronoi_cell_cte_sql(seed_points_sql: str) -> str: + """Render SQL CTE block for building clipped Voronoi cells.""" + + if not seed_points_sql.strip(): + raise ValueError("seed_points_sql must be non-empty") + return VORONOI_CELL_CTE_SQL_TEMPLATE.format( + seed_points_sql=seed_points_sql.strip(), + clip_expr=VORONOI_CLIP_EXPR_SQL, + ) diff --git a/pipeline/src/pipeline/db/__init__.py b/pipeline/src/pipeline/db/__init__.py new file mode 100644 index 0000000..3c4b3c0 --- /dev/null +++ b/pipeline/src/pipeline/db/__init__.py @@ -0,0 +1 @@ +"""Database helpers for the pipeline.""" diff --git a/pipeline/src/pipeline/db/connection.py b/pipeline/src/pipeline/db/connection.py new file mode 100644 index 0000000..1d46064 --- /dev/null +++ b/pipeline/src/pipeline/db/connection.py @@ -0,0 +1,17 @@ +"""Database connection helpers.""" + +from __future__ import annotations + +from contextlib import contextmanager +from typing import Iterator + +import psycopg + + +@contextmanager +def connect(dsn: str) -> Iterator[psycopg.Connection]: + conn = psycopg.connect(dsn) + try: + yield conn + finally: + conn.close() diff --git a/pipeline/src/pipeline/db/migrations.py b/pipeline/src/pipeline/db/migrations.py new file mode 100644 index 0000000..51abe2f --- /dev/null +++ b/pipeline/src/pipeline/db/migrations.py @@ -0,0 +1,69 @@ +"""Simple SQL migration runner for pipeline schemas.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, List + + +@dataclass(frozen=True) +class Migration: + version: str + path: Path + + +def discover_migrations(migrations_dir: Path) -> List[Migration]: + """Return sorted migration files based on numeric filename prefix.""" + + migrations: List[Migration] = [] + for path in sorted(migrations_dir.glob("*.sql")): + version = path.stem.split("_", 1)[0] + migrations.append(Migration(version=version, path=path)) + return migrations + + +def _read_sql(path: Path) -> str: + return path.read_text(encoding="utf-8") + + +def apply_migrations(dsn: str, migrations_dir: Path) -> int: + """Apply unapplied migrations in filename order. + + Requires psycopg at runtime, but keeps import optional for environments + where only static checks are needed. + """ + + try: + import psycopg # type: ignore + except ImportError as exc: # pragma: no cover - import-path safety + raise RuntimeError("psycopg is required to run migrations") from exc + + migrations = discover_migrations(migrations_dir) + applied_count = 0 + + with psycopg.connect(dsn) as conn: + with conn.cursor() as cur: + cur.execute("CREATE SCHEMA IF NOT EXISTS meta") + cur.execute( + """ + CREATE TABLE IF NOT EXISTS meta.schema_migration ( + version text PRIMARY KEY, + applied_at timestamptz NOT NULL DEFAULT now() + ) + """ + ) + cur.execute("SELECT version FROM meta.schema_migration") + applied_versions = {row[0] for row in cur.fetchall()} + + for migration in migrations: + if migration.version in applied_versions: + continue + cur.execute(_read_sql(migration.path)) + cur.execute( + "INSERT INTO meta.schema_migration (version) VALUES (%s)", + (migration.version,), + ) + applied_count += 1 + + return applied_count diff --git a/pipeline/src/pipeline/ingest/__init__.py b/pipeline/src/pipeline/ingest/__init__.py new file mode 100644 index 0000000..3f26ba5 --- /dev/null +++ b/pipeline/src/pipeline/ingest/__init__.py @@ -0,0 +1 @@ +"""Ingest workflows for Phase 1 datasets.""" diff --git a/pipeline/src/pipeline/ingest/workflows.py b/pipeline/src/pipeline/ingest/workflows.py new file mode 100644 index 0000000..2872290 --- /dev/null +++ b/pipeline/src/pipeline/ingest/workflows.py @@ -0,0 +1,339 @@ +"""Source ingest workflows for Pipeline V3.""" + +from __future__ import annotations + +import csv +import json +import sqlite3 +import uuid +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable, Iterator + +import psycopg +from psycopg import sql +from psycopg.types.json import Jsonb + +from pipeline.manifest import SourceFileManifest, SourceIngestManifest +from pipeline.util.hashing import sha256_file + + +class IngestError(RuntimeError): + """Raised when source ingest fails.""" + + +@dataclass(frozen=True) +class IngestResult: + source_name: str + run_id: str + status: str + files_loaded: int + rows_loaded: int + + +RAW_TABLE_BY_SOURCE = { + "onspd": "raw.onspd_row", + "os_open_usrn": "raw.os_open_usrn_row", + "os_open_names": "raw.os_open_names_row", + "os_open_roads": "raw.os_open_roads_row", + "os_open_uprn": "raw.os_open_uprn_row", + "os_open_linked_identifiers": "raw.os_open_linked_identifiers_row", + "nsul": "raw.nsul_row", + "osni_gazetteer": "raw.osni_gazetteer_row", + "dfi_highway": "raw.dfi_highway_row", + "ppd": "raw.ppd_row", +} + + +CSV_INSERT_BATCH_SIZE = 5_000 + + +def _file_set_hash(files: tuple[SourceFileManifest, ...]) -> str: + payload = [ + { + "file_role": file.file_role, + "path": str(file.file_path), + "sha256": file.sha256, + "size_bytes": file.size_bytes, + "format": file.format, + "layer_name": file.layer_name, + } + for file in sorted(files, key=lambda item: (item.file_role, str(item.file_path), item.layer_name)) + ] + encoded = json.dumps(payload, ensure_ascii=True, separators=(",", ":")).encode("utf-8") + import hashlib + + return hashlib.sha256(encoded).hexdigest() + + +def _iter_rows_from_csv(path: Path) -> Iterator[dict[str, Any]]: + with path.open("r", encoding="utf-8-sig", newline="") as handle: + reader = csv.DictReader(handle) + if reader.fieldnames is None: + raise IngestError(f"CSV file is missing header row: {path}") + for row in reader: + yield {str(key): value for key, value in row.items()} + + +def _iter_rows_from_geojson(path: Path) -> Iterator[dict[str, Any]]: + payload = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(payload, dict): + raise IngestError(f"GeoJSON root must be object: {path}") + + features = payload.get("features") + if not isinstance(features, list): + raise IngestError(f"GeoJSON features missing or invalid: {path}") + + for feature in features: + if not isinstance(feature, dict): + continue + props = feature.get("properties") + row: dict[str, Any] = {} + if isinstance(props, dict): + row.update({str(key): value for key, value in props.items()}) + geometry = feature.get("geometry") + row["__geometry"] = geometry + yield row + + +def _iter_rows_from_json(path: Path) -> Iterator[dict[str, Any]]: + payload = json.loads(path.read_text(encoding="utf-8")) + if isinstance(payload, list): + for item in payload: + if isinstance(item, dict): + yield {str(key): value for key, value in item.items()} + return + if isinstance(payload, dict): + yield {str(key): value for key, value in payload.items()} + return + raise IngestError(f"Unsupported JSON payload shape: {path}") + + +def _iter_rows_from_gpkg(path: Path, layer_name: str) -> Iterator[dict[str, Any]]: + if not layer_name: + raise IngestError(f"GeoPackage manifest must set layer_name: {path}") + + quoted_layer = '"' + layer_name.replace('"', '""') + '"' + conn = sqlite3.connect(f"file:{path}?mode=ro", uri=True) + try: + cur = conn.execute( + """ + SELECT 1 + FROM sqlite_master + WHERE type IN ('table', 'view') + AND name = ? + LIMIT 1 + """, + (layer_name,), + ) + if cur.fetchone() is None: + raise IngestError( + f"GeoPackage layer '{layer_name}' not found in {path}" + ) + + row_cur = conn.execute(f"SELECT * FROM {quoted_layer}") + col_names = [desc[0] for desc in row_cur.description] + for values in row_cur: + row: dict[str, Any] = {} + for index, column_name in enumerate(col_names): + value = values[index] + if isinstance(value, (bytes, bytearray, memoryview)): + # Keep raw binary columns JSON-safe while preserving source bytes. + value = bytes(value).hex() + row[str(column_name)] = value + yield row + finally: + conn.close() + + +def _iter_rows(file_manifest: SourceFileManifest) -> Iterator[dict[str, Any]]: + file_format = file_manifest.format.lower() + if file_format == "csv": + return _iter_rows_from_csv(file_manifest.file_path) + if file_format in {"geojson", "json"}: + return _iter_rows_from_geojson(file_manifest.file_path) + if file_format == "json_array": + return _iter_rows_from_json(file_manifest.file_path) + if file_format in {"gpkg", "geopackage"}: + return _iter_rows_from_gpkg(file_manifest.file_path, file_manifest.layer_name) + raise IngestError(f"Unsupported file format '{file_manifest.format}' for {file_manifest.file_path}") + + +def _table_ident(qualified_table: str) -> tuple[sql.Identifier, sql.Identifier]: + schema_name, table_name = qualified_table.split(".", 1) + return sql.Identifier(schema_name), sql.Identifier(table_name) + + +def _insert_raw_rows( + conn: psycopg.Connection, + qualified_table: str, + ingest_run_id: str, + rows: Iterable[dict[str, Any]], +) -> int: + schema_ident, table_ident = _table_ident(qualified_table) + insert_sql = sql.SQL( + """ + INSERT INTO {}.{} ( + ingest_run_id, + source_row_num, + payload_jsonb + ) VALUES (%s, %s, %s) + """ + ).format(schema_ident, table_ident) + + total_loaded = 0 + batch: list[tuple[str, int, Jsonb]] = [] + with conn.cursor() as cur: + for row_num, row in enumerate(rows, start=1): + batch.append((ingest_run_id, row_num, Jsonb(row))) + if len(batch) >= CSV_INSERT_BATCH_SIZE: + cur.executemany(insert_sql, batch) + total_loaded += len(batch) + batch.clear() + if batch: + cur.executemany(insert_sql, batch) + total_loaded += len(batch) + batch.clear() + + return total_loaded + + +def _existing_ingest_run( + conn: psycopg.Connection, + source_name: str, + source_version: str, + file_set_sha256: str, +) -> str | None: + with conn.cursor() as cur: + cur.execute( + """ + SELECT run_id + FROM meta.ingest_run + WHERE source_name = %s + AND source_version = %s + AND file_set_sha256 = %s + """, + (source_name, source_version, file_set_sha256), + ) + row = cur.fetchone() + return str(row[0]) if row is not None else None + + +def ingest_source(conn: psycopg.Connection, manifest: SourceIngestManifest) -> IngestResult: + file_set_sha256 = _file_set_hash(manifest.files) + existing = _existing_ingest_run( + conn, + source_name=manifest.source_name, + source_version=manifest.source_version, + file_set_sha256=file_set_sha256, + ) + if existing is not None: + return IngestResult( + source_name=manifest.source_name, + run_id=existing, + status="noop", + files_loaded=0, + rows_loaded=0, + ) + + raw_table = RAW_TABLE_BY_SOURCE[manifest.source_name] + + run_id = str(uuid.uuid4()) + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO meta.ingest_run ( + run_id, + source_name, + source_version, + retrieved_at_utc, + source_url, + processing_git_sha, + record_count, + notes, + file_set_sha256 + ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) + """, + ( + run_id, + manifest.source_name, + manifest.source_version, + manifest.retrieved_at_utc, + manifest.source_url, + manifest.processing_git_sha, + 0, + manifest.notes, + file_set_sha256, + ), + ) + + total_rows = 0 + for file_manifest in manifest.files: + actual_sha = sha256_file(file_manifest.file_path) + if actual_sha != file_manifest.sha256: + raise IngestError( + "SHA256 mismatch for source file: " + f"path={file_manifest.file_path} expected={file_manifest.sha256} actual={actual_sha}" + ) + + actual_size = file_manifest.file_path.stat().st_size + if actual_size != file_manifest.size_bytes: + raise IngestError( + f"size_bytes mismatch for {file_manifest.file_path}: " + f"expected={file_manifest.size_bytes} actual={actual_size}" + ) + + rows = _iter_rows(file_manifest) + loaded_rows = _insert_raw_rows(conn, raw_table, run_id, rows) + + if file_manifest.row_count_expected is not None and loaded_rows != file_manifest.row_count_expected: + raise IngestError( + f"row_count_expected mismatch for {file_manifest.file_path}: " + f"expected={file_manifest.row_count_expected} loaded={loaded_rows}" + ) + + total_rows += loaded_rows + + with conn.cursor() as cur: + cur.execute( + """ + INSERT INTO meta.ingest_run_file ( + ingest_run_id, + file_role, + filename, + layer_name, + sha256, + size_bytes, + row_count, + format + ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) + """, + ( + run_id, + file_manifest.file_role, + str(file_manifest.file_path), + file_manifest.layer_name, + actual_sha, + actual_size, + loaded_rows, + file_manifest.format, + ), + ) + + with conn.cursor() as cur: + cur.execute( + """ + UPDATE meta.ingest_run + SET record_count = %s + WHERE run_id = %s + """, + (total_rows, run_id), + ) + + return IngestResult( + source_name=manifest.source_name, + run_id=run_id, + status="ingested", + files_loaded=len(manifest.files), + rows_loaded=total_rows, + ) diff --git a/pipeline/src/pipeline/manifest.py b/pipeline/src/pipeline/manifest.py new file mode 100644 index 0000000..6d17da8 --- /dev/null +++ b/pipeline/src/pipeline/manifest.py @@ -0,0 +1,272 @@ +"""Manifest parsing and validation for Pipeline V3.""" + +from __future__ import annotations + +import json +import re +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any +from uuid import UUID + + +class ManifestError(ValueError): + """Raised when a manifest file is invalid.""" + + +SOURCE_NAMES = { + "onspd", + "os_open_usrn", + "os_open_names", + "os_open_roads", + "os_open_uprn", + "os_open_linked_identifiers", + "nsul", + "osni_gazetteer", + "dfi_highway", + "ppd", +} + +BUILD_PROFILES = { + "gb_core": { + "onspd", + "os_open_usrn", + "os_open_names", + "os_open_roads", + "os_open_uprn", + "os_open_linked_identifiers", + "nsul", + }, + "gb_core_ppd": { + "onspd", + "os_open_usrn", + "os_open_names", + "os_open_roads", + "os_open_uprn", + "os_open_linked_identifiers", + "nsul", + "ppd", + }, + "core_ni": { + "onspd", + "os_open_usrn", + "os_open_names", + "os_open_roads", + "os_open_uprn", + "os_open_linked_identifiers", + "nsul", + "osni_gazetteer", + "dfi_highway", + }, +} + +SHA256_RE = re.compile(r"^[0-9a-fA-F]{64}$") +GIT_SHA_RE = re.compile(r"^[0-9a-f]{40}$") + + +@dataclass(frozen=True) +class SourceFileManifest: + file_role: str + file_path: Path + sha256: str + size_bytes: int + format: str + layer_name: str + row_count_expected: int | None + + +@dataclass(frozen=True) +class SourceIngestManifest: + source_name: str + source_version: str + retrieved_at_utc: datetime + source_url: str | None + processing_git_sha: str + notes: str | None + files: tuple[SourceFileManifest, ...] + raw: dict[str, Any] + + +@dataclass(frozen=True) +class BuildBundleManifest: + build_profile: str + source_runs: dict[str, tuple[str, ...]] + raw: dict[str, Any] + + +def _load_json(path: Path) -> dict[str, Any]: + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise ManifestError(f"Invalid JSON manifest: {path}") from exc + if not isinstance(payload, dict): + raise ManifestError(f"Manifest root must be an object: {path}") + return payload + + +def _require_string(payload: dict[str, Any], key: str) -> str: + value = payload.get(key) + if not isinstance(value, str) or not value.strip(): + raise ManifestError(f"Manifest field '{key}' must be a non-empty string") + return value.strip() + + +def _parse_optional_string(payload: dict[str, Any], key: str) -> str | None: + value = payload.get(key) + if value is None: + return None + if not isinstance(value, str): + raise ManifestError(f"Manifest field '{key}' must be a string when present") + text = value.strip() + return text or None + + +def _parse_utc_datetime(value: str, field_name: str) -> datetime: + normalized = value.replace("Z", "+00:00") + try: + parsed = datetime.fromisoformat(normalized) + except ValueError as exc: + raise ManifestError(f"Manifest field '{field_name}' must be ISO8601 datetime") from exc + if parsed.tzinfo is None: + raise ManifestError(f"Manifest field '{field_name}' must include timezone") + return parsed.astimezone(timezone.utc) + + +def _parse_file_entry(entry: dict[str, Any]) -> SourceFileManifest: + if not isinstance(entry, dict): + raise ManifestError("Each files[] entry must be an object") + + file_role = _require_string(entry, "file_role") + file_path_value = _require_string(entry, "file_path") + file_path = Path(file_path_value).expanduser().resolve() + if not file_path.exists() or not file_path.is_file(): + raise ManifestError(f"Manifest file_path does not exist: {file_path}") + + sha256 = _require_string(entry, "sha256") + if SHA256_RE.match(sha256) is None: + raise ManifestError("files[].sha256 must be 64 hex chars") + + size_bytes = entry.get("size_bytes") + if not isinstance(size_bytes, int) or size_bytes < 0: + raise ManifestError("files[].size_bytes must be an integer >= 0") + + format_value = _require_string(entry, "format") + layer_name = _parse_optional_string(entry, "layer_name") or "" + + row_count_expected_raw = entry.get("row_count_expected") + row_count_expected: int | None + if row_count_expected_raw is None: + row_count_expected = None + else: + if not isinstance(row_count_expected_raw, int) or row_count_expected_raw < 0: + raise ManifestError("files[].row_count_expected must be integer >= 0 when present") + row_count_expected = row_count_expected_raw + + return SourceFileManifest( + file_role=file_role, + file_path=file_path, + sha256=sha256.lower(), + size_bytes=size_bytes, + format=format_value, + layer_name=layer_name, + row_count_expected=row_count_expected, + ) + + +def load_source_manifest(path: Path) -> SourceIngestManifest: + payload = _load_json(path) + + source_name = _require_string(payload, "source_name") + if source_name not in SOURCE_NAMES: + raise ManifestError(f"Invalid source_name '{source_name}'") + + source_version = _require_string(payload, "source_version") + retrieved_raw = _require_string(payload, "retrieved_at_utc") + retrieved_at_utc = _parse_utc_datetime(retrieved_raw, "retrieved_at_utc") + + source_url = _parse_optional_string(payload, "source_url") + processing_git_sha = _require_string(payload, "processing_git_sha") + if GIT_SHA_RE.match(processing_git_sha) is None: + raise ManifestError("processing_git_sha must be 40 lowercase hex chars") + + notes = _parse_optional_string(payload, "notes") + + files_raw = payload.get("files") + if not isinstance(files_raw, list) or not files_raw: + raise ManifestError("Manifest files must be a non-empty array") + + files = tuple(_parse_file_entry(entry) for entry in files_raw) + + return SourceIngestManifest( + source_name=source_name, + source_version=source_version, + retrieved_at_utc=retrieved_at_utc, + source_url=source_url, + processing_git_sha=processing_git_sha, + notes=notes, + files=files, + raw=payload, + ) + + +def load_bundle_manifest(path: Path) -> BuildBundleManifest: + payload = _load_json(path) + + build_profile = _require_string(payload, "build_profile") + if build_profile not in BUILD_PROFILES: + raise ManifestError(f"Invalid build_profile '{build_profile}'") + + source_runs_raw = payload.get("source_runs") + if not isinstance(source_runs_raw, dict): + raise ManifestError("Bundle manifest source_runs must be an object") + + source_runs: dict[str, tuple[str, ...]] = {} + for source_name, run_ids_raw in source_runs_raw.items(): + if source_name not in SOURCE_NAMES: + raise ManifestError(f"Unknown source in source_runs: {source_name}") + run_ids: tuple[str, ...] + if isinstance(run_ids_raw, str): + run_ids = (run_ids_raw,) + elif isinstance(run_ids_raw, list): + if not run_ids_raw: + raise ManifestError(f"source_runs[{source_name}] list must not be empty") + normalized: list[str] = [] + for item in run_ids_raw: + if not isinstance(item, str): + raise ManifestError( + f"source_runs[{source_name}] values must be UUID strings" + ) + normalized.append(item) + run_ids = tuple(normalized) + else: + raise ManifestError( + f"source_runs[{source_name}] must be a UUID string or non-empty UUID array" + ) + + parsed_ids: list[str] = [] + for run_id in run_ids: + try: + UUID(run_id) + except ValueError as exc: + raise ManifestError( + f"Invalid ingest run UUID for {source_name}: {run_id}" + ) from exc + parsed_ids.append(run_id) + source_runs[source_name] = tuple(parsed_ids) + + required = BUILD_PROFILES[build_profile] + missing = sorted(required - set(source_runs.keys())) + if missing: + raise ManifestError( + "Bundle manifest missing required sources for profile " + f"{build_profile}: {', '.join(missing)}" + ) + + for source_name in required: + if len(source_runs.get(source_name, ())) == 0: + raise ManifestError( + f"Bundle manifest source_runs[{source_name}] must include at least one ingest run id" + ) + + return BuildBundleManifest(build_profile=build_profile, source_runs=source_runs, raw=payload) diff --git a/pipeline/src/pipeline/util/hashing.py b/pipeline/src/pipeline/util/hashing.py new file mode 100644 index 0000000..8f65db1 --- /dev/null +++ b/pipeline/src/pipeline/util/hashing.py @@ -0,0 +1,14 @@ +"""Hashing helpers for deterministic ingest and canonical output checks.""" + +from __future__ import annotations + +import hashlib +from pathlib import Path + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as fh: + for chunk in iter(lambda: fh.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() diff --git a/pipeline/src/pipeline/util/normalise.py b/pipeline/src/pipeline/util/normalise.py new file mode 100644 index 0000000..85943ab --- /dev/null +++ b/pipeline/src/pipeline/util/normalise.py @@ -0,0 +1,75 @@ +"""Canonicalisation helpers for Pipeline V3.""" + +from __future__ import annotations + +import json +import re +import unicodedata +from functools import lru_cache +from pathlib import Path + +from pipeline.config import normalisation_config_path + + +def _load_json_config(path: Path) -> dict[str, object]: + return json.loads(path.read_text(encoding="utf-8")) + + +@lru_cache(maxsize=1) +def _alias_map() -> dict[str, str]: + config = _load_json_config(normalisation_config_path()) + raw_alias = config.get("alias_map", {}) + if not isinstance(raw_alias, dict): + return {} + output: dict[str, str] = {} + for key, value in raw_alias.items(): + if isinstance(key, str) and isinstance(value, str): + output[key.upper()] = value.upper() + return output + + +@lru_cache(maxsize=1) +def _strip_punctuation() -> str: + config = _load_json_config(normalisation_config_path()) + value = config.get("strip_punctuation", ".,'-") + if not isinstance(value, str): + return ".,'-" + return value + + +def postcode_norm(value: str | None) -> str | None: + if value is None: + return None + cleaned = re.sub(r"[^A-Za-z0-9]", "", value).upper() + if not cleaned: + return None + return cleaned + + +def postcode_display(value: str | None) -> str | None: + normalized = postcode_norm(value) + if normalized is None: + return None + if len(normalized) <= 3: + return normalized + return f"{normalized[:-3]} {normalized[-3:]}" + + +def street_casefold(value: str | None) -> str | None: + if value is None: + return None + + text = unicodedata.normalize("NFKC", value).strip().upper() + text = re.sub(r"\s+", " ", text) + strip_chars = _strip_punctuation() + if strip_chars: + text = text.translate(str.maketrans("", "", strip_chars)) + text = re.sub(r"\s+", " ", text).strip() + if not text: + return None + + alias_map = _alias_map() + tokens = [alias_map.get(token, token) for token in text.split(" ")] + canonical = " ".join(tokens).strip() + return canonical or None + diff --git a/scripts/obtain_phase1_e2e_sources.sh b/scripts/obtain_phase1_e2e_sources.sh new file mode 100755 index 0000000..e68f889 --- /dev/null +++ b/scripts/obtain_phase1_e2e_sources.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +"$SCRIPT_DIR/obtain_phase2_e2e_sources.sh" diff --git a/scripts/obtain_phase2_e2e_sources.sh b/scripts/obtain_phase2_e2e_sources.sh new file mode 100755 index 0000000..766999f --- /dev/null +++ b/scripts/obtain_phase2_e2e_sources.sh @@ -0,0 +1,169 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)" +SOURCE_DIR="$ROOT_DIR/data/source_files/e2e" +MANIFEST_DIR="$ROOT_DIR/data/manifests/e2e" +RELEASE_ID="2026-Q1-E2E-P2" + +mkdir -p "$SOURCE_DIR" "$MANIFEST_DIR" + +cat > "$SOURCE_DIR/onsud_sample.csv" <<'CSV' +ONS_UPRN,ONS_POSTCODE,PC_UNIT_E,PC_UNIT_N,OA_CODE +1001,SW1A 2AA,530268.167,179640.532,E001 +1002,SW1A 2AB,530343.656,179675.849,E002 +1003,SW1A 2AC,530236.700,179784.382,E003 +1004,SW1A 2AD,530165.000,179894.000,E004 +CSV + +cat > "$SOURCE_DIR/open_uprn_sample.csv" <<'CSV' +UPRN_REF,LAT,LON,EASTING,NORTHING,UPRN_STATUS +1001,51.5007,-0.1246,530268.167,179640.532,ACTIVE +1002,51.5010,-0.1235,530343.656,179675.849,ACTIVE +1003,51.5020,-0.1250,530236.700,179784.382,ACTIVE +1005,51.5030,-0.1260,530165.000,179894.000,ACTIVE +CSV + +cat > "$SOURCE_DIR/open_roads_sample.geojson" <<'GEOJSON' +{ + "type": "FeatureCollection", + "name": "open_roads_sample", + "features": [ + { + "type": "Feature", + "properties": { + "SRC_ID": 10, + "ROAD_NAME": "Parliament Street" + }, + "geometry": { + "type": "LineString", + "coordinates": [ + [-0.1253, 51.5001], + [-0.1232, 51.5014] + ] + } + }, + { + "type": "Feature", + "properties": { + "SRC_ID": 20, + "ROAD_NAME": "Bridge Street" + }, + "geometry": { + "type": "LineString", + "coordinates": [ + [-0.1244, 51.5016], + [-0.1231, 51.5024] + ] + } + }, + { + "type": "Feature", + "properties": { + "SRC_ID": 30, + "ROAD_NAME": "A40" + }, + "geometry": { + "type": "LineString", + "coordinates": [ + [-0.1262, 51.5018], + [-0.1248, 51.5025] + ] + } + } + ] +} +GEOJSON + +cat > "$SOURCE_DIR/open_names_sample.csv" <<'CSV' +ON_ID,NAME1,NAME1_LANG,NAME2,LOCAL_TYPE,GEOM_X,GEOM_Y,PC_DISTRICT +E1001,Parliament Street,,,Road,530280.000,179650.000,SW1A +E1002,Bridge Street,,,Street,530330.000,179690.000,SW1A +E1003,Western Avenue,,,Road,530225.000,179790.000,SW1A +E1004,Charing Cross,,,PopulatedPlace,530200.000,179700.000,SW1A +CSV + +onsud_sha="$(shasum -a 256 "$SOURCE_DIR/onsud_sample.csv" | awk '{print $1}')" +open_uprn_sha="$(shasum -a 256 "$SOURCE_DIR/open_uprn_sample.csv" | awk '{print $1}')" +open_roads_sha="$(shasum -a 256 "$SOURCE_DIR/open_roads_sample.geojson" | awk '{print $1}')" +open_names_sha="$(shasum -a 256 "$SOURCE_DIR/open_names_sample.csv" | awk '{print $1}')" + +cat > "$MANIFEST_DIR/onsud_manifest.json" < "$MANIFEST_DIR/open_uprn_manifest.json" < "$MANIFEST_DIR/open_roads_manifest.json" < "$MANIFEST_DIR/open_names_manifest.json" </dev/null + +python -m pipeline.cli --dsn "$DSN" db migrate + +./scripts/obtain_phase2_e2e_sources.sh + +python -m pipeline.cli --dsn "$DSN" ingest onsud --manifest data/manifests/e2e/onsud_manifest.json +python -m pipeline.cli --dsn "$DSN" ingest open-uprn --manifest data/manifests/e2e/open_uprn_manifest.json +python -m pipeline.cli --dsn "$DSN" ingest open-roads --manifest data/manifests/e2e/open_roads_manifest.json +python -m pipeline.cli --dsn "$DSN" ingest open-names --manifest data/manifests/e2e/open_names_manifest.json + +release_json="$(python -m pipeline.cli --dsn "$DSN" release create \ + --onsud-release "$RELEASE_ID" \ + --open-uprn-release "$RELEASE_ID" \ + --open-roads-release "$RELEASE_ID" \ + --open-names-release "$RELEASE_ID")" +release_set_id="$(python - <<'PY' "$release_json" +import json +import sys +print(json.loads(sys.argv[1])["release_set_id"]) +PY +)" + +run_one="$(python -m pipeline.cli --dsn "$DSN" run phase2-open-names --release-set-id "$release_set_id" --rebuild)" +run_one_id="$(python - <<'PY' "$run_one" +import json +import sys +print(json.loads(sys.argv[1])["run_id"]) +PY +)" + +run_two="$(python -m pipeline.cli --dsn "$DSN" run phase2-open-names --release-set-id "$release_set_id" --rebuild)" +run_two_id="$(python - <<'PY' "$run_two" +import json +import sys +print(json.loads(sys.argv[1])["run_id"]) +PY +)" + +python -m pipeline.cli --dsn "$DSN" release activate --release-set-id "$release_set_id" --actor "e2e-script" --ack-warnings + +psql "$DSN" -v ON_ERROR_STOP=1 < Date: Fri, 20 Feb 2026 23:27:34 +0000 Subject: [PATCH 03/17] test(pipeline): add v3 contract coverage and determinism checks --- docs/spec/pipeline_v3/canonicalisation.md | 48 ++++++++ docs/spec/pipeline_v3/data_model.md | 75 +++++++++++++ docs/spec/pipeline_v3/spec.md | 104 ++++++++++++++++++ tests/test_bundle_manifest_ppd_updates.py | 91 +++++++++++++++ ...test_candidate_immutability_db_contract.py | 21 ++++ tests/test_cli_v3_contract.py | 21 ++++ tests/test_docs_v3_contract.py | 20 ++++ tests/test_pass3_append_only_promotion.py | 24 ++++ tests/test_probability_exact_normalization.py | 25 +++++ tests/test_weight_config_contract.py | 38 +++++++ 10 files changed, 467 insertions(+) create mode 100644 docs/spec/pipeline_v3/canonicalisation.md create mode 100644 docs/spec/pipeline_v3/data_model.md create mode 100644 docs/spec/pipeline_v3/spec.md create mode 100644 tests/test_bundle_manifest_ppd_updates.py create mode 100644 tests/test_candidate_immutability_db_contract.py create mode 100644 tests/test_cli_v3_contract.py create mode 100644 tests/test_docs_v3_contract.py create mode 100644 tests/test_pass3_append_only_promotion.py create mode 100644 tests/test_probability_exact_normalization.py create mode 100644 tests/test_weight_config_contract.py diff --git a/docs/spec/pipeline_v3/canonicalisation.md b/docs/spec/pipeline_v3/canonicalisation.md new file mode 100644 index 0000000..abeae69 --- /dev/null +++ b/docs/spec/pipeline_v3/canonicalisation.md @@ -0,0 +1,48 @@ +# Pipeline V3 Canonicalisation and Determinism Rules + +## Postcode Normalisation + +1. Uppercase. +2. Remove non-alphanumeric characters. +3. Require minimum structure for UK postcode canonical form. +4. Store display form with single space before final three characters. + +## Street Name Normalisation + +1. Trim whitespace. +2. Unicode NFKC normalisation. +3. Uppercase canonical form. +4. Strip configured punctuation. +5. Collapse internal whitespace. +6. Apply configured token aliases deterministically. + +## Null and Empty Handling + +- Empty strings map to null. +- Null and empty-string duplicates are forbidden semantically. + +## Probability and Rounding + +- Base probability uses exact formula from the main spec. +- Store to fixed precision `numeric(6,4)`. +- Apply residual correction to deterministic rank 1 row per postcode. + +## Deterministic Ranking Keys + +Probability ranking (descending) uses: +1. unrounded probability desc +2. confidence rank desc (`high` > `medium` > `low` > `none`) +3. canonical street name `COLLATE "C"` asc +4. USRN asc (nulls last) + +## JSON and Array Ordering + +- `streets_json` is materialised with deterministic ordered aggregation. +- API projection source arrays are ordered lexicographically by: + - `source_name` + - `ingest_run_id` + - `candidate_type` + +## Timezone + +All metadata timestamps are UTC. diff --git a/docs/spec/pipeline_v3/data_model.md b/docs/spec/pipeline_v3/data_model.md new file mode 100644 index 0000000..1eaa059 --- /dev/null +++ b/docs/spec/pipeline_v3/data_model.md @@ -0,0 +1,75 @@ +# Pipeline V3 Data Model + +## Meta Layer + +### `meta.ingest_run` +Tracks ingest runs by source and release. + +### `meta.ingest_run_file` +Child rows for multi-file ingest provenance. + +### `meta.build_bundle` +Deterministic source selection envelope by profile. + +### `meta.build_bundle_source` +Source-to-ingest-run links for each bundle. + +Bundle rule: +- most sources have exactly one ingest run per bundle +- `ppd` may have multiple ingest runs (baseline + yearly/monthly updates) + +### `meta.build_run` +Execution record for a bundle build. + +### `meta.build_pass_checkpoint` +Per-pass completion checkpoints. + +### `meta.canonical_hash` +Deterministic object hashes per build run. + +### `meta.dataset_publication` +Published dataset pointer log. + +## Raw and Stage Layers + +- `raw.*` holds immutable source snapshots. +- `stage.*` holds typed, normalised rows that build passes consume. + +## Core Layer + +- `core.postcodes` +- `core.postcodes_meta` +- `core.streets_usrn` + +## Derived Layer + +### `derived.postcode_street_candidates` +Append-only evidence table. + +Required contract: +- insert-only table +- no update/delete of evidence rows +- candidate rows include `source_name`, `ingest_run_id`, `produced_build_run_id` + +### `derived.postcode_street_candidate_lineage` +Promotion lineage mapping parent evidence rows to child evidence rows. + +### `derived.postcode_streets_final` +One row per final postcode-street record. + +### `derived.postcode_streets_final_candidate` +Relational link from final record to all contributing candidate rows. + +### `derived.postcode_streets_final_source` +Relational source summary by final record. + +## Internal Layer + +### `internal.unit_index` +Disambiguation-only table. Never exposed to API reader role. + +## API Projection Layer + +- `api.postcode_street_lookup__` +- `api.postcode_lookup__` +- stable views: `api.postcode_street_lookup`, `api.postcode_lookup` diff --git a/docs/spec/pipeline_v3/spec.md b/docs/spec/pipeline_v3/spec.md new file mode 100644 index 0000000..7c929c9 --- /dev/null +++ b/docs/spec/pipeline_v3/spec.md @@ -0,0 +1,104 @@ +# Pipeline V3 Specification + +## 1. Scope + +Pipeline V3 is a raw-first, deterministic, replayable build pipeline for postcode and street intelligence. + +Key properties: +- all source ingests are archived with file-level hashes +- all build outputs are reproducible from a bundle of ingest runs +- all derived records have relational provenance links +- API projections are versioned and published by atomic view switch + +## 2. Pass Sequence + +Build pass order is fixed: +1. `0a_raw_ingest` +2. `0b_stage_normalisation` +3. `1_onspd_backbone` +4. `2_gb_canonical_streets` +5. `3_open_names_candidates` +6. `4_uprn_reinforcement` +7. `5_gb_spatial_fallback` +8. `6_ni_candidates` +9. `7_ppd_gap_fill` +10. `8_finalisation` + +### 2.1 PPD Baseline + Updates Rule + +- The 4.2GB PPD full baseline is ingested once and retained. +- Subsequent yearly and monthly PPD update files are ingested as additional PPD runs. +- A build bundle may include multiple PPD ingest runs: + - one baseline run + - zero or more yearly/monthly update runs +- Stage normalisation applies PPD runs in deterministic ingest timestamp order. +- Non-PPD sources remain single-run-per-source within a bundle. +- Build profile naming keeps PPD independent from NI: + - `gb_core`: GB core only + - `gb_core_ppd`: GB core + PPD + - `core_ni`: GB core + NI (without PPD) + +## 3. Candidate Evidence Contract + +`derived.postcode_street_candidates` is an immutable evidence log. + +### 3.1 Pass 3 Promotion Semantics (Append-Only) + +- `names_postcode_feature` candidates are immutable evidence rows. +- TOID confirmation creates a new `oli_toid_usrn` candidate row. +- Promotion lineage is recorded in `derived.postcode_street_candidate_lineage`. +- Existing candidate rows are never updated for `candidate_type`, `confidence`, `usrn`, or `evidence_ref`. + +## 4. Confidence and Candidate Types + +Candidate type enum: +- `names_postcode_feature` +- `oli_toid_usrn` +- `uprn_usrn` +- `spatial_os_open_roads` +- `osni_gazetteer_direct` +- `spatial_dfi_highway` +- `ppd_parse_matched` +- `ppd_parse_unmatched` + +Confidence enum: +- `high` +- `medium` +- `low` +- `none` + +NI confidence cap: +- NI candidate types cannot exceed `medium` in this release. + +## 5. Frequency and Probability + +### 5.1 Probability Formula (Exact) + +- `weighted_score(postcode, street) = sum(candidate_weight for contributing candidates)`. +- `total_weight(postcode) = sum(weighted_score(postcode, *))`. +- `probability(postcode, street) = weighted_score(postcode, street) / total_weight(postcode)`. + +### 5.2 Storage Rule + +- Probabilities are rounded to fixed scale (`numeric(6,4)`). +- Deterministic residual correction is applied to rank 1 street so stored probabilities sum to exactly `1.0000` per postcode. +- Builds fail if `total_weight(postcode) <= 0` for any postcode with final rows. + +## 6. Publish Contract + +- Build writes versioned physical API tables only: + - `api.postcode_lookup__` + - `api.postcode_street_lookup__` +- Publish updates stable views in one transaction: + - `api.postcode_lookup` + - `api.postcode_street_lookup` +- Publication metadata is persisted transactionally. +- Publish rollback leaves previous published version untouched. + +## 7. Provenance + +Final outputs use relational provenance: +- `derived.postcode_streets_final_candidate` +- `derived.postcode_streets_final_source` + +Arrays and JSON payloads are projection-only conveniences in `api.*` tables/views. diff --git a/tests/test_bundle_manifest_ppd_updates.py b/tests/test_bundle_manifest_ppd_updates.py new file mode 100644 index 0000000..19d62b4 --- /dev/null +++ b/tests/test_bundle_manifest_ppd_updates.py @@ -0,0 +1,91 @@ +import json +import tempfile +import unittest +from pathlib import Path + +import sys + + +ROOT = Path(__file__).resolve().parents[1] +SRC = ROOT / "pipeline" / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +from pipeline.manifest import ManifestError, load_bundle_manifest # noqa: E402 + + +class BundleManifestPpdUpdatesTests(unittest.TestCase): + def _write_manifest(self, payload: dict) -> Path: + handle = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".json", delete=False) + try: + json.dump(payload, handle) + handle.flush() + return Path(handle.name) + finally: + handle.close() + + def test_bundle_allows_multiple_ppd_runs(self) -> None: + payload = { + "build_profile": "gb_core_ppd", + "source_runs": { + "onspd": "11111111-1111-1111-1111-111111111111", + "os_open_usrn": "22222222-2222-2222-2222-222222222222", + "os_open_names": "33333333-3333-3333-3333-333333333333", + "os_open_roads": "44444444-4444-4444-4444-444444444444", + "os_open_uprn": "55555555-5555-5555-5555-555555555555", + "os_open_linked_identifiers": "66666666-6666-6666-6666-666666666666", + "nsul": "77777777-7777-7777-7777-777777777777", + "ppd": [ + "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb", + ], + }, + } + path = self._write_manifest(payload) + manifest = load_bundle_manifest(path) + self.assertEqual( + ( + "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb", + ), + manifest.source_runs["ppd"], + ) + + def test_bundle_rejects_empty_source_run_list(self) -> None: + payload = { + "build_profile": "gb_core", + "source_runs": { + "onspd": [], + "os_open_usrn": "22222222-2222-2222-2222-222222222222", + "os_open_names": "33333333-3333-3333-3333-333333333333", + "os_open_roads": "44444444-4444-4444-4444-444444444444", + "os_open_uprn": "55555555-5555-5555-5555-555555555555", + "os_open_linked_identifiers": "66666666-6666-6666-6666-666666666666", + "nsul": "77777777-7777-7777-7777-777777777777", + }, + } + path = self._write_manifest(payload) + with self.assertRaises(ManifestError): + load_bundle_manifest(path) + + def test_gb_core_ppd_does_not_require_ni_sources(self) -> None: + payload = { + "build_profile": "gb_core_ppd", + "source_runs": { + "onspd": "11111111-1111-1111-1111-111111111111", + "os_open_usrn": "22222222-2222-2222-2222-222222222222", + "os_open_names": "33333333-3333-3333-3333-333333333333", + "os_open_roads": "44444444-4444-4444-4444-444444444444", + "os_open_uprn": "55555555-5555-5555-5555-555555555555", + "os_open_linked_identifiers": "66666666-6666-6666-6666-666666666666", + "nsul": "77777777-7777-7777-7777-777777777777", + "ppd": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + }, + } + path = self._write_manifest(payload) + manifest = load_bundle_manifest(path) + self.assertEqual(("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa",), manifest.source_runs["ppd"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_candidate_immutability_db_contract.py b/tests/test_candidate_immutability_db_contract.py new file mode 100644 index 0000000..68ddc36 --- /dev/null +++ b/tests/test_candidate_immutability_db_contract.py @@ -0,0 +1,21 @@ +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +MIGRATION = ROOT / "pipeline" / "sql" / "migrations" / "0005_v3_cutover_foundation.sql" + + +class CandidateImmutabilityDbContractTests(unittest.TestCase): + def test_candidate_trigger_rejects_update_and_delete(self) -> None: + text = MIGRATION.read_text(encoding="utf-8") + self.assertIn("CREATE OR REPLACE FUNCTION derived.reject_candidate_mutation", text) + self.assertIn("append-only", text) + self.assertIn("CREATE TRIGGER trg_candidate_no_update", text) + self.assertIn("BEFORE UPDATE ON derived.postcode_street_candidates", text) + self.assertIn("CREATE TRIGGER trg_candidate_no_delete", text) + self.assertIn("BEFORE DELETE ON derived.postcode_street_candidates", text) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_cli_v3_contract.py b/tests/test_cli_v3_contract.py new file mode 100644 index 0000000..fc88283 --- /dev/null +++ b/tests/test_cli_v3_contract.py @@ -0,0 +1,21 @@ +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +CLI = ROOT / "pipeline" / "src" / "pipeline" / "cli.py" + + +class CliV3ContractTests(unittest.TestCase): + def test_cli_has_v3_commands(self) -> None: + text = CLI.read_text(encoding="utf-8") + self.assertIn('add_parser("bundle"', text) + self.assertIn('add_parser("build"', text) + self.assertIn('add_parser("source"', text) + self.assertIn('add_parser("run"', text) + self.assertIn('add_parser("verify"', text) + self.assertIn('add_parser("publish"', text) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_docs_v3_contract.py b/tests/test_docs_v3_contract.py new file mode 100644 index 0000000..d713273 --- /dev/null +++ b/tests/test_docs_v3_contract.py @@ -0,0 +1,20 @@ +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +SPEC = ROOT / "docs" / "spec" / "pipeline_v3" / "spec.md" + + +class DocsV3ContractTests(unittest.TestCase): + def test_spec_locks_append_only_promotion_and_exact_probability(self) -> None: + text = SPEC.read_text(encoding="utf-8") + self.assertIn("Pass 3 Promotion Semantics (Append-Only)", text) + self.assertIn("immutable evidence rows", text) + self.assertIn("Probability Formula (Exact)", text) + self.assertIn("probability(postcode, street) = weighted_score(postcode, street) / total_weight(postcode)", text) + self.assertNotIn("~1.0", text) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_pass3_append_only_promotion.py b/tests/test_pass3_append_only_promotion.py new file mode 100644 index 0000000..e127dfa --- /dev/null +++ b/tests/test_pass3_append_only_promotion.py @@ -0,0 +1,24 @@ +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +WORKFLOWS = ROOT / "pipeline" / "src" / "pipeline" / "build" / "workflows.py" + + +class Pass3AppendOnlyPromotionTests(unittest.TestCase): + def test_pass3_inserts_promoted_rows_and_lineage(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn("def _pass_3_open_names_candidates", text) + self.assertIn("INSERT INTO derived.postcode_street_candidates", text) + self.assertIn("'oli_toid_usrn'", text) + self.assertIn("INSERT INTO derived.postcode_street_candidate_lineage", text) + self.assertIn("promotion_toid_usrn", text) + + def test_pass3_does_not_update_candidate_type(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertNotIn("UPDATE derived.postcode_street_candidates", text) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_probability_exact_normalization.py b/tests/test_probability_exact_normalization.py new file mode 100644 index 0000000..97c14a2 --- /dev/null +++ b/tests/test_probability_exact_normalization.py @@ -0,0 +1,25 @@ +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +WORKFLOWS = ROOT / "pipeline" / "src" / "pipeline" / "build" / "workflows.py" + + +class ProbabilityExactNormalizationTests(unittest.TestCase): + def test_probability_uses_explicit_denominator(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn("(g.weighted_score / t.total_weight) AS raw_probability", text) + + def test_probability_residual_correction_applied_to_rank_one(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn("WHEN rn = 1", text) + self.assertIn("(1.0000 - rounded_sum)", text) + + def test_verify_requires_exact_sum_one(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn("HAVING SUM(probability)::numeric(10,4) <> 1.0000", text) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_weight_config_contract.py b/tests/test_weight_config_contract.py new file mode 100644 index 0000000..8d3e1a4 --- /dev/null +++ b/tests/test_weight_config_contract.py @@ -0,0 +1,38 @@ +import ast +import json +import unittest +from decimal import Decimal +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +WORKFLOWS = ROOT / "pipeline" / "src" / "pipeline" / "build" / "workflows.py" + + +def _candidate_types_from_workflows() -> tuple[str, ...]: + text = WORKFLOWS.read_text(encoding="utf-8") + tree = ast.parse(text) + for node in tree.body: + if isinstance(node, ast.Assign): + for target in node.targets: + if isinstance(target, ast.Name) and target.id == "CANDIDATE_TYPES": + value = ast.literal_eval(node.value) + if isinstance(value, tuple): + return tuple(str(item) for item in value) + raise AssertionError("CANDIDATE_TYPES constant not found in workflows.py") + + +class WeightConfigContractTests(unittest.TestCase): + def test_weight_config_has_all_candidate_types_with_positive_values(self) -> None: + candidate_types = _candidate_types_from_workflows() + config_path = ROOT / "pipeline" / "config" / "frequency_weights.yaml" + payload = json.loads(config_path.read_text(encoding="utf-8")) + weights = payload["weights"] + + self.assertEqual(set(candidate_types), set(weights.keys())) + for candidate_type in candidate_types: + self.assertGreater(Decimal(str(weights[candidate_type])), Decimal("0")) + + +if __name__ == "__main__": + unittest.main() From ddb615356fb78a3ac1ad1bc50026b301ea2fe144 Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Fri, 20 Feb 2026 23:27:49 +0000 Subject: [PATCH 04/17] chore(repo): ignore local datasets and python build artifacts --- .gitignore | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e45accc --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +/.idea/ +/.DS +.DS_Store +**/.DS_Store + +# Python caches and local build artifacts +__pycache__/ +*.py[cod] +*.egg-info/ +.pytest_cache/ + +# Local datasets and generated source extracts +/data/source_files/real/ +/data/source_files/e2e/ +/data/source_files/v3_smoke/ From c3f40e6491d63736901c10202aaa4957aa188f18 Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Sat, 21 Feb 2026 00:13:36 +0000 Subject: [PATCH 05/17] feat(pipeline): rename LIDS source and harden stage normalization for real-scale builds - rename source contract from os_open_linked_identifiers to os_open_lids across manifests, build profiles, ingest/build mappings, and tests\n- complete raw table naming migration to raw.os_open_lids_row with compatibility migrations for existing databases\n- add id_1/id_2 + relation_type staging for LIDS links via stage.oli_identifier_pair\n- refactor pass 0a to use ingest metadata row counts instead of expensive raw COUNT scans\n- replace fetchall-based stage loaders with streamed/batched ingestion to avoid multi-GB memory spikes on real datasets\n- add resilient field resolution aliases so smoke fixtures and real source schemas both validate deterministically\n- extend canonical street derivation to infer USRN street names from Open Names + LIDS TOID mappings when direct USRN names are absent\n\nSmoke build validation: bundle 183340b9-b9d8-4667-bcbf-d5f7f953381f rebuilt successfully after these changes. --- .../real_v3/gb_core_bundle_manifest.json | 12 + ...nifest.json => os_open_lids_manifest.json} | 2 +- .../v3_smoke/gb_core_bundle_manifest.json | 2 +- ...nifest.json => os_open_lids_manifest.json} | 6 +- pipeline/config/source_schema.yaml | 65 +- .../migrations/0005_v3_cutover_foundation.sql | 10 +- .../0007_v3_rename_os_open_lids.sql | 71 ++ .../0008_v3_lids_relation_stage.sql | 16 + .../0009_v3_rename_raw_os_open_lids_table.sql | 24 + pipeline/src/pipeline/build/workflows.py | 1024 +++++++++++------ pipeline/src/pipeline/ingest/workflows.py | 2 +- pipeline/src/pipeline/manifest.py | 8 +- tests/test_bundle_manifest_ppd_updates.py | 6 +- 13 files changed, 839 insertions(+), 409 deletions(-) create mode 100644 data/manifests/real_v3/gb_core_bundle_manifest.json rename data/manifests/real_v3/{os_open_linked_identifiers_manifest.json => os_open_lids_manifest.json} (95%) rename data/manifests/v3_smoke/{os_open_linked_identifiers_manifest.json => os_open_lids_manifest.json} (76%) create mode 100644 pipeline/sql/migrations/0007_v3_rename_os_open_lids.sql create mode 100644 pipeline/sql/migrations/0008_v3_lids_relation_stage.sql create mode 100644 pipeline/sql/migrations/0009_v3_rename_raw_os_open_lids_table.sql diff --git a/data/manifests/real_v3/gb_core_bundle_manifest.json b/data/manifests/real_v3/gb_core_bundle_manifest.json new file mode 100644 index 0000000..a47a415 --- /dev/null +++ b/data/manifests/real_v3/gb_core_bundle_manifest.json @@ -0,0 +1,12 @@ +{ + "build_profile": "gb_core", + "source_runs": { + "onspd": "2b9a865a-9579-4bad-8e91-2a84a8796d47", + "os_open_usrn": "a72298d8-3681-4cdb-8c5c-93d220f270a2", + "os_open_names": "371385d1-149b-4e70-a8e0-b2e0cccfc4b2", + "os_open_roads": "7e013cc2-57b8-4b4c-8679-c45ba52a40bd", + "os_open_uprn": "c6d801dc-591c-421d-a88e-a9bbc19353eb", + "os_open_lids": "0a2cbe07-11af-419d-9f24-5703d9f1faa7", + "nsul": "1b333010-45aa-47bc-a0ba-72646181a153" + } +} diff --git a/data/manifests/real_v3/os_open_linked_identifiers_manifest.json b/data/manifests/real_v3/os_open_lids_manifest.json similarity index 95% rename from data/manifests/real_v3/os_open_linked_identifiers_manifest.json rename to data/manifests/real_v3/os_open_lids_manifest.json index dc25907..3959a6a 100644 --- a/data/manifests/real_v3/os_open_linked_identifiers_manifest.json +++ b/data/manifests/real_v3/os_open_lids_manifest.json @@ -1,5 +1,5 @@ { - "source_name": "os_open_linked_identifiers", + "source_name": "os_open_lids", "source_version": "lids_2026_02", "retrieved_at_utc": "2026-02-20T22:41:35Z", "source_url": "https://api.os.uk/downloads/v1/products/LIDS/downloads?area=GB&format=CSV", diff --git a/data/manifests/v3_smoke/gb_core_bundle_manifest.json b/data/manifests/v3_smoke/gb_core_bundle_manifest.json index e975cc5..d9d9c10 100644 --- a/data/manifests/v3_smoke/gb_core_bundle_manifest.json +++ b/data/manifests/v3_smoke/gb_core_bundle_manifest.json @@ -6,7 +6,7 @@ "os_open_names": "420f6591-24ba-42b8-8a13-3658c9ef0c02", "os_open_roads": "95119ff1-33cc-4341-b21a-97df73853ac5", "os_open_uprn": "4a2bef4c-9adc-4427-a11e-e65104b7e86a", - "os_open_linked_identifiers": "6ccc48d1-80e4-4336-a985-7c720781c9fb", + "os_open_lids": "6ccc48d1-80e4-4336-a985-7c720781c9fb", "nsul": "0f24c6e9-ba4c-4d63-baf0-388bbda197b6" } } diff --git a/data/manifests/v3_smoke/os_open_linked_identifiers_manifest.json b/data/manifests/v3_smoke/os_open_lids_manifest.json similarity index 76% rename from data/manifests/v3_smoke/os_open_linked_identifiers_manifest.json rename to data/manifests/v3_smoke/os_open_lids_manifest.json index f1c0ae1..fc56cda 100644 --- a/data/manifests/v3_smoke/os_open_linked_identifiers_manifest.json +++ b/data/manifests/v3_smoke/os_open_lids_manifest.json @@ -1,14 +1,14 @@ { - "source_name": "os_open_linked_identifiers", + "source_name": "os_open_lids", "source_version": "v3_smoke_2026_02_20", "retrieved_at_utc": "2026-02-20T22:19:05Z", - "source_url": "local://v3_smoke/os_open_linked_identifiers", + "source_url": "local://v3_smoke/os_open_lids", "processing_git_sha": "ce2b72877a34457e9e02ffcafcd26d02bac92f03", "notes": "Synthetic smoke dataset for V3 ingest/build validation", "files": [ { "file_role": "primary", - "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/os_open_linked_identifiers.csv", + "file_path": "/Users/jamie/code/postcod.es/data/source_files/v3_smoke/os_open_lids.csv", "sha256": "bc31799d901c014741b671578b26344cc4b7008e5264926eac0a667de1eaa78f", "size_bytes": 84, "format": "csv", diff --git a/pipeline/config/source_schema.yaml b/pipeline/config/source_schema.yaml index b0477e0..2ba72b4 100644 --- a/pipeline/config/source_schema.yaml +++ b/pipeline/config/source_schema.yaml @@ -3,79 +3,66 @@ "onspd": { "required_fields": [ "postcode", - "status", "lat", "lon", "easting", "northing", - "country_iso2", - "country_iso3", - "subdivision_code", - "post_town", - "locality" + "subdivision_code" ], "field_map": { - "postcode": "postcode", - "status": "status", + "postcode": "pcds", + "status": "doterm", "lat": "lat", - "lon": "lon", - "easting": "easting", - "northing": "northing", - "country_iso2": "country_iso2", - "country_iso3": "country_iso3", - "subdivision_code": "subdivision_code", - "post_town": "post_town", - "locality": "locality" + "lon": "long", + "easting": "east1m", + "northing": "north1m", + "country_iso2": "ctry25cd", + "country_iso3": "ctry25cd", + "subdivision_code": "ctry25cd" } }, "os_open_usrn": { - "required_fields": ["usrn", "street_name"], + "required_fields": ["usrn"], "field_map": { "usrn": "usrn", - "street_name": "street_name", - "street_class": "street_class", - "street_status": "street_status" + "street_class": "street_type" } }, "os_open_names": { - "required_fields": ["feature_id", "street_name", "postcode"], + "required_fields": ["feature_id", "street_name"], "field_map": { - "feature_id": "feature_id", - "toid": "toid", - "street_name": "street_name", - "postcode": "postcode" + "feature_id": "ID", + "toid": "RELATED_SPATIAL_OBJECT", + "street_name": "NAME1", + "local_type": "LOCAL_TYPE" } }, "os_open_roads": { "required_fields": ["segment_id", "road_name"], "field_map": { - "segment_id": "segment_id", - "road_id": "road_id", - "usrn": "usrn", - "postcode": "postcode", - "road_name": "road_name" + "segment_id": "id", + "road_id": "road_name_toid", + "road_name": "name_1" } }, "os_open_uprn": { "required_fields": ["uprn"], "field_map": { - "uprn": "uprn", - "postcode": "postcode" + "uprn": "UPRN" } }, - "os_open_linked_identifiers": { - "required_fields": ["relation_type", "left_id", "right_id"], + "os_open_lids": { + "required_fields": ["id_1", "id_2"], "field_map": { - "relation_type": "relation_type", - "left_id": "left_id", - "right_id": "right_id" + "id_1": "IDENTIFIER_1", + "id_2": "IDENTIFIER_2" } }, "nsul": { "required_fields": ["uprn", "postcode"], "field_map": { - "uprn": "uprn", - "postcode": "postcode" + "uprn": "UPRN", + "postcode": "PCDS" } }, "osni_gazetteer": { diff --git a/pipeline/sql/migrations/0005_v3_cutover_foundation.sql b/pipeline/sql/migrations/0005_v3_cutover_foundation.sql index c5bcc35..4bc7808 100644 --- a/pipeline/sql/migrations/0005_v3_cutover_foundation.sql +++ b/pipeline/sql/migrations/0005_v3_cutover_foundation.sql @@ -43,7 +43,7 @@ CREATE TABLE IF NOT EXISTS meta.ingest_run ( 'os_open_names', 'os_open_roads', 'os_open_uprn', - 'os_open_linked_identifiers', + 'os_open_lids', 'nsul', 'osni_gazetteer', 'dfi_highway', @@ -99,7 +99,7 @@ CREATE TABLE IF NOT EXISTS meta.build_bundle_source ( 'os_open_names', 'os_open_roads', 'os_open_uprn', - 'os_open_linked_identifiers', + 'os_open_lids', 'nsul', 'osni_gazetteer', 'dfi_highway', @@ -185,7 +185,7 @@ CREATE TABLE IF NOT EXISTS raw.os_open_usrn_row (LIKE raw.onspd_row INCLUDING AL CREATE TABLE IF NOT EXISTS raw.os_open_names_row (LIKE raw.onspd_row INCLUDING ALL); CREATE TABLE IF NOT EXISTS raw.os_open_roads_row (LIKE raw.onspd_row INCLUDING ALL); CREATE TABLE IF NOT EXISTS raw.os_open_uprn_row (LIKE raw.onspd_row INCLUDING ALL); -CREATE TABLE IF NOT EXISTS raw.os_open_linked_identifiers_row (LIKE raw.onspd_row INCLUDING ALL); +CREATE TABLE IF NOT EXISTS raw.os_open_lids_row (LIKE raw.onspd_row INCLUDING ALL); CREATE TABLE IF NOT EXISTS raw.nsul_row (LIKE raw.onspd_row INCLUDING ALL); CREATE TABLE IF NOT EXISTS raw.osni_gazetteer_row (LIKE raw.onspd_row INCLUDING ALL); CREATE TABLE IF NOT EXISTS raw.dfi_highway_row (LIKE raw.onspd_row INCLUDING ALL); @@ -196,7 +196,7 @@ CREATE INDEX IF NOT EXISTS idx_raw_os_open_usrn_run_id ON raw.os_open_usrn_row ( CREATE INDEX IF NOT EXISTS idx_raw_os_open_names_run_id ON raw.os_open_names_row (ingest_run_id); CREATE INDEX IF NOT EXISTS idx_raw_os_open_roads_run_id ON raw.os_open_roads_row (ingest_run_id); CREATE INDEX IF NOT EXISTS idx_raw_os_open_uprn_run_id ON raw.os_open_uprn_row (ingest_run_id); -CREATE INDEX IF NOT EXISTS idx_raw_oli_run_id ON raw.os_open_linked_identifiers_row (ingest_run_id); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_lids_run_id ON raw.os_open_lids_row (ingest_run_id); CREATE INDEX IF NOT EXISTS idx_raw_nsul_run_id ON raw.nsul_row (ingest_run_id); CREATE INDEX IF NOT EXISTS idx_raw_osni_run_id ON raw.osni_gazetteer_row (ingest_run_id); CREATE INDEX IF NOT EXISTS idx_raw_dfi_run_id ON raw.dfi_highway_row (ingest_run_id); @@ -401,7 +401,7 @@ CREATE TABLE IF NOT EXISTS derived.postcode_street_candidates ( 'os_open_names', 'os_open_roads', 'os_open_uprn', - 'os_open_linked_identifiers', + 'os_open_lids', 'nsul', 'osni_gazetteer', 'dfi_highway', diff --git a/pipeline/sql/migrations/0007_v3_rename_os_open_lids.sql b/pipeline/sql/migrations/0007_v3_rename_os_open_lids.sql new file mode 100644 index 0000000..20df5c2 --- /dev/null +++ b/pipeline/sql/migrations/0007_v3_rename_os_open_lids.sql @@ -0,0 +1,71 @@ +BEGIN; + +ALTER TABLE meta.ingest_run + DROP CONSTRAINT IF EXISTS ingest_run_source_name_check; + +ALTER TABLE meta.build_bundle_source + DROP CONSTRAINT IF EXISTS build_bundle_source_source_name_check; + +ALTER TABLE derived.postcode_street_candidates + DROP CONSTRAINT IF EXISTS postcode_street_candidates_source_name_check; + +UPDATE meta.ingest_run +SET source_name = 'os_open_lids' +WHERE source_name = 'os_open_linked_identifiers'; + +UPDATE meta.build_bundle_source +SET source_name = 'os_open_lids' +WHERE source_name = 'os_open_linked_identifiers'; + +ALTER TABLE derived.postcode_street_candidates DISABLE TRIGGER ALL; +UPDATE derived.postcode_street_candidates +SET source_name = 'os_open_lids' +WHERE source_name = 'os_open_linked_identifiers'; +ALTER TABLE derived.postcode_street_candidates ENABLE TRIGGER ALL; + +ALTER TABLE meta.ingest_run + ADD CONSTRAINT ingest_run_source_name_check + CHECK (source_name IN ( + 'onspd', + 'os_open_usrn', + 'os_open_names', + 'os_open_roads', + 'os_open_uprn', + 'os_open_lids', + 'nsul', + 'osni_gazetteer', + 'dfi_highway', + 'ppd' + )); + +ALTER TABLE meta.build_bundle_source + ADD CONSTRAINT build_bundle_source_source_name_check + CHECK (source_name IN ( + 'onspd', + 'os_open_usrn', + 'os_open_names', + 'os_open_roads', + 'os_open_uprn', + 'os_open_lids', + 'nsul', + 'osni_gazetteer', + 'dfi_highway', + 'ppd' + )); + +ALTER TABLE derived.postcode_street_candidates + ADD CONSTRAINT postcode_street_candidates_source_name_check + CHECK (source_name IN ( + 'onspd', + 'os_open_usrn', + 'os_open_names', + 'os_open_roads', + 'os_open_uprn', + 'os_open_lids', + 'nsul', + 'osni_gazetteer', + 'dfi_highway', + 'ppd' + )); + +COMMIT; diff --git a/pipeline/sql/migrations/0008_v3_lids_relation_stage.sql b/pipeline/sql/migrations/0008_v3_lids_relation_stage.sql new file mode 100644 index 0000000..cee935f --- /dev/null +++ b/pipeline/sql/migrations/0008_v3_lids_relation_stage.sql @@ -0,0 +1,16 @@ +BEGIN; + +CREATE TABLE IF NOT EXISTS stage.oli_identifier_pair ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + id_1 text NOT NULL, + id_2 text NOT NULL, + relation_type text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, id_1, id_2, relation_type), + CHECK (relation_type IN ('toid_usrn', 'uprn_usrn')) +); + +CREATE INDEX IF NOT EXISTS idx_stage_oli_identifier_pair_run_relation + ON stage.oli_identifier_pair (build_run_id, relation_type); + +COMMIT; diff --git a/pipeline/sql/migrations/0009_v3_rename_raw_os_open_lids_table.sql b/pipeline/sql/migrations/0009_v3_rename_raw_os_open_lids_table.sql new file mode 100644 index 0000000..be5cd6b --- /dev/null +++ b/pipeline/sql/migrations/0009_v3_rename_raw_os_open_lids_table.sql @@ -0,0 +1,24 @@ +BEGIN; + +DO $$ +BEGIN + IF to_regclass('raw.os_open_linked_identifiers_row') IS NOT NULL + AND to_regclass('raw.os_open_lids_row') IS NULL THEN + ALTER TABLE raw.os_open_linked_identifiers_row + RENAME TO os_open_lids_row; + END IF; +END $$; + +DO $$ +BEGIN + IF to_regclass('raw.idx_raw_oli_run_id') IS NOT NULL + AND to_regclass('raw.idx_raw_os_open_lids_run_id') IS NULL THEN + ALTER INDEX raw.idx_raw_oli_run_id + RENAME TO idx_raw_os_open_lids_run_id; + END IF; +END $$; + +CREATE INDEX IF NOT EXISTS idx_raw_os_open_lids_run_id + ON raw.os_open_lids_row (ingest_run_id); + +COMMIT; diff --git a/pipeline/src/pipeline/build/workflows.py b/pipeline/src/pipeline/build/workflows.py index 9385f9f..39e1d94 100644 --- a/pipeline/src/pipeline/build/workflows.py +++ b/pipeline/src/pipeline/build/workflows.py @@ -75,7 +75,7 @@ class PublishResult: "os_open_names": "raw.os_open_names_row", "os_open_roads": "raw.os_open_roads_row", "os_open_uprn": "raw.os_open_uprn_row", - "os_open_linked_identifiers": "raw.os_open_linked_identifiers_row", + "os_open_lids": "raw.os_open_lids_row", "nsul": "raw.nsul_row", "osni_gazetteer": "raw.osni_gazetteer_row", "dfi_highway": "raw.dfi_highway_row", @@ -458,9 +458,23 @@ def _mark_build_built(conn: psycopg.Connection, bundle_id: str, build_run_id: st ) -def _load_raw_rows(conn: psycopg.Connection, raw_table: str, ingest_run_id: str) -> list[dict[str, Any]]: +RAW_FETCH_BATCH_SIZE = 5000 +STAGE_INSERT_BATCH_SIZE = 5000 + + +def _iter_validated_raw_rows( + conn: psycopg.Connection, + *, + source_name: str, + raw_table: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +): schema_name, table_name = raw_table.split(".", 1) - with conn.cursor() as cur: + cursor_name = f"stage_raw_{table_name}_{uuid.uuid4().hex[:8]}" + with conn.cursor(name=cursor_name) as cur: + cur.itersize = RAW_FETCH_BATCH_SIZE cur.execute( sql.SQL( """ @@ -472,7 +486,25 @@ def _load_raw_rows(conn: psycopg.Connection, raw_table: str, ingest_run_id: str) ).format(sql.Identifier(schema_name), sql.Identifier(table_name)), (ingest_run_id,), ) - return [row[0] for row in cur.fetchall()] + first = cur.fetchone() + if first is None: + raise BuildError(f"Raw source is empty for {source_name}; cannot stage-normalise") + + first_row = first[0] + _assert_required_mapped_fields_present( + source_name=source_name, + sample_row=first_row, + field_map=field_map, + required_fields=required_fields, + ) + yield first_row + + while True: + chunk = cur.fetchmany(RAW_FETCH_BATCH_SIZE) + if not chunk: + break + for row in chunk: + yield row[0] def _mapped_fields_for_source(schema_config: dict[str, Any], source_name: str) -> tuple[dict[str, str], tuple[str, ...]]: @@ -512,19 +544,16 @@ def _mapped_fields_for_source(schema_config: dict[str, Any], source_name: str) - def _assert_required_mapped_fields_present( source_name: str, - rows: list[dict[str, Any]], + sample_row: dict[str, Any], field_map: dict[str, str], required_fields: tuple[str, ...], ) -> None: - if not rows: - raise BuildError(f"Raw source is empty for {source_name}; cannot stage-normalise") - - sample_keys = set(rows[0].keys()) + sample_keys = set(sample_row.keys()) missing = [] for key in required_fields: - mapped = field_map[key] - if mapped not in sample_keys: - missing.append(mapped) + candidates = _field_name_candidates(field_map, key) + if not any(candidate in sample_keys for candidate in candidates): + missing.append("/".join(candidates)) if missing: raise BuildError( f"Schema mapping unresolved for {source_name}; missing mapped fields in raw rows: " @@ -532,6 +561,45 @@ def _assert_required_mapped_fields_present( ) +def _field_name_candidates(field_map: dict[str, str], logical_key: str) -> tuple[str, ...]: + names: list[str] = [] + mapped = field_map.get(logical_key) + if mapped: + names.append(mapped) + names.append(logical_key) + legacy_aliases = { + "id_1": ("identifier_1", "left_id"), + "id_2": ("identifier_2", "right_id"), + "identifier_1": ("id_1", "left_id"), + "identifier_2": ("id_2", "right_id"), + "left_id": ("id_1", "identifier_1"), + "right_id": ("id_2", "identifier_2"), + } + aliases = legacy_aliases.get(logical_key, ()) + names.extend(aliases) + + expanded: list[str] = [] + for name in names: + expanded.append(name) + expanded.append(name.lower()) + expanded.append(name.upper()) + + deduped: list[str] = [] + seen: set[str] = set() + for name in expanded: + if name not in seen: + deduped.append(name) + seen.add(name) + return tuple(deduped) + + +def _field_value(row: dict[str, Any], field_map: dict[str, str], logical_key: str) -> Any: + for candidate in _field_name_candidates(field_map, logical_key): + if candidate in row: + return row.get(candidate) + return None + + def _schema_insert_rows( conn: psycopg.Connection, query: sql.SQL, @@ -544,6 +612,18 @@ def _schema_insert_rows( return len(rows) +def _flush_stage_batch( + conn: psycopg.Connection, + query: sql.SQL, + payload: list[tuple[Any, ...]], +) -> int: + if not payload: + return 0 + inserted = _schema_insert_rows(conn, query, payload) + payload.clear() + return inserted + + def _stage_cleanup(conn: psycopg.Connection, build_run_id: str) -> None: tables = ( "stage.ppd_parsed_address", @@ -552,6 +632,7 @@ def _stage_cleanup(conn: psycopg.Connection, build_run_id: str) -> None: "stage.nsul_uprn_postcode", "stage.oli_uprn_usrn", "stage.oli_toid_usrn", + "stage.oli_identifier_pair", "stage.uprn_point", "stage.open_roads_segment", "stage.open_names_road_feature", @@ -575,24 +656,35 @@ def _pass_0a_raw_ingest( build_run_id: str, source_runs: dict[str, tuple[str, ...]], ) -> dict[str, int]: + del build_run_id # Pass 0a validates bundle/run metadata only. counts: dict[str, int] = {} with conn.cursor() as cur: for source_name, run_ids in sorted(source_runs.items()): - raw_table = RAW_TABLE_BY_SOURCE[source_name] - schema_name, table_name = raw_table.split(".", 1) total_row_count = 0 for ingest_run_id in run_ids: cur.execute( - sql.SQL("SELECT COUNT(*) FROM {}.{} WHERE ingest_run_id = %s").format( - sql.Identifier(schema_name), - sql.Identifier(table_name), - ), + """ + SELECT source_name, record_count + FROM meta.ingest_run + WHERE run_id = %s + """, (ingest_run_id,), ) - row_count = int(cur.fetchone()[0]) + row = cur.fetchone() + if row is None: + raise BuildError( + f"Pass 0a failed: ingest run missing in metadata source={source_name} run={ingest_run_id}" + ) + row_source_name, record_count = row + if row_source_name != source_name: + raise BuildError( + "Pass 0a failed: ingest run/source mismatch " + f"bundle_source={source_name} run_source={row_source_name} run={ingest_run_id}" + ) + row_count = int(record_count or 0) if row_count <= 0: raise BuildError( - "Pass 0a failed: source has no raw rows for " + "Pass 0a failed: source has no recorded rows for " f"source={source_name} run={ingest_run_id}" ) total_row_count += row_count @@ -608,6 +700,31 @@ def _country_enrichment_available(country_iso2: str, subdivision_code: str | Non return False +def _onspd_country_mapping(value: str | None) -> tuple[str, str, str | None]: + code = (value or "").strip().upper() + mapping = { + "E92000001": ("GB", "GBR", "GB-ENG"), + "S92000003": ("GB", "GBR", "GB-SCT"), + "W92000004": ("GB", "GBR", "GB-WLS"), + "N92000002": ("GB", "GBR", "GB-NIR"), + } + if code in mapping: + return mapping[code] + if code in {"GB", "GBR"}: + return "GB", "GBR", None + return "GB", "GBR", None + + +def _normalise_onspd_status(value: str | None) -> str: + raw = (value or "").strip() + if raw == "": + return "active" + lowered = raw.lower() + if lowered in {"active", "terminated"}: + return lowered + return "terminated" + + def _populate_stage_onspd( conn: psycopg.Connection, build_run_id: str, @@ -615,32 +732,59 @@ def _populate_stage_onspd( field_map: dict[str, str], required_fields: tuple[str, ...], ) -> int: - rows = _load_raw_rows(conn, "raw.onspd_row", ingest_run_id) - _assert_required_mapped_fields_present("onspd", rows, field_map, required_fields) + insert_sql = sql.SQL( + """ + INSERT INTO stage.onspd_postcode ( + build_run_id, + postcode_norm, + postcode_display, + status, + lat, + lon, + easting, + northing, + country_iso2, + country_iso3, + subdivision_code, + post_town, + locality, + street_enrichment_available, + onspd_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """ + ) payload: list[tuple[Any, ...]] = [] - for row in rows: - postcode_raw = row.get(field_map["postcode"]) + inserted = 0 + for row in _iter_validated_raw_rows( + conn, + source_name="onspd", + raw_table="raw.onspd_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + postcode_raw = _field_value(row, field_map, "postcode") postcode_n = postcode_norm(str(postcode_raw) if postcode_raw is not None else None) postcode_d = postcode_display(str(postcode_raw) if postcode_raw is not None else None) if postcode_n is None or postcode_d is None: continue - status_raw = row.get(field_map["status"]) - status = (str(status_raw).strip().lower() if status_raw is not None else "active") or "active" + status_key = field_map.get("status") + status = _normalise_onspd_status( + str(row.get(status_key)) if status_key and row.get(status_key) is not None else None + ) - country_iso2 = str(row.get(field_map["country_iso2"], "")).strip().upper() - country_iso3 = str(row.get(field_map["country_iso3"], "")).strip().upper() - subdivision_code_raw = row.get(field_map["subdivision_code"]) - subdivision_code = ( - str(subdivision_code_raw).strip().upper() if subdivision_code_raw is not None else None + country_key = field_map.get("subdivision_code") or field_map.get("country_iso2") + mapped_country_value = ( + str(row.get(country_key)) if country_key and row.get(country_key) is not None else None ) - subdivision_code = subdivision_code or None + country_iso2, country_iso3, subdivision_code = _onspd_country_mapping(mapped_country_value) - lat_raw = row.get(field_map["lat"]) - lon_raw = row.get(field_map["lon"]) - easting_raw = row.get(field_map["easting"]) - northing_raw = row.get(field_map["northing"]) + lat_raw = _field_value(row, field_map, "lat") + lon_raw = _field_value(row, field_map, "lon") + easting_raw = _field_value(row, field_map, "easting") + northing_raw = _field_value(row, field_map, "northing") lat: Decimal | None lon: Decimal | None @@ -658,8 +802,10 @@ def _populate_stage_onspd( easting = None northing = None - post_town_raw = row.get(field_map["post_town"]) - locality_raw = row.get(field_map["locality"]) + post_town_key = field_map.get("post_town") + locality_key = field_map.get("locality") + post_town_raw = row.get(post_town_key) if post_town_key else None + locality_raw = row.get(locality_key) if locality_key else None payload.append( ( @@ -680,32 +826,11 @@ def _populate_stage_onspd( ingest_run_id, ) ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) - return _schema_insert_rows( - conn, - sql.SQL( - """ - INSERT INTO stage.onspd_postcode ( - build_run_id, - postcode_norm, - postcode_display, - status, - lat, - lon, - easting, - northing, - country_iso2, - country_iso3, - subdivision_code, - post_town, - locality, - street_enrichment_available, - onspd_run_id - ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - """ - ), - payload, - ) + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted def _populate_stage_usrn( @@ -715,13 +840,42 @@ def _populate_stage_usrn( field_map: dict[str, str], required_fields: tuple[str, ...], ) -> int: - rows = _load_raw_rows(conn, "raw.os_open_usrn_row", ingest_run_id) - _assert_required_mapped_fields_present("os_open_usrn", rows, field_map, required_fields) + insert_sql = sql.SQL( + """ + INSERT INTO stage.streets_usrn_input ( + build_run_id, + usrn, + street_name, + street_name_casefolded, + street_class, + street_status, + usrn_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, usrn) + DO UPDATE SET + street_name = EXCLUDED.street_name, + street_name_casefolded = EXCLUDED.street_name_casefolded, + street_class = EXCLUDED.street_class, + street_status = EXCLUDED.street_status, + usrn_run_id = EXCLUDED.usrn_run_id + """ + ) payload: list[tuple[Any, ...]] = [] - for row in rows: - usrn_raw = row.get(field_map["usrn"]) - name_raw = row.get(field_map["street_name"]) + inserted = 0 + street_name_key = field_map.get("street_name") + street_class_key = field_map.get("street_class") + street_status_key = field_map.get("street_status") + for row in _iter_validated_raw_rows( + conn, + source_name="os_open_usrn", + raw_table="raw.os_open_usrn_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + usrn_raw = _field_value(row, field_map, "usrn") + name_raw = row.get(street_name_key) if street_name_key else None if usrn_raw in (None, "") or name_raw in (None, ""): continue try: @@ -739,36 +893,16 @@ def _populate_stage_usrn( usrn, street_name, folded, - str(row.get(field_map.get("street_class", ""), "")).strip() or None, - str(row.get(field_map.get("street_status", ""), "")).strip() or None, + str(row.get(street_class_key)).strip() if street_class_key and row.get(street_class_key) not in (None, "") else None, + str(row.get(street_status_key)).strip() if street_status_key and row.get(street_status_key) not in (None, "") else None, ingest_run_id, ) ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) - return _schema_insert_rows( - conn, - sql.SQL( - """ - INSERT INTO stage.streets_usrn_input ( - build_run_id, - usrn, - street_name, - street_name_casefolded, - street_class, - street_status, - usrn_run_id - ) VALUES (%s, %s, %s, %s, %s, %s, %s) - ON CONFLICT (build_run_id, usrn) - DO UPDATE SET - street_name = EXCLUDED.street_name, - street_name_casefolded = EXCLUDED.street_name_casefolded, - street_class = EXCLUDED.street_class, - street_status = EXCLUDED.street_status, - usrn_run_id = EXCLUDED.usrn_run_id - """ - ), - payload, - ) + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted def _populate_stage_open_names( @@ -778,18 +912,51 @@ def _populate_stage_open_names( field_map: dict[str, str], required_fields: tuple[str, ...], ) -> int: - rows = _load_raw_rows(conn, "raw.os_open_names_row", ingest_run_id) - _assert_required_mapped_fields_present("os_open_names", rows, field_map, required_fields) + insert_sql = sql.SQL( + """ + INSERT INTO stage.open_names_road_feature ( + build_run_id, + feature_id, + toid, + postcode_norm, + street_name_raw, + street_name_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, feature_id) + DO UPDATE SET + toid = EXCLUDED.toid, + postcode_norm = EXCLUDED.postcode_norm, + street_name_raw = EXCLUDED.street_name_raw, + street_name_casefolded = EXCLUDED.street_name_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ) payload: list[tuple[Any, ...]] = [] - for row in rows: - feature_id_raw = row.get(field_map["feature_id"]) - street_raw = row.get(field_map["street_name"]) - postcode_raw = row.get(field_map["postcode"]) - toid_raw = row.get(field_map.get("toid", "")) + inserted = 0 + toid_key = field_map.get("toid") + postcode_key = field_map.get("postcode") + local_type_key = field_map.get("local_type") + for row in _iter_validated_raw_rows( + conn, + source_name="os_open_names", + raw_table="raw.os_open_names_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + feature_id_raw = _field_value(row, field_map, "feature_id") + street_raw = _field_value(row, field_map, "street_name") + postcode_raw = row.get(postcode_key) if postcode_key else None + toid_raw = row.get(toid_key) if toid_key else None if feature_id_raw in (None, "") or street_raw in (None, ""): continue + local_type = str(row.get(local_type_key)).strip().lower() if local_type_key and row.get(local_type_key) not in (None, "") else "" + if local_type and "road" not in local_type and "transport" not in local_type: + continue + folded = street_casefold(str(street_raw)) postcode_n = postcode_norm(str(postcode_raw) if postcode_raw is not None else None) if folded is None: @@ -806,31 +973,11 @@ def _populate_stage_open_names( ingest_run_id, ) ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) - return _schema_insert_rows( - conn, - sql.SQL( - """ - INSERT INTO stage.open_names_road_feature ( - build_run_id, - feature_id, - toid, - postcode_norm, - street_name_raw, - street_name_casefolded, - ingest_run_id - ) VALUES (%s, %s, %s, %s, %s, %s, %s) - ON CONFLICT (build_run_id, feature_id) - DO UPDATE SET - toid = EXCLUDED.toid, - postcode_norm = EXCLUDED.postcode_norm, - street_name_raw = EXCLUDED.street_name_raw, - street_name_casefolded = EXCLUDED.street_name_casefolded, - ingest_run_id = EXCLUDED.ingest_run_id - """ - ), - payload, - ) + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted def _populate_stage_open_roads( @@ -840,13 +987,44 @@ def _populate_stage_open_roads( field_map: dict[str, str], required_fields: tuple[str, ...], ) -> int: - rows = _load_raw_rows(conn, "raw.os_open_roads_row", ingest_run_id) - _assert_required_mapped_fields_present("os_open_roads", rows, field_map, required_fields) + insert_sql = sql.SQL( + """ + INSERT INTO stage.open_roads_segment ( + build_run_id, + segment_id, + road_id, + postcode_norm, + usrn, + road_name, + road_name_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, segment_id) + DO UPDATE SET + road_id = EXCLUDED.road_id, + postcode_norm = EXCLUDED.postcode_norm, + usrn = EXCLUDED.usrn, + road_name = EXCLUDED.road_name, + road_name_casefolded = EXCLUDED.road_name_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ) payload: list[tuple[Any, ...]] = [] - for row in rows: - segment_id_raw = row.get(field_map["segment_id"]) - road_name_raw = row.get(field_map["road_name"]) + inserted = 0 + postcode_key = field_map.get("postcode") + usrn_key = field_map.get("usrn") + road_id_key = field_map.get("road_id") + for row in _iter_validated_raw_rows( + conn, + source_name="os_open_roads", + raw_table="raw.os_open_roads_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + segment_id_raw = _field_value(row, field_map, "segment_id") + road_name_raw = _field_value(row, field_map, "road_name") if segment_id_raw in (None, "") or road_name_raw in (None, ""): continue @@ -854,19 +1032,21 @@ def _populate_stage_open_roads( if folded is None: continue - postcode_n = postcode_norm(str(row.get(field_map.get("postcode", ""), "")) or None) + postcode_n = postcode_norm(str(row.get(postcode_key)) if postcode_key and row.get(postcode_key) not in (None, "") else None) - usrn_raw = row.get(field_map.get("usrn", "")) + usrn_raw = row.get(usrn_key) if usrn_key else None try: usrn = int(usrn_raw) if usrn_raw not in (None, "") else None except Exception: usrn = None + road_id_raw = row.get(road_id_key) if road_id_key else None + payload.append( ( build_run_id, str(segment_id_raw).strip(), - str(row.get(field_map.get("road_id", ""), "")).strip() or None, + str(road_id_raw).strip() if road_id_raw not in (None, "") else None, postcode_n, usrn, str(road_name_raw).strip(), @@ -874,33 +1054,11 @@ def _populate_stage_open_roads( ingest_run_id, ) ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) - return _schema_insert_rows( - conn, - sql.SQL( - """ - INSERT INTO stage.open_roads_segment ( - build_run_id, - segment_id, - road_id, - postcode_norm, - usrn, - road_name, - road_name_casefolded, - ingest_run_id - ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) - ON CONFLICT (build_run_id, segment_id) - DO UPDATE SET - road_id = EXCLUDED.road_id, - postcode_norm = EXCLUDED.postcode_norm, - usrn = EXCLUDED.usrn, - road_name = EXCLUDED.road_name, - road_name_casefolded = EXCLUDED.road_name_casefolded, - ingest_run_id = EXCLUDED.ingest_run_id - """ - ), - payload, - ) + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted def _populate_stage_open_uprn( @@ -910,12 +1068,33 @@ def _populate_stage_open_uprn( field_map: dict[str, str], required_fields: tuple[str, ...], ) -> int: - rows = _load_raw_rows(conn, "raw.os_open_uprn_row", ingest_run_id) - _assert_required_mapped_fields_present("os_open_uprn", rows, field_map, required_fields) + insert_sql = sql.SQL( + """ + INSERT INTO stage.uprn_point ( + build_run_id, + uprn, + postcode_norm, + ingest_run_id + ) VALUES (%s, %s, %s, %s) + ON CONFLICT (build_run_id, uprn) + DO UPDATE SET + postcode_norm = EXCLUDED.postcode_norm, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ) payload: list[tuple[Any, ...]] = [] - for row in rows: - uprn_raw = row.get(field_map["uprn"]) + inserted = 0 + postcode_key = field_map.get("postcode") + for row in _iter_validated_raw_rows( + conn, + source_name="os_open_uprn", + raw_table="raw.os_open_uprn_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + uprn_raw = _field_value(row, field_map, "uprn") if uprn_raw in (None, ""): continue try: @@ -923,28 +1102,46 @@ def _populate_stage_open_uprn( except Exception: continue - postcode_n = postcode_norm(str(row.get(field_map.get("postcode", ""), "")) or None) + postcode_n = postcode_norm(str(row.get(postcode_key)) if postcode_key and row.get(postcode_key) not in (None, "") else None) payload.append((build_run_id, uprn, postcode_n, ingest_run_id)) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) - return _schema_insert_rows( - conn, - sql.SQL( - """ - INSERT INTO stage.uprn_point ( - build_run_id, - uprn, - postcode_norm, - ingest_run_id - ) VALUES (%s, %s, %s, %s) - ON CONFLICT (build_run_id, uprn) - DO UPDATE SET - postcode_norm = EXCLUDED.postcode_norm, - ingest_run_id = EXCLUDED.ingest_run_id - """ - ), - payload, - ) + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted + + +def _infer_lids_relation( + relation_raw: Any, + left_id: str, + right_id: str, +) -> tuple[str | None, str, str]: + relation = str(relation_raw).strip().lower() if relation_raw not in (None, "") else "" + left_is_toid = left_id.lower().startswith("osgb") + right_is_toid = right_id.lower().startswith("osgb") + left_is_digits = left_id.isdigit() + right_is_digits = right_id.isdigit() + + if relation in {"toid_usrn", "toid->usrn", "toid_usrn_link"}: + return "toid_usrn", left_id, right_id + if relation in {"uprn_usrn", "uprn->usrn", "uprn_usrn_link"}: + return "uprn_usrn", left_id, right_id + + if left_is_toid and right_is_digits: + return "toid_usrn", left_id, right_id + if right_is_toid and left_is_digits: + return "toid_usrn", right_id, left_id + + if left_is_digits and right_is_digits: + # UPRN values are usually longer than USRN values. If ambiguous, keep input order. + if len(left_id) > 8 and len(right_id) <= 8: + return "uprn_usrn", left_id, right_id + if len(right_id) > 8 and len(left_id) <= 8: + return "uprn_usrn", right_id, left_id + return "uprn_usrn", left_id, right_id + + return None, left_id, right_id def _populate_stage_oli( @@ -953,76 +1150,99 @@ def _populate_stage_oli( ingest_run_id: str, field_map: dict[str, str], required_fields: tuple[str, ...], -) -> tuple[int, int]: - rows = _load_raw_rows(conn, "raw.os_open_linked_identifiers_row", ingest_run_id) - _assert_required_mapped_fields_present( - "os_open_linked_identifiers", rows, field_map, required_fields +) -> tuple[int, int, int]: + toid_insert_sql = sql.SQL( + """ + INSERT INTO stage.oli_toid_usrn ( + build_run_id, + toid, + usrn, + ingest_run_id + ) VALUES (%s, %s, %s, %s) + ON CONFLICT (build_run_id, toid, usrn) + DO NOTHING + """ + ) + uprn_insert_sql = sql.SQL( + """ + INSERT INTO stage.oli_uprn_usrn ( + build_run_id, + uprn, + usrn, + ingest_run_id + ) VALUES (%s, %s, %s, %s) + ON CONFLICT (build_run_id, uprn, usrn) + DO NOTHING + """ + ) + relation_insert_sql = sql.SQL( + """ + INSERT INTO stage.oli_identifier_pair ( + build_run_id, + id_1, + id_2, + relation_type, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, id_1, id_2, relation_type) + DO NOTHING + """ ) toid_payload: list[tuple[Any, ...]] = [] uprn_payload: list[tuple[Any, ...]] = [] - - for row in rows: - relation_raw = row.get(field_map["relation_type"]) - left_raw = row.get(field_map["left_id"]) - right_raw = row.get(field_map["right_id"]) - - relation = str(relation_raw).strip().lower() if relation_raw not in (None, "") else "" + relation_payload: list[tuple[Any, ...]] = [] + relation_key = field_map.get("relation_type") + toid_count = 0 + uprn_count = 0 + relation_count = 0 + for row in _iter_validated_raw_rows( + conn, + source_name="os_open_lids", + raw_table="raw.os_open_lids_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + relation_raw = row.get(relation_key) if relation_key else None + left_raw = _field_value(row, field_map, "id_1") + right_raw = _field_value(row, field_map, "id_2") if left_raw in (None, "") or right_raw in (None, ""): continue left_id = str(left_raw).strip() right_id = str(right_raw).strip() + relation, rel_left_id, rel_right_id = _infer_lids_relation(relation_raw, left_id, right_id) + if relation is None: + continue + + relation_payload.append((build_run_id, rel_left_id, rel_right_id, relation, ingest_run_id)) - if relation in {"toid_usrn", "toid->usrn", "toid_usrn_link"}: + if relation == "toid_usrn": try: - usrn = int(right_id) + usrn = int(rel_right_id) except Exception: continue - toid_payload.append((build_run_id, left_id, usrn, ingest_run_id)) - elif relation in {"uprn_usrn", "uprn->usrn", "uprn_usrn_link"}: + toid_payload.append((build_run_id, rel_left_id, usrn, ingest_run_id)) + elif relation == "uprn_usrn": try: - uprn = int(left_id) - usrn = int(right_id) + uprn = int(rel_left_id) + usrn = int(rel_right_id) except Exception: continue uprn_payload.append((build_run_id, uprn, usrn, ingest_run_id)) - toid_count = _schema_insert_rows( - conn, - sql.SQL( - """ - INSERT INTO stage.oli_toid_usrn ( - build_run_id, - toid, - usrn, - ingest_run_id - ) VALUES (%s, %s, %s, %s) - ON CONFLICT (build_run_id, toid, usrn) - DO NOTHING - """ - ), - toid_payload, - ) - - uprn_count = _schema_insert_rows( - conn, - sql.SQL( - """ - INSERT INTO stage.oli_uprn_usrn ( - build_run_id, - uprn, - usrn, - ingest_run_id - ) VALUES (%s, %s, %s, %s) - ON CONFLICT (build_run_id, uprn, usrn) - DO NOTHING - """ - ), - uprn_payload, - ) + if len(toid_payload) >= STAGE_INSERT_BATCH_SIZE: + toid_count += _flush_stage_batch(conn, toid_insert_sql, toid_payload) + if len(uprn_payload) >= STAGE_INSERT_BATCH_SIZE: + uprn_count += _flush_stage_batch(conn, uprn_insert_sql, uprn_payload) + if len(relation_payload) >= STAGE_INSERT_BATCH_SIZE: + relation_count += _flush_stage_batch(conn, relation_insert_sql, relation_payload) - return toid_count, uprn_count + toid_count += _flush_stage_batch(conn, toid_insert_sql, toid_payload) + uprn_count += _flush_stage_batch(conn, uprn_insert_sql, uprn_payload) + relation_count += _flush_stage_batch(conn, relation_insert_sql, relation_payload) + return toid_count, uprn_count, relation_count def _populate_stage_nsul( @@ -1032,13 +1252,31 @@ def _populate_stage_nsul( field_map: dict[str, str], required_fields: tuple[str, ...], ) -> int: - rows = _load_raw_rows(conn, "raw.nsul_row", ingest_run_id) - _assert_required_mapped_fields_present("nsul", rows, field_map, required_fields) + insert_sql = sql.SQL( + """ + INSERT INTO stage.nsul_uprn_postcode ( + build_run_id, + uprn, + postcode_norm, + ingest_run_id + ) VALUES (%s, %s, %s, %s) + ON CONFLICT (build_run_id, uprn, postcode_norm) + DO NOTHING + """ + ) payload: list[tuple[Any, ...]] = [] - for row in rows: - uprn_raw = row.get(field_map["uprn"]) - postcode_raw = row.get(field_map["postcode"]) + inserted = 0 + for row in _iter_validated_raw_rows( + conn, + source_name="nsul", + raw_table="raw.nsul_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + uprn_raw = _field_value(row, field_map, "uprn") + postcode_raw = _field_value(row, field_map, "postcode") if uprn_raw in (None, ""): continue try: @@ -1049,23 +1287,11 @@ def _populate_stage_nsul( if postcode_n is None: continue payload.append((build_run_id, uprn, postcode_n, ingest_run_id)) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) - return _schema_insert_rows( - conn, - sql.SQL( - """ - INSERT INTO stage.nsul_uprn_postcode ( - build_run_id, - uprn, - postcode_norm, - ingest_run_id - ) VALUES (%s, %s, %s, %s) - ON CONFLICT (build_run_id, uprn, postcode_norm) - DO NOTHING - """ - ), - payload, - ) + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted def _populate_stage_osni( @@ -1075,13 +1301,38 @@ def _populate_stage_osni( field_map: dict[str, str], required_fields: tuple[str, ...], ) -> int: - rows = _load_raw_rows(conn, "raw.osni_gazetteer_row", ingest_run_id) - _assert_required_mapped_fields_present("osni_gazetteer", rows, field_map, required_fields) + insert_sql = sql.SQL( + """ + INSERT INTO stage.osni_street_point ( + build_run_id, + feature_id, + postcode_norm, + street_name_raw, + street_name_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, feature_id) + DO UPDATE SET + postcode_norm = EXCLUDED.postcode_norm, + street_name_raw = EXCLUDED.street_name_raw, + street_name_casefolded = EXCLUDED.street_name_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ) payload: list[tuple[Any, ...]] = [] - for row in rows: - feature_id_raw = row.get(field_map["feature_id"]) - street_raw = row.get(field_map["street_name"]) + inserted = 0 + postcode_key = field_map.get("postcode") + for row in _iter_validated_raw_rows( + conn, + source_name="osni_gazetteer", + raw_table="raw.osni_gazetteer_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + feature_id_raw = _field_value(row, field_map, "feature_id") + street_raw = _field_value(row, field_map, "street_name") if feature_id_raw in (None, "") or street_raw in (None, ""): continue @@ -1089,7 +1340,7 @@ def _populate_stage_osni( if folded is None: continue - postcode_n = postcode_norm(str(row.get(field_map.get("postcode", ""), "")) or None) + postcode_n = postcode_norm(str(row.get(postcode_key)) if postcode_key and row.get(postcode_key) not in (None, "") else None) payload.append( ( build_run_id, @@ -1100,29 +1351,11 @@ def _populate_stage_osni( ingest_run_id, ) ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) - return _schema_insert_rows( - conn, - sql.SQL( - """ - INSERT INTO stage.osni_street_point ( - build_run_id, - feature_id, - postcode_norm, - street_name_raw, - street_name_casefolded, - ingest_run_id - ) VALUES (%s, %s, %s, %s, %s, %s) - ON CONFLICT (build_run_id, feature_id) - DO UPDATE SET - postcode_norm = EXCLUDED.postcode_norm, - street_name_raw = EXCLUDED.street_name_raw, - street_name_casefolded = EXCLUDED.street_name_casefolded, - ingest_run_id = EXCLUDED.ingest_run_id - """ - ), - payload, - ) + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted def _populate_stage_dfi( @@ -1132,20 +1365,45 @@ def _populate_stage_dfi( field_map: dict[str, str], required_fields: tuple[str, ...], ) -> int: - rows = _load_raw_rows(conn, "raw.dfi_highway_row", ingest_run_id) - _assert_required_mapped_fields_present("dfi_highway", rows, field_map, required_fields) + insert_sql = sql.SQL( + """ + INSERT INTO stage.dfi_road_segment ( + build_run_id, + segment_id, + postcode_norm, + street_name_raw, + street_name_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, segment_id) + DO UPDATE SET + postcode_norm = EXCLUDED.postcode_norm, + street_name_raw = EXCLUDED.street_name_raw, + street_name_casefolded = EXCLUDED.street_name_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ) payload: list[tuple[Any, ...]] = [] - for row in rows: - segment_id_raw = row.get(field_map["segment_id"]) - street_raw = row.get(field_map["street_name"]) + inserted = 0 + postcode_key = field_map.get("postcode") + for row in _iter_validated_raw_rows( + conn, + source_name="dfi_highway", + raw_table="raw.dfi_highway_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + segment_id_raw = _field_value(row, field_map, "segment_id") + street_raw = _field_value(row, field_map, "street_name") if segment_id_raw in (None, "") or street_raw in (None, ""): continue folded = street_casefold(str(street_raw)) if folded is None: continue - postcode_n = postcode_norm(str(row.get(field_map.get("postcode", ""), "")) or None) + postcode_n = postcode_norm(str(row.get(postcode_key)) if postcode_key and row.get(postcode_key) not in (None, "") else None) payload.append( ( @@ -1157,29 +1415,11 @@ def _populate_stage_dfi( ingest_run_id, ) ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) - return _schema_insert_rows( - conn, - sql.SQL( - """ - INSERT INTO stage.dfi_road_segment ( - build_run_id, - segment_id, - postcode_norm, - street_name_raw, - street_name_casefolded, - ingest_run_id - ) VALUES (%s, %s, %s, %s, %s, %s) - ON CONFLICT (build_run_id, segment_id) - DO UPDATE SET - postcode_norm = EXCLUDED.postcode_norm, - street_name_raw = EXCLUDED.street_name_raw, - street_name_casefolded = EXCLUDED.street_name_casefolded, - ingest_run_id = EXCLUDED.ingest_run_id - """ - ), - payload, - ) + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted def _populate_stage_ppd( @@ -1189,15 +1429,41 @@ def _populate_stage_ppd( field_map: dict[str, str], required_fields: tuple[str, ...], ) -> int: - rows = _load_raw_rows(conn, "raw.ppd_row", ingest_run_id) - _assert_required_mapped_fields_present("ppd", rows, field_map, required_fields) + insert_sql = sql.SQL( + """ + INSERT INTO stage.ppd_parsed_address ( + build_run_id, + row_hash, + postcode_norm, + house_number, + street_token_raw, + street_token_casefolded, + ingest_run_id + ) VALUES (%s, %s, %s, %s, %s, %s, %s) + ON CONFLICT (build_run_id, row_hash) + DO UPDATE SET + postcode_norm = EXCLUDED.postcode_norm, + house_number = EXCLUDED.house_number, + street_token_raw = EXCLUDED.street_token_raw, + street_token_casefolded = EXCLUDED.street_token_casefolded, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ) payload: list[tuple[Any, ...]] = [] - for row in rows: - row_hash_raw = row.get(field_map["row_hash"]) - postcode_raw = row.get(field_map["postcode"]) - street_raw = row.get(field_map["street"]) - house_number_raw = row.get(field_map["house_number"]) + inserted = 0 + for row in _iter_validated_raw_rows( + conn, + source_name="ppd", + raw_table="raw.ppd_row", + ingest_run_id=ingest_run_id, + field_map=field_map, + required_fields=required_fields, + ): + row_hash_raw = _field_value(row, field_map, "row_hash") + postcode_raw = _field_value(row, field_map, "postcode") + street_raw = _field_value(row, field_map, "street") + house_number_raw = _field_value(row, field_map, "house_number") if row_hash_raw in (None, "") or postcode_raw in (None, "") or street_raw in (None, ""): continue @@ -1218,31 +1484,11 @@ def _populate_stage_ppd( ingest_run_id, ) ) + if len(payload) >= STAGE_INSERT_BATCH_SIZE: + inserted += _flush_stage_batch(conn, insert_sql, payload) - return _schema_insert_rows( - conn, - sql.SQL( - """ - INSERT INTO stage.ppd_parsed_address ( - build_run_id, - row_hash, - postcode_norm, - house_number, - street_token_raw, - street_token_casefolded, - ingest_run_id - ) VALUES (%s, %s, %s, %s, %s, %s, %s) - ON CONFLICT (build_run_id, row_hash) - DO UPDATE SET - postcode_norm = EXCLUDED.postcode_norm, - house_number = EXCLUDED.house_number, - street_token_raw = EXCLUDED.street_token_raw, - street_token_casefolded = EXCLUDED.street_token_casefolded, - ingest_run_id = EXCLUDED.ingest_run_id - """ - ), - payload, - ) + inserted += _flush_stage_batch(conn, insert_sql, payload) + return inserted def _pass_0b_stage_normalisation( @@ -1290,14 +1536,15 @@ def _pass_0b_stage_normalisation( conn, build_run_id, ingest_run_id, field_map, required_fields ) - if "os_open_linked_identifiers" in source_runs: - field_map, required_fields = _mapped_fields_for_source(schema_config, "os_open_linked_identifiers") - ingest_run_id = _single_source_run(source_runs, "os_open_linked_identifiers") - toid_count, uprn_count = _populate_stage_oli( + if "os_open_lids" in source_runs: + field_map, required_fields = _mapped_fields_for_source(schema_config, "os_open_lids") + ingest_run_id = _single_source_run(source_runs, "os_open_lids") + toid_count, uprn_count, relation_count = _populate_stage_oli( conn, build_run_id, ingest_run_id, field_map, required_fields ) counts["stage.oli_toid_usrn"] = toid_count counts["stage.oli_uprn_usrn"] = uprn_count + counts["stage.oli_identifier_pair"] = relation_count if "nsul" in source_runs: field_map, required_fields = _mapped_fields_for_source(schema_config, "nsul") @@ -1451,6 +1698,80 @@ def _pass_2_gb_canonical_streets(conn: psycopg.Connection, build_run_id: str) -> with conn.cursor() as cur: cur.execute( """ + WITH direct_usrn AS ( + SELECT + usrn, + street_name, + street_name_casefolded, + street_class, + street_status, + usrn_run_id + FROM stage.streets_usrn_input + WHERE build_run_id = %(build_run_id)s + ), + inferred_name_counts AS ( + SELECT + oli.usrn, + n.street_name_raw AS street_name, + n.street_name_casefolded, + COUNT(*)::bigint AS evidence_count, + (ARRAY_AGG(oli.ingest_run_id ORDER BY oli.ingest_run_id::text ASC))[1] AS usrn_run_id + FROM stage.open_names_road_feature AS n + JOIN stage.oli_toid_usrn AS oli + ON oli.build_run_id = n.build_run_id + AND oli.toid = n.toid + WHERE n.build_run_id = %(build_run_id)s + AND n.toid IS NOT NULL + GROUP BY oli.usrn, n.street_name_raw, n.street_name_casefolded + ), + inferred_usrn AS ( + SELECT + usrn, + street_name, + street_name_casefolded, + NULL::text AS street_class, + NULL::text AS street_status, + usrn_run_id + FROM ( + SELECT + usrn, + street_name, + street_name_casefolded, + usrn_run_id, + ROW_NUMBER() OVER ( + PARTITION BY usrn + ORDER BY evidence_count DESC, + street_name_casefolded COLLATE "C" ASC, + street_name COLLATE "C" ASC + ) AS rn + FROM inferred_name_counts + ) AS ranked + WHERE rn = 1 + ), + combined AS ( + SELECT + usrn, + street_name, + street_name_casefolded, + street_class, + street_status, + usrn_run_id + FROM direct_usrn + UNION ALL + SELECT + inferred.usrn, + inferred.street_name, + inferred.street_name_casefolded, + inferred.street_class, + inferred.street_status, + inferred.usrn_run_id + FROM inferred_usrn AS inferred + WHERE NOT EXISTS ( + SELECT 1 + FROM direct_usrn AS direct + WHERE direct.usrn = inferred.usrn + ) + ) INSERT INTO core.streets_usrn ( produced_build_run_id, usrn, @@ -1461,18 +1782,17 @@ def _pass_2_gb_canonical_streets(conn: psycopg.Connection, build_run_id: str) -> usrn_run_id ) SELECT - build_run_id, + %(build_run_id)s, usrn, street_name, street_name_casefolded, street_class, street_status, usrn_run_id - FROM stage.streets_usrn_input - WHERE build_run_id = %s + FROM combined ORDER BY usrn ASC """, - (build_run_id,), + {"build_run_id": build_run_id}, ) inserted = cur.rowcount @@ -1482,7 +1802,7 @@ def _pass_2_gb_canonical_streets(conn: psycopg.Connection, build_run_id: str) -> def _pass_3_open_names_candidates(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: schema_config = _schema_config() _mapped_fields_for_source(schema_config, "os_open_names") - _mapped_fields_for_source(schema_config, "os_open_linked_identifiers") + _mapped_fields_for_source(schema_config, "os_open_lids") with conn.cursor() as cur: cur.execute( @@ -1566,7 +1886,7 @@ def _pass_3_open_names_candidates(conn: psycopg.Connection, build_run_id: str) - source_name, ingest_run_id, evidence_json - ) VALUES (%s, %s, %s, %s, %s, 'oli_toid_usrn', 'high', %s, 'os_open_linked_identifiers', %s, %s) + ) VALUES (%s, %s, %s, %s, %s, 'oli_toid_usrn', 'high', %s, 'os_open_lids', %s, %s) RETURNING candidate_id """, ( @@ -1643,7 +1963,7 @@ def _pass_4_uprn_reinforcement(conn: psycopg.Connection, build_run_id: str) -> d 'uprn_usrn', 'high', 'oli:uprn_usrn:' || a.uprn_count::text || '_uprns', - 'os_open_linked_identifiers', + 'os_open_lids', a.oli_ingest_run_id, jsonb_build_object('uprn_count', a.uprn_count) FROM aggregate_pairs AS a diff --git a/pipeline/src/pipeline/ingest/workflows.py b/pipeline/src/pipeline/ingest/workflows.py index 2872290..bd0dee6 100644 --- a/pipeline/src/pipeline/ingest/workflows.py +++ b/pipeline/src/pipeline/ingest/workflows.py @@ -37,7 +37,7 @@ class IngestResult: "os_open_names": "raw.os_open_names_row", "os_open_roads": "raw.os_open_roads_row", "os_open_uprn": "raw.os_open_uprn_row", - "os_open_linked_identifiers": "raw.os_open_linked_identifiers_row", + "os_open_lids": "raw.os_open_lids_row", "nsul": "raw.nsul_row", "osni_gazetteer": "raw.osni_gazetteer_row", "dfi_highway": "raw.dfi_highway_row", diff --git a/pipeline/src/pipeline/manifest.py b/pipeline/src/pipeline/manifest.py index 6d17da8..78d9eaf 100644 --- a/pipeline/src/pipeline/manifest.py +++ b/pipeline/src/pipeline/manifest.py @@ -21,7 +21,7 @@ class ManifestError(ValueError): "os_open_names", "os_open_roads", "os_open_uprn", - "os_open_linked_identifiers", + "os_open_lids", "nsul", "osni_gazetteer", "dfi_highway", @@ -35,7 +35,7 @@ class ManifestError(ValueError): "os_open_names", "os_open_roads", "os_open_uprn", - "os_open_linked_identifiers", + "os_open_lids", "nsul", }, "gb_core_ppd": { @@ -44,7 +44,7 @@ class ManifestError(ValueError): "os_open_names", "os_open_roads", "os_open_uprn", - "os_open_linked_identifiers", + "os_open_lids", "nsul", "ppd", }, @@ -54,7 +54,7 @@ class ManifestError(ValueError): "os_open_names", "os_open_roads", "os_open_uprn", - "os_open_linked_identifiers", + "os_open_lids", "nsul", "osni_gazetteer", "dfi_highway", diff --git a/tests/test_bundle_manifest_ppd_updates.py b/tests/test_bundle_manifest_ppd_updates.py index 19d62b4..e2edece 100644 --- a/tests/test_bundle_manifest_ppd_updates.py +++ b/tests/test_bundle_manifest_ppd_updates.py @@ -33,7 +33,7 @@ def test_bundle_allows_multiple_ppd_runs(self) -> None: "os_open_names": "33333333-3333-3333-3333-333333333333", "os_open_roads": "44444444-4444-4444-4444-444444444444", "os_open_uprn": "55555555-5555-5555-5555-555555555555", - "os_open_linked_identifiers": "66666666-6666-6666-6666-666666666666", + "os_open_lids": "66666666-6666-6666-6666-666666666666", "nsul": "77777777-7777-7777-7777-777777777777", "ppd": [ "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", @@ -60,7 +60,7 @@ def test_bundle_rejects_empty_source_run_list(self) -> None: "os_open_names": "33333333-3333-3333-3333-333333333333", "os_open_roads": "44444444-4444-4444-4444-444444444444", "os_open_uprn": "55555555-5555-5555-5555-555555555555", - "os_open_linked_identifiers": "66666666-6666-6666-6666-666666666666", + "os_open_lids": "66666666-6666-6666-6666-666666666666", "nsul": "77777777-7777-7777-7777-777777777777", }, } @@ -77,7 +77,7 @@ def test_gb_core_ppd_does_not_require_ni_sources(self) -> None: "os_open_names": "33333333-3333-3333-3333-333333333333", "os_open_roads": "44444444-4444-4444-4444-444444444444", "os_open_uprn": "55555555-5555-5555-5555-555555555555", - "os_open_linked_identifiers": "66666666-6666-6666-6666-666666666666", + "os_open_lids": "66666666-6666-6666-6666-666666666666", "nsul": "77777777-7777-7777-7777-777777777777", "ppd": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", }, From 9f2c9f5fc808731c40cddacc32ee804864e66bce Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Sat, 21 Feb 2026 00:16:59 +0000 Subject: [PATCH 06/17] docs(agents): add explicit commit workflow and conventional commit rules Codifies the requested workflow:\n- commit at logical checkpoints\n- prefer atomic commits by concern\n- require Conventional Commits format for every commit --- AGENTS.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index 9732cc7..6460952 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -247,6 +247,11 @@ Avoid vague language such as: Be precise. +### 7.1 Commit Standards +- Commit at logical checkpoints whenever it makes sense. +- Prefer atomic commits grouped by concern (schema, ingest, transforms, tests, docs). +- Use **Conventional Commits** format for every commit message (`type(scope): summary`). + --- ## 8. Decision Rule @@ -265,4 +270,4 @@ Correctness over convenience. --- -End of AGENTS.md \ No newline at end of file +End of AGENTS.md From f42f3974d283c4ce10478c4ab8e1e6eceda44579 Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Sat, 21 Feb 2026 00:21:19 +0000 Subject: [PATCH 07/17] docs(agents): compact roadmap and add agent onboarding docs - compact AGENTS.md into a concise roadmap-first guide\n- add docs index and agent-focused onboarding/runbook/codebase-map docs under docs/\n- remove absolute local paths from these docs and enforce relative-path rule in AGENTS --- AGENTS.md | 312 ++++++------------------------------- docs/README.md | 22 +++ docs/agent/codebase-map.md | 24 +++ docs/agent/runbook.md | 43 +++++ docs/agent/start-here.md | 31 ++++ 5 files changed, 165 insertions(+), 267 deletions(-) create mode 100644 docs/README.md create mode 100644 docs/agent/codebase-map.md create mode 100644 docs/agent/runbook.md create mode 100644 docs/agent/start-here.md diff --git a/AGENTS.md b/AGENTS.md index 6460952..2ea0828 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,273 +1,51 @@ # AGENTS.md -This repository contains a **data import and transformation pipeline** for UK open datasets. -Its purpose is to produce a reproducible, versioned derived dataset: - -UPRN → postcode → inferred street name → confidence score - -This file defines behavioural rules, quality standards, and documentation requirements -for any agent contributing to this project. - -The priority is **accuracy, provenance, and reproducibility**. - ---- - -## 1. Core Principles - -### 1.1 No Guessing -If a dataset field, schema, release identifier, or licence detail is unknown: -- Mark it as **Unknown** -- Add validation logic -- Document the assumption explicitly - -Never silently assume structure based on “typical” formats. - ---- - -### 1.2 Reproducibility First -The pipeline must be: -- Deterministic -- Rebuildable from raw inputs -- Fully traceable to dataset release identifiers - -If the same inputs are used, outputs must be identical. - -No hidden state. -No environment-dependent logic. -No implicit defaults. - ---- - -### 1.3 Raw Data is Sacred -- Raw imports are immutable. -- Transformations must not mutate raw tables. -- Derived outputs must be rebuildable from raw + release metadata. - -If you need to correct something, rebuild it — do not patch it. - ---- - -### 1.4 Provenance is Mandatory -Every derived dataset must clearly record: -- Source dataset release identifiers -- Method used -- Computation timestamp - -If provenance is not recorded, the output is invalid. - ---- - -### 1.5 Explicit Limitations -Street inference is: -- Heuristic -- Distance-based -- Non-authoritative - -Documentation must clearly state this. -Do not imply authoritative delivery-level correctness. - ---- - -## 2. Documentation Requirements - -Every meaningful change must include documentation updates. - -At minimum: - -### 2.1 Dataset Documentation -Maintain a living document describing: -- Each dataset -- Where it is obtained -- Licence type -- Required fields -- Known limitations -- Known schema quirks - -If a dataset changes, update the documentation immediately. - ---- - -### 2.2 Data Model Documentation -Maintain clear documentation for: -- Raw tables -- Core tables -- Derived tables -- Metrics tables - -Include: -- Field definitions -- Data types -- Constraints -- Semantic meaning - -No column should exist without documented purpose. - ---- - -### 2.3 Transform Documentation -For each transformation layer, document: -- Inputs -- Outputs -- Assumptions -- Failure modes -- Determinism guarantees - -If logic changes (e.g., confidence thresholds), update documentation and record the change rationale. - ---- - -### 2.4 Metrics Documentation -Define: -- What each metric measures -- How it is calculated -- Why it exists -- Expected ranges - -Metrics are part of product quality, not optional extras. - ---- - -## 3. Quality Standards - -### 3.1 Deterministic Behaviour -- Stable ordering in queries -- Explicit tie-breaking rules -- No reliance on implicit database ordering - -### 3.2 Observability -Each pipeline run must: -- Log row counts per stage -- Log join coverage percentages -- Log resolution percentages -- Log distance percentiles - -Silent processing is not acceptable. - ---- - -### 3.3 Fail Fast -If: -- Required columns are missing -- Geometry is invalid -- Coordinate reference systems are inconsistent - -The pipeline must fail clearly. - -Partial silent success is worse than failure. - ---- - -### 3.4 Schema Validation -Before processing: -- Validate required fields exist -- Validate types where possible -- Record dataset release metadata - -Do not infer schema dynamically without documentation. - ---- - -### 3.5 No Scope Drift -This repository is a **pipeline**, not: -- An API -- A serving layer -- An analytics platform -- A proprietary dataset reconstruction engine - -Keep scope disciplined. - ---- - -## 4. Testing Expectations - -Agents must ensure: - -- Normalisation logic is tested. -- Derived outputs are deterministic. -- Schema validation works. -- Metrics calculations are stable. -- Small fixture datasets validate spatial inference logic. - -Tests must: -- Use synthetic or reduced fixture data. -- Not depend on downloading live datasets. - ---- - -## 5. Change Management - -Any change to: -- Confidence scoring -- Search radius -- Join logic -- Normalisation rules -- Spatial reference systems - -Must include: - -1. Rationale -2. Before/after metrics comparison -3. Determinism confirmation -4. Documentation update - ---- - -## 6. What Must Never Be Implemented Here - -- Address enumeration features -- Proprietary dataset integration -- Undocumented inference layers -- Hidden optimisation logic -- Behaviour designed for ambiguous or non-transparent use cases - -This pipeline exists to: -- Normalise open data -- Join open data -- Derive transparent street-level inference -- Record quality metrics - -Nothing more. - ---- - -## 7. Communication Standards - -Pull requests must: - -- State the problem being solved -- Describe the solution -- Document assumptions -- Include metric impact -- Confirm reproducibility - -Avoid vague language such as: -- “Seems to work” -- “Probably correct” -- “Should be fine” - -Be precise. - -### 7.1 Commit Standards +Purpose: this file is the agent entrypoint for this repository. +Use it as a roadmap to the docs, then execute work with strict reproducibility and provenance. + +## 1. Start Here (Required Reading Order) +1. `docs/README.md` +2. `docs/agent/start-here.md` +3. `docs/spec/pipeline_v3/spec.md` +4. `docs/spec/pipeline_v3/data_model.md` +5. `docs/spec/pipeline_v3/canonicalisation.md` + +If behavior in code differs from spec, treat it as a defect and document the delta. + +## 2. Documentation Roadmap +- V3 product/behavior spec: `docs/spec/pipeline_v3/spec.md` +- V3 schema and table contracts: `docs/spec/pipeline_v3/data_model.md` +- Determinism and canonical rules: `docs/spec/pipeline_v3/canonicalisation.md` +- Source acquisition + licensing context: `docs/spec/data_sources.md` +- Agent onboarding: `docs/agent/start-here.md` +- Codebase map: `docs/agent/codebase-map.md` +- Operational runbook (ingest/build/publish): `docs/agent/runbook.md` +- Legacy phase docs (historical only): `docs/spec/phase_1/`, `docs/spec/phase_2-open-names/` + +## 3. Non-Negotiable Engineering Rules +- No guessing: unknown fields/semantics must be marked unknown and validated explicitly. +- Reproducibility first: same inputs must produce same outputs. +- Raw data is immutable: never mutate raw source snapshots. +- Provenance is mandatory: derived records must trace to source run(s) and method. +- Deterministic execution: stable ordering + explicit tie-breaks only. +- Fail fast on schema/geometry/CRS issues. +- This repo is a pipeline only; do not add API-serving scope here. + +## 4. Change Requirements +For meaningful behavior changes (join logic, scoring, normalization, radius/thresholds, CRS, pass semantics): +1. Update spec/docs in `docs/` in the same change. +2. Never place absolute local filesystem paths in docs; use repository-relative paths. +3. State rationale. +4. Provide before/after metrics or counts where applicable. +5. Confirm determinism impact. +6. Add/adjust tests (fixture-based; no live-download dependency). + +## 5. Commit Standards - Commit at logical checkpoints whenever it makes sense. - Prefer atomic commits grouped by concern (schema, ingest, transforms, tests, docs). -- Use **Conventional Commits** format for every commit message (`type(scope): summary`). - ---- - -## 8. Decision Rule - -If a proposed change: -- Reduces transparency, -- Obscures provenance, -- Makes outputs less reproducible, -- Or introduces implicit assumptions, - -It should not be merged. - -Clarity over cleverness. -Traceability over speed. -Correctness over convenience. +- Use Conventional Commits for every commit message (`type(scope): summary`). ---- +## 6. Decision Rule +If a change reduces transparency, obscures provenance, weakens reproducibility, or introduces hidden assumptions, do not merge it. -End of AGENTS.md +Clarity over cleverness. Traceability over speed. Correctness over convenience. diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..183e301 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,22 @@ +# Documentation Index + +This docs tree is organized for fast agent onboarding and precise implementation. + +## Read Order +1. `docs/agent/start-here.md` +2. `docs/agent/codebase-map.md` +3. `docs/agent/runbook.md` +4. `docs/spec/pipeline_v3/spec.md` +5. `docs/spec/pipeline_v3/data_model.md` +6. `docs/spec/pipeline_v3/canonicalisation.md` + +## Sections +- Agent docs: `docs/agent/` +- V3 authoritative spec: `docs/spec/pipeline_v3/` +- Source acquisition and licensing context: `docs/spec/data_sources.md` +- Legacy phase docs (historical reference): + - `docs/spec/phase_1/` + - `docs/spec/phase_2-open-names/` + +## Rule +When behavior changes, update both code and the relevant doc in this tree in the same PR. diff --git a/docs/agent/codebase-map.md b/docs/agent/codebase-map.md new file mode 100644 index 0000000..6bb5cc2 --- /dev/null +++ b/docs/agent/codebase-map.md @@ -0,0 +1,24 @@ +# Codebase Map + +## Main Runtime Modules +- CLI: `pipeline/src/pipeline/cli.py` +- Manifest parsing/validation: `pipeline/src/pipeline/manifest.py` +- Ingest workflows (raw ingestion): `pipeline/src/pipeline/ingest/workflows.py` +- Build workflows (pass execution/finalization/publish): `pipeline/src/pipeline/build/workflows.py` +- DB migrations runner: `pipeline/src/pipeline/db/migrations.py` +- Normalization utilities: `pipeline/src/pipeline/util/normalise.py` + +## SQL and Config +- SQL migrations: `pipeline/sql/migrations/` +- Source schema mapping config: `pipeline/config/source_schema.yaml` +- Frequency weights config: `pipeline/config/frequency_weights.yaml` +- Normalization config: `pipeline/config/normalisation.yaml` + +## Manifests and Data Inputs +- Real manifests: `data/manifests/real_v3/` +- Smoke manifests: `data/manifests/v3_smoke/` +- Local source files: `data/source_files/` + +## Tests +- Test suite root: `tests/` +- Focus on deterministic behavior, schema validation, provenance contracts, and pass semantics. diff --git a/docs/agent/runbook.md b/docs/agent/runbook.md new file mode 100644 index 0000000..5a39cc4 --- /dev/null +++ b/docs/agent/runbook.md @@ -0,0 +1,43 @@ +# Operational Runbook + +## 1) Migrate +```bash +pipeline --dsn "dbname=postcodes_v3" db migrate +``` + +## 2) Ingest Sources +```bash +pipeline --dsn "dbname=postcodes_v3" ingest source --manifest /path/to/source_manifest.json +``` +Repeat for each source in the target profile. + +## 3) Create Bundle +```bash +pipeline --dsn "dbname=postcodes_v3" bundle create --manifest /path/to/bundle_manifest.json +``` + +## 4) Build +```bash +pipeline --dsn "dbname=postcodes_v3" build run --bundle-id [--rebuild|--resume] +``` +Use `--resume` only for the same bundle/run lineage. + +## 5) Verify +```bash +pipeline --dsn "dbname=postcodes_v3" build verify --build-run-id +``` + +## 6) Publish +```bash +pipeline --dsn "dbname=postcodes_v3" build publish --build-run-id --actor +``` + +## Observability Queries +- Build status: `meta.build_run` +- Pass checkpoints: `meta.build_pass_checkpoint` +- Ingest provenance: `meta.ingest_run`, `meta.ingest_run_file` + +## Failure Policy +- Fail fast on schema/field mismatches. +- Do not patch raw data; fix logic/mapping and rebuild. +- Record behavior changes in `docs/spec/pipeline_v3/` docs in the same PR. diff --git a/docs/agent/start-here.md b/docs/agent/start-here.md new file mode 100644 index 0000000..e53af8a --- /dev/null +++ b/docs/agent/start-here.md @@ -0,0 +1,31 @@ +# Agent Start Here + +## Objective +Produce deterministic, replayable postcode/street outputs from open-source ingests with strict provenance. + +## Golden Path +1. Read V3 spec docs: + - `docs/spec/pipeline_v3/spec.md` + - `docs/spec/pipeline_v3/data_model.md` + - `docs/spec/pipeline_v3/canonicalisation.md` +2. Validate local runtime assumptions: + - DB migrations applied + - Manifest source names and schema mappings align with actual raw payload fields +3. Run in this sequence: + - `pipeline db migrate` + - `pipeline ingest source --manifest ` (repeat by source) + - `pipeline bundle create --manifest ` + - `pipeline build run --bundle-id [--rebuild|--resume]` + - `pipeline build verify --build-run-id ` + - `pipeline build publish --build-run-id --actor ` + +## Critical Contracts +- Raw layer is immutable. +- `derived.postcode_street_candidates` is append-only evidence. +- Pass 3 promotion is insert-only with lineage links. +- Probability normalization is exact by formula and stored with deterministic residual correction. + +## When Unsure +- Prefer explicit failure over implicit behavior. +- Capture unknowns in docs + validation. +- Keep all time and ordering deterministic. From d5101b4dadd2c89b749bf90ce3af2ced582bf737 Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Sat, 21 Feb 2026 00:29:03 +0000 Subject: [PATCH 08/17] docs(architecture): add exhaustive dataset/stage lineage docs and scoped agent guides - add docs/architecture index with deep links for relationships, stages, and value-added narrative\n- add separate dataset pages documenting ingest->stage->transform lineage and value per source\n- add separate pass pages (0a..8) documenting inputs, outputs, contracts, and value added\n- add scoped AGENTS guides at docs/, pipeline/, pipeline/src/pipeline/, tests/, and data/\n- strengthen root AGENTS roadmap with architecture links and strict docs-in-lockstep rule\n- normalize historical docs to remove absolute local filesystem paths\n- verify local markdown cross-links resolve and enforce relative-path documentation convention --- AGENTS.md | 18 ++++- data/AGENTS.md | 17 ++++ docs/AGENTS.md | 22 +++++ docs/README.md | 13 +-- docs/agent/codebase-map.md | 6 ++ docs/agent/runbook.md | 1 + docs/agent/start-here.md | 5 ++ docs/architecture/README.md | 24 ++++++ docs/architecture/datasets/README.md | 23 ++++++ docs/architecture/datasets/dfi_highway.md | 19 +++++ docs/architecture/datasets/nsul.md | 27 +++++++ docs/architecture/datasets/onspd.md | 34 ++++++++ docs/architecture/datasets/os_open_lids.md | 32 ++++++++ docs/architecture/datasets/os_open_names.md | 32 ++++++++ docs/architecture/datasets/os_open_roads.md | 28 +++++++ docs/architecture/datasets/os_open_uprn.md | 29 +++++++ docs/architecture/datasets/os_open_usrn.md | 31 +++++++ docs/architecture/datasets/osni_gazetteer.md | 19 +++++ docs/architecture/datasets/ppd.md | 30 +++++++ docs/architecture/relationships-overview.md | 44 ++++++++++ docs/architecture/stages/0a_raw_ingest.md | 18 +++++ .../stages/0b_stage_normalisation.md | 33 ++++++++ docs/architecture/stages/1_onspd_backbone.md | 18 +++++ .../stages/2_gb_canonical_streets.md | 18 +++++ .../stages/3_open_names_candidates.md | 26 ++++++ .../stages/4_uprn_reinforcement.md | 20 +++++ .../stages/5_gb_spatial_fallback.md | 18 +++++ docs/architecture/stages/6_ni_candidates.md | 19 +++++ docs/architecture/stages/7_ppd_gap_fill.md | 19 +++++ docs/architecture/stages/8_finalisation.md | 28 +++++++ docs/architecture/stages/README.md | 19 +++++ docs/architecture/value-added-by-stage.md | 80 +++++++++++++++++++ docs/spec/phase_1/changes.md | 6 +- docs/spec/phase_2-open-names/changes.md | 8 +- docs/spec/pipeline_v3/canonicalisation.md | 4 + docs/spec/pipeline_v3/data_model.md | 5 ++ docs/spec/pipeline_v3/spec.md | 5 ++ pipeline/AGENTS.md | 21 +++++ pipeline/src/pipeline/AGENTS.md | 25 ++++++ tests/AGENTS.md | 19 +++++ 40 files changed, 848 insertions(+), 15 deletions(-) create mode 100644 data/AGENTS.md create mode 100644 docs/AGENTS.md create mode 100644 docs/architecture/README.md create mode 100644 docs/architecture/datasets/README.md create mode 100644 docs/architecture/datasets/dfi_highway.md create mode 100644 docs/architecture/datasets/nsul.md create mode 100644 docs/architecture/datasets/onspd.md create mode 100644 docs/architecture/datasets/os_open_lids.md create mode 100644 docs/architecture/datasets/os_open_names.md create mode 100644 docs/architecture/datasets/os_open_roads.md create mode 100644 docs/architecture/datasets/os_open_uprn.md create mode 100644 docs/architecture/datasets/os_open_usrn.md create mode 100644 docs/architecture/datasets/osni_gazetteer.md create mode 100644 docs/architecture/datasets/ppd.md create mode 100644 docs/architecture/relationships-overview.md create mode 100644 docs/architecture/stages/0a_raw_ingest.md create mode 100644 docs/architecture/stages/0b_stage_normalisation.md create mode 100644 docs/architecture/stages/1_onspd_backbone.md create mode 100644 docs/architecture/stages/2_gb_canonical_streets.md create mode 100644 docs/architecture/stages/3_open_names_candidates.md create mode 100644 docs/architecture/stages/4_uprn_reinforcement.md create mode 100644 docs/architecture/stages/5_gb_spatial_fallback.md create mode 100644 docs/architecture/stages/6_ni_candidates.md create mode 100644 docs/architecture/stages/7_ppd_gap_fill.md create mode 100644 docs/architecture/stages/8_finalisation.md create mode 100644 docs/architecture/stages/README.md create mode 100644 docs/architecture/value-added-by-stage.md create mode 100644 pipeline/AGENTS.md create mode 100644 pipeline/src/pipeline/AGENTS.md create mode 100644 tests/AGENTS.md diff --git a/AGENTS.md b/AGENTS.md index 2ea0828..58c35a2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -6,9 +6,10 @@ Use it as a roadmap to the docs, then execute work with strict reproducibility a ## 1. Start Here (Required Reading Order) 1. `docs/README.md` 2. `docs/agent/start-here.md` -3. `docs/spec/pipeline_v3/spec.md` -4. `docs/spec/pipeline_v3/data_model.md` -5. `docs/spec/pipeline_v3/canonicalisation.md` +3. `docs/architecture/README.md` +4. `docs/spec/pipeline_v3/spec.md` +5. `docs/spec/pipeline_v3/data_model.md` +6. `docs/spec/pipeline_v3/canonicalisation.md` If behavior in code differs from spec, treat it as a defect and document the delta. @@ -20,6 +21,8 @@ If behavior in code differs from spec, treat it as a defect and document the del - Agent onboarding: `docs/agent/start-here.md` - Codebase map: `docs/agent/codebase-map.md` - Operational runbook (ingest/build/publish): `docs/agent/runbook.md` +- Dataset lineage pages: `docs/architecture/datasets/README.md` +- Stage/pass pages: `docs/architecture/stages/README.md` - Legacy phase docs (historical only): `docs/spec/phase_1/`, `docs/spec/phase_2-open-names/` ## 3. Non-Negotiable Engineering Rules @@ -40,6 +43,8 @@ For meaningful behavior changes (join logic, scoring, normalization, radius/thre 5. Confirm determinism impact. 6. Add/adjust tests (fixture-based; no live-download dependency). +This rule is strict: agents must always keep documentation in step with code changes. + ## 5. Commit Standards - Commit at logical checkpoints whenever it makes sense. - Prefer atomic commits grouped by concern (schema, ingest, transforms, tests, docs). @@ -49,3 +54,10 @@ For meaningful behavior changes (join logic, scoring, normalization, radius/thre If a change reduces transparency, obscures provenance, weakens reproducibility, or introduces hidden assumptions, do not merge it. Clarity over cleverness. Traceability over speed. Correctness over convenience. + +## 7. Scoped Agent Guides +- Docs scope: `docs/AGENTS.md` +- Pipeline scope: `pipeline/AGENTS.md` +- Runtime code scope: `pipeline/src/pipeline/AGENTS.md` +- Test scope: `tests/AGENTS.md` +- Data/manifest scope: `data/AGENTS.md` diff --git a/data/AGENTS.md b/data/AGENTS.md new file mode 100644 index 0000000..e5e74f5 --- /dev/null +++ b/data/AGENTS.md @@ -0,0 +1,17 @@ +# data/AGENTS.md + +## Scope +Manifests and local source-file conventions under `data/`. + +## Critical Rule +Manifest/source contract changes must be reflected in docs (`docs/spec/...` and `docs/architecture/...`) and code (`pipeline/src/pipeline/manifest.py`, `pipeline/config/source_schema.yaml`) together. + +## Conventions +- source manifests live under `data/manifests/` +- keep source naming aligned with `pipeline/src/pipeline/manifest.py` +- avoid absolute local paths in documentation; manifests may contain absolute file paths for runtime only +- update bundle manifests when source keys change + +## Useful References +- source acquisition: `docs/spec/data_sources.md` +- architecture dataset pages: `docs/architecture/datasets/` diff --git a/docs/AGENTS.md b/docs/AGENTS.md new file mode 100644 index 0000000..ac070b6 --- /dev/null +++ b/docs/AGENTS.md @@ -0,0 +1,22 @@ +# docs/AGENTS.md + +## Scope +Documentation standards and navigation for everything under `docs/`. + +## Critical Rule +Any code behavior change must update relevant docs in the same change set. Documentation is not optional follow-up work. + +## Path Rule +Never use absolute local filesystem paths in docs. Use repository-relative paths only. + +## Navigation +- Docs index: `docs/README.md` +- Architecture map: `docs/architecture/README.md` +- V3 spec authority: `docs/spec/pipeline_v3/` +- Source acquisition details: `docs/spec/data_sources.md` + +## Update Expectations +When editing docs: +- keep links valid and cross-linked +- update related index pages if new docs are added +- keep wording deterministic and implementation-aligned diff --git a/docs/README.md b/docs/README.md index 183e301..aff4f2c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -4,14 +4,16 @@ This docs tree is organized for fast agent onboarding and precise implementation ## Read Order 1. `docs/agent/start-here.md` -2. `docs/agent/codebase-map.md` -3. `docs/agent/runbook.md` -4. `docs/spec/pipeline_v3/spec.md` -5. `docs/spec/pipeline_v3/data_model.md` -6. `docs/spec/pipeline_v3/canonicalisation.md` +2. `docs/architecture/README.md` +3. `docs/agent/codebase-map.md` +4. `docs/agent/runbook.md` +5. `docs/spec/pipeline_v3/spec.md` +6. `docs/spec/pipeline_v3/data_model.md` +7. `docs/spec/pipeline_v3/canonicalisation.md` ## Sections - Agent docs: `docs/agent/` +- Architecture deep-dive: `docs/architecture/` - V3 authoritative spec: `docs/spec/pipeline_v3/` - Source acquisition and licensing context: `docs/spec/data_sources.md` - Legacy phase docs (historical reference): @@ -20,3 +22,4 @@ This docs tree is organized for fast agent onboarding and precise implementation ## Rule When behavior changes, update both code and the relevant doc in this tree in the same PR. +Never use absolute local filesystem paths in docs. diff --git a/docs/agent/codebase-map.md b/docs/agent/codebase-map.md index 6bb5cc2..fa16dcd 100644 --- a/docs/agent/codebase-map.md +++ b/docs/agent/codebase-map.md @@ -22,3 +22,9 @@ ## Tests - Test suite root: `tests/` - Focus on deterministic behavior, schema validation, provenance contracts, and pass semantics. + +## Documentation Cross-links +- Architecture index: `docs/architecture/README.md` +- Dataset lineage pages: `docs/architecture/datasets/README.md` +- Stage/pass pages: `docs/architecture/stages/README.md` +- Spec authority: `docs/spec/pipeline_v3/` diff --git a/docs/agent/runbook.md b/docs/agent/runbook.md index 5a39cc4..c87bb40 100644 --- a/docs/agent/runbook.md +++ b/docs/agent/runbook.md @@ -41,3 +41,4 @@ pipeline --dsn "dbname=postcodes_v3" build publish --build-run-id - Fail fast on schema/field mismatches. - Do not patch raw data; fix logic/mapping and rebuild. - Record behavior changes in `docs/spec/pipeline_v3/` docs in the same PR. +- Keep architecture docs in sync: `docs/architecture/`. diff --git a/docs/agent/start-here.md b/docs/agent/start-here.md index e53af8a..fee6277 100644 --- a/docs/agent/start-here.md +++ b/docs/agent/start-here.md @@ -5,6 +5,10 @@ Produce deterministic, replayable postcode/street outputs from open-source inges ## Golden Path 1. Read V3 spec docs: + - `docs/architecture/README.md` + - `docs/architecture/relationships-overview.md` + - `docs/architecture/datasets/README.md` + - `docs/architecture/stages/README.md` - `docs/spec/pipeline_v3/spec.md` - `docs/spec/pipeline_v3/data_model.md` - `docs/spec/pipeline_v3/canonicalisation.md` @@ -29,3 +33,4 @@ Produce deterministic, replayable postcode/street outputs from open-source inges - Prefer explicit failure over implicit behavior. - Capture unknowns in docs + validation. - Keep all time and ordering deterministic. +- Keep documentation in step with any behavior change in the same workstream. diff --git a/docs/architecture/README.md b/docs/architecture/README.md new file mode 100644 index 0000000..da8c7b1 --- /dev/null +++ b/docs/architecture/README.md @@ -0,0 +1,24 @@ +# Pipeline Architecture Docs + +This section explains how datasets relate to each other, how data moves through ingest/stage/build passes, and what value is added at each step. + +## Quick Links +- Relationship map: [`relationships-overview.md`](relationships-overview.md) +- End-to-end value by pass: [`value-added-by-stage.md`](value-added-by-stage.md) +- Dataset index: [`datasets/README.md`](datasets/README.md) +- Stage/pass index: [`stages/README.md`](stages/README.md) + +## Authoritative Contracts +- Behavioral spec: [`../spec/pipeline_v3/spec.md`](../spec/pipeline_v3/spec.md) +- Data model: [`../spec/pipeline_v3/data_model.md`](../spec/pipeline_v3/data_model.md) +- Canonicalisation/determinism: [`../spec/pipeline_v3/canonicalisation.md`](../spec/pipeline_v3/canonicalisation.md) + +## Reading Order (Fastest Onboarding) +1. [`relationships-overview.md`](relationships-overview.md) +2. [`datasets/README.md`](datasets/README.md) +3. [`stages/README.md`](stages/README.md) +4. [`value-added-by-stage.md`](value-added-by-stage.md) + +## Scope Note +- Legacy docs under `docs/spec/phase_1/` and `docs/spec/phase_2-open-names/` are historical. +- For new implementation work, default to V3 docs and this architecture section. diff --git a/docs/architecture/datasets/README.md b/docs/architecture/datasets/README.md new file mode 100644 index 0000000..fa6e80d --- /dev/null +++ b/docs/architecture/datasets/README.md @@ -0,0 +1,23 @@ +# Dataset Lineage Index + +Each page documents one dataset from raw ingestion through stage normalisation and downstream transformations. + +## Core GB Datasets +- ONSPD: [`onspd.md`](onspd.md) +- OS Open USRN: [`os_open_usrn.md`](os_open_usrn.md) +- OS Open Names: [`os_open_names.md`](os_open_names.md) +- OS Open Roads: [`os_open_roads.md`](os_open_roads.md) +- OS Open UPRN: [`os_open_uprn.md`](os_open_uprn.md) +- OS Open LIDS: [`os_open_lids.md`](os_open_lids.md) +- NSUL: [`nsul.md`](nsul.md) + +## Optional/Extended Sources +- PPD: [`ppd.md`](ppd.md) +- OSNI Gazetteer: [`osni_gazetteer.md`](osni_gazetteer.md) +- DfI Highway: [`dfi_highway.md`](dfi_highway.md) + +## Cross-links +- Relationship map: [`../relationships-overview.md`](../relationships-overview.md) +- Pass index: [`../stages/README.md`](../stages/README.md) +- Value added by stage: [`../value-added-by-stage.md`](../value-added-by-stage.md) +- Data model contract: [`../../spec/pipeline_v3/data_model.md`](../../spec/pipeline_v3/data_model.md) diff --git a/docs/architecture/datasets/dfi_highway.md b/docs/architecture/datasets/dfi_highway.md new file mode 100644 index 0000000..ba83be1 --- /dev/null +++ b/docs/architecture/datasets/dfi_highway.md @@ -0,0 +1,19 @@ +# DfI Highway Dataset Lineage (Optional NI) + +## Role In The Graph +DfI Highway contributes NI spatial road-segment fallback evidence. + +## Ingest Contract +- Source key: `dfi_highway` +- Raw table: `raw.dfi_highway_row` +- Stage table: `stage.dfi_road_segment` + +## Downstream Transformations +- Pass 6 emits `spatial_dfi_highway` candidates. + +## Value Added +- Adds NI fallback coverage where direct NI evidence is absent. + +## Related Docs +- Pass 6 details: [`../stages/6_ni_candidates.md`](../stages/6_ni_candidates.md) +- NI confidence constraints: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/datasets/nsul.md b/docs/architecture/datasets/nsul.md new file mode 100644 index 0000000..5edf627 --- /dev/null +++ b/docs/architecture/datasets/nsul.md @@ -0,0 +1,27 @@ +# NSUL Dataset Lineage + +## Role In The Graph +NSUL provides UPRN->postcode relationships used with LIDS UPRN->USRN links to generate high-confidence street evidence. + +## Ingest Contract +- Source key: `nsul` +- Raw table: `raw.nsul_row` +- Stage table: `stage.nsul_uprn_postcode` + +## Stage Normalisation +- Normalised fields: + - `uprn` + - `postcode_norm` + +## Downstream Transformations +- Pass 4 joins NSUL and LIDS on UPRN, then aggregates postcode/USRN pairs. +- Output candidate type: `uprn_usrn` (high confidence). + +## Value Added +- Adds postcode side of the UPRN linkage chain. +- Enables frequency-like reinforcement based on property counts. + +## Related Docs +- Pass 4 details: [`../stages/4_uprn_reinforcement.md`](../stages/4_uprn_reinforcement.md) +- Open UPRN context: [`os_open_uprn.md`](os_open_uprn.md) +- LIDS context: [`os_open_lids.md`](os_open_lids.md) diff --git a/docs/architecture/datasets/onspd.md b/docs/architecture/datasets/onspd.md new file mode 100644 index 0000000..bdde023 --- /dev/null +++ b/docs/architecture/datasets/onspd.md @@ -0,0 +1,34 @@ +# ONSPD Dataset Lineage + +## Role In The Graph +ONSPD is the definitive postcode backbone. It validates postcode existence and contributes canonical postcode metadata used by all later joins. + +## Ingest Contract +- Source key: `onspd` +- Raw table: `raw.onspd_row` +- Manifest mapping source: `pipeline/config/source_schema.yaml` +- Primary pass usage: Pass `1_onspd_backbone` + +## Stage Normalisation +- Stage table: `stage.onspd_postcode` +- Main fields: + - `postcode_norm`, `postcode_display` + - `status`, `lat`, `lon`, `easting`, `northing` + - `country_iso2`, `country_iso3`, `subdivision_code` + - `street_enrichment_available` + +## Downstream Transformations +- Pass 1 writes: + - `core.postcodes` + - `core.postcodes_meta` +- Used by passes 3/4/5/6/7 for postcode validation and join gating. + +## Value Added +- Converts raw postcode records into canonical and display-safe forms. +- Centralizes country/subdivision context for profile-specific behavior. +- Prevents downstream candidate generation for invalid/unresolvable postcodes. + +## Related Docs +- Pass 1 details: [`../stages/1_onspd_backbone.md`](../stages/1_onspd_backbone.md) +- Canonical postcode rules: [`../../spec/pipeline_v3/canonicalisation.md`](../../spec/pipeline_v3/canonicalisation.md) +- Relationship map: [`../relationships-overview.md`](../relationships-overview.md) diff --git a/docs/architecture/datasets/os_open_lids.md b/docs/architecture/datasets/os_open_lids.md new file mode 100644 index 0000000..690be10 --- /dev/null +++ b/docs/architecture/datasets/os_open_lids.md @@ -0,0 +1,32 @@ +# OS Open LIDS Dataset Lineage + +## Role In The Graph +LIDS is the identifier bridge dataset. It resolves relationships between TOID/UPRN and USRN. + +## Ingest Contract +- Source key: `os_open_lids` +- Raw table: `raw.os_open_lids_row` +- Stage tables: + - `stage.oli_identifier_pair` (`id_1`, `id_2`, `relation_type`) + - `stage.oli_toid_usrn` + - `stage.oli_uprn_usrn` + +## Stage Normalisation +- Generic identifier pairs are normalised first: + - `id_1`, `id_2`, `relation_type` +- Relation typing is explicit (`toid_usrn` or `uprn_usrn`) after deterministic inference. +- Typed rows are materialised into dedicated stage tables for downstream joins. + +## Downstream Transformations +- Pass 2: helps infer missing canonical USRN names from Open Names TOIDs. +- Pass 3: confirms TOID-based Open Names evidence, generating `oli_toid_usrn` candidates. +- Pass 4: contributes UPRN->USRN links for high-confidence `uprn_usrn` candidates. + +## Value Added +- Supplies the key bridge between feature identifiers and canonical street identifiers. +- Converts generic identifier pairs into explicit typed relationships for deterministic joins. + +## Related Docs +- Pass 0b staging details: [`../stages/0b_stage_normalisation.md`](../stages/0b_stage_normalisation.md) +- Pass 3 details: [`../stages/3_open_names_candidates.md`](../stages/3_open_names_candidates.md) +- Pass 4 details: [`../stages/4_uprn_reinforcement.md`](../stages/4_uprn_reinforcement.md) diff --git a/docs/architecture/datasets/os_open_names.md b/docs/architecture/datasets/os_open_names.md new file mode 100644 index 0000000..a44262a --- /dev/null +++ b/docs/architecture/datasets/os_open_names.md @@ -0,0 +1,32 @@ +# OS Open Names Dataset Lineage + +## Role In The Graph +Open Names contributes named road features and optional TOID references, creating medium-confidence street evidence by postcode and enabling TOID-confirmed promotion. + +## Ingest Contract +- Source key: `os_open_names` +- Raw table: `raw.os_open_names_row` +- Stage table: `stage.open_names_road_feature` +- Primary pass usage: Pass `3_open_names_candidates` + +## Stage Normalisation +- Normalised fields include: + - `feature_id` + - `toid` (when present) + - `street_name_raw`, `street_name_casefolded` + - `postcode_norm` (when available) +- Road/transport filtering is applied during staging. + +## Downstream Transformations +- Pass 3 inserts `names_postcode_feature` candidates. +- Pass 3 appends `oli_toid_usrn` candidates when TOID resolves via LIDS. +- Pass 3 records append-only lineage in `derived.postcode_street_candidate_lineage`. + +## Value Added +- Adds broad coverage of named road features. +- Supplies structured evidence that can be upgraded to high confidence with TOID confirmation. + +## Related Docs +- Pass 3 details: [`../stages/3_open_names_candidates.md`](../stages/3_open_names_candidates.md) +- LIDS bridge: [`os_open_lids.md`](os_open_lids.md) +- Candidate immutability contract: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/datasets/os_open_roads.md b/docs/architecture/datasets/os_open_roads.md new file mode 100644 index 0000000..dd99312 --- /dev/null +++ b/docs/architecture/datasets/os_open_roads.md @@ -0,0 +1,28 @@ +# OS Open Roads Dataset Lineage + +## Role In The Graph +Open Roads provides fallback street evidence where stronger candidate types do not exist. + +## Ingest Contract +- Source key: `os_open_roads` +- Raw table: `raw.os_open_roads_row` +- Stage table: `stage.open_roads_segment` +- Primary pass usage: Pass `5_gb_spatial_fallback` + +## Stage Normalisation +- Normalised fields: + - `segment_id`, `road_id` + - `road_name`, `road_name_casefolded` + - optional `usrn` + - optional `postcode_norm` + +## Downstream Transformations +- Pass 5 emits `spatial_os_open_roads` low-confidence candidates only for postcodes without high-confidence evidence. + +## Value Added +- Improves coverage without overriding stronger evidence. +- Preserves confidence transparency by explicitly tagging fallback provenance. + +## Related Docs +- Pass 5 details: [`../stages/5_gb_spatial_fallback.md`](../stages/5_gb_spatial_fallback.md) +- Confidence model: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/datasets/os_open_uprn.md b/docs/architecture/datasets/os_open_uprn.md new file mode 100644 index 0000000..a4049f9 --- /dev/null +++ b/docs/architecture/datasets/os_open_uprn.md @@ -0,0 +1,29 @@ +# OS Open UPRN Dataset Lineage + +## Role In The Graph +Open UPRN contributes property-level identity used with NSUL and LIDS to create high-confidence postcode/USRN evidence. + +## Ingest Contract +- Source key: `os_open_uprn` +- Raw table: `raw.os_open_uprn_row` +- Stage table: `stage.uprn_point` +- Primary pass usage: indirect, via pass `4_uprn_reinforcement` + +## Stage Normalisation +- Normalised fields: + - `uprn` + - optional `postcode_norm` + +## Downstream Transformations +- Combined with: + - `stage.nsul_uprn_postcode` (UPRN->postcode) + - `stage.oli_uprn_usrn` (UPRN->USRN) +- Pass 4 aggregates evidence into `uprn_usrn` high-confidence candidates. + +## Value Added +- Supports strongest GB candidate type by linking property-level and street-level identifiers. + +## Related Docs +- Pass 4 details: [`../stages/4_uprn_reinforcement.md`](../stages/4_uprn_reinforcement.md) +- NSUL linkage: [`nsul.md`](nsul.md) +- LIDS linkage: [`os_open_lids.md`](os_open_lids.md) diff --git a/docs/architecture/datasets/os_open_usrn.md b/docs/architecture/datasets/os_open_usrn.md new file mode 100644 index 0000000..b2f29c0 --- /dev/null +++ b/docs/architecture/datasets/os_open_usrn.md @@ -0,0 +1,31 @@ +# OS Open USRN Dataset Lineage + +## Role In The Graph +OS Open USRN defines canonical street identity (`USRN`) and street naming used as the final street key in outputs. + +## Ingest Contract +- Source key: `os_open_usrn` +- Raw table: `raw.os_open_usrn_row` +- Stage output: `stage.streets_usrn_input` +- Primary pass usage: Pass `2_gb_canonical_streets` + +## Stage Normalisation +- Core normalised fields: + - `usrn` + - `street_name` + - `street_name_casefolded` + - class/status metadata (when available) + +## Downstream Transformations +- Pass 2 writes `core.streets_usrn`. +- If direct USRN names are sparse, pass 2 infers missing USRN names from Open Names + LIDS TOID bridges. +- Passes 3/4/7 use `core.streets_usrn` for canonical name matching. + +## Value Added +- Provides a stable street key for provenance and de-duplication. +- Anchors candidate evidence to canonical street names. + +## Related Docs +- Pass 2 details: [`../stages/2_gb_canonical_streets.md`](../stages/2_gb_canonical_streets.md) +- Open Names linkage: [`os_open_names.md`](os_open_names.md) +- LIDS bridge: [`os_open_lids.md`](os_open_lids.md) diff --git a/docs/architecture/datasets/osni_gazetteer.md b/docs/architecture/datasets/osni_gazetteer.md new file mode 100644 index 0000000..ace39ae --- /dev/null +++ b/docs/architecture/datasets/osni_gazetteer.md @@ -0,0 +1,19 @@ +# OSNI Gazetteer Dataset Lineage (Optional NI) + +## Role In The Graph +OSNI Gazetteer is NI-specific street evidence input for NI-enabled profiles. + +## Ingest Contract +- Source key: `osni_gazetteer` +- Raw table: `raw.osni_gazetteer_row` +- Stage table: `stage.osni_street_point` + +## Downstream Transformations +- Pass 6 emits `osni_gazetteer_direct` candidates. + +## Value Added +- Extends NI street evidence coverage under explicit NI confidence constraints. + +## Related Docs +- Pass 6 details: [`../stages/6_ni_candidates.md`](../stages/6_ni_candidates.md) +- Candidate type rules: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/datasets/ppd.md b/docs/architecture/datasets/ppd.md new file mode 100644 index 0000000..ec56152 --- /dev/null +++ b/docs/architecture/datasets/ppd.md @@ -0,0 +1,30 @@ +# PPD Dataset Lineage (Optional) + +## Role In The Graph +PPD is a gap-fill source for lower-confidence address-derived street evidence. + +## Ingest Contract +- Source key: `ppd` +- Raw table: `raw.ppd_row` +- Stage table: `stage.ppd_parsed_address` +- Bundle rule: may include multiple ingest runs (baseline + updates), applied in deterministic ingest-time order. + +## Stage Normalisation +- Normalised fields: + - `row_hash` + - `postcode_norm` + - `house_number` + - `street_token_raw`, `street_token_casefolded` + +## Downstream Transformations +- Pass 7 performs token matching against canonical streets. +- Generates `ppd_parse_matched` or `ppd_parse_unmatched` candidate types. +- Used as additive gap-fill only; does not override stronger evidence. + +## Value Added +- Improves coverage where core spatial joins have sparse evidence. +- Preserves confidence transparency through explicit low/none-like candidate typing. + +## Related Docs +- Pass 7 details: [`../stages/7_ppd_gap_fill.md`](../stages/7_ppd_gap_fill.md) +- PPD baseline/update rule: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/relationships-overview.md b/docs/architecture/relationships-overview.md new file mode 100644 index 0000000..238147b --- /dev/null +++ b/docs/architecture/relationships-overview.md @@ -0,0 +1,44 @@ +# Dataset Relationship Overview + +## Core Graph + +```text +ONSPD -> core.postcodes +OS Open USRN -> core.streets_usrn +OS Open Names + ONSPD -> candidates (names_postcode_feature) +OS Open Names + LIDS (TOID->USRN) -> candidates (oli_toid_usrn) +OS Open UPRN + NSUL + LIDS (UPRN->USRN) -> candidates (uprn_usrn) +OS Open Roads + core.postcodes -> fallback candidates (spatial_os_open_roads) +Optional: PPD -> gap-fill candidates (ppd_parse_*) +All candidates + weights -> derived.postcode_streets_final +Final + provenance joins -> api projections +``` + +## Relationship Types +- Validation relationship: + - ONSPD validates and normalises postcode existence and country/subdivision context. +- Canonical street relationship: + - USRN is the canonical street key (`core.streets_usrn`). +- Direct semantic relationship: + - Open Names road features link to postcodes and sometimes TOIDs. +- Identifier bridge relationship: + - LIDS resolves `TOID -> USRN` and `UPRN -> USRN`. +- Property density relationship: + - NSUL ties UPRN to postcode, enabling postcode/USRN aggregation with LIDS. +- Spatial fallback relationship: + - Open Roads provides low-confidence fallback where high-confidence evidence is absent. + +## Where Each Relationship Is Materialised +- Raw snapshots: `raw.*` +- Typed normalisation: `stage.*` +- Canonical entities: `core.postcodes`, `core.streets_usrn` +- Evidence graph: `derived.postcode_street_candidates`, `derived.postcode_street_candidate_lineage` +- Final resolved output: `derived.postcode_streets_final` +- Provenance joins: `derived.postcode_streets_final_candidate`, `derived.postcode_streets_final_source` +- API shapes: `api.postcode_street_lookup__`, `api.postcode_lookup__` + +## Related Docs +- Pass-by-pass detail: [`stages/README.md`](stages/README.md) +- Dataset-specific lineage: [`datasets/README.md`](datasets/README.md) +- Value added by stage: [`value-added-by-stage.md`](value-added-by-stage.md) +- Spec authority: [`../spec/pipeline_v3/spec.md`](../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/stages/0a_raw_ingest.md b/docs/architecture/stages/0a_raw_ingest.md new file mode 100644 index 0000000..93124a3 --- /dev/null +++ b/docs/architecture/stages/0a_raw_ingest.md @@ -0,0 +1,18 @@ +# Pass 0a: Raw Ingest Validation + +## Purpose +Validate bundle sources exist and have non-zero ingest metadata row counts before transformations. + +## Inputs +- `meta.build_bundle_source` +- `meta.ingest_run` + +## Outputs +- pass checkpoint `0a_raw_ingest` with per-source row count summary + +## Value Added +- fast fail for missing/empty source runs +- deterministic baseline counts for observability + +## Related +- Bundle contract: [`../../spec/pipeline_v3/data_model.md`](../../spec/pipeline_v3/data_model.md) diff --git a/docs/architecture/stages/0b_stage_normalisation.md b/docs/architecture/stages/0b_stage_normalisation.md new file mode 100644 index 0000000..13d764a --- /dev/null +++ b/docs/architecture/stages/0b_stage_normalisation.md @@ -0,0 +1,33 @@ +# Pass 0b: Stage Normalisation + +## Purpose +Transform raw payloads into typed/stable stage contracts consumed by later passes. + +## Inputs +- `raw.*` tables selected by bundle ingest runs +- `pipeline/config/source_schema.yaml` + +## Outputs +- `stage.onspd_postcode` +- `stage.streets_usrn_input` +- `stage.open_names_road_feature` +- `stage.open_roads_segment` +- `stage.uprn_point` +- `stage.oli_identifier_pair` +- `stage.oli_toid_usrn` +- `stage.oli_uprn_usrn` +- `stage.nsul_uprn_postcode` +- optional NI/PPD stage tables + +## Determinism/Validation +- required mapped fields validated per source +- stream/batch loading to avoid memory variability +- explicit relation typing for LIDS (`toid_usrn`, `uprn_usrn`) + +## Value Added +- converts heterogeneous schemas into deterministic internal contracts +- surfaces schema drift early + +## Related +- Dataset pages: [`../datasets/README.md`](../datasets/README.md) +- Determinism rules: [`../../spec/pipeline_v3/canonicalisation.md`](../../spec/pipeline_v3/canonicalisation.md) diff --git a/docs/architecture/stages/1_onspd_backbone.md b/docs/architecture/stages/1_onspd_backbone.md new file mode 100644 index 0000000..5472bb6 --- /dev/null +++ b/docs/architecture/stages/1_onspd_backbone.md @@ -0,0 +1,18 @@ +# Pass 1: ONSPD Backbone + +## Purpose +Build canonical postcode entities from staged ONSPD rows. + +## Inputs +- `stage.onspd_postcode` + +## Outputs +- `core.postcodes` +- `core.postcodes_meta` + +## Value Added +- authoritative postcode backbone +- unified geographic/admin context for subsequent joins + +## Related +- Dataset: [`../datasets/onspd.md`](../datasets/onspd.md) diff --git a/docs/architecture/stages/2_gb_canonical_streets.md b/docs/architecture/stages/2_gb_canonical_streets.md new file mode 100644 index 0000000..47ad575 --- /dev/null +++ b/docs/architecture/stages/2_gb_canonical_streets.md @@ -0,0 +1,18 @@ +# Pass 2: GB Canonical Streets + +## Purpose +Build `core.streets_usrn` as canonical street dictionary keyed by USRN. + +## Inputs +- `stage.streets_usrn_input` +- `stage.open_names_road_feature` + `stage.oli_toid_usrn` (for inferred fallback names) + +## Outputs +- `core.streets_usrn` + +## Value Added +- canonical USRN street name layer +- inferred USRN naming where direct USRN names are missing + +## Related +- Datasets: [`../datasets/os_open_usrn.md`](../datasets/os_open_usrn.md), [`../datasets/os_open_names.md`](../datasets/os_open_names.md), [`../datasets/os_open_lids.md`](../datasets/os_open_lids.md) diff --git a/docs/architecture/stages/3_open_names_candidates.md b/docs/architecture/stages/3_open_names_candidates.md new file mode 100644 index 0000000..3e38409 --- /dev/null +++ b/docs/architecture/stages/3_open_names_candidates.md @@ -0,0 +1,26 @@ +# Pass 3: Open Names Candidates + +## Purpose +Create medium-confidence street candidates and append-only TOID-confirmed promotions. + +## Inputs +- `stage.open_names_road_feature` +- `stage.oli_toid_usrn` +- `core.postcodes` + +## Outputs +- base candidates: `candidate_type=names_postcode_feature` +- promoted candidates: `candidate_type=oli_toid_usrn` +- lineage: `derived.postcode_street_candidate_lineage` + +## Contract +- candidate table is immutable evidence +- promotions are insert-only; parent rows are never mutated + +## Value Added +- broad named-road evidence +- high-confidence confirmation via TOID->USRN bridge + +## Related +- Spec contract: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) +- Dataset pages: [`../datasets/os_open_names.md`](../datasets/os_open_names.md), [`../datasets/os_open_lids.md`](../datasets/os_open_lids.md) diff --git a/docs/architecture/stages/4_uprn_reinforcement.md b/docs/architecture/stages/4_uprn_reinforcement.md new file mode 100644 index 0000000..6a8103b --- /dev/null +++ b/docs/architecture/stages/4_uprn_reinforcement.md @@ -0,0 +1,20 @@ +# Pass 4: UPRN Reinforcement + +## Purpose +Generate high-confidence `uprn_usrn` candidates by aggregating property-level evidence. + +## Inputs +- `stage.nsul_uprn_postcode` +- `stage.oli_uprn_usrn` +- `core.postcodes` +- `core.streets_usrn` + +## Outputs +- `derived.postcode_street_candidates` rows (`candidate_type=uprn_usrn`, `confidence=high`) + +## Value Added +- strongest GB evidence class from UPRN-linked observations +- frequency signal (`uprn_count`) for ranking/probability + +## Related +- Datasets: [`../datasets/os_open_uprn.md`](../datasets/os_open_uprn.md), [`../datasets/nsul.md`](../datasets/nsul.md), [`../datasets/os_open_lids.md`](../datasets/os_open_lids.md) diff --git a/docs/architecture/stages/5_gb_spatial_fallback.md b/docs/architecture/stages/5_gb_spatial_fallback.md new file mode 100644 index 0000000..d9e182e --- /dev/null +++ b/docs/architecture/stages/5_gb_spatial_fallback.md @@ -0,0 +1,18 @@ +# Pass 5: GB Spatial Fallback + +## Purpose +Add low-confidence fallback candidates for postcodes lacking high-confidence evidence. + +## Inputs +- `stage.open_roads_segment` +- `core.postcodes` +- existing candidates + +## Outputs +- `derived.postcode_street_candidates` rows (`candidate_type=spatial_os_open_roads`, `confidence=low`) + +## Value Added +- coverage recovery with explicit low-confidence tagging + +## Related +- Dataset: [`../datasets/os_open_roads.md`](../datasets/os_open_roads.md) diff --git a/docs/architecture/stages/6_ni_candidates.md b/docs/architecture/stages/6_ni_candidates.md new file mode 100644 index 0000000..3eee9f8 --- /dev/null +++ b/docs/architecture/stages/6_ni_candidates.md @@ -0,0 +1,19 @@ +# Pass 6: NI Candidates (Profile-Dependent) + +## Purpose +Generate NI-specific candidate types when NI sources are present. + +## Inputs +- `stage.osni_street_point` +- `stage.dfi_road_segment` +- `core.postcodes` + +## Outputs +- `osni_gazetteer_direct` candidates +- `spatial_dfi_highway` candidates + +## Value Added +- extends NI coverage under explicit confidence constraints + +## Related +- Datasets: [`../datasets/osni_gazetteer.md`](../datasets/osni_gazetteer.md), [`../datasets/dfi_highway.md`](../datasets/dfi_highway.md) diff --git a/docs/architecture/stages/7_ppd_gap_fill.md b/docs/architecture/stages/7_ppd_gap_fill.md new file mode 100644 index 0000000..5812bf6 --- /dev/null +++ b/docs/architecture/stages/7_ppd_gap_fill.md @@ -0,0 +1,19 @@ +# Pass 7: PPD Gap Fill (Profile-Dependent) + +## Purpose +Add lower-confidence candidates from parsed transactional/self-reported addresses. + +## Inputs +- `stage.ppd_parsed_address` +- `core.streets_usrn` + +## Outputs +- `derived.postcode_street_candidates` (`ppd_parse_matched` / `ppd_parse_unmatched`) +- `internal.unit_index` + +## Value Added +- gap filling without overriding stronger spatial evidence +- expanded coverage with transparent provenance tags + +## Related +- Dataset: [`../datasets/ppd.md`](../datasets/ppd.md) diff --git a/docs/architecture/stages/8_finalisation.md b/docs/architecture/stages/8_finalisation.md new file mode 100644 index 0000000..3476a7b --- /dev/null +++ b/docs/architecture/stages/8_finalisation.md @@ -0,0 +1,28 @@ +# Pass 8: Finalisation + +## Purpose +Resolve candidate evidence into final postcode/street outputs and materialized API projections. + +## Inputs +- `derived.postcode_street_candidates` +- frequency weights config + +## Outputs +- `derived.postcode_streets_final` +- `derived.postcode_streets_final_candidate` +- `derived.postcode_streets_final_source` +- versioned API tables: + - `api.postcode_street_lookup__` + - `api.postcode_lookup__` + +## Deterministic Probability +- exact formula normalization by postcode total weight +- fixed-scale rounding + deterministic residual correction to rank 1 street + +## Value Added +- converts evidence graph into stable product outputs +- provides reproducible hashes and publishable API projections + +## Related +- Probability contract: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) +- Canonicalisation: [`../../spec/pipeline_v3/canonicalisation.md`](../../spec/pipeline_v3/canonicalisation.md) diff --git a/docs/architecture/stages/README.md b/docs/architecture/stages/README.md new file mode 100644 index 0000000..d60ea25 --- /dev/null +++ b/docs/architecture/stages/README.md @@ -0,0 +1,19 @@ +# Stage/Pass Documentation Index + +## Build Passes +1. [`0a_raw_ingest.md`](0a_raw_ingest.md) +2. [`0b_stage_normalisation.md`](0b_stage_normalisation.md) +3. [`1_onspd_backbone.md`](1_onspd_backbone.md) +4. [`2_gb_canonical_streets.md`](2_gb_canonical_streets.md) +5. [`3_open_names_candidates.md`](3_open_names_candidates.md) +6. [`4_uprn_reinforcement.md`](4_uprn_reinforcement.md) +7. [`5_gb_spatial_fallback.md`](5_gb_spatial_fallback.md) +8. [`6_ni_candidates.md`](6_ni_candidates.md) +9. [`7_ppd_gap_fill.md`](7_ppd_gap_fill.md) +10. [`8_finalisation.md`](8_finalisation.md) + +## Cross-links +- Dataset lineage index: [`../datasets/README.md`](../datasets/README.md) +- Relationship overview: [`../relationships-overview.md`](../relationships-overview.md) +- Value-added summary: [`../value-added-by-stage.md`](../value-added-by-stage.md) +- Spec authority: [`../../spec/pipeline_v3/spec.md`](../../spec/pipeline_v3/spec.md) diff --git a/docs/architecture/value-added-by-stage.md b/docs/architecture/value-added-by-stage.md new file mode 100644 index 0000000..a1af9c0 --- /dev/null +++ b/docs/architecture/value-added-by-stage.md @@ -0,0 +1,80 @@ +# Value Added By Stage + +This page explains what new product value is created at each pass, not just what tables are written. + +## Pass 0a: Raw Ingest Validation +- Inputs: bundle-selected ingest runs (`meta.ingest_run`). +- Output: validated source presence and row-count baseline. +- Value added: + - confirms build bundle completeness before transformations + - establishes reproducible volume expectations per source + +## Pass 0b: Stage Normalisation +- Inputs: immutable raw payloads (`raw.*`) + schema mapping config. +- Output: typed/normalized rows in `stage.*`. +- Value added: + - converts heterogeneous source schemas to deterministic internal contracts + - enforces required-field gates before downstream joins + - materialises `LIDS` relation typing (`id_1`, `id_2`, `relation_type`) + +## Pass 1: ONSPD Backbone +- Inputs: `stage.onspd_postcode`. +- Output: `core.postcodes`, `core.postcodes_meta`. +- Value added: + - creates authoritative postcode validation layer + - provides postcode centroid/admin metadata context for all later joins + +## Pass 2: Canonical Streets (USRN) +- Inputs: `stage.streets_usrn_input`, `stage.open_names_road_feature`, `stage.oli_toid_usrn`. +- Output: `core.streets_usrn`. +- Value added: + - produces canonical USRN-keyed street dictionary + - fills gaps by inferring USRN names from Open Names + LIDS TOID mapping when direct names are absent + +## Pass 3: Open Names Candidates +- Inputs: `stage.open_names_road_feature`, `stage.oli_toid_usrn`, `core.*`. +- Output: `derived.postcode_street_candidates` + lineage rows. +- Value added: + - creates medium-confidence postcode/street evidence from named features + - upgrades TOID-confirmed evidence via append-only promotion (`oli_toid_usrn`) + - preserves full evidence chain (immutable parent + promoted child + lineage) + +## Pass 4: UPRN Reinforcement +- Inputs: `stage.nsul_uprn_postcode`, `stage.oli_uprn_usrn`, `core.*`. +- Output: high-confidence `uprn_usrn` candidates. +- Value added: + - adds strong evidence using property-level frequency aggregation + - ties street inference to observed property density per postcode + +## Pass 5: GB Spatial Fallback +- Inputs: `stage.open_roads_segment`, `core.postcodes`, current candidates. +- Output: low-confidence `spatial_os_open_roads` candidates. +- Value added: + - closes obvious holes where no high-confidence candidate exists + - improves postcode coverage while preserving confidence transparency + +## Pass 6: NI Candidates (Optional Profile) +- Inputs: `stage.osni_street_point`, `stage.dfi_road_segment`, `core.postcodes`. +- Output: NI-specific candidate types. +- Value added: + - extends coverage for NI builds with explicitly capped confidence + +## Pass 7: PPD Gap Fill (Optional Profile) +- Inputs: `stage.ppd_parsed_address`, `core.streets_usrn`. +- Output: `ppd_parse_*` candidates, `internal.unit_index`. +- Value added: + - uses transactional/self-reported evidence to fill gaps only + - never overrides stronger core spatial evidence + +## Pass 8: Finalisation +- Inputs: all candidates + weights config. +- Output: final tables + API versioned projections + deterministic hashes. +- Value added: + - resolves competing evidence into ranked final street outputs + - computes exact probabilities with deterministic rounding correction + - produces API-ready materialisations with relational provenance backing + +## Cross-links +- Stage details: [`stages/README.md`](stages/README.md) +- Dataset lineage pages: [`datasets/README.md`](datasets/README.md) +- Probability contract: [`../spec/pipeline_v3/spec.md`](../spec/pipeline_v3/spec.md) diff --git a/docs/spec/phase_1/changes.md b/docs/spec/phase_1/changes.md index 1c3149b..bb3d5fe 100644 --- a/docs/spec/phase_1/changes.md +++ b/docs/spec/phase_1/changes.md @@ -22,7 +22,7 @@ Determinism confirmation: - Tie-break remains stable and explicit (`distance`, then `segment_id`). Spec update confirmation: -- Updated `/Users/jamie/code/postcod.es/docs/spec/phase_1/spec.md` to reflect runtime query contract and dated change note. +- Updated `docs/spec/phase_1/spec.md` to reflect runtime query contract and dated change note. ## 2026-02-20 — CHG-0002 @@ -53,7 +53,7 @@ Determinism confirmation: - Rebuild path remains explicit via `--rebuild`. Spec update confirmation: -- Updated `/Users/jamie/code/postcod.es/docs/spec/phase_1/spec.md` with resume CLI and checkpoint table contract. +- Updated `docs/spec/phase_1/spec.md` with resume CLI and checkpoint table contract. ## 2026-02-20 — CHG-0003 @@ -84,4 +84,4 @@ Determinism confirmation: - Resume continues by skipping only completed table-level checkpoints. Spec update confirmation: -- Updated `/Users/jamie/code/postcod.es/docs/spec/phase_1/spec.md` to lock table-level checkpoint behavior. +- Updated `docs/spec/phase_1/spec.md` to lock table-level checkpoint behavior. diff --git a/docs/spec/phase_2-open-names/changes.md b/docs/spec/phase_2-open-names/changes.md index c2955d2..588945d 100644 --- a/docs/spec/phase_2-open-names/changes.md +++ b/docs/spec/phase_2-open-names/changes.md @@ -25,7 +25,7 @@ Determinism confirmation: - Determinism validated through contract tests. Spec update confirmation: -- Updated `/Users/jamie/code/postcod.es/docs/spec/phase_2-open-names/spec.md` and added `/Users/jamie/code/postcod.es/docs/spec/phase_2-open-names/voronoi_method.md`. +- Updated `docs/spec/phase_2-open-names/spec.md` and added `docs/spec/phase_2-open-names/voronoi_method.md`. ## 2026-02-20 — CHG-0002 @@ -62,8 +62,8 @@ Determinism confirmation: Spec update confirmation: - Updated: - - `/Users/jamie/code/postcod.es/docs/spec/phase_2-open-names/spec.md` - - `/Users/jamie/code/postcod.es/docs/spec/phase_2-open-names/prd.md` + - `docs/spec/phase_2-open-names/spec.md` + - `docs/spec/phase_2-open-names/prd.md` ## 2026-02-20 — CHG-0003 @@ -94,4 +94,4 @@ Determinism confirmation: - No row-order-dependent seed selection is used. Spec update confirmation: -- Updated `/Users/jamie/code/postcod.es/docs/spec/phase_2-open-names/spec.md`. +- Updated `docs/spec/phase_2-open-names/spec.md`. diff --git a/docs/spec/pipeline_v3/canonicalisation.md b/docs/spec/pipeline_v3/canonicalisation.md index abeae69..913bfdf 100644 --- a/docs/spec/pipeline_v3/canonicalisation.md +++ b/docs/spec/pipeline_v3/canonicalisation.md @@ -46,3 +46,7 @@ Probability ranking (descending) uses: ## Timezone All metadata timestamps are UTC. + +## Cross-links +- Pass finalisation behavior: [`../../architecture/stages/8_finalisation.md`](../../architecture/stages/8_finalisation.md) +- Value-added summary: [`../../architecture/value-added-by-stage.md`](../../architecture/value-added-by-stage.md) diff --git a/docs/spec/pipeline_v3/data_model.md b/docs/spec/pipeline_v3/data_model.md index 1eaa059..e67877f 100644 --- a/docs/spec/pipeline_v3/data_model.md +++ b/docs/spec/pipeline_v3/data_model.md @@ -73,3 +73,8 @@ Disambiguation-only table. Never exposed to API reader role. - `api.postcode_street_lookup__` - `api.postcode_lookup__` - stable views: `api.postcode_street_lookup`, `api.postcode_lookup` + +## Cross-links +- Architecture relationships: [`../../architecture/relationships-overview.md`](../../architecture/relationships-overview.md) +- Dataset lineage pages: [`../../architecture/datasets/README.md`](../../architecture/datasets/README.md) +- Stage/pass pages: [`../../architecture/stages/README.md`](../../architecture/stages/README.md) diff --git a/docs/spec/pipeline_v3/spec.md b/docs/spec/pipeline_v3/spec.md index 7c929c9..9d60839 100644 --- a/docs/spec/pipeline_v3/spec.md +++ b/docs/spec/pipeline_v3/spec.md @@ -102,3 +102,8 @@ Final outputs use relational provenance: - `derived.postcode_streets_final_source` Arrays and JSON payloads are projection-only conveniences in `api.*` tables/views. + +## 8. Architecture Cross-links +- Architecture index: [`../../architecture/README.md`](../../architecture/README.md) +- Dataset lineage pages: [`../../architecture/datasets/README.md`](../../architecture/datasets/README.md) +- Stage/pass pages: [`../../architecture/stages/README.md`](../../architecture/stages/README.md) diff --git a/pipeline/AGENTS.md b/pipeline/AGENTS.md new file mode 100644 index 0000000..af2e43d --- /dev/null +++ b/pipeline/AGENTS.md @@ -0,0 +1,21 @@ +# pipeline/AGENTS.md + +## Scope +Implementation guidance for `pipeline/` (configs, SQL migrations, runtime modules). + +## Critical Rule +If code in `pipeline/` changes behavior, update matching docs in `docs/spec/pipeline_v3/` and `docs/architecture/` in the same commit series. + +## Fast Navigation +- CLI: `pipeline/src/pipeline/cli.py` +- Build logic: `pipeline/src/pipeline/build/workflows.py` +- Ingest logic: `pipeline/src/pipeline/ingest/workflows.py` +- Manifest contracts: `pipeline/src/pipeline/manifest.py` +- Migrations: `pipeline/sql/migrations/` +- Runtime configs: `pipeline/config/` + +## Change Checklist +- migration required? +- manifest/source schema mappings updated? +- determinism/canonicalisation still valid? +- tests/docs updated together? diff --git a/pipeline/src/pipeline/AGENTS.md b/pipeline/src/pipeline/AGENTS.md new file mode 100644 index 0000000..5ea0d2b --- /dev/null +++ b/pipeline/src/pipeline/AGENTS.md @@ -0,0 +1,25 @@ +# pipeline/src/pipeline/AGENTS.md + +## Scope +Python runtime internals for ingest/build/verify/publish. + +## Critical Rule +Keep docs in lockstep with behavior changes. Update: +- `docs/spec/pipeline_v3/spec.md` +- `docs/spec/pipeline_v3/data_model.md` +- `docs/spec/pipeline_v3/canonicalisation.md` +- relevant pages under `docs/architecture/` + +## Module Map +- `cli.py`: command surface and run flow +- `manifest.py`: source/bundle manifest validation +- `ingest/workflows.py`: raw ingest into `raw.*` +- `build/workflows.py`: pass execution 0a..8, provenance, finalisation, publish +- `db/migrations.py`: migration execution +- `util/normalise.py`: canonical text/postcode normalization + +## Common Pitfalls +- nondeterministic ordering in SQL +- mutating append-only evidence tables +- schema mapping drift between manifests/raw payload fields +- config changes without docs/test updates diff --git a/tests/AGENTS.md b/tests/AGENTS.md new file mode 100644 index 0000000..1d1a543 --- /dev/null +++ b/tests/AGENTS.md @@ -0,0 +1,19 @@ +# tests/AGENTS.md + +## Scope +Test strategy and expectations for this repository. + +## Critical Rule +Any behavior change in pipeline logic must include matching test updates and documentation updates in the same workstream. + +## Expectations +- deterministic outputs for identical inputs +- schema validation tests for mapped required fields +- provenance/immutability contract tests +- probability normalization correctness tests +- fixture-based tests only (no live network dependencies) + +## Navigation +- test root: `tests/` +- core docs for expected behavior: `docs/spec/pipeline_v3/` +- architecture references: `docs/architecture/` From fe40449da06b99d757b06cbe1ead5f49bb75ff54 Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Sat, 21 Feb 2026 00:31:03 +0000 Subject: [PATCH 09/17] docs(architecture): add mermaid dataflow diagram for v3 pipeline - add a detailed Mermaid flowchart covering source->meta->raw->stage->core->derived->api relationships\n- align node names with current v3 table/source naming (including os_open_lids)\n- mark optional NI/PPD branches in the diagram\n- surface the diagram from architecture index quick links --- docs/architecture/README.md | 2 +- docs/architecture/relationships-overview.md | 164 ++++++++++++++++++++ 2 files changed, 165 insertions(+), 1 deletion(-) diff --git a/docs/architecture/README.md b/docs/architecture/README.md index da8c7b1..6fa7ed3 100644 --- a/docs/architecture/README.md +++ b/docs/architecture/README.md @@ -3,7 +3,7 @@ This section explains how datasets relate to each other, how data moves through ingest/stage/build passes, and what value is added at each step. ## Quick Links -- Relationship map: [`relationships-overview.md`](relationships-overview.md) +- Relationship map + Mermaid system diagram: [`relationships-overview.md`](relationships-overview.md) - End-to-end value by pass: [`value-added-by-stage.md`](value-added-by-stage.md) - Dataset index: [`datasets/README.md`](datasets/README.md) - Stage/pass index: [`stages/README.md`](stages/README.md) diff --git a/docs/architecture/relationships-overview.md b/docs/architecture/relationships-overview.md index 238147b..beda6d3 100644 --- a/docs/architecture/relationships-overview.md +++ b/docs/architecture/relationships-overview.md @@ -14,6 +14,170 @@ All candidates + weights -> derived.postcode_streets_final Final + provenance joins -> api projections ``` +## Mermaid Diagram + +```mermaid +flowchart TB + subgraph S["Source Datasets"] + ONSPD["ONSPD"] + USRN["OS Open USRN"] + NAMES["OS Open Names"] + LIDS["OS Open LIDS"] + UPRN["OS Open UPRN"] + NSUL["NSUL"] + ROADS["OS Open Roads"] + OSNI["OSNI Gazetteer"] + DFI["DfI Highway"] + PPD["HM Land Registry PPD"] + end + + subgraph META["Meta"] + IR["meta.ingest_run"] + IRF["meta.ingest_run_file"] + BB["meta.build_bundle"] + BBS["meta.build_bundle_source"] + BR["meta.build_run"] + BPC["meta.build_pass_checkpoint"] + CH["meta.canonical_hash"] + PUB["meta.dataset_publication"] + end + + subgraph RAW["Raw"] + R_ONSPD["raw.onspd_row"] + R_USRN["raw.os_open_usrn_row"] + R_NAMES["raw.os_open_names_row"] + R_LIDS["raw.os_open_lids_row"] + R_UPRN["raw.os_open_uprn_row"] + R_NSUL["raw.nsul_row"] + R_ROADS["raw.os_open_roads_row"] + R_OSNI["raw.osni_gazetteer_row"] + R_DFI["raw.dfi_highway_row"] + R_PPD["raw.ppd_row"] + end + + subgraph STAGE["Stage"] + S_ONSPD["stage.onspd_postcode"] + S_USRN["stage.streets_usrn_input"] + S_NAMES["stage.open_names_road_feature"] + S_LIDS_PAIR["stage.oli_identifier_pair"] + S_LIDS_TOID["stage.oli_toid_usrn"] + S_LIDS_UPRN["stage.oli_uprn_usrn"] + S_UPRN["stage.uprn_point"] + S_NSUL["stage.nsul_uprn_postcode"] + S_ROADS["stage.open_roads_segment"] + S_OSNI["stage.osni_street_point"] + S_DFI["stage.dfi_road_segment"] + S_PPD["stage.ppd_parsed_address"] + end + + subgraph CORE["Core"] + C_POST["core.postcodes"] + C_META["core.postcodes_meta"] + C_STREETS["core.streets_usrn"] + end + + subgraph DERIVED["Derived"] + CAND["derived.postcode_street_candidates"] + LIN["derived.postcode_street_candidate_lineage"] + FINAL["derived.postcode_streets_final"] + FINAL_CAND["derived.postcode_streets_final_candidate"] + FINAL_SRC["derived.postcode_streets_final_source"] + end + + subgraph INTERNAL["Internal"] + UNIT["internal.unit_index"] + end + + subgraph API["API Projections"] + API_STREET_V["api.postcode_street_lookup__"] + API_POST_V["api.postcode_lookup__"] + API_STREET["api.postcode_street_lookup (view)"] + API_POST["api.postcode_lookup (view)"] + end + + ONSPD --> IR + USRN --> IR + NAMES --> IR + LIDS --> IR + UPRN --> IR + NSUL --> IR + ROADS --> IR + OSNI --> IR + DFI --> IR + PPD --> IR + + IR --> IRF + IR --> BBS + BB --> BBS + BB --> BR + BR --> BPC + BR --> CH + + ONSPD --> R_ONSPD + USRN --> R_USRN + NAMES --> R_NAMES + LIDS --> R_LIDS + UPRN --> R_UPRN + NSUL --> R_NSUL + ROADS --> R_ROADS + OSNI -. optional .-> R_OSNI + DFI -. optional .-> R_DFI + PPD -. optional .-> R_PPD + + R_ONSPD --> S_ONSPD + R_USRN --> S_USRN + R_NAMES --> S_NAMES + R_LIDS --> S_LIDS_PAIR + R_UPRN --> S_UPRN + R_NSUL --> S_NSUL + R_ROADS --> S_ROADS + R_OSNI -. optional .-> S_OSNI + R_DFI -. optional .-> S_DFI + R_PPD -. optional .-> S_PPD + + S_LIDS_PAIR --> S_LIDS_TOID + S_LIDS_PAIR --> S_LIDS_UPRN + + S_ONSPD --> C_POST + S_ONSPD --> C_META + S_USRN --> C_STREETS + S_NAMES --> C_STREETS + S_LIDS_TOID --> C_STREETS + + C_POST --> CAND + C_STREETS --> CAND + S_NAMES --> CAND + S_LIDS_TOID --> CAND + S_NSUL --> CAND + S_LIDS_UPRN --> CAND + S_ROADS --> CAND + S_OSNI -. optional .-> CAND + S_DFI -. optional .-> CAND + S_PPD -. optional .-> CAND + + S_PPD -. optional .-> UNIT + + CAND --> LIN + CAND --> FINAL + C_POST --> FINAL + C_STREETS --> FINAL + + FINAL --> FINAL_CAND + CAND --> FINAL_CAND + FINAL --> FINAL_SRC + IR --> FINAL_SRC + + FINAL --> API_STREET_V + FINAL --> API_POST_V + FINAL_SRC --> API_STREET_V + FINAL_SRC --> API_POST_V + BR --> API_STREET_V + BR --> API_POST_V + API_STREET_V --> API_STREET + API_POST_V --> API_POST + BR --> PUB +``` + ## Relationship Types - Validation relationship: - ONSPD validates and normalises postcode existence and country/subdivision context. From f61657b5644233db038d8087f3d1e53a8f72d812 Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Sat, 21 Feb 2026 11:47:14 +0000 Subject: [PATCH 10/17] refactor(pipeline): standardize open_lids naming and speed up stage 0b heavy-source normalization --- pipeline/config/frequency_weights.yaml | 2 +- .../migrations/0005_v3_cutover_foundation.sql | 8 +- .../0008_v3_lids_relation_stage.sql | 6 +- .../migrations/0010_v3_open_lids_naming.sql | 115 ++++ .../0011_v3_raw_run_rownum_indexes.sql | 35 ++ pipeline/src/pipeline/build/workflows.py | 556 +++++++++++------- pipeline/src/pipeline/ingest/workflows.py | 10 + tests/test_pass3_append_only_promotion.py | 2 +- 8 files changed, 512 insertions(+), 222 deletions(-) create mode 100644 pipeline/sql/migrations/0010_v3_open_lids_naming.sql create mode 100644 pipeline/sql/migrations/0011_v3_raw_run_rownum_indexes.sql diff --git a/pipeline/config/frequency_weights.yaml b/pipeline/config/frequency_weights.yaml index 7fc5c63..b24dd0a 100644 --- a/pipeline/config/frequency_weights.yaml +++ b/pipeline/config/frequency_weights.yaml @@ -1,7 +1,7 @@ { "weights": { "names_postcode_feature": 0.6, - "oli_toid_usrn": 0.9, + "open_lids_toid_usrn": 0.9, "uprn_usrn": 1.0, "spatial_os_open_roads": 0.3, "osni_gazetteer_direct": 0.6, diff --git a/pipeline/sql/migrations/0005_v3_cutover_foundation.sql b/pipeline/sql/migrations/0005_v3_cutover_foundation.sql index 4bc7808..13cf430 100644 --- a/pipeline/sql/migrations/0005_v3_cutover_foundation.sql +++ b/pipeline/sql/migrations/0005_v3_cutover_foundation.sql @@ -263,7 +263,7 @@ CREATE TABLE IF NOT EXISTS stage.uprn_point ( PRIMARY KEY (build_run_id, uprn) ); -CREATE TABLE IF NOT EXISTS stage.oli_toid_usrn ( +CREATE TABLE IF NOT EXISTS stage.open_lids_toid_usrn ( build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, toid text NOT NULL, usrn bigint NOT NULL, @@ -271,7 +271,7 @@ CREATE TABLE IF NOT EXISTS stage.oli_toid_usrn ( PRIMARY KEY (build_run_id, toid, usrn) ); -CREATE TABLE IF NOT EXISTS stage.oli_uprn_usrn ( +CREATE TABLE IF NOT EXISTS stage.open_lids_uprn_usrn ( build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, uprn bigint NOT NULL, usrn bigint NOT NULL, @@ -386,7 +386,7 @@ CREATE TABLE IF NOT EXISTS derived.postcode_street_candidates ( REFERENCES meta.ingest_run (run_id), CHECK (candidate_type IN ( 'names_postcode_feature', - 'oli_toid_usrn', + 'open_lids_toid_usrn', 'uprn_usrn', 'spatial_os_open_roads', 'osni_gazetteer_direct', @@ -464,7 +464,7 @@ CREATE TABLE IF NOT EXISTS derived.postcode_streets_final_source ( PRIMARY KEY (final_id, source_name, ingest_run_id, candidate_type), CHECK (candidate_type IN ( 'names_postcode_feature', - 'oli_toid_usrn', + 'open_lids_toid_usrn', 'uprn_usrn', 'spatial_os_open_roads', 'osni_gazetteer_direct', diff --git a/pipeline/sql/migrations/0008_v3_lids_relation_stage.sql b/pipeline/sql/migrations/0008_v3_lids_relation_stage.sql index cee935f..c0e2b66 100644 --- a/pipeline/sql/migrations/0008_v3_lids_relation_stage.sql +++ b/pipeline/sql/migrations/0008_v3_lids_relation_stage.sql @@ -1,6 +1,6 @@ BEGIN; -CREATE TABLE IF NOT EXISTS stage.oli_identifier_pair ( +CREATE TABLE IF NOT EXISTS stage.open_lids_pair ( build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, id_1 text NOT NULL, id_2 text NOT NULL, @@ -10,7 +10,7 @@ CREATE TABLE IF NOT EXISTS stage.oli_identifier_pair ( CHECK (relation_type IN ('toid_usrn', 'uprn_usrn')) ); -CREATE INDEX IF NOT EXISTS idx_stage_oli_identifier_pair_run_relation - ON stage.oli_identifier_pair (build_run_id, relation_type); +CREATE INDEX IF NOT EXISTS idx_stage_open_lids_pair_run_relation + ON stage.open_lids_pair (build_run_id, relation_type); COMMIT; diff --git a/pipeline/sql/migrations/0010_v3_open_lids_naming.sql b/pipeline/sql/migrations/0010_v3_open_lids_naming.sql new file mode 100644 index 0000000..067c86e --- /dev/null +++ b/pipeline/sql/migrations/0010_v3_open_lids_naming.sql @@ -0,0 +1,115 @@ +BEGIN; + +DO $$ +BEGIN + IF to_regclass('stage.oli_identifier_pair') IS NOT NULL + AND to_regclass('stage.open_lids_pair') IS NULL THEN + ALTER TABLE stage.oli_identifier_pair + RENAME TO open_lids_pair; + END IF; + + IF to_regclass('stage.oli_toid_usrn') IS NOT NULL + AND to_regclass('stage.open_lids_toid_usrn') IS NULL THEN + ALTER TABLE stage.oli_toid_usrn + RENAME TO open_lids_toid_usrn; + END IF; + + IF to_regclass('stage.oli_uprn_usrn') IS NOT NULL + AND to_regclass('stage.open_lids_uprn_usrn') IS NULL THEN + ALTER TABLE stage.oli_uprn_usrn + RENAME TO open_lids_uprn_usrn; + END IF; +END $$; + +DO $$ +BEGIN + IF to_regclass('stage.oli_identifier_pair_pkey') IS NOT NULL + AND to_regclass('stage.open_lids_pair_pkey') IS NULL THEN + ALTER INDEX stage.oli_identifier_pair_pkey + RENAME TO open_lids_pair_pkey; + END IF; + + IF to_regclass('stage.oli_toid_usrn_pkey') IS NOT NULL + AND to_regclass('stage.open_lids_toid_usrn_pkey') IS NULL THEN + ALTER INDEX stage.oli_toid_usrn_pkey + RENAME TO open_lids_toid_usrn_pkey; + END IF; + + IF to_regclass('stage.oli_uprn_usrn_pkey') IS NOT NULL + AND to_regclass('stage.open_lids_uprn_usrn_pkey') IS NULL THEN + ALTER INDEX stage.oli_uprn_usrn_pkey + RENAME TO open_lids_uprn_usrn_pkey; + END IF; + + IF to_regclass('stage.idx_stage_oli_identifier_pair_run_relation') IS NOT NULL + AND to_regclass('stage.idx_stage_open_lids_pair_run_relation') IS NULL THEN + ALTER INDEX stage.idx_stage_oli_identifier_pair_run_relation + RENAME TO idx_stage_open_lids_pair_run_relation; + END IF; +END $$; + +CREATE TABLE IF NOT EXISTS stage.open_lids_pair ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + id_1 text NOT NULL, + id_2 text NOT NULL, + relation_type text NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, id_1, id_2, relation_type), + CHECK (relation_type IN ('toid_usrn', 'uprn_usrn')) +); + +CREATE INDEX IF NOT EXISTS idx_stage_open_lids_pair_run_relation + ON stage.open_lids_pair (build_run_id, relation_type); + +CREATE TABLE IF NOT EXISTS stage.open_lids_toid_usrn ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + toid text NOT NULL, + usrn bigint NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, toid, usrn) +); + +CREATE TABLE IF NOT EXISTS stage.open_lids_uprn_usrn ( + build_run_id uuid NOT NULL REFERENCES meta.build_run (build_run_id) ON DELETE CASCADE, + uprn bigint NOT NULL, + usrn bigint NOT NULL, + ingest_run_id uuid NOT NULL REFERENCES meta.ingest_run (run_id), + PRIMARY KEY (build_run_id, uprn, usrn) +); + +ALTER TABLE derived.postcode_street_candidates + DROP CONSTRAINT IF EXISTS postcode_street_candidates_candidate_type_check; + +ALTER TABLE derived.postcode_streets_final_source + DROP CONSTRAINT IF EXISTS postcode_streets_final_source_candidate_type_check; + +ALTER TABLE derived.postcode_street_candidates DISABLE TRIGGER USER; + +UPDATE derived.postcode_street_candidates +SET candidate_type = 'open_lids_toid_usrn' +WHERE candidate_type = 'oli_toid_usrn'; + +UPDATE derived.postcode_street_candidates +SET evidence_ref = regexp_replace(evidence_ref, '^oli:', 'open_lids:') +WHERE evidence_ref LIKE 'oli:%'; + +ALTER TABLE derived.postcode_street_candidates ENABLE TRIGGER USER; + +UPDATE derived.postcode_streets_final_source +SET candidate_type = 'open_lids_toid_usrn' +WHERE candidate_type = 'oli_toid_usrn'; + +ALTER TABLE derived.postcode_streets_final_source + ADD CONSTRAINT postcode_streets_final_source_candidate_type_check + CHECK (candidate_type IN ( + 'names_postcode_feature', + 'open_lids_toid_usrn', + 'uprn_usrn', + 'spatial_os_open_roads', + 'osni_gazetteer_direct', + 'spatial_dfi_highway', + 'ppd_parse_matched', + 'ppd_parse_unmatched' + )); + +COMMIT; diff --git a/pipeline/sql/migrations/0011_v3_raw_run_rownum_indexes.sql b/pipeline/sql/migrations/0011_v3_raw_run_rownum_indexes.sql new file mode 100644 index 0000000..0c55d40 --- /dev/null +++ b/pipeline/sql/migrations/0011_v3_raw_run_rownum_indexes.sql @@ -0,0 +1,35 @@ +BEGIN; + +DROP INDEX IF EXISTS raw.idx_raw_onspd_run_id; +DROP INDEX IF EXISTS raw.idx_raw_os_open_usrn_run_id; +DROP INDEX IF EXISTS raw.idx_raw_os_open_names_run_id; +DROP INDEX IF EXISTS raw.idx_raw_os_open_roads_run_id; +DROP INDEX IF EXISTS raw.idx_raw_os_open_uprn_run_id; +DROP INDEX IF EXISTS raw.idx_raw_os_open_lids_run_id; +DROP INDEX IF EXISTS raw.idx_raw_nsul_run_id; +DROP INDEX IF EXISTS raw.idx_raw_osni_run_id; +DROP INDEX IF EXISTS raw.idx_raw_dfi_run_id; +DROP INDEX IF EXISTS raw.idx_raw_ppd_run_id; + +CREATE INDEX IF NOT EXISTS idx_raw_onspd_run_rownum + ON raw.onspd_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_usrn_run_rownum + ON raw.os_open_usrn_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_names_run_rownum + ON raw.os_open_names_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_roads_run_rownum + ON raw.os_open_roads_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_uprn_run_rownum + ON raw.os_open_uprn_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_os_open_lids_run_rownum + ON raw.os_open_lids_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_nsul_run_rownum + ON raw.nsul_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_osni_run_rownum + ON raw.osni_gazetteer_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_dfi_run_rownum + ON raw.dfi_highway_row (ingest_run_id, source_row_num); +CREATE INDEX IF NOT EXISTS idx_raw_ppd_run_rownum + ON raw.ppd_row (ingest_run_id, source_row_num); + +COMMIT; diff --git a/pipeline/src/pipeline/build/workflows.py b/pipeline/src/pipeline/build/workflows.py index 39e1d94..dc7594c 100644 --- a/pipeline/src/pipeline/build/workflows.py +++ b/pipeline/src/pipeline/build/workflows.py @@ -84,7 +84,7 @@ class PublishResult: CANDIDATE_TYPES = ( "names_postcode_feature", - "oli_toid_usrn", + "open_lids_toid_usrn", "uprn_usrn", "spatial_os_open_roads", "osni_gazetteer_direct", @@ -600,6 +600,57 @@ def _field_value(row: dict[str, Any], field_map: dict[str, str], logical_key: st return None +def _validated_raw_sample_row( + conn: psycopg.Connection, + *, + source_name: str, + raw_table: str, + ingest_run_id: str, + field_map: dict[str, str], + required_fields: tuple[str, ...], +) -> dict[str, Any]: + schema_name, table_name = raw_table.split(".", 1) + with conn.cursor() as cur: + cur.execute( + sql.SQL( + """ + SELECT payload_jsonb + FROM {}.{} + WHERE ingest_run_id = %s + ORDER BY source_row_num ASC + LIMIT 1 + """ + ).format(sql.Identifier(schema_name), sql.Identifier(table_name)), + (ingest_run_id,), + ) + row = cur.fetchone() + if row is None: + raise BuildError(f"Raw source is empty for {source_name}; cannot stage-normalise") + + sample_row = row[0] + _assert_required_mapped_fields_present( + source_name=source_name, + sample_row=sample_row, + field_map=field_map, + required_fields=required_fields, + ) + return sample_row + + +def _json_text_from_candidates(payload_expr: sql.SQL, candidates: tuple[str, ...]) -> sql.SQL: + if len(candidates) == 0: + return sql.SQL("NULL") + lookups = [ + sql.SQL("{} ->> {}").format(payload_expr, sql.Literal(candidate)) + for candidate in candidates + ] + return sql.SQL("COALESCE({})").format(sql.SQL(", ").join(lookups)) + + +def _json_text_for_field(payload_expr: sql.SQL, field_map: dict[str, str], logical_key: str) -> sql.SQL: + return _json_text_from_candidates(payload_expr, _field_name_candidates(field_map, logical_key)) + + def _schema_insert_rows( conn: psycopg.Connection, query: sql.SQL, @@ -630,9 +681,9 @@ def _stage_cleanup(conn: psycopg.Connection, build_run_id: str) -> None: "stage.dfi_road_segment", "stage.osni_street_point", "stage.nsul_uprn_postcode", - "stage.oli_uprn_usrn", - "stage.oli_toid_usrn", - "stage.oli_identifier_pair", + "stage.open_lids_uprn_usrn", + "stage.open_lids_toid_usrn", + "stage.open_lids_pair", "stage.uprn_point", "stage.open_roads_segment", "stage.open_names_road_feature", @@ -1068,181 +1119,235 @@ def _populate_stage_open_uprn( field_map: dict[str, str], required_fields: tuple[str, ...], ) -> int: - insert_sql = sql.SQL( - """ - INSERT INTO stage.uprn_point ( - build_run_id, - uprn, - postcode_norm, - ingest_run_id - ) VALUES (%s, %s, %s, %s) - ON CONFLICT (build_run_id, uprn) - DO UPDATE SET - postcode_norm = EXCLUDED.postcode_norm, - ingest_run_id = EXCLUDED.ingest_run_id - """ - ) - - payload: list[tuple[Any, ...]] = [] - inserted = 0 - postcode_key = field_map.get("postcode") - for row in _iter_validated_raw_rows( + _validated_raw_sample_row( conn, source_name="os_open_uprn", raw_table="raw.os_open_uprn_row", ingest_run_id=ingest_run_id, field_map=field_map, required_fields=required_fields, - ): - uprn_raw = _field_value(row, field_map, "uprn") - if uprn_raw in (None, ""): - continue - try: - uprn = int(uprn_raw) - except Exception: - continue - - postcode_n = postcode_norm(str(row.get(postcode_key)) if postcode_key and row.get(postcode_key) not in (None, "") else None) - - payload.append((build_run_id, uprn, postcode_n, ingest_run_id)) - if len(payload) >= STAGE_INSERT_BATCH_SIZE: - inserted += _flush_stage_batch(conn, insert_sql, payload) - - inserted += _flush_stage_batch(conn, insert_sql, payload) - return inserted - - -def _infer_lids_relation( - relation_raw: Any, - left_id: str, - right_id: str, -) -> tuple[str | None, str, str]: - relation = str(relation_raw).strip().lower() if relation_raw not in (None, "") else "" - left_is_toid = left_id.lower().startswith("osgb") - right_is_toid = right_id.lower().startswith("osgb") - left_is_digits = left_id.isdigit() - right_is_digits = right_id.isdigit() - - if relation in {"toid_usrn", "toid->usrn", "toid_usrn_link"}: - return "toid_usrn", left_id, right_id - if relation in {"uprn_usrn", "uprn->usrn", "uprn_usrn_link"}: - return "uprn_usrn", left_id, right_id - - if left_is_toid and right_is_digits: - return "toid_usrn", left_id, right_id - if right_is_toid and left_is_digits: - return "toid_usrn", right_id, left_id + ) - if left_is_digits and right_is_digits: - # UPRN values are usually longer than USRN values. If ambiguous, keep input order. - if len(left_id) > 8 and len(right_id) <= 8: - return "uprn_usrn", left_id, right_id - if len(right_id) > 8 and len(left_id) <= 8: - return "uprn_usrn", right_id, left_id - return "uprn_usrn", left_id, right_id + payload_expr = sql.SQL("r.payload_jsonb") + uprn_expr = _json_text_for_field(payload_expr, field_map, "uprn") + postcode_expr = _json_text_for_field(payload_expr, field_map, "postcode") - return None, left_id, right_id + with conn.cursor() as cur: + cur.execute( + sql.SQL( + """ + WITH extracted AS ( + SELECT + r.source_row_num, + btrim({uprn_expr}) AS uprn_text, + btrim({postcode_expr}) AS postcode_text + FROM raw.os_open_uprn_row AS r + WHERE r.ingest_run_id = %s + ), + filtered AS ( + SELECT + source_row_num, + uprn_text::bigint AS uprn, + NULLIF( + upper(regexp_replace(COALESCE(postcode_text, ''), '[^A-Za-z0-9]', '', 'g')), + '' + ) AS postcode_norm + FROM extracted + WHERE uprn_text IS NOT NULL + AND uprn_text <> '' + AND uprn_text ~ '^[0-9]+$' + ), + deduped AS ( + SELECT DISTINCT ON (uprn) + uprn, + postcode_norm + FROM filtered + ORDER BY uprn ASC, source_row_num DESC + ) + INSERT INTO stage.uprn_point ( + build_run_id, + uprn, + postcode_norm, + ingest_run_id + ) + SELECT + %s, + d.uprn, + d.postcode_norm, + %s + FROM deduped AS d + ORDER BY d.uprn ASC + ON CONFLICT (build_run_id, uprn) + DO UPDATE SET + postcode_norm = EXCLUDED.postcode_norm, + ingest_run_id = EXCLUDED.ingest_run_id + """ + ).format(uprn_expr=uprn_expr, postcode_expr=postcode_expr), + (ingest_run_id, build_run_id, ingest_run_id), + ) + return int(cur.rowcount) -def _populate_stage_oli( +def _populate_stage_open_lids( conn: psycopg.Connection, build_run_id: str, ingest_run_id: str, field_map: dict[str, str], required_fields: tuple[str, ...], ) -> tuple[int, int, int]: - toid_insert_sql = sql.SQL( - """ - INSERT INTO stage.oli_toid_usrn ( - build_run_id, - toid, - usrn, - ingest_run_id - ) VALUES (%s, %s, %s, %s) - ON CONFLICT (build_run_id, toid, usrn) - DO NOTHING - """ - ) - uprn_insert_sql = sql.SQL( - """ - INSERT INTO stage.oli_uprn_usrn ( - build_run_id, - uprn, - usrn, - ingest_run_id - ) VALUES (%s, %s, %s, %s) - ON CONFLICT (build_run_id, uprn, usrn) - DO NOTHING - """ - ) - relation_insert_sql = sql.SQL( - """ - INSERT INTO stage.oli_identifier_pair ( - build_run_id, - id_1, - id_2, - relation_type, - ingest_run_id - ) VALUES (%s, %s, %s, %s, %s) - ON CONFLICT (build_run_id, id_1, id_2, relation_type) - DO NOTHING - """ - ) - - toid_payload: list[tuple[Any, ...]] = [] - uprn_payload: list[tuple[Any, ...]] = [] - relation_payload: list[tuple[Any, ...]] = [] - relation_key = field_map.get("relation_type") - toid_count = 0 - uprn_count = 0 - relation_count = 0 - for row in _iter_validated_raw_rows( + _validated_raw_sample_row( conn, source_name="os_open_lids", raw_table="raw.os_open_lids_row", ingest_run_id=ingest_run_id, field_map=field_map, required_fields=required_fields, - ): - relation_raw = row.get(relation_key) if relation_key else None - left_raw = _field_value(row, field_map, "id_1") - right_raw = _field_value(row, field_map, "id_2") - if left_raw in (None, "") or right_raw in (None, ""): - continue + ) - left_id = str(left_raw).strip() - right_id = str(right_raw).strip() - relation, rel_left_id, rel_right_id = _infer_lids_relation(relation_raw, left_id, right_id) - if relation is None: - continue + payload_expr = sql.SQL("r.payload_jsonb") + id_1_expr = _json_text_for_field(payload_expr, field_map, "id_1") + id_2_expr = _json_text_for_field(payload_expr, field_map, "id_2") + relation_expr = _json_text_for_field(payload_expr, field_map, "relation_type") - relation_payload.append((build_run_id, rel_left_id, rel_right_id, relation, ingest_run_id)) + with conn.cursor() as cur: + cur.execute( + sql.SQL( + """ + WITH extracted AS ( + SELECT + r.source_row_num, + btrim({id_1_expr}) AS left_id, + btrim({id_2_expr}) AS right_id, + lower(btrim(COALESCE({relation_expr}, ''))) AS relation_hint + FROM raw.os_open_lids_row AS r + WHERE r.ingest_run_id = %s + ), + prepared AS ( + SELECT + source_row_num, + left_id, + right_id, + relation_hint, + (left_id IS NOT NULL AND left_id <> '') AS left_present, + (right_id IS NOT NULL AND right_id <> '') AS right_present, + (lower(COALESCE(left_id, '')) LIKE 'osgb%') AS left_is_toid, + (lower(COALESCE(right_id, '')) LIKE 'osgb%') AS right_is_toid, + (COALESCE(left_id, '') ~ '^[0-9]+$') AS left_is_digits, + (COALESCE(right_id, '') ~ '^[0-9]+$') AS right_is_digits + FROM extracted + ), + resolved AS ( + SELECT + source_row_num, + CASE + WHEN relation_hint IN ('toid_usrn', 'toid->usrn', 'toid_usrn_link') THEN 'toid_usrn' + WHEN relation_hint IN ('uprn_usrn', 'uprn->usrn', 'uprn_usrn_link') THEN 'uprn_usrn' + WHEN left_is_toid AND right_is_digits THEN 'toid_usrn' + WHEN right_is_toid AND left_is_digits THEN 'toid_usrn' + WHEN left_is_digits AND right_is_digits THEN 'uprn_usrn' + ELSE NULL + END AS relation_type, + CASE + WHEN relation_hint IN ('toid_usrn', 'toid->usrn', 'toid_usrn_link') THEN left_id + WHEN relation_hint IN ('uprn_usrn', 'uprn->usrn', 'uprn_usrn_link') THEN left_id + WHEN left_is_toid AND right_is_digits THEN left_id + WHEN right_is_toid AND left_is_digits THEN right_id + WHEN left_is_digits AND right_is_digits AND length(right_id) > 8 AND length(left_id) <= 8 THEN right_id + ELSE left_id + END AS id_1, + CASE + WHEN relation_hint IN ('toid_usrn', 'toid->usrn', 'toid_usrn_link') THEN right_id + WHEN relation_hint IN ('uprn_usrn', 'uprn->usrn', 'uprn_usrn_link') THEN right_id + WHEN left_is_toid AND right_is_digits THEN right_id + WHEN right_is_toid AND left_is_digits THEN left_id + WHEN left_is_digits AND right_is_digits AND length(right_id) > 8 AND length(left_id) <= 8 THEN left_id + ELSE right_id + END AS id_2 + FROM prepared + WHERE left_present AND right_present + ) + INSERT INTO stage.open_lids_pair ( + build_run_id, + id_1, + id_2, + relation_type, + ingest_run_id + ) + SELECT + %s, + resolved.id_1, + resolved.id_2, + resolved.relation_type, + %s + FROM resolved + WHERE resolved.relation_type IS NOT NULL + ORDER BY resolved.source_row_num ASC + ON CONFLICT (build_run_id, id_1, id_2, relation_type) + DO NOTHING + """ + ).format( + id_1_expr=id_1_expr, + id_2_expr=id_2_expr, + relation_expr=relation_expr, + ), + (ingest_run_id, build_run_id, ingest_run_id), + ) + pair_count = int(cur.rowcount) - if relation == "toid_usrn": - try: - usrn = int(rel_right_id) - except Exception: - continue - toid_payload.append((build_run_id, rel_left_id, usrn, ingest_run_id)) - elif relation == "uprn_usrn": - try: - uprn = int(rel_left_id) - usrn = int(rel_right_id) - except Exception: - continue - uprn_payload.append((build_run_id, uprn, usrn, ingest_run_id)) + cur.execute( + """ + INSERT INTO stage.open_lids_toid_usrn ( + build_run_id, + toid, + usrn, + ingest_run_id + ) + SELECT + %s, + p.id_1, + p.id_2::bigint, + %s + FROM stage.open_lids_pair AS p + WHERE p.build_run_id = %s + AND p.ingest_run_id = %s + AND p.relation_type = 'toid_usrn' + AND p.id_2 ~ '^[0-9]+$' + ORDER BY p.id_1 COLLATE "C" ASC, p.id_2 COLLATE "C" ASC + ON CONFLICT (build_run_id, toid, usrn) + DO NOTHING + """, + (build_run_id, ingest_run_id, build_run_id, ingest_run_id), + ) + toid_count = int(cur.rowcount) - if len(toid_payload) >= STAGE_INSERT_BATCH_SIZE: - toid_count += _flush_stage_batch(conn, toid_insert_sql, toid_payload) - if len(uprn_payload) >= STAGE_INSERT_BATCH_SIZE: - uprn_count += _flush_stage_batch(conn, uprn_insert_sql, uprn_payload) - if len(relation_payload) >= STAGE_INSERT_BATCH_SIZE: - relation_count += _flush_stage_batch(conn, relation_insert_sql, relation_payload) + cur.execute( + """ + INSERT INTO stage.open_lids_uprn_usrn ( + build_run_id, + uprn, + usrn, + ingest_run_id + ) + SELECT + %s, + p.id_1::bigint, + p.id_2::bigint, + %s + FROM stage.open_lids_pair AS p + WHERE p.build_run_id = %s + AND p.ingest_run_id = %s + AND p.relation_type = 'uprn_usrn' + AND p.id_1 ~ '^[0-9]+$' + AND p.id_2 ~ '^[0-9]+$' + ORDER BY p.id_1 COLLATE "C" ASC, p.id_2 COLLATE "C" ASC + ON CONFLICT (build_run_id, uprn, usrn) + DO NOTHING + """, + (build_run_id, ingest_run_id, build_run_id, ingest_run_id), + ) + uprn_count = int(cur.rowcount) - toid_count += _flush_stage_batch(conn, toid_insert_sql, toid_payload) - uprn_count += _flush_stage_batch(conn, uprn_insert_sql, uprn_payload) - relation_count += _flush_stage_batch(conn, relation_insert_sql, relation_payload) - return toid_count, uprn_count, relation_count + return toid_count, uprn_count, pair_count def _populate_stage_nsul( @@ -1252,46 +1357,63 @@ def _populate_stage_nsul( field_map: dict[str, str], required_fields: tuple[str, ...], ) -> int: - insert_sql = sql.SQL( - """ - INSERT INTO stage.nsul_uprn_postcode ( - build_run_id, - uprn, - postcode_norm, - ingest_run_id - ) VALUES (%s, %s, %s, %s) - ON CONFLICT (build_run_id, uprn, postcode_norm) - DO NOTHING - """ - ) - - payload: list[tuple[Any, ...]] = [] - inserted = 0 - for row in _iter_validated_raw_rows( + _validated_raw_sample_row( conn, source_name="nsul", raw_table="raw.nsul_row", ingest_run_id=ingest_run_id, field_map=field_map, required_fields=required_fields, - ): - uprn_raw = _field_value(row, field_map, "uprn") - postcode_raw = _field_value(row, field_map, "postcode") - if uprn_raw in (None, ""): - continue - try: - uprn = int(uprn_raw) - except Exception: - continue - postcode_n = postcode_norm(str(postcode_raw) if postcode_raw is not None else None) - if postcode_n is None: - continue - payload.append((build_run_id, uprn, postcode_n, ingest_run_id)) - if len(payload) >= STAGE_INSERT_BATCH_SIZE: - inserted += _flush_stage_batch(conn, insert_sql, payload) + ) - inserted += _flush_stage_batch(conn, insert_sql, payload) - return inserted + payload_expr = sql.SQL("r.payload_jsonb") + uprn_expr = _json_text_for_field(payload_expr, field_map, "uprn") + postcode_expr = _json_text_for_field(payload_expr, field_map, "postcode") + + with conn.cursor() as cur: + cur.execute( + sql.SQL( + """ + WITH extracted AS ( + SELECT + btrim({uprn_expr}) AS uprn_text, + btrim({postcode_expr}) AS postcode_text + FROM raw.nsul_row AS r + WHERE r.ingest_run_id = %s + ), + normalized AS ( + SELECT DISTINCT + uprn_text::bigint AS uprn, + NULLIF( + upper(regexp_replace(COALESCE(postcode_text, ''), '[^A-Za-z0-9]', '', 'g')), + '' + ) AS postcode_norm + FROM extracted + WHERE uprn_text IS NOT NULL + AND uprn_text <> '' + AND uprn_text ~ '^[0-9]+$' + ) + INSERT INTO stage.nsul_uprn_postcode ( + build_run_id, + uprn, + postcode_norm, + ingest_run_id + ) + SELECT + %s, + n.uprn, + n.postcode_norm, + %s + FROM normalized AS n + WHERE n.postcode_norm IS NOT NULL + ORDER BY n.uprn ASC, n.postcode_norm COLLATE "C" ASC + ON CONFLICT (build_run_id, uprn, postcode_norm) + DO NOTHING + """ + ).format(uprn_expr=uprn_expr, postcode_expr=postcode_expr), + (ingest_run_id, build_run_id, ingest_run_id), + ) + return int(cur.rowcount) def _populate_stage_osni( @@ -1539,12 +1661,12 @@ def _pass_0b_stage_normalisation( if "os_open_lids" in source_runs: field_map, required_fields = _mapped_fields_for_source(schema_config, "os_open_lids") ingest_run_id = _single_source_run(source_runs, "os_open_lids") - toid_count, uprn_count, relation_count = _populate_stage_oli( + toid_count, uprn_count, relation_count = _populate_stage_open_lids( conn, build_run_id, ingest_run_id, field_map, required_fields ) - counts["stage.oli_toid_usrn"] = toid_count - counts["stage.oli_uprn_usrn"] = uprn_count - counts["stage.oli_identifier_pair"] = relation_count + counts["stage.open_lids_toid_usrn"] = toid_count + counts["stage.open_lids_uprn_usrn"] = uprn_count + counts["stage.open_lids_pair"] = relation_count if "nsul" in source_runs: field_map, required_fields = _mapped_fields_for_source(schema_config, "nsul") @@ -1711,18 +1833,18 @@ def _pass_2_gb_canonical_streets(conn: psycopg.Connection, build_run_id: str) -> ), inferred_name_counts AS ( SELECT - oli.usrn, + lids.usrn, n.street_name_raw AS street_name, n.street_name_casefolded, COUNT(*)::bigint AS evidence_count, - (ARRAY_AGG(oli.ingest_run_id ORDER BY oli.ingest_run_id::text ASC))[1] AS usrn_run_id + (ARRAY_AGG(lids.ingest_run_id ORDER BY lids.ingest_run_id::text ASC))[1] AS usrn_run_id FROM stage.open_names_road_feature AS n - JOIN stage.oli_toid_usrn AS oli - ON oli.build_run_id = n.build_run_id - AND oli.toid = n.toid + JOIN stage.open_lids_toid_usrn AS lids + ON lids.build_run_id = n.build_run_id + AND lids.toid = n.toid WHERE n.build_run_id = %(build_run_id)s AND n.toid IS NOT NULL - GROUP BY oli.usrn, n.street_name_raw, n.street_name_casefolded + GROUP BY lids.usrn, n.street_name_raw, n.street_name_casefolded ), inferred_usrn AS ( SELECT @@ -1855,23 +1977,31 @@ def _pass_3_open_names_candidates(conn: psycopg.Connection, build_run_id: str) - parent.street_name_raw, parent.street_name_canonical, parent.evidence_json ->> 'toid' AS toid, - oli.usrn, - oli.ingest_run_id + lids.usrn, + lids.ingest_run_id FROM derived.postcode_street_candidates AS parent - JOIN stage.oli_toid_usrn AS oli - ON oli.build_run_id = parent.produced_build_run_id - AND oli.toid = parent.evidence_json ->> 'toid' + JOIN stage.open_lids_toid_usrn AS lids + ON lids.build_run_id = parent.produced_build_run_id + AND lids.toid = parent.evidence_json ->> 'toid' WHERE parent.produced_build_run_id = %s AND parent.candidate_type = 'names_postcode_feature' AND parent.evidence_json ->> 'toid' IS NOT NULL - ORDER BY parent.candidate_id ASC, oli.usrn ASC + ORDER BY parent.candidate_id ASC, lids.usrn ASC """, (build_run_id,), ) promotion_rows = cur.fetchall() with conn.cursor() as cur: - for parent_candidate_id, postcode, street_name_raw, street_name_canonical, toid, usrn, oli_run_id in promotion_rows: + for ( + parent_candidate_id, + postcode, + street_name_raw, + street_name_canonical, + toid, + usrn, + open_lids_run_id, + ) in promotion_rows: cur.execute( """ INSERT INTO derived.postcode_street_candidates ( @@ -1886,7 +2016,7 @@ def _pass_3_open_names_candidates(conn: psycopg.Connection, build_run_id: str) - source_name, ingest_run_id, evidence_json - ) VALUES (%s, %s, %s, %s, %s, 'oli_toid_usrn', 'high', %s, 'os_open_lids', %s, %s) + ) VALUES (%s, %s, %s, %s, %s, 'open_lids_toid_usrn', 'high', %s, 'os_open_lids', %s, %s) RETURNING candidate_id """, ( @@ -1895,8 +2025,8 @@ def _pass_3_open_names_candidates(conn: psycopg.Connection, build_run_id: str) - street_name_raw, street_name_canonical, usrn, - f"oli:toid_usrn:{toid}", - oli_run_id, + f"open_lids:toid_usrn:{toid}", + open_lids_run_id, Jsonb({"toid": toid, "usrn": usrn}), ), ) @@ -1931,15 +2061,15 @@ def _pass_4_uprn_reinforcement(conn: psycopg.Connection, build_run_id: str) -> d WITH aggregate_pairs AS ( SELECT nsul.postcode_norm, - oli.usrn, + lids.usrn, COUNT(*)::bigint AS uprn_count, - (ARRAY_AGG(oli.ingest_run_id ORDER BY oli.ingest_run_id::text ASC))[1] AS oli_ingest_run_id + (ARRAY_AGG(lids.ingest_run_id ORDER BY lids.ingest_run_id::text ASC))[1] AS open_lids_ingest_run_id FROM stage.nsul_uprn_postcode AS nsul - JOIN stage.oli_uprn_usrn AS oli - ON oli.build_run_id = nsul.build_run_id - AND oli.uprn = nsul.uprn + JOIN stage.open_lids_uprn_usrn AS lids + ON lids.build_run_id = nsul.build_run_id + AND lids.uprn = nsul.uprn WHERE nsul.build_run_id = %s - GROUP BY nsul.postcode_norm, oli.usrn + GROUP BY nsul.postcode_norm, lids.usrn ) INSERT INTO derived.postcode_street_candidates ( produced_build_run_id, @@ -1962,9 +2092,9 @@ def _pass_4_uprn_reinforcement(conn: psycopg.Connection, build_run_id: str) -> d a.usrn, 'uprn_usrn', 'high', - 'oli:uprn_usrn:' || a.uprn_count::text || '_uprns', + 'open_lids:uprn_usrn:' || a.uprn_count::text || '_uprns', 'os_open_lids', - a.oli_ingest_run_id, + a.open_lids_ingest_run_id, jsonb_build_object('uprn_count', a.uprn_count) FROM aggregate_pairs AS a JOIN core.postcodes AS p diff --git a/pipeline/src/pipeline/ingest/workflows.py b/pipeline/src/pipeline/ingest/workflows.py index bd0dee6..d8cfaa2 100644 --- a/pipeline/src/pipeline/ingest/workflows.py +++ b/pipeline/src/pipeline/ingest/workflows.py @@ -198,6 +198,14 @@ def _insert_raw_rows( return total_loaded +def _analyze_raw_table(conn: psycopg.Connection, qualified_table: str) -> None: + schema_ident, table_ident = _table_ident(qualified_table) + with conn.cursor() as cur: + cur.execute( + sql.SQL("ANALYZE {}.{}").format(schema_ident, table_ident), + ) + + def _existing_ingest_run( conn: psycopg.Connection, source_name: str, @@ -330,6 +338,8 @@ def ingest_source(conn: psycopg.Connection, manifest: SourceIngestManifest) -> I (total_rows, run_id), ) + _analyze_raw_table(conn, raw_table) + return IngestResult( source_name=manifest.source_name, run_id=run_id, diff --git a/tests/test_pass3_append_only_promotion.py b/tests/test_pass3_append_only_promotion.py index e127dfa..90236f6 100644 --- a/tests/test_pass3_append_only_promotion.py +++ b/tests/test_pass3_append_only_promotion.py @@ -11,7 +11,7 @@ def test_pass3_inserts_promoted_rows_and_lineage(self) -> None: text = WORKFLOWS.read_text(encoding="utf-8") self.assertIn("def _pass_3_open_names_candidates", text) self.assertIn("INSERT INTO derived.postcode_street_candidates", text) - self.assertIn("'oli_toid_usrn'", text) + self.assertIn("'open_lids_toid_usrn'", text) self.assertIn("INSERT INTO derived.postcode_street_candidate_lineage", text) self.assertIn("promotion_toid_usrn", text) From e8e86b72bf4a8c4e9f582a382d8cd2e62efb41ac Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Sat, 21 Feb 2026 11:47:25 +0000 Subject: [PATCH 11/17] docs(architecture): align LIDS stage/candidate naming and pass 0b normalization notes --- docs/architecture/datasets/os_open_lids.md | 8 ++++---- docs/architecture/datasets/os_open_names.md | 2 +- docs/architecture/datasets/os_open_uprn.md | 2 +- docs/architecture/relationships-overview.md | 8 ++++---- docs/architecture/stages/0b_stage_normalisation.md | 10 ++++++---- docs/architecture/stages/2_gb_canonical_streets.md | 2 +- docs/architecture/stages/3_open_names_candidates.md | 4 ++-- docs/architecture/stages/4_uprn_reinforcement.md | 2 +- docs/architecture/value-added-by-stage.md | 8 ++++---- docs/spec/pipeline_v3/spec.md | 4 ++-- 10 files changed, 26 insertions(+), 24 deletions(-) diff --git a/docs/architecture/datasets/os_open_lids.md b/docs/architecture/datasets/os_open_lids.md index 690be10..0adeab4 100644 --- a/docs/architecture/datasets/os_open_lids.md +++ b/docs/architecture/datasets/os_open_lids.md @@ -7,9 +7,9 @@ LIDS is the identifier bridge dataset. It resolves relationships between TOID/UP - Source key: `os_open_lids` - Raw table: `raw.os_open_lids_row` - Stage tables: - - `stage.oli_identifier_pair` (`id_1`, `id_2`, `relation_type`) - - `stage.oli_toid_usrn` - - `stage.oli_uprn_usrn` + - `stage.open_lids_pair` (`id_1`, `id_2`, `relation_type`) + - `stage.open_lids_toid_usrn` + - `stage.open_lids_uprn_usrn` ## Stage Normalisation - Generic identifier pairs are normalised first: @@ -19,7 +19,7 @@ LIDS is the identifier bridge dataset. It resolves relationships between TOID/UP ## Downstream Transformations - Pass 2: helps infer missing canonical USRN names from Open Names TOIDs. -- Pass 3: confirms TOID-based Open Names evidence, generating `oli_toid_usrn` candidates. +- Pass 3: confirms TOID-based Open Names evidence, generating `open_lids_toid_usrn` candidates. - Pass 4: contributes UPRN->USRN links for high-confidence `uprn_usrn` candidates. ## Value Added diff --git a/docs/architecture/datasets/os_open_names.md b/docs/architecture/datasets/os_open_names.md index a44262a..8a6313e 100644 --- a/docs/architecture/datasets/os_open_names.md +++ b/docs/architecture/datasets/os_open_names.md @@ -19,7 +19,7 @@ Open Names contributes named road features and optional TOID references, creatin ## Downstream Transformations - Pass 3 inserts `names_postcode_feature` candidates. -- Pass 3 appends `oli_toid_usrn` candidates when TOID resolves via LIDS. +- Pass 3 appends `open_lids_toid_usrn` candidates when TOID resolves via LIDS. - Pass 3 records append-only lineage in `derived.postcode_street_candidate_lineage`. ## Value Added diff --git a/docs/architecture/datasets/os_open_uprn.md b/docs/architecture/datasets/os_open_uprn.md index a4049f9..a6cd063 100644 --- a/docs/architecture/datasets/os_open_uprn.md +++ b/docs/architecture/datasets/os_open_uprn.md @@ -17,7 +17,7 @@ Open UPRN contributes property-level identity used with NSUL and LIDS to create ## Downstream Transformations - Combined with: - `stage.nsul_uprn_postcode` (UPRN->postcode) - - `stage.oli_uprn_usrn` (UPRN->USRN) + - `stage.open_lids_uprn_usrn` (UPRN->USRN) - Pass 4 aggregates evidence into `uprn_usrn` high-confidence candidates. ## Value Added diff --git a/docs/architecture/relationships-overview.md b/docs/architecture/relationships-overview.md index beda6d3..7304280 100644 --- a/docs/architecture/relationships-overview.md +++ b/docs/architecture/relationships-overview.md @@ -6,7 +6,7 @@ ONSPD -> core.postcodes OS Open USRN -> core.streets_usrn OS Open Names + ONSPD -> candidates (names_postcode_feature) -OS Open Names + LIDS (TOID->USRN) -> candidates (oli_toid_usrn) +OS Open Names + LIDS (TOID->USRN) -> candidates (open_lids_toid_usrn) OS Open UPRN + NSUL + LIDS (UPRN->USRN) -> candidates (uprn_usrn) OS Open Roads + core.postcodes -> fallback candidates (spatial_os_open_roads) Optional: PPD -> gap-fill candidates (ppd_parse_*) @@ -59,9 +59,9 @@ flowchart TB S_ONSPD["stage.onspd_postcode"] S_USRN["stage.streets_usrn_input"] S_NAMES["stage.open_names_road_feature"] - S_LIDS_PAIR["stage.oli_identifier_pair"] - S_LIDS_TOID["stage.oli_toid_usrn"] - S_LIDS_UPRN["stage.oli_uprn_usrn"] + S_LIDS_PAIR["stage.open_lids_pair"] + S_LIDS_TOID["stage.open_lids_toid_usrn"] + S_LIDS_UPRN["stage.open_lids_uprn_usrn"] S_UPRN["stage.uprn_point"] S_NSUL["stage.nsul_uprn_postcode"] S_ROADS["stage.open_roads_segment"] diff --git a/docs/architecture/stages/0b_stage_normalisation.md b/docs/architecture/stages/0b_stage_normalisation.md index 13d764a..5f681f5 100644 --- a/docs/architecture/stages/0b_stage_normalisation.md +++ b/docs/architecture/stages/0b_stage_normalisation.md @@ -13,19 +13,21 @@ Transform raw payloads into typed/stable stage contracts consumed by later passe - `stage.open_names_road_feature` - `stage.open_roads_segment` - `stage.uprn_point` -- `stage.oli_identifier_pair` -- `stage.oli_toid_usrn` -- `stage.oli_uprn_usrn` +- `stage.open_lids_pair` +- `stage.open_lids_toid_usrn` +- `stage.open_lids_uprn_usrn` - `stage.nsul_uprn_postcode` - optional NI/PPD stage tables ## Determinism/Validation - required mapped fields validated per source -- stream/batch loading to avoid memory variability +- heavy-volume sources (`os_open_uprn`, `os_open_lids`, `nsul`) use set-based SQL transforms - explicit relation typing for LIDS (`toid_usrn`, `uprn_usrn`) +- raw reads are ordered by `source_row_num` with `(ingest_run_id, source_row_num)` indexes ## Value Added - converts heterogeneous schemas into deterministic internal contracts +- removes Python row-loop bottlenecks for the largest source feeds - surfaces schema drift early ## Related diff --git a/docs/architecture/stages/2_gb_canonical_streets.md b/docs/architecture/stages/2_gb_canonical_streets.md index 47ad575..6fe0fe3 100644 --- a/docs/architecture/stages/2_gb_canonical_streets.md +++ b/docs/architecture/stages/2_gb_canonical_streets.md @@ -5,7 +5,7 @@ Build `core.streets_usrn` as canonical street dictionary keyed by USRN. ## Inputs - `stage.streets_usrn_input` -- `stage.open_names_road_feature` + `stage.oli_toid_usrn` (for inferred fallback names) +- `stage.open_names_road_feature` + `stage.open_lids_toid_usrn` (for inferred fallback names) ## Outputs - `core.streets_usrn` diff --git a/docs/architecture/stages/3_open_names_candidates.md b/docs/architecture/stages/3_open_names_candidates.md index 3e38409..5440c83 100644 --- a/docs/architecture/stages/3_open_names_candidates.md +++ b/docs/architecture/stages/3_open_names_candidates.md @@ -5,12 +5,12 @@ Create medium-confidence street candidates and append-only TOID-confirmed promot ## Inputs - `stage.open_names_road_feature` -- `stage.oli_toid_usrn` +- `stage.open_lids_toid_usrn` - `core.postcodes` ## Outputs - base candidates: `candidate_type=names_postcode_feature` -- promoted candidates: `candidate_type=oli_toid_usrn` +- promoted candidates: `candidate_type=open_lids_toid_usrn` - lineage: `derived.postcode_street_candidate_lineage` ## Contract diff --git a/docs/architecture/stages/4_uprn_reinforcement.md b/docs/architecture/stages/4_uprn_reinforcement.md index 6a8103b..62d05ad 100644 --- a/docs/architecture/stages/4_uprn_reinforcement.md +++ b/docs/architecture/stages/4_uprn_reinforcement.md @@ -5,7 +5,7 @@ Generate high-confidence `uprn_usrn` candidates by aggregating property-level ev ## Inputs - `stage.nsul_uprn_postcode` -- `stage.oli_uprn_usrn` +- `stage.open_lids_uprn_usrn` - `core.postcodes` - `core.streets_usrn` diff --git a/docs/architecture/value-added-by-stage.md b/docs/architecture/value-added-by-stage.md index a1af9c0..5793e5d 100644 --- a/docs/architecture/value-added-by-stage.md +++ b/docs/architecture/value-added-by-stage.md @@ -25,22 +25,22 @@ This page explains what new product value is created at each pass, not just what - provides postcode centroid/admin metadata context for all later joins ## Pass 2: Canonical Streets (USRN) -- Inputs: `stage.streets_usrn_input`, `stage.open_names_road_feature`, `stage.oli_toid_usrn`. +- Inputs: `stage.streets_usrn_input`, `stage.open_names_road_feature`, `stage.open_lids_toid_usrn`. - Output: `core.streets_usrn`. - Value added: - produces canonical USRN-keyed street dictionary - fills gaps by inferring USRN names from Open Names + LIDS TOID mapping when direct names are absent ## Pass 3: Open Names Candidates -- Inputs: `stage.open_names_road_feature`, `stage.oli_toid_usrn`, `core.*`. +- Inputs: `stage.open_names_road_feature`, `stage.open_lids_toid_usrn`, `core.*`. - Output: `derived.postcode_street_candidates` + lineage rows. - Value added: - creates medium-confidence postcode/street evidence from named features - - upgrades TOID-confirmed evidence via append-only promotion (`oli_toid_usrn`) + - upgrades TOID-confirmed evidence via append-only promotion (`open_lids_toid_usrn`) - preserves full evidence chain (immutable parent + promoted child + lineage) ## Pass 4: UPRN Reinforcement -- Inputs: `stage.nsul_uprn_postcode`, `stage.oli_uprn_usrn`, `core.*`. +- Inputs: `stage.nsul_uprn_postcode`, `stage.open_lids_uprn_usrn`, `core.*`. - Output: high-confidence `uprn_usrn` candidates. - Value added: - adds strong evidence using property-level frequency aggregation diff --git a/docs/spec/pipeline_v3/spec.md b/docs/spec/pipeline_v3/spec.md index 9d60839..3af758a 100644 --- a/docs/spec/pipeline_v3/spec.md +++ b/docs/spec/pipeline_v3/spec.md @@ -45,7 +45,7 @@ Build pass order is fixed: ### 3.1 Pass 3 Promotion Semantics (Append-Only) - `names_postcode_feature` candidates are immutable evidence rows. -- TOID confirmation creates a new `oli_toid_usrn` candidate row. +- TOID confirmation creates a new `open_lids_toid_usrn` candidate row. - Promotion lineage is recorded in `derived.postcode_street_candidate_lineage`. - Existing candidate rows are never updated for `candidate_type`, `confidence`, `usrn`, or `evidence_ref`. @@ -53,7 +53,7 @@ Build pass order is fixed: Candidate type enum: - `names_postcode_feature` -- `oli_toid_usrn` +- `open_lids_toid_usrn` - `uprn_usrn` - `spatial_os_open_roads` - `osni_gazetteer_direct` From 64337fc5ba5e29cdb41c5a7e7d3fdccdbc006463 Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Sat, 21 Feb 2026 11:47:56 +0000 Subject: [PATCH 12/17] fix(build): escape open_lids LIKE patterns for psycopg placeholder parsing --- pipeline/src/pipeline/build/workflows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline/src/pipeline/build/workflows.py b/pipeline/src/pipeline/build/workflows.py index dc7594c..0e57ab4 100644 --- a/pipeline/src/pipeline/build/workflows.py +++ b/pipeline/src/pipeline/build/workflows.py @@ -1230,8 +1230,8 @@ def _populate_stage_open_lids( relation_hint, (left_id IS NOT NULL AND left_id <> '') AS left_present, (right_id IS NOT NULL AND right_id <> '') AS right_present, - (lower(COALESCE(left_id, '')) LIKE 'osgb%') AS left_is_toid, - (lower(COALESCE(right_id, '')) LIKE 'osgb%') AS right_is_toid, + (lower(COALESCE(left_id, '')) LIKE 'osgb%%') AS left_is_toid, + (lower(COALESCE(right_id, '')) LIKE 'osgb%%') AS right_is_toid, (COALESCE(left_id, '') ~ '^[0-9]+$') AS left_is_digits, (COALESCE(right_id, '') ~ '^[0-9]+$') AS right_is_digits FROM extracted From 8802490a484cf3998c08ddc11314d92b2bc333c0 Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Sat, 21 Feb 2026 13:59:22 +0000 Subject: [PATCH 13/17] perf(build): reduce stage 0b runtime and write overhead This keeps the LIDS stage transform fully deterministic while removing avoidable sort work and write amplification. Changes: - remove ORDER BY clauses from open_lids stage inserts where ordering does not affect set semantics - set synchronous_commit=off during build runs to reduce fsync waits on rebuildable stage writes - mark stage tables UNLOGGED via migration 0012 - update stage 0b docs to reflect deterministic replay/indexing plus UNLOGGED stage behavior --- .../stages/0b_stage_normalisation.md | 3 ++- .../migrations/0012_v3_stage_tables_unlogged.sql | 16 ++++++++++++++++ pipeline/src/pipeline/build/workflows.py | 11 +++++------ 3 files changed, 23 insertions(+), 7 deletions(-) create mode 100644 pipeline/sql/migrations/0012_v3_stage_tables_unlogged.sql diff --git a/docs/architecture/stages/0b_stage_normalisation.md b/docs/architecture/stages/0b_stage_normalisation.md index 5f681f5..dd1e304 100644 --- a/docs/architecture/stages/0b_stage_normalisation.md +++ b/docs/architecture/stages/0b_stage_normalisation.md @@ -23,7 +23,8 @@ Transform raw payloads into typed/stable stage contracts consumed by later passe - required mapped fields validated per source - heavy-volume sources (`os_open_uprn`, `os_open_lids`, `nsul`) use set-based SQL transforms - explicit relation typing for LIDS (`toid_usrn`, `uprn_usrn`) -- raw reads are ordered by `source_row_num` with `(ingest_run_id, source_row_num)` indexes +- `(ingest_run_id, source_row_num)` indexes support deterministic replay/debug and source-row traceability +- `stage.*` tables are `UNLOGGED` to reduce write amplification; they are rebuildable from `raw.*` ## Value Added - converts heterogeneous schemas into deterministic internal contracts diff --git a/pipeline/sql/migrations/0012_v3_stage_tables_unlogged.sql b/pipeline/sql/migrations/0012_v3_stage_tables_unlogged.sql new file mode 100644 index 0000000..e9faeca --- /dev/null +++ b/pipeline/sql/migrations/0012_v3_stage_tables_unlogged.sql @@ -0,0 +1,16 @@ +BEGIN; + +ALTER TABLE stage.onspd_postcode SET UNLOGGED; +ALTER TABLE stage.streets_usrn_input SET UNLOGGED; +ALTER TABLE stage.open_names_road_feature SET UNLOGGED; +ALTER TABLE stage.open_roads_segment SET UNLOGGED; +ALTER TABLE stage.uprn_point SET UNLOGGED; +ALTER TABLE stage.open_lids_toid_usrn SET UNLOGGED; +ALTER TABLE stage.open_lids_uprn_usrn SET UNLOGGED; +ALTER TABLE stage.open_lids_pair SET UNLOGGED; +ALTER TABLE stage.nsul_uprn_postcode SET UNLOGGED; +ALTER TABLE stage.osni_street_point SET UNLOGGED; +ALTER TABLE stage.dfi_road_segment SET UNLOGGED; +ALTER TABLE stage.ppd_parsed_address SET UNLOGGED; + +COMMIT; diff --git a/pipeline/src/pipeline/build/workflows.py b/pipeline/src/pipeline/build/workflows.py index 0e57ab4..2623f2a 100644 --- a/pipeline/src/pipeline/build/workflows.py +++ b/pipeline/src/pipeline/build/workflows.py @@ -1215,7 +1215,6 @@ def _populate_stage_open_lids( """ WITH extracted AS ( SELECT - r.source_row_num, btrim({id_1_expr}) AS left_id, btrim({id_2_expr}) AS right_id, lower(btrim(COALESCE({relation_expr}, ''))) AS relation_hint @@ -1224,7 +1223,6 @@ def _populate_stage_open_lids( ), prepared AS ( SELECT - source_row_num, left_id, right_id, relation_hint, @@ -1238,7 +1236,6 @@ def _populate_stage_open_lids( ), resolved AS ( SELECT - source_row_num, CASE WHEN relation_hint IN ('toid_usrn', 'toid->usrn', 'toid_usrn_link') THEN 'toid_usrn' WHEN relation_hint IN ('uprn_usrn', 'uprn->usrn', 'uprn_usrn_link') THEN 'uprn_usrn' @@ -1281,7 +1278,6 @@ def _populate_stage_open_lids( %s FROM resolved WHERE resolved.relation_type IS NOT NULL - ORDER BY resolved.source_row_num ASC ON CONFLICT (build_run_id, id_1, id_2, relation_type) DO NOTHING """ @@ -1312,7 +1308,6 @@ def _populate_stage_open_lids( AND p.ingest_run_id = %s AND p.relation_type = 'toid_usrn' AND p.id_2 ~ '^[0-9]+$' - ORDER BY p.id_1 COLLATE "C" ASC, p.id_2 COLLATE "C" ASC ON CONFLICT (build_run_id, toid, usrn) DO NOTHING """, @@ -1339,7 +1334,6 @@ def _populate_stage_open_lids( AND p.relation_type = 'uprn_usrn' AND p.id_1 ~ '^[0-9]+$' AND p.id_2 ~ '^[0-9]+$' - ORDER BY p.id_1 COLLATE "C" ASC, p.id_2 COLLATE "C" ASC ON CONFLICT (build_run_id, uprn, usrn) DO NOTHING """, @@ -2845,6 +2839,11 @@ def run_build( f"Bundle source {source_name} must include exactly one ingest run" ) + with conn.cursor() as cur: + # Stage tables are rebuildable; disabling per-transaction fsync waits reduces + # pass runtime without changing deterministic outputs. + cur.execute("SET synchronous_commit TO off") + if resume: resumable = _latest_resumable_run(conn, bundle_id) if resumable is None: From a0b12537dccfab27db515ed6c46cf0c1c9766e03 Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Sat, 21 Feb 2026 13:59:36 +0000 Subject: [PATCH 14/17] perf(ingest): mark v3 raw tables unlogged for faster development loops For local development we prioritize ingest+build iteration speed and rely on archived source files for replayability. Changes: - add migration 0013 to set active V3 raw tables to UNLOGGED - document the replay-first durability model in pass 0a, pass 0b, and V3 data model docs - keep provenance guarantees anchored on ingest manifests and meta.ingest_run_file file hashes --- docs/architecture/stages/0a_raw_ingest.md | 1 + docs/architecture/stages/0b_stage_normalisation.md | 1 + docs/spec/pipeline_v3/data_model.md | 3 ++- .../sql/migrations/0013_v3_raw_tables_unlogged.sql | 14 ++++++++++++++ 4 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 pipeline/sql/migrations/0013_v3_raw_tables_unlogged.sql diff --git a/docs/architecture/stages/0a_raw_ingest.md b/docs/architecture/stages/0a_raw_ingest.md index 93124a3..f73d9fa 100644 --- a/docs/architecture/stages/0a_raw_ingest.md +++ b/docs/architecture/stages/0a_raw_ingest.md @@ -13,6 +13,7 @@ Validate bundle sources exist and have non-zero ingest metadata row counts befor ## Value Added - fast fail for missing/empty source runs - deterministic baseline counts for observability +- explicit replay contract: raw snapshots are rebuild caches backed by archived source files + file hashes ## Related - Bundle contract: [`../../spec/pipeline_v3/data_model.md`](../../spec/pipeline_v3/data_model.md) diff --git a/docs/architecture/stages/0b_stage_normalisation.md b/docs/architecture/stages/0b_stage_normalisation.md index dd1e304..5110eb4 100644 --- a/docs/architecture/stages/0b_stage_normalisation.md +++ b/docs/architecture/stages/0b_stage_normalisation.md @@ -25,6 +25,7 @@ Transform raw payloads into typed/stable stage contracts consumed by later passe - explicit relation typing for LIDS (`toid_usrn`, `uprn_usrn`) - `(ingest_run_id, source_row_num)` indexes support deterministic replay/debug and source-row traceability - `stage.*` tables are `UNLOGGED` to reduce write amplification; they are rebuildable from `raw.*` +- `raw.*` tables are `UNLOGGED` in this development profile; authoritative replay comes from archived source files + `meta.ingest_run_file` ## Value Added - converts heterogeneous schemas into deterministic internal contracts diff --git a/docs/spec/pipeline_v3/data_model.md b/docs/spec/pipeline_v3/data_model.md index e67877f..7a00e52 100644 --- a/docs/spec/pipeline_v3/data_model.md +++ b/docs/spec/pipeline_v3/data_model.md @@ -32,7 +32,8 @@ Published dataset pointer log. ## Raw and Stage Layers -- `raw.*` holds immutable source snapshots. +- `raw.*` holds immutable source snapshots for the active build cache. +- `raw.*` tables are `UNLOGGED` for ingest speed and are rebuildable from archived source files + `meta.ingest_run_file`. - `stage.*` holds typed, normalised rows that build passes consume. ## Core Layer diff --git a/pipeline/sql/migrations/0013_v3_raw_tables_unlogged.sql b/pipeline/sql/migrations/0013_v3_raw_tables_unlogged.sql new file mode 100644 index 0000000..4ea3d13 --- /dev/null +++ b/pipeline/sql/migrations/0013_v3_raw_tables_unlogged.sql @@ -0,0 +1,14 @@ +BEGIN; + +ALTER TABLE raw.onspd_row SET UNLOGGED; +ALTER TABLE raw.os_open_usrn_row SET UNLOGGED; +ALTER TABLE raw.os_open_names_row SET UNLOGGED; +ALTER TABLE raw.os_open_roads_row SET UNLOGGED; +ALTER TABLE raw.os_open_uprn_row SET UNLOGGED; +ALTER TABLE raw.os_open_lids_row SET UNLOGGED; +ALTER TABLE raw.nsul_row SET UNLOGGED; +ALTER TABLE raw.osni_gazetteer_row SET UNLOGGED; +ALTER TABLE raw.dfi_highway_row SET UNLOGGED; +ALTER TABLE raw.ppd_row SET UNLOGGED; + +COMMIT; From 052c0f102409192e0b6e3663e35d762b12ba1878 Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Sat, 21 Feb 2026 16:02:18 +0000 Subject: [PATCH 15/17] perf(build): reset stage workspace and streamline open_lids stage path This change targets the stage 0b hotspot that was growing with each rebuild. What changed: - stage cleanup now truncates all stage tables at pass 0b start instead of per-build-row deletes - added a safety guard that blocks truncate when another build is actively in status=started - open_lids stage transform now writes directly to typed tables (toid/uprn) without persisting a large pair intermediate - pass checkpoint now records open_lids relation volume as a metric key () - updated architecture/spec docs and relationship diagram to match the new flow - added a contract test to keep truncate + started-run guard behavior locked in Why: - prevents historical stage index/data accumulation from degrading rebuild performance - removes one high-volume write path while preserving deterministic outputs and provenance contracts --- docs/architecture/datasets/os_open_lids.md | 8 +- docs/architecture/relationships-overview.md | 7 +- .../stages/0b_stage_normalisation.md | 3 +- docs/spec/pipeline_v3/data_model.md | 1 + pipeline/src/pipeline/build/workflows.py | 215 ++++++++++-------- tests/test_stage_cleanup_contract.py | 22 ++ 6 files changed, 154 insertions(+), 102 deletions(-) create mode 100644 tests/test_stage_cleanup_contract.py diff --git a/docs/architecture/datasets/os_open_lids.md b/docs/architecture/datasets/os_open_lids.md index 0adeab4..1fdd9ed 100644 --- a/docs/architecture/datasets/os_open_lids.md +++ b/docs/architecture/datasets/os_open_lids.md @@ -7,15 +7,15 @@ LIDS is the identifier bridge dataset. It resolves relationships between TOID/UP - Source key: `os_open_lids` - Raw table: `raw.os_open_lids_row` - Stage tables: - - `stage.open_lids_pair` (`id_1`, `id_2`, `relation_type`) - `stage.open_lids_toid_usrn` - `stage.open_lids_uprn_usrn` +- Stage checkpoint metric: + - `stage.open_lids_relation_count` ## Stage Normalisation -- Generic identifier pairs are normalised first: - - `id_1`, `id_2`, `relation_type` +- Generic identifier pairs are normalised in-query (`id_1`, `id_2`, `relation_type`) and not persisted as a large intermediate table. - Relation typing is explicit (`toid_usrn` or `uprn_usrn`) after deterministic inference. -- Typed rows are materialised into dedicated stage tables for downstream joins. +- Typed rows are materialised directly into dedicated stage tables for downstream joins. ## Downstream Transformations - Pass 2: helps infer missing canonical USRN names from Open Names TOIDs. diff --git a/docs/architecture/relationships-overview.md b/docs/architecture/relationships-overview.md index 7304280..b2184af 100644 --- a/docs/architecture/relationships-overview.md +++ b/docs/architecture/relationships-overview.md @@ -59,7 +59,6 @@ flowchart TB S_ONSPD["stage.onspd_postcode"] S_USRN["stage.streets_usrn_input"] S_NAMES["stage.open_names_road_feature"] - S_LIDS_PAIR["stage.open_lids_pair"] S_LIDS_TOID["stage.open_lids_toid_usrn"] S_LIDS_UPRN["stage.open_lids_uprn_usrn"] S_UPRN["stage.uprn_point"] @@ -127,7 +126,8 @@ flowchart TB R_ONSPD --> S_ONSPD R_USRN --> S_USRN R_NAMES --> S_NAMES - R_LIDS --> S_LIDS_PAIR + R_LIDS --> S_LIDS_TOID + R_LIDS --> S_LIDS_UPRN R_UPRN --> S_UPRN R_NSUL --> S_NSUL R_ROADS --> S_ROADS @@ -135,9 +135,6 @@ flowchart TB R_DFI -. optional .-> S_DFI R_PPD -. optional .-> S_PPD - S_LIDS_PAIR --> S_LIDS_TOID - S_LIDS_PAIR --> S_LIDS_UPRN - S_ONSPD --> C_POST S_ONSPD --> C_META S_USRN --> C_STREETS diff --git a/docs/architecture/stages/0b_stage_normalisation.md b/docs/architecture/stages/0b_stage_normalisation.md index 5110eb4..2ac9b6d 100644 --- a/docs/architecture/stages/0b_stage_normalisation.md +++ b/docs/architecture/stages/0b_stage_normalisation.md @@ -13,11 +13,11 @@ Transform raw payloads into typed/stable stage contracts consumed by later passe - `stage.open_names_road_feature` - `stage.open_roads_segment` - `stage.uprn_point` -- `stage.open_lids_pair` - `stage.open_lids_toid_usrn` - `stage.open_lids_uprn_usrn` - `stage.nsul_uprn_postcode` - optional NI/PPD stage tables +- checkpoint metric: `stage.open_lids_relation_count` ## Determinism/Validation - required mapped fields validated per source @@ -25,6 +25,7 @@ Transform raw payloads into typed/stable stage contracts consumed by later passe - explicit relation typing for LIDS (`toid_usrn`, `uprn_usrn`) - `(ingest_run_id, source_row_num)` indexes support deterministic replay/debug and source-row traceability - `stage.*` tables are `UNLOGGED` to reduce write amplification; they are rebuildable from `raw.*` +- pass start truncates all `stage.*` tables to prevent historical-row/index accumulation across build runs - `raw.*` tables are `UNLOGGED` in this development profile; authoritative replay comes from archived source files + `meta.ingest_run_file` ## Value Added diff --git a/docs/spec/pipeline_v3/data_model.md b/docs/spec/pipeline_v3/data_model.md index 7a00e52..3537be1 100644 --- a/docs/spec/pipeline_v3/data_model.md +++ b/docs/spec/pipeline_v3/data_model.md @@ -35,6 +35,7 @@ Published dataset pointer log. - `raw.*` holds immutable source snapshots for the active build cache. - `raw.*` tables are `UNLOGGED` for ingest speed and are rebuildable from archived source files + `meta.ingest_run_file`. - `stage.*` holds typed, normalised rows that build passes consume. +- `stage.*` is treated as transient workspace and is truncated at Pass `0b` start. ## Core Layer diff --git a/pipeline/src/pipeline/build/workflows.py b/pipeline/src/pipeline/build/workflows.py index 2623f2a..e44d79e 100644 --- a/pipeline/src/pipeline/build/workflows.py +++ b/pipeline/src/pipeline/build/workflows.py @@ -675,31 +675,63 @@ def _flush_stage_batch( return inserted -def _stage_cleanup(conn: psycopg.Connection, build_run_id: str) -> None: - tables = ( - "stage.ppd_parsed_address", - "stage.dfi_road_segment", - "stage.osni_street_point", - "stage.nsul_uprn_postcode", - "stage.open_lids_uprn_usrn", - "stage.open_lids_toid_usrn", - "stage.open_lids_pair", - "stage.uprn_point", - "stage.open_roads_segment", - "stage.open_names_road_feature", - "stage.streets_usrn_input", - "stage.onspd_postcode", - ) +STAGE_TABLES = ( + "stage.ppd_parsed_address", + "stage.dfi_road_segment", + "stage.osni_street_point", + "stage.nsul_uprn_postcode", + "stage.open_lids_uprn_usrn", + "stage.open_lids_toid_usrn", + "stage.open_lids_pair", + "stage.uprn_point", + "stage.open_roads_segment", + "stage.open_names_road_feature", + "stage.streets_usrn_input", + "stage.onspd_postcode", +) + + +def _assert_no_other_started_build(conn: psycopg.Connection, build_run_id: str) -> None: with conn.cursor() as cur: - for table in tables: - schema_name, table_name = table.split(".", 1) - cur.execute( - sql.SQL("DELETE FROM {}.{} WHERE build_run_id = %s").format( - sql.Identifier(schema_name), - sql.Identifier(table_name), - ), - (build_run_id,), + cur.execute( + """ + SELECT build_run_id::text, current_pass + FROM meta.build_run + WHERE status = 'started' + AND build_run_id <> %s + ORDER BY started_at_utc ASC + LIMIT 1 + """, + (build_run_id,), + ) + row = cur.fetchone() + + if row is not None: + other_run_id, current_pass = row + raise BuildError( + "Stage truncate is unsafe while another build is in status=started; " + f"other_build_run_id={other_run_id} other_current_pass={current_pass}" + ) + + +def _stage_cleanup(conn: psycopg.Connection, build_run_id: str) -> None: + _assert_no_other_started_build(conn, build_run_id) + table_identifiers = [] + for table in STAGE_TABLES: + schema_name, table_name = table.split(".", 1) + table_identifiers.append( + sql.SQL("{}.{}").format( + sql.Identifier(schema_name), + sql.Identifier(table_name), ) + ) + + with conn.cursor() as cur: + # Stage tables are transient build artifacts; truncation keeps runtime stable + # across rebuilds by preventing historical-row/index accumulation. + cur.execute( + sql.SQL("TRUNCATE TABLE {}").format(sql.SQL(", ").join(table_identifiers)) + ) def _pass_0a_raw_ingest( @@ -1234,7 +1266,7 @@ def _populate_stage_open_lids( (COALESCE(right_id, '') ~ '^[0-9]+$') AS right_is_digits FROM extracted ), - resolved AS ( + resolved AS MATERIALIZED ( SELECT CASE WHEN relation_hint IN ('toid_usrn', 'toid->usrn', 'toid_usrn_link') THEN 'toid_usrn' @@ -1262,84 +1294,83 @@ def _populate_stage_open_lids( END AS id_2 FROM prepared WHERE left_present AND right_present - ) - INSERT INTO stage.open_lids_pair ( - build_run_id, - id_1, - id_2, - relation_type, - ingest_run_id + ), + ins_toid AS ( + INSERT INTO stage.open_lids_toid_usrn ( + build_run_id, + toid, + usrn, + ingest_run_id + ) + SELECT + %s, + resolved.id_1, + resolved.id_2::bigint, + %s + FROM resolved + WHERE resolved.relation_type = 'toid_usrn' + AND resolved.id_2 ~ '^[0-9]+$' + ON CONFLICT (build_run_id, toid, usrn) + DO NOTHING + RETURNING 1 + ), + ins_uprn AS ( + INSERT INTO stage.open_lids_uprn_usrn ( + build_run_id, + uprn, + usrn, + ingest_run_id + ) + SELECT + %s, + resolved.id_1::bigint, + resolved.id_2::bigint, + %s + FROM resolved + WHERE resolved.relation_type = 'uprn_usrn' + AND resolved.id_1 ~ '^[0-9]+$' + AND resolved.id_2 ~ '^[0-9]+$' + ON CONFLICT (build_run_id, uprn, usrn) + DO NOTHING + RETURNING 1 ) SELECT - %s, - resolved.id_1, - resolved.id_2, - resolved.relation_type, - %s - FROM resolved - WHERE resolved.relation_type IS NOT NULL - ON CONFLICT (build_run_id, id_1, id_2, relation_type) - DO NOTHING + (SELECT COUNT(*)::bigint FROM ins_toid) AS toid_count, + (SELECT COUNT(*)::bigint FROM ins_uprn) AS uprn_count, + ( + SELECT COUNT(*)::bigint + FROM resolved + WHERE + ( + resolved.relation_type = 'toid_usrn' + AND resolved.id_2 ~ '^[0-9]+$' + ) + OR ( + resolved.relation_type = 'uprn_usrn' + AND resolved.id_1 ~ '^[0-9]+$' + AND resolved.id_2 ~ '^[0-9]+$' + ) + ) AS relation_count """ ).format( id_1_expr=id_1_expr, id_2_expr=id_2_expr, relation_expr=relation_expr, ), - (ingest_run_id, build_run_id, ingest_run_id), - ) - pair_count = int(cur.rowcount) - - cur.execute( - """ - INSERT INTO stage.open_lids_toid_usrn ( + ( + ingest_run_id, build_run_id, - toid, - usrn, - ingest_run_id - ) - SELECT - %s, - p.id_1, - p.id_2::bigint, - %s - FROM stage.open_lids_pair AS p - WHERE p.build_run_id = %s - AND p.ingest_run_id = %s - AND p.relation_type = 'toid_usrn' - AND p.id_2 ~ '^[0-9]+$' - ON CONFLICT (build_run_id, toid, usrn) - DO NOTHING - """, - (build_run_id, ingest_run_id, build_run_id, ingest_run_id), - ) - toid_count = int(cur.rowcount) - - cur.execute( - """ - INSERT INTO stage.open_lids_uprn_usrn ( + ingest_run_id, build_run_id, - uprn, - usrn, - ingest_run_id - ) - SELECT - %s, - p.id_1::bigint, - p.id_2::bigint, - %s - FROM stage.open_lids_pair AS p - WHERE p.build_run_id = %s - AND p.ingest_run_id = %s - AND p.relation_type = 'uprn_usrn' - AND p.id_1 ~ '^[0-9]+$' - AND p.id_2 ~ '^[0-9]+$' - ON CONFLICT (build_run_id, uprn, usrn) - DO NOTHING - """, - (build_run_id, ingest_run_id, build_run_id, ingest_run_id), + ingest_run_id, + ), ) - uprn_count = int(cur.rowcount) + row = cur.fetchone() + if row is None: + return 0, 0, 0 + toid_count = int(row[0]) + uprn_count = int(row[1]) + pair_count = int(row[2]) return toid_count, uprn_count, pair_count @@ -1660,7 +1691,7 @@ def _pass_0b_stage_normalisation( ) counts["stage.open_lids_toid_usrn"] = toid_count counts["stage.open_lids_uprn_usrn"] = uprn_count - counts["stage.open_lids_pair"] = relation_count + counts["stage.open_lids_relation_count"] = relation_count if "nsul" in source_runs: field_map, required_fields = _mapped_fields_for_source(schema_config, "nsul") diff --git a/tests/test_stage_cleanup_contract.py b/tests/test_stage_cleanup_contract.py new file mode 100644 index 0000000..98d7251 --- /dev/null +++ b/tests/test_stage_cleanup_contract.py @@ -0,0 +1,22 @@ +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +WORKFLOWS = ROOT / "pipeline" / "src" / "pipeline" / "build" / "workflows.py" + + +class StageCleanupContractTests(unittest.TestCase): + def test_stage_cleanup_uses_truncate_not_per_run_delete(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn("TRUNCATE TABLE", text) + self.assertNotIn("DELETE FROM {}.{} WHERE build_run_id = %s", text) + + def test_stage_cleanup_blocks_when_other_build_is_started(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn("WHERE status = 'started'", text) + self.assertIn("Stage truncate is unsafe while another build is in status=started", text) + + +if __name__ == "__main__": + unittest.main() From e7659e12beaa7e99ea8a26ff64dd6813a621cabd Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Sun, 22 Feb 2026 22:59:16 +0000 Subject: [PATCH 16/17] perf(pipeline): stabilize heavy passes and cut temp-spill hot paths This commit tackles the long-running build bottlenecks we observed on real-volume data, with a focus on pass 0b and pass 4 performance while preserving deterministic outputs and provenance rules. What changed: - add stage join indexes migration for Open Names joins (build_run_id+toid and build_run_id+postcode_norm) - add reusable ANALYZE helper and refresh stats after stage/core population to improve planner decisions - raise pass-local work_mem for heavy pass 0b and pass 4 operations to reduce temp-file spill - remove unnecessary ORDER BY clauses in large stage upserts (open_uprn/nsul) - refactor pass 4 provenance lookup to use bundle-level os_open_lids ingest_run_id directly - update pass 4 postcode resolution path via stage.onspd_postcode normalization contract - document execution-shape/performance behavior updates across stage docs Validation performed: - full test suite: .venv/bin/python -m unittest discover tests (14/14 passing) - resumed real build completed and verify succeeded for build_run_id=7a2aa7aa-d76e-4be4-8d1c-947ca41de1d7 --- .../stages/0b_stage_normalisation.md | 2 + docs/architecture/stages/1_onspd_backbone.md | 4 + .../stages/2_gb_canonical_streets.md | 8 + .../stages/4_uprn_reinforcement.md | 7 + docs/architecture/stages/8_finalisation.md | 1 + .../migrations/0014_v3_stage_join_indexes.sql | 11 + pipeline/src/pipeline/build/workflows.py | 443 +++++++++++------- 7 files changed, 309 insertions(+), 167 deletions(-) create mode 100644 pipeline/sql/migrations/0014_v3_stage_join_indexes.sql diff --git a/docs/architecture/stages/0b_stage_normalisation.md b/docs/architecture/stages/0b_stage_normalisation.md index 2ac9b6d..81da7ce 100644 --- a/docs/architecture/stages/0b_stage_normalisation.md +++ b/docs/architecture/stages/0b_stage_normalisation.md @@ -23,9 +23,11 @@ Transform raw payloads into typed/stable stage contracts consumed by later passe - required mapped fields validated per source - heavy-volume sources (`os_open_uprn`, `os_open_lids`, `nsul`) use set-based SQL transforms - explicit relation typing for LIDS (`toid_usrn`, `uprn_usrn`) +- pass-local `work_mem` is raised for large sort/dedupe transforms to reduce temp-file spill - `(ingest_run_id, source_row_num)` indexes support deterministic replay/debug and source-row traceability - `stage.*` tables are `UNLOGGED` to reduce write amplification; they are rebuildable from `raw.*` - pass start truncates all `stage.*` tables to prevent historical-row/index accumulation across build runs +- final `ANALYZE` refreshes planner stats for all `stage.*` relations before Pass 1+ - `raw.*` tables are `UNLOGGED` in this development profile; authoritative replay comes from archived source files + `meta.ingest_run_file` ## Value Added diff --git a/docs/architecture/stages/1_onspd_backbone.md b/docs/architecture/stages/1_onspd_backbone.md index 5472bb6..9dd6f50 100644 --- a/docs/architecture/stages/1_onspd_backbone.md +++ b/docs/architecture/stages/1_onspd_backbone.md @@ -10,6 +10,10 @@ Build canonical postcode entities from staged ONSPD rows. - `core.postcodes` - `core.postcodes_meta` +## Execution Notes +- set-based insert ordered by canonical postcode normalization key +- post-insert `ANALYZE` keeps downstream join planning stable (Pass 3/4/5) + ## Value Added - authoritative postcode backbone - unified geographic/admin context for subsequent joins diff --git a/docs/architecture/stages/2_gb_canonical_streets.md b/docs/architecture/stages/2_gb_canonical_streets.md index 6fe0fe3..25bea79 100644 --- a/docs/architecture/stages/2_gb_canonical_streets.md +++ b/docs/architecture/stages/2_gb_canonical_streets.md @@ -10,6 +10,14 @@ Build `core.streets_usrn` as canonical street dictionary keyed by USRN. ## Outputs - `core.streets_usrn` +## Execution Shape +- set-based direct insert from `stage.streets_usrn_input` +- inferred path pre-aggregates Open Names by `toid` before joining to LIDS to reduce join volume +- set-based inferred insert (Open Names + LIDS) for USRNs not already present +- inferred name ranking uses deterministic tie-breaks by evidence count then casefolded/name lexical order +- stage join indexes on `stage.open_names_road_feature(build_run_id, toid)` and `(build_run_id, postcode_norm)` support Pass 2/3 joins +- post-pass `ANALYZE` keeps `core.streets_usrn` statistics current for Pass 4 joins + ## Value Added - canonical USRN street name layer - inferred USRN naming where direct USRN names are missing diff --git a/docs/architecture/stages/4_uprn_reinforcement.md b/docs/architecture/stages/4_uprn_reinforcement.md index 62d05ad..ec39d7e 100644 --- a/docs/architecture/stages/4_uprn_reinforcement.md +++ b/docs/architecture/stages/4_uprn_reinforcement.md @@ -12,6 +12,13 @@ Generate high-confidence `uprn_usrn` candidates by aggregating property-level ev ## Outputs - `derived.postcode_street_candidates` rows (`candidate_type=uprn_usrn`, `confidence=high`) +## Execution Shape +- pass-local `work_mem` is raised to avoid temp-file-heavy plans on large NSUL/LIDS joins +- UPRN evidence is aggregated by `(postcode_norm, usrn)` from `stage.nsul_uprn_postcode` + `stage.open_lids_uprn_usrn` +- `ingest_run_id` provenance is sourced directly from `meta.build_bundle_source` (`os_open_lids`) instead of per-row aggregation +- postcode resolution joins via `stage.onspd_postcode.postcode_norm -> postcode_display` before `core.postcodes` join +- deterministic candidate insertion order remains `postcode`, then `usrn` + ## Value Added - strongest GB evidence class from UPRN-linked observations - frequency signal (`uprn_count`) for ranking/probability diff --git a/docs/architecture/stages/8_finalisation.md b/docs/architecture/stages/8_finalisation.md index 3476a7b..bca72f7 100644 --- a/docs/architecture/stages/8_finalisation.md +++ b/docs/architecture/stages/8_finalisation.md @@ -18,6 +18,7 @@ Resolve candidate evidence into final postcode/street outputs and materialized A ## Deterministic Probability - exact formula normalization by postcode total weight - fixed-scale rounding + deterministic residual correction to rank 1 street +- set-based SQL materialisation for `final`, `final_candidate`, and `final_source` joins (no per-row query loops) ## Value Added - converts evidence graph into stable product outputs diff --git a/pipeline/sql/migrations/0014_v3_stage_join_indexes.sql b/pipeline/sql/migrations/0014_v3_stage_join_indexes.sql new file mode 100644 index 0000000..946ebdb --- /dev/null +++ b/pipeline/sql/migrations/0014_v3_stage_join_indexes.sql @@ -0,0 +1,11 @@ +BEGIN; + +CREATE INDEX IF NOT EXISTS idx_stage_open_names_run_toid + ON stage.open_names_road_feature (build_run_id, toid) + WHERE toid IS NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_stage_open_names_run_postcode + ON stage.open_names_road_feature (build_run_id, postcode_norm) + WHERE postcode_norm IS NOT NULL; + +COMMIT; diff --git a/pipeline/src/pipeline/build/workflows.py b/pipeline/src/pipeline/build/workflows.py index e44d79e..38dcd21 100644 --- a/pipeline/src/pipeline/build/workflows.py +++ b/pipeline/src/pipeline/build/workflows.py @@ -7,7 +7,7 @@ import re import uuid from dataclasses import dataclass -from decimal import Decimal, ROUND_HALF_UP +from decimal import Decimal from pathlib import Path from typing import Any @@ -691,6 +691,18 @@ def _flush_stage_batch( ) +def _analyze_relations(conn: psycopg.Connection, relations: tuple[str, ...]) -> None: + with conn.cursor() as cur: + for relation in relations: + schema_name, table_name = relation.split(".", 1) + cur.execute( + sql.SQL("ANALYZE {}.{}").format( + sql.Identifier(schema_name), + sql.Identifier(table_name), + ) + ) + + def _assert_no_other_started_build(conn: psycopg.Connection, build_run_id: str) -> None: with conn.cursor() as cur: cur.execute( @@ -1208,7 +1220,6 @@ def _populate_stage_open_uprn( d.postcode_norm, %s FROM deduped AS d - ORDER BY d.uprn ASC ON CONFLICT (build_run_id, uprn) DO UPDATE SET postcode_norm = EXCLUDED.postcode_norm, @@ -1431,7 +1442,6 @@ def _populate_stage_nsul( %s FROM normalized AS n WHERE n.postcode_norm IS NOT NULL - ORDER BY n.uprn ASC, n.postcode_norm COLLATE "C" ASC ON CONFLICT (build_run_id, uprn, postcode_norm) DO NOTHING """ @@ -1643,6 +1653,11 @@ def _pass_0b_stage_normalisation( build_run_id: str, source_runs: dict[str, tuple[str, ...]], ) -> dict[str, int]: + with conn.cursor() as cur: + # Pass 0b executes large sort/dedupe operations on raw snapshots. + # Raising work_mem here avoids repeated temp-file spill on default settings. + cur.execute("SET LOCAL work_mem = '256MB'") + _stage_cleanup(conn, build_run_id) schema_config = _schema_config() @@ -1730,6 +1745,7 @@ def _pass_0b_stage_normalisation( ) counts["stage.ppd_parsed_address"] = ppd_rows + _analyze_relations(conn, STAGE_TABLES) return counts @@ -1835,6 +1851,8 @@ def _pass_1_onspd_backbone(conn: psycopg.Connection, build_run_id: str) -> dict[ ) inserted_meta = cur.rowcount + _analyze_relations(conn, ("core.postcodes", "core.postcodes_meta")) + return { "core.postcodes": int(inserted_postcodes), "core.postcodes_meta": int(inserted_meta), @@ -1845,33 +1863,85 @@ def _pass_2_gb_canonical_streets(conn: psycopg.Connection, build_run_id: str) -> with conn.cursor() as cur: cur.execute( """ - WITH direct_usrn AS ( - SELECT + INSERT INTO core.streets_usrn ( + produced_build_run_id, + usrn, + street_name, + street_name_casefolded, + street_class, + street_status, + usrn_run_id + ) + SELECT + %(build_run_id)s, + s.usrn, + s.street_name, + s.street_name_casefolded, + s.street_class, + s.street_status, + s.usrn_run_id + FROM stage.streets_usrn_input AS s + WHERE s.build_run_id = %(build_run_id)s + ORDER BY s.usrn ASC + """, + {"build_run_id": build_run_id}, + ) + inserted_direct = int(cur.rowcount) + + cur.execute( + """ + CREATE TEMP TABLE tmp_open_names_toid_counts + ON COMMIT DROP AS + SELECT + n.toid, + n.street_name_raw AS street_name, + n.street_name_casefolded, + COUNT(*)::bigint AS feature_count + FROM stage.open_names_road_feature AS n + WHERE n.build_run_id = %(build_run_id)s + AND n.toid IS NOT NULL + GROUP BY n.toid, n.street_name_raw, n.street_name_casefolded + """, + {"build_run_id": build_run_id}, + ) + cur.execute( + """ + CREATE INDEX idx_tmp_open_names_toid_counts_toid + ON tmp_open_names_toid_counts (toid) + """ + ) + cur.execute( + """ + CREATE TEMP TABLE tmp_inferred_name_counts + ON COMMIT DROP AS + SELECT + lids.usrn, + n.street_name, + n.street_name_casefolded, + SUM(n.feature_count)::bigint AS evidence_count, + (ARRAY_AGG(lids.ingest_run_id ORDER BY lids.ingest_run_id::text ASC))[1] AS usrn_run_id + FROM tmp_open_names_toid_counts AS n + JOIN stage.open_lids_toid_usrn AS lids + ON lids.build_run_id = %(build_run_id)s + AND lids.toid = n.toid + GROUP BY lids.usrn, n.street_name, n.street_name_casefolded + """, + {"build_run_id": build_run_id}, + ) + cur.execute( + """ + CREATE INDEX idx_tmp_inferred_name_counts_usrn + ON tmp_inferred_name_counts ( usrn, - street_name, + evidence_count DESC, street_name_casefolded, - street_class, - street_status, - usrn_run_id - FROM stage.streets_usrn_input - WHERE build_run_id = %(build_run_id)s - ), - inferred_name_counts AS ( - SELECT - lids.usrn, - n.street_name_raw AS street_name, - n.street_name_casefolded, - COUNT(*)::bigint AS evidence_count, - (ARRAY_AGG(lids.ingest_run_id ORDER BY lids.ingest_run_id::text ASC))[1] AS usrn_run_id - FROM stage.open_names_road_feature AS n - JOIN stage.open_lids_toid_usrn AS lids - ON lids.build_run_id = n.build_run_id - AND lids.toid = n.toid - WHERE n.build_run_id = %(build_run_id)s - AND n.toid IS NOT NULL - GROUP BY lids.usrn, n.street_name_raw, n.street_name_casefolded - ), - inferred_usrn AS ( + street_name + ) + """ + ) + cur.execute( + """ + WITH inferred_usrn AS ( SELECT usrn, street_name, @@ -1891,33 +1961,9 @@ def _pass_2_gb_canonical_streets(conn: psycopg.Connection, build_run_id: str) -> street_name_casefolded COLLATE "C" ASC, street_name COLLATE "C" ASC ) AS rn - FROM inferred_name_counts + FROM tmp_inferred_name_counts ) AS ranked WHERE rn = 1 - ), - combined AS ( - SELECT - usrn, - street_name, - street_name_casefolded, - street_class, - street_status, - usrn_run_id - FROM direct_usrn - UNION ALL - SELECT - inferred.usrn, - inferred.street_name, - inferred.street_name_casefolded, - inferred.street_class, - inferred.street_status, - inferred.usrn_run_id - FROM inferred_usrn AS inferred - WHERE NOT EXISTS ( - SELECT 1 - FROM direct_usrn AS direct - WHERE direct.usrn = inferred.usrn - ) ) INSERT INTO core.streets_usrn ( produced_build_run_id, @@ -1930,20 +1976,27 @@ def _pass_2_gb_canonical_streets(conn: psycopg.Connection, build_run_id: str) -> ) SELECT %(build_run_id)s, - usrn, - street_name, - street_name_casefolded, - street_class, - street_status, - usrn_run_id - FROM combined - ORDER BY usrn ASC + inferred.usrn, + inferred.street_name, + inferred.street_name_casefolded, + inferred.street_class, + inferred.street_status, + inferred.usrn_run_id + FROM inferred_usrn AS inferred + WHERE NOT EXISTS ( + SELECT 1 + FROM core.streets_usrn AS direct + WHERE direct.produced_build_run_id = %(build_run_id)s + AND direct.usrn = inferred.usrn + ) + ORDER BY inferred.usrn ASC """, {"build_run_id": build_run_id}, ) - inserted = cur.rowcount + inserted_inferred = int(cur.rowcount) - return {"core.streets_usrn": int(inserted)} + _analyze_relations(conn, ("core.streets_usrn",)) + return {"core.streets_usrn": inserted_direct + inserted_inferred} def _pass_3_open_names_candidates(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: @@ -2081,14 +2134,35 @@ def _pass_3_open_names_candidates(conn: psycopg.Connection, build_run_id: str) - def _pass_4_uprn_reinforcement(conn: psycopg.Connection, build_run_id: str) -> dict[str, int]: with conn.cursor() as cur: + cur.execute("SET LOCAL work_mem = '256MB'") + cur.execute( + """ + SELECT bbs.ingest_run_id + FROM meta.build_run AS br + JOIN meta.build_bundle_source AS bbs + ON bbs.bundle_id = br.bundle_id + WHERE br.build_run_id = %s + AND bbs.source_name = 'os_open_lids' + ORDER BY bbs.ingest_run_id::text ASC + LIMIT 1 + """, + (build_run_id,), + ) + run_row = cur.fetchone() + if run_row is None or run_row[0] is None: + raise BuildError( + "Pass 4 failed: missing os_open_lids ingest run for build bundle " + f"build_run_id={build_run_id}" + ) + open_lids_ingest_run_id = run_row[0] + cur.execute( """ WITH aggregate_pairs AS ( SELECT nsul.postcode_norm, lids.usrn, - COUNT(*)::bigint AS uprn_count, - (ARRAY_AGG(lids.ingest_run_id ORDER BY lids.ingest_run_id::text ASC))[1] AS open_lids_ingest_run_id + COUNT(*)::bigint AS uprn_count FROM stage.nsul_uprn_postcode AS nsul JOIN stage.open_lids_uprn_usrn AS lids ON lids.build_run_id = nsul.build_run_id @@ -2119,18 +2193,28 @@ def _pass_4_uprn_reinforcement(conn: psycopg.Connection, build_run_id: str) -> d 'high', 'open_lids:uprn_usrn:' || a.uprn_count::text || '_uprns', 'os_open_lids', - a.open_lids_ingest_run_id, + %s, jsonb_build_object('uprn_count', a.uprn_count) FROM aggregate_pairs AS a + JOIN stage.onspd_postcode AS sp + ON sp.build_run_id = %s + AND sp.postcode_norm = a.postcode_norm JOIN core.postcodes AS p ON p.produced_build_run_id = %s - AND replace(p.postcode, ' ', '') = a.postcode_norm + AND p.postcode = sp.postcode_display JOIN core.streets_usrn AS s ON s.produced_build_run_id = %s AND s.usrn = a.usrn ORDER BY p.postcode COLLATE "C" ASC, a.usrn ASC """, - (build_run_id, build_run_id, build_run_id, build_run_id), + ( + build_run_id, + build_run_id, + open_lids_ingest_run_id, + build_run_id, + build_run_id, + build_run_id, + ), ) inserted = cur.rowcount @@ -2426,16 +2510,6 @@ def _pass_7_ppd_gap_fill(conn: psycopg.Connection, build_run_id: str) -> dict[st } -def _confidence_from_rank(conf_rank: int) -> str: - if conf_rank >= 3: - return "high" - if conf_rank == 2: - return "medium" - if conf_rank == 1: - return "low" - return "none" - - def _pass_8_finalisation(conn: psycopg.Connection, build_run_id: str, dataset_version: str) -> dict[str, int]: weight_map = _weight_config() @@ -2504,6 +2578,27 @@ def _pass_8_finalisation(conn: psycopg.Connection, build_run_id: str, dataset_ve cur.execute( """ + CREATE INDEX IF NOT EXISTS idx_tmp_weighted_candidates_street + ON tmp_weighted_candidates (postcode, canonical_street_name, candidate_id) + """ + ) + cur.execute( + """ + CREATE INDEX IF NOT EXISTS idx_tmp_weighted_candidates_source + ON tmp_weighted_candidates ( + postcode, + canonical_street_name, + source_name, + ingest_run_id, + candidate_type + ) + """ + ) + + cur.execute("DROP TABLE IF EXISTS pg_temp.tmp_final_scored") + cur.execute( + """ + CREATE TEMP TABLE tmp_final_scored AS WITH grouped AS ( SELECT postcode, @@ -2552,8 +2647,13 @@ def _pass_8_finalisation(conn: psycopg.Connection, build_run_id: str, dataset_ve postcode, canonical_street_name, usrn, - weighted_score, - conf_rank, + ROUND(weighted_score::numeric, 4) AS frequency_score, + CASE conf_rank + WHEN 3 THEN 'high' + WHEN 2 THEN 'medium' + WHEN 1 THEN 'low' + ELSE 'none' + END AS confidence, CASE WHEN rn = 1 THEN ROUND((rounded_probability + (1.0000 - rounded_sum))::numeric, 4) @@ -2564,23 +2664,26 @@ def _pass_8_finalisation(conn: psycopg.Connection, build_run_id: str, dataset_ve ORDER BY postcode COLLATE "C" ASC, rn ASC """ ) - final_rows = cur.fetchall() - - inserted_final = 0 - inserted_final_candidate = 0 - inserted_final_source = 0 - - with conn.cursor() as cur: - for postcode, street_name, usrn, weighted_score, conf_rank, probability, _rn in final_rows: - frequency_score = Decimal(str(weighted_score)).quantize(Decimal("0.0001"), rounding=ROUND_HALF_UP) - probability_decimal = Decimal(str(probability)).quantize( - Decimal("0.0001"), - rounding=ROUND_HALF_UP, - ) - confidence = _confidence_from_rank(int(conf_rank)) + cur.execute( + """ + CREATE INDEX IF NOT EXISTS idx_tmp_final_scored_street + ON tmp_final_scored (postcode, canonical_street_name) + """ + ) - cur.execute( - """ + cur.execute("DROP TABLE IF EXISTS pg_temp.tmp_final_inserted") + cur.execute( + """ + CREATE TEMP TABLE tmp_final_inserted ( + final_id bigint PRIMARY KEY, + postcode text NOT NULL, + canonical_street_name text NOT NULL + ) ON COMMIT DROP + """ + ) + cur.execute( + """ + WITH inserted AS ( INSERT INTO derived.postcode_streets_final ( produced_build_run_id, postcode, @@ -2589,82 +2692,88 @@ def _pass_8_finalisation(conn: psycopg.Connection, build_run_id: str, dataset_ve confidence, frequency_score, probability - ) VALUES (%s, %s, %s, %s, %s, %s, %s) - RETURNING final_id - """, - ( - build_run_id, - postcode, - street_name, - usrn, - confidence, - frequency_score, - probability_decimal, - ), + ) + SELECT + %s, + fs.postcode, + fs.canonical_street_name, + fs.usrn, + fs.confidence, + fs.frequency_score, + fs.final_probability + FROM tmp_final_scored AS fs + ORDER BY fs.postcode COLLATE "C" ASC, fs.rn ASC + RETURNING final_id, postcode, street_name ) - final_id = int(cur.fetchone()[0]) - inserted_final += 1 + INSERT INTO tmp_final_inserted (final_id, postcode, canonical_street_name) + SELECT final_id, postcode, street_name + FROM inserted + """, + (build_run_id,), + ) + inserted_final = int(cur.rowcount) - cur.execute( - """ - SELECT candidate_id - FROM tmp_weighted_candidates - WHERE postcode = %s - AND canonical_street_name = %s - ORDER BY candidate_id ASC - """, - (postcode, street_name), + cur.execute( + """ + INSERT INTO derived.postcode_streets_final_candidate ( + final_id, + candidate_id, + produced_build_run_id, + link_rank ) - candidate_ids = [int(row[0]) for row in cur.fetchall()] - for rank, candidate_id in enumerate(candidate_ids, start=1): - cur.execute( - """ - INSERT INTO derived.postcode_streets_final_candidate ( - final_id, - candidate_id, - produced_build_run_id, - link_rank - ) VALUES (%s, %s, %s, %s) - """, - (final_id, candidate_id, build_run_id, rank), - ) - inserted_final_candidate += 1 + SELECT + fi.final_id, + wc.candidate_id, + %s, + ROW_NUMBER() OVER ( + PARTITION BY fi.final_id + ORDER BY wc.candidate_id ASC + ) AS link_rank + FROM tmp_final_inserted AS fi + JOIN tmp_weighted_candidates AS wc + ON wc.postcode = fi.postcode + AND wc.canonical_street_name = fi.canonical_street_name + ORDER BY fi.final_id ASC, wc.candidate_id ASC + """, + (build_run_id,), + ) + inserted_final_candidate = int(cur.rowcount) - cur.execute( - """ - SELECT source_name, ingest_run_id, candidate_type, SUM(weight) AS contribution_weight - FROM tmp_weighted_candidates - WHERE postcode = %s - AND canonical_street_name = %s - GROUP BY source_name, ingest_run_id, candidate_type - ORDER BY source_name COLLATE "C" ASC, ingest_run_id::text ASC, candidate_type COLLATE "C" ASC - """, - (postcode, street_name), + cur.execute( + """ + INSERT INTO derived.postcode_streets_final_source ( + final_id, + source_name, + ingest_run_id, + candidate_type, + contribution_weight, + produced_build_run_id ) - for source_name, ingest_run_id, candidate_type, contribution_weight in cur.fetchall(): - cur.execute( - """ - INSERT INTO derived.postcode_streets_final_source ( - final_id, - source_name, - ingest_run_id, - candidate_type, - contribution_weight, - produced_build_run_id - ) VALUES (%s, %s, %s, %s, %s, %s) - """, - ( - final_id, - source_name, - ingest_run_id, - candidate_type, - Decimal(str(contribution_weight)).quantize( - Decimal("0.0001"), rounding=ROUND_HALF_UP - ), - build_run_id, - ), - ) - inserted_final_source += 1 + SELECT + fi.final_id, + wc.source_name, + wc.ingest_run_id, + wc.candidate_type, + ROUND(SUM(wc.weight)::numeric, 4) AS contribution_weight, + %s + FROM tmp_final_inserted AS fi + JOIN tmp_weighted_candidates AS wc + ON wc.postcode = fi.postcode + AND wc.canonical_street_name = fi.canonical_street_name + GROUP BY + fi.final_id, + wc.source_name, + wc.ingest_run_id, + wc.candidate_type + ORDER BY + fi.final_id ASC, + wc.source_name COLLATE "C" ASC, + wc.ingest_run_id::text ASC, + wc.candidate_type COLLATE "C" ASC + """, + (build_run_id,), + ) + inserted_final_source = int(cur.rowcount) cur.execute( """ From 2aef8a2ab63ecc44a900ea6f86a56035c4f043cd Mon Sep 17 00:00:00 2001 From: Jamie Thompson Date: Sun, 22 Feb 2026 23:41:12 +0000 Subject: [PATCH 17/17] fix(onspd): propagate post_town/locality and lock contract tests (#3) Ensure ONSPD passthrough admin fields are not silently dropped during stage normalisation. Changes: - add onspd field_map entries for post_town (potown) and locality - resolve post_town/locality via _field_value(...) so mapped names + aliases are honored - add contract tests guarding schema mapping + stage-loader extraction paths - update ONSPD/stage/data-model docs to document passthrough semantics and NULL behavior when absent upstream Validation: - python3 -m unittest discover -s tests -p 'test_*.py' (16 tests, all passing) --- docs/architecture/datasets/onspd.md | 4 +++ docs/architecture/stages/1_onspd_backbone.md | 1 + docs/spec/pipeline_v3/data_model.md | 7 +++++ pipeline/config/source_schema.yaml | 4 ++- pipeline/src/pipeline/build/workflows.py | 6 ++--- ...st_onspd_optional_admin_fields_contract.py | 27 +++++++++++++++++++ 6 files changed, 44 insertions(+), 5 deletions(-) create mode 100644 tests/test_onspd_optional_admin_fields_contract.py diff --git a/docs/architecture/datasets/onspd.md b/docs/architecture/datasets/onspd.md index bdde023..75a9bd8 100644 --- a/docs/architecture/datasets/onspd.md +++ b/docs/architecture/datasets/onspd.md @@ -15,7 +15,11 @@ ONSPD is the definitive postcode backbone. It validates postcode existence and c - `postcode_norm`, `postcode_display` - `status`, `lat`, `lon`, `easting`, `northing` - `country_iso2`, `country_iso3`, `subdivision_code` + - `post_town`, `locality` (when present in source payload) - `street_enrichment_available` +- Limitations: + - `post_town` and `locality` are passthrough attributes only. + - If a source release omits these fields, `stage.onspd_postcode` and downstream outputs retain `NULL`. ## Downstream Transformations - Pass 1 writes: diff --git a/docs/architecture/stages/1_onspd_backbone.md b/docs/architecture/stages/1_onspd_backbone.md index 9dd6f50..c83721c 100644 --- a/docs/architecture/stages/1_onspd_backbone.md +++ b/docs/architecture/stages/1_onspd_backbone.md @@ -13,6 +13,7 @@ Build canonical postcode entities from staged ONSPD rows. ## Execution Notes - set-based insert ordered by canonical postcode normalization key - post-insert `ANALYZE` keeps downstream join planning stable (Pass 3/4/5) +- `post_town` and `locality` are copied from `stage.onspd_postcode` into `core.postcodes` and `core.postcodes_meta` without inference. ## Value Added - authoritative postcode backbone diff --git a/docs/spec/pipeline_v3/data_model.md b/docs/spec/pipeline_v3/data_model.md index 3537be1..46bb129 100644 --- a/docs/spec/pipeline_v3/data_model.md +++ b/docs/spec/pipeline_v3/data_model.md @@ -43,6 +43,13 @@ Published dataset pointer log. - `core.postcodes_meta` - `core.streets_usrn` +### `core.postcodes` key columns + +- canonical postcode identity and geo/admin context: + - `postcode`, `status`, `lat`, `lon`, `easting`, `northing` + - `country_iso2`, `country_iso3`, `subdivision_code` + - `post_town`, `locality` (passthrough when present in staged ONSPD rows) + ## Derived Layer ### `derived.postcode_street_candidates` diff --git a/pipeline/config/source_schema.yaml b/pipeline/config/source_schema.yaml index 2ba72b4..a2e80ce 100644 --- a/pipeline/config/source_schema.yaml +++ b/pipeline/config/source_schema.yaml @@ -18,7 +18,9 @@ "northing": "north1m", "country_iso2": "ctry25cd", "country_iso3": "ctry25cd", - "subdivision_code": "ctry25cd" + "subdivision_code": "ctry25cd", + "post_town": "potown", + "locality": "locality" } }, "os_open_usrn": { diff --git a/pipeline/src/pipeline/build/workflows.py b/pipeline/src/pipeline/build/workflows.py index 38dcd21..b4d3e73 100644 --- a/pipeline/src/pipeline/build/workflows.py +++ b/pipeline/src/pipeline/build/workflows.py @@ -897,10 +897,8 @@ def _populate_stage_onspd( easting = None northing = None - post_town_key = field_map.get("post_town") - locality_key = field_map.get("locality") - post_town_raw = row.get(post_town_key) if post_town_key else None - locality_raw = row.get(locality_key) if locality_key else None + post_town_raw = _field_value(row, field_map, "post_town") + locality_raw = _field_value(row, field_map, "locality") payload.append( ( diff --git a/tests/test_onspd_optional_admin_fields_contract.py b/tests/test_onspd_optional_admin_fields_contract.py new file mode 100644 index 0000000..ced71db --- /dev/null +++ b/tests/test_onspd_optional_admin_fields_contract.py @@ -0,0 +1,27 @@ +import json +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +SOURCE_SCHEMA = ROOT / "pipeline" / "config" / "source_schema.yaml" +WORKFLOWS = ROOT / "pipeline" / "src" / "pipeline" / "build" / "workflows.py" + + +class OnspdOptionalAdminFieldsContractTests(unittest.TestCase): + def test_onspd_field_map_includes_post_town_and_locality(self) -> None: + payload = json.loads(SOURCE_SCHEMA.read_text(encoding="utf-8")) + field_map = payload["sources"]["onspd"]["field_map"] + self.assertIn("post_town", field_map) + self.assertIn("locality", field_map) + self.assertTrue(str(field_map["post_town"]).strip()) + self.assertTrue(str(field_map["locality"]).strip()) + + def test_stage_loader_resolves_post_town_and_locality_via_field_candidates(self) -> None: + text = WORKFLOWS.read_text(encoding="utf-8") + self.assertIn('post_town_raw = _field_value(row, field_map, "post_town")', text) + self.assertIn('locality_raw = _field_value(row, field_map, "locality")', text) + + +if __name__ == "__main__": + unittest.main()