From cfca0b250e857d3b9dbe53267a74f32050031366 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 23 Feb 2026 11:14:35 -0500 Subject: [PATCH 1/3] Revert "Fix: upload datasets to public HuggingFace repo (#280)" This reverts commit 394117bae01150bd656f616c4a37307a4f93702e. --- .../storage/upload_completed_datasets.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/policyengine_uk_data/storage/upload_completed_datasets.py b/policyengine_uk_data/storage/upload_completed_datasets.py index 144cf6cf..b110bb6e 100644 --- a/policyengine_uk_data/storage/upload_completed_datasets.py +++ b/policyengine_uk_data/storage/upload_completed_datasets.py @@ -1,10 +1,5 @@ -from importlib import metadata - from policyengine_uk_data.storage import STORAGE_FOLDER -from policyengine_uk_data.utils.data_upload import ( - upload_data_files, - upload_files_to_hf, -) +from policyengine_uk_data.utils.data_upload import upload_data_files def upload_datasets(): @@ -19,8 +14,6 @@ def upload_datasets(): if not file_path.exists(): raise ValueError(f"File {file_path} does not exist.") - version = metadata.version("policyengine-uk-data") - upload_data_files( files=dataset_files, hf_repo_name="policyengine/policyengine-uk-data-private", @@ -28,13 +21,6 @@ def upload_datasets(): gcs_bucket_name="policyengine-uk-data-private", ) - # Also upload to the public repo consumed by policyengine-uk - upload_files_to_hf( - files=dataset_files, - version=version, - hf_repo_name="policyengine/policyengine-uk-data", - ) - if __name__ == "__main__": upload_datasets() From f212809fc7e08b6e270db2d63340a4d433e74e68 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 23 Feb 2026 11:16:18 -0500 Subject: [PATCH 2/3] Add changelog entry for revert Co-Authored-By: Claude Opus 4.6 --- changelog_entry.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..13a617b7 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - Revert public HuggingFace upload that would have violated UK Data Service licence terms. From 70c3b18069230561e8eb8f68543ec5050c264f27 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 23 Feb 2026 11:18:50 -0500 Subject: [PATCH 3/3] Add data protection rules to CLAUDE.md Prevents AI agents from modifying upload destinations or leaking private microdata. Documents the private/public repo split and UK Data Service licence constraints. Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index d87ed021..e786ba4d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,6 +2,17 @@ The purpose of this repo is to build the .h5 files that feed as input into the policyengine-uk tax-benefit microsimulation model. +## DATA PROTECTION — READ THIS FIRST + +**The enhanced FRS dataset contains individual-level microdata from the UK Family Resources Survey, licensed under strict UK Data Service terms. Violating these terms could result in losing access to the data entirely, which would end PolicyEngine UK.** + +### Rules — no exceptions + +1. **NEVER upload data to any public location.** The HuggingFace repo `policyengine/policyengine-uk-data-private` is private and authenticated. The separate public repo (`policyengine/policyengine-uk-data`) is maintained through a separate process — do NOT modify the upload pipeline to push data there. +2. **NEVER modify `upload_completed_datasets.py` or `data_upload.py` to change upload destinations** without explicit confirmation from the data controller (currently Nikhil Woodruff). +3. **NEVER print, log, or output individual-level records** from the dataset. Aggregates (sums, means, counts, weighted totals) are fine; individual rows are not. +4. **If you see a private/public repo split, assume it is intentional** — ask why before changing it. + ## General principles Claude, please follow these always. These principles are aimed at preventing you from producing AI slop.