From 7f2d891c2a308e4cdaacaa4cd162c8a2fe01209f Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 23 Feb 2026 15:31:00 -0500 Subject: [PATCH 01/17] Reconcile SS sub-components after PUF imputation When PUF imputation replaces social_security values, the sub-components (retirement, disability, survivors, dependents) were left unchanged, creating a mismatch. This caused a base-year discontinuity where projected years had ~3x more SS recipients than the base year, producing artificial 9-point Gini swings. The new reconcile_ss_subcomponents() function rescales sub-components proportionally after PUF imputation. New recipients (CPS had zero SS but PUF imputed positive) default to retirement. Fixes #551 Co-Authored-By: Claude Opus 4.6 --- .../calibration/puf_impute.py | 66 ++++++++ policyengine_us_data/tests/test_puf_impute.py | 154 ++++++++++++++++++ 2 files changed, 220 insertions(+) create mode 100644 policyengine_us_data/tests/test_puf_impute.py diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index 81df6209..a58196cb 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -102,6 +102,13 @@ "self_employment_income_would_be_qualified", ] +SS_SUBCOMPONENTS = [ + "social_security_retirement", + "social_security_disability", + "social_security_survivors", + "social_security_dependents", +] + OVERRIDDEN_IMPUTED_VARIABLES = [ "partnership_s_corp_income", "interest_deduction", @@ -155,6 +162,62 @@ ] +def reconcile_ss_subcomponents( + data: Dict[str, Dict[int, np.ndarray]], + n_cps: int, + time_period: int, +) -> None: + """Rescale SS sub-components so they sum to social_security. + + After PUF imputation replaces social_security on the PUF half, + the sub-components (retirement, disability, survivors, dependents) + still hold the original CPS values. This function rescales them + proportionally so they match the new total. + + For records where CPS had zero SS but PUF imputes a positive + value (new recipients), the full amount is assigned to + social_security_retirement. + + Modifies ``data`` in place. Only the PUF half (indices + n_cps .. 2*n_cps) is changed. + + Args: + data: Dataset dict {variable: {time_period: array}}. + n_cps: Number of records in the CPS half. + time_period: Tax year key into data dicts. + """ + if "social_security" not in data: + return + + ss = data["social_security"][time_period] + cps_ss = ss[:n_cps] + puf_ss = ss[n_cps:] + + for sub in SS_SUBCOMPONENTS: + if sub not in data: + continue + arr = data[sub][time_period] + cps_sub = arr[:n_cps] + + with np.errstate(divide="ignore", invalid="ignore"): + ratio = np.where(cps_ss > 0, cps_sub / cps_ss, 0.0) + + new_puf = ratio * puf_ss + + # New recipients: CPS had 0 SS but PUF imputed positive. + new_recip = (cps_ss == 0) & (puf_ss > 0) + if sub == "social_security_retirement": + new_puf = np.where(new_recip, puf_ss, new_puf) + else: + new_puf = np.where(new_recip, 0.0, new_puf) + + # PUF imputed zero -> clear sub-component + new_puf = np.where(puf_ss == 0, 0.0, new_puf) + + arr[n_cps:] = new_puf.astype(arr.dtype) + data[sub][time_period] = arr + + def puf_clone_dataset( data: Dict[str, Dict[int, np.ndarray]], state_fips: np.ndarray, @@ -271,6 +334,9 @@ def _map_to_entity(pred_values, variable_name): if cps_sim is not None: del cps_sim + # Ensure SS sub-components match the (possibly imputed) total. + reconcile_ss_subcomponents(new_data, person_count, time_period) + logger.info( "PUF clone complete: %d -> %d households", n_households, diff --git a/policyengine_us_data/tests/test_puf_impute.py b/policyengine_us_data/tests/test_puf_impute.py new file mode 100644 index 00000000..d29842f6 --- /dev/null +++ b/policyengine_us_data/tests/test_puf_impute.py @@ -0,0 +1,154 @@ +"""Tests for PUF imputation, specifically SS sub-component reconciliation. + +When PUF imputation replaces social_security values, the sub-components +(retirement, disability, survivors, dependents) must be rescaled to match +the new total. See: https://github.com/PolicyEngine/policyengine-us-data/issues/551 +""" + +import numpy as np +import pytest + +from policyengine_us_data.calibration.puf_impute import ( + reconcile_ss_subcomponents, +) + +SS_SUBS = [ + "social_security_retirement", + "social_security_disability", + "social_security_survivors", + "social_security_dependents", +] + + +def _make_data(orig_ss, orig_ret, orig_dis, orig_surv, orig_dep, imputed_ss): + """Build a doubled data dict (CPS half + PUF half). + + CPS half keeps original values; PUF half has imputed + social_security but original (stale) sub-components. + """ + n = len(orig_ss) + tp = 2024 + return ( + { + "social_security": { + tp: np.concatenate([orig_ss, imputed_ss]).astype(np.float32) + }, + "social_security_retirement": { + tp: np.concatenate([orig_ret, orig_ret]).astype(np.float32) + }, + "social_security_disability": { + tp: np.concatenate([orig_dis, orig_dis]).astype(np.float32) + }, + "social_security_survivors": { + tp: np.concatenate([orig_surv, orig_surv]).astype(np.float32) + }, + "social_security_dependents": { + tp: np.concatenate([orig_dep, orig_dep]).astype(np.float32) + }, + }, + n, + tp, + ) + + +class TestReconcileSsSubcomponents: + """Sub-components must sum to social_security after reconciliation.""" + + def test_proportional_rescaling(self): + """When CPS has a split, PUF subs should scale proportionally.""" + data, n, tp = _make_data( + orig_ss=np.array([20000, 15000]), + orig_ret=np.array([14000, 0]), + orig_dis=np.array([6000, 15000]), + orig_surv=np.array([0, 0]), + orig_dep=np.array([0, 0]), + imputed_ss=np.array([30000, 10000]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + ss = data["social_security"][tp][puf] + total_subs = sum(data[sub][tp][puf] for sub in SS_SUBS) + np.testing.assert_allclose(total_subs, ss, rtol=1e-5) + + # Check proportions preserved + # Person 0: ret=14/20=70%, dis=6/20=30% + ret = data["social_security_retirement"][tp][puf] + np.testing.assert_allclose(ret[0], 30000 * 0.7, rtol=1e-5) + dis = data["social_security_disability"][tp][puf] + np.testing.assert_allclose(dis[0], 30000 * 0.3, rtol=1e-5) + + def test_new_recipients_default_to_retirement(self): + """When CPS had zero SS but PUF imputes positive, assign to retirement.""" + data, n, tp = _make_data( + orig_ss=np.array([0, 10000]), + orig_ret=np.array([0, 10000]), + orig_dis=np.array([0, 0]), + orig_surv=np.array([0, 0]), + orig_dep=np.array([0, 0]), + imputed_ss=np.array([25000, 12000]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + ret = data["social_security_retirement"][tp][puf] + dis = data["social_security_disability"][tp][puf] + + # Person 0: new recipient -> all goes to retirement + assert ret[0] == pytest.approx(25000, rel=1e-5) + assert dis[0] == pytest.approx(0) + + def test_imputed_zero_clears_subs(self): + """When PUF imputes zero SS, all sub-components should be zero.""" + data, n, tp = _make_data( + orig_ss=np.array([20000]), + orig_ret=np.array([20000]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([0]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + for sub in SS_SUBS: + assert data[sub][tp][puf][0] == pytest.approx(0) + + def test_cps_half_unchanged(self): + """CPS half should not be modified.""" + orig_ret = np.array([14000, 0]) + data, n, tp = _make_data( + orig_ss=np.array([20000, 15000]), + orig_ret=orig_ret, + orig_dis=np.array([6000, 15000]), + orig_surv=np.array([0, 0]), + orig_dep=np.array([0, 0]), + imputed_ss=np.array([30000, 10000]), + ) + reconcile_ss_subcomponents(data, n, tp) + + cps = slice(0, n) + np.testing.assert_array_equal( + data["social_security_retirement"][tp][cps], + orig_ret, + ) + + def test_missing_subcomponent_is_skipped(self): + """If a sub-component is absent from data, no error.""" + data, n, tp = _make_data( + orig_ss=np.array([10000]), + orig_ret=np.array([10000]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([15000]), + ) + del data["social_security_survivors"] + del data["social_security_dependents"] + # Should not raise + reconcile_ss_subcomponents(data, n, tp) + + def test_no_social_security_is_noop(self): + """If social_security is absent, function is a no-op.""" + data = {"some_var": {2024: np.array([1, 2])}} + reconcile_ss_subcomponents(data, 1, 2024) From 3b195a88eb80c9f1908cf314432104dcd4938102 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 23 Feb 2026 15:41:04 -0500 Subject: [PATCH 02/17] Use age heuristic for new SS recipients in reconciliation New PUF recipients (CPS had zero SS) now get assigned based on age: >= 62 -> retirement, < 62 -> disability. Matches the CPS fallback logic. Falls back to retirement if age is unavailable. Co-Authored-By: Claude Opus 4.6 --- .../calibration/puf_impute.py | 42 ++++- policyengine_us_data/tests/test_puf_impute.py | 158 ++++++++++++++---- 2 files changed, 158 insertions(+), 42 deletions(-) diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index a58196cb..8f8b6e04 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -162,6 +162,9 @@ ] +MINIMUM_RETIREMENT_AGE = 62 + + def reconcile_ss_subcomponents( data: Dict[str, Dict[int, np.ndarray]], n_cps: int, @@ -175,8 +178,8 @@ def reconcile_ss_subcomponents( proportionally so they match the new total. For records where CPS had zero SS but PUF imputes a positive - value (new recipients), the full amount is assigned to - social_security_retirement. + value (new recipients), the age heuristic from CPS construction + is used: age >= 62 -> retirement, age < 62 -> disability. Modifies ``data`` in place. Only the PUF half (indices n_cps .. 2*n_cps) is changed. @@ -193,6 +196,13 @@ def reconcile_ss_subcomponents( cps_ss = ss[:n_cps] puf_ss = ss[n_cps:] + # Age for the CPS half (PUF half has same demographics). + age = None + if "age" in data: + age = data["age"][time_period][:n_cps] + + new_recip = (cps_ss == 0) & (puf_ss > 0) + for sub in SS_SUBCOMPONENTS: if sub not in data: continue @@ -205,11 +215,29 @@ def reconcile_ss_subcomponents( new_puf = ratio * puf_ss # New recipients: CPS had 0 SS but PUF imputed positive. - new_recip = (cps_ss == 0) & (puf_ss > 0) - if sub == "social_security_retirement": - new_puf = np.where(new_recip, puf_ss, new_puf) - else: - new_puf = np.where(new_recip, 0.0, new_puf) + # Use age heuristic: >= 62 -> retirement, < 62 -> disability. + if new_recip.any(): + if age is not None: + if sub == "social_security_retirement": + new_puf = np.where( + new_recip & (age >= MINIMUM_RETIREMENT_AGE), + puf_ss, + new_puf, + ) + elif sub == "social_security_disability": + new_puf = np.where( + new_recip & (age < MINIMUM_RETIREMENT_AGE), + puf_ss, + new_puf, + ) + else: + new_puf = np.where(new_recip, 0.0, new_puf) + else: + # No age data; fall back to retirement. + if sub == "social_security_retirement": + new_puf = np.where(new_recip, puf_ss, new_puf) + else: + new_puf = np.where(new_recip, 0.0, new_puf) # PUF imputed zero -> clear sub-component new_puf = np.where(puf_ss == 0, 0.0, new_puf) diff --git a/policyengine_us_data/tests/test_puf_impute.py b/policyengine_us_data/tests/test_puf_impute.py index d29842f6..bd8e87cb 100644 --- a/policyengine_us_data/tests/test_puf_impute.py +++ b/policyengine_us_data/tests/test_puf_impute.py @@ -9,6 +9,7 @@ import pytest from policyengine_us_data.calibration.puf_impute import ( + MINIMUM_RETIREMENT_AGE, reconcile_ss_subcomponents, ) @@ -20,7 +21,15 @@ ] -def _make_data(orig_ss, orig_ret, orig_dis, orig_surv, orig_dep, imputed_ss): +def _make_data( + orig_ss, + orig_ret, + orig_dis, + orig_surv, + orig_dep, + imputed_ss, + age=None, +): """Build a doubled data dict (CPS half + PUF half). CPS half keeps original values; PUF half has imputed @@ -28,34 +37,33 @@ def _make_data(orig_ss, orig_ret, orig_dis, orig_surv, orig_dep, imputed_ss): """ n = len(orig_ss) tp = 2024 - return ( - { - "social_security": { - tp: np.concatenate([orig_ss, imputed_ss]).astype(np.float32) - }, - "social_security_retirement": { - tp: np.concatenate([orig_ret, orig_ret]).astype(np.float32) - }, - "social_security_disability": { - tp: np.concatenate([orig_dis, orig_dis]).astype(np.float32) - }, - "social_security_survivors": { - tp: np.concatenate([orig_surv, orig_surv]).astype(np.float32) - }, - "social_security_dependents": { - tp: np.concatenate([orig_dep, orig_dep]).astype(np.float32) - }, + data = { + "social_security": { + tp: np.concatenate([orig_ss, imputed_ss]).astype(np.float32) }, - n, - tp, - ) + "social_security_retirement": { + tp: np.concatenate([orig_ret, orig_ret]).astype(np.float32) + }, + "social_security_disability": { + tp: np.concatenate([orig_dis, orig_dis]).astype(np.float32) + }, + "social_security_survivors": { + tp: np.concatenate([orig_surv, orig_surv]).astype(np.float32) + }, + "social_security_dependents": { + tp: np.concatenate([orig_dep, orig_dep]).astype(np.float32) + }, + } + if age is not None: + data["age"] = {tp: np.concatenate([age, age]).astype(np.float32)} + return data, n, tp class TestReconcileSsSubcomponents: """Sub-components must sum to social_security after reconciliation.""" def test_proportional_rescaling(self): - """When CPS has a split, PUF subs should scale proportionally.""" + """When CPS has a split, PUF subs scale proportionally.""" data, n, tp = _make_data( orig_ss=np.array([20000, 15000]), orig_ret=np.array([14000, 0]), @@ -71,35 +79,94 @@ def test_proportional_rescaling(self): total_subs = sum(data[sub][tp][puf] for sub in SS_SUBS) np.testing.assert_allclose(total_subs, ss, rtol=1e-5) - # Check proportions preserved # Person 0: ret=14/20=70%, dis=6/20=30% ret = data["social_security_retirement"][tp][puf] np.testing.assert_allclose(ret[0], 30000 * 0.7, rtol=1e-5) dis = data["social_security_disability"][tp][puf] np.testing.assert_allclose(dis[0], 30000 * 0.3, rtol=1e-5) - def test_new_recipients_default_to_retirement(self): - """When CPS had zero SS but PUF imputes positive, assign to retirement.""" + def test_new_recipient_elderly_gets_retirement(self): + """New recipient aged >= 62 -> retirement.""" data, n, tp = _make_data( - orig_ss=np.array([0, 10000]), - orig_ret=np.array([0, 10000]), - orig_dis=np.array([0, 0]), - orig_surv=np.array([0, 0]), - orig_dep=np.array([0, 0]), - imputed_ss=np.array([25000, 12000]), + orig_ss=np.array([0]), + orig_ret=np.array([0]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([25000]), + age=np.array([70]), ) reconcile_ss_subcomponents(data, n, tp) puf = slice(n, 2 * n) ret = data["social_security_retirement"][tp][puf] dis = data["social_security_disability"][tp][puf] - - # Person 0: new recipient -> all goes to retirement assert ret[0] == pytest.approx(25000, rel=1e-5) assert dis[0] == pytest.approx(0) + def test_new_recipient_young_gets_disability(self): + """New recipient aged < 62 -> disability.""" + data, n, tp = _make_data( + orig_ss=np.array([0]), + orig_ret=np.array([0]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([18000]), + age=np.array([45]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + ret = data["social_security_retirement"][tp][puf] + dis = data["social_security_disability"][tp][puf] + assert ret[0] == pytest.approx(0) + assert dis[0] == pytest.approx(18000, rel=1e-5) + + def test_new_recipient_mixed_ages(self): + """Multiple new recipients split by age threshold.""" + data, n, tp = _make_data( + orig_ss=np.array([0, 0, 10000]), + orig_ret=np.array([0, 0, 10000]), + orig_dis=np.array([0, 0, 0]), + orig_surv=np.array([0, 0, 0]), + orig_dep=np.array([0, 0, 0]), + imputed_ss=np.array([20000, 15000, 12000]), + age=np.array([30, 65, 50]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + ret = data["social_security_retirement"][tp][puf] + dis = data["social_security_disability"][tp][puf] + + # Person 0: age 30, new -> disability + assert ret[0] == pytest.approx(0) + assert dis[0] == pytest.approx(20000, rel=1e-5) + # Person 1: age 65, new -> retirement + assert ret[1] == pytest.approx(15000, rel=1e-5) + assert dis[1] == pytest.approx(0) + # Person 2: existing CPS recipient, just rescaled + assert ret[2] == pytest.approx(12000, rel=1e-5) + + def test_new_recipient_no_age_defaults_to_retirement(self): + """Without age data, new recipients default to retirement.""" + data, n, tp = _make_data( + orig_ss=np.array([0]), + orig_ret=np.array([0]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([25000]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + ret = data["social_security_retirement"][tp][puf] + assert ret[0] == pytest.approx(25000, rel=1e-5) + def test_imputed_zero_clears_subs(self): - """When PUF imputes zero SS, all sub-components should be zero.""" + """When PUF imputes zero SS, all subs should be zero.""" data, n, tp = _make_data( orig_ss=np.array([20000]), orig_ret=np.array([20000]), @@ -145,10 +212,31 @@ def test_missing_subcomponent_is_skipped(self): ) del data["social_security_survivors"] del data["social_security_dependents"] - # Should not raise reconcile_ss_subcomponents(data, n, tp) def test_no_social_security_is_noop(self): """If social_security is absent, function is a no-op.""" data = {"some_var": {2024: np.array([1, 2])}} reconcile_ss_subcomponents(data, 1, 2024) + + def test_subs_sum_to_total_all_cases(self): + """Comprehensive: subs must sum to total across all case types.""" + data, n, tp = _make_data( + # Person 0: existing recipient (rescale) + # Person 1: new recipient, old (retirement) + # Person 2: new recipient, young (disability) + # Person 3: PUF zeroed out (clear) + orig_ss=np.array([20000, 0, 0, 15000]), + orig_ret=np.array([20000, 0, 0, 15000]), + orig_dis=np.array([0, 0, 0, 0]), + orig_surv=np.array([0, 0, 0, 0]), + orig_dep=np.array([0, 0, 0, 0]), + imputed_ss=np.array([25000, 18000, 12000, 0]), + age=np.array([70, 68, 40, 75]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + ss = data["social_security"][tp][puf] + total_subs = sum(data[sub][tp][puf] for sub in SS_SUBS) + np.testing.assert_allclose(total_subs, ss, rtol=1e-5) From 37b6ec8fc24d1b7872924a2f9efc12c2a8cde5c4 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 23 Feb 2026 15:50:43 -0500 Subject: [PATCH 03/17] Use QRF to predict SS sub-component split for new recipients Instead of the simple age >= 62 heuristic, train a QRF on CPS records (where the reason-code split is known) to predict shares for new PUF recipients. Uses age, gender, marital status, and other demographics. Falls back to age heuristic when microimpute is unavailable or training data is insufficient (< 100 records). 14 tests covering: - Proportional rescaling (existing recipients) - Age heuristic fallback (4 tests) - QRF share prediction (4 tests including sum-to-one and elderly-predicted-as-retirement) - Edge cases (zero imputation, missing subs, no SS) Co-Authored-By: Claude Opus 4.6 --- .../calibration/puf_impute.py | 203 ++++++++++-- policyengine_us_data/tests/test_puf_impute.py | 307 +++++++++++++----- 2 files changed, 389 insertions(+), 121 deletions(-) diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index 8f8b6e04..a50e6dbb 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -164,6 +164,153 @@ MINIMUM_RETIREMENT_AGE = 62 +SS_SPLIT_PREDICTORS = [ + "age", + "is_male", + "tax_unit_is_joint", + "is_tax_unit_head", + "is_tax_unit_dependent", +] + +MIN_QRF_TRAINING_RECORDS = 100 + + +def _qrf_ss_shares( + data: Dict[str, Dict[int, np.ndarray]], + n_cps: int, + time_period: int, + new_recip: np.ndarray, +) -> Optional[Dict[str, np.ndarray]]: + """Predict SS sub-component shares for new recipients via QRF. + + Trains on CPS records that have SS > 0 (where the reason-code + split is known), then predicts shares for records where CPS had + zero SS but PUF imputed a positive value. + + Args: + data: Dataset dict. + n_cps: Records in CPS half. + time_period: Tax year. + new_recip: Boolean mask (length n_cps) for new recipients. + + Returns: + Dict mapping sub-component name to predicted share arrays + (length = new_recip.sum()), or None if QRF is unavailable + or training data is insufficient. + """ + try: + from microimpute.models.qrf import QRF + except ImportError: + return None + + cps_ss = data["social_security"][time_period][:n_cps] + has_ss = cps_ss > 0 + + if has_ss.sum() < MIN_QRF_TRAINING_RECORDS: + return None + + # Build training features from available predictors. + predictors = [] + train_cols = {} + test_cols = {} + for pred in SS_SPLIT_PREDICTORS: + if pred not in data: + continue + vals = data[pred][time_period][:n_cps] + train_cols[pred] = vals[has_ss].astype(np.float32) + test_cols[pred] = vals[new_recip].astype(np.float32) + predictors.append(pred) + + if not predictors: + return None + + X_train = pd.DataFrame(train_cols) + X_test = pd.DataFrame(test_cols) + + # Training targets: share going to each sub-component (0 or 1). + share_vars = [] + with np.errstate(divide="ignore", invalid="ignore"): + for sub in SS_SUBCOMPONENTS: + if sub not in data: + continue + sub_vals = data[sub][time_period][:n_cps][has_ss] + share_name = sub + "_share" + X_train[share_name] = np.where( + cps_ss[has_ss] > 0, + sub_vals / cps_ss[has_ss], + 0.0, + ) + share_vars.append(share_name) + + if not share_vars: + return None + + qrf = QRF(log_level="WARNING", memory_efficient=True) + try: + fitted = qrf.fit( + X_train=X_train[predictors + share_vars], + predictors=predictors, + imputed_variables=share_vars, + n_jobs=1, + ) + predictions = fitted.predict(X_test=X_test) + except Exception: + logger.warning("QRF SS split failed, falling back to heuristic") + return None + + # Clip to [0, 1] and normalise so shares sum to 1. + shares = {} + total = np.zeros(len(X_test)) + for sub in SS_SUBCOMPONENTS: + key = sub + "_share" + if key in predictions.columns: + s = np.clip(predictions[key].values, 0, 1) + shares[sub] = s + total += s + + for sub in shares: + shares[sub] = np.where(total > 0, shares[sub] / total, 0.0) + + del fitted, predictions + gc.collect() + + logger.info( + "QRF SS split: predicted shares for %d new recipients", + new_recip.sum(), + ) + return shares + + +def _age_heuristic_ss_shares( + data: Dict[str, Dict[int, np.ndarray]], + n_cps: int, + time_period: int, + new_recip: np.ndarray, +) -> Dict[str, np.ndarray]: + """Fallback: assign new recipients by age threshold. + + Age >= 62 -> retirement, < 62 -> disability. + If age is unavailable, all go to retirement. + """ + n_new = new_recip.sum() + shares = {sub: np.zeros(n_new) for sub in SS_SUBCOMPONENTS} + + age = None + if "age" in data: + age = data["age"][time_period][:n_cps][new_recip] + + if age is not None: + is_old = age >= MINIMUM_RETIREMENT_AGE + if "social_security_retirement" in shares: + shares["social_security_retirement"] = is_old.astype(np.float64) + if "social_security_disability" in shares: + shares["social_security_disability"] = (~is_old).astype(np.float64) + else: + if "social_security_retirement" in shares: + shares["social_security_retirement"] = np.ones(n_new) + + return shares + def reconcile_ss_subcomponents( data: Dict[str, Dict[int, np.ndarray]], @@ -177,9 +324,10 @@ def reconcile_ss_subcomponents( still hold the original CPS values. This function rescales them proportionally so they match the new total. - For records where CPS had zero SS but PUF imputes a positive - value (new recipients), the age heuristic from CPS construction - is used: age >= 62 -> retirement, age < 62 -> disability. + For new recipients (CPS had zero SS, PUF imputed positive), a + QRF is trained on CPS records to predict the sub-component split + from demographics. Falls back to an age heuristic (>= 62 -> + retirement, < 62 -> disability) when QRF is unavailable. Modifies ``data`` in place. Only the PUF half (indices n_cps .. 2*n_cps) is changed. @@ -196,50 +344,35 @@ def reconcile_ss_subcomponents( cps_ss = ss[:n_cps] puf_ss = ss[n_cps:] - # Age for the CPS half (PUF half has same demographics). - age = None - if "age" in data: - age = data["age"][time_period][:n_cps] - new_recip = (cps_ss == 0) & (puf_ss > 0) + # Predict shares for new recipients (QRF with age fallback). + new_shares = None + if new_recip.any(): + new_shares = _qrf_ss_shares(data, n_cps, time_period, new_recip) + if new_shares is None: + new_shares = _age_heuristic_ss_shares( + data, n_cps, time_period, new_recip + ) + for sub in SS_SUBCOMPONENTS: if sub not in data: continue arr = data[sub][time_period] cps_sub = arr[:n_cps] + # Existing recipients: proportional rescaling. with np.errstate(divide="ignore", invalid="ignore"): ratio = np.where(cps_ss > 0, cps_sub / cps_ss, 0.0) - new_puf = ratio * puf_ss - # New recipients: CPS had 0 SS but PUF imputed positive. - # Use age heuristic: >= 62 -> retirement, < 62 -> disability. - if new_recip.any(): - if age is not None: - if sub == "social_security_retirement": - new_puf = np.where( - new_recip & (age >= MINIMUM_RETIREMENT_AGE), - puf_ss, - new_puf, - ) - elif sub == "social_security_disability": - new_puf = np.where( - new_recip & (age < MINIMUM_RETIREMENT_AGE), - puf_ss, - new_puf, - ) - else: - new_puf = np.where(new_recip, 0.0, new_puf) - else: - # No age data; fall back to retirement. - if sub == "social_security_retirement": - new_puf = np.where(new_recip, puf_ss, new_puf) - else: - new_puf = np.where(new_recip, 0.0, new_puf) - - # PUF imputed zero -> clear sub-component + # New recipients: apply predicted shares. + if new_recip.any() and new_shares is not None: + share = new_shares.get(sub, np.zeros(new_recip.sum())) + puf_new_vals = puf_ss[new_recip] * share + new_puf[new_recip] = puf_new_vals + + # PUF imputed zero -> clear sub-component. new_puf = np.where(puf_ss == 0, 0.0, new_puf) arr[n_cps:] = new_puf.astype(arr.dtype) diff --git a/policyengine_us_data/tests/test_puf_impute.py b/policyengine_us_data/tests/test_puf_impute.py index bd8e87cb..030f2754 100644 --- a/policyengine_us_data/tests/test_puf_impute.py +++ b/policyengine_us_data/tests/test_puf_impute.py @@ -5,11 +5,15 @@ the new total. See: https://github.com/PolicyEngine/policyengine-us-data/issues/551 """ +from unittest.mock import patch + import numpy as np import pytest from policyengine_us_data.calibration.puf_impute import ( MINIMUM_RETIREMENT_AGE, + _age_heuristic_ss_shares, + _qrf_ss_shares, reconcile_ss_subcomponents, ) @@ -29,12 +33,9 @@ def _make_data( orig_dep, imputed_ss, age=None, + is_male=None, ): - """Build a doubled data dict (CPS half + PUF half). - - CPS half keeps original values; PUF half has imputed - social_security but original (stale) sub-components. - """ + """Build a doubled data dict (CPS half + PUF half).""" n = len(orig_ss) tp = 2024 data = { @@ -56,6 +57,10 @@ def _make_data( } if age is not None: data["age"] = {tp: np.concatenate([age, age]).astype(np.float32)} + if is_male is not None: + data["is_male"] = { + tp: np.concatenate([is_male, is_male]).astype(np.float32) + } return data, n, tp @@ -85,86 +90,6 @@ def test_proportional_rescaling(self): dis = data["social_security_disability"][tp][puf] np.testing.assert_allclose(dis[0], 30000 * 0.3, rtol=1e-5) - def test_new_recipient_elderly_gets_retirement(self): - """New recipient aged >= 62 -> retirement.""" - data, n, tp = _make_data( - orig_ss=np.array([0]), - orig_ret=np.array([0]), - orig_dis=np.array([0]), - orig_surv=np.array([0]), - orig_dep=np.array([0]), - imputed_ss=np.array([25000]), - age=np.array([70]), - ) - reconcile_ss_subcomponents(data, n, tp) - - puf = slice(n, 2 * n) - ret = data["social_security_retirement"][tp][puf] - dis = data["social_security_disability"][tp][puf] - assert ret[0] == pytest.approx(25000, rel=1e-5) - assert dis[0] == pytest.approx(0) - - def test_new_recipient_young_gets_disability(self): - """New recipient aged < 62 -> disability.""" - data, n, tp = _make_data( - orig_ss=np.array([0]), - orig_ret=np.array([0]), - orig_dis=np.array([0]), - orig_surv=np.array([0]), - orig_dep=np.array([0]), - imputed_ss=np.array([18000]), - age=np.array([45]), - ) - reconcile_ss_subcomponents(data, n, tp) - - puf = slice(n, 2 * n) - ret = data["social_security_retirement"][tp][puf] - dis = data["social_security_disability"][tp][puf] - assert ret[0] == pytest.approx(0) - assert dis[0] == pytest.approx(18000, rel=1e-5) - - def test_new_recipient_mixed_ages(self): - """Multiple new recipients split by age threshold.""" - data, n, tp = _make_data( - orig_ss=np.array([0, 0, 10000]), - orig_ret=np.array([0, 0, 10000]), - orig_dis=np.array([0, 0, 0]), - orig_surv=np.array([0, 0, 0]), - orig_dep=np.array([0, 0, 0]), - imputed_ss=np.array([20000, 15000, 12000]), - age=np.array([30, 65, 50]), - ) - reconcile_ss_subcomponents(data, n, tp) - - puf = slice(n, 2 * n) - ret = data["social_security_retirement"][tp][puf] - dis = data["social_security_disability"][tp][puf] - - # Person 0: age 30, new -> disability - assert ret[0] == pytest.approx(0) - assert dis[0] == pytest.approx(20000, rel=1e-5) - # Person 1: age 65, new -> retirement - assert ret[1] == pytest.approx(15000, rel=1e-5) - assert dis[1] == pytest.approx(0) - # Person 2: existing CPS recipient, just rescaled - assert ret[2] == pytest.approx(12000, rel=1e-5) - - def test_new_recipient_no_age_defaults_to_retirement(self): - """Without age data, new recipients default to retirement.""" - data, n, tp = _make_data( - orig_ss=np.array([0]), - orig_ret=np.array([0]), - orig_dis=np.array([0]), - orig_surv=np.array([0]), - orig_dep=np.array([0]), - imputed_ss=np.array([25000]), - ) - reconcile_ss_subcomponents(data, n, tp) - - puf = slice(n, 2 * n) - ret = data["social_security_retirement"][tp][puf] - assert ret[0] == pytest.approx(25000, rel=1e-5) - def test_imputed_zero_clears_subs(self): """When PUF imputes zero SS, all subs should be zero.""" data, n, tp = _make_data( @@ -220,7 +145,7 @@ def test_no_social_security_is_noop(self): reconcile_ss_subcomponents(data, 1, 2024) def test_subs_sum_to_total_all_cases(self): - """Comprehensive: subs must sum to total across all case types.""" + """Comprehensive: subs must sum to total across all cases.""" data, n, tp = _make_data( # Person 0: existing recipient (rescale) # Person 1: new recipient, old (retirement) @@ -240,3 +165,213 @@ def test_subs_sum_to_total_all_cases(self): ss = data["social_security"][tp][puf] total_subs = sum(data[sub][tp][puf] for sub in SS_SUBS) np.testing.assert_allclose(total_subs, ss, rtol=1e-5) + + +class TestAgeHeuristicSsShares: + """Age-based fallback for new SS recipient classification.""" + + def test_elderly_gets_retirement(self): + """Age >= 62 -> retirement.""" + data, n, tp = _make_data( + orig_ss=np.array([0]), + orig_ret=np.array([0]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([25000]), + age=np.array([70]), + ) + new_recip = np.array([True]) + shares = _age_heuristic_ss_shares(data, n, tp, new_recip) + + assert shares["social_security_retirement"][0] == pytest.approx(1.0) + assert shares["social_security_disability"][0] == pytest.approx(0.0) + + def test_young_gets_disability(self): + """Age < 62 -> disability.""" + data, n, tp = _make_data( + orig_ss=np.array([0]), + orig_ret=np.array([0]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([18000]), + age=np.array([45]), + ) + new_recip = np.array([True]) + shares = _age_heuristic_ss_shares(data, n, tp, new_recip) + + assert shares["social_security_retirement"][0] == pytest.approx(0.0) + assert shares["social_security_disability"][0] == pytest.approx(1.0) + + def test_no_age_defaults_to_retirement(self): + """Without age data, default to retirement.""" + data, n, tp = _make_data( + orig_ss=np.array([0]), + orig_ret=np.array([0]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([25000]), + ) + new_recip = np.array([True]) + shares = _age_heuristic_ss_shares(data, n, tp, new_recip) + + assert shares["social_security_retirement"][0] == pytest.approx(1.0) + + def test_mixed_ages(self): + """Multiple people split correctly by age threshold.""" + data, n, tp = _make_data( + orig_ss=np.array([0, 0, 0]), + orig_ret=np.array([0, 0, 0]), + orig_dis=np.array([0, 0, 0]), + orig_surv=np.array([0, 0, 0]), + orig_dep=np.array([0, 0, 0]), + imputed_ss=np.array([1, 1, 1]), + age=np.array([30, 62, 80]), + ) + new_recip = np.array([True, True, True]) + shares = _age_heuristic_ss_shares(data, n, tp, new_recip) + + # age 30 -> disability + assert shares["social_security_retirement"][0] == 0 + assert shares["social_security_disability"][0] == 1 + # age 62 -> retirement (>= threshold) + assert shares["social_security_retirement"][1] == 1 + assert shares["social_security_disability"][1] == 0 + # age 80 -> retirement + assert shares["social_security_retirement"][2] == 1 + + +class TestQrfSsShares: + """QRF-based SS sub-component share prediction.""" + + def test_returns_none_without_microimpute(self): + """Returns None when microimpute is not importable.""" + data, n, tp = _make_data( + orig_ss=np.array([10000]), + orig_ret=np.array([10000]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([15000]), + age=np.array([70]), + ) + new_recip = np.array([False]) + with patch.dict("sys.modules", {"microimpute": None}): + result = _qrf_ss_shares(data, n, tp, new_recip) + # No new recipients -> no call needed, but also shouldn't error + # Actually test with import blocked: + assert result is None or isinstance(result, dict) + + def test_returns_none_with_few_training_records(self): + """Returns None when < MIN_QRF_TRAINING_RECORDS have SS.""" + # Only 2 training records with SS > 0. + data, n, tp = _make_data( + orig_ss=np.array([10000, 5000, 0]), + orig_ret=np.array([10000, 0, 0]), + orig_dis=np.array([0, 5000, 0]), + orig_surv=np.array([0, 0, 0]), + orig_dep=np.array([0, 0, 0]), + imputed_ss=np.array([0, 0, 20000]), + age=np.array([70, 45, 55]), + ) + new_recip = np.array([False, False, True]) + result = _qrf_ss_shares(data, n, tp, new_recip) + assert result is None + + def test_shares_sum_to_one(self): + """QRF-predicted shares should sum to ~1 after normalisation.""" + pytest.importorskip("microimpute") + rng = np.random.default_rng(42) + n = 500 + + # Synthetic training: age > 62 mostly retirement, + # age < 62 mostly disability. + ages = rng.integers(20, 90, size=n).astype(np.float32) + is_male = rng.integers(0, 2, size=n).astype(np.float32) + ss_vals = rng.uniform(5000, 40000, size=n).astype(np.float32) + ret = np.where(ages >= 62, ss_vals, 0).astype(np.float32) + dis = np.where(ages < 62, ss_vals, 0).astype(np.float32) + surv = np.zeros(n, dtype=np.float32) + dep = np.zeros(n, dtype=np.float32) + + # 20 new recipients (CPS had 0 SS). + n_new = 20 + new_ages = rng.integers(20, 90, size=n_new).astype(np.float32) + new_is_male = rng.integers(0, 2, size=n_new).astype(np.float32) + new_ss = rng.uniform(10000, 30000, size=n_new).astype(np.float32) + + all_ss = np.concatenate([ss_vals, np.zeros(n_new)]) + all_ret = np.concatenate([ret, np.zeros(n_new)]) + all_dis = np.concatenate([dis, np.zeros(n_new)]) + all_surv = np.concatenate([surv, np.zeros(n_new)]) + all_dep = np.concatenate([dep, np.zeros(n_new)]) + all_ages = np.concatenate([ages, new_ages]) + all_is_male = np.concatenate([is_male, new_is_male]) + n_total = n + n_new + + data, nn, tp = _make_data( + orig_ss=all_ss, + orig_ret=all_ret, + orig_dis=all_dis, + orig_surv=all_surv, + orig_dep=all_dep, + imputed_ss=np.concatenate([ss_vals, new_ss]), + age=all_ages, + is_male=all_is_male, + ) + + new_recip = np.concatenate( + [np.zeros(n, dtype=bool), np.ones(n_new, dtype=bool)] + ) + shares = _qrf_ss_shares(data, nn, tp, new_recip) + + assert shares is not None + total = sum(shares.get(sub, 0) for sub in SS_SUBS) + np.testing.assert_allclose(total, 1.0, atol=0.01) + + def test_elderly_predicted_as_retirement(self): + """QRF should mostly predict retirement for age >= 62.""" + pytest.importorskip("microimpute") + rng = np.random.default_rng(123) + n = 500 + + ages = rng.integers(20, 90, size=n).astype(np.float32) + ss_vals = rng.uniform(5000, 40000, size=n).astype(np.float32) + ret = np.where(ages >= 62, ss_vals, 0).astype(np.float32) + dis = np.where(ages < 62, ss_vals, 0).astype(np.float32) + surv = np.zeros(n, dtype=np.float32) + dep = np.zeros(n, dtype=np.float32) + + # 10 new elderly recipients. + n_new = 10 + new_ages = np.full(n_new, 70, dtype=np.float32) + new_ss = np.full(n_new, 20000, dtype=np.float32) + + all_ss = np.concatenate([ss_vals, np.zeros(n_new)]) + all_ret = np.concatenate([ret, np.zeros(n_new)]) + all_dis = np.concatenate([dis, np.zeros(n_new)]) + all_surv = np.concatenate([surv, np.zeros(n_new)]) + all_dep = np.concatenate([dep, np.zeros(n_new)]) + all_ages = np.concatenate([ages, new_ages]) + + data, nn, tp = _make_data( + orig_ss=all_ss, + orig_ret=all_ret, + orig_dis=all_dis, + orig_surv=all_surv, + orig_dep=all_dep, + imputed_ss=np.concatenate([ss_vals, new_ss]), + age=all_ages, + ) + + new_recip = np.concatenate( + [np.zeros(n, dtype=bool), np.ones(n_new, dtype=bool)] + ) + shares = _qrf_ss_shares(data, nn, tp, new_recip) + + assert shares is not None + ret_share = shares["social_security_retirement"] + # All elderly -> retirement share should dominate. + assert np.mean(ret_share) > 0.7 From e7f5933f6afab52fefaf68a8a26c1cb9e2463a64 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 23 Feb 2026 16:10:40 -0500 Subject: [PATCH 04/17] Use QRF for all PUF SS records, not just new recipients CPS-PUF link is statistical (not identity-based), so the paired CPS record's sub-component split is just one noisy draw. A QRF trained on all CPS SS recipients gives a better expected prediction. Also removes unnecessary try/except ImportError guard for microimpute. Co-Authored-By: Claude Opus 4.6 --- .../calibration/puf_impute.py | 97 +++++----- policyengine_us_data/tests/test_puf_impute.py | 173 ++++++++---------- 2 files changed, 123 insertions(+), 147 deletions(-) diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index a50e6dbb..d5c73785 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -179,29 +179,29 @@ def _qrf_ss_shares( data: Dict[str, Dict[int, np.ndarray]], n_cps: int, time_period: int, - new_recip: np.ndarray, + puf_has_ss: np.ndarray, ) -> Optional[Dict[str, np.ndarray]]: - """Predict SS sub-component shares for new recipients via QRF. + """Predict SS sub-component shares via QRF. Trains on CPS records that have SS > 0 (where the reason-code - split is known), then predicts shares for records where CPS had - zero SS but PUF imputed a positive value. + split is known), then predicts shares for all PUF records with + positive SS. The CPS-PUF link is statistical (not identity-based), + so the QRF gives a better expected prediction than using the + paired CPS record's split. Args: data: Dataset dict. n_cps: Records in CPS half. time_period: Tax year. - new_recip: Boolean mask (length n_cps) for new recipients. + puf_has_ss: Boolean mask (length n_cps) — True where the + PUF half has positive social_security. Returns: Dict mapping sub-component name to predicted share arrays - (length = new_recip.sum()), or None if QRF is unavailable - or training data is insufficient. + (length = puf_has_ss.sum()), or None if training data is + insufficient. """ - try: - from microimpute.models.qrf import QRF - except ImportError: - return None + from microimpute.models.qrf import QRF cps_ss = data["social_security"][time_period][:n_cps] has_ss = cps_ss > 0 @@ -218,7 +218,7 @@ def _qrf_ss_shares( continue vals = data[pred][time_period][:n_cps] train_cols[pred] = vals[has_ss].astype(np.float32) - test_cols[pred] = vals[new_recip].astype(np.float32) + test_cols[pred] = vals[puf_has_ss].astype(np.float32) predictors.append(pred) if not predictors: @@ -275,8 +275,8 @@ def _qrf_ss_shares( gc.collect() logger.info( - "QRF SS split: predicted shares for %d new recipients", - new_recip.sum(), + "QRF SS split: predicted shares for %d PUF records", + puf_has_ss.sum(), ) return shares @@ -285,19 +285,19 @@ def _age_heuristic_ss_shares( data: Dict[str, Dict[int, np.ndarray]], n_cps: int, time_period: int, - new_recip: np.ndarray, + puf_has_ss: np.ndarray, ) -> Dict[str, np.ndarray]: - """Fallback: assign new recipients by age threshold. + """Fallback: assign SS type by age threshold. Age >= 62 -> retirement, < 62 -> disability. If age is unavailable, all go to retirement. """ - n_new = new_recip.sum() - shares = {sub: np.zeros(n_new) for sub in SS_SUBCOMPONENTS} + n_pred = puf_has_ss.sum() + shares = {sub: np.zeros(n_pred) for sub in SS_SUBCOMPONENTS} age = None if "age" in data: - age = data["age"][time_period][:n_cps][new_recip] + age = data["age"][time_period][:n_cps][puf_has_ss] if age is not None: is_old = age >= MINIMUM_RETIREMENT_AGE @@ -307,7 +307,7 @@ def _age_heuristic_ss_shares( shares["social_security_disability"] = (~is_old).astype(np.float64) else: if "social_security_retirement" in shares: - shares["social_security_retirement"] = np.ones(n_new) + shares["social_security_retirement"] = np.ones(n_pred) return shares @@ -317,17 +317,17 @@ def reconcile_ss_subcomponents( n_cps: int, time_period: int, ) -> None: - """Rescale SS sub-components so they sum to social_security. + """Predict SS sub-components for PUF half from demographics. - After PUF imputation replaces social_security on the PUF half, - the sub-components (retirement, disability, survivors, dependents) - still hold the original CPS values. This function rescales them - proportionally so they match the new total. + The CPS-PUF link is statistical (not identity-based), so the + paired CPS record's sub-component split is just one noisy draw. + A QRF trained on all CPS SS recipients gives a better expected + prediction by pooling across the full training set. - For new recipients (CPS had zero SS, PUF imputed positive), a - QRF is trained on CPS records to predict the sub-component split - from demographics. Falls back to an age heuristic (>= 62 -> - retirement, < 62 -> disability) when QRF is unavailable. + For all PUF records with positive social_security, this function + predicts shares via QRF (falling back to an age heuristic) and + scales them to match the imputed total. PUF records with zero + SS get all sub-components cleared to zero. Modifies ``data`` in place. Only the PUF half (indices n_cps .. 2*n_cps) is changed. @@ -340,40 +340,27 @@ def reconcile_ss_subcomponents( if "social_security" not in data: return - ss = data["social_security"][time_period] - cps_ss = ss[:n_cps] - puf_ss = ss[n_cps:] + puf_ss = data["social_security"][time_period][n_cps:] + puf_has_ss = puf_ss > 0 - new_recip = (cps_ss == 0) & (puf_ss > 0) - - # Predict shares for new recipients (QRF with age fallback). - new_shares = None - if new_recip.any(): - new_shares = _qrf_ss_shares(data, n_cps, time_period, new_recip) - if new_shares is None: - new_shares = _age_heuristic_ss_shares( - data, n_cps, time_period, new_recip + # Predict shares for all PUF records with SS > 0. + shares = None + if puf_has_ss.any(): + shares = _qrf_ss_shares(data, n_cps, time_period, puf_has_ss) + if shares is None: + shares = _age_heuristic_ss_shares( + data, n_cps, time_period, puf_has_ss ) for sub in SS_SUBCOMPONENTS: if sub not in data: continue arr = data[sub][time_period] - cps_sub = arr[:n_cps] - - # Existing recipients: proportional rescaling. - with np.errstate(divide="ignore", invalid="ignore"): - ratio = np.where(cps_ss > 0, cps_sub / cps_ss, 0.0) - new_puf = ratio * puf_ss - - # New recipients: apply predicted shares. - if new_recip.any() and new_shares is not None: - share = new_shares.get(sub, np.zeros(new_recip.sum())) - puf_new_vals = puf_ss[new_recip] * share - new_puf[new_recip] = puf_new_vals - # PUF imputed zero -> clear sub-component. - new_puf = np.where(puf_ss == 0, 0.0, new_puf) + new_puf = np.zeros(n_cps) + if puf_has_ss.any() and shares is not None: + share = shares.get(sub, np.zeros(puf_has_ss.sum())) + new_puf[puf_has_ss] = puf_ss[puf_has_ss] * share arr[n_cps:] = new_puf.astype(arr.dtype) data[sub][time_period] = arr diff --git a/policyengine_us_data/tests/test_puf_impute.py b/policyengine_us_data/tests/test_puf_impute.py index 030f2754..fcdcf763 100644 --- a/policyengine_us_data/tests/test_puf_impute.py +++ b/policyengine_us_data/tests/test_puf_impute.py @@ -1,12 +1,11 @@ """Tests for PUF imputation, specifically SS sub-component reconciliation. When PUF imputation replaces social_security values, the sub-components -(retirement, disability, survivors, dependents) must be rescaled to match -the new total. See: https://github.com/PolicyEngine/policyengine-us-data/issues/551 +(retirement, disability, survivors, dependents) must be predicted from +demographics (not copied from the statistically-linked CPS record). +See: https://github.com/PolicyEngine/policyengine-us-data/issues/551 """ -from unittest.mock import patch - import numpy as np import pytest @@ -67,8 +66,8 @@ def _make_data( class TestReconcileSsSubcomponents: """Sub-components must sum to social_security after reconciliation.""" - def test_proportional_rescaling(self): - """When CPS has a split, PUF subs scale proportionally.""" + def test_subs_sum_to_total(self): + """PUF subs must sum to imputed SS total.""" data, n, tp = _make_data( orig_ss=np.array([20000, 15000]), orig_ret=np.array([14000, 0]), @@ -76,6 +75,7 @@ def test_proportional_rescaling(self): orig_surv=np.array([0, 0]), orig_dep=np.array([0, 0]), imputed_ss=np.array([30000, 10000]), + age=np.array([70, 45]), ) reconcile_ss_subcomponents(data, n, tp) @@ -84,11 +84,28 @@ def test_proportional_rescaling(self): total_subs = sum(data[sub][tp][puf] for sub in SS_SUBS) np.testing.assert_allclose(total_subs, ss, rtol=1e-5) - # Person 0: ret=14/20=70%, dis=6/20=30% + def test_age_determines_split(self): + """Age heuristic: >= 62 -> retirement, < 62 -> disability.""" + data, n, tp = _make_data( + orig_ss=np.array([20000, 15000]), + orig_ret=np.array([14000, 0]), + orig_dis=np.array([6000, 15000]), + orig_surv=np.array([0, 0]), + orig_dep=np.array([0, 0]), + imputed_ss=np.array([30000, 10000]), + age=np.array([70, 45]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) ret = data["social_security_retirement"][tp][puf] - np.testing.assert_allclose(ret[0], 30000 * 0.7, rtol=1e-5) dis = data["social_security_disability"][tp][puf] - np.testing.assert_allclose(dis[0], 30000 * 0.3, rtol=1e-5) + # Person 0 is 70 -> retirement + assert ret[0] == pytest.approx(30000) + assert dis[0] == pytest.approx(0) + # Person 1 is 45 -> disability + assert ret[1] == pytest.approx(0) + assert dis[1] == pytest.approx(10000) def test_imputed_zero_clears_subs(self): """When PUF imputes zero SS, all subs should be zero.""" @@ -144,13 +161,13 @@ def test_no_social_security_is_noop(self): data = {"some_var": {2024: np.array([1, 2])}} reconcile_ss_subcomponents(data, 1, 2024) - def test_subs_sum_to_total_all_cases(self): - """Comprehensive: subs must sum to total across all cases.""" + def test_subs_sum_to_total_mixed_cases(self): + """Comprehensive: subs sum to total across mixed cases.""" data, n, tp = _make_data( - # Person 0: existing recipient (rescale) - # Person 1: new recipient, old (retirement) - # Person 2: new recipient, young (disability) - # Person 3: PUF zeroed out (clear) + # Person 0: has CPS SS, has PUF SS (old) + # Person 1: no CPS SS, has PUF SS (old) + # Person 2: no CPS SS, has PUF SS (young) + # Person 3: has CPS SS, PUF zeroed out orig_ss=np.array([20000, 0, 0, 15000]), orig_ret=np.array([20000, 0, 0, 15000]), orig_dis=np.array([0, 0, 0, 0]), @@ -168,7 +185,7 @@ def test_subs_sum_to_total_all_cases(self): class TestAgeHeuristicSsShares: - """Age-based fallback for new SS recipient classification.""" + """Age-based fallback for SS type classification.""" def test_elderly_gets_retirement(self): """Age >= 62 -> retirement.""" @@ -181,8 +198,8 @@ def test_elderly_gets_retirement(self): imputed_ss=np.array([25000]), age=np.array([70]), ) - new_recip = np.array([True]) - shares = _age_heuristic_ss_shares(data, n, tp, new_recip) + puf_has_ss = np.array([True]) + shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss) assert shares["social_security_retirement"][0] == pytest.approx(1.0) assert shares["social_security_disability"][0] == pytest.approx(0.0) @@ -198,8 +215,8 @@ def test_young_gets_disability(self): imputed_ss=np.array([18000]), age=np.array([45]), ) - new_recip = np.array([True]) - shares = _age_heuristic_ss_shares(data, n, tp, new_recip) + puf_has_ss = np.array([True]) + shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss) assert shares["social_security_retirement"][0] == pytest.approx(0.0) assert shares["social_security_disability"][0] == pytest.approx(1.0) @@ -214,8 +231,8 @@ def test_no_age_defaults_to_retirement(self): orig_dep=np.array([0]), imputed_ss=np.array([25000]), ) - new_recip = np.array([True]) - shares = _age_heuristic_ss_shares(data, n, tp, new_recip) + puf_has_ss = np.array([True]) + shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss) assert shares["social_security_retirement"][0] == pytest.approx(1.0) @@ -230,8 +247,8 @@ def test_mixed_ages(self): imputed_ss=np.array([1, 1, 1]), age=np.array([30, 62, 80]), ) - new_recip = np.array([True, True, True]) - shares = _age_heuristic_ss_shares(data, n, tp, new_recip) + puf_has_ss = np.array([True, True, True]) + shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss) # age 30 -> disability assert shares["social_security_retirement"][0] == 0 @@ -246,24 +263,6 @@ def test_mixed_ages(self): class TestQrfSsShares: """QRF-based SS sub-component share prediction.""" - def test_returns_none_without_microimpute(self): - """Returns None when microimpute is not importable.""" - data, n, tp = _make_data( - orig_ss=np.array([10000]), - orig_ret=np.array([10000]), - orig_dis=np.array([0]), - orig_surv=np.array([0]), - orig_dep=np.array([0]), - imputed_ss=np.array([15000]), - age=np.array([70]), - ) - new_recip = np.array([False]) - with patch.dict("sys.modules", {"microimpute": None}): - result = _qrf_ss_shares(data, n, tp, new_recip) - # No new recipients -> no call needed, but also shouldn't error - # Actually test with import blocked: - assert result is None or isinstance(result, dict) - def test_returns_none_with_few_training_records(self): """Returns None when < MIN_QRF_TRAINING_RECORDS have SS.""" # Only 2 training records with SS > 0. @@ -276,8 +275,8 @@ def test_returns_none_with_few_training_records(self): imputed_ss=np.array([0, 0, 20000]), age=np.array([70, 45, 55]), ) - new_recip = np.array([False, False, True]) - result = _qrf_ss_shares(data, n, tp, new_recip) + puf_has_ss = np.array([False, False, True]) + result = _qrf_ss_shares(data, n, tp, puf_has_ss) assert result is None def test_shares_sum_to_one(self): @@ -296,36 +295,22 @@ def test_shares_sum_to_one(self): surv = np.zeros(n, dtype=np.float32) dep = np.zeros(n, dtype=np.float32) - # 20 new recipients (CPS had 0 SS). - n_new = 20 - new_ages = rng.integers(20, 90, size=n_new).astype(np.float32) - new_is_male = rng.integers(0, 2, size=n_new).astype(np.float32) - new_ss = rng.uniform(10000, 30000, size=n_new).astype(np.float32) - - all_ss = np.concatenate([ss_vals, np.zeros(n_new)]) - all_ret = np.concatenate([ret, np.zeros(n_new)]) - all_dis = np.concatenate([dis, np.zeros(n_new)]) - all_surv = np.concatenate([surv, np.zeros(n_new)]) - all_dep = np.concatenate([dep, np.zeros(n_new)]) - all_ages = np.concatenate([ages, new_ages]) - all_is_male = np.concatenate([is_male, new_is_male]) - n_total = n + n_new + # PUF SS for all records. + puf_ss = rng.uniform(5000, 40000, size=n).astype(np.float32) data, nn, tp = _make_data( - orig_ss=all_ss, - orig_ret=all_ret, - orig_dis=all_dis, - orig_surv=all_surv, - orig_dep=all_dep, - imputed_ss=np.concatenate([ss_vals, new_ss]), - age=all_ages, - is_male=all_is_male, + orig_ss=ss_vals, + orig_ret=ret, + orig_dis=dis, + orig_surv=surv, + orig_dep=dep, + imputed_ss=puf_ss, + age=ages, + is_male=is_male, ) - new_recip = np.concatenate( - [np.zeros(n, dtype=bool), np.ones(n_new, dtype=bool)] - ) - shares = _qrf_ss_shares(data, nn, tp, new_recip) + puf_has_ss = puf_ss > 0 + shares = _qrf_ss_shares(data, nn, tp, puf_has_ss) assert shares is not None total = sum(shares.get(sub, 0) for sub in SS_SUBS) @@ -335,26 +320,32 @@ def test_elderly_predicted_as_retirement(self): """QRF should mostly predict retirement for age >= 62.""" pytest.importorskip("microimpute") rng = np.random.default_rng(123) - n = 500 + n_train = 500 - ages = rng.integers(20, 90, size=n).astype(np.float32) - ss_vals = rng.uniform(5000, 40000, size=n).astype(np.float32) + ages = rng.integers(20, 90, size=n_train).astype(np.float32) + ss_vals = rng.uniform(5000, 40000, size=n_train).astype(np.float32) ret = np.where(ages >= 62, ss_vals, 0).astype(np.float32) dis = np.where(ages < 62, ss_vals, 0).astype(np.float32) - surv = np.zeros(n, dtype=np.float32) - dep = np.zeros(n, dtype=np.float32) - - # 10 new elderly recipients. - n_new = 10 - new_ages = np.full(n_new, 70, dtype=np.float32) - new_ss = np.full(n_new, 20000, dtype=np.float32) - - all_ss = np.concatenate([ss_vals, np.zeros(n_new)]) - all_ret = np.concatenate([ret, np.zeros(n_new)]) - all_dis = np.concatenate([dis, np.zeros(n_new)]) - all_surv = np.concatenate([surv, np.zeros(n_new)]) - all_dep = np.concatenate([dep, np.zeros(n_new)]) - all_ages = np.concatenate([ages, new_ages]) + surv = np.zeros(n_train, dtype=np.float32) + dep = np.zeros(n_train, dtype=np.float32) + + # Only elderly records get PUF SS > 0; training records + # get PUF SS = 0 so puf_has_ss selects just the elderly. + n_pred = 10 + pred_ages = np.full(n_pred, 70, dtype=np.float32) + + all_ages = np.concatenate([ages, pred_ages]) + all_ss = np.concatenate([ss_vals, np.zeros(n_pred)]) + all_ret = np.concatenate([ret, np.zeros(n_pred)]) + all_dis = np.concatenate([dis, np.zeros(n_pred)]) + all_surv = np.concatenate([surv, np.zeros(n_pred)]) + all_dep = np.concatenate([dep, np.zeros(n_pred)]) + puf_ss = np.concatenate( + [ + np.zeros(n_train, dtype=np.float32), + np.full(n_pred, 20000, dtype=np.float32), + ] + ) data, nn, tp = _make_data( orig_ss=all_ss, @@ -362,14 +353,12 @@ def test_elderly_predicted_as_retirement(self): orig_dis=all_dis, orig_surv=all_surv, orig_dep=all_dep, - imputed_ss=np.concatenate([ss_vals, new_ss]), + imputed_ss=puf_ss, age=all_ages, ) - new_recip = np.concatenate( - [np.zeros(n, dtype=bool), np.ones(n_new, dtype=bool)] - ) - shares = _qrf_ss_shares(data, nn, tp, new_recip) + puf_has_ss = data["social_security"][tp][nn:] > 0 + shares = _qrf_ss_shares(data, nn, tp, puf_has_ss) assert shares is not None ret_share = shares["social_security_retirement"] From 88f2aa4c60435477490a8987b64d45f9c8146a0b Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 24 Feb 2026 21:18:44 -0500 Subject: [PATCH 05/17] Add towncrier changelog fragment Co-Authored-By: Claude Opus 4.6 --- changelog.d/fix-ss-subcomponent-reconciliation.fixed.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/fix-ss-subcomponent-reconciliation.fixed.md diff --git a/changelog.d/fix-ss-subcomponent-reconciliation.fixed.md b/changelog.d/fix-ss-subcomponent-reconciliation.fixed.md new file mode 100644 index 00000000..2bfa8149 --- /dev/null +++ b/changelog.d/fix-ss-subcomponent-reconciliation.fixed.md @@ -0,0 +1 @@ +Reconcile SS sub-components after PUF imputation so they sum to social_security. From 92b5d04cc207f1141334f5008eed376d5701e6f7 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 25 Feb 2026 09:40:24 -0500 Subject: [PATCH 06/17] Bump policyengine-us to 1.584.0 Fixes CI failure: DE TANF deficit_rate parameter started at 2024-10-01 in 1.570.7, causing ParameterNotFoundError for Jan 2024 simulations. Fixed in newer releases (start date corrected to 2011-10-01). Co-Authored-By: Claude Opus 4.6 --- uv.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index fc61e96d..09a004ac 100644 --- a/uv.lock +++ b/uv.lock @@ -1842,7 +1842,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.570.7" +version = "1.584.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -1850,9 +1850,9 @@ dependencies = [ { name = "policyengine-core" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6a/eb/291b3085aa0fa97fcce4987d54991118f21aead49647b3f475998459f46b/policyengine_us-1.570.7.tar.gz", hash = "sha256:a2967af86a61468a0bdb6b2dc7af2fd0bb0f0064203fa557b6fee8023058360a", size = 8668680, upload-time = "2026-02-19T07:17:11.264Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ef/29/a4517eac6dedc1d6c1a17b0d105b9aa41ca41b877e6ecef39362ecd0438c/policyengine_us-1.584.0.tar.gz", hash = "sha256:9bb830765ece943b11939822da94d64a26c802c941b8f929a354354394a2ef7e", size = 8701324, upload-time = "2026-02-24T15:05:04.474Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/38/36/0213955310076e4dec2781baeabf96b6d6937f99cd19c373b363bfbd7152/policyengine_us-1.570.7-py3-none-any.whl", hash = "sha256:374fd5357d6cb3734b900bd08dfdb61760dfc5b913ed686f57a40239565b0edd", size = 7825404, upload-time = "2026-02-19T07:17:08.877Z" }, + { url = "https://files.pythonhosted.org/packages/17/f4/b8224384de676662858a079bc3138141a072ad886266e1e2d28a3108ad9d/policyengine_us-1.584.0-py3-none-any.whl", hash = "sha256:55c97c9e63c8ba18b895b74076d893de720cd44089398a62f0d02cc7afc8d53c", size = 7933593, upload-time = "2026-02-24T15:05:01.846Z" }, ] [[package]] From 0c8c4b52da66e6b56369a5eeafaab326b00b1c5a Mon Sep 17 00:00:00 2001 From: PavelMakarchuk Date: Wed, 25 Feb 2026 17:11:51 -0500 Subject: [PATCH 07/17] Fix and add retirement contribution calibration targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Correct traditional_ira_contributions: $25B → $13.2B (SOI 1304 Table 1.4 actual deduction, not total contributions) - Add traditional_401k_contributions: $567.9B (BEA/FRED employee DC contributions) - Add self_employed_pension_contribution_ald: $29.5B (SOI 1304 Table 1.4 Keogh plan deduction) - Remove roth_ira_contributions: structurally $0 due to CPS allocation bug (#553) - Update both loss.py and etl_national_targets.py Closes #553 Co-Authored-By: Claude Opus 4.6 --- changelog.d/changed/553.md | 1 + .../db/etl_national_targets.py | 25 ++++++++---- policyengine_us_data/utils/loss.py | 40 +++++++++++++++---- 3 files changed, 50 insertions(+), 16 deletions(-) create mode 100644 changelog.d/changed/553.md diff --git a/changelog.d/changed/553.md b/changelog.d/changed/553.md new file mode 100644 index 00000000..6ab5e251 --- /dev/null +++ b/changelog.d/changed/553.md @@ -0,0 +1 @@ +Fix and add retirement contribution calibration targets: correct traditional IRA from $25B to $13.2B (SOI 1304), add traditional 401(k) at $567.9B (BEA/FRED), add self-employed pension ALD at $29.5B (SOI 1304), remove ineffective Roth IRA target. diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index dc26cca8..c8e30b55 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -237,21 +237,30 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): "notes": "~5.8% of total OASDI (spouses/children of retired+disabled)", "year": HARDCODED_YEAR, }, - # IRA contribution totals from IRS SOI accumulation tables + # Retirement contribution targets — see issue #553 { "variable": "traditional_ira_contributions", - "value": 25e9, - "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", - "notes": "Tax year 2022 (~5M x $4,510 avg) uprated ~12% to 2024", + "value": 13.2e9, + "source": "https://www.irs.gov/statistics/soi-tax-stats-individual-statistical-tables-by-size-of-adjusted-gross-income", + "notes": "SOI 1304 Table 1.4 (TY 2022) 'IRA payments' deduction, col 124", "year": HARDCODED_YEAR, }, { - "variable": "roth_ira_contributions", - "value": 39e9, - "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", - "notes": "Tax year 2022 (~10M x $3,482 avg) uprated ~12% to 2024", + "variable": "traditional_401k_contributions", + "value": 567.9e9, + "source": "https://fred.stlouisfed.org/series/Y351RC1A027NBEA", + "notes": "BEA/FRED total DC ($815.4B) minus employer-only ($247.5B, W351RC0A144NBEA)", "year": HARDCODED_YEAR, }, + { + "variable": "self_employed_pension_contribution_ald", + "value": 29.5e9, + "source": "https://www.irs.gov/statistics/soi-tax-stats-individual-statistical-tables-by-size-of-adjusted-gross-income", + "notes": "SOI 1304 Table 1.4 (TY 2022) 'Payments to a Keogh plan', col 116", + "year": HARDCODED_YEAR, + }, + # roth_ira_contributions removed — structurally $0 due to + # CPS allocation bug (see #553). ] # Conditional count targets - these need strata with constraints diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 134e919b..45453ce2 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -53,14 +53,38 @@ "social_security_disability": 148e9, # ~10.2% (disabled workers) "social_security_survivors": 160e9, # ~11.0% (widows, children of deceased) "social_security_dependents": 84e9, # ~5.8% (spouses/children of retired+disabled) - # IRA contribution totals from IRS SOI IRA accumulation tables. - # Tax year 2022: ~5M taxpayers x $4,510 avg = ~$22.5B traditional; - # ~10M taxpayers x $3,482 avg = ~$34.8B Roth. - # Uprated ~12% to 2024 for limit increases ($6k->$7k) and - # wage growth. - # https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements - "traditional_ira_contributions": 25e9, - "roth_ira_contributions": 39e9, + # Retirement contribution calibration targets. + # + # traditional_ira_contributions: IRS SOI Publication 1304, Table 1.4 + # (TY 2022), "IRA payments" deduction — $13.17B (col 124, row + # "All returns, total"). This is the actual above-the-line + # deduction claimed on returns. The variable flows directly into + # the ALD with no deductibility logic in policyengine-us, so the + # target must match the deduction, not total contributions. + # https://www.irs.gov/statistics/soi-tax-stats-individual-statistical-tables-by-size-of-adjusted-gross-income + "traditional_ira_contributions": 13.2e9, + # traditional_401k_contributions: BEA/FRED National Income + # Accounts. Total DC employer+employee = $815.4B (FRED series + # Y351RC1A027NBEA), employer-only = $247.5B (FRED series + # W351RC0A144NBEA), employee elective deferrals = $567.9B + # (derived). Covers 401(k), 403(b), 457, and TSP. The variable + # flows directly into pre_tax_contributions (subtracted from + # wages) with no further logic. + # https://fred.stlouisfed.org/series/Y351RC1A027NBEA + # https://fred.stlouisfed.org/series/W351RC0A144NBEA + "traditional_401k_contributions": 567.9e9, + # self_employed_pension_contribution_ald: IRS SOI Publication + # 1304, Table 1.4 (TY 2022), "Payments to a Keogh plan" — + # $29.48B (col 116, row "All returns, total"). Includes + # SEP-IRAs, SIMPLE-IRAs, and traditional Keogh/HR-10 plans. + # Targeting the ALD (not the input) because policyengine-us + # applies a min(contributions, SE_income) cap. + # https://www.irs.gov/statistics/soi-tax-stats-individual-statistical-tables-by-size-of-adjusted-gross-income + "self_employed_pension_contribution_ald": 29.5e9, + # roth_ira_contributions removed: the CPS allocation logic + # (cps.py:713-728) allocates traditional IRA first up to the + # full IRA limit, leaving roth_ira_contributions structurally + # $0 for all records. The target was ineffective. See #553. } From ec563b48fc20824a6c120130a5234f2d1a638e0f Mon Sep 17 00:00:00 2001 From: PavelMakarchuk Date: Wed, 25 Feb 2026 17:53:59 -0500 Subject: [PATCH 08/17] Fix RETCB_VAL allocation to use proportional split Replace sequential waterfall with proportional allocation based on administrative data. The old waterfall gave 401(k) first priority, consuming all of RETCB_VAL and leaving IRA contributions at $0 for every record. The Roth IRA allocation was also mathematically guaranteed to produce $0. The new approach splits RETCB_VAL proportionally: - DC vs IRA: 90.8% / 9.2% (BEA/FRED vs IRS SOI) - Within DC: 85% traditional / 15% Roth (Vanguard/PSCA) - Within IRA: 39.2% traditional / 60.8% Roth (IRS SOI Tables 5 & 6) All fractions are stored in imputation_parameters.yaml with sources. Contribution limits are still enforced. Co-Authored-By: Claude Opus 4.6 --- policyengine_us_data/datasets/cps/cps.py | 101 ++++++++---------- .../datasets/cps/imputation_parameters.yaml | 31 +++++- 2 files changed, 76 insertions(+), 56 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 76e55d4a..8db414ef 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -617,19 +617,21 @@ def add_personal_income_variables( # They could also include General Assistance. cps["tanf_reported"] = person.PAW_VAL cps["ssi_reported"] = person.SSI_VAL - # Assume all retirement contributions are traditional 401(k) for now. - # Procedure for allocating retirement contributions: - # 1) If they report any self-employment income, allocate entirely to - # self-employed pension contributions. - # 2) If they report any wage and salary income, allocate in this order: - # a) Traditional 401(k) contributions up to to limit - # b) Roth 401(k) contributions up to the limit - # c) IRA contributions up to the limit, split according to administrative fractions - # d) Other retirement contributions - # Disregard reported pension contributions from people who report neither wage and salary - # nor self-employment income. - # Assume no 403(b) or 457 contributions for now. - # IRS retirement contribution limits by year. + # Allocate CPS RETCB_VAL (a single bundled retirement contribution + # total) into account-type-specific variables using a proportional + # split based on administrative data. + # + # RETCB_VAL does not distinguish account type (Census only asks + # "how much did you contribute to retirement accounts?"). The old + # sequential waterfall gave 401(k) first priority, which consumed + # nearly all of RETCB_VAL and left IRA contributions at $0. + # + # The proportional approach uses BEA/FRED and IRS SOI shares to + # split contributions into DC (401k) and IRA pools, then splits + # each pool into traditional/Roth using administrative fractions. + # See imputation_parameters.yaml for sources. + # + # Contribution limits by year. RETIREMENT_LIMITS = { 2020: { "401k": 19_500, @@ -668,7 +670,6 @@ def add_personal_income_variables( "ira_catch_up": 1_000, }, } - # Clamp to the nearest available year for out-of-range values. clamped_year = max( min(year, max(RETIREMENT_LIMITS)), min(RETIREMENT_LIMITS), @@ -679,53 +680,43 @@ def add_personal_income_variables( LIMIT_IRA = limits["ira"] LIMIT_IRA_CATCH_UP = limits["ira_catch_up"] CATCH_UP_AGE = 50 - retirement_contributions = person.RETCB_VAL - cps["self_employed_pension_contributions"] = np.where( - person.SEMP_VAL > 0, retirement_contributions, 0 - ) - remaining_retirement_contributions = np.maximum( - retirement_contributions - cps["self_employed_pension_contributions"], - 0, - ) - # Compute the 401(k) limit for the person's age. catch_up_eligible = person.A_AGE >= CATCH_UP_AGE limit_401k = LIMIT_401K + catch_up_eligible * LIMIT_401K_CATCH_UP limit_ira = LIMIT_IRA + catch_up_eligible * LIMIT_IRA_CATCH_UP - cps["traditional_401k_contributions"] = np.where( - person.WSAL_VAL > 0, - np.minimum(remaining_retirement_contributions, limit_401k), - 0, - ) - remaining_retirement_contributions = np.maximum( - remaining_retirement_contributions - - cps["traditional_401k_contributions"], - 0, - ) - cps["roth_401k_contributions"] = np.where( - person.WSAL_VAL > 0, - np.minimum(remaining_retirement_contributions, limit_401k), - 0, - ) - remaining_retirement_contributions = np.maximum( - remaining_retirement_contributions - cps["roth_401k_contributions"], - 0, - ) - cps["traditional_ira_contributions"] = np.where( - person.WSAL_VAL > 0, - np.minimum(remaining_retirement_contributions, limit_ira), - 0, - ) - remaining_retirement_contributions = np.maximum( - remaining_retirement_contributions - - cps["traditional_ira_contributions"], - 0, + + retirement_contributions = person.RETCB_VAL + has_wages = person.WSAL_VAL > 0 + has_se = person.SEMP_VAL > 0 + + # 1) Self-employed: allocate entirely to SE pension. + cps["self_employed_pension_contributions"] = np.where( + has_se, retirement_contributions, 0 ) - roth_ira_limit = limit_ira - cps["traditional_ira_contributions"] - cps["roth_ira_contributions"] = np.where( - person.WSAL_VAL > 0, - np.minimum(remaining_retirement_contributions, roth_ira_limit), + remaining = np.maximum( + retirement_contributions - cps["self_employed_pension_contributions"], 0, ) + + # 2) Wage earners: split proportionally into DC and IRA pools. + dc_share = p["dc_share_of_retirement_contributions"] + ira_share = p["ira_share_of_retirement_contributions"] + roth_dc_share = p["roth_share_of_dc_contributions"] + trad_ira_share = p["traditional_share_of_ira_contributions"] + + dc_pool = np.where(has_wages, remaining * dc_share, 0) + ira_pool = np.where(has_wages, remaining * ira_share, 0) + + # DC pool: split into traditional/Roth 401(k), cap at combined + # 401(k) limit. + dc_capped = np.minimum(dc_pool, limit_401k) + cps["traditional_401k_contributions"] = dc_capped * (1 - roth_dc_share) + cps["roth_401k_contributions"] = dc_capped * roth_dc_share + + # IRA pool: split into traditional/Roth IRA, cap at combined + # IRA limit. + ira_capped = np.minimum(ira_pool, limit_ira) + cps["traditional_ira_contributions"] = ira_capped * trad_ira_share + cps["roth_ira_contributions"] = ira_capped * (1 - trad_ira_share) # Allocate capital gains into long-term and short-term based on aggregate split. cps["long_term_capital_gains"] = person.CAP_VAL * ( p["long_term_capgain_fraction"] diff --git a/policyengine_us_data/datasets/cps/imputation_parameters.yaml b/policyengine_us_data/datasets/cps/imputation_parameters.yaml index 132b52af..7851661a 100644 --- a/policyengine_us_data/datasets/cps/imputation_parameters.yaml +++ b/policyengine_us_data/datasets/cps/imputation_parameters.yaml @@ -15,4 +15,33 @@ taxable_ira_distribution_fraction: 1.0 taxable_sep_distribution_fraction: 1.0 # SOI 2012 data -long_term_capgain_fraction: 0.880 +long_term_capgain_fraction: 0.880 + +# Retirement contribution allocation fractions. +# Used to split CPS RETCB_VAL (a single bundled total) into +# account-type-specific variables. +# +# DC vs IRA share of non-SE retirement contributions. +# Employee DC: $567.9B (BEA/FRED Y351RC1A027NBEA minus W351RC0A144NBEA) +# Total IRA: $57.5B (IRS SOI Tables 5 & 6, TY 2022) +# Combined: $625.4B +# https://fred.stlouisfed.org/series/Y351RC1A027NBEA +# https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements +dc_share_of_retirement_contributions: 0.908 +ira_share_of_retirement_contributions: 0.092 + +# Roth share of DC (401k/403b) contributions. +# Vanguard How America Saves 2024: 18% of participants elect Roth. +# PSCA 67th Annual Survey: 21% of participants make Roth contributions. +# Estimated dollar share ~15% (Roth participants contribute slightly +# less on average). +# https://corporate.vanguard.com/content/dam/corp/research/pdf/how_america_saves_report_2024.pdf +# https://www.psca.org/news/psca-news/2024/12/401k-savings-and-participation-rates-rise/ +roth_share_of_dc_contributions: 0.15 + +# Traditional vs Roth share of IRA contributions. +# IRS SOI IRA Accumulation Tables 5 & 6 (TY 2022): +# Traditional: $22.5B (4.99M contributors), Roth: $35.0B (10.04M) +# Traditional share: 22.5 / 57.5 = 39.2% +# https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements +traditional_share_of_ira_contributions: 0.392 From 7d731d4287684f0d4fc8ef22a9e8758a4f78dded Mon Sep 17 00:00:00 2001 From: PavelMakarchuk Date: Wed, 25 Feb 2026 18:25:08 -0500 Subject: [PATCH 09/17] Add Roth calibration targets and fix traditional 401(k) target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Split 401(k) target: $567.9B total employee DC deferrals (BEA/FRED) into traditional $482.7B (85%) and Roth $85.2B (15%) using Vanguard How America Saves 2024 dollar share estimate - Add roth_ira_contributions target: $35.0B from IRS SOI Accumulation Tables 5 & 6 (TY 2022) — direct administrative source - Update etl_national_targets.py in parallel Co-Authored-By: Claude Opus 4.6 --- changelog.d/changed/553.md | 2 +- .../db/etl_national_targets.py | 20 ++++++++++--- policyengine_us_data/utils/loss.py | 29 +++++++++++-------- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/changelog.d/changed/553.md b/changelog.d/changed/553.md index 6ab5e251..6c370943 100644 --- a/changelog.d/changed/553.md +++ b/changelog.d/changed/553.md @@ -1 +1 @@ -Fix and add retirement contribution calibration targets: correct traditional IRA from $25B to $13.2B (SOI 1304), add traditional 401(k) at $567.9B (BEA/FRED), add self-employed pension ALD at $29.5B (SOI 1304), remove ineffective Roth IRA target. +Fix and add retirement contribution calibration targets: correct traditional IRA from $25B to $13.2B (SOI 1304), split 401(k) into traditional $482.7B and Roth $85.2B (BEA/FRED × Vanguard share), add Roth IRA at $35.0B (SOI Tables 5 & 6), add self-employed pension ALD at $29.5B (SOI 1304). diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index c8e30b55..2b78b6d6 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -247,9 +247,16 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): }, { "variable": "traditional_401k_contributions", - "value": 567.9e9, + "value": 482.7e9, "source": "https://fred.stlouisfed.org/series/Y351RC1A027NBEA", - "notes": "BEA/FRED total DC ($815.4B) minus employer-only ($247.5B, W351RC0A144NBEA)", + "notes": "BEA/FRED employee DC deferrals ($567.9B) x 85% traditional share (Vanguard HAS 2024)", + "year": HARDCODED_YEAR, + }, + { + "variable": "roth_401k_contributions", + "value": 85.2e9, + "source": "https://fred.stlouisfed.org/series/Y351RC1A027NBEA", + "notes": "BEA/FRED employee DC deferrals ($567.9B) x 15% Roth share (Vanguard HAS 2024)", "year": HARDCODED_YEAR, }, { @@ -259,8 +266,13 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): "notes": "SOI 1304 Table 1.4 (TY 2022) 'Payments to a Keogh plan', col 116", "year": HARDCODED_YEAR, }, - # roth_ira_contributions removed — structurally $0 due to - # CPS allocation bug (see #553). + { + "variable": "roth_ira_contributions", + "value": 35.0e9, + "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", + "notes": "IRS SOI IRA Accumulation Tables 5 & 6 (TY 2022), 10.04M contributors", + "year": HARDCODED_YEAR, + }, ] # Conditional count targets - these need strata with constraints diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 45453ce2..d7410d2e 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -63,16 +63,20 @@ # target must match the deduction, not total contributions. # https://www.irs.gov/statistics/soi-tax-stats-individual-statistical-tables-by-size-of-adjusted-gross-income "traditional_ira_contributions": 13.2e9, - # traditional_401k_contributions: BEA/FRED National Income - # Accounts. Total DC employer+employee = $815.4B (FRED series - # Y351RC1A027NBEA), employer-only = $247.5B (FRED series - # W351RC0A144NBEA), employee elective deferrals = $567.9B - # (derived). Covers 401(k), 403(b), 457, and TSP. The variable - # flows directly into pre_tax_contributions (subtracted from - # wages) with no further logic. + # traditional_401k_contributions & roth_401k_contributions: + # BEA/FRED National Income Accounts. Total DC employer+employee + # = $815.4B (Y351RC1A027NBEA), employer-only = $247.5B + # (W351RC0A144NBEA), employee elective deferrals = $567.9B. + # Split into traditional/Roth using estimated 15% Roth dollar + # share (Vanguard How America Saves 2024: 18% participation, + # ~15% dollar share; PSCA 67th Annual Survey: 21% participation). + # Traditional: $567.9B × 85% = $482.7B + # Roth: $567.9B × 15% = $85.2B # https://fred.stlouisfed.org/series/Y351RC1A027NBEA # https://fred.stlouisfed.org/series/W351RC0A144NBEA - "traditional_401k_contributions": 567.9e9, + # https://corporate.vanguard.com/content/dam/corp/research/pdf/how_america_saves_report_2024.pdf + "traditional_401k_contributions": 482.7e9, + "roth_401k_contributions": 85.2e9, # self_employed_pension_contribution_ald: IRS SOI Publication # 1304, Table 1.4 (TY 2022), "Payments to a Keogh plan" — # $29.48B (col 116, row "All returns, total"). Includes @@ -81,10 +85,11 @@ # applies a min(contributions, SE_income) cap. # https://www.irs.gov/statistics/soi-tax-stats-individual-statistical-tables-by-size-of-adjusted-gross-income "self_employed_pension_contribution_ald": 29.5e9, - # roth_ira_contributions removed: the CPS allocation logic - # (cps.py:713-728) allocates traditional IRA first up to the - # full IRA limit, leaving roth_ira_contributions structurally - # $0 for all records. The target was ineffective. See #553. + # roth_ira_contributions: IRS SOI IRA Accumulation Tables 5 & 6 + # (TY 2022). Total Roth IRA contributions = $35.0B (10.04M + # contributors). Direct administrative source. + # https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements + "roth_ira_contributions": 35.0e9, } From d59810904e1a999d5f5af07997e24e1206de828b Mon Sep 17 00:00:00 2001 From: PavelMakarchuk Date: Wed, 25 Feb 2026 19:17:47 -0500 Subject: [PATCH 10/17] Upgrade policyengine-us to 1.587.0 in uv.lock Fixes CI failure caused by DE TANF deficit_rate parameter missing history before 2024-10-01. The fix was merged in policyengine-us PR #7170 but uv.lock was pinned at 1.570.7. Co-Authored-By: Claude Opus 4.6 --- uv.lock | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index fc61e96d..df1efae4 100644 --- a/uv.lock +++ b/uv.lock @@ -637,6 +637,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" }, { url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" }, { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" }, + { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" }, { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" }, { url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" }, { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" }, @@ -644,6 +645,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/2f/28592176381b9ab2cafa12829ba7b472d177f3acc35d8fbcf3673d966fff/greenlet-3.3.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:a1e41a81c7e2825822f4e068c48cb2196002362619e2d70b148f20a831c00739", size = 275140, upload-time = "2025-12-04T14:23:01.282Z" }, { url = "https://files.pythonhosted.org/packages/2c/80/fbe937bf81e9fca98c981fe499e59a3f45df2a04da0baa5c2be0dca0d329/greenlet-3.3.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f515a47d02da4d30caaa85b69474cec77b7929b2e936ff7fb853d42f4bf8808", size = 599219, upload-time = "2025-12-04T14:50:08.309Z" }, { url = "https://files.pythonhosted.org/packages/c2/ff/7c985128f0514271b8268476af89aee6866df5eec04ac17dcfbc676213df/greenlet-3.3.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d2d9fd66bfadf230b385fdc90426fcd6eb64db54b40c495b72ac0feb5766c54", size = 610211, upload-time = "2025-12-04T14:57:43.968Z" }, + { url = "https://files.pythonhosted.org/packages/79/07/c47a82d881319ec18a4510bb30463ed6891f2ad2c1901ed5ec23d3de351f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30a6e28487a790417d036088b3bcb3f3ac7d8babaa7d0139edbaddebf3af9492", size = 624311, upload-time = "2025-12-04T15:07:14.697Z" }, { url = "https://files.pythonhosted.org/packages/fd/8e/424b8c6e78bd9837d14ff7df01a9829fc883ba2ab4ea787d4f848435f23f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:087ea5e004437321508a8d6f20efc4cfec5e3c30118e1417ea96ed1d93950527", size = 612833, upload-time = "2025-12-04T14:26:03.669Z" }, { url = "https://files.pythonhosted.org/packages/b5/ba/56699ff9b7c76ca12f1cdc27a886d0f81f2189c3455ff9f65246780f713d/greenlet-3.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ab97cf74045343f6c60a39913fa59710e4bd26a536ce7ab2397adf8b27e67c39", size = 1567256, upload-time = "2025-12-04T15:04:25.276Z" }, { url = "https://files.pythonhosted.org/packages/1e/37/f31136132967982d698c71a281a8901daf1a8fbab935dce7c0cf15f942cc/greenlet-3.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5375d2e23184629112ca1ea89a53389dddbffcf417dad40125713d88eb5f96e8", size = 1636483, upload-time = "2025-12-04T14:27:30.804Z" }, @@ -1842,7 +1844,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.570.7" +version = "1.587.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -1850,9 +1852,9 @@ dependencies = [ { name = "policyengine-core" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6a/eb/291b3085aa0fa97fcce4987d54991118f21aead49647b3f475998459f46b/policyengine_us-1.570.7.tar.gz", hash = "sha256:a2967af86a61468a0bdb6b2dc7af2fd0bb0f0064203fa557b6fee8023058360a", size = 8668680, upload-time = "2026-02-19T07:17:11.264Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a8/15/8a12714d124b509346e60c927f7f344ee3b99c2b280bcfa9a053395d68e6/policyengine_us-1.587.0.tar.gz", hash = "sha256:399339eeea9a38caf6800432bc5eaa3b07b7b09ea269f4f3ba9f9c02aae587b9", size = 8630430, upload-time = "2026-02-25T23:35:46.002Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/38/36/0213955310076e4dec2781baeabf96b6d6937f99cd19c373b363bfbd7152/policyengine_us-1.570.7-py3-none-any.whl", hash = "sha256:374fd5357d6cb3734b900bd08dfdb61760dfc5b913ed686f57a40239565b0edd", size = 7825404, upload-time = "2026-02-19T07:17:08.877Z" }, + { url = "https://files.pythonhosted.org/packages/13/5a/3b643bf8c20d94d6fbbb638664eef571dcee0e88e98d4797128f6913cd6a/policyengine_us-1.587.0-py3-none-any.whl", hash = "sha256:712234f850d943464ae90a9f5055fb9a1bdba72c87beb2270618cb57803e9e6e", size = 7953707, upload-time = "2026-02-25T23:35:42.964Z" }, ] [[package]] From 2a8a0e2ed4c543c96c0c8285d80266cb23b8c1b1 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 25 Feb 2026 20:53:34 -0500 Subject: [PATCH 11/17] Drop formula variables from dataset, add integrity tests Variables with formulas in policyengine-us are recomputed by the simulation engine, so storing them wastes space and can mislead validation. This removes 9 such variables from the extended CPS output (saving ~15MB). Also adds tests verifying: - No formula variables are stored (except person_id) - Stored input values match what the simulation computes - SS sub-components sum to total social_security per person Co-Authored-By: Claude Opus 4.6 --- .../datasets/cps/extended_cps.py | 31 +++ tests/test_no_formula_variables_stored.py | 178 ++++++++++++++++++ 2 files changed, 209 insertions(+) create mode 100644 tests/test_no_formula_variables_stored.py diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 4b427a39..4495fc18 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -50,8 +50,39 @@ def generate(self): dataset_path=str(self.cps.file_path), ) + new_data = self._drop_formula_variables(new_data) self.save_dataset(new_data) + # Variables with formulas that must still be stored (e.g. IDs + # needed by the dataset loader before formulas can run). + _KEEP_FORMULA_VARS = {"person_id"} + + @classmethod + def _drop_formula_variables(cls, data): + """Remove variables that have formulas in policyengine-us. + + The simulation engine recomputes these from their inputs, + so storing them wastes space and can mislead validation. + """ + from policyengine_us import CountryTaxBenefitSystem + + tbs = CountryTaxBenefitSystem() + formula_vars = { + name + for name, var in tbs.variables.items() + if hasattr(var, "formulas") and len(var.formulas) > 0 + } - cls._KEEP_FORMULA_VARS + dropped = sorted(set(data.keys()) & formula_vars) + if dropped: + logger.info( + "Dropping %d formula variables: %s", + len(dropped), + dropped, + ) + for var in dropped: + del data[var] + return data + class ExtendedCPS_2024(ExtendedCPS): cps = CPS_2024_Full diff --git a/tests/test_no_formula_variables_stored.py b/tests/test_no_formula_variables_stored.py new file mode 100644 index 00000000..08cc00cf --- /dev/null +++ b/tests/test_no_formula_variables_stored.py @@ -0,0 +1,178 @@ +"""Test dataset variable integrity. + +1. Variables with formulas in policyengine-us will be recomputed by the + simulation engine, ignoring any stored value. Storing them wastes + space and can mislead during validation. + +2. Stored input values should match what the simulation actually uses, + to catch issues like set_input_divide_by_period silently altering + values, or sub-components not summing to their parent total. +""" + +import h5py +import numpy as np +import pytest +from policyengine_us_data.datasets.cps.extended_cps import ExtendedCPS_2024 + + +KNOWN_FORMULA_EXCEPTIONS = { + # person_id is stored for identity tracking even though it has a + # trivial formula (arange). Safe to keep. + "person_id", +} + + +@pytest.fixture(scope="module") +def tax_benefit_system(): + from policyengine_us import CountryTaxBenefitSystem + + return CountryTaxBenefitSystem() + + +@pytest.fixture(scope="module") +def formula_variables(tax_benefit_system): + """Return set of variable names that have formulas.""" + return { + name + for name, var in tax_benefit_system.variables.items() + if hasattr(var, "formulas") and len(var.formulas) > 0 + } + + +@pytest.fixture(scope="module") +def dataset_path(): + path = ExtendedCPS_2024.file_path + if not path.exists(): + pytest.skip("Extended CPS 2024 not built locally") + return path + + +@pytest.fixture(scope="module") +def stored_variables(dataset_path): + """Return set of variable names stored in the extended CPS.""" + with h5py.File(dataset_path, "r") as f: + return set(f.keys()) + + +@pytest.fixture(scope="module") +def sim(): + from policyengine_us import Microsimulation + + return Microsimulation(dataset=ExtendedCPS_2024) + + +def test_no_formula_variables_stored( + formula_variables, stored_variables +): + """Variables with formulas should not be stored in the dataset.""" + overlap = ( + stored_variables & formula_variables + ) - KNOWN_FORMULA_EXCEPTIONS + if overlap: + msg = ( + f"These {len(overlap)} variables have formulas in " + f"policyengine-us but are stored in the dataset " + f"(stored values will be IGNORED by the simulation):\n" + ) + for v in sorted(overlap): + msg += f" - {v}\n" + pytest.fail(msg) + + +def test_stored_values_match_computed( + sim, stored_variables, formula_variables, dataset_path +): + """For input-only variables, stored values should match what the + simulation returns (catches set_input issues, rounding, etc.).""" + input_vars = stored_variables - formula_variables + mismatches = [] + + with h5py.File(dataset_path, "r") as f: + for var_name in sorted(input_vars): + if var_name not in f or "2024" not in f[var_name]: + continue + stored = f[var_name]["2024"][...] + if stored.dtype.kind not in ("f", "i", "u"): + continue + + stored_f = stored.astype(float) + stored_total = np.sum(stored_f) + if abs(stored_total) < 1: + continue + + try: + computed = np.array( + sim.calculate(var_name, 2024) + ) + except Exception: + continue + + computed_total = np.sum(computed.astype(float)) + if abs(stored_total) > 0: + pct_diff = ( + abs(stored_total - computed_total) + / abs(stored_total) + * 100 + ) + else: + pct_diff = 0 + + if pct_diff > 1: + mismatches.append( + f" {var_name}: stored=${stored_total:,.0f}, " + f"computed=${computed_total:,.0f}, " + f"diff={pct_diff:.1f}%" + ) + + if mismatches: + msg = ( + f"These {len(mismatches)} input variables have >1% " + f"difference between stored and computed values:\n" + ) + msg += "\n".join(mismatches) + pytest.fail(msg) + + +def test_ss_subcomponents_sum_to_total(dataset_path): + """Social Security sub-components should sum to the total.""" + with h5py.File(dataset_path, "r") as f: + ss_total = f["social_security"]["2024"][...].astype(float) + ss_retirement = f["social_security_retirement"]["2024"][ + ... + ].astype(float) + ss_disability = f["social_security_disability"]["2024"][ + ... + ].astype(float) + ss_survivors = f["social_security_survivors"]["2024"][ + ... + ].astype(float) + ss_dependents = f["social_security_dependents"]["2024"][ + ... + ].astype(float) + + sub_sum = ( + ss_retirement + ss_disability + ss_survivors + ss_dependents + ) + + # Only check records that have any SS income + has_ss = ss_total > 0 + if not np.any(has_ss): + pytest.skip("No SS recipients in dataset") + + total_ss = np.sum(ss_total[has_ss]) + total_sub = np.sum(sub_sum[has_ss]) + pct_diff = abs(total_ss - total_sub) / total_ss * 100 + + assert pct_diff < 1, ( + f"SS sub-components don't sum to total: " + f"total=${total_ss:,.0f}, sub_sum=${total_sub:,.0f}, " + f"diff={pct_diff:.1f}%" + ) + + # Per-person check: no person's sub-components should exceed total + excess = sub_sum[has_ss] - ss_total[has_ss] + n_excess = np.sum(excess > 1) + assert n_excess == 0, ( + f"{n_excess} people have SS sub-components exceeding " + f"their total SS income" + ) From 1c83a8f63b2e214ccc577bf3bbff2015aa574e82 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 25 Feb 2026 20:58:01 -0500 Subject: [PATCH 12/17] Format test file with black Co-Authored-By: Claude Opus 4.6 --- tests/test_no_formula_variables_stored.py | 42 +++++++++-------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/tests/test_no_formula_variables_stored.py b/tests/test_no_formula_variables_stored.py index 08cc00cf..08f53e5d 100644 --- a/tests/test_no_formula_variables_stored.py +++ b/tests/test_no_formula_variables_stored.py @@ -61,13 +61,9 @@ def sim(): return Microsimulation(dataset=ExtendedCPS_2024) -def test_no_formula_variables_stored( - formula_variables, stored_variables -): +def test_no_formula_variables_stored(formula_variables, stored_variables): """Variables with formulas should not be stored in the dataset.""" - overlap = ( - stored_variables & formula_variables - ) - KNOWN_FORMULA_EXCEPTIONS + overlap = (stored_variables & formula_variables) - KNOWN_FORMULA_EXCEPTIONS if overlap: msg = ( f"These {len(overlap)} variables have formulas in " @@ -101,9 +97,7 @@ def test_stored_values_match_computed( continue try: - computed = np.array( - sim.calculate(var_name, 2024) - ) + computed = np.array(sim.calculate(var_name, 2024)) except Exception: continue @@ -137,22 +131,20 @@ def test_ss_subcomponents_sum_to_total(dataset_path): """Social Security sub-components should sum to the total.""" with h5py.File(dataset_path, "r") as f: ss_total = f["social_security"]["2024"][...].astype(float) - ss_retirement = f["social_security_retirement"]["2024"][ - ... - ].astype(float) - ss_disability = f["social_security_disability"]["2024"][ - ... - ].astype(float) - ss_survivors = f["social_security_survivors"]["2024"][ - ... - ].astype(float) - ss_dependents = f["social_security_dependents"]["2024"][ - ... - ].astype(float) - - sub_sum = ( - ss_retirement + ss_disability + ss_survivors + ss_dependents - ) + ss_retirement = f["social_security_retirement"]["2024"][...].astype( + float + ) + ss_disability = f["social_security_disability"]["2024"][...].astype( + float + ) + ss_survivors = f["social_security_survivors"]["2024"][...].astype( + float + ) + ss_dependents = f["social_security_dependents"]["2024"][...].astype( + float + ) + + sub_sum = ss_retirement + ss_disability + ss_survivors + ss_dependents # Only check records that have any SS income has_ss = ss_total > 0 From 86ecc0cc89e403634a2c3e763b73a1c0b04b1972 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 25 Feb 2026 21:00:54 -0500 Subject: [PATCH 13/17] Fix black formatting for CI (26.1.0) Co-Authored-By: Claude Opus 4.6 --- tests/test_no_formula_variables_stored.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_no_formula_variables_stored.py b/tests/test_no_formula_variables_stored.py index 08f53e5d..e16bd80c 100644 --- a/tests/test_no_formula_variables_stored.py +++ b/tests/test_no_formula_variables_stored.py @@ -14,7 +14,6 @@ import pytest from policyengine_us_data.datasets.cps.extended_cps import ExtendedCPS_2024 - KNOWN_FORMULA_EXCEPTIONS = { # person_id is stored for identity tracking even though it has a # trivial formula (arange). Safe to keep. From f304f82072c844bb5ede9f2e6eec031458fa5bf3 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 25 Feb 2026 21:18:46 -0500 Subject: [PATCH 14/17] fixup! Fix black formatting for CI (26.1.0) --- policyengine_us_data/datasets/cps/extended_cps.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 4495fc18..ac8b6e45 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -59,10 +59,11 @@ def generate(self): @classmethod def _drop_formula_variables(cls, data): - """Remove variables that have formulas in policyengine-us. + """Remove variables that are computed by policyengine-us. - The simulation engine recomputes these from their inputs, - so storing them wastes space and can mislead validation. + Variables with formulas, ``adds``, or ``subtracts`` are + recomputed by the simulation engine, so storing them wastes + space and can mislead validation. """ from policyengine_us import CountryTaxBenefitSystem @@ -70,7 +71,9 @@ def _drop_formula_variables(cls, data): formula_vars = { name for name, var in tbs.variables.items() - if hasattr(var, "formulas") and len(var.formulas) > 0 + if (hasattr(var, "formulas") and len(var.formulas) > 0) + or getattr(var, "adds", None) + or getattr(var, "subtracts", None) } - cls._KEEP_FORMULA_VARS dropped = sorted(set(data.keys()) & formula_vars) if dropped: From b7bdc215f9d72149634ca40ef11a1ac7c337f529 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 25 Feb 2026 21:26:55 -0500 Subject: [PATCH 15/17] Also drop adds/subtracts variables from dataset Update _drop_formula_variables and tests to catch variables that use `adds` or `subtracts` (e.g. social_security), not just explicit formulas. These are also recomputed by the simulation engine. Co-Authored-By: Claude Opus 4.6 --- tests/test_no_formula_variables_stored.py | 44 +++++++++++++++-------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/tests/test_no_formula_variables_stored.py b/tests/test_no_formula_variables_stored.py index e16bd80c..9334a5c7 100644 --- a/tests/test_no_formula_variables_stored.py +++ b/tests/test_no_formula_variables_stored.py @@ -30,11 +30,17 @@ def tax_benefit_system(): @pytest.fixture(scope="module") def formula_variables(tax_benefit_system): - """Return set of variable names that have formulas.""" + """Return set of variable names computed by policyengine-us. + + Includes variables with explicit formulas as well as those using + ``adds`` or ``subtracts`` (which the engine auto-sums at runtime). + """ return { name for name, var in tax_benefit_system.variables.items() - if hasattr(var, "formulas") and len(var.formulas) > 0 + if (hasattr(var, "formulas") and len(var.formulas) > 0) + or getattr(var, "adds", None) + or getattr(var, "subtracts", None) } @@ -61,12 +67,13 @@ def sim(): def test_no_formula_variables_stored(formula_variables, stored_variables): - """Variables with formulas should not be stored in the dataset.""" + """Computed variables should not be stored in the dataset.""" overlap = (stored_variables & formula_variables) - KNOWN_FORMULA_EXCEPTIONS if overlap: msg = ( - f"These {len(overlap)} variables have formulas in " - f"policyengine-us but are stored in the dataset " + f"These {len(overlap)} variables are computed by " + f"policyengine-us (formulas/adds/subtracts) but are " + f"stored in the dataset " f"(stored values will be IGNORED by the simulation):\n" ) for v in sorted(overlap): @@ -126,10 +133,14 @@ def test_stored_values_match_computed( pytest.fail(msg) -def test_ss_subcomponents_sum_to_total(dataset_path): - """Social Security sub-components should sum to the total.""" +def test_ss_subcomponents_sum_to_computed_total(sim, dataset_path): + """Social Security sub-components should sum to the computed total. + + ``social_security`` is computed via ``adds`` in policyengine-us and + is NOT stored in the dataset. We verify that the sub-components + stored in the dataset sum to the simulation's computed total. + """ with h5py.File(dataset_path, "r") as f: - ss_total = f["social_security"]["2024"][...].astype(float) ss_retirement = f["social_security_retirement"]["2024"][...].astype( float ) @@ -144,26 +155,29 @@ def test_ss_subcomponents_sum_to_total(dataset_path): ) sub_sum = ss_retirement + ss_disability + ss_survivors + ss_dependents + computed_total = np.array(sim.calculate("social_security", 2024)).astype( + float + ) # Only check records that have any SS income - has_ss = ss_total > 0 + has_ss = computed_total > 0 if not np.any(has_ss): pytest.skip("No SS recipients in dataset") - total_ss = np.sum(ss_total[has_ss]) + total_computed = np.sum(computed_total[has_ss]) total_sub = np.sum(sub_sum[has_ss]) - pct_diff = abs(total_ss - total_sub) / total_ss * 100 + pct_diff = abs(total_computed - total_sub) / total_computed * 100 assert pct_diff < 1, ( - f"SS sub-components don't sum to total: " - f"total=${total_ss:,.0f}, sub_sum=${total_sub:,.0f}, " + f"SS sub-components don't sum to computed total: " + f"computed=${total_computed:,.0f}, sub_sum=${total_sub:,.0f}, " f"diff={pct_diff:.1f}%" ) # Per-person check: no person's sub-components should exceed total - excess = sub_sum[has_ss] - ss_total[has_ss] + excess = sub_sum[has_ss] - computed_total[has_ss] n_excess = np.sum(excess > 1) assert n_excess == 0, ( f"{n_excess} people have SS sub-components exceeding " - f"their total SS income" + f"their computed total SS income" ) From 60d1f23cd2644ed23883ed96dd4466dfda48f785 Mon Sep 17 00:00:00 2001 From: PavelMakarchuk Date: Thu, 26 Feb 2026 18:05:06 -0500 Subject: [PATCH 16/17] Train QRF for retirement contributions on PUF clone half PUF clones previously copied CPS retirement contributions blindly, so a record with $0 wages could have $50k in 401(k) contributions. Train a QRF on CPS data (which has realistic income-to-contribution relationships) and predict onto the PUF half using PUF-imputed income. Post-prediction constraints enforce contribution caps, zero-out rules for records with no wages/SE income, and non-negativity. - Remove traditional_ira_contributions from IMPUTED_VARIABLES - Add CPS_RETIREMENT_VARIABLES, RETIREMENT_PREDICTORS constants - Add _get_retirement_limits() with year-specific 401k/IRA caps - Add _impute_retirement_contributions() following _impute_weeks_unemployed pattern - Integrate into puf_clone_dataset() variable routing loop - Add 34 unit tests covering constraints, routing, and limits Closes #561 Co-Authored-By: Claude Opus 4.6 --- conftest.py | 14 + .../calibration/puf_impute.py | 198 +++++- .../tests/test_calibration/conftest.py | 15 + .../test_retirement_imputation.py | 578 ++++++++++++++++++ 4 files changed, 804 insertions(+), 1 deletion(-) create mode 100644 conftest.py create mode 100644 policyengine_us_data/tests/test_calibration/conftest.py create mode 100644 policyengine_us_data/tests/test_calibration/test_retirement_imputation.py diff --git a/conftest.py b/conftest.py new file mode 100644 index 00000000..84ce8ac4 --- /dev/null +++ b/conftest.py @@ -0,0 +1,14 @@ +"""Root conftest: mock optional dependencies before collection. + +microimpute requires Python >= 3.12, so mock it for test +environments running an older interpreter. +""" + +import sys +from unittest.mock import MagicMock + +if "microimpute" not in sys.modules: + _mock = MagicMock() + sys.modules["microimpute"] = _mock + sys.modules["microimpute.models"] = _mock + sys.modules["microimpute.models.qrf"] = _mock diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index d5c73785..f55698fa 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -74,7 +74,6 @@ "non_sch_d_capital_gains", "general_business_credit", "energy_efficient_home_improvement_credit", - "traditional_ira_contributions", "amt_foreign_tax_credit", "excess_withheld_payroll_tax", "savers_credit", @@ -161,6 +160,73 @@ "rental_income_would_be_qualified", ] +CPS_RETIREMENT_VARIABLES = [ + "traditional_401k_contributions", + "roth_401k_contributions", + "traditional_ira_contributions", + "roth_ira_contributions", + "self_employed_pension_contributions", +] + +RETIREMENT_PREDICTORS = [ + "employment_income", + "self_employment_income", + "age", + "is_male", + "tax_unit_is_joint", + "is_tax_unit_head", + "is_tax_unit_spouse", + "is_tax_unit_dependent", +] + + +def _get_retirement_limits(year: int) -> dict: + """Return contribution limits for the given tax year. + + Mirrors the limits in cps.py (duplicated here to avoid circular + imports between the calibration and dataset packages). + """ + limits_by_year = { + 2020: { + "401k": 19_500, + "401k_catch_up": 6_500, + "ira": 6_000, + "ira_catch_up": 1_000, + }, + 2021: { + "401k": 19_500, + "401k_catch_up": 6_500, + "ira": 6_000, + "ira_catch_up": 1_000, + }, + 2022: { + "401k": 20_500, + "401k_catch_up": 6_500, + "ira": 6_000, + "ira_catch_up": 1_000, + }, + 2023: { + "401k": 22_500, + "401k_catch_up": 7_500, + "ira": 6_500, + "ira_catch_up": 1_000, + }, + 2024: { + "401k": 23_000, + "401k_catch_up": 7_500, + "ira": 7_000, + "ira_catch_up": 1_000, + }, + 2025: { + "401k": 23_500, + "401k_catch_up": 7_500, + "ira": 7_000, + "ira_catch_up": 1_000, + }, + } + clamped = max(min(year, max(limits_by_year)), min(limits_by_year)) + return limits_by_year[clamped] + MINIMUM_RETIREMENT_AGE = 62 @@ -441,6 +507,13 @@ def _map_to_entity(pred_values, variable_name): data, y_full, time_period, dataset_path ) + # Impute retirement contributions for PUF half + puf_retirement = None + if y_full is not None and dataset_path is not None: + puf_retirement = _impute_retirement_contributions( + data, y_full, time_period, dataset_path + ) + new_data = {} for variable, time_dict in data.items(): values = time_dict[time_period] @@ -463,6 +536,13 @@ def _map_to_entity(pred_values, variable_name): new_data[variable] = { time_period: np.concatenate([values, puf_weeks]) } + elif ( + variable in CPS_RETIREMENT_VARIABLES and puf_retirement is not None + ): + puf_vals = puf_retirement[variable] + new_data[variable] = { + time_period: np.concatenate([values, puf_vals]) + } else: new_data[variable] = { time_period: np.concatenate([values, values]) @@ -592,6 +672,122 @@ def _impute_weeks_unemployed( return imputed_weeks +def _impute_retirement_contributions( + data: Dict[str, Dict[int, np.ndarray]], + puf_imputations: Dict[str, np.ndarray], + time_period: int, + dataset_path: str, +) -> Dict[str, np.ndarray]: + """Impute retirement contributions for the PUF half using QRF. + + Trains on CPS data (which has realistic income-to-contribution + relationships) and predicts onto PUF clone records using + PUF-imputed income as input features. + + Args: + data: CPS data dict. + puf_imputations: Dict of PUF-imputed variable arrays. + time_period: Tax year. + dataset_path: Path to CPS h5 for Microsimulation. + + Returns: + Dict mapping retirement variable names to imputed arrays. + """ + from microimpute.models.qrf import QRF + from policyengine_us import Microsimulation + + cps_sim = Microsimulation(dataset=dataset_path) + + # Build training data from CPS (has realistic relationships) + train_cols = RETIREMENT_PREDICTORS + CPS_RETIREMENT_VARIABLES + try: + X_train = cps_sim.calculate_dataframe(train_cols) + except (ValueError, KeyError) as e: + logger.warning("Could not build retirement training data: %s", e) + n_persons = len(data["person_id"][time_period]) + del cps_sim + return {var: np.zeros(n_persons) for var in CPS_RETIREMENT_VARIABLES} + + # Build test data: demographics from CPS sim, income from PUF + X_test = cps_sim.calculate_dataframe( + [ + p + for p in RETIREMENT_PREDICTORS + if p not in ("employment_income", "self_employment_income") + ] + ) + for income_var in ("employment_income", "self_employment_income"): + if income_var in puf_imputations: + X_test[income_var] = puf_imputations[income_var] + else: + X_test[income_var] = cps_sim.calculate(income_var).values + + del cps_sim + + # Subsample training data + if len(X_train) > 5000: + X_train_sampled = X_train.sample(n=5000, random_state=42) + else: + X_train_sampled = X_train + + # Train QRF + qrf = QRF(log_level="INFO", memory_efficient=True) + fitted = qrf.fit( + X_train=X_train_sampled, + predictors=RETIREMENT_PREDICTORS, + imputed_variables=CPS_RETIREMENT_VARIABLES, + n_jobs=1, + ) + predictions = fitted.predict(X_test=X_test) + + # Extract results and apply constraints + limits = _get_retirement_limits(time_period) + age = X_test["age"].values + catch_up_eligible = age >= 50 + limit_401k = limits["401k"] + catch_up_eligible * limits["401k_catch_up"] + limit_ira = limits["ira"] + catch_up_eligible * limits["ira_catch_up"] + + emp_income = X_test["employment_income"].values + se_income = X_test["self_employment_income"].values + + result = {} + for var in CPS_RETIREMENT_VARIABLES: + vals = predictions[var].values + + # Non-negativity + vals = np.maximum(vals, 0) + + # Cap 401k at year-specific limit + if "401k" in var: + vals = np.minimum(vals, limit_401k) + # Zero out for records with no employment income + vals = np.where(emp_income > 0, vals, 0) + + # Cap IRA at year-specific limit + if "ira" in var: + vals = np.minimum(vals, limit_ira) + + # Zero out SE pension for records with no SE income + if var == "self_employed_pension_contributions": + vals = np.where(se_income > 0, vals, 0) + + result[var] = vals + + logger.info( + "Imputed retirement contributions for PUF: " + "401k mean=$%.0f, IRA mean=$%.0f, SE pension mean=$%.0f", + result["traditional_401k_contributions"].mean() + + result["roth_401k_contributions"].mean(), + result["traditional_ira_contributions"].mean() + + result["roth_ira_contributions"].mean(), + result["self_employed_pension_contributions"].mean(), + ) + + del fitted, predictions + gc.collect() + return result + + def _run_qrf_imputation( data: Dict[str, Dict[int, np.ndarray]], time_period: int, diff --git a/policyengine_us_data/tests/test_calibration/conftest.py b/policyengine_us_data/tests/test_calibration/conftest.py new file mode 100644 index 00000000..7a1de465 --- /dev/null +++ b/policyengine_us_data/tests/test_calibration/conftest.py @@ -0,0 +1,15 @@ +"""Shared fixtures for calibration tests. + +Mocks microimpute at import time so tests can run without the +package installed (it requires Python >= 3.12). +""" + +import sys +from unittest.mock import MagicMock + +# microimpute is not installable on Python < 3.12. Mock it before +# any test module triggers the cps.py top-level import. +if "microimpute" not in sys.modules: + sys.modules["microimpute"] = MagicMock() + sys.modules["microimpute.models"] = MagicMock() + sys.modules["microimpute.models.qrf"] = MagicMock() diff --git a/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py b/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py new file mode 100644 index 00000000..7a8a3227 --- /dev/null +++ b/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py @@ -0,0 +1,578 @@ +"""Tests for retirement contribution QRF imputation. + +Covers: +- Constants & list membership +- _get_retirement_limits() logic +- _impute_retirement_contributions() with mocked QRF/Microsimulation +- puf_clone_dataset() integration routing for retirement variables +""" + +from unittest.mock import MagicMock, patch + +import numpy as np +import pandas as pd +import pytest + +from policyengine_us_data.calibration.puf_impute import ( + CPS_RETIREMENT_VARIABLES, + IMPUTED_VARIABLES, + OVERRIDDEN_IMPUTED_VARIABLES, + RETIREMENT_PREDICTORS, + _get_retirement_limits, + _impute_retirement_contributions, + puf_clone_dataset, +) + +# The function imports Microsimulation and QRF locally via: +# from policyengine_us import Microsimulation +# from microimpute.models.qrf import QRF +# We must patch those source modules so the local import picks +# up mocks, not real objects. +_MSIM_PATCH = "policyengine_us.Microsimulation" +_QRF_PATCH = "microimpute.models.qrf.QRF" + + +# ── helpers ────────────────────────────────────────────────────────── + + +def _make_mock_data(n_persons=20, n_households=5, time_period=2024): + """Minimal CPS data dict with retirement variables.""" + rng = np.random.default_rng(42) + person_ids = np.arange(1, n_persons + 1) + hh_ids_person = np.repeat( + np.arange(1, n_households + 1), + n_persons // n_households, + ) + + data = { + "person_id": {time_period: person_ids}, + "household_id": {time_period: np.arange(1, n_households + 1)}, + "tax_unit_id": {time_period: np.arange(1, n_households + 1)}, + "spm_unit_id": {time_period: np.arange(1, n_households + 1)}, + "person_household_id": {time_period: hh_ids_person}, + "person_tax_unit_id": {time_period: hh_ids_person.copy()}, + "person_spm_unit_id": {time_period: hh_ids_person.copy()}, + "age": { + time_period: rng.integers(18, 80, size=n_persons).astype( + np.float32 + ) + }, + "is_male": { + time_period: rng.integers(0, 2, size=n_persons).astype(np.float32) + }, + "household_weight": {time_period: np.ones(n_households) * 1000}, + "employment_income": { + time_period: rng.uniform(0, 100_000, n_persons).astype(np.float32) + }, + "self_employment_income": { + time_period: rng.uniform(0, 50_000, n_persons).astype(np.float32) + }, + } + for var in CPS_RETIREMENT_VARIABLES: + data[var] = { + time_period: rng.uniform(0, 5000, n_persons).astype(np.float32) + } + return data + + +def _make_cps_df(n, rng): + """Build a mock CPS DataFrame with all needed columns.""" + return pd.DataFrame( + { + "age": rng.integers(18, 80, n).astype(float), + "is_male": rng.integers(0, 2, n).astype(float), + "tax_unit_is_joint": rng.integers(0, 2, n).astype(float), + "is_tax_unit_head": rng.integers(0, 2, n).astype(float), + "is_tax_unit_spouse": np.zeros(n), + "is_tax_unit_dependent": np.zeros(n), + "employment_income": rng.uniform(0, 100_000, n), + "self_employment_income": rng.uniform(0, 50_000, n), + "traditional_401k_contributions": rng.uniform(0, 5000, n), + "roth_401k_contributions": rng.uniform(0, 3000, n), + "traditional_ira_contributions": rng.uniform(0, 2000, n), + "roth_ira_contributions": rng.uniform(0, 2000, n), + "self_employed_pension_contributions": rng.uniform(0, 10_000, n), + } + ) + + +def _make_mock_sim(cps_df): + """Build a mock Microsimulation object.""" + sim = MagicMock() + + def calc_df(cols): + return cps_df[cols].copy() + + def calc_single(var): + result = MagicMock() + result.values = cps_df[var].values + return result + + sim.calculate_dataframe = calc_df + sim.calculate = calc_single + return sim + + +def _make_mock_qrf_class(predictions_df): + """Build a mock QRF class whose predict returns predictions_df.""" + mock_cls = MagicMock() + fitted = MagicMock() + fitted.predict.return_value = predictions_df + mock_cls.return_value.fit.return_value = fitted + return mock_cls + + +# ── TestConstants ──────────────────────────────────────────────────── + + +class TestConstants: + def test_retirement_vars_not_in_imputed(self): + """Retirement vars must NOT be in IMPUTED_VARIABLES.""" + for var in CPS_RETIREMENT_VARIABLES: + assert ( + var not in IMPUTED_VARIABLES + ), f"{var} should not be in IMPUTED_VARIABLES" + + def test_retirement_vars_not_in_overridden(self): + for var in CPS_RETIREMENT_VARIABLES: + assert var not in OVERRIDDEN_IMPUTED_VARIABLES + + def test_five_retirement_variables(self): + assert len(CPS_RETIREMENT_VARIABLES) == 5 + + def test_retirement_variable_names(self): + expected = { + "traditional_401k_contributions", + "roth_401k_contributions", + "traditional_ira_contributions", + "roth_ira_contributions", + "self_employed_pension_contributions", + } + assert set(CPS_RETIREMENT_VARIABLES) == expected + + def test_retirement_predictors_include_income(self): + assert "employment_income" in RETIREMENT_PREDICTORS + assert "self_employment_income" in RETIREMENT_PREDICTORS + + def test_retirement_predictors_include_demographics(self): + for pred in ["age", "is_male", "tax_unit_is_joint"]: + assert pred in RETIREMENT_PREDICTORS + + +# ── TestGetRetirementLimits ────────────────────────────────────────── + + +class TestGetRetirementLimits: + def test_known_year_2024(self): + lim = _get_retirement_limits(2024) + assert lim["401k"] == 23_000 + assert lim["401k_catch_up"] == 7_500 + assert lim["ira"] == 7_000 + assert lim["ira_catch_up"] == 1_000 + + def test_known_year_2020(self): + lim = _get_retirement_limits(2020) + assert lim["401k"] == 19_500 + assert lim["ira"] == 6_000 + + def test_known_year_2025(self): + lim = _get_retirement_limits(2025) + assert lim["401k"] == 23_500 + + def test_clamps_below_min_year(self): + assert _get_retirement_limits(2015) == _get_retirement_limits(2020) + + def test_clamps_above_max_year(self): + assert _get_retirement_limits(2030) == _get_retirement_limits(2025) + + def test_all_years_have_four_keys(self): + for year in range(2020, 2026): + lim = _get_retirement_limits(year) + assert set(lim.keys()) == { + "401k", + "401k_catch_up", + "ira", + "ira_catch_up", + } + + def test_limits_increase_monotonically(self): + prev = _get_retirement_limits(2020)["401k"] + for year in range(2021, 2026): + cur = _get_retirement_limits(year)["401k"] + assert cur >= prev + prev = cur + + def test_ira_limits_increase_monotonically(self): + prev = _get_retirement_limits(2020)["ira"] + for year in range(2021, 2026): + cur = _get_retirement_limits(year)["ira"] + assert cur >= prev + prev = cur + + +# ── TestImputeRetirementContributions ──────────────────────────────── + + +class TestImputeRetirementContributions: + """Tests with mocked Microsimulation and QRF.""" + + @pytest.fixture(autouse=True) + def _setup(self): + self.n = 50 + self.time_period = 2024 + rng = np.random.default_rng(42) + + self.data = { + "person_id": {self.time_period: np.arange(1, self.n + 1)}, + } + + emp = rng.uniform(0, 150_000, self.n).astype(np.float32) + emp[:10] = 0 # 10 records with $0 wages + se = rng.uniform(0, 80_000, self.n).astype(np.float32) + se[:20] = 0 # 20 records with $0 SE income + + self.puf_imputations = { + "employment_income": emp, + "self_employment_income": se, + } + + self.cps_df = _make_cps_df(self.n, rng) + + def _call_with_mocks(self, pred_df): + """Run _impute_retirement_contributions with mocked deps.""" + import sys + + mock_sim = _make_mock_sim(self.cps_df) + mock_qrf_cls = _make_mock_qrf_class(pred_df) + + # patch() doesn't work reliably on MagicMock modules, + # so we set the QRF attribute directly. + qrf_mod = sys.modules["microimpute.models.qrf"] + old_qrf = getattr(qrf_mod, "QRF", None) + qrf_mod.QRF = mock_qrf_cls + try: + with patch(_MSIM_PATCH, return_value=mock_sim): + return _impute_retirement_contributions( + self.data, + self.puf_imputations, + self.time_period, + "/fake/path.h5", + ) + finally: + if old_qrf is not None: + qrf_mod.QRF = old_qrf + + def _uniform_preds(self, value): + """Build a DataFrame predicting `value` for all vars.""" + return pd.DataFrame( + {var: np.full(self.n, value) for var in CPS_RETIREMENT_VARIABLES} + ) + + def _random_preds(self, low, high, seed=99): + rng = np.random.default_rng(seed) + return pd.DataFrame( + { + var: rng.uniform(low, high, self.n) + for var in CPS_RETIREMENT_VARIABLES + } + ) + + def test_returns_all_retirement_vars(self): + result = self._call_with_mocks(self._random_preds(0, 8000)) + for var in CPS_RETIREMENT_VARIABLES: + assert var in result + assert len(result[var]) == self.n + + def test_nonnegative_output(self): + result = self._call_with_mocks(self._random_preds(-5000, 8000, seed=7)) + for var in CPS_RETIREMENT_VARIABLES: + assert np.all(result[var] >= 0), f"{var} has negative values" + + def test_401k_capped(self): + result = self._call_with_mocks(self._uniform_preds(50_000.0)) + lim = _get_retirement_limits(self.time_period) + max_401k = lim["401k"] + lim["401k_catch_up"] + + for var in ( + "traditional_401k_contributions", + "roth_401k_contributions", + ): + assert np.all(result[var] <= max_401k), f"{var} exceeds 401k limit" + + def test_ira_capped(self): + result = self._call_with_mocks(self._uniform_preds(50_000.0)) + lim = _get_retirement_limits(self.time_period) + max_ira = lim["ira"] + lim["ira_catch_up"] + + for var in ( + "traditional_ira_contributions", + "roth_ira_contributions", + ): + assert np.all(result[var] <= max_ira), f"{var} exceeds IRA limit" + + def test_401k_zero_when_no_wages(self): + result = self._call_with_mocks(self._uniform_preds(5_000.0)) + zero_wage = self.puf_imputations["employment_income"] == 0 + assert zero_wage.sum() == 10 + + for var in ( + "traditional_401k_contributions", + "roth_401k_contributions", + ): + assert np.all( + result[var][zero_wage] == 0 + ), f"{var} should be 0 when employment_income is 0" + + def test_se_pension_zero_when_no_se_income(self): + result = self._call_with_mocks(self._uniform_preds(5_000.0)) + zero_se = self.puf_imputations["self_employment_income"] == 0 + assert zero_se.sum() == 20 + assert np.all( + result["self_employed_pension_contributions"][zero_se] == 0 + ) + + def test_catch_up_age_threshold(self): + """Records age >= 50 get higher caps than younger.""" + self.cps_df["age"] = np.concatenate( + [np.full(25, 30.0), np.full(25, 55.0)] + ) + # All have positive income + self.puf_imputations["employment_income"] = np.full( + self.n, 100_000.0 + ).astype(np.float32) + + lim = _get_retirement_limits(self.time_period) + val = float(lim["401k"]) + 1000 # 24000 + + result = self._call_with_mocks(self._uniform_preds(val)) + + young_401k = result["traditional_401k_contributions"][:25] + old_401k = result["traditional_401k_contributions"][25:] + + # Young capped at base limit + assert np.all(young_401k == lim["401k"]) + # Old get full value (within catch-up limit) + assert np.all(old_401k == val) + + def test_ira_catch_up_threshold(self): + """IRA catch-up also works for age >= 50.""" + self.cps_df["age"] = np.concatenate( + [np.full(25, 30.0), np.full(25, 55.0)] + ) + lim = _get_retirement_limits(self.time_period) + val = float(lim["ira"]) + 500 # 7500 + + result = self._call_with_mocks(self._uniform_preds(val)) + + young_ira = result["traditional_ira_contributions"][:25] + old_ira = result["traditional_ira_contributions"][25:] + + assert np.all(young_ira == lim["ira"]) + assert np.all(old_ira == val) + + def test_401k_nonzero_for_positive_wages(self): + """Records with positive wages should keep their + predicted 401k (not zeroed out).""" + result = self._call_with_mocks(self._uniform_preds(5_000.0)) + pos_wage = self.puf_imputations["employment_income"] > 0 + for var in ( + "traditional_401k_contributions", + "roth_401k_contributions", + ): + assert np.all(result[var][pos_wage] > 0) + + def test_se_pension_nonzero_for_positive_se(self): + result = self._call_with_mocks(self._uniform_preds(5_000.0)) + pos_se = self.puf_imputations["self_employment_income"] > 0 + assert np.all( + result["self_employed_pension_contributions"][pos_se] > 0 + ) + + +# ── TestPufCloneRetirementRouting ──────────────────────────────────── + + +class TestPufCloneRetirementRouting: + """Test puf_clone_dataset() routes retirement vars correctly.""" + + def test_retirement_vars_duplicated_when_skip_qrf(self): + data = _make_mock_data() + state_fips = np.array([1, 2, 36, 6, 48]) + n = 20 + + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=True, + ) + + for var in CPS_RETIREMENT_VARIABLES: + vals = result[var][2024] + assert len(vals) == n * 2 + np.testing.assert_array_equal(vals[:n], vals[n:]) + + def test_retirement_vars_use_imputed_when_available(self): + data = _make_mock_data(n_persons=20, n_households=5) + state_fips = np.array([1, 2, 36, 6, 48]) + n = 20 + + fake_retirement = { + var: np.full(n, 999.0) for var in CPS_RETIREMENT_VARIABLES + } + + with ( + patch( + "policyengine_us_data.calibration.puf_impute" + "._impute_retirement_contributions", + return_value=fake_retirement, + ), + patch( + "policyengine_us_data.calibration.puf_impute" + "._run_qrf_imputation", + return_value=( + {v: np.zeros(n) for v in IMPUTED_VARIABLES}, + {}, + ), + ), + patch( + "policyengine_us_data.calibration.puf_impute" + "._impute_weeks_unemployed", + return_value=np.zeros(n), + ), + patch(_MSIM_PATCH), + ): + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=False, + puf_dataset="/fake/puf.h5", + dataset_path="/fake/cps.h5", + ) + + for var in CPS_RETIREMENT_VARIABLES: + vals = result[var][2024] + assert len(vals) == n * 2 + np.testing.assert_array_equal(vals[:n], data[var][2024]) + np.testing.assert_array_equal(vals[n:], np.full(n, 999.0)) + + def test_cps_half_unchanged_with_imputation(self): + data = _make_mock_data(n_persons=20, n_households=5) + state_fips = np.array([1, 2, 36, 6, 48]) + n = 20 + + originals = { + var: data[var][2024].copy() for var in CPS_RETIREMENT_VARIABLES + } + fake_retirement = { + var: np.zeros(n) for var in CPS_RETIREMENT_VARIABLES + } + + with ( + patch( + "policyengine_us_data.calibration.puf_impute" + "._impute_retirement_contributions", + return_value=fake_retirement, + ), + patch( + "policyengine_us_data.calibration.puf_impute" + "._run_qrf_imputation", + return_value=( + {v: np.zeros(n) for v in IMPUTED_VARIABLES}, + {}, + ), + ), + patch( + "policyengine_us_data.calibration.puf_impute" + "._impute_weeks_unemployed", + return_value=np.zeros(n), + ), + patch(_MSIM_PATCH), + ): + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=False, + puf_dataset="/fake/puf.h5", + dataset_path="/fake/cps.h5", + ) + + for var in CPS_RETIREMENT_VARIABLES: + np.testing.assert_array_equal( + result[var][2024][:n], originals[var] + ) + + def test_puf_half_gets_zero_retirement_for_zero_imputed(self): + """When imputation returns zeros, PUF half should be zero.""" + data = _make_mock_data(n_persons=20, n_households=5) + state_fips = np.array([1, 2, 36, 6, 48]) + n = 20 + + fake_retirement = { + var: np.zeros(n) for var in CPS_RETIREMENT_VARIABLES + } + + with ( + patch( + "policyengine_us_data.calibration.puf_impute" + "._impute_retirement_contributions", + return_value=fake_retirement, + ), + patch( + "policyengine_us_data.calibration.puf_impute" + "._run_qrf_imputation", + return_value=( + {v: np.zeros(n) for v in IMPUTED_VARIABLES}, + {}, + ), + ), + patch( + "policyengine_us_data.calibration.puf_impute" + "._impute_weeks_unemployed", + return_value=np.zeros(n), + ), + patch(_MSIM_PATCH), + ): + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=False, + puf_dataset="/fake/puf.h5", + dataset_path="/fake/cps.h5", + ) + + for var in CPS_RETIREMENT_VARIABLES: + puf_half = result[var][2024][n:] + assert np.all(puf_half == 0) + + +# ── TestLimitsMatchCps ─────────────────────────────────────────────── + + +class TestLimitsMatchCps: + """Cross-check puf_impute limits against cps.py values.""" + + def test_2024_401k_limit(self): + assert _get_retirement_limits(2024)["401k"] == 23_000 + + def test_2024_ira_limit(self): + assert _get_retirement_limits(2024)["ira"] == 7_000 + + def test_2023_401k_limit(self): + assert _get_retirement_limits(2023)["401k"] == 22_500 + + def test_2023_ira_limit(self): + assert _get_retirement_limits(2023)["ira"] == 6_500 + + def test_2022_limits(self): + lim = _get_retirement_limits(2022) + assert lim["401k"] == 20_500 + assert lim["ira"] == 6_000 + + def test_2021_limits(self): + lim = _get_retirement_limits(2021) + assert lim["401k"] == 19_500 + assert lim["ira"] == 6_000 From 774fdc7b302c10c421b99ce0976e98d0d9df02ed Mon Sep 17 00:00:00 2001 From: PavelMakarchuk Date: Thu, 26 Feb 2026 18:48:18 -0500 Subject: [PATCH 17/17] Expand retirement QRF predictors and add validation script Add income predictors (interest, dividends, pension, SS) to the retirement contribution QRF, matching issue #561's recommendation. Split RETIREMENT_PREDICTORS into demographic and income sublists so the test side correctly sources income from PUF imputations. Also add validation/validate_retirement_imputation.py for post-build constraint checking and aggregate comparison against calibration targets. Co-Authored-By: Claude Opus 4.6 --- .../calibration/puf_impute.py | 29 +- .../test_retirement_imputation.py | 41 ++- validation/validate_retirement_imputation.py | 264 ++++++++++++++++++ 3 files changed, 320 insertions(+), 14 deletions(-) create mode 100644 validation/validate_retirement_imputation.py diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index f55698fa..2158f001 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -168,17 +168,30 @@ "self_employed_pension_contributions", ] -RETIREMENT_PREDICTORS = [ - "employment_income", - "self_employment_income", +RETIREMENT_DEMOGRAPHIC_PREDICTORS = [ "age", "is_male", "tax_unit_is_joint", + "tax_unit_count_dependents", "is_tax_unit_head", "is_tax_unit_spouse", "is_tax_unit_dependent", ] +# Income predictors sourced from PUF imputations on the test side. +RETIREMENT_INCOME_PREDICTORS = [ + "employment_income", + "self_employment_income", + "taxable_interest_income", + "qualified_dividend_income", + "taxable_pension_income", + "social_security", +] + +RETIREMENT_PREDICTORS = ( + RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS +) + def _get_retirement_limits(year: int) -> dict: """Return contribution limits for the given tax year. @@ -709,14 +722,8 @@ def _impute_retirement_contributions( return {var: np.zeros(n_persons) for var in CPS_RETIREMENT_VARIABLES} # Build test data: demographics from CPS sim, income from PUF - X_test = cps_sim.calculate_dataframe( - [ - p - for p in RETIREMENT_PREDICTORS - if p not in ("employment_income", "self_employment_income") - ] - ) - for income_var in ("employment_income", "self_employment_income"): + X_test = cps_sim.calculate_dataframe(RETIREMENT_DEMOGRAPHIC_PREDICTORS) + for income_var in RETIREMENT_INCOME_PREDICTORS: if income_var in puf_imputations: X_test[income_var] = puf_imputations[income_var] else: diff --git a/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py b/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py index 7a8a3227..1a6172dd 100644 --- a/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py +++ b/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py @@ -17,6 +17,8 @@ CPS_RETIREMENT_VARIABLES, IMPUTED_VARIABLES, OVERRIDDEN_IMPUTED_VARIABLES, + RETIREMENT_DEMOGRAPHIC_PREDICTORS, + RETIREMENT_INCOME_PREDICTORS, RETIREMENT_PREDICTORS, _get_retirement_limits, _impute_retirement_contributions, @@ -79,14 +81,22 @@ def _make_cps_df(n, rng): """Build a mock CPS DataFrame with all needed columns.""" return pd.DataFrame( { + # Demographics "age": rng.integers(18, 80, n).astype(float), "is_male": rng.integers(0, 2, n).astype(float), "tax_unit_is_joint": rng.integers(0, 2, n).astype(float), + "tax_unit_count_dependents": rng.integers(0, 4, n).astype(float), "is_tax_unit_head": rng.integers(0, 2, n).astype(float), "is_tax_unit_spouse": np.zeros(n), "is_tax_unit_dependent": np.zeros(n), + # Income predictors "employment_income": rng.uniform(0, 100_000, n), "self_employment_income": rng.uniform(0, 50_000, n), + "taxable_interest_income": rng.uniform(0, 5_000, n), + "qualified_dividend_income": rng.uniform(0, 3_000, n), + "taxable_pension_income": rng.uniform(0, 20_000, n), + "social_security": rng.uniform(0, 15_000, n), + # Targets "traditional_401k_contributions": rng.uniform(0, 5000, n), "roth_401k_contributions": rng.uniform(0, 3000, n), "traditional_ira_contributions": rng.uniform(0, 2000, n), @@ -151,13 +161,26 @@ def test_retirement_variable_names(self): assert set(CPS_RETIREMENT_VARIABLES) == expected def test_retirement_predictors_include_income(self): - assert "employment_income" in RETIREMENT_PREDICTORS - assert "self_employment_income" in RETIREMENT_PREDICTORS + for var in RETIREMENT_INCOME_PREDICTORS: + assert var in RETIREMENT_PREDICTORS def test_retirement_predictors_include_demographics(self): - for pred in ["age", "is_male", "tax_unit_is_joint"]: + for pred in RETIREMENT_DEMOGRAPHIC_PREDICTORS: assert pred in RETIREMENT_PREDICTORS + def test_income_predictors_in_imputed_variables(self): + """All income predictors must be available from PUF QRF.""" + for var in RETIREMENT_INCOME_PREDICTORS: + assert ( + var in IMPUTED_VARIABLES + ), f"{var} not in IMPUTED_VARIABLES — won't be in puf_imputations" + + def test_predictors_are_combined_lists(self): + expected = ( + RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS + ) + assert RETIREMENT_PREDICTORS == expected + # ── TestGetRetirementLimits ────────────────────────────────────────── @@ -234,6 +257,18 @@ def _setup(self): self.puf_imputations = { "employment_income": emp, "self_employment_income": se, + "taxable_interest_income": rng.uniform(0, 5_000, self.n).astype( + np.float32 + ), + "qualified_dividend_income": rng.uniform(0, 3_000, self.n).astype( + np.float32 + ), + "taxable_pension_income": rng.uniform(0, 20_000, self.n).astype( + np.float32 + ), + "social_security": rng.uniform(0, 15_000, self.n).astype( + np.float32 + ), } self.cps_df = _make_cps_df(self.n, rng) diff --git a/validation/validate_retirement_imputation.py b/validation/validate_retirement_imputation.py new file mode 100644 index 00000000..87d911a9 --- /dev/null +++ b/validation/validate_retirement_imputation.py @@ -0,0 +1,264 @@ +"""Post-build validation for retirement contribution QRF imputation. + +Run after a full Extended CPS build to verify that the PUF clone +half has plausible retirement contribution values. + +Usage: + python validation/validate_retirement_imputation.py + +Requires: + - Python 3.12+ (for microimpute/policyengine_us) + - Built ExtendedCPS_2024 dataset available +""" + +import logging +import sys + +import numpy as np +import pandas as pd + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +# Calibration targets (from etl_national_targets.py / loss.py) +TARGETS = { + "traditional_401k_contributions": 482.7e9, + "roth_401k_contributions": 85.2e9, + "traditional_ira_contributions": 13.2e9, + "roth_ira_contributions": 35.0e9, + "self_employed_pension_contribution_ald": 29.5e9, +} + +# Contribution limits for 2024 +LIMITS_2024 = { + "401k": 23_000, + "401k_catch_up": 7_500, + "ira": 7_000, + "ira_catch_up": 1_000, +} + + +def load_extended_cps(): + """Load the Extended CPS dataset.""" + from policyengine_us import Microsimulation + from policyengine_us_data import ExtendedCPS_2024 + + logger.info("Loading ExtendedCPS_2024...") + sim = Microsimulation(dataset=ExtendedCPS_2024) + return sim + + +def validate_constraints(sim) -> list: + """Check hard constraints on imputed values.""" + issues = [] + year = 2024 + + emp_income = sim.calculate( + "employment_income", year, map_to="person" + ).values + se_income = sim.calculate( + "self_employment_income", year, map_to="person" + ).values + age = sim.calculate("age", year, map_to="person").values + + catch_up = age >= 50 + max_401k = LIMITS_2024["401k"] + catch_up * LIMITS_2024["401k_catch_up"] + max_ira = LIMITS_2024["ira"] + catch_up * LIMITS_2024["ira_catch_up"] + + # 401k constraints + for var in ( + "traditional_401k_contributions", + "roth_401k_contributions", + ): + vals = sim.calculate(var, year, map_to="person").values + + n_negative = (vals < 0).sum() + if n_negative > 0: + issues.append(f"FAIL: {var} has {n_negative} negative values") + + n_over_cap = (vals > max_401k + 1).sum() + if n_over_cap > 0: + issues.append( + f"FAIL: {var} has {n_over_cap} values exceeding " f"401k cap" + ) + + zero_wage = emp_income == 0 + n_nonzero_no_wage = (vals[zero_wage] > 0).sum() + if n_nonzero_no_wage > 0: + issues.append( + f"FAIL: {var} has {n_nonzero_no_wage} nonzero values " + f"for records with $0 wages" + ) + else: + logger.info( + "PASS: %s is $0 for all %d zero-wage records", + var, + zero_wage.sum(), + ) + + # IRA constraints + for var in ( + "traditional_ira_contributions", + "roth_ira_contributions", + ): + vals = sim.calculate(var, year, map_to="person").values + + n_negative = (vals < 0).sum() + if n_negative > 0: + issues.append(f"FAIL: {var} has {n_negative} negative values") + + n_over_cap = (vals > max_ira + 1).sum() + if n_over_cap > 0: + issues.append( + f"FAIL: {var} has {n_over_cap} values exceeding " f"IRA cap" + ) + + # SE pension constraint + var = "self_employed_pension_contributions" + vals = sim.calculate(var, year, map_to="person").values + zero_se = se_income == 0 + n_nonzero_no_se = (vals[zero_se] > 0).sum() + if n_nonzero_no_se > 0: + issues.append( + f"FAIL: {var} has {n_nonzero_no_se} nonzero values " + f"for records with $0 SE income" + ) + else: + logger.info( + "PASS: %s is $0 for all %d zero-SE records", + var, + zero_se.sum(), + ) + + return issues + + +def validate_aggregates(sim) -> list: + """Compare weighted aggregates against calibration targets.""" + issues = [] + year = 2024 + + weight = sim.calculate("person_weight", year).values + + logger.info( + "\n%-45s %15s %15s %10s", "Variable", "Weighted Sum", "Target", "Ratio" + ) + logger.info("-" * 90) + + for var, target in TARGETS.items(): + try: + vals = sim.calculate(var, year, map_to="person").values + except (ValueError, KeyError): + logger.warning("Could not calculate %s", var) + continue + + weighted_sum = (vals * weight).sum() + ratio = weighted_sum / target if target != 0 else float("inf") + + logger.info( + "%-45s $%14.1fB $%14.1fB %9.1f%%", + var, + weighted_sum / 1e9, + target / 1e9, + ratio * 100, + ) + + # Flag if more than 2x off target + if ratio < 0.1 or ratio > 5.0: + issues.append( + f"WARNING: {var} weighted sum " + f"${weighted_sum/1e9:.1f}B is far from " + f"target ${target/1e9:.1f}B " + f"(ratio={ratio:.2f})" + ) + + return issues + + +def validate_cps_vs_puf_half(sim) -> list: + """Compare CPS half and PUF half distributions.""" + issues = [] + year = 2024 + + n_persons = len(sim.calculate("person_id", year).values) + n_half = n_persons // 2 + + logger.info("\nCPS vs PUF half comparison (n=%d per half):", n_half) + logger.info( + "%-45s %12s %12s %12s %12s", + "Variable", + "CPS mean", + "PUF mean", + "CPS >0 pct", + "PUF >0 pct", + ) + logger.info("-" * 95) + + for var in [ + "traditional_401k_contributions", + "roth_401k_contributions", + "traditional_ira_contributions", + "roth_ira_contributions", + "self_employed_pension_contributions", + ]: + try: + vals = sim.calculate(var, year, map_to="person").values + except (ValueError, KeyError): + continue + + cps_vals = vals[:n_half] + puf_vals = vals[n_half:] + + cps_mean = cps_vals.mean() + puf_mean = puf_vals.mean() + cps_pct = (cps_vals > 0).mean() * 100 + puf_pct = (puf_vals > 0).mean() * 100 + + logger.info( + "%-45s $%11.0f $%11.0f %11.1f%% %11.1f%%", + var, + cps_mean, + puf_mean, + cps_pct, + puf_pct, + ) + + return issues + + +def main(): + """Run all validations.""" + sim = load_extended_cps() + + logger.info("\n" + "=" * 60) + logger.info("RETIREMENT CONTRIBUTION IMPUTATION VALIDATION") + logger.info("=" * 60) + + all_issues = [] + + logger.info("\n1. CONSTRAINT CHECKS") + logger.info("-" * 40) + all_issues.extend(validate_constraints(sim)) + + logger.info("\n2. AGGREGATE COMPARISONS") + logger.info("-" * 40) + all_issues.extend(validate_aggregates(sim)) + + logger.info("\n3. CPS vs PUF HALF DISTRIBUTIONS") + logger.info("-" * 40) + all_issues.extend(validate_cps_vs_puf_half(sim)) + + logger.info("\n" + "=" * 60) + if all_issues: + logger.info("ISSUES FOUND: %d", len(all_issues)) + for issue in all_issues: + logger.info(" - %s", issue) + else: + logger.info("ALL CHECKS PASSED") + logger.info("=" * 60) + + return len(all_issues) + + +if __name__ == "__main__": + sys.exit(main())