diff --git a/changelog.d/fix-ss-subcomponent-reconciliation.fixed.md b/changelog.d/fix-ss-subcomponent-reconciliation.fixed.md new file mode 100644 index 00000000..2bfa8149 --- /dev/null +++ b/changelog.d/fix-ss-subcomponent-reconciliation.fixed.md @@ -0,0 +1 @@ +Reconcile SS sub-components after PUF imputation so they sum to social_security. diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index 81df6209..d5c73785 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -102,6 +102,13 @@ "self_employment_income_would_be_qualified", ] +SS_SUBCOMPONENTS = [ + "social_security_retirement", + "social_security_disability", + "social_security_survivors", + "social_security_dependents", +] + OVERRIDDEN_IMPUTED_VARIABLES = [ "partnership_s_corp_income", "interest_deduction", @@ -155,6 +162,210 @@ ] +MINIMUM_RETIREMENT_AGE = 62 + +SS_SPLIT_PREDICTORS = [ + "age", + "is_male", + "tax_unit_is_joint", + "is_tax_unit_head", + "is_tax_unit_dependent", +] + +MIN_QRF_TRAINING_RECORDS = 100 + + +def _qrf_ss_shares( + data: Dict[str, Dict[int, np.ndarray]], + n_cps: int, + time_period: int, + puf_has_ss: np.ndarray, +) -> Optional[Dict[str, np.ndarray]]: + """Predict SS sub-component shares via QRF. + + Trains on CPS records that have SS > 0 (where the reason-code + split is known), then predicts shares for all PUF records with + positive SS. The CPS-PUF link is statistical (not identity-based), + so the QRF gives a better expected prediction than using the + paired CPS record's split. + + Args: + data: Dataset dict. + n_cps: Records in CPS half. + time_period: Tax year. + puf_has_ss: Boolean mask (length n_cps) — True where the + PUF half has positive social_security. + + Returns: + Dict mapping sub-component name to predicted share arrays + (length = puf_has_ss.sum()), or None if training data is + insufficient. + """ + from microimpute.models.qrf import QRF + + cps_ss = data["social_security"][time_period][:n_cps] + has_ss = cps_ss > 0 + + if has_ss.sum() < MIN_QRF_TRAINING_RECORDS: + return None + + # Build training features from available predictors. + predictors = [] + train_cols = {} + test_cols = {} + for pred in SS_SPLIT_PREDICTORS: + if pred not in data: + continue + vals = data[pred][time_period][:n_cps] + train_cols[pred] = vals[has_ss].astype(np.float32) + test_cols[pred] = vals[puf_has_ss].astype(np.float32) + predictors.append(pred) + + if not predictors: + return None + + X_train = pd.DataFrame(train_cols) + X_test = pd.DataFrame(test_cols) + + # Training targets: share going to each sub-component (0 or 1). + share_vars = [] + with np.errstate(divide="ignore", invalid="ignore"): + for sub in SS_SUBCOMPONENTS: + if sub not in data: + continue + sub_vals = data[sub][time_period][:n_cps][has_ss] + share_name = sub + "_share" + X_train[share_name] = np.where( + cps_ss[has_ss] > 0, + sub_vals / cps_ss[has_ss], + 0.0, + ) + share_vars.append(share_name) + + if not share_vars: + return None + + qrf = QRF(log_level="WARNING", memory_efficient=True) + try: + fitted = qrf.fit( + X_train=X_train[predictors + share_vars], + predictors=predictors, + imputed_variables=share_vars, + n_jobs=1, + ) + predictions = fitted.predict(X_test=X_test) + except Exception: + logger.warning("QRF SS split failed, falling back to heuristic") + return None + + # Clip to [0, 1] and normalise so shares sum to 1. + shares = {} + total = np.zeros(len(X_test)) + for sub in SS_SUBCOMPONENTS: + key = sub + "_share" + if key in predictions.columns: + s = np.clip(predictions[key].values, 0, 1) + shares[sub] = s + total += s + + for sub in shares: + shares[sub] = np.where(total > 0, shares[sub] / total, 0.0) + + del fitted, predictions + gc.collect() + + logger.info( + "QRF SS split: predicted shares for %d PUF records", + puf_has_ss.sum(), + ) + return shares + + +def _age_heuristic_ss_shares( + data: Dict[str, Dict[int, np.ndarray]], + n_cps: int, + time_period: int, + puf_has_ss: np.ndarray, +) -> Dict[str, np.ndarray]: + """Fallback: assign SS type by age threshold. + + Age >= 62 -> retirement, < 62 -> disability. + If age is unavailable, all go to retirement. + """ + n_pred = puf_has_ss.sum() + shares = {sub: np.zeros(n_pred) for sub in SS_SUBCOMPONENTS} + + age = None + if "age" in data: + age = data["age"][time_period][:n_cps][puf_has_ss] + + if age is not None: + is_old = age >= MINIMUM_RETIREMENT_AGE + if "social_security_retirement" in shares: + shares["social_security_retirement"] = is_old.astype(np.float64) + if "social_security_disability" in shares: + shares["social_security_disability"] = (~is_old).astype(np.float64) + else: + if "social_security_retirement" in shares: + shares["social_security_retirement"] = np.ones(n_pred) + + return shares + + +def reconcile_ss_subcomponents( + data: Dict[str, Dict[int, np.ndarray]], + n_cps: int, + time_period: int, +) -> None: + """Predict SS sub-components for PUF half from demographics. + + The CPS-PUF link is statistical (not identity-based), so the + paired CPS record's sub-component split is just one noisy draw. + A QRF trained on all CPS SS recipients gives a better expected + prediction by pooling across the full training set. + + For all PUF records with positive social_security, this function + predicts shares via QRF (falling back to an age heuristic) and + scales them to match the imputed total. PUF records with zero + SS get all sub-components cleared to zero. + + Modifies ``data`` in place. Only the PUF half (indices + n_cps .. 2*n_cps) is changed. + + Args: + data: Dataset dict {variable: {time_period: array}}. + n_cps: Number of records in the CPS half. + time_period: Tax year key into data dicts. + """ + if "social_security" not in data: + return + + puf_ss = data["social_security"][time_period][n_cps:] + puf_has_ss = puf_ss > 0 + + # Predict shares for all PUF records with SS > 0. + shares = None + if puf_has_ss.any(): + shares = _qrf_ss_shares(data, n_cps, time_period, puf_has_ss) + if shares is None: + shares = _age_heuristic_ss_shares( + data, n_cps, time_period, puf_has_ss + ) + + for sub in SS_SUBCOMPONENTS: + if sub not in data: + continue + arr = data[sub][time_period] + + new_puf = np.zeros(n_cps) + if puf_has_ss.any() and shares is not None: + share = shares.get(sub, np.zeros(puf_has_ss.sum())) + new_puf[puf_has_ss] = puf_ss[puf_has_ss] * share + + arr[n_cps:] = new_puf.astype(arr.dtype) + data[sub][time_period] = arr + + def puf_clone_dataset( data: Dict[str, Dict[int, np.ndarray]], state_fips: np.ndarray, @@ -271,6 +482,9 @@ def _map_to_entity(pred_values, variable_name): if cps_sim is not None: del cps_sim + # Ensure SS sub-components match the (possibly imputed) total. + reconcile_ss_subcomponents(new_data, person_count, time_period) + logger.info( "PUF clone complete: %d -> %d households", n_households, diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 4b427a39..ac8b6e45 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -50,8 +50,42 @@ def generate(self): dataset_path=str(self.cps.file_path), ) + new_data = self._drop_formula_variables(new_data) self.save_dataset(new_data) + # Variables with formulas that must still be stored (e.g. IDs + # needed by the dataset loader before formulas can run). + _KEEP_FORMULA_VARS = {"person_id"} + + @classmethod + def _drop_formula_variables(cls, data): + """Remove variables that are computed by policyengine-us. + + Variables with formulas, ``adds``, or ``subtracts`` are + recomputed by the simulation engine, so storing them wastes + space and can mislead validation. + """ + from policyengine_us import CountryTaxBenefitSystem + + tbs = CountryTaxBenefitSystem() + formula_vars = { + name + for name, var in tbs.variables.items() + if (hasattr(var, "formulas") and len(var.formulas) > 0) + or getattr(var, "adds", None) + or getattr(var, "subtracts", None) + } - cls._KEEP_FORMULA_VARS + dropped = sorted(set(data.keys()) & formula_vars) + if dropped: + logger.info( + "Dropping %d formula variables: %s", + len(dropped), + dropped, + ) + for var in dropped: + del data[var] + return data + class ExtendedCPS_2024(ExtendedCPS): cps = CPS_2024_Full diff --git a/policyengine_us_data/tests/test_puf_impute.py b/policyengine_us_data/tests/test_puf_impute.py new file mode 100644 index 00000000..fcdcf763 --- /dev/null +++ b/policyengine_us_data/tests/test_puf_impute.py @@ -0,0 +1,366 @@ +"""Tests for PUF imputation, specifically SS sub-component reconciliation. + +When PUF imputation replaces social_security values, the sub-components +(retirement, disability, survivors, dependents) must be predicted from +demographics (not copied from the statistically-linked CPS record). +See: https://github.com/PolicyEngine/policyengine-us-data/issues/551 +""" + +import numpy as np +import pytest + +from policyengine_us_data.calibration.puf_impute import ( + MINIMUM_RETIREMENT_AGE, + _age_heuristic_ss_shares, + _qrf_ss_shares, + reconcile_ss_subcomponents, +) + +SS_SUBS = [ + "social_security_retirement", + "social_security_disability", + "social_security_survivors", + "social_security_dependents", +] + + +def _make_data( + orig_ss, + orig_ret, + orig_dis, + orig_surv, + orig_dep, + imputed_ss, + age=None, + is_male=None, +): + """Build a doubled data dict (CPS half + PUF half).""" + n = len(orig_ss) + tp = 2024 + data = { + "social_security": { + tp: np.concatenate([orig_ss, imputed_ss]).astype(np.float32) + }, + "social_security_retirement": { + tp: np.concatenate([orig_ret, orig_ret]).astype(np.float32) + }, + "social_security_disability": { + tp: np.concatenate([orig_dis, orig_dis]).astype(np.float32) + }, + "social_security_survivors": { + tp: np.concatenate([orig_surv, orig_surv]).astype(np.float32) + }, + "social_security_dependents": { + tp: np.concatenate([orig_dep, orig_dep]).astype(np.float32) + }, + } + if age is not None: + data["age"] = {tp: np.concatenate([age, age]).astype(np.float32)} + if is_male is not None: + data["is_male"] = { + tp: np.concatenate([is_male, is_male]).astype(np.float32) + } + return data, n, tp + + +class TestReconcileSsSubcomponents: + """Sub-components must sum to social_security after reconciliation.""" + + def test_subs_sum_to_total(self): + """PUF subs must sum to imputed SS total.""" + data, n, tp = _make_data( + orig_ss=np.array([20000, 15000]), + orig_ret=np.array([14000, 0]), + orig_dis=np.array([6000, 15000]), + orig_surv=np.array([0, 0]), + orig_dep=np.array([0, 0]), + imputed_ss=np.array([30000, 10000]), + age=np.array([70, 45]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + ss = data["social_security"][tp][puf] + total_subs = sum(data[sub][tp][puf] for sub in SS_SUBS) + np.testing.assert_allclose(total_subs, ss, rtol=1e-5) + + def test_age_determines_split(self): + """Age heuristic: >= 62 -> retirement, < 62 -> disability.""" + data, n, tp = _make_data( + orig_ss=np.array([20000, 15000]), + orig_ret=np.array([14000, 0]), + orig_dis=np.array([6000, 15000]), + orig_surv=np.array([0, 0]), + orig_dep=np.array([0, 0]), + imputed_ss=np.array([30000, 10000]), + age=np.array([70, 45]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + ret = data["social_security_retirement"][tp][puf] + dis = data["social_security_disability"][tp][puf] + # Person 0 is 70 -> retirement + assert ret[0] == pytest.approx(30000) + assert dis[0] == pytest.approx(0) + # Person 1 is 45 -> disability + assert ret[1] == pytest.approx(0) + assert dis[1] == pytest.approx(10000) + + def test_imputed_zero_clears_subs(self): + """When PUF imputes zero SS, all subs should be zero.""" + data, n, tp = _make_data( + orig_ss=np.array([20000]), + orig_ret=np.array([20000]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([0]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + for sub in SS_SUBS: + assert data[sub][tp][puf][0] == pytest.approx(0) + + def test_cps_half_unchanged(self): + """CPS half should not be modified.""" + orig_ret = np.array([14000, 0]) + data, n, tp = _make_data( + orig_ss=np.array([20000, 15000]), + orig_ret=orig_ret, + orig_dis=np.array([6000, 15000]), + orig_surv=np.array([0, 0]), + orig_dep=np.array([0, 0]), + imputed_ss=np.array([30000, 10000]), + ) + reconcile_ss_subcomponents(data, n, tp) + + cps = slice(0, n) + np.testing.assert_array_equal( + data["social_security_retirement"][tp][cps], + orig_ret, + ) + + def test_missing_subcomponent_is_skipped(self): + """If a sub-component is absent from data, no error.""" + data, n, tp = _make_data( + orig_ss=np.array([10000]), + orig_ret=np.array([10000]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([15000]), + ) + del data["social_security_survivors"] + del data["social_security_dependents"] + reconcile_ss_subcomponents(data, n, tp) + + def test_no_social_security_is_noop(self): + """If social_security is absent, function is a no-op.""" + data = {"some_var": {2024: np.array([1, 2])}} + reconcile_ss_subcomponents(data, 1, 2024) + + def test_subs_sum_to_total_mixed_cases(self): + """Comprehensive: subs sum to total across mixed cases.""" + data, n, tp = _make_data( + # Person 0: has CPS SS, has PUF SS (old) + # Person 1: no CPS SS, has PUF SS (old) + # Person 2: no CPS SS, has PUF SS (young) + # Person 3: has CPS SS, PUF zeroed out + orig_ss=np.array([20000, 0, 0, 15000]), + orig_ret=np.array([20000, 0, 0, 15000]), + orig_dis=np.array([0, 0, 0, 0]), + orig_surv=np.array([0, 0, 0, 0]), + orig_dep=np.array([0, 0, 0, 0]), + imputed_ss=np.array([25000, 18000, 12000, 0]), + age=np.array([70, 68, 40, 75]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + ss = data["social_security"][tp][puf] + total_subs = sum(data[sub][tp][puf] for sub in SS_SUBS) + np.testing.assert_allclose(total_subs, ss, rtol=1e-5) + + +class TestAgeHeuristicSsShares: + """Age-based fallback for SS type classification.""" + + def test_elderly_gets_retirement(self): + """Age >= 62 -> retirement.""" + data, n, tp = _make_data( + orig_ss=np.array([0]), + orig_ret=np.array([0]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([25000]), + age=np.array([70]), + ) + puf_has_ss = np.array([True]) + shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss) + + assert shares["social_security_retirement"][0] == pytest.approx(1.0) + assert shares["social_security_disability"][0] == pytest.approx(0.0) + + def test_young_gets_disability(self): + """Age < 62 -> disability.""" + data, n, tp = _make_data( + orig_ss=np.array([0]), + orig_ret=np.array([0]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([18000]), + age=np.array([45]), + ) + puf_has_ss = np.array([True]) + shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss) + + assert shares["social_security_retirement"][0] == pytest.approx(0.0) + assert shares["social_security_disability"][0] == pytest.approx(1.0) + + def test_no_age_defaults_to_retirement(self): + """Without age data, default to retirement.""" + data, n, tp = _make_data( + orig_ss=np.array([0]), + orig_ret=np.array([0]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([25000]), + ) + puf_has_ss = np.array([True]) + shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss) + + assert shares["social_security_retirement"][0] == pytest.approx(1.0) + + def test_mixed_ages(self): + """Multiple people split correctly by age threshold.""" + data, n, tp = _make_data( + orig_ss=np.array([0, 0, 0]), + orig_ret=np.array([0, 0, 0]), + orig_dis=np.array([0, 0, 0]), + orig_surv=np.array([0, 0, 0]), + orig_dep=np.array([0, 0, 0]), + imputed_ss=np.array([1, 1, 1]), + age=np.array([30, 62, 80]), + ) + puf_has_ss = np.array([True, True, True]) + shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss) + + # age 30 -> disability + assert shares["social_security_retirement"][0] == 0 + assert shares["social_security_disability"][0] == 1 + # age 62 -> retirement (>= threshold) + assert shares["social_security_retirement"][1] == 1 + assert shares["social_security_disability"][1] == 0 + # age 80 -> retirement + assert shares["social_security_retirement"][2] == 1 + + +class TestQrfSsShares: + """QRF-based SS sub-component share prediction.""" + + def test_returns_none_with_few_training_records(self): + """Returns None when < MIN_QRF_TRAINING_RECORDS have SS.""" + # Only 2 training records with SS > 0. + data, n, tp = _make_data( + orig_ss=np.array([10000, 5000, 0]), + orig_ret=np.array([10000, 0, 0]), + orig_dis=np.array([0, 5000, 0]), + orig_surv=np.array([0, 0, 0]), + orig_dep=np.array([0, 0, 0]), + imputed_ss=np.array([0, 0, 20000]), + age=np.array([70, 45, 55]), + ) + puf_has_ss = np.array([False, False, True]) + result = _qrf_ss_shares(data, n, tp, puf_has_ss) + assert result is None + + def test_shares_sum_to_one(self): + """QRF-predicted shares should sum to ~1 after normalisation.""" + pytest.importorskip("microimpute") + rng = np.random.default_rng(42) + n = 500 + + # Synthetic training: age > 62 mostly retirement, + # age < 62 mostly disability. + ages = rng.integers(20, 90, size=n).astype(np.float32) + is_male = rng.integers(0, 2, size=n).astype(np.float32) + ss_vals = rng.uniform(5000, 40000, size=n).astype(np.float32) + ret = np.where(ages >= 62, ss_vals, 0).astype(np.float32) + dis = np.where(ages < 62, ss_vals, 0).astype(np.float32) + surv = np.zeros(n, dtype=np.float32) + dep = np.zeros(n, dtype=np.float32) + + # PUF SS for all records. + puf_ss = rng.uniform(5000, 40000, size=n).astype(np.float32) + + data, nn, tp = _make_data( + orig_ss=ss_vals, + orig_ret=ret, + orig_dis=dis, + orig_surv=surv, + orig_dep=dep, + imputed_ss=puf_ss, + age=ages, + is_male=is_male, + ) + + puf_has_ss = puf_ss > 0 + shares = _qrf_ss_shares(data, nn, tp, puf_has_ss) + + assert shares is not None + total = sum(shares.get(sub, 0) for sub in SS_SUBS) + np.testing.assert_allclose(total, 1.0, atol=0.01) + + def test_elderly_predicted_as_retirement(self): + """QRF should mostly predict retirement for age >= 62.""" + pytest.importorskip("microimpute") + rng = np.random.default_rng(123) + n_train = 500 + + ages = rng.integers(20, 90, size=n_train).astype(np.float32) + ss_vals = rng.uniform(5000, 40000, size=n_train).astype(np.float32) + ret = np.where(ages >= 62, ss_vals, 0).astype(np.float32) + dis = np.where(ages < 62, ss_vals, 0).astype(np.float32) + surv = np.zeros(n_train, dtype=np.float32) + dep = np.zeros(n_train, dtype=np.float32) + + # Only elderly records get PUF SS > 0; training records + # get PUF SS = 0 so puf_has_ss selects just the elderly. + n_pred = 10 + pred_ages = np.full(n_pred, 70, dtype=np.float32) + + all_ages = np.concatenate([ages, pred_ages]) + all_ss = np.concatenate([ss_vals, np.zeros(n_pred)]) + all_ret = np.concatenate([ret, np.zeros(n_pred)]) + all_dis = np.concatenate([dis, np.zeros(n_pred)]) + all_surv = np.concatenate([surv, np.zeros(n_pred)]) + all_dep = np.concatenate([dep, np.zeros(n_pred)]) + puf_ss = np.concatenate( + [ + np.zeros(n_train, dtype=np.float32), + np.full(n_pred, 20000, dtype=np.float32), + ] + ) + + data, nn, tp = _make_data( + orig_ss=all_ss, + orig_ret=all_ret, + orig_dis=all_dis, + orig_surv=all_surv, + orig_dep=all_dep, + imputed_ss=puf_ss, + age=all_ages, + ) + + puf_has_ss = data["social_security"][tp][nn:] > 0 + shares = _qrf_ss_shares(data, nn, tp, puf_has_ss) + + assert shares is not None + ret_share = shares["social_security_retirement"] + # All elderly -> retirement share should dominate. + assert np.mean(ret_share) > 0.7 diff --git a/tests/test_no_formula_variables_stored.py b/tests/test_no_formula_variables_stored.py new file mode 100644 index 00000000..9334a5c7 --- /dev/null +++ b/tests/test_no_formula_variables_stored.py @@ -0,0 +1,183 @@ +"""Test dataset variable integrity. + +1. Variables with formulas in policyengine-us will be recomputed by the + simulation engine, ignoring any stored value. Storing them wastes + space and can mislead during validation. + +2. Stored input values should match what the simulation actually uses, + to catch issues like set_input_divide_by_period silently altering + values, or sub-components not summing to their parent total. +""" + +import h5py +import numpy as np +import pytest +from policyengine_us_data.datasets.cps.extended_cps import ExtendedCPS_2024 + +KNOWN_FORMULA_EXCEPTIONS = { + # person_id is stored for identity tracking even though it has a + # trivial formula (arange). Safe to keep. + "person_id", +} + + +@pytest.fixture(scope="module") +def tax_benefit_system(): + from policyengine_us import CountryTaxBenefitSystem + + return CountryTaxBenefitSystem() + + +@pytest.fixture(scope="module") +def formula_variables(tax_benefit_system): + """Return set of variable names computed by policyengine-us. + + Includes variables with explicit formulas as well as those using + ``adds`` or ``subtracts`` (which the engine auto-sums at runtime). + """ + return { + name + for name, var in tax_benefit_system.variables.items() + if (hasattr(var, "formulas") and len(var.formulas) > 0) + or getattr(var, "adds", None) + or getattr(var, "subtracts", None) + } + + +@pytest.fixture(scope="module") +def dataset_path(): + path = ExtendedCPS_2024.file_path + if not path.exists(): + pytest.skip("Extended CPS 2024 not built locally") + return path + + +@pytest.fixture(scope="module") +def stored_variables(dataset_path): + """Return set of variable names stored in the extended CPS.""" + with h5py.File(dataset_path, "r") as f: + return set(f.keys()) + + +@pytest.fixture(scope="module") +def sim(): + from policyengine_us import Microsimulation + + return Microsimulation(dataset=ExtendedCPS_2024) + + +def test_no_formula_variables_stored(formula_variables, stored_variables): + """Computed variables should not be stored in the dataset.""" + overlap = (stored_variables & formula_variables) - KNOWN_FORMULA_EXCEPTIONS + if overlap: + msg = ( + f"These {len(overlap)} variables are computed by " + f"policyengine-us (formulas/adds/subtracts) but are " + f"stored in the dataset " + f"(stored values will be IGNORED by the simulation):\n" + ) + for v in sorted(overlap): + msg += f" - {v}\n" + pytest.fail(msg) + + +def test_stored_values_match_computed( + sim, stored_variables, formula_variables, dataset_path +): + """For input-only variables, stored values should match what the + simulation returns (catches set_input issues, rounding, etc.).""" + input_vars = stored_variables - formula_variables + mismatches = [] + + with h5py.File(dataset_path, "r") as f: + for var_name in sorted(input_vars): + if var_name not in f or "2024" not in f[var_name]: + continue + stored = f[var_name]["2024"][...] + if stored.dtype.kind not in ("f", "i", "u"): + continue + + stored_f = stored.astype(float) + stored_total = np.sum(stored_f) + if abs(stored_total) < 1: + continue + + try: + computed = np.array(sim.calculate(var_name, 2024)) + except Exception: + continue + + computed_total = np.sum(computed.astype(float)) + if abs(stored_total) > 0: + pct_diff = ( + abs(stored_total - computed_total) + / abs(stored_total) + * 100 + ) + else: + pct_diff = 0 + + if pct_diff > 1: + mismatches.append( + f" {var_name}: stored=${stored_total:,.0f}, " + f"computed=${computed_total:,.0f}, " + f"diff={pct_diff:.1f}%" + ) + + if mismatches: + msg = ( + f"These {len(mismatches)} input variables have >1% " + f"difference between stored and computed values:\n" + ) + msg += "\n".join(mismatches) + pytest.fail(msg) + + +def test_ss_subcomponents_sum_to_computed_total(sim, dataset_path): + """Social Security sub-components should sum to the computed total. + + ``social_security`` is computed via ``adds`` in policyengine-us and + is NOT stored in the dataset. We verify that the sub-components + stored in the dataset sum to the simulation's computed total. + """ + with h5py.File(dataset_path, "r") as f: + ss_retirement = f["social_security_retirement"]["2024"][...].astype( + float + ) + ss_disability = f["social_security_disability"]["2024"][...].astype( + float + ) + ss_survivors = f["social_security_survivors"]["2024"][...].astype( + float + ) + ss_dependents = f["social_security_dependents"]["2024"][...].astype( + float + ) + + sub_sum = ss_retirement + ss_disability + ss_survivors + ss_dependents + computed_total = np.array(sim.calculate("social_security", 2024)).astype( + float + ) + + # Only check records that have any SS income + has_ss = computed_total > 0 + if not np.any(has_ss): + pytest.skip("No SS recipients in dataset") + + total_computed = np.sum(computed_total[has_ss]) + total_sub = np.sum(sub_sum[has_ss]) + pct_diff = abs(total_computed - total_sub) / total_computed * 100 + + assert pct_diff < 1, ( + f"SS sub-components don't sum to computed total: " + f"computed=${total_computed:,.0f}, sub_sum=${total_sub:,.0f}, " + f"diff={pct_diff:.1f}%" + ) + + # Per-person check: no person's sub-components should exceed total + excess = sub_sum[has_ss] - computed_total[has_ss] + n_excess = np.sum(excess > 1) + assert n_excess == 0, ( + f"{n_excess} people have SS sub-components exceeding " + f"their computed total SS income" + ) diff --git a/uv.lock b/uv.lock index fc61e96d..09a004ac 100644 --- a/uv.lock +++ b/uv.lock @@ -1842,7 +1842,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.570.7" +version = "1.584.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -1850,9 +1850,9 @@ dependencies = [ { name = "policyengine-core" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6a/eb/291b3085aa0fa97fcce4987d54991118f21aead49647b3f475998459f46b/policyengine_us-1.570.7.tar.gz", hash = "sha256:a2967af86a61468a0bdb6b2dc7af2fd0bb0f0064203fa557b6fee8023058360a", size = 8668680, upload-time = "2026-02-19T07:17:11.264Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ef/29/a4517eac6dedc1d6c1a17b0d105b9aa41ca41b877e6ecef39362ecd0438c/policyengine_us-1.584.0.tar.gz", hash = "sha256:9bb830765ece943b11939822da94d64a26c802c941b8f929a354354394a2ef7e", size = 8701324, upload-time = "2026-02-24T15:05:04.474Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/38/36/0213955310076e4dec2781baeabf96b6d6937f99cd19c373b363bfbd7152/policyengine_us-1.570.7-py3-none-any.whl", hash = "sha256:374fd5357d6cb3734b900bd08dfdb61760dfc5b913ed686f57a40239565b0edd", size = 7825404, upload-time = "2026-02-19T07:17:08.877Z" }, + { url = "https://files.pythonhosted.org/packages/17/f4/b8224384de676662858a079bc3138141a072ad886266e1e2d28a3108ad9d/policyengine_us-1.584.0-py3-none-any.whl", hash = "sha256:55c97c9e63c8ba18b895b74076d893de720cd44089398a62f0d02cc7afc8d53c", size = 7933593, upload-time = "2026-02-24T15:05:01.846Z" }, ] [[package]]