diff --git a/changelog.d/changed/553.md b/changelog.d/changed/553.md new file mode 100644 index 000000000..6c370943f --- /dev/null +++ b/changelog.d/changed/553.md @@ -0,0 +1 @@ +Fix and add retirement contribution calibration targets: correct traditional IRA from $25B to $13.2B (SOI 1304), split 401(k) into traditional $482.7B and Roth $85.2B (BEA/FRED × Vanguard share), add Roth IRA at $35.0B (SOI Tables 5 & 6), add self-employed pension ALD at $29.5B (SOI 1304). diff --git a/changelog.d/fix-ss-subcomponent-reconciliation.fixed.md b/changelog.d/fix-ss-subcomponent-reconciliation.fixed.md new file mode 100644 index 000000000..2bfa81492 --- /dev/null +++ b/changelog.d/fix-ss-subcomponent-reconciliation.fixed.md @@ -0,0 +1 @@ +Reconcile SS sub-components after PUF imputation so they sum to social_security. diff --git a/conftest.py b/conftest.py new file mode 100644 index 000000000..84ce8ac4b --- /dev/null +++ b/conftest.py @@ -0,0 +1,14 @@ +"""Root conftest: mock optional dependencies before collection. + +microimpute requires Python >= 3.12, so mock it for test +environments running an older interpreter. +""" + +import sys +from unittest.mock import MagicMock + +if "microimpute" not in sys.modules: + _mock = MagicMock() + sys.modules["microimpute"] = _mock + sys.modules["microimpute.models"] = _mock + sys.modules["microimpute.models.qrf"] = _mock diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index 81df62095..2158f001b 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -74,7 +74,6 @@ "non_sch_d_capital_gains", "general_business_credit", "energy_efficient_home_improvement_credit", - "traditional_ira_contributions", "amt_foreign_tax_credit", "excess_withheld_payroll_tax", "savers_credit", @@ -102,6 +101,13 @@ "self_employment_income_would_be_qualified", ] +SS_SUBCOMPONENTS = [ + "social_security_retirement", + "social_security_disability", + "social_security_survivors", + "social_security_dependents", +] + OVERRIDDEN_IMPUTED_VARIABLES = [ "partnership_s_corp_income", "interest_deduction", @@ -154,6 +160,290 @@ "rental_income_would_be_qualified", ] +CPS_RETIREMENT_VARIABLES = [ + "traditional_401k_contributions", + "roth_401k_contributions", + "traditional_ira_contributions", + "roth_ira_contributions", + "self_employed_pension_contributions", +] + +RETIREMENT_DEMOGRAPHIC_PREDICTORS = [ + "age", + "is_male", + "tax_unit_is_joint", + "tax_unit_count_dependents", + "is_tax_unit_head", + "is_tax_unit_spouse", + "is_tax_unit_dependent", +] + +# Income predictors sourced from PUF imputations on the test side. +RETIREMENT_INCOME_PREDICTORS = [ + "employment_income", + "self_employment_income", + "taxable_interest_income", + "qualified_dividend_income", + "taxable_pension_income", + "social_security", +] + +RETIREMENT_PREDICTORS = ( + RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS +) + + +def _get_retirement_limits(year: int) -> dict: + """Return contribution limits for the given tax year. + + Mirrors the limits in cps.py (duplicated here to avoid circular + imports between the calibration and dataset packages). + """ + limits_by_year = { + 2020: { + "401k": 19_500, + "401k_catch_up": 6_500, + "ira": 6_000, + "ira_catch_up": 1_000, + }, + 2021: { + "401k": 19_500, + "401k_catch_up": 6_500, + "ira": 6_000, + "ira_catch_up": 1_000, + }, + 2022: { + "401k": 20_500, + "401k_catch_up": 6_500, + "ira": 6_000, + "ira_catch_up": 1_000, + }, + 2023: { + "401k": 22_500, + "401k_catch_up": 7_500, + "ira": 6_500, + "ira_catch_up": 1_000, + }, + 2024: { + "401k": 23_000, + "401k_catch_up": 7_500, + "ira": 7_000, + "ira_catch_up": 1_000, + }, + 2025: { + "401k": 23_500, + "401k_catch_up": 7_500, + "ira": 7_000, + "ira_catch_up": 1_000, + }, + } + clamped = max(min(year, max(limits_by_year)), min(limits_by_year)) + return limits_by_year[clamped] + + +MINIMUM_RETIREMENT_AGE = 62 + +SS_SPLIT_PREDICTORS = [ + "age", + "is_male", + "tax_unit_is_joint", + "is_tax_unit_head", + "is_tax_unit_dependent", +] + +MIN_QRF_TRAINING_RECORDS = 100 + + +def _qrf_ss_shares( + data: Dict[str, Dict[int, np.ndarray]], + n_cps: int, + time_period: int, + puf_has_ss: np.ndarray, +) -> Optional[Dict[str, np.ndarray]]: + """Predict SS sub-component shares via QRF. + + Trains on CPS records that have SS > 0 (where the reason-code + split is known), then predicts shares for all PUF records with + positive SS. The CPS-PUF link is statistical (not identity-based), + so the QRF gives a better expected prediction than using the + paired CPS record's split. + + Args: + data: Dataset dict. + n_cps: Records in CPS half. + time_period: Tax year. + puf_has_ss: Boolean mask (length n_cps) — True where the + PUF half has positive social_security. + + Returns: + Dict mapping sub-component name to predicted share arrays + (length = puf_has_ss.sum()), or None if training data is + insufficient. + """ + from microimpute.models.qrf import QRF + + cps_ss = data["social_security"][time_period][:n_cps] + has_ss = cps_ss > 0 + + if has_ss.sum() < MIN_QRF_TRAINING_RECORDS: + return None + + # Build training features from available predictors. + predictors = [] + train_cols = {} + test_cols = {} + for pred in SS_SPLIT_PREDICTORS: + if pred not in data: + continue + vals = data[pred][time_period][:n_cps] + train_cols[pred] = vals[has_ss].astype(np.float32) + test_cols[pred] = vals[puf_has_ss].astype(np.float32) + predictors.append(pred) + + if not predictors: + return None + + X_train = pd.DataFrame(train_cols) + X_test = pd.DataFrame(test_cols) + + # Training targets: share going to each sub-component (0 or 1). + share_vars = [] + with np.errstate(divide="ignore", invalid="ignore"): + for sub in SS_SUBCOMPONENTS: + if sub not in data: + continue + sub_vals = data[sub][time_period][:n_cps][has_ss] + share_name = sub + "_share" + X_train[share_name] = np.where( + cps_ss[has_ss] > 0, + sub_vals / cps_ss[has_ss], + 0.0, + ) + share_vars.append(share_name) + + if not share_vars: + return None + + qrf = QRF(log_level="WARNING", memory_efficient=True) + try: + fitted = qrf.fit( + X_train=X_train[predictors + share_vars], + predictors=predictors, + imputed_variables=share_vars, + n_jobs=1, + ) + predictions = fitted.predict(X_test=X_test) + except Exception: + logger.warning("QRF SS split failed, falling back to heuristic") + return None + + # Clip to [0, 1] and normalise so shares sum to 1. + shares = {} + total = np.zeros(len(X_test)) + for sub in SS_SUBCOMPONENTS: + key = sub + "_share" + if key in predictions.columns: + s = np.clip(predictions[key].values, 0, 1) + shares[sub] = s + total += s + + for sub in shares: + shares[sub] = np.where(total > 0, shares[sub] / total, 0.0) + + del fitted, predictions + gc.collect() + + logger.info( + "QRF SS split: predicted shares for %d PUF records", + puf_has_ss.sum(), + ) + return shares + + +def _age_heuristic_ss_shares( + data: Dict[str, Dict[int, np.ndarray]], + n_cps: int, + time_period: int, + puf_has_ss: np.ndarray, +) -> Dict[str, np.ndarray]: + """Fallback: assign SS type by age threshold. + + Age >= 62 -> retirement, < 62 -> disability. + If age is unavailable, all go to retirement. + """ + n_pred = puf_has_ss.sum() + shares = {sub: np.zeros(n_pred) for sub in SS_SUBCOMPONENTS} + + age = None + if "age" in data: + age = data["age"][time_period][:n_cps][puf_has_ss] + + if age is not None: + is_old = age >= MINIMUM_RETIREMENT_AGE + if "social_security_retirement" in shares: + shares["social_security_retirement"] = is_old.astype(np.float64) + if "social_security_disability" in shares: + shares["social_security_disability"] = (~is_old).astype(np.float64) + else: + if "social_security_retirement" in shares: + shares["social_security_retirement"] = np.ones(n_pred) + + return shares + + +def reconcile_ss_subcomponents( + data: Dict[str, Dict[int, np.ndarray]], + n_cps: int, + time_period: int, +) -> None: + """Predict SS sub-components for PUF half from demographics. + + The CPS-PUF link is statistical (not identity-based), so the + paired CPS record's sub-component split is just one noisy draw. + A QRF trained on all CPS SS recipients gives a better expected + prediction by pooling across the full training set. + + For all PUF records with positive social_security, this function + predicts shares via QRF (falling back to an age heuristic) and + scales them to match the imputed total. PUF records with zero + SS get all sub-components cleared to zero. + + Modifies ``data`` in place. Only the PUF half (indices + n_cps .. 2*n_cps) is changed. + + Args: + data: Dataset dict {variable: {time_period: array}}. + n_cps: Number of records in the CPS half. + time_period: Tax year key into data dicts. + """ + if "social_security" not in data: + return + + puf_ss = data["social_security"][time_period][n_cps:] + puf_has_ss = puf_ss > 0 + + # Predict shares for all PUF records with SS > 0. + shares = None + if puf_has_ss.any(): + shares = _qrf_ss_shares(data, n_cps, time_period, puf_has_ss) + if shares is None: + shares = _age_heuristic_ss_shares( + data, n_cps, time_period, puf_has_ss + ) + + for sub in SS_SUBCOMPONENTS: + if sub not in data: + continue + arr = data[sub][time_period] + + new_puf = np.zeros(n_cps) + if puf_has_ss.any() and shares is not None: + share = shares.get(sub, np.zeros(puf_has_ss.sum())) + new_puf[puf_has_ss] = puf_ss[puf_has_ss] * share + + arr[n_cps:] = new_puf.astype(arr.dtype) + data[sub][time_period] = arr + def puf_clone_dataset( data: Dict[str, Dict[int, np.ndarray]], @@ -230,6 +520,13 @@ def _map_to_entity(pred_values, variable_name): data, y_full, time_period, dataset_path ) + # Impute retirement contributions for PUF half + puf_retirement = None + if y_full is not None and dataset_path is not None: + puf_retirement = _impute_retirement_contributions( + data, y_full, time_period, dataset_path + ) + new_data = {} for variable, time_dict in data.items(): values = time_dict[time_period] @@ -252,6 +549,13 @@ def _map_to_entity(pred_values, variable_name): new_data[variable] = { time_period: np.concatenate([values, puf_weeks]) } + elif ( + variable in CPS_RETIREMENT_VARIABLES and puf_retirement is not None + ): + puf_vals = puf_retirement[variable] + new_data[variable] = { + time_period: np.concatenate([values, puf_vals]) + } else: new_data[variable] = { time_period: np.concatenate([values, values]) @@ -271,6 +575,9 @@ def _map_to_entity(pred_values, variable_name): if cps_sim is not None: del cps_sim + # Ensure SS sub-components match the (possibly imputed) total. + reconcile_ss_subcomponents(new_data, person_count, time_period) + logger.info( "PUF clone complete: %d -> %d households", n_households, @@ -378,6 +685,116 @@ def _impute_weeks_unemployed( return imputed_weeks +def _impute_retirement_contributions( + data: Dict[str, Dict[int, np.ndarray]], + puf_imputations: Dict[str, np.ndarray], + time_period: int, + dataset_path: str, +) -> Dict[str, np.ndarray]: + """Impute retirement contributions for the PUF half using QRF. + + Trains on CPS data (which has realistic income-to-contribution + relationships) and predicts onto PUF clone records using + PUF-imputed income as input features. + + Args: + data: CPS data dict. + puf_imputations: Dict of PUF-imputed variable arrays. + time_period: Tax year. + dataset_path: Path to CPS h5 for Microsimulation. + + Returns: + Dict mapping retirement variable names to imputed arrays. + """ + from microimpute.models.qrf import QRF + from policyengine_us import Microsimulation + + cps_sim = Microsimulation(dataset=dataset_path) + + # Build training data from CPS (has realistic relationships) + train_cols = RETIREMENT_PREDICTORS + CPS_RETIREMENT_VARIABLES + try: + X_train = cps_sim.calculate_dataframe(train_cols) + except (ValueError, KeyError) as e: + logger.warning("Could not build retirement training data: %s", e) + n_persons = len(data["person_id"][time_period]) + del cps_sim + return {var: np.zeros(n_persons) for var in CPS_RETIREMENT_VARIABLES} + + # Build test data: demographics from CPS sim, income from PUF + X_test = cps_sim.calculate_dataframe(RETIREMENT_DEMOGRAPHIC_PREDICTORS) + for income_var in RETIREMENT_INCOME_PREDICTORS: + if income_var in puf_imputations: + X_test[income_var] = puf_imputations[income_var] + else: + X_test[income_var] = cps_sim.calculate(income_var).values + + del cps_sim + + # Subsample training data + if len(X_train) > 5000: + X_train_sampled = X_train.sample(n=5000, random_state=42) + else: + X_train_sampled = X_train + + # Train QRF + qrf = QRF(log_level="INFO", memory_efficient=True) + fitted = qrf.fit( + X_train=X_train_sampled, + predictors=RETIREMENT_PREDICTORS, + imputed_variables=CPS_RETIREMENT_VARIABLES, + n_jobs=1, + ) + predictions = fitted.predict(X_test=X_test) + + # Extract results and apply constraints + limits = _get_retirement_limits(time_period) + age = X_test["age"].values + catch_up_eligible = age >= 50 + limit_401k = limits["401k"] + catch_up_eligible * limits["401k_catch_up"] + limit_ira = limits["ira"] + catch_up_eligible * limits["ira_catch_up"] + + emp_income = X_test["employment_income"].values + se_income = X_test["self_employment_income"].values + + result = {} + for var in CPS_RETIREMENT_VARIABLES: + vals = predictions[var].values + + # Non-negativity + vals = np.maximum(vals, 0) + + # Cap 401k at year-specific limit + if "401k" in var: + vals = np.minimum(vals, limit_401k) + # Zero out for records with no employment income + vals = np.where(emp_income > 0, vals, 0) + + # Cap IRA at year-specific limit + if "ira" in var: + vals = np.minimum(vals, limit_ira) + + # Zero out SE pension for records with no SE income + if var == "self_employed_pension_contributions": + vals = np.where(se_income > 0, vals, 0) + + result[var] = vals + + logger.info( + "Imputed retirement contributions for PUF: " + "401k mean=$%.0f, IRA mean=$%.0f, SE pension mean=$%.0f", + result["traditional_401k_contributions"].mean() + + result["roth_401k_contributions"].mean(), + result["traditional_ira_contributions"].mean() + + result["roth_ira_contributions"].mean(), + result["self_employed_pension_contributions"].mean(), + ) + + del fitted, predictions + gc.collect() + return result + + def _run_qrf_imputation( data: Dict[str, Dict[int, np.ndarray]], time_period: int, diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 76e55d4ac..8db414ef8 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -617,19 +617,21 @@ def add_personal_income_variables( # They could also include General Assistance. cps["tanf_reported"] = person.PAW_VAL cps["ssi_reported"] = person.SSI_VAL - # Assume all retirement contributions are traditional 401(k) for now. - # Procedure for allocating retirement contributions: - # 1) If they report any self-employment income, allocate entirely to - # self-employed pension contributions. - # 2) If they report any wage and salary income, allocate in this order: - # a) Traditional 401(k) contributions up to to limit - # b) Roth 401(k) contributions up to the limit - # c) IRA contributions up to the limit, split according to administrative fractions - # d) Other retirement contributions - # Disregard reported pension contributions from people who report neither wage and salary - # nor self-employment income. - # Assume no 403(b) or 457 contributions for now. - # IRS retirement contribution limits by year. + # Allocate CPS RETCB_VAL (a single bundled retirement contribution + # total) into account-type-specific variables using a proportional + # split based on administrative data. + # + # RETCB_VAL does not distinguish account type (Census only asks + # "how much did you contribute to retirement accounts?"). The old + # sequential waterfall gave 401(k) first priority, which consumed + # nearly all of RETCB_VAL and left IRA contributions at $0. + # + # The proportional approach uses BEA/FRED and IRS SOI shares to + # split contributions into DC (401k) and IRA pools, then splits + # each pool into traditional/Roth using administrative fractions. + # See imputation_parameters.yaml for sources. + # + # Contribution limits by year. RETIREMENT_LIMITS = { 2020: { "401k": 19_500, @@ -668,7 +670,6 @@ def add_personal_income_variables( "ira_catch_up": 1_000, }, } - # Clamp to the nearest available year for out-of-range values. clamped_year = max( min(year, max(RETIREMENT_LIMITS)), min(RETIREMENT_LIMITS), @@ -679,53 +680,43 @@ def add_personal_income_variables( LIMIT_IRA = limits["ira"] LIMIT_IRA_CATCH_UP = limits["ira_catch_up"] CATCH_UP_AGE = 50 - retirement_contributions = person.RETCB_VAL - cps["self_employed_pension_contributions"] = np.where( - person.SEMP_VAL > 0, retirement_contributions, 0 - ) - remaining_retirement_contributions = np.maximum( - retirement_contributions - cps["self_employed_pension_contributions"], - 0, - ) - # Compute the 401(k) limit for the person's age. catch_up_eligible = person.A_AGE >= CATCH_UP_AGE limit_401k = LIMIT_401K + catch_up_eligible * LIMIT_401K_CATCH_UP limit_ira = LIMIT_IRA + catch_up_eligible * LIMIT_IRA_CATCH_UP - cps["traditional_401k_contributions"] = np.where( - person.WSAL_VAL > 0, - np.minimum(remaining_retirement_contributions, limit_401k), - 0, - ) - remaining_retirement_contributions = np.maximum( - remaining_retirement_contributions - - cps["traditional_401k_contributions"], - 0, - ) - cps["roth_401k_contributions"] = np.where( - person.WSAL_VAL > 0, - np.minimum(remaining_retirement_contributions, limit_401k), - 0, - ) - remaining_retirement_contributions = np.maximum( - remaining_retirement_contributions - cps["roth_401k_contributions"], - 0, - ) - cps["traditional_ira_contributions"] = np.where( - person.WSAL_VAL > 0, - np.minimum(remaining_retirement_contributions, limit_ira), - 0, - ) - remaining_retirement_contributions = np.maximum( - remaining_retirement_contributions - - cps["traditional_ira_contributions"], - 0, + + retirement_contributions = person.RETCB_VAL + has_wages = person.WSAL_VAL > 0 + has_se = person.SEMP_VAL > 0 + + # 1) Self-employed: allocate entirely to SE pension. + cps["self_employed_pension_contributions"] = np.where( + has_se, retirement_contributions, 0 ) - roth_ira_limit = limit_ira - cps["traditional_ira_contributions"] - cps["roth_ira_contributions"] = np.where( - person.WSAL_VAL > 0, - np.minimum(remaining_retirement_contributions, roth_ira_limit), + remaining = np.maximum( + retirement_contributions - cps["self_employed_pension_contributions"], 0, ) + + # 2) Wage earners: split proportionally into DC and IRA pools. + dc_share = p["dc_share_of_retirement_contributions"] + ira_share = p["ira_share_of_retirement_contributions"] + roth_dc_share = p["roth_share_of_dc_contributions"] + trad_ira_share = p["traditional_share_of_ira_contributions"] + + dc_pool = np.where(has_wages, remaining * dc_share, 0) + ira_pool = np.where(has_wages, remaining * ira_share, 0) + + # DC pool: split into traditional/Roth 401(k), cap at combined + # 401(k) limit. + dc_capped = np.minimum(dc_pool, limit_401k) + cps["traditional_401k_contributions"] = dc_capped * (1 - roth_dc_share) + cps["roth_401k_contributions"] = dc_capped * roth_dc_share + + # IRA pool: split into traditional/Roth IRA, cap at combined + # IRA limit. + ira_capped = np.minimum(ira_pool, limit_ira) + cps["traditional_ira_contributions"] = ira_capped * trad_ira_share + cps["roth_ira_contributions"] = ira_capped * (1 - trad_ira_share) # Allocate capital gains into long-term and short-term based on aggregate split. cps["long_term_capital_gains"] = person.CAP_VAL * ( p["long_term_capgain_fraction"] diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 4b427a399..ac8b6e45c 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -50,8 +50,42 @@ def generate(self): dataset_path=str(self.cps.file_path), ) + new_data = self._drop_formula_variables(new_data) self.save_dataset(new_data) + # Variables with formulas that must still be stored (e.g. IDs + # needed by the dataset loader before formulas can run). + _KEEP_FORMULA_VARS = {"person_id"} + + @classmethod + def _drop_formula_variables(cls, data): + """Remove variables that are computed by policyengine-us. + + Variables with formulas, ``adds``, or ``subtracts`` are + recomputed by the simulation engine, so storing them wastes + space and can mislead validation. + """ + from policyengine_us import CountryTaxBenefitSystem + + tbs = CountryTaxBenefitSystem() + formula_vars = { + name + for name, var in tbs.variables.items() + if (hasattr(var, "formulas") and len(var.formulas) > 0) + or getattr(var, "adds", None) + or getattr(var, "subtracts", None) + } - cls._KEEP_FORMULA_VARS + dropped = sorted(set(data.keys()) & formula_vars) + if dropped: + logger.info( + "Dropping %d formula variables: %s", + len(dropped), + dropped, + ) + for var in dropped: + del data[var] + return data + class ExtendedCPS_2024(ExtendedCPS): cps = CPS_2024_Full diff --git a/policyengine_us_data/datasets/cps/imputation_parameters.yaml b/policyengine_us_data/datasets/cps/imputation_parameters.yaml index 132b52af4..7851661ac 100644 --- a/policyengine_us_data/datasets/cps/imputation_parameters.yaml +++ b/policyengine_us_data/datasets/cps/imputation_parameters.yaml @@ -15,4 +15,33 @@ taxable_ira_distribution_fraction: 1.0 taxable_sep_distribution_fraction: 1.0 # SOI 2012 data -long_term_capgain_fraction: 0.880 +long_term_capgain_fraction: 0.880 + +# Retirement contribution allocation fractions. +# Used to split CPS RETCB_VAL (a single bundled total) into +# account-type-specific variables. +# +# DC vs IRA share of non-SE retirement contributions. +# Employee DC: $567.9B (BEA/FRED Y351RC1A027NBEA minus W351RC0A144NBEA) +# Total IRA: $57.5B (IRS SOI Tables 5 & 6, TY 2022) +# Combined: $625.4B +# https://fred.stlouisfed.org/series/Y351RC1A027NBEA +# https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements +dc_share_of_retirement_contributions: 0.908 +ira_share_of_retirement_contributions: 0.092 + +# Roth share of DC (401k/403b) contributions. +# Vanguard How America Saves 2024: 18% of participants elect Roth. +# PSCA 67th Annual Survey: 21% of participants make Roth contributions. +# Estimated dollar share ~15% (Roth participants contribute slightly +# less on average). +# https://corporate.vanguard.com/content/dam/corp/research/pdf/how_america_saves_report_2024.pdf +# https://www.psca.org/news/psca-news/2024/12/401k-savings-and-participation-rates-rise/ +roth_share_of_dc_contributions: 0.15 + +# Traditional vs Roth share of IRA contributions. +# IRS SOI IRA Accumulation Tables 5 & 6 (TY 2022): +# Traditional: $22.5B (4.99M contributors), Roth: $35.0B (10.04M) +# Traditional share: 22.5 / 57.5 = 39.2% +# https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements +traditional_share_of_ira_contributions: 0.392 diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index dc26cca8d..2b78b6d6e 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -237,19 +237,40 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): "notes": "~5.8% of total OASDI (spouses/children of retired+disabled)", "year": HARDCODED_YEAR, }, - # IRA contribution totals from IRS SOI accumulation tables + # Retirement contribution targets — see issue #553 { "variable": "traditional_ira_contributions", - "value": 25e9, - "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", - "notes": "Tax year 2022 (~5M x $4,510 avg) uprated ~12% to 2024", + "value": 13.2e9, + "source": "https://www.irs.gov/statistics/soi-tax-stats-individual-statistical-tables-by-size-of-adjusted-gross-income", + "notes": "SOI 1304 Table 1.4 (TY 2022) 'IRA payments' deduction, col 124", + "year": HARDCODED_YEAR, + }, + { + "variable": "traditional_401k_contributions", + "value": 482.7e9, + "source": "https://fred.stlouisfed.org/series/Y351RC1A027NBEA", + "notes": "BEA/FRED employee DC deferrals ($567.9B) x 85% traditional share (Vanguard HAS 2024)", + "year": HARDCODED_YEAR, + }, + { + "variable": "roth_401k_contributions", + "value": 85.2e9, + "source": "https://fred.stlouisfed.org/series/Y351RC1A027NBEA", + "notes": "BEA/FRED employee DC deferrals ($567.9B) x 15% Roth share (Vanguard HAS 2024)", + "year": HARDCODED_YEAR, + }, + { + "variable": "self_employed_pension_contribution_ald", + "value": 29.5e9, + "source": "https://www.irs.gov/statistics/soi-tax-stats-individual-statistical-tables-by-size-of-adjusted-gross-income", + "notes": "SOI 1304 Table 1.4 (TY 2022) 'Payments to a Keogh plan', col 116", "year": HARDCODED_YEAR, }, { "variable": "roth_ira_contributions", - "value": 39e9, + "value": 35.0e9, "source": "https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements", - "notes": "Tax year 2022 (~10M x $3,482 avg) uprated ~12% to 2024", + "notes": "IRS SOI IRA Accumulation Tables 5 & 6 (TY 2022), 10.04M contributors", "year": HARDCODED_YEAR, }, ] diff --git a/policyengine_us_data/tests/test_calibration/conftest.py b/policyengine_us_data/tests/test_calibration/conftest.py new file mode 100644 index 000000000..7a1de465f --- /dev/null +++ b/policyengine_us_data/tests/test_calibration/conftest.py @@ -0,0 +1,15 @@ +"""Shared fixtures for calibration tests. + +Mocks microimpute at import time so tests can run without the +package installed (it requires Python >= 3.12). +""" + +import sys +from unittest.mock import MagicMock + +# microimpute is not installable on Python < 3.12. Mock it before +# any test module triggers the cps.py top-level import. +if "microimpute" not in sys.modules: + sys.modules["microimpute"] = MagicMock() + sys.modules["microimpute.models"] = MagicMock() + sys.modules["microimpute.models.qrf"] = MagicMock() diff --git a/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py b/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py new file mode 100644 index 000000000..1a6172dd6 --- /dev/null +++ b/policyengine_us_data/tests/test_calibration/test_retirement_imputation.py @@ -0,0 +1,613 @@ +"""Tests for retirement contribution QRF imputation. + +Covers: +- Constants & list membership +- _get_retirement_limits() logic +- _impute_retirement_contributions() with mocked QRF/Microsimulation +- puf_clone_dataset() integration routing for retirement variables +""" + +from unittest.mock import MagicMock, patch + +import numpy as np +import pandas as pd +import pytest + +from policyengine_us_data.calibration.puf_impute import ( + CPS_RETIREMENT_VARIABLES, + IMPUTED_VARIABLES, + OVERRIDDEN_IMPUTED_VARIABLES, + RETIREMENT_DEMOGRAPHIC_PREDICTORS, + RETIREMENT_INCOME_PREDICTORS, + RETIREMENT_PREDICTORS, + _get_retirement_limits, + _impute_retirement_contributions, + puf_clone_dataset, +) + +# The function imports Microsimulation and QRF locally via: +# from policyengine_us import Microsimulation +# from microimpute.models.qrf import QRF +# We must patch those source modules so the local import picks +# up mocks, not real objects. +_MSIM_PATCH = "policyengine_us.Microsimulation" +_QRF_PATCH = "microimpute.models.qrf.QRF" + + +# ── helpers ────────────────────────────────────────────────────────── + + +def _make_mock_data(n_persons=20, n_households=5, time_period=2024): + """Minimal CPS data dict with retirement variables.""" + rng = np.random.default_rng(42) + person_ids = np.arange(1, n_persons + 1) + hh_ids_person = np.repeat( + np.arange(1, n_households + 1), + n_persons // n_households, + ) + + data = { + "person_id": {time_period: person_ids}, + "household_id": {time_period: np.arange(1, n_households + 1)}, + "tax_unit_id": {time_period: np.arange(1, n_households + 1)}, + "spm_unit_id": {time_period: np.arange(1, n_households + 1)}, + "person_household_id": {time_period: hh_ids_person}, + "person_tax_unit_id": {time_period: hh_ids_person.copy()}, + "person_spm_unit_id": {time_period: hh_ids_person.copy()}, + "age": { + time_period: rng.integers(18, 80, size=n_persons).astype( + np.float32 + ) + }, + "is_male": { + time_period: rng.integers(0, 2, size=n_persons).astype(np.float32) + }, + "household_weight": {time_period: np.ones(n_households) * 1000}, + "employment_income": { + time_period: rng.uniform(0, 100_000, n_persons).astype(np.float32) + }, + "self_employment_income": { + time_period: rng.uniform(0, 50_000, n_persons).astype(np.float32) + }, + } + for var in CPS_RETIREMENT_VARIABLES: + data[var] = { + time_period: rng.uniform(0, 5000, n_persons).astype(np.float32) + } + return data + + +def _make_cps_df(n, rng): + """Build a mock CPS DataFrame with all needed columns.""" + return pd.DataFrame( + { + # Demographics + "age": rng.integers(18, 80, n).astype(float), + "is_male": rng.integers(0, 2, n).astype(float), + "tax_unit_is_joint": rng.integers(0, 2, n).astype(float), + "tax_unit_count_dependents": rng.integers(0, 4, n).astype(float), + "is_tax_unit_head": rng.integers(0, 2, n).astype(float), + "is_tax_unit_spouse": np.zeros(n), + "is_tax_unit_dependent": np.zeros(n), + # Income predictors + "employment_income": rng.uniform(0, 100_000, n), + "self_employment_income": rng.uniform(0, 50_000, n), + "taxable_interest_income": rng.uniform(0, 5_000, n), + "qualified_dividend_income": rng.uniform(0, 3_000, n), + "taxable_pension_income": rng.uniform(0, 20_000, n), + "social_security": rng.uniform(0, 15_000, n), + # Targets + "traditional_401k_contributions": rng.uniform(0, 5000, n), + "roth_401k_contributions": rng.uniform(0, 3000, n), + "traditional_ira_contributions": rng.uniform(0, 2000, n), + "roth_ira_contributions": rng.uniform(0, 2000, n), + "self_employed_pension_contributions": rng.uniform(0, 10_000, n), + } + ) + + +def _make_mock_sim(cps_df): + """Build a mock Microsimulation object.""" + sim = MagicMock() + + def calc_df(cols): + return cps_df[cols].copy() + + def calc_single(var): + result = MagicMock() + result.values = cps_df[var].values + return result + + sim.calculate_dataframe = calc_df + sim.calculate = calc_single + return sim + + +def _make_mock_qrf_class(predictions_df): + """Build a mock QRF class whose predict returns predictions_df.""" + mock_cls = MagicMock() + fitted = MagicMock() + fitted.predict.return_value = predictions_df + mock_cls.return_value.fit.return_value = fitted + return mock_cls + + +# ── TestConstants ──────────────────────────────────────────────────── + + +class TestConstants: + def test_retirement_vars_not_in_imputed(self): + """Retirement vars must NOT be in IMPUTED_VARIABLES.""" + for var in CPS_RETIREMENT_VARIABLES: + assert ( + var not in IMPUTED_VARIABLES + ), f"{var} should not be in IMPUTED_VARIABLES" + + def test_retirement_vars_not_in_overridden(self): + for var in CPS_RETIREMENT_VARIABLES: + assert var not in OVERRIDDEN_IMPUTED_VARIABLES + + def test_five_retirement_variables(self): + assert len(CPS_RETIREMENT_VARIABLES) == 5 + + def test_retirement_variable_names(self): + expected = { + "traditional_401k_contributions", + "roth_401k_contributions", + "traditional_ira_contributions", + "roth_ira_contributions", + "self_employed_pension_contributions", + } + assert set(CPS_RETIREMENT_VARIABLES) == expected + + def test_retirement_predictors_include_income(self): + for var in RETIREMENT_INCOME_PREDICTORS: + assert var in RETIREMENT_PREDICTORS + + def test_retirement_predictors_include_demographics(self): + for pred in RETIREMENT_DEMOGRAPHIC_PREDICTORS: + assert pred in RETIREMENT_PREDICTORS + + def test_income_predictors_in_imputed_variables(self): + """All income predictors must be available from PUF QRF.""" + for var in RETIREMENT_INCOME_PREDICTORS: + assert ( + var in IMPUTED_VARIABLES + ), f"{var} not in IMPUTED_VARIABLES — won't be in puf_imputations" + + def test_predictors_are_combined_lists(self): + expected = ( + RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS + ) + assert RETIREMENT_PREDICTORS == expected + + +# ── TestGetRetirementLimits ────────────────────────────────────────── + + +class TestGetRetirementLimits: + def test_known_year_2024(self): + lim = _get_retirement_limits(2024) + assert lim["401k"] == 23_000 + assert lim["401k_catch_up"] == 7_500 + assert lim["ira"] == 7_000 + assert lim["ira_catch_up"] == 1_000 + + def test_known_year_2020(self): + lim = _get_retirement_limits(2020) + assert lim["401k"] == 19_500 + assert lim["ira"] == 6_000 + + def test_known_year_2025(self): + lim = _get_retirement_limits(2025) + assert lim["401k"] == 23_500 + + def test_clamps_below_min_year(self): + assert _get_retirement_limits(2015) == _get_retirement_limits(2020) + + def test_clamps_above_max_year(self): + assert _get_retirement_limits(2030) == _get_retirement_limits(2025) + + def test_all_years_have_four_keys(self): + for year in range(2020, 2026): + lim = _get_retirement_limits(year) + assert set(lim.keys()) == { + "401k", + "401k_catch_up", + "ira", + "ira_catch_up", + } + + def test_limits_increase_monotonically(self): + prev = _get_retirement_limits(2020)["401k"] + for year in range(2021, 2026): + cur = _get_retirement_limits(year)["401k"] + assert cur >= prev + prev = cur + + def test_ira_limits_increase_monotonically(self): + prev = _get_retirement_limits(2020)["ira"] + for year in range(2021, 2026): + cur = _get_retirement_limits(year)["ira"] + assert cur >= prev + prev = cur + + +# ── TestImputeRetirementContributions ──────────────────────────────── + + +class TestImputeRetirementContributions: + """Tests with mocked Microsimulation and QRF.""" + + @pytest.fixture(autouse=True) + def _setup(self): + self.n = 50 + self.time_period = 2024 + rng = np.random.default_rng(42) + + self.data = { + "person_id": {self.time_period: np.arange(1, self.n + 1)}, + } + + emp = rng.uniform(0, 150_000, self.n).astype(np.float32) + emp[:10] = 0 # 10 records with $0 wages + se = rng.uniform(0, 80_000, self.n).astype(np.float32) + se[:20] = 0 # 20 records with $0 SE income + + self.puf_imputations = { + "employment_income": emp, + "self_employment_income": se, + "taxable_interest_income": rng.uniform(0, 5_000, self.n).astype( + np.float32 + ), + "qualified_dividend_income": rng.uniform(0, 3_000, self.n).astype( + np.float32 + ), + "taxable_pension_income": rng.uniform(0, 20_000, self.n).astype( + np.float32 + ), + "social_security": rng.uniform(0, 15_000, self.n).astype( + np.float32 + ), + } + + self.cps_df = _make_cps_df(self.n, rng) + + def _call_with_mocks(self, pred_df): + """Run _impute_retirement_contributions with mocked deps.""" + import sys + + mock_sim = _make_mock_sim(self.cps_df) + mock_qrf_cls = _make_mock_qrf_class(pred_df) + + # patch() doesn't work reliably on MagicMock modules, + # so we set the QRF attribute directly. + qrf_mod = sys.modules["microimpute.models.qrf"] + old_qrf = getattr(qrf_mod, "QRF", None) + qrf_mod.QRF = mock_qrf_cls + try: + with patch(_MSIM_PATCH, return_value=mock_sim): + return _impute_retirement_contributions( + self.data, + self.puf_imputations, + self.time_period, + "/fake/path.h5", + ) + finally: + if old_qrf is not None: + qrf_mod.QRF = old_qrf + + def _uniform_preds(self, value): + """Build a DataFrame predicting `value` for all vars.""" + return pd.DataFrame( + {var: np.full(self.n, value) for var in CPS_RETIREMENT_VARIABLES} + ) + + def _random_preds(self, low, high, seed=99): + rng = np.random.default_rng(seed) + return pd.DataFrame( + { + var: rng.uniform(low, high, self.n) + for var in CPS_RETIREMENT_VARIABLES + } + ) + + def test_returns_all_retirement_vars(self): + result = self._call_with_mocks(self._random_preds(0, 8000)) + for var in CPS_RETIREMENT_VARIABLES: + assert var in result + assert len(result[var]) == self.n + + def test_nonnegative_output(self): + result = self._call_with_mocks(self._random_preds(-5000, 8000, seed=7)) + for var in CPS_RETIREMENT_VARIABLES: + assert np.all(result[var] >= 0), f"{var} has negative values" + + def test_401k_capped(self): + result = self._call_with_mocks(self._uniform_preds(50_000.0)) + lim = _get_retirement_limits(self.time_period) + max_401k = lim["401k"] + lim["401k_catch_up"] + + for var in ( + "traditional_401k_contributions", + "roth_401k_contributions", + ): + assert np.all(result[var] <= max_401k), f"{var} exceeds 401k limit" + + def test_ira_capped(self): + result = self._call_with_mocks(self._uniform_preds(50_000.0)) + lim = _get_retirement_limits(self.time_period) + max_ira = lim["ira"] + lim["ira_catch_up"] + + for var in ( + "traditional_ira_contributions", + "roth_ira_contributions", + ): + assert np.all(result[var] <= max_ira), f"{var} exceeds IRA limit" + + def test_401k_zero_when_no_wages(self): + result = self._call_with_mocks(self._uniform_preds(5_000.0)) + zero_wage = self.puf_imputations["employment_income"] == 0 + assert zero_wage.sum() == 10 + + for var in ( + "traditional_401k_contributions", + "roth_401k_contributions", + ): + assert np.all( + result[var][zero_wage] == 0 + ), f"{var} should be 0 when employment_income is 0" + + def test_se_pension_zero_when_no_se_income(self): + result = self._call_with_mocks(self._uniform_preds(5_000.0)) + zero_se = self.puf_imputations["self_employment_income"] == 0 + assert zero_se.sum() == 20 + assert np.all( + result["self_employed_pension_contributions"][zero_se] == 0 + ) + + def test_catch_up_age_threshold(self): + """Records age >= 50 get higher caps than younger.""" + self.cps_df["age"] = np.concatenate( + [np.full(25, 30.0), np.full(25, 55.0)] + ) + # All have positive income + self.puf_imputations["employment_income"] = np.full( + self.n, 100_000.0 + ).astype(np.float32) + + lim = _get_retirement_limits(self.time_period) + val = float(lim["401k"]) + 1000 # 24000 + + result = self._call_with_mocks(self._uniform_preds(val)) + + young_401k = result["traditional_401k_contributions"][:25] + old_401k = result["traditional_401k_contributions"][25:] + + # Young capped at base limit + assert np.all(young_401k == lim["401k"]) + # Old get full value (within catch-up limit) + assert np.all(old_401k == val) + + def test_ira_catch_up_threshold(self): + """IRA catch-up also works for age >= 50.""" + self.cps_df["age"] = np.concatenate( + [np.full(25, 30.0), np.full(25, 55.0)] + ) + lim = _get_retirement_limits(self.time_period) + val = float(lim["ira"]) + 500 # 7500 + + result = self._call_with_mocks(self._uniform_preds(val)) + + young_ira = result["traditional_ira_contributions"][:25] + old_ira = result["traditional_ira_contributions"][25:] + + assert np.all(young_ira == lim["ira"]) + assert np.all(old_ira == val) + + def test_401k_nonzero_for_positive_wages(self): + """Records with positive wages should keep their + predicted 401k (not zeroed out).""" + result = self._call_with_mocks(self._uniform_preds(5_000.0)) + pos_wage = self.puf_imputations["employment_income"] > 0 + for var in ( + "traditional_401k_contributions", + "roth_401k_contributions", + ): + assert np.all(result[var][pos_wage] > 0) + + def test_se_pension_nonzero_for_positive_se(self): + result = self._call_with_mocks(self._uniform_preds(5_000.0)) + pos_se = self.puf_imputations["self_employment_income"] > 0 + assert np.all( + result["self_employed_pension_contributions"][pos_se] > 0 + ) + + +# ── TestPufCloneRetirementRouting ──────────────────────────────────── + + +class TestPufCloneRetirementRouting: + """Test puf_clone_dataset() routes retirement vars correctly.""" + + def test_retirement_vars_duplicated_when_skip_qrf(self): + data = _make_mock_data() + state_fips = np.array([1, 2, 36, 6, 48]) + n = 20 + + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=True, + ) + + for var in CPS_RETIREMENT_VARIABLES: + vals = result[var][2024] + assert len(vals) == n * 2 + np.testing.assert_array_equal(vals[:n], vals[n:]) + + def test_retirement_vars_use_imputed_when_available(self): + data = _make_mock_data(n_persons=20, n_households=5) + state_fips = np.array([1, 2, 36, 6, 48]) + n = 20 + + fake_retirement = { + var: np.full(n, 999.0) for var in CPS_RETIREMENT_VARIABLES + } + + with ( + patch( + "policyengine_us_data.calibration.puf_impute" + "._impute_retirement_contributions", + return_value=fake_retirement, + ), + patch( + "policyengine_us_data.calibration.puf_impute" + "._run_qrf_imputation", + return_value=( + {v: np.zeros(n) for v in IMPUTED_VARIABLES}, + {}, + ), + ), + patch( + "policyengine_us_data.calibration.puf_impute" + "._impute_weeks_unemployed", + return_value=np.zeros(n), + ), + patch(_MSIM_PATCH), + ): + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=False, + puf_dataset="/fake/puf.h5", + dataset_path="/fake/cps.h5", + ) + + for var in CPS_RETIREMENT_VARIABLES: + vals = result[var][2024] + assert len(vals) == n * 2 + np.testing.assert_array_equal(vals[:n], data[var][2024]) + np.testing.assert_array_equal(vals[n:], np.full(n, 999.0)) + + def test_cps_half_unchanged_with_imputation(self): + data = _make_mock_data(n_persons=20, n_households=5) + state_fips = np.array([1, 2, 36, 6, 48]) + n = 20 + + originals = { + var: data[var][2024].copy() for var in CPS_RETIREMENT_VARIABLES + } + fake_retirement = { + var: np.zeros(n) for var in CPS_RETIREMENT_VARIABLES + } + + with ( + patch( + "policyengine_us_data.calibration.puf_impute" + "._impute_retirement_contributions", + return_value=fake_retirement, + ), + patch( + "policyengine_us_data.calibration.puf_impute" + "._run_qrf_imputation", + return_value=( + {v: np.zeros(n) for v in IMPUTED_VARIABLES}, + {}, + ), + ), + patch( + "policyengine_us_data.calibration.puf_impute" + "._impute_weeks_unemployed", + return_value=np.zeros(n), + ), + patch(_MSIM_PATCH), + ): + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=False, + puf_dataset="/fake/puf.h5", + dataset_path="/fake/cps.h5", + ) + + for var in CPS_RETIREMENT_VARIABLES: + np.testing.assert_array_equal( + result[var][2024][:n], originals[var] + ) + + def test_puf_half_gets_zero_retirement_for_zero_imputed(self): + """When imputation returns zeros, PUF half should be zero.""" + data = _make_mock_data(n_persons=20, n_households=5) + state_fips = np.array([1, 2, 36, 6, 48]) + n = 20 + + fake_retirement = { + var: np.zeros(n) for var in CPS_RETIREMENT_VARIABLES + } + + with ( + patch( + "policyengine_us_data.calibration.puf_impute" + "._impute_retirement_contributions", + return_value=fake_retirement, + ), + patch( + "policyengine_us_data.calibration.puf_impute" + "._run_qrf_imputation", + return_value=( + {v: np.zeros(n) for v in IMPUTED_VARIABLES}, + {}, + ), + ), + patch( + "policyengine_us_data.calibration.puf_impute" + "._impute_weeks_unemployed", + return_value=np.zeros(n), + ), + patch(_MSIM_PATCH), + ): + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=False, + puf_dataset="/fake/puf.h5", + dataset_path="/fake/cps.h5", + ) + + for var in CPS_RETIREMENT_VARIABLES: + puf_half = result[var][2024][n:] + assert np.all(puf_half == 0) + + +# ── TestLimitsMatchCps ─────────────────────────────────────────────── + + +class TestLimitsMatchCps: + """Cross-check puf_impute limits against cps.py values.""" + + def test_2024_401k_limit(self): + assert _get_retirement_limits(2024)["401k"] == 23_000 + + def test_2024_ira_limit(self): + assert _get_retirement_limits(2024)["ira"] == 7_000 + + def test_2023_401k_limit(self): + assert _get_retirement_limits(2023)["401k"] == 22_500 + + def test_2023_ira_limit(self): + assert _get_retirement_limits(2023)["ira"] == 6_500 + + def test_2022_limits(self): + lim = _get_retirement_limits(2022) + assert lim["401k"] == 20_500 + assert lim["ira"] == 6_000 + + def test_2021_limits(self): + lim = _get_retirement_limits(2021) + assert lim["401k"] == 19_500 + assert lim["ira"] == 6_000 diff --git a/policyengine_us_data/tests/test_puf_impute.py b/policyengine_us_data/tests/test_puf_impute.py new file mode 100644 index 000000000..fcdcf763f --- /dev/null +++ b/policyengine_us_data/tests/test_puf_impute.py @@ -0,0 +1,366 @@ +"""Tests for PUF imputation, specifically SS sub-component reconciliation. + +When PUF imputation replaces social_security values, the sub-components +(retirement, disability, survivors, dependents) must be predicted from +demographics (not copied from the statistically-linked CPS record). +See: https://github.com/PolicyEngine/policyengine-us-data/issues/551 +""" + +import numpy as np +import pytest + +from policyengine_us_data.calibration.puf_impute import ( + MINIMUM_RETIREMENT_AGE, + _age_heuristic_ss_shares, + _qrf_ss_shares, + reconcile_ss_subcomponents, +) + +SS_SUBS = [ + "social_security_retirement", + "social_security_disability", + "social_security_survivors", + "social_security_dependents", +] + + +def _make_data( + orig_ss, + orig_ret, + orig_dis, + orig_surv, + orig_dep, + imputed_ss, + age=None, + is_male=None, +): + """Build a doubled data dict (CPS half + PUF half).""" + n = len(orig_ss) + tp = 2024 + data = { + "social_security": { + tp: np.concatenate([orig_ss, imputed_ss]).astype(np.float32) + }, + "social_security_retirement": { + tp: np.concatenate([orig_ret, orig_ret]).astype(np.float32) + }, + "social_security_disability": { + tp: np.concatenate([orig_dis, orig_dis]).astype(np.float32) + }, + "social_security_survivors": { + tp: np.concatenate([orig_surv, orig_surv]).astype(np.float32) + }, + "social_security_dependents": { + tp: np.concatenate([orig_dep, orig_dep]).astype(np.float32) + }, + } + if age is not None: + data["age"] = {tp: np.concatenate([age, age]).astype(np.float32)} + if is_male is not None: + data["is_male"] = { + tp: np.concatenate([is_male, is_male]).astype(np.float32) + } + return data, n, tp + + +class TestReconcileSsSubcomponents: + """Sub-components must sum to social_security after reconciliation.""" + + def test_subs_sum_to_total(self): + """PUF subs must sum to imputed SS total.""" + data, n, tp = _make_data( + orig_ss=np.array([20000, 15000]), + orig_ret=np.array([14000, 0]), + orig_dis=np.array([6000, 15000]), + orig_surv=np.array([0, 0]), + orig_dep=np.array([0, 0]), + imputed_ss=np.array([30000, 10000]), + age=np.array([70, 45]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + ss = data["social_security"][tp][puf] + total_subs = sum(data[sub][tp][puf] for sub in SS_SUBS) + np.testing.assert_allclose(total_subs, ss, rtol=1e-5) + + def test_age_determines_split(self): + """Age heuristic: >= 62 -> retirement, < 62 -> disability.""" + data, n, tp = _make_data( + orig_ss=np.array([20000, 15000]), + orig_ret=np.array([14000, 0]), + orig_dis=np.array([6000, 15000]), + orig_surv=np.array([0, 0]), + orig_dep=np.array([0, 0]), + imputed_ss=np.array([30000, 10000]), + age=np.array([70, 45]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + ret = data["social_security_retirement"][tp][puf] + dis = data["social_security_disability"][tp][puf] + # Person 0 is 70 -> retirement + assert ret[0] == pytest.approx(30000) + assert dis[0] == pytest.approx(0) + # Person 1 is 45 -> disability + assert ret[1] == pytest.approx(0) + assert dis[1] == pytest.approx(10000) + + def test_imputed_zero_clears_subs(self): + """When PUF imputes zero SS, all subs should be zero.""" + data, n, tp = _make_data( + orig_ss=np.array([20000]), + orig_ret=np.array([20000]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([0]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + for sub in SS_SUBS: + assert data[sub][tp][puf][0] == pytest.approx(0) + + def test_cps_half_unchanged(self): + """CPS half should not be modified.""" + orig_ret = np.array([14000, 0]) + data, n, tp = _make_data( + orig_ss=np.array([20000, 15000]), + orig_ret=orig_ret, + orig_dis=np.array([6000, 15000]), + orig_surv=np.array([0, 0]), + orig_dep=np.array([0, 0]), + imputed_ss=np.array([30000, 10000]), + ) + reconcile_ss_subcomponents(data, n, tp) + + cps = slice(0, n) + np.testing.assert_array_equal( + data["social_security_retirement"][tp][cps], + orig_ret, + ) + + def test_missing_subcomponent_is_skipped(self): + """If a sub-component is absent from data, no error.""" + data, n, tp = _make_data( + orig_ss=np.array([10000]), + orig_ret=np.array([10000]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([15000]), + ) + del data["social_security_survivors"] + del data["social_security_dependents"] + reconcile_ss_subcomponents(data, n, tp) + + def test_no_social_security_is_noop(self): + """If social_security is absent, function is a no-op.""" + data = {"some_var": {2024: np.array([1, 2])}} + reconcile_ss_subcomponents(data, 1, 2024) + + def test_subs_sum_to_total_mixed_cases(self): + """Comprehensive: subs sum to total across mixed cases.""" + data, n, tp = _make_data( + # Person 0: has CPS SS, has PUF SS (old) + # Person 1: no CPS SS, has PUF SS (old) + # Person 2: no CPS SS, has PUF SS (young) + # Person 3: has CPS SS, PUF zeroed out + orig_ss=np.array([20000, 0, 0, 15000]), + orig_ret=np.array([20000, 0, 0, 15000]), + orig_dis=np.array([0, 0, 0, 0]), + orig_surv=np.array([0, 0, 0, 0]), + orig_dep=np.array([0, 0, 0, 0]), + imputed_ss=np.array([25000, 18000, 12000, 0]), + age=np.array([70, 68, 40, 75]), + ) + reconcile_ss_subcomponents(data, n, tp) + + puf = slice(n, 2 * n) + ss = data["social_security"][tp][puf] + total_subs = sum(data[sub][tp][puf] for sub in SS_SUBS) + np.testing.assert_allclose(total_subs, ss, rtol=1e-5) + + +class TestAgeHeuristicSsShares: + """Age-based fallback for SS type classification.""" + + def test_elderly_gets_retirement(self): + """Age >= 62 -> retirement.""" + data, n, tp = _make_data( + orig_ss=np.array([0]), + orig_ret=np.array([0]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([25000]), + age=np.array([70]), + ) + puf_has_ss = np.array([True]) + shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss) + + assert shares["social_security_retirement"][0] == pytest.approx(1.0) + assert shares["social_security_disability"][0] == pytest.approx(0.0) + + def test_young_gets_disability(self): + """Age < 62 -> disability.""" + data, n, tp = _make_data( + orig_ss=np.array([0]), + orig_ret=np.array([0]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([18000]), + age=np.array([45]), + ) + puf_has_ss = np.array([True]) + shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss) + + assert shares["social_security_retirement"][0] == pytest.approx(0.0) + assert shares["social_security_disability"][0] == pytest.approx(1.0) + + def test_no_age_defaults_to_retirement(self): + """Without age data, default to retirement.""" + data, n, tp = _make_data( + orig_ss=np.array([0]), + orig_ret=np.array([0]), + orig_dis=np.array([0]), + orig_surv=np.array([0]), + orig_dep=np.array([0]), + imputed_ss=np.array([25000]), + ) + puf_has_ss = np.array([True]) + shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss) + + assert shares["social_security_retirement"][0] == pytest.approx(1.0) + + def test_mixed_ages(self): + """Multiple people split correctly by age threshold.""" + data, n, tp = _make_data( + orig_ss=np.array([0, 0, 0]), + orig_ret=np.array([0, 0, 0]), + orig_dis=np.array([0, 0, 0]), + orig_surv=np.array([0, 0, 0]), + orig_dep=np.array([0, 0, 0]), + imputed_ss=np.array([1, 1, 1]), + age=np.array([30, 62, 80]), + ) + puf_has_ss = np.array([True, True, True]) + shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss) + + # age 30 -> disability + assert shares["social_security_retirement"][0] == 0 + assert shares["social_security_disability"][0] == 1 + # age 62 -> retirement (>= threshold) + assert shares["social_security_retirement"][1] == 1 + assert shares["social_security_disability"][1] == 0 + # age 80 -> retirement + assert shares["social_security_retirement"][2] == 1 + + +class TestQrfSsShares: + """QRF-based SS sub-component share prediction.""" + + def test_returns_none_with_few_training_records(self): + """Returns None when < MIN_QRF_TRAINING_RECORDS have SS.""" + # Only 2 training records with SS > 0. + data, n, tp = _make_data( + orig_ss=np.array([10000, 5000, 0]), + orig_ret=np.array([10000, 0, 0]), + orig_dis=np.array([0, 5000, 0]), + orig_surv=np.array([0, 0, 0]), + orig_dep=np.array([0, 0, 0]), + imputed_ss=np.array([0, 0, 20000]), + age=np.array([70, 45, 55]), + ) + puf_has_ss = np.array([False, False, True]) + result = _qrf_ss_shares(data, n, tp, puf_has_ss) + assert result is None + + def test_shares_sum_to_one(self): + """QRF-predicted shares should sum to ~1 after normalisation.""" + pytest.importorskip("microimpute") + rng = np.random.default_rng(42) + n = 500 + + # Synthetic training: age > 62 mostly retirement, + # age < 62 mostly disability. + ages = rng.integers(20, 90, size=n).astype(np.float32) + is_male = rng.integers(0, 2, size=n).astype(np.float32) + ss_vals = rng.uniform(5000, 40000, size=n).astype(np.float32) + ret = np.where(ages >= 62, ss_vals, 0).astype(np.float32) + dis = np.where(ages < 62, ss_vals, 0).astype(np.float32) + surv = np.zeros(n, dtype=np.float32) + dep = np.zeros(n, dtype=np.float32) + + # PUF SS for all records. + puf_ss = rng.uniform(5000, 40000, size=n).astype(np.float32) + + data, nn, tp = _make_data( + orig_ss=ss_vals, + orig_ret=ret, + orig_dis=dis, + orig_surv=surv, + orig_dep=dep, + imputed_ss=puf_ss, + age=ages, + is_male=is_male, + ) + + puf_has_ss = puf_ss > 0 + shares = _qrf_ss_shares(data, nn, tp, puf_has_ss) + + assert shares is not None + total = sum(shares.get(sub, 0) for sub in SS_SUBS) + np.testing.assert_allclose(total, 1.0, atol=0.01) + + def test_elderly_predicted_as_retirement(self): + """QRF should mostly predict retirement for age >= 62.""" + pytest.importorskip("microimpute") + rng = np.random.default_rng(123) + n_train = 500 + + ages = rng.integers(20, 90, size=n_train).astype(np.float32) + ss_vals = rng.uniform(5000, 40000, size=n_train).astype(np.float32) + ret = np.where(ages >= 62, ss_vals, 0).astype(np.float32) + dis = np.where(ages < 62, ss_vals, 0).astype(np.float32) + surv = np.zeros(n_train, dtype=np.float32) + dep = np.zeros(n_train, dtype=np.float32) + + # Only elderly records get PUF SS > 0; training records + # get PUF SS = 0 so puf_has_ss selects just the elderly. + n_pred = 10 + pred_ages = np.full(n_pred, 70, dtype=np.float32) + + all_ages = np.concatenate([ages, pred_ages]) + all_ss = np.concatenate([ss_vals, np.zeros(n_pred)]) + all_ret = np.concatenate([ret, np.zeros(n_pred)]) + all_dis = np.concatenate([dis, np.zeros(n_pred)]) + all_surv = np.concatenate([surv, np.zeros(n_pred)]) + all_dep = np.concatenate([dep, np.zeros(n_pred)]) + puf_ss = np.concatenate( + [ + np.zeros(n_train, dtype=np.float32), + np.full(n_pred, 20000, dtype=np.float32), + ] + ) + + data, nn, tp = _make_data( + orig_ss=all_ss, + orig_ret=all_ret, + orig_dis=all_dis, + orig_surv=all_surv, + orig_dep=all_dep, + imputed_ss=puf_ss, + age=all_ages, + ) + + puf_has_ss = data["social_security"][tp][nn:] > 0 + shares = _qrf_ss_shares(data, nn, tp, puf_has_ss) + + assert shares is not None + ret_share = shares["social_security_retirement"] + # All elderly -> retirement share should dominate. + assert np.mean(ret_share) > 0.7 diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 134e919b3..d7410d2eb 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -53,14 +53,43 @@ "social_security_disability": 148e9, # ~10.2% (disabled workers) "social_security_survivors": 160e9, # ~11.0% (widows, children of deceased) "social_security_dependents": 84e9, # ~5.8% (spouses/children of retired+disabled) - # IRA contribution totals from IRS SOI IRA accumulation tables. - # Tax year 2022: ~5M taxpayers x $4,510 avg = ~$22.5B traditional; - # ~10M taxpayers x $3,482 avg = ~$34.8B Roth. - # Uprated ~12% to 2024 for limit increases ($6k->$7k) and - # wage growth. + # Retirement contribution calibration targets. + # + # traditional_ira_contributions: IRS SOI Publication 1304, Table 1.4 + # (TY 2022), "IRA payments" deduction — $13.17B (col 124, row + # "All returns, total"). This is the actual above-the-line + # deduction claimed on returns. The variable flows directly into + # the ALD with no deductibility logic in policyengine-us, so the + # target must match the deduction, not total contributions. + # https://www.irs.gov/statistics/soi-tax-stats-individual-statistical-tables-by-size-of-adjusted-gross-income + "traditional_ira_contributions": 13.2e9, + # traditional_401k_contributions & roth_401k_contributions: + # BEA/FRED National Income Accounts. Total DC employer+employee + # = $815.4B (Y351RC1A027NBEA), employer-only = $247.5B + # (W351RC0A144NBEA), employee elective deferrals = $567.9B. + # Split into traditional/Roth using estimated 15% Roth dollar + # share (Vanguard How America Saves 2024: 18% participation, + # ~15% dollar share; PSCA 67th Annual Survey: 21% participation). + # Traditional: $567.9B × 85% = $482.7B + # Roth: $567.9B × 15% = $85.2B + # https://fred.stlouisfed.org/series/Y351RC1A027NBEA + # https://fred.stlouisfed.org/series/W351RC0A144NBEA + # https://corporate.vanguard.com/content/dam/corp/research/pdf/how_america_saves_report_2024.pdf + "traditional_401k_contributions": 482.7e9, + "roth_401k_contributions": 85.2e9, + # self_employed_pension_contribution_ald: IRS SOI Publication + # 1304, Table 1.4 (TY 2022), "Payments to a Keogh plan" — + # $29.48B (col 116, row "All returns, total"). Includes + # SEP-IRAs, SIMPLE-IRAs, and traditional Keogh/HR-10 plans. + # Targeting the ALD (not the input) because policyengine-us + # applies a min(contributions, SE_income) cap. + # https://www.irs.gov/statistics/soi-tax-stats-individual-statistical-tables-by-size-of-adjusted-gross-income + "self_employed_pension_contribution_ald": 29.5e9, + # roth_ira_contributions: IRS SOI IRA Accumulation Tables 5 & 6 + # (TY 2022). Total Roth IRA contributions = $35.0B (10.04M + # contributors). Direct administrative source. # https://www.irs.gov/statistics/soi-tax-stats-accumulation-and-distribution-of-individual-retirement-arrangements - "traditional_ira_contributions": 25e9, - "roth_ira_contributions": 39e9, + "roth_ira_contributions": 35.0e9, } diff --git a/tests/test_no_formula_variables_stored.py b/tests/test_no_formula_variables_stored.py new file mode 100644 index 000000000..9334a5c78 --- /dev/null +++ b/tests/test_no_formula_variables_stored.py @@ -0,0 +1,183 @@ +"""Test dataset variable integrity. + +1. Variables with formulas in policyengine-us will be recomputed by the + simulation engine, ignoring any stored value. Storing them wastes + space and can mislead during validation. + +2. Stored input values should match what the simulation actually uses, + to catch issues like set_input_divide_by_period silently altering + values, or sub-components not summing to their parent total. +""" + +import h5py +import numpy as np +import pytest +from policyengine_us_data.datasets.cps.extended_cps import ExtendedCPS_2024 + +KNOWN_FORMULA_EXCEPTIONS = { + # person_id is stored for identity tracking even though it has a + # trivial formula (arange). Safe to keep. + "person_id", +} + + +@pytest.fixture(scope="module") +def tax_benefit_system(): + from policyengine_us import CountryTaxBenefitSystem + + return CountryTaxBenefitSystem() + + +@pytest.fixture(scope="module") +def formula_variables(tax_benefit_system): + """Return set of variable names computed by policyengine-us. + + Includes variables with explicit formulas as well as those using + ``adds`` or ``subtracts`` (which the engine auto-sums at runtime). + """ + return { + name + for name, var in tax_benefit_system.variables.items() + if (hasattr(var, "formulas") and len(var.formulas) > 0) + or getattr(var, "adds", None) + or getattr(var, "subtracts", None) + } + + +@pytest.fixture(scope="module") +def dataset_path(): + path = ExtendedCPS_2024.file_path + if not path.exists(): + pytest.skip("Extended CPS 2024 not built locally") + return path + + +@pytest.fixture(scope="module") +def stored_variables(dataset_path): + """Return set of variable names stored in the extended CPS.""" + with h5py.File(dataset_path, "r") as f: + return set(f.keys()) + + +@pytest.fixture(scope="module") +def sim(): + from policyengine_us import Microsimulation + + return Microsimulation(dataset=ExtendedCPS_2024) + + +def test_no_formula_variables_stored(formula_variables, stored_variables): + """Computed variables should not be stored in the dataset.""" + overlap = (stored_variables & formula_variables) - KNOWN_FORMULA_EXCEPTIONS + if overlap: + msg = ( + f"These {len(overlap)} variables are computed by " + f"policyengine-us (formulas/adds/subtracts) but are " + f"stored in the dataset " + f"(stored values will be IGNORED by the simulation):\n" + ) + for v in sorted(overlap): + msg += f" - {v}\n" + pytest.fail(msg) + + +def test_stored_values_match_computed( + sim, stored_variables, formula_variables, dataset_path +): + """For input-only variables, stored values should match what the + simulation returns (catches set_input issues, rounding, etc.).""" + input_vars = stored_variables - formula_variables + mismatches = [] + + with h5py.File(dataset_path, "r") as f: + for var_name in sorted(input_vars): + if var_name not in f or "2024" not in f[var_name]: + continue + stored = f[var_name]["2024"][...] + if stored.dtype.kind not in ("f", "i", "u"): + continue + + stored_f = stored.astype(float) + stored_total = np.sum(stored_f) + if abs(stored_total) < 1: + continue + + try: + computed = np.array(sim.calculate(var_name, 2024)) + except Exception: + continue + + computed_total = np.sum(computed.astype(float)) + if abs(stored_total) > 0: + pct_diff = ( + abs(stored_total - computed_total) + / abs(stored_total) + * 100 + ) + else: + pct_diff = 0 + + if pct_diff > 1: + mismatches.append( + f" {var_name}: stored=${stored_total:,.0f}, " + f"computed=${computed_total:,.0f}, " + f"diff={pct_diff:.1f}%" + ) + + if mismatches: + msg = ( + f"These {len(mismatches)} input variables have >1% " + f"difference between stored and computed values:\n" + ) + msg += "\n".join(mismatches) + pytest.fail(msg) + + +def test_ss_subcomponents_sum_to_computed_total(sim, dataset_path): + """Social Security sub-components should sum to the computed total. + + ``social_security`` is computed via ``adds`` in policyengine-us and + is NOT stored in the dataset. We verify that the sub-components + stored in the dataset sum to the simulation's computed total. + """ + with h5py.File(dataset_path, "r") as f: + ss_retirement = f["social_security_retirement"]["2024"][...].astype( + float + ) + ss_disability = f["social_security_disability"]["2024"][...].astype( + float + ) + ss_survivors = f["social_security_survivors"]["2024"][...].astype( + float + ) + ss_dependents = f["social_security_dependents"]["2024"][...].astype( + float + ) + + sub_sum = ss_retirement + ss_disability + ss_survivors + ss_dependents + computed_total = np.array(sim.calculate("social_security", 2024)).astype( + float + ) + + # Only check records that have any SS income + has_ss = computed_total > 0 + if not np.any(has_ss): + pytest.skip("No SS recipients in dataset") + + total_computed = np.sum(computed_total[has_ss]) + total_sub = np.sum(sub_sum[has_ss]) + pct_diff = abs(total_computed - total_sub) / total_computed * 100 + + assert pct_diff < 1, ( + f"SS sub-components don't sum to computed total: " + f"computed=${total_computed:,.0f}, sub_sum=${total_sub:,.0f}, " + f"diff={pct_diff:.1f}%" + ) + + # Per-person check: no person's sub-components should exceed total + excess = sub_sum[has_ss] - computed_total[has_ss] + n_excess = np.sum(excess > 1) + assert n_excess == 0, ( + f"{n_excess} people have SS sub-components exceeding " + f"their computed total SS income" + ) diff --git a/uv.lock b/uv.lock index fc61e96df..df1efae4f 100644 --- a/uv.lock +++ b/uv.lock @@ -637,6 +637,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" }, { url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" }, { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" }, + { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" }, { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" }, { url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" }, { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" }, @@ -644,6 +645,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/2f/28592176381b9ab2cafa12829ba7b472d177f3acc35d8fbcf3673d966fff/greenlet-3.3.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:a1e41a81c7e2825822f4e068c48cb2196002362619e2d70b148f20a831c00739", size = 275140, upload-time = "2025-12-04T14:23:01.282Z" }, { url = "https://files.pythonhosted.org/packages/2c/80/fbe937bf81e9fca98c981fe499e59a3f45df2a04da0baa5c2be0dca0d329/greenlet-3.3.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f515a47d02da4d30caaa85b69474cec77b7929b2e936ff7fb853d42f4bf8808", size = 599219, upload-time = "2025-12-04T14:50:08.309Z" }, { url = "https://files.pythonhosted.org/packages/c2/ff/7c985128f0514271b8268476af89aee6866df5eec04ac17dcfbc676213df/greenlet-3.3.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d2d9fd66bfadf230b385fdc90426fcd6eb64db54b40c495b72ac0feb5766c54", size = 610211, upload-time = "2025-12-04T14:57:43.968Z" }, + { url = "https://files.pythonhosted.org/packages/79/07/c47a82d881319ec18a4510bb30463ed6891f2ad2c1901ed5ec23d3de351f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30a6e28487a790417d036088b3bcb3f3ac7d8babaa7d0139edbaddebf3af9492", size = 624311, upload-time = "2025-12-04T15:07:14.697Z" }, { url = "https://files.pythonhosted.org/packages/fd/8e/424b8c6e78bd9837d14ff7df01a9829fc883ba2ab4ea787d4f848435f23f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:087ea5e004437321508a8d6f20efc4cfec5e3c30118e1417ea96ed1d93950527", size = 612833, upload-time = "2025-12-04T14:26:03.669Z" }, { url = "https://files.pythonhosted.org/packages/b5/ba/56699ff9b7c76ca12f1cdc27a886d0f81f2189c3455ff9f65246780f713d/greenlet-3.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ab97cf74045343f6c60a39913fa59710e4bd26a536ce7ab2397adf8b27e67c39", size = 1567256, upload-time = "2025-12-04T15:04:25.276Z" }, { url = "https://files.pythonhosted.org/packages/1e/37/f31136132967982d698c71a281a8901daf1a8fbab935dce7c0cf15f942cc/greenlet-3.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5375d2e23184629112ca1ea89a53389dddbffcf417dad40125713d88eb5f96e8", size = 1636483, upload-time = "2025-12-04T14:27:30.804Z" }, @@ -1842,7 +1844,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.570.7" +version = "1.587.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -1850,9 +1852,9 @@ dependencies = [ { name = "policyengine-core" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6a/eb/291b3085aa0fa97fcce4987d54991118f21aead49647b3f475998459f46b/policyengine_us-1.570.7.tar.gz", hash = "sha256:a2967af86a61468a0bdb6b2dc7af2fd0bb0f0064203fa557b6fee8023058360a", size = 8668680, upload-time = "2026-02-19T07:17:11.264Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a8/15/8a12714d124b509346e60c927f7f344ee3b99c2b280bcfa9a053395d68e6/policyengine_us-1.587.0.tar.gz", hash = "sha256:399339eeea9a38caf6800432bc5eaa3b07b7b09ea269f4f3ba9f9c02aae587b9", size = 8630430, upload-time = "2026-02-25T23:35:46.002Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/38/36/0213955310076e4dec2781baeabf96b6d6937f99cd19c373b363bfbd7152/policyengine_us-1.570.7-py3-none-any.whl", hash = "sha256:374fd5357d6cb3734b900bd08dfdb61760dfc5b913ed686f57a40239565b0edd", size = 7825404, upload-time = "2026-02-19T07:17:08.877Z" }, + { url = "https://files.pythonhosted.org/packages/13/5a/3b643bf8c20d94d6fbbb638664eef571dcee0e88e98d4797128f6913cd6a/policyengine_us-1.587.0-py3-none-any.whl", hash = "sha256:712234f850d943464ae90a9f5055fb9a1bdba72c87beb2270618cb57803e9e6e", size = 7953707, upload-time = "2026-02-25T23:35:42.964Z" }, ] [[package]] diff --git a/validation/validate_retirement_imputation.py b/validation/validate_retirement_imputation.py new file mode 100644 index 000000000..87d911a92 --- /dev/null +++ b/validation/validate_retirement_imputation.py @@ -0,0 +1,264 @@ +"""Post-build validation for retirement contribution QRF imputation. + +Run after a full Extended CPS build to verify that the PUF clone +half has plausible retirement contribution values. + +Usage: + python validation/validate_retirement_imputation.py + +Requires: + - Python 3.12+ (for microimpute/policyengine_us) + - Built ExtendedCPS_2024 dataset available +""" + +import logging +import sys + +import numpy as np +import pandas as pd + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +# Calibration targets (from etl_national_targets.py / loss.py) +TARGETS = { + "traditional_401k_contributions": 482.7e9, + "roth_401k_contributions": 85.2e9, + "traditional_ira_contributions": 13.2e9, + "roth_ira_contributions": 35.0e9, + "self_employed_pension_contribution_ald": 29.5e9, +} + +# Contribution limits for 2024 +LIMITS_2024 = { + "401k": 23_000, + "401k_catch_up": 7_500, + "ira": 7_000, + "ira_catch_up": 1_000, +} + + +def load_extended_cps(): + """Load the Extended CPS dataset.""" + from policyengine_us import Microsimulation + from policyengine_us_data import ExtendedCPS_2024 + + logger.info("Loading ExtendedCPS_2024...") + sim = Microsimulation(dataset=ExtendedCPS_2024) + return sim + + +def validate_constraints(sim) -> list: + """Check hard constraints on imputed values.""" + issues = [] + year = 2024 + + emp_income = sim.calculate( + "employment_income", year, map_to="person" + ).values + se_income = sim.calculate( + "self_employment_income", year, map_to="person" + ).values + age = sim.calculate("age", year, map_to="person").values + + catch_up = age >= 50 + max_401k = LIMITS_2024["401k"] + catch_up * LIMITS_2024["401k_catch_up"] + max_ira = LIMITS_2024["ira"] + catch_up * LIMITS_2024["ira_catch_up"] + + # 401k constraints + for var in ( + "traditional_401k_contributions", + "roth_401k_contributions", + ): + vals = sim.calculate(var, year, map_to="person").values + + n_negative = (vals < 0).sum() + if n_negative > 0: + issues.append(f"FAIL: {var} has {n_negative} negative values") + + n_over_cap = (vals > max_401k + 1).sum() + if n_over_cap > 0: + issues.append( + f"FAIL: {var} has {n_over_cap} values exceeding " f"401k cap" + ) + + zero_wage = emp_income == 0 + n_nonzero_no_wage = (vals[zero_wage] > 0).sum() + if n_nonzero_no_wage > 0: + issues.append( + f"FAIL: {var} has {n_nonzero_no_wage} nonzero values " + f"for records with $0 wages" + ) + else: + logger.info( + "PASS: %s is $0 for all %d zero-wage records", + var, + zero_wage.sum(), + ) + + # IRA constraints + for var in ( + "traditional_ira_contributions", + "roth_ira_contributions", + ): + vals = sim.calculate(var, year, map_to="person").values + + n_negative = (vals < 0).sum() + if n_negative > 0: + issues.append(f"FAIL: {var} has {n_negative} negative values") + + n_over_cap = (vals > max_ira + 1).sum() + if n_over_cap > 0: + issues.append( + f"FAIL: {var} has {n_over_cap} values exceeding " f"IRA cap" + ) + + # SE pension constraint + var = "self_employed_pension_contributions" + vals = sim.calculate(var, year, map_to="person").values + zero_se = se_income == 0 + n_nonzero_no_se = (vals[zero_se] > 0).sum() + if n_nonzero_no_se > 0: + issues.append( + f"FAIL: {var} has {n_nonzero_no_se} nonzero values " + f"for records with $0 SE income" + ) + else: + logger.info( + "PASS: %s is $0 for all %d zero-SE records", + var, + zero_se.sum(), + ) + + return issues + + +def validate_aggregates(sim) -> list: + """Compare weighted aggregates against calibration targets.""" + issues = [] + year = 2024 + + weight = sim.calculate("person_weight", year).values + + logger.info( + "\n%-45s %15s %15s %10s", "Variable", "Weighted Sum", "Target", "Ratio" + ) + logger.info("-" * 90) + + for var, target in TARGETS.items(): + try: + vals = sim.calculate(var, year, map_to="person").values + except (ValueError, KeyError): + logger.warning("Could not calculate %s", var) + continue + + weighted_sum = (vals * weight).sum() + ratio = weighted_sum / target if target != 0 else float("inf") + + logger.info( + "%-45s $%14.1fB $%14.1fB %9.1f%%", + var, + weighted_sum / 1e9, + target / 1e9, + ratio * 100, + ) + + # Flag if more than 2x off target + if ratio < 0.1 or ratio > 5.0: + issues.append( + f"WARNING: {var} weighted sum " + f"${weighted_sum/1e9:.1f}B is far from " + f"target ${target/1e9:.1f}B " + f"(ratio={ratio:.2f})" + ) + + return issues + + +def validate_cps_vs_puf_half(sim) -> list: + """Compare CPS half and PUF half distributions.""" + issues = [] + year = 2024 + + n_persons = len(sim.calculate("person_id", year).values) + n_half = n_persons // 2 + + logger.info("\nCPS vs PUF half comparison (n=%d per half):", n_half) + logger.info( + "%-45s %12s %12s %12s %12s", + "Variable", + "CPS mean", + "PUF mean", + "CPS >0 pct", + "PUF >0 pct", + ) + logger.info("-" * 95) + + for var in [ + "traditional_401k_contributions", + "roth_401k_contributions", + "traditional_ira_contributions", + "roth_ira_contributions", + "self_employed_pension_contributions", + ]: + try: + vals = sim.calculate(var, year, map_to="person").values + except (ValueError, KeyError): + continue + + cps_vals = vals[:n_half] + puf_vals = vals[n_half:] + + cps_mean = cps_vals.mean() + puf_mean = puf_vals.mean() + cps_pct = (cps_vals > 0).mean() * 100 + puf_pct = (puf_vals > 0).mean() * 100 + + logger.info( + "%-45s $%11.0f $%11.0f %11.1f%% %11.1f%%", + var, + cps_mean, + puf_mean, + cps_pct, + puf_pct, + ) + + return issues + + +def main(): + """Run all validations.""" + sim = load_extended_cps() + + logger.info("\n" + "=" * 60) + logger.info("RETIREMENT CONTRIBUTION IMPUTATION VALIDATION") + logger.info("=" * 60) + + all_issues = [] + + logger.info("\n1. CONSTRAINT CHECKS") + logger.info("-" * 40) + all_issues.extend(validate_constraints(sim)) + + logger.info("\n2. AGGREGATE COMPARISONS") + logger.info("-" * 40) + all_issues.extend(validate_aggregates(sim)) + + logger.info("\n3. CPS vs PUF HALF DISTRIBUTIONS") + logger.info("-" * 40) + all_issues.extend(validate_cps_vs_puf_half(sim)) + + logger.info("\n" + "=" * 60) + if all_issues: + logger.info("ISSUES FOUND: %d", len(all_issues)) + for issue in all_issues: + logger.info(" - %s", issue) + else: + logger.info("ALL CHECKS PASSED") + logger.info("=" * 60) + + return len(all_issues) + + +if __name__ == "__main__": + sys.exit(main())