From 6ecfc19688516f980cbcc7734991118ae7f83b20 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 16 Feb 2026 08:37:40 -0500 Subject: [PATCH 1/6] Add PUF impute + source impute modules, fix $5.8M AGI ceiling (issue #530) Adds puf_impute.py and source_impute.py from PR #516 (by @MaxGhenis), refactors extended_cps.py to delegate to the new modules, and integrates both into the unified calibration pipeline. The core fix removes the subsample(10_000) call that dropped high-income PUF records before QRF training, which caused a hard AGI ceiling at ~$6.26M after uprating. Co-Authored-By: Max Ghenis Co-Authored-By: Claude Opus 4.6 --- changelog_entry.yaml | 24 +- .../calibration/puf_impute.py | 555 +++++++++++++ .../calibration/source_impute.py | 757 ++++++++++++++++++ .../calibration/unified_calibration.py | 200 ++++- .../datasets/cps/extended_cps.py | 438 +--------- .../tests/test_calibration/test_puf_impute.py | 134 ++++ .../test_calibration/test_source_impute.py | 169 ++++ 7 files changed, 1843 insertions(+), 434 deletions(-) create mode 100644 policyengine_us_data/calibration/puf_impute.py create mode 100644 policyengine_us_data/calibration/source_impute.py create mode 100644 policyengine_us_data/tests/test_calibration/test_puf_impute.py create mode 100644 policyengine_us_data/tests/test_calibration/test_source_impute.py diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e4c231aa..8e9b4fb6 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,22 +1,12 @@ - bump: minor changes: added: - - Census-block-first calibration pipeline (calibration/ package) ported from PR #516 - - Clone-and-assign module for population-weighted census block sampling - - Unified matrix builder with clone-by-clone simulation, COO caching, and target_overview-based querying - - Unified calibration CLI with L0 optimization and seeded takeup re-randomization - - 28 new tests for the calibration pipeline - - Integration test for build_matrix geographic masking (national/state/CD) - - Tests for drop_target_groups utility - - voluntary_filing.yaml takeup rate parameter + - PUF clone + QRF imputation module (puf_impute.py) with state_fips predictor, fixing $5.8M AGI ceiling from subsample bug (issue #530) + - ACS/SIPP/SCF geographic re-imputation module (source_impute.py) with state predictor + - PUF and source impute integration into unified calibration pipeline (--puf-dataset, --skip-puf, --skip-source-impute flags) + - 21 new tests for puf_impute and source_impute modules changed: - - Rewrote local_area_calibration_setup.ipynb for clone-based pipeline - - Renamed _get_geo_level to get_geo_level (now cross-module public API) + - Refactored extended_cps.py to delegate to puf_impute.puf_clone_dataset() (443 -> 75 lines) + - unified_calibration.py pipeline now supports optional source imputation and PUF cloning steps fixed: - - Fix Jupyter import error in unified_calibration.py (OutStream.reconfigure moved to main) - - Fix modal_app/remote_calibration_runner.py referencing deleted fit_calibration_weights.py - - Fix _coo_parts stale state bug on build_matrix re-call after failure - - Remove hardcoded voluntary_filing rate in favor of YAML parameter - removed: - - SparseMatrixBuilder, MatrixTracer, and fit_calibration_weights (replaced by unified pipeline) - - 8 old SparseMatrixBuilder-dependent tests (replaced by new test_calibration suite) + - QRF training now uses ALL PUF records instead of subsample(10_000), removing hard AGI ceiling at ~$6.26M diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py new file mode 100644 index 00000000..9c185c9f --- /dev/null +++ b/policyengine_us_data/calibration/puf_impute.py @@ -0,0 +1,555 @@ +"""PUF clone and QRF imputation for calibration pipeline. + +Doubles CPS records: one half keeps original values, the other half +gets PUF tax variables imputed via Quantile Random Forest. Geography +(state_fips) is included as a QRF predictor so imputations vary by +state. + +Usage within the calibration pipeline: + 1. Load raw CPS dataset + 2. Clone 10x and assign geography + 3. Call puf_clone_dataset() to double records and impute PUF vars + 4. Save expanded dataset for matrix building +""" + +import gc +import logging +import time +from typing import Dict, List, Optional + +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) + +DEMOGRAPHIC_PREDICTORS = [ + "age", + "is_male", + "tax_unit_is_joint", + "tax_unit_count_dependents", + "is_tax_unit_head", + "is_tax_unit_spouse", + "is_tax_unit_dependent", + "state_fips", +] + +IMPUTED_VARIABLES = [ + "employment_income", + "partnership_s_corp_income", + "social_security", + "taxable_pension_income", + "interest_deduction", + "tax_exempt_pension_income", + "long_term_capital_gains", + "unreimbursed_business_employee_expenses", + "pre_tax_contributions", + "taxable_ira_distributions", + "self_employment_income", + "w2_wages_from_qualified_business", + "unadjusted_basis_qualified_property", + "business_is_sstb", + "short_term_capital_gains", + "qualified_dividend_income", + "charitable_cash_donations", + "self_employed_pension_contribution_ald", + "unrecaptured_section_1250_gain", + "taxable_unemployment_compensation", + "taxable_interest_income", + "domestic_production_ald", + "self_employed_health_insurance_ald", + "rental_income", + "non_qualified_dividend_income", + "cdcc_relevant_expenses", + "tax_exempt_interest_income", + "salt_refund_income", + "foreign_tax_credit", + "estate_income", + "charitable_non_cash_donations", + "american_opportunity_credit", + "miscellaneous_income", + "alimony_expense", + "farm_income", + "partnership_se_income", + "alimony_income", + "health_savings_account_ald", + "non_sch_d_capital_gains", + "general_business_credit", + "energy_efficient_home_improvement_credit", + "traditional_ira_contributions", + "amt_foreign_tax_credit", + "excess_withheld_payroll_tax", + "savers_credit", + "student_loan_interest", + "investment_income_elected_form_4952", + "early_withdrawal_penalty", + "prior_year_minimum_tax_credit", + "farm_rent_income", + "qualified_tuition_expenses", + "educator_expense", + "long_term_capital_gains_on_collectibles", + "other_credits", + "casualty_loss", + "unreported_payroll_tax", + "recapture_of_investment_credit", + "deductible_mortgage_interest", + "qualified_reit_and_ptp_income", + "qualified_bdc_income", + "farm_operations_income", + "estate_income_would_be_qualified", + "farm_operations_income_would_be_qualified", + "farm_rent_income_would_be_qualified", + "partnership_s_corp_income_would_be_qualified", + "rental_income_would_be_qualified", + "self_employment_income_would_be_qualified", +] + +OVERRIDDEN_IMPUTED_VARIABLES = [ + "partnership_s_corp_income", + "interest_deduction", + "unreimbursed_business_employee_expenses", + "pre_tax_contributions", + "w2_wages_from_qualified_business", + "unadjusted_basis_qualified_property", + "business_is_sstb", + "charitable_cash_donations", + "self_employed_pension_contribution_ald", + "unrecaptured_section_1250_gain", + "taxable_unemployment_compensation", + "domestic_production_ald", + "self_employed_health_insurance_ald", + "cdcc_relevant_expenses", + "salt_refund_income", + "foreign_tax_credit", + "estate_income", + "charitable_non_cash_donations", + "american_opportunity_credit", + "miscellaneous_income", + "alimony_expense", + "health_savings_account_ald", + "non_sch_d_capital_gains", + "general_business_credit", + "energy_efficient_home_improvement_credit", + "amt_foreign_tax_credit", + "excess_withheld_payroll_tax", + "savers_credit", + "student_loan_interest", + "investment_income_elected_form_4952", + "early_withdrawal_penalty", + "prior_year_minimum_tax_credit", + "farm_rent_income", + "qualified_tuition_expenses", + "educator_expense", + "long_term_capital_gains_on_collectibles", + "other_credits", + "casualty_loss", + "unreported_payroll_tax", + "recapture_of_investment_credit", + "deductible_mortgage_interest", + "qualified_reit_and_ptp_income", + "qualified_bdc_income", + "farm_operations_income", + "estate_income_would_be_qualified", + "farm_operations_income_would_be_qualified", + "farm_rent_income_would_be_qualified", + "partnership_s_corp_income_would_be_qualified", + "rental_income_would_be_qualified", +] + + +def puf_clone_dataset( + data: Dict[str, Dict[int, np.ndarray]], + state_fips: np.ndarray, + time_period: int = 2024, + puf_dataset=None, + skip_qrf: bool = False, + dataset_path: Optional[str] = None, +) -> Dict[str, Dict[int, np.ndarray]]: + """Clone CPS data 2x and impute PUF variables on one half. + + The first half keeps CPS values (with OVERRIDDEN vars QRF'd). + The second half gets full PUF QRF imputation. The second half + has household weights set to zero. + + Args: + data: CPS dataset dict {variable: {time_period: array}}. + state_fips: State FIPS per household, shape (n_households,). + time_period: Tax year. + puf_dataset: PUF dataset class or path for QRF training. + If None, skips QRF (same as skip_qrf=True). + skip_qrf: If True, skip QRF imputation (for testing). + dataset_path: Path to CPS h5 file (needed for QRF to + compute demographic predictors via Microsimulation). + + Returns: + New data dict with doubled records. + """ + household_ids = data["household_id"][time_period] + n_households = len(household_ids) + person_count = len(data["person_id"][time_period]) + + logger.info( + "PUF clone: %d households, %d persons", + n_households, + person_count, + ) + + y_full = None + y_override = None + if not skip_qrf and puf_dataset is not None: + y_full, y_override = _run_qrf_imputation( + data, + state_fips, + time_period, + puf_dataset, + dataset_path=dataset_path, + ) + + cps_sim = None + tbs = None + if (y_full or y_override) and dataset_path is not None: + from policyengine_us import Microsimulation + + cps_sim = Microsimulation(dataset=dataset_path) + tbs = cps_sim.tax_benefit_system + + def _map_to_entity(pred_values, variable_name): + if cps_sim is None or tbs is None: + return pred_values + var_meta = tbs.variables.get(variable_name) + if var_meta is None: + return pred_values + entity = var_meta.entity.key + if entity != "person": + return cps_sim.populations[entity].value_from_first_person( + pred_values + ) + return pred_values + + # Impute weeks_unemployed for PUF half + puf_weeks = None + if y_full is not None and dataset_path is not None: + puf_weeks = _impute_weeks_unemployed( + data, y_full, time_period, dataset_path + ) + + new_data = {} + for variable, time_dict in data.items(): + values = time_dict[time_period] + + if variable in OVERRIDDEN_IMPUTED_VARIABLES and y_override: + pred = _map_to_entity(y_override[variable], variable) + new_data[variable] = {time_period: np.concatenate([pred, pred])} + elif variable in IMPUTED_VARIABLES and y_full: + pred = _map_to_entity(y_full[variable], variable) + new_data[variable] = {time_period: np.concatenate([values, pred])} + elif variable == "person_id": + new_data[variable] = { + time_period: np.concatenate([values, values + values.max()]) + } + elif "_id" in variable: + new_data[variable] = { + time_period: np.concatenate([values, values + values.max()]) + } + elif "_weight" in variable: + new_data[variable] = { + time_period: np.concatenate([values, values * 0]) + } + elif variable == "weeks_unemployed" and puf_weeks is not None: + new_data[variable] = { + time_period: np.concatenate([values, puf_weeks]) + } + else: + new_data[variable] = { + time_period: np.concatenate([values, values]) + } + + new_data["state_fips"] = { + time_period: np.concatenate([state_fips, state_fips]).astype(np.int32) + } + + if y_full: + for var in IMPUTED_VARIABLES: + if var not in data: + pred = _map_to_entity(y_full[var], var) + orig = np.zeros_like(pred) + new_data[var] = {time_period: np.concatenate([orig, pred])} + + if cps_sim is not None: + del cps_sim + + logger.info( + "PUF clone complete: %d -> %d households", + n_households, + n_households * 2, + ) + return new_data + + +def _impute_weeks_unemployed( + data: Dict[str, Dict[int, np.ndarray]], + puf_imputations: Dict[str, np.ndarray], + time_period: int, + dataset_path: str, +) -> np.ndarray: + """Impute weeks_unemployed for the PUF half using QRF. + + Uses CPS as training data and imputed PUF demographics as + test data, preserving the joint distribution of weeks with + unemployment compensation. + + Args: + data: CPS data dict. + puf_imputations: Dict of PUF-imputed variable arrays. + time_period: Tax year. + dataset_path: Path to CPS h5 for Microsimulation. + + Returns: + Array of imputed weeks for PUF half. + """ + from microimpute.models.qrf import QRF + from policyengine_us import Microsimulation + + cps_sim = Microsimulation(dataset=dataset_path) + try: + cps_weeks = cps_sim.calculate("weeks_unemployed").values + except (ValueError, KeyError): + logger.warning("weeks_unemployed not in CPS, returning zeros") + n_persons = len(data["person_id"][time_period]) + del cps_sim + return np.zeros(n_persons) + + WEEKS_PREDICTORS = [ + "age", + "is_male", + "tax_unit_is_joint", + "is_tax_unit_head", + "is_tax_unit_spouse", + "is_tax_unit_dependent", + ] + + X_train = cps_sim.calculate_dataframe(WEEKS_PREDICTORS) + X_train["weeks_unemployed"] = cps_weeks + + if "taxable_unemployment_compensation" in puf_imputations: + cps_uc = cps_sim.calculate("unemployment_compensation").values + X_train["unemployment_compensation"] = cps_uc + WEEKS_PREDICTORS = WEEKS_PREDICTORS + ["unemployment_compensation"] + + X_test = cps_sim.calculate_dataframe( + [p for p in WEEKS_PREDICTORS if p != "unemployment_compensation"] + ) + + if "taxable_unemployment_compensation" in puf_imputations: + X_test["unemployment_compensation"] = puf_imputations[ + "taxable_unemployment_compensation" + ] + + del cps_sim + + qrf = QRF(log_level="INFO", memory_efficient=True) + sample_size = min(5000, len(X_train)) + if len(X_train) > sample_size: + X_train_sampled = X_train.sample(n=sample_size, random_state=42) + else: + X_train_sampled = X_train + + fitted = qrf.fit( + X_train=X_train_sampled, + predictors=WEEKS_PREDICTORS, + imputed_variables=["weeks_unemployed"], + n_jobs=1, + ) + predictions = fitted.predict(X_test=X_test) + imputed_weeks = predictions["weeks_unemployed"].values + + imputed_weeks = np.clip(imputed_weeks, 0, 52) + if "unemployment_compensation" in X_test.columns: + imputed_weeks = np.where( + X_test["unemployment_compensation"].values > 0, + imputed_weeks, + 0, + ) + + logger.info( + "Imputed weeks_unemployed for PUF: " "%d with weeks > 0, mean = %.1f", + (imputed_weeks > 0).sum(), + ( + imputed_weeks[imputed_weeks > 0].mean() + if (imputed_weeks > 0).any() + else 0 + ), + ) + + del fitted, predictions + gc.collect() + return imputed_weeks + + +def _run_qrf_imputation( + data: Dict[str, Dict[int, np.ndarray]], + state_fips: np.ndarray, + time_period: int, + puf_dataset, + dataset_path: Optional[str] = None, +) -> tuple: + """Run QRF imputation for PUF variables. + + Trains QRF on ALL PUF records (no subsample) with random + state assignment, and predicts on CPS data using + demographics + state_fips. + + Args: + data: CPS data dict. + state_fips: State FIPS per household. + time_period: Tax year. + puf_dataset: PUF dataset class or path. + dataset_path: Path to CPS h5 for computing + demographic predictors via Microsimulation. + + Returns: + Tuple of (y_full_imputations, y_override_imputations) + as dicts of {variable: np.ndarray}. + """ + from microimpute.models.qrf import QRF + from policyengine_us import Microsimulation + + logger.info("Running QRF imputation with state predictor") + + puf_sim = Microsimulation(dataset=puf_dataset) + + from policyengine_us_data.calibration.clone_and_assign import ( + load_global_block_distribution, + ) + + _, _, puf_states, block_probs = load_global_block_distribution() + rng = np.random.default_rng(seed=99) + n_puf = len(puf_sim.calculate("person_id", map_to="person").values) + puf_state_indices = rng.choice(len(puf_states), size=n_puf, p=block_probs) + puf_state_fips = puf_states[puf_state_indices] + + demo_preds = [p for p in DEMOGRAPHIC_PREDICTORS if p != "state_fips"] + + X_train_full = puf_sim.calculate_dataframe(demo_preds + IMPUTED_VARIABLES) + X_train_full["state_fips"] = puf_state_fips.astype(np.float32) + + X_train_override = puf_sim.calculate_dataframe( + demo_preds + OVERRIDDEN_IMPUTED_VARIABLES + ) + X_train_override["state_fips"] = puf_state_fips.astype(np.float32) + + del puf_sim + + n_hh = len(data["household_id"][time_period]) + person_count = len(data["person_id"][time_period]) + + if dataset_path is not None: + cps_sim = Microsimulation(dataset=dataset_path) + X_test = cps_sim.calculate_dataframe(demo_preds) + del cps_sim + else: + X_test = pd.DataFrame() + for pred in demo_preds: + if pred in data: + X_test[pred] = data[pred][time_period].astype(np.float32) + + hh_ids_person = data.get("person_household_id", {}).get(time_period) + if hh_ids_person is not None: + hh_ids = data["household_id"][time_period] + hh_to_idx = {int(hh_id): i for i, hh_id in enumerate(hh_ids)} + person_states = np.array( + [state_fips[hh_to_idx[int(hh_id)]] for hh_id in hh_ids_person] + ) + else: + person_states = np.repeat( + state_fips, + person_count // n_hh, + ) + X_test["state_fips"] = person_states.astype(np.float32) + + predictors = DEMOGRAPHIC_PREDICTORS + + logger.info("Imputing %d PUF variables (full)", len(IMPUTED_VARIABLES)) + y_full = _batch_qrf(X_train_full, X_test, predictors, IMPUTED_VARIABLES) + + logger.info( + "Imputing %d PUF variables (override)", + len(OVERRIDDEN_IMPUTED_VARIABLES), + ) + y_override = _batch_qrf( + X_train_override, + X_test, + predictors, + OVERRIDDEN_IMPUTED_VARIABLES, + ) + + return y_full, y_override + + +def _batch_qrf( + X_train: pd.DataFrame, + X_test: pd.DataFrame, + predictors: List[str], + output_vars: List[str], + batch_size: int = 10, +) -> Dict[str, np.ndarray]: + """Run QRF in batches to control memory. + + Args: + X_train: Training data with predictors + output vars. + X_test: Test data with predictors only. + predictors: Predictor column names. + output_vars: Output variable names to impute. + batch_size: Variables per batch. + + Returns: + Dict mapping variable name to imputed values. + """ + from microimpute.models.qrf import QRF + + available = [c for c in output_vars if c in X_train.columns] + missing = [c for c in output_vars if c not in X_train.columns] + + if missing: + logger.warning( + "%d variables missing from training: %s", + len(missing), + missing[:5], + ) + + result = {} + + for batch_start in range(0, len(available), batch_size): + batch_end = min(batch_start + batch_size, len(available)) + batch_vars = available[batch_start:batch_end] + + gc.collect() + + qrf = QRF( + log_level="INFO", + memory_efficient=True, + batch_size=10, + cleanup_interval=5, + ) + + batch_X_train = X_train[predictors + batch_vars].copy() + + fitted = qrf.fit( + X_train=batch_X_train, + predictors=predictors, + imputed_variables=batch_vars, + n_jobs=1, + ) + + predictions = fitted.predict(X_test=X_test) + + for var in batch_vars: + result[var] = predictions[var].values + + del fitted, predictions, batch_X_train + gc.collect() + + n_test = len(X_test) + for var in missing: + result[var] = np.zeros(n_test) + + return result diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py new file mode 100644 index 00000000..1d5021a2 --- /dev/null +++ b/policyengine_us_data/calibration/source_impute.py @@ -0,0 +1,757 @@ +"""Non-PUF QRF imputations with state_fips as predictor. + +Re-imputes variables from ACS, SIPP, and SCF donor surveys +with state_fips included as a QRF predictor. This runs after +geography assignment so imputations reflect assigned state. + +Sources and variables: + ACS -> rent, real_estate_taxes + SIPP -> tip_income, bank_account_assets, stock_assets, + bond_assets + SCF -> net_worth, auto_loan_balance, auto_loan_interest + +Usage in unified calibration pipeline: + 1. Load raw CPS + 2. Clone Nx, assign geography + 3. impute_source_variables() <-- this module + 4. PUF clone + QRF impute (puf_impute.py) + 5. PE simulate, build matrix, calibrate +""" + +import gc +import logging +from typing import Dict, Optional + +import numpy as np +import pandas as pd + +logger = logging.getLogger(__name__) + +ACS_IMPUTED_VARIABLES = [ + "rent", + "real_estate_taxes", +] + +SIPP_IMPUTED_VARIABLES = [ + "tip_income", + "bank_account_assets", + "stock_assets", + "bond_assets", +] + +SCF_IMPUTED_VARIABLES = [ + "net_worth", + "auto_loan_balance", + "auto_loan_interest", +] + +ALL_SOURCE_VARIABLES = ( + ACS_IMPUTED_VARIABLES + SIPP_IMPUTED_VARIABLES + SCF_IMPUTED_VARIABLES +) + +ACS_PREDICTORS = [ + "is_household_head", + "age", + "is_male", + "tenure_type", + "employment_income", + "self_employment_income", + "social_security", + "pension_income", + "household_size", +] + +SIPP_TIPS_PREDICTORS = [ + "employment_income", + "age", + "count_under_18", + "count_under_6", +] + +SIPP_ASSETS_PREDICTORS = [ + "employment_income", + "age", + "is_female", + "is_married", + "count_under_18", +] + +SCF_PREDICTORS = [ + "age", + "is_female", + "cps_race", + "is_married", + "own_children_in_household", + "employment_income", + "interest_dividend_income", + "social_security_pension_income", +] + + +def impute_source_variables( + data: Dict[str, Dict[int, np.ndarray]], + state_fips: np.ndarray, + time_period: int = 2024, + dataset_path: Optional[str] = None, + skip_acs: bool = False, + skip_sipp: bool = False, + skip_scf: bool = False, +) -> Dict[str, Dict[int, np.ndarray]]: + """Re-impute ACS/SIPP/SCF variables with state as predictor. + + Overwrites existing imputed values in data with new values + that use assigned state_fips as a QRF predictor. + + Args: + data: CPS dataset dict {variable: {time_period: array}}. + state_fips: State FIPS per household. + time_period: Tax year. + dataset_path: Path to CPS h5 for Microsimulation. + skip_acs: Skip ACS imputation. + skip_sipp: Skip SIPP imputation. + skip_scf: Skip SCF imputation. + + Returns: + Updated data dict with re-imputed variables. + """ + data["state_fips"] = { + time_period: state_fips.astype(np.int32), + } + + if not skip_acs: + logger.info("Imputing ACS variables with state predictor") + data = _impute_acs(data, state_fips, time_period, dataset_path) + + if not skip_sipp: + logger.info("Imputing SIPP variables with state predictor") + data = _impute_sipp(data, state_fips, time_period, dataset_path) + + if not skip_scf: + logger.info("Imputing SCF variables with state predictor") + data = _impute_scf(data, state_fips, time_period, dataset_path) + + return data + + +def _build_cps_receiver( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, + dataset_path: Optional[str], + pe_variables: list, +) -> pd.DataFrame: + """Build CPS receiver DataFrame from Microsimulation. + + Args: + data: CPS data dict. + time_period: Tax year. + dataset_path: Path to CPS h5 for Microsimulation. + pe_variables: List of PE variable names to compute. + + Returns: + DataFrame with requested columns. + """ + if dataset_path is not None: + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=dataset_path) + tbs = sim.tax_benefit_system + valid_vars = [v for v in pe_variables if v in tbs.variables] + if valid_vars: + df = sim.calculate_dataframe(valid_vars) + else: + df = pd.DataFrame(index=range(len(data["person_id"][time_period]))) + del sim + else: + df = pd.DataFrame() + + for var in pe_variables: + if var not in df.columns and var in data: + df[var] = data[var][time_period].astype(np.float32) + + return df + + +def _get_variable_entity(variable_name: str) -> str: + """Return the entity key for a PE variable.""" + from policyengine_us import CountryTaxBenefitSystem + + tbs = CountryTaxBenefitSystem() + var = tbs.variables.get(variable_name) + if var is None: + return "person" + return var.entity.key + + +def _person_state_fips( + data: Dict[str, Dict[int, np.ndarray]], + state_fips: np.ndarray, + time_period: int, +) -> np.ndarray: + """Map household-level state_fips to person level. + + Args: + data: CPS data dict. + state_fips: State FIPS per household. + time_period: Tax year. + + Returns: + Person-level state FIPS array. + """ + hh_ids_person = data.get("person_household_id", {}).get(time_period) + if hh_ids_person is not None: + hh_ids = data["household_id"][time_period] + hh_to_idx = {int(hh_id): i for i, hh_id in enumerate(hh_ids)} + return np.array( + [state_fips[hh_to_idx[int(hh_id)]] for hh_id in hh_ids_person] + ) + n_hh = len(data["household_id"][time_period]) + n_persons = len(data["person_id"][time_period]) + return np.repeat(state_fips, n_persons // n_hh) + + +def _impute_acs( + data: Dict[str, Dict[int, np.ndarray]], + state_fips: np.ndarray, + time_period: int, + dataset_path: Optional[str] = None, +) -> Dict[str, Dict[int, np.ndarray]]: + """Impute rent and real_estate_taxes from ACS with state. + + Args: + data: CPS data dict. + state_fips: State FIPS per household. + time_period: Tax year. + dataset_path: Path to CPS h5 for Microsimulation. + + Returns: + Updated data dict. + """ + from microimpute.models.qrf import QRF + from policyengine_us import Microsimulation + + from policyengine_us_data.datasets.acs.acs import ACS_2022 + + acs = Microsimulation(dataset=ACS_2022) + predictors = ACS_PREDICTORS + ["state_fips"] + + acs_df = acs.calculate_dataframe(ACS_PREDICTORS + ACS_IMPUTED_VARIABLES) + acs_df["state_fips"] = acs.calculate( + "state_fips", map_to="person" + ).values.astype(np.float32) + + train_df = acs_df[acs_df.is_household_head].sample(10_000, random_state=42) + if "tenure_type" in train_df.columns: + train_df["tenure_type"] = ( + train_df["tenure_type"] + .astype(str) + .map( + { + "OWNED_WITH_MORTGAGE": 1, + "OWNED_OUTRIGHT": 1, + "RENTED": 2, + "NONE": 0, + } + ) + .fillna(0) + .astype(np.float32) + ) + del acs + + if dataset_path is not None: + cps_sim = Microsimulation(dataset=dataset_path) + cps_df = cps_sim.calculate_dataframe(ACS_PREDICTORS) + del cps_sim + else: + cps_df = pd.DataFrame() + for pred in ACS_PREDICTORS: + if pred in data: + cps_df[pred] = data[pred][time_period].astype(np.float32) + + if "tenure_type" in cps_df.columns: + cps_df["tenure_type"] = ( + cps_df["tenure_type"] + .astype(str) + .map( + { + "OWNED_WITH_MORTGAGE": 1, + "OWNED_OUTRIGHT": 1, + "RENTED": 2, + "NONE": 0, + } + ) + .fillna(0) + .astype(np.float32) + ) + + person_states = _person_state_fips(data, state_fips, time_period) + cps_df["state_fips"] = person_states.astype(np.float32) + + mask = ( + cps_df.is_household_head.values + if "is_household_head" in cps_df.columns + else np.ones(len(cps_df), dtype=bool) + ) + cps_heads = cps_df[mask] + + qrf = QRF() + logger.info( + "ACS QRF: %d train, %d test, %d predictors", + len(train_df), + len(cps_heads), + len(predictors), + ) + fitted = qrf.fit( + X_train=train_df, + predictors=predictors, + imputed_variables=ACS_IMPUTED_VARIABLES, + ) + predictions = fitted.predict(X_test=cps_heads) + + n_persons = len(data["person_id"][time_period]) + for var in ACS_IMPUTED_VARIABLES: + values = np.zeros(n_persons, dtype=np.float32) + values[mask] = predictions[var].values + data[var] = {time_period: values} + data["pre_subsidy_rent"] = {time_period: data["rent"][time_period].copy()} + + del fitted, predictions + gc.collect() + + logger.info("ACS imputation complete: rent, real_estate_taxes") + return data + + +def _impute_sipp( + data: Dict[str, Dict[int, np.ndarray]], + state_fips: np.ndarray, + time_period: int, + dataset_path: Optional[str] = None, +) -> Dict[str, Dict[int, np.ndarray]]: + """Impute tip_income and liquid assets from SIPP with state. + + Args: + data: CPS data dict. + state_fips: State FIPS per household. + time_period: Tax year. + dataset_path: Path to CPS h5 for Microsimulation. + + Returns: + Updated data dict. + """ + from microimpute.models.qrf import QRF + from policyengine_us import Microsimulation + + from policyengine_us_data.calibration.clone_and_assign import ( + load_global_block_distribution, + ) + + _, _, donor_states, block_probs = load_global_block_distribution() + rng = np.random.default_rng(seed=88) + + from policyengine_us_data.storage import STORAGE_FOLDER + from huggingface_hub import hf_hub_download + + hf_hub_download( + repo_id="PolicyEngine/policyengine-us-data", + filename="pu2023_slim.csv", + repo_type="model", + local_dir=STORAGE_FOLDER, + ) + sipp_df = pd.read_csv(STORAGE_FOLDER / "pu2023_slim.csv") + + sipp_df["tip_income"] = ( + sipp_df[sipp_df.columns[sipp_df.columns.str.contains("TXAMT")]] + .fillna(0) + .sum(axis=1) + * 12 + ) + sipp_df["employment_income"] = sipp_df.TPTOTINC * 12 + sipp_df["age"] = sipp_df.TAGE + sipp_df["household_weight"] = sipp_df.WPFINWGT + sipp_df["household_id"] = sipp_df.SSUID + + sipp_df["is_under_18"] = sipp_df.TAGE < 18 + sipp_df["is_under_6"] = sipp_df.TAGE < 6 + sipp_df["count_under_18"] = ( + sipp_df.groupby("SSUID")["is_under_18"] + .sum() + .loc[sipp_df.SSUID.values] + .values + ) + sipp_df["count_under_6"] = ( + sipp_df.groupby("SSUID")["is_under_6"] + .sum() + .loc[sipp_df.SSUID.values] + .values + ) + + tip_cols = [ + "household_id", + "employment_income", + "tip_income", + "count_under_18", + "count_under_6", + "age", + "household_weight", + ] + tip_train = sipp_df[tip_cols].dropna() + tip_train = tip_train.loc[ + rng.choice( + tip_train.index, + size=min(10_000, len(tip_train)), + replace=True, + p=(tip_train.household_weight / tip_train.household_weight.sum()), + ) + ] + + tip_state_idx = rng.choice( + len(donor_states), size=len(tip_train), p=block_probs + ) + tip_train["state_fips"] = donor_states[tip_state_idx].astype(np.float32) + + cps_tip_df = _build_cps_receiver( + data, time_period, dataset_path, ["employment_income", "age"] + ) + person_ages = data["age"][time_period] + hh_ids_person = data.get("person_household_id", {}).get(time_period) + if hh_ids_person is not None: + age_df = pd.DataFrame({"hh": hh_ids_person, "age": person_ages}) + under_18 = age_df.groupby("hh")["age"].apply(lambda x: (x < 18).sum()) + under_6 = age_df.groupby("hh")["age"].apply(lambda x: (x < 6).sum()) + cps_tip_df["count_under_18"] = under_18.loc[ + hh_ids_person + ].values.astype(np.float32) + cps_tip_df["count_under_6"] = under_6.loc[hh_ids_person].values.astype( + np.float32 + ) + else: + cps_tip_df["count_under_18"] = 0.0 + cps_tip_df["count_under_6"] = 0.0 + + person_states = _person_state_fips(data, state_fips, time_period) + cps_tip_df["state_fips"] = person_states.astype(np.float32) + + tip_predictors = SIPP_TIPS_PREDICTORS + ["state_fips"] + qrf = QRF() + logger.info( + "SIPP tips QRF: %d train, %d test", + len(tip_train), + len(cps_tip_df), + ) + fitted = qrf.fit( + X_train=tip_train, + predictors=tip_predictors, + imputed_variables=["tip_income"], + ) + tip_preds = fitted.predict(X_test=cps_tip_df) + data["tip_income"] = { + time_period: tip_preds["tip_income"].values, + } + del fitted, tip_preds + gc.collect() + + logger.info("SIPP tip imputation complete") + + # Asset imputation + try: + hf_hub_download( + repo_id="PolicyEngine/policyengine-us-data", + filename="pu2023.csv", + repo_type="model", + local_dir=STORAGE_FOLDER, + ) + asset_cols = [ + "SSUID", + "PNUM", + "MONTHCODE", + "WPFINWGT", + "TAGE", + "ESEX", + "EMS", + "TPTOTINC", + "TVAL_BANK", + "TVAL_STMF", + "TVAL_BOND", + ] + asset_df = pd.read_csv( + STORAGE_FOLDER / "pu2023.csv", + delimiter="|", + usecols=asset_cols, + ) + asset_df = asset_df[asset_df.MONTHCODE == 12] + + asset_df["bank_account_assets"] = asset_df["TVAL_BANK"].fillna(0) + asset_df["stock_assets"] = asset_df["TVAL_STMF"].fillna(0) + asset_df["bond_assets"] = asset_df["TVAL_BOND"].fillna(0) + asset_df["age"] = asset_df.TAGE + asset_df["is_female"] = asset_df.ESEX == 2 + asset_df["is_married"] = asset_df.EMS == 1 + asset_df["employment_income"] = asset_df.TPTOTINC * 12 + asset_df["household_weight"] = asset_df.WPFINWGT + asset_df["is_under_18"] = asset_df.TAGE < 18 + asset_df["count_under_18"] = ( + asset_df.groupby("SSUID")["is_under_18"] + .sum() + .loc[asset_df.SSUID.values] + .values + ) + + asset_train_cols = [ + "employment_income", + "bank_account_assets", + "stock_assets", + "bond_assets", + "age", + "is_female", + "is_married", + "count_under_18", + "household_weight", + ] + asset_train = asset_df[asset_train_cols].dropna() + asset_train = asset_train.loc[ + rng.choice( + asset_train.index, + size=min(20_000, len(asset_train)), + replace=True, + p=( + asset_train.household_weight + / asset_train.household_weight.sum() + ), + ) + ] + + asset_state_idx = rng.choice( + len(donor_states), + size=len(asset_train), + p=block_probs, + ) + asset_train["state_fips"] = donor_states[asset_state_idx].astype( + np.float32 + ) + + cps_asset_df = _build_cps_receiver( + data, + time_period, + dataset_path, + ["employment_income", "age", "is_male"], + ) + if "is_male" in cps_asset_df.columns: + cps_asset_df["is_female"] = ( + ~cps_asset_df["is_male"].astype(bool) + ).astype(np.float32) + else: + cps_asset_df["is_female"] = 0.0 + if "is_married" in data: + cps_asset_df["is_married"] = data["is_married"][ + time_period + ].astype(np.float32) + else: + cps_asset_df["is_married"] = 0.0 + cps_asset_df["count_under_18"] = ( + cps_tip_df["count_under_18"] + if "count_under_18" in cps_tip_df.columns + else 0.0 + ) + + cps_asset_df["state_fips"] = person_states.astype(np.float32) + + asset_predictors = SIPP_ASSETS_PREDICTORS + ["state_fips"] + asset_vars = [ + "bank_account_assets", + "stock_assets", + "bond_assets", + ] + qrf = QRF() + logger.info( + "SIPP assets QRF: %d train, %d test", + len(asset_train), + len(cps_asset_df), + ) + fitted = qrf.fit( + X_train=asset_train, + predictors=asset_predictors, + imputed_variables=asset_vars, + ) + asset_preds = fitted.predict(X_test=cps_asset_df) + + for var in asset_vars: + data[var] = { + time_period: asset_preds[var].values, + } + del fitted, asset_preds + gc.collect() + + logger.info("SIPP asset imputation complete") + + except Exception as e: + logger.warning( + "SIPP asset imputation failed: %s. " "Keeping existing values.", + e, + ) + + return data + + +def _impute_scf( + data: Dict[str, Dict[int, np.ndarray]], + state_fips: np.ndarray, + time_period: int, + dataset_path: Optional[str] = None, +) -> Dict[str, Dict[int, np.ndarray]]: + """Impute net_worth and auto_loan from SCF with state. + + Args: + data: CPS data dict. + state_fips: State FIPS per household. + time_period: Tax year. + dataset_path: Path to CPS h5 for Microsimulation. + + Returns: + Updated data dict. + """ + from microimpute.models.qrf import QRF + from policyengine_us import Microsimulation + + from policyengine_us_data.calibration.clone_and_assign import ( + load_global_block_distribution, + ) + from policyengine_us_data.datasets.scf.scf import SCF_2022 + + _, _, donor_states, block_probs = load_global_block_distribution() + rng = np.random.default_rng(seed=77) + + scf_dataset = SCF_2022() + scf_data = scf_dataset.load_dataset() + scf_df = pd.DataFrame({key: scf_data[key] for key in scf_data.keys()}) + + scf_state_idx = rng.choice( + len(donor_states), size=len(scf_df), p=block_probs + ) + scf_df["state_fips"] = donor_states[scf_state_idx].astype(np.float32) + + scf_predictors = SCF_PREDICTORS + ["state_fips"] + + available_preds = [p for p in scf_predictors if p in scf_df.columns] + missing_preds = [p for p in scf_predictors if p not in scf_df.columns] + if missing_preds: + logger.warning("SCF missing predictors: %s", missing_preds) + scf_predictors = available_preds + + scf_vars = SCF_IMPUTED_VARIABLES + if "networth" in scf_df.columns and "net_worth" not in scf_df.columns: + scf_df["net_worth"] = scf_df["networth"] + + available_vars = [v for v in scf_vars if v in scf_df.columns] + if not available_vars: + logger.warning("No SCF imputed variables available. Skipping.") + return data + + weights = scf_df.get("wgt") + + donor = scf_df[scf_predictors + available_vars].copy() + if weights is not None: + donor["wgt"] = weights + donor = donor.dropna(subset=scf_predictors) + donor = donor.sample(frac=0.5, random_state=42).reset_index(drop=True) + + pe_vars = [ + "age", + "is_male", + "employment_income", + ] + cps_df = _build_cps_receiver(data, time_period, dataset_path, pe_vars) + + if "is_male" in cps_df.columns: + cps_df["is_female"] = (~cps_df["is_male"].astype(bool)).astype( + np.float32 + ) + else: + cps_df["is_female"] = 0.0 + + for var in [ + "cps_race", + "is_married", + "own_children_in_household", + ]: + if var in data: + cps_df[var] = data[var][time_period].astype(np.float32) + else: + cps_df[var] = 0.0 + + for var in [ + "taxable_interest_income", + "tax_exempt_interest_income", + "qualified_dividend_income", + "non_qualified_dividend_income", + ]: + if var in data: + cps_df[var] = data[var][time_period].astype(np.float32) + cps_df["interest_dividend_income"] = ( + cps_df.get("taxable_interest_income", 0) + + cps_df.get("tax_exempt_interest_income", 0) + + cps_df.get("qualified_dividend_income", 0) + + cps_df.get("non_qualified_dividend_income", 0) + ).astype(np.float32) + + for var in [ + "tax_exempt_private_pension_income", + "taxable_private_pension_income", + "social_security_retirement", + ]: + if var in data: + cps_df[var] = data[var][time_period].astype(np.float32) + cps_df["social_security_pension_income"] = ( + cps_df.get("tax_exempt_private_pension_income", 0) + + cps_df.get("taxable_private_pension_income", 0) + + cps_df.get("social_security_retirement", 0) + ).astype(np.float32) + + person_states = _person_state_fips(data, state_fips, time_period) + cps_df["state_fips"] = person_states.astype(np.float32) + + qrf = QRF() + logger.info( + "SCF QRF: %d train, %d test, vars=%s", + len(donor), + len(cps_df), + available_vars, + ) + fitted = qrf.fit( + X_train=donor, + predictors=scf_predictors, + imputed_variables=available_vars, + weight_col="wgt" if weights is not None else None, + tune_hyperparameters=False, + ) + preds = fitted.predict(X_test=cps_df) + + hh_ids = data["household_id"][time_period] + person_hh_ids = data.get("person_household_id", {}).get(time_period) + + for var in available_vars: + person_vals = preds[var].values + entity = _get_variable_entity(var) + if entity == "household" and person_hh_ids is not None: + hh_vals = np.zeros(len(hh_ids), dtype=np.float32) + hh_to_idx = {int(hid): i for i, hid in enumerate(hh_ids)} + seen = set() + for p_idx, p_hh in enumerate(person_hh_ids): + hh_key = int(p_hh) + if hh_key not in seen: + seen.add(hh_key) + hh_vals[hh_to_idx[hh_key]] = person_vals[p_idx] + data[var] = {time_period: hh_vals} + logger.info( + " %s: person(%d) -> household(%d)", + var, + len(person_vals), + len(hh_vals), + ) + else: + data[var] = {time_period: person_vals} + + del fitted, preds + gc.collect() + + logger.info("SCF imputation complete: %s", available_vars) + return data diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py index d2759b34..3a95952f 100644 --- a/policyengine_us_data/calibration/unified_calibration.py +++ b/policyengine_us_data/calibration/unified_calibration.py @@ -4,10 +4,12 @@ Pipeline flow: 1. Load CPS dataset -> get n_records 2. Clone Nx, assign random geography (census block) - 3. Re-randomize simple takeup variables per block - 4. Build sparse calibration matrix (clone-by-clone) - 5. L0-regularized optimization -> calibrated weights - 6. Save weights, diagnostics, run config + 3. (Optional) Source impute ACS/SIPP/SCF vars with state + 4. (Optional) PUF clone (2x) + QRF impute with state + 5. Re-randomize simple takeup variables per block + 6. Build sparse calibration matrix (clone-by-clone) + 7. L0-regularized optimization -> calibrated weights + 8. Save weights, diagnostics, run config Two presets control output size via L0 regularization: - local: L0=1e-8, ~3-4M records (for local area dataset) @@ -19,7 +21,8 @@ --db-path path/to/policy_data.db \\ --output path/to/weights.npy \\ --preset local \\ - --epochs 100 + --epochs 100 \\ + --puf-dataset path/to/puf_2024.h5 """ import argparse @@ -253,6 +256,21 @@ def parse_args(argv=None): action="store_true", help="Skip takeup re-randomization", ) + parser.add_argument( + "--puf-dataset", + default=None, + help="Path to PUF h5 file for QRF training", + ) + parser.add_argument( + "--skip-puf", + action="store_true", + help="Skip PUF clone + QRF imputation", + ) + parser.add_argument( + "--skip-source-impute", + action="store_true", + help="Skip ACS/SIPP/SCF re-imputation with state", + ) return parser.parse_args(argv) @@ -389,6 +407,84 @@ def compute_diagnostics( ) +def _build_puf_cloned_dataset( + dataset_path: str, + puf_dataset_path: str, + state_fips: np.ndarray, + time_period: int = 2024, + skip_qrf: bool = False, + skip_source_impute: bool = False, +) -> str: + """Build a PUF-cloned dataset from raw CPS. + + Loads the CPS, optionally runs source imputations + (ACS/SIPP/SCF), then PUF clone + QRF. + + Args: + dataset_path: Path to raw CPS h5 file. + puf_dataset_path: Path to PUF h5 file. + state_fips: State FIPS per household (base records). + time_period: Tax year. + skip_qrf: Skip QRF imputation. + skip_source_impute: Skip ACS/SIPP/SCF imputations. + + Returns: + Path to the PUF-cloned h5 file. + """ + import h5py + + from policyengine_us import Microsimulation + + from policyengine_us_data.calibration.puf_impute import ( + puf_clone_dataset, + ) + + logger.info("Building PUF-cloned dataset from %s", dataset_path) + + sim = Microsimulation(dataset=dataset_path) + data = sim.dataset.load_dataset() + + data_dict = {} + for var in data: + data_dict[var] = {time_period: data[var][...]} + + if not skip_source_impute: + from policyengine_us_data.calibration.source_impute import ( + impute_source_variables, + ) + + data_dict = impute_source_variables( + data=data_dict, + state_fips=state_fips, + time_period=time_period, + dataset_path=dataset_path, + ) + + puf_dataset = puf_dataset_path if not skip_qrf else None + + new_data = puf_clone_dataset( + data=data_dict, + state_fips=state_fips, + time_period=time_period, + puf_dataset=puf_dataset, + skip_qrf=skip_qrf, + dataset_path=dataset_path, + ) + + output_path = str( + Path(dataset_path).parent / f"puf_cloned_{Path(dataset_path).stem}.h5" + ) + + with h5py.File(output_path, "w") as f: + for var, time_dict in new_data.items(): + for tp, values in time_dict.items(): + f.create_dataset(f"{var}/{tp}", data=values) + + del sim + logger.info("PUF-cloned dataset saved to %s", output_path) + return output_path + + def run_calibration( dataset_path: str, db_path: str, @@ -400,6 +496,9 @@ def run_calibration( domain_variables: list = None, hierarchical_domains: list = None, skip_takeup_rerandomize: bool = False, + puf_dataset_path: str = None, + skip_puf: bool = False, + skip_source_impute: bool = False, ): """Run unified calibration pipeline. @@ -415,6 +514,9 @@ def run_calibration( hierarchical_domains: Domains for hierarchical uprating + CD reconciliation. skip_takeup_rerandomize: Skip takeup step. + puf_dataset_path: Path to PUF h5 for QRF training. + skip_puf: Skip PUF clone step. + skip_source_impute: Skip ACS/SIPP/SCF imputations. Returns: (weights, targets_df, X_sparse, target_names) @@ -425,6 +527,7 @@ def run_calibration( from policyengine_us_data.calibration.clone_and_assign import ( assign_random_geography, + double_geography_for_puf, ) from policyengine_us_data.calibration.unified_matrix_builder import ( UnifiedMatrixBuilder, @@ -451,7 +554,76 @@ def run_calibration( seed=seed, ) - # Step 3: Build sim_modifier for takeup rerandomization + # Step 3: Source impute + PUF clone (if requested) + dataset_for_matrix = dataset_path + if not skip_puf and puf_dataset_path is not None: + base_states = geography.state_fips[:n_records] + + puf_cloned_path = _build_puf_cloned_dataset( + dataset_path=dataset_path, + puf_dataset_path=puf_dataset_path, + state_fips=base_states, + time_period=2024, + skip_qrf=False, + skip_source_impute=skip_source_impute, + ) + + geography = double_geography_for_puf(geography) + dataset_for_matrix = puf_cloned_path + n_records = n_records * 2 + + # Reload sim from PUF-cloned dataset + del sim + sim = Microsimulation(dataset=puf_cloned_path) + + logger.info( + "After PUF clone: %d records x %d clones = %d", + n_records, + n_clones, + n_records * n_clones, + ) + elif not skip_source_impute: + # Run source imputations without PUF cloning + import h5py + + base_states = geography.state_fips[:n_records] + + source_sim = Microsimulation(dataset=dataset_path) + raw_data = source_sim.dataset.load_dataset() + data_dict = {} + for var in raw_data: + data_dict[var] = {2024: raw_data[var][...]} + del source_sim + + from policyengine_us_data.calibration.source_impute import ( + impute_source_variables, + ) + + data_dict = impute_source_variables( + data=data_dict, + state_fips=base_states, + time_period=2024, + dataset_path=dataset_path, + ) + + source_path = str( + Path(dataset_path).parent + / f"source_imputed_{Path(dataset_path).stem}.h5" + ) + with h5py.File(source_path, "w") as f: + for var, time_dict in data_dict.items(): + for tp, values in time_dict.items(): + f.create_dataset(f"{var}/{tp}", data=values) + dataset_for_matrix = source_path + + del sim + sim = Microsimulation(dataset=source_path) + logger.info( + "Source-imputed dataset saved to %s", + source_path, + ) + + # Step 4: Build sim_modifier for takeup rerandomization sim_modifier = None if not skip_takeup_rerandomize: time_period = 2024 @@ -463,18 +635,18 @@ def sim_modifier(s, clone_idx): states = geography.state_fips[col_start:col_end] rerandomize_takeup(s, blocks, states, time_period) - # Step 4: Build target filter + # Step 5: Build target filter target_filter = {} if domain_variables: target_filter["domain_variables"] = domain_variables - # Step 5: Build sparse calibration matrix + # Step 6: Build sparse calibration matrix t_matrix = time.time() db_uri = f"sqlite:///{db_path}" builder = UnifiedMatrixBuilder( db_uri=db_uri, time_period=2024, - dataset_path=dataset_path, + dataset_path=dataset_for_matrix, ) targets_df, X_sparse, target_names = builder.build_matrix( geography=geography, @@ -495,7 +667,7 @@ def sim_modifier(s, clone_idx): X_sparse.nnz, ) - # Step 6: L0 calibration + # Step 7: L0 calibration targets = targets_df["value"].values row_sums = np.array(X_sparse.sum(axis=1)).flatten() @@ -570,6 +742,8 @@ def main(argv=None): t_start = time.time() + puf_dataset_path = getattr(args, "puf_dataset", None) + weights, targets_df, X_sparse, target_names = run_calibration( dataset_path=dataset_path, db_path=db_path, @@ -581,6 +755,9 @@ def main(argv=None): domain_variables=domain_variables, hierarchical_domains=hierarchical_domains, skip_takeup_rerandomize=(args.skip_takeup_rerandomize), + puf_dataset_path=puf_dataset_path, + skip_puf=getattr(args, "skip_puf", False), + skip_source_impute=getattr(args, "skip_source_impute", False), ) # Save weights @@ -612,6 +789,9 @@ def main(argv=None): run_config = { "dataset": dataset_path, "db_path": db_path, + "puf_dataset": puf_dataset_path, + "skip_puf": getattr(args, "skip_puf", False), + "skip_source_impute": getattr(args, "skip_source_impute", False), "n_clones": args.n_clones, "lambda_l0": lambda_l0, "epochs": args.epochs, diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index f96384a6..6af86ca0 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -1,140 +1,14 @@ +import logging + +import numpy as np from policyengine_core.data import Dataset -from policyengine_us_data.storage import STORAGE_FOLDER from typing import Type + +from policyengine_us_data.storage import STORAGE_FOLDER from policyengine_us_data.datasets.cps.cps import * from policyengine_us_data.datasets.puf import * -import pandas as pd -from microimpute.models.qrf import QRF -import time -import logging -import gc -# These are sorted by magnitude. -# First 15 contain 90%. -# First 7 contain 75%. -# If you're trying to debug this part of the code and don't want to wait ages -# to see if something breaks, try limiting to those. -IMPUTED_VARIABLES = [ - "employment_income", - "partnership_s_corp_income", - "social_security", - "taxable_pension_income", - "interest_deduction", - "tax_exempt_pension_income", - "long_term_capital_gains", - "unreimbursed_business_employee_expenses", - "pre_tax_contributions", - "taxable_ira_distributions", - "self_employment_income", - "w2_wages_from_qualified_business", - "unadjusted_basis_qualified_property", - "business_is_sstb", # bool - "short_term_capital_gains", - "qualified_dividend_income", - "charitable_cash_donations", - "self_employed_pension_contribution_ald", - "unrecaptured_section_1250_gain", - "taxable_unemployment_compensation", - "taxable_interest_income", - "domestic_production_ald", - "self_employed_health_insurance_ald", - "rental_income", - "non_qualified_dividend_income", - "cdcc_relevant_expenses", - "tax_exempt_interest_income", - "salt_refund_income", - "foreign_tax_credit", - "estate_income", - "charitable_non_cash_donations", - "american_opportunity_credit", - "miscellaneous_income", - "alimony_expense", - "farm_income", - "partnership_se_income", - "alimony_income", - "health_savings_account_ald", - "non_sch_d_capital_gains", - "general_business_credit", - "energy_efficient_home_improvement_credit", - "traditional_ira_contributions", - "amt_foreign_tax_credit", - "excess_withheld_payroll_tax", - "savers_credit", - "student_loan_interest", - "investment_income_elected_form_4952", - "early_withdrawal_penalty", - "prior_year_minimum_tax_credit", - "farm_rent_income", - "qualified_tuition_expenses", - "educator_expense", - "long_term_capital_gains_on_collectibles", - "other_credits", - "casualty_loss", - "unreported_payroll_tax", - "recapture_of_investment_credit", - "deductible_mortgage_interest", - "qualified_reit_and_ptp_income", - "qualified_bdc_income", - "farm_operations_income", - "estate_income_would_be_qualified", - "farm_operations_income_would_be_qualified", - "farm_rent_income_would_be_qualified", - "partnership_s_corp_income_would_be_qualified", - "rental_income_would_be_qualified", - "self_employment_income_would_be_qualified", -] - -OVERRIDDEN_IMPUTED_VARIABLES = [ - "partnership_s_corp_income", - "interest_deduction", - "unreimbursed_business_employee_expenses", - "pre_tax_contributions", - "w2_wages_from_qualified_business", - "unadjusted_basis_qualified_property", - "business_is_sstb", - "charitable_cash_donations", - "self_employed_pension_contribution_ald", - "unrecaptured_section_1250_gain", - "taxable_unemployment_compensation", - "domestic_production_ald", - "self_employed_health_insurance_ald", - "cdcc_relevant_expenses", - "salt_refund_income", - "foreign_tax_credit", - "estate_income", - "charitable_non_cash_donations", - "american_opportunity_credit", - "miscellaneous_income", - "alimony_expense", - "health_savings_account_ald", - "non_sch_d_capital_gains", - "general_business_credit", - "energy_efficient_home_improvement_credit", - "amt_foreign_tax_credit", - "excess_withheld_payroll_tax", - "savers_credit", - "student_loan_interest", - "investment_income_elected_form_4952", - "early_withdrawal_penalty", - "prior_year_minimum_tax_credit", - "farm_rent_income", - "qualified_tuition_expenses", - "educator_expense", - "long_term_capital_gains_on_collectibles", - "other_credits", - "casualty_loss", - "unreported_payroll_tax", - "recapture_of_investment_credit", - "deductible_mortgage_interest", - "qualified_reit_and_ptp_income", - "qualified_bdc_income", - "farm_operations_income", - "estate_income_would_be_qualified", - "farm_operations_income_would_be_qualified", - "farm_rent_income_would_be_qualified", - "partnership_s_corp_income_would_be_qualified", - "rental_income_would_be_qualified", -] +logger = logging.getLogger(__name__) class ExtendedCPS(Dataset): @@ -145,290 +19,40 @@ class ExtendedCPS(Dataset): def generate(self): from policyengine_us import Microsimulation - cps_sim = Microsimulation(dataset=self.cps) - puf_sim = Microsimulation(dataset=self.puf) - - puf_sim.subsample(10_000) - - INPUTS = [ - "age", - "is_male", - "tax_unit_is_joint", - "tax_unit_count_dependents", - "is_tax_unit_head", - "is_tax_unit_spouse", - "is_tax_unit_dependent", - ] - - y_full_imputations = impute_income_variables( - cps_sim, - puf_sim, - predictors=INPUTS, - outputs=IMPUTED_VARIABLES, + from policyengine_us_data.calibration.clone_and_assign import ( + load_global_block_distribution, ) - y_cps_imputations = impute_income_variables( - cps_sim, - puf_sim, - predictors=INPUTS, - outputs=OVERRIDDEN_IMPUTED_VARIABLES, + from policyengine_us_data.calibration.puf_impute import ( + puf_clone_dataset, ) + + logger.info("Loading CPS dataset: %s", self.cps) cps_sim = Microsimulation(dataset=self.cps) data = cps_sim.dataset.load_dataset() - new_data = {} - - # Pre-compute weeks_unemployed imputation for PUF copy - # Preserve relationship between UC and weeks from CPS - puf_weeks_unemployed = impute_weeks_unemployed_for_puf( - cps_sim, y_full_imputations + del cps_sim + + data_dict = {} + for var in data: + data_dict[var] = {self.time_period: data[var][...]} + + n_hh = len(data_dict["household_id"][self.time_period]) + _, _, block_states, block_probs = load_global_block_distribution() + rng = np.random.default_rng(seed=42) + indices = rng.choice(len(block_states), size=n_hh, p=block_probs) + state_fips = block_states[indices] + + logger.info("PUF clone with dataset: %s", self.puf) + new_data = puf_clone_dataset( + data=data_dict, + state_fips=state_fips, + time_period=self.time_period, + puf_dataset=self.puf, + dataset_path=str(self.cps.file_path), ) - for variable in list(data) + IMPUTED_VARIABLES: - variable_metadata = cps_sim.tax_benefit_system.variables.get( - variable - ) - if variable in data: - values = data[variable][...] - else: - values = cps_sim.calculate(variable).values - if variable in OVERRIDDEN_IMPUTED_VARIABLES: - pred_values = y_cps_imputations[variable].values - entity = variable_metadata.entity.key - if entity != "person": - pred_values = cps_sim.populations[ - entity - ].value_from_first_person(pred_values) - values = np.concatenate([pred_values, pred_values]) - elif variable in IMPUTED_VARIABLES: - pred_values = y_full_imputations[variable].values - entity = variable_metadata.entity.key - if entity != "person": - pred_values = cps_sim.populations[ - entity - ].value_from_first_person(pred_values) - values = np.concatenate([values, pred_values]) - elif variable == "person_id": - values = np.concatenate([values, values + values.max()]) - elif "_id" in variable: - values = np.concatenate([values, values + values.max()]) - elif "_weight" in variable: - values = np.concatenate([values, values * 0]) - elif variable == "weeks_unemployed": - # Use imputed weeks for PUF copy to preserve UC relationship - values = np.concatenate([values, puf_weeks_unemployed]) - else: - values = np.concatenate([values, values]) - new_data[variable] = { - self.time_period: values, - } - self.save_dataset(new_data) -def impute_income_variables( - cps_sim, - puf_sim, - predictors: list[str] = None, - outputs: list[str] = None, -): - - # Calculate all variables together to preserve dependencies - X_train = puf_sim.calculate_dataframe(predictors + outputs) - - # Check which outputs are actually in the result - available_outputs = [col for col in outputs if col in X_train.columns] - missing_outputs = [col for col in outputs if col not in X_train.columns] - - if missing_outputs: - logging.warning( - f"The following {len(missing_outputs)} variables were not calculated: {missing_outputs}" - ) - # Log the specific missing variable that's causing issues - if "recapture_of_investment_credit" in missing_outputs: - logging.error( - "recapture_of_investment_credit is missing from PUF calculation!" - ) - - logging.info( - f"X_train shape: {X_train.shape}, columns: {len(X_train.columns)}" - ) - - X_test = cps_sim.calculate_dataframe(predictors) - - logging.info( - f"Imputing {len(available_outputs)} variables using batched sequential QRF" - ) - total_start = time.time() - - # Batch variables to avoid memory issues with sequential imputation - batch_size = 10 # Reduce to 10 variables at a time - result = pd.DataFrame(index=X_test.index) - - # Sample training data more aggressively upfront - sample_size = min(5000, len(X_train)) # Reduced from 5000 - if len(X_train) > sample_size: - logging.info( - f"Sampling training data from {len(X_train)} to {sample_size} rows" - ) - X_train_sampled = X_train.sample(n=sample_size, random_state=42) - else: - X_train_sampled = X_train - - for batch_start in range(0, len(available_outputs), batch_size): - batch_end = min(batch_start + batch_size, len(available_outputs)) - batch_vars = available_outputs[batch_start:batch_end] - - logging.info( - f"Processing batch {batch_start//batch_size + 1}: variables {batch_start+1}-{batch_end} ({batch_vars})" - ) - - # Force garbage collection before each batch - gc.collect() - - # Create a fresh QRF for each batch - qrf = QRF( - log_level="INFO", - memory_efficient=True, - batch_size=10, - cleanup_interval=5, - ) - - # Use pre-sampled data for this batch - batch_X_train = X_train_sampled[predictors + batch_vars].copy() - - # Fit model for this batch with sequential imputation within the batch - fitted_model = qrf.fit( - X_train=batch_X_train, - predictors=predictors, - imputed_variables=batch_vars, - n_jobs=1, # Single thread to reduce memory overhead - ) - - # Predict for this batch - batch_predictions = fitted_model.predict(X_test=X_test) - - # Extract median predictions and add to result - for var in batch_vars: - result[var] = batch_predictions[var] - - # Clean up batch objects - del fitted_model - del batch_predictions - del batch_X_train - gc.collect() - - logging.info(f"Completed batch {batch_start//batch_size + 1}") - - # Add zeros for missing variables - for var in missing_outputs: - result[var] = 0 - - logging.info( - f"Imputing {len(available_outputs)} variables took {time.time() - total_start:.2f} seconds total" - ) - - return result - - -def impute_weeks_unemployed_for_puf(cps_sim, puf_imputations): - """ - Impute weeks_unemployed for the PUF copy using QRF from CPS data. - - Uses microimpute's Quantile Random Forest to impute weeks_unemployed - for PUF records based on CPS data, preserving the joint distribution - of weeks with UC, age, and other predictors. - - This is the reverse of the income imputation (CPS → PUF instead of - PUF → CPS) because weeks_unemployed exists in CPS but not in PUF. - """ - # Get CPS weeks - try: - cps_weeks = cps_sim.calculate("weeks_unemployed").values - except (ValueError, KeyError): - logging.warning( - "weeks_unemployed not available in CPS, " - "returning zeros for PUF copy" - ) - n_persons = len(puf_imputations.index) - return np.zeros(n_persons) - - # Predictors available in both CPS and imputed PUF data - WEEKS_PREDICTORS = [ - "age", - "is_male", - "tax_unit_is_joint", - "is_tax_unit_head", - "is_tax_unit_spouse", - "is_tax_unit_dependent", - ] - - # Build training data from CPS - X_train = cps_sim.calculate_dataframe(WEEKS_PREDICTORS) - X_train["weeks_unemployed"] = cps_weeks - - # Add UC as predictor if available in imputations (strong predictor) - if "taxable_unemployment_compensation" in puf_imputations.columns: - cps_uc = cps_sim.calculate("unemployment_compensation").values - X_train["unemployment_compensation"] = cps_uc - WEEKS_PREDICTORS = WEEKS_PREDICTORS + ["unemployment_compensation"] - - # Build test data for PUF copy - # Use CPS sim to get demographics (same as CPS portion) - X_test = cps_sim.calculate_dataframe( - [p for p in WEEKS_PREDICTORS if p != "unemployment_compensation"] - ) - - # Add imputed UC if available - if "taxable_unemployment_compensation" in puf_imputations.columns: - X_test["unemployment_compensation"] = puf_imputations[ - "taxable_unemployment_compensation" - ].values - - logging.info( - f"Imputing weeks_unemployed using QRF with " - f"predictors: {WEEKS_PREDICTORS}" - ) - - # Use QRF to impute weeks - qrf = QRF( - log_level="INFO", - memory_efficient=True, - ) - - # Sample training data for efficiency - sample_size = min(5000, len(X_train)) - if len(X_train) > sample_size: - X_train_sampled = X_train.sample(n=sample_size, random_state=42) - else: - X_train_sampled = X_train - - fitted_model = qrf.fit( - X_train=X_train_sampled, - predictors=WEEKS_PREDICTORS, - imputed_variables=["weeks_unemployed"], - n_jobs=1, - ) - - predictions = fitted_model.predict(X_test=X_test) - imputed_weeks = predictions["weeks_unemployed"].values - - # Enforce constraints: 0-52 weeks, 0 if no UC - imputed_weeks = np.clip(imputed_weeks, 0, 52) - if "unemployment_compensation" in X_test.columns: - imputed_weeks = np.where( - X_test["unemployment_compensation"].values > 0, - imputed_weeks, - 0, - ) - - logging.info( - f"Imputed weeks_unemployed for PUF: " - f"{(imputed_weeks > 0).sum()} with weeks > 0, " - f"mean = {imputed_weeks[imputed_weeks > 0].mean():.1f} weeks" - ) - - return imputed_weeks - - class ExtendedCPS_2024(ExtendedCPS): cps = CPS_2024_Full puf = PUF_2024 diff --git a/policyengine_us_data/tests/test_calibration/test_puf_impute.py b/policyengine_us_data/tests/test_calibration/test_puf_impute.py new file mode 100644 index 00000000..153df144 --- /dev/null +++ b/policyengine_us_data/tests/test_calibration/test_puf_impute.py @@ -0,0 +1,134 @@ +"""Tests for puf_impute module. + +Verifies PUF clone + QRF imputation logic using mock data +so tests don't require real CPS/PUF datasets. +""" + +import numpy as np +import pytest + +from policyengine_us_data.calibration.puf_impute import ( + puf_clone_dataset, + DEMOGRAPHIC_PREDICTORS, + IMPUTED_VARIABLES, + OVERRIDDEN_IMPUTED_VARIABLES, +) + + +def _make_mock_data(n_persons=20, n_households=5, time_period=2024): + """Build a minimal mock CPS data dict.""" + person_ids = np.arange(1, n_persons + 1) + household_ids_person = np.repeat( + np.arange(1, n_households + 1), + n_persons // n_households, + ) + tax_unit_ids_person = household_ids_person.copy() + spm_unit_ids_person = household_ids_person.copy() + + rng = np.random.default_rng(42) + ages = rng.integers(18, 80, size=n_persons) + is_male = rng.integers(0, 2, size=n_persons) + + data = { + "person_id": {time_period: person_ids}, + "household_id": {time_period: np.arange(1, n_households + 1)}, + "tax_unit_id": {time_period: np.arange(1, n_households + 1)}, + "spm_unit_id": {time_period: np.arange(1, n_households + 1)}, + "person_household_id": {time_period: household_ids_person}, + "person_tax_unit_id": {time_period: tax_unit_ids_person}, + "person_spm_unit_id": {time_period: spm_unit_ids_person}, + "age": {time_period: ages.astype(np.float32)}, + "is_male": {time_period: is_male.astype(np.float32)}, + "household_weight": {time_period: np.ones(n_households) * 1000}, + "employment_income": { + time_period: rng.uniform(0, 100000, n_persons).astype(np.float32) + }, + } + return data + + +class TestPufCloneDataset: + def test_doubles_records(self): + data = _make_mock_data(n_persons=20, n_households=5) + state_fips = np.array([1, 2, 36, 6, 48]) + + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=True, + ) + + assert len(result["household_id"][2024]) == 10 + assert len(result["person_id"][2024]) == 40 + + def test_ids_are_unique(self): + data = _make_mock_data(n_persons=20, n_households=5) + state_fips = np.array([1, 2, 36, 6, 48]) + + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=True, + ) + + person_ids = result["person_id"][2024] + household_ids = result["household_id"][2024] + assert len(np.unique(person_ids)) == len(person_ids) + assert len(np.unique(household_ids)) == len(household_ids) + + def test_puf_half_weight_zero(self): + data = _make_mock_data(n_persons=20, n_households=5) + state_fips = np.array([1, 2, 36, 6, 48]) + + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=True, + ) + + weights = result["household_weight"][2024] + assert np.all(weights[:5] > 0) + assert np.all(weights[5:] == 0) + + def test_state_fips_preserved(self): + data = _make_mock_data(n_persons=20, n_households=5) + state_fips = np.array([1, 2, 36, 6, 48]) + + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=True, + ) + + result_states = result["state_fips"][2024] + np.testing.assert_array_equal(result_states[:5], state_fips) + np.testing.assert_array_equal(result_states[5:], state_fips) + + def test_demographics_shared(self): + data = _make_mock_data(n_persons=20, n_households=5) + state_fips = np.array([1, 2, 36, 6, 48]) + + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=True, + ) + + ages = result["age"][2024] + n = len(ages) // 2 + np.testing.assert_array_equal(ages[:n], ages[n:]) + + def test_demographic_predictors_includes_state(self): + assert "state_fips" in DEMOGRAPHIC_PREDICTORS + + def test_imputed_variables_not_empty(self): + assert len(IMPUTED_VARIABLES) > 0 + + def test_overridden_subset_of_imputed(self): + for var in OVERRIDDEN_IMPUTED_VARIABLES: + assert var in IMPUTED_VARIABLES diff --git a/policyengine_us_data/tests/test_calibration/test_source_impute.py b/policyengine_us_data/tests/test_calibration/test_source_impute.py new file mode 100644 index 00000000..676e66de --- /dev/null +++ b/policyengine_us_data/tests/test_calibration/test_source_impute.py @@ -0,0 +1,169 @@ +"""Tests for source_impute module. + +Uses skip flags to avoid loading real donor data. +""" + +import numpy as np +import pytest + +from policyengine_us_data.calibration.source_impute import ( + ACS_IMPUTED_VARIABLES, + SIPP_IMPUTED_VARIABLES, + SCF_IMPUTED_VARIABLES, + ALL_SOURCE_VARIABLES, + impute_source_variables, + _impute_acs, + _impute_sipp, + _impute_scf, + _person_state_fips, +) + + +def _make_data_dict(n_persons=20, time_period=2024): + n_hh = n_persons // 2 + rng = np.random.default_rng(42) + return { + "person_id": { + time_period: np.arange(n_persons), + }, + "household_id": { + time_period: np.arange(n_hh), + }, + "person_household_id": { + time_period: np.repeat(np.arange(n_hh), 2), + }, + "age": { + time_period: rng.integers(18, 80, n_persons).astype(np.float32), + }, + "employment_income": { + time_period: rng.uniform(0, 100000, n_persons).astype(np.float32), + }, + "rent": {time_period: np.zeros(n_persons)}, + "real_estate_taxes": {time_period: np.zeros(n_persons)}, + "tip_income": {time_period: np.zeros(n_persons)}, + "bank_account_assets": {time_period: np.zeros(n_persons)}, + "stock_assets": {time_period: np.zeros(n_persons)}, + "bond_assets": {time_period: np.zeros(n_persons)}, + "net_worth": {time_period: np.zeros(n_persons)}, + "auto_loan_balance": {time_period: np.zeros(n_persons)}, + "auto_loan_interest": {time_period: np.zeros(n_persons)}, + } + + +class TestConstants: + def test_acs_variables_defined(self): + assert "rent" in ACS_IMPUTED_VARIABLES + assert "real_estate_taxes" in ACS_IMPUTED_VARIABLES + + def test_sipp_variables_defined(self): + assert "tip_income" in SIPP_IMPUTED_VARIABLES + assert "bank_account_assets" in SIPP_IMPUTED_VARIABLES + assert "stock_assets" in SIPP_IMPUTED_VARIABLES + assert "bond_assets" in SIPP_IMPUTED_VARIABLES + + def test_scf_variables_defined(self): + assert "net_worth" in SCF_IMPUTED_VARIABLES + assert "auto_loan_balance" in SCF_IMPUTED_VARIABLES + assert "auto_loan_interest" in SCF_IMPUTED_VARIABLES + + def test_all_source_variables_defined(self): + expected = ( + ACS_IMPUTED_VARIABLES + + SIPP_IMPUTED_VARIABLES + + SCF_IMPUTED_VARIABLES + ) + assert ALL_SOURCE_VARIABLES == expected + + +class TestImputeSourceVariables: + def test_function_exists(self): + assert callable(impute_source_variables) + + def test_returns_dict(self): + data = _make_data_dict(n_persons=20) + state_fips = np.ones(10, dtype=np.int32) * 6 + + result = impute_source_variables( + data=data, + state_fips=state_fips, + time_period=2024, + skip_acs=True, + skip_sipp=True, + skip_scf=True, + ) + assert isinstance(result, dict) + for key in data: + assert key in result + + def test_skip_flags_preserve_data(self): + data = _make_data_dict(n_persons=20) + state_fips = np.ones(10, dtype=np.int32) * 6 + + result = impute_source_variables( + data=data, + state_fips=state_fips, + time_period=2024, + skip_acs=True, + skip_sipp=True, + skip_scf=True, + ) + + for var in [ + "rent", + "real_estate_taxes", + "tip_income", + "net_worth", + ]: + np.testing.assert_array_equal(result[var][2024], data[var][2024]) + + def test_state_fips_added_to_data(self): + data = _make_data_dict(n_persons=20) + state_fips = np.ones(10, dtype=np.int32) * 6 + + result = impute_source_variables( + data=data, + state_fips=state_fips, + time_period=2024, + skip_acs=True, + skip_sipp=True, + skip_scf=True, + ) + + assert "state_fips" in result + np.testing.assert_array_equal(result["state_fips"][2024], state_fips) + + +class TestPersonStateFips: + def test_maps_correctly(self): + data = { + "household_id": {2024: np.array([10, 20, 30])}, + "person_household_id": {2024: np.array([10, 10, 20, 20, 30])}, + "person_id": {2024: np.arange(5)}, + } + state_fips = np.array([1, 2, 3]) + + result = _person_state_fips(data, state_fips, 2024) + expected = np.array([1, 1, 2, 2, 3]) + np.testing.assert_array_equal(result, expected) + + def test_fallback_repeat(self): + data = { + "household_id": {2024: np.array([10, 20])}, + "person_id": {2024: np.arange(4)}, + } + state_fips = np.array([1, 2]) + + result = _person_state_fips(data, state_fips, 2024) + expected = np.array([1, 1, 2, 2]) + np.testing.assert_array_equal(result, expected) + + +class TestSubfunctions: + def test_impute_acs_exists(self): + assert callable(_impute_acs) + + def test_impute_sipp_exists(self): + assert callable(_impute_sipp) + + def test_impute_scf_exists(self): + assert callable(_impute_scf) From 5e65aaaf203bffba5a07a66e3f2292061741e5c7 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 16 Feb 2026 22:46:25 -0500 Subject: [PATCH 2/6] Stratified PUF subsample, remove random states from SIPP/SCF, fix calibration loader Replace weight-proportional PUF subsample with stratified approach that force-includes top 0.5% by AGI and randomly samples rest to 20K, preserving the high-income tail the QRF needs. Remove random state assignment from SIPP and SCF in source_impute.py since these surveys lack state identifiers. Fix unified_calibration.py to handle TIME_PERIOD_ARRAYS dataset format. Add `make calibrate` target. Co-Authored-By: Claude Opus 4.6 --- Makefile | 6 +- .../calibration/puf_impute.py | 58 ++++++++++++++++++- .../calibration/source_impute.py | 49 ++-------------- .../calibration/unified_calibration.py | 6 +- .../tests/test_calibration/test_puf_impute.py | 46 +++++++++++++++ 5 files changed, 116 insertions(+), 49 deletions(-) diff --git a/Makefile b/Makefile index 7c2c63ee..20b1d223 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all format test install download upload docker documentation data publish-local-area clean build paper clean-paper presentations database database-refresh promote-database promote-dataset +.PHONY: all format test install download upload docker documentation data calibrate publish-local-area clean build paper clean-paper presentations database database-refresh promote-database promote-dataset HF_CLONE_DIR ?= $(HOME)/huggingface/policyengine-us-data @@ -97,6 +97,10 @@ data: download python policyengine_us_data/datasets/cps/small_enhanced_cps.py python policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py +calibrate: data + python -m policyengine_us_data.calibration.unified_calibration \ + --puf-dataset policyengine_us_data/storage/puf_2024.h5 + publish-local-area: python policyengine_us_data/datasets/cps/local_area_calibration/publish_local_area.py diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index 9c185c9f..3ad452bb 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -22,6 +22,9 @@ logger = logging.getLogger(__name__) +PUF_SUBSAMPLE_TARGET = 20_000 +PUF_TOP_PERCENTILE = 99.5 + DEMOGRAPHIC_PREDICTORS = [ "age", "is_male", @@ -394,9 +397,9 @@ def _run_qrf_imputation( ) -> tuple: """Run QRF imputation for PUF variables. - Trains QRF on ALL PUF records (no subsample) with random - state assignment, and predicts on CPS data using - demographics + state_fips. + Stratified-subsamples PUF records (top 0.5% by AGI kept, + rest randomly sampled to ~20K total) with random state + assignment, trains QRF, and predicts on CPS data. Args: data: CPS data dict. @@ -427,6 +430,10 @@ def _run_qrf_imputation( puf_state_indices = rng.choice(len(puf_states), size=n_puf, p=block_probs) puf_state_fips = puf_states[puf_state_indices] + puf_agi = puf_sim.calculate( + "adjusted_gross_income", map_to="person" + ).values + demo_preds = [p for p in DEMOGRAPHIC_PREDICTORS if p != "state_fips"] X_train_full = puf_sim.calculate_dataframe(demo_preds + IMPUTED_VARIABLES) @@ -439,6 +446,18 @@ def _run_qrf_imputation( del puf_sim + sub_idx = _stratified_subsample_index(puf_agi) + logger.info( + "Stratified PUF subsample: %d -> %d records " + "(top %.1f%% preserved, AGI threshold $%,.0f)", + len(puf_agi), + len(sub_idx), + 100 - PUF_TOP_PERCENTILE, + np.percentile(puf_agi, PUF_TOP_PERCENTILE), + ) + X_train_full = X_train_full.iloc[sub_idx].reset_index(drop=True) + X_train_override = X_train_override.iloc[sub_idx].reset_index(drop=True) + n_hh = len(data["household_id"][time_period]) person_count = len(data["person_id"][time_period]) @@ -485,6 +504,39 @@ def _run_qrf_imputation( return y_full, y_override +def _stratified_subsample_index( + income: np.ndarray, + target_n: int = PUF_SUBSAMPLE_TARGET, + top_pct: float = PUF_TOP_PERCENTILE, + seed: int = 42, +) -> np.ndarray: + """Return indices for stratified subsample preserving top earners. + + Keeps ALL records above the top_pct percentile of income, + then randomly samples the rest to reach target_n total. + """ + n = len(income) + if n <= target_n: + return np.arange(n) + + threshold = np.percentile(income, top_pct) + top_idx = np.where(income >= threshold)[0] + bottom_idx = np.where(income < threshold)[0] + + remaining_quota = max(0, target_n - len(top_idx)) + rng = np.random.default_rng(seed=seed) + if remaining_quota >= len(bottom_idx): + selected_bottom = bottom_idx + else: + selected_bottom = rng.choice( + bottom_idx, size=remaining_quota, replace=False + ) + + selected = np.concatenate([top_idx, selected_bottom]) + selected.sort() + return selected + + def _batch_qrf( X_train: pd.DataFrame, X_test: pd.DataFrame, diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py index 1d5021a2..a7b6ffc7 100644 --- a/policyengine_us_data/calibration/source_impute.py +++ b/policyengine_us_data/calibration/source_impute.py @@ -327,7 +327,7 @@ def _impute_sipp( time_period: int, dataset_path: Optional[str] = None, ) -> Dict[str, Dict[int, np.ndarray]]: - """Impute tip_income and liquid assets from SIPP with state. + """Impute tip_income and liquid assets from SIPP. Args: data: CPS data dict. @@ -341,11 +341,6 @@ def _impute_sipp( from microimpute.models.qrf import QRF from policyengine_us import Microsimulation - from policyengine_us_data.calibration.clone_and_assign import ( - load_global_block_distribution, - ) - - _, _, donor_states, block_probs = load_global_block_distribution() rng = np.random.default_rng(seed=88) from policyengine_us_data.storage import STORAGE_FOLDER @@ -404,11 +399,6 @@ def _impute_sipp( ) ] - tip_state_idx = rng.choice( - len(donor_states), size=len(tip_train), p=block_probs - ) - tip_train["state_fips"] = donor_states[tip_state_idx].astype(np.float32) - cps_tip_df = _build_cps_receiver( data, time_period, dataset_path, ["employment_income", "age"] ) @@ -428,10 +418,7 @@ def _impute_sipp( cps_tip_df["count_under_18"] = 0.0 cps_tip_df["count_under_6"] = 0.0 - person_states = _person_state_fips(data, state_fips, time_period) - cps_tip_df["state_fips"] = person_states.astype(np.float32) - - tip_predictors = SIPP_TIPS_PREDICTORS + ["state_fips"] + tip_predictors = SIPP_TIPS_PREDICTORS qrf = QRF() logger.info( "SIPP tips QRF: %d train, %d test", @@ -520,15 +507,6 @@ def _impute_sipp( ) ] - asset_state_idx = rng.choice( - len(donor_states), - size=len(asset_train), - p=block_probs, - ) - asset_train["state_fips"] = donor_states[asset_state_idx].astype( - np.float32 - ) - cps_asset_df = _build_cps_receiver( data, time_period, @@ -553,9 +531,7 @@ def _impute_sipp( else 0.0 ) - cps_asset_df["state_fips"] = person_states.astype(np.float32) - - asset_predictors = SIPP_ASSETS_PREDICTORS + ["state_fips"] + asset_predictors = SIPP_ASSETS_PREDICTORS asset_vars = [ "bank_account_assets", "stock_assets", @@ -598,7 +574,7 @@ def _impute_scf( time_period: int, dataset_path: Optional[str] = None, ) -> Dict[str, Dict[int, np.ndarray]]: - """Impute net_worth and auto_loan from SCF with state. + """Impute net_worth and auto_loan from SCF. Args: data: CPS data dict. @@ -611,25 +587,13 @@ def _impute_scf( """ from microimpute.models.qrf import QRF from policyengine_us import Microsimulation - - from policyengine_us_data.calibration.clone_and_assign import ( - load_global_block_distribution, - ) from policyengine_us_data.datasets.scf.scf import SCF_2022 - _, _, donor_states, block_probs = load_global_block_distribution() - rng = np.random.default_rng(seed=77) - scf_dataset = SCF_2022() scf_data = scf_dataset.load_dataset() scf_df = pd.DataFrame({key: scf_data[key] for key in scf_data.keys()}) - scf_state_idx = rng.choice( - len(donor_states), size=len(scf_df), p=block_probs - ) - scf_df["state_fips"] = donor_states[scf_state_idx].astype(np.float32) - - scf_predictors = SCF_PREDICTORS + ["state_fips"] + scf_predictors = SCF_PREDICTORS available_preds = [p for p in scf_predictors if p in scf_df.columns] missing_preds = [p for p in scf_predictors if p not in scf_df.columns] @@ -706,9 +670,6 @@ def _impute_scf( + cps_df.get("social_security_retirement", 0) ).astype(np.float32) - person_states = _person_state_fips(data, state_fips, time_period) - cps_df["state_fips"] = person_states.astype(np.float32) - qrf = QRF() logger.info( "SCF QRF: %d train, %d test, vars=%s", diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py index 3a95952f..84c31538 100644 --- a/policyengine_us_data/calibration/unified_calibration.py +++ b/policyengine_us_data/calibration/unified_calibration.py @@ -446,7 +446,11 @@ def _build_puf_cloned_dataset( data_dict = {} for var in data: - data_dict[var] = {time_period: data[var][...]} + if isinstance(data[var], dict): + vals = list(data[var].values()) + data_dict[var] = {time_period: vals[0]} + else: + data_dict[var] = {time_period: np.array(data[var])} if not skip_source_impute: from policyengine_us_data.calibration.source_impute import ( diff --git a/policyengine_us_data/tests/test_calibration/test_puf_impute.py b/policyengine_us_data/tests/test_calibration/test_puf_impute.py index 153df144..46db7367 100644 --- a/policyengine_us_data/tests/test_calibration/test_puf_impute.py +++ b/policyengine_us_data/tests/test_calibration/test_puf_impute.py @@ -9,9 +9,12 @@ from policyengine_us_data.calibration.puf_impute import ( puf_clone_dataset, + _stratified_subsample_index, DEMOGRAPHIC_PREDICTORS, IMPUTED_VARIABLES, OVERRIDDEN_IMPUTED_VARIABLES, + PUF_SUBSAMPLE_TARGET, + PUF_TOP_PERCENTILE, ) @@ -132,3 +135,46 @@ def test_imputed_variables_not_empty(self): def test_overridden_subset_of_imputed(self): for var in OVERRIDDEN_IMPUTED_VARIABLES: assert var in IMPUTED_VARIABLES + + +class TestStratifiedSubsample: + def test_noop_when_small(self): + income = np.random.default_rng(0).normal(50000, 20000, size=100) + idx = _stratified_subsample_index(income, target_n=200) + assert len(idx) == 100 + + def test_reduces_to_target(self): + rng = np.random.default_rng(0) + income = np.concatenate( + [ + rng.normal(50000, 20000, size=50_000), + rng.uniform(500_000, 5_000_000, size=250), + ] + ) + idx = _stratified_subsample_index( + income, target_n=10_000, top_pct=99.5 + ) + assert len(idx) == 10_000 + + def test_preserves_top_earners(self): + rng = np.random.default_rng(0) + income = np.concatenate( + [ + rng.normal(50000, 20000, size=50_000), + rng.uniform(500_000, 5_000_000, size=250), + ] + ) + threshold = np.percentile(income, 99.5) + n_top = (income >= threshold).sum() + + idx = _stratified_subsample_index( + income, target_n=10_000, top_pct=99.5 + ) + selected_income = income[idx] + n_top_selected = (selected_income >= threshold).sum() + assert n_top_selected == n_top + + def test_indices_sorted(self): + income = np.random.default_rng(0).normal(50000, 20000, size=50_000) + idx = _stratified_subsample_index(income, target_n=10_000) + assert np.all(idx[1:] >= idx[:-1]) From 4057da8598eb5851c55bb15c5d07f6c972505ec1 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Tue, 17 Feb 2026 12:19:25 -0500 Subject: [PATCH 3/6] Add calibration package checkpointing, target config, and hyperparameter CLI - Add build-only mode to save calibration matrix as pickle package - Add target config YAML for declarative target exclusion rules - Add CLI flags for beta, lambda_l2, learning_rate hyperparameters - Add streaming subprocess output in Modal runner - Add calibration pipeline documentation - Add tests for target config filtering and CLI arg parsing Co-Authored-By: Claude Opus 4.6 --- Makefile | 11 +- docs/calibration.md | 276 +++++++++++++++++ modal_app/remote_calibration_runner.py | 184 ++++++++--- .../calibration/target_config.yaml | 51 +++ .../calibration/unified_calibration.py | 291 +++++++++++++++++- .../test_calibration/test_target_config.py | 177 +++++++++++ .../test_unified_calibration.py | 60 ++++ 7 files changed, 995 insertions(+), 55 deletions(-) create mode 100644 docs/calibration.md create mode 100644 policyengine_us_data/calibration/target_config.yaml create mode 100644 policyengine_us_data/tests/test_calibration/test_target_config.py diff --git a/Makefile b/Makefile index 20b1d223..b43edde7 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all format test install download upload docker documentation data calibrate publish-local-area clean build paper clean-paper presentations database database-refresh promote-database promote-dataset +.PHONY: all format test install download upload docker documentation data calibrate calibrate-build publish-local-area clean build paper clean-paper presentations database database-refresh promote-database promote-dataset HF_CLONE_DIR ?= $(HOME)/huggingface/policyengine-us-data @@ -99,7 +99,14 @@ data: download calibrate: data python -m policyengine_us_data.calibration.unified_calibration \ - --puf-dataset policyengine_us_data/storage/puf_2024.h5 + --puf-dataset policyengine_us_data/storage/puf_2024.h5 \ + --target-config policyengine_us_data/calibration/target_config.yaml + +calibrate-build: data + python -m policyengine_us_data.calibration.unified_calibration \ + --puf-dataset policyengine_us_data/storage/puf_2024.h5 \ + --target-config policyengine_us_data/calibration/target_config.yaml \ + --build-only publish-local-area: python policyengine_us_data/datasets/cps/local_area_calibration/publish_local_area.py diff --git a/docs/calibration.md b/docs/calibration.md new file mode 100644 index 00000000..8f27baf1 --- /dev/null +++ b/docs/calibration.md @@ -0,0 +1,276 @@ +# Calibration Pipeline User's Manual + +The unified calibration pipeline reweights cloned CPS records to match administrative targets using L0-regularized optimization. This guide covers the three main workflows: full pipeline, build-then-fit, and fitting from a saved package. + +## Quick Start + +```bash +# Full pipeline (build matrix + fit weights): +make calibrate + +# Build matrix only (save package for later fitting): +make calibrate-build +``` + +## Architecture Overview + +The pipeline has two expensive phases: + +1. **Matrix build** (~30 min with PUF): Clone CPS records, assign geography, optionally PUF-impute, compute all target variable values, assemble a sparse calibration matrix. +2. **Weight fitting** (~5-20 min on GPU): L0-regularized optimization to find household weights that reproduce administrative targets. + +The calibration package checkpoint lets you run phase 1 once and iterate on phase 2 with different hyperparameters or target selections---without rebuilding. + +## Workflows + +### 1. Single-pass (default) + +Build the matrix and fit weights in one run: + +```bash +python -m policyengine_us_data.calibration.unified_calibration \ + --puf-dataset policyengine_us_data/storage/puf_2024.h5 \ + --target-config policyengine_us_data/calibration/target_config.yaml \ + --epochs 200 \ + --device cuda +``` + +Output: +- `storage/calibration/unified_weights.npy` --- calibrated weight vector +- `storage/calibration/unified_diagnostics.csv` --- per-target error report +- `storage/calibration/unified_run_config.json` --- full run configuration + +### 2. Build-then-fit (recommended for iteration) + +**Step 1: Build the matrix and save a package.** + +```bash +python -m policyengine_us_data.calibration.unified_calibration \ + --puf-dataset policyengine_us_data/storage/puf_2024.h5 \ + --target-config policyengine_us_data/calibration/target_config.yaml \ + --build-only +``` + +This saves `storage/calibration/calibration_package.pkl` (default location). Use `--package-output` to specify a different path. + +**Step 2: Fit weights from the package (fast, repeatable).** + +```bash +python -m policyengine_us_data.calibration.unified_calibration \ + --package-path storage/calibration/calibration_package.pkl \ + --epochs 500 \ + --lambda-l0 1e-8 \ + --beta 0.65 \ + --lambda-l2 1e-8 \ + --device cuda +``` + +You can re-run Step 2 as many times as you want with different hyperparameters. The expensive matrix build only happens once. + +### 3. Re-filtering a saved package + +A saved package contains **all** targets from the database (before target config filtering). You can apply a different target config at fit time: + +```bash +python -m policyengine_us_data.calibration.unified_calibration \ + --package-path storage/calibration/calibration_package.pkl \ + --target-config my_custom_config.yaml \ + --epochs 200 +``` + +This lets you experiment with which targets to include without rebuilding the matrix. + +### 4. Running on Modal (GPU cloud) + +```bash +modal run modal_app/remote_calibration_runner.py \ + --branch puf-impute-fix-530 \ + --gpu A10 \ + --epochs 500 \ + --target-config policyengine_us_data/calibration/target_config.yaml \ + --beta 0.65 +``` + +The target config YAML is read from the cloned repo inside the container, so it must be committed to the branch you specify. + +### 5. Portable fitting (Kaggle, Colab, etc.) + +Transfer the package file to any environment with `scipy`, `numpy`, `pandas`, `torch`, and `l0-python` installed: + +```python +from policyengine_us_data.calibration.unified_calibration import ( + load_calibration_package, + apply_target_config, + fit_l0_weights, +) + +package = load_calibration_package("calibration_package.pkl") +targets_df = package["targets_df"] +X_sparse = package["X_sparse"] + +weights = fit_l0_weights( + X_sparse=X_sparse, + targets=targets_df["value"].values, + lambda_l0=1e-8, + epochs=500, + device="cuda", + beta=0.65, + lambda_l2=1e-8, +) +``` + +## Target Config + +The target config controls which targets reach the optimizer. It uses a YAML exclusion list: + +```yaml +exclude: + - variable: rent + geo_level: national + - variable: eitc + geo_level: district + - variable: snap + geo_level: state + domain_variable: snap # optional: further narrow the match +``` + +Each rule drops rows from the calibration matrix where **all** specified fields match. Unrecognized variables silently match nothing. + +### Fields + +| Field | Required | Values | Description | +|---|---|---|---| +| `variable` | Yes | Any variable name in `target_overview` | The calibration target variable | +| `geo_level` | Yes | `national`, `state`, `district` | Geographic aggregation level | +| `domain_variable` | No | Any domain variable in `target_overview` | Narrows match to a specific domain | + +### Default config + +The checked-in config at `policyengine_us_data/calibration/target_config.yaml` reproduces the junkyard notebook's 22 excluded target groups. It drops: + +- **13 national-level variables**: alimony, charitable deduction, child support, interest deduction, medical expense deduction, net worth, person count, real estate taxes, rent, social security dependents/survivors +- **9 district-level variables**: ACA PTC, EITC, income tax before credits, medical expense deduction, net capital gains, rental income, tax unit count, partnership/S-corp income, taxable social security + +Applying this config reduces targets from ~37K to ~21K, matching the junkyard's target selection. + +### Writing a custom config + +To experiment, copy the default and edit: + +```bash +cp policyengine_us_data/calibration/target_config.yaml my_config.yaml +# Edit my_config.yaml to add/remove exclusion rules +python -m policyengine_us_data.calibration.unified_calibration \ + --package-path storage/calibration/calibration_package.pkl \ + --target-config my_config.yaml \ + --epochs 200 +``` + +To see what variables and geo_levels are available in the database: + +```sql +SELECT DISTINCT variable, geo_level +FROM target_overview +ORDER BY variable, geo_level; +``` + +## CLI Reference + +### Core flags + +| Flag | Default | Description | +|---|---|---| +| `--dataset` | `storage/stratified_extended_cps_2024.h5` | Path to CPS h5 file | +| `--db-path` | `storage/calibration/policy_data.db` | Path to target database | +| `--output` | `storage/calibration/unified_weights.npy` | Weight output path | +| `--puf-dataset` | None | Path to PUF h5 (enables PUF cloning) | +| `--preset` | `local` | L0 preset: `local` (1e-8) or `national` (1e-4) | +| `--lambda-l0` | None | Custom L0 penalty (overrides `--preset`) | +| `--epochs` | 100 | Training epochs | +| `--device` | `cpu` | `cpu` or `cuda` | +| `--n-clones` | 10 | Number of dataset clones | +| `--seed` | 42 | Random seed for geography assignment | + +### Target selection + +| Flag | Default | Description | +|---|---|---| +| `--target-config` | None | Path to YAML exclusion config | +| `--domain-variables` | None | Comma-separated domain filter (SQL-level) | +| `--hierarchical-domains` | None | Domains for hierarchical uprating | + +### Checkpoint flags + +| Flag | Default | Description | +|---|---|---| +| `--build-only` | False | Build matrix, save package, skip fitting | +| `--package-path` | None | Load pre-built package (skip matrix build) | +| `--package-output` | Auto (when `--build-only`) | Where to save package | + +### Hyperparameter flags + +| Flag | Default | Junkyard value | Description | +|---|---|---|---| +| `--beta` | 0.35 | 0.65 | L0 gate temperature (higher = softer gates) | +| `--lambda-l2` | 1e-12 | 1e-8 | L2 regularization on weights | +| `--learning-rate` | 0.15 | 0.15 | Optimizer learning rate | + +### Skip flags + +| Flag | Description | +|---|---| +| `--skip-puf` | Skip PUF clone + QRF imputation | +| `--skip-source-impute` | Skip ACS/SIPP/SCF re-imputation | +| `--skip-takeup-rerandomize` | Skip takeup re-randomization | + +## Calibration Package Format + +The package is a pickled Python dict: + +```python +{ + "X_sparse": scipy.sparse.csr_matrix, # (n_targets, n_records) + "targets_df": pd.DataFrame, # target metadata + values + "target_names": list[str], # human-readable names + "metadata": { + "dataset_path": str, + "db_path": str, + "n_clones": int, + "n_records": int, + "seed": int, + "created_at": str, # ISO timestamp + "target_config": dict, # config used at build time + }, +} +``` + +The `targets_df` DataFrame has columns: `variable`, `geo_level`, `geographic_id`, `domain_variable`, `value`, and others from the database. + +## Hyperparameter Tuning Guide + +The three key hyperparameters control the tradeoff between target accuracy and sparsity: + +- **`beta`** (L0 gate temperature): Controls how sharply the L0 gates open/close. Higher values (0.5--0.8) give softer decisions and more exploration early in training. Lower values (0.2--0.4) give harder on/off decisions. + +- **`lambda_l0`** (via `--preset` or `--lambda-l0`): Controls how many records survive. `1e-8` (local preset) keeps millions of records for local-area analysis. `1e-4` (national preset) keeps ~50K for the web app. + +- **`lambda_l2`**: Regularizes weight magnitudes. Larger values (1e-8) prevent any single record from having extreme weight. Smaller values (1e-12) allow more weight concentration. + +### Suggested starting points + +For **local-area calibration** (millions of records): +```bash +--lambda-l0 1e-8 --beta 0.65 --lambda-l2 1e-8 --epochs 500 +``` + +For **national web app** (~50K records): +```bash +--lambda-l0 1e-4 --beta 0.35 --lambda-l2 1e-12 --epochs 200 +``` + +## Makefile Targets + +| Target | Description | +|---|---| +| `make calibrate` | Full pipeline with PUF and target config | +| `make calibrate-build` | Build-only mode (saves package, no fitting) | diff --git a/modal_app/remote_calibration_runner.py b/modal_app/remote_calibration_runner.py index 689d245d..24583003 100644 --- a/modal_app/remote_calibration_runner.py +++ b/modal_app/remote_calibration_runner.py @@ -15,7 +15,39 @@ REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git" -def _fit_weights_impl(branch: str, epochs: int) -> dict: +def _run_streaming(cmd, env=None, label=""): + """Run a subprocess, streaming output line-by-line. + + Returns (returncode, captured_stdout_lines). + """ + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + env=env, + ) + lines = [] + for line in proc.stdout: + line = line.rstrip("\n") + if label: + print(f"[{label}] {line}", flush=True) + else: + print(line, flush=True) + lines.append(line) + proc.wait() + return proc.returncode, lines + + +def _fit_weights_impl( + branch: str, + epochs: int, + target_config: str = None, + beta: float = None, + lambda_l2: float = None, + learning_rate: float = None, +) -> dict: """Shared implementation for weight fitting.""" os.chdir("/root") subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True) @@ -23,8 +55,8 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict: subprocess.run(["uv", "sync", "--extra", "l0"], check=True) - print("Downloading calibration inputs from HuggingFace...") - download_result = subprocess.run( + print("Downloading calibration inputs from HuggingFace...", flush=True) + dl_rc, dl_lines = _run_streaming( [ "uv", "run", @@ -36,52 +68,54 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict: "print(f\"DB: {paths['database']}\"); " "print(f\"DATASET: {paths['dataset']}\")", ], - capture_output=True, - text=True, env=os.environ.copy(), + label="download", ) - print(download_result.stdout) - if download_result.stderr: - print("Download STDERR:", download_result.stderr) - if download_result.returncode != 0: - raise RuntimeError(f"Download failed: {download_result.returncode}") + if dl_rc != 0: + raise RuntimeError(f"Download failed with code {dl_rc}") db_path = dataset_path = None - for line in download_result.stdout.split("\n"): - if line.startswith("DB:"): + for line in dl_lines: + if "DB:" in line: db_path = line.split("DB:")[1].strip() - elif line.startswith("DATASET:"): + elif "DATASET:" in line: dataset_path = line.split("DATASET:")[1].strip() script_path = "policyengine_us_data/calibration/unified_calibration.py" - result = subprocess.run( - [ - "uv", - "run", - "python", - script_path, - "--device", - "cuda", - "--epochs", - str(epochs), - "--db-path", - db_path, - "--dataset", - dataset_path, - ], - capture_output=True, - text=True, + cmd = [ + "uv", + "run", + "python", + script_path, + "--device", + "cuda", + "--epochs", + str(epochs), + "--db-path", + db_path, + "--dataset", + dataset_path, + ] + if target_config: + cmd.extend(["--target-config", target_config]) + if beta is not None: + cmd.extend(["--beta", str(beta)]) + if lambda_l2 is not None: + cmd.extend(["--lambda-l2", str(lambda_l2)]) + if learning_rate is not None: + cmd.extend(["--learning-rate", str(learning_rate)]) + + cal_rc, cal_lines = _run_streaming( + cmd, env=os.environ.copy(), + label="calibrate", ) - print(result.stdout) - if result.stderr: - print("STDERR:", result.stderr) - if result.returncode != 0: - raise RuntimeError(f"Script failed with code {result.returncode}") + if cal_rc != 0: + raise RuntimeError(f"Script failed with code {cal_rc}") output_path = None log_path = None - for line in result.stdout.split("\n"): + for line in cal_lines: if "OUTPUT_PATH:" in line: output_path = line.split("OUTPUT_PATH:")[1].strip() elif "LOG_PATH:" in line: @@ -106,8 +140,17 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict: gpu="T4", timeout=14400, ) -def fit_weights_t4(branch: str = "main", epochs: int = 200) -> dict: - return _fit_weights_impl(branch, epochs) +def fit_weights_t4( + branch: str = "main", + epochs: int = 200, + target_config: str = None, + beta: float = None, + lambda_l2: float = None, + learning_rate: float = None, +) -> dict: + return _fit_weights_impl( + branch, epochs, target_config, beta, lambda_l2, learning_rate + ) @app.function( @@ -118,8 +161,17 @@ def fit_weights_t4(branch: str = "main", epochs: int = 200) -> dict: gpu="A10", timeout=14400, ) -def fit_weights_a10(branch: str = "main", epochs: int = 200) -> dict: - return _fit_weights_impl(branch, epochs) +def fit_weights_a10( + branch: str = "main", + epochs: int = 200, + target_config: str = None, + beta: float = None, + lambda_l2: float = None, + learning_rate: float = None, +) -> dict: + return _fit_weights_impl( + branch, epochs, target_config, beta, lambda_l2, learning_rate + ) @app.function( @@ -130,8 +182,17 @@ def fit_weights_a10(branch: str = "main", epochs: int = 200) -> dict: gpu="A100-40GB", timeout=14400, ) -def fit_weights_a100_40(branch: str = "main", epochs: int = 200) -> dict: - return _fit_weights_impl(branch, epochs) +def fit_weights_a100_40( + branch: str = "main", + epochs: int = 200, + target_config: str = None, + beta: float = None, + lambda_l2: float = None, + learning_rate: float = None, +) -> dict: + return _fit_weights_impl( + branch, epochs, target_config, beta, lambda_l2, learning_rate + ) @app.function( @@ -142,8 +203,17 @@ def fit_weights_a100_40(branch: str = "main", epochs: int = 200) -> dict: gpu="A100-80GB", timeout=14400, ) -def fit_weights_a100_80(branch: str = "main", epochs: int = 200) -> dict: - return _fit_weights_impl(branch, epochs) +def fit_weights_a100_80( + branch: str = "main", + epochs: int = 200, + target_config: str = None, + beta: float = None, + lambda_l2: float = None, + learning_rate: float = None, +) -> dict: + return _fit_weights_impl( + branch, epochs, target_config, beta, lambda_l2, learning_rate + ) @app.function( @@ -154,8 +224,17 @@ def fit_weights_a100_80(branch: str = "main", epochs: int = 200) -> dict: gpu="H100", timeout=14400, ) -def fit_weights_h100(branch: str = "main", epochs: int = 200) -> dict: - return _fit_weights_impl(branch, epochs) +def fit_weights_h100( + branch: str = "main", + epochs: int = 200, + target_config: str = None, + beta: float = None, + lambda_l2: float = None, + learning_rate: float = None, +) -> dict: + return _fit_weights_impl( + branch, epochs, target_config, beta, lambda_l2, learning_rate + ) GPU_FUNCTIONS = { @@ -174,6 +253,10 @@ def main( gpu: str = "T4", output: str = "calibration_weights.npy", log_output: str = "calibration_log.csv", + target_config: str = None, + beta: float = None, + lambda_l2: float = None, + learning_rate: float = None, ): if gpu not in GPU_FUNCTIONS: raise ValueError( @@ -182,7 +265,14 @@ def main( print(f"Running with GPU: {gpu}, epochs: {epochs}, branch: {branch}") func = GPU_FUNCTIONS[gpu] - result = func.remote(branch=branch, epochs=epochs) + result = func.remote( + branch=branch, + epochs=epochs, + target_config=target_config, + beta=beta, + lambda_l2=lambda_l2, + learning_rate=learning_rate, + ) with open(output, "wb") as f: f.write(result["weights"]) diff --git a/policyengine_us_data/calibration/target_config.yaml b/policyengine_us_data/calibration/target_config.yaml new file mode 100644 index 00000000..1e1e287d --- /dev/null +++ b/policyengine_us_data/calibration/target_config.yaml @@ -0,0 +1,51 @@ +# Target exclusion config for unified calibration. +# Each entry excludes targets matching (variable, geo_level). +# Derived from junkyard's 22 excluded target groups. + +exclude: + # National exclusions + - variable: alimony_expense + geo_level: national + - variable: alimony_income + geo_level: national + - variable: charitable_deduction + geo_level: national + - variable: child_support_expense + geo_level: national + - variable: child_support_received + geo_level: national + - variable: interest_deduction + geo_level: national + - variable: medical_expense_deduction + geo_level: national + - variable: net_worth + geo_level: national + - variable: person_count + geo_level: national + - variable: real_estate_taxes + geo_level: national + - variable: rent + geo_level: national + - variable: social_security_dependents + geo_level: national + - variable: social_security_survivors + geo_level: national + # District exclusions + - variable: aca_ptc + geo_level: district + - variable: eitc + geo_level: district + - variable: income_tax_before_credits + geo_level: district + - variable: medical_expense_deduction + geo_level: district + - variable: net_capital_gains + geo_level: district + - variable: rental_income + geo_level: district + - variable: tax_unit_count + geo_level: district + - variable: tax_unit_partnership_s_corp_income + geo_level: district + - variable: taxable_social_security + geo_level: district diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py index 84c31538..5fffa13f 100644 --- a/policyengine_us_data/calibration/unified_calibration.py +++ b/policyengine_us_data/calibration/unified_calibration.py @@ -271,9 +271,175 @@ def parse_args(argv=None): action="store_true", help="Skip ACS/SIPP/SCF re-imputation with state", ) + parser.add_argument( + "--target-config", + default=None, + help="Path to target exclusion YAML config", + ) + parser.add_argument( + "--build-only", + action="store_true", + help="Build matrix + save package, skip fitting", + ) + parser.add_argument( + "--package-path", + default=None, + help="Load pre-built calibration package (skip matrix build)", + ) + parser.add_argument( + "--package-output", + default=None, + help="Where to save calibration package", + ) + parser.add_argument( + "--beta", + type=float, + default=BETA, + help=f"L0 gate temperature (default: {BETA})", + ) + parser.add_argument( + "--lambda-l2", + type=float, + default=LAMBDA_L2, + help=f"L2 regularization (default: {LAMBDA_L2})", + ) + parser.add_argument( + "--learning-rate", + type=float, + default=LEARNING_RATE, + help=f"Learning rate (default: {LEARNING_RATE})", + ) return parser.parse_args(argv) +def load_target_config(path: str) -> dict: + """Load target exclusion config from YAML. + + Args: + path: Path to YAML config file. + + Returns: + Parsed config dict with 'exclude' list. + """ + import yaml + + with open(path) as f: + config = yaml.safe_load(f) + if config is None: + config = {} + if "exclude" not in config: + config["exclude"] = [] + return config + + +def apply_target_config( + targets_df: "pd.DataFrame", + X_sparse, + target_names: list, + config: dict, +) -> tuple: + """Filter targets based on exclusion config. + + Each exclude rule matches rows where variable and geo_level + both match. Optionally matches domain_variable too. + + Args: + targets_df: DataFrame with target rows. + X_sparse: Sparse matrix (targets x records). + target_names: List of target name strings. + config: Config dict with 'exclude' list. + + Returns: + (filtered_targets_df, filtered_X_sparse, filtered_names) + """ + import pandas as pd + + exclude_rules = config.get("exclude", []) + if not exclude_rules: + return targets_df, X_sparse, target_names + + n_before = len(targets_df) + keep_mask = np.ones(n_before, dtype=bool) + + for rule in exclude_rules: + var = rule["variable"] + geo = rule["geo_level"] + rule_mask = (targets_df["variable"] == var) & ( + targets_df["geo_level"] == geo + ) + if "domain_variable" in rule: + rule_mask = rule_mask & ( + targets_df["domain_variable"] == rule["domain_variable"] + ) + keep_mask &= ~rule_mask + + n_dropped = n_before - keep_mask.sum() + logger.info( + "Target config: kept %d / %d targets (dropped %d)", + keep_mask.sum(), + n_before, + n_dropped, + ) + + idx = np.where(keep_mask)[0] + filtered_df = targets_df.iloc[idx].reset_index(drop=True) + filtered_X = X_sparse[idx, :] + filtered_names = [target_names[i] for i in idx] + + return filtered_df, filtered_X, filtered_names + + +def save_calibration_package( + path: str, + X_sparse, + targets_df: "pd.DataFrame", + target_names: list, + metadata: dict, +) -> None: + """Save calibration package to pickle. + + Args: + path: Output file path. + X_sparse: Sparse matrix. + targets_df: Targets DataFrame. + target_names: Target name list. + metadata: Run metadata dict. + """ + import pickle + + package = { + "X_sparse": X_sparse, + "targets_df": targets_df, + "target_names": target_names, + "metadata": metadata, + } + Path(path).parent.mkdir(parents=True, exist_ok=True) + with open(path, "wb") as f: + pickle.dump(package, f, protocol=pickle.HIGHEST_PROTOCOL) + logger.info("Calibration package saved to %s", path) + + +def load_calibration_package(path: str) -> dict: + """Load calibration package from pickle. + + Args: + path: Path to package file. + + Returns: + Dict with X_sparse, targets_df, target_names, metadata. + """ + import pickle + + with open(path, "rb") as f: + package = pickle.load(f) + logger.info( + "Loaded package: %d targets, %d records", + package["X_sparse"].shape[0], + package["X_sparse"].shape[1], + ) + return package + + def fit_l0_weights( X_sparse, targets: np.ndarray, @@ -281,6 +447,9 @@ def fit_l0_weights( epochs: int = DEFAULT_EPOCHS, device: str = "cpu", verbose_freq: int = None, + beta: float = BETA, + lambda_l2: float = LAMBDA_L2, + learning_rate: float = LEARNING_RATE, ) -> np.ndarray: """Fit L0-regularized calibration weights. @@ -291,6 +460,9 @@ def fit_l0_weights( epochs: Training epochs. device: Torch device. verbose_freq: Print frequency. Defaults to 10%. + beta: L0 gate temperature. + lambda_l2: L2 regularization strength. + learning_rate: Optimizer learning rate. Returns: Weight array of shape (n_records,). @@ -311,16 +483,20 @@ def fit_l0_weights( logger.info( "L0 calibration: %d targets, %d features, " - "lambda_l0=%.1e, epochs=%d", + "lambda_l0=%.1e, beta=%.2f, lambda_l2=%.1e, " + "lr=%.3f, epochs=%d", X_sparse.shape[0], n_total, lambda_l0, + beta, + lambda_l2, + learning_rate, epochs, ) model = SparseCalibrationWeights( n_features=n_total, - beta=BETA, + beta=beta, gamma=GAMMA, zeta=ZETA, init_keep_prob=INIT_KEEP_PROB, @@ -348,8 +524,8 @@ def _flushed_print(*args, **kwargs): y=targets, target_groups=None, lambda_l0=lambda_l0, - lambda_l2=LAMBDA_L2, - lr=LEARNING_RATE, + lambda_l2=lambda_l2, + lr=learning_rate, epochs=epochs, loss_type="relative", verbose=True, @@ -503,6 +679,13 @@ def run_calibration( puf_dataset_path: str = None, skip_puf: bool = False, skip_source_impute: bool = False, + target_config: dict = None, + build_only: bool = False, + package_path: str = None, + package_output_path: str = None, + beta: float = BETA, + lambda_l2: float = LAMBDA_L2, + learning_rate: float = LEARNING_RATE, ): """Run unified calibration pipeline. @@ -521,12 +704,51 @@ def run_calibration( puf_dataset_path: Path to PUF h5 for QRF training. skip_puf: Skip PUF clone step. skip_source_impute: Skip ACS/SIPP/SCF imputations. + target_config: Parsed target config dict. + build_only: If True, save package and skip fitting. + package_path: Load pre-built package (skip build). + package_output_path: Where to save calibration package. + beta: L0 gate temperature. + lambda_l2: L2 regularization strength. + learning_rate: Optimizer learning rate. Returns: (weights, targets_df, X_sparse, target_names) + weights is None when build_only=True. """ import time + t0 = time.time() + + # Early exit: load pre-built package + if package_path is not None: + package = load_calibration_package(package_path) + targets_df = package["targets_df"] + X_sparse = package["X_sparse"] + target_names = package["target_names"] + + if target_config: + targets_df, X_sparse, target_names = apply_target_config( + targets_df, X_sparse, target_names, target_config + ) + + targets = targets_df["value"].values + weights = fit_l0_weights( + X_sparse=X_sparse, + targets=targets, + lambda_l0=lambda_l0, + epochs=epochs, + device=device, + beta=beta, + lambda_l2=lambda_l2, + learning_rate=learning_rate, + ) + logger.info( + "Total pipeline (from package): %.1f min", + (time.time() - t0) / 60, + ) + return weights, targets_df, X_sparse, target_names + from policyengine_us import Microsimulation from policyengine_us_data.calibration.clone_and_assign import ( @@ -537,8 +759,6 @@ def run_calibration( UnifiedMatrixBuilder, ) - t0 = time.time() - # Step 1: Load dataset logger.info("Loading dataset from %s", dataset_path) sim = Microsimulation(dataset=dataset_path) @@ -671,6 +891,37 @@ def sim_modifier(s, clone_idx): X_sparse.nnz, ) + # Step 6b: Apply target config filtering + if target_config: + targets_df, X_sparse, target_names = apply_target_config( + targets_df, X_sparse, target_names, target_config + ) + + # Step 6c: Save calibration package + if package_output_path: + import datetime + + metadata = { + "dataset_path": dataset_path, + "db_path": db_path, + "n_clones": n_clones, + "n_records": X_sparse.shape[1], + "seed": seed, + "created_at": datetime.datetime.now().isoformat(), + "target_config": target_config, + } + save_calibration_package( + package_output_path, + X_sparse, + targets_df, + target_names, + metadata, + ) + + if build_only: + logger.info("Build-only mode: skipping fitting") + return None, targets_df, X_sparse, target_names + # Step 7: L0 calibration targets = targets_df["value"].values @@ -688,6 +939,9 @@ def sim_modifier(s, clone_idx): lambda_l0=lambda_l0, epochs=epochs, device=device, + beta=beta, + lambda_l2=lambda_l2, + learning_rate=learning_rate, ) logger.info( @@ -748,6 +1002,16 @@ def main(argv=None): puf_dataset_path = getattr(args, "puf_dataset", None) + target_config = None + if args.target_config: + target_config = load_target_config(args.target_config) + + package_output_path = args.package_output + if args.build_only and not package_output_path: + package_output_path = str( + STORAGE_FOLDER / "calibration" / "calibration_package.pkl" + ) + weights, targets_df, X_sparse, target_names = run_calibration( dataset_path=dataset_path, db_path=db_path, @@ -762,8 +1026,19 @@ def main(argv=None): puf_dataset_path=puf_dataset_path, skip_puf=getattr(args, "skip_puf", False), skip_source_impute=getattr(args, "skip_source_impute", False), + target_config=target_config, + build_only=args.build_only, + package_path=args.package_path, + package_output_path=package_output_path, + beta=args.beta, + lambda_l2=args.lambda_l2, + learning_rate=args.learning_rate, ) + if weights is None: + logger.info("Build-only complete. Package saved.") + return + # Save weights np.save(output_path, weights) logger.info("Weights saved to %s", output_path) @@ -798,11 +1073,15 @@ def main(argv=None): "skip_source_impute": getattr(args, "skip_source_impute", False), "n_clones": args.n_clones, "lambda_l0": lambda_l0, + "beta": args.beta, + "lambda_l2": args.lambda_l2, + "learning_rate": args.learning_rate, "epochs": args.epochs, "device": args.device, "seed": args.seed, "domain_variables": domain_variables, "hierarchical_domains": hierarchical_domains, + "target_config": args.target_config, "n_targets": len(targets_df), "n_records": X_sparse.shape[1], "weight_sum": float(weights.sum()), diff --git a/policyengine_us_data/tests/test_calibration/test_target_config.py b/policyengine_us_data/tests/test_calibration/test_target_config.py new file mode 100644 index 00000000..9241660c --- /dev/null +++ b/policyengine_us_data/tests/test_calibration/test_target_config.py @@ -0,0 +1,177 @@ +"""Tests for target config filtering in unified calibration.""" + +import numpy as np +import pandas as pd +import pytest +from scipy import sparse + +from policyengine_us_data.calibration.unified_calibration import ( + apply_target_config, + load_target_config, + save_calibration_package, + load_calibration_package, +) + + +@pytest.fixture +def sample_targets(): + targets_df = pd.DataFrame( + { + "variable": [ + "snap", + "snap", + "eitc", + "eitc", + "rent", + "person_count", + ], + "geo_level": [ + "national", + "state", + "district", + "state", + "national", + "national", + ], + "domain_variable": [ + "snap", + "snap", + "eitc", + "eitc", + "rent", + "person_count", + ], + "geographic_id": ["US", "6", "0601", "6", "US", "US"], + "value": [1000, 500, 200, 300, 800, 5000], + } + ) + n_rows = len(targets_df) + n_cols = 10 + rng = np.random.default_rng(42) + X = sparse.random(n_rows, n_cols, density=0.5, random_state=rng) + X = X.tocsr() + target_names = [ + f"{r.variable}_{r.geo_level}_{r.geographic_id}" + for _, r in targets_df.iterrows() + ] + return targets_df, X, target_names + + +class TestApplyTargetConfig: + def test_empty_config_keeps_all(self, sample_targets): + df, X, names = sample_targets + config = {"exclude": []} + out_df, out_X, out_names = apply_target_config(df, X, names, config) + assert len(out_df) == len(df) + assert out_X.shape == X.shape + assert out_names == names + + def test_single_variable_geo_exclusion(self, sample_targets): + df, X, names = sample_targets + config = {"exclude": [{"variable": "rent", "geo_level": "national"}]} + out_df, out_X, out_names = apply_target_config(df, X, names, config) + assert len(out_df) == len(df) - 1 + assert "rent" not in out_df["variable"].values + + def test_multiple_exclusions(self, sample_targets): + df, X, names = sample_targets + config = { + "exclude": [ + {"variable": "rent", "geo_level": "national"}, + {"variable": "eitc", "geo_level": "district"}, + ] + } + out_df, out_X, out_names = apply_target_config(df, X, names, config) + assert len(out_df) == len(df) - 2 + kept = set(zip(out_df["variable"], out_df["geo_level"])) + assert ("rent", "national") not in kept + assert ("eitc", "district") not in kept + assert ("eitc", "state") in kept + + def test_domain_variable_matching(self, sample_targets): + df, X, names = sample_targets + config = { + "exclude": [ + { + "variable": "snap", + "geo_level": "national", + "domain_variable": "snap", + } + ] + } + out_df, out_X, out_names = apply_target_config(df, X, names, config) + assert len(out_df) == len(df) - 1 + + def test_matrix_and_names_stay_in_sync(self, sample_targets): + df, X, names = sample_targets + config = { + "exclude": [{"variable": "person_count", "geo_level": "national"}] + } + out_df, out_X, out_names = apply_target_config(df, X, names, config) + assert out_X.shape[0] == len(out_df) + assert len(out_names) == len(out_df) + assert out_X.shape[1] == X.shape[1] + + def test_no_match_keeps_all(self, sample_targets): + df, X, names = sample_targets + config = { + "exclude": [{"variable": "nonexistent", "geo_level": "national"}] + } + out_df, out_X, out_names = apply_target_config(df, X, names, config) + assert len(out_df) == len(df) + assert out_X.shape[0] == X.shape[0] + + +class TestLoadTargetConfig: + def test_load_valid_config(self, tmp_path): + config_file = tmp_path / "config.yaml" + config_file.write_text( + "exclude:\n" " - variable: snap\n" " geo_level: national\n" + ) + config = load_target_config(str(config_file)) + assert len(config["exclude"]) == 1 + assert config["exclude"][0]["variable"] == "snap" + + def test_load_empty_config(self, tmp_path): + config_file = tmp_path / "empty.yaml" + config_file.write_text("") + config = load_target_config(str(config_file)) + assert config["exclude"] == [] + + +class TestCalibrationPackageRoundTrip: + def test_round_trip(self, sample_targets, tmp_path): + df, X, names = sample_targets + pkg_path = str(tmp_path / "package.pkl") + metadata = { + "dataset_path": "/tmp/test.h5", + "db_path": "/tmp/test.db", + "n_clones": 5, + "n_records": X.shape[1], + "seed": 42, + "created_at": "2024-01-01T00:00:00", + "target_config": None, + } + save_calibration_package(pkg_path, X, df, names, metadata) + loaded = load_calibration_package(pkg_path) + + assert loaded["target_names"] == names + pd.testing.assert_frame_equal(loaded["targets_df"], df) + assert loaded["X_sparse"].shape == X.shape + assert loaded["metadata"]["seed"] == 42 + + def test_package_then_filter(self, sample_targets, tmp_path): + df, X, names = sample_targets + pkg_path = str(tmp_path / "package.pkl") + metadata = {"n_records": X.shape[1]} + save_calibration_package(pkg_path, X, df, names, metadata) + loaded = load_calibration_package(pkg_path) + + config = {"exclude": [{"variable": "rent", "geo_level": "national"}]} + out_df, out_X, out_names = apply_target_config( + loaded["targets_df"], + loaded["X_sparse"], + loaded["target_names"], + config, + ) + assert len(out_df) == len(df) - 1 diff --git a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py index 2d3f8061..341ffcc0 100644 --- a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py +++ b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py @@ -85,3 +85,63 @@ def test_expected_count(self): ) assert len(SIMPLE_TAKEUP_VARS) == 8 + + +class TestParseArgsNewFlags: + """Verify new CLI flags are parsed correctly.""" + + def test_target_config_flag(self): + from policyengine_us_data.calibration.unified_calibration import ( + parse_args, + ) + + args = parse_args(["--target-config", "config.yaml"]) + assert args.target_config == "config.yaml" + + def test_build_only_flag(self): + from policyengine_us_data.calibration.unified_calibration import ( + parse_args, + ) + + args = parse_args(["--build-only"]) + assert args.build_only is True + + def test_package_path_flag(self): + from policyengine_us_data.calibration.unified_calibration import ( + parse_args, + ) + + args = parse_args(["--package-path", "pkg.pkl"]) + assert args.package_path == "pkg.pkl" + + def test_hyperparams_flags(self): + from policyengine_us_data.calibration.unified_calibration import ( + parse_args, + ) + + args = parse_args( + [ + "--beta", + "0.65", + "--lambda-l2", + "1e-8", + "--learning-rate", + "0.2", + ] + ) + assert args.beta == 0.65 + assert args.lambda_l2 == 1e-8 + assert args.learning_rate == 0.2 + + def test_hyperparams_defaults(self): + from policyengine_us_data.calibration.unified_calibration import ( + BETA, + LAMBDA_L2, + LEARNING_RATE, + parse_args, + ) + + args = parse_args([]) + assert args.beta == BETA + assert args.lambda_l2 == LAMBDA_L2 + assert args.learning_rate == LEARNING_RATE From 61523d83d5819bcc18d7ca76209925cd1a9da90e Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Tue, 17 Feb 2026 12:23:55 -0500 Subject: [PATCH 4/6] Ignore all calibration run outputs in storage/calibration/ Co-Authored-By: Claude Opus 4.6 --- .gitignore | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index a7ab98c9..6fa185f6 100644 --- a/.gitignore +++ b/.gitignore @@ -30,8 +30,8 @@ docs/.ipynb_checkpoints/ ## ACA PTC state-level uprating factors !policyengine_us_data/storage/aca_ptc_multipliers_2022_2024.csv -## Raw input cache for database pipeline -policyengine_us_data/storage/calibration/raw_inputs/ +## Calibration run outputs (weights, diagnostics, packages, config) +policyengine_us_data/storage/calibration/ ## Batch processing checkpoints completed_*.txt From 4c51b3280fe5e9cbb90ab019dd8982cabc702a2d Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Wed, 18 Feb 2026 20:06:31 +0530 Subject: [PATCH 5/6] Add category-dependent takeup re-randomization for EITC and WIC EITC takeup rates vary by child count and WIC rates vary by demographic category, both requiring sim-calculated values. Introduce a two-pass flow in _simulate_clone() with a post_sim_modifier that runs after the first cache clear, computes category variables, re-randomizes takeup, then clears caches again before final target calculation. Co-Authored-By: Claude Opus 4.6 --- changelog_entry.yaml | 2 + .../calibration/unified_calibration.py | 130 +++++ .../calibration/unified_matrix_builder.py | 24 +- .../test_unified_calibration.py | 454 ++++++++++++++++++ 4 files changed, 607 insertions(+), 3 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index 8e9b4fb6..f446f1b1 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -5,6 +5,8 @@ - ACS/SIPP/SCF geographic re-imputation module (source_impute.py) with state predictor - PUF and source impute integration into unified calibration pipeline (--puf-dataset, --skip-puf, --skip-source-impute flags) - 21 new tests for puf_impute and source_impute modules + - Category-dependent takeup re-randomization for takes_up_eitc (by child count) and would_claim_wic (by WIC category) during calibration cloning + - Two-pass simulation flow in _simulate_clone() with post_sim_modifier for category variables that require sim output changed: - Refactored extended_cps.py to delegate to puf_impute.puf_clone_dataset() (443 -> 75 lines) - unified_calibration.py pipeline now supports optional source imputation and PUF cloning steps diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py index 5fffa13f..48a268cb 100644 --- a/policyengine_us_data/calibration/unified_calibration.py +++ b/policyengine_us_data/calibration/unified_calibration.py @@ -100,6 +100,58 @@ ] +def _eitc_category_mapper(categories, rates): + """Map eitc_child_count to per-entity takeup rates. + + Rates loaded from parameters/take_up/eitc.yaml via + load_take_up_rate(). Clamps child count to max key in + rate dict (matches cps.py EITC logic). + + Args: + categories: Array of child counts per tax unit. + rates: Dict mapping child count -> takeup rate. + + Returns: + Array of per-entity takeup rates. + """ + max_key = max(rates.keys()) + return np.array([rates[min(int(c), max_key)] for c in categories]) + + +def _wic_category_mapper(categories, rates): + """Map wic_category_str to per-entity takeup rates. + + Rates loaded from parameters/take_up/wic_takeup.yaml via + load_take_up_rate(). Unknown categories default to 0. + + Args: + categories: Array of WIC category strings per person. + rates: Dict mapping category string -> takeup rate. + + Returns: + Array of per-entity takeup rates. + """ + return np.array([rates.get(c, 0) for c in categories]) + + +CATEGORY_TAKEUP_VARS = [ + { + "variable": "takes_up_eitc", + "entity": "tax_unit", + "rate_key": "eitc", + "category_variable": "eitc_child_count", + "category_mapper": _eitc_category_mapper, + }, + { + "variable": "would_claim_wic", + "entity": "person", + "rate_key": "wic_takeup", + "category_variable": "wic_category_str", + "category_mapper": _wic_category_mapper, + }, +] + + def rerandomize_takeup( sim, clone_block_geoids: np.ndarray, @@ -179,6 +231,76 @@ def rerandomize_takeup( sim.set_input(var_name, time_period, new_values) +def rerandomize_category_takeup( + sim, + clone_block_geoids: np.ndarray, + time_period: int, +) -> None: + """Re-randomize category-dependent takeup variables. + + Called post-simulation (after cache clear) so that category + variables like eitc_child_count and wic_category_str are + computed fresh using the clone's geography. + + For each entry in CATEGORY_TAKEUP_VARS: + 1. Load rates from YAML via load_take_up_rate() + 2. Calculate the category variable from the sim + 3. Map categories to per-entity rates via the mapper + 4. Draw per-block seeded random values + 5. Compare draws < rates to get new boolean takeup + + Args: + sim: Microsimulation instance (post cache-clear). + clone_block_geoids: Block GEOIDs per household. + time_period: Tax year. + """ + from policyengine_us_data.parameters import ( + load_take_up_rate, + ) + from policyengine_us_data.utils.randomness import ( + seeded_rng, + ) + + hh_ids = sim.calculate("household_id", map_to="household").values + hh_to_block = dict(zip(hh_ids, clone_block_geoids)) + + for spec in CATEGORY_TAKEUP_VARS: + var_name = spec["variable"] + entity_level = spec["entity"] + rate_key = spec["rate_key"] + cat_var = spec["category_variable"] + mapper = spec["category_mapper"] + + rates_dict = load_take_up_rate(rate_key, time_period) + + # Calculate category variable (uses fresh sim state) + categories = sim.calculate(cat_var, map_to=entity_level).values + + # Map categories to per-entity rates + per_entity_rates = mapper(categories, rates_dict) + + # Map entities to blocks via household + entity_hh_ids = sim.calculate( + "household_id", map_to=entity_level + ).values + n_entities = len(entity_hh_ids) + + entity_blocks = np.array( + [hh_to_block.get(hid, "0") for hid in entity_hh_ids] + ) + + draws = np.zeros(n_entities, dtype=np.float64) + unique_blocks = np.unique(entity_blocks) + for block in unique_blocks: + mask = entity_blocks == block + n_in_block = mask.sum() + rng = seeded_rng(var_name, salt=str(block)) + draws[mask] = rng.random(n_in_block) + + new_values = draws < per_entity_rates + sim.set_input(var_name, time_period, new_values) + + def parse_args(argv=None): parser = argparse.ArgumentParser( description="Unified L0 calibration pipeline" @@ -849,6 +971,7 @@ def run_calibration( # Step 4: Build sim_modifier for takeup rerandomization sim_modifier = None + post_sim_modifier = None if not skip_takeup_rerandomize: time_period = 2024 @@ -859,6 +982,12 @@ def sim_modifier(s, clone_idx): states = geography.state_fips[col_start:col_end] rerandomize_takeup(s, blocks, states, time_period) + def post_sim_modifier(s, clone_idx): + col_start = clone_idx * n_records + col_end = col_start + n_records + blocks = geography.block_geoid[col_start:col_end] + rerandomize_category_takeup(s, blocks, time_period) + # Step 5: Build target filter target_filter = {} if domain_variables: @@ -878,6 +1007,7 @@ def sim_modifier(s, clone_idx): target_filter=target_filter, hierarchical_domains=hierarchical_domains, sim_modifier=sim_modifier, + post_sim_modifier=post_sim_modifier, ) builder.print_uprating_summary(targets_df) diff --git a/policyengine_us_data/calibration/unified_matrix_builder.py b/policyengine_us_data/calibration/unified_matrix_builder.py index ac31c34e..a65cca92 100644 --- a/policyengine_us_data/calibration/unified_matrix_builder.py +++ b/policyengine_us_data/calibration/unified_matrix_builder.py @@ -555,6 +555,7 @@ def _simulate_clone( n_records: int, variables: set, sim_modifier=None, + post_sim_modifier=None, clone_idx: int = 0, ) -> Tuple[Dict[str, np.ndarray], object]: """Simulate one clone with assigned geography. @@ -566,9 +567,14 @@ def _simulate_clone( variables: Target variable names to compute. sim_modifier: Optional callback(sim, clone_idx) called after state_fips is set but before - cache clearing. Used for takeup + cache clearing. Used for simple takeup re-randomization. - clone_idx: Clone index passed to sim_modifier. + post_sim_modifier: Optional callback(sim, clone_idx) + called after the first cache clear. Used for + category-dependent takeup re-randomization + that needs sim-calculated category variables. + Triggers a second cache clear afterwards. + clone_idx: Clone index passed to modifiers. Returns: (var_values, sim) where var_values maps variable @@ -587,6 +593,13 @@ def _simulate_clone( for var in get_calculated_variables(sim): sim.delete_arrays(var) + # Two-pass: category-dependent takeup needs fresh + # category variables, then a second cache clear + if post_sim_modifier is not None: + post_sim_modifier(sim, clone_idx) + for var in get_calculated_variables(sim): + sim.delete_arrays(var) + var_values: Dict[str, np.ndarray] = {} for var in variables: if var.endswith("_count"): @@ -614,6 +627,7 @@ def build_matrix( hierarchical_domains: Optional[List[str]] = None, cache_dir: Optional[str] = None, sim_modifier=None, + post_sim_modifier=None, ) -> Tuple[pd.DataFrame, sparse.csr_matrix, List[str]]: """Build sparse calibration matrix. @@ -633,8 +647,11 @@ def build_matrix( If None, COO data held in memory. sim_modifier: Optional callback(sim, clone_idx) called per clone after state_fips is set but - before cache clearing. Use for takeup + before cache clearing. Use for simple takeup re-randomization. + post_sim_modifier: Optional callback(sim, clone_idx) + called after the first cache clear for + category-dependent takeup re-randomization. Returns: (targets_df, X_sparse, target_names) @@ -758,6 +775,7 @@ def build_matrix( n_records, unique_variables, sim_modifier=sim_modifier, + post_sim_modifier=post_sim_modifier, clone_idx=clone_idx, ) diff --git a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py index 341ffcc0..6a47dd7c 100644 --- a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py +++ b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py @@ -145,3 +145,457 @@ def test_hyperparams_defaults(self): assert args.beta == BETA assert args.lambda_l2 == LAMBDA_L2 assert args.learning_rate == LEARNING_RATE + + +class TestCategoryTakeupConfig: + """Verify the CATEGORY_TAKEUP_VARS config is well-formed.""" + + def test_all_entries_have_required_keys(self): + from policyengine_us_data.calibration.unified_calibration import ( + CATEGORY_TAKEUP_VARS, + ) + + required = { + "variable", + "entity", + "rate_key", + "category_variable", + "category_mapper", + } + for entry in CATEGORY_TAKEUP_VARS: + for key in required: + assert key in entry, f"Missing key '{key}' in {entry}" + + def test_all_mappers_callable(self): + from policyengine_us_data.calibration.unified_calibration import ( + CATEGORY_TAKEUP_VARS, + ) + + for entry in CATEGORY_TAKEUP_VARS: + assert callable(entry["category_mapper"]) + + def test_all_entities_valid(self): + from policyengine_us_data.calibration.unified_calibration import ( + CATEGORY_TAKEUP_VARS, + ) + + valid = ("person", "tax_unit", "spm_unit") + for entry in CATEGORY_TAKEUP_VARS: + assert entry["entity"] in valid + + def test_expected_count(self): + from policyengine_us_data.calibration.unified_calibration import ( + CATEGORY_TAKEUP_VARS, + ) + + assert len(CATEGORY_TAKEUP_VARS) == 2 + + +class TestCategoryMappers: + """Test mapper functions load rates from YAML and map correctly.""" + + def test_eitc_mapper_standard_counts(self): + from policyengine_us_data.calibration.unified_calibration import ( + _eitc_category_mapper, + ) + from policyengine_us_data.parameters import ( + load_take_up_rate, + ) + + rates = load_take_up_rate("eitc") + categories = np.array([0, 1, 2, 3, 5]) + result = _eitc_category_mapper(categories, rates) + max_key = max(rates.keys()) + expected = np.array( + [ + rates[0], + rates[1], + rates[2], + rates[3], + rates[max_key], + ] + ) + np.testing.assert_array_almost_equal(result, expected) + + def test_eitc_mapper_clamps_high_count(self): + from policyengine_us_data.calibration.unified_calibration import ( + _eitc_category_mapper, + ) + from policyengine_us_data.parameters import ( + load_take_up_rate, + ) + + rates = load_take_up_rate("eitc") + max_key = max(rates.keys()) + categories = np.array([99]) + result = _eitc_category_mapper(categories, rates) + assert result[0] == rates[max_key] + + def test_wic_mapper_known_categories(self): + from policyengine_us_data.calibration.unified_calibration import ( + _wic_category_mapper, + ) + from policyengine_us_data.parameters import ( + load_take_up_rate, + ) + + rates = load_take_up_rate("wic_takeup", 2022) + categories = np.array(list(rates.keys())) + result = _wic_category_mapper(categories, rates) + expected = np.array([rates[c] for c in categories]) + np.testing.assert_array_almost_equal(result, expected) + + def test_wic_mapper_unknown_defaults_zero(self): + from policyengine_us_data.calibration.unified_calibration import ( + _wic_category_mapper, + ) + from policyengine_us_data.parameters import ( + load_take_up_rate, + ) + + rates = load_take_up_rate("wic_takeup", 2022) + categories = np.array(["UNKNOWN"]) + result = _wic_category_mapper(categories, rates) + assert result[0] == 0 + + def test_wic_mapper_none_category(self): + from policyengine_us_data.calibration.unified_calibration import ( + _wic_category_mapper, + ) + from policyengine_us_data.parameters import ( + load_take_up_rate, + ) + + rates = load_take_up_rate("wic_takeup", 2022) + categories = np.array(["NONE"]) + result = _wic_category_mapper(categories, rates) + assert result[0] == rates["NONE"] + + +class _MockResult: + """Wraps an array to mimic sim.calculate(...).values.""" + + def __init__(self, values): + self.values = values + + +class _MockSim: + """Lightweight mock Microsimulation for rerandomize tests. + + Supports sim.calculate(var, map_to=entity).values and + sim.set_input(var, period, values). No dataset needed. + """ + + def __init__(self, calc_returns: dict): + """ + Args: + calc_returns: Dict mapping (var, entity) -> np.ndarray. + """ + self._calc = calc_returns + self.inputs_set = {} + + def calculate(self, var, period=None, map_to=None): + key = (var, map_to) if map_to else (var, None) + if key not in self._calc: + raise ValueError( + f"MockSim has no return for {key}. " + f"Available: {list(self._calc.keys())}" + ) + return _MockResult(self._calc[key]) + + def set_input(self, var, period, values): + self.inputs_set[var] = np.asarray(values) + + +class TestRerandomizeCategoryTakeup: + """Integration test: call rerandomize_category_takeup() + with a mock sim and verify the outputs end-to-end.""" + + def _build_mock_sim(self): + """Build a mock sim with 3 households, 5 tax units, + 6 persons, known child counts and WIC categories.""" + # 3 households + hh_ids_hh = np.array([100, 200, 300]) + + # 5 tax units: HH 100 has 2 TUs, HH 200 has 2, HH 300 has 1 + tu_hh_ids = np.array([100, 100, 200, 200, 300]) + # child counts: 0, 2, 1, 3, 0 + eitc_child_counts = np.array([0, 2, 1, 3, 0]) + + # 6 persons: HH 100 has 2, HH 200 has 3, HH 300 has 1 + person_hh_ids = np.array([100, 100, 200, 200, 200, 300]) + wic_cats = np.array( + [ + "CHILD", + "NONE", + "INFANT", + "PREGNANT", + "CHILD", + "NONE", + ] + ) + + calc_returns = { + ("household_id", "household"): hh_ids_hh, + ("eitc_child_count", "tax_unit"): eitc_child_counts, + ("household_id", "tax_unit"): tu_hh_ids, + ("wic_category_str", "person"): wic_cats, + ("household_id", "person"): person_hh_ids, + } + return _MockSim(calc_returns), { + "hh_ids": hh_ids_hh, + "child_counts": eitc_child_counts, + "tu_hh_ids": tu_hh_ids, + "wic_cats": wic_cats, + "person_hh_ids": person_hh_ids, + } + + def test_sets_both_variables(self): + """rerandomize_category_takeup sets takes_up_eitc + and would_claim_wic.""" + from policyengine_us_data.calibration.unified_calibration import ( + rerandomize_category_takeup, + ) + + sim, _ = self._build_mock_sim() + blocks = np.array( + ["010010001001001", "020020002002002", "030030003003003"] + ) + rerandomize_category_takeup(sim, blocks, 2024) + + assert "takes_up_eitc" in sim.inputs_set + assert "would_claim_wic" in sim.inputs_set + + def test_output_shapes_match_entities(self): + """Output arrays match entity counts: 5 tax units + for EITC, 6 persons for WIC.""" + from policyengine_us_data.calibration.unified_calibration import ( + rerandomize_category_takeup, + ) + + sim, info = self._build_mock_sim() + blocks = np.array(["blk_a", "blk_b", "blk_c"]) + rerandomize_category_takeup(sim, blocks, 2024) + + assert len(sim.inputs_set["takes_up_eitc"]) == len( + info["child_counts"] + ) + assert len(sim.inputs_set["would_claim_wic"]) == len(info["wic_cats"]) + + def test_outputs_are_boolean(self): + from policyengine_us_data.calibration.unified_calibration import ( + rerandomize_category_takeup, + ) + + sim, _ = self._build_mock_sim() + blocks = np.array(["blk_a", "blk_b", "blk_c"]) + rerandomize_category_takeup(sim, blocks, 2024) + + assert sim.inputs_set["takes_up_eitc"].dtype == bool + assert sim.inputs_set["would_claim_wic"].dtype == bool + + def test_deterministic_same_blocks(self): + """Same blocks produce identical takeup values.""" + from policyengine_us_data.calibration.unified_calibration import ( + rerandomize_category_takeup, + ) + + blocks = np.array(["blk_a", "blk_b", "blk_c"]) + + sim1, _ = self._build_mock_sim() + rerandomize_category_takeup(sim1, blocks, 2024) + + sim2, _ = self._build_mock_sim() + rerandomize_category_takeup(sim2, blocks, 2024) + + np.testing.assert_array_equal( + sim1.inputs_set["takes_up_eitc"], + sim2.inputs_set["takes_up_eitc"], + ) + np.testing.assert_array_equal( + sim1.inputs_set["would_claim_wic"], + sim2.inputs_set["would_claim_wic"], + ) + + def test_different_blocks_differ(self): + """Different blocks produce different takeup values.""" + from policyengine_us_data.calibration.unified_calibration import ( + rerandomize_category_takeup, + ) + + sim1, _ = self._build_mock_sim() + rerandomize_category_takeup( + sim1, + np.array(["aaa", "bbb", "ccc"]), + 2024, + ) + + sim2, _ = self._build_mock_sim() + rerandomize_category_takeup( + sim2, + np.array(["xxx", "yyy", "zzz"]), + 2024, + ) + + # At least one of the two variables should differ + eitc_differ = not np.array_equal( + sim1.inputs_set["takes_up_eitc"], + sim2.inputs_set["takes_up_eitc"], + ) + wic_differ = not np.array_equal( + sim1.inputs_set["would_claim_wic"], + sim2.inputs_set["would_claim_wic"], + ) + assert eitc_differ or wic_differ + + def test_eitc_values_match_manual_calculation(self): + """Verify takes_up_eitc matches a hand-computed + result using the same seeded draws and rates.""" + from policyengine_us_data.calibration.unified_calibration import ( + _eitc_category_mapper, + rerandomize_category_takeup, + ) + from policyengine_us_data.parameters import ( + load_take_up_rate, + ) + + sim, info = self._build_mock_sim() + blocks = np.array( + [ + "010010001001001", + "020020002002002", + "030030003003003", + ] + ) + rerandomize_category_takeup(sim, blocks, 2024) + + # Reproduce manually + eitc_rates = load_take_up_rate("eitc", 2024) + child_counts = info["child_counts"] + per_tu_rates = _eitc_category_mapper(child_counts, eitc_rates) + + hh_to_block = dict(zip(info["hh_ids"], blocks)) + tu_blocks = np.array( + [hh_to_block.get(hid, "0") for hid in info["tu_hh_ids"]] + ) + draws = np.zeros(len(child_counts), dtype=np.float64) + for block in np.unique(tu_blocks): + mask = tu_blocks == block + rng = seeded_rng("takes_up_eitc", salt=str(block)) + draws[mask] = rng.random(mask.sum()) + + expected = draws < per_tu_rates + np.testing.assert_array_equal( + sim.inputs_set["takes_up_eitc"], expected + ) + + def test_wic_values_match_manual_calculation(self): + """Verify would_claim_wic matches a hand-computed + result using the same seeded draws and rates.""" + from policyengine_us_data.calibration.unified_calibration import ( + _wic_category_mapper, + rerandomize_category_takeup, + ) + from policyengine_us_data.parameters import ( + load_take_up_rate, + ) + + sim, info = self._build_mock_sim() + blocks = np.array( + [ + "010010001001001", + "020020002002002", + "030030003003003", + ] + ) + rerandomize_category_takeup(sim, blocks, 2024) + + # Reproduce manually + wic_rates = load_take_up_rate("wic_takeup", 2024) + per_person_rates = _wic_category_mapper(info["wic_cats"], wic_rates) + + hh_to_block = dict(zip(info["hh_ids"], blocks)) + person_blocks = np.array( + [hh_to_block.get(hid, "0") for hid in info["person_hh_ids"]] + ) + draws = np.zeros(len(info["wic_cats"]), dtype=np.float64) + for block in np.unique(person_blocks): + mask = person_blocks == block + rng = seeded_rng("would_claim_wic", salt=str(block)) + draws[mask] = rng.random(mask.sum()) + + expected = draws < per_person_rates + np.testing.assert_array_equal( + sim.inputs_set["would_claim_wic"], expected + ) + + def test_none_wic_never_takes_up(self): + """Persons with wic_category_str='NONE' should never + take up WIC (rate=0 from YAML).""" + from policyengine_us_data.calibration.unified_calibration import ( + rerandomize_category_takeup, + ) + + sim, info = self._build_mock_sim() + blocks = np.array(["blk_a", "blk_b", "blk_c"]) + rerandomize_category_takeup(sim, blocks, 2024) + + wic_result = sim.inputs_set["would_claim_wic"] + # Persons at indices 1 and 5 have category "NONE" + none_indices = [ + i for i, c in enumerate(info["wic_cats"]) if c == "NONE" + ] + for idx in none_indices: + assert ( + wic_result[idx] == False + ), f"Person {idx} (NONE) should not claim WIC" + + def test_zero_child_eitc_rate_is_lower(self): + """Tax units with 0 children should have a lower + takeup rate than those with children, verified + statistically over many blocks.""" + from policyengine_us_data.calibration.unified_calibration import ( + _eitc_category_mapper, + ) + from policyengine_us_data.parameters import ( + load_take_up_rate, + ) + + eitc_rates = load_take_up_rate("eitc", 2024) + # 0-child rate should be strictly less than 1-child rate + assert eitc_rates[0] < eitc_rates[1] + + # Simulate 10k draws to verify the rates manifest + n = 10000 + child_counts = np.array([0, 1]) + per_tu_rates = _eitc_category_mapper(child_counts, eitc_rates) + rng = seeded_rng("takes_up_eitc", salt="stat_test") + draws = rng.random(n) + + frac_0 = (draws[:n] < per_tu_rates[0]).mean() + frac_1 = (draws[:n] < per_tu_rates[1]).mean() + assert frac_0 < frac_1 + + +class TestCategoryTakeupSeeding: + """Verify seeded draws for category takeup are deterministic + and vary by block/variable.""" + + def test_same_block_same_draws(self): + var = "takes_up_eitc" + block = "010010001001001" + rng1 = seeded_rng(var, salt=block) + rng2 = seeded_rng(var, salt=block) + np.testing.assert_array_equal(rng1.random(100), rng2.random(100)) + + def test_different_blocks_different_draws(self): + var = "takes_up_eitc" + rng1 = seeded_rng(var, salt="010010001001001") + rng2 = seeded_rng(var, salt="020010001001001") + assert not np.array_equal(rng1.random(100), rng2.random(100)) + + def test_different_category_vars_different_draws(self): + block = "010010001001001" + rng1 = seeded_rng("takes_up_eitc", salt=block) + rng2 = seeded_rng("would_claim_wic", salt=block) + assert not np.array_equal(rng1.random(100), rng2.random(100)) From 5d0ecf60da41a6711c45fe3da26abfde6f0ae785 Mon Sep 17 00:00:00 2001 From: juaristi22 Date: Wed, 18 Feb 2026 20:55:32 +0530 Subject: [PATCH 6/6] Add category-dependent takeup re-randomization for WIC nutritional risk --- changelog_entry.yaml | 2 +- .../calibration/unified_calibration.py | 16 +++ .../test_unified_calibration.py | 112 ++++++++++++++++-- 3 files changed, 122 insertions(+), 8 deletions(-) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index f446f1b1..9445baca 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -5,7 +5,7 @@ - ACS/SIPP/SCF geographic re-imputation module (source_impute.py) with state predictor - PUF and source impute integration into unified calibration pipeline (--puf-dataset, --skip-puf, --skip-source-impute flags) - 21 new tests for puf_impute and source_impute modules - - Category-dependent takeup re-randomization for takes_up_eitc (by child count) and would_claim_wic (by WIC category) during calibration cloning + - Category-dependent takeup re-randomization for takes_up_eitc (by child count) and would_claim_wic, and is_wic_at_nutritional_risk (by WIC category) during calibration cloning - Two-pass simulation flow in _simulate_clone() with post_sim_modifier for category variables that require sim output changed: - Refactored extended_cps.py to delegate to puf_impute.puf_clone_dataset() (443 -> 75 lines) diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py index 48a268cb..6a1a2927 100644 --- a/policyengine_us_data/calibration/unified_calibration.py +++ b/policyengine_us_data/calibration/unified_calibration.py @@ -149,6 +149,14 @@ def _wic_category_mapper(categories, rates): "category_variable": "wic_category_str", "category_mapper": _wic_category_mapper, }, + { + "variable": "is_wic_at_nutritional_risk", + "entity": "person", + "rate_key": "wic_nutritional_risk", + "category_variable": "wic_category_str", + "category_mapper": _wic_category_mapper, + "base_variable": "receives_wic", + }, ] @@ -298,6 +306,14 @@ def rerandomize_category_takeup( draws[mask] = rng.random(n_in_block) new_values = draws < per_entity_rates + + base_var = spec.get("base_variable") + if base_var is not None: + base_values = sim.calculate( + base_var, map_to=entity_level + ).values.astype(bool) + new_values = base_values | new_values + sim.set_input(var_name, time_period, new_values) diff --git a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py index 6a47dd7c..250ffc51 100644 --- a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py +++ b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py @@ -188,7 +188,7 @@ def test_expected_count(self): CATEGORY_TAKEUP_VARS, ) - assert len(CATEGORY_TAKEUP_VARS) == 2 + assert len(CATEGORY_TAKEUP_VARS) == 3 class TestCategoryMappers: @@ -334,6 +334,8 @@ def _build_mock_sim(self): "NONE", ] ) + # receives_wic: persons 0, 2, 4 receive WIC + receives_wic = np.array([True, False, True, False, True, False]) calc_returns = { ("household_id", "household"): hh_ids_hh, @@ -341,6 +343,7 @@ def _build_mock_sim(self): ("household_id", "tax_unit"): tu_hh_ids, ("wic_category_str", "person"): wic_cats, ("household_id", "person"): person_hh_ids, + ("receives_wic", "person"): receives_wic, } return _MockSim(calc_returns), { "hh_ids": hh_ids_hh, @@ -348,11 +351,12 @@ def _build_mock_sim(self): "tu_hh_ids": tu_hh_ids, "wic_cats": wic_cats, "person_hh_ids": person_hh_ids, + "receives_wic": receives_wic, } - def test_sets_both_variables(self): - """rerandomize_category_takeup sets takes_up_eitc - and would_claim_wic.""" + def test_sets_all_variables(self): + """rerandomize_category_takeup sets takes_up_eitc, + would_claim_wic, and is_wic_at_nutritional_risk.""" from policyengine_us_data.calibration.unified_calibration import ( rerandomize_category_takeup, ) @@ -365,10 +369,11 @@ def test_sets_both_variables(self): assert "takes_up_eitc" in sim.inputs_set assert "would_claim_wic" in sim.inputs_set + assert "is_wic_at_nutritional_risk" in sim.inputs_set def test_output_shapes_match_entities(self): """Output arrays match entity counts: 5 tax units - for EITC, 6 persons for WIC.""" + for EITC, 6 persons for WIC and nutritional risk.""" from policyengine_us_data.calibration.unified_calibration import ( rerandomize_category_takeup, ) @@ -381,6 +386,9 @@ def test_output_shapes_match_entities(self): info["child_counts"] ) assert len(sim.inputs_set["would_claim_wic"]) == len(info["wic_cats"]) + assert len(sim.inputs_set["is_wic_at_nutritional_risk"]) == len( + info["wic_cats"] + ) def test_outputs_are_boolean(self): from policyengine_us_data.calibration.unified_calibration import ( @@ -393,6 +401,7 @@ def test_outputs_are_boolean(self): assert sim.inputs_set["takes_up_eitc"].dtype == bool assert sim.inputs_set["would_claim_wic"].dtype == bool + assert sim.inputs_set["is_wic_at_nutritional_risk"].dtype == bool def test_deterministic_same_blocks(self): """Same blocks produce identical takeup values.""" @@ -416,6 +425,10 @@ def test_deterministic_same_blocks(self): sim1.inputs_set["would_claim_wic"], sim2.inputs_set["would_claim_wic"], ) + np.testing.assert_array_equal( + sim1.inputs_set["is_wic_at_nutritional_risk"], + sim2.inputs_set["is_wic_at_nutritional_risk"], + ) def test_different_blocks_differ(self): """Different blocks produce different takeup values.""" @@ -437,7 +450,7 @@ def test_different_blocks_differ(self): 2024, ) - # At least one of the two variables should differ + # At least one of the three variables should differ eitc_differ = not np.array_equal( sim1.inputs_set["takes_up_eitc"], sim2.inputs_set["takes_up_eitc"], @@ -446,7 +459,11 @@ def test_different_blocks_differ(self): sim1.inputs_set["would_claim_wic"], sim2.inputs_set["would_claim_wic"], ) - assert eitc_differ or wic_differ + risk_differ = not np.array_equal( + sim1.inputs_set["is_wic_at_nutritional_risk"], + sim2.inputs_set["is_wic_at_nutritional_risk"], + ) + assert eitc_differ or wic_differ or risk_differ def test_eitc_values_match_manual_calculation(self): """Verify takes_up_eitc matches a hand-computed @@ -550,6 +567,87 @@ def test_none_wic_never_takes_up(self): wic_result[idx] == False ), f"Person {idx} (NONE) should not claim WIC" + def test_nutritional_risk_values_match_manual_calculation(self): + """Verify is_wic_at_nutritional_risk matches + receives_wic | (draws < rates) using seeded draws.""" + from policyengine_us_data.calibration.unified_calibration import ( + _wic_category_mapper, + rerandomize_category_takeup, + ) + from policyengine_us_data.parameters import ( + load_take_up_rate, + ) + + sim, info = self._build_mock_sim() + blocks = np.array( + [ + "010010001001001", + "020020002002002", + "030030003003003", + ] + ) + rerandomize_category_takeup(sim, blocks, 2024) + + # Reproduce manually + risk_rates = load_take_up_rate("wic_nutritional_risk", 2024) + per_person_rates = _wic_category_mapper(info["wic_cats"], risk_rates) + + hh_to_block = dict(zip(info["hh_ids"], blocks)) + person_blocks = np.array( + [hh_to_block.get(hid, "0") for hid in info["person_hh_ids"]] + ) + draws = np.zeros(len(info["wic_cats"]), dtype=np.float64) + for block in np.unique(person_blocks): + mask = person_blocks == block + rng = seeded_rng("is_wic_at_nutritional_risk", salt=str(block)) + draws[mask] = rng.random(mask.sum()) + + expected = info["receives_wic"] | (draws < per_person_rates) + np.testing.assert_array_equal( + sim.inputs_set["is_wic_at_nutritional_risk"], + expected, + ) + + def test_receives_wic_guarantees_nutritional_risk(self): + """Anyone with receives_wic=True must have + is_wic_at_nutritional_risk=True.""" + from policyengine_us_data.calibration.unified_calibration import ( + rerandomize_category_takeup, + ) + + sim, info = self._build_mock_sim() + blocks = np.array(["blk_a", "blk_b", "blk_c"]) + rerandomize_category_takeup(sim, blocks, 2024) + + risk = sim.inputs_set["is_wic_at_nutritional_risk"] + for i, recv in enumerate(info["receives_wic"]): + if recv: + assert risk[i], ( + f"Person {i} receives WIC but " + f"is_wic_at_nutritional_risk is False" + ) + + def test_none_wic_without_receives_never_at_risk(self): + """Persons with NONE category who don't receive WIC + are never at nutritional risk (rate=0).""" + from policyengine_us_data.calibration.unified_calibration import ( + rerandomize_category_takeup, + ) + + sim, info = self._build_mock_sim() + blocks = np.array(["blk_a", "blk_b", "blk_c"]) + rerandomize_category_takeup(sim, blocks, 2024) + + risk = sim.inputs_set["is_wic_at_nutritional_risk"] + for i, (cat, recv) in enumerate( + zip(info["wic_cats"], info["receives_wic"]) + ): + if cat == "NONE" and not recv: + assert not risk[i], ( + f"Person {i} (NONE, no WIC) should " + f"not be at nutritional risk" + ) + def test_zero_child_eitc_rate_is_lower(self): """Tax units with 0 children should have a lower takeup rate than those with children, verified