From 6ecfc19688516f980cbcc7734991118ae7f83b20 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Mon, 16 Feb 2026 08:37:40 -0500
Subject: [PATCH 1/6] Add PUF impute + source impute modules, fix $5.8M AGI
 ceiling (issue #530)

Adds puf_impute.py and source_impute.py from PR #516 (by @MaxGhenis),
refactors extended_cps.py to delegate to the new modules, and integrates
both into the unified calibration pipeline. The core fix removes the
subsample(10_000) call that dropped high-income PUF records before QRF
training, which caused a hard AGI ceiling at ~$6.26M after uprating.

Co-Authored-By: Max Ghenis <mghenis@gmail.com>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 changelog_entry.yaml                          |  24 +-
 .../calibration/puf_impute.py                 | 555 +++++++++++++
 .../calibration/source_impute.py              | 757 ++++++++++++++++++
 .../calibration/unified_calibration.py        | 200 ++++-
 .../datasets/cps/extended_cps.py              | 438 +---------
 .../tests/test_calibration/test_puf_impute.py | 134 ++++
 .../test_calibration/test_source_impute.py    | 169 ++++
 7 files changed, 1843 insertions(+), 434 deletions(-)
 create mode 100644 policyengine_us_data/calibration/puf_impute.py
 create mode 100644 policyengine_us_data/calibration/source_impute.py
 create mode 100644 policyengine_us_data/tests/test_calibration/test_puf_impute.py
 create mode 100644 policyengine_us_data/tests/test_calibration/test_source_impute.py

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index e4c231aa..8e9b4fb6 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,22 +1,12 @@
 - bump: minor
   changes:
     added:
-    - Census-block-first calibration pipeline (calibration/ package) ported from PR #516
-    - Clone-and-assign module for population-weighted census block sampling
-    - Unified matrix builder with clone-by-clone simulation, COO caching, and target_overview-based querying
-    - Unified calibration CLI with L0 optimization and seeded takeup re-randomization
-    - 28 new tests for the calibration pipeline
-    - Integration test for build_matrix geographic masking (national/state/CD)
-    - Tests for drop_target_groups utility
-    - voluntary_filing.yaml takeup rate parameter
+    - PUF clone + QRF imputation module (puf_impute.py) with state_fips predictor, fixing $5.8M AGI ceiling from subsample bug (issue #530)
+    - ACS/SIPP/SCF geographic re-imputation module (source_impute.py) with state predictor
+    - PUF and source impute integration into unified calibration pipeline (--puf-dataset, --skip-puf, --skip-source-impute flags)
+    - 21 new tests for puf_impute and source_impute modules
     changed:
-    - Rewrote local_area_calibration_setup.ipynb for clone-based pipeline
-    - Renamed _get_geo_level to get_geo_level (now cross-module public API)
+    - Refactored extended_cps.py to delegate to puf_impute.puf_clone_dataset() (443 -> 75 lines)
+    - unified_calibration.py pipeline now supports optional source imputation and PUF cloning steps
     fixed:
-    - Fix Jupyter import error in unified_calibration.py (OutStream.reconfigure moved to main)
-    - Fix modal_app/remote_calibration_runner.py referencing deleted fit_calibration_weights.py
-    - Fix _coo_parts stale state bug on build_matrix re-call after failure
-    - Remove hardcoded voluntary_filing rate in favor of YAML parameter
-    removed:
-    - SparseMatrixBuilder, MatrixTracer, and fit_calibration_weights (replaced by unified pipeline)
-    - 8 old SparseMatrixBuilder-dependent tests (replaced by new test_calibration suite)
+    - QRF training now uses ALL PUF records instead of subsample(10_000), removing hard AGI ceiling at ~$6.26M
diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py
new file mode 100644
index 00000000..9c185c9f
--- /dev/null
+++ b/policyengine_us_data/calibration/puf_impute.py
@@ -0,0 +1,555 @@
+"""PUF clone and QRF imputation for calibration pipeline.
+
+Doubles CPS records: one half keeps original values, the other half
+gets PUF tax variables imputed via Quantile Random Forest.  Geography
+(state_fips) is included as a QRF predictor so imputations vary by
+state.
+
+Usage within the calibration pipeline:
+    1. Load raw CPS dataset
+    2. Clone 10x and assign geography
+    3. Call puf_clone_dataset() to double records and impute PUF vars
+    4. Save expanded dataset for matrix building
+"""
+
+import gc
+import logging
+import time
+from typing import Dict, List, Optional
+
+import numpy as np
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+DEMOGRAPHIC_PREDICTORS = [
+    "age",
+    "is_male",
+    "tax_unit_is_joint",
+    "tax_unit_count_dependents",
+    "is_tax_unit_head",
+    "is_tax_unit_spouse",
+    "is_tax_unit_dependent",
+    "state_fips",
+]
+
+IMPUTED_VARIABLES = [
+    "employment_income",
+    "partnership_s_corp_income",
+    "social_security",
+    "taxable_pension_income",
+    "interest_deduction",
+    "tax_exempt_pension_income",
+    "long_term_capital_gains",
+    "unreimbursed_business_employee_expenses",
+    "pre_tax_contributions",
+    "taxable_ira_distributions",
+    "self_employment_income",
+    "w2_wages_from_qualified_business",
+    "unadjusted_basis_qualified_property",
+    "business_is_sstb",
+    "short_term_capital_gains",
+    "qualified_dividend_income",
+    "charitable_cash_donations",
+    "self_employed_pension_contribution_ald",
+    "unrecaptured_section_1250_gain",
+    "taxable_unemployment_compensation",
+    "taxable_interest_income",
+    "domestic_production_ald",
+    "self_employed_health_insurance_ald",
+    "rental_income",
+    "non_qualified_dividend_income",
+    "cdcc_relevant_expenses",
+    "tax_exempt_interest_income",
+    "salt_refund_income",
+    "foreign_tax_credit",
+    "estate_income",
+    "charitable_non_cash_donations",
+    "american_opportunity_credit",
+    "miscellaneous_income",
+    "alimony_expense",
+    "farm_income",
+    "partnership_se_income",
+    "alimony_income",
+    "health_savings_account_ald",
+    "non_sch_d_capital_gains",
+    "general_business_credit",
+    "energy_efficient_home_improvement_credit",
+    "traditional_ira_contributions",
+    "amt_foreign_tax_credit",
+    "excess_withheld_payroll_tax",
+    "savers_credit",
+    "student_loan_interest",
+    "investment_income_elected_form_4952",
+    "early_withdrawal_penalty",
+    "prior_year_minimum_tax_credit",
+    "farm_rent_income",
+    "qualified_tuition_expenses",
+    "educator_expense",
+    "long_term_capital_gains_on_collectibles",
+    "other_credits",
+    "casualty_loss",
+    "unreported_payroll_tax",
+    "recapture_of_investment_credit",
+    "deductible_mortgage_interest",
+    "qualified_reit_and_ptp_income",
+    "qualified_bdc_income",
+    "farm_operations_income",
+    "estate_income_would_be_qualified",
+    "farm_operations_income_would_be_qualified",
+    "farm_rent_income_would_be_qualified",
+    "partnership_s_corp_income_would_be_qualified",
+    "rental_income_would_be_qualified",
+    "self_employment_income_would_be_qualified",
+]
+
+OVERRIDDEN_IMPUTED_VARIABLES = [
+    "partnership_s_corp_income",
+    "interest_deduction",
+    "unreimbursed_business_employee_expenses",
+    "pre_tax_contributions",
+    "w2_wages_from_qualified_business",
+    "unadjusted_basis_qualified_property",
+    "business_is_sstb",
+    "charitable_cash_donations",
+    "self_employed_pension_contribution_ald",
+    "unrecaptured_section_1250_gain",
+    "taxable_unemployment_compensation",
+    "domestic_production_ald",
+    "self_employed_health_insurance_ald",
+    "cdcc_relevant_expenses",
+    "salt_refund_income",
+    "foreign_tax_credit",
+    "estate_income",
+    "charitable_non_cash_donations",
+    "american_opportunity_credit",
+    "miscellaneous_income",
+    "alimony_expense",
+    "health_savings_account_ald",
+    "non_sch_d_capital_gains",
+    "general_business_credit",
+    "energy_efficient_home_improvement_credit",
+    "amt_foreign_tax_credit",
+    "excess_withheld_payroll_tax",
+    "savers_credit",
+    "student_loan_interest",
+    "investment_income_elected_form_4952",
+    "early_withdrawal_penalty",
+    "prior_year_minimum_tax_credit",
+    "farm_rent_income",
+    "qualified_tuition_expenses",
+    "educator_expense",
+    "long_term_capital_gains_on_collectibles",
+    "other_credits",
+    "casualty_loss",
+    "unreported_payroll_tax",
+    "recapture_of_investment_credit",
+    "deductible_mortgage_interest",
+    "qualified_reit_and_ptp_income",
+    "qualified_bdc_income",
+    "farm_operations_income",
+    "estate_income_would_be_qualified",
+    "farm_operations_income_would_be_qualified",
+    "farm_rent_income_would_be_qualified",
+    "partnership_s_corp_income_would_be_qualified",
+    "rental_income_would_be_qualified",
+]
+
+
+def puf_clone_dataset(
+    data: Dict[str, Dict[int, np.ndarray]],
+    state_fips: np.ndarray,
+    time_period: int = 2024,
+    puf_dataset=None,
+    skip_qrf: bool = False,
+    dataset_path: Optional[str] = None,
+) -> Dict[str, Dict[int, np.ndarray]]:
+    """Clone CPS data 2x and impute PUF variables on one half.
+
+    The first half keeps CPS values (with OVERRIDDEN vars QRF'd).
+    The second half gets full PUF QRF imputation. The second half
+    has household weights set to zero.
+
+    Args:
+        data: CPS dataset dict {variable: {time_period: array}}.
+        state_fips: State FIPS per household, shape (n_households,).
+        time_period: Tax year.
+        puf_dataset: PUF dataset class or path for QRF training.
+            If None, skips QRF (same as skip_qrf=True).
+        skip_qrf: If True, skip QRF imputation (for testing).
+        dataset_path: Path to CPS h5 file (needed for QRF to
+            compute demographic predictors via Microsimulation).
+
+    Returns:
+        New data dict with doubled records.
+    """
+    household_ids = data["household_id"][time_period]
+    n_households = len(household_ids)
+    person_count = len(data["person_id"][time_period])
+
+    logger.info(
+        "PUF clone: %d households, %d persons",
+        n_households,
+        person_count,
+    )
+
+    y_full = None
+    y_override = None
+    if not skip_qrf and puf_dataset is not None:
+        y_full, y_override = _run_qrf_imputation(
+            data,
+            state_fips,
+            time_period,
+            puf_dataset,
+            dataset_path=dataset_path,
+        )
+
+    cps_sim = None
+    tbs = None
+    if (y_full or y_override) and dataset_path is not None:
+        from policyengine_us import Microsimulation
+
+        cps_sim = Microsimulation(dataset=dataset_path)
+        tbs = cps_sim.tax_benefit_system
+
+    def _map_to_entity(pred_values, variable_name):
+        if cps_sim is None or tbs is None:
+            return pred_values
+        var_meta = tbs.variables.get(variable_name)
+        if var_meta is None:
+            return pred_values
+        entity = var_meta.entity.key
+        if entity != "person":
+            return cps_sim.populations[entity].value_from_first_person(
+                pred_values
+            )
+        return pred_values
+
+    # Impute weeks_unemployed for PUF half
+    puf_weeks = None
+    if y_full is not None and dataset_path is not None:
+        puf_weeks = _impute_weeks_unemployed(
+            data, y_full, time_period, dataset_path
+        )
+
+    new_data = {}
+    for variable, time_dict in data.items():
+        values = time_dict[time_period]
+
+        if variable in OVERRIDDEN_IMPUTED_VARIABLES and y_override:
+            pred = _map_to_entity(y_override[variable], variable)
+            new_data[variable] = {time_period: np.concatenate([pred, pred])}
+        elif variable in IMPUTED_VARIABLES and y_full:
+            pred = _map_to_entity(y_full[variable], variable)
+            new_data[variable] = {time_period: np.concatenate([values, pred])}
+        elif variable == "person_id":
+            new_data[variable] = {
+                time_period: np.concatenate([values, values + values.max()])
+            }
+        elif "_id" in variable:
+            new_data[variable] = {
+                time_period: np.concatenate([values, values + values.max()])
+            }
+        elif "_weight" in variable:
+            new_data[variable] = {
+                time_period: np.concatenate([values, values * 0])
+            }
+        elif variable == "weeks_unemployed" and puf_weeks is not None:
+            new_data[variable] = {
+                time_period: np.concatenate([values, puf_weeks])
+            }
+        else:
+            new_data[variable] = {
+                time_period: np.concatenate([values, values])
+            }
+
+    new_data["state_fips"] = {
+        time_period: np.concatenate([state_fips, state_fips]).astype(np.int32)
+    }
+
+    if y_full:
+        for var in IMPUTED_VARIABLES:
+            if var not in data:
+                pred = _map_to_entity(y_full[var], var)
+                orig = np.zeros_like(pred)
+                new_data[var] = {time_period: np.concatenate([orig, pred])}
+
+    if cps_sim is not None:
+        del cps_sim
+
+    logger.info(
+        "PUF clone complete: %d -> %d households",
+        n_households,
+        n_households * 2,
+    )
+    return new_data
+
+
+def _impute_weeks_unemployed(
+    data: Dict[str, Dict[int, np.ndarray]],
+    puf_imputations: Dict[str, np.ndarray],
+    time_period: int,
+    dataset_path: str,
+) -> np.ndarray:
+    """Impute weeks_unemployed for the PUF half using QRF.
+
+    Uses CPS as training data and imputed PUF demographics as
+    test data, preserving the joint distribution of weeks with
+    unemployment compensation.
+
+    Args:
+        data: CPS data dict.
+        puf_imputations: Dict of PUF-imputed variable arrays.
+        time_period: Tax year.
+        dataset_path: Path to CPS h5 for Microsimulation.
+
+    Returns:
+        Array of imputed weeks for PUF half.
+    """
+    from microimpute.models.qrf import QRF
+    from policyengine_us import Microsimulation
+
+    cps_sim = Microsimulation(dataset=dataset_path)
+    try:
+        cps_weeks = cps_sim.calculate("weeks_unemployed").values
+    except (ValueError, KeyError):
+        logger.warning("weeks_unemployed not in CPS, returning zeros")
+        n_persons = len(data["person_id"][time_period])
+        del cps_sim
+        return np.zeros(n_persons)
+
+    WEEKS_PREDICTORS = [
+        "age",
+        "is_male",
+        "tax_unit_is_joint",
+        "is_tax_unit_head",
+        "is_tax_unit_spouse",
+        "is_tax_unit_dependent",
+    ]
+
+    X_train = cps_sim.calculate_dataframe(WEEKS_PREDICTORS)
+    X_train["weeks_unemployed"] = cps_weeks
+
+    if "taxable_unemployment_compensation" in puf_imputations:
+        cps_uc = cps_sim.calculate("unemployment_compensation").values
+        X_train["unemployment_compensation"] = cps_uc
+        WEEKS_PREDICTORS = WEEKS_PREDICTORS + ["unemployment_compensation"]
+
+    X_test = cps_sim.calculate_dataframe(
+        [p for p in WEEKS_PREDICTORS if p != "unemployment_compensation"]
+    )
+
+    if "taxable_unemployment_compensation" in puf_imputations:
+        X_test["unemployment_compensation"] = puf_imputations[
+            "taxable_unemployment_compensation"
+        ]
+
+    del cps_sim
+
+    qrf = QRF(log_level="INFO", memory_efficient=True)
+    sample_size = min(5000, len(X_train))
+    if len(X_train) > sample_size:
+        X_train_sampled = X_train.sample(n=sample_size, random_state=42)
+    else:
+        X_train_sampled = X_train
+
+    fitted = qrf.fit(
+        X_train=X_train_sampled,
+        predictors=WEEKS_PREDICTORS,
+        imputed_variables=["weeks_unemployed"],
+        n_jobs=1,
+    )
+    predictions = fitted.predict(X_test=X_test)
+    imputed_weeks = predictions["weeks_unemployed"].values
+
+    imputed_weeks = np.clip(imputed_weeks, 0, 52)
+    if "unemployment_compensation" in X_test.columns:
+        imputed_weeks = np.where(
+            X_test["unemployment_compensation"].values > 0,
+            imputed_weeks,
+            0,
+        )
+
+    logger.info(
+        "Imputed weeks_unemployed for PUF: " "%d with weeks > 0, mean = %.1f",
+        (imputed_weeks > 0).sum(),
+        (
+            imputed_weeks[imputed_weeks > 0].mean()
+            if (imputed_weeks > 0).any()
+            else 0
+        ),
+    )
+
+    del fitted, predictions
+    gc.collect()
+    return imputed_weeks
+
+
+def _run_qrf_imputation(
+    data: Dict[str, Dict[int, np.ndarray]],
+    state_fips: np.ndarray,
+    time_period: int,
+    puf_dataset,
+    dataset_path: Optional[str] = None,
+) -> tuple:
+    """Run QRF imputation for PUF variables.
+
+    Trains QRF on ALL PUF records (no subsample) with random
+    state assignment, and predicts on CPS data using
+    demographics + state_fips.
+
+    Args:
+        data: CPS data dict.
+        state_fips: State FIPS per household.
+        time_period: Tax year.
+        puf_dataset: PUF dataset class or path.
+        dataset_path: Path to CPS h5 for computing
+            demographic predictors via Microsimulation.
+
+    Returns:
+        Tuple of (y_full_imputations, y_override_imputations)
+        as dicts of {variable: np.ndarray}.
+    """
+    from microimpute.models.qrf import QRF
+    from policyengine_us import Microsimulation
+
+    logger.info("Running QRF imputation with state predictor")
+
+    puf_sim = Microsimulation(dataset=puf_dataset)
+
+    from policyengine_us_data.calibration.clone_and_assign import (
+        load_global_block_distribution,
+    )
+
+    _, _, puf_states, block_probs = load_global_block_distribution()
+    rng = np.random.default_rng(seed=99)
+    n_puf = len(puf_sim.calculate("person_id", map_to="person").values)
+    puf_state_indices = rng.choice(len(puf_states), size=n_puf, p=block_probs)
+    puf_state_fips = puf_states[puf_state_indices]
+
+    demo_preds = [p for p in DEMOGRAPHIC_PREDICTORS if p != "state_fips"]
+
+    X_train_full = puf_sim.calculate_dataframe(demo_preds + IMPUTED_VARIABLES)
+    X_train_full["state_fips"] = puf_state_fips.astype(np.float32)
+
+    X_train_override = puf_sim.calculate_dataframe(
+        demo_preds + OVERRIDDEN_IMPUTED_VARIABLES
+    )
+    X_train_override["state_fips"] = puf_state_fips.astype(np.float32)
+
+    del puf_sim
+
+    n_hh = len(data["household_id"][time_period])
+    person_count = len(data["person_id"][time_period])
+
+    if dataset_path is not None:
+        cps_sim = Microsimulation(dataset=dataset_path)
+        X_test = cps_sim.calculate_dataframe(demo_preds)
+        del cps_sim
+    else:
+        X_test = pd.DataFrame()
+        for pred in demo_preds:
+            if pred in data:
+                X_test[pred] = data[pred][time_period].astype(np.float32)
+
+    hh_ids_person = data.get("person_household_id", {}).get(time_period)
+    if hh_ids_person is not None:
+        hh_ids = data["household_id"][time_period]
+        hh_to_idx = {int(hh_id): i for i, hh_id in enumerate(hh_ids)}
+        person_states = np.array(
+            [state_fips[hh_to_idx[int(hh_id)]] for hh_id in hh_ids_person]
+        )
+    else:
+        person_states = np.repeat(
+            state_fips,
+            person_count // n_hh,
+        )
+    X_test["state_fips"] = person_states.astype(np.float32)
+
+    predictors = DEMOGRAPHIC_PREDICTORS
+
+    logger.info("Imputing %d PUF variables (full)", len(IMPUTED_VARIABLES))
+    y_full = _batch_qrf(X_train_full, X_test, predictors, IMPUTED_VARIABLES)
+
+    logger.info(
+        "Imputing %d PUF variables (override)",
+        len(OVERRIDDEN_IMPUTED_VARIABLES),
+    )
+    y_override = _batch_qrf(
+        X_train_override,
+        X_test,
+        predictors,
+        OVERRIDDEN_IMPUTED_VARIABLES,
+    )
+
+    return y_full, y_override
+
+
+def _batch_qrf(
+    X_train: pd.DataFrame,
+    X_test: pd.DataFrame,
+    predictors: List[str],
+    output_vars: List[str],
+    batch_size: int = 10,
+) -> Dict[str, np.ndarray]:
+    """Run QRF in batches to control memory.
+
+    Args:
+        X_train: Training data with predictors + output vars.
+        X_test: Test data with predictors only.
+        predictors: Predictor column names.
+        output_vars: Output variable names to impute.
+        batch_size: Variables per batch.
+
+    Returns:
+        Dict mapping variable name to imputed values.
+    """
+    from microimpute.models.qrf import QRF
+
+    available = [c for c in output_vars if c in X_train.columns]
+    missing = [c for c in output_vars if c not in X_train.columns]
+
+    if missing:
+        logger.warning(
+            "%d variables missing from training: %s",
+            len(missing),
+            missing[:5],
+        )
+
+    result = {}
+
+    for batch_start in range(0, len(available), batch_size):
+        batch_end = min(batch_start + batch_size, len(available))
+        batch_vars = available[batch_start:batch_end]
+
+        gc.collect()
+
+        qrf = QRF(
+            log_level="INFO",
+            memory_efficient=True,
+            batch_size=10,
+            cleanup_interval=5,
+        )
+
+        batch_X_train = X_train[predictors + batch_vars].copy()
+
+        fitted = qrf.fit(
+            X_train=batch_X_train,
+            predictors=predictors,
+            imputed_variables=batch_vars,
+            n_jobs=1,
+        )
+
+        predictions = fitted.predict(X_test=X_test)
+
+        for var in batch_vars:
+            result[var] = predictions[var].values
+
+        del fitted, predictions, batch_X_train
+        gc.collect()
+
+    n_test = len(X_test)
+    for var in missing:
+        result[var] = np.zeros(n_test)
+
+    return result
diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py
new file mode 100644
index 00000000..1d5021a2
--- /dev/null
+++ b/policyengine_us_data/calibration/source_impute.py
@@ -0,0 +1,757 @@
+"""Non-PUF QRF imputations with state_fips as predictor.
+
+Re-imputes variables from ACS, SIPP, and SCF donor surveys
+with state_fips included as a QRF predictor. This runs after
+geography assignment so imputations reflect assigned state.
+
+Sources and variables:
+    ACS  -> rent, real_estate_taxes
+    SIPP -> tip_income, bank_account_assets, stock_assets,
+            bond_assets
+    SCF  -> net_worth, auto_loan_balance, auto_loan_interest
+
+Usage in unified calibration pipeline:
+    1. Load raw CPS
+    2. Clone Nx, assign geography
+    3. impute_source_variables()  <-- this module
+    4. PUF clone + QRF impute (puf_impute.py)
+    5. PE simulate, build matrix, calibrate
+"""
+
+import gc
+import logging
+from typing import Dict, Optional
+
+import numpy as np
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+ACS_IMPUTED_VARIABLES = [
+    "rent",
+    "real_estate_taxes",
+]
+
+SIPP_IMPUTED_VARIABLES = [
+    "tip_income",
+    "bank_account_assets",
+    "stock_assets",
+    "bond_assets",
+]
+
+SCF_IMPUTED_VARIABLES = [
+    "net_worth",
+    "auto_loan_balance",
+    "auto_loan_interest",
+]
+
+ALL_SOURCE_VARIABLES = (
+    ACS_IMPUTED_VARIABLES + SIPP_IMPUTED_VARIABLES + SCF_IMPUTED_VARIABLES
+)
+
+ACS_PREDICTORS = [
+    "is_household_head",
+    "age",
+    "is_male",
+    "tenure_type",
+    "employment_income",
+    "self_employment_income",
+    "social_security",
+    "pension_income",
+    "household_size",
+]
+
+SIPP_TIPS_PREDICTORS = [
+    "employment_income",
+    "age",
+    "count_under_18",
+    "count_under_6",
+]
+
+SIPP_ASSETS_PREDICTORS = [
+    "employment_income",
+    "age",
+    "is_female",
+    "is_married",
+    "count_under_18",
+]
+
+SCF_PREDICTORS = [
+    "age",
+    "is_female",
+    "cps_race",
+    "is_married",
+    "own_children_in_household",
+    "employment_income",
+    "interest_dividend_income",
+    "social_security_pension_income",
+]
+
+
+def impute_source_variables(
+    data: Dict[str, Dict[int, np.ndarray]],
+    state_fips: np.ndarray,
+    time_period: int = 2024,
+    dataset_path: Optional[str] = None,
+    skip_acs: bool = False,
+    skip_sipp: bool = False,
+    skip_scf: bool = False,
+) -> Dict[str, Dict[int, np.ndarray]]:
+    """Re-impute ACS/SIPP/SCF variables with state as predictor.
+
+    Overwrites existing imputed values in data with new values
+    that use assigned state_fips as a QRF predictor.
+
+    Args:
+        data: CPS dataset dict {variable: {time_period: array}}.
+        state_fips: State FIPS per household.
+        time_period: Tax year.
+        dataset_path: Path to CPS h5 for Microsimulation.
+        skip_acs: Skip ACS imputation.
+        skip_sipp: Skip SIPP imputation.
+        skip_scf: Skip SCF imputation.
+
+    Returns:
+        Updated data dict with re-imputed variables.
+    """
+    data["state_fips"] = {
+        time_period: state_fips.astype(np.int32),
+    }
+
+    if not skip_acs:
+        logger.info("Imputing ACS variables with state predictor")
+        data = _impute_acs(data, state_fips, time_period, dataset_path)
+
+    if not skip_sipp:
+        logger.info("Imputing SIPP variables with state predictor")
+        data = _impute_sipp(data, state_fips, time_period, dataset_path)
+
+    if not skip_scf:
+        logger.info("Imputing SCF variables with state predictor")
+        data = _impute_scf(data, state_fips, time_period, dataset_path)
+
+    return data
+
+
+def _build_cps_receiver(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+    dataset_path: Optional[str],
+    pe_variables: list,
+) -> pd.DataFrame:
+    """Build CPS receiver DataFrame from Microsimulation.
+
+    Args:
+        data: CPS data dict.
+        time_period: Tax year.
+        dataset_path: Path to CPS h5 for Microsimulation.
+        pe_variables: List of PE variable names to compute.
+
+    Returns:
+        DataFrame with requested columns.
+    """
+    if dataset_path is not None:
+        from policyengine_us import Microsimulation
+
+        sim = Microsimulation(dataset=dataset_path)
+        tbs = sim.tax_benefit_system
+        valid_vars = [v for v in pe_variables if v in tbs.variables]
+        if valid_vars:
+            df = sim.calculate_dataframe(valid_vars)
+        else:
+            df = pd.DataFrame(index=range(len(data["person_id"][time_period])))
+        del sim
+    else:
+        df = pd.DataFrame()
+
+    for var in pe_variables:
+        if var not in df.columns and var in data:
+            df[var] = data[var][time_period].astype(np.float32)
+
+    return df
+
+
+def _get_variable_entity(variable_name: str) -> str:
+    """Return the entity key for a PE variable."""
+    from policyengine_us import CountryTaxBenefitSystem
+
+    tbs = CountryTaxBenefitSystem()
+    var = tbs.variables.get(variable_name)
+    if var is None:
+        return "person"
+    return var.entity.key
+
+
+def _person_state_fips(
+    data: Dict[str, Dict[int, np.ndarray]],
+    state_fips: np.ndarray,
+    time_period: int,
+) -> np.ndarray:
+    """Map household-level state_fips to person level.
+
+    Args:
+        data: CPS data dict.
+        state_fips: State FIPS per household.
+        time_period: Tax year.
+
+    Returns:
+        Person-level state FIPS array.
+    """
+    hh_ids_person = data.get("person_household_id", {}).get(time_period)
+    if hh_ids_person is not None:
+        hh_ids = data["household_id"][time_period]
+        hh_to_idx = {int(hh_id): i for i, hh_id in enumerate(hh_ids)}
+        return np.array(
+            [state_fips[hh_to_idx[int(hh_id)]] for hh_id in hh_ids_person]
+        )
+    n_hh = len(data["household_id"][time_period])
+    n_persons = len(data["person_id"][time_period])
+    return np.repeat(state_fips, n_persons // n_hh)
+
+
+def _impute_acs(
+    data: Dict[str, Dict[int, np.ndarray]],
+    state_fips: np.ndarray,
+    time_period: int,
+    dataset_path: Optional[str] = None,
+) -> Dict[str, Dict[int, np.ndarray]]:
+    """Impute rent and real_estate_taxes from ACS with state.
+
+    Args:
+        data: CPS data dict.
+        state_fips: State FIPS per household.
+        time_period: Tax year.
+        dataset_path: Path to CPS h5 for Microsimulation.
+
+    Returns:
+        Updated data dict.
+    """
+    from microimpute.models.qrf import QRF
+    from policyengine_us import Microsimulation
+
+    from policyengine_us_data.datasets.acs.acs import ACS_2022
+
+    acs = Microsimulation(dataset=ACS_2022)
+    predictors = ACS_PREDICTORS + ["state_fips"]
+
+    acs_df = acs.calculate_dataframe(ACS_PREDICTORS + ACS_IMPUTED_VARIABLES)
+    acs_df["state_fips"] = acs.calculate(
+        "state_fips", map_to="person"
+    ).values.astype(np.float32)
+
+    train_df = acs_df[acs_df.is_household_head].sample(10_000, random_state=42)
+    if "tenure_type" in train_df.columns:
+        train_df["tenure_type"] = (
+            train_df["tenure_type"]
+            .astype(str)
+            .map(
+                {
+                    "OWNED_WITH_MORTGAGE": 1,
+                    "OWNED_OUTRIGHT": 1,
+                    "RENTED": 2,
+                    "NONE": 0,
+                }
+            )
+            .fillna(0)
+            .astype(np.float32)
+        )
+    del acs
+
+    if dataset_path is not None:
+        cps_sim = Microsimulation(dataset=dataset_path)
+        cps_df = cps_sim.calculate_dataframe(ACS_PREDICTORS)
+        del cps_sim
+    else:
+        cps_df = pd.DataFrame()
+        for pred in ACS_PREDICTORS:
+            if pred in data:
+                cps_df[pred] = data[pred][time_period].astype(np.float32)
+
+    if "tenure_type" in cps_df.columns:
+        cps_df["tenure_type"] = (
+            cps_df["tenure_type"]
+            .astype(str)
+            .map(
+                {
+                    "OWNED_WITH_MORTGAGE": 1,
+                    "OWNED_OUTRIGHT": 1,
+                    "RENTED": 2,
+                    "NONE": 0,
+                }
+            )
+            .fillna(0)
+            .astype(np.float32)
+        )
+
+    person_states = _person_state_fips(data, state_fips, time_period)
+    cps_df["state_fips"] = person_states.astype(np.float32)
+
+    mask = (
+        cps_df.is_household_head.values
+        if "is_household_head" in cps_df.columns
+        else np.ones(len(cps_df), dtype=bool)
+    )
+    cps_heads = cps_df[mask]
+
+    qrf = QRF()
+    logger.info(
+        "ACS QRF: %d train, %d test, %d predictors",
+        len(train_df),
+        len(cps_heads),
+        len(predictors),
+    )
+    fitted = qrf.fit(
+        X_train=train_df,
+        predictors=predictors,
+        imputed_variables=ACS_IMPUTED_VARIABLES,
+    )
+    predictions = fitted.predict(X_test=cps_heads)
+
+    n_persons = len(data["person_id"][time_period])
+    for var in ACS_IMPUTED_VARIABLES:
+        values = np.zeros(n_persons, dtype=np.float32)
+        values[mask] = predictions[var].values
+        data[var] = {time_period: values}
+    data["pre_subsidy_rent"] = {time_period: data["rent"][time_period].copy()}
+
+    del fitted, predictions
+    gc.collect()
+
+    logger.info("ACS imputation complete: rent, real_estate_taxes")
+    return data
+
+
+def _impute_sipp(
+    data: Dict[str, Dict[int, np.ndarray]],
+    state_fips: np.ndarray,
+    time_period: int,
+    dataset_path: Optional[str] = None,
+) -> Dict[str, Dict[int, np.ndarray]]:
+    """Impute tip_income and liquid assets from SIPP with state.
+
+    Args:
+        data: CPS data dict.
+        state_fips: State FIPS per household.
+        time_period: Tax year.
+        dataset_path: Path to CPS h5 for Microsimulation.
+
+    Returns:
+        Updated data dict.
+    """
+    from microimpute.models.qrf import QRF
+    from policyengine_us import Microsimulation
+
+    from policyengine_us_data.calibration.clone_and_assign import (
+        load_global_block_distribution,
+    )
+
+    _, _, donor_states, block_probs = load_global_block_distribution()
+    rng = np.random.default_rng(seed=88)
+
+    from policyengine_us_data.storage import STORAGE_FOLDER
+    from huggingface_hub import hf_hub_download
+
+    hf_hub_download(
+        repo_id="PolicyEngine/policyengine-us-data",
+        filename="pu2023_slim.csv",
+        repo_type="model",
+        local_dir=STORAGE_FOLDER,
+    )
+    sipp_df = pd.read_csv(STORAGE_FOLDER / "pu2023_slim.csv")
+
+    sipp_df["tip_income"] = (
+        sipp_df[sipp_df.columns[sipp_df.columns.str.contains("TXAMT")]]
+        .fillna(0)
+        .sum(axis=1)
+        * 12
+    )
+    sipp_df["employment_income"] = sipp_df.TPTOTINC * 12
+    sipp_df["age"] = sipp_df.TAGE
+    sipp_df["household_weight"] = sipp_df.WPFINWGT
+    sipp_df["household_id"] = sipp_df.SSUID
+
+    sipp_df["is_under_18"] = sipp_df.TAGE < 18
+    sipp_df["is_under_6"] = sipp_df.TAGE < 6
+    sipp_df["count_under_18"] = (
+        sipp_df.groupby("SSUID")["is_under_18"]
+        .sum()
+        .loc[sipp_df.SSUID.values]
+        .values
+    )
+    sipp_df["count_under_6"] = (
+        sipp_df.groupby("SSUID")["is_under_6"]
+        .sum()
+        .loc[sipp_df.SSUID.values]
+        .values
+    )
+
+    tip_cols = [
+        "household_id",
+        "employment_income",
+        "tip_income",
+        "count_under_18",
+        "count_under_6",
+        "age",
+        "household_weight",
+    ]
+    tip_train = sipp_df[tip_cols].dropna()
+    tip_train = tip_train.loc[
+        rng.choice(
+            tip_train.index,
+            size=min(10_000, len(tip_train)),
+            replace=True,
+            p=(tip_train.household_weight / tip_train.household_weight.sum()),
+        )
+    ]
+
+    tip_state_idx = rng.choice(
+        len(donor_states), size=len(tip_train), p=block_probs
+    )
+    tip_train["state_fips"] = donor_states[tip_state_idx].astype(np.float32)
+
+    cps_tip_df = _build_cps_receiver(
+        data, time_period, dataset_path, ["employment_income", "age"]
+    )
+    person_ages = data["age"][time_period]
+    hh_ids_person = data.get("person_household_id", {}).get(time_period)
+    if hh_ids_person is not None:
+        age_df = pd.DataFrame({"hh": hh_ids_person, "age": person_ages})
+        under_18 = age_df.groupby("hh")["age"].apply(lambda x: (x < 18).sum())
+        under_6 = age_df.groupby("hh")["age"].apply(lambda x: (x < 6).sum())
+        cps_tip_df["count_under_18"] = under_18.loc[
+            hh_ids_person
+        ].values.astype(np.float32)
+        cps_tip_df["count_under_6"] = under_6.loc[hh_ids_person].values.astype(
+            np.float32
+        )
+    else:
+        cps_tip_df["count_under_18"] = 0.0
+        cps_tip_df["count_under_6"] = 0.0
+
+    person_states = _person_state_fips(data, state_fips, time_period)
+    cps_tip_df["state_fips"] = person_states.astype(np.float32)
+
+    tip_predictors = SIPP_TIPS_PREDICTORS + ["state_fips"]
+    qrf = QRF()
+    logger.info(
+        "SIPP tips QRF: %d train, %d test",
+        len(tip_train),
+        len(cps_tip_df),
+    )
+    fitted = qrf.fit(
+        X_train=tip_train,
+        predictors=tip_predictors,
+        imputed_variables=["tip_income"],
+    )
+    tip_preds = fitted.predict(X_test=cps_tip_df)
+    data["tip_income"] = {
+        time_period: tip_preds["tip_income"].values,
+    }
+    del fitted, tip_preds
+    gc.collect()
+
+    logger.info("SIPP tip imputation complete")
+
+    # Asset imputation
+    try:
+        hf_hub_download(
+            repo_id="PolicyEngine/policyengine-us-data",
+            filename="pu2023.csv",
+            repo_type="model",
+            local_dir=STORAGE_FOLDER,
+        )
+        asset_cols = [
+            "SSUID",
+            "PNUM",
+            "MONTHCODE",
+            "WPFINWGT",
+            "TAGE",
+            "ESEX",
+            "EMS",
+            "TPTOTINC",
+            "TVAL_BANK",
+            "TVAL_STMF",
+            "TVAL_BOND",
+        ]
+        asset_df = pd.read_csv(
+            STORAGE_FOLDER / "pu2023.csv",
+            delimiter="|",
+            usecols=asset_cols,
+        )
+        asset_df = asset_df[asset_df.MONTHCODE == 12]
+
+        asset_df["bank_account_assets"] = asset_df["TVAL_BANK"].fillna(0)
+        asset_df["stock_assets"] = asset_df["TVAL_STMF"].fillna(0)
+        asset_df["bond_assets"] = asset_df["TVAL_BOND"].fillna(0)
+        asset_df["age"] = asset_df.TAGE
+        asset_df["is_female"] = asset_df.ESEX == 2
+        asset_df["is_married"] = asset_df.EMS == 1
+        asset_df["employment_income"] = asset_df.TPTOTINC * 12
+        asset_df["household_weight"] = asset_df.WPFINWGT
+        asset_df["is_under_18"] = asset_df.TAGE < 18
+        asset_df["count_under_18"] = (
+            asset_df.groupby("SSUID")["is_under_18"]
+            .sum()
+            .loc[asset_df.SSUID.values]
+            .values
+        )
+
+        asset_train_cols = [
+            "employment_income",
+            "bank_account_assets",
+            "stock_assets",
+            "bond_assets",
+            "age",
+            "is_female",
+            "is_married",
+            "count_under_18",
+            "household_weight",
+        ]
+        asset_train = asset_df[asset_train_cols].dropna()
+        asset_train = asset_train.loc[
+            rng.choice(
+                asset_train.index,
+                size=min(20_000, len(asset_train)),
+                replace=True,
+                p=(
+                    asset_train.household_weight
+                    / asset_train.household_weight.sum()
+                ),
+            )
+        ]
+
+        asset_state_idx = rng.choice(
+            len(donor_states),
+            size=len(asset_train),
+            p=block_probs,
+        )
+        asset_train["state_fips"] = donor_states[asset_state_idx].astype(
+            np.float32
+        )
+
+        cps_asset_df = _build_cps_receiver(
+            data,
+            time_period,
+            dataset_path,
+            ["employment_income", "age", "is_male"],
+        )
+        if "is_male" in cps_asset_df.columns:
+            cps_asset_df["is_female"] = (
+                ~cps_asset_df["is_male"].astype(bool)
+            ).astype(np.float32)
+        else:
+            cps_asset_df["is_female"] = 0.0
+        if "is_married" in data:
+            cps_asset_df["is_married"] = data["is_married"][
+                time_period
+            ].astype(np.float32)
+        else:
+            cps_asset_df["is_married"] = 0.0
+        cps_asset_df["count_under_18"] = (
+            cps_tip_df["count_under_18"]
+            if "count_under_18" in cps_tip_df.columns
+            else 0.0
+        )
+
+        cps_asset_df["state_fips"] = person_states.astype(np.float32)
+
+        asset_predictors = SIPP_ASSETS_PREDICTORS + ["state_fips"]
+        asset_vars = [
+            "bank_account_assets",
+            "stock_assets",
+            "bond_assets",
+        ]
+        qrf = QRF()
+        logger.info(
+            "SIPP assets QRF: %d train, %d test",
+            len(asset_train),
+            len(cps_asset_df),
+        )
+        fitted = qrf.fit(
+            X_train=asset_train,
+            predictors=asset_predictors,
+            imputed_variables=asset_vars,
+        )
+        asset_preds = fitted.predict(X_test=cps_asset_df)
+
+        for var in asset_vars:
+            data[var] = {
+                time_period: asset_preds[var].values,
+            }
+        del fitted, asset_preds
+        gc.collect()
+
+        logger.info("SIPP asset imputation complete")
+
+    except Exception as e:
+        logger.warning(
+            "SIPP asset imputation failed: %s. " "Keeping existing values.",
+            e,
+        )
+
+    return data
+
+
+def _impute_scf(
+    data: Dict[str, Dict[int, np.ndarray]],
+    state_fips: np.ndarray,
+    time_period: int,
+    dataset_path: Optional[str] = None,
+) -> Dict[str, Dict[int, np.ndarray]]:
+    """Impute net_worth and auto_loan from SCF with state.
+
+    Args:
+        data: CPS data dict.
+        state_fips: State FIPS per household.
+        time_period: Tax year.
+        dataset_path: Path to CPS h5 for Microsimulation.
+
+    Returns:
+        Updated data dict.
+    """
+    from microimpute.models.qrf import QRF
+    from policyengine_us import Microsimulation
+
+    from policyengine_us_data.calibration.clone_and_assign import (
+        load_global_block_distribution,
+    )
+    from policyengine_us_data.datasets.scf.scf import SCF_2022
+
+    _, _, donor_states, block_probs = load_global_block_distribution()
+    rng = np.random.default_rng(seed=77)
+
+    scf_dataset = SCF_2022()
+    scf_data = scf_dataset.load_dataset()
+    scf_df = pd.DataFrame({key: scf_data[key] for key in scf_data.keys()})
+
+    scf_state_idx = rng.choice(
+        len(donor_states), size=len(scf_df), p=block_probs
+    )
+    scf_df["state_fips"] = donor_states[scf_state_idx].astype(np.float32)
+
+    scf_predictors = SCF_PREDICTORS + ["state_fips"]
+
+    available_preds = [p for p in scf_predictors if p in scf_df.columns]
+    missing_preds = [p for p in scf_predictors if p not in scf_df.columns]
+    if missing_preds:
+        logger.warning("SCF missing predictors: %s", missing_preds)
+        scf_predictors = available_preds
+
+    scf_vars = SCF_IMPUTED_VARIABLES
+    if "networth" in scf_df.columns and "net_worth" not in scf_df.columns:
+        scf_df["net_worth"] = scf_df["networth"]
+
+    available_vars = [v for v in scf_vars if v in scf_df.columns]
+    if not available_vars:
+        logger.warning("No SCF imputed variables available. Skipping.")
+        return data
+
+    weights = scf_df.get("wgt")
+
+    donor = scf_df[scf_predictors + available_vars].copy()
+    if weights is not None:
+        donor["wgt"] = weights
+    donor = donor.dropna(subset=scf_predictors)
+    donor = donor.sample(frac=0.5, random_state=42).reset_index(drop=True)
+
+    pe_vars = [
+        "age",
+        "is_male",
+        "employment_income",
+    ]
+    cps_df = _build_cps_receiver(data, time_period, dataset_path, pe_vars)
+
+    if "is_male" in cps_df.columns:
+        cps_df["is_female"] = (~cps_df["is_male"].astype(bool)).astype(
+            np.float32
+        )
+    else:
+        cps_df["is_female"] = 0.0
+
+    for var in [
+        "cps_race",
+        "is_married",
+        "own_children_in_household",
+    ]:
+        if var in data:
+            cps_df[var] = data[var][time_period].astype(np.float32)
+        else:
+            cps_df[var] = 0.0
+
+    for var in [
+        "taxable_interest_income",
+        "tax_exempt_interest_income",
+        "qualified_dividend_income",
+        "non_qualified_dividend_income",
+    ]:
+        if var in data:
+            cps_df[var] = data[var][time_period].astype(np.float32)
+    cps_df["interest_dividend_income"] = (
+        cps_df.get("taxable_interest_income", 0)
+        + cps_df.get("tax_exempt_interest_income", 0)
+        + cps_df.get("qualified_dividend_income", 0)
+        + cps_df.get("non_qualified_dividend_income", 0)
+    ).astype(np.float32)
+
+    for var in [
+        "tax_exempt_private_pension_income",
+        "taxable_private_pension_income",
+        "social_security_retirement",
+    ]:
+        if var in data:
+            cps_df[var] = data[var][time_period].astype(np.float32)
+    cps_df["social_security_pension_income"] = (
+        cps_df.get("tax_exempt_private_pension_income", 0)
+        + cps_df.get("taxable_private_pension_income", 0)
+        + cps_df.get("social_security_retirement", 0)
+    ).astype(np.float32)
+
+    person_states = _person_state_fips(data, state_fips, time_period)
+    cps_df["state_fips"] = person_states.astype(np.float32)
+
+    qrf = QRF()
+    logger.info(
+        "SCF QRF: %d train, %d test, vars=%s",
+        len(donor),
+        len(cps_df),
+        available_vars,
+    )
+    fitted = qrf.fit(
+        X_train=donor,
+        predictors=scf_predictors,
+        imputed_variables=available_vars,
+        weight_col="wgt" if weights is not None else None,
+        tune_hyperparameters=False,
+    )
+    preds = fitted.predict(X_test=cps_df)
+
+    hh_ids = data["household_id"][time_period]
+    person_hh_ids = data.get("person_household_id", {}).get(time_period)
+
+    for var in available_vars:
+        person_vals = preds[var].values
+        entity = _get_variable_entity(var)
+        if entity == "household" and person_hh_ids is not None:
+            hh_vals = np.zeros(len(hh_ids), dtype=np.float32)
+            hh_to_idx = {int(hid): i for i, hid in enumerate(hh_ids)}
+            seen = set()
+            for p_idx, p_hh in enumerate(person_hh_ids):
+                hh_key = int(p_hh)
+                if hh_key not in seen:
+                    seen.add(hh_key)
+                    hh_vals[hh_to_idx[hh_key]] = person_vals[p_idx]
+            data[var] = {time_period: hh_vals}
+            logger.info(
+                "  %s: person(%d) -> household(%d)",
+                var,
+                len(person_vals),
+                len(hh_vals),
+            )
+        else:
+            data[var] = {time_period: person_vals}
+
+    del fitted, preds
+    gc.collect()
+
+    logger.info("SCF imputation complete: %s", available_vars)
+    return data
diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py
index d2759b34..3a95952f 100644
--- a/policyengine_us_data/calibration/unified_calibration.py
+++ b/policyengine_us_data/calibration/unified_calibration.py
@@ -4,10 +4,12 @@
 Pipeline flow:
     1. Load CPS dataset -> get n_records
     2. Clone Nx, assign random geography (census block)
-    3. Re-randomize simple takeup variables per block
-    4. Build sparse calibration matrix (clone-by-clone)
-    5. L0-regularized optimization -> calibrated weights
-    6. Save weights, diagnostics, run config
+    3. (Optional) Source impute ACS/SIPP/SCF vars with state
+    4. (Optional) PUF clone (2x) + QRF impute with state
+    5. Re-randomize simple takeup variables per block
+    6. Build sparse calibration matrix (clone-by-clone)
+    7. L0-regularized optimization -> calibrated weights
+    8. Save weights, diagnostics, run config
 
 Two presets control output size via L0 regularization:
 - local: L0=1e-8, ~3-4M records (for local area dataset)
@@ -19,7 +21,8 @@
         --db-path path/to/policy_data.db \\
         --output path/to/weights.npy \\
         --preset local \\
-        --epochs 100
+        --epochs 100 \\
+        --puf-dataset path/to/puf_2024.h5
 """
 
 import argparse
@@ -253,6 +256,21 @@ def parse_args(argv=None):
         action="store_true",
         help="Skip takeup re-randomization",
     )
+    parser.add_argument(
+        "--puf-dataset",
+        default=None,
+        help="Path to PUF h5 file for QRF training",
+    )
+    parser.add_argument(
+        "--skip-puf",
+        action="store_true",
+        help="Skip PUF clone + QRF imputation",
+    )
+    parser.add_argument(
+        "--skip-source-impute",
+        action="store_true",
+        help="Skip ACS/SIPP/SCF re-imputation with state",
+    )
     return parser.parse_args(argv)
 
 
@@ -389,6 +407,84 @@ def compute_diagnostics(
     )
 
 
+def _build_puf_cloned_dataset(
+    dataset_path: str,
+    puf_dataset_path: str,
+    state_fips: np.ndarray,
+    time_period: int = 2024,
+    skip_qrf: bool = False,
+    skip_source_impute: bool = False,
+) -> str:
+    """Build a PUF-cloned dataset from raw CPS.
+
+    Loads the CPS, optionally runs source imputations
+    (ACS/SIPP/SCF), then PUF clone + QRF.
+
+    Args:
+        dataset_path: Path to raw CPS h5 file.
+        puf_dataset_path: Path to PUF h5 file.
+        state_fips: State FIPS per household (base records).
+        time_period: Tax year.
+        skip_qrf: Skip QRF imputation.
+        skip_source_impute: Skip ACS/SIPP/SCF imputations.
+
+    Returns:
+        Path to the PUF-cloned h5 file.
+    """
+    import h5py
+
+    from policyengine_us import Microsimulation
+
+    from policyengine_us_data.calibration.puf_impute import (
+        puf_clone_dataset,
+    )
+
+    logger.info("Building PUF-cloned dataset from %s", dataset_path)
+
+    sim = Microsimulation(dataset=dataset_path)
+    data = sim.dataset.load_dataset()
+
+    data_dict = {}
+    for var in data:
+        data_dict[var] = {time_period: data[var][...]}
+
+    if not skip_source_impute:
+        from policyengine_us_data.calibration.source_impute import (
+            impute_source_variables,
+        )
+
+        data_dict = impute_source_variables(
+            data=data_dict,
+            state_fips=state_fips,
+            time_period=time_period,
+            dataset_path=dataset_path,
+        )
+
+    puf_dataset = puf_dataset_path if not skip_qrf else None
+
+    new_data = puf_clone_dataset(
+        data=data_dict,
+        state_fips=state_fips,
+        time_period=time_period,
+        puf_dataset=puf_dataset,
+        skip_qrf=skip_qrf,
+        dataset_path=dataset_path,
+    )
+
+    output_path = str(
+        Path(dataset_path).parent / f"puf_cloned_{Path(dataset_path).stem}.h5"
+    )
+
+    with h5py.File(output_path, "w") as f:
+        for var, time_dict in new_data.items():
+            for tp, values in time_dict.items():
+                f.create_dataset(f"{var}/{tp}", data=values)
+
+    del sim
+    logger.info("PUF-cloned dataset saved to %s", output_path)
+    return output_path
+
+
 def run_calibration(
     dataset_path: str,
     db_path: str,
@@ -400,6 +496,9 @@ def run_calibration(
     domain_variables: list = None,
     hierarchical_domains: list = None,
     skip_takeup_rerandomize: bool = False,
+    puf_dataset_path: str = None,
+    skip_puf: bool = False,
+    skip_source_impute: bool = False,
 ):
     """Run unified calibration pipeline.
 
@@ -415,6 +514,9 @@ def run_calibration(
         hierarchical_domains: Domains for hierarchical
             uprating + CD reconciliation.
         skip_takeup_rerandomize: Skip takeup step.
+        puf_dataset_path: Path to PUF h5 for QRF training.
+        skip_puf: Skip PUF clone step.
+        skip_source_impute: Skip ACS/SIPP/SCF imputations.
 
     Returns:
         (weights, targets_df, X_sparse, target_names)
@@ -425,6 +527,7 @@ def run_calibration(
 
     from policyengine_us_data.calibration.clone_and_assign import (
         assign_random_geography,
+        double_geography_for_puf,
     )
     from policyengine_us_data.calibration.unified_matrix_builder import (
         UnifiedMatrixBuilder,
@@ -451,7 +554,76 @@ def run_calibration(
         seed=seed,
     )
 
-    # Step 3: Build sim_modifier for takeup rerandomization
+    # Step 3: Source impute + PUF clone (if requested)
+    dataset_for_matrix = dataset_path
+    if not skip_puf and puf_dataset_path is not None:
+        base_states = geography.state_fips[:n_records]
+
+        puf_cloned_path = _build_puf_cloned_dataset(
+            dataset_path=dataset_path,
+            puf_dataset_path=puf_dataset_path,
+            state_fips=base_states,
+            time_period=2024,
+            skip_qrf=False,
+            skip_source_impute=skip_source_impute,
+        )
+
+        geography = double_geography_for_puf(geography)
+        dataset_for_matrix = puf_cloned_path
+        n_records = n_records * 2
+
+        # Reload sim from PUF-cloned dataset
+        del sim
+        sim = Microsimulation(dataset=puf_cloned_path)
+
+        logger.info(
+            "After PUF clone: %d records x %d clones = %d",
+            n_records,
+            n_clones,
+            n_records * n_clones,
+        )
+    elif not skip_source_impute:
+        # Run source imputations without PUF cloning
+        import h5py
+
+        base_states = geography.state_fips[:n_records]
+
+        source_sim = Microsimulation(dataset=dataset_path)
+        raw_data = source_sim.dataset.load_dataset()
+        data_dict = {}
+        for var in raw_data:
+            data_dict[var] = {2024: raw_data[var][...]}
+        del source_sim
+
+        from policyengine_us_data.calibration.source_impute import (
+            impute_source_variables,
+        )
+
+        data_dict = impute_source_variables(
+            data=data_dict,
+            state_fips=base_states,
+            time_period=2024,
+            dataset_path=dataset_path,
+        )
+
+        source_path = str(
+            Path(dataset_path).parent
+            / f"source_imputed_{Path(dataset_path).stem}.h5"
+        )
+        with h5py.File(source_path, "w") as f:
+            for var, time_dict in data_dict.items():
+                for tp, values in time_dict.items():
+                    f.create_dataset(f"{var}/{tp}", data=values)
+        dataset_for_matrix = source_path
+
+        del sim
+        sim = Microsimulation(dataset=source_path)
+        logger.info(
+            "Source-imputed dataset saved to %s",
+            source_path,
+        )
+
+    # Step 4: Build sim_modifier for takeup rerandomization
     sim_modifier = None
     if not skip_takeup_rerandomize:
         time_period = 2024
@@ -463,18 +635,18 @@ def sim_modifier(s, clone_idx):
             states = geography.state_fips[col_start:col_end]
             rerandomize_takeup(s, blocks, states, time_period)
 
-    # Step 4: Build target filter
+    # Step 5: Build target filter
     target_filter = {}
     if domain_variables:
         target_filter["domain_variables"] = domain_variables
 
-    # Step 5: Build sparse calibration matrix
+    # Step 6: Build sparse calibration matrix
     t_matrix = time.time()
     db_uri = f"sqlite:///{db_path}"
     builder = UnifiedMatrixBuilder(
         db_uri=db_uri,
         time_period=2024,
-        dataset_path=dataset_path,
+        dataset_path=dataset_for_matrix,
     )
     targets_df, X_sparse, target_names = builder.build_matrix(
         geography=geography,
@@ -495,7 +667,7 @@ def sim_modifier(s, clone_idx):
         X_sparse.nnz,
     )
 
-    # Step 6: L0 calibration
+    # Step 7: L0 calibration
     targets = targets_df["value"].values
 
     row_sums = np.array(X_sparse.sum(axis=1)).flatten()
@@ -570,6 +742,8 @@ def main(argv=None):
 
     t_start = time.time()
 
+    puf_dataset_path = getattr(args, "puf_dataset", None)
+
     weights, targets_df, X_sparse, target_names = run_calibration(
         dataset_path=dataset_path,
         db_path=db_path,
@@ -581,6 +755,9 @@ def main(argv=None):
         domain_variables=domain_variables,
         hierarchical_domains=hierarchical_domains,
         skip_takeup_rerandomize=(args.skip_takeup_rerandomize),
+        puf_dataset_path=puf_dataset_path,
+        skip_puf=getattr(args, "skip_puf", False),
+        skip_source_impute=getattr(args, "skip_source_impute", False),
     )
 
     # Save weights
@@ -612,6 +789,9 @@ def main(argv=None):
     run_config = {
         "dataset": dataset_path,
         "db_path": db_path,
+        "puf_dataset": puf_dataset_path,
+        "skip_puf": getattr(args, "skip_puf", False),
+        "skip_source_impute": getattr(args, "skip_source_impute", False),
         "n_clones": args.n_clones,
         "lambda_l0": lambda_l0,
         "epochs": args.epochs,
diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
index f96384a6..6af86ca0 100644
--- a/policyengine_us_data/datasets/cps/extended_cps.py
+++ b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -1,140 +1,14 @@
+import logging
+
+import numpy as np
 from policyengine_core.data import Dataset
-from policyengine_us_data.storage import STORAGE_FOLDER
 from typing import Type
+
+from policyengine_us_data.storage import STORAGE_FOLDER
 from policyengine_us_data.datasets.cps.cps import *
 from policyengine_us_data.datasets.puf import *
-import pandas as pd
-from microimpute.models.qrf import QRF
-import time
-import logging
-import gc
 
-# These are sorted by magnitude.
-# First 15 contain 90%.
-# First 7 contain 75%.
-# If you're trying to debug this part of the code and don't want to wait ages
-# to see if something breaks, try limiting to those.
-IMPUTED_VARIABLES = [
-    "employment_income",
-    "partnership_s_corp_income",
-    "social_security",
-    "taxable_pension_income",
-    "interest_deduction",
-    "tax_exempt_pension_income",
-    "long_term_capital_gains",
-    "unreimbursed_business_employee_expenses",
-    "pre_tax_contributions",
-    "taxable_ira_distributions",
-    "self_employment_income",
-    "w2_wages_from_qualified_business",
-    "unadjusted_basis_qualified_property",
-    "business_is_sstb",  # bool
-    "short_term_capital_gains",
-    "qualified_dividend_income",
-    "charitable_cash_donations",
-    "self_employed_pension_contribution_ald",
-    "unrecaptured_section_1250_gain",
-    "taxable_unemployment_compensation",
-    "taxable_interest_income",
-    "domestic_production_ald",
-    "self_employed_health_insurance_ald",
-    "rental_income",
-    "non_qualified_dividend_income",
-    "cdcc_relevant_expenses",
-    "tax_exempt_interest_income",
-    "salt_refund_income",
-    "foreign_tax_credit",
-    "estate_income",
-    "charitable_non_cash_donations",
-    "american_opportunity_credit",
-    "miscellaneous_income",
-    "alimony_expense",
-    "farm_income",
-    "partnership_se_income",
-    "alimony_income",
-    "health_savings_account_ald",
-    "non_sch_d_capital_gains",
-    "general_business_credit",
-    "energy_efficient_home_improvement_credit",
-    "traditional_ira_contributions",
-    "amt_foreign_tax_credit",
-    "excess_withheld_payroll_tax",
-    "savers_credit",
-    "student_loan_interest",
-    "investment_income_elected_form_4952",
-    "early_withdrawal_penalty",
-    "prior_year_minimum_tax_credit",
-    "farm_rent_income",
-    "qualified_tuition_expenses",
-    "educator_expense",
-    "long_term_capital_gains_on_collectibles",
-    "other_credits",
-    "casualty_loss",
-    "unreported_payroll_tax",
-    "recapture_of_investment_credit",
-    "deductible_mortgage_interest",
-    "qualified_reit_and_ptp_income",
-    "qualified_bdc_income",
-    "farm_operations_income",
-    "estate_income_would_be_qualified",
-    "farm_operations_income_would_be_qualified",
-    "farm_rent_income_would_be_qualified",
-    "partnership_s_corp_income_would_be_qualified",
-    "rental_income_would_be_qualified",
-    "self_employment_income_would_be_qualified",
-]
-
-OVERRIDDEN_IMPUTED_VARIABLES = [
-    "partnership_s_corp_income",
-    "interest_deduction",
-    "unreimbursed_business_employee_expenses",
-    "pre_tax_contributions",
-    "w2_wages_from_qualified_business",
-    "unadjusted_basis_qualified_property",
-    "business_is_sstb",
-    "charitable_cash_donations",
-    "self_employed_pension_contribution_ald",
-    "unrecaptured_section_1250_gain",
-    "taxable_unemployment_compensation",
-    "domestic_production_ald",
-    "self_employed_health_insurance_ald",
-    "cdcc_relevant_expenses",
-    "salt_refund_income",
-    "foreign_tax_credit",
-    "estate_income",
-    "charitable_non_cash_donations",
-    "american_opportunity_credit",
-    "miscellaneous_income",
-    "alimony_expense",
-    "health_savings_account_ald",
-    "non_sch_d_capital_gains",
-    "general_business_credit",
-    "energy_efficient_home_improvement_credit",
-    "amt_foreign_tax_credit",
-    "excess_withheld_payroll_tax",
-    "savers_credit",
-    "student_loan_interest",
-    "investment_income_elected_form_4952",
-    "early_withdrawal_penalty",
-    "prior_year_minimum_tax_credit",
-    "farm_rent_income",
-    "qualified_tuition_expenses",
-    "educator_expense",
-    "long_term_capital_gains_on_collectibles",
-    "other_credits",
-    "casualty_loss",
-    "unreported_payroll_tax",
-    "recapture_of_investment_credit",
-    "deductible_mortgage_interest",
-    "qualified_reit_and_ptp_income",
-    "qualified_bdc_income",
-    "farm_operations_income",
-    "estate_income_would_be_qualified",
-    "farm_operations_income_would_be_qualified",
-    "farm_rent_income_would_be_qualified",
-    "partnership_s_corp_income_would_be_qualified",
-    "rental_income_would_be_qualified",
-]
+logger = logging.getLogger(__name__)
 
 
 class ExtendedCPS(Dataset):
@@ -145,290 +19,40 @@ class ExtendedCPS(Dataset):
     def generate(self):
         from policyengine_us import Microsimulation
 
-        cps_sim = Microsimulation(dataset=self.cps)
-        puf_sim = Microsimulation(dataset=self.puf)
-
-        puf_sim.subsample(10_000)
-
-        INPUTS = [
-            "age",
-            "is_male",
-            "tax_unit_is_joint",
-            "tax_unit_count_dependents",
-            "is_tax_unit_head",
-            "is_tax_unit_spouse",
-            "is_tax_unit_dependent",
-        ]
-
-        y_full_imputations = impute_income_variables(
-            cps_sim,
-            puf_sim,
-            predictors=INPUTS,
-            outputs=IMPUTED_VARIABLES,
+        from policyengine_us_data.calibration.clone_and_assign import (
+            load_global_block_distribution,
         )
-        y_cps_imputations = impute_income_variables(
-            cps_sim,
-            puf_sim,
-            predictors=INPUTS,
-            outputs=OVERRIDDEN_IMPUTED_VARIABLES,
+        from policyengine_us_data.calibration.puf_impute import (
+            puf_clone_dataset,
         )
+
+        logger.info("Loading CPS dataset: %s", self.cps)
         cps_sim = Microsimulation(dataset=self.cps)
         data = cps_sim.dataset.load_dataset()
-        new_data = {}
-
-        # Pre-compute weeks_unemployed imputation for PUF copy
-        # Preserve relationship between UC and weeks from CPS
-        puf_weeks_unemployed = impute_weeks_unemployed_for_puf(
-            cps_sim, y_full_imputations
+        del cps_sim
+
+        data_dict = {}
+        for var in data:
+            data_dict[var] = {self.time_period: data[var][...]}
+
+        n_hh = len(data_dict["household_id"][self.time_period])
+        _, _, block_states, block_probs = load_global_block_distribution()
+        rng = np.random.default_rng(seed=42)
+        indices = rng.choice(len(block_states), size=n_hh, p=block_probs)
+        state_fips = block_states[indices]
+
+        logger.info("PUF clone with dataset: %s", self.puf)
+        new_data = puf_clone_dataset(
+            data=data_dict,
+            state_fips=state_fips,
+            time_period=self.time_period,
+            puf_dataset=self.puf,
+            dataset_path=str(self.cps.file_path),
         )
 
-        for variable in list(data) + IMPUTED_VARIABLES:
-            variable_metadata = cps_sim.tax_benefit_system.variables.get(
-                variable
-            )
-            if variable in data:
-                values = data[variable][...]
-            else:
-                values = cps_sim.calculate(variable).values
-            if variable in OVERRIDDEN_IMPUTED_VARIABLES:
-                pred_values = y_cps_imputations[variable].values
-                entity = variable_metadata.entity.key
-                if entity != "person":
-                    pred_values = cps_sim.populations[
-                        entity
-                    ].value_from_first_person(pred_values)
-                values = np.concatenate([pred_values, pred_values])
-            elif variable in IMPUTED_VARIABLES:
-                pred_values = y_full_imputations[variable].values
-                entity = variable_metadata.entity.key
-                if entity != "person":
-                    pred_values = cps_sim.populations[
-                        entity
-                    ].value_from_first_person(pred_values)
-                values = np.concatenate([values, pred_values])
-            elif variable == "person_id":
-                values = np.concatenate([values, values + values.max()])
-            elif "_id" in variable:
-                values = np.concatenate([values, values + values.max()])
-            elif "_weight" in variable:
-                values = np.concatenate([values, values * 0])
-            elif variable == "weeks_unemployed":
-                # Use imputed weeks for PUF copy to preserve UC relationship
-                values = np.concatenate([values, puf_weeks_unemployed])
-            else:
-                values = np.concatenate([values, values])
-            new_data[variable] = {
-                self.time_period: values,
-            }
-
         self.save_dataset(new_data)
 
 
-def impute_income_variables(
-    cps_sim,
-    puf_sim,
-    predictors: list[str] = None,
-    outputs: list[str] = None,
-):
-
-    # Calculate all variables together to preserve dependencies
-    X_train = puf_sim.calculate_dataframe(predictors + outputs)
-
-    # Check which outputs are actually in the result
-    available_outputs = [col for col in outputs if col in X_train.columns]
-    missing_outputs = [col for col in outputs if col not in X_train.columns]
-
-    if missing_outputs:
-        logging.warning(
-            f"The following {len(missing_outputs)} variables were not calculated: {missing_outputs}"
-        )
-        # Log the specific missing variable that's causing issues
-        if "recapture_of_investment_credit" in missing_outputs:
-            logging.error(
-                "recapture_of_investment_credit is missing from PUF calculation!"
-            )
-
-    logging.info(
-        f"X_train shape: {X_train.shape}, columns: {len(X_train.columns)}"
-    )
-
-    X_test = cps_sim.calculate_dataframe(predictors)
-
-    logging.info(
-        f"Imputing {len(available_outputs)} variables using batched sequential QRF"
-    )
-    total_start = time.time()
-
-    # Batch variables to avoid memory issues with sequential imputation
-    batch_size = 10  # Reduce to 10 variables at a time
-    result = pd.DataFrame(index=X_test.index)
-
-    # Sample training data more aggressively upfront
-    sample_size = min(5000, len(X_train))  # Reduced from 5000
-    if len(X_train) > sample_size:
-        logging.info(
-            f"Sampling training data from {len(X_train)} to {sample_size} rows"
-        )
-        X_train_sampled = X_train.sample(n=sample_size, random_state=42)
-    else:
-        X_train_sampled = X_train
-
-    for batch_start in range(0, len(available_outputs), batch_size):
-        batch_end = min(batch_start + batch_size, len(available_outputs))
-        batch_vars = available_outputs[batch_start:batch_end]
-
-        logging.info(
-            f"Processing batch {batch_start//batch_size + 1}: variables {batch_start+1}-{batch_end} ({batch_vars})"
-        )
-
-        # Force garbage collection before each batch
-        gc.collect()
-
-        # Create a fresh QRF for each batch
-        qrf = QRF(
-            log_level="INFO",
-            memory_efficient=True,
-            batch_size=10,
-            cleanup_interval=5,
-        )
-
-        # Use pre-sampled data for this batch
-        batch_X_train = X_train_sampled[predictors + batch_vars].copy()
-
-        # Fit model for this batch with sequential imputation within the batch
-        fitted_model = qrf.fit(
-            X_train=batch_X_train,
-            predictors=predictors,
-            imputed_variables=batch_vars,
-            n_jobs=1,  # Single thread to reduce memory overhead
-        )
-
-        # Predict for this batch
-        batch_predictions = fitted_model.predict(X_test=X_test)
-
-        # Extract median predictions and add to result
-        for var in batch_vars:
-            result[var] = batch_predictions[var]
-
-        # Clean up batch objects
-        del fitted_model
-        del batch_predictions
-        del batch_X_train
-        gc.collect()
-
-        logging.info(f"Completed batch {batch_start//batch_size + 1}")
-
-    # Add zeros for missing variables
-    for var in missing_outputs:
-        result[var] = 0
-
-    logging.info(
-        f"Imputing {len(available_outputs)} variables took {time.time() - total_start:.2f} seconds total"
-    )
-
-    return result
-
-
-def impute_weeks_unemployed_for_puf(cps_sim, puf_imputations):
-    """
-    Impute weeks_unemployed for the PUF copy using QRF from CPS data.
-
-    Uses microimpute's Quantile Random Forest to impute weeks_unemployed
-    for PUF records based on CPS data, preserving the joint distribution
-    of weeks with UC, age, and other predictors.
-
-    This is the reverse of the income imputation (CPS → PUF instead of
-    PUF → CPS) because weeks_unemployed exists in CPS but not in PUF.
-    """
-    # Get CPS weeks
-    try:
-        cps_weeks = cps_sim.calculate("weeks_unemployed").values
-    except (ValueError, KeyError):
-        logging.warning(
-            "weeks_unemployed not available in CPS, "
-            "returning zeros for PUF copy"
-        )
-        n_persons = len(puf_imputations.index)
-        return np.zeros(n_persons)
-
-    # Predictors available in both CPS and imputed PUF data
-    WEEKS_PREDICTORS = [
-        "age",
-        "is_male",
-        "tax_unit_is_joint",
-        "is_tax_unit_head",
-        "is_tax_unit_spouse",
-        "is_tax_unit_dependent",
-    ]
-
-    # Build training data from CPS
-    X_train = cps_sim.calculate_dataframe(WEEKS_PREDICTORS)
-    X_train["weeks_unemployed"] = cps_weeks
-
-    # Add UC as predictor if available in imputations (strong predictor)
-    if "taxable_unemployment_compensation" in puf_imputations.columns:
-        cps_uc = cps_sim.calculate("unemployment_compensation").values
-        X_train["unemployment_compensation"] = cps_uc
-        WEEKS_PREDICTORS = WEEKS_PREDICTORS + ["unemployment_compensation"]
-
-    # Build test data for PUF copy
-    # Use CPS sim to get demographics (same as CPS portion)
-    X_test = cps_sim.calculate_dataframe(
-        [p for p in WEEKS_PREDICTORS if p != "unemployment_compensation"]
-    )
-
-    # Add imputed UC if available
-    if "taxable_unemployment_compensation" in puf_imputations.columns:
-        X_test["unemployment_compensation"] = puf_imputations[
-            "taxable_unemployment_compensation"
-        ].values
-
-    logging.info(
-        f"Imputing weeks_unemployed using QRF with "
-        f"predictors: {WEEKS_PREDICTORS}"
-    )
-
-    # Use QRF to impute weeks
-    qrf = QRF(
-        log_level="INFO",
-        memory_efficient=True,
-    )
-
-    # Sample training data for efficiency
-    sample_size = min(5000, len(X_train))
-    if len(X_train) > sample_size:
-        X_train_sampled = X_train.sample(n=sample_size, random_state=42)
-    else:
-        X_train_sampled = X_train
-
-    fitted_model = qrf.fit(
-        X_train=X_train_sampled,
-        predictors=WEEKS_PREDICTORS,
-        imputed_variables=["weeks_unemployed"],
-        n_jobs=1,
-    )
-
-    predictions = fitted_model.predict(X_test=X_test)
-    imputed_weeks = predictions["weeks_unemployed"].values
-
-    # Enforce constraints: 0-52 weeks, 0 if no UC
-    imputed_weeks = np.clip(imputed_weeks, 0, 52)
-    if "unemployment_compensation" in X_test.columns:
-        imputed_weeks = np.where(
-            X_test["unemployment_compensation"].values > 0,
-            imputed_weeks,
-            0,
-        )
-
-    logging.info(
-        f"Imputed weeks_unemployed for PUF: "
-        f"{(imputed_weeks > 0).sum()} with weeks > 0, "
-        f"mean = {imputed_weeks[imputed_weeks > 0].mean():.1f} weeks"
-    )
-
-    return imputed_weeks
-
-
 class ExtendedCPS_2024(ExtendedCPS):
     cps = CPS_2024_Full
     puf = PUF_2024
diff --git a/policyengine_us_data/tests/test_calibration/test_puf_impute.py b/policyengine_us_data/tests/test_calibration/test_puf_impute.py
new file mode 100644
index 00000000..153df144
--- /dev/null
+++ b/policyengine_us_data/tests/test_calibration/test_puf_impute.py
@@ -0,0 +1,134 @@
+"""Tests for puf_impute module.
+
+Verifies PUF clone + QRF imputation logic using mock data
+so tests don't require real CPS/PUF datasets.
+"""
+
+import numpy as np
+import pytest
+
+from policyengine_us_data.calibration.puf_impute import (
+    puf_clone_dataset,
+    DEMOGRAPHIC_PREDICTORS,
+    IMPUTED_VARIABLES,
+    OVERRIDDEN_IMPUTED_VARIABLES,
+)
+
+
+def _make_mock_data(n_persons=20, n_households=5, time_period=2024):
+    """Build a minimal mock CPS data dict."""
+    person_ids = np.arange(1, n_persons + 1)
+    household_ids_person = np.repeat(
+        np.arange(1, n_households + 1),
+        n_persons // n_households,
+    )
+    tax_unit_ids_person = household_ids_person.copy()
+    spm_unit_ids_person = household_ids_person.copy()
+
+    rng = np.random.default_rng(42)
+    ages = rng.integers(18, 80, size=n_persons)
+    is_male = rng.integers(0, 2, size=n_persons)
+
+    data = {
+        "person_id": {time_period: person_ids},
+        "household_id": {time_period: np.arange(1, n_households + 1)},
+        "tax_unit_id": {time_period: np.arange(1, n_households + 1)},
+        "spm_unit_id": {time_period: np.arange(1, n_households + 1)},
+        "person_household_id": {time_period: household_ids_person},
+        "person_tax_unit_id": {time_period: tax_unit_ids_person},
+        "person_spm_unit_id": {time_period: spm_unit_ids_person},
+        "age": {time_period: ages.astype(np.float32)},
+        "is_male": {time_period: is_male.astype(np.float32)},
+        "household_weight": {time_period: np.ones(n_households) * 1000},
+        "employment_income": {
+            time_period: rng.uniform(0, 100000, n_persons).astype(np.float32)
+        },
+    }
+    return data
+
+
+class TestPufCloneDataset:
+    def test_doubles_records(self):
+        data = _make_mock_data(n_persons=20, n_households=5)
+        state_fips = np.array([1, 2, 36, 6, 48])
+
+        result = puf_clone_dataset(
+            data=data,
+            state_fips=state_fips,
+            time_period=2024,
+            skip_qrf=True,
+        )
+
+        assert len(result["household_id"][2024]) == 10
+        assert len(result["person_id"][2024]) == 40
+
+    def test_ids_are_unique(self):
+        data = _make_mock_data(n_persons=20, n_households=5)
+        state_fips = np.array([1, 2, 36, 6, 48])
+
+        result = puf_clone_dataset(
+            data=data,
+            state_fips=state_fips,
+            time_period=2024,
+            skip_qrf=True,
+        )
+
+        person_ids = result["person_id"][2024]
+        household_ids = result["household_id"][2024]
+        assert len(np.unique(person_ids)) == len(person_ids)
+        assert len(np.unique(household_ids)) == len(household_ids)
+
+    def test_puf_half_weight_zero(self):
+        data = _make_mock_data(n_persons=20, n_households=5)
+        state_fips = np.array([1, 2, 36, 6, 48])
+
+        result = puf_clone_dataset(
+            data=data,
+            state_fips=state_fips,
+            time_period=2024,
+            skip_qrf=True,
+        )
+
+        weights = result["household_weight"][2024]
+        assert np.all(weights[:5] > 0)
+        assert np.all(weights[5:] == 0)
+
+    def test_state_fips_preserved(self):
+        data = _make_mock_data(n_persons=20, n_households=5)
+        state_fips = np.array([1, 2, 36, 6, 48])
+
+        result = puf_clone_dataset(
+            data=data,
+            state_fips=state_fips,
+            time_period=2024,
+            skip_qrf=True,
+        )
+
+        result_states = result["state_fips"][2024]
+        np.testing.assert_array_equal(result_states[:5], state_fips)
+        np.testing.assert_array_equal(result_states[5:], state_fips)
+
+    def test_demographics_shared(self):
+        data = _make_mock_data(n_persons=20, n_households=5)
+        state_fips = np.array([1, 2, 36, 6, 48])
+
+        result = puf_clone_dataset(
+            data=data,
+            state_fips=state_fips,
+            time_period=2024,
+            skip_qrf=True,
+        )
+
+        ages = result["age"][2024]
+        n = len(ages) // 2
+        np.testing.assert_array_equal(ages[:n], ages[n:])
+
+    def test_demographic_predictors_includes_state(self):
+        assert "state_fips" in DEMOGRAPHIC_PREDICTORS
+
+    def test_imputed_variables_not_empty(self):
+        assert len(IMPUTED_VARIABLES) > 0
+
+    def test_overridden_subset_of_imputed(self):
+        for var in OVERRIDDEN_IMPUTED_VARIABLES:
+            assert var in IMPUTED_VARIABLES
diff --git a/policyengine_us_data/tests/test_calibration/test_source_impute.py b/policyengine_us_data/tests/test_calibration/test_source_impute.py
new file mode 100644
index 00000000..676e66de
--- /dev/null
+++ b/policyengine_us_data/tests/test_calibration/test_source_impute.py
@@ -0,0 +1,169 @@
+"""Tests for source_impute module.
+
+Uses skip flags to avoid loading real donor data.
+"""
+
+import numpy as np
+import pytest
+
+from policyengine_us_data.calibration.source_impute import (
+    ACS_IMPUTED_VARIABLES,
+    SIPP_IMPUTED_VARIABLES,
+    SCF_IMPUTED_VARIABLES,
+    ALL_SOURCE_VARIABLES,
+    impute_source_variables,
+    _impute_acs,
+    _impute_sipp,
+    _impute_scf,
+    _person_state_fips,
+)
+
+
+def _make_data_dict(n_persons=20, time_period=2024):
+    n_hh = n_persons // 2
+    rng = np.random.default_rng(42)
+    return {
+        "person_id": {
+            time_period: np.arange(n_persons),
+        },
+        "household_id": {
+            time_period: np.arange(n_hh),
+        },
+        "person_household_id": {
+            time_period: np.repeat(np.arange(n_hh), 2),
+        },
+        "age": {
+            time_period: rng.integers(18, 80, n_persons).astype(np.float32),
+        },
+        "employment_income": {
+            time_period: rng.uniform(0, 100000, n_persons).astype(np.float32),
+        },
+        "rent": {time_period: np.zeros(n_persons)},
+        "real_estate_taxes": {time_period: np.zeros(n_persons)},
+        "tip_income": {time_period: np.zeros(n_persons)},
+        "bank_account_assets": {time_period: np.zeros(n_persons)},
+        "stock_assets": {time_period: np.zeros(n_persons)},
+        "bond_assets": {time_period: np.zeros(n_persons)},
+        "net_worth": {time_period: np.zeros(n_persons)},
+        "auto_loan_balance": {time_period: np.zeros(n_persons)},
+        "auto_loan_interest": {time_period: np.zeros(n_persons)},
+    }
+
+
+class TestConstants:
+    def test_acs_variables_defined(self):
+        assert "rent" in ACS_IMPUTED_VARIABLES
+        assert "real_estate_taxes" in ACS_IMPUTED_VARIABLES
+
+    def test_sipp_variables_defined(self):
+        assert "tip_income" in SIPP_IMPUTED_VARIABLES
+        assert "bank_account_assets" in SIPP_IMPUTED_VARIABLES
+        assert "stock_assets" in SIPP_IMPUTED_VARIABLES
+        assert "bond_assets" in SIPP_IMPUTED_VARIABLES
+
+    def test_scf_variables_defined(self):
+        assert "net_worth" in SCF_IMPUTED_VARIABLES
+        assert "auto_loan_balance" in SCF_IMPUTED_VARIABLES
+        assert "auto_loan_interest" in SCF_IMPUTED_VARIABLES
+
+    def test_all_source_variables_defined(self):
+        expected = (
+            ACS_IMPUTED_VARIABLES
+            + SIPP_IMPUTED_VARIABLES
+            + SCF_IMPUTED_VARIABLES
+        )
+        assert ALL_SOURCE_VARIABLES == expected
+
+
+class TestImputeSourceVariables:
+    def test_function_exists(self):
+        assert callable(impute_source_variables)
+
+    def test_returns_dict(self):
+        data = _make_data_dict(n_persons=20)
+        state_fips = np.ones(10, dtype=np.int32) * 6
+
+        result = impute_source_variables(
+            data=data,
+            state_fips=state_fips,
+            time_period=2024,
+            skip_acs=True,
+            skip_sipp=True,
+            skip_scf=True,
+        )
+        assert isinstance(result, dict)
+        for key in data:
+            assert key in result
+
+    def test_skip_flags_preserve_data(self):
+        data = _make_data_dict(n_persons=20)
+        state_fips = np.ones(10, dtype=np.int32) * 6
+
+        result = impute_source_variables(
+            data=data,
+            state_fips=state_fips,
+            time_period=2024,
+            skip_acs=True,
+            skip_sipp=True,
+            skip_scf=True,
+        )
+
+        for var in [
+            "rent",
+            "real_estate_taxes",
+            "tip_income",
+            "net_worth",
+        ]:
+            np.testing.assert_array_equal(result[var][2024], data[var][2024])
+
+    def test_state_fips_added_to_data(self):
+        data = _make_data_dict(n_persons=20)
+        state_fips = np.ones(10, dtype=np.int32) * 6
+
+        result = impute_source_variables(
+            data=data,
+            state_fips=state_fips,
+            time_period=2024,
+            skip_acs=True,
+            skip_sipp=True,
+            skip_scf=True,
+        )
+
+        assert "state_fips" in result
+        np.testing.assert_array_equal(result["state_fips"][2024], state_fips)
+
+
+class TestPersonStateFips:
+    def test_maps_correctly(self):
+        data = {
+            "household_id": {2024: np.array([10, 20, 30])},
+            "person_household_id": {2024: np.array([10, 10, 20, 20, 30])},
+            "person_id": {2024: np.arange(5)},
+        }
+        state_fips = np.array([1, 2, 3])
+
+        result = _person_state_fips(data, state_fips, 2024)
+        expected = np.array([1, 1, 2, 2, 3])
+        np.testing.assert_array_equal(result, expected)
+
+    def test_fallback_repeat(self):
+        data = {
+            "household_id": {2024: np.array([10, 20])},
+            "person_id": {2024: np.arange(4)},
+        }
+        state_fips = np.array([1, 2])
+
+        result = _person_state_fips(data, state_fips, 2024)
+        expected = np.array([1, 1, 2, 2])
+        np.testing.assert_array_equal(result, expected)
+
+
+class TestSubfunctions:
+    def test_impute_acs_exists(self):
+        assert callable(_impute_acs)
+
+    def test_impute_sipp_exists(self):
+        assert callable(_impute_sipp)
+
+    def test_impute_scf_exists(self):
+        assert callable(_impute_scf)

From 5e65aaaf203bffba5a07a66e3f2292061741e5c7 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Mon, 16 Feb 2026 22:46:25 -0500
Subject: [PATCH 2/6] Stratified PUF subsample, remove random states from
 SIPP/SCF, fix calibration loader

Replace weight-proportional PUF subsample with stratified approach that
force-includes top 0.5% by AGI and randomly samples rest to 20K, preserving
the high-income tail the QRF needs. Remove random state assignment from SIPP
and SCF in source_impute.py since these surveys lack state identifiers. Fix
unified_calibration.py to handle TIME_PERIOD_ARRAYS dataset format. Add
`make calibrate` target.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 Makefile                                      |  6 +-
 .../calibration/puf_impute.py                 | 58 ++++++++++++++++++-
 .../calibration/source_impute.py              | 49 ++--------------
 .../calibration/unified_calibration.py        |  6 +-
 .../tests/test_calibration/test_puf_impute.py | 46 +++++++++++++++
 5 files changed, 116 insertions(+), 49 deletions(-)

diff --git a/Makefile b/Makefile
index 7c2c63ee..20b1d223 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all format test install download upload docker documentation data publish-local-area clean build paper clean-paper presentations database database-refresh promote-database promote-dataset
+.PHONY: all format test install download upload docker documentation data calibrate publish-local-area clean build paper clean-paper presentations database database-refresh promote-database promote-dataset
 
 HF_CLONE_DIR ?= $(HOME)/huggingface/policyengine-us-data
 
@@ -97,6 +97,10 @@ data: download
 	python policyengine_us_data/datasets/cps/small_enhanced_cps.py
 	python policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py
 
+calibrate: data
+	python -m policyengine_us_data.calibration.unified_calibration \
+		--puf-dataset policyengine_us_data/storage/puf_2024.h5
+
 publish-local-area:
 	python policyengine_us_data/datasets/cps/local_area_calibration/publish_local_area.py
 
diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py
index 9c185c9f..3ad452bb 100644
--- a/policyengine_us_data/calibration/puf_impute.py
+++ b/policyengine_us_data/calibration/puf_impute.py
@@ -22,6 +22,9 @@
 
 logger = logging.getLogger(__name__)
 
+PUF_SUBSAMPLE_TARGET = 20_000
+PUF_TOP_PERCENTILE = 99.5
+
 DEMOGRAPHIC_PREDICTORS = [
     "age",
     "is_male",
@@ -394,9 +397,9 @@ def _run_qrf_imputation(
 ) -> tuple:
     """Run QRF imputation for PUF variables.
 
-    Trains QRF on ALL PUF records (no subsample) with random
-    state assignment, and predicts on CPS data using
-    demographics + state_fips.
+    Stratified-subsamples PUF records (top 0.5% by AGI kept,
+    rest randomly sampled to ~20K total) with random state
+    assignment, trains QRF, and predicts on CPS data.
 
     Args:
         data: CPS data dict.
@@ -427,6 +430,10 @@ def _run_qrf_imputation(
     puf_state_indices = rng.choice(len(puf_states), size=n_puf, p=block_probs)
     puf_state_fips = puf_states[puf_state_indices]
 
+    puf_agi = puf_sim.calculate(
+        "adjusted_gross_income", map_to="person"
+    ).values
+
     demo_preds = [p for p in DEMOGRAPHIC_PREDICTORS if p != "state_fips"]
 
     X_train_full = puf_sim.calculate_dataframe(demo_preds + IMPUTED_VARIABLES)
@@ -439,6 +446,18 @@ def _run_qrf_imputation(
 
     del puf_sim
 
+    sub_idx = _stratified_subsample_index(puf_agi)
+    logger.info(
+        "Stratified PUF subsample: %d -> %d records "
+        "(top %.1f%% preserved, AGI threshold $%,.0f)",
+        len(puf_agi),
+        len(sub_idx),
+        100 - PUF_TOP_PERCENTILE,
+        np.percentile(puf_agi, PUF_TOP_PERCENTILE),
+    )
+    X_train_full = X_train_full.iloc[sub_idx].reset_index(drop=True)
+    X_train_override = X_train_override.iloc[sub_idx].reset_index(drop=True)
+
     n_hh = len(data["household_id"][time_period])
     person_count = len(data["person_id"][time_period])
 
@@ -485,6 +504,39 @@ def _run_qrf_imputation(
     return y_full, y_override
 
 
+def _stratified_subsample_index(
+    income: np.ndarray,
+    target_n: int = PUF_SUBSAMPLE_TARGET,
+    top_pct: float = PUF_TOP_PERCENTILE,
+    seed: int = 42,
+) -> np.ndarray:
+    """Return indices for stratified subsample preserving top earners.
+
+    Keeps ALL records above the top_pct percentile of income,
+    then randomly samples the rest to reach target_n total.
+    """
+    n = len(income)
+    if n <= target_n:
+        return np.arange(n)
+
+    threshold = np.percentile(income, top_pct)
+    top_idx = np.where(income >= threshold)[0]
+    bottom_idx = np.where(income < threshold)[0]
+
+    remaining_quota = max(0, target_n - len(top_idx))
+    rng = np.random.default_rng(seed=seed)
+    if remaining_quota >= len(bottom_idx):
+        selected_bottom = bottom_idx
+    else:
+        selected_bottom = rng.choice(
+            bottom_idx, size=remaining_quota, replace=False
+        )
+
+    selected = np.concatenate([top_idx, selected_bottom])
+    selected.sort()
+    return selected
+
+
 def _batch_qrf(
     X_train: pd.DataFrame,
     X_test: pd.DataFrame,
diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py
index 1d5021a2..a7b6ffc7 100644
--- a/policyengine_us_data/calibration/source_impute.py
+++ b/policyengine_us_data/calibration/source_impute.py
@@ -327,7 +327,7 @@ def _impute_sipp(
     time_period: int,
     dataset_path: Optional[str] = None,
 ) -> Dict[str, Dict[int, np.ndarray]]:
-    """Impute tip_income and liquid assets from SIPP with state.
+    """Impute tip_income and liquid assets from SIPP.
 
     Args:
         data: CPS data dict.
@@ -341,11 +341,6 @@ def _impute_sipp(
     from microimpute.models.qrf import QRF
     from policyengine_us import Microsimulation
 
-    from policyengine_us_data.calibration.clone_and_assign import (
-        load_global_block_distribution,
-    )
-
-    _, _, donor_states, block_probs = load_global_block_distribution()
     rng = np.random.default_rng(seed=88)
 
     from policyengine_us_data.storage import STORAGE_FOLDER
@@ -404,11 +399,6 @@ def _impute_sipp(
         )
     ]
 
-    tip_state_idx = rng.choice(
-        len(donor_states), size=len(tip_train), p=block_probs
-    )
-    tip_train["state_fips"] = donor_states[tip_state_idx].astype(np.float32)
-
     cps_tip_df = _build_cps_receiver(
         data, time_period, dataset_path, ["employment_income", "age"]
     )
@@ -428,10 +418,7 @@ def _impute_sipp(
         cps_tip_df["count_under_18"] = 0.0
         cps_tip_df["count_under_6"] = 0.0
 
-    person_states = _person_state_fips(data, state_fips, time_period)
-    cps_tip_df["state_fips"] = person_states.astype(np.float32)
-
-    tip_predictors = SIPP_TIPS_PREDICTORS + ["state_fips"]
+    tip_predictors = SIPP_TIPS_PREDICTORS
     qrf = QRF()
     logger.info(
         "SIPP tips QRF: %d train, %d test",
@@ -520,15 +507,6 @@ def _impute_sipp(
             )
         ]
 
-        asset_state_idx = rng.choice(
-            len(donor_states),
-            size=len(asset_train),
-            p=block_probs,
-        )
-        asset_train["state_fips"] = donor_states[asset_state_idx].astype(
-            np.float32
-        )
-
         cps_asset_df = _build_cps_receiver(
             data,
             time_period,
@@ -553,9 +531,7 @@ def _impute_sipp(
             else 0.0
         )
 
-        cps_asset_df["state_fips"] = person_states.astype(np.float32)
-
-        asset_predictors = SIPP_ASSETS_PREDICTORS + ["state_fips"]
+        asset_predictors = SIPP_ASSETS_PREDICTORS
         asset_vars = [
             "bank_account_assets",
             "stock_assets",
@@ -598,7 +574,7 @@ def _impute_scf(
     time_period: int,
     dataset_path: Optional[str] = None,
 ) -> Dict[str, Dict[int, np.ndarray]]:
-    """Impute net_worth and auto_loan from SCF with state.
+    """Impute net_worth and auto_loan from SCF.
 
     Args:
         data: CPS data dict.
@@ -611,25 +587,13 @@ def _impute_scf(
     """
     from microimpute.models.qrf import QRF
     from policyengine_us import Microsimulation
-
-    from policyengine_us_data.calibration.clone_and_assign import (
-        load_global_block_distribution,
-    )
     from policyengine_us_data.datasets.scf.scf import SCF_2022
 
-    _, _, donor_states, block_probs = load_global_block_distribution()
-    rng = np.random.default_rng(seed=77)
-
     scf_dataset = SCF_2022()
     scf_data = scf_dataset.load_dataset()
     scf_df = pd.DataFrame({key: scf_data[key] for key in scf_data.keys()})
 
-    scf_state_idx = rng.choice(
-        len(donor_states), size=len(scf_df), p=block_probs
-    )
-    scf_df["state_fips"] = donor_states[scf_state_idx].astype(np.float32)
-
-    scf_predictors = SCF_PREDICTORS + ["state_fips"]
+    scf_predictors = SCF_PREDICTORS
 
     available_preds = [p for p in scf_predictors if p in scf_df.columns]
     missing_preds = [p for p in scf_predictors if p not in scf_df.columns]
@@ -706,9 +670,6 @@ def _impute_scf(
         + cps_df.get("social_security_retirement", 0)
     ).astype(np.float32)
 
-    person_states = _person_state_fips(data, state_fips, time_period)
-    cps_df["state_fips"] = person_states.astype(np.float32)
-
     qrf = QRF()
     logger.info(
         "SCF QRF: %d train, %d test, vars=%s",
diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py
index 3a95952f..84c31538 100644
--- a/policyengine_us_data/calibration/unified_calibration.py
+++ b/policyengine_us_data/calibration/unified_calibration.py
@@ -446,7 +446,11 @@ def _build_puf_cloned_dataset(
 
     data_dict = {}
     for var in data:
-        data_dict[var] = {time_period: data[var][...]}
+        if isinstance(data[var], dict):
+            vals = list(data[var].values())
+            data_dict[var] = {time_period: vals[0]}
+        else:
+            data_dict[var] = {time_period: np.array(data[var])}
 
     if not skip_source_impute:
         from policyengine_us_data.calibration.source_impute import (
diff --git a/policyengine_us_data/tests/test_calibration/test_puf_impute.py b/policyengine_us_data/tests/test_calibration/test_puf_impute.py
index 153df144..46db7367 100644
--- a/policyengine_us_data/tests/test_calibration/test_puf_impute.py
+++ b/policyengine_us_data/tests/test_calibration/test_puf_impute.py
@@ -9,9 +9,12 @@
 
 from policyengine_us_data.calibration.puf_impute import (
     puf_clone_dataset,
+    _stratified_subsample_index,
     DEMOGRAPHIC_PREDICTORS,
     IMPUTED_VARIABLES,
     OVERRIDDEN_IMPUTED_VARIABLES,
+    PUF_SUBSAMPLE_TARGET,
+    PUF_TOP_PERCENTILE,
 )
 
 
@@ -132,3 +135,46 @@ def test_imputed_variables_not_empty(self):
     def test_overridden_subset_of_imputed(self):
         for var in OVERRIDDEN_IMPUTED_VARIABLES:
             assert var in IMPUTED_VARIABLES
+
+
+class TestStratifiedSubsample:
+    def test_noop_when_small(self):
+        income = np.random.default_rng(0).normal(50000, 20000, size=100)
+        idx = _stratified_subsample_index(income, target_n=200)
+        assert len(idx) == 100
+
+    def test_reduces_to_target(self):
+        rng = np.random.default_rng(0)
+        income = np.concatenate(
+            [
+                rng.normal(50000, 20000, size=50_000),
+                rng.uniform(500_000, 5_000_000, size=250),
+            ]
+        )
+        idx = _stratified_subsample_index(
+            income, target_n=10_000, top_pct=99.5
+        )
+        assert len(idx) == 10_000
+
+    def test_preserves_top_earners(self):
+        rng = np.random.default_rng(0)
+        income = np.concatenate(
+            [
+                rng.normal(50000, 20000, size=50_000),
+                rng.uniform(500_000, 5_000_000, size=250),
+            ]
+        )
+        threshold = np.percentile(income, 99.5)
+        n_top = (income >= threshold).sum()
+
+        idx = _stratified_subsample_index(
+            income, target_n=10_000, top_pct=99.5
+        )
+        selected_income = income[idx]
+        n_top_selected = (selected_income >= threshold).sum()
+        assert n_top_selected == n_top
+
+    def test_indices_sorted(self):
+        income = np.random.default_rng(0).normal(50000, 20000, size=50_000)
+        idx = _stratified_subsample_index(income, target_n=10_000)
+        assert np.all(idx[1:] >= idx[:-1])

From 4057da8598eb5851c55bb15c5d07f6c972505ec1 Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Tue, 17 Feb 2026 12:19:25 -0500
Subject: [PATCH 3/6] Add calibration package checkpointing, target config, and
 hyperparameter CLI

- Add build-only mode to save calibration matrix as pickle package
- Add target config YAML for declarative target exclusion rules
- Add CLI flags for beta, lambda_l2, learning_rate hyperparameters
- Add streaming subprocess output in Modal runner
- Add calibration pipeline documentation
- Add tests for target config filtering and CLI arg parsing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 Makefile                                      |  11 +-
 docs/calibration.md                           | 276 +++++++++++++++++
 modal_app/remote_calibration_runner.py        | 184 ++++++++---
 .../calibration/target_config.yaml            |  51 +++
 .../calibration/unified_calibration.py        | 291 +++++++++++++++++-
 .../test_calibration/test_target_config.py    | 177 +++++++++++
 .../test_unified_calibration.py               |  60 ++++
 7 files changed, 995 insertions(+), 55 deletions(-)
 create mode 100644 docs/calibration.md
 create mode 100644 policyengine_us_data/calibration/target_config.yaml
 create mode 100644 policyengine_us_data/tests/test_calibration/test_target_config.py

diff --git a/Makefile b/Makefile
index 20b1d223..b43edde7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all format test install download upload docker documentation data calibrate publish-local-area clean build paper clean-paper presentations database database-refresh promote-database promote-dataset
+.PHONY: all format test install download upload docker documentation data calibrate calibrate-build publish-local-area clean build paper clean-paper presentations database database-refresh promote-database promote-dataset
 
 HF_CLONE_DIR ?= $(HOME)/huggingface/policyengine-us-data
 
@@ -99,7 +99,14 @@ data: download
 
 calibrate: data
 	python -m policyengine_us_data.calibration.unified_calibration \
-		--puf-dataset policyengine_us_data/storage/puf_2024.h5
+		--puf-dataset policyengine_us_data/storage/puf_2024.h5 \
+		--target-config policyengine_us_data/calibration/target_config.yaml
+
+calibrate-build: data
+	python -m policyengine_us_data.calibration.unified_calibration \
+		--puf-dataset policyengine_us_data/storage/puf_2024.h5 \
+		--target-config policyengine_us_data/calibration/target_config.yaml \
+		--build-only
 
 publish-local-area:
 	python policyengine_us_data/datasets/cps/local_area_calibration/publish_local_area.py
diff --git a/docs/calibration.md b/docs/calibration.md
new file mode 100644
index 00000000..8f27baf1
--- /dev/null
+++ b/docs/calibration.md
@@ -0,0 +1,276 @@
+# Calibration Pipeline User's Manual
+
+The unified calibration pipeline reweights cloned CPS records to match administrative targets using L0-regularized optimization. This guide covers the three main workflows: full pipeline, build-then-fit, and fitting from a saved package.
+
+## Quick Start
+
+```bash
+# Full pipeline (build matrix + fit weights):
+make calibrate
+
+# Build matrix only (save package for later fitting):
+make calibrate-build
+```
+
+## Architecture Overview
+
+The pipeline has two expensive phases:
+
+1. **Matrix build** (~30 min with PUF): Clone CPS records, assign geography, optionally PUF-impute, compute all target variable values, assemble a sparse calibration matrix.
+2. **Weight fitting** (~5-20 min on GPU): L0-regularized optimization to find household weights that reproduce administrative targets.
+
+The calibration package checkpoint lets you run phase 1 once and iterate on phase 2 with different hyperparameters or target selections---without rebuilding.
+
+## Workflows
+
+### 1. Single-pass (default)
+
+Build the matrix and fit weights in one run:
+
+```bash
+python -m policyengine_us_data.calibration.unified_calibration \
+  --puf-dataset policyengine_us_data/storage/puf_2024.h5 \
+  --target-config policyengine_us_data/calibration/target_config.yaml \
+  --epochs 200 \
+  --device cuda
+```
+
+Output:
+- `storage/calibration/unified_weights.npy` --- calibrated weight vector
+- `storage/calibration/unified_diagnostics.csv` --- per-target error report
+- `storage/calibration/unified_run_config.json` --- full run configuration
+
+### 2. Build-then-fit (recommended for iteration)
+
+**Step 1: Build the matrix and save a package.**
+
+```bash
+python -m policyengine_us_data.calibration.unified_calibration \
+  --puf-dataset policyengine_us_data/storage/puf_2024.h5 \
+  --target-config policyengine_us_data/calibration/target_config.yaml \
+  --build-only
+```
+
+This saves `storage/calibration/calibration_package.pkl` (default location). Use `--package-output` to specify a different path.
+
+**Step 2: Fit weights from the package (fast, repeatable).**
+
+```bash
+python -m policyengine_us_data.calibration.unified_calibration \
+  --package-path storage/calibration/calibration_package.pkl \
+  --epochs 500 \
+  --lambda-l0 1e-8 \
+  --beta 0.65 \
+  --lambda-l2 1e-8 \
+  --device cuda
+```
+
+You can re-run Step 2 as many times as you want with different hyperparameters. The expensive matrix build only happens once.
+
+### 3. Re-filtering a saved package
+
+A saved package contains **all** targets from the database (before target config filtering). You can apply a different target config at fit time:
+
+```bash
+python -m policyengine_us_data.calibration.unified_calibration \
+  --package-path storage/calibration/calibration_package.pkl \
+  --target-config my_custom_config.yaml \
+  --epochs 200
+```
+
+This lets you experiment with which targets to include without rebuilding the matrix.
+
+### 4. Running on Modal (GPU cloud)
+
+```bash
+modal run modal_app/remote_calibration_runner.py \
+  --branch puf-impute-fix-530 \
+  --gpu A10 \
+  --epochs 500 \
+  --target-config policyengine_us_data/calibration/target_config.yaml \
+  --beta 0.65
+```
+
+The target config YAML is read from the cloned repo inside the container, so it must be committed to the branch you specify.
+
+### 5. Portable fitting (Kaggle, Colab, etc.)
+
+Transfer the package file to any environment with `scipy`, `numpy`, `pandas`, `torch`, and `l0-python` installed:
+
+```python
+from policyengine_us_data.calibration.unified_calibration import (
+    load_calibration_package,
+    apply_target_config,
+    fit_l0_weights,
+)
+
+package = load_calibration_package("calibration_package.pkl")
+targets_df = package["targets_df"]
+X_sparse = package["X_sparse"]
+
+weights = fit_l0_weights(
+    X_sparse=X_sparse,
+    targets=targets_df["value"].values,
+    lambda_l0=1e-8,
+    epochs=500,
+    device="cuda",
+    beta=0.65,
+    lambda_l2=1e-8,
+)
+```
+
+## Target Config
+
+The target config controls which targets reach the optimizer. It uses a YAML exclusion list:
+
+```yaml
+exclude:
+  - variable: rent
+    geo_level: national
+  - variable: eitc
+    geo_level: district
+  - variable: snap
+    geo_level: state
+    domain_variable: snap   # optional: further narrow the match
+```
+
+Each rule drops rows from the calibration matrix where **all** specified fields match. Unrecognized variables silently match nothing.
+
+### Fields
+
+| Field | Required | Values | Description |
+|---|---|---|---|
+| `variable` | Yes | Any variable name in `target_overview` | The calibration target variable |
+| `geo_level` | Yes | `national`, `state`, `district` | Geographic aggregation level |
+| `domain_variable` | No | Any domain variable in `target_overview` | Narrows match to a specific domain |
+
+### Default config
+
+The checked-in config at `policyengine_us_data/calibration/target_config.yaml` reproduces the junkyard notebook's 22 excluded target groups. It drops:
+
+- **13 national-level variables**: alimony, charitable deduction, child support, interest deduction, medical expense deduction, net worth, person count, real estate taxes, rent, social security dependents/survivors
+- **9 district-level variables**: ACA PTC, EITC, income tax before credits, medical expense deduction, net capital gains, rental income, tax unit count, partnership/S-corp income, taxable social security
+
+Applying this config reduces targets from ~37K to ~21K, matching the junkyard's target selection.
+
+### Writing a custom config
+
+To experiment, copy the default and edit:
+
+```bash
+cp policyengine_us_data/calibration/target_config.yaml my_config.yaml
+# Edit my_config.yaml to add/remove exclusion rules
+python -m policyengine_us_data.calibration.unified_calibration \
+  --package-path storage/calibration/calibration_package.pkl \
+  --target-config my_config.yaml \
+  --epochs 200
+```
+
+To see what variables and geo_levels are available in the database:
+
+```sql
+SELECT DISTINCT variable, geo_level
+FROM target_overview
+ORDER BY variable, geo_level;
+```
+
+## CLI Reference
+
+### Core flags
+
+| Flag | Default | Description |
+|---|---|---|
+| `--dataset` | `storage/stratified_extended_cps_2024.h5` | Path to CPS h5 file |
+| `--db-path` | `storage/calibration/policy_data.db` | Path to target database |
+| `--output` | `storage/calibration/unified_weights.npy` | Weight output path |
+| `--puf-dataset` | None | Path to PUF h5 (enables PUF cloning) |
+| `--preset` | `local` | L0 preset: `local` (1e-8) or `national` (1e-4) |
+| `--lambda-l0` | None | Custom L0 penalty (overrides `--preset`) |
+| `--epochs` | 100 | Training epochs |
+| `--device` | `cpu` | `cpu` or `cuda` |
+| `--n-clones` | 10 | Number of dataset clones |
+| `--seed` | 42 | Random seed for geography assignment |
+
+### Target selection
+
+| Flag | Default | Description |
+|---|---|---|
+| `--target-config` | None | Path to YAML exclusion config |
+| `--domain-variables` | None | Comma-separated domain filter (SQL-level) |
+| `--hierarchical-domains` | None | Domains for hierarchical uprating |
+
+### Checkpoint flags
+
+| Flag | Default | Description |
+|---|---|---|
+| `--build-only` | False | Build matrix, save package, skip fitting |
+| `--package-path` | None | Load pre-built package (skip matrix build) |
+| `--package-output` | Auto (when `--build-only`) | Where to save package |
+
+### Hyperparameter flags
+
+| Flag | Default | Junkyard value | Description |
+|---|---|---|---|
+| `--beta` | 0.35 | 0.65 | L0 gate temperature (higher = softer gates) |
+| `--lambda-l2` | 1e-12 | 1e-8 | L2 regularization on weights |
+| `--learning-rate` | 0.15 | 0.15 | Optimizer learning rate |
+
+### Skip flags
+
+| Flag | Description |
+|---|---|
+| `--skip-puf` | Skip PUF clone + QRF imputation |
+| `--skip-source-impute` | Skip ACS/SIPP/SCF re-imputation |
+| `--skip-takeup-rerandomize` | Skip takeup re-randomization |
+
+## Calibration Package Format
+
+The package is a pickled Python dict:
+
+```python
+{
+    "X_sparse": scipy.sparse.csr_matrix,  # (n_targets, n_records)
+    "targets_df": pd.DataFrame,           # target metadata + values
+    "target_names": list[str],            # human-readable names
+    "metadata": {
+        "dataset_path": str,
+        "db_path": str,
+        "n_clones": int,
+        "n_records": int,
+        "seed": int,
+        "created_at": str,       # ISO timestamp
+        "target_config": dict,   # config used at build time
+    },
+}
+```
+
+The `targets_df` DataFrame has columns: `variable`, `geo_level`, `geographic_id`, `domain_variable`, `value`, and others from the database.
+
+## Hyperparameter Tuning Guide
+
+The three key hyperparameters control the tradeoff between target accuracy and sparsity:
+
+- **`beta`** (L0 gate temperature): Controls how sharply the L0 gates open/close. Higher values (0.5--0.8) give softer decisions and more exploration early in training. Lower values (0.2--0.4) give harder on/off decisions.
+
+- **`lambda_l0`** (via `--preset` or `--lambda-l0`): Controls how many records survive. `1e-8` (local preset) keeps millions of records for local-area analysis. `1e-4` (national preset) keeps ~50K for the web app.
+
+- **`lambda_l2`**: Regularizes weight magnitudes. Larger values (1e-8) prevent any single record from having extreme weight. Smaller values (1e-12) allow more weight concentration.
+
+### Suggested starting points
+
+For **local-area calibration** (millions of records):
+```bash
+--lambda-l0 1e-8 --beta 0.65 --lambda-l2 1e-8 --epochs 500
+```
+
+For **national web app** (~50K records):
+```bash
+--lambda-l0 1e-4 --beta 0.35 --lambda-l2 1e-12 --epochs 200
+```
+
+## Makefile Targets
+
+| Target | Description |
+|---|---|
+| `make calibrate` | Full pipeline with PUF and target config |
+| `make calibrate-build` | Build-only mode (saves package, no fitting) |
diff --git a/modal_app/remote_calibration_runner.py b/modal_app/remote_calibration_runner.py
index 689d245d..24583003 100644
--- a/modal_app/remote_calibration_runner.py
+++ b/modal_app/remote_calibration_runner.py
@@ -15,7 +15,39 @@
 REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git"
 
 
-def _fit_weights_impl(branch: str, epochs: int) -> dict:
+def _run_streaming(cmd, env=None, label=""):
+    """Run a subprocess, streaming output line-by-line.
+
+    Returns (returncode, captured_stdout_lines).
+    """
+    proc = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        env=env,
+    )
+    lines = []
+    for line in proc.stdout:
+        line = line.rstrip("\n")
+        if label:
+            print(f"[{label}] {line}", flush=True)
+        else:
+            print(line, flush=True)
+        lines.append(line)
+    proc.wait()
+    return proc.returncode, lines
+
+
+def _fit_weights_impl(
+    branch: str,
+    epochs: int,
+    target_config: str = None,
+    beta: float = None,
+    lambda_l2: float = None,
+    learning_rate: float = None,
+) -> dict:
     """Shared implementation for weight fitting."""
     os.chdir("/root")
     subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True)
@@ -23,8 +55,8 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict:
 
     subprocess.run(["uv", "sync", "--extra", "l0"], check=True)
 
-    print("Downloading calibration inputs from HuggingFace...")
-    download_result = subprocess.run(
+    print("Downloading calibration inputs from HuggingFace...", flush=True)
+    dl_rc, dl_lines = _run_streaming(
         [
             "uv",
             "run",
@@ -36,52 +68,54 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict:
             "print(f\"DB: {paths['database']}\"); "
             "print(f\"DATASET: {paths['dataset']}\")",
         ],
-        capture_output=True,
-        text=True,
         env=os.environ.copy(),
+        label="download",
     )
-    print(download_result.stdout)
-    if download_result.stderr:
-        print("Download STDERR:", download_result.stderr)
-    if download_result.returncode != 0:
-        raise RuntimeError(f"Download failed: {download_result.returncode}")
+    if dl_rc != 0:
+        raise RuntimeError(f"Download failed with code {dl_rc}")
 
     db_path = dataset_path = None
-    for line in download_result.stdout.split("\n"):
-        if line.startswith("DB:"):
+    for line in dl_lines:
+        if "DB:" in line:
             db_path = line.split("DB:")[1].strip()
-        elif line.startswith("DATASET:"):
+        elif "DATASET:" in line:
             dataset_path = line.split("DATASET:")[1].strip()
 
     script_path = "policyengine_us_data/calibration/unified_calibration.py"
-    result = subprocess.run(
-        [
-            "uv",
-            "run",
-            "python",
-            script_path,
-            "--device",
-            "cuda",
-            "--epochs",
-            str(epochs),
-            "--db-path",
-            db_path,
-            "--dataset",
-            dataset_path,
-        ],
-        capture_output=True,
-        text=True,
+    cmd = [
+        "uv",
+        "run",
+        "python",
+        script_path,
+        "--device",
+        "cuda",
+        "--epochs",
+        str(epochs),
+        "--db-path",
+        db_path,
+        "--dataset",
+        dataset_path,
+    ]
+    if target_config:
+        cmd.extend(["--target-config", target_config])
+    if beta is not None:
+        cmd.extend(["--beta", str(beta)])
+    if lambda_l2 is not None:
+        cmd.extend(["--lambda-l2", str(lambda_l2)])
+    if learning_rate is not None:
+        cmd.extend(["--learning-rate", str(learning_rate)])
+
+    cal_rc, cal_lines = _run_streaming(
+        cmd,
         env=os.environ.copy(),
+        label="calibrate",
     )
-    print(result.stdout)
-    if result.stderr:
-        print("STDERR:", result.stderr)
-    if result.returncode != 0:
-        raise RuntimeError(f"Script failed with code {result.returncode}")
+    if cal_rc != 0:
+        raise RuntimeError(f"Script failed with code {cal_rc}")
 
     output_path = None
     log_path = None
-    for line in result.stdout.split("\n"):
+    for line in cal_lines:
         if "OUTPUT_PATH:" in line:
             output_path = line.split("OUTPUT_PATH:")[1].strip()
         elif "LOG_PATH:" in line:
@@ -106,8 +140,17 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict:
     gpu="T4",
     timeout=14400,
 )
-def fit_weights_t4(branch: str = "main", epochs: int = 200) -> dict:
-    return _fit_weights_impl(branch, epochs)
+def fit_weights_t4(
+    branch: str = "main",
+    epochs: int = 200,
+    target_config: str = None,
+    beta: float = None,
+    lambda_l2: float = None,
+    learning_rate: float = None,
+) -> dict:
+    return _fit_weights_impl(
+        branch, epochs, target_config, beta, lambda_l2, learning_rate
+    )
 
 
 @app.function(
@@ -118,8 +161,17 @@ def fit_weights_t4(branch: str = "main", epochs: int = 200) -> dict:
     gpu="A10",
     timeout=14400,
 )
-def fit_weights_a10(branch: str = "main", epochs: int = 200) -> dict:
-    return _fit_weights_impl(branch, epochs)
+def fit_weights_a10(
+    branch: str = "main",
+    epochs: int = 200,
+    target_config: str = None,
+    beta: float = None,
+    lambda_l2: float = None,
+    learning_rate: float = None,
+) -> dict:
+    return _fit_weights_impl(
+        branch, epochs, target_config, beta, lambda_l2, learning_rate
+    )
 
 
 @app.function(
@@ -130,8 +182,17 @@ def fit_weights_a10(branch: str = "main", epochs: int = 200) -> dict:
     gpu="A100-40GB",
     timeout=14400,
 )
-def fit_weights_a100_40(branch: str = "main", epochs: int = 200) -> dict:
-    return _fit_weights_impl(branch, epochs)
+def fit_weights_a100_40(
+    branch: str = "main",
+    epochs: int = 200,
+    target_config: str = None,
+    beta: float = None,
+    lambda_l2: float = None,
+    learning_rate: float = None,
+) -> dict:
+    return _fit_weights_impl(
+        branch, epochs, target_config, beta, lambda_l2, learning_rate
+    )
 
 
 @app.function(
@@ -142,8 +203,17 @@ def fit_weights_a100_40(branch: str = "main", epochs: int = 200) -> dict:
     gpu="A100-80GB",
     timeout=14400,
 )
-def fit_weights_a100_80(branch: str = "main", epochs: int = 200) -> dict:
-    return _fit_weights_impl(branch, epochs)
+def fit_weights_a100_80(
+    branch: str = "main",
+    epochs: int = 200,
+    target_config: str = None,
+    beta: float = None,
+    lambda_l2: float = None,
+    learning_rate: float = None,
+) -> dict:
+    return _fit_weights_impl(
+        branch, epochs, target_config, beta, lambda_l2, learning_rate
+    )
 
 
 @app.function(
@@ -154,8 +224,17 @@ def fit_weights_a100_80(branch: str = "main", epochs: int = 200) -> dict:
     gpu="H100",
     timeout=14400,
 )
-def fit_weights_h100(branch: str = "main", epochs: int = 200) -> dict:
-    return _fit_weights_impl(branch, epochs)
+def fit_weights_h100(
+    branch: str = "main",
+    epochs: int = 200,
+    target_config: str = None,
+    beta: float = None,
+    lambda_l2: float = None,
+    learning_rate: float = None,
+) -> dict:
+    return _fit_weights_impl(
+        branch, epochs, target_config, beta, lambda_l2, learning_rate
+    )
 
 
 GPU_FUNCTIONS = {
@@ -174,6 +253,10 @@ def main(
     gpu: str = "T4",
     output: str = "calibration_weights.npy",
     log_output: str = "calibration_log.csv",
+    target_config: str = None,
+    beta: float = None,
+    lambda_l2: float = None,
+    learning_rate: float = None,
 ):
     if gpu not in GPU_FUNCTIONS:
         raise ValueError(
@@ -182,7 +265,14 @@ def main(
 
     print(f"Running with GPU: {gpu}, epochs: {epochs}, branch: {branch}")
     func = GPU_FUNCTIONS[gpu]
-    result = func.remote(branch=branch, epochs=epochs)
+    result = func.remote(
+        branch=branch,
+        epochs=epochs,
+        target_config=target_config,
+        beta=beta,
+        lambda_l2=lambda_l2,
+        learning_rate=learning_rate,
+    )
 
     with open(output, "wb") as f:
         f.write(result["weights"])
diff --git a/policyengine_us_data/calibration/target_config.yaml b/policyengine_us_data/calibration/target_config.yaml
new file mode 100644
index 00000000..1e1e287d
--- /dev/null
+++ b/policyengine_us_data/calibration/target_config.yaml
@@ -0,0 +1,51 @@
+# Target exclusion config for unified calibration.
+# Each entry excludes targets matching (variable, geo_level).
+# Derived from junkyard's 22 excluded target groups.
+
+exclude:
+  # National exclusions
+  - variable: alimony_expense
+    geo_level: national
+  - variable: alimony_income
+    geo_level: national
+  - variable: charitable_deduction
+    geo_level: national
+  - variable: child_support_expense
+    geo_level: national
+  - variable: child_support_received
+    geo_level: national
+  - variable: interest_deduction
+    geo_level: national
+  - variable: medical_expense_deduction
+    geo_level: national
+  - variable: net_worth
+    geo_level: national
+  - variable: person_count
+    geo_level: national
+  - variable: real_estate_taxes
+    geo_level: national
+  - variable: rent
+    geo_level: national
+  - variable: social_security_dependents
+    geo_level: national
+  - variable: social_security_survivors
+    geo_level: national
+  # District exclusions
+  - variable: aca_ptc
+    geo_level: district
+  - variable: eitc
+    geo_level: district
+  - variable: income_tax_before_credits
+    geo_level: district
+  - variable: medical_expense_deduction
+    geo_level: district
+  - variable: net_capital_gains
+    geo_level: district
+  - variable: rental_income
+    geo_level: district
+  - variable: tax_unit_count
+    geo_level: district
+  - variable: tax_unit_partnership_s_corp_income
+    geo_level: district
+  - variable: taxable_social_security
+    geo_level: district
diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py
index 84c31538..5fffa13f 100644
--- a/policyengine_us_data/calibration/unified_calibration.py
+++ b/policyengine_us_data/calibration/unified_calibration.py
@@ -271,9 +271,175 @@ def parse_args(argv=None):
         action="store_true",
         help="Skip ACS/SIPP/SCF re-imputation with state",
     )
+    parser.add_argument(
+        "--target-config",
+        default=None,
+        help="Path to target exclusion YAML config",
+    )
+    parser.add_argument(
+        "--build-only",
+        action="store_true",
+        help="Build matrix + save package, skip fitting",
+    )
+    parser.add_argument(
+        "--package-path",
+        default=None,
+        help="Load pre-built calibration package (skip matrix build)",
+    )
+    parser.add_argument(
+        "--package-output",
+        default=None,
+        help="Where to save calibration package",
+    )
+    parser.add_argument(
+        "--beta",
+        type=float,
+        default=BETA,
+        help=f"L0 gate temperature (default: {BETA})",
+    )
+    parser.add_argument(
+        "--lambda-l2",
+        type=float,
+        default=LAMBDA_L2,
+        help=f"L2 regularization (default: {LAMBDA_L2})",
+    )
+    parser.add_argument(
+        "--learning-rate",
+        type=float,
+        default=LEARNING_RATE,
+        help=f"Learning rate (default: {LEARNING_RATE})",
+    )
     return parser.parse_args(argv)
 
 
+def load_target_config(path: str) -> dict:
+    """Load target exclusion config from YAML.
+
+    Args:
+        path: Path to YAML config file.
+
+    Returns:
+        Parsed config dict with 'exclude' list.
+    """
+    import yaml
+
+    with open(path) as f:
+        config = yaml.safe_load(f)
+    if config is None:
+        config = {}
+    if "exclude" not in config:
+        config["exclude"] = []
+    return config
+
+
+def apply_target_config(
+    targets_df: "pd.DataFrame",
+    X_sparse,
+    target_names: list,
+    config: dict,
+) -> tuple:
+    """Filter targets based on exclusion config.
+
+    Each exclude rule matches rows where variable and geo_level
+    both match. Optionally matches domain_variable too.
+
+    Args:
+        targets_df: DataFrame with target rows.
+        X_sparse: Sparse matrix (targets x records).
+        target_names: List of target name strings.
+        config: Config dict with 'exclude' list.
+
+    Returns:
+        (filtered_targets_df, filtered_X_sparse, filtered_names)
+    """
+    import pandas as pd
+
+    exclude_rules = config.get("exclude", [])
+    if not exclude_rules:
+        return targets_df, X_sparse, target_names
+
+    n_before = len(targets_df)
+    keep_mask = np.ones(n_before, dtype=bool)
+
+    for rule in exclude_rules:
+        var = rule["variable"]
+        geo = rule["geo_level"]
+        rule_mask = (targets_df["variable"] == var) & (
+            targets_df["geo_level"] == geo
+        )
+        if "domain_variable" in rule:
+            rule_mask = rule_mask & (
+                targets_df["domain_variable"] == rule["domain_variable"]
+            )
+        keep_mask &= ~rule_mask
+
+    n_dropped = n_before - keep_mask.sum()
+    logger.info(
+        "Target config: kept %d / %d targets (dropped %d)",
+        keep_mask.sum(),
+        n_before,
+        n_dropped,
+    )
+
+    idx = np.where(keep_mask)[0]
+    filtered_df = targets_df.iloc[idx].reset_index(drop=True)
+    filtered_X = X_sparse[idx, :]
+    filtered_names = [target_names[i] for i in idx]
+
+    return filtered_df, filtered_X, filtered_names
+
+
+def save_calibration_package(
+    path: str,
+    X_sparse,
+    targets_df: "pd.DataFrame",
+    target_names: list,
+    metadata: dict,
+) -> None:
+    """Save calibration package to pickle.
+
+    Args:
+        path: Output file path.
+        X_sparse: Sparse matrix.
+        targets_df: Targets DataFrame.
+        target_names: Target name list.
+        metadata: Run metadata dict.
+    """
+    import pickle
+
+    package = {
+        "X_sparse": X_sparse,
+        "targets_df": targets_df,
+        "target_names": target_names,
+        "metadata": metadata,
+    }
+    Path(path).parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "wb") as f:
+        pickle.dump(package, f, protocol=pickle.HIGHEST_PROTOCOL)
+    logger.info("Calibration package saved to %s", path)
+
+
+def load_calibration_package(path: str) -> dict:
+    """Load calibration package from pickle.
+
+    Args:
+        path: Path to package file.
+
+    Returns:
+        Dict with X_sparse, targets_df, target_names, metadata.
+    """
+    import pickle
+
+    with open(path, "rb") as f:
+        package = pickle.load(f)
+    logger.info(
+        "Loaded package: %d targets, %d records",
+        package["X_sparse"].shape[0],
+        package["X_sparse"].shape[1],
+    )
+    return package
+
+
 def fit_l0_weights(
     X_sparse,
     targets: np.ndarray,
@@ -281,6 +447,9 @@ def fit_l0_weights(
     epochs: int = DEFAULT_EPOCHS,
     device: str = "cpu",
     verbose_freq: int = None,
+    beta: float = BETA,
+    lambda_l2: float = LAMBDA_L2,
+    learning_rate: float = LEARNING_RATE,
 ) -> np.ndarray:
     """Fit L0-regularized calibration weights.
 
@@ -291,6 +460,9 @@ def fit_l0_weights(
         epochs: Training epochs.
         device: Torch device.
         verbose_freq: Print frequency. Defaults to 10%.
+        beta: L0 gate temperature.
+        lambda_l2: L2 regularization strength.
+        learning_rate: Optimizer learning rate.
 
     Returns:
         Weight array of shape (n_records,).
@@ -311,16 +483,20 @@ def fit_l0_weights(
 
     logger.info(
         "L0 calibration: %d targets, %d features, "
-        "lambda_l0=%.1e, epochs=%d",
+        "lambda_l0=%.1e, beta=%.2f, lambda_l2=%.1e, "
+        "lr=%.3f, epochs=%d",
         X_sparse.shape[0],
         n_total,
         lambda_l0,
+        beta,
+        lambda_l2,
+        learning_rate,
         epochs,
     )
 
     model = SparseCalibrationWeights(
         n_features=n_total,
-        beta=BETA,
+        beta=beta,
         gamma=GAMMA,
         zeta=ZETA,
         init_keep_prob=INIT_KEEP_PROB,
@@ -348,8 +524,8 @@ def _flushed_print(*args, **kwargs):
             y=targets,
             target_groups=None,
             lambda_l0=lambda_l0,
-            lambda_l2=LAMBDA_L2,
-            lr=LEARNING_RATE,
+            lambda_l2=lambda_l2,
+            lr=learning_rate,
             epochs=epochs,
             loss_type="relative",
             verbose=True,
@@ -503,6 +679,13 @@ def run_calibration(
     puf_dataset_path: str = None,
     skip_puf: bool = False,
     skip_source_impute: bool = False,
+    target_config: dict = None,
+    build_only: bool = False,
+    package_path: str = None,
+    package_output_path: str = None,
+    beta: float = BETA,
+    lambda_l2: float = LAMBDA_L2,
+    learning_rate: float = LEARNING_RATE,
 ):
     """Run unified calibration pipeline.
 
@@ -521,12 +704,51 @@ def run_calibration(
         puf_dataset_path: Path to PUF h5 for QRF training.
         skip_puf: Skip PUF clone step.
         skip_source_impute: Skip ACS/SIPP/SCF imputations.
+        target_config: Parsed target config dict.
+        build_only: If True, save package and skip fitting.
+        package_path: Load pre-built package (skip build).
+        package_output_path: Where to save calibration package.
+        beta: L0 gate temperature.
+        lambda_l2: L2 regularization strength.
+        learning_rate: Optimizer learning rate.
 
     Returns:
         (weights, targets_df, X_sparse, target_names)
+        weights is None when build_only=True.
     """
     import time
 
+    t0 = time.time()
+
+    # Early exit: load pre-built package
+    if package_path is not None:
+        package = load_calibration_package(package_path)
+        targets_df = package["targets_df"]
+        X_sparse = package["X_sparse"]
+        target_names = package["target_names"]
+
+        if target_config:
+            targets_df, X_sparse, target_names = apply_target_config(
+                targets_df, X_sparse, target_names, target_config
+            )
+
+        targets = targets_df["value"].values
+        weights = fit_l0_weights(
+            X_sparse=X_sparse,
+            targets=targets,
+            lambda_l0=lambda_l0,
+            epochs=epochs,
+            device=device,
+            beta=beta,
+            lambda_l2=lambda_l2,
+            learning_rate=learning_rate,
+        )
+        logger.info(
+            "Total pipeline (from package): %.1f min",
+            (time.time() - t0) / 60,
+        )
+        return weights, targets_df, X_sparse, target_names
+
     from policyengine_us import Microsimulation
 
     from policyengine_us_data.calibration.clone_and_assign import (
@@ -537,8 +759,6 @@ def run_calibration(
         UnifiedMatrixBuilder,
     )
 
-    t0 = time.time()
-
     # Step 1: Load dataset
     logger.info("Loading dataset from %s", dataset_path)
     sim = Microsimulation(dataset=dataset_path)
@@ -671,6 +891,37 @@ def sim_modifier(s, clone_idx):
         X_sparse.nnz,
     )
 
+    # Step 6b: Apply target config filtering
+    if target_config:
+        targets_df, X_sparse, target_names = apply_target_config(
+            targets_df, X_sparse, target_names, target_config
+        )
+
+    # Step 6c: Save calibration package
+    if package_output_path:
+        import datetime
+
+        metadata = {
+            "dataset_path": dataset_path,
+            "db_path": db_path,
+            "n_clones": n_clones,
+            "n_records": X_sparse.shape[1],
+            "seed": seed,
+            "created_at": datetime.datetime.now().isoformat(),
+            "target_config": target_config,
+        }
+        save_calibration_package(
+            package_output_path,
+            X_sparse,
+            targets_df,
+            target_names,
+            metadata,
+        )
+
+    if build_only:
+        logger.info("Build-only mode: skipping fitting")
+        return None, targets_df, X_sparse, target_names
+
     # Step 7: L0 calibration
     targets = targets_df["value"].values
 
@@ -688,6 +939,9 @@ def sim_modifier(s, clone_idx):
         lambda_l0=lambda_l0,
         epochs=epochs,
         device=device,
+        beta=beta,
+        lambda_l2=lambda_l2,
+        learning_rate=learning_rate,
     )
 
     logger.info(
@@ -748,6 +1002,16 @@ def main(argv=None):
 
     puf_dataset_path = getattr(args, "puf_dataset", None)
 
+    target_config = None
+    if args.target_config:
+        target_config = load_target_config(args.target_config)
+
+    package_output_path = args.package_output
+    if args.build_only and not package_output_path:
+        package_output_path = str(
+            STORAGE_FOLDER / "calibration" / "calibration_package.pkl"
+        )
+
     weights, targets_df, X_sparse, target_names = run_calibration(
         dataset_path=dataset_path,
         db_path=db_path,
@@ -762,8 +1026,19 @@ def main(argv=None):
         puf_dataset_path=puf_dataset_path,
         skip_puf=getattr(args, "skip_puf", False),
         skip_source_impute=getattr(args, "skip_source_impute", False),
+        target_config=target_config,
+        build_only=args.build_only,
+        package_path=args.package_path,
+        package_output_path=package_output_path,
+        beta=args.beta,
+        lambda_l2=args.lambda_l2,
+        learning_rate=args.learning_rate,
     )
 
+    if weights is None:
+        logger.info("Build-only complete. Package saved.")
+        return
+
     # Save weights
     np.save(output_path, weights)
     logger.info("Weights saved to %s", output_path)
@@ -798,11 +1073,15 @@ def main(argv=None):
         "skip_source_impute": getattr(args, "skip_source_impute", False),
         "n_clones": args.n_clones,
         "lambda_l0": lambda_l0,
+        "beta": args.beta,
+        "lambda_l2": args.lambda_l2,
+        "learning_rate": args.learning_rate,
         "epochs": args.epochs,
         "device": args.device,
         "seed": args.seed,
         "domain_variables": domain_variables,
         "hierarchical_domains": hierarchical_domains,
+        "target_config": args.target_config,
         "n_targets": len(targets_df),
         "n_records": X_sparse.shape[1],
         "weight_sum": float(weights.sum()),
diff --git a/policyengine_us_data/tests/test_calibration/test_target_config.py b/policyengine_us_data/tests/test_calibration/test_target_config.py
new file mode 100644
index 00000000..9241660c
--- /dev/null
+++ b/policyengine_us_data/tests/test_calibration/test_target_config.py
@@ -0,0 +1,177 @@
+"""Tests for target config filtering in unified calibration."""
+
+import numpy as np
+import pandas as pd
+import pytest
+from scipy import sparse
+
+from policyengine_us_data.calibration.unified_calibration import (
+    apply_target_config,
+    load_target_config,
+    save_calibration_package,
+    load_calibration_package,
+)
+
+
+@pytest.fixture
+def sample_targets():
+    targets_df = pd.DataFrame(
+        {
+            "variable": [
+                "snap",
+                "snap",
+                "eitc",
+                "eitc",
+                "rent",
+                "person_count",
+            ],
+            "geo_level": [
+                "national",
+                "state",
+                "district",
+                "state",
+                "national",
+                "national",
+            ],
+            "domain_variable": [
+                "snap",
+                "snap",
+                "eitc",
+                "eitc",
+                "rent",
+                "person_count",
+            ],
+            "geographic_id": ["US", "6", "0601", "6", "US", "US"],
+            "value": [1000, 500, 200, 300, 800, 5000],
+        }
+    )
+    n_rows = len(targets_df)
+    n_cols = 10
+    rng = np.random.default_rng(42)
+    X = sparse.random(n_rows, n_cols, density=0.5, random_state=rng)
+    X = X.tocsr()
+    target_names = [
+        f"{r.variable}_{r.geo_level}_{r.geographic_id}"
+        for _, r in targets_df.iterrows()
+    ]
+    return targets_df, X, target_names
+
+
+class TestApplyTargetConfig:
+    def test_empty_config_keeps_all(self, sample_targets):
+        df, X, names = sample_targets
+        config = {"exclude": []}
+        out_df, out_X, out_names = apply_target_config(df, X, names, config)
+        assert len(out_df) == len(df)
+        assert out_X.shape == X.shape
+        assert out_names == names
+
+    def test_single_variable_geo_exclusion(self, sample_targets):
+        df, X, names = sample_targets
+        config = {"exclude": [{"variable": "rent", "geo_level": "national"}]}
+        out_df, out_X, out_names = apply_target_config(df, X, names, config)
+        assert len(out_df) == len(df) - 1
+        assert "rent" not in out_df["variable"].values
+
+    def test_multiple_exclusions(self, sample_targets):
+        df, X, names = sample_targets
+        config = {
+            "exclude": [
+                {"variable": "rent", "geo_level": "national"},
+                {"variable": "eitc", "geo_level": "district"},
+            ]
+        }
+        out_df, out_X, out_names = apply_target_config(df, X, names, config)
+        assert len(out_df) == len(df) - 2
+        kept = set(zip(out_df["variable"], out_df["geo_level"]))
+        assert ("rent", "national") not in kept
+        assert ("eitc", "district") not in kept
+        assert ("eitc", "state") in kept
+
+    def test_domain_variable_matching(self, sample_targets):
+        df, X, names = sample_targets
+        config = {
+            "exclude": [
+                {
+                    "variable": "snap",
+                    "geo_level": "national",
+                    "domain_variable": "snap",
+                }
+            ]
+        }
+        out_df, out_X, out_names = apply_target_config(df, X, names, config)
+        assert len(out_df) == len(df) - 1
+
+    def test_matrix_and_names_stay_in_sync(self, sample_targets):
+        df, X, names = sample_targets
+        config = {
+            "exclude": [{"variable": "person_count", "geo_level": "national"}]
+        }
+        out_df, out_X, out_names = apply_target_config(df, X, names, config)
+        assert out_X.shape[0] == len(out_df)
+        assert len(out_names) == len(out_df)
+        assert out_X.shape[1] == X.shape[1]
+
+    def test_no_match_keeps_all(self, sample_targets):
+        df, X, names = sample_targets
+        config = {
+            "exclude": [{"variable": "nonexistent", "geo_level": "national"}]
+        }
+        out_df, out_X, out_names = apply_target_config(df, X, names, config)
+        assert len(out_df) == len(df)
+        assert out_X.shape[0] == X.shape[0]
+
+
+class TestLoadTargetConfig:
+    def test_load_valid_config(self, tmp_path):
+        config_file = tmp_path / "config.yaml"
+        config_file.write_text(
+            "exclude:\n" "  - variable: snap\n" "    geo_level: national\n"
+        )
+        config = load_target_config(str(config_file))
+        assert len(config["exclude"]) == 1
+        assert config["exclude"][0]["variable"] == "snap"
+
+    def test_load_empty_config(self, tmp_path):
+        config_file = tmp_path / "empty.yaml"
+        config_file.write_text("")
+        config = load_target_config(str(config_file))
+        assert config["exclude"] == []
+
+
+class TestCalibrationPackageRoundTrip:
+    def test_round_trip(self, sample_targets, tmp_path):
+        df, X, names = sample_targets
+        pkg_path = str(tmp_path / "package.pkl")
+        metadata = {
+            "dataset_path": "/tmp/test.h5",
+            "db_path": "/tmp/test.db",
+            "n_clones": 5,
+            "n_records": X.shape[1],
+            "seed": 42,
+            "created_at": "2024-01-01T00:00:00",
+            "target_config": None,
+        }
+        save_calibration_package(pkg_path, X, df, names, metadata)
+        loaded = load_calibration_package(pkg_path)
+
+        assert loaded["target_names"] == names
+        pd.testing.assert_frame_equal(loaded["targets_df"], df)
+        assert loaded["X_sparse"].shape == X.shape
+        assert loaded["metadata"]["seed"] == 42
+
+    def test_package_then_filter(self, sample_targets, tmp_path):
+        df, X, names = sample_targets
+        pkg_path = str(tmp_path / "package.pkl")
+        metadata = {"n_records": X.shape[1]}
+        save_calibration_package(pkg_path, X, df, names, metadata)
+        loaded = load_calibration_package(pkg_path)
+
+        config = {"exclude": [{"variable": "rent", "geo_level": "national"}]}
+        out_df, out_X, out_names = apply_target_config(
+            loaded["targets_df"],
+            loaded["X_sparse"],
+            loaded["target_names"],
+            config,
+        )
+        assert len(out_df) == len(df) - 1
diff --git a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py
index 2d3f8061..341ffcc0 100644
--- a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py
+++ b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py
@@ -85,3 +85,63 @@ def test_expected_count(self):
         )
 
         assert len(SIMPLE_TAKEUP_VARS) == 8
+
+
+class TestParseArgsNewFlags:
+    """Verify new CLI flags are parsed correctly."""
+
+    def test_target_config_flag(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            parse_args,
+        )
+
+        args = parse_args(["--target-config", "config.yaml"])
+        assert args.target_config == "config.yaml"
+
+    def test_build_only_flag(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            parse_args,
+        )
+
+        args = parse_args(["--build-only"])
+        assert args.build_only is True
+
+    def test_package_path_flag(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            parse_args,
+        )
+
+        args = parse_args(["--package-path", "pkg.pkl"])
+        assert args.package_path == "pkg.pkl"
+
+    def test_hyperparams_flags(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            parse_args,
+        )
+
+        args = parse_args(
+            [
+                "--beta",
+                "0.65",
+                "--lambda-l2",
+                "1e-8",
+                "--learning-rate",
+                "0.2",
+            ]
+        )
+        assert args.beta == 0.65
+        assert args.lambda_l2 == 1e-8
+        assert args.learning_rate == 0.2
+
+    def test_hyperparams_defaults(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            BETA,
+            LAMBDA_L2,
+            LEARNING_RATE,
+            parse_args,
+        )
+
+        args = parse_args([])
+        assert args.beta == BETA
+        assert args.lambda_l2 == LAMBDA_L2
+        assert args.learning_rate == LEARNING_RATE

From 61523d83d5819bcc18d7ca76209925cd1a9da90e Mon Sep 17 00:00:00 2001
From: "baogorek@gmail.com" <baogorek@gmail.com>
Date: Tue, 17 Feb 2026 12:23:55 -0500
Subject: [PATCH 4/6] Ignore all calibration run outputs in
 storage/calibration/

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .gitignore | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index a7ab98c9..6fa185f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,8 +30,8 @@ docs/.ipynb_checkpoints/
 ## ACA PTC state-level uprating factors
 !policyengine_us_data/storage/aca_ptc_multipliers_2022_2024.csv
 
-## Raw input cache for database pipeline
-policyengine_us_data/storage/calibration/raw_inputs/
+## Calibration run outputs (weights, diagnostics, packages, config)
+policyengine_us_data/storage/calibration/
 
 ## Batch processing checkpoints
 completed_*.txt

From 4c51b3280fe5e9cbb90ab019dd8982cabc702a2d Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Wed, 18 Feb 2026 20:06:31 +0530
Subject: [PATCH 5/6] Add category-dependent takeup re-randomization for EITC
 and WIC

EITC takeup rates vary by child count and WIC rates vary by demographic
category, both requiring sim-calculated values. Introduce a two-pass flow
in _simulate_clone() with a post_sim_modifier that runs after the first
cache clear, computes category variables, re-randomizes takeup, then
clears caches again before final target calculation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 changelog_entry.yaml                          |   2 +
 .../calibration/unified_calibration.py        | 130 +++++
 .../calibration/unified_matrix_builder.py     |  24 +-
 .../test_unified_calibration.py               | 454 ++++++++++++++++++
 4 files changed, 607 insertions(+), 3 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index 8e9b4fb6..f446f1b1 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -5,6 +5,8 @@
     - ACS/SIPP/SCF geographic re-imputation module (source_impute.py) with state predictor
     - PUF and source impute integration into unified calibration pipeline (--puf-dataset, --skip-puf, --skip-source-impute flags)
     - 21 new tests for puf_impute and source_impute modules
+    - Category-dependent takeup re-randomization for takes_up_eitc (by child count) and would_claim_wic (by WIC category) during calibration cloning
+    - Two-pass simulation flow in _simulate_clone() with post_sim_modifier for category variables that require sim output
     changed:
     - Refactored extended_cps.py to delegate to puf_impute.puf_clone_dataset() (443 -> 75 lines)
     - unified_calibration.py pipeline now supports optional source imputation and PUF cloning steps
diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py
index 5fffa13f..48a268cb 100644
--- a/policyengine_us_data/calibration/unified_calibration.py
+++ b/policyengine_us_data/calibration/unified_calibration.py
@@ -100,6 +100,58 @@
 ]
 
 
+def _eitc_category_mapper(categories, rates):
+    """Map eitc_child_count to per-entity takeup rates.
+
+    Rates loaded from parameters/take_up/eitc.yaml via
+    load_take_up_rate(). Clamps child count to max key in
+    rate dict (matches cps.py EITC logic).
+
+    Args:
+        categories: Array of child counts per tax unit.
+        rates: Dict mapping child count -> takeup rate.
+
+    Returns:
+        Array of per-entity takeup rates.
+    """
+    max_key = max(rates.keys())
+    return np.array([rates[min(int(c), max_key)] for c in categories])
+
+
+def _wic_category_mapper(categories, rates):
+    """Map wic_category_str to per-entity takeup rates.
+
+    Rates loaded from parameters/take_up/wic_takeup.yaml via
+    load_take_up_rate(). Unknown categories default to 0.
+
+    Args:
+        categories: Array of WIC category strings per person.
+        rates: Dict mapping category string -> takeup rate.
+
+    Returns:
+        Array of per-entity takeup rates.
+    """
+    return np.array([rates.get(c, 0) for c in categories])
+
+
+CATEGORY_TAKEUP_VARS = [
+    {
+        "variable": "takes_up_eitc",
+        "entity": "tax_unit",
+        "rate_key": "eitc",
+        "category_variable": "eitc_child_count",
+        "category_mapper": _eitc_category_mapper,
+    },
+    {
+        "variable": "would_claim_wic",
+        "entity": "person",
+        "rate_key": "wic_takeup",
+        "category_variable": "wic_category_str",
+        "category_mapper": _wic_category_mapper,
+    },
+]
+
+
 def rerandomize_takeup(
     sim,
     clone_block_geoids: np.ndarray,
@@ -179,6 +231,76 @@ def rerandomize_takeup(
         sim.set_input(var_name, time_period, new_values)
 
 
+def rerandomize_category_takeup(
+    sim,
+    clone_block_geoids: np.ndarray,
+    time_period: int,
+) -> None:
+    """Re-randomize category-dependent takeup variables.
+
+    Called post-simulation (after cache clear) so that category
+    variables like eitc_child_count and wic_category_str are
+    computed fresh using the clone's geography.
+
+    For each entry in CATEGORY_TAKEUP_VARS:
+    1. Load rates from YAML via load_take_up_rate()
+    2. Calculate the category variable from the sim
+    3. Map categories to per-entity rates via the mapper
+    4. Draw per-block seeded random values
+    5. Compare draws < rates to get new boolean takeup
+
+    Args:
+        sim: Microsimulation instance (post cache-clear).
+        clone_block_geoids: Block GEOIDs per household.
+        time_period: Tax year.
+    """
+    from policyengine_us_data.parameters import (
+        load_take_up_rate,
+    )
+    from policyengine_us_data.utils.randomness import (
+        seeded_rng,
+    )
+
+    hh_ids = sim.calculate("household_id", map_to="household").values
+    hh_to_block = dict(zip(hh_ids, clone_block_geoids))
+
+    for spec in CATEGORY_TAKEUP_VARS:
+        var_name = spec["variable"]
+        entity_level = spec["entity"]
+        rate_key = spec["rate_key"]
+        cat_var = spec["category_variable"]
+        mapper = spec["category_mapper"]
+
+        rates_dict = load_take_up_rate(rate_key, time_period)
+
+        # Calculate category variable (uses fresh sim state)
+        categories = sim.calculate(cat_var, map_to=entity_level).values
+
+        # Map categories to per-entity rates
+        per_entity_rates = mapper(categories, rates_dict)
+
+        # Map entities to blocks via household
+        entity_hh_ids = sim.calculate(
+            "household_id", map_to=entity_level
+        ).values
+        n_entities = len(entity_hh_ids)
+
+        entity_blocks = np.array(
+            [hh_to_block.get(hid, "0") for hid in entity_hh_ids]
+        )
+
+        draws = np.zeros(n_entities, dtype=np.float64)
+        unique_blocks = np.unique(entity_blocks)
+        for block in unique_blocks:
+            mask = entity_blocks == block
+            n_in_block = mask.sum()
+            rng = seeded_rng(var_name, salt=str(block))
+            draws[mask] = rng.random(n_in_block)
+
+        new_values = draws < per_entity_rates
+        sim.set_input(var_name, time_period, new_values)
+
+
 def parse_args(argv=None):
     parser = argparse.ArgumentParser(
         description="Unified L0 calibration pipeline"
@@ -849,6 +971,7 @@ def run_calibration(
 
     # Step 4: Build sim_modifier for takeup rerandomization
     sim_modifier = None
+    post_sim_modifier = None
     if not skip_takeup_rerandomize:
         time_period = 2024
 
@@ -859,6 +982,12 @@ def sim_modifier(s, clone_idx):
             states = geography.state_fips[col_start:col_end]
             rerandomize_takeup(s, blocks, states, time_period)
 
+        def post_sim_modifier(s, clone_idx):
+            col_start = clone_idx * n_records
+            col_end = col_start + n_records
+            blocks = geography.block_geoid[col_start:col_end]
+            rerandomize_category_takeup(s, blocks, time_period)
+
     # Step 5: Build target filter
     target_filter = {}
     if domain_variables:
@@ -878,6 +1007,7 @@ def sim_modifier(s, clone_idx):
         target_filter=target_filter,
         hierarchical_domains=hierarchical_domains,
         sim_modifier=sim_modifier,
+        post_sim_modifier=post_sim_modifier,
     )
 
     builder.print_uprating_summary(targets_df)
diff --git a/policyengine_us_data/calibration/unified_matrix_builder.py b/policyengine_us_data/calibration/unified_matrix_builder.py
index ac31c34e..a65cca92 100644
--- a/policyengine_us_data/calibration/unified_matrix_builder.py
+++ b/policyengine_us_data/calibration/unified_matrix_builder.py
@@ -555,6 +555,7 @@ def _simulate_clone(
         n_records: int,
         variables: set,
         sim_modifier=None,
+        post_sim_modifier=None,
         clone_idx: int = 0,
     ) -> Tuple[Dict[str, np.ndarray], object]:
         """Simulate one clone with assigned geography.
@@ -566,9 +567,14 @@ def _simulate_clone(
             variables: Target variable names to compute.
             sim_modifier: Optional callback(sim, clone_idx)
                 called after state_fips is set but before
-                cache clearing. Used for takeup
+                cache clearing. Used for simple takeup
                 re-randomization.
-            clone_idx: Clone index passed to sim_modifier.
+            post_sim_modifier: Optional callback(sim, clone_idx)
+                called after the first cache clear. Used for
+                category-dependent takeup re-randomization
+                that needs sim-calculated category variables.
+                Triggers a second cache clear afterwards.
+            clone_idx: Clone index passed to modifiers.
 
         Returns:
             (var_values, sim) where var_values maps variable
@@ -587,6 +593,13 @@ def _simulate_clone(
         for var in get_calculated_variables(sim):
             sim.delete_arrays(var)
 
+        # Two-pass: category-dependent takeup needs fresh
+        # category variables, then a second cache clear
+        if post_sim_modifier is not None:
+            post_sim_modifier(sim, clone_idx)
+            for var in get_calculated_variables(sim):
+                sim.delete_arrays(var)
+
         var_values: Dict[str, np.ndarray] = {}
         for var in variables:
             if var.endswith("_count"):
@@ -614,6 +627,7 @@ def build_matrix(
         hierarchical_domains: Optional[List[str]] = None,
         cache_dir: Optional[str] = None,
         sim_modifier=None,
+        post_sim_modifier=None,
     ) -> Tuple[pd.DataFrame, sparse.csr_matrix, List[str]]:
         """Build sparse calibration matrix.
 
@@ -633,8 +647,11 @@ def build_matrix(
                 If None, COO data held in memory.
             sim_modifier: Optional callback(sim, clone_idx)
                 called per clone after state_fips is set but
-                before cache clearing. Use for takeup
+                before cache clearing. Use for simple takeup
                 re-randomization.
+            post_sim_modifier: Optional callback(sim, clone_idx)
+                called after the first cache clear for
+                category-dependent takeup re-randomization.
 
         Returns:
             (targets_df, X_sparse, target_names)
@@ -758,6 +775,7 @@ def build_matrix(
                 n_records,
                 unique_variables,
                 sim_modifier=sim_modifier,
+                post_sim_modifier=post_sim_modifier,
                 clone_idx=clone_idx,
             )
 
diff --git a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py
index 341ffcc0..6a47dd7c 100644
--- a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py
+++ b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py
@@ -145,3 +145,457 @@ def test_hyperparams_defaults(self):
         assert args.beta == BETA
         assert args.lambda_l2 == LAMBDA_L2
         assert args.learning_rate == LEARNING_RATE
+
+
+class TestCategoryTakeupConfig:
+    """Verify the CATEGORY_TAKEUP_VARS config is well-formed."""
+
+    def test_all_entries_have_required_keys(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            CATEGORY_TAKEUP_VARS,
+        )
+
+        required = {
+            "variable",
+            "entity",
+            "rate_key",
+            "category_variable",
+            "category_mapper",
+        }
+        for entry in CATEGORY_TAKEUP_VARS:
+            for key in required:
+                assert key in entry, f"Missing key '{key}' in {entry}"
+
+    def test_all_mappers_callable(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            CATEGORY_TAKEUP_VARS,
+        )
+
+        for entry in CATEGORY_TAKEUP_VARS:
+            assert callable(entry["category_mapper"])
+
+    def test_all_entities_valid(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            CATEGORY_TAKEUP_VARS,
+        )
+
+        valid = ("person", "tax_unit", "spm_unit")
+        for entry in CATEGORY_TAKEUP_VARS:
+            assert entry["entity"] in valid
+
+    def test_expected_count(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            CATEGORY_TAKEUP_VARS,
+        )
+
+        assert len(CATEGORY_TAKEUP_VARS) == 2
+
+
+class TestCategoryMappers:
+    """Test mapper functions load rates from YAML and map correctly."""
+
+    def test_eitc_mapper_standard_counts(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            _eitc_category_mapper,
+        )
+        from policyengine_us_data.parameters import (
+            load_take_up_rate,
+        )
+
+        rates = load_take_up_rate("eitc")
+        categories = np.array([0, 1, 2, 3, 5])
+        result = _eitc_category_mapper(categories, rates)
+        max_key = max(rates.keys())
+        expected = np.array(
+            [
+                rates[0],
+                rates[1],
+                rates[2],
+                rates[3],
+                rates[max_key],
+            ]
+        )
+        np.testing.assert_array_almost_equal(result, expected)
+
+    def test_eitc_mapper_clamps_high_count(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            _eitc_category_mapper,
+        )
+        from policyengine_us_data.parameters import (
+            load_take_up_rate,
+        )
+
+        rates = load_take_up_rate("eitc")
+        max_key = max(rates.keys())
+        categories = np.array([99])
+        result = _eitc_category_mapper(categories, rates)
+        assert result[0] == rates[max_key]
+
+    def test_wic_mapper_known_categories(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            _wic_category_mapper,
+        )
+        from policyengine_us_data.parameters import (
+            load_take_up_rate,
+        )
+
+        rates = load_take_up_rate("wic_takeup", 2022)
+        categories = np.array(list(rates.keys()))
+        result = _wic_category_mapper(categories, rates)
+        expected = np.array([rates[c] for c in categories])
+        np.testing.assert_array_almost_equal(result, expected)
+
+    def test_wic_mapper_unknown_defaults_zero(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            _wic_category_mapper,
+        )
+        from policyengine_us_data.parameters import (
+            load_take_up_rate,
+        )
+
+        rates = load_take_up_rate("wic_takeup", 2022)
+        categories = np.array(["UNKNOWN"])
+        result = _wic_category_mapper(categories, rates)
+        assert result[0] == 0
+
+    def test_wic_mapper_none_category(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            _wic_category_mapper,
+        )
+        from policyengine_us_data.parameters import (
+            load_take_up_rate,
+        )
+
+        rates = load_take_up_rate("wic_takeup", 2022)
+        categories = np.array(["NONE"])
+        result = _wic_category_mapper(categories, rates)
+        assert result[0] == rates["NONE"]
+
+
+class _MockResult:
+    """Wraps an array to mimic sim.calculate(...).values."""
+
+    def __init__(self, values):
+        self.values = values
+
+
+class _MockSim:
+    """Lightweight mock Microsimulation for rerandomize tests.
+
+    Supports sim.calculate(var, map_to=entity).values and
+    sim.set_input(var, period, values). No dataset needed.
+    """
+
+    def __init__(self, calc_returns: dict):
+        """
+        Args:
+            calc_returns: Dict mapping (var, entity) -> np.ndarray.
+        """
+        self._calc = calc_returns
+        self.inputs_set = {}
+
+    def calculate(self, var, period=None, map_to=None):
+        key = (var, map_to) if map_to else (var, None)
+        if key not in self._calc:
+            raise ValueError(
+                f"MockSim has no return for {key}. "
+                f"Available: {list(self._calc.keys())}"
+            )
+        return _MockResult(self._calc[key])
+
+    def set_input(self, var, period, values):
+        self.inputs_set[var] = np.asarray(values)
+
+
+class TestRerandomizeCategoryTakeup:
+    """Integration test: call rerandomize_category_takeup()
+    with a mock sim and verify the outputs end-to-end."""
+
+    def _build_mock_sim(self):
+        """Build a mock sim with 3 households, 5 tax units,
+        6 persons, known child counts and WIC categories."""
+        # 3 households
+        hh_ids_hh = np.array([100, 200, 300])
+
+        # 5 tax units: HH 100 has 2 TUs, HH 200 has 2, HH 300 has 1
+        tu_hh_ids = np.array([100, 100, 200, 200, 300])
+        # child counts: 0, 2, 1, 3, 0
+        eitc_child_counts = np.array([0, 2, 1, 3, 0])
+
+        # 6 persons: HH 100 has 2, HH 200 has 3, HH 300 has 1
+        person_hh_ids = np.array([100, 100, 200, 200, 200, 300])
+        wic_cats = np.array(
+            [
+                "CHILD",
+                "NONE",
+                "INFANT",
+                "PREGNANT",
+                "CHILD",
+                "NONE",
+            ]
+        )
+
+        calc_returns = {
+            ("household_id", "household"): hh_ids_hh,
+            ("eitc_child_count", "tax_unit"): eitc_child_counts,
+            ("household_id", "tax_unit"): tu_hh_ids,
+            ("wic_category_str", "person"): wic_cats,
+            ("household_id", "person"): person_hh_ids,
+        }
+        return _MockSim(calc_returns), {
+            "hh_ids": hh_ids_hh,
+            "child_counts": eitc_child_counts,
+            "tu_hh_ids": tu_hh_ids,
+            "wic_cats": wic_cats,
+            "person_hh_ids": person_hh_ids,
+        }
+
+    def test_sets_both_variables(self):
+        """rerandomize_category_takeup sets takes_up_eitc
+        and would_claim_wic."""
+        from policyengine_us_data.calibration.unified_calibration import (
+            rerandomize_category_takeup,
+        )
+
+        sim, _ = self._build_mock_sim()
+        blocks = np.array(
+            ["010010001001001", "020020002002002", "030030003003003"]
+        )
+        rerandomize_category_takeup(sim, blocks, 2024)
+
+        assert "takes_up_eitc" in sim.inputs_set
+        assert "would_claim_wic" in sim.inputs_set
+
+    def test_output_shapes_match_entities(self):
+        """Output arrays match entity counts: 5 tax units
+        for EITC, 6 persons for WIC."""
+        from policyengine_us_data.calibration.unified_calibration import (
+            rerandomize_category_takeup,
+        )
+
+        sim, info = self._build_mock_sim()
+        blocks = np.array(["blk_a", "blk_b", "blk_c"])
+        rerandomize_category_takeup(sim, blocks, 2024)
+
+        assert len(sim.inputs_set["takes_up_eitc"]) == len(
+            info["child_counts"]
+        )
+        assert len(sim.inputs_set["would_claim_wic"]) == len(info["wic_cats"])
+
+    def test_outputs_are_boolean(self):
+        from policyengine_us_data.calibration.unified_calibration import (
+            rerandomize_category_takeup,
+        )
+
+        sim, _ = self._build_mock_sim()
+        blocks = np.array(["blk_a", "blk_b", "blk_c"])
+        rerandomize_category_takeup(sim, blocks, 2024)
+
+        assert sim.inputs_set["takes_up_eitc"].dtype == bool
+        assert sim.inputs_set["would_claim_wic"].dtype == bool
+
+    def test_deterministic_same_blocks(self):
+        """Same blocks produce identical takeup values."""
+        from policyengine_us_data.calibration.unified_calibration import (
+            rerandomize_category_takeup,
+        )
+
+        blocks = np.array(["blk_a", "blk_b", "blk_c"])
+
+        sim1, _ = self._build_mock_sim()
+        rerandomize_category_takeup(sim1, blocks, 2024)
+
+        sim2, _ = self._build_mock_sim()
+        rerandomize_category_takeup(sim2, blocks, 2024)
+
+        np.testing.assert_array_equal(
+            sim1.inputs_set["takes_up_eitc"],
+            sim2.inputs_set["takes_up_eitc"],
+        )
+        np.testing.assert_array_equal(
+            sim1.inputs_set["would_claim_wic"],
+            sim2.inputs_set["would_claim_wic"],
+        )
+
+    def test_different_blocks_differ(self):
+        """Different blocks produce different takeup values."""
+        from policyengine_us_data.calibration.unified_calibration import (
+            rerandomize_category_takeup,
+        )
+
+        sim1, _ = self._build_mock_sim()
+        rerandomize_category_takeup(
+            sim1,
+            np.array(["aaa", "bbb", "ccc"]),
+            2024,
+        )
+
+        sim2, _ = self._build_mock_sim()
+        rerandomize_category_takeup(
+            sim2,
+            np.array(["xxx", "yyy", "zzz"]),
+            2024,
+        )
+
+        # At least one of the two variables should differ
+        eitc_differ = not np.array_equal(
+            sim1.inputs_set["takes_up_eitc"],
+            sim2.inputs_set["takes_up_eitc"],
+        )
+        wic_differ = not np.array_equal(
+            sim1.inputs_set["would_claim_wic"],
+            sim2.inputs_set["would_claim_wic"],
+        )
+        assert eitc_differ or wic_differ
+
+    def test_eitc_values_match_manual_calculation(self):
+        """Verify takes_up_eitc matches a hand-computed
+        result using the same seeded draws and rates."""
+        from policyengine_us_data.calibration.unified_calibration import (
+            _eitc_category_mapper,
+            rerandomize_category_takeup,
+        )
+        from policyengine_us_data.parameters import (
+            load_take_up_rate,
+        )
+
+        sim, info = self._build_mock_sim()
+        blocks = np.array(
+            [
+                "010010001001001",
+                "020020002002002",
+                "030030003003003",
+            ]
+        )
+        rerandomize_category_takeup(sim, blocks, 2024)
+
+        # Reproduce manually
+        eitc_rates = load_take_up_rate("eitc", 2024)
+        child_counts = info["child_counts"]
+        per_tu_rates = _eitc_category_mapper(child_counts, eitc_rates)
+
+        hh_to_block = dict(zip(info["hh_ids"], blocks))
+        tu_blocks = np.array(
+            [hh_to_block.get(hid, "0") for hid in info["tu_hh_ids"]]
+        )
+        draws = np.zeros(len(child_counts), dtype=np.float64)
+        for block in np.unique(tu_blocks):
+            mask = tu_blocks == block
+            rng = seeded_rng("takes_up_eitc", salt=str(block))
+            draws[mask] = rng.random(mask.sum())
+
+        expected = draws < per_tu_rates
+        np.testing.assert_array_equal(
+            sim.inputs_set["takes_up_eitc"], expected
+        )
+
+    def test_wic_values_match_manual_calculation(self):
+        """Verify would_claim_wic matches a hand-computed
+        result using the same seeded draws and rates."""
+        from policyengine_us_data.calibration.unified_calibration import (
+            _wic_category_mapper,
+            rerandomize_category_takeup,
+        )
+        from policyengine_us_data.parameters import (
+            load_take_up_rate,
+        )
+
+        sim, info = self._build_mock_sim()
+        blocks = np.array(
+            [
+                "010010001001001",
+                "020020002002002",
+                "030030003003003",
+            ]
+        )
+        rerandomize_category_takeup(sim, blocks, 2024)
+
+        # Reproduce manually
+        wic_rates = load_take_up_rate("wic_takeup", 2024)
+        per_person_rates = _wic_category_mapper(info["wic_cats"], wic_rates)
+
+        hh_to_block = dict(zip(info["hh_ids"], blocks))
+        person_blocks = np.array(
+            [hh_to_block.get(hid, "0") for hid in info["person_hh_ids"]]
+        )
+        draws = np.zeros(len(info["wic_cats"]), dtype=np.float64)
+        for block in np.unique(person_blocks):
+            mask = person_blocks == block
+            rng = seeded_rng("would_claim_wic", salt=str(block))
+            draws[mask] = rng.random(mask.sum())
+
+        expected = draws < per_person_rates
+        np.testing.assert_array_equal(
+            sim.inputs_set["would_claim_wic"], expected
+        )
+
+    def test_none_wic_never_takes_up(self):
+        """Persons with wic_category_str='NONE' should never
+        take up WIC (rate=0 from YAML)."""
+        from policyengine_us_data.calibration.unified_calibration import (
+            rerandomize_category_takeup,
+        )
+
+        sim, info = self._build_mock_sim()
+        blocks = np.array(["blk_a", "blk_b", "blk_c"])
+        rerandomize_category_takeup(sim, blocks, 2024)
+
+        wic_result = sim.inputs_set["would_claim_wic"]
+        # Persons at indices 1 and 5 have category "NONE"
+        none_indices = [
+            i for i, c in enumerate(info["wic_cats"]) if c == "NONE"
+        ]
+        for idx in none_indices:
+            assert (
+                wic_result[idx] == False
+            ), f"Person {idx} (NONE) should not claim WIC"
+
+    def test_zero_child_eitc_rate_is_lower(self):
+        """Tax units with 0 children should have a lower
+        takeup rate than those with children, verified
+        statistically over many blocks."""
+        from policyengine_us_data.calibration.unified_calibration import (
+            _eitc_category_mapper,
+        )
+        from policyengine_us_data.parameters import (
+            load_take_up_rate,
+        )
+
+        eitc_rates = load_take_up_rate("eitc", 2024)
+        # 0-child rate should be strictly less than 1-child rate
+        assert eitc_rates[0] < eitc_rates[1]
+
+        # Simulate 10k draws to verify the rates manifest
+        n = 10000
+        child_counts = np.array([0, 1])
+        per_tu_rates = _eitc_category_mapper(child_counts, eitc_rates)
+        rng = seeded_rng("takes_up_eitc", salt="stat_test")
+        draws = rng.random(n)
+
+        frac_0 = (draws[:n] < per_tu_rates[0]).mean()
+        frac_1 = (draws[:n] < per_tu_rates[1]).mean()
+        assert frac_0 < frac_1
+
+
+class TestCategoryTakeupSeeding:
+    """Verify seeded draws for category takeup are deterministic
+    and vary by block/variable."""
+
+    def test_same_block_same_draws(self):
+        var = "takes_up_eitc"
+        block = "010010001001001"
+        rng1 = seeded_rng(var, salt=block)
+        rng2 = seeded_rng(var, salt=block)
+        np.testing.assert_array_equal(rng1.random(100), rng2.random(100))
+
+    def test_different_blocks_different_draws(self):
+        var = "takes_up_eitc"
+        rng1 = seeded_rng(var, salt="010010001001001")
+        rng2 = seeded_rng(var, salt="020010001001001")
+        assert not np.array_equal(rng1.random(100), rng2.random(100))
+
+    def test_different_category_vars_different_draws(self):
+        block = "010010001001001"
+        rng1 = seeded_rng("takes_up_eitc", salt=block)
+        rng2 = seeded_rng("would_claim_wic", salt=block)
+        assert not np.array_equal(rng1.random(100), rng2.random(100))

From 5d0ecf60da41a6711c45fe3da26abfde6f0ae785 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Wed, 18 Feb 2026 20:55:32 +0530
Subject: [PATCH 6/6] Add category-dependent takeup re-randomization for WIC
 nutritional risk

---
 changelog_entry.yaml                          |   2 +-
 .../calibration/unified_calibration.py        |  16 +++
 .../test_unified_calibration.py               | 112 ++++++++++++++++--
 3 files changed, 122 insertions(+), 8 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index f446f1b1..9445baca 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -5,7 +5,7 @@
     - ACS/SIPP/SCF geographic re-imputation module (source_impute.py) with state predictor
     - PUF and source impute integration into unified calibration pipeline (--puf-dataset, --skip-puf, --skip-source-impute flags)
     - 21 new tests for puf_impute and source_impute modules
-    - Category-dependent takeup re-randomization for takes_up_eitc (by child count) and would_claim_wic (by WIC category) during calibration cloning
+    - Category-dependent takeup re-randomization for takes_up_eitc (by child count) and would_claim_wic, and is_wic_at_nutritional_risk (by WIC category) during calibration cloning
     - Two-pass simulation flow in _simulate_clone() with post_sim_modifier for category variables that require sim output
     changed:
     - Refactored extended_cps.py to delegate to puf_impute.puf_clone_dataset() (443 -> 75 lines)
diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py
index 48a268cb..6a1a2927 100644
--- a/policyengine_us_data/calibration/unified_calibration.py
+++ b/policyengine_us_data/calibration/unified_calibration.py
@@ -149,6 +149,14 @@ def _wic_category_mapper(categories, rates):
         "category_variable": "wic_category_str",
         "category_mapper": _wic_category_mapper,
     },
+    {
+        "variable": "is_wic_at_nutritional_risk",
+        "entity": "person",
+        "rate_key": "wic_nutritional_risk",
+        "category_variable": "wic_category_str",
+        "category_mapper": _wic_category_mapper,
+        "base_variable": "receives_wic",
+    },
 ]
 
 
@@ -298,6 +306,14 @@ def rerandomize_category_takeup(
             draws[mask] = rng.random(n_in_block)
 
         new_values = draws < per_entity_rates
+
+        base_var = spec.get("base_variable")
+        if base_var is not None:
+            base_values = sim.calculate(
+                base_var, map_to=entity_level
+            ).values.astype(bool)
+            new_values = base_values | new_values
+
         sim.set_input(var_name, time_period, new_values)
 
 
diff --git a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py
index 6a47dd7c..250ffc51 100644
--- a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py
+++ b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py
@@ -188,7 +188,7 @@ def test_expected_count(self):
             CATEGORY_TAKEUP_VARS,
         )
 
-        assert len(CATEGORY_TAKEUP_VARS) == 2
+        assert len(CATEGORY_TAKEUP_VARS) == 3
 
 
 class TestCategoryMappers:
@@ -334,6 +334,8 @@ def _build_mock_sim(self):
                 "NONE",
             ]
         )
+        # receives_wic: persons 0, 2, 4 receive WIC
+        receives_wic = np.array([True, False, True, False, True, False])
 
         calc_returns = {
             ("household_id", "household"): hh_ids_hh,
@@ -341,6 +343,7 @@ def _build_mock_sim(self):
             ("household_id", "tax_unit"): tu_hh_ids,
             ("wic_category_str", "person"): wic_cats,
             ("household_id", "person"): person_hh_ids,
+            ("receives_wic", "person"): receives_wic,
         }
         return _MockSim(calc_returns), {
             "hh_ids": hh_ids_hh,
@@ -348,11 +351,12 @@ def _build_mock_sim(self):
             "tu_hh_ids": tu_hh_ids,
             "wic_cats": wic_cats,
             "person_hh_ids": person_hh_ids,
+            "receives_wic": receives_wic,
         }
 
-    def test_sets_both_variables(self):
-        """rerandomize_category_takeup sets takes_up_eitc
-        and would_claim_wic."""
+    def test_sets_all_variables(self):
+        """rerandomize_category_takeup sets takes_up_eitc,
+        would_claim_wic, and is_wic_at_nutritional_risk."""
         from policyengine_us_data.calibration.unified_calibration import (
             rerandomize_category_takeup,
         )
@@ -365,10 +369,11 @@ def test_sets_both_variables(self):
 
         assert "takes_up_eitc" in sim.inputs_set
         assert "would_claim_wic" in sim.inputs_set
+        assert "is_wic_at_nutritional_risk" in sim.inputs_set
 
     def test_output_shapes_match_entities(self):
         """Output arrays match entity counts: 5 tax units
-        for EITC, 6 persons for WIC."""
+        for EITC, 6 persons for WIC and nutritional risk."""
         from policyengine_us_data.calibration.unified_calibration import (
             rerandomize_category_takeup,
         )
@@ -381,6 +386,9 @@ def test_output_shapes_match_entities(self):
             info["child_counts"]
         )
         assert len(sim.inputs_set["would_claim_wic"]) == len(info["wic_cats"])
+        assert len(sim.inputs_set["is_wic_at_nutritional_risk"]) == len(
+            info["wic_cats"]
+        )
 
     def test_outputs_are_boolean(self):
         from policyengine_us_data.calibration.unified_calibration import (
@@ -393,6 +401,7 @@ def test_outputs_are_boolean(self):
 
         assert sim.inputs_set["takes_up_eitc"].dtype == bool
         assert sim.inputs_set["would_claim_wic"].dtype == bool
+        assert sim.inputs_set["is_wic_at_nutritional_risk"].dtype == bool
 
     def test_deterministic_same_blocks(self):
         """Same blocks produce identical takeup values."""
@@ -416,6 +425,10 @@ def test_deterministic_same_blocks(self):
             sim1.inputs_set["would_claim_wic"],
             sim2.inputs_set["would_claim_wic"],
         )
+        np.testing.assert_array_equal(
+            sim1.inputs_set["is_wic_at_nutritional_risk"],
+            sim2.inputs_set["is_wic_at_nutritional_risk"],
+        )
 
     def test_different_blocks_differ(self):
         """Different blocks produce different takeup values."""
@@ -437,7 +450,7 @@ def test_different_blocks_differ(self):
             2024,
         )
 
-        # At least one of the two variables should differ
+        # At least one of the three variables should differ
         eitc_differ = not np.array_equal(
             sim1.inputs_set["takes_up_eitc"],
             sim2.inputs_set["takes_up_eitc"],
@@ -446,7 +459,11 @@ def test_different_blocks_differ(self):
             sim1.inputs_set["would_claim_wic"],
             sim2.inputs_set["would_claim_wic"],
         )
-        assert eitc_differ or wic_differ
+        risk_differ = not np.array_equal(
+            sim1.inputs_set["is_wic_at_nutritional_risk"],
+            sim2.inputs_set["is_wic_at_nutritional_risk"],
+        )
+        assert eitc_differ or wic_differ or risk_differ
 
     def test_eitc_values_match_manual_calculation(self):
         """Verify takes_up_eitc matches a hand-computed
@@ -550,6 +567,87 @@ def test_none_wic_never_takes_up(self):
                 wic_result[idx] == False
             ), f"Person {idx} (NONE) should not claim WIC"
 
+    def test_nutritional_risk_values_match_manual_calculation(self):
+        """Verify is_wic_at_nutritional_risk matches
+        receives_wic | (draws < rates) using seeded draws."""
+        from policyengine_us_data.calibration.unified_calibration import (
+            _wic_category_mapper,
+            rerandomize_category_takeup,
+        )
+        from policyengine_us_data.parameters import (
+            load_take_up_rate,
+        )
+
+        sim, info = self._build_mock_sim()
+        blocks = np.array(
+            [
+                "010010001001001",
+                "020020002002002",
+                "030030003003003",
+            ]
+        )
+        rerandomize_category_takeup(sim, blocks, 2024)
+
+        # Reproduce manually
+        risk_rates = load_take_up_rate("wic_nutritional_risk", 2024)
+        per_person_rates = _wic_category_mapper(info["wic_cats"], risk_rates)
+
+        hh_to_block = dict(zip(info["hh_ids"], blocks))
+        person_blocks = np.array(
+            [hh_to_block.get(hid, "0") for hid in info["person_hh_ids"]]
+        )
+        draws = np.zeros(len(info["wic_cats"]), dtype=np.float64)
+        for block in np.unique(person_blocks):
+            mask = person_blocks == block
+            rng = seeded_rng("is_wic_at_nutritional_risk", salt=str(block))
+            draws[mask] = rng.random(mask.sum())
+
+        expected = info["receives_wic"] | (draws < per_person_rates)
+        np.testing.assert_array_equal(
+            sim.inputs_set["is_wic_at_nutritional_risk"],
+            expected,
+        )
+
+    def test_receives_wic_guarantees_nutritional_risk(self):
+        """Anyone with receives_wic=True must have
+        is_wic_at_nutritional_risk=True."""
+        from policyengine_us_data.calibration.unified_calibration import (
+            rerandomize_category_takeup,
+        )
+
+        sim, info = self._build_mock_sim()
+        blocks = np.array(["blk_a", "blk_b", "blk_c"])
+        rerandomize_category_takeup(sim, blocks, 2024)
+
+        risk = sim.inputs_set["is_wic_at_nutritional_risk"]
+        for i, recv in enumerate(info["receives_wic"]):
+            if recv:
+                assert risk[i], (
+                    f"Person {i} receives WIC but "
+                    f"is_wic_at_nutritional_risk is False"
+                )
+
+    def test_none_wic_without_receives_never_at_risk(self):
+        """Persons with NONE category who don't receive WIC
+        are never at nutritional risk (rate=0)."""
+        from policyengine_us_data.calibration.unified_calibration import (
+            rerandomize_category_takeup,
+        )
+
+        sim, info = self._build_mock_sim()
+        blocks = np.array(["blk_a", "blk_b", "blk_c"])
+        rerandomize_category_takeup(sim, blocks, 2024)
+
+        risk = sim.inputs_set["is_wic_at_nutritional_risk"]
+        for i, (cat, recv) in enumerate(
+            zip(info["wic_cats"], info["receives_wic"])
+        ):
+            if cat == "NONE" and not recv:
+                assert not risk[i], (
+                    f"Person {i} (NONE, no WIC) should "
+                    f"not be at nutritional risk"
+                )
+
     def test_zero_child_eitc_rate_is_lower(self):
         """Tax units with 0 children should have a lower
         takeup rate than those with children, verified