diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index e69de29bb..e4c231aa2 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -0,0 +1,22 @@
+- bump: minor
+ changes:
+ added:
+ - Census-block-first calibration pipeline (calibration/ package) ported from PR #516
+ - Clone-and-assign module for population-weighted census block sampling
+ - Unified matrix builder with clone-by-clone simulation, COO caching, and target_overview-based querying
+ - Unified calibration CLI with L0 optimization and seeded takeup re-randomization
+ - 28 new tests for the calibration pipeline
+ - Integration test for build_matrix geographic masking (national/state/CD)
+ - Tests for drop_target_groups utility
+ - voluntary_filing.yaml takeup rate parameter
+ changed:
+ - Rewrote local_area_calibration_setup.ipynb for clone-based pipeline
+ - Renamed _get_geo_level to get_geo_level (now cross-module public API)
+ fixed:
+ - Fix Jupyter import error in unified_calibration.py (OutStream.reconfigure moved to main)
+ - Fix modal_app/remote_calibration_runner.py referencing deleted fit_calibration_weights.py
+ - Fix _coo_parts stale state bug on build_matrix re-call after failure
+ - Remove hardcoded voluntary_filing rate in favor of YAML parameter
+ removed:
+ - SparseMatrixBuilder, MatrixTracer, and fit_calibration_weights (replaced by unified pipeline)
+ - 8 old SparseMatrixBuilder-dependent tests (replaced by new test_calibration suite)
diff --git a/docs/calibration_matrix.ipynb b/docs/calibration_matrix.ipynb
index b7edbe507..41497b1e8 100644
--- a/docs/calibration_matrix.ipynb
+++ b/docs/calibration_matrix.ipynb
@@ -6,11 +6,13 @@
"source": [
"# The Calibration Matrix\n",
"\n",
- "The calibration pipeline has three stages: (1) compute uprated target values ([`hierarchical_uprating.ipynb`](hierarchical_uprating.ipynb)), (2) assemble the sparse constraint matrix (this notebook), and (3) optimize weights ([`fit_calibration_weights.py`](../policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py)). This notebook is the diagnostic checkpoint between stages 1 and 2 — understand your matrix before you optimize.\n",
+ "The calibration pipeline has three stages: (1) compute uprated target values, (2) assemble the sparse constraint matrix (this notebook), and (3) optimize weights (`unified_calibration.py`). This notebook is the diagnostic checkpoint between stages 1 and 2 — understand your matrix before you optimize.\n",
"\n",
- "We build the full calibration matrix using `SparseMatrixBuilder`, then use `MatrixTracer` to inspect its structure: what rows and columns represent, how target groups partition the loss function, and where sparsity patterns emerge.\n",
+ "We build the full calibration matrix using `UnifiedMatrixBuilder` with clone-based geography from `assign_random_geography`, then inspect its structure: what rows and columns represent, how target groups partition the loss function, and where sparsity patterns emerge.\n",
"\n",
- "**Requirements:** `policy_data.db` and the stratified CPS h5 file in `STORAGE_FOLDER`."
+ "**Column layout:** `col = clone_idx * n_records + record_idx`\n",
+ "\n",
+ "**Requirements:** `policy_data.db`, `block_cd_distributions.csv.gz`, and the stratified CPS h5 file in `STORAGE_FOLDER`."
]
},
{
@@ -22,39 +24,10 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/baogorek/envs/sep/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
- " from .autonotebook import tqdm as notebook_tqdm\n"
- ]
- }
- ],
- "source": [
- "import numpy as np\n",
- "import pandas as pd\n",
- "from policyengine_us import Microsimulation\n",
- "from policyengine_us_data.storage import STORAGE_FOLDER\n",
- "from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (\n",
- " SparseMatrixBuilder,\n",
- ")\n",
- "from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n",
- " get_all_cds_from_database,\n",
- " create_target_groups,\n",
- " STATE_CODES,\n",
- ")\n",
- "from policyengine_us_data.datasets.cps.local_area_calibration.matrix_tracer import (\n",
- " MatrixTracer,\n",
- ")\n",
- "\n",
- "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n",
- "db_uri = f\"sqlite:///{db_path}\"\n",
- "dataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\""
- ]
+ "outputs": [],
+ "source": "import numpy as np\nimport pandas as pd\nfrom policyengine_us import Microsimulation\nfrom policyengine_us_data.storage import STORAGE_FOLDER\nfrom policyengine_us_data.calibration.unified_matrix_builder import (\n UnifiedMatrixBuilder,\n)\nfrom policyengine_us_data.calibration.clone_and_assign import (\n assign_random_geography,\n)\nfrom policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n create_target_groups,\n drop_target_groups,\n get_geo_level,\n STATE_CODES,\n)\n\ndb_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\ndb_uri = f\"sqlite:///{db_path}\"\ndataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\""
},
{
"cell_type": "code",
@@ -65,32 +38,34 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Matrix shape: (1411, 5231564)\n",
- "Non-zero entries: 2,199,033\n"
+ "Records: 11,999, Clones: 3, Total columns: 35,997\n",
+ "Matrix shape: (1411, 35997)\n",
+ "Non-zero entries: 14,946\n"
]
}
],
"source": [
"sim = Microsimulation(dataset=str(dataset_path))\n",
- "cds_to_calibrate = get_all_cds_from_database(db_uri)\n",
+ "n_records = sim.calculate(\"household_id\", map_to=\"household\").values.shape[0]\n",
+ "\n",
+ "N_CLONES = 3 # keep small for diagnostics\n",
+ "geography = assign_random_geography(n_records, n_clones=N_CLONES, seed=42)\n",
"\n",
- "builder = SparseMatrixBuilder(\n",
+ "builder = UnifiedMatrixBuilder(\n",
" db_uri=db_uri,\n",
" time_period=2024,\n",
- " cds_to_calibrate=cds_to_calibrate,\n",
" dataset_path=str(dataset_path),\n",
")\n",
"\n",
- "targets_df, X_sparse, household_id_mapping = builder.build_matrix(\n",
+ "targets_df, X_sparse, target_names = builder.build_matrix(\n",
+ " geography,\n",
" sim,\n",
" target_filter={\"domain_variables\": [\"aca_ptc\", \"snap\"]},\n",
" hierarchical_domains=[\"aca_ptc\", \"snap\"],\n",
")\n",
"\n",
- "tracer = MatrixTracer(\n",
- " targets_df, X_sparse, household_id_mapping, cds_to_calibrate, sim\n",
- ")\n",
- "\n",
+ "n_total = n_records * N_CLONES\n",
+ "print(f\"Records: {n_records:,}, Clones: {N_CLONES}, Total columns: {n_total:,}\")\n",
"print(f\"Matrix shape: {X_sparse.shape}\")\n",
"print(f\"Non-zero entries: {X_sparse.nnz:,}\")"
]
@@ -104,91 +79,10 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "================================================================================\n",
- "MATRIX STRUCTURE BREAKDOWN\n",
- "================================================================================\n",
- "\n",
- "Matrix dimensions: 1411 rows x 5231564 columns\n",
- " Rows = 1411 targets\n",
- " Columns = 11999 households x 436 CDs\n",
- " = 11,999 x 436 = 5,231,564\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "COLUMN STRUCTURE (Households stacked by CD)\n",
- "--------------------------------------------------------------------------------\n",
- "\n",
- "Showing first and last 5 CDs of 436 total:\n",
- "\n",
- "First 5 CDs:\n",
- "cd_geoid start_col end_col n_households\n",
- " 1001 0 11998 11999\n",
- " 101 11999 23997 11999\n",
- " 102 23998 35996 11999\n",
- " 103 35997 47995 11999\n",
- " 104 47996 59994 11999\n",
- "\n",
- "Last 5 CDs:\n",
- "cd_geoid start_col end_col n_households\n",
- " 901 5171569 5183567 11999\n",
- " 902 5183568 5195566 11999\n",
- " 903 5195567 5207565 11999\n",
- " 904 5207566 5219564 11999\n",
- " 905 5219565 5231563 11999\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "ROW STRUCTURE (Targets)\n",
- "--------------------------------------------------------------------------------\n",
- "\n",
- "Total targets: 1411\n",
- "\n",
- "Targets by domain variable:\n",
- " n_targets n_unique_vars\n",
- "domain_variable \n",
- "aca_ptc 873 3\n",
- "snap 538 2\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "TARGET GROUPS (for loss calculation)\n",
- "--------------------------------------------------------------------------------\n",
- "\n",
- "=== Creating Target Groups ===\n",
- "\n",
- "National targets:\n",
- " Group 0: ACA PTC Person Count = 19,743,689\n",
- "\n",
- "State targets:\n",
- " Group 1: SNAP Household Count (51 targets)\n",
- " Group 2: Snap (51 targets)\n",
- "\n",
- "District targets:\n",
- " Group 3: Aca Ptc (436 targets)\n",
- " Group 4: ACA PTC Tax Unit Count (436 targets)\n",
- " Group 5: SNAP Household Count (436 targets)\n",
- "\n",
- "Total groups created: 6\n",
- "========================================\n",
- " Group 0: National ACA PTC Person Count (1 target, value=19,743,689) - rows [0]\n",
- " Group 1: State SNAP Household Count (51 targets) - rows [1, 2, 3, ..., 50, 51]\n",
- " Group 2: State Snap (51 targets) - rows [52, 53, 54, ..., 101, 102]\n",
- " Group 3: District Aca Ptc (436 targets) - rows [103, 104, 105, ..., 537, 538]\n",
- " Group 4: District ACA PTC Tax Unit Count (436 targets) - rows [975, 976, 977, ..., 1409, 1410]\n",
- " Group 5: District SNAP Household Count (436 targets) - rows [539, 540, 541, ..., 973, 974]\n",
- "\n",
- "================================================================================\n"
- ]
- }
- ],
- "source": [
- "tracer.print_matrix_structure()"
- ]
+ "outputs": [],
+ "source": "print(f\"Targets: {X_sparse.shape[0]}\")\nprint(f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\")\nprint(f\"Non-zeros: {X_sparse.nnz:,}\")\nprint(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nfor level in [0, 1, 2]:\n n = (geo_levels == level).sum()\n if n > 0:\n print(f\" {level_names[level]}: {n} targets\")"
},
{
"cell_type": "markdown",
@@ -196,7 +90,7 @@
"source": [
"## 3. Anatomy of a row\n",
"\n",
- "Each row is one calibration target — a known aggregate (dollar total, household count, person count) that the optimizer tries to match. The row vector's non-zero entries identify which (household, CD) pairs can contribute to that target."
+ "Each row is one calibration target — a known aggregate (dollar total, household count, person count) that the optimizer tries to match. The row vector's non-zero entries identify which cloned records can contribute to that target."
]
},
{
@@ -208,23 +102,24 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Row 705:\n",
- " row_index: 705\n",
+ "Row 705: cd_3402/household_count/[snap>0]\n",
" variable: household_count\n",
- " variable_desc: Households represented\n",
" geographic_id: 3402\n",
- " target_value: 48652.0536866581\n",
- " stratum_id: 9625\n",
- " domain_variable: snap\n"
+ " geo_level: district\n",
+ " target value: 48,652\n",
+ " uprating_factor: 1.0\n"
]
}
],
"source": [
"mid_row = X_sparse.shape[0] // 2\n",
- "row_info = tracer.get_row_info(mid_row)\n",
- "print(f\"Row {mid_row}:\")\n",
- "for k, v in row_info.items():\n",
- " print(f\" {k}: {v}\")"
+ "row = targets_df.iloc[mid_row]\n",
+ "print(f\"Row {mid_row}: {target_names[mid_row]}\")\n",
+ "print(f\" variable: {row['variable']}\")\n",
+ "print(f\" geographic_id: {row['geographic_id']}\")\n",
+ "print(f\" geo_level: {row['geo_level']}\")\n",
+ "print(f\" target value: {row['value']:,.0f}\")\n",
+ "print(f\" uprating_factor: {row.get('uprating_factor', 'N/A')}\")"
]
},
{
@@ -236,21 +131,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Row 705 has 1,841 non-zero columns\n",
- "\n",
- "First non-zero column (1991877):\n",
- " column_index: 1991877\n",
- " cd_geoid: 3402\n",
- " household_id: 952\n",
- " household_index: 43\n",
- "\n",
- "Last non-zero column (2003831):\n",
- " column_index: 2003831\n",
+ "Row 705 has 9 non-zero columns\n",
+ " Spans 3 clone(s)\n",
+ " Spans 9 unique record(s)\n",
+ "\n",
+ "First non-zero column (8000):\n",
+ " clone_idx: 0\n",
+ " record_idx: 8000\n",
+ " state_fips: 34\n",
" cd_geoid: 3402\n",
- " household_id: 177860\n",
- " household_index: 11997\n",
- "\n",
- "Spans 1 CD(s)\n"
+ " value: 1.00\n"
]
}
],
@@ -260,19 +150,18 @@
"print(f\"Row {mid_row} has {len(nz_cols):,} non-zero columns\")\n",
"\n",
"if len(nz_cols) > 0:\n",
- " first_col = tracer.get_column_info(nz_cols[0])\n",
- " last_col = tracer.get_column_info(nz_cols[-1])\n",
- " print(f\"\\nFirst non-zero column ({nz_cols[0]}):\")\n",
- " for k, v in first_col.items():\n",
- " print(f\" {k}: {v}\")\n",
- " print(f\"\\nLast non-zero column ({nz_cols[-1]}):\")\n",
- " for k, v in last_col.items():\n",
- " print(f\" {k}: {v}\")\n",
- "\n",
- " unique_cds = set(\n",
- " tracer.get_column_info(c)[\"cd_geoid\"] for c in nz_cols\n",
- " )\n",
- " print(f\"\\nSpans {len(unique_cds)} CD(s)\")"
+ " clone_indices = nz_cols // n_records\n",
+ " record_indices = nz_cols % n_records\n",
+ " print(f\" Spans {len(np.unique(clone_indices))} clone(s)\")\n",
+ " print(f\" Spans {len(np.unique(record_indices))} unique record(s)\")\n",
+ "\n",
+ " first_col = nz_cols[0]\n",
+ " print(f\"\\nFirst non-zero column ({first_col}):\")\n",
+ " print(f\" clone_idx: {first_col // n_records}\")\n",
+ " print(f\" record_idx: {first_col % n_records}\")\n",
+ " print(f\" state_fips: {geography.state_fips[first_col]}\")\n",
+ " print(f\" cd_geoid: {geography.cd_geoid[first_col]}\")\n",
+ " print(f\" value: {X_sparse[mid_row, first_col]:.2f}\")"
]
},
{
@@ -281,9 +170,9 @@
"source": [
"## 4. Anatomy of a column\n",
"\n",
- "Each column represents one (household, CD) pair. The columns are organized in blocks: the first `n_households` columns belong to CD 1, the next to CD 2, and so on. The block formula is:\n",
+ "Each column represents one (record, clone) pair. Columns are organized in clone blocks: the first `n_records` columns belong to clone 0, the next to clone 1, and so on. The block formula is:\n",
"\n",
- "$$\\text{column\\_idx} = \\text{cd\\_block} \\times n_{\\text{households}} + \\text{hh\\_index}$$"
+ "$$\\text{column\\_idx} = \\text{clone\\_idx} \\times n_{\\text{records}} + \\text{record\\_idx}$$"
]
},
{
@@ -295,22 +184,27 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Column 60037:\n",
- " column_index: 60037\n",
- " cd_geoid: 105\n",
- " household_id: 946\n",
- " household_index: 42\n",
+ "Column 12041:\n",
+ " clone_idx: 1\n",
+ " record_idx: 42\n",
+ " state_fips: 45\n",
+ " cd_geoid: 4507\n",
+ " block_geoid: 450510801013029\n",
"\n",
"This column has non-zero values in 0 target rows\n"
]
}
],
"source": [
- "col_idx = tracer.n_households * 5 + 42\n",
- "col_info = tracer.get_column_info(col_idx)\n",
+ "col_idx = 1 * n_records + 42 # clone 1, record 42\n",
+ "clone_idx = col_idx // n_records\n",
+ "record_idx = col_idx % n_records\n",
"print(f\"Column {col_idx}:\")\n",
- "for k, v in col_info.items():\n",
- " print(f\" {k}: {v}\")\n",
+ "print(f\" clone_idx: {clone_idx}\")\n",
+ "print(f\" record_idx: {record_idx}\")\n",
+ "print(f\" state_fips: {geography.state_fips[col_idx]}\")\n",
+ "print(f\" cd_geoid: {geography.cd_geoid[col_idx]}\")\n",
+ "print(f\" block_geoid: {geography.block_geoid[col_idx]}\")\n",
"\n",
"col_vec = X_sparse[:, col_idx]\n",
"nz_rows = col_vec.nonzero()[0]\n",
@@ -318,10 +212,10 @@
"if len(nz_rows) > 0:\n",
" print(\"First 5 target rows:\")\n",
" for r in nz_rows[:5]:\n",
- " ri = tracer.get_row_info(r)\n",
+ " row = targets_df.iloc[r]\n",
" print(\n",
- " f\" row {r}: {ri['variable']} \"\n",
- " f\"(geo={ri['geographic_id']}, \"\n",
+ " f\" row {r}: {row['variable']} \"\n",
+ " f\"(geo={row['geographic_id']}, \"\n",
" f\"val={X_sparse[r, col_idx]:.2f})\"\n",
" )"
]
@@ -335,16 +229,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Block formula verified: cd_block=5 * n_hh=11999 + hh_idx=42 = 60037\n"
+ "Block formula verified: clone_idx=1 * n_records=11999 + record_idx=42 = 12041\n"
]
}
],
"source": [
- "expected_col = 5 * tracer.n_households + 42\n",
+ "expected_col = 1 * n_records + 42\n",
"assert col_idx == expected_col, f\"{col_idx} != {expected_col}\"\n",
"print(\n",
" f\"Block formula verified: \"\n",
- " f\"cd_block=5 * n_hh={tracer.n_households} + hh_idx=42 = {expected_col}\"\n",
+ " f\"clone_idx=1 * n_records={n_records} + record_idx=42 = {expected_col}\"\n",
")"
]
},
@@ -424,30 +318,30 @@
"text": [
"\n",
"--- Group 0: National ACA PTC Person Count (1 target, value=19,743,689) ---\n",
- " row_index variable variable_desc geographic_id target_value stratum_id domain_variable\n",
- " 0 person_count People represented US 19743689.0 491 aca_ptc\n",
+ " variable geographic_id value\n",
+ "person_count US 19743689.0\n",
"\n",
"--- Group 2: State Snap (51 targets) ---\n",
- " row_index variable variable_desc geographic_id target_value stratum_id domain_variable\n",
- " 52 snap SNAP allotment 1 1733693703.0 9330 snap\n",
- " 53 snap SNAP allotment 10 254854243.0 9337 snap\n",
- " 54 snap SNAP allotment 11 319119173.0 9338 snap\n",
- " 55 snap SNAP allotment 12 6604797454.0 9339 snap\n",
- " 56 snap SNAP allotment 13 3281329856.0 9340 snap\n",
- " 57 snap SNAP allotment 15 731331421.0 9341 snap\n",
- " 58 snap SNAP allotment 16 281230283.0 9342 snap\n",
- " 59 snap SNAP allotment 17 4469341818.0 9343 snap\n",
+ "variable geographic_id value\n",
+ " snap 1 1733693703.0\n",
+ " snap 10 254854243.0\n",
+ " snap 11 319119173.0\n",
+ " snap 12 6604797454.0\n",
+ " snap 13 3281329856.0\n",
+ " snap 15 731331421.0\n",
+ " snap 16 281230283.0\n",
+ " snap 17 4469341818.0\n",
"\n",
"--- Group 4: District ACA PTC Tax Unit Count (436 targets) ---\n",
- " row_index variable variable_desc geographic_id target_value stratum_id domain_variable\n",
- " 975 tax_unit_count Tax units represented 1001 25064.255490 21717 aca_ptc\n",
- " 976 tax_unit_count Tax units represented 101 9794.081624 21631 aca_ptc\n",
- " 977 tax_unit_count Tax units represented 102 11597.544977 21632 aca_ptc\n",
- " 978 tax_unit_count Tax units represented 103 9160.097959 21633 aca_ptc\n",
- " 979 tax_unit_count Tax units represented 104 9786.728220 21634 aca_ptc\n",
- " 980 tax_unit_count Tax units represented 105 18266.234326 21635 aca_ptc\n",
- " 981 tax_unit_count Tax units represented 106 25397.026846 21636 aca_ptc\n",
- " 982 tax_unit_count Tax units represented 107 11798.642968 21637 aca_ptc\n"
+ " variable geographic_id value\n",
+ "tax_unit_count 1001 25064.255490\n",
+ "tax_unit_count 101 9794.081624\n",
+ "tax_unit_count 102 11597.544977\n",
+ "tax_unit_count 103 9160.097959\n",
+ "tax_unit_count 104 9786.728220\n",
+ "tax_unit_count 105 18266.234326\n",
+ "tax_unit_count 106 25397.026846\n",
+ "tax_unit_count 107 11798.642968\n"
]
}
],
@@ -455,18 +349,19 @@
"for gid in [0, 2, 4]:\n",
" if gid >= len(group_info):\n",
" continue\n",
- " rows = tracer.get_group_rows(gid)\n",
+ " mask = target_groups == gid\n",
+ " rows = targets_df[mask][[\"variable\", \"geographic_id\", \"value\"]].head(8)\n",
" print(f\"\\n--- {group_info[gid]} ---\")\n",
- " print(rows.head(8).to_string(index=False))"
+ " print(rows.to_string(index=False))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## 6. Tracing a household\n",
+ "## 6. Tracing a household across clones\n",
"\n",
- "One CPS household appears in every CD block (once per CD = 436 column positions). But most of those columns are zero — the household only contributes where its characteristics match the target constraints."
+ "One CPS record appears once per clone (N_CLONES column positions). Each clone places it in a different census block/CD/state, so it contributes to different geographic targets depending on the clone."
]
},
{
@@ -478,9 +373,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Example SNAP-receiving household: 654\n",
+ "Example SNAP-receiving household: record index 23\n",
"SNAP value: $70\n",
- "Column positions across CDs: 436\n"
+ "\n",
+ "Column positions across 3 clones:\n",
+ " col 23: TX (state=48, CD=4829) — 0 non-zero rows\n",
+ " col 12022: IL (state=17, CD=1708) — 0 non-zero rows\n",
+ " col 24021: FL (state=12, CD=1220) — 3 non-zero rows\n"
]
}
],
@@ -488,12 +387,20 @@
"snap_values = sim.calculate(\"snap\", map_to=\"household\").values\n",
"hh_ids = sim.calculate(\"household_id\", map_to=\"household\").values\n",
"positive_snap = hh_ids[snap_values > 0]\n",
- "example_hh = int(positive_snap[0])\n",
- "print(f\"Example SNAP-receiving household: {example_hh}\")\n",
- "print(f\"SNAP value: ${snap_values[hh_ids == example_hh][0]:,.0f}\")\n",
- "\n",
- "positions = tracer.get_household_column_positions(example_hh)\n",
- "print(f\"Column positions across CDs: {len(positions)}\")"
+ "example_hh_idx = int(np.where(snap_values > 0)[0][0])\n",
+ "print(f\"Example SNAP-receiving household: record index {example_hh_idx}\")\n",
+ "print(f\"SNAP value: ${snap_values[example_hh_idx]:,.0f}\")\n",
+ "\n",
+ "clone_cols = [c * n_records + example_hh_idx for c in range(N_CLONES)]\n",
+ "print(f\"\\nColumn positions across {N_CLONES} clones:\")\n",
+ "for col in clone_cols:\n",
+ " state = geography.state_fips[col]\n",
+ " cd = geography.cd_geoid[col]\n",
+ " block = geography.block_geoid[col]\n",
+ " col_vec = X_sparse[:, col]\n",
+ " nnz = col_vec.nnz\n",
+ " abbr = STATE_CODES.get(state, \"??\")\n",
+ " print(f\" col {col}: {abbr} (state={state}, CD={cd}) — {nnz} non-zero rows\")"
]
},
{
@@ -505,42 +412,30 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "CDs with non-zero entries: 160\n",
- "CDs with all-zero columns: 276\n",
"\n",
- "Top 10 CDs by activity for household 654:\n",
- " CD 1001 (DE): 3 non-zero rows\n",
- " CD 1101 (DC): 3 non-zero rows\n",
- " CD 1201 (FL): 3 non-zero rows\n",
- " CD 1202 (FL): 3 non-zero rows\n",
- " CD 1203 (FL): 3 non-zero rows\n",
- " CD 1204 (FL): 3 non-zero rows\n",
- " CD 1205 (FL): 3 non-zero rows\n",
- " CD 1206 (FL): 3 non-zero rows\n",
- " CD 1207 (FL): 3 non-zero rows\n",
- " CD 1208 (FL): 3 non-zero rows\n"
+ "Clone 2 (col 24021, CD 1220):\n",
+ " household_count (geo=12): 1.00\n",
+ " snap (geo=12): 70.08\n",
+ " household_count (geo=1220): 1.00\n"
]
}
],
"source": [
- "cd_activity = []\n",
- "for cd_geoid, col_pos in positions.items():\n",
- " col_vec = X_sparse[:, col_pos]\n",
- " nnz = col_vec.nnz\n",
- " cd_activity.append({\"cd_geoid\": cd_geoid, \"col_pos\": col_pos, \"nnz\": nnz})\n",
- "\n",
- "cd_df = pd.DataFrame(cd_activity)\n",
- "n_active = (cd_df[\"nnz\"] > 0).sum()\n",
- "n_zero = (cd_df[\"nnz\"] == 0).sum()\n",
- "print(f\"CDs with non-zero entries: {n_active}\")\n",
- "print(f\"CDs with all-zero columns: {n_zero}\")\n",
- "\n",
- "top10 = cd_df.nlargest(10, \"nnz\")\n",
- "print(f\"\\nTop 10 CDs by activity for household {example_hh}:\")\n",
- "for _, r in top10.iterrows():\n",
- " state_fips = int(r[\"cd_geoid\"]) // 100\n",
- " abbr = STATE_CODES.get(state_fips, \"??\")\n",
- " print(f\" CD {r['cd_geoid']} ({abbr}): {r['nnz']} non-zero rows\")"
+ "for col in clone_cols:\n",
+ " col_vec = X_sparse[:, col]\n",
+ " nz_rows = col_vec.nonzero()[0]\n",
+ " if len(nz_rows) == 0:\n",
+ " continue\n",
+ " clone_i = col // n_records\n",
+ " print(f\"\\nClone {clone_i} (col {col}, CD {geography.cd_geoid[col]}):\")\n",
+ " for r in nz_rows[:5]:\n",
+ " row = targets_df.iloc[r]\n",
+ " print(\n",
+ " f\" {row['variable']} (geo={row['geographic_id']}): \"\n",
+ " f\"{X_sparse[r, col]:.2f}\"\n",
+ " )\n",
+ " if len(nz_rows) > 5:\n",
+ " print(f\" ... and {len(nz_rows) - 5} more\")"
]
},
{
@@ -559,10 +454,10 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Total cells: 7,381,736,804\n",
- "Non-zero entries: 2,199,033\n",
- "Density: 0.000298\n",
- "Sparsity: 99.9702%\n"
+ "Total cells: 50,791,767\n",
+ "Non-zero entries: 14,946\n",
+ "Density: 0.000294\n",
+ "Sparsity: 99.9706%\n"
]
}
],
@@ -577,52 +472,10 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Non-zeros per row:\n",
- " min: 0\n",
- " median: 0\n",
- " mean: 1,558\n",
- " max: 77,116\n",
- "\n",
- "By geographic level:\n",
- " National : n= 1, median nnz= 0, range=[0, 0]\n",
- " State : n= 102, median nnz= 10,423, range=[1,468, 77,116]\n",
- " District : n=1308, median nnz= 0, range=[0, 1,988]\n"
- ]
- }
- ],
- "source": [
- "nnz_per_row = np.diff(X_sparse.indptr)\n",
- "print(f\"Non-zeros per row:\")\n",
- "print(f\" min: {nnz_per_row.min():,}\")\n",
- "print(f\" median: {int(np.median(nnz_per_row)):,}\")\n",
- "print(f\" mean: {nnz_per_row.mean():,.0f}\")\n",
- "print(f\" max: {nnz_per_row.max():,}\")\n",
- "\n",
- "from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n",
- " _get_geo_level,\n",
- ")\n",
- "\n",
- "geo_levels = targets_df[\"geographic_id\"].apply(_get_geo_level)\n",
- "level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n",
- "print(\"\\nBy geographic level:\")\n",
- "for level in [0, 1, 2]:\n",
- " mask = (geo_levels == level).values\n",
- " if mask.any():\n",
- " vals = nnz_per_row[mask]\n",
- " print(\n",
- " f\" {level_names[level]:10s}: \"\n",
- " f\"n={mask.sum():>4d}, \"\n",
- " f\"median nnz={int(np.median(vals)):>7,}, \"\n",
- " f\"range=[{vals.min():,}, {vals.max():,}]\"\n",
- " )"
- ]
+ "outputs": [],
+ "source": "nnz_per_row = np.diff(X_sparse.indptr)\nprint(f\"Non-zeros per row:\")\nprint(f\" min: {nnz_per_row.min():,}\")\nprint(f\" median: {int(np.median(nnz_per_row)):,}\")\nprint(f\" mean: {nnz_per_row.mean():,.0f}\")\nprint(f\" max: {nnz_per_row.max():,}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nprint(\"\\nBy geographic level:\")\nfor level in [0, 1, 2]:\n mask = (geo_levels == level).values\n if mask.any():\n vals = nnz_per_row[mask]\n print(\n f\" {level_names[level]:10s}: \"\n f\"n={mask.sum():>4d}, \"\n f\"median nnz={int(np.median(vals)):>7,}, \"\n f\"range=[{vals.min():,}, {vals.max():,}]\"\n )"
},
{
"cell_type": "code",
@@ -633,38 +486,39 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Non-zeros per CD block:\n",
- " min: 4,326 (CD 2801)\n",
- " median: 4,884\n",
- " max: 5,964 (CD 1101)\n"
+ "Non-zeros per clone block:\n",
+ " clone nnz unique_states\n",
+ " 0 4962 50\n",
+ " 1 4988 50\n",
+ " 2 4996 50\n"
]
}
],
"source": [
- "n_hh = tracer.n_households\n",
- "n_cds = tracer.n_geographies\n",
- "cd_nnz = []\n",
- "for cd_idx in range(n_cds):\n",
- " block = X_sparse[:, cd_idx * n_hh : (cd_idx + 1) * n_hh]\n",
- " cd_nnz.append({\n",
- " \"cd_geoid\": cds_to_calibrate[cd_idx],\n",
+ "clone_nnz = []\n",
+ "for ci in range(N_CLONES):\n",
+ " block = X_sparse[:, ci * n_records : (ci + 1) * n_records]\n",
+ " n_states = len(np.unique(geography.state_fips[ci * n_records : (ci + 1) * n_records]))\n",
+ " clone_nnz.append({\n",
+ " \"clone\": ci,\n",
" \"nnz\": block.nnz,\n",
+ " \"unique_states\": n_states,\n",
" })\n",
"\n",
- "cd_nnz_df = pd.DataFrame(cd_nnz)\n",
- "print(f\"Non-zeros per CD block:\")\n",
- "print(f\" min: {cd_nnz_df['nnz'].min():,} (CD {cd_nnz_df.loc[cd_nnz_df['nnz'].idxmin(), 'cd_geoid']})\")\n",
- "print(f\" median: {int(cd_nnz_df['nnz'].median()):,}\")\n",
- "print(f\" max: {cd_nnz_df['nnz'].max():,} (CD {cd_nnz_df.loc[cd_nnz_df['nnz'].idxmax(), 'cd_geoid']})\")"
+ "clone_df = pd.DataFrame(clone_nnz)\n",
+ "print(\"Non-zeros per clone block:\")\n",
+ "print(clone_df.to_string(index=False))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## 8. Group exclusion\n",
+ "## 8. Dropping target groups\n",
+ "\n",
+ "Some target groups are redundant after hierarchical uprating. For example, state-level SNAP Household Count (Group 1) is redundant with district-level SNAP Household Count (Group 5) — the district targets were already reconciled to sum to the state totals.\n",
"\n",
- "`GROUPS_TO_EXCLUDE` removes redundant or harmful constraints before training. For example, state-level SNAP household counts (Group 1) are redundant with reconciled district rows (Group 4) and can confuse the optimizer. Group IDs depend on database contents, so always check `print_matrix_structure()` output first."
+ "Specify drops as `(variable_label, geo_level)` pairs. The labels come from the group descriptions above; the geo level is \"National\", \"State\", or \"District\"."
]
},
{
@@ -676,24 +530,27 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Before exclusion: 1411 rows\n",
- "Excluding groups [1]: dropping 51 rows\n",
- "After exclusion: 1360 rows\n"
+ "Matrix before: 1411 rows\n",
+ " DROPPING Group 1: State SNAP Household Count (51 targets) (51 rows)\n",
+ "\n",
+ " KEEPING Group 0: National ACA PTC Person Count (1 target, value=19,743,689) (1 rows)\n",
+ " KEEPING Group 2: State Snap (51 targets) (51 rows)\n",
+ " KEEPING Group 3: District Aca Ptc (436 targets) (436 rows)\n",
+ " KEEPING Group 4: District ACA PTC Tax Unit Count (436 targets) (436 rows)\n",
+ " KEEPING Group 5: District SNAP Household Count (436 targets) (436 rows)\n",
+ "\n",
+ "Matrix after: 1360 rows\n"
]
}
],
"source": [
- "GROUPS_TO_EXCLUDE = [1]\n",
- "\n",
- "print(f\"Before exclusion: {X_sparse.shape[0]} rows\")\n",
+ "GROUPS_TO_DROP = [\n",
+ " (\"SNAP Household Count\", \"State\"),\n",
+ "]\n",
"\n",
- "keep_mask = ~np.isin(tracer.target_groups, GROUPS_TO_EXCLUDE)\n",
- "n_dropped = (~keep_mask).sum()\n",
- "print(f\"Excluding groups {GROUPS_TO_EXCLUDE}: dropping {n_dropped} rows\")\n",
- "\n",
- "X_filtered = X_sparse[keep_mask, :]\n",
- "targets_filtered = targets_df[keep_mask].reset_index(drop=True)\n",
- "print(f\"After exclusion: {X_filtered.shape[0]} rows\")"
+ "targets_filtered, X_filtered = drop_target_groups(\n",
+ " targets_df, X_sparse, target_groups, group_info, GROUPS_TO_DROP\n",
+ ")"
]
},
{
@@ -756,883 +613,15 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Achievable targets: 487\n",
- "Impossible targets: 873\n",
- "\n",
- "Impossible targets:\n",
- " aca_ptc/person_count (geo=US)\n",
- " aca_ptc/aca_ptc (geo=1001)\n",
- " aca_ptc/aca_ptc (geo=101)\n",
- " aca_ptc/aca_ptc (geo=102)\n",
- " aca_ptc/aca_ptc (geo=103)\n",
- " aca_ptc/aca_ptc (geo=104)\n",
- " aca_ptc/aca_ptc (geo=105)\n",
- " aca_ptc/aca_ptc (geo=106)\n",
- " aca_ptc/aca_ptc (geo=107)\n",
- " aca_ptc/aca_ptc (geo=1101)\n",
- " aca_ptc/aca_ptc (geo=1201)\n",
- " aca_ptc/aca_ptc (geo=1202)\n",
- " aca_ptc/aca_ptc (geo=1203)\n",
- " aca_ptc/aca_ptc (geo=1204)\n",
- " aca_ptc/aca_ptc (geo=1205)\n",
- " aca_ptc/aca_ptc (geo=1206)\n",
- " aca_ptc/aca_ptc (geo=1207)\n",
- " aca_ptc/aca_ptc (geo=1208)\n",
- " aca_ptc/aca_ptc (geo=1209)\n",
- " aca_ptc/aca_ptc (geo=1210)\n",
- " aca_ptc/aca_ptc (geo=1211)\n",
- " aca_ptc/aca_ptc (geo=1212)\n",
- " aca_ptc/aca_ptc (geo=1213)\n",
- " aca_ptc/aca_ptc (geo=1214)\n",
- " aca_ptc/aca_ptc (geo=1215)\n",
- " aca_ptc/aca_ptc (geo=1216)\n",
- " aca_ptc/aca_ptc (geo=1217)\n",
- " aca_ptc/aca_ptc (geo=1218)\n",
- " aca_ptc/aca_ptc (geo=1219)\n",
- " aca_ptc/aca_ptc (geo=1220)\n",
- " aca_ptc/aca_ptc (geo=1221)\n",
- " aca_ptc/aca_ptc (geo=1222)\n",
- " aca_ptc/aca_ptc (geo=1223)\n",
- " aca_ptc/aca_ptc (geo=1224)\n",
- " aca_ptc/aca_ptc (geo=1225)\n",
- " aca_ptc/aca_ptc (geo=1226)\n",
- " aca_ptc/aca_ptc (geo=1227)\n",
- " aca_ptc/aca_ptc (geo=1228)\n",
- " aca_ptc/aca_ptc (geo=1301)\n",
- " aca_ptc/aca_ptc (geo=1302)\n",
- " aca_ptc/aca_ptc (geo=1303)\n",
- " aca_ptc/aca_ptc (geo=1304)\n",
- " aca_ptc/aca_ptc (geo=1305)\n",
- " aca_ptc/aca_ptc (geo=1306)\n",
- " aca_ptc/aca_ptc (geo=1307)\n",
- " aca_ptc/aca_ptc (geo=1308)\n",
- " aca_ptc/aca_ptc (geo=1309)\n",
- " aca_ptc/aca_ptc (geo=1310)\n",
- " aca_ptc/aca_ptc (geo=1311)\n",
- " aca_ptc/aca_ptc (geo=1312)\n",
- " aca_ptc/aca_ptc (geo=1313)\n",
- " aca_ptc/aca_ptc (geo=1314)\n",
- " aca_ptc/aca_ptc (geo=1501)\n",
- " aca_ptc/aca_ptc (geo=1502)\n",
- " aca_ptc/aca_ptc (geo=1601)\n",
- " aca_ptc/aca_ptc (geo=1602)\n",
- " aca_ptc/aca_ptc (geo=1701)\n",
- " aca_ptc/aca_ptc (geo=1702)\n",
- " aca_ptc/aca_ptc (geo=1703)\n",
- " aca_ptc/aca_ptc (geo=1704)\n",
- " aca_ptc/aca_ptc (geo=1705)\n",
- " aca_ptc/aca_ptc (geo=1706)\n",
- " aca_ptc/aca_ptc (geo=1707)\n",
- " aca_ptc/aca_ptc (geo=1708)\n",
- " aca_ptc/aca_ptc (geo=1709)\n",
- " aca_ptc/aca_ptc (geo=1710)\n",
- " aca_ptc/aca_ptc (geo=1711)\n",
- " aca_ptc/aca_ptc (geo=1712)\n",
- " aca_ptc/aca_ptc (geo=1713)\n",
- " aca_ptc/aca_ptc (geo=1714)\n",
- " aca_ptc/aca_ptc (geo=1715)\n",
- " aca_ptc/aca_ptc (geo=1716)\n",
- " aca_ptc/aca_ptc (geo=1717)\n",
- " aca_ptc/aca_ptc (geo=1801)\n",
- " aca_ptc/aca_ptc (geo=1802)\n",
- " aca_ptc/aca_ptc (geo=1803)\n",
- " aca_ptc/aca_ptc (geo=1804)\n",
- " aca_ptc/aca_ptc (geo=1805)\n",
- " aca_ptc/aca_ptc (geo=1806)\n",
- " aca_ptc/aca_ptc (geo=1807)\n",
- " aca_ptc/aca_ptc (geo=1808)\n",
- " aca_ptc/aca_ptc (geo=1809)\n",
- " aca_ptc/aca_ptc (geo=1901)\n",
- " aca_ptc/aca_ptc (geo=1902)\n",
- " aca_ptc/aca_ptc (geo=1903)\n",
- " aca_ptc/aca_ptc (geo=1904)\n",
- " aca_ptc/aca_ptc (geo=2001)\n",
- " aca_ptc/aca_ptc (geo=2002)\n",
- " aca_ptc/aca_ptc (geo=2003)\n",
- " aca_ptc/aca_ptc (geo=2004)\n",
- " aca_ptc/aca_ptc (geo=201)\n",
- " aca_ptc/aca_ptc (geo=2101)\n",
- " aca_ptc/aca_ptc (geo=2102)\n",
- " aca_ptc/aca_ptc (geo=2103)\n",
- " aca_ptc/aca_ptc (geo=2104)\n",
- " aca_ptc/aca_ptc (geo=2105)\n",
- " aca_ptc/aca_ptc (geo=2106)\n",
- " aca_ptc/aca_ptc (geo=2201)\n",
- " aca_ptc/aca_ptc (geo=2202)\n",
- " aca_ptc/aca_ptc (geo=2203)\n",
- " aca_ptc/aca_ptc (geo=2204)\n",
- " aca_ptc/aca_ptc (geo=2205)\n",
- " aca_ptc/aca_ptc (geo=2206)\n",
- " aca_ptc/aca_ptc (geo=2301)\n",
- " aca_ptc/aca_ptc (geo=2302)\n",
- " aca_ptc/aca_ptc (geo=2401)\n",
- " aca_ptc/aca_ptc (geo=2402)\n",
- " aca_ptc/aca_ptc (geo=2403)\n",
- " aca_ptc/aca_ptc (geo=2404)\n",
- " aca_ptc/aca_ptc (geo=2405)\n",
- " aca_ptc/aca_ptc (geo=2406)\n",
- " aca_ptc/aca_ptc (geo=2407)\n",
- " aca_ptc/aca_ptc (geo=2408)\n",
- " aca_ptc/aca_ptc (geo=2501)\n",
- " aca_ptc/aca_ptc (geo=2502)\n",
- " aca_ptc/aca_ptc (geo=2503)\n",
- " aca_ptc/aca_ptc (geo=2504)\n",
- " aca_ptc/aca_ptc (geo=2505)\n",
- " aca_ptc/aca_ptc (geo=2506)\n",
- " aca_ptc/aca_ptc (geo=2507)\n",
- " aca_ptc/aca_ptc (geo=2508)\n",
- " aca_ptc/aca_ptc (geo=2509)\n",
- " aca_ptc/aca_ptc (geo=2601)\n",
- " aca_ptc/aca_ptc (geo=2602)\n",
- " aca_ptc/aca_ptc (geo=2603)\n",
- " aca_ptc/aca_ptc (geo=2604)\n",
- " aca_ptc/aca_ptc (geo=2605)\n",
- " aca_ptc/aca_ptc (geo=2606)\n",
- " aca_ptc/aca_ptc (geo=2607)\n",
- " aca_ptc/aca_ptc (geo=2608)\n",
- " aca_ptc/aca_ptc (geo=2609)\n",
- " aca_ptc/aca_ptc (geo=2610)\n",
- " aca_ptc/aca_ptc (geo=2611)\n",
- " aca_ptc/aca_ptc (geo=2612)\n",
- " aca_ptc/aca_ptc (geo=2613)\n",
- " aca_ptc/aca_ptc (geo=2701)\n",
- " aca_ptc/aca_ptc (geo=2702)\n",
- " aca_ptc/aca_ptc (geo=2703)\n",
- " aca_ptc/aca_ptc (geo=2704)\n",
- " aca_ptc/aca_ptc (geo=2705)\n",
- " aca_ptc/aca_ptc (geo=2706)\n",
- " aca_ptc/aca_ptc (geo=2707)\n",
- " aca_ptc/aca_ptc (geo=2708)\n",
- " aca_ptc/aca_ptc (geo=2801)\n",
- " aca_ptc/aca_ptc (geo=2802)\n",
- " aca_ptc/aca_ptc (geo=2803)\n",
- " aca_ptc/aca_ptc (geo=2804)\n",
- " aca_ptc/aca_ptc (geo=2901)\n",
- " aca_ptc/aca_ptc (geo=2902)\n",
- " aca_ptc/aca_ptc (geo=2903)\n",
- " aca_ptc/aca_ptc (geo=2904)\n",
- " aca_ptc/aca_ptc (geo=2905)\n",
- " aca_ptc/aca_ptc (geo=2906)\n",
- " aca_ptc/aca_ptc (geo=2907)\n",
- " aca_ptc/aca_ptc (geo=2908)\n",
- " aca_ptc/aca_ptc (geo=3001)\n",
- " aca_ptc/aca_ptc (geo=3002)\n",
- " aca_ptc/aca_ptc (geo=3101)\n",
- " aca_ptc/aca_ptc (geo=3102)\n",
- " aca_ptc/aca_ptc (geo=3103)\n",
- " aca_ptc/aca_ptc (geo=3201)\n",
- " aca_ptc/aca_ptc (geo=3202)\n",
- " aca_ptc/aca_ptc (geo=3203)\n",
- " aca_ptc/aca_ptc (geo=3204)\n",
- " aca_ptc/aca_ptc (geo=3301)\n",
- " aca_ptc/aca_ptc (geo=3302)\n",
- " aca_ptc/aca_ptc (geo=3401)\n",
- " aca_ptc/aca_ptc (geo=3402)\n",
- " aca_ptc/aca_ptc (geo=3403)\n",
- " aca_ptc/aca_ptc (geo=3404)\n",
- " aca_ptc/aca_ptc (geo=3405)\n",
- " aca_ptc/aca_ptc (geo=3406)\n",
- " aca_ptc/aca_ptc (geo=3407)\n",
- " aca_ptc/aca_ptc (geo=3408)\n",
- " aca_ptc/aca_ptc (geo=3409)\n",
- " aca_ptc/aca_ptc (geo=3410)\n",
- " aca_ptc/aca_ptc (geo=3411)\n",
- " aca_ptc/aca_ptc (geo=3412)\n",
- " aca_ptc/aca_ptc (geo=3501)\n",
- " aca_ptc/aca_ptc (geo=3502)\n",
- " aca_ptc/aca_ptc (geo=3503)\n",
- " aca_ptc/aca_ptc (geo=3601)\n",
- " aca_ptc/aca_ptc (geo=3602)\n",
- " aca_ptc/aca_ptc (geo=3603)\n",
- " aca_ptc/aca_ptc (geo=3604)\n",
- " aca_ptc/aca_ptc (geo=3605)\n",
- " aca_ptc/aca_ptc (geo=3606)\n",
- " aca_ptc/aca_ptc (geo=3607)\n",
- " aca_ptc/aca_ptc (geo=3608)\n",
- " aca_ptc/aca_ptc (geo=3609)\n",
- " aca_ptc/aca_ptc (geo=3610)\n",
- " aca_ptc/aca_ptc (geo=3611)\n",
- " aca_ptc/aca_ptc (geo=3612)\n",
- " aca_ptc/aca_ptc (geo=3613)\n",
- " aca_ptc/aca_ptc (geo=3614)\n",
- " aca_ptc/aca_ptc (geo=3615)\n",
- " aca_ptc/aca_ptc (geo=3616)\n",
- " aca_ptc/aca_ptc (geo=3617)\n",
- " aca_ptc/aca_ptc (geo=3618)\n",
- " aca_ptc/aca_ptc (geo=3619)\n",
- " aca_ptc/aca_ptc (geo=3620)\n",
- " aca_ptc/aca_ptc (geo=3621)\n",
- " aca_ptc/aca_ptc (geo=3622)\n",
- " aca_ptc/aca_ptc (geo=3623)\n",
- " aca_ptc/aca_ptc (geo=3624)\n",
- " aca_ptc/aca_ptc (geo=3625)\n",
- " aca_ptc/aca_ptc (geo=3626)\n",
- " aca_ptc/aca_ptc (geo=3701)\n",
- " aca_ptc/aca_ptc (geo=3702)\n",
- " aca_ptc/aca_ptc (geo=3703)\n",
- " aca_ptc/aca_ptc (geo=3704)\n",
- " aca_ptc/aca_ptc (geo=3705)\n",
- " aca_ptc/aca_ptc (geo=3706)\n",
- " aca_ptc/aca_ptc (geo=3707)\n",
- " aca_ptc/aca_ptc (geo=3708)\n",
- " aca_ptc/aca_ptc (geo=3709)\n",
- " aca_ptc/aca_ptc (geo=3710)\n",
- " aca_ptc/aca_ptc (geo=3711)\n",
- " aca_ptc/aca_ptc (geo=3712)\n",
- " aca_ptc/aca_ptc (geo=3713)\n",
- " aca_ptc/aca_ptc (geo=3714)\n",
- " aca_ptc/aca_ptc (geo=3801)\n",
- " aca_ptc/aca_ptc (geo=3901)\n",
- " aca_ptc/aca_ptc (geo=3902)\n",
- " aca_ptc/aca_ptc (geo=3903)\n",
- " aca_ptc/aca_ptc (geo=3904)\n",
- " aca_ptc/aca_ptc (geo=3905)\n",
- " aca_ptc/aca_ptc (geo=3906)\n",
- " aca_ptc/aca_ptc (geo=3907)\n",
- " aca_ptc/aca_ptc (geo=3908)\n",
- " aca_ptc/aca_ptc (geo=3909)\n",
- " aca_ptc/aca_ptc (geo=3910)\n",
- " aca_ptc/aca_ptc (geo=3911)\n",
- " aca_ptc/aca_ptc (geo=3912)\n",
- " aca_ptc/aca_ptc (geo=3913)\n",
- " aca_ptc/aca_ptc (geo=3914)\n",
- " aca_ptc/aca_ptc (geo=3915)\n",
- " aca_ptc/aca_ptc (geo=4001)\n",
- " aca_ptc/aca_ptc (geo=4002)\n",
- " aca_ptc/aca_ptc (geo=4003)\n",
- " aca_ptc/aca_ptc (geo=4004)\n",
- " aca_ptc/aca_ptc (geo=4005)\n",
- " aca_ptc/aca_ptc (geo=401)\n",
- " aca_ptc/aca_ptc (geo=402)\n",
- " aca_ptc/aca_ptc (geo=403)\n",
- " aca_ptc/aca_ptc (geo=404)\n",
- " aca_ptc/aca_ptc (geo=405)\n",
- " aca_ptc/aca_ptc (geo=406)\n",
- " aca_ptc/aca_ptc (geo=407)\n",
- " aca_ptc/aca_ptc (geo=408)\n",
- " aca_ptc/aca_ptc (geo=409)\n",
- " aca_ptc/aca_ptc (geo=4101)\n",
- " aca_ptc/aca_ptc (geo=4102)\n",
- " aca_ptc/aca_ptc (geo=4103)\n",
- " aca_ptc/aca_ptc (geo=4104)\n",
- " aca_ptc/aca_ptc (geo=4105)\n",
- " aca_ptc/aca_ptc (geo=4106)\n",
- " aca_ptc/aca_ptc (geo=4201)\n",
- " aca_ptc/aca_ptc (geo=4202)\n",
- " aca_ptc/aca_ptc (geo=4203)\n",
- " aca_ptc/aca_ptc (geo=4204)\n",
- " aca_ptc/aca_ptc (geo=4205)\n",
- " aca_ptc/aca_ptc (geo=4206)\n",
- " aca_ptc/aca_ptc (geo=4207)\n",
- " aca_ptc/aca_ptc (geo=4208)\n",
- " aca_ptc/aca_ptc (geo=4209)\n",
- " aca_ptc/aca_ptc (geo=4210)\n",
- " aca_ptc/aca_ptc (geo=4211)\n",
- " aca_ptc/aca_ptc (geo=4212)\n",
- " aca_ptc/aca_ptc (geo=4213)\n",
- " aca_ptc/aca_ptc (geo=4214)\n",
- " aca_ptc/aca_ptc (geo=4215)\n",
- " aca_ptc/aca_ptc (geo=4216)\n",
- " aca_ptc/aca_ptc (geo=4217)\n",
- " aca_ptc/aca_ptc (geo=4401)\n",
- " aca_ptc/aca_ptc (geo=4402)\n",
- " aca_ptc/aca_ptc (geo=4501)\n",
- " aca_ptc/aca_ptc (geo=4502)\n",
- " aca_ptc/aca_ptc (geo=4503)\n",
- " aca_ptc/aca_ptc (geo=4504)\n",
- " aca_ptc/aca_ptc (geo=4505)\n",
- " aca_ptc/aca_ptc (geo=4506)\n",
- " aca_ptc/aca_ptc (geo=4507)\n",
- " aca_ptc/aca_ptc (geo=4601)\n",
- " aca_ptc/aca_ptc (geo=4701)\n",
- " aca_ptc/aca_ptc (geo=4702)\n",
- " aca_ptc/aca_ptc (geo=4703)\n",
- " aca_ptc/aca_ptc (geo=4704)\n",
- " aca_ptc/aca_ptc (geo=4705)\n",
- " aca_ptc/aca_ptc (geo=4706)\n",
- " aca_ptc/aca_ptc (geo=4707)\n",
- " aca_ptc/aca_ptc (geo=4708)\n",
- " aca_ptc/aca_ptc (geo=4709)\n",
- " aca_ptc/aca_ptc (geo=4801)\n",
- " aca_ptc/aca_ptc (geo=4802)\n",
- " aca_ptc/aca_ptc (geo=4803)\n",
- " aca_ptc/aca_ptc (geo=4804)\n",
- " aca_ptc/aca_ptc (geo=4805)\n",
- " aca_ptc/aca_ptc (geo=4806)\n",
- " aca_ptc/aca_ptc (geo=4807)\n",
- " aca_ptc/aca_ptc (geo=4808)\n",
- " aca_ptc/aca_ptc (geo=4809)\n",
- " aca_ptc/aca_ptc (geo=4810)\n",
- " aca_ptc/aca_ptc (geo=4811)\n",
- " aca_ptc/aca_ptc (geo=4812)\n",
- " aca_ptc/aca_ptc (geo=4813)\n",
- " aca_ptc/aca_ptc (geo=4814)\n",
- " aca_ptc/aca_ptc (geo=4815)\n",
- " aca_ptc/aca_ptc (geo=4816)\n",
- " aca_ptc/aca_ptc (geo=4817)\n",
- " aca_ptc/aca_ptc (geo=4818)\n",
- " aca_ptc/aca_ptc (geo=4819)\n",
- " aca_ptc/aca_ptc (geo=4820)\n",
- " aca_ptc/aca_ptc (geo=4821)\n",
- " aca_ptc/aca_ptc (geo=4822)\n",
- " aca_ptc/aca_ptc (geo=4823)\n",
- " aca_ptc/aca_ptc (geo=4824)\n",
- " aca_ptc/aca_ptc (geo=4825)\n",
- " aca_ptc/aca_ptc (geo=4826)\n",
- " aca_ptc/aca_ptc (geo=4827)\n",
- " aca_ptc/aca_ptc (geo=4828)\n",
- " aca_ptc/aca_ptc (geo=4829)\n",
- " aca_ptc/aca_ptc (geo=4830)\n",
- " aca_ptc/aca_ptc (geo=4831)\n",
- " aca_ptc/aca_ptc (geo=4832)\n",
- " aca_ptc/aca_ptc (geo=4833)\n",
- " aca_ptc/aca_ptc (geo=4834)\n",
- " aca_ptc/aca_ptc (geo=4835)\n",
- " aca_ptc/aca_ptc (geo=4836)\n",
- " aca_ptc/aca_ptc (geo=4837)\n",
- " aca_ptc/aca_ptc (geo=4838)\n",
- " aca_ptc/aca_ptc (geo=4901)\n",
- " aca_ptc/aca_ptc (geo=4902)\n",
- " aca_ptc/aca_ptc (geo=4903)\n",
- " aca_ptc/aca_ptc (geo=4904)\n",
- " aca_ptc/aca_ptc (geo=5001)\n",
- " aca_ptc/aca_ptc (geo=501)\n",
- " aca_ptc/aca_ptc (geo=502)\n",
- " aca_ptc/aca_ptc (geo=503)\n",
- " aca_ptc/aca_ptc (geo=504)\n",
- " aca_ptc/aca_ptc (geo=5101)\n",
- " aca_ptc/aca_ptc (geo=5102)\n",
- " aca_ptc/aca_ptc (geo=5103)\n",
- " aca_ptc/aca_ptc (geo=5104)\n",
- " aca_ptc/aca_ptc (geo=5105)\n",
- " aca_ptc/aca_ptc (geo=5106)\n",
- " aca_ptc/aca_ptc (geo=5107)\n",
- " aca_ptc/aca_ptc (geo=5108)\n",
- " aca_ptc/aca_ptc (geo=5109)\n",
- " aca_ptc/aca_ptc (geo=5110)\n",
- " aca_ptc/aca_ptc (geo=5111)\n",
- " aca_ptc/aca_ptc (geo=5301)\n",
- " aca_ptc/aca_ptc (geo=5302)\n",
- " aca_ptc/aca_ptc (geo=5303)\n",
- " aca_ptc/aca_ptc (geo=5304)\n",
- " aca_ptc/aca_ptc (geo=5305)\n",
- " aca_ptc/aca_ptc (geo=5306)\n",
- " aca_ptc/aca_ptc (geo=5307)\n",
- " aca_ptc/aca_ptc (geo=5308)\n",
- " aca_ptc/aca_ptc (geo=5309)\n",
- " aca_ptc/aca_ptc (geo=5310)\n",
- " aca_ptc/aca_ptc (geo=5401)\n",
- " aca_ptc/aca_ptc (geo=5402)\n",
- " aca_ptc/aca_ptc (geo=5501)\n",
- " aca_ptc/aca_ptc (geo=5502)\n",
- " aca_ptc/aca_ptc (geo=5503)\n",
- " aca_ptc/aca_ptc (geo=5504)\n",
- " aca_ptc/aca_ptc (geo=5505)\n",
- " aca_ptc/aca_ptc (geo=5506)\n",
- " aca_ptc/aca_ptc (geo=5507)\n",
- " aca_ptc/aca_ptc (geo=5508)\n",
- " aca_ptc/aca_ptc (geo=5601)\n",
- " aca_ptc/aca_ptc (geo=601)\n",
- " aca_ptc/aca_ptc (geo=602)\n",
- " aca_ptc/aca_ptc (geo=603)\n",
- " aca_ptc/aca_ptc (geo=604)\n",
- " aca_ptc/aca_ptc (geo=605)\n",
- " aca_ptc/aca_ptc (geo=606)\n",
- " aca_ptc/aca_ptc (geo=607)\n",
- " aca_ptc/aca_ptc (geo=608)\n",
- " aca_ptc/aca_ptc (geo=609)\n",
- " aca_ptc/aca_ptc (geo=610)\n",
- " aca_ptc/aca_ptc (geo=611)\n",
- " aca_ptc/aca_ptc (geo=612)\n",
- " aca_ptc/aca_ptc (geo=613)\n",
- " aca_ptc/aca_ptc (geo=614)\n",
- " aca_ptc/aca_ptc (geo=615)\n",
- " aca_ptc/aca_ptc (geo=616)\n",
- " aca_ptc/aca_ptc (geo=617)\n",
- " aca_ptc/aca_ptc (geo=618)\n",
- " aca_ptc/aca_ptc (geo=619)\n",
- " aca_ptc/aca_ptc (geo=620)\n",
- " aca_ptc/aca_ptc (geo=621)\n",
- " aca_ptc/aca_ptc (geo=622)\n",
- " aca_ptc/aca_ptc (geo=623)\n",
- " aca_ptc/aca_ptc (geo=624)\n",
- " aca_ptc/aca_ptc (geo=625)\n",
- " aca_ptc/aca_ptc (geo=626)\n",
- " aca_ptc/aca_ptc (geo=627)\n",
- " aca_ptc/aca_ptc (geo=628)\n",
- " aca_ptc/aca_ptc (geo=629)\n",
- " aca_ptc/aca_ptc (geo=630)\n",
- " aca_ptc/aca_ptc (geo=631)\n",
- " aca_ptc/aca_ptc (geo=632)\n",
- " aca_ptc/aca_ptc (geo=633)\n",
- " aca_ptc/aca_ptc (geo=634)\n",
- " aca_ptc/aca_ptc (geo=635)\n",
- " aca_ptc/aca_ptc (geo=636)\n",
- " aca_ptc/aca_ptc (geo=637)\n",
- " aca_ptc/aca_ptc (geo=638)\n",
- " aca_ptc/aca_ptc (geo=639)\n",
- " aca_ptc/aca_ptc (geo=640)\n",
- " aca_ptc/aca_ptc (geo=641)\n",
- " aca_ptc/aca_ptc (geo=642)\n",
- " aca_ptc/aca_ptc (geo=643)\n",
- " aca_ptc/aca_ptc (geo=644)\n",
- " aca_ptc/aca_ptc (geo=645)\n",
- " aca_ptc/aca_ptc (geo=646)\n",
- " aca_ptc/aca_ptc (geo=647)\n",
- " aca_ptc/aca_ptc (geo=648)\n",
- " aca_ptc/aca_ptc (geo=649)\n",
- " aca_ptc/aca_ptc (geo=650)\n",
- " aca_ptc/aca_ptc (geo=651)\n",
- " aca_ptc/aca_ptc (geo=652)\n",
- " aca_ptc/aca_ptc (geo=801)\n",
- " aca_ptc/aca_ptc (geo=802)\n",
- " aca_ptc/aca_ptc (geo=803)\n",
- " aca_ptc/aca_ptc (geo=804)\n",
- " aca_ptc/aca_ptc (geo=805)\n",
- " aca_ptc/aca_ptc (geo=806)\n",
- " aca_ptc/aca_ptc (geo=807)\n",
- " aca_ptc/aca_ptc (geo=808)\n",
- " aca_ptc/aca_ptc (geo=901)\n",
- " aca_ptc/aca_ptc (geo=902)\n",
- " aca_ptc/aca_ptc (geo=903)\n",
- " aca_ptc/aca_ptc (geo=904)\n",
- " aca_ptc/aca_ptc (geo=905)\n",
- " aca_ptc/tax_unit_count (geo=1001)\n",
- " aca_ptc/tax_unit_count (geo=101)\n",
- " aca_ptc/tax_unit_count (geo=102)\n",
- " aca_ptc/tax_unit_count (geo=103)\n",
- " aca_ptc/tax_unit_count (geo=104)\n",
- " aca_ptc/tax_unit_count (geo=105)\n",
- " aca_ptc/tax_unit_count (geo=106)\n",
- " aca_ptc/tax_unit_count (geo=107)\n",
- " aca_ptc/tax_unit_count (geo=1101)\n",
- " aca_ptc/tax_unit_count (geo=1201)\n",
- " aca_ptc/tax_unit_count (geo=1202)\n",
- " aca_ptc/tax_unit_count (geo=1203)\n",
- " aca_ptc/tax_unit_count (geo=1204)\n",
- " aca_ptc/tax_unit_count (geo=1205)\n",
- " aca_ptc/tax_unit_count (geo=1206)\n",
- " aca_ptc/tax_unit_count (geo=1207)\n",
- " aca_ptc/tax_unit_count (geo=1208)\n",
- " aca_ptc/tax_unit_count (geo=1209)\n",
- " aca_ptc/tax_unit_count (geo=1210)\n",
- " aca_ptc/tax_unit_count (geo=1211)\n",
- " aca_ptc/tax_unit_count (geo=1212)\n",
- " aca_ptc/tax_unit_count (geo=1213)\n",
- " aca_ptc/tax_unit_count (geo=1214)\n",
- " aca_ptc/tax_unit_count (geo=1215)\n",
- " aca_ptc/tax_unit_count (geo=1216)\n",
- " aca_ptc/tax_unit_count (geo=1217)\n",
- " aca_ptc/tax_unit_count (geo=1218)\n",
- " aca_ptc/tax_unit_count (geo=1219)\n",
- " aca_ptc/tax_unit_count (geo=1220)\n",
- " aca_ptc/tax_unit_count (geo=1221)\n",
- " aca_ptc/tax_unit_count (geo=1222)\n",
- " aca_ptc/tax_unit_count (geo=1223)\n",
- " aca_ptc/tax_unit_count (geo=1224)\n",
- " aca_ptc/tax_unit_count (geo=1225)\n",
- " aca_ptc/tax_unit_count (geo=1226)\n",
- " aca_ptc/tax_unit_count (geo=1227)\n",
- " aca_ptc/tax_unit_count (geo=1228)\n",
- " aca_ptc/tax_unit_count (geo=1301)\n",
- " aca_ptc/tax_unit_count (geo=1302)\n",
- " aca_ptc/tax_unit_count (geo=1303)\n",
- " aca_ptc/tax_unit_count (geo=1304)\n",
- " aca_ptc/tax_unit_count (geo=1305)\n",
- " aca_ptc/tax_unit_count (geo=1306)\n",
- " aca_ptc/tax_unit_count (geo=1307)\n",
- " aca_ptc/tax_unit_count (geo=1308)\n",
- " aca_ptc/tax_unit_count (geo=1309)\n",
- " aca_ptc/tax_unit_count (geo=1310)\n",
- " aca_ptc/tax_unit_count (geo=1311)\n",
- " aca_ptc/tax_unit_count (geo=1312)\n",
- " aca_ptc/tax_unit_count (geo=1313)\n",
- " aca_ptc/tax_unit_count (geo=1314)\n",
- " aca_ptc/tax_unit_count (geo=1501)\n",
- " aca_ptc/tax_unit_count (geo=1502)\n",
- " aca_ptc/tax_unit_count (geo=1601)\n",
- " aca_ptc/tax_unit_count (geo=1602)\n",
- " aca_ptc/tax_unit_count (geo=1701)\n",
- " aca_ptc/tax_unit_count (geo=1702)\n",
- " aca_ptc/tax_unit_count (geo=1703)\n",
- " aca_ptc/tax_unit_count (geo=1704)\n",
- " aca_ptc/tax_unit_count (geo=1705)\n",
- " aca_ptc/tax_unit_count (geo=1706)\n",
- " aca_ptc/tax_unit_count (geo=1707)\n",
- " aca_ptc/tax_unit_count (geo=1708)\n",
- " aca_ptc/tax_unit_count (geo=1709)\n",
- " aca_ptc/tax_unit_count (geo=1710)\n",
- " aca_ptc/tax_unit_count (geo=1711)\n",
- " aca_ptc/tax_unit_count (geo=1712)\n",
- " aca_ptc/tax_unit_count (geo=1713)\n",
- " aca_ptc/tax_unit_count (geo=1714)\n",
- " aca_ptc/tax_unit_count (geo=1715)\n",
- " aca_ptc/tax_unit_count (geo=1716)\n",
- " aca_ptc/tax_unit_count (geo=1717)\n",
- " aca_ptc/tax_unit_count (geo=1801)\n",
- " aca_ptc/tax_unit_count (geo=1802)\n",
- " aca_ptc/tax_unit_count (geo=1803)\n",
- " aca_ptc/tax_unit_count (geo=1804)\n",
- " aca_ptc/tax_unit_count (geo=1805)\n",
- " aca_ptc/tax_unit_count (geo=1806)\n",
- " aca_ptc/tax_unit_count (geo=1807)\n",
- " aca_ptc/tax_unit_count (geo=1808)\n",
- " aca_ptc/tax_unit_count (geo=1809)\n",
- " aca_ptc/tax_unit_count (geo=1901)\n",
- " aca_ptc/tax_unit_count (geo=1902)\n",
- " aca_ptc/tax_unit_count (geo=1903)\n",
- " aca_ptc/tax_unit_count (geo=1904)\n",
- " aca_ptc/tax_unit_count (geo=2001)\n",
- " aca_ptc/tax_unit_count (geo=2002)\n",
- " aca_ptc/tax_unit_count (geo=2003)\n",
- " aca_ptc/tax_unit_count (geo=2004)\n",
- " aca_ptc/tax_unit_count (geo=201)\n",
- " aca_ptc/tax_unit_count (geo=2101)\n",
- " aca_ptc/tax_unit_count (geo=2102)\n",
- " aca_ptc/tax_unit_count (geo=2103)\n",
- " aca_ptc/tax_unit_count (geo=2104)\n",
- " aca_ptc/tax_unit_count (geo=2105)\n",
- " aca_ptc/tax_unit_count (geo=2106)\n",
- " aca_ptc/tax_unit_count (geo=2201)\n",
- " aca_ptc/tax_unit_count (geo=2202)\n",
- " aca_ptc/tax_unit_count (geo=2203)\n",
- " aca_ptc/tax_unit_count (geo=2204)\n",
- " aca_ptc/tax_unit_count (geo=2205)\n",
- " aca_ptc/tax_unit_count (geo=2206)\n",
- " aca_ptc/tax_unit_count (geo=2301)\n",
- " aca_ptc/tax_unit_count (geo=2302)\n",
- " aca_ptc/tax_unit_count (geo=2401)\n",
- " aca_ptc/tax_unit_count (geo=2402)\n",
- " aca_ptc/tax_unit_count (geo=2403)\n",
- " aca_ptc/tax_unit_count (geo=2404)\n",
- " aca_ptc/tax_unit_count (geo=2405)\n",
- " aca_ptc/tax_unit_count (geo=2406)\n",
- " aca_ptc/tax_unit_count (geo=2407)\n",
- " aca_ptc/tax_unit_count (geo=2408)\n",
- " aca_ptc/tax_unit_count (geo=2501)\n",
- " aca_ptc/tax_unit_count (geo=2502)\n",
- " aca_ptc/tax_unit_count (geo=2503)\n",
- " aca_ptc/tax_unit_count (geo=2504)\n",
- " aca_ptc/tax_unit_count (geo=2505)\n",
- " aca_ptc/tax_unit_count (geo=2506)\n",
- " aca_ptc/tax_unit_count (geo=2507)\n",
- " aca_ptc/tax_unit_count (geo=2508)\n",
- " aca_ptc/tax_unit_count (geo=2509)\n",
- " aca_ptc/tax_unit_count (geo=2601)\n",
- " aca_ptc/tax_unit_count (geo=2602)\n",
- " aca_ptc/tax_unit_count (geo=2603)\n",
- " aca_ptc/tax_unit_count (geo=2604)\n",
- " aca_ptc/tax_unit_count (geo=2605)\n",
- " aca_ptc/tax_unit_count (geo=2606)\n",
- " aca_ptc/tax_unit_count (geo=2607)\n",
- " aca_ptc/tax_unit_count (geo=2608)\n",
- " aca_ptc/tax_unit_count (geo=2609)\n",
- " aca_ptc/tax_unit_count (geo=2610)\n",
- " aca_ptc/tax_unit_count (geo=2611)\n",
- " aca_ptc/tax_unit_count (geo=2612)\n",
- " aca_ptc/tax_unit_count (geo=2613)\n",
- " aca_ptc/tax_unit_count (geo=2701)\n",
- " aca_ptc/tax_unit_count (geo=2702)\n",
- " aca_ptc/tax_unit_count (geo=2703)\n",
- " aca_ptc/tax_unit_count (geo=2704)\n",
- " aca_ptc/tax_unit_count (geo=2705)\n",
- " aca_ptc/tax_unit_count (geo=2706)\n",
- " aca_ptc/tax_unit_count (geo=2707)\n",
- " aca_ptc/tax_unit_count (geo=2708)\n",
- " aca_ptc/tax_unit_count (geo=2801)\n",
- " aca_ptc/tax_unit_count (geo=2802)\n",
- " aca_ptc/tax_unit_count (geo=2803)\n",
- " aca_ptc/tax_unit_count (geo=2804)\n",
- " aca_ptc/tax_unit_count (geo=2901)\n",
- " aca_ptc/tax_unit_count (geo=2902)\n",
- " aca_ptc/tax_unit_count (geo=2903)\n",
- " aca_ptc/tax_unit_count (geo=2904)\n",
- " aca_ptc/tax_unit_count (geo=2905)\n",
- " aca_ptc/tax_unit_count (geo=2906)\n",
- " aca_ptc/tax_unit_count (geo=2907)\n",
- " aca_ptc/tax_unit_count (geo=2908)\n",
- " aca_ptc/tax_unit_count (geo=3001)\n",
- " aca_ptc/tax_unit_count (geo=3002)\n",
- " aca_ptc/tax_unit_count (geo=3101)\n",
- " aca_ptc/tax_unit_count (geo=3102)\n",
- " aca_ptc/tax_unit_count (geo=3103)\n",
- " aca_ptc/tax_unit_count (geo=3201)\n",
- " aca_ptc/tax_unit_count (geo=3202)\n",
- " aca_ptc/tax_unit_count (geo=3203)\n",
- " aca_ptc/tax_unit_count (geo=3204)\n",
- " aca_ptc/tax_unit_count (geo=3301)\n",
- " aca_ptc/tax_unit_count (geo=3302)\n",
- " aca_ptc/tax_unit_count (geo=3401)\n",
- " aca_ptc/tax_unit_count (geo=3402)\n",
- " aca_ptc/tax_unit_count (geo=3403)\n",
- " aca_ptc/tax_unit_count (geo=3404)\n",
- " aca_ptc/tax_unit_count (geo=3405)\n",
- " aca_ptc/tax_unit_count (geo=3406)\n",
- " aca_ptc/tax_unit_count (geo=3407)\n",
- " aca_ptc/tax_unit_count (geo=3408)\n",
- " aca_ptc/tax_unit_count (geo=3409)\n",
- " aca_ptc/tax_unit_count (geo=3410)\n",
- " aca_ptc/tax_unit_count (geo=3411)\n",
- " aca_ptc/tax_unit_count (geo=3412)\n",
- " aca_ptc/tax_unit_count (geo=3501)\n",
- " aca_ptc/tax_unit_count (geo=3502)\n",
- " aca_ptc/tax_unit_count (geo=3503)\n",
- " aca_ptc/tax_unit_count (geo=3601)\n",
- " aca_ptc/tax_unit_count (geo=3602)\n",
- " aca_ptc/tax_unit_count (geo=3603)\n",
- " aca_ptc/tax_unit_count (geo=3604)\n",
- " aca_ptc/tax_unit_count (geo=3605)\n",
- " aca_ptc/tax_unit_count (geo=3606)\n",
- " aca_ptc/tax_unit_count (geo=3607)\n",
- " aca_ptc/tax_unit_count (geo=3608)\n",
- " aca_ptc/tax_unit_count (geo=3609)\n",
- " aca_ptc/tax_unit_count (geo=3610)\n",
- " aca_ptc/tax_unit_count (geo=3611)\n",
- " aca_ptc/tax_unit_count (geo=3612)\n",
- " aca_ptc/tax_unit_count (geo=3613)\n",
- " aca_ptc/tax_unit_count (geo=3614)\n",
- " aca_ptc/tax_unit_count (geo=3615)\n",
- " aca_ptc/tax_unit_count (geo=3616)\n",
- " aca_ptc/tax_unit_count (geo=3617)\n",
- " aca_ptc/tax_unit_count (geo=3618)\n",
- " aca_ptc/tax_unit_count (geo=3619)\n",
- " aca_ptc/tax_unit_count (geo=3620)\n",
- " aca_ptc/tax_unit_count (geo=3621)\n",
- " aca_ptc/tax_unit_count (geo=3622)\n",
- " aca_ptc/tax_unit_count (geo=3623)\n",
- " aca_ptc/tax_unit_count (geo=3624)\n",
- " aca_ptc/tax_unit_count (geo=3625)\n",
- " aca_ptc/tax_unit_count (geo=3626)\n",
- " aca_ptc/tax_unit_count (geo=3701)\n",
- " aca_ptc/tax_unit_count (geo=3702)\n",
- " aca_ptc/tax_unit_count (geo=3703)\n",
- " aca_ptc/tax_unit_count (geo=3704)\n",
- " aca_ptc/tax_unit_count (geo=3705)\n",
- " aca_ptc/tax_unit_count (geo=3706)\n",
- " aca_ptc/tax_unit_count (geo=3707)\n",
- " aca_ptc/tax_unit_count (geo=3708)\n",
- " aca_ptc/tax_unit_count (geo=3709)\n",
- " aca_ptc/tax_unit_count (geo=3710)\n",
- " aca_ptc/tax_unit_count (geo=3711)\n",
- " aca_ptc/tax_unit_count (geo=3712)\n",
- " aca_ptc/tax_unit_count (geo=3713)\n",
- " aca_ptc/tax_unit_count (geo=3714)\n",
- " aca_ptc/tax_unit_count (geo=3801)\n",
- " aca_ptc/tax_unit_count (geo=3901)\n",
- " aca_ptc/tax_unit_count (geo=3902)\n",
- " aca_ptc/tax_unit_count (geo=3903)\n",
- " aca_ptc/tax_unit_count (geo=3904)\n",
- " aca_ptc/tax_unit_count (geo=3905)\n",
- " aca_ptc/tax_unit_count (geo=3906)\n",
- " aca_ptc/tax_unit_count (geo=3907)\n",
- " aca_ptc/tax_unit_count (geo=3908)\n",
- " aca_ptc/tax_unit_count (geo=3909)\n",
- " aca_ptc/tax_unit_count (geo=3910)\n",
- " aca_ptc/tax_unit_count (geo=3911)\n",
- " aca_ptc/tax_unit_count (geo=3912)\n",
- " aca_ptc/tax_unit_count (geo=3913)\n",
- " aca_ptc/tax_unit_count (geo=3914)\n",
- " aca_ptc/tax_unit_count (geo=3915)\n",
- " aca_ptc/tax_unit_count (geo=4001)\n",
- " aca_ptc/tax_unit_count (geo=4002)\n",
- " aca_ptc/tax_unit_count (geo=4003)\n",
- " aca_ptc/tax_unit_count (geo=4004)\n",
- " aca_ptc/tax_unit_count (geo=4005)\n",
- " aca_ptc/tax_unit_count (geo=401)\n",
- " aca_ptc/tax_unit_count (geo=402)\n",
- " aca_ptc/tax_unit_count (geo=403)\n",
- " aca_ptc/tax_unit_count (geo=404)\n",
- " aca_ptc/tax_unit_count (geo=405)\n",
- " aca_ptc/tax_unit_count (geo=406)\n",
- " aca_ptc/tax_unit_count (geo=407)\n",
- " aca_ptc/tax_unit_count (geo=408)\n",
- " aca_ptc/tax_unit_count (geo=409)\n",
- " aca_ptc/tax_unit_count (geo=4101)\n",
- " aca_ptc/tax_unit_count (geo=4102)\n",
- " aca_ptc/tax_unit_count (geo=4103)\n",
- " aca_ptc/tax_unit_count (geo=4104)\n",
- " aca_ptc/tax_unit_count (geo=4105)\n",
- " aca_ptc/tax_unit_count (geo=4106)\n",
- " aca_ptc/tax_unit_count (geo=4201)\n",
- " aca_ptc/tax_unit_count (geo=4202)\n",
- " aca_ptc/tax_unit_count (geo=4203)\n",
- " aca_ptc/tax_unit_count (geo=4204)\n",
- " aca_ptc/tax_unit_count (geo=4205)\n",
- " aca_ptc/tax_unit_count (geo=4206)\n",
- " aca_ptc/tax_unit_count (geo=4207)\n",
- " aca_ptc/tax_unit_count (geo=4208)\n",
- " aca_ptc/tax_unit_count (geo=4209)\n",
- " aca_ptc/tax_unit_count (geo=4210)\n",
- " aca_ptc/tax_unit_count (geo=4211)\n",
- " aca_ptc/tax_unit_count (geo=4212)\n",
- " aca_ptc/tax_unit_count (geo=4213)\n",
- " aca_ptc/tax_unit_count (geo=4214)\n",
- " aca_ptc/tax_unit_count (geo=4215)\n",
- " aca_ptc/tax_unit_count (geo=4216)\n",
- " aca_ptc/tax_unit_count (geo=4217)\n",
- " aca_ptc/tax_unit_count (geo=4401)\n",
- " aca_ptc/tax_unit_count (geo=4402)\n",
- " aca_ptc/tax_unit_count (geo=4501)\n",
- " aca_ptc/tax_unit_count (geo=4502)\n",
- " aca_ptc/tax_unit_count (geo=4503)\n",
- " aca_ptc/tax_unit_count (geo=4504)\n",
- " aca_ptc/tax_unit_count (geo=4505)\n",
- " aca_ptc/tax_unit_count (geo=4506)\n",
- " aca_ptc/tax_unit_count (geo=4507)\n",
- " aca_ptc/tax_unit_count (geo=4601)\n",
- " aca_ptc/tax_unit_count (geo=4701)\n",
- " aca_ptc/tax_unit_count (geo=4702)\n",
- " aca_ptc/tax_unit_count (geo=4703)\n",
- " aca_ptc/tax_unit_count (geo=4704)\n",
- " aca_ptc/tax_unit_count (geo=4705)\n",
- " aca_ptc/tax_unit_count (geo=4706)\n",
- " aca_ptc/tax_unit_count (geo=4707)\n",
- " aca_ptc/tax_unit_count (geo=4708)\n",
- " aca_ptc/tax_unit_count (geo=4709)\n",
- " aca_ptc/tax_unit_count (geo=4801)\n",
- " aca_ptc/tax_unit_count (geo=4802)\n",
- " aca_ptc/tax_unit_count (geo=4803)\n",
- " aca_ptc/tax_unit_count (geo=4804)\n",
- " aca_ptc/tax_unit_count (geo=4805)\n",
- " aca_ptc/tax_unit_count (geo=4806)\n",
- " aca_ptc/tax_unit_count (geo=4807)\n",
- " aca_ptc/tax_unit_count (geo=4808)\n",
- " aca_ptc/tax_unit_count (geo=4809)\n",
- " aca_ptc/tax_unit_count (geo=4810)\n",
- " aca_ptc/tax_unit_count (geo=4811)\n",
- " aca_ptc/tax_unit_count (geo=4812)\n",
- " aca_ptc/tax_unit_count (geo=4813)\n",
- " aca_ptc/tax_unit_count (geo=4814)\n",
- " aca_ptc/tax_unit_count (geo=4815)\n",
- " aca_ptc/tax_unit_count (geo=4816)\n",
- " aca_ptc/tax_unit_count (geo=4817)\n",
- " aca_ptc/tax_unit_count (geo=4818)\n",
- " aca_ptc/tax_unit_count (geo=4819)\n",
- " aca_ptc/tax_unit_count (geo=4820)\n",
- " aca_ptc/tax_unit_count (geo=4821)\n",
- " aca_ptc/tax_unit_count (geo=4822)\n",
- " aca_ptc/tax_unit_count (geo=4823)\n",
- " aca_ptc/tax_unit_count (geo=4824)\n",
- " aca_ptc/tax_unit_count (geo=4825)\n",
- " aca_ptc/tax_unit_count (geo=4826)\n",
- " aca_ptc/tax_unit_count (geo=4827)\n",
- " aca_ptc/tax_unit_count (geo=4828)\n",
- " aca_ptc/tax_unit_count (geo=4829)\n",
- " aca_ptc/tax_unit_count (geo=4830)\n",
- " aca_ptc/tax_unit_count (geo=4831)\n",
- " aca_ptc/tax_unit_count (geo=4832)\n",
- " aca_ptc/tax_unit_count (geo=4833)\n",
- " aca_ptc/tax_unit_count (geo=4834)\n",
- " aca_ptc/tax_unit_count (geo=4835)\n",
- " aca_ptc/tax_unit_count (geo=4836)\n",
- " aca_ptc/tax_unit_count (geo=4837)\n",
- " aca_ptc/tax_unit_count (geo=4838)\n",
- " aca_ptc/tax_unit_count (geo=4901)\n",
- " aca_ptc/tax_unit_count (geo=4902)\n",
- " aca_ptc/tax_unit_count (geo=4903)\n",
- " aca_ptc/tax_unit_count (geo=4904)\n",
- " aca_ptc/tax_unit_count (geo=5001)\n",
- " aca_ptc/tax_unit_count (geo=501)\n",
- " aca_ptc/tax_unit_count (geo=502)\n",
- " aca_ptc/tax_unit_count (geo=503)\n",
- " aca_ptc/tax_unit_count (geo=504)\n",
- " aca_ptc/tax_unit_count (geo=5101)\n",
- " aca_ptc/tax_unit_count (geo=5102)\n",
- " aca_ptc/tax_unit_count (geo=5103)\n",
- " aca_ptc/tax_unit_count (geo=5104)\n",
- " aca_ptc/tax_unit_count (geo=5105)\n",
- " aca_ptc/tax_unit_count (geo=5106)\n",
- " aca_ptc/tax_unit_count (geo=5107)\n",
- " aca_ptc/tax_unit_count (geo=5108)\n",
- " aca_ptc/tax_unit_count (geo=5109)\n",
- " aca_ptc/tax_unit_count (geo=5110)\n",
- " aca_ptc/tax_unit_count (geo=5111)\n",
- " aca_ptc/tax_unit_count (geo=5301)\n",
- " aca_ptc/tax_unit_count (geo=5302)\n",
- " aca_ptc/tax_unit_count (geo=5303)\n",
- " aca_ptc/tax_unit_count (geo=5304)\n",
- " aca_ptc/tax_unit_count (geo=5305)\n",
- " aca_ptc/tax_unit_count (geo=5306)\n",
- " aca_ptc/tax_unit_count (geo=5307)\n",
- " aca_ptc/tax_unit_count (geo=5308)\n",
- " aca_ptc/tax_unit_count (geo=5309)\n",
- " aca_ptc/tax_unit_count (geo=5310)\n",
- " aca_ptc/tax_unit_count (geo=5401)\n",
- " aca_ptc/tax_unit_count (geo=5402)\n",
- " aca_ptc/tax_unit_count (geo=5501)\n",
- " aca_ptc/tax_unit_count (geo=5502)\n",
- " aca_ptc/tax_unit_count (geo=5503)\n",
- " aca_ptc/tax_unit_count (geo=5504)\n",
- " aca_ptc/tax_unit_count (geo=5505)\n",
- " aca_ptc/tax_unit_count (geo=5506)\n",
- " aca_ptc/tax_unit_count (geo=5507)\n",
- " aca_ptc/tax_unit_count (geo=5508)\n",
- " aca_ptc/tax_unit_count (geo=5601)\n",
- " aca_ptc/tax_unit_count (geo=601)\n",
- " aca_ptc/tax_unit_count (geo=602)\n",
- " aca_ptc/tax_unit_count (geo=603)\n",
- " aca_ptc/tax_unit_count (geo=604)\n",
- " aca_ptc/tax_unit_count (geo=605)\n",
- " aca_ptc/tax_unit_count (geo=606)\n",
- " aca_ptc/tax_unit_count (geo=607)\n",
- " aca_ptc/tax_unit_count (geo=608)\n",
- " aca_ptc/tax_unit_count (geo=609)\n",
- " aca_ptc/tax_unit_count (geo=610)\n",
- " aca_ptc/tax_unit_count (geo=611)\n",
- " aca_ptc/tax_unit_count (geo=612)\n",
- " aca_ptc/tax_unit_count (geo=613)\n",
- " aca_ptc/tax_unit_count (geo=614)\n",
- " aca_ptc/tax_unit_count (geo=615)\n",
- " aca_ptc/tax_unit_count (geo=616)\n",
- " aca_ptc/tax_unit_count (geo=617)\n",
- " aca_ptc/tax_unit_count (geo=618)\n",
- " aca_ptc/tax_unit_count (geo=619)\n",
- " aca_ptc/tax_unit_count (geo=620)\n",
- " aca_ptc/tax_unit_count (geo=621)\n",
- " aca_ptc/tax_unit_count (geo=622)\n",
- " aca_ptc/tax_unit_count (geo=623)\n",
- " aca_ptc/tax_unit_count (geo=624)\n",
- " aca_ptc/tax_unit_count (geo=625)\n",
- " aca_ptc/tax_unit_count (geo=626)\n",
- " aca_ptc/tax_unit_count (geo=627)\n",
- " aca_ptc/tax_unit_count (geo=628)\n",
- " aca_ptc/tax_unit_count (geo=629)\n",
- " aca_ptc/tax_unit_count (geo=630)\n",
- " aca_ptc/tax_unit_count (geo=631)\n",
- " aca_ptc/tax_unit_count (geo=632)\n",
- " aca_ptc/tax_unit_count (geo=633)\n",
- " aca_ptc/tax_unit_count (geo=634)\n",
- " aca_ptc/tax_unit_count (geo=635)\n",
- " aca_ptc/tax_unit_count (geo=636)\n",
- " aca_ptc/tax_unit_count (geo=637)\n",
- " aca_ptc/tax_unit_count (geo=638)\n",
- " aca_ptc/tax_unit_count (geo=639)\n",
- " aca_ptc/tax_unit_count (geo=640)\n",
- " aca_ptc/tax_unit_count (geo=641)\n",
- " aca_ptc/tax_unit_count (geo=642)\n",
- " aca_ptc/tax_unit_count (geo=643)\n",
- " aca_ptc/tax_unit_count (geo=644)\n",
- " aca_ptc/tax_unit_count (geo=645)\n",
- " aca_ptc/tax_unit_count (geo=646)\n",
- " aca_ptc/tax_unit_count (geo=647)\n",
- " aca_ptc/tax_unit_count (geo=648)\n",
- " aca_ptc/tax_unit_count (geo=649)\n",
- " aca_ptc/tax_unit_count (geo=650)\n",
- " aca_ptc/tax_unit_count (geo=651)\n",
- " aca_ptc/tax_unit_count (geo=652)\n",
- " aca_ptc/tax_unit_count (geo=801)\n",
- " aca_ptc/tax_unit_count (geo=802)\n",
- " aca_ptc/tax_unit_count (geo=803)\n",
- " aca_ptc/tax_unit_count (geo=804)\n",
- " aca_ptc/tax_unit_count (geo=805)\n",
- " aca_ptc/tax_unit_count (geo=806)\n",
- " aca_ptc/tax_unit_count (geo=807)\n",
- " aca_ptc/tax_unit_count (geo=808)\n",
- " aca_ptc/tax_unit_count (geo=901)\n",
- " aca_ptc/tax_unit_count (geo=902)\n",
- " aca_ptc/tax_unit_count (geo=903)\n",
- " aca_ptc/tax_unit_count (geo=904)\n",
- " aca_ptc/tax_unit_count (geo=905)\n"
+ "Achievable targets: 479\n",
+ "Impossible targets: 881\n",
+ "\n",
+ "Impossible targets by (domain, variable):\n",
+ " aca_ptc/aca_ptc: 436\n",
+ " aca_ptc/tax_unit_count: 436\n",
+ " snap/household_count: 7\n",
+ " aca_ptc/person_count: 1\n",
+ " snap/snap: 1\n"
]
}
],
@@ -1647,12 +636,15 @@
"\n",
"if n_impossible > 0:\n",
" impossible = targets_filtered[~achievable_mask]\n",
- " print(\"\\nImpossible targets:\")\n",
- " for _, r in impossible.iterrows():\n",
- " print(\n",
- " f\" {r.get('domain_variable', '?')}/{r['variable']} \"\n",
- " f\"(geo={r['geographic_id']})\"\n",
- " )"
+ " by_var = (\n",
+ " impossible.groupby([\"domain_variable\", \"variable\"])\n",
+ " .agg(count=(\"value\", \"size\"))\n",
+ " .reset_index()\n",
+ " .sort_values(\"count\", ascending=False)\n",
+ " )\n",
+ " print(\"\\nImpossible targets by (domain, variable):\")\n",
+ " for _, r in by_var.iterrows():\n",
+ " print(f\" {r['domain_variable']}/{r['variable']}: {r['count']}\")"
]
},
{
@@ -1665,11 +657,11 @@
"output_type": "stream",
"text": [
"Hardest targets (lowest row_sum / target_value ratio):\n",
- " snap/household_count (geo=3615): ratio=0.0088, row_sum=1,535, target=173,591\n",
- " snap/household_count (geo=3613): ratio=0.0110, row_sum=1,535, target=139,162\n",
- " snap/household_count (geo=621): ratio=0.0124, row_sum=1,483, target=119,148\n",
- " snap/household_count (geo=3608): ratio=0.0129, row_sum=1,535, target=118,977\n",
- " snap/household_count (geo=634): ratio=0.0130, row_sum=1,483, target=113,916\n"
+ " snap/household_count (geo=621): ratio=0.0000, row_sum=4, target=119,148\n",
+ " snap/household_count (geo=3615): ratio=0.0001, row_sum=9, target=173,591\n",
+ " snap/snap (geo=46): ratio=0.0001, row_sum=9,421, target=180,195,817\n",
+ " snap/household_count (geo=3625): ratio=0.0001, row_sum=4, target=67,315\n",
+ " snap/household_count (geo=1702): ratio=0.0001, row_sum=6, target=97,494\n"
]
}
],
@@ -1700,9 +692,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Final matrix shape: (487, 5231564)\n",
- "Final non-zero entries: 1,466,022\n",
- "Final density: 0.000575\n",
+ "Final matrix shape: (479, 35997)\n",
+ "Final non-zero entries: 9,944\n",
+ "Final density: 0.000577\n",
"\n",
"This is what the optimizer receives.\n"
]
@@ -1724,10 +716,10 @@
"\n",
"The calibration matrix pipeline has five steps:\n",
"\n",
- "1. **Build** — `SparseMatrixBuilder.build_matrix()` queries targets, applies hierarchical uprating, evaluates constraints, and assembles the sparse CSR matrix.\n",
- "2. **Read** — `MatrixTracer` decodes rows (targets) and columns (household-CD pairs) so you can verify the matrix makes sense.\n",
+ "1. **Clone + assign** — `assign_random_geography()` creates N clones of each CPS record, each with a random census block (and derived CD/state).\n",
+ "2. **Build** — `UnifiedMatrixBuilder.build_matrix()` queries targets, applies hierarchical uprating, simulates each clone with its assigned geography, and assembles the sparse CSR matrix.\n",
"3. **Groups** — `create_target_groups()` partitions rows for balanced loss weighting. `GROUPS_TO_EXCLUDE` drops redundant constraints.\n",
- "4. **Sparsity** — Most of the matrix is zero. District-level targets confine non-zeros to single CD blocks; national targets span all blocks.\n",
+ "4. **Sparsity** — Most of the matrix is zero. District-level targets confine non-zeros to clones assigned to that district; national targets span all clones.\n",
"5. **Filter** — Remove impossible targets (row sum = 0) before handing to the optimizer.\n",
"\n",
"When adding new domains or variables to the calibration, re-run this notebook to verify the new targets appear correctly and don't introduce impossible constraints."
@@ -1755,4 +747,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/docs/hierarchical_uprating.ipynb b/docs/hierarchical_uprating.ipynb
index 76530225c..4da30d82c 100644
--- a/docs/hierarchical_uprating.ipynb
+++ b/docs/hierarchical_uprating.ipynb
@@ -51,20 +51,16 @@
"import pandas as pd\n",
"\n",
"from policyengine_us_data.storage import STORAGE_FOLDER\n",
- "from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (\n",
- " SparseMatrixBuilder,\n",
+ "from policyengine_us_data.calibration.unified_matrix_builder import (\n",
+ " UnifiedMatrixBuilder,\n",
")\n",
"from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n",
- " get_all_cds_from_database,\n",
" STATE_CODES,\n",
")\n",
"\n",
"db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n",
"db_uri = f\"sqlite:///{db_path}\"\n",
- "cds = get_all_cds_from_database(db_uri)\n",
- "builder = SparseMatrixBuilder(\n",
- " db_uri, time_period=2024, cds_to_calibrate=cds\n",
- ")"
+ "builder = UnifiedMatrixBuilder(db_uri, time_period=2024)"
]
},
{
diff --git a/docs/local_area_calibration_setup.ipynb b/docs/local_area_calibration_setup.ipynb
index c21264e9a..2e8614aa9 100644
--- a/docs/local_area_calibration_setup.ipynb
+++ b/docs/local_area_calibration_setup.ipynb
@@ -7,7 +7,21 @@
"source": [
"# Local Area Calibration Setup\n",
"\n",
- "This notebook demonstrates the sparse matrix construction for local area (congressional district) calibration. It uses a subset of CDs from NC, HI, MT, and AK for manageable runtime."
+ "This notebook demonstrates the clone-based calibration pipeline: how raw CPS records become a calibration matrix and, ultimately, CD-level stacked datasets.\n",
+ "\n",
+ "The paradigm shift from the old approach: instead of replicating every household into every congressional district, we **clone** each record N times and assign each clone a **random census block** drawn from a population-weighted distribution. Each clone inherits a state, CD, and block — and gets re-simulated under the rules of its assigned state.\n",
+ "\n",
+ "We follow one household (`record_idx=8629`, household_id 128694, SNAP \\$18,396) through the entire pipeline:\n",
+ "1. Clone and assign geography\n",
+ "2. Simulate under new state rules (`_simulate_clone`)\n",
+ "3. Geographic column masking\n",
+ "4. Re-randomize takeup per census block\n",
+ "5. Build the calibration matrix\n",
+ "6. Create stacked datasets from calibrated weights\n",
+ "\n",
+ "**Companion notebook:** [calibration_matrix.ipynb](calibration_matrix.ipynb) covers the *finished* matrix — row/column anatomy, target groups, sparsity. This notebook covers the *process* that creates it and what happens after (stacked datasets).\n",
+ "\n",
+ "**Requirements:** `policy_data.db`, `block_cd_distributions.csv.gz`, and the stratified CPS h5 file in `STORAGE_FOLDER`."
]
},
{
@@ -23,24 +37,52 @@
"execution_count": 1,
"id": "cell-2",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/baogorek/envs/sep/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n"
+ ]
+ }
+ ],
"source": [
- "from sqlalchemy import create_engine, text\n",
- "import pandas as pd\n",
"import numpy as np\n",
+ "import pandas as pd\n",
+ "from collections import defaultdict\n",
"\n",
"from policyengine_us import Microsimulation\n",
"from policyengine_us_data.storage import STORAGE_FOLDER\n",
- "from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (\n",
- " SparseMatrixBuilder,\n",
+ "from policyengine_us_data.calibration.clone_and_assign import (\n",
+ " assign_random_geography,\n",
+ " GeographyAssignment,\n",
+ " load_global_block_distribution,\n",
+ ")\n",
+ "from policyengine_us_data.calibration.unified_matrix_builder import (\n",
+ " UnifiedMatrixBuilder,\n",
")\n",
- "from policyengine_us_data.datasets.cps.local_area_calibration.matrix_tracer import (\n",
- " MatrixTracer,\n",
+ "from policyengine_us_data.calibration.unified_calibration import (\n",
+ " rerandomize_takeup,\n",
+ " SIMPLE_TAKEUP_VARS,\n",
")\n",
+ "from policyengine_us_data.utils.randomness import seeded_rng\n",
+ "from policyengine_us_data.parameters import load_take_up_rate\n",
"from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n",
" get_calculated_variables,\n",
- " create_target_groups,\n",
- ")"
+ " STATE_CODES,\n",
+ " get_all_cds_from_database,\n",
+ ")\n",
+ "from policyengine_us_data.datasets.cps.local_area_calibration.stacked_dataset_builder import (\n",
+ " create_sparse_cd_stacked_dataset,\n",
+ ")\n",
+ "\n",
+ "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n",
+ "db_uri = f\"sqlite:///{db_path}\"\n",
+ "dataset_path = str(STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\")\n",
+ "\n",
+ "N_CLONES = 3\n",
+ "SEED = 42"
]
},
{
@@ -48,13 +90,30 @@
"execution_count": 2,
"id": "cell-3",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Base dataset: 11,999 households\n",
+ "Example household: record_idx=8629, household_id=128694, SNAP=$18,396.00\n"
+ ]
+ }
+ ],
"source": [
- "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n",
- "db_uri = f\"sqlite:///{db_path}\"\n",
- "dataset_path = str(STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\")\n",
+ "sim = Microsimulation(dataset=dataset_path)\n",
+ "hh_ids = sim.calculate(\"household_id\", map_to=\"household\").values\n",
+ "snap_values = sim.calculate(\"snap\", map_to=\"household\").values\n",
+ "n_records = len(hh_ids)\n",
"\n",
- "engine = create_engine(db_uri)"
+ "record_idx = 8629 # High SNAP ($18k), lands in TX/PA/NY with seed=42\n",
+ "example_hh_id = hh_ids[record_idx]\n",
+ "print(f\"Base dataset: {n_records:,} households\")\n",
+ "print(\n",
+ " f\"Example household: record_idx={record_idx}, \"\n",
+ " f\"household_id={example_hh_id}, \"\n",
+ " f\"SNAP=${snap_values[record_idx]:,.2f}\"\n",
+ ")"
]
},
{
@@ -62,13 +121,9 @@
"id": "cell-4",
"metadata": {},
"source": [
- "## Section 2: Select Test Congressional Districts\n",
+ "## Section 2: Geography Assignment\n",
"\n",
- "We use CDs from 4 states for testing:\n",
- "- **NC (37)**: 14 CDs (3701-3714) - provides same-state different-CD test cases\n",
- "- **HI (15)**: 2 CDs (1501-1502)\n",
- "- **MT (30)**: 2 CDs (3001-3002)\n",
- "- **AK (2)**: 1 CD (200)"
+ "`assign_random_geography` creates `n_records * n_clones` total records, each assigned a random census block from a population-weighted distribution. State and CD are derived from the block GEOID. The result is a `GeographyAssignment` dataclass with arrays indexed as `clone_idx * n_records + record_idx`."
]
},
{
@@ -81,557 +136,850 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Testing with 19 congressional districts:\n",
- " NC (37): ['3701', '3702', '3703', '3704', '3705', '3706', '3707', '3708', '3709', '3710', '3711', '3712', '3713', '3714']\n",
- " HI (15): ['1501', '1502']\n",
- " MT (30): ['3001', '3002']\n",
- " AK (2): ['201']\n"
+ "Total cloned records: 35,997\n",
+ "Unique states: 50\n",
+ "Unique CDs: 435\n",
+ "Unique blocks: 35508\n"
]
}
],
"source": [
- "query = \"\"\"\n",
- "SELECT DISTINCT sc.value as cd_geoid\n",
- "FROM stratum_constraints sc\n",
- "WHERE sc.constraint_variable = 'congressional_district_geoid'\n",
- " AND (\n",
- " sc.value LIKE '37__'\n",
- " OR sc.value LIKE '150_'\n",
- " OR sc.value LIKE '300_'\n",
- " OR sc.value = '200' OR sc.value = '201'\n",
- " )\n",
- "ORDER BY sc.value\n",
- "\"\"\"\n",
- "\n",
- "with engine.connect() as conn:\n",
- " result = conn.execute(text(query)).fetchall()\n",
- " test_cds = [row[0] for row in result]\n",
- "\n",
- "print(f\"Testing with {len(test_cds)} congressional districts:\")\n",
- "print(f\" NC (37): {[cd for cd in test_cds if cd.startswith('37')]}\")\n",
- "print(f\" HI (15): {[cd for cd in test_cds if cd.startswith('15')]}\")\n",
- "print(f\" MT (30): {[cd for cd in test_cds if cd.startswith('30')]}\")\n",
- "print(f\" AK (2): {[cd for cd in test_cds if cd.startswith('20')]}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "cell-6",
- "metadata": {},
- "source": [
- "## Section 3: Build the Sparse Matrix\n",
+ "geography = assign_random_geography(n_records, n_clones=N_CLONES, seed=SEED)\n",
+ "n_total = n_records * N_CLONES\n",
"\n",
- "The sparse matrix `X_sparse` has:\n",
- "- **Rows**: Calibration targets (e.g., SNAP totals by geography)\n",
- "- **Columns**: (household × CD) pairs - each household appears once per CD\n",
- "\n",
- "We filter to SNAP targets using the `domain_variables` filter for this demonstration."
+ "print(f\"Total cloned records: {n_total:,}\")\n",
+ "print(f\"Unique states: {len(np.unique(geography.state_fips))}\")\n",
+ "print(f\"Unique CDs: {len(np.unique(geography.cd_geoid))}\")\n",
+ "print(f\"Unique blocks: {len(np.unique(geography.block_geoid))}\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
- "id": "cell-7",
+ "id": "cell-6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "X_sparse shape: (539, 227981)\n",
- " Rows (targets): 539\n",
- " Columns (household × CD pairs): 227981\n",
- " Non-zero entries: 141,536\n",
- " Sparsity: 99.88%\n"
+ "Example household (record_idx=8629) across 3 clones:\n",
+ "\n"
]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " clone | \n",
+ " col | \n",
+ " state_fips | \n",
+ " abbr | \n",
+ " cd_geoid | \n",
+ " block_geoid | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 8629 | \n",
+ " 48 | \n",
+ " TX | \n",
+ " 4817 | \n",
+ " 481450004002026 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " 20628 | \n",
+ " 42 | \n",
+ " PA | \n",
+ " 4201 | \n",
+ " 420171058013029 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " 32627 | \n",
+ " 36 | \n",
+ " NY | \n",
+ " 3611 | \n",
+ " 360850208041023 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " clone col state_fips abbr cd_geoid block_geoid\n",
+ "0 0 8629 48 TX 4817 481450004002026\n",
+ "1 1 20628 42 PA 4201 420171058013029\n",
+ "2 2 32627 36 NY 3611 360850208041023"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "sim = Microsimulation(dataset=dataset_path)\n",
- "\n",
- "builder = SparseMatrixBuilder(\n",
- " db_uri,\n",
- " time_period=2024,\n",
- " cds_to_calibrate=test_cds,\n",
- " dataset_path=dataset_path,\n",
+ "print(\n",
+ " f\"Example household (record_idx={record_idx}) across {N_CLONES} clones:\\n\"\n",
")\n",
- "\n",
- "targets_df, X_sparse, household_id_mapping = builder.build_matrix(\n",
- " sim, target_filter={\"domain_variables\": [\"snap\"], \"variables\": [\"snap\"]}\n",
- ")\n",
- "\n",
- "print(f\"X_sparse shape: {X_sparse.shape}\")\n",
- "print(f\" Rows (targets): {X_sparse.shape[0]}\")\n",
- "print(f\" Columns (household × CD pairs): {X_sparse.shape[1]}\")\n",
- "print(f\" Non-zero entries: {X_sparse.nnz:,}\")\n",
- "print(f\" Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.2%}\")"
+ "rows = []\n",
+ "for c in range(N_CLONES):\n",
+ " col = c * n_records + record_idx\n",
+ " rows.append(\n",
+ " {\n",
+ " \"clone\": c,\n",
+ " \"col\": col,\n",
+ " \"state_fips\": geography.state_fips[col],\n",
+ " \"abbr\": STATE_CODES.get(geography.state_fips[col], \"??\"),\n",
+ " \"cd_geoid\": geography.cd_geoid[col],\n",
+ " \"block_geoid\": geography.block_geoid[col],\n",
+ " }\n",
+ " )\n",
+ "pd.DataFrame(rows)"
]
},
{
"cell_type": "markdown",
- "id": "cell-8",
+ "id": "cell-7",
"metadata": {},
"source": [
- "## Section 4: Understanding the Matrix Structure with MatrixTracer\n",
+ "One household, three parallel geographic identities. Each clone will be simulated under different state rules, producing different benefit amounts.\n",
"\n",
- "The `MatrixTracer` helps navigate the sparse matrix by providing lookups between:\n",
- "- Column indices ↔ (household_id, CD) pairs\n",
- "- Row indices ↔ target definitions"
+ "**Note:** With only N_CLONES=3 (~36K total samples), small-population areas like DC may not appear in the random draw. The production pipeline uses N_CLONES=10, which covers all 51 state-equivalents and 436 CDs."
]
},
{
"cell_type": "code",
"execution_count": 5,
- "id": "cell-9",
+ "id": "cell-8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "\n",
- "================================================================================\n",
- "MATRIX STRUCTURE BREAKDOWN\n",
- "================================================================================\n",
- "\n",
- "Matrix dimensions: 539 rows x 227981 columns\n",
- " Rows = 539 targets\n",
- " Columns = 11999 households x 19 CDs\n",
- " = 11,999 x 19 = 227,981\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "COLUMN STRUCTURE (Households stacked by CD)\n",
- "--------------------------------------------------------------------------------\n",
- "\n",
- "Showing first and last 5 CDs of 19 total:\n",
- "\n",
- "First 5 CDs:\n",
- "cd_geoid start_col end_col n_households\n",
- " 1501 0 11998 11999\n",
- " 1502 11999 23997 11999\n",
- " 201 23998 35996 11999\n",
- " 3001 35997 47995 11999\n",
- " 3002 47996 59994 11999\n",
- "\n",
- "Last 5 CDs:\n",
- "cd_geoid start_col end_col n_households\n",
- " 3710 167986 179984 11999\n",
- " 3711 179985 191983 11999\n",
- " 3712 191984 203982 11999\n",
- " 3713 203983 215981 11999\n",
- " 3714 215982 227980 11999\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "ROW STRUCTURE (Targets)\n",
- "--------------------------------------------------------------------------------\n",
- "\n",
- "Total targets: 539\n",
- "\n",
- "Targets by domain variable:\n",
- " n_targets n_unique_vars\n",
- "domain_variable \n",
- "snap 538 2\n",
- "\n",
- "--------------------------------------------------------------------------------\n",
- "TARGET GROUPS (for loss calculation)\n",
- "--------------------------------------------------------------------------------\n",
- "\n",
- "=== Creating Target Groups ===\n",
- "\n",
- "National targets:\n",
- " Group 0: Snap = 93,730,290,000\n",
- "\n",
- "State targets:\n",
- " Group 1: SNAP Household Count (51 targets)\n",
- " Group 2: Snap (51 targets)\n",
- "\n",
- "District targets:\n",
- " Group 3: SNAP Household Count (436 targets)\n",
- "\n",
- "Total groups created: 4\n",
- "========================================\n",
- " Group 0: National Snap (1 target, value=93,730,290,000) - rows [0]\n",
- " Group 1: State SNAP Household Count (51 targets) - rows [1, 2, 3, ..., 50, 51]\n",
- " Group 2: State Snap (51 targets) - rows [52, 53, 54, ..., 101, 102]\n",
- " Group 3: District SNAP Household Count (436 targets) - rows [103, 104, 105, ..., 537, 538]\n",
- "\n",
- "================================================================================\n"
+ "Global block distribution: 5,765,442 blocks\n",
+ "Top 5 states by total probability:\n",
+ " CA (6): 11.954%\n",
+ " TX (48): 8.736%\n",
+ " FL (12): 6.437%\n",
+ " NY (36): 5.977%\n",
+ " PA (42): 3.908%\n"
]
}
],
"source": [
- "tracer = MatrixTracer(\n",
- " targets_df, X_sparse, household_id_mapping, test_cds, sim\n",
- ")\n",
+ "blocks, cds, states, probs = load_global_block_distribution()\n",
+ "print(f\"Global block distribution: {len(blocks):,} blocks\")\n",
+ "print(f\"Top 5 states by total probability:\")\n",
+ "state_prob = pd.Series(probs, index=states).groupby(level=0).sum()\n",
+ "top5 = state_prob.nlargest(5)\n",
+ "for fips, p in top5.items():\n",
+ " print(f\" {STATE_CODES.get(fips, '??')} ({fips}): {p:.3%}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-9",
+ "metadata": {},
+ "source": [
+ "## Section 3: Inside `_simulate_clone` — State-Swap\n",
+ "\n",
+ "For each clone, `_simulate_clone` does four things:\n",
+ "1. Creates a **fresh** `Microsimulation` from the base dataset\n",
+ "2. Overwrites `state_fips` with the clone's assigned states\n",
+ "3. Optionally calls a `sim_modifier` (e.g., takeup re-randomization)\n",
+ "4. **Clears cached formulas** via `get_calculated_variables` — preserving survey inputs and IDs while forcing recalculation of state-dependent variables like SNAP\n",
"\n",
- "tracer.print_matrix_structure()"
+ "Let's reproduce this manually for clone 0."
]
},
{
"cell_type": "code",
"execution_count": 6,
- "id": "cell-11",
+ "id": "cell-10",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "\n",
- "=== Creating Target Groups ===\n",
- "\n",
- "National targets:\n",
- " Group 0: Snap = 93,730,290,000\n",
- "\n",
- "State targets:\n",
- " Group 1: SNAP Household Count (51 targets)\n",
- " Group 2: Snap (51 targets)\n",
- "\n",
- "District targets:\n",
- " Group 3: SNAP Household Count (436 targets)\n",
- "\n",
- "Total groups created: 4\n",
- "========================================\n"
+ "Example household (record_idx=8629):\n",
+ " Original state: NC (37)\n",
+ " Clone 0 state: TX (48)\n",
+ " Original SNAP: $18,396.00\n",
+ " Clone 0 SNAP: $18,396.00\n"
]
}
],
"source": [
- "target_groups, group_info = create_target_groups(targets_df)"
+ "clone_idx = 0\n",
+ "col_start = clone_idx * n_records\n",
+ "col_end = col_start + n_records\n",
+ "clone_states = geography.state_fips[col_start:col_end]\n",
+ "\n",
+ "clone_sim = Microsimulation(dataset=dataset_path)\n",
+ "clone_sim.set_input(\"state_fips\", 2024, clone_states.astype(np.int32))\n",
+ "for var in get_calculated_variables(clone_sim):\n",
+ " clone_sim.delete_arrays(var)\n",
+ "\n",
+ "new_snap = clone_sim.calculate(\"snap\", map_to=\"household\").values\n",
+ "\n",
+ "orig_state = sim.calculate(\"state_fips\", map_to=\"household\").values[record_idx]\n",
+ "new_state = clone_states[record_idx]\n",
+ "\n",
+ "print(f\"Example household (record_idx={record_idx}):\")\n",
+ "print(\n",
+ " f\" Original state: {STATE_CODES.get(int(orig_state), '??')} \"\n",
+ " f\"({int(orig_state)})\"\n",
+ ")\n",
+ "print(\n",
+ " f\" Clone 0 state: {STATE_CODES.get(int(new_state), '??')} \"\n",
+ " f\"({int(new_state)})\"\n",
+ ")\n",
+ "print(f\" Original SNAP: ${snap_values[record_idx]:,.2f}\")\n",
+ "print(f\" Clone 0 SNAP: ${new_snap[record_idx]:,.2f}\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
- "id": "7e75756b-a317-4800-bac5-e0fd6bc43b8c",
+ "id": "cell-11",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Row info for North Carolina's SNAP benefit amount:\n",
- "{'row_index': 80, 'variable': 'snap', 'variable_desc': 'SNAP allotment', 'geographic_id': '37', 'target_value': 2934626410.0, 'stratum_id': 9363, 'domain_variable': 'snap'}\n"
+ "SNAP for record_idx=8629 across all 3 clones:\n",
+ "\n"
]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " clone | \n",
+ " state | \n",
+ " state_fips | \n",
+ " SNAP | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " TX | \n",
+ " 48 | \n",
+ " $18,396.00 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " PA | \n",
+ " 42 | \n",
+ " $18,396.00 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " NY | \n",
+ " 36 | \n",
+ " $18,396.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " clone state state_fips SNAP\n",
+ "0 0 TX 48 $18,396.00\n",
+ "1 1 PA 42 $18,396.00\n",
+ "2 2 NY 36 $18,396.00"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "target_group = tracer.get_group_rows(2)\n",
- "row_loc = target_group.iloc[28]['row_index'] # Manually found the index value 28\n",
- "row_info = tracer.get_row_info(row_loc)\n",
- "var = row_info['variable']\n",
- "var_desc = row_info['variable_desc']\n",
- "target_geo_id = int(row_info['geographic_id'])\n",
+ "print(f\"SNAP for record_idx={record_idx} across all {N_CLONES} clones:\\n\")\n",
+ "rows = []\n",
+ "for c in range(N_CLONES):\n",
+ " cs = geography.state_fips[c * n_records + record_idx]\n",
+ " s = Microsimulation(dataset=dataset_path)\n",
+ " s.set_input(\n",
+ " \"state_fips\",\n",
+ " 2024,\n",
+ " geography.state_fips[c * n_records : (c + 1) * n_records].astype(\n",
+ " np.int32\n",
+ " ),\n",
+ " )\n",
+ " for var in get_calculated_variables(s):\n",
+ " s.delete_arrays(var)\n",
+ " clone_snap = s.calculate(\"snap\", map_to=\"household\").values\n",
+ " rows.append(\n",
+ " {\n",
+ " \"clone\": c,\n",
+ " \"state\": STATE_CODES.get(int(cs), \"??\"),\n",
+ " \"state_fips\": int(cs),\n",
+ " \"SNAP\": f\"${clone_snap[record_idx]:,.2f}\",\n",
+ " }\n",
+ " )\n",
+ "pd.DataFrame(rows)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-12",
+ "metadata": {},
+ "source": [
+ "`get_calculated_variables` is selective: it identifies variables with formulas (state-dependent computations) while preserving survey-reported inputs and entity IDs. This is what allows the same demographic household to produce different benefit amounts under different state rules."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-13",
+ "metadata": {},
+ "source": [
+ "## Section 4: Geographic Column Masking\n",
"\n",
- "print(\"Row info for North Carolina's SNAP benefit amount:\")\n",
- "print(row_info)"
+ "When assembling the calibration matrix, each target row only \"sees\" columns (clones) whose geography matches the target's geography. This is implemented via `state_to_cols` and `cd_to_cols` dictionaries built from the `GeographyAssignment`.\n",
+ "\n",
+ "This is step 3 of `build_matrix` — reproduced here for transparency."
]
},
{
"cell_type": "code",
"execution_count": 8,
- "id": "c2be9721-ff11-4f78-ba0b-03407201dd53",
+ "id": "cell-14",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- " household_id household_weight state_fips snap\n",
- "0 26 1205.310059 23 0.0\n",
- "1 34 2170.419922 23 0.0\n",
- "2 38 587.510010 23 0.0\n",
- "3 46 1010.840027 23 0.0\n",
- "4 71 957.460022 23 0.0\n",
- "... ... ... ... ...\n",
- "11994 177822 0.000000 15 0.0\n",
- "11995 177829 0.000000 15 0.0\n",
- "11996 177831 0.000000 15 0.0\n",
- "11997 177860 0.000000 15 6294.0\n",
- "11998 177861 0.000000 15 0.0\n",
+ "Unique states mapped: 50\n",
+ "Unique CDs mapped: 435\n",
"\n",
- "[11999 rows x 4 columns]\n"
+ "Columns per state: min=62, median=494, max=4311\n"
]
}
],
"source": [
- "hh_snap_df = pd.DataFrame(sim.calculate_dataframe([\n",
- " \"household_id\", \"household_weight\", \"state_fips\", \"snap\"]) \n",
- ")\n",
- "print(hh_snap_df)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "438828ac-df94-4d3e-a9a8-227bb6f64933",
- "metadata": {},
- "source": [
- "If we were to include `congressional_district_geoid` above, they would all be zeros. It's not until we do the calibration, i.e., come back with a vector of weights `w` to multiply `X_sparse` with, that we will set `congressional_district_geoid`.\n",
+ "state_col_lists = defaultdict(list)\n",
+ "cd_col_lists = defaultdict(list)\n",
+ "for col in range(n_total):\n",
+ " state_col_lists[int(geography.state_fips[col])].append(col)\n",
+ " cd_col_lists[str(geography.cd_geoid[col])].append(col)\n",
"\n",
- "However, every household is already a donor to every contressional district. You can get the column positions for every household (remember targets are on the rows, donor households on the columns) by running tracer's get_household_column_positions with the *original* `household_id`."
+ "state_to_cols = {s: np.array(c) for s, c in state_col_lists.items()}\n",
+ "cd_to_cols = {cd: np.array(c) for cd, c in cd_col_lists.items()}\n",
+ "\n",
+ "print(f\"Unique states mapped: {len(state_to_cols)}\")\n",
+ "print(f\"Unique CDs mapped: {len(cd_to_cols)}\")\n",
+ "\n",
+ "state_counts = {s: len(c) for s, c in state_to_cols.items()}\n",
+ "sc_series = pd.Series(state_counts)\n",
+ "print(\n",
+ " f\"\\nColumns per state: min={sc_series.min()}, \"\n",
+ " f\"median={sc_series.median():.0f}, max={sc_series.max()}\"\n",
+ ")"
]
},
{
"cell_type": "code",
"execution_count": 9,
- "id": "cell-12",
+ "id": "cell-15",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- " household_id household_weight state_fips snap\n",
- "23 654 1550.660034 23 70.080002\n",
+ "Example household clone visibility:\n",
"\n",
- "Evaluating the tracer.get_household_column_positions dictionary:\n",
+ "Clone 0 (TX, CD 4817):\n",
+ " Visible to TX state targets: col 8629 in state_to_cols[48]? True\n",
+ " Visible to CD 4817 targets: col 8629 in cd_to_cols['4817']? True\n",
+ " Visible to NC (37) targets: False\n",
"\n",
- "{'1501': 23, '1502': 12022, '201': 24021, '3001': 36020, '3002': 48019, '3701': 60018, '3702': 72017, '3703': 84016, '3704': 96015, '3705': 108014, '3706': 120013, '3707': 132012, '3708': 144011, '3709': 156010, '3710': 168009, '3711': 180008, '3712': 192007, '3713': 204006, '3714': 216005}\n"
+ "Clone 1 (PA, CD 4201):\n",
+ " Visible to PA state targets: col 20628 in state_to_cols[42]? True\n",
+ " Visible to CD 4201 targets: col 20628 in cd_to_cols['4201']? True\n",
+ " Visible to NC (37) targets: False\n",
+ "\n",
+ "Clone 2 (NY, CD 3611):\n",
+ " Visible to NY state targets: col 32627 in state_to_cols[36]? True\n",
+ " Visible to CD 3611 targets: col 32627 in cd_to_cols['3611']? True\n",
+ " Visible to NC (37) targets: False\n",
+ "\n"
]
}
],
"source": [
- "# Reverse lookup: get all column positions for a specific household\n",
- "hh_id = hh_snap_df.loc[hh_snap_df.snap > 0].household_id.values[0]\n",
- "print(hh_snap_df.loc[hh_snap_df.household_id == hh_id])\n",
- "\n",
- "print(\"\\nEvaluating the tracer.get_household_column_positions dictionary:\\n\")\n",
- "positions = tracer.get_household_column_positions(hh_id)\n",
- "print(positions)"
+ "print(f\"Example household clone visibility:\\n\")\n",
+ "for c in range(N_CLONES):\n",
+ " col = c * n_records + record_idx\n",
+ " state = int(geography.state_fips[col])\n",
+ " cd = str(geography.cd_geoid[col])\n",
+ " abbr = STATE_CODES.get(state, \"??\")\n",
+ " print(f\"Clone {c} ({abbr}, CD {cd}):\")\n",
+ " print(\n",
+ " f\" Visible to {abbr} state targets: \"\n",
+ " f\"col {col} in state_to_cols[{state}]? \"\n",
+ " f\"{col in state_to_cols.get(state, [])}\"\n",
+ " )\n",
+ " print(\n",
+ " f\" Visible to CD {cd} targets: \"\n",
+ " f\"col {col} in cd_to_cols['{cd}']? \"\n",
+ " f\"{col in cd_to_cols.get(cd, [])}\"\n",
+ " )\n",
+ " # Check an unrelated state\n",
+ " print(\n",
+ " f\" Visible to NC (37) targets: \" f\"{col in state_to_cols.get(37, [])}\"\n",
+ " )\n",
+ " print()"
]
},
{
"cell_type": "markdown",
- "id": "cell-13",
+ "id": "cell-16",
+ "metadata": {},
+ "source": [
+ "This is the mechanism behind the sparsity pattern in `calibration_matrix.ipynb`: a household clone assigned to TX can contribute to TX state targets and TX CD targets, but produces a zero entry for NC or AK targets. The matrix is sparse because each clone only intersects a small fraction of all geographic targets."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-17",
"metadata": {},
"source": [
- "## Section 5: Understanding the cells of the X_Sparse matrix and Target vector"
+ "## Section 5: Takeup Re-randomization\n",
+ "\n",
+ "The base CPS has fixed takeup decisions (e.g., \"this household takes up SNAP\"). But when we clone a household into different census blocks, each block should have independently drawn takeup — otherwise every clone of a SNAP-participating household would still participate, regardless of geography.\n",
+ "\n",
+ "`rerandomize_takeup` solves this: for each census block, it uses `seeded_rng(variable_name, salt=block_geoid)` to draw new takeup booleans. The seed is deterministic per (variable, block) pair, so results are reproducible."
]
},
{
"cell_type": "code",
"execution_count": 10,
- "id": "e05aaeab-3786-4ff0-a50b-34577065d2e0",
+ "id": "cell-18",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Remember, this is a North Carolina target:\n",
- "\n",
- "target_id 8942\n",
- "stratum_id 9363\n",
- "variable snap\n",
- "value 2934626410.0\n",
- "period 2024\n",
- "geo_level state\n",
- "geographic_id 37\n",
- "domain_variable snap\n",
- "original_value 2934626410.0\n",
- "uprating_factor 1.0\n",
- "Name: 80, dtype: object\n",
+ "8 takeup variables:\n",
"\n",
- "NC State target. Household donated to NC's 2nd district, 2024 SNAP dollars:\n",
- "70.08\n",
- "\n",
- "Same target, same household, donated to AK's at Large district, 2024 SNAP dollars:\n",
- "0.0\n"
+ " takes_up_snap_if_eligible entity=spm_unit rate=82.00%\n",
+ " takes_up_aca_if_eligible entity=tax_unit rate=67.20%\n",
+ " takes_up_dc_ptc entity=tax_unit rate=32.00%\n",
+ " takes_up_head_start_if_eligible entity=person rate=30.00%\n",
+ " takes_up_early_head_start_if_eligible entity=person rate=9.00%\n",
+ " takes_up_ssi_if_eligible entity=person rate=50.00%\n",
+ " would_file_taxes_voluntarily entity=tax_unit rate=5.00%\n",
+ " takes_up_medicaid_if_eligible entity=person rate=dict (51 entries)\n"
]
}
],
"source": [
- "print(\"Remember, this is a North Carolina target:\\n\")\n",
- "print(targets_df.iloc[row_loc])\n",
- "\n",
- "print(\"\\nNC State target. Household donated to NC's 2nd district, 2024 SNAP dollars:\")\n",
- "print(X_sparse[row_loc, positions['3702']]) # Household donated to NC's 2nd district\n",
- "\n",
- "print(\"\\nSame target, same household, donated to AK's at Large district, 2024 SNAP dollars:\")\n",
- "print(X_sparse[row_loc, positions['201']]) # Household donated to AK's at Large District"
+ "print(f\"{len(SIMPLE_TAKEUP_VARS)} takeup variables:\\n\")\n",
+ "for spec in SIMPLE_TAKEUP_VARS:\n",
+ " rate_key = spec[\"rate_key\"]\n",
+ " if rate_key == \"voluntary_filing\":\n",
+ " rate = 0.05\n",
+ " else:\n",
+ " rate = load_take_up_rate(rate_key, 2024)\n",
+ " rate_str = (\n",
+ " f\"{rate:.2%}\"\n",
+ " if isinstance(rate, float)\n",
+ " else f\"dict ({len(rate)} entries)\"\n",
+ " )\n",
+ " print(\n",
+ " f\" {spec['variable']:40s} \"\n",
+ " f\"entity={spec['entity']:10s} rate={rate_str}\"\n",
+ " )"
]
},
{
- "cell_type": "markdown",
- "id": "cell-16",
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "cell-19",
"metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Same block + same var (reproducible):\n",
+ " [0.50514599 0.75213437 0.9703409 0.18048868 0.31969517]\n",
+ " [0.50514599 0.75213437 0.9703409 0.18048868 0.31969517]\n",
+ " Match: True\n",
+ "\n",
+ "Different block, same var:\n",
+ " [0.15503168 0.96707026 0.79019745 0.67544525 0.85245009]\n",
+ " Match: False\n",
+ "\n",
+ "Same block, different var:\n",
+ " [0.93155876 0.8912794 0.50838888 0.32192278 0.01005173]\n",
+ " Match: False\n"
+ ]
+ }
+ ],
"source": [
- "Key property: For state-level targets, only CDs in that state should have non-zero values.\n",
+ "block_a = \"482011234567890\"\n",
+ "block_b = \"170311234567890\"\n",
+ "var = \"takes_up_snap_if_eligible\"\n",
"\n",
- "Example: A NC state SNAP target should have zeros for HI, MT, and AK CD columns.\n",
+ "rng_a1 = seeded_rng(var, salt=block_a)\n",
+ "rng_a2 = seeded_rng(var, salt=block_a)\n",
+ "rng_b = seeded_rng(var, salt=block_b)\n",
+ "rng_other = seeded_rng(\"takes_up_aca_if_eligible\", salt=block_a)\n",
"\n",
- "So let's see that same household's value for the Alaska state target:"
+ "draws_a1 = rng_a1.random(5)\n",
+ "draws_a2 = rng_a2.random(5)\n",
+ "draws_b = rng_b.random(5)\n",
+ "draws_other = rng_other.random(5)\n",
+ "\n",
+ "print(\"Same block + same var (reproducible):\")\n",
+ "print(f\" {draws_a1}\")\n",
+ "print(f\" {draws_a2}\")\n",
+ "print(f\" Match: {np.allclose(draws_a1, draws_a2)}\")\n",
+ "print(f\"\\nDifferent block, same var:\")\n",
+ "print(f\" {draws_b}\")\n",
+ "print(f\" Match: {np.allclose(draws_a1, draws_b)}\")\n",
+ "print(f\"\\nSame block, different var:\")\n",
+ "print(f\" {draws_other}\")\n",
+ "print(f\" Match: {np.allclose(draws_a1, draws_other)}\")"
]
},
{
"cell_type": "code",
- "execution_count": 11,
- "id": "8cdc264c-8335-40eb-afd9-4c4d023ec303",
+ "execution_count": 12,
+ "id": "cell-20",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Row info for Alaska's SNAP benefit amount:\n",
- "{'row_index': 80, 'variable': 'snap', 'variable_desc': 'SNAP allotment', 'geographic_id': '37', 'target_value': 2934626410.0, 'stratum_id': 9363, 'domain_variable': 'snap'}\n"
+ "Takeup rates before/after re-randomization (clone 0):\n",
+ "\n",
+ " takes_up_snap_if_eligible before=82.333% after=82.381%\n",
+ " takes_up_aca_if_eligible before=66.718% after=67.486%\n",
+ " takes_up_dc_ptc before=31.483% after=32.044%\n",
+ " takes_up_head_start_if_eligible before=29.963% after=29.689%\n",
+ " takes_up_early_head_start_if_eligible before=8.869% after=8.721%\n",
+ " takes_up_ssi_if_eligible before=100.000% after=49.776%\n",
+ " would_file_taxes_voluntarily before=0.000% after=4.905%\n",
+ " takes_up_medicaid_if_eligible before=84.496% after=80.051%\n"
]
}
],
"source": [
- "target_group = tracer.get_group_rows(2)\n",
- "new_row_loc = target_group.iloc[10]['row_index'] # Manually found the index value 10\n",
- "row_info = tracer.get_row_info(row_loc)\n",
- "var = row_info['variable']\n",
- "var_desc = row_info['variable_desc']\n",
- "target_geo_id = int(row_info['geographic_id'])\n",
+ "test_sim = Microsimulation(dataset=dataset_path)\n",
+ "clone_0_states = geography.state_fips[:n_records]\n",
+ "clone_0_blocks = geography.block_geoid[:n_records]\n",
+ "test_sim.set_input(\"state_fips\", 2024, clone_0_states.astype(np.int32))\n",
+ "\n",
+ "before = {}\n",
+ "for spec in SIMPLE_TAKEUP_VARS:\n",
+ " v = spec[\"variable\"]\n",
+ " vals = test_sim.calculate(v, map_to=spec[\"entity\"]).values\n",
+ " before[v] = vals.mean()\n",
+ "\n",
+ "rerandomize_takeup(test_sim, clone_0_blocks, clone_0_states, 2024)\n",
"\n",
- "print(\"Row info for Alaska's SNAP benefit amount:\")\n",
- "print(row_info)"
+ "print(\"Takeup rates before/after re-randomization (clone 0):\\n\")\n",
+ "for spec in SIMPLE_TAKEUP_VARS:\n",
+ " v = spec[\"variable\"]\n",
+ " vals = test_sim.calculate(v, map_to=spec[\"entity\"]).values\n",
+ " after = vals.mean()\n",
+ " print(f\" {v:40s} before={before[v]:.3%} after={after:.3%}\")"
]
},
{
"cell_type": "code",
- "execution_count": 12,
- "id": "ac59b6f1-859f-4246-8a05-8cb26384c882",
+ "execution_count": 13,
+ "id": "cell-21",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
+ "Medicaid takeup rates (state-specific), first 10 states:\n",
"\n",
- "Household donated to AK's 1st district, 2024 SNAP dollars:\n",
- "0.0\n"
+ " AK: 88.00%\n",
+ " AL: 92.00%\n",
+ " AR: 79.00%\n",
+ " AZ: 95.00%\n",
+ " CA: 78.00%\n",
+ " CO: 99.00%\n",
+ " CT: 89.00%\n",
+ " DC: 99.00%\n",
+ " DE: 86.00%\n",
+ " FL: 98.00%\n"
]
}
],
"source": [
- "print(\"\\nHousehold donated to AK's 1st district, 2024 SNAP dollars:\")\n",
- "print(X_sparse[new_row_loc, positions['201']]) # Household donated to AK's at Large District"
+ "medicaid_rates = load_take_up_rate(\"medicaid\", 2024)\n",
+ "print(\"Medicaid takeup rates (state-specific), first 10 states:\\n\")\n",
+ "for state, rate in sorted(medicaid_rates.items())[:10]:\n",
+ " print(f\" {state}: {rate:.2%}\")"
]
},
{
"cell_type": "markdown",
- "id": "cell-18",
+ "id": "cell-22",
+ "metadata": {},
+ "source": [
+ "In the full pipeline, `rerandomize_takeup` is passed to `build_matrix` as a `sim_modifier` callback. For each clone, after `state_fips` is set but before formula caches are cleared, the callback draws new takeup booleans per census block. This means the same household in block A might take up SNAP while in block B it doesn't — matching the statistical reality that takeup varies by geography."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-23",
"metadata": {},
"source": [
- "## Section 6: Simulating State-Swapped Calculations\n",
+ "## Section 6: Matrix Build Verification\n",
"\n",
- "When a household is \"transplanted\" to a different state, state-dependent benefits like SNAP are recalculated under the destination state's rules."
+ "Let's run the full `build_matrix` pipeline and verify the example household's pattern matches our Section 4 predictions. We use the same `target_filter` as in `calibration_matrix.ipynb` but *without* `sim_modifier` to match that notebook's output."
]
},
{
"cell_type": "code",
- "execution_count": 13,
- "id": "cell-19",
+ "execution_count": 14,
+ "id": "cell-24",
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-02-13 17:11:22,384 - INFO - Processing clone 1/3 (cols 0-11998, 50 unique states)...\n",
+ "2026-02-13 17:11:23,509 - INFO - Processing clone 2/3 (cols 11999-23997, 50 unique states)...\n",
+ "2026-02-13 17:11:24,645 - INFO - Processing clone 3/3 (cols 23998-35996, 50 unique states)...\n",
+ "2026-02-13 17:11:25,769 - INFO - Assembling matrix from 3 clones...\n",
+ "2026-02-13 17:11:25,771 - INFO - Matrix: 538 targets x 35997 cols, 14946 nnz\n"
+ ]
+ },
{
"name": "stdout",
"output_type": "stream",
"text": [
- "SNAP values for first 5 households under different state rules:\n",
- " NC rules: [0. 0. 0. 0. 0.]\n",
- " AK rules: [0. 0. 0. 0. 0.]\n",
- " Difference: [0. 0. 0. 0. 0.]\n"
+ "Matrix shape: (538, 35997)\n",
+ "Non-zero entries: 14,946\n",
+ "Density: 0.000772\n"
]
}
],
"source": [
- "def create_state_simulation(state_fips):\n",
- " \"\"\"Create a simulation with all households assigned to a specific state.\"\"\"\n",
- " s = Microsimulation(dataset=dataset_path)\n",
- " s.set_input(\n",
- " \"state_fips\", 2024, np.full(hh_snap_df.shape[0], state_fips, dtype=np.int32)\n",
- " )\n",
- " for var in get_calculated_variables(s):\n",
- " s.delete_arrays(var)\n",
- " return s\n",
- "\n",
- "# Compare SNAP for first 5 households under NC vs AK rules\n",
- "nc_sim = create_state_simulation(37) # NC\n",
- "ak_sim = create_state_simulation(2) # AK\n",
+ "builder = UnifiedMatrixBuilder(\n",
+ " db_uri=db_uri,\n",
+ " time_period=2024,\n",
+ " dataset_path=dataset_path,\n",
+ ")\n",
"\n",
- "nc_snap = nc_sim.calculate(\"snap\", map_to=\"household\").values[:5]\n",
- "ak_snap = ak_sim.calculate(\"snap\", map_to=\"household\").values[:5]\n",
+ "targets_df, X_sparse, target_names = builder.build_matrix(\n",
+ " geography,\n",
+ " sim,\n",
+ " target_filter={\"domain_variables\": [\"snap\"]},\n",
+ ")\n",
"\n",
- "print(\"SNAP values for first 5 households under different state rules:\")\n",
- "print(f\" NC rules: {nc_snap}\")\n",
- "print(f\" AK rules: {ak_snap}\")\n",
- "print(f\" Difference: {ak_snap - nc_snap}\")"
+ "print(f\"Matrix shape: {X_sparse.shape}\")\n",
+ "print(f\"Non-zero entries: {X_sparse.nnz:,}\")\n",
+ "print(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "cell-25",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Example household non-zero pattern across clones:\n",
+ "\n",
+ "Clone 0 (TX, CD 4817): 3 non-zero rows\n",
+ " row 39: household_count (geo=48): 1.00\n",
+ " row 90: snap (geo=48): 18396.00\n",
+ " row 410: household_count (geo=4817): 1.00\n",
+ "Clone 1 (PA, CD 4201): 3 non-zero rows\n",
+ " row 34: household_count (geo=42): 1.00\n",
+ " row 85: snap (geo=42): 18396.00\n",
+ " row 358: household_count (geo=4201): 1.00\n",
+ "Clone 2 (NY, CD 3611): 3 non-zero rows\n",
+ " row 27: household_count (geo=36): 1.00\n",
+ " row 78: snap (geo=36): 18396.00\n",
+ " row 292: household_count (geo=3611): 1.00\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"Example household non-zero pattern across clones:\\n\")\n",
+ "for c in range(N_CLONES):\n",
+ " col = c * n_records + record_idx\n",
+ " col_vec = X_sparse[:, col]\n",
+ " nz_rows = col_vec.nonzero()[0]\n",
+ " state = int(geography.state_fips[col])\n",
+ " cd = geography.cd_geoid[col]\n",
+ " abbr = STATE_CODES.get(state, \"??\")\n",
+ " print(f\"Clone {c} ({abbr}, CD {cd}): {len(nz_rows)} non-zero rows\")\n",
+ " for r in nz_rows:\n",
+ " row = targets_df.iloc[r]\n",
+ " print(\n",
+ " f\" row {r}: {row['variable']} \"\n",
+ " f\"(geo={row['geographic_id']}): \"\n",
+ " f\"{X_sparse[r, col]:.2f}\"\n",
+ " )"
]
},
{
"cell_type": "markdown",
- "id": "a7a3b4f3-dabc-4160-a781-a529018e889f",
+ "id": "cell-26",
"metadata": {},
"source": [
- "## Section 7: Creating the h5 files\n",
- "\n",
- " `w` (required)\n",
- " - The calibrated weight vector from L0 calibration\n",
- " - Shape: (n_cds * n_households,) — a flattened matrix where each CD has weights for all households\n",
- " - Gets reshaped to (n_cds, n_households) internally\n",
+ "## Section 7: From Weights to Datasets\n",
"\n",
- " `cds_to_calibrate` (required)\n",
- " - The ordered list of CD GEOIDs used when building w\n",
- " - Serves two purposes:\n",
- " a. Tells us how to reshape w (via its length)\n",
- " b. Provides the index mapping so we can extract the right rows for any cd_subset\n",
+ "`create_sparse_cd_stacked_dataset` takes calibrated weights and builds an h5 file with only the non-zero-weight households, reindexed per CD. Internally it does its own state-swap simulation — loading the base dataset, assigning `state_fips` for the target CD's state, and recalculating benefits from scratch. This means SNAP values in the output reflect the destination state's rules (e.g., a $70 SNAP household from ME may get $0 under AK rules).\n",
"\n",
- " `cd_subset` (optional, default None)\n",
- " - Which CDs to actually include in the output dataset\n",
- " - Must be a subset of cds_to_calibrate\n",
- " - If None, all CDs are included\n",
- " - Use cases: build a single-state file, a single-CD file for testing, etc.\n",
- "\n",
- " `output_path` (optional but effectively required — raises if None)\n",
- " - Where to save the resulting .h5 file\n",
- " - Creates parent directories if needed\n",
+ "**Format gap:** The calibration produces weights in clone layout `(n_records * n_clones,)` where each clone maps to one specific CD via the `GeographyAssignment`. The stacked dataset builder expects CD layout `(n_cds * n_households,)` where every CD has a weight slot for every household. Converting between these — accumulating clone weights into their assigned CDs — is a separate step not yet implemented. The demo below constructs artificial CD-layout weights directly to show how the builder works."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "cell-27",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Dimension mismatch:\n",
+ " Calibration output: (11999 * 3,) = 35,997 (clone layout)\n",
+ " Stacked builder expects: (436 * 11999,) = 5,231,564 (CD layout)\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Dimension mismatch:\")\n",
+ "print(\n",
+ " f\" Calibration output: ({n_records} * {N_CLONES},) \"\n",
+ " f\"= {n_records * N_CLONES:,} (clone layout)\"\n",
+ ")\n",
"\n",
- " `dataset_path` (optional, default None)\n",
- " - Path to the base .h5 dataset that was used during calibration\n",
- " - This is the \"template\" — household structure, demographics, etc.\n",
- " - The function loads this, reweights households per CD, updates geography, and stacks"
+ "all_cds = get_all_cds_from_database(db_uri)\n",
+ "n_cds = len(all_cds)\n",
+ "print(\n",
+ " f\" Stacked builder expects: ({n_cds} * {n_records},) \"\n",
+ " f\"= {n_cds * n_records:,} (CD layout)\"\n",
+ ")"
]
},
{
"cell_type": "code",
- "execution_count": 14,
- "id": "e1f8b237-ba42-4fca-8d43-f253f587d49b",
+ "execution_count": 17,
+ "id": "cell-28",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Weight vector: 23,998 entries (2 CDs x 11,999 HH)\n",
+ "Non-zero weights: 277\n",
+ "Example HH weight in CD 3701: 2.5\n",
+ "Example HH weight in CD 201: 3.5\n"
+ ]
+ }
+ ],
"source": [
"import os\n",
"\n",
- "from policyengine_us_data.datasets.cps.local_area_calibration.stacked_dataset_builder import create_sparse_cd_stacked_dataset\n",
+ "demo_cds = [\"3701\", \"201\"]\n",
+ "n_demo_cds = len(demo_cds)\n",
"\n",
- "# Initialize the weights w for demonstration\n",
- "# We can't allow too many w cells to be positive for a given state, or the reindexing will fail\n",
- "w = np.random.binomial(n=1, p=0.01, size=X_sparse.shape[1]).astype(float)\n",
+ "w = (\n",
+ " np.random.default_rng(42)\n",
+ " .binomial(n=1, p=0.01, size=n_demo_cds * n_records)\n",
+ " .astype(float)\n",
+ ")\n",
"\n",
- "# We'll make sure our earlier household is included:\n",
- "household_ids = sim.calculate(\"household_id\", map_to=\"household\").values\n",
- "hh_idx = np.where(household_ids == hh_id)[0][0]\n",
+ "# Seed our example household into both CDs\n",
+ "cd_idx_3701 = demo_cds.index(\"3701\")\n",
+ "w[cd_idx_3701 * n_records + record_idx] = 2.5\n",
"\n",
- "cd_idx = test_cds.index('3701')\n",
- "flat_idx = cd_idx * len(household_ids) + hh_idx\n",
- "w[flat_idx] = 2.5\n",
+ "cd_idx_201 = demo_cds.index(\"201\")\n",
+ "w[cd_idx_201 * n_records + record_idx] = 3.5\n",
"\n",
- "cd_idx = test_cds.index('201')\n",
- "flat_idx = cd_idx * len(household_ids) + hh_idx\n",
- "w[flat_idx] = 3.5\n",
+ "output_dir = \"calibration_output\"\n",
+ "os.makedirs(output_dir, exist_ok=True)\n",
+ "output_path = os.path.join(output_dir, \"results.h5\")\n",
"\n",
- "# Create a folder for the outputs of the function that is to come.\n",
- "new_folder_name = \"calibration_output\"\n",
- "os.makedirs(new_folder_name, exist_ok=True)\n",
- "output_path = os.path.join(new_folder_name, \"results.h5\")"
+ "print(\n",
+ " f\"Weight vector: {len(w):,} entries \"\n",
+ " f\"({n_demo_cds} CDs x {n_records:,} HH)\"\n",
+ ")\n",
+ "print(f\"Non-zero weights: {(w > 0).sum()}\")\n",
+ "print(\n",
+ " f\"Example HH weight in CD 3701: {w[cd_idx_3701 * n_records + record_idx]}\"\n",
+ ")\n",
+ "print(f\"Example HH weight in CD 201: {w[cd_idx_201 * n_records + record_idx]}\")"
]
},
{
"cell_type": "code",
- "execution_count": 15,
- "id": "650b807d-3d20-48e0-b512-43922ca2aace",
+ "execution_count": 18,
+ "id": "cell-29",
"metadata": {},
"outputs": [
{
@@ -643,27 +991,43 @@
"\n",
"Original dataset has 11,999 households\n",
"Extracted weights for 2 CDs from full weight matrix\n",
- "Total active household-CD pairs: 230\n",
- "Total weight in W matrix: 234\n",
- "Processing CD 201 (2/2)...\n",
+ "Total active household-CD pairs: 277\n",
+ "Total weight in W matrix: 281\n",
+ "Processing CD 201 (2/2)...\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2026-02-13 17:11:40,873 - INFO - HTTP Request: GET https://huggingface.co/api/models/policyengine/policyengine-us-data \"HTTP/1.1 200 OK\"\n",
+ "2026-02-13 17:11:40,899 - INFO - HTTP Request: HEAD https://huggingface.co/policyengine/policyengine-us-data/resolve/main/enhanced_cps_2024.h5 \"HTTP/1.1 302 Found\"\n",
+ "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n",
+ "2026-02-13 17:11:40,899 - WARNING - Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
"\n",
"Combining 2 CD DataFrames...\n",
- "Total households across all CDs: 230\n",
- "Combined DataFrame shape: (578, 222)\n",
+ "Total households across all CDs: 277\n",
+ "Combined DataFrame shape: (726, 222)\n",
"\n",
"Reindexing all entity IDs using 25k ranges per CD...\n",
- " Created 230 unique households across 2 CDs\n",
+ " Created 277 unique households across 2 CDs\n",
" Reindexing persons using 25k ranges...\n",
" Reindexing tax units...\n",
" Reindexing SPM units...\n",
" Reindexing marital units...\n",
" Reindexing families...\n",
- " Final persons: 578\n",
- " Final households: 230\n",
- " Final tax units: 314\n",
- " Final SPM units: 236\n",
- " Final marital units: 461\n",
- " Final families: 249\n",
+ " Final persons: 726\n",
+ " Final households: 277\n",
+ " Final tax units: 373\n",
+ " Final SPM units: 291\n",
+ " Final marital units: 586\n",
+ " Final families: 309\n",
"\n",
"Weights in combined_df AFTER reindexing:\n",
" HH weight sum: 0.00M\n",
@@ -671,8 +1035,8 @@
" Ratio: 1.00\n",
"\n",
"Overflow check:\n",
- " Max person ID after reindexing: 5,125,285\n",
- " Max person ID × 100: 512,528,500\n",
+ " Max person ID after reindexing: 5,025,335\n",
+ " Max person ID × 100: 502,533,500\n",
" int32 max: 2,147,483,647\n",
" ✓ No overflow risk!\n",
"\n",
@@ -687,9 +1051,9 @@
"Household mapping saved to calibration_output/mappings/results_household_mapping.csv\n",
"\n",
"Verifying saved file...\n",
- " Final households: 230\n",
- " Final persons: 578\n",
- " Total population (from household weights): 234\n"
+ " Final households: 277\n",
+ " Final persons: 726\n",
+ " Total population (from household weights): 281\n"
]
},
{
@@ -698,17 +1062,16 @@
"'calibration_output/results.h5'"
]
},
- "execution_count": 15,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "cd_subset = ['3701', '201']\n",
"create_sparse_cd_stacked_dataset(\n",
" w,\n",
- " test_cds, # cds_to_calibrate - Defines the structure of the weight vector w\n",
- " cd_subset=cd_subset, # cd_subset - Specifies which CDs to actually include in the output dataset (optional, defaults to all).\n",
+ " demo_cds,\n",
+ " cd_subset=demo_cds,\n",
" dataset_path=dataset_path,\n",
" output_path=output_path,\n",
")"
@@ -716,280 +1079,101 @@
},
{
"cell_type": "code",
- "execution_count": 16,
- "id": "f8d449b4-6069-44e0-8d21-e73944a1a1d2",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[34mmappings\u001b[m\u001b[m/ results.h5\n"
- ]
- }
- ],
- "source": [
- "%ls calibration_output"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "04d7b733-bec5-49cb-9272-d167ae9c4693",
- "metadata": {},
- "source": [
- "Note that there is a *mappings* directory that has also been created by create_sparse_cd_stacked_dataset. This contains the CSV file that links the original households to the donor households. The reason it's a seperate folder is to keep the h5 files and the mapping CSVs organized when this function is run for all districts or states."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "5fd7f7cc-6517-4f39-9a14-9cb147af38e7",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "results_household_mapping.csv\n"
- ]
- }
- ],
- "source": [
- "%ls calibration_output/mappings"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "578e8a69-b7ec-46bf-82ec-8020a46fd9cf",
+ "execution_count": 19,
+ "id": "cell-30",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- " household_id congressional_district_geoid \\\n",
- "0 50000 201 \n",
- "1 50001 201 \n",
- "2 50002 201 \n",
- "3 50003 201 \n",
- "4 50004 201 \n",
- ".. ... ... \n",
- "225 125113 3701 \n",
- "226 125114 3701 \n",
- "227 125115 3701 \n",
- "228 125116 3701 \n",
- "229 125117 3701 \n",
+ "Stacked dataset: 277 households\n",
"\n",
- " county household_weight state_fips \\\n",
- "0 NORTH_SLOPE_BOROUGH_AK 3.5 2 \n",
- "1 ALEUTIANS_WEST_CENSUS_AREA_AK 1.0 2 \n",
- "2 FAIRBANKS_NORTH_STAR_BOROUGH_AK 1.0 2 \n",
- "3 KENAI_PENINSULA_BOROUGH_AK 1.0 2 \n",
- "4 HOONAH_ANGOON_CENSUS_AREA_AK 1.0 2 \n",
- ".. ... ... ... \n",
- "225 TYRRELL_COUNTY_NC 1.0 37 \n",
- "226 WILSON_COUNTY_NC 1.0 37 \n",
- "227 WARREN_COUNTY_NC 1.0 37 \n",
- "228 WILSON_COUNTY_NC 1.0 37 \n",
- "229 GREENE_COUNTY_NC 1.0 37 \n",
+ "Example household (original_id=128694) in mapping:\n",
"\n",
- " snap \n",
- "0 0.000000 \n",
- "1 0.000000 \n",
- "2 0.000000 \n",
- "3 0.000000 \n",
- "4 0.000000 \n",
- ".. ... \n",
- "225 0.000000 \n",
- "226 3438.300293 \n",
- "227 0.000000 \n",
- "228 0.000000 \n",
- "229 885.599792 \n",
+ " new_household_id original_household_id congressional_district state_fips\n",
+ " 108 128694 201 2\n",
+ " 25097 128694 3701 37\n",
"\n",
- "[230 rows x 6 columns]\n"
+ "In stacked dataset:\n",
+ "\n",
+ " household_id congressional_district_geoid household_weight state_fips snap\n",
+ " 108 201 3.5 2 23640.0\n",
+ " 25097 3701 2.5 37 18396.0\n"
]
}
],
"source": [
- "sim_after = Microsimulation(dataset=\"./calibration_output/results.h5\")\n",
+ "sim_after = Microsimulation(dataset=f\"./{output_path}\")\n",
+ "hh_after_df = pd.DataFrame(\n",
+ " sim_after.calculate_dataframe(\n",
+ " [\n",
+ " \"household_id\",\n",
+ " \"congressional_district_geoid\",\n",
+ " \"household_weight\",\n",
+ " \"state_fips\",\n",
+ " \"snap\",\n",
+ " ]\n",
+ " )\n",
+ ")\n",
+ "print(f\"Stacked dataset: {len(hh_after_df)} households\\n\")\n",
"\n",
- "hh_after_df = pd.DataFrame(sim_after.calculate_dataframe([\n",
- " \"household_id\", \"congressional_district_geoid\", \"county\", \"household_weight\", \"state_fips\", \"snap\"]) \n",
+ "mapping_df = pd.read_csv(\n",
+ " f\"{output_dir}/mappings/results_household_mapping.csv\"\n",
")\n",
- "print(hh_after_df)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "83769d86-91e1-41bb-b718-01ee09cc7e2a",
- "metadata": {},
- "source": [
- "We can see one of the correct instances above but let's confirm that this new household id does in fact link back to the original in both cases."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "27baf521-1bd6-4ef0-9f70-4381fd842b52",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " new_household_id | \n",
- " original_household_id | \n",
- " congressional_district | \n",
- " state_fips | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 50000 | \n",
- " 654 | \n",
- " 201 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 125000 | \n",
- " 654 | \n",
- " 3701 | \n",
- " 37 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " new_household_id original_household_id congressional_district state_fips\n",
- "0 50000 654 201 2\n",
- "1 125000 654 3701 37"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "mapping_df = pd.read_csv(\"calibration_output/mappings/results_household_mapping.csv\")\n",
- "mapping_df.loc[mapping_df.original_household_id == hh_id]"
+ "example_mapping = mapping_df.loc[\n",
+ " mapping_df.original_household_id == example_hh_id\n",
+ "]\n",
+ "print(f\"Example household (original_id={example_hh_id}) \" f\"in mapping:\\n\")\n",
+ "print(example_mapping.to_string(index=False))\n",
+ "\n",
+ "new_ids = example_mapping.new_household_id\n",
+ "print(f\"\\nIn stacked dataset:\\n\")\n",
+ "print(\n",
+ " hh_after_df.loc[hh_after_df.household_id.isin(new_ids)].to_string(\n",
+ " index=False\n",
+ " )\n",
+ ")"
]
},
{
"cell_type": "code",
"execution_count": 20,
- "id": "36be0858-33f4-4c65-a74f-e18a76ce8eea",
+ "id": "cell-31",
"metadata": {},
"outputs": [
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " household_id | \n",
- " congressional_district_geoid | \n",
- " county | \n",
- " household_weight | \n",
- " state_fips | \n",
- " snap | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 50000 | \n",
- " 201 | \n",
- " NORTH_SLOPE_BOROUGH_AK | \n",
- " 3.5 | \n",
- " 2 | \n",
- " 0.000000 | \n",
- "
\n",
- " \n",
- " | 112 | \n",
- " 125000 | \n",
- " 3701 | \n",
- " HALIFAX_COUNTY_NC | \n",
- " 2.5 | \n",
- " 37 | \n",
- " 70.080002 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " household_id congressional_district_geoid county \\\n",
- "0 50000 201 NORTH_SLOPE_BOROUGH_AK \n",
- "112 125000 3701 HALIFAX_COUNTY_NC \n",
- "\n",
- " household_weight state_fips snap \n",
- "0 3.5 2 0.000000 \n",
- "112 2.5 37 70.080002 "
- ]
- },
- "execution_count": 20,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Cleaned up calibration_output/\n"
+ ]
}
],
"source": [
- "new_hh_ids = mapping_df.loc[mapping_df.original_household_id == hh_id].new_household_id\n",
- "hh_after_df.loc[hh_after_df.household_id.isin(new_hh_ids)]"
+ "import shutil\n",
+ "\n",
+ "shutil.rmtree(output_dir)\n",
+ "print(f\"Cleaned up {output_dir}/\")"
]
},
{
"cell_type": "markdown",
- "id": "96fa8407-008f-4eaa-8f22-a803b72e71e4",
+ "id": "cell-32",
"metadata": {},
"source": [
- "And we can see that the snap numbers still match their values from the different US state systems. However note that due to the use of policyengine-core's random function in a component of snap_gross_income, for some households, the value in the final simulation will not match the one used in creating the X matrix (`X_sparse` here). This is outlined in [Issue 412](https://github.com/PolicyEngine/policyengine-core/issues/412)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "90ee3a8b-d529-41f2-83ee-d543c53b5492",
- "metadata": {},
- "outputs": [],
- "source": [
- "%rm -r calibration_output"
+ "## Summary\n",
+ "\n",
+ "The clone-based calibration pipeline has six stages:\n",
+ "\n",
+ "1. **Clone + assign geography** — `assign_random_geography()` creates N copies of each CPS record, each with a population-weighted random census block.\n",
+ "2. **Simulate** — `_simulate_clone()` sets each clone's `state_fips` and recalculates state-dependent benefits.\n",
+ "3. **Geographic masking** — `state_to_cols` / `cd_to_cols` restrict each target row to geographically relevant columns.\n",
+ "4. **Re-randomize takeup** — `rerandomize_takeup()` draws new takeup per census block, breaking the fixed-takeup assumption.\n",
+ "5. **Build matrix** — `UnifiedMatrixBuilder.build_matrix()` assembles the sparse CSR matrix from all clones.\n",
+ "6. **Stacked datasets** — `create_sparse_cd_stacked_dataset()` converts calibrated weights into CD-level h5 files.\n",
+ "\n",
+ "For matrix diagnostics (row/column anatomy, target groups, sparsity analysis), see [calibration_matrix.ipynb](calibration_matrix.ipynb)."
]
}
],
diff --git a/modal_app/remote_calibration_runner.py b/modal_app/remote_calibration_runner.py
index 43e354456..689d245dd 100644
--- a/modal_app/remote_calibration_runner.py
+++ b/modal_app/remote_calibration_runner.py
@@ -53,10 +53,7 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict:
elif line.startswith("DATASET:"):
dataset_path = line.split("DATASET:")[1].strip()
- script_path = (
- "policyengine_us_data/datasets/cps/"
- "local_area_calibration/fit_calibration_weights.py"
- )
+ script_path = "policyengine_us_data/calibration/unified_calibration.py"
result = subprocess.run(
[
"uv",
@@ -69,7 +66,7 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict:
str(epochs),
"--db-path",
db_path,
- "--dataset-path",
+ "--dataset",
dataset_path,
],
capture_output=True,
diff --git a/policyengine_us_data/calibration/__init__.py b/policyengine_us_data/calibration/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/policyengine_us_data/calibration/clone_and_assign.py b/policyengine_us_data/calibration/clone_and_assign.py
new file mode 100644
index 000000000..9aa64cbbc
--- /dev/null
+++ b/policyengine_us_data/calibration/clone_and_assign.py
@@ -0,0 +1,145 @@
+"""Clone CPS records and assign random geography."""
+
+import logging
+from functools import lru_cache
+from dataclasses import dataclass
+
+import numpy as np
+import pandas as pd
+
+from policyengine_us_data.storage import STORAGE_FOLDER
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class GeographyAssignment:
+ """Random geography assignment for cloned CPS records.
+
+ All arrays have length n_records * n_clones.
+ Index i corresponds to clone i // n_records,
+ record i % n_records.
+ """
+
+ block_geoid: np.ndarray # str array, 15-char block GEOIDs
+ cd_geoid: np.ndarray # str array of CD GEOIDs
+ state_fips: np.ndarray # int array of 2-digit state FIPS
+ n_records: int
+ n_clones: int
+
+
+@lru_cache(maxsize=1)
+def load_global_block_distribution():
+ """Load block_cd_distributions.csv.gz and build
+ global distribution.
+
+ Returns:
+ Tuple of (block_geoids, cd_geoids, state_fips,
+ probabilities) where each is a numpy array indexed
+ by block row. Probabilities are normalized to sum
+ to 1 globally.
+
+ Raises:
+ FileNotFoundError: If the CSV file does not exist.
+ """
+ csv_path = STORAGE_FOLDER / "block_cd_distributions.csv.gz"
+ if not csv_path.exists():
+ raise FileNotFoundError(
+ f"{csv_path} not found. "
+ "Run make_block_cd_distributions.py to generate."
+ )
+
+ df = pd.read_csv(csv_path, dtype={"block_geoid": str})
+
+ block_geoids = df["block_geoid"].values
+ cd_geoids = df["cd_geoid"].astype(str).values
+ state_fips = np.array([int(b[:2]) for b in block_geoids])
+
+ probs = df["probability"].values.astype(np.float64)
+ probs = probs / probs.sum()
+
+ return block_geoids, cd_geoids, state_fips, probs
+
+
+def assign_random_geography(
+ n_records: int,
+ n_clones: int = 10,
+ seed: int = 42,
+) -> GeographyAssignment:
+ """Assign random census block geography to cloned
+ CPS records.
+
+ Each of n_records * n_clones total records gets a
+ random census block sampled from the global
+ population-weighted distribution. State and CD are
+ derived from the block GEOID.
+
+ Args:
+ n_records: Number of households in the base CPS
+ dataset.
+ n_clones: Number of clones (default 10).
+ seed: Random seed for reproducibility.
+
+ Returns:
+ GeographyAssignment with arrays of length
+ n_records * n_clones.
+ """
+ blocks, cds, states, probs = load_global_block_distribution()
+
+ n_total = n_records * n_clones
+ rng = np.random.default_rng(seed)
+ indices = rng.choice(len(blocks), size=n_total, p=probs)
+
+ return GeographyAssignment(
+ block_geoid=blocks[indices],
+ cd_geoid=cds[indices],
+ state_fips=states[indices],
+ n_records=n_records,
+ n_clones=n_clones,
+ )
+
+
+def double_geography_for_puf(
+ geography: GeographyAssignment,
+) -> GeographyAssignment:
+ """Double geography arrays for PUF clone step.
+
+ After PUF cloning doubles the base records, the geography
+ assignment must also double: each record and its PUF copy
+ share the same geographic assignment.
+
+ The output has n_records = 2 * geography.n_records, with
+ the first half being the CPS records and the second half
+ being the PUF copies.
+
+ Args:
+ geography: Original geography assignment.
+
+ Returns:
+ New GeographyAssignment with doubled n_records.
+ """
+ n_old = geography.n_records
+ n_new = n_old * 2
+ n_clones = geography.n_clones
+
+ new_blocks = []
+ new_cds = []
+ new_states = []
+
+ for c in range(n_clones):
+ start = c * n_old
+ end = start + n_old
+ clone_blocks = geography.block_geoid[start:end]
+ clone_cds = geography.cd_geoid[start:end]
+ clone_states = geography.state_fips[start:end]
+ new_blocks.append(np.concatenate([clone_blocks, clone_blocks]))
+ new_cds.append(np.concatenate([clone_cds, clone_cds]))
+ new_states.append(np.concatenate([clone_states, clone_states]))
+
+ return GeographyAssignment(
+ block_geoid=np.concatenate(new_blocks),
+ cd_geoid=np.concatenate(new_cds),
+ state_fips=np.concatenate(new_states),
+ n_records=n_new,
+ n_clones=n_clones,
+ )
diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py
new file mode 100644
index 000000000..d2759b34b
--- /dev/null
+++ b/policyengine_us_data/calibration/unified_calibration.py
@@ -0,0 +1,637 @@
+"""
+Unified L0 calibration pipeline.
+
+Pipeline flow:
+ 1. Load CPS dataset -> get n_records
+ 2. Clone Nx, assign random geography (census block)
+ 3. Re-randomize simple takeup variables per block
+ 4. Build sparse calibration matrix (clone-by-clone)
+ 5. L0-regularized optimization -> calibrated weights
+ 6. Save weights, diagnostics, run config
+
+Two presets control output size via L0 regularization:
+- local: L0=1e-8, ~3-4M records (for local area dataset)
+- national: L0=1e-4, ~50K records (for web app)
+
+Usage:
+ python -m policyengine_us_data.calibration.unified_calibration \\
+ --dataset path/to/cps_2024.h5 \\
+ --db-path path/to/policy_data.db \\
+ --output path/to/weights.npy \\
+ --preset local \\
+ --epochs 100
+"""
+
+import argparse
+import builtins
+import logging
+import sys
+from pathlib import Path
+
+import numpy as np
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s - %(levelname)s - %(message)s",
+ stream=sys.stderr,
+)
+logger = logging.getLogger(__name__)
+
+PRESETS = {
+ "local": 1e-8,
+ "national": 1e-4,
+}
+
+BETA = 0.35
+GAMMA = -0.1
+ZETA = 1.1
+INIT_KEEP_PROB = 0.999
+LOG_WEIGHT_JITTER_SD = 0.05
+LOG_ALPHA_JITTER_SD = 0.01
+LAMBDA_L2 = 1e-12
+LEARNING_RATE = 0.15
+DEFAULT_EPOCHS = 100
+DEFAULT_N_CLONES = 10
+
+SIMPLE_TAKEUP_VARS = [
+ {
+ "variable": "takes_up_snap_if_eligible",
+ "entity": "spm_unit",
+ "rate_key": "snap",
+ },
+ {
+ "variable": "takes_up_aca_if_eligible",
+ "entity": "tax_unit",
+ "rate_key": "aca",
+ },
+ {
+ "variable": "takes_up_dc_ptc",
+ "entity": "tax_unit",
+ "rate_key": "dc_ptc",
+ },
+ {
+ "variable": "takes_up_head_start_if_eligible",
+ "entity": "person",
+ "rate_key": "head_start",
+ },
+ {
+ "variable": "takes_up_early_head_start_if_eligible",
+ "entity": "person",
+ "rate_key": "early_head_start",
+ },
+ {
+ "variable": "takes_up_ssi_if_eligible",
+ "entity": "person",
+ "rate_key": "ssi",
+ },
+ {
+ "variable": "would_file_taxes_voluntarily",
+ "entity": "tax_unit",
+ "rate_key": "voluntary_filing",
+ },
+ {
+ "variable": "takes_up_medicaid_if_eligible",
+ "entity": "person",
+ "rate_key": "medicaid",
+ },
+]
+
+
+def rerandomize_takeup(
+ sim,
+ clone_block_geoids: np.ndarray,
+ clone_state_fips: np.ndarray,
+ time_period: int,
+) -> None:
+ """Re-randomize simple takeup variables per census block.
+
+ Groups entities by their household's block GEOID and draws
+ new takeup booleans using seeded_rng(var_name, salt=block).
+ Overrides the simulation's stored inputs.
+
+ Args:
+ sim: Microsimulation instance (already has state_fips).
+ clone_block_geoids: Block GEOIDs per household.
+ clone_state_fips: State FIPS per household.
+ time_period: Tax year.
+ """
+ from policyengine_us_data.parameters import (
+ load_take_up_rate,
+ )
+ from policyengine_us_data.utils.randomness import (
+ seeded_rng,
+ )
+
+ n_households = len(clone_block_geoids)
+ hh_ids = sim.calculate("household_id", map_to="household").values
+ hh_to_block = dict(zip(hh_ids, clone_block_geoids))
+ hh_to_state = dict(zip(hh_ids, clone_state_fips))
+
+ for spec in SIMPLE_TAKEUP_VARS:
+ var_name = spec["variable"]
+ entity_level = spec["entity"]
+ rate_key = spec["rate_key"]
+
+ rate_or_dict = load_take_up_rate(rate_key, time_period)
+
+ is_state_specific = isinstance(rate_or_dict, dict)
+
+ entity_ids = sim.calculate(
+ f"{entity_level}_id", map_to=entity_level
+ ).values
+ entity_hh_ids = sim.calculate(
+ "household_id", map_to=entity_level
+ ).values
+ n_entities = len(entity_ids)
+
+ draws = np.zeros(n_entities, dtype=np.float64)
+ rates = np.zeros(n_entities, dtype=np.float64)
+
+ entity_blocks = np.array(
+ [hh_to_block.get(hid, "0") for hid in entity_hh_ids]
+ )
+
+ unique_blocks = np.unique(entity_blocks)
+ for block in unique_blocks:
+ mask = entity_blocks == block
+ n_in_block = mask.sum()
+ rng = seeded_rng(var_name, salt=str(block))
+ draws[mask] = rng.random(n_in_block)
+
+ if is_state_specific:
+ block_hh_ids = entity_hh_ids[mask]
+ for i, hid in enumerate(block_hh_ids):
+ state = int(hh_to_state.get(hid, 0))
+ state_str = str(state)
+ r = rate_or_dict.get(
+ state_str,
+ rate_or_dict.get(state, 0.8),
+ )
+ idx = np.where(mask)[0][i]
+ rates[idx] = r
+ else:
+ rates[mask] = rate_or_dict
+
+ new_values = draws < rates
+ sim.set_input(var_name, time_period, new_values)
+
+
+def parse_args(argv=None):
+ parser = argparse.ArgumentParser(
+ description="Unified L0 calibration pipeline"
+ )
+ parser.add_argument(
+ "--dataset",
+ default=None,
+ help="Path to CPS h5 file",
+ )
+ parser.add_argument(
+ "--db-path",
+ default=None,
+ help="Path to policy_data.db",
+ )
+ parser.add_argument(
+ "--output",
+ default=None,
+ help="Path to save weights (.npy)",
+ )
+ parser.add_argument(
+ "--n-clones",
+ type=int,
+ default=DEFAULT_N_CLONES,
+ help=f"Number of clones (default: {DEFAULT_N_CLONES})",
+ )
+ parser.add_argument(
+ "--preset",
+ choices=list(PRESETS.keys()),
+ default=None,
+ help="L0 preset: local or national",
+ )
+ parser.add_argument(
+ "--lambda-l0",
+ type=float,
+ default=None,
+ help="Custom L0 penalty (overrides preset)",
+ )
+ parser.add_argument(
+ "--epochs",
+ type=int,
+ default=DEFAULT_EPOCHS,
+ help=f"Training epochs (default: {DEFAULT_EPOCHS})",
+ )
+ parser.add_argument(
+ "--device",
+ default="cpu",
+ choices=["cpu", "cuda"],
+ help="Device for training",
+ )
+ parser.add_argument(
+ "--seed",
+ type=int,
+ default=42,
+ help="Random seed for geography assignment",
+ )
+ parser.add_argument(
+ "--domain-variables",
+ type=str,
+ default=None,
+ help=(
+ "Comma-separated domain variables for " "target_overview filtering"
+ ),
+ )
+ parser.add_argument(
+ "--hierarchical-domains",
+ type=str,
+ default=None,
+ help=(
+ "Comma-separated domains for hierarchical "
+ "uprating + CD reconciliation"
+ ),
+ )
+ parser.add_argument(
+ "--skip-takeup-rerandomize",
+ action="store_true",
+ help="Skip takeup re-randomization",
+ )
+ return parser.parse_args(argv)
+
+
+def fit_l0_weights(
+ X_sparse,
+ targets: np.ndarray,
+ lambda_l0: float,
+ epochs: int = DEFAULT_EPOCHS,
+ device: str = "cpu",
+ verbose_freq: int = None,
+) -> np.ndarray:
+ """Fit L0-regularized calibration weights.
+
+ Args:
+ X_sparse: Sparse matrix (targets x records).
+ targets: Target values array.
+ lambda_l0: L0 regularization strength.
+ epochs: Training epochs.
+ device: Torch device.
+ verbose_freq: Print frequency. Defaults to 10%.
+
+ Returns:
+ Weight array of shape (n_records,).
+ """
+ import time
+
+ try:
+ from l0.calibration import SparseCalibrationWeights
+ except ImportError:
+ raise ImportError(
+ "l0-python required. " "Install: pip install l0-python"
+ )
+
+ import torch
+
+ n_total = X_sparse.shape[1]
+ initial_weights = np.ones(n_total) * 100
+
+ logger.info(
+ "L0 calibration: %d targets, %d features, "
+ "lambda_l0=%.1e, epochs=%d",
+ X_sparse.shape[0],
+ n_total,
+ lambda_l0,
+ epochs,
+ )
+
+ model = SparseCalibrationWeights(
+ n_features=n_total,
+ beta=BETA,
+ gamma=GAMMA,
+ zeta=ZETA,
+ init_keep_prob=INIT_KEEP_PROB,
+ init_weights=initial_weights,
+ log_weight_jitter_sd=LOG_WEIGHT_JITTER_SD,
+ log_alpha_jitter_sd=LOG_ALPHA_JITTER_SD,
+ device=device,
+ )
+
+ if verbose_freq is None:
+ verbose_freq = max(1, epochs // 10)
+
+ _builtin_print = builtins.print
+
+ def _flushed_print(*args, **kwargs):
+ _builtin_print(*args, **kwargs)
+ sys.stdout.flush()
+
+ builtins.print = _flushed_print
+
+ t0 = time.time()
+ try:
+ model.fit(
+ M=X_sparse,
+ y=targets,
+ target_groups=None,
+ lambda_l0=lambda_l0,
+ lambda_l2=LAMBDA_L2,
+ lr=LEARNING_RATE,
+ epochs=epochs,
+ loss_type="relative",
+ verbose=True,
+ verbose_freq=verbose_freq,
+ )
+ finally:
+ builtins.print = _builtin_print
+
+ elapsed = time.time() - t0
+ logger.info(
+ "L0 done in %.1f min (%.1f sec/epoch)",
+ elapsed / 60,
+ elapsed / epochs,
+ )
+
+ with torch.no_grad():
+ weights = model.get_weights(deterministic=True).cpu().numpy()
+
+ n_nz = (weights > 0).sum()
+ logger.info(
+ "Non-zero: %d / %d (%.1f%% sparsity)",
+ n_nz,
+ n_total,
+ (1 - n_nz / n_total) * 100,
+ )
+ return weights
+
+
+def compute_diagnostics(
+ weights: np.ndarray,
+ X_sparse,
+ targets_df,
+ target_names: list,
+) -> "pd.DataFrame":
+ import pandas as pd
+
+ estimates = X_sparse.dot(weights)
+ true_values = targets_df["value"].values
+ row_sums = np.array(X_sparse.sum(axis=1)).flatten()
+
+ rel_errors = np.where(
+ np.abs(true_values) > 0,
+ (estimates - true_values) / np.abs(true_values),
+ 0.0,
+ )
+ return pd.DataFrame(
+ {
+ "target": target_names,
+ "true_value": true_values,
+ "estimate": estimates,
+ "rel_error": rel_errors,
+ "abs_rel_error": np.abs(rel_errors),
+ "achievable": row_sums > 0,
+ }
+ )
+
+
+def run_calibration(
+ dataset_path: str,
+ db_path: str,
+ n_clones: int = DEFAULT_N_CLONES,
+ lambda_l0: float = 1e-8,
+ epochs: int = DEFAULT_EPOCHS,
+ device: str = "cpu",
+ seed: int = 42,
+ domain_variables: list = None,
+ hierarchical_domains: list = None,
+ skip_takeup_rerandomize: bool = False,
+):
+ """Run unified calibration pipeline.
+
+ Args:
+ dataset_path: Path to CPS h5 file.
+ db_path: Path to policy_data.db.
+ n_clones: Number of dataset clones.
+ lambda_l0: L0 regularization strength.
+ epochs: Training epochs.
+ device: Torch device.
+ seed: Random seed.
+ domain_variables: Filter targets by domain variable.
+ hierarchical_domains: Domains for hierarchical
+ uprating + CD reconciliation.
+ skip_takeup_rerandomize: Skip takeup step.
+
+ Returns:
+ (weights, targets_df, X_sparse, target_names)
+ """
+ import time
+
+ from policyengine_us import Microsimulation
+
+ from policyengine_us_data.calibration.clone_and_assign import (
+ assign_random_geography,
+ )
+ from policyengine_us_data.calibration.unified_matrix_builder import (
+ UnifiedMatrixBuilder,
+ )
+
+ t0 = time.time()
+
+ # Step 1: Load dataset
+ logger.info("Loading dataset from %s", dataset_path)
+ sim = Microsimulation(dataset=dataset_path)
+ n_records = len(sim.calculate("household_id", map_to="household").values)
+ logger.info("Loaded %d households", n_records)
+
+ # Step 2: Clone and assign geography
+ logger.info(
+ "Assigning geography: %d x %d = %d total",
+ n_records,
+ n_clones,
+ n_records * n_clones,
+ )
+ geography = assign_random_geography(
+ n_records=n_records,
+ n_clones=n_clones,
+ seed=seed,
+ )
+
+ # Step 3: Build sim_modifier for takeup rerandomization
+ sim_modifier = None
+ if not skip_takeup_rerandomize:
+ time_period = 2024
+
+ def sim_modifier(s, clone_idx):
+ col_start = clone_idx * n_records
+ col_end = col_start + n_records
+ blocks = geography.block_geoid[col_start:col_end]
+ states = geography.state_fips[col_start:col_end]
+ rerandomize_takeup(s, blocks, states, time_period)
+
+ # Step 4: Build target filter
+ target_filter = {}
+ if domain_variables:
+ target_filter["domain_variables"] = domain_variables
+
+ # Step 5: Build sparse calibration matrix
+ t_matrix = time.time()
+ db_uri = f"sqlite:///{db_path}"
+ builder = UnifiedMatrixBuilder(
+ db_uri=db_uri,
+ time_period=2024,
+ dataset_path=dataset_path,
+ )
+ targets_df, X_sparse, target_names = builder.build_matrix(
+ geography=geography,
+ sim=sim,
+ target_filter=target_filter,
+ hierarchical_domains=hierarchical_domains,
+ sim_modifier=sim_modifier,
+ )
+
+ builder.print_uprating_summary(targets_df)
+ logger.info(
+ "Matrix built in %.1f min",
+ (time.time() - t_matrix) / 60,
+ )
+ logger.info(
+ "Matrix shape: %s, nnz: %d",
+ X_sparse.shape,
+ X_sparse.nnz,
+ )
+
+ # Step 6: L0 calibration
+ targets = targets_df["value"].values
+
+ row_sums = np.array(X_sparse.sum(axis=1)).flatten()
+ achievable = row_sums > 0
+ logger.info(
+ "Achievable: %d / %d targets",
+ achievable.sum(),
+ len(achievable),
+ )
+
+ weights = fit_l0_weights(
+ X_sparse=X_sparse,
+ targets=targets,
+ lambda_l0=lambda_l0,
+ epochs=epochs,
+ device=device,
+ )
+
+ logger.info(
+ "Total pipeline: %.1f min",
+ (time.time() - t0) / 60,
+ )
+ return weights, targets_df, X_sparse, target_names
+
+
+def main(argv=None):
+ import json
+ import time
+
+ import pandas as pd
+
+ try:
+ if not sys.stderr.isatty():
+ sys.stderr.reconfigure(line_buffering=True)
+ if not sys.stdout.isatty():
+ sys.stdout.reconfigure(line_buffering=True)
+ except AttributeError:
+ pass
+
+ args = parse_args(argv)
+
+ from policyengine_us_data.storage import STORAGE_FOLDER
+
+ dataset_path = args.dataset or str(
+ STORAGE_FOLDER / "stratified_extended_cps_2024.h5"
+ )
+ db_path = args.db_path or str(
+ STORAGE_FOLDER / "calibration" / "policy_data.db"
+ )
+ output_path = args.output or str(
+ STORAGE_FOLDER / "calibration" / "unified_weights.npy"
+ )
+
+ if args.lambda_l0 is not None:
+ lambda_l0 = args.lambda_l0
+ elif args.preset is not None:
+ lambda_l0 = PRESETS[args.preset]
+ else:
+ lambda_l0 = PRESETS["local"]
+
+ domain_variables = None
+ if args.domain_variables:
+ domain_variables = [
+ x.strip() for x in args.domain_variables.split(",")
+ ]
+
+ hierarchical_domains = None
+ if args.hierarchical_domains:
+ hierarchical_domains = [
+ x.strip() for x in args.hierarchical_domains.split(",")
+ ]
+
+ t_start = time.time()
+
+ weights, targets_df, X_sparse, target_names = run_calibration(
+ dataset_path=dataset_path,
+ db_path=db_path,
+ n_clones=args.n_clones,
+ lambda_l0=lambda_l0,
+ epochs=args.epochs,
+ device=args.device,
+ seed=args.seed,
+ domain_variables=domain_variables,
+ hierarchical_domains=hierarchical_domains,
+ skip_takeup_rerandomize=(args.skip_takeup_rerandomize),
+ )
+
+ # Save weights
+ np.save(output_path, weights)
+ logger.info("Weights saved to %s", output_path)
+ print(f"OUTPUT_PATH:{output_path}")
+
+ # Save diagnostics
+ output_dir = Path(output_path).parent
+ diag_df = compute_diagnostics(weights, X_sparse, targets_df, target_names)
+ diag_path = output_dir / "unified_diagnostics.csv"
+ diag_df.to_csv(diag_path, index=False)
+
+ ach = diag_df[diag_df.achievable]
+ err_pct = ach.abs_rel_error * 100
+ logger.info(
+ "Diagnostics: %d targets, "
+ "mean=%.1f%%, median=%.1f%%, "
+ "<10%%=%.1f%%, <25%%=%.1f%%",
+ len(ach),
+ err_pct.mean(),
+ err_pct.median(),
+ (err_pct < 10).mean() * 100,
+ (err_pct < 25).mean() * 100,
+ )
+
+ # Save run config
+ t_end = time.time()
+ run_config = {
+ "dataset": dataset_path,
+ "db_path": db_path,
+ "n_clones": args.n_clones,
+ "lambda_l0": lambda_l0,
+ "epochs": args.epochs,
+ "device": args.device,
+ "seed": args.seed,
+ "domain_variables": domain_variables,
+ "hierarchical_domains": hierarchical_domains,
+ "n_targets": len(targets_df),
+ "n_records": X_sparse.shape[1],
+ "weight_sum": float(weights.sum()),
+ "weight_nonzero": int((weights > 0).sum()),
+ "mean_error_pct": float(err_pct.mean()),
+ "elapsed_seconds": round(t_end - t_start, 1),
+ }
+ config_path = output_dir / "unified_run_config.json"
+ with open(config_path, "w") as f:
+ json.dump(run_config, f, indent=2)
+ logger.info("Config saved to %s", config_path)
+ print(f"LOG_PATH:{diag_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/policyengine_us_data/calibration/unified_matrix_builder.py b/policyengine_us_data/calibration/unified_matrix_builder.py
new file mode 100644
index 000000000..ac31c34e1
--- /dev/null
+++ b/policyengine_us_data/calibration/unified_matrix_builder.py
@@ -0,0 +1,906 @@
+"""
+Unified sparse matrix builder for clone-based calibration.
+
+Builds a sparse calibration matrix for cloned+geography-assigned CPS
+records. Processes clone-by-clone: for each clone, sets each
+record's state_fips to its assigned value, simulates, and extracts
+variable values.
+
+Matrix shape: (n_targets, n_records * n_clones)
+Column ordering: index i = clone_idx * n_records + record_idx
+"""
+
+import logging
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from scipy import sparse
+from sqlalchemy import create_engine, text
+
+from policyengine_us_data.storage import STORAGE_FOLDER
+from policyengine_us_data.utils.census import STATE_NAME_TO_FIPS
+from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (
+ get_calculated_variables,
+ apply_op,
+ get_geo_level,
+)
+
+logger = logging.getLogger(__name__)
+
+_GEO_VARS = {
+ "state_fips",
+ "state_code",
+ "congressional_district_geoid",
+}
+
+
+class UnifiedMatrixBuilder:
+ """Build sparse calibration matrix for cloned CPS records.
+
+ Processes clone-by-clone: each clone's records get their
+ assigned geography, are simulated, and the results fill
+ the corresponding columns.
+
+ Args:
+ db_uri: SQLAlchemy database URI.
+ time_period: Tax year for calibration (e.g. 2024).
+ dataset_path: Path to the base extended CPS h5 file.
+ """
+
+ def __init__(
+ self,
+ db_uri: str,
+ time_period: int,
+ dataset_path: Optional[str] = None,
+ ):
+ self.db_uri = db_uri
+ self.engine = create_engine(db_uri)
+ self.time_period = time_period
+ self.dataset_path = dataset_path
+ self._entity_rel_cache = None
+
+ # ---------------------------------------------------------------
+ # Entity relationships
+ # ---------------------------------------------------------------
+
+ def _build_entity_relationship(self, sim) -> pd.DataFrame:
+ if self._entity_rel_cache is not None:
+ return self._entity_rel_cache
+
+ self._entity_rel_cache = pd.DataFrame(
+ {
+ "person_id": sim.calculate(
+ "person_id", map_to="person"
+ ).values,
+ "household_id": sim.calculate(
+ "household_id", map_to="person"
+ ).values,
+ "tax_unit_id": sim.calculate(
+ "tax_unit_id", map_to="person"
+ ).values,
+ "spm_unit_id": sim.calculate(
+ "spm_unit_id", map_to="person"
+ ).values,
+ }
+ )
+ return self._entity_rel_cache
+
+ # ---------------------------------------------------------------
+ # Constraint evaluation
+ # ---------------------------------------------------------------
+
+ def _evaluate_constraints_entity_aware(
+ self,
+ sim,
+ constraints: List[dict],
+ n_households: int,
+ ) -> np.ndarray:
+ """Evaluate constraints at person level, aggregate to
+ household level via .any()."""
+ if not constraints:
+ return np.ones(n_households, dtype=bool)
+
+ entity_rel = self._build_entity_relationship(sim)
+ n_persons = len(entity_rel)
+ person_mask = np.ones(n_persons, dtype=bool)
+
+ for c in constraints:
+ try:
+ vals = sim.calculate(
+ c["variable"],
+ self.time_period,
+ map_to="person",
+ ).values
+ except Exception as exc:
+ logger.warning(
+ "Cannot evaluate constraint '%s': %s",
+ c["variable"],
+ exc,
+ )
+ return np.zeros(n_households, dtype=bool)
+ person_mask &= apply_op(vals, c["operation"], c["value"])
+
+ df = entity_rel.copy()
+ df["satisfies"] = person_mask
+ hh_mask = df.groupby("household_id")["satisfies"].any()
+
+ household_ids = sim.calculate(
+ "household_id", map_to="household"
+ ).values
+ return np.array([hh_mask.get(hid, False) for hid in household_ids])
+
+ # ---------------------------------------------------------------
+ # Database queries
+ # ---------------------------------------------------------------
+
+ def _get_stratum_constraints(self, stratum_id: int) -> List[dict]:
+ query = """
+ SELECT constraint_variable AS variable, operation, value
+ FROM stratum_constraints
+ WHERE stratum_id = :stratum_id
+ """
+ with self.engine.connect() as conn:
+ df = pd.read_sql(
+ query,
+ conn,
+ params={"stratum_id": int(stratum_id)},
+ )
+ return df.to_dict("records")
+
+ def _query_targets(self, target_filter: dict) -> pd.DataFrame:
+ """Query targets via target_overview view with
+ best-period selection."""
+ or_conditions = []
+
+ if "domain_variables" in target_filter:
+ dvs = target_filter["domain_variables"]
+ ph = ",".join(f"'{dv}'" for dv in dvs)
+ or_conditions.append(f"tv.domain_variable IN ({ph})")
+
+ if "variables" in target_filter:
+ vs = ",".join(f"'{v}'" for v in target_filter["variables"])
+ or_conditions.append(f"tv.variable IN ({vs})")
+
+ if "target_ids" in target_filter:
+ ids = ",".join(map(str, target_filter["target_ids"]))
+ or_conditions.append(f"tv.target_id IN ({ids})")
+
+ if "stratum_ids" in target_filter:
+ ids = ",".join(map(str, target_filter["stratum_ids"]))
+ or_conditions.append(f"tv.stratum_id IN ({ids})")
+
+ if not or_conditions:
+ where_clause = "1=1"
+ else:
+ where_clause = " OR ".join(f"({c})" for c in or_conditions)
+
+ query = f"""
+ WITH filtered_targets AS (
+ SELECT tv.target_id, tv.stratum_id, tv.variable,
+ tv.value, tv.period, tv.geo_level,
+ tv.geographic_id, tv.domain_variable
+ FROM target_overview tv
+ WHERE {where_clause}
+ ),
+ best_periods AS (
+ SELECT stratum_id, variable,
+ CASE
+ WHEN MAX(CASE WHEN period <= :time_period
+ THEN period END) IS NOT NULL
+ THEN MAX(CASE WHEN period <= :time_period
+ THEN period END)
+ ELSE MIN(period)
+ END as best_period
+ FROM filtered_targets
+ GROUP BY stratum_id, variable
+ )
+ SELECT ft.*
+ FROM filtered_targets ft
+ JOIN best_periods bp
+ ON ft.stratum_id = bp.stratum_id
+ AND ft.variable = bp.variable
+ AND ft.period = bp.best_period
+ ORDER BY ft.target_id
+ """
+
+ with self.engine.connect() as conn:
+ return pd.read_sql(
+ query,
+ conn,
+ params={"time_period": self.time_period},
+ )
+
+ # ---------------------------------------------------------------
+ # Uprating
+ # ---------------------------------------------------------------
+
+ def _calculate_uprating_factors(self, params) -> dict:
+ factors = {}
+ query = (
+ "SELECT DISTINCT period FROM targets "
+ "WHERE period IS NOT NULL ORDER BY period"
+ )
+ with self.engine.connect() as conn:
+ result = conn.execute(text(query))
+ years_needed = [row[0] for row in result]
+
+ for from_year in years_needed:
+ if from_year == self.time_period:
+ factors[(from_year, "cpi")] = 1.0
+ factors[(from_year, "pop")] = 1.0
+ continue
+
+ try:
+ cpi_from = params.gov.bls.cpi.cpi_u(from_year)
+ cpi_to = params.gov.bls.cpi.cpi_u(self.time_period)
+ factors[(from_year, "cpi")] = float(cpi_to / cpi_from)
+ except Exception:
+ factors[(from_year, "cpi")] = 1.0
+
+ try:
+ pop_from = params.calibration.gov.census.populations.total(
+ from_year
+ )
+ pop_to = params.calibration.gov.census.populations.total(
+ self.time_period
+ )
+ factors[(from_year, "pop")] = float(pop_to / pop_from)
+ except Exception:
+ factors[(from_year, "pop")] = 1.0
+
+ return factors
+
+ def _get_uprating_info(
+ self,
+ variable: str,
+ period: int,
+ factors: dict,
+ ) -> Tuple[float, str]:
+ if period == self.time_period:
+ return 1.0, "none"
+
+ count_indicators = [
+ "count",
+ "person",
+ "people",
+ "households",
+ "tax_units",
+ ]
+ is_count = any(ind in variable.lower() for ind in count_indicators)
+ uprating_type = "pop" if is_count else "cpi"
+ factor = factors.get((period, uprating_type), 1.0)
+ return factor, uprating_type
+
+ def _load_aca_ptc_factors(
+ self,
+ ) -> Dict[int, Dict[str, float]]:
+ csv_path = STORAGE_FOLDER / "aca_ptc_multipliers_2022_2024.csv"
+ df = pd.read_csv(csv_path)
+ result = {}
+ for _, row in df.iterrows():
+ fips_str = STATE_NAME_TO_FIPS.get(row["state"])
+ if fips_str is None:
+ continue
+ fips_int = int(fips_str)
+ result[fips_int] = {
+ "tax_unit_count": row["vol_mult"],
+ "aca_ptc": row["vol_mult"] * row["val_mult"],
+ }
+ return result
+
+ def _get_state_uprating_factors(
+ self,
+ domain: str,
+ targets_df: pd.DataFrame,
+ national_factors: dict,
+ ) -> Dict[int, Dict[str, float]]:
+ state_rows = targets_df[
+ (targets_df["domain_variable"] == domain)
+ & (targets_df["geo_level"] == "state")
+ ]
+ state_fips_list = state_rows["geographic_id"].unique()
+ variables = state_rows["variable"].unique()
+
+ if domain == "aca_ptc":
+ csv_factors = self._load_aca_ptc_factors()
+ else:
+ csv_factors = None
+
+ result = {}
+ for sf in state_fips_list:
+ state_int = int(sf)
+ var_factors = {}
+
+ if csv_factors and state_int in csv_factors:
+ for var in variables:
+ var_factors[var] = csv_factors[state_int].get(var, 1.0)
+ else:
+ for var in variables:
+ row = state_rows[
+ (state_rows["geographic_id"] == sf)
+ & (state_rows["variable"] == var)
+ ]
+ if row.empty:
+ var_factors[var] = 1.0
+ continue
+ period = row.iloc[0]["period"]
+ factor, _ = self._get_uprating_info(
+ var, period, national_factors
+ )
+ var_factors[var] = factor
+
+ result[state_int] = var_factors
+
+ return result
+
+ def _apply_hierarchical_uprating(
+ self,
+ targets_df: pd.DataFrame,
+ hierarchical_domains: List[str],
+ national_factors: dict,
+ ) -> pd.DataFrame:
+ """Apply state-level uprating and reconcile CDs.
+
+ Two factors per CD row:
+ - hif: state_original / sum(cd_originals)
+ - uprating_factor: state-specific scaling
+
+ Final CD value = original * hif * uprating_factor.
+ """
+ df = targets_df.copy()
+ df["hif"] = np.nan
+ df["state_uprating_factor"] = np.nan
+ rows_to_drop = []
+
+ for domain in hierarchical_domains:
+ domain_mask = df["domain_variable"] == domain
+ state_factors = self._get_state_uprating_factors(
+ domain, df, national_factors
+ )
+ state_mask = domain_mask & (df["geo_level"] == "state")
+ district_mask = domain_mask & (df["geo_level"] == "district")
+
+ for sf, var_factors in state_factors.items():
+ for var, uf in var_factors.items():
+ state_row = df[
+ state_mask
+ & (df["geographic_id"] == str(sf))
+ & (df["variable"] == var)
+ ]
+ if state_row.empty:
+ continue
+ state_original = state_row.iloc[0]["original_value"]
+
+ def _cd_in_state(g, s=sf):
+ try:
+ return int(g) // 100 == s
+ except (ValueError, TypeError):
+ return False
+
+ cd_mask = (
+ district_mask
+ & (df["variable"] == var)
+ & df["geographic_id"].apply(_cd_in_state)
+ )
+ cd_rows = df[cd_mask]
+ if cd_rows.empty:
+ continue
+
+ cd_original_sum = cd_rows["original_value"].sum()
+ if cd_original_sum == 0:
+ continue
+
+ hif = state_original / cd_original_sum
+ for cd_idx in cd_rows.index:
+ df.at[cd_idx, "hif"] = hif
+ df.at[cd_idx, "state_uprating_factor"] = uf
+ df.at[cd_idx, "value"] = (
+ df.at[cd_idx, "original_value"] * hif * uf
+ )
+
+ # Drop national/state rows used for reconciliation
+ national_mask = domain_mask & (df["geo_level"] == "national")
+ for idx in df[national_mask | state_mask].index:
+ row = df.loc[idx]
+ if row["period"] != self.time_period:
+ rows_to_drop.append(idx)
+
+ if rows_to_drop:
+ df = df.drop(index=rows_to_drop).reset_index(drop=True)
+
+ df["target_period"] = self.time_period
+ return df
+
+ def print_uprating_summary(self, targets_df: pd.DataFrame) -> None:
+ has_state_uf = "state_uprating_factor" in targets_df.columns
+ if has_state_uf:
+ eff = targets_df["state_uprating_factor"].fillna(
+ targets_df["uprating_factor"]
+ )
+ else:
+ eff = targets_df["uprating_factor"]
+
+ uprated = targets_df[eff != 1.0]
+ if len(uprated) == 0:
+ print("No targets were uprated.")
+ return
+
+ print("\n" + "=" * 60)
+ print("UPRATING SUMMARY")
+ print("=" * 60)
+ print(f"Uprated {len(uprated)} of " f"{len(targets_df)} targets")
+ period_counts = uprated["period"].value_counts().sort_index()
+ for period, count in period_counts.items():
+ print(f" Period {period}: {count} targets")
+ factors = eff[eff != 1.0]
+ print(
+ f" Factor range: [{factors.min():.4f}, " f"{factors.max():.4f}]"
+ )
+
+ # ---------------------------------------------------------------
+ # Target naming
+ # ---------------------------------------------------------------
+
+ @staticmethod
+ def _make_target_name(
+ variable: str,
+ constraints: List[dict],
+ reform_id: int = 0,
+ ) -> str:
+ geo_parts: List[str] = []
+ for c in constraints:
+ if c["variable"] == "state_fips":
+ geo_parts.append(f"state_{c['value']}")
+ elif c["variable"] == "congressional_district_geoid":
+ geo_parts.append(f"cd_{c['value']}")
+
+ parts: List[str] = []
+ parts.append("/".join(geo_parts) if geo_parts else "national")
+ if reform_id > 0:
+ parts.append(f"{variable}_expenditure")
+ else:
+ parts.append(variable)
+
+ non_geo = [c for c in constraints if c["variable"] not in _GEO_VARS]
+ if non_geo:
+ strs = [
+ f"{c['variable']}{c['operation']}{c['value']}" for c in non_geo
+ ]
+ parts.append("[" + ",".join(strs) + "]")
+
+ return "/".join(parts)
+
+ # ---------------------------------------------------------------
+ # Target value calculation
+ # ---------------------------------------------------------------
+
+ def _calculate_target_values(
+ self,
+ sim,
+ target_variable: str,
+ non_geo_constraints: List[dict],
+ n_households: int,
+ ) -> np.ndarray:
+ """Calculate per-household target values.
+
+ For count targets (*_count): count entities per HH
+ satisfying constraints.
+ For value targets: multiply values by constraint mask.
+ """
+ is_count = target_variable.endswith("_count")
+
+ if not is_count:
+ mask = self._evaluate_constraints_entity_aware(
+ sim, non_geo_constraints, n_households
+ )
+ vals = sim.calculate(target_variable, map_to="household").values
+ return (vals * mask).astype(np.float32)
+
+ # Count target: entity-aware counting
+ entity_rel = self._build_entity_relationship(sim)
+ n_persons = len(entity_rel)
+ person_mask = np.ones(n_persons, dtype=bool)
+
+ for c in non_geo_constraints:
+ try:
+ cv = sim.calculate(c["variable"], map_to="person").values
+ except Exception:
+ return np.zeros(n_households, dtype=np.float32)
+ person_mask &= apply_op(cv, c["operation"], c["value"])
+
+ target_entity = sim.tax_benefit_system.variables[
+ target_variable
+ ].entity.key
+ household_ids = sim.calculate(
+ "household_id", map_to="household"
+ ).values
+
+ if target_entity == "household":
+ if non_geo_constraints:
+ mask = self._evaluate_constraints_entity_aware(
+ sim, non_geo_constraints, n_households
+ )
+ return mask.astype(np.float32)
+ return np.ones(n_households, dtype=np.float32)
+
+ if target_entity == "person":
+ er = entity_rel.copy()
+ er["satisfies"] = person_mask
+ filtered = er[er["satisfies"]]
+ counts = filtered.groupby("household_id")["person_id"].nunique()
+ else:
+ eid_col = f"{target_entity}_id"
+ er = entity_rel.copy()
+ er["satisfies"] = person_mask
+ entity_ok = er.groupby(eid_col)["satisfies"].any()
+ unique = er[["household_id", eid_col]].drop_duplicates()
+ unique["entity_ok"] = unique[eid_col].map(entity_ok)
+ filtered = unique[unique["entity_ok"]]
+ counts = filtered.groupby("household_id")[eid_col].nunique()
+
+ return np.array(
+ [counts.get(hid, 0) for hid in household_ids],
+ dtype=np.float32,
+ )
+
+ # ---------------------------------------------------------------
+ # Clone simulation
+ # ---------------------------------------------------------------
+
+ def _simulate_clone(
+ self,
+ clone_state_fips: np.ndarray,
+ n_records: int,
+ variables: set,
+ sim_modifier=None,
+ clone_idx: int = 0,
+ ) -> Tuple[Dict[str, np.ndarray], object]:
+ """Simulate one clone with assigned geography.
+
+ Args:
+ clone_state_fips: State FIPS per record, shape
+ (n_records,).
+ n_records: Number of base records.
+ variables: Target variable names to compute.
+ sim_modifier: Optional callback(sim, clone_idx)
+ called after state_fips is set but before
+ cache clearing. Used for takeup
+ re-randomization.
+ clone_idx: Clone index passed to sim_modifier.
+
+ Returns:
+ (var_values, sim) where var_values maps variable
+ name to household-level float32 array.
+ """
+ from policyengine_us import Microsimulation
+
+ sim = Microsimulation(dataset=self.dataset_path)
+ sim.set_input(
+ "state_fips",
+ self.time_period,
+ clone_state_fips.astype(np.int32),
+ )
+ if sim_modifier is not None:
+ sim_modifier(sim, clone_idx)
+ for var in get_calculated_variables(sim):
+ sim.delete_arrays(var)
+
+ var_values: Dict[str, np.ndarray] = {}
+ for var in variables:
+ if var.endswith("_count"):
+ continue
+ try:
+ var_values[var] = sim.calculate(
+ var,
+ self.time_period,
+ map_to="household",
+ ).values.astype(np.float32)
+ except Exception as exc:
+ logger.warning("Cannot calculate '%s': %s", var, exc)
+
+ return var_values, sim
+
+ # ---------------------------------------------------------------
+ # Main build method
+ # ---------------------------------------------------------------
+
+ def build_matrix(
+ self,
+ geography,
+ sim,
+ target_filter: Optional[dict] = None,
+ hierarchical_domains: Optional[List[str]] = None,
+ cache_dir: Optional[str] = None,
+ sim_modifier=None,
+ ) -> Tuple[pd.DataFrame, sparse.csr_matrix, List[str]]:
+ """Build sparse calibration matrix.
+
+ Two-phase build: (1) simulate each clone and save
+ COO entries to disk, (2) assemble CSR from caches.
+
+ Args:
+ geography: GeographyAssignment with state_fips,
+ cd_geoid, block_geoid arrays and n_records,
+ n_clones attributes.
+ sim: Microsimulation for parameters and entity
+ relationships.
+ target_filter: Dict for target_overview filtering.
+ hierarchical_domains: Domain names for
+ hierarchical uprating + CD reconciliation.
+ cache_dir: Directory for per-clone COO caches.
+ If None, COO data held in memory.
+ sim_modifier: Optional callback(sim, clone_idx)
+ called per clone after state_fips is set but
+ before cache clearing. Use for takeup
+ re-randomization.
+
+ Returns:
+ (targets_df, X_sparse, target_names)
+ """
+ n_records = geography.n_records
+ n_clones = geography.n_clones
+ n_total = n_records * n_clones
+ self._coo_parts = ([], [], [])
+
+ # 1. Query and uprate targets
+ targets_df = self._query_targets(target_filter or {})
+ if len(targets_df) == 0:
+ raise ValueError("No targets found matching filter")
+
+ params = sim.tax_benefit_system.parameters
+ uprating_factors = self._calculate_uprating_factors(params)
+ targets_df["original_value"] = targets_df["value"].copy()
+ targets_df["uprating_factor"] = targets_df.apply(
+ lambda row: self._get_uprating_info(
+ row["variable"],
+ row["period"],
+ uprating_factors,
+ )[0],
+ axis=1,
+ )
+ targets_df["value"] = (
+ targets_df["original_value"] * targets_df["uprating_factor"]
+ )
+
+ if hierarchical_domains:
+ targets_df = self._apply_hierarchical_uprating(
+ targets_df,
+ hierarchical_domains,
+ uprating_factors,
+ )
+
+ n_targets = len(targets_df)
+
+ # 2. Sort targets by geographic level
+ targets_df["_geo_level"] = targets_df["geographic_id"].apply(
+ get_geo_level
+ )
+ targets_df = targets_df.sort_values(
+ ["_geo_level", "variable", "geographic_id"]
+ )
+ targets_df = targets_df.drop(columns=["_geo_level"]).reset_index(
+ drop=True
+ )
+
+ # 3. Build column index structures from geography
+ state_col_lists: Dict[int, list] = defaultdict(list)
+ cd_col_lists: Dict[str, list] = defaultdict(list)
+ for col in range(n_total):
+ state_col_lists[int(geography.state_fips[col])].append(col)
+ cd_col_lists[str(geography.cd_geoid[col])].append(col)
+ state_to_cols = {s: np.array(c) for s, c in state_col_lists.items()}
+ cd_to_cols = {cd: np.array(c) for cd, c in cd_col_lists.items()}
+
+ # 4. Pre-process targets: resolve constraints
+ constraint_cache: Dict[int, List[dict]] = {}
+ target_geo_info: List[Tuple[str, str]] = []
+ target_names: List[str] = []
+ non_geo_constraints_list: List[List[dict]] = []
+
+ for _, row in targets_df.iterrows():
+ sid = int(row["stratum_id"])
+ if sid not in constraint_cache:
+ constraint_cache[sid] = self._get_stratum_constraints(sid)
+ constraints = constraint_cache[sid]
+
+ geo_level = row["geo_level"]
+ geo_id = row["geographic_id"]
+ target_geo_info.append((geo_level, geo_id))
+
+ non_geo = [
+ c for c in constraints if c["variable"] not in _GEO_VARS
+ ]
+ non_geo_constraints_list.append(non_geo)
+
+ target_names.append(
+ self._make_target_name(str(row["variable"]), constraints)
+ )
+
+ unique_variables = set(targets_df["variable"].values)
+
+ # 5. Clone loop
+ from pathlib import Path
+
+ clone_dir = Path(cache_dir) if cache_dir else None
+ if clone_dir:
+ clone_dir.mkdir(parents=True, exist_ok=True)
+
+ self._entity_rel_cache = None
+
+ for clone_idx in range(n_clones):
+ if clone_dir:
+ coo_path = clone_dir / f"clone_{clone_idx:04d}.npz"
+ if coo_path.exists():
+ logger.info(
+ "Clone %d/%d cached, skipping.",
+ clone_idx + 1,
+ n_clones,
+ )
+ continue
+
+ col_start = clone_idx * n_records
+ col_end = col_start + n_records
+ clone_states = geography.state_fips[col_start:col_end]
+
+ logger.info(
+ "Processing clone %d/%d " "(cols %d-%d, %d unique states)...",
+ clone_idx + 1,
+ n_clones,
+ col_start,
+ col_end - 1,
+ len(np.unique(clone_states)),
+ )
+
+ var_values, clone_sim = self._simulate_clone(
+ clone_states,
+ n_records,
+ unique_variables,
+ sim_modifier=sim_modifier,
+ clone_idx=clone_idx,
+ )
+
+ mask_cache: Dict[tuple, np.ndarray] = {}
+ count_cache: Dict[tuple, np.ndarray] = {}
+
+ rows_list: list = []
+ cols_list: list = []
+ vals_list: list = []
+
+ for row_idx in range(n_targets):
+ variable = str(targets_df.iloc[row_idx]["variable"])
+ geo_level, geo_id = target_geo_info[row_idx]
+ non_geo = non_geo_constraints_list[row_idx]
+
+ # Geographic column selection
+ if geo_level == "district":
+ all_geo_cols = cd_to_cols.get(
+ str(geo_id),
+ np.array([], dtype=np.int64),
+ )
+ elif geo_level == "state":
+ all_geo_cols = state_to_cols.get(
+ int(geo_id),
+ np.array([], dtype=np.int64),
+ )
+ else:
+ all_geo_cols = np.arange(n_total)
+
+ clone_cols = all_geo_cols[
+ (all_geo_cols >= col_start) & (all_geo_cols < col_end)
+ ]
+ if len(clone_cols) == 0:
+ continue
+
+ rec_indices = clone_cols - col_start
+
+ constraint_key = tuple(
+ sorted(
+ (
+ c["variable"],
+ c["operation"],
+ c["value"],
+ )
+ for c in non_geo
+ )
+ )
+
+ if variable.endswith("_count"):
+ vkey = (variable, constraint_key)
+ if vkey not in count_cache:
+ count_cache[vkey] = self._calculate_target_values(
+ clone_sim,
+ variable,
+ non_geo,
+ n_records,
+ )
+ values = count_cache[vkey]
+ else:
+ if variable not in var_values:
+ continue
+ if constraint_key not in mask_cache:
+ mask_cache[constraint_key] = (
+ self._evaluate_constraints_entity_aware(
+ clone_sim,
+ non_geo,
+ n_records,
+ )
+ )
+ mask = mask_cache[constraint_key]
+ values = var_values[variable] * mask
+
+ vals = values[rec_indices]
+ nonzero = vals != 0
+ if nonzero.any():
+ rows_list.append(
+ np.full(
+ nonzero.sum(),
+ row_idx,
+ dtype=np.int32,
+ )
+ )
+ cols_list.append(clone_cols[nonzero].astype(np.int32))
+ vals_list.append(vals[nonzero])
+
+ # Save COO entries
+ if rows_list:
+ cr = np.concatenate(rows_list)
+ cc = np.concatenate(cols_list)
+ cv = np.concatenate(vals_list)
+ else:
+ cr = np.array([], dtype=np.int32)
+ cc = np.array([], dtype=np.int32)
+ cv = np.array([], dtype=np.float32)
+
+ if clone_dir:
+ np.savez_compressed(
+ str(coo_path),
+ rows=cr,
+ cols=cc,
+ vals=cv,
+ )
+ logger.info(
+ "Clone %d: %d nonzero entries saved.",
+ clone_idx + 1,
+ len(cv),
+ )
+ del var_values, clone_sim
+ else:
+ self._coo_parts[0].append(cr)
+ self._coo_parts[1].append(cc)
+ self._coo_parts[2].append(cv)
+
+ # 6. Assemble sparse matrix from COO data
+ logger.info("Assembling matrix from %d clones...", n_clones)
+ if clone_dir:
+ all_r, all_c, all_v = [], [], []
+ for ci in range(n_clones):
+ p = clone_dir / f"clone_{ci:04d}.npz"
+ data = np.load(str(p))
+ all_r.append(data["rows"])
+ all_c.append(data["cols"])
+ all_v.append(data["vals"])
+ rows = np.concatenate(all_r)
+ cols = np.concatenate(all_c)
+ vals = np.concatenate(all_v)
+ else:
+ rows = np.concatenate(self._coo_parts[0])
+ cols = np.concatenate(self._coo_parts[1])
+ vals = np.concatenate(self._coo_parts[2])
+ del self._coo_parts
+
+ X_csr = sparse.csr_matrix(
+ (vals, (rows, cols)),
+ shape=(n_targets, n_total),
+ dtype=np.float32,
+ )
+
+ logger.info(
+ "Matrix: %d targets x %d cols, %d nnz",
+ X_csr.shape[0],
+ X_csr.shape[1],
+ X_csr.nnz,
+ )
+
+ return targets_df, X_csr, target_names
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
index 3dcab0e9f..97c82360d 100644
--- a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
+++ b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
@@ -277,7 +277,7 @@ def apply_op(values: np.ndarray, op: str, val: str) -> np.ndarray:
return np.ones(len(values), dtype=bool)
-def _get_geo_level(geo_id) -> int:
+def get_geo_level(geo_id) -> int:
"""Return geographic level: 0=National, 1=State, 2=District."""
if geo_id == "US":
return 0
@@ -324,9 +324,7 @@ def create_target_groups(
# Add geo_level column for sorting
targets_df = targets_df.copy()
- targets_df["_geo_level"] = targets_df["geographic_id"].apply(
- _get_geo_level
- )
+ targets_df["_geo_level"] = targets_df["geographic_id"].apply(get_geo_level)
geo_level_names = {0: "National", 1: "State", 2: "District"}
@@ -401,6 +399,70 @@ def create_target_groups(
return target_groups, group_info
+_GEO_LEVEL_NAMES = {0: "National", 1: "State", 2: "District"}
+
+
+def drop_target_groups(
+ targets_df: pd.DataFrame,
+ X_sparse,
+ target_groups: np.ndarray,
+ group_info: List[str],
+ drop_specs: List[Tuple[str, str]],
+) -> Tuple[pd.DataFrame, "sparse.csr_matrix"]:
+ """Drop target groups by (label_substring, geo_level_name).
+
+ Args:
+ targets_df: Target metadata from build_matrix.
+ X_sparse: Sparse calibration matrix (n_targets x n_cols).
+ target_groups: Group ID per row from create_target_groups.
+ group_info: Group descriptions from create_target_groups.
+ drop_specs: List of (label_substring, geo_level_name)
+ tuples. geo_level_name is "National", "State", or
+ "District". label_substring is matched case-insensitive
+ against group descriptions.
+
+ Returns:
+ (filtered_targets_df, filtered_X_sparse)
+ """
+ geo_levels = targets_df["geographic_id"].apply(get_geo_level)
+ name_to_level = {v: k for k, v in _GEO_LEVEL_NAMES.items()}
+ drop_ids = set()
+
+ for label_substr, geo_name in drop_specs:
+ level = name_to_level[geo_name]
+ matched = False
+ for gid, info in enumerate(group_info):
+ group_mask = target_groups == gid
+ group_geo = geo_levels[group_mask]
+ if not (group_geo == level).all():
+ continue
+ if label_substr.lower() in info.lower():
+ drop_ids.add(gid)
+ matched = True
+ if not matched:
+ print(
+ f" WARNING: no match for " f"({label_substr!r}, {geo_name!r})"
+ )
+
+ keep_mask = ~np.isin(target_groups, list(drop_ids))
+
+ print(f"Matrix before: {X_sparse.shape[0]} rows")
+ for gid in sorted(drop_ids):
+ n = (target_groups == gid).sum()
+ print(f" DROPPING {group_info[gid]} ({n} rows)")
+ print()
+
+ kept_ids = sorted(set(range(len(group_info))) - drop_ids)
+ for gid in kept_ids:
+ n = (target_groups == gid).sum()
+ print(f" KEEPING {group_info[gid]} ({n} rows)")
+
+ X_out = X_sparse[keep_mask, :]
+ targets_out = targets_df[keep_mask].reset_index(drop=True)
+ print(f"\nMatrix after: {X_out.shape[0]} rows")
+ return targets_out, X_out
+
+
def get_all_cds_from_database(db_uri: str) -> List[str]:
"""
Get ordered list of all CD GEOIDs from database.
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py
deleted file mode 100644
index 7185c7dc1..000000000
--- a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py
+++ /dev/null
@@ -1,284 +0,0 @@
-"""
-Fit calibration weights using L0-regularized optimization.
-Prototype script for weight calibration using the l0-python package.
-"""
-
-import argparse
-import logging
-from datetime import datetime
-from pathlib import Path
-
-logging.basicConfig(
- level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
-
-parser = argparse.ArgumentParser(description="Fit calibration weights")
-parser.add_argument(
- "--device",
- default="cpu",
- choices=["cpu", "cuda"],
- help="Device for training (cpu or cuda)",
-)
-parser.add_argument(
- "--epochs", type=int, default=100, help="Total epochs for training"
-)
-parser.add_argument(
- "--db-path",
- default=None,
- help="Path to policy_data.db (default: STORAGE_FOLDER/calibration/policy_data.db)",
-)
-parser.add_argument(
- "--dataset-path", default=None, help="Path to stratified CPS h5 file"
-)
-args = parser.parse_args()
-
-import numpy as np
-import pandas as pd
-from policyengine_us import Microsimulation
-from policyengine_us_data.storage import STORAGE_FOLDER
-from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (
- SparseMatrixBuilder,
-)
-from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (
- get_all_cds_from_database,
-)
-from policyengine_us_data.datasets.cps.local_area_calibration.matrix_tracer import (
- MatrixTracer,
-)
-
-try:
- import torch
- from l0.calibration import SparseCalibrationWeights
-except ImportError:
- raise ImportError(
- "l0-python is required for weight fitting. "
- "Install with: pip install policyengine-us-data[l0]"
- )
-
-# ============================================================================
-# CONFIGURATION
-# ============================================================================
-DEVICE = args.device
-TOTAL_EPOCHS = args.epochs
-EPOCHS_PER_CHUNK = 500 # TODO: need a better way to set this. Remember it can blow up the Vercel app
-
-# Groups to exclude from the matrix (by group ID from tracer output).
-# Set to [] to keep all groups. Review tracer.print_matrix_structure()
-# output to decide. E.g., drop state-level rows that are linearly
-# redundant with reconciled district rows — or keep them to steer
-# the optimizer.
-GROUPS_TO_EXCLUDE = [1] # drop state SNAP HH counts (redundant with Group 4)
-
-# Hyperparameters
-BETA = 0.35
-GAMMA = -0.1
-ZETA = 1.1
-INIT_KEEP_PROB = 0.999
-LOG_WEIGHT_JITTER_SD = 0.05
-LOG_ALPHA_JITTER_SD = 0.01
-LAMBDA_L0 = 1e-8
-LAMBDA_L2 = 1e-12
-LEARNING_RATE = 0.15
-
-# Data paths
-if args.db_path:
- db_path = Path(args.db_path)
-else:
- db_path = STORAGE_FOLDER / "calibration" / "policy_data.db"
-db_uri = f"sqlite:///{db_path}"
-
-if args.dataset_path:
- dataset_path = Path(args.dataset_path)
-else:
- dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2024.h5"
-
-output_dir = STORAGE_FOLDER / "calibration"
-output_dir.mkdir(parents=True, exist_ok=True)
-time_period = 2024
-
-# Get all CDs from database
-cds_to_calibrate = get_all_cds_from_database(db_uri)
-print(f"Found {len(cds_to_calibrate)} congressional districts")
-
-# ============================================================================
-# STEP 1: BUILD CALIBRATION MATRIX
-# ============================================================================
-print(f"Loading simulation from {dataset_path}...")
-sim = Microsimulation(dataset=str(dataset_path))
-n_households = len(sim.calculate("household_id", map_to="household").values)
-print(f"Loaded {n_households:,} households")
-
-print("\nBuilding sparse matrix...")
-builder = SparseMatrixBuilder(
- db_uri=db_uri,
- time_period=time_period,
- cds_to_calibrate=cds_to_calibrate,
- dataset_path=str(dataset_path),
-)
-
-targets_df, X_sparse, household_id_mapping = builder.build_matrix(
- sim,
- target_filter={
- "domain_variables": ["aca_ptc", "snap"],
- },
- hierarchical_domains=["aca_ptc", "snap"],
-)
-
-builder.print_uprating_summary(targets_df)
-
-tracer = MatrixTracer(
- targets_df, X_sparse, household_id_mapping, cds_to_calibrate, sim
-)
-tracer.print_matrix_structure()
-
-print(f"\nMatrix shape: {X_sparse.shape}")
-print(f"Targets: {len(targets_df)}")
-
-# ============================================================================
-# STEP 2: FILTER GROUPS AND ACHIEVABLE TARGETS
-# ============================================================================
-if GROUPS_TO_EXCLUDE:
- keep_mask = ~np.isin(tracer.target_groups, GROUPS_TO_EXCLUDE)
- n_dropped = (~keep_mask).sum()
- print("\n" + "=" * 60)
- print("GROUP EXCLUSION")
- print("=" * 60)
- print(
- f"Excluding groups {GROUPS_TO_EXCLUDE}: "
- f"dropping {n_dropped} of {len(targets_df)} rows"
- )
- targets_df = targets_df[keep_mask].reset_index(drop=True)
- X_sparse = X_sparse[keep_mask, :]
- print(f"Matrix after exclusion: {X_sparse.shape}")
-else:
- print("\nNo groups excluded (GROUPS_TO_EXCLUDE is empty)")
-
-# Filter to achievable targets (rows with non-zero data)
-row_sums = np.array(X_sparse.sum(axis=1)).flatten()
-achievable_mask = row_sums > 0
-n_achievable = achievable_mask.sum()
-n_impossible = (~achievable_mask).sum()
-
-print(f"\nAchievable targets: {n_achievable}")
-print(f"Impossible targets (filtered out): {n_impossible}")
-
-targets_df = targets_df[achievable_mask].reset_index(drop=True)
-X_sparse = X_sparse[achievable_mask, :]
-
-print(f"Final matrix shape: {X_sparse.shape}")
-
-# Extract target vector and names
-targets = targets_df["value"].values
-target_names = [
- f"{row['geographic_id']}/{row['variable']}"
- for _, row in targets_df.iterrows()
-]
-
-# ============================================================================
-# STEP 3: INITIALIZE WEIGHTS
-# ============================================================================
-initial_weights = np.ones(X_sparse.shape[1]) * 100
-print(f"\nInitial weights shape: {initial_weights.shape}")
-print(f"Initial weights sum: {initial_weights.sum():,.0f}")
-
-# ============================================================================
-# STEP 4: CREATE MODEL
-# ============================================================================
-print("\nCreating SparseCalibrationWeights model...")
-model = SparseCalibrationWeights(
- n_features=X_sparse.shape[1],
- beta=BETA,
- gamma=GAMMA,
- zeta=ZETA,
- init_keep_prob=INIT_KEEP_PROB,
- init_weights=initial_weights,
- log_weight_jitter_sd=LOG_WEIGHT_JITTER_SD,
- log_alpha_jitter_sd=LOG_ALPHA_JITTER_SD,
- device=DEVICE,
-)
-
-# ============================================================================
-# STEP 5: TRAIN IN CHUNKS
-# ============================================================================
-timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-calibration_log = pd.DataFrame()
-
-for chunk_start in range(0, TOTAL_EPOCHS, EPOCHS_PER_CHUNK):
- chunk_epochs = min(EPOCHS_PER_CHUNK, TOTAL_EPOCHS - chunk_start)
- current_epoch = chunk_start + chunk_epochs
-
- print(f"\nTraining epochs {chunk_start + 1} to {current_epoch}...")
-
- model.fit(
- M=X_sparse,
- y=targets,
- target_groups=None,
- lambda_l0=LAMBDA_L0,
- lambda_l2=LAMBDA_L2,
- lr=LEARNING_RATE,
- epochs=chunk_epochs,
- loss_type="relative",
- verbose=True,
- verbose_freq=chunk_epochs,
- )
-
- with torch.no_grad():
- predictions = model.predict(X_sparse).cpu().numpy()
-
- chunk_df = pd.DataFrame(
- {
- "target_name": target_names,
- "estimate": predictions,
- "target": targets,
- }
- )
- chunk_df["epoch"] = current_epoch
- chunk_df["error"] = chunk_df.estimate - chunk_df.target
- chunk_df["rel_error"] = chunk_df.error / chunk_df.target
- chunk_df["abs_error"] = chunk_df.error.abs()
- chunk_df["rel_abs_error"] = chunk_df.rel_error.abs()
- chunk_df["loss"] = chunk_df.rel_abs_error**2
- calibration_log = pd.concat([calibration_log, chunk_df], ignore_index=True)
-
-# ============================================================================
-# STEP 6: EXTRACT AND SAVE WEIGHTS
-# ============================================================================
-with torch.no_grad():
- w = model.get_weights(deterministic=True).cpu().numpy()
-
-print(f"\nFinal weights shape: {w.shape}")
-print(f"Final weights sum: {w.sum():,.0f}")
-print(f"Non-zero weights: {(w > 0).sum():,}")
-
-output_path = output_dir / f"calibration_weights_{timestamp}.npy"
-np.save(output_path, w)
-print(f"\nWeights saved to: {output_path}")
-print(f"OUTPUT_PATH:{output_path}")
-
-log_path = output_dir / f"calibration_log_{timestamp}.csv"
-calibration_log.to_csv(log_path, index=False)
-print(f"Calibration log saved to: {log_path}")
-print(f"LOG_PATH:{log_path}")
-
-# ============================================================================
-# STEP 7: VERIFY PREDICTIONS
-# ============================================================================
-print("\n" + "=" * 60)
-print("PREDICTION VERIFICATION")
-print("=" * 60)
-
-with torch.no_grad():
- predictions = model.predict(X_sparse).cpu().numpy()
-
-for i in range(len(targets)):
- rel_error = (predictions[i] - targets[i]) / targets[i] * 100
- print(
- f"{target_names[i][:50]:50} | "
- f"pred: {predictions[i]:>12,.0f} | "
- f"target: {targets[i]:>12,.0f} | "
- f"err: {rel_error:>6.2f}%"
- )
-
-print("\n" + "=" * 60)
-print("FITTING COMPLETED")
-print("=" * 60)
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py b/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py
deleted file mode 100644
index 4fbe6e78f..000000000
--- a/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py
+++ /dev/null
@@ -1,382 +0,0 @@
-"""
-Matrix tracer utility for debugging geo-stacking sparse matrices.
-
-This utility allows tracing through the complex stacked matrix structure
-to verify values match simulation results.
-
-USAGE
-=====
-
-Basic Setup:
-
- from matrix_tracer import MatrixTracer
-
- tracer = MatrixTracer(
- targets_df, X_sparse, household_id_mapping,
- cds_to_calibrate, sim
- )
-
-Common Operations:
-
- # 1. Understand what a column represents
- col_info = tracer.get_column_info(100)
-
- # 2. Find where a household appears across all CDs
- positions = tracer.get_household_column_positions(565)
-
- # 3. View matrix structure
- tracer.print_matrix_structure()
-
-Matrix Structure:
-
- Columns are organized as: [CD1_households | CD2_households | ... | CD436_households]
- Each CD block has n_households columns (e.g., 10,580 households)
-
- Formula to find column index:
- column_idx = cd_block_number * n_households + household_index
-"""
-
-import logging
-import pandas as pd
-import numpy as np
-from typing import Dict, List
-from scipy import sparse
-
-from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (
- create_target_groups,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class MatrixTracer:
- """Trace through geo-stacked sparse matrices for debugging."""
-
- def __init__(
- self,
- targets_df: pd.DataFrame,
- matrix: sparse.csr_matrix,
- household_id_mapping: Dict[str, List[str]],
- geographic_ids: List[str],
- sim,
- ):
- """
- Initialize tracer with matrix components.
-
- Args:
- targets_df: DataFrame of all targets
- matrix: The final stacked sparse matrix
- household_id_mapping: Mapping from geo keys to household ID lists
- geographic_ids: List of geographic IDs in order
- sim: Microsimulation instance
- """
- self.targets_df = targets_df
- self.matrix = matrix
- self.household_id_mapping = household_id_mapping
- self.geographic_ids = geographic_ids
- self.sim = sim
-
- # Get original household info
- self.original_household_ids = sim.calculate("household_id").values
- self.n_households = len(self.original_household_ids)
- self.n_geographies = len(geographic_ids)
-
- # Build reverse lookup: original_hh_id -> index in original data
- self.hh_id_to_index = {
- hh_id: idx for idx, hh_id in enumerate(self.original_household_ids)
- }
-
- # Build column catalog: maps column index -> (cd_geoid, household_id, household_index)
- self.column_catalog = self._build_column_catalog()
-
- # Build row catalog: maps row index -> target info
- self.row_catalog = self._build_row_catalog()
-
- logger.info(
- f"Tracer initialized: {self.n_households} households x {self.n_geographies} geographies"
- )
- logger.info(f"Matrix shape: {matrix.shape}")
-
- def _build_column_catalog(self) -> pd.DataFrame:
- """Build a complete catalog of all matrix columns."""
- catalog = []
- col_idx = 0
-
- for geo_id in self.geographic_ids:
- for hh_idx, hh_id in enumerate(self.original_household_ids):
- catalog.append(
- {
- "column_index": col_idx,
- "cd_geoid": geo_id,
- "household_id": hh_id,
- "household_index": hh_idx,
- }
- )
- col_idx += 1
-
- return pd.DataFrame(catalog)
-
- def _build_row_catalog(self) -> pd.DataFrame:
- """Build a complete catalog of all matrix rows (targets)."""
- catalog = []
-
- for row_idx, (_, target) in enumerate(self.targets_df.iterrows()):
- var_name = target["variable"]
- var_desc = ""
- if var_name in self.sim.tax_benefit_system.variables:
- var_obj = self.sim.tax_benefit_system.variables[var_name]
- var_desc = getattr(var_obj, "label", var_name)
-
- catalog.append(
- {
- "row_index": row_idx,
- "variable": var_name,
- "variable_desc": var_desc,
- "geographic_id": target.get("geographic_id", "unknown"),
- "target_value": target["value"],
- "stratum_id": target.get("stratum_id"),
- "domain_variable": target.get(
- "domain_variable", "unknown"
- ),
- }
- )
-
- return pd.DataFrame(catalog)
-
- def get_column_info(self, col_idx: int) -> Dict:
- """Get information about a specific column."""
- if col_idx >= len(self.column_catalog):
- raise ValueError(
- f"Column index {col_idx} out of range (max: {len(self.column_catalog)-1})"
- )
- return self.column_catalog.iloc[col_idx].to_dict()
-
- def get_row_info(self, row_idx: int) -> Dict:
- """Get information about a specific row (target)."""
- if row_idx >= len(self.row_catalog):
- raise ValueError(
- f"Row index {row_idx} out of range (max: {len(self.row_catalog)-1})"
- )
- return self.row_catalog.iloc[row_idx].to_dict()
-
- def lookup_matrix_cell(self, row_idx: int, col_idx: int) -> Dict:
- """
- Look up a specific matrix cell and return complete context.
-
- Args:
- row_idx: Row index in matrix
- col_idx: Column index in matrix
-
- Returns:
- Dict with row info, column info, and matrix value
- """
- row_info = self.get_row_info(row_idx)
- col_info = self.get_column_info(col_idx)
- matrix_value = self.matrix[row_idx, col_idx]
-
- return {
- "row_index": row_idx,
- "column_index": col_idx,
- "matrix_value": float(matrix_value),
- "target": row_info,
- "household": col_info,
- }
-
- def get_household_column_positions(
- self, original_hh_id: int
- ) -> Dict[str, int]:
- """
- Get all column positions for a household across all geographies.
-
- Args:
- original_hh_id: Original household ID from simulation
-
- Returns:
- Dict mapping geo_id to column position in stacked matrix
- """
- if original_hh_id not in self.hh_id_to_index:
- raise ValueError(
- f"Household {original_hh_id} not found in original data"
- )
-
- # Get the household's index in the original data
- hh_index = self.hh_id_to_index[original_hh_id]
-
- # Calculate column positions for each geography
- positions = {}
- for geo_idx, geo_id in enumerate(self.geographic_ids):
- # Each geography gets a block of n_households columns
- col_position = geo_idx * self.n_households + hh_index
- positions[geo_id] = col_position
-
- return positions
-
- def print_matrix_structure(self, show_groups=True):
- """Print a comprehensive breakdown of the matrix structure."""
- print("\n" + "=" * 80)
- print("MATRIX STRUCTURE BREAKDOWN")
- print("=" * 80)
-
- print(
- f"\nMatrix dimensions: {self.matrix.shape[0]} rows x "
- f"{self.matrix.shape[1]} columns"
- )
- print(f" Rows = {len(self.row_catalog)} targets")
- print(
- f" Columns = {self.n_households} households x "
- f"{self.n_geographies} CDs"
- )
- print(
- f" = {self.n_households:,} x {self.n_geographies} "
- f"= {self.matrix.shape[1]:,}"
- )
-
- print("\n" + "-" * 80)
- print("COLUMN STRUCTURE (Households stacked by CD)")
- print("-" * 80)
-
- # Build column ranges by CD
- col_ranges = []
- cumulative = 0
- for geo_id in self.geographic_ids:
- start_col = cumulative
- end_col = cumulative + self.n_households - 1
- col_ranges.append(
- {
- "cd_geoid": geo_id,
- "start_col": start_col,
- "end_col": end_col,
- "n_households": self.n_households,
- }
- )
- cumulative += self.n_households
-
- ranges_df = pd.DataFrame(col_ranges)
- print(f"\nShowing first and last 5 CDs of {len(ranges_df)} total:")
- print("\nFirst 5 CDs:")
- print(ranges_df.head(5).to_string(index=False))
- print("\nLast 5 CDs:")
- print(ranges_df.tail(5).to_string(index=False))
-
- print("\n" + "-" * 80)
- print("ROW STRUCTURE (Targets)")
- print("-" * 80)
-
- print(f"\nTotal targets: {len(self.row_catalog)}")
-
- # Summarize by geographic level if column exists
- if "geographic_level" in self.row_catalog.columns:
- print("\nTargets by geographic level:")
- geo_level_summary = (
- self.row_catalog.groupby("geographic_level")
- .size()
- .reset_index(name="n_targets")
- )
- print(geo_level_summary.to_string(index=False))
-
- print("\nTargets by domain variable:")
- domain_summary = (
- self.row_catalog.groupby("domain_variable")
- .agg({"row_index": "count", "variable": lambda x: len(set(x))})
- .rename(
- columns={"row_index": "n_targets", "variable": "n_unique_vars"}
- )
- )
- print(domain_summary.to_string())
-
- # Create and display target groups with row indices
- if show_groups:
- print("\n" + "-" * 80)
- print("TARGET GROUPS (for loss calculation)")
- print("-" * 80)
-
- target_groups, group_info = create_target_groups(self.targets_df)
-
- # Store for later use
- self.target_groups = target_groups
-
- # Print each group with row indices
- for group_id, info in enumerate(group_info):
- group_mask = target_groups == group_id
- row_indices = np.where(group_mask)[0]
-
- # Format row indices for display
- if len(row_indices) > 6:
- row_display = (
- f"[{row_indices[0]}, {row_indices[1]}, "
- f"{row_indices[2]}, ..., {row_indices[-2]}, "
- f"{row_indices[-1]}]"
- )
- else:
- row_display = str(row_indices.tolist())
-
- print(f" {info} - rows {row_display}")
-
- print("\n" + "=" * 80)
-
- def print_column_catalog(self, max_rows: int = 50):
- """Print a sample of the column catalog."""
- print(
- f"\nColumn Catalog (showing first {max_rows} of {len(self.column_catalog)}):"
- )
- print(self.column_catalog.head(max_rows).to_string(index=False))
-
- def print_row_catalog(self, max_rows: int = 50):
- """Print a sample of the row catalog."""
- print(
- f"\nRow Catalog (showing first {max_rows} of {len(self.row_catalog)}):"
- )
- print(self.row_catalog.head(max_rows).to_string(index=False))
-
- def get_group_rows(self, group_id: int) -> pd.DataFrame:
- """
- Get all rows belonging to a specific target group.
-
- Args:
- group_id: The group ID to filter by
-
- Returns:
- DataFrame of row catalog entries for this group
- """
- if not hasattr(self, "target_groups"):
- self.target_groups, self.group_info = create_target_groups(
- self.targets_df
- )
-
- group_mask = self.target_groups == group_id
- return self.row_catalog[group_mask].copy()
-
- def trace_household_targets(self, original_hh_id: int) -> pd.DataFrame:
- """
- Extract all target values for a household across all geographies.
-
- Args:
- original_hh_id: Original household ID to trace
-
- Returns:
- DataFrame with target details and values for this household
- """
- positions = self.get_household_column_positions(original_hh_id)
-
- results = []
-
- for target_idx, (_, target) in enumerate(self.targets_df.iterrows()):
- target_result = {
- "target_idx": target_idx,
- "variable": target["variable"],
- "target_value": target["value"],
- "geographic_id": target.get("geographic_id", "unknown"),
- "domain_variable": target.get("domain_variable", "unknown"),
- }
-
- # Extract values for this target across all geographies
- for geo_id, col_pos in positions.items():
- if col_pos < self.matrix.shape[1]:
- matrix_value = self.matrix[target_idx, col_pos]
- target_result[f"matrix_value_{geo_id}"] = matrix_value
- else:
- target_result[f"matrix_value_{geo_id}"] = np.nan
-
- results.append(target_result)
-
- return pd.DataFrame(results)
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py
deleted file mode 100644
index 74b2e2cee..000000000
--- a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py
+++ /dev/null
@@ -1,838 +0,0 @@
-"""
-Sparse matrix builder for geo-stacking calibration.
-
-Generic, database-driven approach where all constraints (including geographic)
-are evaluated as masks. Geographic constraints work because we SET state_fips
-before evaluating constraints.
-"""
-
-import logging
-from collections import defaultdict
-from typing import Dict, List, Optional, Tuple
-import numpy as np
-import pandas as pd
-from scipy import sparse
-from sqlalchemy import create_engine, text
-
-logger = logging.getLogger(__name__)
-
-from policyengine_us_data.storage import STORAGE_FOLDER
-from policyengine_us_data.utils.census import STATE_NAME_TO_FIPS
-from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (
- get_calculated_variables,
- apply_op,
- _get_geo_level,
-)
-
-
-class SparseMatrixBuilder:
- """Build sparse calibration matrices for geo-stacking."""
-
- def __init__(
- self,
- db_uri: str,
- time_period: int,
- cds_to_calibrate: List[str],
- dataset_path: Optional[str] = None,
- ):
- self.db_uri = db_uri
- self.engine = create_engine(db_uri)
- self.time_period = time_period
- self.cds_to_calibrate = cds_to_calibrate
- self.dataset_path = dataset_path
- self._entity_rel_cache = None
-
- def _build_entity_relationship(self, sim) -> pd.DataFrame:
- """
- Build entity relationship DataFrame mapping persons to all entity IDs.
-
- This is used to evaluate constraints at the person level and then
- aggregate to household level, handling variables defined at different
- entity levels (person, tax_unit, household, spm_unit).
-
- Returns:
- DataFrame with person_id, household_id, tax_unit_id, spm_unit_id
- """
- if self._entity_rel_cache is not None:
- return self._entity_rel_cache
-
- self._entity_rel_cache = pd.DataFrame(
- {
- "person_id": sim.calculate(
- "person_id", map_to="person"
- ).values,
- "household_id": sim.calculate(
- "household_id", map_to="person"
- ).values,
- "tax_unit_id": sim.calculate(
- "tax_unit_id", map_to="person"
- ).values,
- "spm_unit_id": sim.calculate(
- "spm_unit_id", map_to="person"
- ).values,
- }
- )
- return self._entity_rel_cache
-
- def _evaluate_constraints_entity_aware(
- self, state_sim, constraints: List[dict], n_households: int
- ) -> np.ndarray:
- """
- Evaluate non-geographic constraints at person level, aggregate to
- household level using .any().
-
- This properly handles constraints on variables defined at different
- entity levels (e.g., tax_unit_is_filer at tax_unit level). Instead of
- summing values at household level (which would give 2, 3, etc. for
- households with multiple tax units), we evaluate at person level and
- use .any() aggregation ("does this household have at least one person
- satisfying all constraints?").
-
- Args:
- state_sim: Microsimulation with state_fips set
- constraints: List of constraint dicts with variable, operation,
- value keys (geographic constraints should be pre-filtered)
- n_households: Number of households
-
- Returns:
- Boolean mask array of length n_households
- """
- if not constraints:
- return np.ones(n_households, dtype=bool)
-
- entity_rel = self._build_entity_relationship(state_sim)
- n_persons = len(entity_rel)
-
- person_mask = np.ones(n_persons, dtype=bool)
-
- for c in constraints:
- var = c["variable"]
- op = c["operation"]
- val = c["value"]
-
- # Calculate constraint variable at person level
- constraint_values = state_sim.calculate(
- var, self.time_period, map_to="person"
- ).values
-
- # Apply operation at person level
- person_mask &= apply_op(constraint_values, op, val)
-
- # Aggregate to household level using .any()
- # "At least one person in this household satisfies ALL constraints"
- entity_rel_with_mask = entity_rel.copy()
- entity_rel_with_mask["satisfies"] = person_mask
-
- household_mask_series = entity_rel_with_mask.groupby("household_id")[
- "satisfies"
- ].any()
-
- # Ensure we return a mask aligned with household order
- household_ids = state_sim.calculate(
- "household_id", map_to="household"
- ).values
- household_mask = np.array(
- [
- household_mask_series.get(hh_id, False)
- for hh_id in household_ids
- ]
- )
-
- return household_mask
-
- def _calculate_target_values_entity_aware(
- self,
- state_sim,
- target_variable: str,
- non_geo_constraints: List[dict],
- geo_mask: np.ndarray,
- n_households: int,
- ) -> np.ndarray:
- """
- Calculate target values at household level, handling count targets.
-
- For count targets (*_count): Count entities per household satisfying
- constraints
- For value targets: Sum values at household level (existing behavior)
-
- Args:
- state_sim: Microsimulation with state_fips set
- target_variable: The target variable name (e.g., "snap",
- "person_count")
- non_geo_constraints: List of constraint dicts (geographic
- constraints should be pre-filtered)
- geo_mask: Boolean mask array for geographic filtering (household
- level)
- n_households: Number of households
-
- Returns:
- Float array of target values at household level
- """
- is_count_target = target_variable.endswith("_count")
-
- if not is_count_target:
- # Value target: use existing entity-aware constraint evaluation
- entity_mask = self._evaluate_constraints_entity_aware(
- state_sim, non_geo_constraints, n_households
- )
- mask = geo_mask & entity_mask
-
- target_values = state_sim.calculate(
- target_variable, map_to="household"
- ).values
- return (target_values * mask).astype(np.float32)
-
- # Count target: need to count entities satisfying constraints
- entity_rel = self._build_entity_relationship(state_sim)
- n_persons = len(entity_rel)
-
- # Evaluate constraints at person level (don't aggregate to HH yet)
- person_mask = np.ones(n_persons, dtype=bool)
- for c in non_geo_constraints:
- constraint_values = state_sim.calculate(
- c["variable"], map_to="person"
- ).values
- person_mask &= apply_op(
- constraint_values, c["operation"], c["value"]
- )
-
- # Get target entity from variable definition
- target_entity = state_sim.tax_benefit_system.variables[
- target_variable
- ].entity.key
-
- household_ids = state_sim.calculate(
- "household_id", map_to="household"
- ).values
- geo_mask_map = dict(zip(household_ids, geo_mask))
-
- if target_entity == "household":
- # household_count: 1 per qualifying household
- if non_geo_constraints:
- entity_mask = self._evaluate_constraints_entity_aware(
- state_sim, non_geo_constraints, n_households
- )
- return (geo_mask & entity_mask).astype(np.float32)
- return geo_mask.astype(np.float32)
-
- if target_entity == "person":
- # Count persons satisfying constraints per household
- entity_rel["satisfies"] = person_mask
- entity_rel["geo_ok"] = entity_rel["household_id"].map(geo_mask_map)
- filtered = entity_rel[
- entity_rel["satisfies"] & entity_rel["geo_ok"]
- ]
- counts = filtered.groupby("household_id")["person_id"].nunique()
- else:
- # For tax_unit, spm_unit: aggregate person mask to entity, then
- # count
- entity_id_col = f"{target_entity}_id"
- entity_rel["satisfies"] = person_mask
- entity_satisfies = entity_rel.groupby(entity_id_col)[
- "satisfies"
- ].any()
-
- entity_rel_unique = entity_rel[
- ["household_id", entity_id_col]
- ].drop_duplicates()
- entity_rel_unique["entity_ok"] = entity_rel_unique[
- entity_id_col
- ].map(entity_satisfies)
- entity_rel_unique["geo_ok"] = entity_rel_unique[
- "household_id"
- ].map(geo_mask_map)
- filtered = entity_rel_unique[
- entity_rel_unique["entity_ok"] & entity_rel_unique["geo_ok"]
- ]
- counts = filtered.groupby("household_id")[entity_id_col].nunique()
-
- # Build result aligned with household order
- return np.array(
- [counts.get(hh_id, 0) for hh_id in household_ids], dtype=np.float32
- )
-
- def _query_targets(self, target_filter: dict) -> pd.DataFrame:
- """Query targets via target_overview view.
-
- Best period: most recent period <= self.time_period, or closest
- future period if none exists.
-
- Returns DataFrame with geo_level, geographic_id, and
- domain_variable columns.
-
- Supports filters: domain_variables, variables, target_ids,
- stratum_ids.
- """
- or_conditions = []
-
- if "domain_variables" in target_filter:
- dvs = target_filter["domain_variables"]
- placeholders = ",".join(f"'{dv}'" for dv in dvs)
- or_conditions.append(f"tv.domain_variable IN ({placeholders})")
-
- if "variables" in target_filter:
- vars_str = ",".join(f"'{v}'" for v in target_filter["variables"])
- or_conditions.append(f"tv.variable IN ({vars_str})")
-
- if "target_ids" in target_filter:
- ids = ",".join(map(str, target_filter["target_ids"]))
- or_conditions.append(f"tv.target_id IN ({ids})")
-
- if "stratum_ids" in target_filter:
- ids = ",".join(map(str, target_filter["stratum_ids"]))
- or_conditions.append(f"tv.stratum_id IN ({ids})")
-
- if not or_conditions:
- where_clause = "1=1"
- else:
- where_clause = " OR ".join(f"({c})" for c in or_conditions)
-
- query = f"""
- WITH filtered_targets AS (
- SELECT tv.target_id, tv.stratum_id, tv.variable, tv.value,
- tv.period, tv.geo_level, tv.geographic_id,
- tv.domain_variable
- FROM target_overview tv
- WHERE {where_clause}
- ),
- best_periods AS (
- SELECT stratum_id, variable,
- CASE
- WHEN MAX(CASE WHEN period <= :time_period
- THEN period END) IS NOT NULL
- THEN MAX(CASE WHEN period <= :time_period
- THEN period END)
- ELSE MIN(period)
- END as best_period
- FROM filtered_targets
- GROUP BY stratum_id, variable
- )
- SELECT ft.*
- FROM filtered_targets ft
- JOIN best_periods bp
- ON ft.stratum_id = bp.stratum_id
- AND ft.variable = bp.variable
- AND ft.period = bp.best_period
- ORDER BY ft.target_id
- """
-
- with self.engine.connect() as conn:
- return pd.read_sql(
- query, conn, params={"time_period": self.time_period}
- )
-
- def _get_constraints(self, stratum_id: int) -> List[dict]:
- """Get all constraints for a stratum (including geographic)."""
- query = """
- SELECT constraint_variable as variable, operation, value
- FROM stratum_constraints
- WHERE stratum_id = :stratum_id
- """
- with self.engine.connect() as conn:
- df = pd.read_sql(query, conn, params={"stratum_id": stratum_id})
- return df.to_dict("records")
-
- def _get_geographic_id(self, stratum_id: int) -> str:
- """Extract geographic_id from constraints for targets_df."""
- constraints = self._get_constraints(stratum_id)
- for c in constraints:
- if c["variable"] == "state_fips":
- return c["value"]
- if c["variable"] == "congressional_district_geoid":
- return c["value"]
- return "US"
-
- def _calculate_uprating_factors(self, params) -> dict:
- """Calculate CPI and population uprating factors for all periods."""
- factors = {}
-
- query = "SELECT DISTINCT period FROM targets WHERE period IS NOT NULL ORDER BY period"
- with self.engine.connect() as conn:
- result = conn.execute(text(query))
- years_needed = [row[0] for row in result]
-
- logger.info(
- f"Calculating uprating factors for years "
- f"{years_needed} to {self.time_period}"
- )
-
- for from_year in years_needed:
- if from_year == self.time_period:
- factors[(from_year, "cpi")] = 1.0
- factors[(from_year, "pop")] = 1.0
- continue
-
- try:
- cpi_from = params.gov.bls.cpi.cpi_u(from_year)
- cpi_to = params.gov.bls.cpi.cpi_u(self.time_period)
- factors[(from_year, "cpi")] = float(cpi_to / cpi_from)
- except Exception as e:
- logger.warning(
- f"Could not calculate CPI factor for " f"{from_year}: {e}"
- )
- factors[(from_year, "cpi")] = 1.0
-
- try:
- pop_from = params.calibration.gov.census.populations.total(
- from_year
- )
- pop_to = params.calibration.gov.census.populations.total(
- self.time_period
- )
- factors[(from_year, "pop")] = float(pop_to / pop_from)
- except Exception as e:
- logger.warning(
- f"Could not calculate population factor for "
- f"{from_year}: {e}"
- )
- factors[(from_year, "pop")] = 1.0
-
- for (year, type_), factor in sorted(factors.items()):
- if factor != 1.0:
- logger.info(
- f" {year} -> {self.time_period} "
- f"({type_}): {factor:.4f}"
- )
-
- return factors
-
- def _get_uprating_info(
- self,
- variable: str,
- period: int,
- factors: dict,
- ) -> Tuple[float, str]:
- """Get uprating factor and type for a variable at a given period."""
- if period == self.time_period:
- return 1.0, "none"
-
- count_indicators = [
- "count",
- "person",
- "people",
- "households",
- "tax_units",
- ]
- is_count = any(ind in variable.lower() for ind in count_indicators)
- uprating_type = "pop" if is_count else "cpi"
-
- factor = factors.get((period, uprating_type), 1.0)
- return factor, uprating_type
-
- def _load_aca_ptc_factors(self) -> Dict[int, Dict[str, float]]:
- """Load state-level ACA PTC uprating factors from CSV.
-
- Returns:
- {state_fips_int: {"tax_unit_count": vol_mult,
- "aca_ptc": vol_mult * val_mult}}
- """
- csv_path = STORAGE_FOLDER / "aca_ptc_multipliers_2022_2024.csv"
- df = pd.read_csv(csv_path)
- result = {}
- for _, row in df.iterrows():
- fips_str = STATE_NAME_TO_FIPS.get(row["state"])
- if fips_str is None:
- continue
- fips_int = int(fips_str)
- result[fips_int] = {
- "tax_unit_count": row["vol_mult"],
- "aca_ptc": row["vol_mult"] * row["val_mult"],
- }
- return result
-
- def _get_state_uprating_factors(
- self,
- domain: str,
- targets_df: pd.DataFrame,
- national_factors: dict,
- ) -> Dict[int, Dict[str, float]]:
- """Get per-state uprating factors for a hierarchical domain.
-
- For aca_ptc: loads real state-level enrollment/APTC factors
- from CSV. For other domains: returns uniform national CPI/pop
- factors.
-
- Returns:
- {state_fips: {variable: factor}} for each state in the
- domain's state-level targets.
- """
- state_rows = targets_df[
- (targets_df["domain_variable"] == domain)
- & (targets_df["geo_level"] == "state")
- ]
- state_fips_list = state_rows["geographic_id"].unique()
- variables = state_rows["variable"].unique()
-
- if domain == "aca_ptc":
- csv_factors = self._load_aca_ptc_factors()
- logger.info(
- f" [{domain}] Using CSV state-level factors "
- f"({len(csv_factors)} states)"
- )
- else:
- csv_factors = None
- logger.info(f" [{domain}] Using national CPI/pop factors")
-
- result = {}
- n_csv = 0
- n_fallback = 0
- for sf in state_fips_list:
- state_int = int(sf)
- var_factors = {}
-
- if csv_factors and state_int in csv_factors:
- n_csv += 1
- for var in variables:
- var_factors[var] = csv_factors[state_int].get(var, 1.0)
- else:
- n_fallback += 1
- for var in variables:
- row = state_rows[
- (state_rows["geographic_id"] == sf)
- & (state_rows["variable"] == var)
- ]
- if row.empty:
- var_factors[var] = 1.0
- continue
- period = row.iloc[0]["period"]
- factor, _ = self._get_uprating_info(
- var, period, national_factors
- )
- var_factors[var] = factor
-
- result[state_int] = var_factors
-
- if csv_factors:
- all_factors = [f for vf in result.values() for f in vf.values()]
- logger.info(
- f" {n_csv} states from CSV, "
- f"{n_fallback} national fallback"
- )
- for var in variables:
- vf = [result[s][var] for s in result]
- logger.info(f" {var}: [{min(vf):.4f}, {max(vf):.4f}]")
-
- return result
-
- def _apply_hierarchical_uprating(
- self,
- targets_df: pd.DataFrame,
- hierarchical_domains: List[str],
- national_factors: dict,
- ) -> pd.DataFrame:
- """Apply state-level uprating and reconcile CDs to state totals.
-
- Two separable factors per CD row:
- - hif (hierarchy inconsistency factor): base-year correction
- so that sum(CDs) == state total in the source data.
- hif = state_original / sum(cd_originals). Pure geometry,
- no time dimension.
- - uprating_factor: state-specific (or national fallback)
- scaling from base year to target year. Pure time, no
- geography correction.
-
- Final CD value = original_value * hif * uprating_factor.
-
- Also drops national/state rows used for reconciliation
- (keeps rows like CMS person_count at period == time_period).
- """
- df = targets_df.copy()
- df["hif"] = np.nan
- df["state_uprating_factor"] = np.nan
-
- rows_to_drop = []
-
- for domain in hierarchical_domains:
- domain_mask = df["domain_variable"] == domain
-
- state_factors = self._get_state_uprating_factors(
- domain, df, national_factors
- )
-
- state_mask = domain_mask & (df["geo_level"] == "state")
- district_mask = domain_mask & (df["geo_level"] == "district")
-
- for sf, var_factors in state_factors.items():
- for var, uf in var_factors.items():
- state_row = df[
- state_mask
- & (df["geographic_id"] == str(sf))
- & (df["variable"] == var)
- ]
- if state_row.empty:
- continue
- state_original = state_row.iloc[0]["original_value"]
-
- def _cd_in_state(g, s=sf):
- try:
- return int(g) // 100 == s
- except (ValueError, TypeError):
- return False
-
- cd_mask = (
- district_mask
- & (df["variable"] == var)
- & df["geographic_id"].apply(_cd_in_state)
- )
- cd_rows = df[cd_mask]
- if cd_rows.empty:
- continue
-
- cd_original_sum = cd_rows["original_value"].sum()
- if cd_original_sum == 0:
- continue
-
- hif = state_original / cd_original_sum
-
- for cd_idx in cd_rows.index:
- df.at[cd_idx, "hif"] = hif
- df.at[cd_idx, "state_uprating_factor"] = uf
- df.at[cd_idx, "value"] = (
- df.at[cd_idx, "original_value"] * hif * uf
- )
-
- # Log HIF and UF summary for this domain
- cd_domain = df[district_mask & df["hif"].notna()]
- if not cd_domain.empty:
- for var in cd_domain["variable"].unique():
- vrows = cd_domain[cd_domain["variable"] == var]
- hifs = vrows["hif"]
- ufs = vrows["state_uprating_factor"]
- logger.info(
- f" [{domain}] {var}: "
- f"{len(vrows)} CDs, "
- f"HIF=[{hifs.min():.4f}, {hifs.max():.4f}], "
- f"UF=[{ufs.min():.4f}, {ufs.max():.4f}]"
- )
-
- # Drop national/state rows used for reconciliation
- # Keep rows like CMS person_count (period == time_period)
- national_mask = domain_mask & (df["geo_level"] == "national")
- for idx in df[national_mask | state_mask].index:
- row = df.loc[idx]
- if row["period"] != self.time_period:
- rows_to_drop.append(idx)
-
- if rows_to_drop:
- dropped = df.loc[rows_to_drop]
- logger.info(
- f"Hierarchical uprating: dropping "
- f"{len(rows_to_drop)} national/state rows "
- f"(used only for reconciliation)"
- )
- for domain in hierarchical_domains:
- d = dropped[dropped["domain_variable"] == domain]
- if d.empty:
- continue
- by_level = d["geo_level"].value_counts().to_dict()
- parts = [f"{n} {lvl}" for lvl, n in sorted(by_level.items())]
- logger.info(f" {domain}: {', '.join(parts)}")
- df = df.drop(index=rows_to_drop).reset_index(drop=True)
-
- df["target_period"] = self.time_period
-
- return df
-
- def print_uprating_summary(self, targets_df: pd.DataFrame) -> None:
- """Print summary of uprating applied to targets."""
- has_state_uf = "state_uprating_factor" in targets_df.columns
-
- # Effective factor: use state_uprating_factor where set,
- # otherwise fall back to uprating_factor
- if has_state_uf:
- eff = targets_df["state_uprating_factor"].fillna(
- targets_df["uprating_factor"]
- )
- else:
- eff = targets_df["uprating_factor"]
-
- uprated = targets_df[eff != 1.0]
- if len(uprated) == 0:
- print("No targets were uprated.")
- return
-
- print("\n" + "=" * 60)
- print("UPRATING SUMMARY")
- print("=" * 60)
- print(f"Uprated {len(uprated)} of {len(targets_df)} targets")
-
- period_counts = uprated["period"].value_counts().sort_index()
- for period, count in period_counts.items():
- print(f" Period {period}: {count} targets")
-
- factors = eff[eff != 1.0]
- print(
- f" Factor range: [{factors.min():.4f}, " f"{factors.max():.4f}]"
- )
-
- def _create_state_sim(self, state: int, n_households: int):
- """Create a fresh simulation with state_fips set to given state."""
- from policyengine_us import Microsimulation
-
- state_sim = Microsimulation(dataset=self.dataset_path)
- state_sim.set_input(
- "state_fips",
- self.time_period,
- np.full(n_households, state, dtype=np.int32),
- )
- for var in get_calculated_variables(state_sim):
- state_sim.delete_arrays(var)
- return state_sim
-
- def build_matrix(
- self,
- sim,
- target_filter: dict,
- hierarchical_domains: Optional[List[str]] = None,
- ) -> Tuple[pd.DataFrame, sparse.csr_matrix, Dict[str, List[str]]]:
- """
- Build sparse calibration matrix.
-
- Args:
- sim: Microsimulation instance (used for household_ids, or
- as template)
- target_filter: Dict specifying which targets to include
- - {"domain_variables": ["aca_ptc"]} via target_overview
- - {"target_ids": [123, 456]} for specific targets
- - an empty dict {} will fetch all targets
- hierarchical_domains: Optional list of domain_variable
- names for state-level uprating + CD reconciliation.
- Requires domain_variables in target_filter.
-
- Returns:
- Tuple of (targets_df, X_sparse, household_id_mapping)
- """
- household_ids = sim.calculate(
- "household_id", map_to="household"
- ).values
- n_households = len(household_ids)
- n_cds = len(self.cds_to_calibrate)
- n_cols = n_households * n_cds
-
- targets_df = self._query_targets(target_filter)
-
- if len(targets_df) == 0:
- raise ValueError("No targets found matching filter")
-
- # Uprate targets from their original period to self.time_period
- params = sim.tax_benefit_system.parameters
- uprating_factors = self._calculate_uprating_factors(params)
- targets_df["original_value"] = targets_df["value"].copy()
- targets_df["uprating_factor"] = targets_df.apply(
- lambda row: self._get_uprating_info(
- row["variable"], row["period"], uprating_factors
- )[0],
- axis=1,
- )
- targets_df["value"] = (
- targets_df["original_value"] * targets_df["uprating_factor"]
- )
-
- # Hierarchical uprating: state-level uprating + CD reconciliation
- if hierarchical_domains:
- targets_df = self._apply_hierarchical_uprating(
- targets_df, hierarchical_domains, uprating_factors
- )
-
- n_targets = len(targets_df)
-
- # Sort by (geo_level, variable, geographic_id) for contiguous group
- targets_df["_geo_level"] = targets_df["geographic_id"].apply(
- _get_geo_level
- )
- targets_df = targets_df.sort_values(
- ["_geo_level", "variable", "geographic_id"]
- )
- targets_df = targets_df.drop(columns=["_geo_level"]).reset_index(
- drop=True
- )
-
- X = sparse.lil_matrix((n_targets, n_cols), dtype=np.float32)
-
- # Group CDs by state. CD GEOIDs follow format SSCCC where SS is state
- # FIPS (2 digits) and CCC is CD number (2-3 digits), so state = CD // 100
- cds_by_state = defaultdict(list)
- for cd_idx, cd in enumerate(self.cds_to_calibrate):
- state = int(cd) // 100
- cds_by_state[state].append((cd_idx, cd))
-
- for state, cd_list in cds_by_state.items():
- # Clear entity relationship cache when creating new simulation
- self._entity_rel_cache = None
-
- if self.dataset_path:
- state_sim = self._create_state_sim(state, n_households)
- else:
- state_sim = sim
- state_sim.set_input(
- "state_fips",
- self.time_period,
- np.full(n_households, state, dtype=np.int32),
- )
- for var in get_calculated_variables(state_sim):
- state_sim.delete_arrays(var)
-
- for cd_idx, cd in cd_list:
- col_start = cd_idx * n_households
-
- for row_idx, (_, target) in enumerate(targets_df.iterrows()):
- constraints = self._get_constraints(target["stratum_id"])
-
- geo_constraints = []
- non_geo_constraints = []
- for c in constraints:
- if c["variable"] in (
- "state_fips",
- "congressional_district_geoid",
- ):
- geo_constraints.append(c)
- else:
- non_geo_constraints.append(c)
-
- # Check geographic constraints first (quick fail)
- geo_mask = np.ones(n_households, dtype=bool)
- for c in geo_constraints:
- if c["variable"] == "congressional_district_geoid":
- if (
- c["operation"] in ("==", "=")
- and c["value"] != cd
- ):
- geo_mask[:] = False
- elif c["variable"] == "state_fips":
- if (
- c["operation"] in ("==", "=")
- and int(c["value"]) != state
- ):
- geo_mask[:] = False
-
- if not geo_mask.any():
- continue
-
- # Calculate target values with entity-aware handling
- # This properly handles count targets (*_count) by counting
- # entities rather than summing values
- masked_values = self._calculate_target_values_entity_aware(
- state_sim,
- target["variable"],
- non_geo_constraints,
- geo_mask,
- n_households,
- )
-
- if not masked_values.any():
- continue
-
- nonzero = np.where(masked_values != 0)[0]
- if len(nonzero) > 0:
- X[row_idx, col_start + nonzero] = masked_values[
- nonzero
- ]
-
- household_id_mapping = {}
- for cd in self.cds_to_calibrate:
- key = f"cd{cd}"
- household_id_mapping[key] = [
- f"{hh_id}_{key}" for hh_id in household_ids
- ]
-
- return targets_df, X.tocsr(), household_id_mapping
diff --git a/policyengine_us_data/parameters/take_up/voluntary_filing.yaml b/policyengine_us_data/parameters/take_up/voluntary_filing.yaml
new file mode 100644
index 000000000..46d23e504
--- /dev/null
+++ b/policyengine_us_data/parameters/take_up/voluntary_filing.yaml
@@ -0,0 +1,6 @@
+description: Percentage of tax units (not taking up EITC) who file taxes voluntarily.
+metadata:
+ label: Voluntary filing rate
+ unit: /1
+values:
+ 2018-01-01: 0.05
diff --git a/policyengine_us_data/tests/test_calibration/__init__.py b/policyengine_us_data/tests/test_calibration/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/policyengine_us_data/tests/test_calibration/test_build_matrix_masking.py b/policyengine_us_data/tests/test_calibration/test_build_matrix_masking.py
new file mode 100644
index 000000000..8db56ddcb
--- /dev/null
+++ b/policyengine_us_data/tests/test_calibration/test_build_matrix_masking.py
@@ -0,0 +1,207 @@
+"""Integration test for build_matrix geographic masking.
+
+Traces one household through the matrix with 2 clones, verifying:
+- National targets: both clones can contribute (non-zero)
+- State targets: only the clone assigned to that state contributes
+- CD targets: only the clone assigned to that CD contributes;
+ a different CD in the same state gets zero
+"""
+
+import os
+
+import numpy as np
+import pytest
+from scipy import sparse
+
+from policyengine_us_data.storage import STORAGE_FOLDER
+
+DATASET_PATH = str(STORAGE_FOLDER / "stratified_extended_cps_2024.h5")
+DB_PATH = str(STORAGE_FOLDER / "calibration" / "policy_data.db")
+DB_URI = f"sqlite:///{DB_PATH}"
+
+N_CLONES = 2
+SEED = 42
+RECORD_IDX = 8629 # High SNAP ($18k), lands in TX/PA with seed=42
+
+
+def _data_available():
+ return os.path.exists(DATASET_PATH) and os.path.exists(DB_PATH)
+
+
+@pytest.fixture(scope="module")
+def matrix_result():
+ if not _data_available():
+ pytest.skip("Calibration data not available")
+
+ from policyengine_us import Microsimulation
+ from policyengine_us_data.calibration.clone_and_assign import (
+ assign_random_geography,
+ )
+ from policyengine_us_data.calibration.unified_matrix_builder import (
+ UnifiedMatrixBuilder,
+ )
+
+ sim = Microsimulation(dataset=DATASET_PATH)
+ n_records = sim.calculate("household_id").values.shape[0]
+ geography = assign_random_geography(
+ n_records, n_clones=N_CLONES, seed=SEED
+ )
+ builder = UnifiedMatrixBuilder(
+ db_uri=DB_URI,
+ time_period=2024,
+ dataset_path=DATASET_PATH,
+ )
+ targets_df, X_sparse, target_names = builder.build_matrix(
+ geography=geography,
+ sim=sim,
+ target_filter={"domain_variables": ["snap", "medicaid"]},
+ )
+ return {
+ "geography": geography,
+ "targets_df": targets_df,
+ "X": X_sparse,
+ "target_names": target_names,
+ "n_records": n_records,
+ }
+
+
+def _clone_col(n_records, clone_idx, record_idx):
+ return clone_idx * n_records + record_idx
+
+
+class TestMatrixShape:
+ def test_columns_equal_clones_times_records(self, matrix_result):
+ X = matrix_result["X"]
+ n_records = matrix_result["n_records"]
+ assert X.shape[1] == N_CLONES * n_records
+
+ def test_rows_equal_targets(self, matrix_result):
+ X = matrix_result["X"]
+ assert X.shape[0] == len(matrix_result["targets_df"])
+
+ def test_matrix_is_sparse(self, matrix_result):
+ X = matrix_result["X"]
+ density = X.nnz / (X.shape[0] * X.shape[1])
+ assert density < 0.1
+
+
+class TestNationalMasking:
+ def test_both_clones_visible_to_national_target(self, matrix_result):
+ X = matrix_result["X"]
+ targets_df = matrix_result["targets_df"]
+ n_records = matrix_result["n_records"]
+
+ national_rows = targets_df[targets_df["geo_level"] == "national"].index
+ assert len(national_rows) > 0
+
+ col_0 = _clone_col(n_records, 0, RECORD_IDX)
+ col_1 = _clone_col(n_records, 1, RECORD_IDX)
+ X_csc = X.tocsc()
+
+ visible_0 = X_csc[:, col_0].toarray().ravel()
+ visible_1 = X_csc[:, col_1].toarray().ravel()
+
+ for row_idx in national_rows:
+ if visible_0[row_idx] != 0 or visible_1[row_idx] != 0:
+ return
+ pytest.fail(
+ "Household has zero value for all national targets "
+ "in both clones — cannot verify masking"
+ )
+
+
+class TestStateMasking:
+ def test_clone_visible_only_to_own_state(self, matrix_result):
+ X = matrix_result["X"]
+ targets_df = matrix_result["targets_df"]
+ geography = matrix_result["geography"]
+ n_records = matrix_result["n_records"]
+
+ col_0 = _clone_col(n_records, 0, RECORD_IDX)
+ col_1 = _clone_col(n_records, 1, RECORD_IDX)
+ state_0 = str(int(geography.state_fips[col_0]))
+ state_1 = str(int(geography.state_fips[col_1]))
+
+ if state_0 == state_1:
+ pytest.skip(
+ "Both clones landed in the same state — "
+ "cannot test cross-state masking"
+ )
+
+ state_targets = targets_df[targets_df["geo_level"] == "state"]
+ X_csc = X.tocsc()
+ vals_0 = X_csc[:, col_0].toarray().ravel()
+ vals_1 = X_csc[:, col_1].toarray().ravel()
+
+ for _, row in state_targets.iterrows():
+ row_idx = row.name
+ geo_id = str(row["geographic_id"])
+ if geo_id == state_0:
+ assert vals_1[row_idx] == 0, (
+ f"Clone 1 (state {state_1}) should be zero "
+ f"for state {state_0} target row {row_idx}"
+ )
+ elif geo_id == state_1:
+ assert vals_0[row_idx] == 0, (
+ f"Clone 0 (state {state_0}) should be zero "
+ f"for state {state_1} target row {row_idx}"
+ )
+
+
+class TestDistrictMasking:
+ def test_clone_visible_only_to_own_cd(self, matrix_result):
+ X = matrix_result["X"]
+ targets_df = matrix_result["targets_df"]
+ geography = matrix_result["geography"]
+ n_records = matrix_result["n_records"]
+
+ col_0 = _clone_col(n_records, 0, RECORD_IDX)
+ cd_0 = str(geography.cd_geoid[col_0])
+ state_0 = str(int(geography.state_fips[col_0]))
+
+ district_targets = targets_df[targets_df["geo_level"] == "district"]
+ X_csc = X.tocsc()
+ vals_0 = X_csc[:, col_0].toarray().ravel()
+
+ same_state_other_cd = district_targets[
+ (
+ district_targets["geographic_id"].apply(
+ lambda g: g.startswith(state_0)
+ )
+ )
+ & (district_targets["geographic_id"] != cd_0)
+ ]
+
+ for _, row in same_state_other_cd.iterrows():
+ row_idx = row.name
+ assert vals_0[row_idx] == 0, (
+ f"Clone 0 (CD {cd_0}) should be zero for "
+ f"CD {row['geographic_id']} target row {row_idx}"
+ )
+
+ def test_clone_nonzero_for_own_cd(self, matrix_result):
+ X = matrix_result["X"]
+ targets_df = matrix_result["targets_df"]
+ geography = matrix_result["geography"]
+ n_records = matrix_result["n_records"]
+
+ col_0 = _clone_col(n_records, 0, RECORD_IDX)
+ cd_0 = str(geography.cd_geoid[col_0])
+
+ own_cd_targets = targets_df[
+ (targets_df["geo_level"] == "district")
+ & (targets_df["geographic_id"] == cd_0)
+ ]
+ if len(own_cd_targets) == 0:
+ pytest.skip(f"No district targets for CD {cd_0}")
+
+ X_csc = X.tocsc()
+ vals_0 = X_csc[:, col_0].toarray().ravel()
+
+ any_nonzero = any(
+ vals_0[row.name] != 0 for _, row in own_cd_targets.iterrows()
+ )
+ assert any_nonzero, (
+ f"Clone 0 should have at least one non-zero entry "
+ f"for its own CD {cd_0}"
+ )
diff --git a/policyengine_us_data/tests/test_calibration/test_clone_and_assign.py b/policyengine_us_data/tests/test_calibration/test_clone_and_assign.py
new file mode 100644
index 000000000..0ba330549
--- /dev/null
+++ b/policyengine_us_data/tests/test_calibration/test_clone_and_assign.py
@@ -0,0 +1,189 @@
+"""Tests for clone_and_assign module.
+
+Uses mock CSV data so tests don't require the real
+block_cd_distributions.csv.gz file.
+"""
+
+import numpy as np
+import pandas as pd
+import pytest
+from unittest.mock import patch
+
+from policyengine_us_data.calibration.clone_and_assign import (
+ GeographyAssignment,
+ load_global_block_distribution,
+ assign_random_geography,
+ double_geography_for_puf,
+)
+
+MOCK_BLOCKS = pd.DataFrame(
+ {
+ "cd_geoid": [101, 101, 101, 102, 102, 103, 103, 103, 103],
+ "block_geoid": [
+ "010010001001001",
+ "010010001001002",
+ "010010001001003",
+ "020010001001001",
+ "020010001001002",
+ "360100001001001",
+ "360100001001002",
+ "360100001001003",
+ "360100001001004",
+ ],
+ "probability": [
+ 0.4,
+ 0.3,
+ 0.3,
+ 0.6,
+ 0.4,
+ 0.25,
+ 0.25,
+ 0.25,
+ 0.25,
+ ],
+ }
+)
+
+
+@pytest.fixture(autouse=True)
+def _clear_lru_cache():
+ load_global_block_distribution.cache_clear()
+ yield
+ load_global_block_distribution.cache_clear()
+
+
+def _mock_distribution():
+ blocks = MOCK_BLOCKS["block_geoid"].values
+ cds = MOCK_BLOCKS["cd_geoid"].astype(str).values
+ states = np.array([int(b[:2]) for b in blocks])
+ probs = MOCK_BLOCKS["probability"].values.astype(np.float64)
+ probs = probs / probs.sum()
+ return blocks, cds, states, probs
+
+
+class TestLoadGlobalBlockDistribution:
+ def test_loads_and_normalizes(self, tmp_path):
+ csv_path = tmp_path / "block_cd_distributions.csv.gz"
+ MOCK_BLOCKS.to_csv(csv_path, index=False, compression="gzip")
+ with patch(
+ "policyengine_us_data.calibration"
+ ".clone_and_assign.STORAGE_FOLDER",
+ tmp_path,
+ ):
+ blocks, cds, states, probs = (
+ load_global_block_distribution.__wrapped__()
+ )
+ assert len(blocks) == 9
+ np.testing.assert_almost_equal(probs.sum(), 1.0)
+
+ def test_state_fips_extracted(self, tmp_path):
+ csv_path = tmp_path / "block_cd_distributions.csv.gz"
+ MOCK_BLOCKS.to_csv(csv_path, index=False, compression="gzip")
+ with patch(
+ "policyengine_us_data.calibration"
+ ".clone_and_assign.STORAGE_FOLDER",
+ tmp_path,
+ ):
+ _, _, states, _ = load_global_block_distribution.__wrapped__()
+ assert states[0] == 1
+ assert states[3] == 2
+ assert states[5] == 36
+
+
+class TestAssignRandomGeography:
+ @patch(
+ "policyengine_us_data.calibration.clone_and_assign"
+ ".load_global_block_distribution"
+ )
+ def test_shape(self, mock_load):
+ mock_load.return_value = _mock_distribution()
+ r = assign_random_geography(n_records=10, n_clones=3, seed=42)
+ assert len(r.block_geoid) == 30
+ assert r.n_records == 10
+ assert r.n_clones == 3
+
+ @patch(
+ "policyengine_us_data.calibration.clone_and_assign"
+ ".load_global_block_distribution"
+ )
+ def test_deterministic(self, mock_load):
+ mock_load.return_value = _mock_distribution()
+ r1 = assign_random_geography(n_records=10, n_clones=3, seed=99)
+ r2 = assign_random_geography(n_records=10, n_clones=3, seed=99)
+ np.testing.assert_array_equal(r1.block_geoid, r2.block_geoid)
+
+ @patch(
+ "policyengine_us_data.calibration.clone_and_assign"
+ ".load_global_block_distribution"
+ )
+ def test_different_seeds_differ(self, mock_load):
+ mock_load.return_value = _mock_distribution()
+ r1 = assign_random_geography(n_records=100, n_clones=3, seed=1)
+ r2 = assign_random_geography(n_records=100, n_clones=3, seed=2)
+ assert not np.array_equal(r1.block_geoid, r2.block_geoid)
+
+ @patch(
+ "policyengine_us_data.calibration.clone_and_assign"
+ ".load_global_block_distribution"
+ )
+ def test_state_from_block(self, mock_load):
+ mock_load.return_value = _mock_distribution()
+ r = assign_random_geography(n_records=20, n_clones=5, seed=42)
+ for i in range(len(r.block_geoid)):
+ expected = int(r.block_geoid[i][:2])
+ assert r.state_fips[i] == expected
+
+ def test_missing_file_raises(self, tmp_path):
+ fake = tmp_path / "nonexistent"
+ fake.mkdir()
+ with patch(
+ "policyengine_us_data.calibration"
+ ".clone_and_assign.STORAGE_FOLDER",
+ fake,
+ ):
+ with pytest.raises(FileNotFoundError):
+ load_global_block_distribution.__wrapped__()
+
+
+class TestDoubleGeographyForPuf:
+ def test_doubles_n_records(self):
+ geo = GeographyAssignment(
+ block_geoid=np.array(["010010001001001", "020010001001001"] * 3),
+ cd_geoid=np.array(["101", "202"] * 3),
+ state_fips=np.array([1, 2] * 3),
+ n_records=2,
+ n_clones=3,
+ )
+ r = double_geography_for_puf(geo)
+ assert r.n_records == 4
+ assert r.n_clones == 3
+ assert len(r.block_geoid) == 12
+
+ def test_puf_half_matches_cps_half(self):
+ geo = GeographyAssignment(
+ block_geoid=np.array(
+ [
+ "010010001001001",
+ "020010001001001",
+ "360100001001001",
+ "060100001001001",
+ "480100001001001",
+ "120100001001001",
+ ]
+ ),
+ cd_geoid=np.array(["101", "202", "1036", "653", "4831", "1227"]),
+ state_fips=np.array([1, 2, 36, 6, 48, 12]),
+ n_records=3,
+ n_clones=2,
+ )
+ r = double_geography_for_puf(geo)
+ n_new = r.n_records
+
+ for c in range(r.n_clones):
+ start = c * n_new
+ mid = start + n_new // 2
+ end = start + n_new
+ np.testing.assert_array_equal(
+ r.state_fips[start:mid],
+ r.state_fips[mid:end],
+ )
diff --git a/policyengine_us_data/tests/test_calibration/test_drop_target_groups.py b/policyengine_us_data/tests/test_calibration/test_drop_target_groups.py
new file mode 100644
index 000000000..daade621d
--- /dev/null
+++ b/policyengine_us_data/tests/test_calibration/test_drop_target_groups.py
@@ -0,0 +1,142 @@
+"""Tests for drop_target_groups in calibration_utils."""
+
+import numpy as np
+import pandas as pd
+import pytest
+from scipy import sparse
+
+from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (
+ drop_target_groups,
+ create_target_groups,
+)
+
+
+@pytest.fixture
+def sample_data():
+ targets_df = pd.DataFrame(
+ {
+ "variable": [
+ "snap",
+ "snap",
+ "snap",
+ "household_count",
+ "household_count",
+ ],
+ "domain_variable": [
+ "snap",
+ "snap",
+ "snap",
+ "snap",
+ "snap",
+ ],
+ "geographic_id": ["US", "6", "37", "6", "37"],
+ "value": [1000, 500, 300, 200, 100],
+ }
+ )
+ n_rows = len(targets_df)
+ n_cols = 10
+ rng = np.random.default_rng(42)
+ X = sparse.random(n_rows, n_cols, density=0.5, random_state=rng)
+ X = X.tocsr()
+ target_groups, group_info = create_target_groups(targets_df)
+ return targets_df, X, target_groups, group_info
+
+
+class TestDropTargetGroups:
+ def test_drops_matching_group(self, sample_data):
+ targets_df, X, target_groups, group_info = sample_data
+ n_before = len(targets_df)
+ out_df, out_X = drop_target_groups(
+ targets_df,
+ X,
+ target_groups,
+ group_info,
+ [("household count", "State")],
+ )
+ assert len(out_df) < n_before
+ assert out_X.shape[0] == len(out_df)
+ assert "household_count" not in out_df["variable"].values or not (
+ out_df[out_df["variable"] == "household_count"]["geographic_id"]
+ .isin(["6", "37"])
+ .any()
+ )
+
+ def test_keeps_unmatched_groups(self, sample_data):
+ targets_df, X, target_groups, group_info = sample_data
+ out_df, out_X = drop_target_groups(
+ targets_df,
+ X,
+ target_groups,
+ group_info,
+ [("household count", "State")],
+ )
+ assert "snap" in out_df["variable"].values
+
+ def test_matrix_rows_match_df(self, sample_data):
+ targets_df, X, target_groups, group_info = sample_data
+ out_df, out_X = drop_target_groups(
+ targets_df,
+ X,
+ target_groups,
+ group_info,
+ [("snap", "National")],
+ )
+ assert out_X.shape[0] == len(out_df)
+ assert out_X.shape[1] == X.shape[1]
+
+ def test_no_match_keeps_all(self, sample_data):
+ targets_df, X, target_groups, group_info = sample_data
+ out_df, out_X = drop_target_groups(
+ targets_df,
+ X,
+ target_groups,
+ group_info,
+ [("nonexistent", "National")],
+ )
+ assert len(out_df) == len(targets_df)
+ assert out_X.shape[0] == X.shape[0]
+
+ def test_drop_all_groups(self, sample_data):
+ targets_df, X, target_groups, group_info = sample_data
+ out_df, out_X = drop_target_groups(
+ targets_df,
+ X,
+ target_groups,
+ group_info,
+ [
+ ("snap", "National"),
+ ("snap", "State"),
+ ("household count", "State"),
+ ],
+ )
+ assert len(out_df) == 0
+ assert out_X.shape[0] == 0
+
+ def test_columns_preserved(self, sample_data):
+ targets_df, X, target_groups, group_info = sample_data
+ out_df, out_X = drop_target_groups(
+ targets_df,
+ X,
+ target_groups,
+ group_info,
+ [("snap", "National")],
+ )
+ assert out_X.shape[1] == X.shape[1]
+
+ def test_case_insensitive_match(self, sample_data):
+ targets_df, X, target_groups, group_info = sample_data
+ out_df, _ = drop_target_groups(
+ targets_df,
+ X,
+ target_groups,
+ group_info,
+ [("SNAP", "State")],
+ )
+ out_df2, _ = drop_target_groups(
+ targets_df,
+ X,
+ target_groups,
+ group_info,
+ [("snap", "State")],
+ )
+ assert len(out_df) == len(out_df2)
diff --git a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py
new file mode 100644
index 000000000..2d3f80619
--- /dev/null
+++ b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py
@@ -0,0 +1,87 @@
+"""Tests for unified_calibration module.
+
+Focuses on rerandomize_takeup: verifies draws differ by
+block and are reproducible within the same block.
+"""
+
+import numpy as np
+import pytest
+
+from policyengine_us_data.utils.randomness import seeded_rng
+
+
+class TestRerandomizeTakeupSeeding:
+ """Verify seeded_rng(var, salt=block) produces
+ reproducible, block-dependent draws."""
+
+ def test_same_block_same_draws(self):
+ var = "takes_up_snap_if_eligible"
+ block = "010010001001001"
+ rng1 = seeded_rng(var, salt=block)
+ rng2 = seeded_rng(var, salt=block)
+ draws1 = rng1.random(100)
+ draws2 = rng2.random(100)
+ np.testing.assert_array_equal(draws1, draws2)
+
+ def test_different_blocks_different_draws(self):
+ var = "takes_up_snap_if_eligible"
+ rng1 = seeded_rng(var, salt="010010001001001")
+ rng2 = seeded_rng(var, salt="020010001001001")
+ draws1 = rng1.random(100)
+ draws2 = rng2.random(100)
+ assert not np.array_equal(draws1, draws2)
+
+ def test_different_vars_different_draws(self):
+ block = "010010001001001"
+ rng1 = seeded_rng("takes_up_snap_if_eligible", salt=block)
+ rng2 = seeded_rng("takes_up_aca_if_eligible", salt=block)
+ draws1 = rng1.random(100)
+ draws2 = rng2.random(100)
+ assert not np.array_equal(draws1, draws2)
+
+ def test_draws_in_unit_interval(self):
+ rng = seeded_rng(
+ "takes_up_snap_if_eligible",
+ salt="010010001001001",
+ )
+ draws = rng.random(10000)
+ assert draws.min() >= 0.0
+ assert draws.max() < 1.0
+
+ def test_rate_comparison_produces_booleans(self):
+ rng = seeded_rng(
+ "takes_up_snap_if_eligible",
+ salt="010010001001001",
+ )
+ draws = rng.random(10000)
+ rate = 0.75
+ result = draws < rate
+ assert result.dtype == bool
+ frac = result.mean()
+ assert 0.70 < frac < 0.80
+
+
+class TestSimpleTakeupConfig:
+ """Verify the SIMPLE_TAKEUP_VARS config is well-formed."""
+
+ def test_all_entries_have_required_keys(self):
+ from policyengine_us_data.calibration.unified_calibration import (
+ SIMPLE_TAKEUP_VARS,
+ )
+
+ for entry in SIMPLE_TAKEUP_VARS:
+ assert "variable" in entry
+ assert "entity" in entry
+ assert "rate_key" in entry
+ assert entry["entity"] in (
+ "person",
+ "tax_unit",
+ "spm_unit",
+ )
+
+ def test_expected_count(self):
+ from policyengine_us_data.calibration.unified_calibration import (
+ SIMPLE_TAKEUP_VARS,
+ )
+
+ assert len(SIMPLE_TAKEUP_VARS) == 8
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_hierarchical_uprating.py b/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py
similarity index 53%
rename from policyengine_us_data/tests/test_local_area_calibration/test_hierarchical_uprating.py
rename to policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py
index 918e6ac86..ea2d49c5c 100644
--- a/policyengine_us_data/tests/test_local_area_calibration/test_hierarchical_uprating.py
+++ b/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py
@@ -1,15 +1,18 @@
-"""
-Tests for hierarchical uprating and CD reconciliation.
+"""Tests for UnifiedMatrixBuilder.
+
+Ports uprating/hierarchical tests from test_hierarchical_uprating.py.
+Uses in-memory SQLite DBs, self-contained.
"""
import unittest
import tempfile
import os
+
import pandas as pd
from sqlalchemy import create_engine, text
-from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (
- SparseMatrixBuilder,
+from policyengine_us_data.calibration.unified_matrix_builder import (
+ UnifiedMatrixBuilder,
)
from policyengine_us_data.db.create_database_tables import (
TARGET_OVERVIEW_VIEW,
@@ -17,13 +20,18 @@
def _create_test_db(db_path):
- """Create test DB with target_overview view and sample data."""
db_uri = f"sqlite:///{db_path}"
engine = create_engine(db_uri)
with engine.connect() as conn:
conn.execute(
- text("CREATE TABLE strata (" "stratum_id INTEGER PRIMARY KEY)")
+ text(
+ "CREATE TABLE strata ("
+ "stratum_id INTEGER PRIMARY KEY, "
+ "definition_hash VARCHAR(64), "
+ "parent_stratum_id INTEGER, "
+ "notes VARCHAR)"
+ )
)
conn.execute(
text(
@@ -46,7 +54,6 @@ def _create_test_db(db_path):
"active INTEGER DEFAULT 1)"
)
)
-
conn.execute(text(TARGET_OVERVIEW_VIEW))
conn.commit()
@@ -54,51 +61,37 @@ def _create_test_db(db_path):
def _insert_aca_ptc_data(engine):
- """Insert ACA PTC test data at national/state/district levels.
-
- State 6 (CA): 3 CDs (601, 602, 603)
- State 37 (NC): 2 CDs (3701, 3702)
-
- All IRS data at period=2022.
- One CMS national person_count at period=2024.
- """
with engine.connect() as conn:
- # Strata: national(1), state CA(2), state NC(3),
- # CDs: 601(4), 602(5), 603(6), 3701(7), 3702(8)
- # CMS national(9)
strata = [1, 2, 3, 4, 5, 6, 7, 8, 9]
for sid in strata:
conn.execute(
- text("INSERT INTO strata VALUES (:sid)"),
- {"sid": sid},
+ text(
+ "INSERT INTO strata "
+ "(stratum_id, parent_stratum_id) "
+ "VALUES (:sid, :parent)"
+ ),
+ {
+ "sid": sid,
+ "parent": None if sid == 1 else 1,
+ },
)
- # Constraints
constraints = [
- # National: aca_ptc > 0
(1, 1, "aca_ptc", ">", "0"),
- # State CA: aca_ptc > 0, state_fips=6
(2, 2, "aca_ptc", ">", "0"),
(3, 2, "state_fips", "=", "6"),
- # State NC: aca_ptc > 0, state_fips=37
(4, 3, "aca_ptc", ">", "0"),
(5, 3, "state_fips", "=", "37"),
- # CD 601
(6, 4, "aca_ptc", ">", "0"),
(7, 4, "congressional_district_geoid", "=", "601"),
- # CD 602
(8, 5, "aca_ptc", ">", "0"),
(9, 5, "congressional_district_geoid", "=", "602"),
- # CD 603
(10, 6, "aca_ptc", ">", "0"),
(11, 6, "congressional_district_geoid", "=", "603"),
- # CD 3701
(12, 7, "aca_ptc", ">", "0"),
(13, 7, "congressional_district_geoid", "=", "3701"),
- # CD 3702
(14, 8, "aca_ptc", ">", "0"),
(15, 8, "congressional_district_geoid", "=", "3702"),
- # CMS national: aca_ptc > 0
(16, 9, "aca_ptc", ">", "0"),
]
for cid, sid, var, op, val in constraints:
@@ -116,48 +109,31 @@ def _insert_aca_ptc_data(engine):
},
)
- # Targets
targets = [
- # National aca_ptc 2022
(1, 1, "aca_ptc", 10000.0, 2022),
- # National tax_unit_count 2022
(2, 1, "tax_unit_count", 500.0, 2022),
- # State CA aca_ptc 2022: 6000
(3, 2, "aca_ptc", 6000.0, 2022),
- # State CA tax_unit_count 2022: 300
(4, 2, "tax_unit_count", 300.0, 2022),
- # State NC aca_ptc 2022: 4000
(5, 3, "aca_ptc", 4000.0, 2022),
- # State NC tax_unit_count 2022: 200
(6, 3, "tax_unit_count", 200.0, 2022),
- # CD 601 aca_ptc 2022: 2000
(7, 4, "aca_ptc", 2000.0, 2022),
- # CD 602 aca_ptc 2022: 2500
(8, 5, "aca_ptc", 2500.0, 2022),
- # CD 603 aca_ptc 2022: 1500
(9, 6, "aca_ptc", 1500.0, 2022),
- # CD 601 tax_unit_count 2022: 100
(10, 4, "tax_unit_count", 100.0, 2022),
- # CD 602 tax_unit_count 2022: 120
(11, 5, "tax_unit_count", 120.0, 2022),
- # CD 603 tax_unit_count 2022: 80
(12, 6, "tax_unit_count", 80.0, 2022),
- # CD 3701 aca_ptc 2022: 2200
(13, 7, "aca_ptc", 2200.0, 2022),
- # CD 3702 aca_ptc 2022: 1800
(14, 8, "aca_ptc", 1800.0, 2022),
- # CD 3701 tax_unit_count 2022: 110
(15, 7, "tax_unit_count", 110.0, 2022),
- # CD 3702 tax_unit_count 2022: 90
(16, 8, "tax_unit_count", 90.0, 2022),
- # CMS national person_count 2024
(17, 9, "person_count", 19743689.0, 2024),
]
for tid, sid, var, val, period in targets:
conn.execute(
text(
"INSERT INTO targets "
- "VALUES (:tid, :sid, :var, :val, :period, 1)"
+ "VALUES (:tid, :sid, :var, :val, "
+ ":period, 1)"
),
{
"tid": tid,
@@ -170,9 +146,7 @@ def _insert_aca_ptc_data(engine):
conn.commit()
-class TestQueryTargetsOverview(unittest.TestCase):
- """Test _query_targets with target_overview view."""
-
+class TestQueryTargets(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.temp_db = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
@@ -186,57 +160,46 @@ def tearDownClass(cls):
os.unlink(cls.db_path)
def _make_builder(self, time_period=2024):
- return SparseMatrixBuilder(
+ return UnifiedMatrixBuilder(
db_uri=self.db_uri,
time_period=time_period,
- cds_to_calibrate=["601", "602", "603", "3701", "3702"],
)
def test_domain_variables_filter(self):
- builder = self._make_builder()
- df = builder._query_targets({"domain_variables": ["aca_ptc"]})
+ b = self._make_builder()
+ df = b._query_targets({"domain_variables": ["aca_ptc"]})
self.assertGreater(len(df), 0)
self.assertIn("geo_level", df.columns)
self.assertIn("geographic_id", df.columns)
self.assertIn("domain_variable", df.columns)
def test_all_geo_levels_returned(self):
- builder = self._make_builder()
- df = builder._query_targets({"domain_variables": ["aca_ptc"]})
+ b = self._make_builder()
+ df = b._query_targets({"domain_variables": ["aca_ptc"]})
geo_levels = set(df["geo_level"].unique())
self.assertEqual(geo_levels, {"national", "state", "district"})
def test_best_period_selection(self):
- """All aca_ptc targets at 2022, CMS at 2024."""
- builder = self._make_builder(time_period=2024)
- df = builder._query_targets({"domain_variables": ["aca_ptc"]})
- aca_rows = df[df["variable"] == "aca_ptc"]
- self.assertTrue((aca_rows["period"] == 2022).all())
-
- cms_rows = df[df["variable"] == "person_count"]
- self.assertEqual(len(cms_rows), 1)
- self.assertEqual(cms_rows.iloc[0]["period"], 2024)
+ b = self._make_builder(time_period=2024)
+ df = b._query_targets({"domain_variables": ["aca_ptc"]})
+ aca = df[df["variable"] == "aca_ptc"]
+ self.assertTrue((aca["period"] == 2022).all())
+ cms = df[df["variable"] == "person_count"]
+ self.assertEqual(len(cms), 1)
+ self.assertEqual(cms.iloc[0]["period"], 2024)
def test_geographic_id_populated(self):
- builder = self._make_builder()
- df = builder._query_targets({"domain_variables": ["aca_ptc"]})
+ b = self._make_builder()
+ df = b._query_targets({"domain_variables": ["aca_ptc"]})
national = df[df["geo_level"] == "national"]
self.assertTrue((national["geographic_id"] == "US").all())
-
state_ca = df[
(df["geo_level"] == "state") & (df["geographic_id"] == "6")
]
self.assertGreater(len(state_ca), 0)
- district_601 = df[
- (df["geo_level"] == "district") & (df["geographic_id"] == "601")
- ]
- self.assertGreater(len(district_601), 0)
-
class TestHierarchicalUprating(unittest.TestCase):
- """Test _apply_hierarchical_uprating logic."""
-
@classmethod
def setUpClass(cls):
cls.temp_db = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
@@ -250,141 +213,91 @@ def tearDownClass(cls):
os.unlink(cls.db_path)
def _make_builder(self, time_period=2024):
- return SparseMatrixBuilder(
+ return UnifiedMatrixBuilder(
db_uri=self.db_uri,
time_period=time_period,
- cds_to_calibrate=["601", "602", "603", "3701", "3702"],
)
def _get_targets_with_uprating(self, cpi_factor=1.1, pop_factor=1.02):
- builder = self._make_builder(time_period=2024)
- df = builder._query_targets({"domain_variables": ["aca_ptc"]})
+ b = self._make_builder(time_period=2024)
+ df = b._query_targets({"domain_variables": ["aca_ptc"]})
factors = {
(2022, "cpi"): cpi_factor,
(2022, "pop"): pop_factor,
}
df["original_value"] = df["value"].copy()
df["uprating_factor"] = df.apply(
- lambda row: builder._get_uprating_info(
+ lambda row: b._get_uprating_info(
row["variable"], row["period"], factors
)[0],
axis=1,
)
df["value"] = df["original_value"] * df["uprating_factor"]
- return builder, df, factors
+ return b, df, factors
- def test_cd_sums_match_uprated_state_totals(self):
- """After reconciliation, CD sums must equal state * UF."""
- builder, df, factors = self._get_targets_with_uprating(
+ def test_cd_sums_match_uprated_state(self):
+ b, df, factors = self._get_targets_with_uprating(
cpi_factor=1.1, pop_factor=1.02
)
+ result = b._apply_hierarchical_uprating(df, ["aca_ptc"], factors)
+ csv_factors = b._load_aca_ptc_factors()
- result = builder._apply_hierarchical_uprating(df, ["aca_ptc"], factors)
-
- # Get the CSV-based uprating factors used
- csv_factors = builder._load_aca_ptc_factors()
-
- # Expected: state_original * csv_factor
- for var, state_fips, state_original in [
+ for var, sf, orig in [
("aca_ptc", 6, 6000.0),
("aca_ptc", 37, 4000.0),
("tax_unit_count", 6, 300.0),
("tax_unit_count", 37, 200.0),
]:
- expected_total = state_original * csv_factors[state_fips][var]
+ expected = orig * csv_factors[sf][var]
cd_rows = result[
(result["variable"] == var)
& (result["geo_level"] == "district")
& (
result["geographic_id"].apply(
- lambda g, s=state_fips: (
+ lambda g, s=sf: (
int(g) // 100 == s if g.isdigit() else False
)
)
)
]
- cd_sum = cd_rows["value"].sum()
self.assertAlmostEqual(
- cd_sum,
- expected_total,
+ cd_rows["value"].sum(),
+ expected,
places=2,
- msg=f"CD sum for {var} state {state_fips}",
+ msg=f"{var} state {sf}",
)
def test_national_and_state_rows_dropped(self):
- """IRS national and state rows (period!=2024) are dropped."""
- builder, df, factors = self._get_targets_with_uprating()
- result = builder._apply_hierarchical_uprating(df, ["aca_ptc"], factors)
-
+ b, df, factors = self._get_targets_with_uprating()
+ result = b._apply_hierarchical_uprating(df, ["aca_ptc"], factors)
irs_national = result[
(result["geo_level"] == "national") & (result["period"] != 2024)
]
self.assertEqual(len(irs_national), 0)
-
state_rows = result[result["geo_level"] == "state"]
self.assertEqual(len(state_rows), 0)
def test_cms_person_count_preserved(self):
- """CMS national person_count (period=2024) is NOT dropped."""
- builder, df, factors = self._get_targets_with_uprating()
- result = builder._apply_hierarchical_uprating(df, ["aca_ptc"], factors)
-
+ b, df, factors = self._get_targets_with_uprating()
+ result = b._apply_hierarchical_uprating(df, ["aca_ptc"], factors)
cms = result[
(result["variable"] == "person_count") & (result["period"] == 2024)
]
self.assertEqual(len(cms), 1)
self.assertAlmostEqual(cms.iloc[0]["value"], 19743689.0, places=0)
- def test_hif_and_uprating_columns(self):
- """Diagnostic hif and state_uprating_factor columns populated."""
- builder, df, factors = self._get_targets_with_uprating(cpi_factor=1.1)
- result = builder._apply_hierarchical_uprating(df, ["aca_ptc"], factors)
-
- cd_aca = result[
- (result["variable"] == "aca_ptc")
- & (result["geo_level"] == "district")
- ]
- self.assertTrue(cd_aca["hif"].notna().all())
- self.assertTrue(cd_aca["state_uprating_factor"].notna().all())
-
def test_hif_is_one_when_cds_sum_to_state(self):
- """HIF == 1.0 when CDs already sum to state total.
-
- The uprating factor now comes from the CSV (state-specific),
- not from national CPI, so we just check HIF and that a
- nonzero uprating factor is set.
- """
- builder, df, factors = self._get_targets_with_uprating(cpi_factor=1.15)
- result = builder._apply_hierarchical_uprating(df, ["aca_ptc"], factors)
-
+ b, df, factors = self._get_targets_with_uprating(cpi_factor=1.15)
+ result = b._apply_hierarchical_uprating(df, ["aca_ptc"], factors)
cd_aca = result[
(result["variable"] == "aca_ptc")
& (result["geo_level"] == "district")
]
for _, row in cd_aca.iterrows():
- self.assertAlmostEqual(
- row["hif"],
- 1.0,
- places=6,
- msg=(
- f"CD {row['geographic_id']} HIF "
- f"should be 1.0 (CDs sum to state)"
- ),
- )
- self.assertGreater(
- row["state_uprating_factor"],
- 0,
- msg=(
- f"CD {row['geographic_id']} should "
- f"have a positive uprating factor"
- ),
- )
-
- def test_no_data_loss_for_non_hierarchical_rows(self):
- """Rows not in hierarchical_domains are untouched."""
- builder, df, factors = self._get_targets_with_uprating()
+ self.assertAlmostEqual(row["hif"], 1.0, places=6)
- # Add a non-hierarchical row
+ def test_non_hierarchical_rows_untouched(self):
+ b, df, factors = self._get_targets_with_uprating()
extra = pd.DataFrame(
[
{
@@ -401,20 +314,14 @@ def test_no_data_loss_for_non_hierarchical_rows(self):
}
]
)
- df_with_snap = pd.concat([df, extra], ignore_index=True)
-
- result = builder._apply_hierarchical_uprating(
- df_with_snap, ["aca_ptc"], factors
- )
-
- snap_rows = result[result["domain_variable"] == "snap"]
- self.assertEqual(len(snap_rows), 1)
- self.assertEqual(snap_rows.iloc[0]["value"], 5000.0)
+ df2 = pd.concat([df, extra], ignore_index=True)
+ result = b._apply_hierarchical_uprating(df2, ["aca_ptc"], factors)
+ snap = result[result["domain_variable"] == "snap"]
+ self.assertEqual(len(snap), 1)
+ self.assertEqual(snap.iloc[0]["value"], 5000.0)
class TestGetStateUpratingFactors(unittest.TestCase):
- """Test _get_state_uprating_factors."""
-
@classmethod
def setUpClass(cls):
cls.temp_db = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
@@ -428,56 +335,17 @@ def tearDownClass(cls):
os.unlink(cls.db_path)
def test_aca_ptc_uses_csv_factors(self):
- """aca_ptc domain loads real state-level factors from CSV."""
- builder = SparseMatrixBuilder(
- db_uri=self.db_uri,
- time_period=2024,
- cds_to_calibrate=["601"],
- )
- df = builder._query_targets({"domain_variables": ["aca_ptc"]})
- national_factors = {
- (2022, "cpi"): 1.08,
- (2022, "pop"): 1.015,
- }
+ b = UnifiedMatrixBuilder(db_uri=self.db_uri, time_period=2024)
+ df = b._query_targets({"domain_variables": ["aca_ptc"]})
+ nf = {(2022, "cpi"): 1.08, (2022, "pop"): 1.015}
df["original_value"] = df["value"].copy()
- result = builder._get_state_uprating_factors(
- "aca_ptc", df, national_factors
- )
-
+ result = b._get_state_uprating_factors("aca_ptc", df, nf)
self.assertIn(6, result)
self.assertIn(37, result)
- # CA: vol_mult ~1.0554, val_mult ~1.1460
- # aca_ptc factor = vol_mult * val_mult
- self.assertAlmostEqual(
- result[6]["aca_ptc"],
- 1.0554375137756227 * 1.1459694989106755,
- places=5,
- )
- # tax_unit_count factor = vol_mult only
- self.assertAlmostEqual(
- result[6]["tax_unit_count"], 1.0554375137756227, places=5
- )
-
- # NC: vol_mult ~1.4784, val_mult ~0.9571
- self.assertAlmostEqual(
- result[37]["aca_ptc"],
- 1.4784049241899557 * 0.9571183533447685,
- places=5,
- )
- self.assertAlmostEqual(
- result[37]["tax_unit_count"], 1.4784049241899557, places=5
- )
-
- def test_non_aca_domain_uses_national_factors(self):
- """Non-aca_ptc domains fall back to national CPI/pop factors."""
- builder = SparseMatrixBuilder(
- db_uri=self.db_uri,
- time_period=2024,
- cds_to_calibrate=["601"],
- )
- # Build a fake targets_df with domain="snap"
+ def test_non_aca_uses_national_factors(self):
+ b = UnifiedMatrixBuilder(db_uri=self.db_uri, time_period=2024)
df = pd.DataFrame(
[
{
@@ -500,21 +368,32 @@ def test_non_aca_domain_uses_national_factors(self):
},
]
)
- national_factors = {
- (2022, "cpi"): 1.08,
- (2022, "pop"): 1.015,
- }
-
- result = builder._get_state_uprating_factors(
- "snap", df, national_factors
- )
-
+ nf = {(2022, "cpi"): 1.08, (2022, "pop"): 1.015}
+ result = b._get_state_uprating_factors("snap", df, nf)
self.assertIn(6, result)
- # snap is dollar -> CPI
self.assertAlmostEqual(result[6]["snap"], 1.08)
- # household_count -> pop
self.assertAlmostEqual(result[6]["household_count"], 1.015)
+class TestCountTargetDetection(unittest.TestCase):
+ def test_endswith_count(self):
+ count_vars = [
+ "person_count",
+ "tax_unit_count",
+ "household_count",
+ ]
+ value_vars = ["snap", "aca_ptc", "income_tax"]
+ for v in count_vars:
+ self.assertTrue(
+ v.endswith("_count"),
+ f"{v} should be detected as count",
+ )
+ for v in value_vars:
+ self.assertFalse(
+ v.endswith("_count"),
+ f"{v} should not be a count target",
+ )
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/policyengine_us_data/tests/test_local_area_calibration/conftest.py b/policyengine_us_data/tests/test_local_area_calibration/conftest.py
index ce36157cc..dfede8002 100644
--- a/policyengine_us_data/tests/test_local_area_calibration/conftest.py
+++ b/policyengine_us_data/tests/test_local_area_calibration/conftest.py
@@ -1,57 +1,8 @@
-"""Shared fixtures for local area calibration tests.
-
-Importantly, this file determines which variables will be included in the sparse matrix and calibrating routine.
-"""
+"""Shared fixtures for local area calibration tests."""
import pytest
-import numpy as np
-from sqlalchemy import create_engine, text
-from policyengine_us import Microsimulation
from policyengine_us_data.storage import STORAGE_FOLDER
-from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (
- SparseMatrixBuilder,
-)
-from policyengine_us_data.datasets.cps.local_area_calibration.matrix_tracer import (
- MatrixTracer,
-)
-from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (
- get_calculated_variables,
-)
-
-# Variables to test for state-level value matching (CI uses subset for speed)
-# Format: (variable_name, rtol)
-# variable_name as per the targets in policy_data.db
-# rtol is relative tolerance for comparison
-#
-# NOTE: Count targets (person_count, tax_unit_count) are excluded because
-# they have constraints (e.g., age>=5|age<18) that make the X_sparse values
-# different from raw sim.calculate() values. Count targets are tested
-# separately in test_count_targets.py with controlled mock data.
-VARIABLES_TO_TEST = [
- ("snap", 1e-2),
- ("income_tax", 1e-2),
- ("eitc", 1e-2),
-]
-
-# CI filter config - minimal subset for fast CI runs
-# Tests 3 representative variables covering benefits, taxes, and credits
-COMBINED_FILTER_CONFIG = {
- "domain_variables": [
- "snap",
- ],
- "variables": [
- "snap",
- "income_tax",
- "eitc",
- ],
-}
-
-# Maximum allowed mismatch rate for state-level value comparison
-MAX_MISMATCH_RATE = 0.02
-
-# Number of samples for cell-level verification tests
-N_VERIFICATION_SAMPLES = 500
@pytest.fixture(scope="module")
@@ -63,92 +14,3 @@ def db_uri():
@pytest.fixture(scope="module")
def dataset_path():
return str(STORAGE_FOLDER / "stratified_extended_cps_2024.h5")
-
-
-@pytest.fixture(scope="module")
-def test_cds(db_uri):
- """CDs from NC, HI, MT, AK (manageable size for CI, multiple same-state CDs)."""
- engine = create_engine(db_uri)
- query = """
- SELECT DISTINCT sc.value as cd_geoid
- FROM stratum_constraints sc
- WHERE sc.constraint_variable = 'congressional_district_geoid'
- AND (
- sc.value LIKE '37__'
- OR sc.value LIKE '150_'
- OR sc.value LIKE '300_'
- OR sc.value = '200' OR sc.value = '201'
- )
- ORDER BY sc.value
- """
- with engine.connect() as conn:
- result = conn.execute(text(query)).fetchall()
- return [row[0] for row in result]
-
-
-@pytest.fixture(scope="module")
-def sim(dataset_path):
- return Microsimulation(dataset=dataset_path)
-
-
-@pytest.fixture(scope="module")
-def matrix_data(db_uri, dataset_path, test_cds, sim):
- """Build sparse matrix with all configured variables."""
- builder = SparseMatrixBuilder(
- db_uri,
- time_period=2023,
- cds_to_calibrate=test_cds,
- dataset_path=dataset_path,
- )
- targets_df, X_sparse, household_id_mapping = builder.build_matrix(
- sim, target_filter=COMBINED_FILTER_CONFIG
- )
- return targets_df, X_sparse, household_id_mapping
-
-
-@pytest.fixture(scope="module")
-def targets_df(matrix_data):
- return matrix_data[0]
-
-
-@pytest.fixture(scope="module")
-def X_sparse(matrix_data):
- return matrix_data[1]
-
-
-@pytest.fixture(scope="module")
-def household_id_mapping(matrix_data):
- return matrix_data[2]
-
-
-@pytest.fixture(scope="module")
-def tracer(targets_df, X_sparse, household_id_mapping, test_cds, sim):
- return MatrixTracer(
- targets_df, X_sparse, household_id_mapping, test_cds, sim
- )
-
-
-@pytest.fixture(scope="module")
-def n_households(tracer):
- return tracer.n_households
-
-
-@pytest.fixture(scope="module")
-def household_ids(tracer):
- return tracer.original_household_ids
-
-
-@pytest.fixture(scope="module")
-def household_states(sim):
- return sim.calculate("state_fips", map_to="household").values
-
-
-def create_state_simulation(dataset_path, n_households, state):
- """Create simulation with all households assigned to a specific state."""
- s = Microsimulation(dataset=dataset_path)
- s.set_input(
- "state_fips", 2023, np.full(n_households, state, dtype=np.int32)
- )
- for var in get_calculated_variables(s):
- s.delete_arrays(var)
- return s
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_column_indexing.py b/policyengine_us_data/tests/test_local_area_calibration/test_column_indexing.py
deleted file mode 100644
index 2e23763bc..000000000
--- a/policyengine_us_data/tests/test_local_area_calibration/test_column_indexing.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""Test column indexing in sparse matrix."""
-
-import pytest
-
-
-def test_column_indexing_roundtrip(X_sparse, tracer, test_cds):
- """
- Verify column index = cd_idx * n_households + household_index.
-
- This is pure math - if this fails, everything else is unreliable.
- """
- n_hh = tracer.n_households
- hh_ids = tracer.original_household_ids
- errors = []
-
- test_cases = []
- for cd_idx in [0, len(test_cds) // 2, len(test_cds) - 1]:
- for hh_idx in [0, 100, n_hh - 1]:
- test_cases.append((cd_idx, hh_idx))
-
- for cd_idx, hh_idx in test_cases:
- cd = test_cds[cd_idx]
- hh_id = hh_ids[hh_idx]
- expected_col = cd_idx * n_hh + hh_idx
- col_info = tracer.get_column_info(expected_col)
- positions = tracer.get_household_column_positions(hh_id)
- pos_col = positions[cd]
-
- if col_info["cd_geoid"] != cd:
- errors.append(f"CD mismatch at col {expected_col}")
- if col_info["household_index"] != hh_idx:
- errors.append(f"HH index mismatch at col {expected_col}")
- if col_info["household_id"] != hh_id:
- errors.append(f"HH ID mismatch at col {expected_col}")
- if pos_col != expected_col:
- errors.append(f"Position mismatch for hh {hh_id}, cd {cd}")
-
- assert not errors, f"Column indexing errors: {errors}"
-
-
-def test_matrix_dimensions(X_sparse, tracer, test_cds):
- """Verify matrix width matches expected CD x household count."""
- n_hh = tracer.n_households
- expected_cols = len(test_cds) * n_hh
- assert (
- X_sparse.shape[1] == expected_cols
- ), f"Matrix width mismatch: expected {expected_cols}, got {X_sparse.shape[1]}"
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_count_targets.py b/policyengine_us_data/tests/test_local_area_calibration/test_count_targets.py
deleted file mode 100644
index 46eae4ebb..000000000
--- a/policyengine_us_data/tests/test_local_area_calibration/test_count_targets.py
+++ /dev/null
@@ -1,415 +0,0 @@
-"""
-Tests for count target handling in SparseMatrixBuilder.
-
-These tests verify that count targets (e.g., person_count, tax_unit_count)
-are correctly handled by counting entities that satisfy constraints, rather
-than summing values.
-"""
-
-import pytest
-import numpy as np
-from dataclasses import dataclass
-
-from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (
- SparseMatrixBuilder,
-)
-
-
-@dataclass
-class MockEntity:
- """Mock entity with a key attribute."""
-
- key: str
-
-
-@dataclass
-class MockVariable:
- """Mock variable with entity information."""
-
- entity: MockEntity
-
- @classmethod
- def create(cls, entity_key: str) -> "MockVariable":
- return cls(entity=MockEntity(key=entity_key))
-
-
-class MockTaxBenefitSystem:
- """Mock tax benefit system with variable definitions."""
-
- def __init__(self):
- self.variables = {
- "person_count": MockVariable.create("person"),
- "tax_unit_count": MockVariable.create("tax_unit"),
- "household_count": MockVariable.create("household"),
- "spm_unit_count": MockVariable.create("spm_unit"),
- "snap": MockVariable.create("spm_unit"),
- }
-
-
-@dataclass
-class MockCalculationResult:
- """Mock result from simulation.calculate()."""
-
- values: np.ndarray
-
-
-class MockSimulation:
- """Mock simulation for testing count target calculations."""
-
- def __init__(self, entity_data: dict, variable_values: dict):
- """
- Args:
- entity_data: Dict with person_id, household_id, tax_unit_id,
- spm_unit_id arrays (all at person level)
- variable_values: Dict mapping variable names to their values
- at the appropriate entity level
- """
- self.entity_data = entity_data
- self.variable_values = variable_values
- self.tax_benefit_system = MockTaxBenefitSystem()
-
- def calculate(self, variable: str, map_to: str = None):
- """Return mock calculation result."""
- if variable in self.entity_data:
- # Entity ID variables
- if map_to == "person":
- values = np.array(self.entity_data[variable])
- elif map_to == "household":
- # Return unique household IDs
- values = np.array(
- sorted(set(self.entity_data["household_id"]))
- )
- else:
- values = np.array(self.entity_data[variable])
- elif variable in self.variable_values:
- # Regular variables - return at requested level
- val_data = self.variable_values[variable]
- if map_to == "person":
- values = np.array(val_data["person"])
- elif map_to == "household":
- values = np.array(val_data["household"])
- else:
- values = np.array(val_data.get("default", []))
- else:
- values = np.array([])
-
- return MockCalculationResult(values=values)
-
-
-@pytest.fixture
-def basic_entity_data():
- """
- Create mock entity relationships with known household compositions.
-
- Household 1 (id=100): 3 people (ages 5, 12, 40) -> 2 aged 5-17
- Household 2 (id=200): 2 people (ages 3, 25) -> 0 aged 5-17
- Household 3 (id=300): 4 people (ages 6, 8, 10, 45) -> 3 aged 5-17
- """
- return {
- "person_id": [1, 2, 3, 4, 5, 6, 7, 8, 9],
- "household_id": [100, 100, 100, 200, 200, 300, 300, 300, 300],
- "tax_unit_id": [10, 10, 10, 20, 20, 30, 30, 30, 30],
- "spm_unit_id": [
- 1000,
- 1000,
- 1000,
- 2000,
- 2000,
- 3000,
- 3000,
- 3000,
- 3000,
- ],
- }
-
-
-@pytest.fixture
-def basic_variable_values():
- """Variable values for basic household composition tests."""
- return {
- "age": {
- "person": [5, 12, 40, 3, 25, 6, 8, 10, 45],
- "household": [40, 25, 45], # Not used for age constraints
- },
- "person_count": {
- "person": [1, 1, 1, 1, 1, 1, 1, 1, 1],
- "household": [3, 2, 4], # Sum per household
- },
- "snap": {
- "person": [100, 100, 100, 0, 0, 200, 200, 200, 200],
- "household": [300, 0, 800],
- },
- }
-
-
-@pytest.fixture
-def basic_sim(basic_entity_data, basic_variable_values):
- """Mock simulation with basic household compositions."""
- return MockSimulation(basic_entity_data, basic_variable_values)
-
-
-@pytest.fixture
-def builder():
- """Create a minimal SparseMatrixBuilder (won't use DB for unit tests)."""
- return SparseMatrixBuilder(
- db_uri="sqlite:///:memory:",
- time_period=2023,
- cds_to_calibrate=["101"],
- )
-
-
-# Tests for basic count target calculation
-class TestCountTargetCalculation:
- """Test _calculate_target_values_entity_aware for count targets."""
-
- def test_person_count_with_age_constraints(self, builder, basic_sim):
- """Test person_count correctly counts persons in age range per HH."""
- # Constraints: age >= 5 AND age < 18
- constraints = [
- {"variable": "age", "operation": ">=", "value": 5},
- {"variable": "age", "operation": "<", "value": 18},
- ]
-
- geo_mask = np.array([True, True, True]) # All households included
- n_households = 3
-
- result = builder._calculate_target_values_entity_aware(
- basic_sim,
- "person_count",
- constraints,
- geo_mask,
- n_households,
- )
-
- # Expected: HH1 has 2 people (ages 5, 12), HH2 has 0, HH3 has 3 (6,8,10)
- expected = np.array([2, 0, 3], dtype=np.float32)
- np.testing.assert_array_equal(result, expected)
-
- def test_person_count_no_constraints(self, builder, basic_sim):
- """Test person_count without constraints returns all persons per HH."""
- constraints = []
- geo_mask = np.array([True, True, True])
- n_households = 3
-
- result = builder._calculate_target_values_entity_aware(
- basic_sim,
- "person_count",
- constraints,
- geo_mask,
- n_households,
- )
-
- # Expected: HH1 has 3 people, HH2 has 2, HH3 has 4
- expected = np.array([3, 2, 4], dtype=np.float32)
- np.testing.assert_array_equal(result, expected)
-
- def test_person_count_with_geo_mask(self, builder, basic_sim):
- """Test person_count respects geographic mask."""
- constraints = [
- {"variable": "age", "operation": ">=", "value": 5},
- {"variable": "age", "operation": "<", "value": 18},
- ]
-
- # Only include households 1 and 3
- geo_mask = np.array([True, False, True])
- n_households = 3
-
- result = builder._calculate_target_values_entity_aware(
- basic_sim,
- "person_count",
- constraints,
- geo_mask,
- n_households,
- )
-
- # Expected: HH1=2, HH2=0 (masked out), HH3=3
- expected = np.array([2, 0, 3], dtype=np.float32)
- np.testing.assert_array_equal(result, expected)
-
- def test_value_target_uses_sum(self, builder, basic_sim):
- """Test that non-count targets sum values (existing behavior)."""
- # SNAP is a value target, not a count target
- constraints = []
- geo_mask = np.array([True, True, True])
- n_households = 3
-
- result = builder._calculate_target_values_entity_aware(
- basic_sim,
- "snap",
- constraints,
- geo_mask,
- n_households,
- )
-
- # Expected: Sum of snap values per household
- expected = np.array([300, 0, 800], dtype=np.float32)
- np.testing.assert_array_equal(result, expected)
-
- def test_household_count_no_constraints(self, builder, basic_sim):
- """Test household_count returns 1 for each qualifying household."""
- constraints = []
- geo_mask = np.array([True, True, True])
- n_households = 3
-
- result = builder._calculate_target_values_entity_aware(
- basic_sim,
- "household_count",
- constraints,
- geo_mask,
- n_households,
- )
-
- # Expected: 1 for each household in geo_mask
- expected = np.array([1, 1, 1], dtype=np.float32)
- np.testing.assert_array_equal(result, expected)
-
- def test_household_count_with_geo_mask(self, builder, basic_sim):
- """Test household_count respects geographic mask."""
- constraints = []
- geo_mask = np.array([True, False, True])
- n_households = 3
-
- result = builder._calculate_target_values_entity_aware(
- basic_sim,
- "household_count",
- constraints,
- geo_mask,
- n_households,
- )
-
- # Expected: 1 for HH1, 0 for HH2 (masked), 1 for HH3
- expected = np.array([1, 0, 1], dtype=np.float32)
- np.testing.assert_array_equal(result, expected)
-
-
-# Fixtures for complex entity relationship tests
-@pytest.fixture
-def complex_entity_data():
- """
- Create entity data with multiple tax units per household.
-
- Household 1 (id=100): 4 people in 2 tax units
- Tax unit 10: person 1 (age 30, filer), person 2 (age 28)
- Tax unit 11: person 3 (age 65, filer), person 4 (age 62)
- Household 2 (id=200): 2 people in 1 tax unit
- Tax unit 20: person 5 (age 45, filer), person 6 (age 16)
- """
- return {
- "person_id": [1, 2, 3, 4, 5, 6],
- "household_id": [100, 100, 100, 100, 200, 200],
- "tax_unit_id": [10, 10, 11, 11, 20, 20],
- "spm_unit_id": [1000, 1000, 1000, 1000, 2000, 2000],
- }
-
-
-@pytest.fixture
-def complex_variable_values():
- """Variable values for complex entity relationship tests."""
- return {
- "age": {
- "person": [30, 28, 65, 62, 45, 16],
- "household": [65, 45],
- },
- "is_tax_unit_head": {
- "person": [True, False, True, False, True, False],
- "household": [2, 1], # count of heads per HH
- },
- "tax_unit_count": {
- "person": [1, 1, 1, 1, 1, 1],
- "household": [2, 1],
- },
- "person_count": {
- "person": [1, 1, 1, 1, 1, 1],
- "household": [4, 2],
- },
- }
-
-
-@pytest.fixture
-def complex_sim(complex_entity_data, complex_variable_values):
- """Mock simulation with complex entity relationships."""
- return MockSimulation(complex_entity_data, complex_variable_values)
-
-
-# Tests for complex entity relationships
-class TestCountTargetWithRealEntities:
- """Test count targets with more complex entity relationships."""
-
- def test_tax_unit_count_no_constraints(self, builder, complex_sim):
- """Test tax_unit_count counts all tax units per household."""
- constraints = []
- geo_mask = np.array([True, True])
- n_households = 2
-
- result = builder._calculate_target_values_entity_aware(
- complex_sim,
- "tax_unit_count",
- constraints,
- geo_mask,
- n_households,
- )
-
- # Expected: HH1 has 2 tax units, HH2 has 1
- expected = np.array([2, 1], dtype=np.float32)
- np.testing.assert_array_equal(result, expected)
-
- def test_tax_unit_count_with_age_constraint(self, builder, complex_sim):
- """Test tax_unit_count with age constraint on members."""
- # Count tax units that have at least one person aged >= 65
- constraints = [
- {"variable": "age", "operation": ">=", "value": 65},
- ]
- geo_mask = np.array([True, True])
- n_households = 2
-
- result = builder._calculate_target_values_entity_aware(
- complex_sim,
- "tax_unit_count",
- constraints,
- geo_mask,
- n_households,
- )
-
- # Expected: HH1 has 1 tax unit (TU 11) with person >=65, HH2 has 0
- expected = np.array([1, 0], dtype=np.float32)
- np.testing.assert_array_equal(result, expected)
-
- def test_person_count_seniors(self, builder, complex_sim):
- """Test person_count for seniors (age >= 65)."""
- constraints = [
- {"variable": "age", "operation": ">=", "value": 65},
- ]
- geo_mask = np.array([True, True])
- n_households = 2
-
- result = builder._calculate_target_values_entity_aware(
- complex_sim,
- "person_count",
- constraints,
- geo_mask,
- n_households,
- )
-
- # Expected: HH1 has 1 senior (age 65), HH2 has 0
- expected = np.array([1, 0], dtype=np.float32)
- np.testing.assert_array_equal(result, expected)
-
- def test_person_count_children(self, builder, complex_sim):
- """Test person_count for children (age < 18)."""
- constraints = [
- {"variable": "age", "operation": "<", "value": 18},
- ]
- geo_mask = np.array([True, True])
- n_households = 2
-
- result = builder._calculate_target_values_entity_aware(
- complex_sim,
- "person_count",
- constraints,
- geo_mask,
- n_households,
- )
-
- # Expected: HH1 has 0 children, HH2 has 1 (age 16)
- expected = np.array([0, 1], dtype=np.float32)
- np.testing.assert_array_equal(result, expected)
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py b/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py
deleted file mode 100644
index 2f44428c5..000000000
--- a/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py
+++ /dev/null
@@ -1,134 +0,0 @@
-"""Test cross-state values match state-swapped simulations."""
-
-import pytest
-import numpy as np
-from collections import defaultdict
-
-from policyengine_us import Microsimulation
-from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (
- get_calculated_variables,
-)
-
-from .conftest import VARIABLES_TO_TEST, N_VERIFICATION_SAMPLES
-
-
-@pytest.mark.skip(
- reason="Sparse matrix builder not used in production; test needs rework after time_period fix"
-)
-def test_cross_state_matches_swapped_sim(
- X_sparse,
- targets_df,
- test_cds,
- dataset_path,
- n_households,
- household_ids,
- household_states,
-):
- """
- Cross-state non-zero cells must match state-swapped simulation.
-
- When household moves to different state, X_sparse should contain the
- value calculated from a fresh simulation with state_fips set to
- destination state.
-
- Uses stratified sampling to ensure all variables in VARIABLES_TO_TEST
- are covered with approximately equal samples per variable.
- """
- seed = 42
- rng = np.random.default_rng(seed)
- n_hh = n_households
- hh_ids = household_ids
- hh_states = household_states
-
- state_sims = {}
-
- def get_state_sim(state):
- if state not in state_sims:
- s = Microsimulation(dataset=dataset_path)
- s.set_input(
- "state_fips", 2023, np.full(n_hh, state, dtype=np.int32)
- )
- for var in get_calculated_variables(s):
- s.delete_arrays(var)
- state_sims[state] = s
- return state_sims[state]
-
- nonzero_rows, nonzero_cols = X_sparse.nonzero()
-
- # Group cross-state cells by variable for stratified sampling
- variable_to_indices = defaultdict(list)
- variables_to_test = {v[0] for v in VARIABLES_TO_TEST}
-
- for i in range(len(nonzero_rows)):
- row_idx = nonzero_rows[i]
- col_idx = nonzero_cols[i]
- cd_idx = col_idx // n_hh
- hh_idx = col_idx % n_hh
- cd = test_cds[cd_idx]
- dest_state = int(cd) // 100
- orig_state = int(hh_states[hh_idx])
-
- # Only include cross-state cells
- if dest_state == orig_state:
- continue
-
- # Get variable for this row
- variable = targets_df.iloc[row_idx]["variable"]
- if variable in variables_to_test:
- variable_to_indices[variable].append(i)
-
- if not variable_to_indices:
- pytest.skip("No cross-state non-zero cells found for test variables")
-
- # Stratified sampling: sample proportionally from each variable
- samples_per_var = max(
- 1, N_VERIFICATION_SAMPLES // len(variable_to_indices)
- )
- sample_indices = []
-
- for variable, indices in variable_to_indices.items():
- n_to_sample = min(samples_per_var, len(indices))
- sampled = rng.choice(indices, n_to_sample, replace=False)
- sample_indices.extend(sampled)
-
- errors = []
- variables_tested = set()
-
- for idx in sample_indices:
- row_idx = nonzero_rows[idx]
- col_idx = nonzero_cols[idx]
- cd_idx = col_idx // n_hh
- hh_idx = col_idx % n_hh
- cd = test_cds[cd_idx]
- dest_state = int(cd) // 100
- variable = targets_df.iloc[row_idx]["variable"]
- actual = float(X_sparse[row_idx, col_idx])
- state_sim = get_state_sim(dest_state)
- expected = float(
- state_sim.calculate(variable, map_to="household").values[hh_idx]
- )
-
- variables_tested.add(variable)
-
- if not np.isclose(actual, expected, atol=0.5):
- errors.append(
- {
- "hh_id": hh_ids[hh_idx],
- "orig_state": int(hh_states[hh_idx]),
- "dest_state": dest_state,
- "variable": variable,
- "actual": actual,
- "expected": expected,
- }
- )
-
- # Report which variables were tested
- missing_vars = variables_to_test - variables_tested
- if missing_vars:
- print(f"Warning: No cross-state cells found for: {missing_vars}")
-
- assert not errors, (
- f"Cross-state verification failed: {len(errors)}/{len(sample_indices)} "
- f"mismatches across {len(variables_tested)} variables. "
- f"First 5: {errors[:5]}"
- )
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_geo_masking.py b/policyengine_us_data/tests/test_local_area_calibration/test_geo_masking.py
deleted file mode 100644
index 9f0033733..000000000
--- a/policyengine_us_data/tests/test_local_area_calibration/test_geo_masking.py
+++ /dev/null
@@ -1,202 +0,0 @@
-"""Test geographic masking behavior in sparse matrix."""
-
-import pytest
-import numpy as np
-
-
-def test_state_level_zero_masking(
- X_sparse, targets_df, tracer, test_cds, n_households
-):
- """
- State-level targets have zeros for wrong-state CD columns.
-
- For a target with geographic_id=37 (NC), columns for CDs in other states
- (HI, MT, AK) should all be zero.
- """
- seed = 42
- rng = np.random.default_rng(seed)
- n_hh = n_households
-
- state_targets = []
- for row_idx in range(len(targets_df)):
- geo_id = targets_df.iloc[row_idx].get("geographic_id", "US")
- if geo_id != "US":
- try:
- val = int(geo_id)
- if val < 100:
- state_targets.append((row_idx, val))
- except (ValueError, TypeError):
- pass
-
- if not state_targets:
- pytest.skip("No state-level targets found")
-
- errors = []
- checked = 0
- sample_targets = rng.choice(
- len(state_targets), min(20, len(state_targets)), replace=False
- )
-
- for idx in sample_targets:
- row_idx, target_state = state_targets[idx]
- other_state_cds = [
- (i, cd)
- for i, cd in enumerate(test_cds)
- if int(cd) // 100 != target_state
- ]
- if not other_state_cds:
- continue
-
- sample_cds = rng.choice(
- len(other_state_cds), min(5, len(other_state_cds)), replace=False
- )
- for cd_sample_idx in sample_cds:
- cd_idx, cd = other_state_cds[cd_sample_idx]
- sample_hh = rng.choice(n_hh, min(5, n_hh), replace=False)
- for hh_idx in sample_hh:
- col_idx = cd_idx * n_hh + hh_idx
- actual = X_sparse[row_idx, col_idx]
- checked += 1
- if actual != 0:
- errors.append(
- {"row": row_idx, "cd": cd, "value": float(actual)}
- )
-
- assert (
- not errors
- ), f"State-level masking failed: {len(errors)}/{checked} should be zero"
-
-
-def test_cd_level_zero_masking(
- X_sparse, targets_df, tracer, test_cds, n_households
-):
- """
- CD-level targets have zeros for other CDs, even same-state.
-
- For a target with geographic_id=3707, columns for CDs 3701-3706, 3708-3714
- should all be zero, even though they're all in NC (state 37).
- """
- seed = 42
- rng = np.random.default_rng(seed)
- n_hh = n_households
-
- cd_targets_with_same_state = []
- for row_idx in range(len(targets_df)):
- geo_id = targets_df.iloc[row_idx].get("geographic_id", "US")
- if geo_id != "US":
- try:
- val = int(geo_id)
- if val >= 100:
- target_state = val // 100
- same_state_other_cds = [
- cd
- for cd in test_cds
- if int(cd) // 100 == target_state and cd != geo_id
- ]
- if same_state_other_cds:
- cd_targets_with_same_state.append(
- (row_idx, geo_id, same_state_other_cds)
- )
- except (ValueError, TypeError):
- pass
-
- if not cd_targets_with_same_state:
- pytest.skip(
- "No CD-level targets with same-state other CDs in test_cds"
- )
-
- errors = []
- same_state_checks = 0
-
- for row_idx, target_cd, other_cds in cd_targets_with_same_state[:10]:
- for cd in other_cds:
- cd_idx = test_cds.index(cd)
- for hh_idx in rng.choice(n_hh, 3, replace=False):
- col_idx = cd_idx * n_hh + hh_idx
- actual = X_sparse[row_idx, col_idx]
- same_state_checks += 1
- if actual != 0:
- errors.append(
- {
- "target_cd": target_cd,
- "other_cd": cd,
- "value": float(actual),
- }
- )
-
- assert not errors, (
- f"CD-level masking failed: {len(errors)} same-state-different-CD "
- f"non-zero values. First 5: {errors[:5]}"
- )
-
-
-@pytest.mark.skip(
- reason="Sparse matrix builder not used in production; test needs rework after time_period fix"
-)
-def test_national_no_geo_masking(
- X_sparse, targets_df, tracer, sim, test_cds, dataset_path, n_households
-):
- """
- National targets have no geographic masking.
-
- National targets (geographic_id='US') can have non-zero values for ANY CD.
- Values differ by destination state because benefits are recalculated
- under each state's rules.
- """
- seed = 42
- rng = np.random.default_rng(seed)
- n_hh = n_households
- hh_ids = tracer.original_household_ids
-
- national_rows = [
- i
- for i in range(len(targets_df))
- if targets_df.iloc[i].get("geographic_id", "US") == "US"
- ]
-
- if not national_rows:
- pytest.skip("No national targets found")
-
- states_in_test = sorted(set(int(cd) // 100 for cd in test_cds))
- cds_by_state = {
- state: [cd for cd in test_cds if int(cd) // 100 == state]
- for state in states_in_test
- }
-
- for row_idx in national_rows:
- variable = targets_df.iloc[row_idx]["variable"]
-
- row_data = X_sparse.getrow(row_idx)
- nonzero_cols = row_data.nonzero()[1]
-
- assert (
- len(nonzero_cols) > 0
- ), f"National target row {row_idx} ({variable}) has no non-zero values"
-
- sample_cols = rng.choice(
- nonzero_cols, min(5, len(nonzero_cols)), replace=False
- )
-
- households_checked = 0
- households_with_multi_state_values = 0
-
- for col_idx in sample_cols:
- hh_idx = col_idx % n_hh
-
- values_by_state = {}
- for state, cds in cds_by_state.items():
- cd = cds[0]
- cd_idx = test_cds.index(cd)
- state_col = cd_idx * n_hh + hh_idx
- val = float(X_sparse[row_idx, state_col])
- if val != 0:
- values_by_state[state] = val
-
- households_checked += 1
- if len(values_by_state) > 1:
- households_with_multi_state_values += 1
-
- assert households_with_multi_state_values > 0, (
- f"National target {variable}: no households have values in "
- f"multiple states"
- )
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py b/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py
deleted file mode 100644
index 53760834c..000000000
--- a/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py
+++ /dev/null
@@ -1,488 +0,0 @@
-"""
-Tests for correctness in the sparse matrix builder, particularly for national level contributions.
-
-These tests verify that:
-1. Matrix shape and structure are correct
-2. Variable aggregation (person to household) preserves totals
-3. National-level targets receive contributions from all states (no geographic
- bias)
-4. Cross-state recalculation applies state-specific rules
-"""
-
-import pytest
-import numpy as np
-import pandas as pd
-from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (
- SparseMatrixBuilder,
-)
-
-from .conftest import (
- VARIABLES_TO_TEST,
- COMBINED_FILTER_CONFIG,
-)
-
-# Variables with state-specific variation (e.g., SNAP eligibility)
-VARIABLES_WITH_STATE_VARIATION = [
- "snap",
-]
-
-
-@pytest.fixture(scope="module")
-def builder(db_uri, dataset_path, test_cds):
- """SparseMatrixBuilder configured with test CDs."""
- return SparseMatrixBuilder(
- db_uri=db_uri,
- time_period=2023,
- cds_to_calibrate=test_cds,
- dataset_path=dataset_path,
- )
-
-
-def _get_geo_level(geo_id) -> str:
- """Determine geographic level from geographic_id."""
- if geo_id == "US":
- return "national"
- try:
- val = int(geo_id)
- if 1 <= val <= 56:
- return "state"
- else:
- return "district"
- except (ValueError, TypeError):
- return "unknown"
-
-
-def test_person_level_aggregation_preserves_totals(sim):
- """Health insurance premiums (person-level) sum correctly to household."""
- var = "health_insurance_premiums_without_medicare_part_b"
- person_total = sim.calculate(var, 2023, map_to="person").values.sum()
- household_total = sim.calculate(var, 2023, map_to="household").values.sum()
- assert np.isclose(person_total, household_total, rtol=1e-6)
-
-
-def test_matrix_shape(sim, builder):
- """Matrix should have (n_targets, n_households * n_cds) shape."""
- targets_df, X_sparse, _ = builder.build_matrix(
- sim,
- target_filter={
- "variables": ["health_insurance_premiums_without_medicare_part_b"]
- },
- )
- n_households = len(
- sim.calculate("household_id", map_to="household").values
- )
- n_cds = len(builder.cds_to_calibrate)
- assert X_sparse.shape[1] == n_households * n_cds
-
-
-def test_combined_variables_in_matrix(sim, builder):
- """Matrix should include all configured variables."""
- targets_df, X_sparse, _ = builder.build_matrix(
- sim,
- target_filter=COMBINED_FILTER_CONFIG,
- )
- variables = targets_df["variable"].unique()
-
- for var_name, _ in VARIABLES_TO_TEST:
- assert var_name in variables, f"Missing variable: {var_name}"
-
-
-class TestNationalLevelContributions:
- """
- Tests verifying that national-level targets receive contributions from
- households across all states, not just a geographic subset.
-
- The key insight: for a national target, when we look at a single CD's
- column block, households from ALL original states should potentially
- contribute (subject to meeting eligibility constraints). There should
- be no systematic geographic bias where only households from certain
- states contribute to the national total.
- """
-
- def test_national_targets_receive_multistate_contributions(
- self, targets_df, X_sparse, household_states, n_households, test_cds
- ):
- """
- Verify that national-level targets have contributions from households
- originally from multiple states.
-
- For each national target:
- 1. Look at the matrix row
- 2. For EACH CD's column block, identify which original states have
- non-zero contributions
- 3. Verify contributions come from multiple states (not geographically
- biased)
- """
- state_fips = household_states
- cds = test_cds
-
- # Find national-level targets
- national_targets = targets_df[
- targets_df["geographic_id"].apply(
- lambda x: _get_geo_level(x) == "national"
- )
- ]
-
- if len(national_targets) == 0:
- pytest.skip("No national-level targets found")
-
- results = []
-
- for _, target in national_targets.iterrows():
- row_idx = target.name
- variable = target["variable"]
- row = X_sparse[row_idx, :].toarray().flatten()
-
- # For each CD block, check which original states contribute
- cd_contribution_stats = []
-
- for cd_idx, cd in enumerate(cds):
- col_start = cd_idx * n_households
- col_end = col_start + n_households
- cd_values = row[col_start:col_end]
-
- # Find households with non-zero values in this CD block
- nonzero_mask = cd_values != 0
- nonzero_indices = np.where(nonzero_mask)[0]
-
- if len(nonzero_indices) == 0:
- continue
-
- # Get original states of contributing households
- contributing_states = set(state_fips[nonzero_indices])
-
- cd_contribution_stats.append(
- {
- "cd": cd,
- "cd_state": int(cd) // 100,
- "n_contributing": len(nonzero_indices),
- "n_states": len(contributing_states),
- "contributing_states": contributing_states,
- }
- )
-
- if not cd_contribution_stats:
- results.append(
- {
- "variable": variable,
- "status": "NO_CONTRIBUTIONS",
- "details": "No non-zero values in any CD block",
- }
- )
- continue
-
- # Aggregate stats
- stats_df = pd.DataFrame(cd_contribution_stats)
- avg_states = stats_df["n_states"].mean()
- min_states = stats_df["n_states"].min()
-
- # Check: on average, contributions should come from multiple states
- # (at least 2, since we have CDs from 4 different states)
- passed = avg_states >= 2 and min_states >= 1
-
- results.append(
- {
- "variable": variable,
- "status": "PASSED" if passed else "FAILED",
- "avg_contributing_states": avg_states,
- "min_contributing_states": min_states,
- "n_cd_blocks_with_data": len(stats_df),
- }
- )
-
- # Assert no geographic bias
- failed = [r for r in results if r["status"] == "FAILED"]
- assert len(failed) == 0, (
- f"Geographic bias detected in national targets: "
- f"{[r['variable'] for r in failed]}"
- )
-
- def test_state_distribution_in_national_targets(
- self, targets_df, X_sparse, household_states, n_households, test_cds
- ):
- """
- Verify the distribution of contributing states in national targets
- roughly matches the original data distribution.
-
- This catches cases where one state dominates the contributions
- disproportionately.
- """
- state_fips = household_states
- cds = test_cds
-
- # Get original state distribution (count of households per state)
- unique_states, original_counts = np.unique(
- state_fips, return_counts=True
- )
- original_dist = dict(zip(unique_states, original_counts))
- total_hh = len(state_fips)
-
- # Find national-level targets
- national_targets = targets_df[
- targets_df["geographic_id"].apply(
- lambda x: _get_geo_level(x) == "national"
- )
- ]
-
- if len(national_targets) == 0:
- pytest.skip("No national-level targets found")
-
- for _, target in national_targets.iterrows():
- row_idx = target.name
- variable = target["variable"]
- row = X_sparse[row_idx, :].toarray().flatten()
-
- # Count contributions by original state across ALL CD blocks
- state_contribution_counts = {}
-
- for cd_idx, cd in enumerate(cds):
- col_start = cd_idx * n_households
- col_end = col_start + n_households
- cd_values = row[col_start:col_end]
-
- nonzero_mask = cd_values != 0
- nonzero_indices = np.where(nonzero_mask)[0]
-
- for hh_idx in nonzero_indices:
- orig_state = state_fips[hh_idx]
- state_contribution_counts[orig_state] = (
- state_contribution_counts.get(orig_state, 0) + 1
- )
-
- if not state_contribution_counts:
- continue
-
- # Check that no single state dominates excessively
- total_contributions = sum(state_contribution_counts.values())
- max_contribution = max(state_contribution_counts.values())
- max_state = max(
- state_contribution_counts, key=state_contribution_counts.get
- )
- max_share = max_contribution / total_contributions
-
- # The max share should not exceed 70% (unless that state has 70%+
- # of households in the original data)
- original_max_share = original_dist.get(max_state, 0) / total_hh
-
- # Allow 20% margin above original share
- threshold = min(0.7, original_max_share + 0.2)
-
- assert max_share <= threshold, (
- f"State {max_state} dominates national {variable} target with "
- f"{max_share:.1%} of contributions "
- f"(original share: {original_max_share:.1%})"
- )
-
-
-@pytest.mark.skip(
- reason="Sparse matrix builder not used in production; test needs rework after time_period fix"
-)
-class TestCrossStateRecalculation:
- """
- Tests verifying that household values change when borrowed to different
- states, confirming state-specific rules are being applied.
-
- The key insight: for national-level targets (no state constraint), each
- household appears in every CD block. The value in each CD block represents
- what the variable would be if that household lived in that CD's state.
- For state-dependent variables (like SNAP), values should differ across
- states for at least some households.
-
- NOTE: This complements test_cross_state.py which verifies exact values.
- These tests verify that variation exists (state rules are applied).
- """
-
- def test_values_change_across_states_for_national_targets(
- self, targets_df, X_sparse, n_households, test_cds
- ):
- """
- Verify that for national targets, household values vary across CD
- blocks from different states.
-
- This confirms the matrix builder is correctly recalculating variables
- with state-specific rules when households are "borrowed" to different
- geographic areas.
-
- The test checks:
- 1. For each national target, examine households with non-zero values
- 2. Compare each household's value across CD blocks from different states
- 3. At least some households should have different values in different
- states (confirming recalculation with different state rules)
- """
- cds = test_cds
-
- # Group CDs by state
- cds_by_state = {}
- for cd_idx, cd in enumerate(cds):
- state = int(cd) // 100
- if state not in cds_by_state:
- cds_by_state[state] = []
- cds_by_state[state].append((cd_idx, cd))
-
- states = list(cds_by_state.keys())
- if len(states) < 2:
- pytest.skip("Need at least 2 states to test cross-state variation")
-
- # Find national-level targets
- national_targets = targets_df[
- targets_df["geographic_id"].apply(
- lambda x: _get_geo_level(x) == "national"
- )
- ]
-
- if len(national_targets) == 0:
- pytest.skip("No national-level targets found")
-
- results = []
-
- for _, target in national_targets.iterrows():
- if target["variable"] not in VARIABLES_WITH_STATE_VARIATION:
- continue
- row_idx = target.name
- variable = target["variable"]
- row = X_sparse[row_idx, :].toarray().flatten()
-
- # For each household, collect values from different states
- households_with_variation = 0
- households_checked = 0
-
- # Sample households (check every 10th to keep test fast)
- for hh_idx in range(0, n_households, 10):
- # Get this household's value in each state (use first CD of
- # each state)
- state_values = {}
- for state, cd_list in cds_by_state.items():
- cd_idx, _ = cd_list[0] # First CD in this state
- col_idx = cd_idx * n_households + hh_idx
- state_values[state] = row[col_idx]
-
- # Skip if all values are zero (household doesn't qualify for
- # this variable)
- nonzero_values = [v for v in state_values.values() if v != 0]
- if len(nonzero_values) < 2:
- continue
-
- households_checked += 1
-
- # Check if values differ across states
- unique_values = set(nonzero_values)
- if len(unique_values) > 1:
- households_with_variation += 1
-
- variation_rate = (
- households_with_variation / households_checked
- if households_checked > 0
- else 0
- )
-
- results.append(
- {
- "variable": variable,
- "households_checked": households_checked,
- "households_with_variation": households_with_variation,
- "variation_rate": variation_rate,
- }
- )
-
- # For state-dependent variables, we expect SOME variation
- # (not all households will vary - some may have $0 or max benefits
- # regardless of state)
- # The key is that variation exists, confirming recalculation occurs
- for r in results:
- if r["households_checked"] > 0:
- # At least 10% of households should show variation for
- # state-dependent variables
- assert (
- r["variation_rate"] > 0.1 or r["households_checked"] < 10
- ), (
- f"No cross-state variation found for {r['variable']}. "
- f"This suggests state-specific rules may not be applied "
- f"when households are borrowed to different states."
- )
-
- def test_same_household_different_states_shows_rule_changes(
- self, targets_df, X_sparse, household_states, n_households, test_cds
- ):
- """
- Deep dive test: pick specific households and verify their values
- differ across states in a way consistent with state-specific rules.
-
- For SNAP specifically, different states have different:
- - Standard deductions
- - Shelter deduction caps
- - Vehicle allowances
- - Categorical eligibility rules
-
- This test finds households where we can verify the recalculation
- is applying different state rules.
- """
- state_fips_orig = household_states
- cds = test_cds
-
- # Group CDs by state
- cds_by_state = {}
- for cd_idx, cd in enumerate(cds):
- state = int(cd) // 100
- if state not in cds_by_state:
- cds_by_state[state] = []
- cds_by_state[state].append((cd_idx, cd))
-
- states = sorted(cds_by_state.keys())
- if len(states) < 2:
- pytest.skip("Need at least 2 states")
-
- # Find national SNAP target (most state-dependent)
- snap_national = targets_df[
- (targets_df["variable"] == "snap")
- & (
- targets_df["geographic_id"].apply(
- lambda x: _get_geo_level(x) == "national"
- )
- )
- ]
-
- if len(snap_national) == 0:
- pytest.skip("No national SNAP target found")
-
- row_idx = snap_national.iloc[0].name
- row = X_sparse[row_idx, :].toarray().flatten()
-
- # Find households with interesting variation patterns
- example_households = []
-
- for hh_idx in range(n_households):
- state_values = {}
- for state, cd_list in cds_by_state.items():
- cd_idx, _ = cd_list[0]
- col_idx = cd_idx * n_households + hh_idx
- state_values[state] = row[col_idx]
-
- # Look for households where:
- # 1. At least 2 states have non-zero SNAP
- # 2. The values differ significantly (>10% relative difference)
- nonzero_states = {s: v for s, v in state_values.items() if v > 0}
-
- if len(nonzero_states) >= 2:
- values = list(nonzero_states.values())
- max_val = max(values)
- min_val = min(values)
- if min_val > 0 and (max_val - min_val) / min_val > 0.1:
- example_households.append(
- {
- "hh_idx": hh_idx,
- "original_state": state_fips_orig[hh_idx],
- "state_values": nonzero_states,
- "max_val": max_val,
- "min_val": min_val,
- "variation": (max_val - min_val) / min_val,
- }
- )
-
- if len(example_households) >= 5:
- break
-
- # Assert we found at least one household with variation
- assert len(example_households) > 0, (
- "Expected to find households with >10% SNAP variation across "
- "states, confirming state-specific rules are applied"
- )
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_period_selection_and_uprating.py b/policyengine_us_data/tests/test_local_area_calibration/test_period_selection_and_uprating.py
deleted file mode 100644
index b6523f91b..000000000
--- a/policyengine_us_data/tests/test_local_area_calibration/test_period_selection_and_uprating.py
+++ /dev/null
@@ -1,246 +0,0 @@
-"""
-Tests for best-period selection and uprating in SparseMatrixBuilder.
-"""
-
-import unittest
-import tempfile
-import os
-import pandas as pd
-from sqlalchemy import create_engine, text
-
-from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (
- SparseMatrixBuilder,
-)
-from policyengine_us_data.db.create_database_tables import (
- TARGET_OVERVIEW_VIEW,
-)
-
-
-class TestPeriodSelectionAndUprating(unittest.TestCase):
- """Test best-period SQL CTE and uprating logic."""
-
- @classmethod
- def setUpClass(cls):
- cls.temp_db = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
- cls.db_path = cls.temp_db.name
- cls.temp_db.close()
-
- cls.db_uri = f"sqlite:///{cls.db_path}"
- engine = create_engine(cls.db_uri)
-
- with engine.connect() as conn:
- conn.execute(
- text("CREATE TABLE strata (" "stratum_id INTEGER PRIMARY KEY)")
- )
- conn.execute(
- text(
- "CREATE TABLE stratum_constraints ("
- "constraint_id INTEGER PRIMARY KEY, "
- "stratum_id INTEGER, "
- "constraint_variable TEXT, "
- "operation TEXT, "
- "value TEXT)"
- )
- )
- conn.execute(
- text(
- "CREATE TABLE targets ("
- "target_id INTEGER PRIMARY KEY, "
- "stratum_id INTEGER, "
- "variable TEXT, "
- "value REAL, "
- "period INTEGER, "
- "active INTEGER DEFAULT 1)"
- )
- )
-
- conn.execute(text(TARGET_OVERVIEW_VIEW))
- conn.commit()
-
- @classmethod
- def tearDownClass(cls):
- os.unlink(cls.db_path)
-
- def setUp(self):
- engine = create_engine(self.db_uri)
- with engine.connect() as conn:
- conn.execute(text("DELETE FROM targets"))
- conn.execute(text("DELETE FROM stratum_constraints"))
- conn.execute(text("DELETE FROM strata"))
- conn.commit()
-
- def _insert_test_data(self, strata, constraints, targets):
- engine = create_engine(self.db_uri)
- with engine.connect() as conn:
- for stratum_id, group_id in strata:
- conn.execute(
- text("INSERT INTO strata VALUES (:sid)"),
- {"sid": stratum_id},
- )
- for i, (stratum_id, var, op, val) in enumerate(constraints):
- conn.execute(
- text(
- "INSERT INTO stratum_constraints "
- "VALUES (:cid, :sid, :var, :op, :val)"
- ),
- {
- "cid": i + 1,
- "sid": stratum_id,
- "var": var,
- "op": op,
- "val": val,
- },
- )
- for i, (
- stratum_id,
- variable,
- value,
- period,
- ) in enumerate(targets):
- conn.execute(
- text(
- "INSERT INTO targets "
- "(target_id, stratum_id, variable, "
- "value, period) "
- "VALUES (:tid, :sid, :var, :val, :period)"
- ),
- {
- "tid": i + 1,
- "sid": stratum_id,
- "var": variable,
- "val": value,
- "period": period,
- },
- )
- conn.commit()
-
- def _make_builder(self, time_period=2024):
- return SparseMatrixBuilder(
- db_uri=self.db_uri,
- time_period=time_period,
- cds_to_calibrate=["601"],
- )
-
- # ---- Period selection tests ----
-
- def test_best_period_prefers_past(self):
- """Targets at 2022 and 2026 -> picks 2022 for time_period=2024."""
- self._insert_test_data(
- strata=[(1, 1)],
- constraints=[
- (1, "congressional_district_geoid", "=", "601"),
- ],
- targets=[
- (1, "snap", 1000, 2022),
- (1, "snap", 2000, 2026),
- ],
- )
- builder = self._make_builder(time_period=2024)
- df = builder._query_targets({"stratum_ids": [1]})
- self.assertEqual(len(df), 1)
- self.assertEqual(df.iloc[0]["period"], 2022)
- self.assertEqual(df.iloc[0]["value"], 1000)
-
- def test_best_period_uses_future_when_no_past(self):
- """Target only at 2026 -> picks 2026 for time_period=2024."""
- self._insert_test_data(
- strata=[(1, 1)],
- constraints=[
- (1, "congressional_district_geoid", "=", "601"),
- ],
- targets=[
- (1, "snap", 5000, 2026),
- ],
- )
- builder = self._make_builder(time_period=2024)
- df = builder._query_targets({"stratum_ids": [1]})
- self.assertEqual(len(df), 1)
- self.assertEqual(df.iloc[0]["period"], 2026)
-
- def test_best_period_exact_match(self):
- """Targets at 2022, 2024, 2026 -> picks 2024 exactly."""
- self._insert_test_data(
- strata=[(1, 1)],
- constraints=[
- (1, "congressional_district_geoid", "=", "601"),
- ],
- targets=[
- (1, "snap", 1000, 2022),
- (1, "snap", 1500, 2024),
- (1, "snap", 2000, 2026),
- ],
- )
- builder = self._make_builder(time_period=2024)
- df = builder._query_targets({"stratum_ids": [1]})
- self.assertEqual(len(df), 1)
- self.assertEqual(df.iloc[0]["period"], 2024)
- self.assertEqual(df.iloc[0]["value"], 1500)
-
- def test_independent_per_stratum_and_variable(self):
- """Different strata/variables select independently."""
- self._insert_test_data(
- strata=[(1, 1), (2, 1)],
- constraints=[
- (1, "congressional_district_geoid", "=", "601"),
- (2, "congressional_district_geoid", "=", "601"),
- ],
- targets=[
- (1, "snap", 1000, 2024),
- (1, "snap", 800, 2022),
- (2, "person_count", 500, 2022),
- (2, "person_count", 600, 2026),
- ],
- )
- builder = self._make_builder(time_period=2024)
- df = builder._query_targets({"stratum_ids": [1, 2]})
- self.assertEqual(len(df), 2)
- snap_row = df[df["variable"] == "snap"].iloc[0]
- self.assertEqual(snap_row["period"], 2024)
- count_row = df[df["variable"] == "person_count"].iloc[0]
- self.assertEqual(count_row["period"], 2022)
-
- # ---- Uprating info tests ----
-
- def test_cpi_uprating_for_dollar_vars(self):
- builder = self._make_builder(time_period=2024)
- factors = {(2022, "cpi"): 1.06, (2022, "pop"): 1.01}
- factor, type_ = builder._get_uprating_info("snap", 2022, factors)
- self.assertAlmostEqual(factor, 1.06)
- self.assertEqual(type_, "cpi")
-
- def test_pop_uprating_for_count_vars(self):
- builder = self._make_builder(time_period=2024)
- factors = {(2022, "cpi"): 1.06, (2022, "pop"): 1.01}
- factor, type_ = builder._get_uprating_info(
- "person_count", 2022, factors
- )
- self.assertAlmostEqual(factor, 1.01)
- self.assertEqual(type_, "pop")
-
- def test_no_uprating_for_current_period(self):
- builder = self._make_builder(time_period=2024)
- factors = {(2024, "cpi"): 1.0, (2024, "pop"): 1.0}
- factor, type_ = builder._get_uprating_info("snap", 2024, factors)
- self.assertAlmostEqual(factor, 1.0)
- self.assertEqual(type_, "none")
-
- def test_pop_uprating_households_variable(self):
- builder = self._make_builder(time_period=2024)
- factors = {(2022, "cpi"): 1.06, (2022, "pop"): 1.02}
- factor, type_ = builder._get_uprating_info("households", 2022, factors)
- self.assertAlmostEqual(factor, 1.02)
- self.assertEqual(type_, "pop")
-
- def test_pop_uprating_tax_units_variable(self):
- builder = self._make_builder(time_period=2024)
- factors = {(2022, "cpi"): 1.06, (2022, "pop"): 1.02}
- factor, type_ = builder._get_uprating_info("tax_units", 2022, factors)
- self.assertAlmostEqual(factor, 1.02)
- self.assertEqual(type_, "pop")
-
- def test_missing_factor_defaults_to_1(self):
- builder = self._make_builder(time_period=2024)
- factors = {}
- factor, type_ = builder._get_uprating_info("snap", 2020, factors)
- self.assertAlmostEqual(factor, 1.0)
- self.assertEqual(type_, "cpi")
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py b/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py
deleted file mode 100644
index 065b99201..000000000
--- a/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py
+++ /dev/null
@@ -1,126 +0,0 @@
-"""Test same-state values match original simulation values."""
-
-import pytest
-import numpy as np
-from collections import defaultdict
-
-from .conftest import VARIABLES_TO_TEST, N_VERIFICATION_SAMPLES
-
-
-@pytest.mark.skip(
- reason="Sparse matrix builder not used in production; test needs rework after time_period fix"
-)
-def test_same_state_matches_original(
- sim,
- X_sparse,
- targets_df,
- test_cds,
- n_households,
- household_ids,
- household_states,
-):
- """
- Same-state non-zero cells must match ORIGINAL simulation values.
-
- When household stays in same state, X_sparse should contain the value
- from the original simulation (ground truth from H5 dataset).
-
- Uses stratified sampling to ensure all variables in VARIABLES_TO_TEST
- are covered with approximately equal samples per variable.
- """
- seed = 42
- rng = np.random.default_rng(seed)
- n_hh = n_households
- hh_ids = household_ids
- hh_states = household_states
-
- nonzero_rows, nonzero_cols = X_sparse.nonzero()
-
- # Group same-state cells by variable for stratified sampling
- variable_to_indices = defaultdict(list)
- variables_to_test = {v[0] for v in VARIABLES_TO_TEST}
-
- for i in range(len(nonzero_rows)):
- row_idx = nonzero_rows[i]
- col_idx = nonzero_cols[i]
- cd_idx = col_idx // n_hh
- hh_idx = col_idx % n_hh
- cd = test_cds[cd_idx]
- dest_state = int(cd) // 100
- orig_state = int(hh_states[hh_idx])
-
- # Only include same-state cells
- if dest_state != orig_state:
- continue
-
- variable = targets_df.iloc[row_idx]["variable"]
- if variable in variables_to_test:
- variable_to_indices[variable].append(i)
-
- if not variable_to_indices:
- pytest.skip("No same-state non-zero cells found for test variables")
-
- # Stratified sampling: sample proportionally from each variable
- samples_per_var = max(
- 1, N_VERIFICATION_SAMPLES // len(variable_to_indices)
- )
- sample_indices = []
-
- for variable, indices in variable_to_indices.items():
- n_to_sample = min(samples_per_var, len(indices))
- sampled = rng.choice(indices, n_to_sample, replace=False)
- sample_indices.extend(sampled)
-
- # Cache original values per variable to avoid repeated calculations
- original_values_cache = {}
-
- def get_original_values(variable):
- if variable not in original_values_cache:
- original_values_cache[variable] = sim.calculate(
- variable, map_to="household"
- ).values
- return original_values_cache[variable]
-
- errors = []
- variables_tested = set()
-
- for idx in sample_indices:
- row_idx = nonzero_rows[idx]
- col_idx = nonzero_cols[idx]
- cd_idx = col_idx // n_hh
- hh_idx = col_idx % n_hh
- variable = targets_df.iloc[row_idx]["variable"]
- actual = float(X_sparse[row_idx, col_idx])
-
- # Compare to ORIGINAL simulation values (ground truth)
- original_values = get_original_values(variable)
- expected = float(original_values[hh_idx])
-
- variables_tested.add(variable)
-
- if not np.isclose(actual, expected, atol=0.5):
- errors.append(
- {
- "hh_id": hh_ids[hh_idx],
- "hh_idx": hh_idx,
- "variable": variable,
- "actual": actual,
- "expected": expected,
- "diff": actual - expected,
- "rel_diff": (
- (actual - expected) / expected
- if expected != 0
- else np.inf
- ),
- }
- )
-
- missing_vars = variables_to_test - variables_tested
- if missing_vars:
- print(f"Warning: No same-state cells found for: {missing_vars}")
-
- assert not errors, (
- f"Same-state verification failed: {len(errors)}/{len(sample_indices)} "
- f"mismatches across {len(variables_tested)} variables. "
- f"First 5: {errors[:5]}"
- )