diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29bb..e4c231aa2 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,22 @@ +- bump: minor + changes: + added: + - Census-block-first calibration pipeline (calibration/ package) ported from PR #516 + - Clone-and-assign module for population-weighted census block sampling + - Unified matrix builder with clone-by-clone simulation, COO caching, and target_overview-based querying + - Unified calibration CLI with L0 optimization and seeded takeup re-randomization + - 28 new tests for the calibration pipeline + - Integration test for build_matrix geographic masking (national/state/CD) + - Tests for drop_target_groups utility + - voluntary_filing.yaml takeup rate parameter + changed: + - Rewrote local_area_calibration_setup.ipynb for clone-based pipeline + - Renamed _get_geo_level to get_geo_level (now cross-module public API) + fixed: + - Fix Jupyter import error in unified_calibration.py (OutStream.reconfigure moved to main) + - Fix modal_app/remote_calibration_runner.py referencing deleted fit_calibration_weights.py + - Fix _coo_parts stale state bug on build_matrix re-call after failure + - Remove hardcoded voluntary_filing rate in favor of YAML parameter + removed: + - SparseMatrixBuilder, MatrixTracer, and fit_calibration_weights (replaced by unified pipeline) + - 8 old SparseMatrixBuilder-dependent tests (replaced by new test_calibration suite) diff --git a/docs/calibration_matrix.ipynb b/docs/calibration_matrix.ipynb index b7edbe507..41497b1e8 100644 --- a/docs/calibration_matrix.ipynb +++ b/docs/calibration_matrix.ipynb @@ -6,11 +6,13 @@ "source": [ "# The Calibration Matrix\n", "\n", - "The calibration pipeline has three stages: (1) compute uprated target values ([`hierarchical_uprating.ipynb`](hierarchical_uprating.ipynb)), (2) assemble the sparse constraint matrix (this notebook), and (3) optimize weights ([`fit_calibration_weights.py`](../policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py)). This notebook is the diagnostic checkpoint between stages 1 and 2 — understand your matrix before you optimize.\n", + "The calibration pipeline has three stages: (1) compute uprated target values, (2) assemble the sparse constraint matrix (this notebook), and (3) optimize weights (`unified_calibration.py`). This notebook is the diagnostic checkpoint between stages 1 and 2 — understand your matrix before you optimize.\n", "\n", - "We build the full calibration matrix using `SparseMatrixBuilder`, then use `MatrixTracer` to inspect its structure: what rows and columns represent, how target groups partition the loss function, and where sparsity patterns emerge.\n", + "We build the full calibration matrix using `UnifiedMatrixBuilder` with clone-based geography from `assign_random_geography`, then inspect its structure: what rows and columns represent, how target groups partition the loss function, and where sparsity patterns emerge.\n", "\n", - "**Requirements:** `policy_data.db` and the stratified CPS h5 file in `STORAGE_FOLDER`." + "**Column layout:** `col = clone_idx * n_records + record_idx`\n", + "\n", + "**Requirements:** `policy_data.db`, `block_cd_distributions.csv.gz`, and the stratified CPS h5 file in `STORAGE_FOLDER`." ] }, { @@ -22,39 +24,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/baogorek/envs/sep/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from policyengine_us import Microsimulation\n", - "from policyengine_us_data.storage import STORAGE_FOLDER\n", - "from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (\n", - " SparseMatrixBuilder,\n", - ")\n", - "from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n", - " get_all_cds_from_database,\n", - " create_target_groups,\n", - " STATE_CODES,\n", - ")\n", - "from policyengine_us_data.datasets.cps.local_area_calibration.matrix_tracer import (\n", - " MatrixTracer,\n", - ")\n", - "\n", - "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n", - "db_uri = f\"sqlite:///{db_path}\"\n", - "dataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\"" - ] + "outputs": [], + "source": "import numpy as np\nimport pandas as pd\nfrom policyengine_us import Microsimulation\nfrom policyengine_us_data.storage import STORAGE_FOLDER\nfrom policyengine_us_data.calibration.unified_matrix_builder import (\n UnifiedMatrixBuilder,\n)\nfrom policyengine_us_data.calibration.clone_and_assign import (\n assign_random_geography,\n)\nfrom policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n create_target_groups,\n drop_target_groups,\n get_geo_level,\n STATE_CODES,\n)\n\ndb_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\ndb_uri = f\"sqlite:///{db_path}\"\ndataset_path = STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\"" }, { "cell_type": "code", @@ -65,32 +38,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "Matrix shape: (1411, 5231564)\n", - "Non-zero entries: 2,199,033\n" + "Records: 11,999, Clones: 3, Total columns: 35,997\n", + "Matrix shape: (1411, 35997)\n", + "Non-zero entries: 14,946\n" ] } ], "source": [ "sim = Microsimulation(dataset=str(dataset_path))\n", - "cds_to_calibrate = get_all_cds_from_database(db_uri)\n", + "n_records = sim.calculate(\"household_id\", map_to=\"household\").values.shape[0]\n", + "\n", + "N_CLONES = 3 # keep small for diagnostics\n", + "geography = assign_random_geography(n_records, n_clones=N_CLONES, seed=42)\n", "\n", - "builder = SparseMatrixBuilder(\n", + "builder = UnifiedMatrixBuilder(\n", " db_uri=db_uri,\n", " time_period=2024,\n", - " cds_to_calibrate=cds_to_calibrate,\n", " dataset_path=str(dataset_path),\n", ")\n", "\n", - "targets_df, X_sparse, household_id_mapping = builder.build_matrix(\n", + "targets_df, X_sparse, target_names = builder.build_matrix(\n", + " geography,\n", " sim,\n", " target_filter={\"domain_variables\": [\"aca_ptc\", \"snap\"]},\n", " hierarchical_domains=[\"aca_ptc\", \"snap\"],\n", ")\n", "\n", - "tracer = MatrixTracer(\n", - " targets_df, X_sparse, household_id_mapping, cds_to_calibrate, sim\n", - ")\n", - "\n", + "n_total = n_records * N_CLONES\n", + "print(f\"Records: {n_records:,}, Clones: {N_CLONES}, Total columns: {n_total:,}\")\n", "print(f\"Matrix shape: {X_sparse.shape}\")\n", "print(f\"Non-zero entries: {X_sparse.nnz:,}\")" ] @@ -104,91 +79,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "================================================================================\n", - "MATRIX STRUCTURE BREAKDOWN\n", - "================================================================================\n", - "\n", - "Matrix dimensions: 1411 rows x 5231564 columns\n", - " Rows = 1411 targets\n", - " Columns = 11999 households x 436 CDs\n", - " = 11,999 x 436 = 5,231,564\n", - "\n", - "--------------------------------------------------------------------------------\n", - "COLUMN STRUCTURE (Households stacked by CD)\n", - "--------------------------------------------------------------------------------\n", - "\n", - "Showing first and last 5 CDs of 436 total:\n", - "\n", - "First 5 CDs:\n", - "cd_geoid start_col end_col n_households\n", - " 1001 0 11998 11999\n", - " 101 11999 23997 11999\n", - " 102 23998 35996 11999\n", - " 103 35997 47995 11999\n", - " 104 47996 59994 11999\n", - "\n", - "Last 5 CDs:\n", - "cd_geoid start_col end_col n_households\n", - " 901 5171569 5183567 11999\n", - " 902 5183568 5195566 11999\n", - " 903 5195567 5207565 11999\n", - " 904 5207566 5219564 11999\n", - " 905 5219565 5231563 11999\n", - "\n", - "--------------------------------------------------------------------------------\n", - "ROW STRUCTURE (Targets)\n", - "--------------------------------------------------------------------------------\n", - "\n", - "Total targets: 1411\n", - "\n", - "Targets by domain variable:\n", - " n_targets n_unique_vars\n", - "domain_variable \n", - "aca_ptc 873 3\n", - "snap 538 2\n", - "\n", - "--------------------------------------------------------------------------------\n", - "TARGET GROUPS (for loss calculation)\n", - "--------------------------------------------------------------------------------\n", - "\n", - "=== Creating Target Groups ===\n", - "\n", - "National targets:\n", - " Group 0: ACA PTC Person Count = 19,743,689\n", - "\n", - "State targets:\n", - " Group 1: SNAP Household Count (51 targets)\n", - " Group 2: Snap (51 targets)\n", - "\n", - "District targets:\n", - " Group 3: Aca Ptc (436 targets)\n", - " Group 4: ACA PTC Tax Unit Count (436 targets)\n", - " Group 5: SNAP Household Count (436 targets)\n", - "\n", - "Total groups created: 6\n", - "========================================\n", - " Group 0: National ACA PTC Person Count (1 target, value=19,743,689) - rows [0]\n", - " Group 1: State SNAP Household Count (51 targets) - rows [1, 2, 3, ..., 50, 51]\n", - " Group 2: State Snap (51 targets) - rows [52, 53, 54, ..., 101, 102]\n", - " Group 3: District Aca Ptc (436 targets) - rows [103, 104, 105, ..., 537, 538]\n", - " Group 4: District ACA PTC Tax Unit Count (436 targets) - rows [975, 976, 977, ..., 1409, 1410]\n", - " Group 5: District SNAP Household Count (436 targets) - rows [539, 540, 541, ..., 973, 974]\n", - "\n", - "================================================================================\n" - ] - } - ], - "source": [ - "tracer.print_matrix_structure()" - ] + "outputs": [], + "source": "print(f\"Targets: {X_sparse.shape[0]}\")\nprint(f\"Columns: {X_sparse.shape[1]:,} ({N_CLONES} clones x {n_records:,} records)\")\nprint(f\"Non-zeros: {X_sparse.nnz:,}\")\nprint(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nfor level in [0, 1, 2]:\n n = (geo_levels == level).sum()\n if n > 0:\n print(f\" {level_names[level]}: {n} targets\")" }, { "cell_type": "markdown", @@ -196,7 +90,7 @@ "source": [ "## 3. Anatomy of a row\n", "\n", - "Each row is one calibration target — a known aggregate (dollar total, household count, person count) that the optimizer tries to match. The row vector's non-zero entries identify which (household, CD) pairs can contribute to that target." + "Each row is one calibration target — a known aggregate (dollar total, household count, person count) that the optimizer tries to match. The row vector's non-zero entries identify which cloned records can contribute to that target." ] }, { @@ -208,23 +102,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "Row 705:\n", - " row_index: 705\n", + "Row 705: cd_3402/household_count/[snap>0]\n", " variable: household_count\n", - " variable_desc: Households represented\n", " geographic_id: 3402\n", - " target_value: 48652.0536866581\n", - " stratum_id: 9625\n", - " domain_variable: snap\n" + " geo_level: district\n", + " target value: 48,652\n", + " uprating_factor: 1.0\n" ] } ], "source": [ "mid_row = X_sparse.shape[0] // 2\n", - "row_info = tracer.get_row_info(mid_row)\n", - "print(f\"Row {mid_row}:\")\n", - "for k, v in row_info.items():\n", - " print(f\" {k}: {v}\")" + "row = targets_df.iloc[mid_row]\n", + "print(f\"Row {mid_row}: {target_names[mid_row]}\")\n", + "print(f\" variable: {row['variable']}\")\n", + "print(f\" geographic_id: {row['geographic_id']}\")\n", + "print(f\" geo_level: {row['geo_level']}\")\n", + "print(f\" target value: {row['value']:,.0f}\")\n", + "print(f\" uprating_factor: {row.get('uprating_factor', 'N/A')}\")" ] }, { @@ -236,21 +131,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Row 705 has 1,841 non-zero columns\n", - "\n", - "First non-zero column (1991877):\n", - " column_index: 1991877\n", - " cd_geoid: 3402\n", - " household_id: 952\n", - " household_index: 43\n", - "\n", - "Last non-zero column (2003831):\n", - " column_index: 2003831\n", + "Row 705 has 9 non-zero columns\n", + " Spans 3 clone(s)\n", + " Spans 9 unique record(s)\n", + "\n", + "First non-zero column (8000):\n", + " clone_idx: 0\n", + " record_idx: 8000\n", + " state_fips: 34\n", " cd_geoid: 3402\n", - " household_id: 177860\n", - " household_index: 11997\n", - "\n", - "Spans 1 CD(s)\n" + " value: 1.00\n" ] } ], @@ -260,19 +150,18 @@ "print(f\"Row {mid_row} has {len(nz_cols):,} non-zero columns\")\n", "\n", "if len(nz_cols) > 0:\n", - " first_col = tracer.get_column_info(nz_cols[0])\n", - " last_col = tracer.get_column_info(nz_cols[-1])\n", - " print(f\"\\nFirst non-zero column ({nz_cols[0]}):\")\n", - " for k, v in first_col.items():\n", - " print(f\" {k}: {v}\")\n", - " print(f\"\\nLast non-zero column ({nz_cols[-1]}):\")\n", - " for k, v in last_col.items():\n", - " print(f\" {k}: {v}\")\n", - "\n", - " unique_cds = set(\n", - " tracer.get_column_info(c)[\"cd_geoid\"] for c in nz_cols\n", - " )\n", - " print(f\"\\nSpans {len(unique_cds)} CD(s)\")" + " clone_indices = nz_cols // n_records\n", + " record_indices = nz_cols % n_records\n", + " print(f\" Spans {len(np.unique(clone_indices))} clone(s)\")\n", + " print(f\" Spans {len(np.unique(record_indices))} unique record(s)\")\n", + "\n", + " first_col = nz_cols[0]\n", + " print(f\"\\nFirst non-zero column ({first_col}):\")\n", + " print(f\" clone_idx: {first_col // n_records}\")\n", + " print(f\" record_idx: {first_col % n_records}\")\n", + " print(f\" state_fips: {geography.state_fips[first_col]}\")\n", + " print(f\" cd_geoid: {geography.cd_geoid[first_col]}\")\n", + " print(f\" value: {X_sparse[mid_row, first_col]:.2f}\")" ] }, { @@ -281,9 +170,9 @@ "source": [ "## 4. Anatomy of a column\n", "\n", - "Each column represents one (household, CD) pair. The columns are organized in blocks: the first `n_households` columns belong to CD 1, the next to CD 2, and so on. The block formula is:\n", + "Each column represents one (record, clone) pair. Columns are organized in clone blocks: the first `n_records` columns belong to clone 0, the next to clone 1, and so on. The block formula is:\n", "\n", - "$$\\text{column\\_idx} = \\text{cd\\_block} \\times n_{\\text{households}} + \\text{hh\\_index}$$" + "$$\\text{column\\_idx} = \\text{clone\\_idx} \\times n_{\\text{records}} + \\text{record\\_idx}$$" ] }, { @@ -295,22 +184,27 @@ "name": "stdout", "output_type": "stream", "text": [ - "Column 60037:\n", - " column_index: 60037\n", - " cd_geoid: 105\n", - " household_id: 946\n", - " household_index: 42\n", + "Column 12041:\n", + " clone_idx: 1\n", + " record_idx: 42\n", + " state_fips: 45\n", + " cd_geoid: 4507\n", + " block_geoid: 450510801013029\n", "\n", "This column has non-zero values in 0 target rows\n" ] } ], "source": [ - "col_idx = tracer.n_households * 5 + 42\n", - "col_info = tracer.get_column_info(col_idx)\n", + "col_idx = 1 * n_records + 42 # clone 1, record 42\n", + "clone_idx = col_idx // n_records\n", + "record_idx = col_idx % n_records\n", "print(f\"Column {col_idx}:\")\n", - "for k, v in col_info.items():\n", - " print(f\" {k}: {v}\")\n", + "print(f\" clone_idx: {clone_idx}\")\n", + "print(f\" record_idx: {record_idx}\")\n", + "print(f\" state_fips: {geography.state_fips[col_idx]}\")\n", + "print(f\" cd_geoid: {geography.cd_geoid[col_idx]}\")\n", + "print(f\" block_geoid: {geography.block_geoid[col_idx]}\")\n", "\n", "col_vec = X_sparse[:, col_idx]\n", "nz_rows = col_vec.nonzero()[0]\n", @@ -318,10 +212,10 @@ "if len(nz_rows) > 0:\n", " print(\"First 5 target rows:\")\n", " for r in nz_rows[:5]:\n", - " ri = tracer.get_row_info(r)\n", + " row = targets_df.iloc[r]\n", " print(\n", - " f\" row {r}: {ri['variable']} \"\n", - " f\"(geo={ri['geographic_id']}, \"\n", + " f\" row {r}: {row['variable']} \"\n", + " f\"(geo={row['geographic_id']}, \"\n", " f\"val={X_sparse[r, col_idx]:.2f})\"\n", " )" ] @@ -335,16 +229,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Block formula verified: cd_block=5 * n_hh=11999 + hh_idx=42 = 60037\n" + "Block formula verified: clone_idx=1 * n_records=11999 + record_idx=42 = 12041\n" ] } ], "source": [ - "expected_col = 5 * tracer.n_households + 42\n", + "expected_col = 1 * n_records + 42\n", "assert col_idx == expected_col, f\"{col_idx} != {expected_col}\"\n", "print(\n", " f\"Block formula verified: \"\n", - " f\"cd_block=5 * n_hh={tracer.n_households} + hh_idx=42 = {expected_col}\"\n", + " f\"clone_idx=1 * n_records={n_records} + record_idx=42 = {expected_col}\"\n", ")" ] }, @@ -424,30 +318,30 @@ "text": [ "\n", "--- Group 0: National ACA PTC Person Count (1 target, value=19,743,689) ---\n", - " row_index variable variable_desc geographic_id target_value stratum_id domain_variable\n", - " 0 person_count People represented US 19743689.0 491 aca_ptc\n", + " variable geographic_id value\n", + "person_count US 19743689.0\n", "\n", "--- Group 2: State Snap (51 targets) ---\n", - " row_index variable variable_desc geographic_id target_value stratum_id domain_variable\n", - " 52 snap SNAP allotment 1 1733693703.0 9330 snap\n", - " 53 snap SNAP allotment 10 254854243.0 9337 snap\n", - " 54 snap SNAP allotment 11 319119173.0 9338 snap\n", - " 55 snap SNAP allotment 12 6604797454.0 9339 snap\n", - " 56 snap SNAP allotment 13 3281329856.0 9340 snap\n", - " 57 snap SNAP allotment 15 731331421.0 9341 snap\n", - " 58 snap SNAP allotment 16 281230283.0 9342 snap\n", - " 59 snap SNAP allotment 17 4469341818.0 9343 snap\n", + "variable geographic_id value\n", + " snap 1 1733693703.0\n", + " snap 10 254854243.0\n", + " snap 11 319119173.0\n", + " snap 12 6604797454.0\n", + " snap 13 3281329856.0\n", + " snap 15 731331421.0\n", + " snap 16 281230283.0\n", + " snap 17 4469341818.0\n", "\n", "--- Group 4: District ACA PTC Tax Unit Count (436 targets) ---\n", - " row_index variable variable_desc geographic_id target_value stratum_id domain_variable\n", - " 975 tax_unit_count Tax units represented 1001 25064.255490 21717 aca_ptc\n", - " 976 tax_unit_count Tax units represented 101 9794.081624 21631 aca_ptc\n", - " 977 tax_unit_count Tax units represented 102 11597.544977 21632 aca_ptc\n", - " 978 tax_unit_count Tax units represented 103 9160.097959 21633 aca_ptc\n", - " 979 tax_unit_count Tax units represented 104 9786.728220 21634 aca_ptc\n", - " 980 tax_unit_count Tax units represented 105 18266.234326 21635 aca_ptc\n", - " 981 tax_unit_count Tax units represented 106 25397.026846 21636 aca_ptc\n", - " 982 tax_unit_count Tax units represented 107 11798.642968 21637 aca_ptc\n" + " variable geographic_id value\n", + "tax_unit_count 1001 25064.255490\n", + "tax_unit_count 101 9794.081624\n", + "tax_unit_count 102 11597.544977\n", + "tax_unit_count 103 9160.097959\n", + "tax_unit_count 104 9786.728220\n", + "tax_unit_count 105 18266.234326\n", + "tax_unit_count 106 25397.026846\n", + "tax_unit_count 107 11798.642968\n" ] } ], @@ -455,18 +349,19 @@ "for gid in [0, 2, 4]:\n", " if gid >= len(group_info):\n", " continue\n", - " rows = tracer.get_group_rows(gid)\n", + " mask = target_groups == gid\n", + " rows = targets_df[mask][[\"variable\", \"geographic_id\", \"value\"]].head(8)\n", " print(f\"\\n--- {group_info[gid]} ---\")\n", - " print(rows.head(8).to_string(index=False))" + " print(rows.to_string(index=False))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 6. Tracing a household\n", + "## 6. Tracing a household across clones\n", "\n", - "One CPS household appears in every CD block (once per CD = 436 column positions). But most of those columns are zero — the household only contributes where its characteristics match the target constraints." + "One CPS record appears once per clone (N_CLONES column positions). Each clone places it in a different census block/CD/state, so it contributes to different geographic targets depending on the clone." ] }, { @@ -478,9 +373,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Example SNAP-receiving household: 654\n", + "Example SNAP-receiving household: record index 23\n", "SNAP value: $70\n", - "Column positions across CDs: 436\n" + "\n", + "Column positions across 3 clones:\n", + " col 23: TX (state=48, CD=4829) — 0 non-zero rows\n", + " col 12022: IL (state=17, CD=1708) — 0 non-zero rows\n", + " col 24021: FL (state=12, CD=1220) — 3 non-zero rows\n" ] } ], @@ -488,12 +387,20 @@ "snap_values = sim.calculate(\"snap\", map_to=\"household\").values\n", "hh_ids = sim.calculate(\"household_id\", map_to=\"household\").values\n", "positive_snap = hh_ids[snap_values > 0]\n", - "example_hh = int(positive_snap[0])\n", - "print(f\"Example SNAP-receiving household: {example_hh}\")\n", - "print(f\"SNAP value: ${snap_values[hh_ids == example_hh][0]:,.0f}\")\n", - "\n", - "positions = tracer.get_household_column_positions(example_hh)\n", - "print(f\"Column positions across CDs: {len(positions)}\")" + "example_hh_idx = int(np.where(snap_values > 0)[0][0])\n", + "print(f\"Example SNAP-receiving household: record index {example_hh_idx}\")\n", + "print(f\"SNAP value: ${snap_values[example_hh_idx]:,.0f}\")\n", + "\n", + "clone_cols = [c * n_records + example_hh_idx for c in range(N_CLONES)]\n", + "print(f\"\\nColumn positions across {N_CLONES} clones:\")\n", + "for col in clone_cols:\n", + " state = geography.state_fips[col]\n", + " cd = geography.cd_geoid[col]\n", + " block = geography.block_geoid[col]\n", + " col_vec = X_sparse[:, col]\n", + " nnz = col_vec.nnz\n", + " abbr = STATE_CODES.get(state, \"??\")\n", + " print(f\" col {col}: {abbr} (state={state}, CD={cd}) — {nnz} non-zero rows\")" ] }, { @@ -505,42 +412,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "CDs with non-zero entries: 160\n", - "CDs with all-zero columns: 276\n", "\n", - "Top 10 CDs by activity for household 654:\n", - " CD 1001 (DE): 3 non-zero rows\n", - " CD 1101 (DC): 3 non-zero rows\n", - " CD 1201 (FL): 3 non-zero rows\n", - " CD 1202 (FL): 3 non-zero rows\n", - " CD 1203 (FL): 3 non-zero rows\n", - " CD 1204 (FL): 3 non-zero rows\n", - " CD 1205 (FL): 3 non-zero rows\n", - " CD 1206 (FL): 3 non-zero rows\n", - " CD 1207 (FL): 3 non-zero rows\n", - " CD 1208 (FL): 3 non-zero rows\n" + "Clone 2 (col 24021, CD 1220):\n", + " household_count (geo=12): 1.00\n", + " snap (geo=12): 70.08\n", + " household_count (geo=1220): 1.00\n" ] } ], "source": [ - "cd_activity = []\n", - "for cd_geoid, col_pos in positions.items():\n", - " col_vec = X_sparse[:, col_pos]\n", - " nnz = col_vec.nnz\n", - " cd_activity.append({\"cd_geoid\": cd_geoid, \"col_pos\": col_pos, \"nnz\": nnz})\n", - "\n", - "cd_df = pd.DataFrame(cd_activity)\n", - "n_active = (cd_df[\"nnz\"] > 0).sum()\n", - "n_zero = (cd_df[\"nnz\"] == 0).sum()\n", - "print(f\"CDs with non-zero entries: {n_active}\")\n", - "print(f\"CDs with all-zero columns: {n_zero}\")\n", - "\n", - "top10 = cd_df.nlargest(10, \"nnz\")\n", - "print(f\"\\nTop 10 CDs by activity for household {example_hh}:\")\n", - "for _, r in top10.iterrows():\n", - " state_fips = int(r[\"cd_geoid\"]) // 100\n", - " abbr = STATE_CODES.get(state_fips, \"??\")\n", - " print(f\" CD {r['cd_geoid']} ({abbr}): {r['nnz']} non-zero rows\")" + "for col in clone_cols:\n", + " col_vec = X_sparse[:, col]\n", + " nz_rows = col_vec.nonzero()[0]\n", + " if len(nz_rows) == 0:\n", + " continue\n", + " clone_i = col // n_records\n", + " print(f\"\\nClone {clone_i} (col {col}, CD {geography.cd_geoid[col]}):\")\n", + " for r in nz_rows[:5]:\n", + " row = targets_df.iloc[r]\n", + " print(\n", + " f\" {row['variable']} (geo={row['geographic_id']}): \"\n", + " f\"{X_sparse[r, col]:.2f}\"\n", + " )\n", + " if len(nz_rows) > 5:\n", + " print(f\" ... and {len(nz_rows) - 5} more\")" ] }, { @@ -559,10 +454,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "Total cells: 7,381,736,804\n", - "Non-zero entries: 2,199,033\n", - "Density: 0.000298\n", - "Sparsity: 99.9702%\n" + "Total cells: 50,791,767\n", + "Non-zero entries: 14,946\n", + "Density: 0.000294\n", + "Sparsity: 99.9706%\n" ] } ], @@ -577,52 +472,10 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Non-zeros per row:\n", - " min: 0\n", - " median: 0\n", - " mean: 1,558\n", - " max: 77,116\n", - "\n", - "By geographic level:\n", - " National : n= 1, median nnz= 0, range=[0, 0]\n", - " State : n= 102, median nnz= 10,423, range=[1,468, 77,116]\n", - " District : n=1308, median nnz= 0, range=[0, 1,988]\n" - ] - } - ], - "source": [ - "nnz_per_row = np.diff(X_sparse.indptr)\n", - "print(f\"Non-zeros per row:\")\n", - "print(f\" min: {nnz_per_row.min():,}\")\n", - "print(f\" median: {int(np.median(nnz_per_row)):,}\")\n", - "print(f\" mean: {nnz_per_row.mean():,.0f}\")\n", - "print(f\" max: {nnz_per_row.max():,}\")\n", - "\n", - "from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n", - " _get_geo_level,\n", - ")\n", - "\n", - "geo_levels = targets_df[\"geographic_id\"].apply(_get_geo_level)\n", - "level_names = {0: \"National\", 1: \"State\", 2: \"District\"}\n", - "print(\"\\nBy geographic level:\")\n", - "for level in [0, 1, 2]:\n", - " mask = (geo_levels == level).values\n", - " if mask.any():\n", - " vals = nnz_per_row[mask]\n", - " print(\n", - " f\" {level_names[level]:10s}: \"\n", - " f\"n={mask.sum():>4d}, \"\n", - " f\"median nnz={int(np.median(vals)):>7,}, \"\n", - " f\"range=[{vals.min():,}, {vals.max():,}]\"\n", - " )" - ] + "outputs": [], + "source": "nnz_per_row = np.diff(X_sparse.indptr)\nprint(f\"Non-zeros per row:\")\nprint(f\" min: {nnz_per_row.min():,}\")\nprint(f\" median: {int(np.median(nnz_per_row)):,}\")\nprint(f\" mean: {nnz_per_row.mean():,.0f}\")\nprint(f\" max: {nnz_per_row.max():,}\")\n\ngeo_levels = targets_df[\"geographic_id\"].apply(get_geo_level)\nlevel_names = {0: \"National\", 1: \"State\", 2: \"District\"}\nprint(\"\\nBy geographic level:\")\nfor level in [0, 1, 2]:\n mask = (geo_levels == level).values\n if mask.any():\n vals = nnz_per_row[mask]\n print(\n f\" {level_names[level]:10s}: \"\n f\"n={mask.sum():>4d}, \"\n f\"median nnz={int(np.median(vals)):>7,}, \"\n f\"range=[{vals.min():,}, {vals.max():,}]\"\n )" }, { "cell_type": "code", @@ -633,38 +486,39 @@ "name": "stdout", "output_type": "stream", "text": [ - "Non-zeros per CD block:\n", - " min: 4,326 (CD 2801)\n", - " median: 4,884\n", - " max: 5,964 (CD 1101)\n" + "Non-zeros per clone block:\n", + " clone nnz unique_states\n", + " 0 4962 50\n", + " 1 4988 50\n", + " 2 4996 50\n" ] } ], "source": [ - "n_hh = tracer.n_households\n", - "n_cds = tracer.n_geographies\n", - "cd_nnz = []\n", - "for cd_idx in range(n_cds):\n", - " block = X_sparse[:, cd_idx * n_hh : (cd_idx + 1) * n_hh]\n", - " cd_nnz.append({\n", - " \"cd_geoid\": cds_to_calibrate[cd_idx],\n", + "clone_nnz = []\n", + "for ci in range(N_CLONES):\n", + " block = X_sparse[:, ci * n_records : (ci + 1) * n_records]\n", + " n_states = len(np.unique(geography.state_fips[ci * n_records : (ci + 1) * n_records]))\n", + " clone_nnz.append({\n", + " \"clone\": ci,\n", " \"nnz\": block.nnz,\n", + " \"unique_states\": n_states,\n", " })\n", "\n", - "cd_nnz_df = pd.DataFrame(cd_nnz)\n", - "print(f\"Non-zeros per CD block:\")\n", - "print(f\" min: {cd_nnz_df['nnz'].min():,} (CD {cd_nnz_df.loc[cd_nnz_df['nnz'].idxmin(), 'cd_geoid']})\")\n", - "print(f\" median: {int(cd_nnz_df['nnz'].median()):,}\")\n", - "print(f\" max: {cd_nnz_df['nnz'].max():,} (CD {cd_nnz_df.loc[cd_nnz_df['nnz'].idxmax(), 'cd_geoid']})\")" + "clone_df = pd.DataFrame(clone_nnz)\n", + "print(\"Non-zeros per clone block:\")\n", + "print(clone_df.to_string(index=False))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 8. Group exclusion\n", + "## 8. Dropping target groups\n", + "\n", + "Some target groups are redundant after hierarchical uprating. For example, state-level SNAP Household Count (Group 1) is redundant with district-level SNAP Household Count (Group 5) — the district targets were already reconciled to sum to the state totals.\n", "\n", - "`GROUPS_TO_EXCLUDE` removes redundant or harmful constraints before training. For example, state-level SNAP household counts (Group 1) are redundant with reconciled district rows (Group 4) and can confuse the optimizer. Group IDs depend on database contents, so always check `print_matrix_structure()` output first." + "Specify drops as `(variable_label, geo_level)` pairs. The labels come from the group descriptions above; the geo level is \"National\", \"State\", or \"District\"." ] }, { @@ -676,24 +530,27 @@ "name": "stdout", "output_type": "stream", "text": [ - "Before exclusion: 1411 rows\n", - "Excluding groups [1]: dropping 51 rows\n", - "After exclusion: 1360 rows\n" + "Matrix before: 1411 rows\n", + " DROPPING Group 1: State SNAP Household Count (51 targets) (51 rows)\n", + "\n", + " KEEPING Group 0: National ACA PTC Person Count (1 target, value=19,743,689) (1 rows)\n", + " KEEPING Group 2: State Snap (51 targets) (51 rows)\n", + " KEEPING Group 3: District Aca Ptc (436 targets) (436 rows)\n", + " KEEPING Group 4: District ACA PTC Tax Unit Count (436 targets) (436 rows)\n", + " KEEPING Group 5: District SNAP Household Count (436 targets) (436 rows)\n", + "\n", + "Matrix after: 1360 rows\n" ] } ], "source": [ - "GROUPS_TO_EXCLUDE = [1]\n", - "\n", - "print(f\"Before exclusion: {X_sparse.shape[0]} rows\")\n", + "GROUPS_TO_DROP = [\n", + " (\"SNAP Household Count\", \"State\"),\n", + "]\n", "\n", - "keep_mask = ~np.isin(tracer.target_groups, GROUPS_TO_EXCLUDE)\n", - "n_dropped = (~keep_mask).sum()\n", - "print(f\"Excluding groups {GROUPS_TO_EXCLUDE}: dropping {n_dropped} rows\")\n", - "\n", - "X_filtered = X_sparse[keep_mask, :]\n", - "targets_filtered = targets_df[keep_mask].reset_index(drop=True)\n", - "print(f\"After exclusion: {X_filtered.shape[0]} rows\")" + "targets_filtered, X_filtered = drop_target_groups(\n", + " targets_df, X_sparse, target_groups, group_info, GROUPS_TO_DROP\n", + ")" ] }, { @@ -756,883 +613,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Achievable targets: 487\n", - "Impossible targets: 873\n", - "\n", - "Impossible targets:\n", - " aca_ptc/person_count (geo=US)\n", - " aca_ptc/aca_ptc (geo=1001)\n", - " aca_ptc/aca_ptc (geo=101)\n", - " aca_ptc/aca_ptc (geo=102)\n", - " aca_ptc/aca_ptc (geo=103)\n", - " aca_ptc/aca_ptc (geo=104)\n", - " aca_ptc/aca_ptc (geo=105)\n", - " aca_ptc/aca_ptc (geo=106)\n", - " aca_ptc/aca_ptc (geo=107)\n", - " aca_ptc/aca_ptc (geo=1101)\n", - " aca_ptc/aca_ptc (geo=1201)\n", - " aca_ptc/aca_ptc (geo=1202)\n", - " aca_ptc/aca_ptc (geo=1203)\n", - " aca_ptc/aca_ptc (geo=1204)\n", - " aca_ptc/aca_ptc (geo=1205)\n", - " aca_ptc/aca_ptc (geo=1206)\n", - " aca_ptc/aca_ptc (geo=1207)\n", - " aca_ptc/aca_ptc (geo=1208)\n", - " aca_ptc/aca_ptc (geo=1209)\n", - " aca_ptc/aca_ptc (geo=1210)\n", - " aca_ptc/aca_ptc (geo=1211)\n", - " aca_ptc/aca_ptc (geo=1212)\n", - " aca_ptc/aca_ptc (geo=1213)\n", - " aca_ptc/aca_ptc (geo=1214)\n", - " aca_ptc/aca_ptc (geo=1215)\n", - " aca_ptc/aca_ptc (geo=1216)\n", - " aca_ptc/aca_ptc (geo=1217)\n", - " aca_ptc/aca_ptc (geo=1218)\n", - " aca_ptc/aca_ptc (geo=1219)\n", - " aca_ptc/aca_ptc (geo=1220)\n", - " aca_ptc/aca_ptc (geo=1221)\n", - " aca_ptc/aca_ptc (geo=1222)\n", - " aca_ptc/aca_ptc (geo=1223)\n", - " aca_ptc/aca_ptc (geo=1224)\n", - " aca_ptc/aca_ptc (geo=1225)\n", - " aca_ptc/aca_ptc (geo=1226)\n", - " aca_ptc/aca_ptc (geo=1227)\n", - " aca_ptc/aca_ptc (geo=1228)\n", - " aca_ptc/aca_ptc (geo=1301)\n", - " aca_ptc/aca_ptc (geo=1302)\n", - " aca_ptc/aca_ptc (geo=1303)\n", - " aca_ptc/aca_ptc (geo=1304)\n", - " aca_ptc/aca_ptc (geo=1305)\n", - " aca_ptc/aca_ptc (geo=1306)\n", - " aca_ptc/aca_ptc (geo=1307)\n", - " aca_ptc/aca_ptc (geo=1308)\n", - " aca_ptc/aca_ptc (geo=1309)\n", - " aca_ptc/aca_ptc (geo=1310)\n", - " aca_ptc/aca_ptc (geo=1311)\n", - " aca_ptc/aca_ptc (geo=1312)\n", - " aca_ptc/aca_ptc (geo=1313)\n", - " aca_ptc/aca_ptc (geo=1314)\n", - " aca_ptc/aca_ptc (geo=1501)\n", - " aca_ptc/aca_ptc (geo=1502)\n", - " aca_ptc/aca_ptc (geo=1601)\n", - " aca_ptc/aca_ptc (geo=1602)\n", - " aca_ptc/aca_ptc (geo=1701)\n", - " aca_ptc/aca_ptc (geo=1702)\n", - " aca_ptc/aca_ptc (geo=1703)\n", - " aca_ptc/aca_ptc (geo=1704)\n", - " aca_ptc/aca_ptc (geo=1705)\n", - " aca_ptc/aca_ptc (geo=1706)\n", - " aca_ptc/aca_ptc (geo=1707)\n", - " aca_ptc/aca_ptc (geo=1708)\n", - " aca_ptc/aca_ptc (geo=1709)\n", - " aca_ptc/aca_ptc (geo=1710)\n", - " aca_ptc/aca_ptc (geo=1711)\n", - " aca_ptc/aca_ptc (geo=1712)\n", - " aca_ptc/aca_ptc (geo=1713)\n", - " aca_ptc/aca_ptc (geo=1714)\n", - " aca_ptc/aca_ptc (geo=1715)\n", - " aca_ptc/aca_ptc (geo=1716)\n", - " aca_ptc/aca_ptc (geo=1717)\n", - " aca_ptc/aca_ptc (geo=1801)\n", - " aca_ptc/aca_ptc (geo=1802)\n", - " aca_ptc/aca_ptc (geo=1803)\n", - " aca_ptc/aca_ptc (geo=1804)\n", - " aca_ptc/aca_ptc (geo=1805)\n", - " aca_ptc/aca_ptc (geo=1806)\n", - " aca_ptc/aca_ptc (geo=1807)\n", - " aca_ptc/aca_ptc (geo=1808)\n", - " aca_ptc/aca_ptc (geo=1809)\n", - " aca_ptc/aca_ptc (geo=1901)\n", - " aca_ptc/aca_ptc (geo=1902)\n", - " aca_ptc/aca_ptc (geo=1903)\n", - " aca_ptc/aca_ptc (geo=1904)\n", - " aca_ptc/aca_ptc (geo=2001)\n", - " aca_ptc/aca_ptc (geo=2002)\n", - " aca_ptc/aca_ptc (geo=2003)\n", - " aca_ptc/aca_ptc (geo=2004)\n", - " aca_ptc/aca_ptc (geo=201)\n", - " aca_ptc/aca_ptc (geo=2101)\n", - " aca_ptc/aca_ptc (geo=2102)\n", - " aca_ptc/aca_ptc (geo=2103)\n", - " aca_ptc/aca_ptc (geo=2104)\n", - " aca_ptc/aca_ptc (geo=2105)\n", - " aca_ptc/aca_ptc (geo=2106)\n", - " aca_ptc/aca_ptc (geo=2201)\n", - " aca_ptc/aca_ptc (geo=2202)\n", - " aca_ptc/aca_ptc (geo=2203)\n", - " aca_ptc/aca_ptc (geo=2204)\n", - " aca_ptc/aca_ptc (geo=2205)\n", - " aca_ptc/aca_ptc (geo=2206)\n", - " aca_ptc/aca_ptc (geo=2301)\n", - " aca_ptc/aca_ptc (geo=2302)\n", - " aca_ptc/aca_ptc (geo=2401)\n", - " aca_ptc/aca_ptc (geo=2402)\n", - " aca_ptc/aca_ptc (geo=2403)\n", - " aca_ptc/aca_ptc (geo=2404)\n", - " aca_ptc/aca_ptc (geo=2405)\n", - " aca_ptc/aca_ptc (geo=2406)\n", - " aca_ptc/aca_ptc (geo=2407)\n", - " aca_ptc/aca_ptc (geo=2408)\n", - " aca_ptc/aca_ptc (geo=2501)\n", - " aca_ptc/aca_ptc (geo=2502)\n", - " aca_ptc/aca_ptc (geo=2503)\n", - " aca_ptc/aca_ptc (geo=2504)\n", - " aca_ptc/aca_ptc (geo=2505)\n", - " aca_ptc/aca_ptc (geo=2506)\n", - " aca_ptc/aca_ptc (geo=2507)\n", - " aca_ptc/aca_ptc (geo=2508)\n", - " aca_ptc/aca_ptc (geo=2509)\n", - " aca_ptc/aca_ptc (geo=2601)\n", - " aca_ptc/aca_ptc (geo=2602)\n", - " aca_ptc/aca_ptc (geo=2603)\n", - " aca_ptc/aca_ptc (geo=2604)\n", - " aca_ptc/aca_ptc (geo=2605)\n", - " aca_ptc/aca_ptc (geo=2606)\n", - " aca_ptc/aca_ptc (geo=2607)\n", - " aca_ptc/aca_ptc (geo=2608)\n", - " aca_ptc/aca_ptc (geo=2609)\n", - " aca_ptc/aca_ptc (geo=2610)\n", - " aca_ptc/aca_ptc (geo=2611)\n", - " aca_ptc/aca_ptc (geo=2612)\n", - " aca_ptc/aca_ptc (geo=2613)\n", - " aca_ptc/aca_ptc (geo=2701)\n", - " aca_ptc/aca_ptc (geo=2702)\n", - " aca_ptc/aca_ptc (geo=2703)\n", - " aca_ptc/aca_ptc (geo=2704)\n", - " aca_ptc/aca_ptc (geo=2705)\n", - " aca_ptc/aca_ptc (geo=2706)\n", - " aca_ptc/aca_ptc (geo=2707)\n", - " aca_ptc/aca_ptc (geo=2708)\n", - " aca_ptc/aca_ptc (geo=2801)\n", - " aca_ptc/aca_ptc (geo=2802)\n", - " aca_ptc/aca_ptc (geo=2803)\n", - " aca_ptc/aca_ptc (geo=2804)\n", - " aca_ptc/aca_ptc (geo=2901)\n", - " aca_ptc/aca_ptc (geo=2902)\n", - " aca_ptc/aca_ptc (geo=2903)\n", - " aca_ptc/aca_ptc (geo=2904)\n", - " aca_ptc/aca_ptc (geo=2905)\n", - " aca_ptc/aca_ptc (geo=2906)\n", - " aca_ptc/aca_ptc (geo=2907)\n", - " aca_ptc/aca_ptc (geo=2908)\n", - " aca_ptc/aca_ptc (geo=3001)\n", - " aca_ptc/aca_ptc (geo=3002)\n", - " aca_ptc/aca_ptc (geo=3101)\n", - " aca_ptc/aca_ptc (geo=3102)\n", - " aca_ptc/aca_ptc (geo=3103)\n", - " aca_ptc/aca_ptc (geo=3201)\n", - " aca_ptc/aca_ptc (geo=3202)\n", - " aca_ptc/aca_ptc (geo=3203)\n", - " aca_ptc/aca_ptc (geo=3204)\n", - " aca_ptc/aca_ptc (geo=3301)\n", - " aca_ptc/aca_ptc (geo=3302)\n", - " aca_ptc/aca_ptc (geo=3401)\n", - " aca_ptc/aca_ptc (geo=3402)\n", - " aca_ptc/aca_ptc (geo=3403)\n", - " aca_ptc/aca_ptc (geo=3404)\n", - " aca_ptc/aca_ptc (geo=3405)\n", - " aca_ptc/aca_ptc (geo=3406)\n", - " aca_ptc/aca_ptc (geo=3407)\n", - " aca_ptc/aca_ptc (geo=3408)\n", - " aca_ptc/aca_ptc (geo=3409)\n", - " aca_ptc/aca_ptc (geo=3410)\n", - " aca_ptc/aca_ptc (geo=3411)\n", - " aca_ptc/aca_ptc (geo=3412)\n", - " aca_ptc/aca_ptc (geo=3501)\n", - " aca_ptc/aca_ptc (geo=3502)\n", - " aca_ptc/aca_ptc (geo=3503)\n", - " aca_ptc/aca_ptc (geo=3601)\n", - " aca_ptc/aca_ptc (geo=3602)\n", - " aca_ptc/aca_ptc (geo=3603)\n", - " aca_ptc/aca_ptc (geo=3604)\n", - " aca_ptc/aca_ptc (geo=3605)\n", - " aca_ptc/aca_ptc (geo=3606)\n", - " aca_ptc/aca_ptc (geo=3607)\n", - " aca_ptc/aca_ptc (geo=3608)\n", - " aca_ptc/aca_ptc (geo=3609)\n", - " aca_ptc/aca_ptc (geo=3610)\n", - " aca_ptc/aca_ptc (geo=3611)\n", - " aca_ptc/aca_ptc (geo=3612)\n", - " aca_ptc/aca_ptc (geo=3613)\n", - " aca_ptc/aca_ptc (geo=3614)\n", - " aca_ptc/aca_ptc (geo=3615)\n", - " aca_ptc/aca_ptc (geo=3616)\n", - " aca_ptc/aca_ptc (geo=3617)\n", - " aca_ptc/aca_ptc (geo=3618)\n", - " aca_ptc/aca_ptc (geo=3619)\n", - " aca_ptc/aca_ptc (geo=3620)\n", - " aca_ptc/aca_ptc (geo=3621)\n", - " aca_ptc/aca_ptc (geo=3622)\n", - " aca_ptc/aca_ptc (geo=3623)\n", - " aca_ptc/aca_ptc (geo=3624)\n", - " aca_ptc/aca_ptc (geo=3625)\n", - " aca_ptc/aca_ptc (geo=3626)\n", - " aca_ptc/aca_ptc (geo=3701)\n", - " aca_ptc/aca_ptc (geo=3702)\n", - " aca_ptc/aca_ptc (geo=3703)\n", - " aca_ptc/aca_ptc (geo=3704)\n", - " aca_ptc/aca_ptc (geo=3705)\n", - " aca_ptc/aca_ptc (geo=3706)\n", - " aca_ptc/aca_ptc (geo=3707)\n", - " aca_ptc/aca_ptc (geo=3708)\n", - " aca_ptc/aca_ptc (geo=3709)\n", - " aca_ptc/aca_ptc (geo=3710)\n", - " aca_ptc/aca_ptc (geo=3711)\n", - " aca_ptc/aca_ptc (geo=3712)\n", - " aca_ptc/aca_ptc (geo=3713)\n", - " aca_ptc/aca_ptc (geo=3714)\n", - " aca_ptc/aca_ptc (geo=3801)\n", - " aca_ptc/aca_ptc (geo=3901)\n", - " aca_ptc/aca_ptc (geo=3902)\n", - " aca_ptc/aca_ptc (geo=3903)\n", - " aca_ptc/aca_ptc (geo=3904)\n", - " aca_ptc/aca_ptc (geo=3905)\n", - " aca_ptc/aca_ptc (geo=3906)\n", - " aca_ptc/aca_ptc (geo=3907)\n", - " aca_ptc/aca_ptc (geo=3908)\n", - " aca_ptc/aca_ptc (geo=3909)\n", - " aca_ptc/aca_ptc (geo=3910)\n", - " aca_ptc/aca_ptc (geo=3911)\n", - " aca_ptc/aca_ptc (geo=3912)\n", - " aca_ptc/aca_ptc (geo=3913)\n", - " aca_ptc/aca_ptc (geo=3914)\n", - " aca_ptc/aca_ptc (geo=3915)\n", - " aca_ptc/aca_ptc (geo=4001)\n", - " aca_ptc/aca_ptc (geo=4002)\n", - " aca_ptc/aca_ptc (geo=4003)\n", - " aca_ptc/aca_ptc (geo=4004)\n", - " aca_ptc/aca_ptc (geo=4005)\n", - " aca_ptc/aca_ptc (geo=401)\n", - " aca_ptc/aca_ptc (geo=402)\n", - " aca_ptc/aca_ptc (geo=403)\n", - " aca_ptc/aca_ptc (geo=404)\n", - " aca_ptc/aca_ptc (geo=405)\n", - " aca_ptc/aca_ptc (geo=406)\n", - " aca_ptc/aca_ptc (geo=407)\n", - " aca_ptc/aca_ptc (geo=408)\n", - " aca_ptc/aca_ptc (geo=409)\n", - " aca_ptc/aca_ptc (geo=4101)\n", - " aca_ptc/aca_ptc (geo=4102)\n", - " aca_ptc/aca_ptc (geo=4103)\n", - " aca_ptc/aca_ptc (geo=4104)\n", - " aca_ptc/aca_ptc (geo=4105)\n", - " aca_ptc/aca_ptc (geo=4106)\n", - " aca_ptc/aca_ptc (geo=4201)\n", - " aca_ptc/aca_ptc (geo=4202)\n", - " aca_ptc/aca_ptc (geo=4203)\n", - " aca_ptc/aca_ptc (geo=4204)\n", - " aca_ptc/aca_ptc (geo=4205)\n", - " aca_ptc/aca_ptc (geo=4206)\n", - " aca_ptc/aca_ptc (geo=4207)\n", - " aca_ptc/aca_ptc (geo=4208)\n", - " aca_ptc/aca_ptc (geo=4209)\n", - " aca_ptc/aca_ptc (geo=4210)\n", - " aca_ptc/aca_ptc (geo=4211)\n", - " aca_ptc/aca_ptc (geo=4212)\n", - " aca_ptc/aca_ptc (geo=4213)\n", - " aca_ptc/aca_ptc (geo=4214)\n", - " aca_ptc/aca_ptc (geo=4215)\n", - " aca_ptc/aca_ptc (geo=4216)\n", - " aca_ptc/aca_ptc (geo=4217)\n", - " aca_ptc/aca_ptc (geo=4401)\n", - " aca_ptc/aca_ptc (geo=4402)\n", - " aca_ptc/aca_ptc (geo=4501)\n", - " aca_ptc/aca_ptc (geo=4502)\n", - " aca_ptc/aca_ptc (geo=4503)\n", - " aca_ptc/aca_ptc (geo=4504)\n", - " aca_ptc/aca_ptc (geo=4505)\n", - " aca_ptc/aca_ptc (geo=4506)\n", - " aca_ptc/aca_ptc (geo=4507)\n", - " aca_ptc/aca_ptc (geo=4601)\n", - " aca_ptc/aca_ptc (geo=4701)\n", - " aca_ptc/aca_ptc (geo=4702)\n", - " aca_ptc/aca_ptc (geo=4703)\n", - " aca_ptc/aca_ptc (geo=4704)\n", - " aca_ptc/aca_ptc (geo=4705)\n", - " aca_ptc/aca_ptc (geo=4706)\n", - " aca_ptc/aca_ptc (geo=4707)\n", - " aca_ptc/aca_ptc (geo=4708)\n", - " aca_ptc/aca_ptc (geo=4709)\n", - " aca_ptc/aca_ptc (geo=4801)\n", - " aca_ptc/aca_ptc (geo=4802)\n", - " aca_ptc/aca_ptc (geo=4803)\n", - " aca_ptc/aca_ptc (geo=4804)\n", - " aca_ptc/aca_ptc (geo=4805)\n", - " aca_ptc/aca_ptc (geo=4806)\n", - " aca_ptc/aca_ptc (geo=4807)\n", - " aca_ptc/aca_ptc (geo=4808)\n", - " aca_ptc/aca_ptc (geo=4809)\n", - " aca_ptc/aca_ptc (geo=4810)\n", - " aca_ptc/aca_ptc (geo=4811)\n", - " aca_ptc/aca_ptc (geo=4812)\n", - " aca_ptc/aca_ptc (geo=4813)\n", - " aca_ptc/aca_ptc (geo=4814)\n", - " aca_ptc/aca_ptc (geo=4815)\n", - " aca_ptc/aca_ptc (geo=4816)\n", - " aca_ptc/aca_ptc (geo=4817)\n", - " aca_ptc/aca_ptc (geo=4818)\n", - " aca_ptc/aca_ptc (geo=4819)\n", - " aca_ptc/aca_ptc (geo=4820)\n", - " aca_ptc/aca_ptc (geo=4821)\n", - " aca_ptc/aca_ptc (geo=4822)\n", - " aca_ptc/aca_ptc (geo=4823)\n", - " aca_ptc/aca_ptc (geo=4824)\n", - " aca_ptc/aca_ptc (geo=4825)\n", - " aca_ptc/aca_ptc (geo=4826)\n", - " aca_ptc/aca_ptc (geo=4827)\n", - " aca_ptc/aca_ptc (geo=4828)\n", - " aca_ptc/aca_ptc (geo=4829)\n", - " aca_ptc/aca_ptc (geo=4830)\n", - " aca_ptc/aca_ptc (geo=4831)\n", - " aca_ptc/aca_ptc (geo=4832)\n", - " aca_ptc/aca_ptc (geo=4833)\n", - " aca_ptc/aca_ptc (geo=4834)\n", - " aca_ptc/aca_ptc (geo=4835)\n", - " aca_ptc/aca_ptc (geo=4836)\n", - " aca_ptc/aca_ptc (geo=4837)\n", - " aca_ptc/aca_ptc (geo=4838)\n", - " aca_ptc/aca_ptc (geo=4901)\n", - " aca_ptc/aca_ptc (geo=4902)\n", - " aca_ptc/aca_ptc (geo=4903)\n", - " aca_ptc/aca_ptc (geo=4904)\n", - " aca_ptc/aca_ptc (geo=5001)\n", - " aca_ptc/aca_ptc (geo=501)\n", - " aca_ptc/aca_ptc (geo=502)\n", - " aca_ptc/aca_ptc (geo=503)\n", - " aca_ptc/aca_ptc (geo=504)\n", - " aca_ptc/aca_ptc (geo=5101)\n", - " aca_ptc/aca_ptc (geo=5102)\n", - " aca_ptc/aca_ptc (geo=5103)\n", - " aca_ptc/aca_ptc (geo=5104)\n", - " aca_ptc/aca_ptc (geo=5105)\n", - " aca_ptc/aca_ptc (geo=5106)\n", - " aca_ptc/aca_ptc (geo=5107)\n", - " aca_ptc/aca_ptc (geo=5108)\n", - " aca_ptc/aca_ptc (geo=5109)\n", - " aca_ptc/aca_ptc (geo=5110)\n", - " aca_ptc/aca_ptc (geo=5111)\n", - " aca_ptc/aca_ptc (geo=5301)\n", - " aca_ptc/aca_ptc (geo=5302)\n", - " aca_ptc/aca_ptc (geo=5303)\n", - " aca_ptc/aca_ptc (geo=5304)\n", - " aca_ptc/aca_ptc (geo=5305)\n", - " aca_ptc/aca_ptc (geo=5306)\n", - " aca_ptc/aca_ptc (geo=5307)\n", - " aca_ptc/aca_ptc (geo=5308)\n", - " aca_ptc/aca_ptc (geo=5309)\n", - " aca_ptc/aca_ptc (geo=5310)\n", - " aca_ptc/aca_ptc (geo=5401)\n", - " aca_ptc/aca_ptc (geo=5402)\n", - " aca_ptc/aca_ptc (geo=5501)\n", - " aca_ptc/aca_ptc (geo=5502)\n", - " aca_ptc/aca_ptc (geo=5503)\n", - " aca_ptc/aca_ptc (geo=5504)\n", - " aca_ptc/aca_ptc (geo=5505)\n", - " aca_ptc/aca_ptc (geo=5506)\n", - " aca_ptc/aca_ptc (geo=5507)\n", - " aca_ptc/aca_ptc (geo=5508)\n", - " aca_ptc/aca_ptc (geo=5601)\n", - " aca_ptc/aca_ptc (geo=601)\n", - " aca_ptc/aca_ptc (geo=602)\n", - " aca_ptc/aca_ptc (geo=603)\n", - " aca_ptc/aca_ptc (geo=604)\n", - " aca_ptc/aca_ptc (geo=605)\n", - " aca_ptc/aca_ptc (geo=606)\n", - " aca_ptc/aca_ptc (geo=607)\n", - " aca_ptc/aca_ptc (geo=608)\n", - " aca_ptc/aca_ptc (geo=609)\n", - " aca_ptc/aca_ptc (geo=610)\n", - " aca_ptc/aca_ptc (geo=611)\n", - " aca_ptc/aca_ptc (geo=612)\n", - " aca_ptc/aca_ptc (geo=613)\n", - " aca_ptc/aca_ptc (geo=614)\n", - " aca_ptc/aca_ptc (geo=615)\n", - " aca_ptc/aca_ptc (geo=616)\n", - " aca_ptc/aca_ptc (geo=617)\n", - " aca_ptc/aca_ptc (geo=618)\n", - " aca_ptc/aca_ptc (geo=619)\n", - " aca_ptc/aca_ptc (geo=620)\n", - " aca_ptc/aca_ptc (geo=621)\n", - " aca_ptc/aca_ptc (geo=622)\n", - " aca_ptc/aca_ptc (geo=623)\n", - " aca_ptc/aca_ptc (geo=624)\n", - " aca_ptc/aca_ptc (geo=625)\n", - " aca_ptc/aca_ptc (geo=626)\n", - " aca_ptc/aca_ptc (geo=627)\n", - " aca_ptc/aca_ptc (geo=628)\n", - " aca_ptc/aca_ptc (geo=629)\n", - " aca_ptc/aca_ptc (geo=630)\n", - " aca_ptc/aca_ptc (geo=631)\n", - " aca_ptc/aca_ptc (geo=632)\n", - " aca_ptc/aca_ptc (geo=633)\n", - " aca_ptc/aca_ptc (geo=634)\n", - " aca_ptc/aca_ptc (geo=635)\n", - " aca_ptc/aca_ptc (geo=636)\n", - " aca_ptc/aca_ptc (geo=637)\n", - " aca_ptc/aca_ptc (geo=638)\n", - " aca_ptc/aca_ptc (geo=639)\n", - " aca_ptc/aca_ptc (geo=640)\n", - " aca_ptc/aca_ptc (geo=641)\n", - " aca_ptc/aca_ptc (geo=642)\n", - " aca_ptc/aca_ptc (geo=643)\n", - " aca_ptc/aca_ptc (geo=644)\n", - " aca_ptc/aca_ptc (geo=645)\n", - " aca_ptc/aca_ptc (geo=646)\n", - " aca_ptc/aca_ptc (geo=647)\n", - " aca_ptc/aca_ptc (geo=648)\n", - " aca_ptc/aca_ptc (geo=649)\n", - " aca_ptc/aca_ptc (geo=650)\n", - " aca_ptc/aca_ptc (geo=651)\n", - " aca_ptc/aca_ptc (geo=652)\n", - " aca_ptc/aca_ptc (geo=801)\n", - " aca_ptc/aca_ptc (geo=802)\n", - " aca_ptc/aca_ptc (geo=803)\n", - " aca_ptc/aca_ptc (geo=804)\n", - " aca_ptc/aca_ptc (geo=805)\n", - " aca_ptc/aca_ptc (geo=806)\n", - " aca_ptc/aca_ptc (geo=807)\n", - " aca_ptc/aca_ptc (geo=808)\n", - " aca_ptc/aca_ptc (geo=901)\n", - " aca_ptc/aca_ptc (geo=902)\n", - " aca_ptc/aca_ptc (geo=903)\n", - " aca_ptc/aca_ptc (geo=904)\n", - " aca_ptc/aca_ptc (geo=905)\n", - " aca_ptc/tax_unit_count (geo=1001)\n", - " aca_ptc/tax_unit_count (geo=101)\n", - " aca_ptc/tax_unit_count (geo=102)\n", - " aca_ptc/tax_unit_count (geo=103)\n", - " aca_ptc/tax_unit_count (geo=104)\n", - " aca_ptc/tax_unit_count (geo=105)\n", - " aca_ptc/tax_unit_count (geo=106)\n", - " aca_ptc/tax_unit_count (geo=107)\n", - " aca_ptc/tax_unit_count (geo=1101)\n", - " aca_ptc/tax_unit_count (geo=1201)\n", - " aca_ptc/tax_unit_count (geo=1202)\n", - " aca_ptc/tax_unit_count (geo=1203)\n", - " aca_ptc/tax_unit_count (geo=1204)\n", - " aca_ptc/tax_unit_count (geo=1205)\n", - " aca_ptc/tax_unit_count (geo=1206)\n", - " aca_ptc/tax_unit_count (geo=1207)\n", - " aca_ptc/tax_unit_count (geo=1208)\n", - " aca_ptc/tax_unit_count (geo=1209)\n", - " aca_ptc/tax_unit_count (geo=1210)\n", - " aca_ptc/tax_unit_count (geo=1211)\n", - " aca_ptc/tax_unit_count (geo=1212)\n", - " aca_ptc/tax_unit_count (geo=1213)\n", - " aca_ptc/tax_unit_count (geo=1214)\n", - " aca_ptc/tax_unit_count (geo=1215)\n", - " aca_ptc/tax_unit_count (geo=1216)\n", - " aca_ptc/tax_unit_count (geo=1217)\n", - " aca_ptc/tax_unit_count (geo=1218)\n", - " aca_ptc/tax_unit_count (geo=1219)\n", - " aca_ptc/tax_unit_count (geo=1220)\n", - " aca_ptc/tax_unit_count (geo=1221)\n", - " aca_ptc/tax_unit_count (geo=1222)\n", - " aca_ptc/tax_unit_count (geo=1223)\n", - " aca_ptc/tax_unit_count (geo=1224)\n", - " aca_ptc/tax_unit_count (geo=1225)\n", - " aca_ptc/tax_unit_count (geo=1226)\n", - " aca_ptc/tax_unit_count (geo=1227)\n", - " aca_ptc/tax_unit_count (geo=1228)\n", - " aca_ptc/tax_unit_count (geo=1301)\n", - " aca_ptc/tax_unit_count (geo=1302)\n", - " aca_ptc/tax_unit_count (geo=1303)\n", - " aca_ptc/tax_unit_count (geo=1304)\n", - " aca_ptc/tax_unit_count (geo=1305)\n", - " aca_ptc/tax_unit_count (geo=1306)\n", - " aca_ptc/tax_unit_count (geo=1307)\n", - " aca_ptc/tax_unit_count (geo=1308)\n", - " aca_ptc/tax_unit_count (geo=1309)\n", - " aca_ptc/tax_unit_count (geo=1310)\n", - " aca_ptc/tax_unit_count (geo=1311)\n", - " aca_ptc/tax_unit_count (geo=1312)\n", - " aca_ptc/tax_unit_count (geo=1313)\n", - " aca_ptc/tax_unit_count (geo=1314)\n", - " aca_ptc/tax_unit_count (geo=1501)\n", - " aca_ptc/tax_unit_count (geo=1502)\n", - " aca_ptc/tax_unit_count (geo=1601)\n", - " aca_ptc/tax_unit_count (geo=1602)\n", - " aca_ptc/tax_unit_count (geo=1701)\n", - " aca_ptc/tax_unit_count (geo=1702)\n", - " aca_ptc/tax_unit_count (geo=1703)\n", - " aca_ptc/tax_unit_count (geo=1704)\n", - " aca_ptc/tax_unit_count (geo=1705)\n", - " aca_ptc/tax_unit_count (geo=1706)\n", - " aca_ptc/tax_unit_count (geo=1707)\n", - " aca_ptc/tax_unit_count (geo=1708)\n", - " aca_ptc/tax_unit_count (geo=1709)\n", - " aca_ptc/tax_unit_count (geo=1710)\n", - " aca_ptc/tax_unit_count (geo=1711)\n", - " aca_ptc/tax_unit_count (geo=1712)\n", - " aca_ptc/tax_unit_count (geo=1713)\n", - " aca_ptc/tax_unit_count (geo=1714)\n", - " aca_ptc/tax_unit_count (geo=1715)\n", - " aca_ptc/tax_unit_count (geo=1716)\n", - " aca_ptc/tax_unit_count (geo=1717)\n", - " aca_ptc/tax_unit_count (geo=1801)\n", - " aca_ptc/tax_unit_count (geo=1802)\n", - " aca_ptc/tax_unit_count (geo=1803)\n", - " aca_ptc/tax_unit_count (geo=1804)\n", - " aca_ptc/tax_unit_count (geo=1805)\n", - " aca_ptc/tax_unit_count (geo=1806)\n", - " aca_ptc/tax_unit_count (geo=1807)\n", - " aca_ptc/tax_unit_count (geo=1808)\n", - " aca_ptc/tax_unit_count (geo=1809)\n", - " aca_ptc/tax_unit_count (geo=1901)\n", - " aca_ptc/tax_unit_count (geo=1902)\n", - " aca_ptc/tax_unit_count (geo=1903)\n", - " aca_ptc/tax_unit_count (geo=1904)\n", - " aca_ptc/tax_unit_count (geo=2001)\n", - " aca_ptc/tax_unit_count (geo=2002)\n", - " aca_ptc/tax_unit_count (geo=2003)\n", - " aca_ptc/tax_unit_count (geo=2004)\n", - " aca_ptc/tax_unit_count (geo=201)\n", - " aca_ptc/tax_unit_count (geo=2101)\n", - " aca_ptc/tax_unit_count (geo=2102)\n", - " aca_ptc/tax_unit_count (geo=2103)\n", - " aca_ptc/tax_unit_count (geo=2104)\n", - " aca_ptc/tax_unit_count (geo=2105)\n", - " aca_ptc/tax_unit_count (geo=2106)\n", - " aca_ptc/tax_unit_count (geo=2201)\n", - " aca_ptc/tax_unit_count (geo=2202)\n", - " aca_ptc/tax_unit_count (geo=2203)\n", - " aca_ptc/tax_unit_count (geo=2204)\n", - " aca_ptc/tax_unit_count (geo=2205)\n", - " aca_ptc/tax_unit_count (geo=2206)\n", - " aca_ptc/tax_unit_count (geo=2301)\n", - " aca_ptc/tax_unit_count (geo=2302)\n", - " aca_ptc/tax_unit_count (geo=2401)\n", - " aca_ptc/tax_unit_count (geo=2402)\n", - " aca_ptc/tax_unit_count (geo=2403)\n", - " aca_ptc/tax_unit_count (geo=2404)\n", - " aca_ptc/tax_unit_count (geo=2405)\n", - " aca_ptc/tax_unit_count (geo=2406)\n", - " aca_ptc/tax_unit_count (geo=2407)\n", - " aca_ptc/tax_unit_count (geo=2408)\n", - " aca_ptc/tax_unit_count (geo=2501)\n", - " aca_ptc/tax_unit_count (geo=2502)\n", - " aca_ptc/tax_unit_count (geo=2503)\n", - " aca_ptc/tax_unit_count (geo=2504)\n", - " aca_ptc/tax_unit_count (geo=2505)\n", - " aca_ptc/tax_unit_count (geo=2506)\n", - " aca_ptc/tax_unit_count (geo=2507)\n", - " aca_ptc/tax_unit_count (geo=2508)\n", - " aca_ptc/tax_unit_count (geo=2509)\n", - " aca_ptc/tax_unit_count (geo=2601)\n", - " aca_ptc/tax_unit_count (geo=2602)\n", - " aca_ptc/tax_unit_count (geo=2603)\n", - " aca_ptc/tax_unit_count (geo=2604)\n", - " aca_ptc/tax_unit_count (geo=2605)\n", - " aca_ptc/tax_unit_count (geo=2606)\n", - " aca_ptc/tax_unit_count (geo=2607)\n", - " aca_ptc/tax_unit_count (geo=2608)\n", - " aca_ptc/tax_unit_count (geo=2609)\n", - " aca_ptc/tax_unit_count (geo=2610)\n", - " aca_ptc/tax_unit_count (geo=2611)\n", - " aca_ptc/tax_unit_count (geo=2612)\n", - " aca_ptc/tax_unit_count (geo=2613)\n", - " aca_ptc/tax_unit_count (geo=2701)\n", - " aca_ptc/tax_unit_count (geo=2702)\n", - " aca_ptc/tax_unit_count (geo=2703)\n", - " aca_ptc/tax_unit_count (geo=2704)\n", - " aca_ptc/tax_unit_count (geo=2705)\n", - " aca_ptc/tax_unit_count (geo=2706)\n", - " aca_ptc/tax_unit_count (geo=2707)\n", - " aca_ptc/tax_unit_count (geo=2708)\n", - " aca_ptc/tax_unit_count (geo=2801)\n", - " aca_ptc/tax_unit_count (geo=2802)\n", - " aca_ptc/tax_unit_count (geo=2803)\n", - " aca_ptc/tax_unit_count (geo=2804)\n", - " aca_ptc/tax_unit_count (geo=2901)\n", - " aca_ptc/tax_unit_count (geo=2902)\n", - " aca_ptc/tax_unit_count (geo=2903)\n", - " aca_ptc/tax_unit_count (geo=2904)\n", - " aca_ptc/tax_unit_count (geo=2905)\n", - " aca_ptc/tax_unit_count (geo=2906)\n", - " aca_ptc/tax_unit_count (geo=2907)\n", - " aca_ptc/tax_unit_count (geo=2908)\n", - " aca_ptc/tax_unit_count (geo=3001)\n", - " aca_ptc/tax_unit_count (geo=3002)\n", - " aca_ptc/tax_unit_count (geo=3101)\n", - " aca_ptc/tax_unit_count (geo=3102)\n", - " aca_ptc/tax_unit_count (geo=3103)\n", - " aca_ptc/tax_unit_count (geo=3201)\n", - " aca_ptc/tax_unit_count (geo=3202)\n", - " aca_ptc/tax_unit_count (geo=3203)\n", - " aca_ptc/tax_unit_count (geo=3204)\n", - " aca_ptc/tax_unit_count (geo=3301)\n", - " aca_ptc/tax_unit_count (geo=3302)\n", - " aca_ptc/tax_unit_count (geo=3401)\n", - " aca_ptc/tax_unit_count (geo=3402)\n", - " aca_ptc/tax_unit_count (geo=3403)\n", - " aca_ptc/tax_unit_count (geo=3404)\n", - " aca_ptc/tax_unit_count (geo=3405)\n", - " aca_ptc/tax_unit_count (geo=3406)\n", - " aca_ptc/tax_unit_count (geo=3407)\n", - " aca_ptc/tax_unit_count (geo=3408)\n", - " aca_ptc/tax_unit_count (geo=3409)\n", - " aca_ptc/tax_unit_count (geo=3410)\n", - " aca_ptc/tax_unit_count (geo=3411)\n", - " aca_ptc/tax_unit_count (geo=3412)\n", - " aca_ptc/tax_unit_count (geo=3501)\n", - " aca_ptc/tax_unit_count (geo=3502)\n", - " aca_ptc/tax_unit_count (geo=3503)\n", - " aca_ptc/tax_unit_count (geo=3601)\n", - " aca_ptc/tax_unit_count (geo=3602)\n", - " aca_ptc/tax_unit_count (geo=3603)\n", - " aca_ptc/tax_unit_count (geo=3604)\n", - " aca_ptc/tax_unit_count (geo=3605)\n", - " aca_ptc/tax_unit_count (geo=3606)\n", - " aca_ptc/tax_unit_count (geo=3607)\n", - " aca_ptc/tax_unit_count (geo=3608)\n", - " aca_ptc/tax_unit_count (geo=3609)\n", - " aca_ptc/tax_unit_count (geo=3610)\n", - " aca_ptc/tax_unit_count (geo=3611)\n", - " aca_ptc/tax_unit_count (geo=3612)\n", - " aca_ptc/tax_unit_count (geo=3613)\n", - " aca_ptc/tax_unit_count (geo=3614)\n", - " aca_ptc/tax_unit_count (geo=3615)\n", - " aca_ptc/tax_unit_count (geo=3616)\n", - " aca_ptc/tax_unit_count (geo=3617)\n", - " aca_ptc/tax_unit_count (geo=3618)\n", - " aca_ptc/tax_unit_count (geo=3619)\n", - " aca_ptc/tax_unit_count (geo=3620)\n", - " aca_ptc/tax_unit_count (geo=3621)\n", - " aca_ptc/tax_unit_count (geo=3622)\n", - " aca_ptc/tax_unit_count (geo=3623)\n", - " aca_ptc/tax_unit_count (geo=3624)\n", - " aca_ptc/tax_unit_count (geo=3625)\n", - " aca_ptc/tax_unit_count (geo=3626)\n", - " aca_ptc/tax_unit_count (geo=3701)\n", - " aca_ptc/tax_unit_count (geo=3702)\n", - " aca_ptc/tax_unit_count (geo=3703)\n", - " aca_ptc/tax_unit_count (geo=3704)\n", - " aca_ptc/tax_unit_count (geo=3705)\n", - " aca_ptc/tax_unit_count (geo=3706)\n", - " aca_ptc/tax_unit_count (geo=3707)\n", - " aca_ptc/tax_unit_count (geo=3708)\n", - " aca_ptc/tax_unit_count (geo=3709)\n", - " aca_ptc/tax_unit_count (geo=3710)\n", - " aca_ptc/tax_unit_count (geo=3711)\n", - " aca_ptc/tax_unit_count (geo=3712)\n", - " aca_ptc/tax_unit_count (geo=3713)\n", - " aca_ptc/tax_unit_count (geo=3714)\n", - " aca_ptc/tax_unit_count (geo=3801)\n", - " aca_ptc/tax_unit_count (geo=3901)\n", - " aca_ptc/tax_unit_count (geo=3902)\n", - " aca_ptc/tax_unit_count (geo=3903)\n", - " aca_ptc/tax_unit_count (geo=3904)\n", - " aca_ptc/tax_unit_count (geo=3905)\n", - " aca_ptc/tax_unit_count (geo=3906)\n", - " aca_ptc/tax_unit_count (geo=3907)\n", - " aca_ptc/tax_unit_count (geo=3908)\n", - " aca_ptc/tax_unit_count (geo=3909)\n", - " aca_ptc/tax_unit_count (geo=3910)\n", - " aca_ptc/tax_unit_count (geo=3911)\n", - " aca_ptc/tax_unit_count (geo=3912)\n", - " aca_ptc/tax_unit_count (geo=3913)\n", - " aca_ptc/tax_unit_count (geo=3914)\n", - " aca_ptc/tax_unit_count (geo=3915)\n", - " aca_ptc/tax_unit_count (geo=4001)\n", - " aca_ptc/tax_unit_count (geo=4002)\n", - " aca_ptc/tax_unit_count (geo=4003)\n", - " aca_ptc/tax_unit_count (geo=4004)\n", - " aca_ptc/tax_unit_count (geo=4005)\n", - " aca_ptc/tax_unit_count (geo=401)\n", - " aca_ptc/tax_unit_count (geo=402)\n", - " aca_ptc/tax_unit_count (geo=403)\n", - " aca_ptc/tax_unit_count (geo=404)\n", - " aca_ptc/tax_unit_count (geo=405)\n", - " aca_ptc/tax_unit_count (geo=406)\n", - " aca_ptc/tax_unit_count (geo=407)\n", - " aca_ptc/tax_unit_count (geo=408)\n", - " aca_ptc/tax_unit_count (geo=409)\n", - " aca_ptc/tax_unit_count (geo=4101)\n", - " aca_ptc/tax_unit_count (geo=4102)\n", - " aca_ptc/tax_unit_count (geo=4103)\n", - " aca_ptc/tax_unit_count (geo=4104)\n", - " aca_ptc/tax_unit_count (geo=4105)\n", - " aca_ptc/tax_unit_count (geo=4106)\n", - " aca_ptc/tax_unit_count (geo=4201)\n", - " aca_ptc/tax_unit_count (geo=4202)\n", - " aca_ptc/tax_unit_count (geo=4203)\n", - " aca_ptc/tax_unit_count (geo=4204)\n", - " aca_ptc/tax_unit_count (geo=4205)\n", - " aca_ptc/tax_unit_count (geo=4206)\n", - " aca_ptc/tax_unit_count (geo=4207)\n", - " aca_ptc/tax_unit_count (geo=4208)\n", - " aca_ptc/tax_unit_count (geo=4209)\n", - " aca_ptc/tax_unit_count (geo=4210)\n", - " aca_ptc/tax_unit_count (geo=4211)\n", - " aca_ptc/tax_unit_count (geo=4212)\n", - " aca_ptc/tax_unit_count (geo=4213)\n", - " aca_ptc/tax_unit_count (geo=4214)\n", - " aca_ptc/tax_unit_count (geo=4215)\n", - " aca_ptc/tax_unit_count (geo=4216)\n", - " aca_ptc/tax_unit_count (geo=4217)\n", - " aca_ptc/tax_unit_count (geo=4401)\n", - " aca_ptc/tax_unit_count (geo=4402)\n", - " aca_ptc/tax_unit_count (geo=4501)\n", - " aca_ptc/tax_unit_count (geo=4502)\n", - " aca_ptc/tax_unit_count (geo=4503)\n", - " aca_ptc/tax_unit_count (geo=4504)\n", - " aca_ptc/tax_unit_count (geo=4505)\n", - " aca_ptc/tax_unit_count (geo=4506)\n", - " aca_ptc/tax_unit_count (geo=4507)\n", - " aca_ptc/tax_unit_count (geo=4601)\n", - " aca_ptc/tax_unit_count (geo=4701)\n", - " aca_ptc/tax_unit_count (geo=4702)\n", - " aca_ptc/tax_unit_count (geo=4703)\n", - " aca_ptc/tax_unit_count (geo=4704)\n", - " aca_ptc/tax_unit_count (geo=4705)\n", - " aca_ptc/tax_unit_count (geo=4706)\n", - " aca_ptc/tax_unit_count (geo=4707)\n", - " aca_ptc/tax_unit_count (geo=4708)\n", - " aca_ptc/tax_unit_count (geo=4709)\n", - " aca_ptc/tax_unit_count (geo=4801)\n", - " aca_ptc/tax_unit_count (geo=4802)\n", - " aca_ptc/tax_unit_count (geo=4803)\n", - " aca_ptc/tax_unit_count (geo=4804)\n", - " aca_ptc/tax_unit_count (geo=4805)\n", - " aca_ptc/tax_unit_count (geo=4806)\n", - " aca_ptc/tax_unit_count (geo=4807)\n", - " aca_ptc/tax_unit_count (geo=4808)\n", - " aca_ptc/tax_unit_count (geo=4809)\n", - " aca_ptc/tax_unit_count (geo=4810)\n", - " aca_ptc/tax_unit_count (geo=4811)\n", - " aca_ptc/tax_unit_count (geo=4812)\n", - " aca_ptc/tax_unit_count (geo=4813)\n", - " aca_ptc/tax_unit_count (geo=4814)\n", - " aca_ptc/tax_unit_count (geo=4815)\n", - " aca_ptc/tax_unit_count (geo=4816)\n", - " aca_ptc/tax_unit_count (geo=4817)\n", - " aca_ptc/tax_unit_count (geo=4818)\n", - " aca_ptc/tax_unit_count (geo=4819)\n", - " aca_ptc/tax_unit_count (geo=4820)\n", - " aca_ptc/tax_unit_count (geo=4821)\n", - " aca_ptc/tax_unit_count (geo=4822)\n", - " aca_ptc/tax_unit_count (geo=4823)\n", - " aca_ptc/tax_unit_count (geo=4824)\n", - " aca_ptc/tax_unit_count (geo=4825)\n", - " aca_ptc/tax_unit_count (geo=4826)\n", - " aca_ptc/tax_unit_count (geo=4827)\n", - " aca_ptc/tax_unit_count (geo=4828)\n", - " aca_ptc/tax_unit_count (geo=4829)\n", - " aca_ptc/tax_unit_count (geo=4830)\n", - " aca_ptc/tax_unit_count (geo=4831)\n", - " aca_ptc/tax_unit_count (geo=4832)\n", - " aca_ptc/tax_unit_count (geo=4833)\n", - " aca_ptc/tax_unit_count (geo=4834)\n", - " aca_ptc/tax_unit_count (geo=4835)\n", - " aca_ptc/tax_unit_count (geo=4836)\n", - " aca_ptc/tax_unit_count (geo=4837)\n", - " aca_ptc/tax_unit_count (geo=4838)\n", - " aca_ptc/tax_unit_count (geo=4901)\n", - " aca_ptc/tax_unit_count (geo=4902)\n", - " aca_ptc/tax_unit_count (geo=4903)\n", - " aca_ptc/tax_unit_count (geo=4904)\n", - " aca_ptc/tax_unit_count (geo=5001)\n", - " aca_ptc/tax_unit_count (geo=501)\n", - " aca_ptc/tax_unit_count (geo=502)\n", - " aca_ptc/tax_unit_count (geo=503)\n", - " aca_ptc/tax_unit_count (geo=504)\n", - " aca_ptc/tax_unit_count (geo=5101)\n", - " aca_ptc/tax_unit_count (geo=5102)\n", - " aca_ptc/tax_unit_count (geo=5103)\n", - " aca_ptc/tax_unit_count (geo=5104)\n", - " aca_ptc/tax_unit_count (geo=5105)\n", - " aca_ptc/tax_unit_count (geo=5106)\n", - " aca_ptc/tax_unit_count (geo=5107)\n", - " aca_ptc/tax_unit_count (geo=5108)\n", - " aca_ptc/tax_unit_count (geo=5109)\n", - " aca_ptc/tax_unit_count (geo=5110)\n", - " aca_ptc/tax_unit_count (geo=5111)\n", - " aca_ptc/tax_unit_count (geo=5301)\n", - " aca_ptc/tax_unit_count (geo=5302)\n", - " aca_ptc/tax_unit_count (geo=5303)\n", - " aca_ptc/tax_unit_count (geo=5304)\n", - " aca_ptc/tax_unit_count (geo=5305)\n", - " aca_ptc/tax_unit_count (geo=5306)\n", - " aca_ptc/tax_unit_count (geo=5307)\n", - " aca_ptc/tax_unit_count (geo=5308)\n", - " aca_ptc/tax_unit_count (geo=5309)\n", - " aca_ptc/tax_unit_count (geo=5310)\n", - " aca_ptc/tax_unit_count (geo=5401)\n", - " aca_ptc/tax_unit_count (geo=5402)\n", - " aca_ptc/tax_unit_count (geo=5501)\n", - " aca_ptc/tax_unit_count (geo=5502)\n", - " aca_ptc/tax_unit_count (geo=5503)\n", - " aca_ptc/tax_unit_count (geo=5504)\n", - " aca_ptc/tax_unit_count (geo=5505)\n", - " aca_ptc/tax_unit_count (geo=5506)\n", - " aca_ptc/tax_unit_count (geo=5507)\n", - " aca_ptc/tax_unit_count (geo=5508)\n", - " aca_ptc/tax_unit_count (geo=5601)\n", - " aca_ptc/tax_unit_count (geo=601)\n", - " aca_ptc/tax_unit_count (geo=602)\n", - " aca_ptc/tax_unit_count (geo=603)\n", - " aca_ptc/tax_unit_count (geo=604)\n", - " aca_ptc/tax_unit_count (geo=605)\n", - " aca_ptc/tax_unit_count (geo=606)\n", - " aca_ptc/tax_unit_count (geo=607)\n", - " aca_ptc/tax_unit_count (geo=608)\n", - " aca_ptc/tax_unit_count (geo=609)\n", - " aca_ptc/tax_unit_count (geo=610)\n", - " aca_ptc/tax_unit_count (geo=611)\n", - " aca_ptc/tax_unit_count (geo=612)\n", - " aca_ptc/tax_unit_count (geo=613)\n", - " aca_ptc/tax_unit_count (geo=614)\n", - " aca_ptc/tax_unit_count (geo=615)\n", - " aca_ptc/tax_unit_count (geo=616)\n", - " aca_ptc/tax_unit_count (geo=617)\n", - " aca_ptc/tax_unit_count (geo=618)\n", - " aca_ptc/tax_unit_count (geo=619)\n", - " aca_ptc/tax_unit_count (geo=620)\n", - " aca_ptc/tax_unit_count (geo=621)\n", - " aca_ptc/tax_unit_count (geo=622)\n", - " aca_ptc/tax_unit_count (geo=623)\n", - " aca_ptc/tax_unit_count (geo=624)\n", - " aca_ptc/tax_unit_count (geo=625)\n", - " aca_ptc/tax_unit_count (geo=626)\n", - " aca_ptc/tax_unit_count (geo=627)\n", - " aca_ptc/tax_unit_count (geo=628)\n", - " aca_ptc/tax_unit_count (geo=629)\n", - " aca_ptc/tax_unit_count (geo=630)\n", - " aca_ptc/tax_unit_count (geo=631)\n", - " aca_ptc/tax_unit_count (geo=632)\n", - " aca_ptc/tax_unit_count (geo=633)\n", - " aca_ptc/tax_unit_count (geo=634)\n", - " aca_ptc/tax_unit_count (geo=635)\n", - " aca_ptc/tax_unit_count (geo=636)\n", - " aca_ptc/tax_unit_count (geo=637)\n", - " aca_ptc/tax_unit_count (geo=638)\n", - " aca_ptc/tax_unit_count (geo=639)\n", - " aca_ptc/tax_unit_count (geo=640)\n", - " aca_ptc/tax_unit_count (geo=641)\n", - " aca_ptc/tax_unit_count (geo=642)\n", - " aca_ptc/tax_unit_count (geo=643)\n", - " aca_ptc/tax_unit_count (geo=644)\n", - " aca_ptc/tax_unit_count (geo=645)\n", - " aca_ptc/tax_unit_count (geo=646)\n", - " aca_ptc/tax_unit_count (geo=647)\n", - " aca_ptc/tax_unit_count (geo=648)\n", - " aca_ptc/tax_unit_count (geo=649)\n", - " aca_ptc/tax_unit_count (geo=650)\n", - " aca_ptc/tax_unit_count (geo=651)\n", - " aca_ptc/tax_unit_count (geo=652)\n", - " aca_ptc/tax_unit_count (geo=801)\n", - " aca_ptc/tax_unit_count (geo=802)\n", - " aca_ptc/tax_unit_count (geo=803)\n", - " aca_ptc/tax_unit_count (geo=804)\n", - " aca_ptc/tax_unit_count (geo=805)\n", - " aca_ptc/tax_unit_count (geo=806)\n", - " aca_ptc/tax_unit_count (geo=807)\n", - " aca_ptc/tax_unit_count (geo=808)\n", - " aca_ptc/tax_unit_count (geo=901)\n", - " aca_ptc/tax_unit_count (geo=902)\n", - " aca_ptc/tax_unit_count (geo=903)\n", - " aca_ptc/tax_unit_count (geo=904)\n", - " aca_ptc/tax_unit_count (geo=905)\n" + "Achievable targets: 479\n", + "Impossible targets: 881\n", + "\n", + "Impossible targets by (domain, variable):\n", + " aca_ptc/aca_ptc: 436\n", + " aca_ptc/tax_unit_count: 436\n", + " snap/household_count: 7\n", + " aca_ptc/person_count: 1\n", + " snap/snap: 1\n" ] } ], @@ -1647,12 +636,15 @@ "\n", "if n_impossible > 0:\n", " impossible = targets_filtered[~achievable_mask]\n", - " print(\"\\nImpossible targets:\")\n", - " for _, r in impossible.iterrows():\n", - " print(\n", - " f\" {r.get('domain_variable', '?')}/{r['variable']} \"\n", - " f\"(geo={r['geographic_id']})\"\n", - " )" + " by_var = (\n", + " impossible.groupby([\"domain_variable\", \"variable\"])\n", + " .agg(count=(\"value\", \"size\"))\n", + " .reset_index()\n", + " .sort_values(\"count\", ascending=False)\n", + " )\n", + " print(\"\\nImpossible targets by (domain, variable):\")\n", + " for _, r in by_var.iterrows():\n", + " print(f\" {r['domain_variable']}/{r['variable']}: {r['count']}\")" ] }, { @@ -1665,11 +657,11 @@ "output_type": "stream", "text": [ "Hardest targets (lowest row_sum / target_value ratio):\n", - " snap/household_count (geo=3615): ratio=0.0088, row_sum=1,535, target=173,591\n", - " snap/household_count (geo=3613): ratio=0.0110, row_sum=1,535, target=139,162\n", - " snap/household_count (geo=621): ratio=0.0124, row_sum=1,483, target=119,148\n", - " snap/household_count (geo=3608): ratio=0.0129, row_sum=1,535, target=118,977\n", - " snap/household_count (geo=634): ratio=0.0130, row_sum=1,483, target=113,916\n" + " snap/household_count (geo=621): ratio=0.0000, row_sum=4, target=119,148\n", + " snap/household_count (geo=3615): ratio=0.0001, row_sum=9, target=173,591\n", + " snap/snap (geo=46): ratio=0.0001, row_sum=9,421, target=180,195,817\n", + " snap/household_count (geo=3625): ratio=0.0001, row_sum=4, target=67,315\n", + " snap/household_count (geo=1702): ratio=0.0001, row_sum=6, target=97,494\n" ] } ], @@ -1700,9 +692,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Final matrix shape: (487, 5231564)\n", - "Final non-zero entries: 1,466,022\n", - "Final density: 0.000575\n", + "Final matrix shape: (479, 35997)\n", + "Final non-zero entries: 9,944\n", + "Final density: 0.000577\n", "\n", "This is what the optimizer receives.\n" ] @@ -1724,10 +716,10 @@ "\n", "The calibration matrix pipeline has five steps:\n", "\n", - "1. **Build** — `SparseMatrixBuilder.build_matrix()` queries targets, applies hierarchical uprating, evaluates constraints, and assembles the sparse CSR matrix.\n", - "2. **Read** — `MatrixTracer` decodes rows (targets) and columns (household-CD pairs) so you can verify the matrix makes sense.\n", + "1. **Clone + assign** — `assign_random_geography()` creates N clones of each CPS record, each with a random census block (and derived CD/state).\n", + "2. **Build** — `UnifiedMatrixBuilder.build_matrix()` queries targets, applies hierarchical uprating, simulates each clone with its assigned geography, and assembles the sparse CSR matrix.\n", "3. **Groups** — `create_target_groups()` partitions rows for balanced loss weighting. `GROUPS_TO_EXCLUDE` drops redundant constraints.\n", - "4. **Sparsity** — Most of the matrix is zero. District-level targets confine non-zeros to single CD blocks; national targets span all blocks.\n", + "4. **Sparsity** — Most of the matrix is zero. District-level targets confine non-zeros to clones assigned to that district; national targets span all clones.\n", "5. **Filter** — Remove impossible targets (row sum = 0) before handing to the optimizer.\n", "\n", "When adding new domains or variables to the calibration, re-run this notebook to verify the new targets appear correctly and don't introduce impossible constraints." @@ -1755,4 +747,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/docs/hierarchical_uprating.ipynb b/docs/hierarchical_uprating.ipynb index 76530225c..4da30d82c 100644 --- a/docs/hierarchical_uprating.ipynb +++ b/docs/hierarchical_uprating.ipynb @@ -51,20 +51,16 @@ "import pandas as pd\n", "\n", "from policyengine_us_data.storage import STORAGE_FOLDER\n", - "from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (\n", - " SparseMatrixBuilder,\n", + "from policyengine_us_data.calibration.unified_matrix_builder import (\n", + " UnifiedMatrixBuilder,\n", ")\n", "from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n", - " get_all_cds_from_database,\n", " STATE_CODES,\n", ")\n", "\n", "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n", "db_uri = f\"sqlite:///{db_path}\"\n", - "cds = get_all_cds_from_database(db_uri)\n", - "builder = SparseMatrixBuilder(\n", - " db_uri, time_period=2024, cds_to_calibrate=cds\n", - ")" + "builder = UnifiedMatrixBuilder(db_uri, time_period=2024)" ] }, { diff --git a/docs/local_area_calibration_setup.ipynb b/docs/local_area_calibration_setup.ipynb index c21264e9a..2e8614aa9 100644 --- a/docs/local_area_calibration_setup.ipynb +++ b/docs/local_area_calibration_setup.ipynb @@ -7,7 +7,21 @@ "source": [ "# Local Area Calibration Setup\n", "\n", - "This notebook demonstrates the sparse matrix construction for local area (congressional district) calibration. It uses a subset of CDs from NC, HI, MT, and AK for manageable runtime." + "This notebook demonstrates the clone-based calibration pipeline: how raw CPS records become a calibration matrix and, ultimately, CD-level stacked datasets.\n", + "\n", + "The paradigm shift from the old approach: instead of replicating every household into every congressional district, we **clone** each record N times and assign each clone a **random census block** drawn from a population-weighted distribution. Each clone inherits a state, CD, and block — and gets re-simulated under the rules of its assigned state.\n", + "\n", + "We follow one household (`record_idx=8629`, household_id 128694, SNAP \\$18,396) through the entire pipeline:\n", + "1. Clone and assign geography\n", + "2. Simulate under new state rules (`_simulate_clone`)\n", + "3. Geographic column masking\n", + "4. Re-randomize takeup per census block\n", + "5. Build the calibration matrix\n", + "6. Create stacked datasets from calibrated weights\n", + "\n", + "**Companion notebook:** [calibration_matrix.ipynb](calibration_matrix.ipynb) covers the *finished* matrix — row/column anatomy, target groups, sparsity. This notebook covers the *process* that creates it and what happens after (stacked datasets).\n", + "\n", + "**Requirements:** `policy_data.db`, `block_cd_distributions.csv.gz`, and the stratified CPS h5 file in `STORAGE_FOLDER`." ] }, { @@ -23,24 +37,52 @@ "execution_count": 1, "id": "cell-2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/baogorek/envs/sep/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ - "from sqlalchemy import create_engine, text\n", - "import pandas as pd\n", "import numpy as np\n", + "import pandas as pd\n", + "from collections import defaultdict\n", "\n", "from policyengine_us import Microsimulation\n", "from policyengine_us_data.storage import STORAGE_FOLDER\n", - "from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import (\n", - " SparseMatrixBuilder,\n", + "from policyengine_us_data.calibration.clone_and_assign import (\n", + " assign_random_geography,\n", + " GeographyAssignment,\n", + " load_global_block_distribution,\n", + ")\n", + "from policyengine_us_data.calibration.unified_matrix_builder import (\n", + " UnifiedMatrixBuilder,\n", ")\n", - "from policyengine_us_data.datasets.cps.local_area_calibration.matrix_tracer import (\n", - " MatrixTracer,\n", + "from policyengine_us_data.calibration.unified_calibration import (\n", + " rerandomize_takeup,\n", + " SIMPLE_TAKEUP_VARS,\n", ")\n", + "from policyengine_us_data.utils.randomness import seeded_rng\n", + "from policyengine_us_data.parameters import load_take_up_rate\n", "from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import (\n", " get_calculated_variables,\n", - " create_target_groups,\n", - ")" + " STATE_CODES,\n", + " get_all_cds_from_database,\n", + ")\n", + "from policyengine_us_data.datasets.cps.local_area_calibration.stacked_dataset_builder import (\n", + " create_sparse_cd_stacked_dataset,\n", + ")\n", + "\n", + "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n", + "db_uri = f\"sqlite:///{db_path}\"\n", + "dataset_path = str(STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\")\n", + "\n", + "N_CLONES = 3\n", + "SEED = 42" ] }, { @@ -48,13 +90,30 @@ "execution_count": 2, "id": "cell-3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base dataset: 11,999 households\n", + "Example household: record_idx=8629, household_id=128694, SNAP=$18,396.00\n" + ] + } + ], "source": [ - "db_path = STORAGE_FOLDER / \"calibration\" / \"policy_data.db\"\n", - "db_uri = f\"sqlite:///{db_path}\"\n", - "dataset_path = str(STORAGE_FOLDER / \"stratified_extended_cps_2024.h5\")\n", + "sim = Microsimulation(dataset=dataset_path)\n", + "hh_ids = sim.calculate(\"household_id\", map_to=\"household\").values\n", + "snap_values = sim.calculate(\"snap\", map_to=\"household\").values\n", + "n_records = len(hh_ids)\n", "\n", - "engine = create_engine(db_uri)" + "record_idx = 8629 # High SNAP ($18k), lands in TX/PA/NY with seed=42\n", + "example_hh_id = hh_ids[record_idx]\n", + "print(f\"Base dataset: {n_records:,} households\")\n", + "print(\n", + " f\"Example household: record_idx={record_idx}, \"\n", + " f\"household_id={example_hh_id}, \"\n", + " f\"SNAP=${snap_values[record_idx]:,.2f}\"\n", + ")" ] }, { @@ -62,13 +121,9 @@ "id": "cell-4", "metadata": {}, "source": [ - "## Section 2: Select Test Congressional Districts\n", + "## Section 2: Geography Assignment\n", "\n", - "We use CDs from 4 states for testing:\n", - "- **NC (37)**: 14 CDs (3701-3714) - provides same-state different-CD test cases\n", - "- **HI (15)**: 2 CDs (1501-1502)\n", - "- **MT (30)**: 2 CDs (3001-3002)\n", - "- **AK (2)**: 1 CD (200)" + "`assign_random_geography` creates `n_records * n_clones` total records, each assigned a random census block from a population-weighted distribution. State and CD are derived from the block GEOID. The result is a `GeographyAssignment` dataclass with arrays indexed as `clone_idx * n_records + record_idx`." ] }, { @@ -81,557 +136,850 @@ "name": "stdout", "output_type": "stream", "text": [ - "Testing with 19 congressional districts:\n", - " NC (37): ['3701', '3702', '3703', '3704', '3705', '3706', '3707', '3708', '3709', '3710', '3711', '3712', '3713', '3714']\n", - " HI (15): ['1501', '1502']\n", - " MT (30): ['3001', '3002']\n", - " AK (2): ['201']\n" + "Total cloned records: 35,997\n", + "Unique states: 50\n", + "Unique CDs: 435\n", + "Unique blocks: 35508\n" ] } ], "source": [ - "query = \"\"\"\n", - "SELECT DISTINCT sc.value as cd_geoid\n", - "FROM stratum_constraints sc\n", - "WHERE sc.constraint_variable = 'congressional_district_geoid'\n", - " AND (\n", - " sc.value LIKE '37__'\n", - " OR sc.value LIKE '150_'\n", - " OR sc.value LIKE '300_'\n", - " OR sc.value = '200' OR sc.value = '201'\n", - " )\n", - "ORDER BY sc.value\n", - "\"\"\"\n", - "\n", - "with engine.connect() as conn:\n", - " result = conn.execute(text(query)).fetchall()\n", - " test_cds = [row[0] for row in result]\n", - "\n", - "print(f\"Testing with {len(test_cds)} congressional districts:\")\n", - "print(f\" NC (37): {[cd for cd in test_cds if cd.startswith('37')]}\")\n", - "print(f\" HI (15): {[cd for cd in test_cds if cd.startswith('15')]}\")\n", - "print(f\" MT (30): {[cd for cd in test_cds if cd.startswith('30')]}\")\n", - "print(f\" AK (2): {[cd for cd in test_cds if cd.startswith('20')]}\")" - ] - }, - { - "cell_type": "markdown", - "id": "cell-6", - "metadata": {}, - "source": [ - "## Section 3: Build the Sparse Matrix\n", + "geography = assign_random_geography(n_records, n_clones=N_CLONES, seed=SEED)\n", + "n_total = n_records * N_CLONES\n", "\n", - "The sparse matrix `X_sparse` has:\n", - "- **Rows**: Calibration targets (e.g., SNAP totals by geography)\n", - "- **Columns**: (household × CD) pairs - each household appears once per CD\n", - "\n", - "We filter to SNAP targets using the `domain_variables` filter for this demonstration." + "print(f\"Total cloned records: {n_total:,}\")\n", + "print(f\"Unique states: {len(np.unique(geography.state_fips))}\")\n", + "print(f\"Unique CDs: {len(np.unique(geography.cd_geoid))}\")\n", + "print(f\"Unique blocks: {len(np.unique(geography.block_geoid))}\")" ] }, { "cell_type": "code", "execution_count": 4, - "id": "cell-7", + "id": "cell-6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "X_sparse shape: (539, 227981)\n", - " Rows (targets): 539\n", - " Columns (household × CD pairs): 227981\n", - " Non-zero entries: 141,536\n", - " Sparsity: 99.88%\n" + "Example household (record_idx=8629) across 3 clones:\n", + "\n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
clonecolstate_fipsabbrcd_geoidblock_geoid
00862948TX4817481450004002026
112062842PA4201420171058013029
223262736NY3611360850208041023
\n", + "
" + ], + "text/plain": [ + " clone col state_fips abbr cd_geoid block_geoid\n", + "0 0 8629 48 TX 4817 481450004002026\n", + "1 1 20628 42 PA 4201 420171058013029\n", + "2 2 32627 36 NY 3611 360850208041023" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "sim = Microsimulation(dataset=dataset_path)\n", - "\n", - "builder = SparseMatrixBuilder(\n", - " db_uri,\n", - " time_period=2024,\n", - " cds_to_calibrate=test_cds,\n", - " dataset_path=dataset_path,\n", + "print(\n", + " f\"Example household (record_idx={record_idx}) across {N_CLONES} clones:\\n\"\n", ")\n", - "\n", - "targets_df, X_sparse, household_id_mapping = builder.build_matrix(\n", - " sim, target_filter={\"domain_variables\": [\"snap\"], \"variables\": [\"snap\"]}\n", - ")\n", - "\n", - "print(f\"X_sparse shape: {X_sparse.shape}\")\n", - "print(f\" Rows (targets): {X_sparse.shape[0]}\")\n", - "print(f\" Columns (household × CD pairs): {X_sparse.shape[1]}\")\n", - "print(f\" Non-zero entries: {X_sparse.nnz:,}\")\n", - "print(f\" Sparsity: {1 - X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.2%}\")" + "rows = []\n", + "for c in range(N_CLONES):\n", + " col = c * n_records + record_idx\n", + " rows.append(\n", + " {\n", + " \"clone\": c,\n", + " \"col\": col,\n", + " \"state_fips\": geography.state_fips[col],\n", + " \"abbr\": STATE_CODES.get(geography.state_fips[col], \"??\"),\n", + " \"cd_geoid\": geography.cd_geoid[col],\n", + " \"block_geoid\": geography.block_geoid[col],\n", + " }\n", + " )\n", + "pd.DataFrame(rows)" ] }, { "cell_type": "markdown", - "id": "cell-8", + "id": "cell-7", "metadata": {}, "source": [ - "## Section 4: Understanding the Matrix Structure with MatrixTracer\n", + "One household, three parallel geographic identities. Each clone will be simulated under different state rules, producing different benefit amounts.\n", "\n", - "The `MatrixTracer` helps navigate the sparse matrix by providing lookups between:\n", - "- Column indices ↔ (household_id, CD) pairs\n", - "- Row indices ↔ target definitions" + "**Note:** With only N_CLONES=3 (~36K total samples), small-population areas like DC may not appear in the random draw. The production pipeline uses N_CLONES=10, which covers all 51 state-equivalents and 436 CDs." ] }, { "cell_type": "code", "execution_count": 5, - "id": "cell-9", + "id": "cell-8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "================================================================================\n", - "MATRIX STRUCTURE BREAKDOWN\n", - "================================================================================\n", - "\n", - "Matrix dimensions: 539 rows x 227981 columns\n", - " Rows = 539 targets\n", - " Columns = 11999 households x 19 CDs\n", - " = 11,999 x 19 = 227,981\n", - "\n", - "--------------------------------------------------------------------------------\n", - "COLUMN STRUCTURE (Households stacked by CD)\n", - "--------------------------------------------------------------------------------\n", - "\n", - "Showing first and last 5 CDs of 19 total:\n", - "\n", - "First 5 CDs:\n", - "cd_geoid start_col end_col n_households\n", - " 1501 0 11998 11999\n", - " 1502 11999 23997 11999\n", - " 201 23998 35996 11999\n", - " 3001 35997 47995 11999\n", - " 3002 47996 59994 11999\n", - "\n", - "Last 5 CDs:\n", - "cd_geoid start_col end_col n_households\n", - " 3710 167986 179984 11999\n", - " 3711 179985 191983 11999\n", - " 3712 191984 203982 11999\n", - " 3713 203983 215981 11999\n", - " 3714 215982 227980 11999\n", - "\n", - "--------------------------------------------------------------------------------\n", - "ROW STRUCTURE (Targets)\n", - "--------------------------------------------------------------------------------\n", - "\n", - "Total targets: 539\n", - "\n", - "Targets by domain variable:\n", - " n_targets n_unique_vars\n", - "domain_variable \n", - "snap 538 2\n", - "\n", - "--------------------------------------------------------------------------------\n", - "TARGET GROUPS (for loss calculation)\n", - "--------------------------------------------------------------------------------\n", - "\n", - "=== Creating Target Groups ===\n", - "\n", - "National targets:\n", - " Group 0: Snap = 93,730,290,000\n", - "\n", - "State targets:\n", - " Group 1: SNAP Household Count (51 targets)\n", - " Group 2: Snap (51 targets)\n", - "\n", - "District targets:\n", - " Group 3: SNAP Household Count (436 targets)\n", - "\n", - "Total groups created: 4\n", - "========================================\n", - " Group 0: National Snap (1 target, value=93,730,290,000) - rows [0]\n", - " Group 1: State SNAP Household Count (51 targets) - rows [1, 2, 3, ..., 50, 51]\n", - " Group 2: State Snap (51 targets) - rows [52, 53, 54, ..., 101, 102]\n", - " Group 3: District SNAP Household Count (436 targets) - rows [103, 104, 105, ..., 537, 538]\n", - "\n", - "================================================================================\n" + "Global block distribution: 5,765,442 blocks\n", + "Top 5 states by total probability:\n", + " CA (6): 11.954%\n", + " TX (48): 8.736%\n", + " FL (12): 6.437%\n", + " NY (36): 5.977%\n", + " PA (42): 3.908%\n" ] } ], "source": [ - "tracer = MatrixTracer(\n", - " targets_df, X_sparse, household_id_mapping, test_cds, sim\n", - ")\n", + "blocks, cds, states, probs = load_global_block_distribution()\n", + "print(f\"Global block distribution: {len(blocks):,} blocks\")\n", + "print(f\"Top 5 states by total probability:\")\n", + "state_prob = pd.Series(probs, index=states).groupby(level=0).sum()\n", + "top5 = state_prob.nlargest(5)\n", + "for fips, p in top5.items():\n", + " print(f\" {STATE_CODES.get(fips, '??')} ({fips}): {p:.3%}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-9", + "metadata": {}, + "source": [ + "## Section 3: Inside `_simulate_clone` — State-Swap\n", + "\n", + "For each clone, `_simulate_clone` does four things:\n", + "1. Creates a **fresh** `Microsimulation` from the base dataset\n", + "2. Overwrites `state_fips` with the clone's assigned states\n", + "3. Optionally calls a `sim_modifier` (e.g., takeup re-randomization)\n", + "4. **Clears cached formulas** via `get_calculated_variables` — preserving survey inputs and IDs while forcing recalculation of state-dependent variables like SNAP\n", "\n", - "tracer.print_matrix_structure()" + "Let's reproduce this manually for clone 0." ] }, { "cell_type": "code", "execution_count": 6, - "id": "cell-11", + "id": "cell-10", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "=== Creating Target Groups ===\n", - "\n", - "National targets:\n", - " Group 0: Snap = 93,730,290,000\n", - "\n", - "State targets:\n", - " Group 1: SNAP Household Count (51 targets)\n", - " Group 2: Snap (51 targets)\n", - "\n", - "District targets:\n", - " Group 3: SNAP Household Count (436 targets)\n", - "\n", - "Total groups created: 4\n", - "========================================\n" + "Example household (record_idx=8629):\n", + " Original state: NC (37)\n", + " Clone 0 state: TX (48)\n", + " Original SNAP: $18,396.00\n", + " Clone 0 SNAP: $18,396.00\n" ] } ], "source": [ - "target_groups, group_info = create_target_groups(targets_df)" + "clone_idx = 0\n", + "col_start = clone_idx * n_records\n", + "col_end = col_start + n_records\n", + "clone_states = geography.state_fips[col_start:col_end]\n", + "\n", + "clone_sim = Microsimulation(dataset=dataset_path)\n", + "clone_sim.set_input(\"state_fips\", 2024, clone_states.astype(np.int32))\n", + "for var in get_calculated_variables(clone_sim):\n", + " clone_sim.delete_arrays(var)\n", + "\n", + "new_snap = clone_sim.calculate(\"snap\", map_to=\"household\").values\n", + "\n", + "orig_state = sim.calculate(\"state_fips\", map_to=\"household\").values[record_idx]\n", + "new_state = clone_states[record_idx]\n", + "\n", + "print(f\"Example household (record_idx={record_idx}):\")\n", + "print(\n", + " f\" Original state: {STATE_CODES.get(int(orig_state), '??')} \"\n", + " f\"({int(orig_state)})\"\n", + ")\n", + "print(\n", + " f\" Clone 0 state: {STATE_CODES.get(int(new_state), '??')} \"\n", + " f\"({int(new_state)})\"\n", + ")\n", + "print(f\" Original SNAP: ${snap_values[record_idx]:,.2f}\")\n", + "print(f\" Clone 0 SNAP: ${new_snap[record_idx]:,.2f}\")" ] }, { "cell_type": "code", "execution_count": 7, - "id": "7e75756b-a317-4800-bac5-e0fd6bc43b8c", + "id": "cell-11", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Row info for North Carolina's SNAP benefit amount:\n", - "{'row_index': 80, 'variable': 'snap', 'variable_desc': 'SNAP allotment', 'geographic_id': '37', 'target_value': 2934626410.0, 'stratum_id': 9363, 'domain_variable': 'snap'}\n" + "SNAP for record_idx=8629 across all 3 clones:\n", + "\n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
clonestatestate_fipsSNAP
00TX48$18,396.00
11PA42$18,396.00
22NY36$18,396.00
\n", + "
" + ], + "text/plain": [ + " clone state state_fips SNAP\n", + "0 0 TX 48 $18,396.00\n", + "1 1 PA 42 $18,396.00\n", + "2 2 NY 36 $18,396.00" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "target_group = tracer.get_group_rows(2)\n", - "row_loc = target_group.iloc[28]['row_index'] # Manually found the index value 28\n", - "row_info = tracer.get_row_info(row_loc)\n", - "var = row_info['variable']\n", - "var_desc = row_info['variable_desc']\n", - "target_geo_id = int(row_info['geographic_id'])\n", + "print(f\"SNAP for record_idx={record_idx} across all {N_CLONES} clones:\\n\")\n", + "rows = []\n", + "for c in range(N_CLONES):\n", + " cs = geography.state_fips[c * n_records + record_idx]\n", + " s = Microsimulation(dataset=dataset_path)\n", + " s.set_input(\n", + " \"state_fips\",\n", + " 2024,\n", + " geography.state_fips[c * n_records : (c + 1) * n_records].astype(\n", + " np.int32\n", + " ),\n", + " )\n", + " for var in get_calculated_variables(s):\n", + " s.delete_arrays(var)\n", + " clone_snap = s.calculate(\"snap\", map_to=\"household\").values\n", + " rows.append(\n", + " {\n", + " \"clone\": c,\n", + " \"state\": STATE_CODES.get(int(cs), \"??\"),\n", + " \"state_fips\": int(cs),\n", + " \"SNAP\": f\"${clone_snap[record_idx]:,.2f}\",\n", + " }\n", + " )\n", + "pd.DataFrame(rows)" + ] + }, + { + "cell_type": "markdown", + "id": "cell-12", + "metadata": {}, + "source": [ + "`get_calculated_variables` is selective: it identifies variables with formulas (state-dependent computations) while preserving survey-reported inputs and entity IDs. This is what allows the same demographic household to produce different benefit amounts under different state rules." + ] + }, + { + "cell_type": "markdown", + "id": "cell-13", + "metadata": {}, + "source": [ + "## Section 4: Geographic Column Masking\n", "\n", - "print(\"Row info for North Carolina's SNAP benefit amount:\")\n", - "print(row_info)" + "When assembling the calibration matrix, each target row only \"sees\" columns (clones) whose geography matches the target's geography. This is implemented via `state_to_cols` and `cd_to_cols` dictionaries built from the `GeographyAssignment`.\n", + "\n", + "This is step 3 of `build_matrix` — reproduced here for transparency." ] }, { "cell_type": "code", "execution_count": 8, - "id": "c2be9721-ff11-4f78-ba0b-03407201dd53", + "id": "cell-14", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " household_id household_weight state_fips snap\n", - "0 26 1205.310059 23 0.0\n", - "1 34 2170.419922 23 0.0\n", - "2 38 587.510010 23 0.0\n", - "3 46 1010.840027 23 0.0\n", - "4 71 957.460022 23 0.0\n", - "... ... ... ... ...\n", - "11994 177822 0.000000 15 0.0\n", - "11995 177829 0.000000 15 0.0\n", - "11996 177831 0.000000 15 0.0\n", - "11997 177860 0.000000 15 6294.0\n", - "11998 177861 0.000000 15 0.0\n", + "Unique states mapped: 50\n", + "Unique CDs mapped: 435\n", "\n", - "[11999 rows x 4 columns]\n" + "Columns per state: min=62, median=494, max=4311\n" ] } ], "source": [ - "hh_snap_df = pd.DataFrame(sim.calculate_dataframe([\n", - " \"household_id\", \"household_weight\", \"state_fips\", \"snap\"]) \n", - ")\n", - "print(hh_snap_df)" - ] - }, - { - "cell_type": "markdown", - "id": "438828ac-df94-4d3e-a9a8-227bb6f64933", - "metadata": {}, - "source": [ - "If we were to include `congressional_district_geoid` above, they would all be zeros. It's not until we do the calibration, i.e., come back with a vector of weights `w` to multiply `X_sparse` with, that we will set `congressional_district_geoid`.\n", + "state_col_lists = defaultdict(list)\n", + "cd_col_lists = defaultdict(list)\n", + "for col in range(n_total):\n", + " state_col_lists[int(geography.state_fips[col])].append(col)\n", + " cd_col_lists[str(geography.cd_geoid[col])].append(col)\n", "\n", - "However, every household is already a donor to every contressional district. You can get the column positions for every household (remember targets are on the rows, donor households on the columns) by running tracer's get_household_column_positions with the *original* `household_id`." + "state_to_cols = {s: np.array(c) for s, c in state_col_lists.items()}\n", + "cd_to_cols = {cd: np.array(c) for cd, c in cd_col_lists.items()}\n", + "\n", + "print(f\"Unique states mapped: {len(state_to_cols)}\")\n", + "print(f\"Unique CDs mapped: {len(cd_to_cols)}\")\n", + "\n", + "state_counts = {s: len(c) for s, c in state_to_cols.items()}\n", + "sc_series = pd.Series(state_counts)\n", + "print(\n", + " f\"\\nColumns per state: min={sc_series.min()}, \"\n", + " f\"median={sc_series.median():.0f}, max={sc_series.max()}\"\n", + ")" ] }, { "cell_type": "code", "execution_count": 9, - "id": "cell-12", + "id": "cell-15", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " household_id household_weight state_fips snap\n", - "23 654 1550.660034 23 70.080002\n", + "Example household clone visibility:\n", "\n", - "Evaluating the tracer.get_household_column_positions dictionary:\n", + "Clone 0 (TX, CD 4817):\n", + " Visible to TX state targets: col 8629 in state_to_cols[48]? True\n", + " Visible to CD 4817 targets: col 8629 in cd_to_cols['4817']? True\n", + " Visible to NC (37) targets: False\n", "\n", - "{'1501': 23, '1502': 12022, '201': 24021, '3001': 36020, '3002': 48019, '3701': 60018, '3702': 72017, '3703': 84016, '3704': 96015, '3705': 108014, '3706': 120013, '3707': 132012, '3708': 144011, '3709': 156010, '3710': 168009, '3711': 180008, '3712': 192007, '3713': 204006, '3714': 216005}\n" + "Clone 1 (PA, CD 4201):\n", + " Visible to PA state targets: col 20628 in state_to_cols[42]? True\n", + " Visible to CD 4201 targets: col 20628 in cd_to_cols['4201']? True\n", + " Visible to NC (37) targets: False\n", + "\n", + "Clone 2 (NY, CD 3611):\n", + " Visible to NY state targets: col 32627 in state_to_cols[36]? True\n", + " Visible to CD 3611 targets: col 32627 in cd_to_cols['3611']? True\n", + " Visible to NC (37) targets: False\n", + "\n" ] } ], "source": [ - "# Reverse lookup: get all column positions for a specific household\n", - "hh_id = hh_snap_df.loc[hh_snap_df.snap > 0].household_id.values[0]\n", - "print(hh_snap_df.loc[hh_snap_df.household_id == hh_id])\n", - "\n", - "print(\"\\nEvaluating the tracer.get_household_column_positions dictionary:\\n\")\n", - "positions = tracer.get_household_column_positions(hh_id)\n", - "print(positions)" + "print(f\"Example household clone visibility:\\n\")\n", + "for c in range(N_CLONES):\n", + " col = c * n_records + record_idx\n", + " state = int(geography.state_fips[col])\n", + " cd = str(geography.cd_geoid[col])\n", + " abbr = STATE_CODES.get(state, \"??\")\n", + " print(f\"Clone {c} ({abbr}, CD {cd}):\")\n", + " print(\n", + " f\" Visible to {abbr} state targets: \"\n", + " f\"col {col} in state_to_cols[{state}]? \"\n", + " f\"{col in state_to_cols.get(state, [])}\"\n", + " )\n", + " print(\n", + " f\" Visible to CD {cd} targets: \"\n", + " f\"col {col} in cd_to_cols['{cd}']? \"\n", + " f\"{col in cd_to_cols.get(cd, [])}\"\n", + " )\n", + " # Check an unrelated state\n", + " print(\n", + " f\" Visible to NC (37) targets: \" f\"{col in state_to_cols.get(37, [])}\"\n", + " )\n", + " print()" ] }, { "cell_type": "markdown", - "id": "cell-13", + "id": "cell-16", + "metadata": {}, + "source": [ + "This is the mechanism behind the sparsity pattern in `calibration_matrix.ipynb`: a household clone assigned to TX can contribute to TX state targets and TX CD targets, but produces a zero entry for NC or AK targets. The matrix is sparse because each clone only intersects a small fraction of all geographic targets." + ] + }, + { + "cell_type": "markdown", + "id": "cell-17", "metadata": {}, "source": [ - "## Section 5: Understanding the cells of the X_Sparse matrix and Target vector" + "## Section 5: Takeup Re-randomization\n", + "\n", + "The base CPS has fixed takeup decisions (e.g., \"this household takes up SNAP\"). But when we clone a household into different census blocks, each block should have independently drawn takeup — otherwise every clone of a SNAP-participating household would still participate, regardless of geography.\n", + "\n", + "`rerandomize_takeup` solves this: for each census block, it uses `seeded_rng(variable_name, salt=block_geoid)` to draw new takeup booleans. The seed is deterministic per (variable, block) pair, so results are reproducible." ] }, { "cell_type": "code", "execution_count": 10, - "id": "e05aaeab-3786-4ff0-a50b-34577065d2e0", + "id": "cell-18", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Remember, this is a North Carolina target:\n", - "\n", - "target_id 8942\n", - "stratum_id 9363\n", - "variable snap\n", - "value 2934626410.0\n", - "period 2024\n", - "geo_level state\n", - "geographic_id 37\n", - "domain_variable snap\n", - "original_value 2934626410.0\n", - "uprating_factor 1.0\n", - "Name: 80, dtype: object\n", + "8 takeup variables:\n", "\n", - "NC State target. Household donated to NC's 2nd district, 2024 SNAP dollars:\n", - "70.08\n", - "\n", - "Same target, same household, donated to AK's at Large district, 2024 SNAP dollars:\n", - "0.0\n" + " takes_up_snap_if_eligible entity=spm_unit rate=82.00%\n", + " takes_up_aca_if_eligible entity=tax_unit rate=67.20%\n", + " takes_up_dc_ptc entity=tax_unit rate=32.00%\n", + " takes_up_head_start_if_eligible entity=person rate=30.00%\n", + " takes_up_early_head_start_if_eligible entity=person rate=9.00%\n", + " takes_up_ssi_if_eligible entity=person rate=50.00%\n", + " would_file_taxes_voluntarily entity=tax_unit rate=5.00%\n", + " takes_up_medicaid_if_eligible entity=person rate=dict (51 entries)\n" ] } ], "source": [ - "print(\"Remember, this is a North Carolina target:\\n\")\n", - "print(targets_df.iloc[row_loc])\n", - "\n", - "print(\"\\nNC State target. Household donated to NC's 2nd district, 2024 SNAP dollars:\")\n", - "print(X_sparse[row_loc, positions['3702']]) # Household donated to NC's 2nd district\n", - "\n", - "print(\"\\nSame target, same household, donated to AK's at Large district, 2024 SNAP dollars:\")\n", - "print(X_sparse[row_loc, positions['201']]) # Household donated to AK's at Large District" + "print(f\"{len(SIMPLE_TAKEUP_VARS)} takeup variables:\\n\")\n", + "for spec in SIMPLE_TAKEUP_VARS:\n", + " rate_key = spec[\"rate_key\"]\n", + " if rate_key == \"voluntary_filing\":\n", + " rate = 0.05\n", + " else:\n", + " rate = load_take_up_rate(rate_key, 2024)\n", + " rate_str = (\n", + " f\"{rate:.2%}\"\n", + " if isinstance(rate, float)\n", + " else f\"dict ({len(rate)} entries)\"\n", + " )\n", + " print(\n", + " f\" {spec['variable']:40s} \"\n", + " f\"entity={spec['entity']:10s} rate={rate_str}\"\n", + " )" ] }, { - "cell_type": "markdown", - "id": "cell-16", + "cell_type": "code", + "execution_count": 11, + "id": "cell-19", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Same block + same var (reproducible):\n", + " [0.50514599 0.75213437 0.9703409 0.18048868 0.31969517]\n", + " [0.50514599 0.75213437 0.9703409 0.18048868 0.31969517]\n", + " Match: True\n", + "\n", + "Different block, same var:\n", + " [0.15503168 0.96707026 0.79019745 0.67544525 0.85245009]\n", + " Match: False\n", + "\n", + "Same block, different var:\n", + " [0.93155876 0.8912794 0.50838888 0.32192278 0.01005173]\n", + " Match: False\n" + ] + } + ], "source": [ - "Key property: For state-level targets, only CDs in that state should have non-zero values.\n", + "block_a = \"482011234567890\"\n", + "block_b = \"170311234567890\"\n", + "var = \"takes_up_snap_if_eligible\"\n", "\n", - "Example: A NC state SNAP target should have zeros for HI, MT, and AK CD columns.\n", + "rng_a1 = seeded_rng(var, salt=block_a)\n", + "rng_a2 = seeded_rng(var, salt=block_a)\n", + "rng_b = seeded_rng(var, salt=block_b)\n", + "rng_other = seeded_rng(\"takes_up_aca_if_eligible\", salt=block_a)\n", "\n", - "So let's see that same household's value for the Alaska state target:" + "draws_a1 = rng_a1.random(5)\n", + "draws_a2 = rng_a2.random(5)\n", + "draws_b = rng_b.random(5)\n", + "draws_other = rng_other.random(5)\n", + "\n", + "print(\"Same block + same var (reproducible):\")\n", + "print(f\" {draws_a1}\")\n", + "print(f\" {draws_a2}\")\n", + "print(f\" Match: {np.allclose(draws_a1, draws_a2)}\")\n", + "print(f\"\\nDifferent block, same var:\")\n", + "print(f\" {draws_b}\")\n", + "print(f\" Match: {np.allclose(draws_a1, draws_b)}\")\n", + "print(f\"\\nSame block, different var:\")\n", + "print(f\" {draws_other}\")\n", + "print(f\" Match: {np.allclose(draws_a1, draws_other)}\")" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "8cdc264c-8335-40eb-afd9-4c4d023ec303", + "execution_count": 12, + "id": "cell-20", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Row info for Alaska's SNAP benefit amount:\n", - "{'row_index': 80, 'variable': 'snap', 'variable_desc': 'SNAP allotment', 'geographic_id': '37', 'target_value': 2934626410.0, 'stratum_id': 9363, 'domain_variable': 'snap'}\n" + "Takeup rates before/after re-randomization (clone 0):\n", + "\n", + " takes_up_snap_if_eligible before=82.333% after=82.381%\n", + " takes_up_aca_if_eligible before=66.718% after=67.486%\n", + " takes_up_dc_ptc before=31.483% after=32.044%\n", + " takes_up_head_start_if_eligible before=29.963% after=29.689%\n", + " takes_up_early_head_start_if_eligible before=8.869% after=8.721%\n", + " takes_up_ssi_if_eligible before=100.000% after=49.776%\n", + " would_file_taxes_voluntarily before=0.000% after=4.905%\n", + " takes_up_medicaid_if_eligible before=84.496% after=80.051%\n" ] } ], "source": [ - "target_group = tracer.get_group_rows(2)\n", - "new_row_loc = target_group.iloc[10]['row_index'] # Manually found the index value 10\n", - "row_info = tracer.get_row_info(row_loc)\n", - "var = row_info['variable']\n", - "var_desc = row_info['variable_desc']\n", - "target_geo_id = int(row_info['geographic_id'])\n", + "test_sim = Microsimulation(dataset=dataset_path)\n", + "clone_0_states = geography.state_fips[:n_records]\n", + "clone_0_blocks = geography.block_geoid[:n_records]\n", + "test_sim.set_input(\"state_fips\", 2024, clone_0_states.astype(np.int32))\n", + "\n", + "before = {}\n", + "for spec in SIMPLE_TAKEUP_VARS:\n", + " v = spec[\"variable\"]\n", + " vals = test_sim.calculate(v, map_to=spec[\"entity\"]).values\n", + " before[v] = vals.mean()\n", + "\n", + "rerandomize_takeup(test_sim, clone_0_blocks, clone_0_states, 2024)\n", "\n", - "print(\"Row info for Alaska's SNAP benefit amount:\")\n", - "print(row_info)" + "print(\"Takeup rates before/after re-randomization (clone 0):\\n\")\n", + "for spec in SIMPLE_TAKEUP_VARS:\n", + " v = spec[\"variable\"]\n", + " vals = test_sim.calculate(v, map_to=spec[\"entity\"]).values\n", + " after = vals.mean()\n", + " print(f\" {v:40s} before={before[v]:.3%} after={after:.3%}\")" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "ac59b6f1-859f-4246-8a05-8cb26384c882", + "execution_count": 13, + "id": "cell-21", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "Medicaid takeup rates (state-specific), first 10 states:\n", "\n", - "Household donated to AK's 1st district, 2024 SNAP dollars:\n", - "0.0\n" + " AK: 88.00%\n", + " AL: 92.00%\n", + " AR: 79.00%\n", + " AZ: 95.00%\n", + " CA: 78.00%\n", + " CO: 99.00%\n", + " CT: 89.00%\n", + " DC: 99.00%\n", + " DE: 86.00%\n", + " FL: 98.00%\n" ] } ], "source": [ - "print(\"\\nHousehold donated to AK's 1st district, 2024 SNAP dollars:\")\n", - "print(X_sparse[new_row_loc, positions['201']]) # Household donated to AK's at Large District" + "medicaid_rates = load_take_up_rate(\"medicaid\", 2024)\n", + "print(\"Medicaid takeup rates (state-specific), first 10 states:\\n\")\n", + "for state, rate in sorted(medicaid_rates.items())[:10]:\n", + " print(f\" {state}: {rate:.2%}\")" ] }, { "cell_type": "markdown", - "id": "cell-18", + "id": "cell-22", + "metadata": {}, + "source": [ + "In the full pipeline, `rerandomize_takeup` is passed to `build_matrix` as a `sim_modifier` callback. For each clone, after `state_fips` is set but before formula caches are cleared, the callback draws new takeup booleans per census block. This means the same household in block A might take up SNAP while in block B it doesn't — matching the statistical reality that takeup varies by geography." + ] + }, + { + "cell_type": "markdown", + "id": "cell-23", "metadata": {}, "source": [ - "## Section 6: Simulating State-Swapped Calculations\n", + "## Section 6: Matrix Build Verification\n", "\n", - "When a household is \"transplanted\" to a different state, state-dependent benefits like SNAP are recalculated under the destination state's rules." + "Let's run the full `build_matrix` pipeline and verify the example household's pattern matches our Section 4 predictions. We use the same `target_filter` as in `calibration_matrix.ipynb` but *without* `sim_modifier` to match that notebook's output." ] }, { "cell_type": "code", - "execution_count": 13, - "id": "cell-19", + "execution_count": 14, + "id": "cell-24", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-02-13 17:11:22,384 - INFO - Processing clone 1/3 (cols 0-11998, 50 unique states)...\n", + "2026-02-13 17:11:23,509 - INFO - Processing clone 2/3 (cols 11999-23997, 50 unique states)...\n", + "2026-02-13 17:11:24,645 - INFO - Processing clone 3/3 (cols 23998-35996, 50 unique states)...\n", + "2026-02-13 17:11:25,769 - INFO - Assembling matrix from 3 clones...\n", + "2026-02-13 17:11:25,771 - INFO - Matrix: 538 targets x 35997 cols, 14946 nnz\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "SNAP values for first 5 households under different state rules:\n", - " NC rules: [0. 0. 0. 0. 0.]\n", - " AK rules: [0. 0. 0. 0. 0.]\n", - " Difference: [0. 0. 0. 0. 0.]\n" + "Matrix shape: (538, 35997)\n", + "Non-zero entries: 14,946\n", + "Density: 0.000772\n" ] } ], "source": [ - "def create_state_simulation(state_fips):\n", - " \"\"\"Create a simulation with all households assigned to a specific state.\"\"\"\n", - " s = Microsimulation(dataset=dataset_path)\n", - " s.set_input(\n", - " \"state_fips\", 2024, np.full(hh_snap_df.shape[0], state_fips, dtype=np.int32)\n", - " )\n", - " for var in get_calculated_variables(s):\n", - " s.delete_arrays(var)\n", - " return s\n", - "\n", - "# Compare SNAP for first 5 households under NC vs AK rules\n", - "nc_sim = create_state_simulation(37) # NC\n", - "ak_sim = create_state_simulation(2) # AK\n", + "builder = UnifiedMatrixBuilder(\n", + " db_uri=db_uri,\n", + " time_period=2024,\n", + " dataset_path=dataset_path,\n", + ")\n", "\n", - "nc_snap = nc_sim.calculate(\"snap\", map_to=\"household\").values[:5]\n", - "ak_snap = ak_sim.calculate(\"snap\", map_to=\"household\").values[:5]\n", + "targets_df, X_sparse, target_names = builder.build_matrix(\n", + " geography,\n", + " sim,\n", + " target_filter={\"domain_variables\": [\"snap\"]},\n", + ")\n", "\n", - "print(\"SNAP values for first 5 households under different state rules:\")\n", - "print(f\" NC rules: {nc_snap}\")\n", - "print(f\" AK rules: {ak_snap}\")\n", - "print(f\" Difference: {ak_snap - nc_snap}\")" + "print(f\"Matrix shape: {X_sparse.shape}\")\n", + "print(f\"Non-zero entries: {X_sparse.nnz:,}\")\n", + "print(f\"Density: {X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]):.6f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "cell-25", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Example household non-zero pattern across clones:\n", + "\n", + "Clone 0 (TX, CD 4817): 3 non-zero rows\n", + " row 39: household_count (geo=48): 1.00\n", + " row 90: snap (geo=48): 18396.00\n", + " row 410: household_count (geo=4817): 1.00\n", + "Clone 1 (PA, CD 4201): 3 non-zero rows\n", + " row 34: household_count (geo=42): 1.00\n", + " row 85: snap (geo=42): 18396.00\n", + " row 358: household_count (geo=4201): 1.00\n", + "Clone 2 (NY, CD 3611): 3 non-zero rows\n", + " row 27: household_count (geo=36): 1.00\n", + " row 78: snap (geo=36): 18396.00\n", + " row 292: household_count (geo=3611): 1.00\n" + ] + } + ], + "source": [ + "print(f\"Example household non-zero pattern across clones:\\n\")\n", + "for c in range(N_CLONES):\n", + " col = c * n_records + record_idx\n", + " col_vec = X_sparse[:, col]\n", + " nz_rows = col_vec.nonzero()[0]\n", + " state = int(geography.state_fips[col])\n", + " cd = geography.cd_geoid[col]\n", + " abbr = STATE_CODES.get(state, \"??\")\n", + " print(f\"Clone {c} ({abbr}, CD {cd}): {len(nz_rows)} non-zero rows\")\n", + " for r in nz_rows:\n", + " row = targets_df.iloc[r]\n", + " print(\n", + " f\" row {r}: {row['variable']} \"\n", + " f\"(geo={row['geographic_id']}): \"\n", + " f\"{X_sparse[r, col]:.2f}\"\n", + " )" ] }, { "cell_type": "markdown", - "id": "a7a3b4f3-dabc-4160-a781-a529018e889f", + "id": "cell-26", "metadata": {}, "source": [ - "## Section 7: Creating the h5 files\n", - "\n", - " `w` (required)\n", - " - The calibrated weight vector from L0 calibration\n", - " - Shape: (n_cds * n_households,) — a flattened matrix where each CD has weights for all households\n", - " - Gets reshaped to (n_cds, n_households) internally\n", + "## Section 7: From Weights to Datasets\n", "\n", - " `cds_to_calibrate` (required)\n", - " - The ordered list of CD GEOIDs used when building w\n", - " - Serves two purposes:\n", - " a. Tells us how to reshape w (via its length)\n", - " b. Provides the index mapping so we can extract the right rows for any cd_subset\n", + "`create_sparse_cd_stacked_dataset` takes calibrated weights and builds an h5 file with only the non-zero-weight households, reindexed per CD. Internally it does its own state-swap simulation — loading the base dataset, assigning `state_fips` for the target CD's state, and recalculating benefits from scratch. This means SNAP values in the output reflect the destination state's rules (e.g., a $70 SNAP household from ME may get $0 under AK rules).\n", "\n", - " `cd_subset` (optional, default None)\n", - " - Which CDs to actually include in the output dataset\n", - " - Must be a subset of cds_to_calibrate\n", - " - If None, all CDs are included\n", - " - Use cases: build a single-state file, a single-CD file for testing, etc.\n", - "\n", - " `output_path` (optional but effectively required — raises if None)\n", - " - Where to save the resulting .h5 file\n", - " - Creates parent directories if needed\n", + "**Format gap:** The calibration produces weights in clone layout `(n_records * n_clones,)` where each clone maps to one specific CD via the `GeographyAssignment`. The stacked dataset builder expects CD layout `(n_cds * n_households,)` where every CD has a weight slot for every household. Converting between these — accumulating clone weights into their assigned CDs — is a separate step not yet implemented. The demo below constructs artificial CD-layout weights directly to show how the builder works." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "cell-27", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dimension mismatch:\n", + " Calibration output: (11999 * 3,) = 35,997 (clone layout)\n", + " Stacked builder expects: (436 * 11999,) = 5,231,564 (CD layout)\n" + ] + } + ], + "source": [ + "print(\"Dimension mismatch:\")\n", + "print(\n", + " f\" Calibration output: ({n_records} * {N_CLONES},) \"\n", + " f\"= {n_records * N_CLONES:,} (clone layout)\"\n", + ")\n", "\n", - " `dataset_path` (optional, default None)\n", - " - Path to the base .h5 dataset that was used during calibration\n", - " - This is the \"template\" — household structure, demographics, etc.\n", - " - The function loads this, reweights households per CD, updates geography, and stacks" + "all_cds = get_all_cds_from_database(db_uri)\n", + "n_cds = len(all_cds)\n", + "print(\n", + " f\" Stacked builder expects: ({n_cds} * {n_records},) \"\n", + " f\"= {n_cds * n_records:,} (CD layout)\"\n", + ")" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "e1f8b237-ba42-4fca-8d43-f253f587d49b", + "execution_count": 17, + "id": "cell-28", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Weight vector: 23,998 entries (2 CDs x 11,999 HH)\n", + "Non-zero weights: 277\n", + "Example HH weight in CD 3701: 2.5\n", + "Example HH weight in CD 201: 3.5\n" + ] + } + ], "source": [ "import os\n", "\n", - "from policyengine_us_data.datasets.cps.local_area_calibration.stacked_dataset_builder import create_sparse_cd_stacked_dataset\n", + "demo_cds = [\"3701\", \"201\"]\n", + "n_demo_cds = len(demo_cds)\n", "\n", - "# Initialize the weights w for demonstration\n", - "# We can't allow too many w cells to be positive for a given state, or the reindexing will fail\n", - "w = np.random.binomial(n=1, p=0.01, size=X_sparse.shape[1]).astype(float)\n", + "w = (\n", + " np.random.default_rng(42)\n", + " .binomial(n=1, p=0.01, size=n_demo_cds * n_records)\n", + " .astype(float)\n", + ")\n", "\n", - "# We'll make sure our earlier household is included:\n", - "household_ids = sim.calculate(\"household_id\", map_to=\"household\").values\n", - "hh_idx = np.where(household_ids == hh_id)[0][0]\n", + "# Seed our example household into both CDs\n", + "cd_idx_3701 = demo_cds.index(\"3701\")\n", + "w[cd_idx_3701 * n_records + record_idx] = 2.5\n", "\n", - "cd_idx = test_cds.index('3701')\n", - "flat_idx = cd_idx * len(household_ids) + hh_idx\n", - "w[flat_idx] = 2.5\n", + "cd_idx_201 = demo_cds.index(\"201\")\n", + "w[cd_idx_201 * n_records + record_idx] = 3.5\n", "\n", - "cd_idx = test_cds.index('201')\n", - "flat_idx = cd_idx * len(household_ids) + hh_idx\n", - "w[flat_idx] = 3.5\n", + "output_dir = \"calibration_output\"\n", + "os.makedirs(output_dir, exist_ok=True)\n", + "output_path = os.path.join(output_dir, \"results.h5\")\n", "\n", - "# Create a folder for the outputs of the function that is to come.\n", - "new_folder_name = \"calibration_output\"\n", - "os.makedirs(new_folder_name, exist_ok=True)\n", - "output_path = os.path.join(new_folder_name, \"results.h5\")" + "print(\n", + " f\"Weight vector: {len(w):,} entries \"\n", + " f\"({n_demo_cds} CDs x {n_records:,} HH)\"\n", + ")\n", + "print(f\"Non-zero weights: {(w > 0).sum()}\")\n", + "print(\n", + " f\"Example HH weight in CD 3701: {w[cd_idx_3701 * n_records + record_idx]}\"\n", + ")\n", + "print(f\"Example HH weight in CD 201: {w[cd_idx_201 * n_records + record_idx]}\")" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "650b807d-3d20-48e0-b512-43922ca2aace", + "execution_count": 18, + "id": "cell-29", "metadata": {}, "outputs": [ { @@ -643,27 +991,43 @@ "\n", "Original dataset has 11,999 households\n", "Extracted weights for 2 CDs from full weight matrix\n", - "Total active household-CD pairs: 230\n", - "Total weight in W matrix: 234\n", - "Processing CD 201 (2/2)...\n", + "Total active household-CD pairs: 277\n", + "Total weight in W matrix: 281\n", + "Processing CD 201 (2/2)...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-02-13 17:11:40,873 - INFO - HTTP Request: GET https://huggingface.co/api/models/policyengine/policyengine-us-data \"HTTP/1.1 200 OK\"\n", + "2026-02-13 17:11:40,899 - INFO - HTTP Request: HEAD https://huggingface.co/policyengine/policyengine-us-data/resolve/main/enhanced_cps_2024.h5 \"HTTP/1.1 302 Found\"\n", + "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n", + "2026-02-13 17:11:40,899 - WARNING - Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "\n", "Combining 2 CD DataFrames...\n", - "Total households across all CDs: 230\n", - "Combined DataFrame shape: (578, 222)\n", + "Total households across all CDs: 277\n", + "Combined DataFrame shape: (726, 222)\n", "\n", "Reindexing all entity IDs using 25k ranges per CD...\n", - " Created 230 unique households across 2 CDs\n", + " Created 277 unique households across 2 CDs\n", " Reindexing persons using 25k ranges...\n", " Reindexing tax units...\n", " Reindexing SPM units...\n", " Reindexing marital units...\n", " Reindexing families...\n", - " Final persons: 578\n", - " Final households: 230\n", - " Final tax units: 314\n", - " Final SPM units: 236\n", - " Final marital units: 461\n", - " Final families: 249\n", + " Final persons: 726\n", + " Final households: 277\n", + " Final tax units: 373\n", + " Final SPM units: 291\n", + " Final marital units: 586\n", + " Final families: 309\n", "\n", "Weights in combined_df AFTER reindexing:\n", " HH weight sum: 0.00M\n", @@ -671,8 +1035,8 @@ " Ratio: 1.00\n", "\n", "Overflow check:\n", - " Max person ID after reindexing: 5,125,285\n", - " Max person ID × 100: 512,528,500\n", + " Max person ID after reindexing: 5,025,335\n", + " Max person ID × 100: 502,533,500\n", " int32 max: 2,147,483,647\n", " ✓ No overflow risk!\n", "\n", @@ -687,9 +1051,9 @@ "Household mapping saved to calibration_output/mappings/results_household_mapping.csv\n", "\n", "Verifying saved file...\n", - " Final households: 230\n", - " Final persons: 578\n", - " Total population (from household weights): 234\n" + " Final households: 277\n", + " Final persons: 726\n", + " Total population (from household weights): 281\n" ] }, { @@ -698,17 +1062,16 @@ "'calibration_output/results.h5'" ] }, - "execution_count": 15, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "cd_subset = ['3701', '201']\n", "create_sparse_cd_stacked_dataset(\n", " w,\n", - " test_cds, # cds_to_calibrate - Defines the structure of the weight vector w\n", - " cd_subset=cd_subset, # cd_subset - Specifies which CDs to actually include in the output dataset (optional, defaults to all).\n", + " demo_cds,\n", + " cd_subset=demo_cds,\n", " dataset_path=dataset_path,\n", " output_path=output_path,\n", ")" @@ -716,280 +1079,101 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "f8d449b4-6069-44e0-8d21-e73944a1a1d2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[34mmappings\u001b[m\u001b[m/ results.h5\n" - ] - } - ], - "source": [ - "%ls calibration_output" - ] - }, - { - "cell_type": "markdown", - "id": "04d7b733-bec5-49cb-9272-d167ae9c4693", - "metadata": {}, - "source": [ - "Note that there is a *mappings* directory that has also been created by create_sparse_cd_stacked_dataset. This contains the CSV file that links the original households to the donor households. The reason it's a seperate folder is to keep the h5 files and the mapping CSVs organized when this function is run for all districts or states." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "5fd7f7cc-6517-4f39-9a14-9cb147af38e7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "results_household_mapping.csv\n" - ] - } - ], - "source": [ - "%ls calibration_output/mappings" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "578e8a69-b7ec-46bf-82ec-8020a46fd9cf", + "execution_count": 19, + "id": "cell-30", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " household_id congressional_district_geoid \\\n", - "0 50000 201 \n", - "1 50001 201 \n", - "2 50002 201 \n", - "3 50003 201 \n", - "4 50004 201 \n", - ".. ... ... \n", - "225 125113 3701 \n", - "226 125114 3701 \n", - "227 125115 3701 \n", - "228 125116 3701 \n", - "229 125117 3701 \n", + "Stacked dataset: 277 households\n", "\n", - " county household_weight state_fips \\\n", - "0 NORTH_SLOPE_BOROUGH_AK 3.5 2 \n", - "1 ALEUTIANS_WEST_CENSUS_AREA_AK 1.0 2 \n", - "2 FAIRBANKS_NORTH_STAR_BOROUGH_AK 1.0 2 \n", - "3 KENAI_PENINSULA_BOROUGH_AK 1.0 2 \n", - "4 HOONAH_ANGOON_CENSUS_AREA_AK 1.0 2 \n", - ".. ... ... ... \n", - "225 TYRRELL_COUNTY_NC 1.0 37 \n", - "226 WILSON_COUNTY_NC 1.0 37 \n", - "227 WARREN_COUNTY_NC 1.0 37 \n", - "228 WILSON_COUNTY_NC 1.0 37 \n", - "229 GREENE_COUNTY_NC 1.0 37 \n", + "Example household (original_id=128694) in mapping:\n", "\n", - " snap \n", - "0 0.000000 \n", - "1 0.000000 \n", - "2 0.000000 \n", - "3 0.000000 \n", - "4 0.000000 \n", - ".. ... \n", - "225 0.000000 \n", - "226 3438.300293 \n", - "227 0.000000 \n", - "228 0.000000 \n", - "229 885.599792 \n", + " new_household_id original_household_id congressional_district state_fips\n", + " 108 128694 201 2\n", + " 25097 128694 3701 37\n", "\n", - "[230 rows x 6 columns]\n" + "In stacked dataset:\n", + "\n", + " household_id congressional_district_geoid household_weight state_fips snap\n", + " 108 201 3.5 2 23640.0\n", + " 25097 3701 2.5 37 18396.0\n" ] } ], "source": [ - "sim_after = Microsimulation(dataset=\"./calibration_output/results.h5\")\n", + "sim_after = Microsimulation(dataset=f\"./{output_path}\")\n", + "hh_after_df = pd.DataFrame(\n", + " sim_after.calculate_dataframe(\n", + " [\n", + " \"household_id\",\n", + " \"congressional_district_geoid\",\n", + " \"household_weight\",\n", + " \"state_fips\",\n", + " \"snap\",\n", + " ]\n", + " )\n", + ")\n", + "print(f\"Stacked dataset: {len(hh_after_df)} households\\n\")\n", "\n", - "hh_after_df = pd.DataFrame(sim_after.calculate_dataframe([\n", - " \"household_id\", \"congressional_district_geoid\", \"county\", \"household_weight\", \"state_fips\", \"snap\"]) \n", + "mapping_df = pd.read_csv(\n", + " f\"{output_dir}/mappings/results_household_mapping.csv\"\n", ")\n", - "print(hh_after_df)" - ] - }, - { - "cell_type": "markdown", - "id": "83769d86-91e1-41bb-b718-01ee09cc7e2a", - "metadata": {}, - "source": [ - "We can see one of the correct instances above but let's confirm that this new household id does in fact link back to the original in both cases." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "27baf521-1bd6-4ef0-9f70-4381fd842b52", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
new_household_idoriginal_household_idcongressional_districtstate_fips
0500006542012
1125000654370137
\n", - "
" - ], - "text/plain": [ - " new_household_id original_household_id congressional_district state_fips\n", - "0 50000 654 201 2\n", - "1 125000 654 3701 37" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mapping_df = pd.read_csv(\"calibration_output/mappings/results_household_mapping.csv\")\n", - "mapping_df.loc[mapping_df.original_household_id == hh_id]" + "example_mapping = mapping_df.loc[\n", + " mapping_df.original_household_id == example_hh_id\n", + "]\n", + "print(f\"Example household (original_id={example_hh_id}) \" f\"in mapping:\\n\")\n", + "print(example_mapping.to_string(index=False))\n", + "\n", + "new_ids = example_mapping.new_household_id\n", + "print(f\"\\nIn stacked dataset:\\n\")\n", + "print(\n", + " hh_after_df.loc[hh_after_df.household_id.isin(new_ids)].to_string(\n", + " index=False\n", + " )\n", + ")" ] }, { "cell_type": "code", "execution_count": 20, - "id": "36be0858-33f4-4c65-a74f-e18a76ce8eea", + "id": "cell-31", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
household_idcongressional_district_geoidcountyhousehold_weightstate_fipssnap
050000201NORTH_SLOPE_BOROUGH_AK3.520.000000
1121250003701HALIFAX_COUNTY_NC2.53770.080002
\n", - "
" - ], - "text/plain": [ - " household_id congressional_district_geoid county \\\n", - "0 50000 201 NORTH_SLOPE_BOROUGH_AK \n", - "112 125000 3701 HALIFAX_COUNTY_NC \n", - "\n", - " household_weight state_fips snap \n", - "0 3.5 2 0.000000 \n", - "112 2.5 37 70.080002 " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Cleaned up calibration_output/\n" + ] } ], "source": [ - "new_hh_ids = mapping_df.loc[mapping_df.original_household_id == hh_id].new_household_id\n", - "hh_after_df.loc[hh_after_df.household_id.isin(new_hh_ids)]" + "import shutil\n", + "\n", + "shutil.rmtree(output_dir)\n", + "print(f\"Cleaned up {output_dir}/\")" ] }, { "cell_type": "markdown", - "id": "96fa8407-008f-4eaa-8f22-a803b72e71e4", + "id": "cell-32", "metadata": {}, "source": [ - "And we can see that the snap numbers still match their values from the different US state systems. However note that due to the use of policyengine-core's random function in a component of snap_gross_income, for some households, the value in the final simulation will not match the one used in creating the X matrix (`X_sparse` here). This is outlined in [Issue 412](https://github.com/PolicyEngine/policyengine-core/issues/412)." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "90ee3a8b-d529-41f2-83ee-d543c53b5492", - "metadata": {}, - "outputs": [], - "source": [ - "%rm -r calibration_output" + "## Summary\n", + "\n", + "The clone-based calibration pipeline has six stages:\n", + "\n", + "1. **Clone + assign geography** — `assign_random_geography()` creates N copies of each CPS record, each with a population-weighted random census block.\n", + "2. **Simulate** — `_simulate_clone()` sets each clone's `state_fips` and recalculates state-dependent benefits.\n", + "3. **Geographic masking** — `state_to_cols` / `cd_to_cols` restrict each target row to geographically relevant columns.\n", + "4. **Re-randomize takeup** — `rerandomize_takeup()` draws new takeup per census block, breaking the fixed-takeup assumption.\n", + "5. **Build matrix** — `UnifiedMatrixBuilder.build_matrix()` assembles the sparse CSR matrix from all clones.\n", + "6. **Stacked datasets** — `create_sparse_cd_stacked_dataset()` converts calibrated weights into CD-level h5 files.\n", + "\n", + "For matrix diagnostics (row/column anatomy, target groups, sparsity analysis), see [calibration_matrix.ipynb](calibration_matrix.ipynb)." ] } ], diff --git a/modal_app/remote_calibration_runner.py b/modal_app/remote_calibration_runner.py index 43e354456..689d245dd 100644 --- a/modal_app/remote_calibration_runner.py +++ b/modal_app/remote_calibration_runner.py @@ -53,10 +53,7 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict: elif line.startswith("DATASET:"): dataset_path = line.split("DATASET:")[1].strip() - script_path = ( - "policyengine_us_data/datasets/cps/" - "local_area_calibration/fit_calibration_weights.py" - ) + script_path = "policyengine_us_data/calibration/unified_calibration.py" result = subprocess.run( [ "uv", @@ -69,7 +66,7 @@ def _fit_weights_impl(branch: str, epochs: int) -> dict: str(epochs), "--db-path", db_path, - "--dataset-path", + "--dataset", dataset_path, ], capture_output=True, diff --git a/policyengine_us_data/calibration/__init__.py b/policyengine_us_data/calibration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/policyengine_us_data/calibration/clone_and_assign.py b/policyengine_us_data/calibration/clone_and_assign.py new file mode 100644 index 000000000..9aa64cbbc --- /dev/null +++ b/policyengine_us_data/calibration/clone_and_assign.py @@ -0,0 +1,145 @@ +"""Clone CPS records and assign random geography.""" + +import logging +from functools import lru_cache +from dataclasses import dataclass + +import numpy as np +import pandas as pd + +from policyengine_us_data.storage import STORAGE_FOLDER + +logger = logging.getLogger(__name__) + + +@dataclass +class GeographyAssignment: + """Random geography assignment for cloned CPS records. + + All arrays have length n_records * n_clones. + Index i corresponds to clone i // n_records, + record i % n_records. + """ + + block_geoid: np.ndarray # str array, 15-char block GEOIDs + cd_geoid: np.ndarray # str array of CD GEOIDs + state_fips: np.ndarray # int array of 2-digit state FIPS + n_records: int + n_clones: int + + +@lru_cache(maxsize=1) +def load_global_block_distribution(): + """Load block_cd_distributions.csv.gz and build + global distribution. + + Returns: + Tuple of (block_geoids, cd_geoids, state_fips, + probabilities) where each is a numpy array indexed + by block row. Probabilities are normalized to sum + to 1 globally. + + Raises: + FileNotFoundError: If the CSV file does not exist. + """ + csv_path = STORAGE_FOLDER / "block_cd_distributions.csv.gz" + if not csv_path.exists(): + raise FileNotFoundError( + f"{csv_path} not found. " + "Run make_block_cd_distributions.py to generate." + ) + + df = pd.read_csv(csv_path, dtype={"block_geoid": str}) + + block_geoids = df["block_geoid"].values + cd_geoids = df["cd_geoid"].astype(str).values + state_fips = np.array([int(b[:2]) for b in block_geoids]) + + probs = df["probability"].values.astype(np.float64) + probs = probs / probs.sum() + + return block_geoids, cd_geoids, state_fips, probs + + +def assign_random_geography( + n_records: int, + n_clones: int = 10, + seed: int = 42, +) -> GeographyAssignment: + """Assign random census block geography to cloned + CPS records. + + Each of n_records * n_clones total records gets a + random census block sampled from the global + population-weighted distribution. State and CD are + derived from the block GEOID. + + Args: + n_records: Number of households in the base CPS + dataset. + n_clones: Number of clones (default 10). + seed: Random seed for reproducibility. + + Returns: + GeographyAssignment with arrays of length + n_records * n_clones. + """ + blocks, cds, states, probs = load_global_block_distribution() + + n_total = n_records * n_clones + rng = np.random.default_rng(seed) + indices = rng.choice(len(blocks), size=n_total, p=probs) + + return GeographyAssignment( + block_geoid=blocks[indices], + cd_geoid=cds[indices], + state_fips=states[indices], + n_records=n_records, + n_clones=n_clones, + ) + + +def double_geography_for_puf( + geography: GeographyAssignment, +) -> GeographyAssignment: + """Double geography arrays for PUF clone step. + + After PUF cloning doubles the base records, the geography + assignment must also double: each record and its PUF copy + share the same geographic assignment. + + The output has n_records = 2 * geography.n_records, with + the first half being the CPS records and the second half + being the PUF copies. + + Args: + geography: Original geography assignment. + + Returns: + New GeographyAssignment with doubled n_records. + """ + n_old = geography.n_records + n_new = n_old * 2 + n_clones = geography.n_clones + + new_blocks = [] + new_cds = [] + new_states = [] + + for c in range(n_clones): + start = c * n_old + end = start + n_old + clone_blocks = geography.block_geoid[start:end] + clone_cds = geography.cd_geoid[start:end] + clone_states = geography.state_fips[start:end] + new_blocks.append(np.concatenate([clone_blocks, clone_blocks])) + new_cds.append(np.concatenate([clone_cds, clone_cds])) + new_states.append(np.concatenate([clone_states, clone_states])) + + return GeographyAssignment( + block_geoid=np.concatenate(new_blocks), + cd_geoid=np.concatenate(new_cds), + state_fips=np.concatenate(new_states), + n_records=n_new, + n_clones=n_clones, + ) diff --git a/policyengine_us_data/calibration/unified_calibration.py b/policyengine_us_data/calibration/unified_calibration.py new file mode 100644 index 000000000..d2759b34b --- /dev/null +++ b/policyengine_us_data/calibration/unified_calibration.py @@ -0,0 +1,637 @@ +""" +Unified L0 calibration pipeline. + +Pipeline flow: + 1. Load CPS dataset -> get n_records + 2. Clone Nx, assign random geography (census block) + 3. Re-randomize simple takeup variables per block + 4. Build sparse calibration matrix (clone-by-clone) + 5. L0-regularized optimization -> calibrated weights + 6. Save weights, diagnostics, run config + +Two presets control output size via L0 regularization: +- local: L0=1e-8, ~3-4M records (for local area dataset) +- national: L0=1e-4, ~50K records (for web app) + +Usage: + python -m policyengine_us_data.calibration.unified_calibration \\ + --dataset path/to/cps_2024.h5 \\ + --db-path path/to/policy_data.db \\ + --output path/to/weights.npy \\ + --preset local \\ + --epochs 100 +""" + +import argparse +import builtins +import logging +import sys +from pathlib import Path + +import numpy as np + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + stream=sys.stderr, +) +logger = logging.getLogger(__name__) + +PRESETS = { + "local": 1e-8, + "national": 1e-4, +} + +BETA = 0.35 +GAMMA = -0.1 +ZETA = 1.1 +INIT_KEEP_PROB = 0.999 +LOG_WEIGHT_JITTER_SD = 0.05 +LOG_ALPHA_JITTER_SD = 0.01 +LAMBDA_L2 = 1e-12 +LEARNING_RATE = 0.15 +DEFAULT_EPOCHS = 100 +DEFAULT_N_CLONES = 10 + +SIMPLE_TAKEUP_VARS = [ + { + "variable": "takes_up_snap_if_eligible", + "entity": "spm_unit", + "rate_key": "snap", + }, + { + "variable": "takes_up_aca_if_eligible", + "entity": "tax_unit", + "rate_key": "aca", + }, + { + "variable": "takes_up_dc_ptc", + "entity": "tax_unit", + "rate_key": "dc_ptc", + }, + { + "variable": "takes_up_head_start_if_eligible", + "entity": "person", + "rate_key": "head_start", + }, + { + "variable": "takes_up_early_head_start_if_eligible", + "entity": "person", + "rate_key": "early_head_start", + }, + { + "variable": "takes_up_ssi_if_eligible", + "entity": "person", + "rate_key": "ssi", + }, + { + "variable": "would_file_taxes_voluntarily", + "entity": "tax_unit", + "rate_key": "voluntary_filing", + }, + { + "variable": "takes_up_medicaid_if_eligible", + "entity": "person", + "rate_key": "medicaid", + }, +] + + +def rerandomize_takeup( + sim, + clone_block_geoids: np.ndarray, + clone_state_fips: np.ndarray, + time_period: int, +) -> None: + """Re-randomize simple takeup variables per census block. + + Groups entities by their household's block GEOID and draws + new takeup booleans using seeded_rng(var_name, salt=block). + Overrides the simulation's stored inputs. + + Args: + sim: Microsimulation instance (already has state_fips). + clone_block_geoids: Block GEOIDs per household. + clone_state_fips: State FIPS per household. + time_period: Tax year. + """ + from policyengine_us_data.parameters import ( + load_take_up_rate, + ) + from policyengine_us_data.utils.randomness import ( + seeded_rng, + ) + + n_households = len(clone_block_geoids) + hh_ids = sim.calculate("household_id", map_to="household").values + hh_to_block = dict(zip(hh_ids, clone_block_geoids)) + hh_to_state = dict(zip(hh_ids, clone_state_fips)) + + for spec in SIMPLE_TAKEUP_VARS: + var_name = spec["variable"] + entity_level = spec["entity"] + rate_key = spec["rate_key"] + + rate_or_dict = load_take_up_rate(rate_key, time_period) + + is_state_specific = isinstance(rate_or_dict, dict) + + entity_ids = sim.calculate( + f"{entity_level}_id", map_to=entity_level + ).values + entity_hh_ids = sim.calculate( + "household_id", map_to=entity_level + ).values + n_entities = len(entity_ids) + + draws = np.zeros(n_entities, dtype=np.float64) + rates = np.zeros(n_entities, dtype=np.float64) + + entity_blocks = np.array( + [hh_to_block.get(hid, "0") for hid in entity_hh_ids] + ) + + unique_blocks = np.unique(entity_blocks) + for block in unique_blocks: + mask = entity_blocks == block + n_in_block = mask.sum() + rng = seeded_rng(var_name, salt=str(block)) + draws[mask] = rng.random(n_in_block) + + if is_state_specific: + block_hh_ids = entity_hh_ids[mask] + for i, hid in enumerate(block_hh_ids): + state = int(hh_to_state.get(hid, 0)) + state_str = str(state) + r = rate_or_dict.get( + state_str, + rate_or_dict.get(state, 0.8), + ) + idx = np.where(mask)[0][i] + rates[idx] = r + else: + rates[mask] = rate_or_dict + + new_values = draws < rates + sim.set_input(var_name, time_period, new_values) + + +def parse_args(argv=None): + parser = argparse.ArgumentParser( + description="Unified L0 calibration pipeline" + ) + parser.add_argument( + "--dataset", + default=None, + help="Path to CPS h5 file", + ) + parser.add_argument( + "--db-path", + default=None, + help="Path to policy_data.db", + ) + parser.add_argument( + "--output", + default=None, + help="Path to save weights (.npy)", + ) + parser.add_argument( + "--n-clones", + type=int, + default=DEFAULT_N_CLONES, + help=f"Number of clones (default: {DEFAULT_N_CLONES})", + ) + parser.add_argument( + "--preset", + choices=list(PRESETS.keys()), + default=None, + help="L0 preset: local or national", + ) + parser.add_argument( + "--lambda-l0", + type=float, + default=None, + help="Custom L0 penalty (overrides preset)", + ) + parser.add_argument( + "--epochs", + type=int, + default=DEFAULT_EPOCHS, + help=f"Training epochs (default: {DEFAULT_EPOCHS})", + ) + parser.add_argument( + "--device", + default="cpu", + choices=["cpu", "cuda"], + help="Device for training", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed for geography assignment", + ) + parser.add_argument( + "--domain-variables", + type=str, + default=None, + help=( + "Comma-separated domain variables for " "target_overview filtering" + ), + ) + parser.add_argument( + "--hierarchical-domains", + type=str, + default=None, + help=( + "Comma-separated domains for hierarchical " + "uprating + CD reconciliation" + ), + ) + parser.add_argument( + "--skip-takeup-rerandomize", + action="store_true", + help="Skip takeup re-randomization", + ) + return parser.parse_args(argv) + + +def fit_l0_weights( + X_sparse, + targets: np.ndarray, + lambda_l0: float, + epochs: int = DEFAULT_EPOCHS, + device: str = "cpu", + verbose_freq: int = None, +) -> np.ndarray: + """Fit L0-regularized calibration weights. + + Args: + X_sparse: Sparse matrix (targets x records). + targets: Target values array. + lambda_l0: L0 regularization strength. + epochs: Training epochs. + device: Torch device. + verbose_freq: Print frequency. Defaults to 10%. + + Returns: + Weight array of shape (n_records,). + """ + import time + + try: + from l0.calibration import SparseCalibrationWeights + except ImportError: + raise ImportError( + "l0-python required. " "Install: pip install l0-python" + ) + + import torch + + n_total = X_sparse.shape[1] + initial_weights = np.ones(n_total) * 100 + + logger.info( + "L0 calibration: %d targets, %d features, " + "lambda_l0=%.1e, epochs=%d", + X_sparse.shape[0], + n_total, + lambda_l0, + epochs, + ) + + model = SparseCalibrationWeights( + n_features=n_total, + beta=BETA, + gamma=GAMMA, + zeta=ZETA, + init_keep_prob=INIT_KEEP_PROB, + init_weights=initial_weights, + log_weight_jitter_sd=LOG_WEIGHT_JITTER_SD, + log_alpha_jitter_sd=LOG_ALPHA_JITTER_SD, + device=device, + ) + + if verbose_freq is None: + verbose_freq = max(1, epochs // 10) + + _builtin_print = builtins.print + + def _flushed_print(*args, **kwargs): + _builtin_print(*args, **kwargs) + sys.stdout.flush() + + builtins.print = _flushed_print + + t0 = time.time() + try: + model.fit( + M=X_sparse, + y=targets, + target_groups=None, + lambda_l0=lambda_l0, + lambda_l2=LAMBDA_L2, + lr=LEARNING_RATE, + epochs=epochs, + loss_type="relative", + verbose=True, + verbose_freq=verbose_freq, + ) + finally: + builtins.print = _builtin_print + + elapsed = time.time() - t0 + logger.info( + "L0 done in %.1f min (%.1f sec/epoch)", + elapsed / 60, + elapsed / epochs, + ) + + with torch.no_grad(): + weights = model.get_weights(deterministic=True).cpu().numpy() + + n_nz = (weights > 0).sum() + logger.info( + "Non-zero: %d / %d (%.1f%% sparsity)", + n_nz, + n_total, + (1 - n_nz / n_total) * 100, + ) + return weights + + +def compute_diagnostics( + weights: np.ndarray, + X_sparse, + targets_df, + target_names: list, +) -> "pd.DataFrame": + import pandas as pd + + estimates = X_sparse.dot(weights) + true_values = targets_df["value"].values + row_sums = np.array(X_sparse.sum(axis=1)).flatten() + + rel_errors = np.where( + np.abs(true_values) > 0, + (estimates - true_values) / np.abs(true_values), + 0.0, + ) + return pd.DataFrame( + { + "target": target_names, + "true_value": true_values, + "estimate": estimates, + "rel_error": rel_errors, + "abs_rel_error": np.abs(rel_errors), + "achievable": row_sums > 0, + } + ) + + +def run_calibration( + dataset_path: str, + db_path: str, + n_clones: int = DEFAULT_N_CLONES, + lambda_l0: float = 1e-8, + epochs: int = DEFAULT_EPOCHS, + device: str = "cpu", + seed: int = 42, + domain_variables: list = None, + hierarchical_domains: list = None, + skip_takeup_rerandomize: bool = False, +): + """Run unified calibration pipeline. + + Args: + dataset_path: Path to CPS h5 file. + db_path: Path to policy_data.db. + n_clones: Number of dataset clones. + lambda_l0: L0 regularization strength. + epochs: Training epochs. + device: Torch device. + seed: Random seed. + domain_variables: Filter targets by domain variable. + hierarchical_domains: Domains for hierarchical + uprating + CD reconciliation. + skip_takeup_rerandomize: Skip takeup step. + + Returns: + (weights, targets_df, X_sparse, target_names) + """ + import time + + from policyengine_us import Microsimulation + + from policyengine_us_data.calibration.clone_and_assign import ( + assign_random_geography, + ) + from policyengine_us_data.calibration.unified_matrix_builder import ( + UnifiedMatrixBuilder, + ) + + t0 = time.time() + + # Step 1: Load dataset + logger.info("Loading dataset from %s", dataset_path) + sim = Microsimulation(dataset=dataset_path) + n_records = len(sim.calculate("household_id", map_to="household").values) + logger.info("Loaded %d households", n_records) + + # Step 2: Clone and assign geography + logger.info( + "Assigning geography: %d x %d = %d total", + n_records, + n_clones, + n_records * n_clones, + ) + geography = assign_random_geography( + n_records=n_records, + n_clones=n_clones, + seed=seed, + ) + + # Step 3: Build sim_modifier for takeup rerandomization + sim_modifier = None + if not skip_takeup_rerandomize: + time_period = 2024 + + def sim_modifier(s, clone_idx): + col_start = clone_idx * n_records + col_end = col_start + n_records + blocks = geography.block_geoid[col_start:col_end] + states = geography.state_fips[col_start:col_end] + rerandomize_takeup(s, blocks, states, time_period) + + # Step 4: Build target filter + target_filter = {} + if domain_variables: + target_filter["domain_variables"] = domain_variables + + # Step 5: Build sparse calibration matrix + t_matrix = time.time() + db_uri = f"sqlite:///{db_path}" + builder = UnifiedMatrixBuilder( + db_uri=db_uri, + time_period=2024, + dataset_path=dataset_path, + ) + targets_df, X_sparse, target_names = builder.build_matrix( + geography=geography, + sim=sim, + target_filter=target_filter, + hierarchical_domains=hierarchical_domains, + sim_modifier=sim_modifier, + ) + + builder.print_uprating_summary(targets_df) + logger.info( + "Matrix built in %.1f min", + (time.time() - t_matrix) / 60, + ) + logger.info( + "Matrix shape: %s, nnz: %d", + X_sparse.shape, + X_sparse.nnz, + ) + + # Step 6: L0 calibration + targets = targets_df["value"].values + + row_sums = np.array(X_sparse.sum(axis=1)).flatten() + achievable = row_sums > 0 + logger.info( + "Achievable: %d / %d targets", + achievable.sum(), + len(achievable), + ) + + weights = fit_l0_weights( + X_sparse=X_sparse, + targets=targets, + lambda_l0=lambda_l0, + epochs=epochs, + device=device, + ) + + logger.info( + "Total pipeline: %.1f min", + (time.time() - t0) / 60, + ) + return weights, targets_df, X_sparse, target_names + + +def main(argv=None): + import json + import time + + import pandas as pd + + try: + if not sys.stderr.isatty(): + sys.stderr.reconfigure(line_buffering=True) + if not sys.stdout.isatty(): + sys.stdout.reconfigure(line_buffering=True) + except AttributeError: + pass + + args = parse_args(argv) + + from policyengine_us_data.storage import STORAGE_FOLDER + + dataset_path = args.dataset or str( + STORAGE_FOLDER / "stratified_extended_cps_2024.h5" + ) + db_path = args.db_path or str( + STORAGE_FOLDER / "calibration" / "policy_data.db" + ) + output_path = args.output or str( + STORAGE_FOLDER / "calibration" / "unified_weights.npy" + ) + + if args.lambda_l0 is not None: + lambda_l0 = args.lambda_l0 + elif args.preset is not None: + lambda_l0 = PRESETS[args.preset] + else: + lambda_l0 = PRESETS["local"] + + domain_variables = None + if args.domain_variables: + domain_variables = [ + x.strip() for x in args.domain_variables.split(",") + ] + + hierarchical_domains = None + if args.hierarchical_domains: + hierarchical_domains = [ + x.strip() for x in args.hierarchical_domains.split(",") + ] + + t_start = time.time() + + weights, targets_df, X_sparse, target_names = run_calibration( + dataset_path=dataset_path, + db_path=db_path, + n_clones=args.n_clones, + lambda_l0=lambda_l0, + epochs=args.epochs, + device=args.device, + seed=args.seed, + domain_variables=domain_variables, + hierarchical_domains=hierarchical_domains, + skip_takeup_rerandomize=(args.skip_takeup_rerandomize), + ) + + # Save weights + np.save(output_path, weights) + logger.info("Weights saved to %s", output_path) + print(f"OUTPUT_PATH:{output_path}") + + # Save diagnostics + output_dir = Path(output_path).parent + diag_df = compute_diagnostics(weights, X_sparse, targets_df, target_names) + diag_path = output_dir / "unified_diagnostics.csv" + diag_df.to_csv(diag_path, index=False) + + ach = diag_df[diag_df.achievable] + err_pct = ach.abs_rel_error * 100 + logger.info( + "Diagnostics: %d targets, " + "mean=%.1f%%, median=%.1f%%, " + "<10%%=%.1f%%, <25%%=%.1f%%", + len(ach), + err_pct.mean(), + err_pct.median(), + (err_pct < 10).mean() * 100, + (err_pct < 25).mean() * 100, + ) + + # Save run config + t_end = time.time() + run_config = { + "dataset": dataset_path, + "db_path": db_path, + "n_clones": args.n_clones, + "lambda_l0": lambda_l0, + "epochs": args.epochs, + "device": args.device, + "seed": args.seed, + "domain_variables": domain_variables, + "hierarchical_domains": hierarchical_domains, + "n_targets": len(targets_df), + "n_records": X_sparse.shape[1], + "weight_sum": float(weights.sum()), + "weight_nonzero": int((weights > 0).sum()), + "mean_error_pct": float(err_pct.mean()), + "elapsed_seconds": round(t_end - t_start, 1), + } + config_path = output_dir / "unified_run_config.json" + with open(config_path, "w") as f: + json.dump(run_config, f, indent=2) + logger.info("Config saved to %s", config_path) + print(f"LOG_PATH:{diag_path}") + + +if __name__ == "__main__": + main() diff --git a/policyengine_us_data/calibration/unified_matrix_builder.py b/policyengine_us_data/calibration/unified_matrix_builder.py new file mode 100644 index 000000000..ac31c34e1 --- /dev/null +++ b/policyengine_us_data/calibration/unified_matrix_builder.py @@ -0,0 +1,906 @@ +""" +Unified sparse matrix builder for clone-based calibration. + +Builds a sparse calibration matrix for cloned+geography-assigned CPS +records. Processes clone-by-clone: for each clone, sets each +record's state_fips to its assigned value, simulates, and extracts +variable values. + +Matrix shape: (n_targets, n_records * n_clones) +Column ordering: index i = clone_idx * n_records + record_idx +""" + +import logging +from collections import defaultdict +from typing import Dict, List, Optional, Tuple + +import numpy as np +import pandas as pd +from scipy import sparse +from sqlalchemy import create_engine, text + +from policyengine_us_data.storage import STORAGE_FOLDER +from policyengine_us_data.utils.census import STATE_NAME_TO_FIPS +from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import ( + get_calculated_variables, + apply_op, + get_geo_level, +) + +logger = logging.getLogger(__name__) + +_GEO_VARS = { + "state_fips", + "state_code", + "congressional_district_geoid", +} + + +class UnifiedMatrixBuilder: + """Build sparse calibration matrix for cloned CPS records. + + Processes clone-by-clone: each clone's records get their + assigned geography, are simulated, and the results fill + the corresponding columns. + + Args: + db_uri: SQLAlchemy database URI. + time_period: Tax year for calibration (e.g. 2024). + dataset_path: Path to the base extended CPS h5 file. + """ + + def __init__( + self, + db_uri: str, + time_period: int, + dataset_path: Optional[str] = None, + ): + self.db_uri = db_uri + self.engine = create_engine(db_uri) + self.time_period = time_period + self.dataset_path = dataset_path + self._entity_rel_cache = None + + # --------------------------------------------------------------- + # Entity relationships + # --------------------------------------------------------------- + + def _build_entity_relationship(self, sim) -> pd.DataFrame: + if self._entity_rel_cache is not None: + return self._entity_rel_cache + + self._entity_rel_cache = pd.DataFrame( + { + "person_id": sim.calculate( + "person_id", map_to="person" + ).values, + "household_id": sim.calculate( + "household_id", map_to="person" + ).values, + "tax_unit_id": sim.calculate( + "tax_unit_id", map_to="person" + ).values, + "spm_unit_id": sim.calculate( + "spm_unit_id", map_to="person" + ).values, + } + ) + return self._entity_rel_cache + + # --------------------------------------------------------------- + # Constraint evaluation + # --------------------------------------------------------------- + + def _evaluate_constraints_entity_aware( + self, + sim, + constraints: List[dict], + n_households: int, + ) -> np.ndarray: + """Evaluate constraints at person level, aggregate to + household level via .any().""" + if not constraints: + return np.ones(n_households, dtype=bool) + + entity_rel = self._build_entity_relationship(sim) + n_persons = len(entity_rel) + person_mask = np.ones(n_persons, dtype=bool) + + for c in constraints: + try: + vals = sim.calculate( + c["variable"], + self.time_period, + map_to="person", + ).values + except Exception as exc: + logger.warning( + "Cannot evaluate constraint '%s': %s", + c["variable"], + exc, + ) + return np.zeros(n_households, dtype=bool) + person_mask &= apply_op(vals, c["operation"], c["value"]) + + df = entity_rel.copy() + df["satisfies"] = person_mask + hh_mask = df.groupby("household_id")["satisfies"].any() + + household_ids = sim.calculate( + "household_id", map_to="household" + ).values + return np.array([hh_mask.get(hid, False) for hid in household_ids]) + + # --------------------------------------------------------------- + # Database queries + # --------------------------------------------------------------- + + def _get_stratum_constraints(self, stratum_id: int) -> List[dict]: + query = """ + SELECT constraint_variable AS variable, operation, value + FROM stratum_constraints + WHERE stratum_id = :stratum_id + """ + with self.engine.connect() as conn: + df = pd.read_sql( + query, + conn, + params={"stratum_id": int(stratum_id)}, + ) + return df.to_dict("records") + + def _query_targets(self, target_filter: dict) -> pd.DataFrame: + """Query targets via target_overview view with + best-period selection.""" + or_conditions = [] + + if "domain_variables" in target_filter: + dvs = target_filter["domain_variables"] + ph = ",".join(f"'{dv}'" for dv in dvs) + or_conditions.append(f"tv.domain_variable IN ({ph})") + + if "variables" in target_filter: + vs = ",".join(f"'{v}'" for v in target_filter["variables"]) + or_conditions.append(f"tv.variable IN ({vs})") + + if "target_ids" in target_filter: + ids = ",".join(map(str, target_filter["target_ids"])) + or_conditions.append(f"tv.target_id IN ({ids})") + + if "stratum_ids" in target_filter: + ids = ",".join(map(str, target_filter["stratum_ids"])) + or_conditions.append(f"tv.stratum_id IN ({ids})") + + if not or_conditions: + where_clause = "1=1" + else: + where_clause = " OR ".join(f"({c})" for c in or_conditions) + + query = f""" + WITH filtered_targets AS ( + SELECT tv.target_id, tv.stratum_id, tv.variable, + tv.value, tv.period, tv.geo_level, + tv.geographic_id, tv.domain_variable + FROM target_overview tv + WHERE {where_clause} + ), + best_periods AS ( + SELECT stratum_id, variable, + CASE + WHEN MAX(CASE WHEN period <= :time_period + THEN period END) IS NOT NULL + THEN MAX(CASE WHEN period <= :time_period + THEN period END) + ELSE MIN(period) + END as best_period + FROM filtered_targets + GROUP BY stratum_id, variable + ) + SELECT ft.* + FROM filtered_targets ft + JOIN best_periods bp + ON ft.stratum_id = bp.stratum_id + AND ft.variable = bp.variable + AND ft.period = bp.best_period + ORDER BY ft.target_id + """ + + with self.engine.connect() as conn: + return pd.read_sql( + query, + conn, + params={"time_period": self.time_period}, + ) + + # --------------------------------------------------------------- + # Uprating + # --------------------------------------------------------------- + + def _calculate_uprating_factors(self, params) -> dict: + factors = {} + query = ( + "SELECT DISTINCT period FROM targets " + "WHERE period IS NOT NULL ORDER BY period" + ) + with self.engine.connect() as conn: + result = conn.execute(text(query)) + years_needed = [row[0] for row in result] + + for from_year in years_needed: + if from_year == self.time_period: + factors[(from_year, "cpi")] = 1.0 + factors[(from_year, "pop")] = 1.0 + continue + + try: + cpi_from = params.gov.bls.cpi.cpi_u(from_year) + cpi_to = params.gov.bls.cpi.cpi_u(self.time_period) + factors[(from_year, "cpi")] = float(cpi_to / cpi_from) + except Exception: + factors[(from_year, "cpi")] = 1.0 + + try: + pop_from = params.calibration.gov.census.populations.total( + from_year + ) + pop_to = params.calibration.gov.census.populations.total( + self.time_period + ) + factors[(from_year, "pop")] = float(pop_to / pop_from) + except Exception: + factors[(from_year, "pop")] = 1.0 + + return factors + + def _get_uprating_info( + self, + variable: str, + period: int, + factors: dict, + ) -> Tuple[float, str]: + if period == self.time_period: + return 1.0, "none" + + count_indicators = [ + "count", + "person", + "people", + "households", + "tax_units", + ] + is_count = any(ind in variable.lower() for ind in count_indicators) + uprating_type = "pop" if is_count else "cpi" + factor = factors.get((period, uprating_type), 1.0) + return factor, uprating_type + + def _load_aca_ptc_factors( + self, + ) -> Dict[int, Dict[str, float]]: + csv_path = STORAGE_FOLDER / "aca_ptc_multipliers_2022_2024.csv" + df = pd.read_csv(csv_path) + result = {} + for _, row in df.iterrows(): + fips_str = STATE_NAME_TO_FIPS.get(row["state"]) + if fips_str is None: + continue + fips_int = int(fips_str) + result[fips_int] = { + "tax_unit_count": row["vol_mult"], + "aca_ptc": row["vol_mult"] * row["val_mult"], + } + return result + + def _get_state_uprating_factors( + self, + domain: str, + targets_df: pd.DataFrame, + national_factors: dict, + ) -> Dict[int, Dict[str, float]]: + state_rows = targets_df[ + (targets_df["domain_variable"] == domain) + & (targets_df["geo_level"] == "state") + ] + state_fips_list = state_rows["geographic_id"].unique() + variables = state_rows["variable"].unique() + + if domain == "aca_ptc": + csv_factors = self._load_aca_ptc_factors() + else: + csv_factors = None + + result = {} + for sf in state_fips_list: + state_int = int(sf) + var_factors = {} + + if csv_factors and state_int in csv_factors: + for var in variables: + var_factors[var] = csv_factors[state_int].get(var, 1.0) + else: + for var in variables: + row = state_rows[ + (state_rows["geographic_id"] == sf) + & (state_rows["variable"] == var) + ] + if row.empty: + var_factors[var] = 1.0 + continue + period = row.iloc[0]["period"] + factor, _ = self._get_uprating_info( + var, period, national_factors + ) + var_factors[var] = factor + + result[state_int] = var_factors + + return result + + def _apply_hierarchical_uprating( + self, + targets_df: pd.DataFrame, + hierarchical_domains: List[str], + national_factors: dict, + ) -> pd.DataFrame: + """Apply state-level uprating and reconcile CDs. + + Two factors per CD row: + - hif: state_original / sum(cd_originals) + - uprating_factor: state-specific scaling + + Final CD value = original * hif * uprating_factor. + """ + df = targets_df.copy() + df["hif"] = np.nan + df["state_uprating_factor"] = np.nan + rows_to_drop = [] + + for domain in hierarchical_domains: + domain_mask = df["domain_variable"] == domain + state_factors = self._get_state_uprating_factors( + domain, df, national_factors + ) + state_mask = domain_mask & (df["geo_level"] == "state") + district_mask = domain_mask & (df["geo_level"] == "district") + + for sf, var_factors in state_factors.items(): + for var, uf in var_factors.items(): + state_row = df[ + state_mask + & (df["geographic_id"] == str(sf)) + & (df["variable"] == var) + ] + if state_row.empty: + continue + state_original = state_row.iloc[0]["original_value"] + + def _cd_in_state(g, s=sf): + try: + return int(g) // 100 == s + except (ValueError, TypeError): + return False + + cd_mask = ( + district_mask + & (df["variable"] == var) + & df["geographic_id"].apply(_cd_in_state) + ) + cd_rows = df[cd_mask] + if cd_rows.empty: + continue + + cd_original_sum = cd_rows["original_value"].sum() + if cd_original_sum == 0: + continue + + hif = state_original / cd_original_sum + for cd_idx in cd_rows.index: + df.at[cd_idx, "hif"] = hif + df.at[cd_idx, "state_uprating_factor"] = uf + df.at[cd_idx, "value"] = ( + df.at[cd_idx, "original_value"] * hif * uf + ) + + # Drop national/state rows used for reconciliation + national_mask = domain_mask & (df["geo_level"] == "national") + for idx in df[national_mask | state_mask].index: + row = df.loc[idx] + if row["period"] != self.time_period: + rows_to_drop.append(idx) + + if rows_to_drop: + df = df.drop(index=rows_to_drop).reset_index(drop=True) + + df["target_period"] = self.time_period + return df + + def print_uprating_summary(self, targets_df: pd.DataFrame) -> None: + has_state_uf = "state_uprating_factor" in targets_df.columns + if has_state_uf: + eff = targets_df["state_uprating_factor"].fillna( + targets_df["uprating_factor"] + ) + else: + eff = targets_df["uprating_factor"] + + uprated = targets_df[eff != 1.0] + if len(uprated) == 0: + print("No targets were uprated.") + return + + print("\n" + "=" * 60) + print("UPRATING SUMMARY") + print("=" * 60) + print(f"Uprated {len(uprated)} of " f"{len(targets_df)} targets") + period_counts = uprated["period"].value_counts().sort_index() + for period, count in period_counts.items(): + print(f" Period {period}: {count} targets") + factors = eff[eff != 1.0] + print( + f" Factor range: [{factors.min():.4f}, " f"{factors.max():.4f}]" + ) + + # --------------------------------------------------------------- + # Target naming + # --------------------------------------------------------------- + + @staticmethod + def _make_target_name( + variable: str, + constraints: List[dict], + reform_id: int = 0, + ) -> str: + geo_parts: List[str] = [] + for c in constraints: + if c["variable"] == "state_fips": + geo_parts.append(f"state_{c['value']}") + elif c["variable"] == "congressional_district_geoid": + geo_parts.append(f"cd_{c['value']}") + + parts: List[str] = [] + parts.append("/".join(geo_parts) if geo_parts else "national") + if reform_id > 0: + parts.append(f"{variable}_expenditure") + else: + parts.append(variable) + + non_geo = [c for c in constraints if c["variable"] not in _GEO_VARS] + if non_geo: + strs = [ + f"{c['variable']}{c['operation']}{c['value']}" for c in non_geo + ] + parts.append("[" + ",".join(strs) + "]") + + return "/".join(parts) + + # --------------------------------------------------------------- + # Target value calculation + # --------------------------------------------------------------- + + def _calculate_target_values( + self, + sim, + target_variable: str, + non_geo_constraints: List[dict], + n_households: int, + ) -> np.ndarray: + """Calculate per-household target values. + + For count targets (*_count): count entities per HH + satisfying constraints. + For value targets: multiply values by constraint mask. + """ + is_count = target_variable.endswith("_count") + + if not is_count: + mask = self._evaluate_constraints_entity_aware( + sim, non_geo_constraints, n_households + ) + vals = sim.calculate(target_variable, map_to="household").values + return (vals * mask).astype(np.float32) + + # Count target: entity-aware counting + entity_rel = self._build_entity_relationship(sim) + n_persons = len(entity_rel) + person_mask = np.ones(n_persons, dtype=bool) + + for c in non_geo_constraints: + try: + cv = sim.calculate(c["variable"], map_to="person").values + except Exception: + return np.zeros(n_households, dtype=np.float32) + person_mask &= apply_op(cv, c["operation"], c["value"]) + + target_entity = sim.tax_benefit_system.variables[ + target_variable + ].entity.key + household_ids = sim.calculate( + "household_id", map_to="household" + ).values + + if target_entity == "household": + if non_geo_constraints: + mask = self._evaluate_constraints_entity_aware( + sim, non_geo_constraints, n_households + ) + return mask.astype(np.float32) + return np.ones(n_households, dtype=np.float32) + + if target_entity == "person": + er = entity_rel.copy() + er["satisfies"] = person_mask + filtered = er[er["satisfies"]] + counts = filtered.groupby("household_id")["person_id"].nunique() + else: + eid_col = f"{target_entity}_id" + er = entity_rel.copy() + er["satisfies"] = person_mask + entity_ok = er.groupby(eid_col)["satisfies"].any() + unique = er[["household_id", eid_col]].drop_duplicates() + unique["entity_ok"] = unique[eid_col].map(entity_ok) + filtered = unique[unique["entity_ok"]] + counts = filtered.groupby("household_id")[eid_col].nunique() + + return np.array( + [counts.get(hid, 0) for hid in household_ids], + dtype=np.float32, + ) + + # --------------------------------------------------------------- + # Clone simulation + # --------------------------------------------------------------- + + def _simulate_clone( + self, + clone_state_fips: np.ndarray, + n_records: int, + variables: set, + sim_modifier=None, + clone_idx: int = 0, + ) -> Tuple[Dict[str, np.ndarray], object]: + """Simulate one clone with assigned geography. + + Args: + clone_state_fips: State FIPS per record, shape + (n_records,). + n_records: Number of base records. + variables: Target variable names to compute. + sim_modifier: Optional callback(sim, clone_idx) + called after state_fips is set but before + cache clearing. Used for takeup + re-randomization. + clone_idx: Clone index passed to sim_modifier. + + Returns: + (var_values, sim) where var_values maps variable + name to household-level float32 array. + """ + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=self.dataset_path) + sim.set_input( + "state_fips", + self.time_period, + clone_state_fips.astype(np.int32), + ) + if sim_modifier is not None: + sim_modifier(sim, clone_idx) + for var in get_calculated_variables(sim): + sim.delete_arrays(var) + + var_values: Dict[str, np.ndarray] = {} + for var in variables: + if var.endswith("_count"): + continue + try: + var_values[var] = sim.calculate( + var, + self.time_period, + map_to="household", + ).values.astype(np.float32) + except Exception as exc: + logger.warning("Cannot calculate '%s': %s", var, exc) + + return var_values, sim + + # --------------------------------------------------------------- + # Main build method + # --------------------------------------------------------------- + + def build_matrix( + self, + geography, + sim, + target_filter: Optional[dict] = None, + hierarchical_domains: Optional[List[str]] = None, + cache_dir: Optional[str] = None, + sim_modifier=None, + ) -> Tuple[pd.DataFrame, sparse.csr_matrix, List[str]]: + """Build sparse calibration matrix. + + Two-phase build: (1) simulate each clone and save + COO entries to disk, (2) assemble CSR from caches. + + Args: + geography: GeographyAssignment with state_fips, + cd_geoid, block_geoid arrays and n_records, + n_clones attributes. + sim: Microsimulation for parameters and entity + relationships. + target_filter: Dict for target_overview filtering. + hierarchical_domains: Domain names for + hierarchical uprating + CD reconciliation. + cache_dir: Directory for per-clone COO caches. + If None, COO data held in memory. + sim_modifier: Optional callback(sim, clone_idx) + called per clone after state_fips is set but + before cache clearing. Use for takeup + re-randomization. + + Returns: + (targets_df, X_sparse, target_names) + """ + n_records = geography.n_records + n_clones = geography.n_clones + n_total = n_records * n_clones + self._coo_parts = ([], [], []) + + # 1. Query and uprate targets + targets_df = self._query_targets(target_filter or {}) + if len(targets_df) == 0: + raise ValueError("No targets found matching filter") + + params = sim.tax_benefit_system.parameters + uprating_factors = self._calculate_uprating_factors(params) + targets_df["original_value"] = targets_df["value"].copy() + targets_df["uprating_factor"] = targets_df.apply( + lambda row: self._get_uprating_info( + row["variable"], + row["period"], + uprating_factors, + )[0], + axis=1, + ) + targets_df["value"] = ( + targets_df["original_value"] * targets_df["uprating_factor"] + ) + + if hierarchical_domains: + targets_df = self._apply_hierarchical_uprating( + targets_df, + hierarchical_domains, + uprating_factors, + ) + + n_targets = len(targets_df) + + # 2. Sort targets by geographic level + targets_df["_geo_level"] = targets_df["geographic_id"].apply( + get_geo_level + ) + targets_df = targets_df.sort_values( + ["_geo_level", "variable", "geographic_id"] + ) + targets_df = targets_df.drop(columns=["_geo_level"]).reset_index( + drop=True + ) + + # 3. Build column index structures from geography + state_col_lists: Dict[int, list] = defaultdict(list) + cd_col_lists: Dict[str, list] = defaultdict(list) + for col in range(n_total): + state_col_lists[int(geography.state_fips[col])].append(col) + cd_col_lists[str(geography.cd_geoid[col])].append(col) + state_to_cols = {s: np.array(c) for s, c in state_col_lists.items()} + cd_to_cols = {cd: np.array(c) for cd, c in cd_col_lists.items()} + + # 4. Pre-process targets: resolve constraints + constraint_cache: Dict[int, List[dict]] = {} + target_geo_info: List[Tuple[str, str]] = [] + target_names: List[str] = [] + non_geo_constraints_list: List[List[dict]] = [] + + for _, row in targets_df.iterrows(): + sid = int(row["stratum_id"]) + if sid not in constraint_cache: + constraint_cache[sid] = self._get_stratum_constraints(sid) + constraints = constraint_cache[sid] + + geo_level = row["geo_level"] + geo_id = row["geographic_id"] + target_geo_info.append((geo_level, geo_id)) + + non_geo = [ + c for c in constraints if c["variable"] not in _GEO_VARS + ] + non_geo_constraints_list.append(non_geo) + + target_names.append( + self._make_target_name(str(row["variable"]), constraints) + ) + + unique_variables = set(targets_df["variable"].values) + + # 5. Clone loop + from pathlib import Path + + clone_dir = Path(cache_dir) if cache_dir else None + if clone_dir: + clone_dir.mkdir(parents=True, exist_ok=True) + + self._entity_rel_cache = None + + for clone_idx in range(n_clones): + if clone_dir: + coo_path = clone_dir / f"clone_{clone_idx:04d}.npz" + if coo_path.exists(): + logger.info( + "Clone %d/%d cached, skipping.", + clone_idx + 1, + n_clones, + ) + continue + + col_start = clone_idx * n_records + col_end = col_start + n_records + clone_states = geography.state_fips[col_start:col_end] + + logger.info( + "Processing clone %d/%d " "(cols %d-%d, %d unique states)...", + clone_idx + 1, + n_clones, + col_start, + col_end - 1, + len(np.unique(clone_states)), + ) + + var_values, clone_sim = self._simulate_clone( + clone_states, + n_records, + unique_variables, + sim_modifier=sim_modifier, + clone_idx=clone_idx, + ) + + mask_cache: Dict[tuple, np.ndarray] = {} + count_cache: Dict[tuple, np.ndarray] = {} + + rows_list: list = [] + cols_list: list = [] + vals_list: list = [] + + for row_idx in range(n_targets): + variable = str(targets_df.iloc[row_idx]["variable"]) + geo_level, geo_id = target_geo_info[row_idx] + non_geo = non_geo_constraints_list[row_idx] + + # Geographic column selection + if geo_level == "district": + all_geo_cols = cd_to_cols.get( + str(geo_id), + np.array([], dtype=np.int64), + ) + elif geo_level == "state": + all_geo_cols = state_to_cols.get( + int(geo_id), + np.array([], dtype=np.int64), + ) + else: + all_geo_cols = np.arange(n_total) + + clone_cols = all_geo_cols[ + (all_geo_cols >= col_start) & (all_geo_cols < col_end) + ] + if len(clone_cols) == 0: + continue + + rec_indices = clone_cols - col_start + + constraint_key = tuple( + sorted( + ( + c["variable"], + c["operation"], + c["value"], + ) + for c in non_geo + ) + ) + + if variable.endswith("_count"): + vkey = (variable, constraint_key) + if vkey not in count_cache: + count_cache[vkey] = self._calculate_target_values( + clone_sim, + variable, + non_geo, + n_records, + ) + values = count_cache[vkey] + else: + if variable not in var_values: + continue + if constraint_key not in mask_cache: + mask_cache[constraint_key] = ( + self._evaluate_constraints_entity_aware( + clone_sim, + non_geo, + n_records, + ) + ) + mask = mask_cache[constraint_key] + values = var_values[variable] * mask + + vals = values[rec_indices] + nonzero = vals != 0 + if nonzero.any(): + rows_list.append( + np.full( + nonzero.sum(), + row_idx, + dtype=np.int32, + ) + ) + cols_list.append(clone_cols[nonzero].astype(np.int32)) + vals_list.append(vals[nonzero]) + + # Save COO entries + if rows_list: + cr = np.concatenate(rows_list) + cc = np.concatenate(cols_list) + cv = np.concatenate(vals_list) + else: + cr = np.array([], dtype=np.int32) + cc = np.array([], dtype=np.int32) + cv = np.array([], dtype=np.float32) + + if clone_dir: + np.savez_compressed( + str(coo_path), + rows=cr, + cols=cc, + vals=cv, + ) + logger.info( + "Clone %d: %d nonzero entries saved.", + clone_idx + 1, + len(cv), + ) + del var_values, clone_sim + else: + self._coo_parts[0].append(cr) + self._coo_parts[1].append(cc) + self._coo_parts[2].append(cv) + + # 6. Assemble sparse matrix from COO data + logger.info("Assembling matrix from %d clones...", n_clones) + if clone_dir: + all_r, all_c, all_v = [], [], [] + for ci in range(n_clones): + p = clone_dir / f"clone_{ci:04d}.npz" + data = np.load(str(p)) + all_r.append(data["rows"]) + all_c.append(data["cols"]) + all_v.append(data["vals"]) + rows = np.concatenate(all_r) + cols = np.concatenate(all_c) + vals = np.concatenate(all_v) + else: + rows = np.concatenate(self._coo_parts[0]) + cols = np.concatenate(self._coo_parts[1]) + vals = np.concatenate(self._coo_parts[2]) + del self._coo_parts + + X_csr = sparse.csr_matrix( + (vals, (rows, cols)), + shape=(n_targets, n_total), + dtype=np.float32, + ) + + logger.info( + "Matrix: %d targets x %d cols, %d nnz", + X_csr.shape[0], + X_csr.shape[1], + X_csr.nnz, + ) + + return targets_df, X_csr, target_names diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py index 3dcab0e9f..97c82360d 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py @@ -277,7 +277,7 @@ def apply_op(values: np.ndarray, op: str, val: str) -> np.ndarray: return np.ones(len(values), dtype=bool) -def _get_geo_level(geo_id) -> int: +def get_geo_level(geo_id) -> int: """Return geographic level: 0=National, 1=State, 2=District.""" if geo_id == "US": return 0 @@ -324,9 +324,7 @@ def create_target_groups( # Add geo_level column for sorting targets_df = targets_df.copy() - targets_df["_geo_level"] = targets_df["geographic_id"].apply( - _get_geo_level - ) + targets_df["_geo_level"] = targets_df["geographic_id"].apply(get_geo_level) geo_level_names = {0: "National", 1: "State", 2: "District"} @@ -401,6 +399,70 @@ def create_target_groups( return target_groups, group_info +_GEO_LEVEL_NAMES = {0: "National", 1: "State", 2: "District"} + + +def drop_target_groups( + targets_df: pd.DataFrame, + X_sparse, + target_groups: np.ndarray, + group_info: List[str], + drop_specs: List[Tuple[str, str]], +) -> Tuple[pd.DataFrame, "sparse.csr_matrix"]: + """Drop target groups by (label_substring, geo_level_name). + + Args: + targets_df: Target metadata from build_matrix. + X_sparse: Sparse calibration matrix (n_targets x n_cols). + target_groups: Group ID per row from create_target_groups. + group_info: Group descriptions from create_target_groups. + drop_specs: List of (label_substring, geo_level_name) + tuples. geo_level_name is "National", "State", or + "District". label_substring is matched case-insensitive + against group descriptions. + + Returns: + (filtered_targets_df, filtered_X_sparse) + """ + geo_levels = targets_df["geographic_id"].apply(get_geo_level) + name_to_level = {v: k for k, v in _GEO_LEVEL_NAMES.items()} + drop_ids = set() + + for label_substr, geo_name in drop_specs: + level = name_to_level[geo_name] + matched = False + for gid, info in enumerate(group_info): + group_mask = target_groups == gid + group_geo = geo_levels[group_mask] + if not (group_geo == level).all(): + continue + if label_substr.lower() in info.lower(): + drop_ids.add(gid) + matched = True + if not matched: + print( + f" WARNING: no match for " f"({label_substr!r}, {geo_name!r})" + ) + + keep_mask = ~np.isin(target_groups, list(drop_ids)) + + print(f"Matrix before: {X_sparse.shape[0]} rows") + for gid in sorted(drop_ids): + n = (target_groups == gid).sum() + print(f" DROPPING {group_info[gid]} ({n} rows)") + print() + + kept_ids = sorted(set(range(len(group_info))) - drop_ids) + for gid in kept_ids: + n = (target_groups == gid).sum() + print(f" KEEPING {group_info[gid]} ({n} rows)") + + X_out = X_sparse[keep_mask, :] + targets_out = targets_df[keep_mask].reset_index(drop=True) + print(f"\nMatrix after: {X_out.shape[0]} rows") + return targets_out, X_out + + def get_all_cds_from_database(db_uri: str) -> List[str]: """ Get ordered list of all CD GEOIDs from database. diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py b/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py deleted file mode 100644 index 7185c7dc1..000000000 --- a/policyengine_us_data/datasets/cps/local_area_calibration/fit_calibration_weights.py +++ /dev/null @@ -1,284 +0,0 @@ -""" -Fit calibration weights using L0-regularized optimization. -Prototype script for weight calibration using the l0-python package. -""" - -import argparse -import logging -from datetime import datetime -from pathlib import Path - -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) - -parser = argparse.ArgumentParser(description="Fit calibration weights") -parser.add_argument( - "--device", - default="cpu", - choices=["cpu", "cuda"], - help="Device for training (cpu or cuda)", -) -parser.add_argument( - "--epochs", type=int, default=100, help="Total epochs for training" -) -parser.add_argument( - "--db-path", - default=None, - help="Path to policy_data.db (default: STORAGE_FOLDER/calibration/policy_data.db)", -) -parser.add_argument( - "--dataset-path", default=None, help="Path to stratified CPS h5 file" -) -args = parser.parse_args() - -import numpy as np -import pandas as pd -from policyengine_us import Microsimulation -from policyengine_us_data.storage import STORAGE_FOLDER -from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import ( - SparseMatrixBuilder, -) -from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import ( - get_all_cds_from_database, -) -from policyengine_us_data.datasets.cps.local_area_calibration.matrix_tracer import ( - MatrixTracer, -) - -try: - import torch - from l0.calibration import SparseCalibrationWeights -except ImportError: - raise ImportError( - "l0-python is required for weight fitting. " - "Install with: pip install policyengine-us-data[l0]" - ) - -# ============================================================================ -# CONFIGURATION -# ============================================================================ -DEVICE = args.device -TOTAL_EPOCHS = args.epochs -EPOCHS_PER_CHUNK = 500 # TODO: need a better way to set this. Remember it can blow up the Vercel app - -# Groups to exclude from the matrix (by group ID from tracer output). -# Set to [] to keep all groups. Review tracer.print_matrix_structure() -# output to decide. E.g., drop state-level rows that are linearly -# redundant with reconciled district rows — or keep them to steer -# the optimizer. -GROUPS_TO_EXCLUDE = [1] # drop state SNAP HH counts (redundant with Group 4) - -# Hyperparameters -BETA = 0.35 -GAMMA = -0.1 -ZETA = 1.1 -INIT_KEEP_PROB = 0.999 -LOG_WEIGHT_JITTER_SD = 0.05 -LOG_ALPHA_JITTER_SD = 0.01 -LAMBDA_L0 = 1e-8 -LAMBDA_L2 = 1e-12 -LEARNING_RATE = 0.15 - -# Data paths -if args.db_path: - db_path = Path(args.db_path) -else: - db_path = STORAGE_FOLDER / "calibration" / "policy_data.db" -db_uri = f"sqlite:///{db_path}" - -if args.dataset_path: - dataset_path = Path(args.dataset_path) -else: - dataset_path = STORAGE_FOLDER / "stratified_extended_cps_2024.h5" - -output_dir = STORAGE_FOLDER / "calibration" -output_dir.mkdir(parents=True, exist_ok=True) -time_period = 2024 - -# Get all CDs from database -cds_to_calibrate = get_all_cds_from_database(db_uri) -print(f"Found {len(cds_to_calibrate)} congressional districts") - -# ============================================================================ -# STEP 1: BUILD CALIBRATION MATRIX -# ============================================================================ -print(f"Loading simulation from {dataset_path}...") -sim = Microsimulation(dataset=str(dataset_path)) -n_households = len(sim.calculate("household_id", map_to="household").values) -print(f"Loaded {n_households:,} households") - -print("\nBuilding sparse matrix...") -builder = SparseMatrixBuilder( - db_uri=db_uri, - time_period=time_period, - cds_to_calibrate=cds_to_calibrate, - dataset_path=str(dataset_path), -) - -targets_df, X_sparse, household_id_mapping = builder.build_matrix( - sim, - target_filter={ - "domain_variables": ["aca_ptc", "snap"], - }, - hierarchical_domains=["aca_ptc", "snap"], -) - -builder.print_uprating_summary(targets_df) - -tracer = MatrixTracer( - targets_df, X_sparse, household_id_mapping, cds_to_calibrate, sim -) -tracer.print_matrix_structure() - -print(f"\nMatrix shape: {X_sparse.shape}") -print(f"Targets: {len(targets_df)}") - -# ============================================================================ -# STEP 2: FILTER GROUPS AND ACHIEVABLE TARGETS -# ============================================================================ -if GROUPS_TO_EXCLUDE: - keep_mask = ~np.isin(tracer.target_groups, GROUPS_TO_EXCLUDE) - n_dropped = (~keep_mask).sum() - print("\n" + "=" * 60) - print("GROUP EXCLUSION") - print("=" * 60) - print( - f"Excluding groups {GROUPS_TO_EXCLUDE}: " - f"dropping {n_dropped} of {len(targets_df)} rows" - ) - targets_df = targets_df[keep_mask].reset_index(drop=True) - X_sparse = X_sparse[keep_mask, :] - print(f"Matrix after exclusion: {X_sparse.shape}") -else: - print("\nNo groups excluded (GROUPS_TO_EXCLUDE is empty)") - -# Filter to achievable targets (rows with non-zero data) -row_sums = np.array(X_sparse.sum(axis=1)).flatten() -achievable_mask = row_sums > 0 -n_achievable = achievable_mask.sum() -n_impossible = (~achievable_mask).sum() - -print(f"\nAchievable targets: {n_achievable}") -print(f"Impossible targets (filtered out): {n_impossible}") - -targets_df = targets_df[achievable_mask].reset_index(drop=True) -X_sparse = X_sparse[achievable_mask, :] - -print(f"Final matrix shape: {X_sparse.shape}") - -# Extract target vector and names -targets = targets_df["value"].values -target_names = [ - f"{row['geographic_id']}/{row['variable']}" - for _, row in targets_df.iterrows() -] - -# ============================================================================ -# STEP 3: INITIALIZE WEIGHTS -# ============================================================================ -initial_weights = np.ones(X_sparse.shape[1]) * 100 -print(f"\nInitial weights shape: {initial_weights.shape}") -print(f"Initial weights sum: {initial_weights.sum():,.0f}") - -# ============================================================================ -# STEP 4: CREATE MODEL -# ============================================================================ -print("\nCreating SparseCalibrationWeights model...") -model = SparseCalibrationWeights( - n_features=X_sparse.shape[1], - beta=BETA, - gamma=GAMMA, - zeta=ZETA, - init_keep_prob=INIT_KEEP_PROB, - init_weights=initial_weights, - log_weight_jitter_sd=LOG_WEIGHT_JITTER_SD, - log_alpha_jitter_sd=LOG_ALPHA_JITTER_SD, - device=DEVICE, -) - -# ============================================================================ -# STEP 5: TRAIN IN CHUNKS -# ============================================================================ -timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") -calibration_log = pd.DataFrame() - -for chunk_start in range(0, TOTAL_EPOCHS, EPOCHS_PER_CHUNK): - chunk_epochs = min(EPOCHS_PER_CHUNK, TOTAL_EPOCHS - chunk_start) - current_epoch = chunk_start + chunk_epochs - - print(f"\nTraining epochs {chunk_start + 1} to {current_epoch}...") - - model.fit( - M=X_sparse, - y=targets, - target_groups=None, - lambda_l0=LAMBDA_L0, - lambda_l2=LAMBDA_L2, - lr=LEARNING_RATE, - epochs=chunk_epochs, - loss_type="relative", - verbose=True, - verbose_freq=chunk_epochs, - ) - - with torch.no_grad(): - predictions = model.predict(X_sparse).cpu().numpy() - - chunk_df = pd.DataFrame( - { - "target_name": target_names, - "estimate": predictions, - "target": targets, - } - ) - chunk_df["epoch"] = current_epoch - chunk_df["error"] = chunk_df.estimate - chunk_df.target - chunk_df["rel_error"] = chunk_df.error / chunk_df.target - chunk_df["abs_error"] = chunk_df.error.abs() - chunk_df["rel_abs_error"] = chunk_df.rel_error.abs() - chunk_df["loss"] = chunk_df.rel_abs_error**2 - calibration_log = pd.concat([calibration_log, chunk_df], ignore_index=True) - -# ============================================================================ -# STEP 6: EXTRACT AND SAVE WEIGHTS -# ============================================================================ -with torch.no_grad(): - w = model.get_weights(deterministic=True).cpu().numpy() - -print(f"\nFinal weights shape: {w.shape}") -print(f"Final weights sum: {w.sum():,.0f}") -print(f"Non-zero weights: {(w > 0).sum():,}") - -output_path = output_dir / f"calibration_weights_{timestamp}.npy" -np.save(output_path, w) -print(f"\nWeights saved to: {output_path}") -print(f"OUTPUT_PATH:{output_path}") - -log_path = output_dir / f"calibration_log_{timestamp}.csv" -calibration_log.to_csv(log_path, index=False) -print(f"Calibration log saved to: {log_path}") -print(f"LOG_PATH:{log_path}") - -# ============================================================================ -# STEP 7: VERIFY PREDICTIONS -# ============================================================================ -print("\n" + "=" * 60) -print("PREDICTION VERIFICATION") -print("=" * 60) - -with torch.no_grad(): - predictions = model.predict(X_sparse).cpu().numpy() - -for i in range(len(targets)): - rel_error = (predictions[i] - targets[i]) / targets[i] * 100 - print( - f"{target_names[i][:50]:50} | " - f"pred: {predictions[i]:>12,.0f} | " - f"target: {targets[i]:>12,.0f} | " - f"err: {rel_error:>6.2f}%" - ) - -print("\n" + "=" * 60) -print("FITTING COMPLETED") -print("=" * 60) diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py b/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py deleted file mode 100644 index 4fbe6e78f..000000000 --- a/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py +++ /dev/null @@ -1,382 +0,0 @@ -""" -Matrix tracer utility for debugging geo-stacking sparse matrices. - -This utility allows tracing through the complex stacked matrix structure -to verify values match simulation results. - -USAGE -===== - -Basic Setup: - - from matrix_tracer import MatrixTracer - - tracer = MatrixTracer( - targets_df, X_sparse, household_id_mapping, - cds_to_calibrate, sim - ) - -Common Operations: - - # 1. Understand what a column represents - col_info = tracer.get_column_info(100) - - # 2. Find where a household appears across all CDs - positions = tracer.get_household_column_positions(565) - - # 3. View matrix structure - tracer.print_matrix_structure() - -Matrix Structure: - - Columns are organized as: [CD1_households | CD2_households | ... | CD436_households] - Each CD block has n_households columns (e.g., 10,580 households) - - Formula to find column index: - column_idx = cd_block_number * n_households + household_index -""" - -import logging -import pandas as pd -import numpy as np -from typing import Dict, List -from scipy import sparse - -from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import ( - create_target_groups, -) - -logger = logging.getLogger(__name__) - - -class MatrixTracer: - """Trace through geo-stacked sparse matrices for debugging.""" - - def __init__( - self, - targets_df: pd.DataFrame, - matrix: sparse.csr_matrix, - household_id_mapping: Dict[str, List[str]], - geographic_ids: List[str], - sim, - ): - """ - Initialize tracer with matrix components. - - Args: - targets_df: DataFrame of all targets - matrix: The final stacked sparse matrix - household_id_mapping: Mapping from geo keys to household ID lists - geographic_ids: List of geographic IDs in order - sim: Microsimulation instance - """ - self.targets_df = targets_df - self.matrix = matrix - self.household_id_mapping = household_id_mapping - self.geographic_ids = geographic_ids - self.sim = sim - - # Get original household info - self.original_household_ids = sim.calculate("household_id").values - self.n_households = len(self.original_household_ids) - self.n_geographies = len(geographic_ids) - - # Build reverse lookup: original_hh_id -> index in original data - self.hh_id_to_index = { - hh_id: idx for idx, hh_id in enumerate(self.original_household_ids) - } - - # Build column catalog: maps column index -> (cd_geoid, household_id, household_index) - self.column_catalog = self._build_column_catalog() - - # Build row catalog: maps row index -> target info - self.row_catalog = self._build_row_catalog() - - logger.info( - f"Tracer initialized: {self.n_households} households x {self.n_geographies} geographies" - ) - logger.info(f"Matrix shape: {matrix.shape}") - - def _build_column_catalog(self) -> pd.DataFrame: - """Build a complete catalog of all matrix columns.""" - catalog = [] - col_idx = 0 - - for geo_id in self.geographic_ids: - for hh_idx, hh_id in enumerate(self.original_household_ids): - catalog.append( - { - "column_index": col_idx, - "cd_geoid": geo_id, - "household_id": hh_id, - "household_index": hh_idx, - } - ) - col_idx += 1 - - return pd.DataFrame(catalog) - - def _build_row_catalog(self) -> pd.DataFrame: - """Build a complete catalog of all matrix rows (targets).""" - catalog = [] - - for row_idx, (_, target) in enumerate(self.targets_df.iterrows()): - var_name = target["variable"] - var_desc = "" - if var_name in self.sim.tax_benefit_system.variables: - var_obj = self.sim.tax_benefit_system.variables[var_name] - var_desc = getattr(var_obj, "label", var_name) - - catalog.append( - { - "row_index": row_idx, - "variable": var_name, - "variable_desc": var_desc, - "geographic_id": target.get("geographic_id", "unknown"), - "target_value": target["value"], - "stratum_id": target.get("stratum_id"), - "domain_variable": target.get( - "domain_variable", "unknown" - ), - } - ) - - return pd.DataFrame(catalog) - - def get_column_info(self, col_idx: int) -> Dict: - """Get information about a specific column.""" - if col_idx >= len(self.column_catalog): - raise ValueError( - f"Column index {col_idx} out of range (max: {len(self.column_catalog)-1})" - ) - return self.column_catalog.iloc[col_idx].to_dict() - - def get_row_info(self, row_idx: int) -> Dict: - """Get information about a specific row (target).""" - if row_idx >= len(self.row_catalog): - raise ValueError( - f"Row index {row_idx} out of range (max: {len(self.row_catalog)-1})" - ) - return self.row_catalog.iloc[row_idx].to_dict() - - def lookup_matrix_cell(self, row_idx: int, col_idx: int) -> Dict: - """ - Look up a specific matrix cell and return complete context. - - Args: - row_idx: Row index in matrix - col_idx: Column index in matrix - - Returns: - Dict with row info, column info, and matrix value - """ - row_info = self.get_row_info(row_idx) - col_info = self.get_column_info(col_idx) - matrix_value = self.matrix[row_idx, col_idx] - - return { - "row_index": row_idx, - "column_index": col_idx, - "matrix_value": float(matrix_value), - "target": row_info, - "household": col_info, - } - - def get_household_column_positions( - self, original_hh_id: int - ) -> Dict[str, int]: - """ - Get all column positions for a household across all geographies. - - Args: - original_hh_id: Original household ID from simulation - - Returns: - Dict mapping geo_id to column position in stacked matrix - """ - if original_hh_id not in self.hh_id_to_index: - raise ValueError( - f"Household {original_hh_id} not found in original data" - ) - - # Get the household's index in the original data - hh_index = self.hh_id_to_index[original_hh_id] - - # Calculate column positions for each geography - positions = {} - for geo_idx, geo_id in enumerate(self.geographic_ids): - # Each geography gets a block of n_households columns - col_position = geo_idx * self.n_households + hh_index - positions[geo_id] = col_position - - return positions - - def print_matrix_structure(self, show_groups=True): - """Print a comprehensive breakdown of the matrix structure.""" - print("\n" + "=" * 80) - print("MATRIX STRUCTURE BREAKDOWN") - print("=" * 80) - - print( - f"\nMatrix dimensions: {self.matrix.shape[0]} rows x " - f"{self.matrix.shape[1]} columns" - ) - print(f" Rows = {len(self.row_catalog)} targets") - print( - f" Columns = {self.n_households} households x " - f"{self.n_geographies} CDs" - ) - print( - f" = {self.n_households:,} x {self.n_geographies} " - f"= {self.matrix.shape[1]:,}" - ) - - print("\n" + "-" * 80) - print("COLUMN STRUCTURE (Households stacked by CD)") - print("-" * 80) - - # Build column ranges by CD - col_ranges = [] - cumulative = 0 - for geo_id in self.geographic_ids: - start_col = cumulative - end_col = cumulative + self.n_households - 1 - col_ranges.append( - { - "cd_geoid": geo_id, - "start_col": start_col, - "end_col": end_col, - "n_households": self.n_households, - } - ) - cumulative += self.n_households - - ranges_df = pd.DataFrame(col_ranges) - print(f"\nShowing first and last 5 CDs of {len(ranges_df)} total:") - print("\nFirst 5 CDs:") - print(ranges_df.head(5).to_string(index=False)) - print("\nLast 5 CDs:") - print(ranges_df.tail(5).to_string(index=False)) - - print("\n" + "-" * 80) - print("ROW STRUCTURE (Targets)") - print("-" * 80) - - print(f"\nTotal targets: {len(self.row_catalog)}") - - # Summarize by geographic level if column exists - if "geographic_level" in self.row_catalog.columns: - print("\nTargets by geographic level:") - geo_level_summary = ( - self.row_catalog.groupby("geographic_level") - .size() - .reset_index(name="n_targets") - ) - print(geo_level_summary.to_string(index=False)) - - print("\nTargets by domain variable:") - domain_summary = ( - self.row_catalog.groupby("domain_variable") - .agg({"row_index": "count", "variable": lambda x: len(set(x))}) - .rename( - columns={"row_index": "n_targets", "variable": "n_unique_vars"} - ) - ) - print(domain_summary.to_string()) - - # Create and display target groups with row indices - if show_groups: - print("\n" + "-" * 80) - print("TARGET GROUPS (for loss calculation)") - print("-" * 80) - - target_groups, group_info = create_target_groups(self.targets_df) - - # Store for later use - self.target_groups = target_groups - - # Print each group with row indices - for group_id, info in enumerate(group_info): - group_mask = target_groups == group_id - row_indices = np.where(group_mask)[0] - - # Format row indices for display - if len(row_indices) > 6: - row_display = ( - f"[{row_indices[0]}, {row_indices[1]}, " - f"{row_indices[2]}, ..., {row_indices[-2]}, " - f"{row_indices[-1]}]" - ) - else: - row_display = str(row_indices.tolist()) - - print(f" {info} - rows {row_display}") - - print("\n" + "=" * 80) - - def print_column_catalog(self, max_rows: int = 50): - """Print a sample of the column catalog.""" - print( - f"\nColumn Catalog (showing first {max_rows} of {len(self.column_catalog)}):" - ) - print(self.column_catalog.head(max_rows).to_string(index=False)) - - def print_row_catalog(self, max_rows: int = 50): - """Print a sample of the row catalog.""" - print( - f"\nRow Catalog (showing first {max_rows} of {len(self.row_catalog)}):" - ) - print(self.row_catalog.head(max_rows).to_string(index=False)) - - def get_group_rows(self, group_id: int) -> pd.DataFrame: - """ - Get all rows belonging to a specific target group. - - Args: - group_id: The group ID to filter by - - Returns: - DataFrame of row catalog entries for this group - """ - if not hasattr(self, "target_groups"): - self.target_groups, self.group_info = create_target_groups( - self.targets_df - ) - - group_mask = self.target_groups == group_id - return self.row_catalog[group_mask].copy() - - def trace_household_targets(self, original_hh_id: int) -> pd.DataFrame: - """ - Extract all target values for a household across all geographies. - - Args: - original_hh_id: Original household ID to trace - - Returns: - DataFrame with target details and values for this household - """ - positions = self.get_household_column_positions(original_hh_id) - - results = [] - - for target_idx, (_, target) in enumerate(self.targets_df.iterrows()): - target_result = { - "target_idx": target_idx, - "variable": target["variable"], - "target_value": target["value"], - "geographic_id": target.get("geographic_id", "unknown"), - "domain_variable": target.get("domain_variable", "unknown"), - } - - # Extract values for this target across all geographies - for geo_id, col_pos in positions.items(): - if col_pos < self.matrix.shape[1]: - matrix_value = self.matrix[target_idx, col_pos] - target_result[f"matrix_value_{geo_id}"] = matrix_value - else: - target_result[f"matrix_value_{geo_id}"] = np.nan - - results.append(target_result) - - return pd.DataFrame(results) diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py deleted file mode 100644 index 74b2e2cee..000000000 --- a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py +++ /dev/null @@ -1,838 +0,0 @@ -""" -Sparse matrix builder for geo-stacking calibration. - -Generic, database-driven approach where all constraints (including geographic) -are evaluated as masks. Geographic constraints work because we SET state_fips -before evaluating constraints. -""" - -import logging -from collections import defaultdict -from typing import Dict, List, Optional, Tuple -import numpy as np -import pandas as pd -from scipy import sparse -from sqlalchemy import create_engine, text - -logger = logging.getLogger(__name__) - -from policyengine_us_data.storage import STORAGE_FOLDER -from policyengine_us_data.utils.census import STATE_NAME_TO_FIPS -from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import ( - get_calculated_variables, - apply_op, - _get_geo_level, -) - - -class SparseMatrixBuilder: - """Build sparse calibration matrices for geo-stacking.""" - - def __init__( - self, - db_uri: str, - time_period: int, - cds_to_calibrate: List[str], - dataset_path: Optional[str] = None, - ): - self.db_uri = db_uri - self.engine = create_engine(db_uri) - self.time_period = time_period - self.cds_to_calibrate = cds_to_calibrate - self.dataset_path = dataset_path - self._entity_rel_cache = None - - def _build_entity_relationship(self, sim) -> pd.DataFrame: - """ - Build entity relationship DataFrame mapping persons to all entity IDs. - - This is used to evaluate constraints at the person level and then - aggregate to household level, handling variables defined at different - entity levels (person, tax_unit, household, spm_unit). - - Returns: - DataFrame with person_id, household_id, tax_unit_id, spm_unit_id - """ - if self._entity_rel_cache is not None: - return self._entity_rel_cache - - self._entity_rel_cache = pd.DataFrame( - { - "person_id": sim.calculate( - "person_id", map_to="person" - ).values, - "household_id": sim.calculate( - "household_id", map_to="person" - ).values, - "tax_unit_id": sim.calculate( - "tax_unit_id", map_to="person" - ).values, - "spm_unit_id": sim.calculate( - "spm_unit_id", map_to="person" - ).values, - } - ) - return self._entity_rel_cache - - def _evaluate_constraints_entity_aware( - self, state_sim, constraints: List[dict], n_households: int - ) -> np.ndarray: - """ - Evaluate non-geographic constraints at person level, aggregate to - household level using .any(). - - This properly handles constraints on variables defined at different - entity levels (e.g., tax_unit_is_filer at tax_unit level). Instead of - summing values at household level (which would give 2, 3, etc. for - households with multiple tax units), we evaluate at person level and - use .any() aggregation ("does this household have at least one person - satisfying all constraints?"). - - Args: - state_sim: Microsimulation with state_fips set - constraints: List of constraint dicts with variable, operation, - value keys (geographic constraints should be pre-filtered) - n_households: Number of households - - Returns: - Boolean mask array of length n_households - """ - if not constraints: - return np.ones(n_households, dtype=bool) - - entity_rel = self._build_entity_relationship(state_sim) - n_persons = len(entity_rel) - - person_mask = np.ones(n_persons, dtype=bool) - - for c in constraints: - var = c["variable"] - op = c["operation"] - val = c["value"] - - # Calculate constraint variable at person level - constraint_values = state_sim.calculate( - var, self.time_period, map_to="person" - ).values - - # Apply operation at person level - person_mask &= apply_op(constraint_values, op, val) - - # Aggregate to household level using .any() - # "At least one person in this household satisfies ALL constraints" - entity_rel_with_mask = entity_rel.copy() - entity_rel_with_mask["satisfies"] = person_mask - - household_mask_series = entity_rel_with_mask.groupby("household_id")[ - "satisfies" - ].any() - - # Ensure we return a mask aligned with household order - household_ids = state_sim.calculate( - "household_id", map_to="household" - ).values - household_mask = np.array( - [ - household_mask_series.get(hh_id, False) - for hh_id in household_ids - ] - ) - - return household_mask - - def _calculate_target_values_entity_aware( - self, - state_sim, - target_variable: str, - non_geo_constraints: List[dict], - geo_mask: np.ndarray, - n_households: int, - ) -> np.ndarray: - """ - Calculate target values at household level, handling count targets. - - For count targets (*_count): Count entities per household satisfying - constraints - For value targets: Sum values at household level (existing behavior) - - Args: - state_sim: Microsimulation with state_fips set - target_variable: The target variable name (e.g., "snap", - "person_count") - non_geo_constraints: List of constraint dicts (geographic - constraints should be pre-filtered) - geo_mask: Boolean mask array for geographic filtering (household - level) - n_households: Number of households - - Returns: - Float array of target values at household level - """ - is_count_target = target_variable.endswith("_count") - - if not is_count_target: - # Value target: use existing entity-aware constraint evaluation - entity_mask = self._evaluate_constraints_entity_aware( - state_sim, non_geo_constraints, n_households - ) - mask = geo_mask & entity_mask - - target_values = state_sim.calculate( - target_variable, map_to="household" - ).values - return (target_values * mask).astype(np.float32) - - # Count target: need to count entities satisfying constraints - entity_rel = self._build_entity_relationship(state_sim) - n_persons = len(entity_rel) - - # Evaluate constraints at person level (don't aggregate to HH yet) - person_mask = np.ones(n_persons, dtype=bool) - for c in non_geo_constraints: - constraint_values = state_sim.calculate( - c["variable"], map_to="person" - ).values - person_mask &= apply_op( - constraint_values, c["operation"], c["value"] - ) - - # Get target entity from variable definition - target_entity = state_sim.tax_benefit_system.variables[ - target_variable - ].entity.key - - household_ids = state_sim.calculate( - "household_id", map_to="household" - ).values - geo_mask_map = dict(zip(household_ids, geo_mask)) - - if target_entity == "household": - # household_count: 1 per qualifying household - if non_geo_constraints: - entity_mask = self._evaluate_constraints_entity_aware( - state_sim, non_geo_constraints, n_households - ) - return (geo_mask & entity_mask).astype(np.float32) - return geo_mask.astype(np.float32) - - if target_entity == "person": - # Count persons satisfying constraints per household - entity_rel["satisfies"] = person_mask - entity_rel["geo_ok"] = entity_rel["household_id"].map(geo_mask_map) - filtered = entity_rel[ - entity_rel["satisfies"] & entity_rel["geo_ok"] - ] - counts = filtered.groupby("household_id")["person_id"].nunique() - else: - # For tax_unit, spm_unit: aggregate person mask to entity, then - # count - entity_id_col = f"{target_entity}_id" - entity_rel["satisfies"] = person_mask - entity_satisfies = entity_rel.groupby(entity_id_col)[ - "satisfies" - ].any() - - entity_rel_unique = entity_rel[ - ["household_id", entity_id_col] - ].drop_duplicates() - entity_rel_unique["entity_ok"] = entity_rel_unique[ - entity_id_col - ].map(entity_satisfies) - entity_rel_unique["geo_ok"] = entity_rel_unique[ - "household_id" - ].map(geo_mask_map) - filtered = entity_rel_unique[ - entity_rel_unique["entity_ok"] & entity_rel_unique["geo_ok"] - ] - counts = filtered.groupby("household_id")[entity_id_col].nunique() - - # Build result aligned with household order - return np.array( - [counts.get(hh_id, 0) for hh_id in household_ids], dtype=np.float32 - ) - - def _query_targets(self, target_filter: dict) -> pd.DataFrame: - """Query targets via target_overview view. - - Best period: most recent period <= self.time_period, or closest - future period if none exists. - - Returns DataFrame with geo_level, geographic_id, and - domain_variable columns. - - Supports filters: domain_variables, variables, target_ids, - stratum_ids. - """ - or_conditions = [] - - if "domain_variables" in target_filter: - dvs = target_filter["domain_variables"] - placeholders = ",".join(f"'{dv}'" for dv in dvs) - or_conditions.append(f"tv.domain_variable IN ({placeholders})") - - if "variables" in target_filter: - vars_str = ",".join(f"'{v}'" for v in target_filter["variables"]) - or_conditions.append(f"tv.variable IN ({vars_str})") - - if "target_ids" in target_filter: - ids = ",".join(map(str, target_filter["target_ids"])) - or_conditions.append(f"tv.target_id IN ({ids})") - - if "stratum_ids" in target_filter: - ids = ",".join(map(str, target_filter["stratum_ids"])) - or_conditions.append(f"tv.stratum_id IN ({ids})") - - if not or_conditions: - where_clause = "1=1" - else: - where_clause = " OR ".join(f"({c})" for c in or_conditions) - - query = f""" - WITH filtered_targets AS ( - SELECT tv.target_id, tv.stratum_id, tv.variable, tv.value, - tv.period, tv.geo_level, tv.geographic_id, - tv.domain_variable - FROM target_overview tv - WHERE {where_clause} - ), - best_periods AS ( - SELECT stratum_id, variable, - CASE - WHEN MAX(CASE WHEN period <= :time_period - THEN period END) IS NOT NULL - THEN MAX(CASE WHEN period <= :time_period - THEN period END) - ELSE MIN(period) - END as best_period - FROM filtered_targets - GROUP BY stratum_id, variable - ) - SELECT ft.* - FROM filtered_targets ft - JOIN best_periods bp - ON ft.stratum_id = bp.stratum_id - AND ft.variable = bp.variable - AND ft.period = bp.best_period - ORDER BY ft.target_id - """ - - with self.engine.connect() as conn: - return pd.read_sql( - query, conn, params={"time_period": self.time_period} - ) - - def _get_constraints(self, stratum_id: int) -> List[dict]: - """Get all constraints for a stratum (including geographic).""" - query = """ - SELECT constraint_variable as variable, operation, value - FROM stratum_constraints - WHERE stratum_id = :stratum_id - """ - with self.engine.connect() as conn: - df = pd.read_sql(query, conn, params={"stratum_id": stratum_id}) - return df.to_dict("records") - - def _get_geographic_id(self, stratum_id: int) -> str: - """Extract geographic_id from constraints for targets_df.""" - constraints = self._get_constraints(stratum_id) - for c in constraints: - if c["variable"] == "state_fips": - return c["value"] - if c["variable"] == "congressional_district_geoid": - return c["value"] - return "US" - - def _calculate_uprating_factors(self, params) -> dict: - """Calculate CPI and population uprating factors for all periods.""" - factors = {} - - query = "SELECT DISTINCT period FROM targets WHERE period IS NOT NULL ORDER BY period" - with self.engine.connect() as conn: - result = conn.execute(text(query)) - years_needed = [row[0] for row in result] - - logger.info( - f"Calculating uprating factors for years " - f"{years_needed} to {self.time_period}" - ) - - for from_year in years_needed: - if from_year == self.time_period: - factors[(from_year, "cpi")] = 1.0 - factors[(from_year, "pop")] = 1.0 - continue - - try: - cpi_from = params.gov.bls.cpi.cpi_u(from_year) - cpi_to = params.gov.bls.cpi.cpi_u(self.time_period) - factors[(from_year, "cpi")] = float(cpi_to / cpi_from) - except Exception as e: - logger.warning( - f"Could not calculate CPI factor for " f"{from_year}: {e}" - ) - factors[(from_year, "cpi")] = 1.0 - - try: - pop_from = params.calibration.gov.census.populations.total( - from_year - ) - pop_to = params.calibration.gov.census.populations.total( - self.time_period - ) - factors[(from_year, "pop")] = float(pop_to / pop_from) - except Exception as e: - logger.warning( - f"Could not calculate population factor for " - f"{from_year}: {e}" - ) - factors[(from_year, "pop")] = 1.0 - - for (year, type_), factor in sorted(factors.items()): - if factor != 1.0: - logger.info( - f" {year} -> {self.time_period} " - f"({type_}): {factor:.4f}" - ) - - return factors - - def _get_uprating_info( - self, - variable: str, - period: int, - factors: dict, - ) -> Tuple[float, str]: - """Get uprating factor and type for a variable at a given period.""" - if period == self.time_period: - return 1.0, "none" - - count_indicators = [ - "count", - "person", - "people", - "households", - "tax_units", - ] - is_count = any(ind in variable.lower() for ind in count_indicators) - uprating_type = "pop" if is_count else "cpi" - - factor = factors.get((period, uprating_type), 1.0) - return factor, uprating_type - - def _load_aca_ptc_factors(self) -> Dict[int, Dict[str, float]]: - """Load state-level ACA PTC uprating factors from CSV. - - Returns: - {state_fips_int: {"tax_unit_count": vol_mult, - "aca_ptc": vol_mult * val_mult}} - """ - csv_path = STORAGE_FOLDER / "aca_ptc_multipliers_2022_2024.csv" - df = pd.read_csv(csv_path) - result = {} - for _, row in df.iterrows(): - fips_str = STATE_NAME_TO_FIPS.get(row["state"]) - if fips_str is None: - continue - fips_int = int(fips_str) - result[fips_int] = { - "tax_unit_count": row["vol_mult"], - "aca_ptc": row["vol_mult"] * row["val_mult"], - } - return result - - def _get_state_uprating_factors( - self, - domain: str, - targets_df: pd.DataFrame, - national_factors: dict, - ) -> Dict[int, Dict[str, float]]: - """Get per-state uprating factors for a hierarchical domain. - - For aca_ptc: loads real state-level enrollment/APTC factors - from CSV. For other domains: returns uniform national CPI/pop - factors. - - Returns: - {state_fips: {variable: factor}} for each state in the - domain's state-level targets. - """ - state_rows = targets_df[ - (targets_df["domain_variable"] == domain) - & (targets_df["geo_level"] == "state") - ] - state_fips_list = state_rows["geographic_id"].unique() - variables = state_rows["variable"].unique() - - if domain == "aca_ptc": - csv_factors = self._load_aca_ptc_factors() - logger.info( - f" [{domain}] Using CSV state-level factors " - f"({len(csv_factors)} states)" - ) - else: - csv_factors = None - logger.info(f" [{domain}] Using national CPI/pop factors") - - result = {} - n_csv = 0 - n_fallback = 0 - for sf in state_fips_list: - state_int = int(sf) - var_factors = {} - - if csv_factors and state_int in csv_factors: - n_csv += 1 - for var in variables: - var_factors[var] = csv_factors[state_int].get(var, 1.0) - else: - n_fallback += 1 - for var in variables: - row = state_rows[ - (state_rows["geographic_id"] == sf) - & (state_rows["variable"] == var) - ] - if row.empty: - var_factors[var] = 1.0 - continue - period = row.iloc[0]["period"] - factor, _ = self._get_uprating_info( - var, period, national_factors - ) - var_factors[var] = factor - - result[state_int] = var_factors - - if csv_factors: - all_factors = [f for vf in result.values() for f in vf.values()] - logger.info( - f" {n_csv} states from CSV, " - f"{n_fallback} national fallback" - ) - for var in variables: - vf = [result[s][var] for s in result] - logger.info(f" {var}: [{min(vf):.4f}, {max(vf):.4f}]") - - return result - - def _apply_hierarchical_uprating( - self, - targets_df: pd.DataFrame, - hierarchical_domains: List[str], - national_factors: dict, - ) -> pd.DataFrame: - """Apply state-level uprating and reconcile CDs to state totals. - - Two separable factors per CD row: - - hif (hierarchy inconsistency factor): base-year correction - so that sum(CDs) == state total in the source data. - hif = state_original / sum(cd_originals). Pure geometry, - no time dimension. - - uprating_factor: state-specific (or national fallback) - scaling from base year to target year. Pure time, no - geography correction. - - Final CD value = original_value * hif * uprating_factor. - - Also drops national/state rows used for reconciliation - (keeps rows like CMS person_count at period == time_period). - """ - df = targets_df.copy() - df["hif"] = np.nan - df["state_uprating_factor"] = np.nan - - rows_to_drop = [] - - for domain in hierarchical_domains: - domain_mask = df["domain_variable"] == domain - - state_factors = self._get_state_uprating_factors( - domain, df, national_factors - ) - - state_mask = domain_mask & (df["geo_level"] == "state") - district_mask = domain_mask & (df["geo_level"] == "district") - - for sf, var_factors in state_factors.items(): - for var, uf in var_factors.items(): - state_row = df[ - state_mask - & (df["geographic_id"] == str(sf)) - & (df["variable"] == var) - ] - if state_row.empty: - continue - state_original = state_row.iloc[0]["original_value"] - - def _cd_in_state(g, s=sf): - try: - return int(g) // 100 == s - except (ValueError, TypeError): - return False - - cd_mask = ( - district_mask - & (df["variable"] == var) - & df["geographic_id"].apply(_cd_in_state) - ) - cd_rows = df[cd_mask] - if cd_rows.empty: - continue - - cd_original_sum = cd_rows["original_value"].sum() - if cd_original_sum == 0: - continue - - hif = state_original / cd_original_sum - - for cd_idx in cd_rows.index: - df.at[cd_idx, "hif"] = hif - df.at[cd_idx, "state_uprating_factor"] = uf - df.at[cd_idx, "value"] = ( - df.at[cd_idx, "original_value"] * hif * uf - ) - - # Log HIF and UF summary for this domain - cd_domain = df[district_mask & df["hif"].notna()] - if not cd_domain.empty: - for var in cd_domain["variable"].unique(): - vrows = cd_domain[cd_domain["variable"] == var] - hifs = vrows["hif"] - ufs = vrows["state_uprating_factor"] - logger.info( - f" [{domain}] {var}: " - f"{len(vrows)} CDs, " - f"HIF=[{hifs.min():.4f}, {hifs.max():.4f}], " - f"UF=[{ufs.min():.4f}, {ufs.max():.4f}]" - ) - - # Drop national/state rows used for reconciliation - # Keep rows like CMS person_count (period == time_period) - national_mask = domain_mask & (df["geo_level"] == "national") - for idx in df[national_mask | state_mask].index: - row = df.loc[idx] - if row["period"] != self.time_period: - rows_to_drop.append(idx) - - if rows_to_drop: - dropped = df.loc[rows_to_drop] - logger.info( - f"Hierarchical uprating: dropping " - f"{len(rows_to_drop)} national/state rows " - f"(used only for reconciliation)" - ) - for domain in hierarchical_domains: - d = dropped[dropped["domain_variable"] == domain] - if d.empty: - continue - by_level = d["geo_level"].value_counts().to_dict() - parts = [f"{n} {lvl}" for lvl, n in sorted(by_level.items())] - logger.info(f" {domain}: {', '.join(parts)}") - df = df.drop(index=rows_to_drop).reset_index(drop=True) - - df["target_period"] = self.time_period - - return df - - def print_uprating_summary(self, targets_df: pd.DataFrame) -> None: - """Print summary of uprating applied to targets.""" - has_state_uf = "state_uprating_factor" in targets_df.columns - - # Effective factor: use state_uprating_factor where set, - # otherwise fall back to uprating_factor - if has_state_uf: - eff = targets_df["state_uprating_factor"].fillna( - targets_df["uprating_factor"] - ) - else: - eff = targets_df["uprating_factor"] - - uprated = targets_df[eff != 1.0] - if len(uprated) == 0: - print("No targets were uprated.") - return - - print("\n" + "=" * 60) - print("UPRATING SUMMARY") - print("=" * 60) - print(f"Uprated {len(uprated)} of {len(targets_df)} targets") - - period_counts = uprated["period"].value_counts().sort_index() - for period, count in period_counts.items(): - print(f" Period {period}: {count} targets") - - factors = eff[eff != 1.0] - print( - f" Factor range: [{factors.min():.4f}, " f"{factors.max():.4f}]" - ) - - def _create_state_sim(self, state: int, n_households: int): - """Create a fresh simulation with state_fips set to given state.""" - from policyengine_us import Microsimulation - - state_sim = Microsimulation(dataset=self.dataset_path) - state_sim.set_input( - "state_fips", - self.time_period, - np.full(n_households, state, dtype=np.int32), - ) - for var in get_calculated_variables(state_sim): - state_sim.delete_arrays(var) - return state_sim - - def build_matrix( - self, - sim, - target_filter: dict, - hierarchical_domains: Optional[List[str]] = None, - ) -> Tuple[pd.DataFrame, sparse.csr_matrix, Dict[str, List[str]]]: - """ - Build sparse calibration matrix. - - Args: - sim: Microsimulation instance (used for household_ids, or - as template) - target_filter: Dict specifying which targets to include - - {"domain_variables": ["aca_ptc"]} via target_overview - - {"target_ids": [123, 456]} for specific targets - - an empty dict {} will fetch all targets - hierarchical_domains: Optional list of domain_variable - names for state-level uprating + CD reconciliation. - Requires domain_variables in target_filter. - - Returns: - Tuple of (targets_df, X_sparse, household_id_mapping) - """ - household_ids = sim.calculate( - "household_id", map_to="household" - ).values - n_households = len(household_ids) - n_cds = len(self.cds_to_calibrate) - n_cols = n_households * n_cds - - targets_df = self._query_targets(target_filter) - - if len(targets_df) == 0: - raise ValueError("No targets found matching filter") - - # Uprate targets from their original period to self.time_period - params = sim.tax_benefit_system.parameters - uprating_factors = self._calculate_uprating_factors(params) - targets_df["original_value"] = targets_df["value"].copy() - targets_df["uprating_factor"] = targets_df.apply( - lambda row: self._get_uprating_info( - row["variable"], row["period"], uprating_factors - )[0], - axis=1, - ) - targets_df["value"] = ( - targets_df["original_value"] * targets_df["uprating_factor"] - ) - - # Hierarchical uprating: state-level uprating + CD reconciliation - if hierarchical_domains: - targets_df = self._apply_hierarchical_uprating( - targets_df, hierarchical_domains, uprating_factors - ) - - n_targets = len(targets_df) - - # Sort by (geo_level, variable, geographic_id) for contiguous group - targets_df["_geo_level"] = targets_df["geographic_id"].apply( - _get_geo_level - ) - targets_df = targets_df.sort_values( - ["_geo_level", "variable", "geographic_id"] - ) - targets_df = targets_df.drop(columns=["_geo_level"]).reset_index( - drop=True - ) - - X = sparse.lil_matrix((n_targets, n_cols), dtype=np.float32) - - # Group CDs by state. CD GEOIDs follow format SSCCC where SS is state - # FIPS (2 digits) and CCC is CD number (2-3 digits), so state = CD // 100 - cds_by_state = defaultdict(list) - for cd_idx, cd in enumerate(self.cds_to_calibrate): - state = int(cd) // 100 - cds_by_state[state].append((cd_idx, cd)) - - for state, cd_list in cds_by_state.items(): - # Clear entity relationship cache when creating new simulation - self._entity_rel_cache = None - - if self.dataset_path: - state_sim = self._create_state_sim(state, n_households) - else: - state_sim = sim - state_sim.set_input( - "state_fips", - self.time_period, - np.full(n_households, state, dtype=np.int32), - ) - for var in get_calculated_variables(state_sim): - state_sim.delete_arrays(var) - - for cd_idx, cd in cd_list: - col_start = cd_idx * n_households - - for row_idx, (_, target) in enumerate(targets_df.iterrows()): - constraints = self._get_constraints(target["stratum_id"]) - - geo_constraints = [] - non_geo_constraints = [] - for c in constraints: - if c["variable"] in ( - "state_fips", - "congressional_district_geoid", - ): - geo_constraints.append(c) - else: - non_geo_constraints.append(c) - - # Check geographic constraints first (quick fail) - geo_mask = np.ones(n_households, dtype=bool) - for c in geo_constraints: - if c["variable"] == "congressional_district_geoid": - if ( - c["operation"] in ("==", "=") - and c["value"] != cd - ): - geo_mask[:] = False - elif c["variable"] == "state_fips": - if ( - c["operation"] in ("==", "=") - and int(c["value"]) != state - ): - geo_mask[:] = False - - if not geo_mask.any(): - continue - - # Calculate target values with entity-aware handling - # This properly handles count targets (*_count) by counting - # entities rather than summing values - masked_values = self._calculate_target_values_entity_aware( - state_sim, - target["variable"], - non_geo_constraints, - geo_mask, - n_households, - ) - - if not masked_values.any(): - continue - - nonzero = np.where(masked_values != 0)[0] - if len(nonzero) > 0: - X[row_idx, col_start + nonzero] = masked_values[ - nonzero - ] - - household_id_mapping = {} - for cd in self.cds_to_calibrate: - key = f"cd{cd}" - household_id_mapping[key] = [ - f"{hh_id}_{key}" for hh_id in household_ids - ] - - return targets_df, X.tocsr(), household_id_mapping diff --git a/policyengine_us_data/parameters/take_up/voluntary_filing.yaml b/policyengine_us_data/parameters/take_up/voluntary_filing.yaml new file mode 100644 index 000000000..46d23e504 --- /dev/null +++ b/policyengine_us_data/parameters/take_up/voluntary_filing.yaml @@ -0,0 +1,6 @@ +description: Percentage of tax units (not taking up EITC) who file taxes voluntarily. +metadata: + label: Voluntary filing rate + unit: /1 +values: + 2018-01-01: 0.05 diff --git a/policyengine_us_data/tests/test_calibration/__init__.py b/policyengine_us_data/tests/test_calibration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/policyengine_us_data/tests/test_calibration/test_build_matrix_masking.py b/policyengine_us_data/tests/test_calibration/test_build_matrix_masking.py new file mode 100644 index 000000000..8db56ddcb --- /dev/null +++ b/policyengine_us_data/tests/test_calibration/test_build_matrix_masking.py @@ -0,0 +1,207 @@ +"""Integration test for build_matrix geographic masking. + +Traces one household through the matrix with 2 clones, verifying: +- National targets: both clones can contribute (non-zero) +- State targets: only the clone assigned to that state contributes +- CD targets: only the clone assigned to that CD contributes; + a different CD in the same state gets zero +""" + +import os + +import numpy as np +import pytest +from scipy import sparse + +from policyengine_us_data.storage import STORAGE_FOLDER + +DATASET_PATH = str(STORAGE_FOLDER / "stratified_extended_cps_2024.h5") +DB_PATH = str(STORAGE_FOLDER / "calibration" / "policy_data.db") +DB_URI = f"sqlite:///{DB_PATH}" + +N_CLONES = 2 +SEED = 42 +RECORD_IDX = 8629 # High SNAP ($18k), lands in TX/PA with seed=42 + + +def _data_available(): + return os.path.exists(DATASET_PATH) and os.path.exists(DB_PATH) + + +@pytest.fixture(scope="module") +def matrix_result(): + if not _data_available(): + pytest.skip("Calibration data not available") + + from policyengine_us import Microsimulation + from policyengine_us_data.calibration.clone_and_assign import ( + assign_random_geography, + ) + from policyengine_us_data.calibration.unified_matrix_builder import ( + UnifiedMatrixBuilder, + ) + + sim = Microsimulation(dataset=DATASET_PATH) + n_records = sim.calculate("household_id").values.shape[0] + geography = assign_random_geography( + n_records, n_clones=N_CLONES, seed=SEED + ) + builder = UnifiedMatrixBuilder( + db_uri=DB_URI, + time_period=2024, + dataset_path=DATASET_PATH, + ) + targets_df, X_sparse, target_names = builder.build_matrix( + geography=geography, + sim=sim, + target_filter={"domain_variables": ["snap", "medicaid"]}, + ) + return { + "geography": geography, + "targets_df": targets_df, + "X": X_sparse, + "target_names": target_names, + "n_records": n_records, + } + + +def _clone_col(n_records, clone_idx, record_idx): + return clone_idx * n_records + record_idx + + +class TestMatrixShape: + def test_columns_equal_clones_times_records(self, matrix_result): + X = matrix_result["X"] + n_records = matrix_result["n_records"] + assert X.shape[1] == N_CLONES * n_records + + def test_rows_equal_targets(self, matrix_result): + X = matrix_result["X"] + assert X.shape[0] == len(matrix_result["targets_df"]) + + def test_matrix_is_sparse(self, matrix_result): + X = matrix_result["X"] + density = X.nnz / (X.shape[0] * X.shape[1]) + assert density < 0.1 + + +class TestNationalMasking: + def test_both_clones_visible_to_national_target(self, matrix_result): + X = matrix_result["X"] + targets_df = matrix_result["targets_df"] + n_records = matrix_result["n_records"] + + national_rows = targets_df[targets_df["geo_level"] == "national"].index + assert len(national_rows) > 0 + + col_0 = _clone_col(n_records, 0, RECORD_IDX) + col_1 = _clone_col(n_records, 1, RECORD_IDX) + X_csc = X.tocsc() + + visible_0 = X_csc[:, col_0].toarray().ravel() + visible_1 = X_csc[:, col_1].toarray().ravel() + + for row_idx in national_rows: + if visible_0[row_idx] != 0 or visible_1[row_idx] != 0: + return + pytest.fail( + "Household has zero value for all national targets " + "in both clones — cannot verify masking" + ) + + +class TestStateMasking: + def test_clone_visible_only_to_own_state(self, matrix_result): + X = matrix_result["X"] + targets_df = matrix_result["targets_df"] + geography = matrix_result["geography"] + n_records = matrix_result["n_records"] + + col_0 = _clone_col(n_records, 0, RECORD_IDX) + col_1 = _clone_col(n_records, 1, RECORD_IDX) + state_0 = str(int(geography.state_fips[col_0])) + state_1 = str(int(geography.state_fips[col_1])) + + if state_0 == state_1: + pytest.skip( + "Both clones landed in the same state — " + "cannot test cross-state masking" + ) + + state_targets = targets_df[targets_df["geo_level"] == "state"] + X_csc = X.tocsc() + vals_0 = X_csc[:, col_0].toarray().ravel() + vals_1 = X_csc[:, col_1].toarray().ravel() + + for _, row in state_targets.iterrows(): + row_idx = row.name + geo_id = str(row["geographic_id"]) + if geo_id == state_0: + assert vals_1[row_idx] == 0, ( + f"Clone 1 (state {state_1}) should be zero " + f"for state {state_0} target row {row_idx}" + ) + elif geo_id == state_1: + assert vals_0[row_idx] == 0, ( + f"Clone 0 (state {state_0}) should be zero " + f"for state {state_1} target row {row_idx}" + ) + + +class TestDistrictMasking: + def test_clone_visible_only_to_own_cd(self, matrix_result): + X = matrix_result["X"] + targets_df = matrix_result["targets_df"] + geography = matrix_result["geography"] + n_records = matrix_result["n_records"] + + col_0 = _clone_col(n_records, 0, RECORD_IDX) + cd_0 = str(geography.cd_geoid[col_0]) + state_0 = str(int(geography.state_fips[col_0])) + + district_targets = targets_df[targets_df["geo_level"] == "district"] + X_csc = X.tocsc() + vals_0 = X_csc[:, col_0].toarray().ravel() + + same_state_other_cd = district_targets[ + ( + district_targets["geographic_id"].apply( + lambda g: g.startswith(state_0) + ) + ) + & (district_targets["geographic_id"] != cd_0) + ] + + for _, row in same_state_other_cd.iterrows(): + row_idx = row.name + assert vals_0[row_idx] == 0, ( + f"Clone 0 (CD {cd_0}) should be zero for " + f"CD {row['geographic_id']} target row {row_idx}" + ) + + def test_clone_nonzero_for_own_cd(self, matrix_result): + X = matrix_result["X"] + targets_df = matrix_result["targets_df"] + geography = matrix_result["geography"] + n_records = matrix_result["n_records"] + + col_0 = _clone_col(n_records, 0, RECORD_IDX) + cd_0 = str(geography.cd_geoid[col_0]) + + own_cd_targets = targets_df[ + (targets_df["geo_level"] == "district") + & (targets_df["geographic_id"] == cd_0) + ] + if len(own_cd_targets) == 0: + pytest.skip(f"No district targets for CD {cd_0}") + + X_csc = X.tocsc() + vals_0 = X_csc[:, col_0].toarray().ravel() + + any_nonzero = any( + vals_0[row.name] != 0 for _, row in own_cd_targets.iterrows() + ) + assert any_nonzero, ( + f"Clone 0 should have at least one non-zero entry " + f"for its own CD {cd_0}" + ) diff --git a/policyengine_us_data/tests/test_calibration/test_clone_and_assign.py b/policyengine_us_data/tests/test_calibration/test_clone_and_assign.py new file mode 100644 index 000000000..0ba330549 --- /dev/null +++ b/policyengine_us_data/tests/test_calibration/test_clone_and_assign.py @@ -0,0 +1,189 @@ +"""Tests for clone_and_assign module. + +Uses mock CSV data so tests don't require the real +block_cd_distributions.csv.gz file. +""" + +import numpy as np +import pandas as pd +import pytest +from unittest.mock import patch + +from policyengine_us_data.calibration.clone_and_assign import ( + GeographyAssignment, + load_global_block_distribution, + assign_random_geography, + double_geography_for_puf, +) + +MOCK_BLOCKS = pd.DataFrame( + { + "cd_geoid": [101, 101, 101, 102, 102, 103, 103, 103, 103], + "block_geoid": [ + "010010001001001", + "010010001001002", + "010010001001003", + "020010001001001", + "020010001001002", + "360100001001001", + "360100001001002", + "360100001001003", + "360100001001004", + ], + "probability": [ + 0.4, + 0.3, + 0.3, + 0.6, + 0.4, + 0.25, + 0.25, + 0.25, + 0.25, + ], + } +) + + +@pytest.fixture(autouse=True) +def _clear_lru_cache(): + load_global_block_distribution.cache_clear() + yield + load_global_block_distribution.cache_clear() + + +def _mock_distribution(): + blocks = MOCK_BLOCKS["block_geoid"].values + cds = MOCK_BLOCKS["cd_geoid"].astype(str).values + states = np.array([int(b[:2]) for b in blocks]) + probs = MOCK_BLOCKS["probability"].values.astype(np.float64) + probs = probs / probs.sum() + return blocks, cds, states, probs + + +class TestLoadGlobalBlockDistribution: + def test_loads_and_normalizes(self, tmp_path): + csv_path = tmp_path / "block_cd_distributions.csv.gz" + MOCK_BLOCKS.to_csv(csv_path, index=False, compression="gzip") + with patch( + "policyengine_us_data.calibration" + ".clone_and_assign.STORAGE_FOLDER", + tmp_path, + ): + blocks, cds, states, probs = ( + load_global_block_distribution.__wrapped__() + ) + assert len(blocks) == 9 + np.testing.assert_almost_equal(probs.sum(), 1.0) + + def test_state_fips_extracted(self, tmp_path): + csv_path = tmp_path / "block_cd_distributions.csv.gz" + MOCK_BLOCKS.to_csv(csv_path, index=False, compression="gzip") + with patch( + "policyengine_us_data.calibration" + ".clone_and_assign.STORAGE_FOLDER", + tmp_path, + ): + _, _, states, _ = load_global_block_distribution.__wrapped__() + assert states[0] == 1 + assert states[3] == 2 + assert states[5] == 36 + + +class TestAssignRandomGeography: + @patch( + "policyengine_us_data.calibration.clone_and_assign" + ".load_global_block_distribution" + ) + def test_shape(self, mock_load): + mock_load.return_value = _mock_distribution() + r = assign_random_geography(n_records=10, n_clones=3, seed=42) + assert len(r.block_geoid) == 30 + assert r.n_records == 10 + assert r.n_clones == 3 + + @patch( + "policyengine_us_data.calibration.clone_and_assign" + ".load_global_block_distribution" + ) + def test_deterministic(self, mock_load): + mock_load.return_value = _mock_distribution() + r1 = assign_random_geography(n_records=10, n_clones=3, seed=99) + r2 = assign_random_geography(n_records=10, n_clones=3, seed=99) + np.testing.assert_array_equal(r1.block_geoid, r2.block_geoid) + + @patch( + "policyengine_us_data.calibration.clone_and_assign" + ".load_global_block_distribution" + ) + def test_different_seeds_differ(self, mock_load): + mock_load.return_value = _mock_distribution() + r1 = assign_random_geography(n_records=100, n_clones=3, seed=1) + r2 = assign_random_geography(n_records=100, n_clones=3, seed=2) + assert not np.array_equal(r1.block_geoid, r2.block_geoid) + + @patch( + "policyengine_us_data.calibration.clone_and_assign" + ".load_global_block_distribution" + ) + def test_state_from_block(self, mock_load): + mock_load.return_value = _mock_distribution() + r = assign_random_geography(n_records=20, n_clones=5, seed=42) + for i in range(len(r.block_geoid)): + expected = int(r.block_geoid[i][:2]) + assert r.state_fips[i] == expected + + def test_missing_file_raises(self, tmp_path): + fake = tmp_path / "nonexistent" + fake.mkdir() + with patch( + "policyengine_us_data.calibration" + ".clone_and_assign.STORAGE_FOLDER", + fake, + ): + with pytest.raises(FileNotFoundError): + load_global_block_distribution.__wrapped__() + + +class TestDoubleGeographyForPuf: + def test_doubles_n_records(self): + geo = GeographyAssignment( + block_geoid=np.array(["010010001001001", "020010001001001"] * 3), + cd_geoid=np.array(["101", "202"] * 3), + state_fips=np.array([1, 2] * 3), + n_records=2, + n_clones=3, + ) + r = double_geography_for_puf(geo) + assert r.n_records == 4 + assert r.n_clones == 3 + assert len(r.block_geoid) == 12 + + def test_puf_half_matches_cps_half(self): + geo = GeographyAssignment( + block_geoid=np.array( + [ + "010010001001001", + "020010001001001", + "360100001001001", + "060100001001001", + "480100001001001", + "120100001001001", + ] + ), + cd_geoid=np.array(["101", "202", "1036", "653", "4831", "1227"]), + state_fips=np.array([1, 2, 36, 6, 48, 12]), + n_records=3, + n_clones=2, + ) + r = double_geography_for_puf(geo) + n_new = r.n_records + + for c in range(r.n_clones): + start = c * n_new + mid = start + n_new // 2 + end = start + n_new + np.testing.assert_array_equal( + r.state_fips[start:mid], + r.state_fips[mid:end], + ) diff --git a/policyengine_us_data/tests/test_calibration/test_drop_target_groups.py b/policyengine_us_data/tests/test_calibration/test_drop_target_groups.py new file mode 100644 index 000000000..daade621d --- /dev/null +++ b/policyengine_us_data/tests/test_calibration/test_drop_target_groups.py @@ -0,0 +1,142 @@ +"""Tests for drop_target_groups in calibration_utils.""" + +import numpy as np +import pandas as pd +import pytest +from scipy import sparse + +from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import ( + drop_target_groups, + create_target_groups, +) + + +@pytest.fixture +def sample_data(): + targets_df = pd.DataFrame( + { + "variable": [ + "snap", + "snap", + "snap", + "household_count", + "household_count", + ], + "domain_variable": [ + "snap", + "snap", + "snap", + "snap", + "snap", + ], + "geographic_id": ["US", "6", "37", "6", "37"], + "value": [1000, 500, 300, 200, 100], + } + ) + n_rows = len(targets_df) + n_cols = 10 + rng = np.random.default_rng(42) + X = sparse.random(n_rows, n_cols, density=0.5, random_state=rng) + X = X.tocsr() + target_groups, group_info = create_target_groups(targets_df) + return targets_df, X, target_groups, group_info + + +class TestDropTargetGroups: + def test_drops_matching_group(self, sample_data): + targets_df, X, target_groups, group_info = sample_data + n_before = len(targets_df) + out_df, out_X = drop_target_groups( + targets_df, + X, + target_groups, + group_info, + [("household count", "State")], + ) + assert len(out_df) < n_before + assert out_X.shape[0] == len(out_df) + assert "household_count" not in out_df["variable"].values or not ( + out_df[out_df["variable"] == "household_count"]["geographic_id"] + .isin(["6", "37"]) + .any() + ) + + def test_keeps_unmatched_groups(self, sample_data): + targets_df, X, target_groups, group_info = sample_data + out_df, out_X = drop_target_groups( + targets_df, + X, + target_groups, + group_info, + [("household count", "State")], + ) + assert "snap" in out_df["variable"].values + + def test_matrix_rows_match_df(self, sample_data): + targets_df, X, target_groups, group_info = sample_data + out_df, out_X = drop_target_groups( + targets_df, + X, + target_groups, + group_info, + [("snap", "National")], + ) + assert out_X.shape[0] == len(out_df) + assert out_X.shape[1] == X.shape[1] + + def test_no_match_keeps_all(self, sample_data): + targets_df, X, target_groups, group_info = sample_data + out_df, out_X = drop_target_groups( + targets_df, + X, + target_groups, + group_info, + [("nonexistent", "National")], + ) + assert len(out_df) == len(targets_df) + assert out_X.shape[0] == X.shape[0] + + def test_drop_all_groups(self, sample_data): + targets_df, X, target_groups, group_info = sample_data + out_df, out_X = drop_target_groups( + targets_df, + X, + target_groups, + group_info, + [ + ("snap", "National"), + ("snap", "State"), + ("household count", "State"), + ], + ) + assert len(out_df) == 0 + assert out_X.shape[0] == 0 + + def test_columns_preserved(self, sample_data): + targets_df, X, target_groups, group_info = sample_data + out_df, out_X = drop_target_groups( + targets_df, + X, + target_groups, + group_info, + [("snap", "National")], + ) + assert out_X.shape[1] == X.shape[1] + + def test_case_insensitive_match(self, sample_data): + targets_df, X, target_groups, group_info = sample_data + out_df, _ = drop_target_groups( + targets_df, + X, + target_groups, + group_info, + [("SNAP", "State")], + ) + out_df2, _ = drop_target_groups( + targets_df, + X, + target_groups, + group_info, + [("snap", "State")], + ) + assert len(out_df) == len(out_df2) diff --git a/policyengine_us_data/tests/test_calibration/test_unified_calibration.py b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py new file mode 100644 index 000000000..2d3f80619 --- /dev/null +++ b/policyengine_us_data/tests/test_calibration/test_unified_calibration.py @@ -0,0 +1,87 @@ +"""Tests for unified_calibration module. + +Focuses on rerandomize_takeup: verifies draws differ by +block and are reproducible within the same block. +""" + +import numpy as np +import pytest + +from policyengine_us_data.utils.randomness import seeded_rng + + +class TestRerandomizeTakeupSeeding: + """Verify seeded_rng(var, salt=block) produces + reproducible, block-dependent draws.""" + + def test_same_block_same_draws(self): + var = "takes_up_snap_if_eligible" + block = "010010001001001" + rng1 = seeded_rng(var, salt=block) + rng2 = seeded_rng(var, salt=block) + draws1 = rng1.random(100) + draws2 = rng2.random(100) + np.testing.assert_array_equal(draws1, draws2) + + def test_different_blocks_different_draws(self): + var = "takes_up_snap_if_eligible" + rng1 = seeded_rng(var, salt="010010001001001") + rng2 = seeded_rng(var, salt="020010001001001") + draws1 = rng1.random(100) + draws2 = rng2.random(100) + assert not np.array_equal(draws1, draws2) + + def test_different_vars_different_draws(self): + block = "010010001001001" + rng1 = seeded_rng("takes_up_snap_if_eligible", salt=block) + rng2 = seeded_rng("takes_up_aca_if_eligible", salt=block) + draws1 = rng1.random(100) + draws2 = rng2.random(100) + assert not np.array_equal(draws1, draws2) + + def test_draws_in_unit_interval(self): + rng = seeded_rng( + "takes_up_snap_if_eligible", + salt="010010001001001", + ) + draws = rng.random(10000) + assert draws.min() >= 0.0 + assert draws.max() < 1.0 + + def test_rate_comparison_produces_booleans(self): + rng = seeded_rng( + "takes_up_snap_if_eligible", + salt="010010001001001", + ) + draws = rng.random(10000) + rate = 0.75 + result = draws < rate + assert result.dtype == bool + frac = result.mean() + assert 0.70 < frac < 0.80 + + +class TestSimpleTakeupConfig: + """Verify the SIMPLE_TAKEUP_VARS config is well-formed.""" + + def test_all_entries_have_required_keys(self): + from policyengine_us_data.calibration.unified_calibration import ( + SIMPLE_TAKEUP_VARS, + ) + + for entry in SIMPLE_TAKEUP_VARS: + assert "variable" in entry + assert "entity" in entry + assert "rate_key" in entry + assert entry["entity"] in ( + "person", + "tax_unit", + "spm_unit", + ) + + def test_expected_count(self): + from policyengine_us_data.calibration.unified_calibration import ( + SIMPLE_TAKEUP_VARS, + ) + + assert len(SIMPLE_TAKEUP_VARS) == 8 diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_hierarchical_uprating.py b/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py similarity index 53% rename from policyengine_us_data/tests/test_local_area_calibration/test_hierarchical_uprating.py rename to policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py index 918e6ac86..ea2d49c5c 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/test_hierarchical_uprating.py +++ b/policyengine_us_data/tests/test_calibration/test_unified_matrix_builder.py @@ -1,15 +1,18 @@ -""" -Tests for hierarchical uprating and CD reconciliation. +"""Tests for UnifiedMatrixBuilder. + +Ports uprating/hierarchical tests from test_hierarchical_uprating.py. +Uses in-memory SQLite DBs, self-contained. """ import unittest import tempfile import os + import pandas as pd from sqlalchemy import create_engine, text -from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import ( - SparseMatrixBuilder, +from policyengine_us_data.calibration.unified_matrix_builder import ( + UnifiedMatrixBuilder, ) from policyengine_us_data.db.create_database_tables import ( TARGET_OVERVIEW_VIEW, @@ -17,13 +20,18 @@ def _create_test_db(db_path): - """Create test DB with target_overview view and sample data.""" db_uri = f"sqlite:///{db_path}" engine = create_engine(db_uri) with engine.connect() as conn: conn.execute( - text("CREATE TABLE strata (" "stratum_id INTEGER PRIMARY KEY)") + text( + "CREATE TABLE strata (" + "stratum_id INTEGER PRIMARY KEY, " + "definition_hash VARCHAR(64), " + "parent_stratum_id INTEGER, " + "notes VARCHAR)" + ) ) conn.execute( text( @@ -46,7 +54,6 @@ def _create_test_db(db_path): "active INTEGER DEFAULT 1)" ) ) - conn.execute(text(TARGET_OVERVIEW_VIEW)) conn.commit() @@ -54,51 +61,37 @@ def _create_test_db(db_path): def _insert_aca_ptc_data(engine): - """Insert ACA PTC test data at national/state/district levels. - - State 6 (CA): 3 CDs (601, 602, 603) - State 37 (NC): 2 CDs (3701, 3702) - - All IRS data at period=2022. - One CMS national person_count at period=2024. - """ with engine.connect() as conn: - # Strata: national(1), state CA(2), state NC(3), - # CDs: 601(4), 602(5), 603(6), 3701(7), 3702(8) - # CMS national(9) strata = [1, 2, 3, 4, 5, 6, 7, 8, 9] for sid in strata: conn.execute( - text("INSERT INTO strata VALUES (:sid)"), - {"sid": sid}, + text( + "INSERT INTO strata " + "(stratum_id, parent_stratum_id) " + "VALUES (:sid, :parent)" + ), + { + "sid": sid, + "parent": None if sid == 1 else 1, + }, ) - # Constraints constraints = [ - # National: aca_ptc > 0 (1, 1, "aca_ptc", ">", "0"), - # State CA: aca_ptc > 0, state_fips=6 (2, 2, "aca_ptc", ">", "0"), (3, 2, "state_fips", "=", "6"), - # State NC: aca_ptc > 0, state_fips=37 (4, 3, "aca_ptc", ">", "0"), (5, 3, "state_fips", "=", "37"), - # CD 601 (6, 4, "aca_ptc", ">", "0"), (7, 4, "congressional_district_geoid", "=", "601"), - # CD 602 (8, 5, "aca_ptc", ">", "0"), (9, 5, "congressional_district_geoid", "=", "602"), - # CD 603 (10, 6, "aca_ptc", ">", "0"), (11, 6, "congressional_district_geoid", "=", "603"), - # CD 3701 (12, 7, "aca_ptc", ">", "0"), (13, 7, "congressional_district_geoid", "=", "3701"), - # CD 3702 (14, 8, "aca_ptc", ">", "0"), (15, 8, "congressional_district_geoid", "=", "3702"), - # CMS national: aca_ptc > 0 (16, 9, "aca_ptc", ">", "0"), ] for cid, sid, var, op, val in constraints: @@ -116,48 +109,31 @@ def _insert_aca_ptc_data(engine): }, ) - # Targets targets = [ - # National aca_ptc 2022 (1, 1, "aca_ptc", 10000.0, 2022), - # National tax_unit_count 2022 (2, 1, "tax_unit_count", 500.0, 2022), - # State CA aca_ptc 2022: 6000 (3, 2, "aca_ptc", 6000.0, 2022), - # State CA tax_unit_count 2022: 300 (4, 2, "tax_unit_count", 300.0, 2022), - # State NC aca_ptc 2022: 4000 (5, 3, "aca_ptc", 4000.0, 2022), - # State NC tax_unit_count 2022: 200 (6, 3, "tax_unit_count", 200.0, 2022), - # CD 601 aca_ptc 2022: 2000 (7, 4, "aca_ptc", 2000.0, 2022), - # CD 602 aca_ptc 2022: 2500 (8, 5, "aca_ptc", 2500.0, 2022), - # CD 603 aca_ptc 2022: 1500 (9, 6, "aca_ptc", 1500.0, 2022), - # CD 601 tax_unit_count 2022: 100 (10, 4, "tax_unit_count", 100.0, 2022), - # CD 602 tax_unit_count 2022: 120 (11, 5, "tax_unit_count", 120.0, 2022), - # CD 603 tax_unit_count 2022: 80 (12, 6, "tax_unit_count", 80.0, 2022), - # CD 3701 aca_ptc 2022: 2200 (13, 7, "aca_ptc", 2200.0, 2022), - # CD 3702 aca_ptc 2022: 1800 (14, 8, "aca_ptc", 1800.0, 2022), - # CD 3701 tax_unit_count 2022: 110 (15, 7, "tax_unit_count", 110.0, 2022), - # CD 3702 tax_unit_count 2022: 90 (16, 8, "tax_unit_count", 90.0, 2022), - # CMS national person_count 2024 (17, 9, "person_count", 19743689.0, 2024), ] for tid, sid, var, val, period in targets: conn.execute( text( "INSERT INTO targets " - "VALUES (:tid, :sid, :var, :val, :period, 1)" + "VALUES (:tid, :sid, :var, :val, " + ":period, 1)" ), { "tid": tid, @@ -170,9 +146,7 @@ def _insert_aca_ptc_data(engine): conn.commit() -class TestQueryTargetsOverview(unittest.TestCase): - """Test _query_targets with target_overview view.""" - +class TestQueryTargets(unittest.TestCase): @classmethod def setUpClass(cls): cls.temp_db = tempfile.NamedTemporaryFile(suffix=".db", delete=False) @@ -186,57 +160,46 @@ def tearDownClass(cls): os.unlink(cls.db_path) def _make_builder(self, time_period=2024): - return SparseMatrixBuilder( + return UnifiedMatrixBuilder( db_uri=self.db_uri, time_period=time_period, - cds_to_calibrate=["601", "602", "603", "3701", "3702"], ) def test_domain_variables_filter(self): - builder = self._make_builder() - df = builder._query_targets({"domain_variables": ["aca_ptc"]}) + b = self._make_builder() + df = b._query_targets({"domain_variables": ["aca_ptc"]}) self.assertGreater(len(df), 0) self.assertIn("geo_level", df.columns) self.assertIn("geographic_id", df.columns) self.assertIn("domain_variable", df.columns) def test_all_geo_levels_returned(self): - builder = self._make_builder() - df = builder._query_targets({"domain_variables": ["aca_ptc"]}) + b = self._make_builder() + df = b._query_targets({"domain_variables": ["aca_ptc"]}) geo_levels = set(df["geo_level"].unique()) self.assertEqual(geo_levels, {"national", "state", "district"}) def test_best_period_selection(self): - """All aca_ptc targets at 2022, CMS at 2024.""" - builder = self._make_builder(time_period=2024) - df = builder._query_targets({"domain_variables": ["aca_ptc"]}) - aca_rows = df[df["variable"] == "aca_ptc"] - self.assertTrue((aca_rows["period"] == 2022).all()) - - cms_rows = df[df["variable"] == "person_count"] - self.assertEqual(len(cms_rows), 1) - self.assertEqual(cms_rows.iloc[0]["period"], 2024) + b = self._make_builder(time_period=2024) + df = b._query_targets({"domain_variables": ["aca_ptc"]}) + aca = df[df["variable"] == "aca_ptc"] + self.assertTrue((aca["period"] == 2022).all()) + cms = df[df["variable"] == "person_count"] + self.assertEqual(len(cms), 1) + self.assertEqual(cms.iloc[0]["period"], 2024) def test_geographic_id_populated(self): - builder = self._make_builder() - df = builder._query_targets({"domain_variables": ["aca_ptc"]}) + b = self._make_builder() + df = b._query_targets({"domain_variables": ["aca_ptc"]}) national = df[df["geo_level"] == "national"] self.assertTrue((national["geographic_id"] == "US").all()) - state_ca = df[ (df["geo_level"] == "state") & (df["geographic_id"] == "6") ] self.assertGreater(len(state_ca), 0) - district_601 = df[ - (df["geo_level"] == "district") & (df["geographic_id"] == "601") - ] - self.assertGreater(len(district_601), 0) - class TestHierarchicalUprating(unittest.TestCase): - """Test _apply_hierarchical_uprating logic.""" - @classmethod def setUpClass(cls): cls.temp_db = tempfile.NamedTemporaryFile(suffix=".db", delete=False) @@ -250,141 +213,91 @@ def tearDownClass(cls): os.unlink(cls.db_path) def _make_builder(self, time_period=2024): - return SparseMatrixBuilder( + return UnifiedMatrixBuilder( db_uri=self.db_uri, time_period=time_period, - cds_to_calibrate=["601", "602", "603", "3701", "3702"], ) def _get_targets_with_uprating(self, cpi_factor=1.1, pop_factor=1.02): - builder = self._make_builder(time_period=2024) - df = builder._query_targets({"domain_variables": ["aca_ptc"]}) + b = self._make_builder(time_period=2024) + df = b._query_targets({"domain_variables": ["aca_ptc"]}) factors = { (2022, "cpi"): cpi_factor, (2022, "pop"): pop_factor, } df["original_value"] = df["value"].copy() df["uprating_factor"] = df.apply( - lambda row: builder._get_uprating_info( + lambda row: b._get_uprating_info( row["variable"], row["period"], factors )[0], axis=1, ) df["value"] = df["original_value"] * df["uprating_factor"] - return builder, df, factors + return b, df, factors - def test_cd_sums_match_uprated_state_totals(self): - """After reconciliation, CD sums must equal state * UF.""" - builder, df, factors = self._get_targets_with_uprating( + def test_cd_sums_match_uprated_state(self): + b, df, factors = self._get_targets_with_uprating( cpi_factor=1.1, pop_factor=1.02 ) + result = b._apply_hierarchical_uprating(df, ["aca_ptc"], factors) + csv_factors = b._load_aca_ptc_factors() - result = builder._apply_hierarchical_uprating(df, ["aca_ptc"], factors) - - # Get the CSV-based uprating factors used - csv_factors = builder._load_aca_ptc_factors() - - # Expected: state_original * csv_factor - for var, state_fips, state_original in [ + for var, sf, orig in [ ("aca_ptc", 6, 6000.0), ("aca_ptc", 37, 4000.0), ("tax_unit_count", 6, 300.0), ("tax_unit_count", 37, 200.0), ]: - expected_total = state_original * csv_factors[state_fips][var] + expected = orig * csv_factors[sf][var] cd_rows = result[ (result["variable"] == var) & (result["geo_level"] == "district") & ( result["geographic_id"].apply( - lambda g, s=state_fips: ( + lambda g, s=sf: ( int(g) // 100 == s if g.isdigit() else False ) ) ) ] - cd_sum = cd_rows["value"].sum() self.assertAlmostEqual( - cd_sum, - expected_total, + cd_rows["value"].sum(), + expected, places=2, - msg=f"CD sum for {var} state {state_fips}", + msg=f"{var} state {sf}", ) def test_national_and_state_rows_dropped(self): - """IRS national and state rows (period!=2024) are dropped.""" - builder, df, factors = self._get_targets_with_uprating() - result = builder._apply_hierarchical_uprating(df, ["aca_ptc"], factors) - + b, df, factors = self._get_targets_with_uprating() + result = b._apply_hierarchical_uprating(df, ["aca_ptc"], factors) irs_national = result[ (result["geo_level"] == "national") & (result["period"] != 2024) ] self.assertEqual(len(irs_national), 0) - state_rows = result[result["geo_level"] == "state"] self.assertEqual(len(state_rows), 0) def test_cms_person_count_preserved(self): - """CMS national person_count (period=2024) is NOT dropped.""" - builder, df, factors = self._get_targets_with_uprating() - result = builder._apply_hierarchical_uprating(df, ["aca_ptc"], factors) - + b, df, factors = self._get_targets_with_uprating() + result = b._apply_hierarchical_uprating(df, ["aca_ptc"], factors) cms = result[ (result["variable"] == "person_count") & (result["period"] == 2024) ] self.assertEqual(len(cms), 1) self.assertAlmostEqual(cms.iloc[0]["value"], 19743689.0, places=0) - def test_hif_and_uprating_columns(self): - """Diagnostic hif and state_uprating_factor columns populated.""" - builder, df, factors = self._get_targets_with_uprating(cpi_factor=1.1) - result = builder._apply_hierarchical_uprating(df, ["aca_ptc"], factors) - - cd_aca = result[ - (result["variable"] == "aca_ptc") - & (result["geo_level"] == "district") - ] - self.assertTrue(cd_aca["hif"].notna().all()) - self.assertTrue(cd_aca["state_uprating_factor"].notna().all()) - def test_hif_is_one_when_cds_sum_to_state(self): - """HIF == 1.0 when CDs already sum to state total. - - The uprating factor now comes from the CSV (state-specific), - not from national CPI, so we just check HIF and that a - nonzero uprating factor is set. - """ - builder, df, factors = self._get_targets_with_uprating(cpi_factor=1.15) - result = builder._apply_hierarchical_uprating(df, ["aca_ptc"], factors) - + b, df, factors = self._get_targets_with_uprating(cpi_factor=1.15) + result = b._apply_hierarchical_uprating(df, ["aca_ptc"], factors) cd_aca = result[ (result["variable"] == "aca_ptc") & (result["geo_level"] == "district") ] for _, row in cd_aca.iterrows(): - self.assertAlmostEqual( - row["hif"], - 1.0, - places=6, - msg=( - f"CD {row['geographic_id']} HIF " - f"should be 1.0 (CDs sum to state)" - ), - ) - self.assertGreater( - row["state_uprating_factor"], - 0, - msg=( - f"CD {row['geographic_id']} should " - f"have a positive uprating factor" - ), - ) - - def test_no_data_loss_for_non_hierarchical_rows(self): - """Rows not in hierarchical_domains are untouched.""" - builder, df, factors = self._get_targets_with_uprating() + self.assertAlmostEqual(row["hif"], 1.0, places=6) - # Add a non-hierarchical row + def test_non_hierarchical_rows_untouched(self): + b, df, factors = self._get_targets_with_uprating() extra = pd.DataFrame( [ { @@ -401,20 +314,14 @@ def test_no_data_loss_for_non_hierarchical_rows(self): } ] ) - df_with_snap = pd.concat([df, extra], ignore_index=True) - - result = builder._apply_hierarchical_uprating( - df_with_snap, ["aca_ptc"], factors - ) - - snap_rows = result[result["domain_variable"] == "snap"] - self.assertEqual(len(snap_rows), 1) - self.assertEqual(snap_rows.iloc[0]["value"], 5000.0) + df2 = pd.concat([df, extra], ignore_index=True) + result = b._apply_hierarchical_uprating(df2, ["aca_ptc"], factors) + snap = result[result["domain_variable"] == "snap"] + self.assertEqual(len(snap), 1) + self.assertEqual(snap.iloc[0]["value"], 5000.0) class TestGetStateUpratingFactors(unittest.TestCase): - """Test _get_state_uprating_factors.""" - @classmethod def setUpClass(cls): cls.temp_db = tempfile.NamedTemporaryFile(suffix=".db", delete=False) @@ -428,56 +335,17 @@ def tearDownClass(cls): os.unlink(cls.db_path) def test_aca_ptc_uses_csv_factors(self): - """aca_ptc domain loads real state-level factors from CSV.""" - builder = SparseMatrixBuilder( - db_uri=self.db_uri, - time_period=2024, - cds_to_calibrate=["601"], - ) - df = builder._query_targets({"domain_variables": ["aca_ptc"]}) - national_factors = { - (2022, "cpi"): 1.08, - (2022, "pop"): 1.015, - } + b = UnifiedMatrixBuilder(db_uri=self.db_uri, time_period=2024) + df = b._query_targets({"domain_variables": ["aca_ptc"]}) + nf = {(2022, "cpi"): 1.08, (2022, "pop"): 1.015} df["original_value"] = df["value"].copy() - result = builder._get_state_uprating_factors( - "aca_ptc", df, national_factors - ) - + result = b._get_state_uprating_factors("aca_ptc", df, nf) self.assertIn(6, result) self.assertIn(37, result) - # CA: vol_mult ~1.0554, val_mult ~1.1460 - # aca_ptc factor = vol_mult * val_mult - self.assertAlmostEqual( - result[6]["aca_ptc"], - 1.0554375137756227 * 1.1459694989106755, - places=5, - ) - # tax_unit_count factor = vol_mult only - self.assertAlmostEqual( - result[6]["tax_unit_count"], 1.0554375137756227, places=5 - ) - - # NC: vol_mult ~1.4784, val_mult ~0.9571 - self.assertAlmostEqual( - result[37]["aca_ptc"], - 1.4784049241899557 * 0.9571183533447685, - places=5, - ) - self.assertAlmostEqual( - result[37]["tax_unit_count"], 1.4784049241899557, places=5 - ) - - def test_non_aca_domain_uses_national_factors(self): - """Non-aca_ptc domains fall back to national CPI/pop factors.""" - builder = SparseMatrixBuilder( - db_uri=self.db_uri, - time_period=2024, - cds_to_calibrate=["601"], - ) - # Build a fake targets_df with domain="snap" + def test_non_aca_uses_national_factors(self): + b = UnifiedMatrixBuilder(db_uri=self.db_uri, time_period=2024) df = pd.DataFrame( [ { @@ -500,21 +368,32 @@ def test_non_aca_domain_uses_national_factors(self): }, ] ) - national_factors = { - (2022, "cpi"): 1.08, - (2022, "pop"): 1.015, - } - - result = builder._get_state_uprating_factors( - "snap", df, national_factors - ) - + nf = {(2022, "cpi"): 1.08, (2022, "pop"): 1.015} + result = b._get_state_uprating_factors("snap", df, nf) self.assertIn(6, result) - # snap is dollar -> CPI self.assertAlmostEqual(result[6]["snap"], 1.08) - # household_count -> pop self.assertAlmostEqual(result[6]["household_count"], 1.015) +class TestCountTargetDetection(unittest.TestCase): + def test_endswith_count(self): + count_vars = [ + "person_count", + "tax_unit_count", + "household_count", + ] + value_vars = ["snap", "aca_ptc", "income_tax"] + for v in count_vars: + self.assertTrue( + v.endswith("_count"), + f"{v} should be detected as count", + ) + for v in value_vars: + self.assertFalse( + v.endswith("_count"), + f"{v} should not be a count target", + ) + + if __name__ == "__main__": unittest.main() diff --git a/policyengine_us_data/tests/test_local_area_calibration/conftest.py b/policyengine_us_data/tests/test_local_area_calibration/conftest.py index ce36157cc..dfede8002 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/conftest.py +++ b/policyengine_us_data/tests/test_local_area_calibration/conftest.py @@ -1,57 +1,8 @@ -"""Shared fixtures for local area calibration tests. - -Importantly, this file determines which variables will be included in the sparse matrix and calibrating routine. -""" +"""Shared fixtures for local area calibration tests.""" import pytest -import numpy as np -from sqlalchemy import create_engine, text -from policyengine_us import Microsimulation from policyengine_us_data.storage import STORAGE_FOLDER -from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import ( - SparseMatrixBuilder, -) -from policyengine_us_data.datasets.cps.local_area_calibration.matrix_tracer import ( - MatrixTracer, -) -from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import ( - get_calculated_variables, -) - -# Variables to test for state-level value matching (CI uses subset for speed) -# Format: (variable_name, rtol) -# variable_name as per the targets in policy_data.db -# rtol is relative tolerance for comparison -# -# NOTE: Count targets (person_count, tax_unit_count) are excluded because -# they have constraints (e.g., age>=5|age<18) that make the X_sparse values -# different from raw sim.calculate() values. Count targets are tested -# separately in test_count_targets.py with controlled mock data. -VARIABLES_TO_TEST = [ - ("snap", 1e-2), - ("income_tax", 1e-2), - ("eitc", 1e-2), -] - -# CI filter config - minimal subset for fast CI runs -# Tests 3 representative variables covering benefits, taxes, and credits -COMBINED_FILTER_CONFIG = { - "domain_variables": [ - "snap", - ], - "variables": [ - "snap", - "income_tax", - "eitc", - ], -} - -# Maximum allowed mismatch rate for state-level value comparison -MAX_MISMATCH_RATE = 0.02 - -# Number of samples for cell-level verification tests -N_VERIFICATION_SAMPLES = 500 @pytest.fixture(scope="module") @@ -63,92 +14,3 @@ def db_uri(): @pytest.fixture(scope="module") def dataset_path(): return str(STORAGE_FOLDER / "stratified_extended_cps_2024.h5") - - -@pytest.fixture(scope="module") -def test_cds(db_uri): - """CDs from NC, HI, MT, AK (manageable size for CI, multiple same-state CDs).""" - engine = create_engine(db_uri) - query = """ - SELECT DISTINCT sc.value as cd_geoid - FROM stratum_constraints sc - WHERE sc.constraint_variable = 'congressional_district_geoid' - AND ( - sc.value LIKE '37__' - OR sc.value LIKE '150_' - OR sc.value LIKE '300_' - OR sc.value = '200' OR sc.value = '201' - ) - ORDER BY sc.value - """ - with engine.connect() as conn: - result = conn.execute(text(query)).fetchall() - return [row[0] for row in result] - - -@pytest.fixture(scope="module") -def sim(dataset_path): - return Microsimulation(dataset=dataset_path) - - -@pytest.fixture(scope="module") -def matrix_data(db_uri, dataset_path, test_cds, sim): - """Build sparse matrix with all configured variables.""" - builder = SparseMatrixBuilder( - db_uri, - time_period=2023, - cds_to_calibrate=test_cds, - dataset_path=dataset_path, - ) - targets_df, X_sparse, household_id_mapping = builder.build_matrix( - sim, target_filter=COMBINED_FILTER_CONFIG - ) - return targets_df, X_sparse, household_id_mapping - - -@pytest.fixture(scope="module") -def targets_df(matrix_data): - return matrix_data[0] - - -@pytest.fixture(scope="module") -def X_sparse(matrix_data): - return matrix_data[1] - - -@pytest.fixture(scope="module") -def household_id_mapping(matrix_data): - return matrix_data[2] - - -@pytest.fixture(scope="module") -def tracer(targets_df, X_sparse, household_id_mapping, test_cds, sim): - return MatrixTracer( - targets_df, X_sparse, household_id_mapping, test_cds, sim - ) - - -@pytest.fixture(scope="module") -def n_households(tracer): - return tracer.n_households - - -@pytest.fixture(scope="module") -def household_ids(tracer): - return tracer.original_household_ids - - -@pytest.fixture(scope="module") -def household_states(sim): - return sim.calculate("state_fips", map_to="household").values - - -def create_state_simulation(dataset_path, n_households, state): - """Create simulation with all households assigned to a specific state.""" - s = Microsimulation(dataset=dataset_path) - s.set_input( - "state_fips", 2023, np.full(n_households, state, dtype=np.int32) - ) - for var in get_calculated_variables(s): - s.delete_arrays(var) - return s diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_column_indexing.py b/policyengine_us_data/tests/test_local_area_calibration/test_column_indexing.py deleted file mode 100644 index 2e23763bc..000000000 --- a/policyengine_us_data/tests/test_local_area_calibration/test_column_indexing.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Test column indexing in sparse matrix.""" - -import pytest - - -def test_column_indexing_roundtrip(X_sparse, tracer, test_cds): - """ - Verify column index = cd_idx * n_households + household_index. - - This is pure math - if this fails, everything else is unreliable. - """ - n_hh = tracer.n_households - hh_ids = tracer.original_household_ids - errors = [] - - test_cases = [] - for cd_idx in [0, len(test_cds) // 2, len(test_cds) - 1]: - for hh_idx in [0, 100, n_hh - 1]: - test_cases.append((cd_idx, hh_idx)) - - for cd_idx, hh_idx in test_cases: - cd = test_cds[cd_idx] - hh_id = hh_ids[hh_idx] - expected_col = cd_idx * n_hh + hh_idx - col_info = tracer.get_column_info(expected_col) - positions = tracer.get_household_column_positions(hh_id) - pos_col = positions[cd] - - if col_info["cd_geoid"] != cd: - errors.append(f"CD mismatch at col {expected_col}") - if col_info["household_index"] != hh_idx: - errors.append(f"HH index mismatch at col {expected_col}") - if col_info["household_id"] != hh_id: - errors.append(f"HH ID mismatch at col {expected_col}") - if pos_col != expected_col: - errors.append(f"Position mismatch for hh {hh_id}, cd {cd}") - - assert not errors, f"Column indexing errors: {errors}" - - -def test_matrix_dimensions(X_sparse, tracer, test_cds): - """Verify matrix width matches expected CD x household count.""" - n_hh = tracer.n_households - expected_cols = len(test_cds) * n_hh - assert ( - X_sparse.shape[1] == expected_cols - ), f"Matrix width mismatch: expected {expected_cols}, got {X_sparse.shape[1]}" diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_count_targets.py b/policyengine_us_data/tests/test_local_area_calibration/test_count_targets.py deleted file mode 100644 index 46eae4ebb..000000000 --- a/policyengine_us_data/tests/test_local_area_calibration/test_count_targets.py +++ /dev/null @@ -1,415 +0,0 @@ -""" -Tests for count target handling in SparseMatrixBuilder. - -These tests verify that count targets (e.g., person_count, tax_unit_count) -are correctly handled by counting entities that satisfy constraints, rather -than summing values. -""" - -import pytest -import numpy as np -from dataclasses import dataclass - -from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import ( - SparseMatrixBuilder, -) - - -@dataclass -class MockEntity: - """Mock entity with a key attribute.""" - - key: str - - -@dataclass -class MockVariable: - """Mock variable with entity information.""" - - entity: MockEntity - - @classmethod - def create(cls, entity_key: str) -> "MockVariable": - return cls(entity=MockEntity(key=entity_key)) - - -class MockTaxBenefitSystem: - """Mock tax benefit system with variable definitions.""" - - def __init__(self): - self.variables = { - "person_count": MockVariable.create("person"), - "tax_unit_count": MockVariable.create("tax_unit"), - "household_count": MockVariable.create("household"), - "spm_unit_count": MockVariable.create("spm_unit"), - "snap": MockVariable.create("spm_unit"), - } - - -@dataclass -class MockCalculationResult: - """Mock result from simulation.calculate().""" - - values: np.ndarray - - -class MockSimulation: - """Mock simulation for testing count target calculations.""" - - def __init__(self, entity_data: dict, variable_values: dict): - """ - Args: - entity_data: Dict with person_id, household_id, tax_unit_id, - spm_unit_id arrays (all at person level) - variable_values: Dict mapping variable names to their values - at the appropriate entity level - """ - self.entity_data = entity_data - self.variable_values = variable_values - self.tax_benefit_system = MockTaxBenefitSystem() - - def calculate(self, variable: str, map_to: str = None): - """Return mock calculation result.""" - if variable in self.entity_data: - # Entity ID variables - if map_to == "person": - values = np.array(self.entity_data[variable]) - elif map_to == "household": - # Return unique household IDs - values = np.array( - sorted(set(self.entity_data["household_id"])) - ) - else: - values = np.array(self.entity_data[variable]) - elif variable in self.variable_values: - # Regular variables - return at requested level - val_data = self.variable_values[variable] - if map_to == "person": - values = np.array(val_data["person"]) - elif map_to == "household": - values = np.array(val_data["household"]) - else: - values = np.array(val_data.get("default", [])) - else: - values = np.array([]) - - return MockCalculationResult(values=values) - - -@pytest.fixture -def basic_entity_data(): - """ - Create mock entity relationships with known household compositions. - - Household 1 (id=100): 3 people (ages 5, 12, 40) -> 2 aged 5-17 - Household 2 (id=200): 2 people (ages 3, 25) -> 0 aged 5-17 - Household 3 (id=300): 4 people (ages 6, 8, 10, 45) -> 3 aged 5-17 - """ - return { - "person_id": [1, 2, 3, 4, 5, 6, 7, 8, 9], - "household_id": [100, 100, 100, 200, 200, 300, 300, 300, 300], - "tax_unit_id": [10, 10, 10, 20, 20, 30, 30, 30, 30], - "spm_unit_id": [ - 1000, - 1000, - 1000, - 2000, - 2000, - 3000, - 3000, - 3000, - 3000, - ], - } - - -@pytest.fixture -def basic_variable_values(): - """Variable values for basic household composition tests.""" - return { - "age": { - "person": [5, 12, 40, 3, 25, 6, 8, 10, 45], - "household": [40, 25, 45], # Not used for age constraints - }, - "person_count": { - "person": [1, 1, 1, 1, 1, 1, 1, 1, 1], - "household": [3, 2, 4], # Sum per household - }, - "snap": { - "person": [100, 100, 100, 0, 0, 200, 200, 200, 200], - "household": [300, 0, 800], - }, - } - - -@pytest.fixture -def basic_sim(basic_entity_data, basic_variable_values): - """Mock simulation with basic household compositions.""" - return MockSimulation(basic_entity_data, basic_variable_values) - - -@pytest.fixture -def builder(): - """Create a minimal SparseMatrixBuilder (won't use DB for unit tests).""" - return SparseMatrixBuilder( - db_uri="sqlite:///:memory:", - time_period=2023, - cds_to_calibrate=["101"], - ) - - -# Tests for basic count target calculation -class TestCountTargetCalculation: - """Test _calculate_target_values_entity_aware for count targets.""" - - def test_person_count_with_age_constraints(self, builder, basic_sim): - """Test person_count correctly counts persons in age range per HH.""" - # Constraints: age >= 5 AND age < 18 - constraints = [ - {"variable": "age", "operation": ">=", "value": 5}, - {"variable": "age", "operation": "<", "value": 18}, - ] - - geo_mask = np.array([True, True, True]) # All households included - n_households = 3 - - result = builder._calculate_target_values_entity_aware( - basic_sim, - "person_count", - constraints, - geo_mask, - n_households, - ) - - # Expected: HH1 has 2 people (ages 5, 12), HH2 has 0, HH3 has 3 (6,8,10) - expected = np.array([2, 0, 3], dtype=np.float32) - np.testing.assert_array_equal(result, expected) - - def test_person_count_no_constraints(self, builder, basic_sim): - """Test person_count without constraints returns all persons per HH.""" - constraints = [] - geo_mask = np.array([True, True, True]) - n_households = 3 - - result = builder._calculate_target_values_entity_aware( - basic_sim, - "person_count", - constraints, - geo_mask, - n_households, - ) - - # Expected: HH1 has 3 people, HH2 has 2, HH3 has 4 - expected = np.array([3, 2, 4], dtype=np.float32) - np.testing.assert_array_equal(result, expected) - - def test_person_count_with_geo_mask(self, builder, basic_sim): - """Test person_count respects geographic mask.""" - constraints = [ - {"variable": "age", "operation": ">=", "value": 5}, - {"variable": "age", "operation": "<", "value": 18}, - ] - - # Only include households 1 and 3 - geo_mask = np.array([True, False, True]) - n_households = 3 - - result = builder._calculate_target_values_entity_aware( - basic_sim, - "person_count", - constraints, - geo_mask, - n_households, - ) - - # Expected: HH1=2, HH2=0 (masked out), HH3=3 - expected = np.array([2, 0, 3], dtype=np.float32) - np.testing.assert_array_equal(result, expected) - - def test_value_target_uses_sum(self, builder, basic_sim): - """Test that non-count targets sum values (existing behavior).""" - # SNAP is a value target, not a count target - constraints = [] - geo_mask = np.array([True, True, True]) - n_households = 3 - - result = builder._calculate_target_values_entity_aware( - basic_sim, - "snap", - constraints, - geo_mask, - n_households, - ) - - # Expected: Sum of snap values per household - expected = np.array([300, 0, 800], dtype=np.float32) - np.testing.assert_array_equal(result, expected) - - def test_household_count_no_constraints(self, builder, basic_sim): - """Test household_count returns 1 for each qualifying household.""" - constraints = [] - geo_mask = np.array([True, True, True]) - n_households = 3 - - result = builder._calculate_target_values_entity_aware( - basic_sim, - "household_count", - constraints, - geo_mask, - n_households, - ) - - # Expected: 1 for each household in geo_mask - expected = np.array([1, 1, 1], dtype=np.float32) - np.testing.assert_array_equal(result, expected) - - def test_household_count_with_geo_mask(self, builder, basic_sim): - """Test household_count respects geographic mask.""" - constraints = [] - geo_mask = np.array([True, False, True]) - n_households = 3 - - result = builder._calculate_target_values_entity_aware( - basic_sim, - "household_count", - constraints, - geo_mask, - n_households, - ) - - # Expected: 1 for HH1, 0 for HH2 (masked), 1 for HH3 - expected = np.array([1, 0, 1], dtype=np.float32) - np.testing.assert_array_equal(result, expected) - - -# Fixtures for complex entity relationship tests -@pytest.fixture -def complex_entity_data(): - """ - Create entity data with multiple tax units per household. - - Household 1 (id=100): 4 people in 2 tax units - Tax unit 10: person 1 (age 30, filer), person 2 (age 28) - Tax unit 11: person 3 (age 65, filer), person 4 (age 62) - Household 2 (id=200): 2 people in 1 tax unit - Tax unit 20: person 5 (age 45, filer), person 6 (age 16) - """ - return { - "person_id": [1, 2, 3, 4, 5, 6], - "household_id": [100, 100, 100, 100, 200, 200], - "tax_unit_id": [10, 10, 11, 11, 20, 20], - "spm_unit_id": [1000, 1000, 1000, 1000, 2000, 2000], - } - - -@pytest.fixture -def complex_variable_values(): - """Variable values for complex entity relationship tests.""" - return { - "age": { - "person": [30, 28, 65, 62, 45, 16], - "household": [65, 45], - }, - "is_tax_unit_head": { - "person": [True, False, True, False, True, False], - "household": [2, 1], # count of heads per HH - }, - "tax_unit_count": { - "person": [1, 1, 1, 1, 1, 1], - "household": [2, 1], - }, - "person_count": { - "person": [1, 1, 1, 1, 1, 1], - "household": [4, 2], - }, - } - - -@pytest.fixture -def complex_sim(complex_entity_data, complex_variable_values): - """Mock simulation with complex entity relationships.""" - return MockSimulation(complex_entity_data, complex_variable_values) - - -# Tests for complex entity relationships -class TestCountTargetWithRealEntities: - """Test count targets with more complex entity relationships.""" - - def test_tax_unit_count_no_constraints(self, builder, complex_sim): - """Test tax_unit_count counts all tax units per household.""" - constraints = [] - geo_mask = np.array([True, True]) - n_households = 2 - - result = builder._calculate_target_values_entity_aware( - complex_sim, - "tax_unit_count", - constraints, - geo_mask, - n_households, - ) - - # Expected: HH1 has 2 tax units, HH2 has 1 - expected = np.array([2, 1], dtype=np.float32) - np.testing.assert_array_equal(result, expected) - - def test_tax_unit_count_with_age_constraint(self, builder, complex_sim): - """Test tax_unit_count with age constraint on members.""" - # Count tax units that have at least one person aged >= 65 - constraints = [ - {"variable": "age", "operation": ">=", "value": 65}, - ] - geo_mask = np.array([True, True]) - n_households = 2 - - result = builder._calculate_target_values_entity_aware( - complex_sim, - "tax_unit_count", - constraints, - geo_mask, - n_households, - ) - - # Expected: HH1 has 1 tax unit (TU 11) with person >=65, HH2 has 0 - expected = np.array([1, 0], dtype=np.float32) - np.testing.assert_array_equal(result, expected) - - def test_person_count_seniors(self, builder, complex_sim): - """Test person_count for seniors (age >= 65).""" - constraints = [ - {"variable": "age", "operation": ">=", "value": 65}, - ] - geo_mask = np.array([True, True]) - n_households = 2 - - result = builder._calculate_target_values_entity_aware( - complex_sim, - "person_count", - constraints, - geo_mask, - n_households, - ) - - # Expected: HH1 has 1 senior (age 65), HH2 has 0 - expected = np.array([1, 0], dtype=np.float32) - np.testing.assert_array_equal(result, expected) - - def test_person_count_children(self, builder, complex_sim): - """Test person_count for children (age < 18).""" - constraints = [ - {"variable": "age", "operation": "<", "value": 18}, - ] - geo_mask = np.array([True, True]) - n_households = 2 - - result = builder._calculate_target_values_entity_aware( - complex_sim, - "person_count", - constraints, - geo_mask, - n_households, - ) - - # Expected: HH1 has 0 children, HH2 has 1 (age 16) - expected = np.array([0, 1], dtype=np.float32) - np.testing.assert_array_equal(result, expected) diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py b/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py deleted file mode 100644 index 2f44428c5..000000000 --- a/policyengine_us_data/tests/test_local_area_calibration/test_cross_state.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Test cross-state values match state-swapped simulations.""" - -import pytest -import numpy as np -from collections import defaultdict - -from policyengine_us import Microsimulation -from policyengine_us_data.datasets.cps.local_area_calibration.calibration_utils import ( - get_calculated_variables, -) - -from .conftest import VARIABLES_TO_TEST, N_VERIFICATION_SAMPLES - - -@pytest.mark.skip( - reason="Sparse matrix builder not used in production; test needs rework after time_period fix" -) -def test_cross_state_matches_swapped_sim( - X_sparse, - targets_df, - test_cds, - dataset_path, - n_households, - household_ids, - household_states, -): - """ - Cross-state non-zero cells must match state-swapped simulation. - - When household moves to different state, X_sparse should contain the - value calculated from a fresh simulation with state_fips set to - destination state. - - Uses stratified sampling to ensure all variables in VARIABLES_TO_TEST - are covered with approximately equal samples per variable. - """ - seed = 42 - rng = np.random.default_rng(seed) - n_hh = n_households - hh_ids = household_ids - hh_states = household_states - - state_sims = {} - - def get_state_sim(state): - if state not in state_sims: - s = Microsimulation(dataset=dataset_path) - s.set_input( - "state_fips", 2023, np.full(n_hh, state, dtype=np.int32) - ) - for var in get_calculated_variables(s): - s.delete_arrays(var) - state_sims[state] = s - return state_sims[state] - - nonzero_rows, nonzero_cols = X_sparse.nonzero() - - # Group cross-state cells by variable for stratified sampling - variable_to_indices = defaultdict(list) - variables_to_test = {v[0] for v in VARIABLES_TO_TEST} - - for i in range(len(nonzero_rows)): - row_idx = nonzero_rows[i] - col_idx = nonzero_cols[i] - cd_idx = col_idx // n_hh - hh_idx = col_idx % n_hh - cd = test_cds[cd_idx] - dest_state = int(cd) // 100 - orig_state = int(hh_states[hh_idx]) - - # Only include cross-state cells - if dest_state == orig_state: - continue - - # Get variable for this row - variable = targets_df.iloc[row_idx]["variable"] - if variable in variables_to_test: - variable_to_indices[variable].append(i) - - if not variable_to_indices: - pytest.skip("No cross-state non-zero cells found for test variables") - - # Stratified sampling: sample proportionally from each variable - samples_per_var = max( - 1, N_VERIFICATION_SAMPLES // len(variable_to_indices) - ) - sample_indices = [] - - for variable, indices in variable_to_indices.items(): - n_to_sample = min(samples_per_var, len(indices)) - sampled = rng.choice(indices, n_to_sample, replace=False) - sample_indices.extend(sampled) - - errors = [] - variables_tested = set() - - for idx in sample_indices: - row_idx = nonzero_rows[idx] - col_idx = nonzero_cols[idx] - cd_idx = col_idx // n_hh - hh_idx = col_idx % n_hh - cd = test_cds[cd_idx] - dest_state = int(cd) // 100 - variable = targets_df.iloc[row_idx]["variable"] - actual = float(X_sparse[row_idx, col_idx]) - state_sim = get_state_sim(dest_state) - expected = float( - state_sim.calculate(variable, map_to="household").values[hh_idx] - ) - - variables_tested.add(variable) - - if not np.isclose(actual, expected, atol=0.5): - errors.append( - { - "hh_id": hh_ids[hh_idx], - "orig_state": int(hh_states[hh_idx]), - "dest_state": dest_state, - "variable": variable, - "actual": actual, - "expected": expected, - } - ) - - # Report which variables were tested - missing_vars = variables_to_test - variables_tested - if missing_vars: - print(f"Warning: No cross-state cells found for: {missing_vars}") - - assert not errors, ( - f"Cross-state verification failed: {len(errors)}/{len(sample_indices)} " - f"mismatches across {len(variables_tested)} variables. " - f"First 5: {errors[:5]}" - ) diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_geo_masking.py b/policyengine_us_data/tests/test_local_area_calibration/test_geo_masking.py deleted file mode 100644 index 9f0033733..000000000 --- a/policyengine_us_data/tests/test_local_area_calibration/test_geo_masking.py +++ /dev/null @@ -1,202 +0,0 @@ -"""Test geographic masking behavior in sparse matrix.""" - -import pytest -import numpy as np - - -def test_state_level_zero_masking( - X_sparse, targets_df, tracer, test_cds, n_households -): - """ - State-level targets have zeros for wrong-state CD columns. - - For a target with geographic_id=37 (NC), columns for CDs in other states - (HI, MT, AK) should all be zero. - """ - seed = 42 - rng = np.random.default_rng(seed) - n_hh = n_households - - state_targets = [] - for row_idx in range(len(targets_df)): - geo_id = targets_df.iloc[row_idx].get("geographic_id", "US") - if geo_id != "US": - try: - val = int(geo_id) - if val < 100: - state_targets.append((row_idx, val)) - except (ValueError, TypeError): - pass - - if not state_targets: - pytest.skip("No state-level targets found") - - errors = [] - checked = 0 - sample_targets = rng.choice( - len(state_targets), min(20, len(state_targets)), replace=False - ) - - for idx in sample_targets: - row_idx, target_state = state_targets[idx] - other_state_cds = [ - (i, cd) - for i, cd in enumerate(test_cds) - if int(cd) // 100 != target_state - ] - if not other_state_cds: - continue - - sample_cds = rng.choice( - len(other_state_cds), min(5, len(other_state_cds)), replace=False - ) - for cd_sample_idx in sample_cds: - cd_idx, cd = other_state_cds[cd_sample_idx] - sample_hh = rng.choice(n_hh, min(5, n_hh), replace=False) - for hh_idx in sample_hh: - col_idx = cd_idx * n_hh + hh_idx - actual = X_sparse[row_idx, col_idx] - checked += 1 - if actual != 0: - errors.append( - {"row": row_idx, "cd": cd, "value": float(actual)} - ) - - assert ( - not errors - ), f"State-level masking failed: {len(errors)}/{checked} should be zero" - - -def test_cd_level_zero_masking( - X_sparse, targets_df, tracer, test_cds, n_households -): - """ - CD-level targets have zeros for other CDs, even same-state. - - For a target with geographic_id=3707, columns for CDs 3701-3706, 3708-3714 - should all be zero, even though they're all in NC (state 37). - """ - seed = 42 - rng = np.random.default_rng(seed) - n_hh = n_households - - cd_targets_with_same_state = [] - for row_idx in range(len(targets_df)): - geo_id = targets_df.iloc[row_idx].get("geographic_id", "US") - if geo_id != "US": - try: - val = int(geo_id) - if val >= 100: - target_state = val // 100 - same_state_other_cds = [ - cd - for cd in test_cds - if int(cd) // 100 == target_state and cd != geo_id - ] - if same_state_other_cds: - cd_targets_with_same_state.append( - (row_idx, geo_id, same_state_other_cds) - ) - except (ValueError, TypeError): - pass - - if not cd_targets_with_same_state: - pytest.skip( - "No CD-level targets with same-state other CDs in test_cds" - ) - - errors = [] - same_state_checks = 0 - - for row_idx, target_cd, other_cds in cd_targets_with_same_state[:10]: - for cd in other_cds: - cd_idx = test_cds.index(cd) - for hh_idx in rng.choice(n_hh, 3, replace=False): - col_idx = cd_idx * n_hh + hh_idx - actual = X_sparse[row_idx, col_idx] - same_state_checks += 1 - if actual != 0: - errors.append( - { - "target_cd": target_cd, - "other_cd": cd, - "value": float(actual), - } - ) - - assert not errors, ( - f"CD-level masking failed: {len(errors)} same-state-different-CD " - f"non-zero values. First 5: {errors[:5]}" - ) - - -@pytest.mark.skip( - reason="Sparse matrix builder not used in production; test needs rework after time_period fix" -) -def test_national_no_geo_masking( - X_sparse, targets_df, tracer, sim, test_cds, dataset_path, n_households -): - """ - National targets have no geographic masking. - - National targets (geographic_id='US') can have non-zero values for ANY CD. - Values differ by destination state because benefits are recalculated - under each state's rules. - """ - seed = 42 - rng = np.random.default_rng(seed) - n_hh = n_households - hh_ids = tracer.original_household_ids - - national_rows = [ - i - for i in range(len(targets_df)) - if targets_df.iloc[i].get("geographic_id", "US") == "US" - ] - - if not national_rows: - pytest.skip("No national targets found") - - states_in_test = sorted(set(int(cd) // 100 for cd in test_cds)) - cds_by_state = { - state: [cd for cd in test_cds if int(cd) // 100 == state] - for state in states_in_test - } - - for row_idx in national_rows: - variable = targets_df.iloc[row_idx]["variable"] - - row_data = X_sparse.getrow(row_idx) - nonzero_cols = row_data.nonzero()[1] - - assert ( - len(nonzero_cols) > 0 - ), f"National target row {row_idx} ({variable}) has no non-zero values" - - sample_cols = rng.choice( - nonzero_cols, min(5, len(nonzero_cols)), replace=False - ) - - households_checked = 0 - households_with_multi_state_values = 0 - - for col_idx in sample_cols: - hh_idx = col_idx % n_hh - - values_by_state = {} - for state, cds in cds_by_state.items(): - cd = cds[0] - cd_idx = test_cds.index(cd) - state_col = cd_idx * n_hh + hh_idx - val = float(X_sparse[row_idx, state_col]) - if val != 0: - values_by_state[state] = val - - households_checked += 1 - if len(values_by_state) > 1: - households_with_multi_state_values += 1 - - assert households_with_multi_state_values > 0, ( - f"National target {variable}: no households have values in " - f"multiple states" - ) diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py b/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py deleted file mode 100644 index 53760834c..000000000 --- a/policyengine_us_data/tests/test_local_area_calibration/test_matrix_national_variation.py +++ /dev/null @@ -1,488 +0,0 @@ -""" -Tests for correctness in the sparse matrix builder, particularly for national level contributions. - -These tests verify that: -1. Matrix shape and structure are correct -2. Variable aggregation (person to household) preserves totals -3. National-level targets receive contributions from all states (no geographic - bias) -4. Cross-state recalculation applies state-specific rules -""" - -import pytest -import numpy as np -import pandas as pd -from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import ( - SparseMatrixBuilder, -) - -from .conftest import ( - VARIABLES_TO_TEST, - COMBINED_FILTER_CONFIG, -) - -# Variables with state-specific variation (e.g., SNAP eligibility) -VARIABLES_WITH_STATE_VARIATION = [ - "snap", -] - - -@pytest.fixture(scope="module") -def builder(db_uri, dataset_path, test_cds): - """SparseMatrixBuilder configured with test CDs.""" - return SparseMatrixBuilder( - db_uri=db_uri, - time_period=2023, - cds_to_calibrate=test_cds, - dataset_path=dataset_path, - ) - - -def _get_geo_level(geo_id) -> str: - """Determine geographic level from geographic_id.""" - if geo_id == "US": - return "national" - try: - val = int(geo_id) - if 1 <= val <= 56: - return "state" - else: - return "district" - except (ValueError, TypeError): - return "unknown" - - -def test_person_level_aggregation_preserves_totals(sim): - """Health insurance premiums (person-level) sum correctly to household.""" - var = "health_insurance_premiums_without_medicare_part_b" - person_total = sim.calculate(var, 2023, map_to="person").values.sum() - household_total = sim.calculate(var, 2023, map_to="household").values.sum() - assert np.isclose(person_total, household_total, rtol=1e-6) - - -def test_matrix_shape(sim, builder): - """Matrix should have (n_targets, n_households * n_cds) shape.""" - targets_df, X_sparse, _ = builder.build_matrix( - sim, - target_filter={ - "variables": ["health_insurance_premiums_without_medicare_part_b"] - }, - ) - n_households = len( - sim.calculate("household_id", map_to="household").values - ) - n_cds = len(builder.cds_to_calibrate) - assert X_sparse.shape[1] == n_households * n_cds - - -def test_combined_variables_in_matrix(sim, builder): - """Matrix should include all configured variables.""" - targets_df, X_sparse, _ = builder.build_matrix( - sim, - target_filter=COMBINED_FILTER_CONFIG, - ) - variables = targets_df["variable"].unique() - - for var_name, _ in VARIABLES_TO_TEST: - assert var_name in variables, f"Missing variable: {var_name}" - - -class TestNationalLevelContributions: - """ - Tests verifying that national-level targets receive contributions from - households across all states, not just a geographic subset. - - The key insight: for a national target, when we look at a single CD's - column block, households from ALL original states should potentially - contribute (subject to meeting eligibility constraints). There should - be no systematic geographic bias where only households from certain - states contribute to the national total. - """ - - def test_national_targets_receive_multistate_contributions( - self, targets_df, X_sparse, household_states, n_households, test_cds - ): - """ - Verify that national-level targets have contributions from households - originally from multiple states. - - For each national target: - 1. Look at the matrix row - 2. For EACH CD's column block, identify which original states have - non-zero contributions - 3. Verify contributions come from multiple states (not geographically - biased) - """ - state_fips = household_states - cds = test_cds - - # Find national-level targets - national_targets = targets_df[ - targets_df["geographic_id"].apply( - lambda x: _get_geo_level(x) == "national" - ) - ] - - if len(national_targets) == 0: - pytest.skip("No national-level targets found") - - results = [] - - for _, target in national_targets.iterrows(): - row_idx = target.name - variable = target["variable"] - row = X_sparse[row_idx, :].toarray().flatten() - - # For each CD block, check which original states contribute - cd_contribution_stats = [] - - for cd_idx, cd in enumerate(cds): - col_start = cd_idx * n_households - col_end = col_start + n_households - cd_values = row[col_start:col_end] - - # Find households with non-zero values in this CD block - nonzero_mask = cd_values != 0 - nonzero_indices = np.where(nonzero_mask)[0] - - if len(nonzero_indices) == 0: - continue - - # Get original states of contributing households - contributing_states = set(state_fips[nonzero_indices]) - - cd_contribution_stats.append( - { - "cd": cd, - "cd_state": int(cd) // 100, - "n_contributing": len(nonzero_indices), - "n_states": len(contributing_states), - "contributing_states": contributing_states, - } - ) - - if not cd_contribution_stats: - results.append( - { - "variable": variable, - "status": "NO_CONTRIBUTIONS", - "details": "No non-zero values in any CD block", - } - ) - continue - - # Aggregate stats - stats_df = pd.DataFrame(cd_contribution_stats) - avg_states = stats_df["n_states"].mean() - min_states = stats_df["n_states"].min() - - # Check: on average, contributions should come from multiple states - # (at least 2, since we have CDs from 4 different states) - passed = avg_states >= 2 and min_states >= 1 - - results.append( - { - "variable": variable, - "status": "PASSED" if passed else "FAILED", - "avg_contributing_states": avg_states, - "min_contributing_states": min_states, - "n_cd_blocks_with_data": len(stats_df), - } - ) - - # Assert no geographic bias - failed = [r for r in results if r["status"] == "FAILED"] - assert len(failed) == 0, ( - f"Geographic bias detected in national targets: " - f"{[r['variable'] for r in failed]}" - ) - - def test_state_distribution_in_national_targets( - self, targets_df, X_sparse, household_states, n_households, test_cds - ): - """ - Verify the distribution of contributing states in national targets - roughly matches the original data distribution. - - This catches cases where one state dominates the contributions - disproportionately. - """ - state_fips = household_states - cds = test_cds - - # Get original state distribution (count of households per state) - unique_states, original_counts = np.unique( - state_fips, return_counts=True - ) - original_dist = dict(zip(unique_states, original_counts)) - total_hh = len(state_fips) - - # Find national-level targets - national_targets = targets_df[ - targets_df["geographic_id"].apply( - lambda x: _get_geo_level(x) == "national" - ) - ] - - if len(national_targets) == 0: - pytest.skip("No national-level targets found") - - for _, target in national_targets.iterrows(): - row_idx = target.name - variable = target["variable"] - row = X_sparse[row_idx, :].toarray().flatten() - - # Count contributions by original state across ALL CD blocks - state_contribution_counts = {} - - for cd_idx, cd in enumerate(cds): - col_start = cd_idx * n_households - col_end = col_start + n_households - cd_values = row[col_start:col_end] - - nonzero_mask = cd_values != 0 - nonzero_indices = np.where(nonzero_mask)[0] - - for hh_idx in nonzero_indices: - orig_state = state_fips[hh_idx] - state_contribution_counts[orig_state] = ( - state_contribution_counts.get(orig_state, 0) + 1 - ) - - if not state_contribution_counts: - continue - - # Check that no single state dominates excessively - total_contributions = sum(state_contribution_counts.values()) - max_contribution = max(state_contribution_counts.values()) - max_state = max( - state_contribution_counts, key=state_contribution_counts.get - ) - max_share = max_contribution / total_contributions - - # The max share should not exceed 70% (unless that state has 70%+ - # of households in the original data) - original_max_share = original_dist.get(max_state, 0) / total_hh - - # Allow 20% margin above original share - threshold = min(0.7, original_max_share + 0.2) - - assert max_share <= threshold, ( - f"State {max_state} dominates national {variable} target with " - f"{max_share:.1%} of contributions " - f"(original share: {original_max_share:.1%})" - ) - - -@pytest.mark.skip( - reason="Sparse matrix builder not used in production; test needs rework after time_period fix" -) -class TestCrossStateRecalculation: - """ - Tests verifying that household values change when borrowed to different - states, confirming state-specific rules are being applied. - - The key insight: for national-level targets (no state constraint), each - household appears in every CD block. The value in each CD block represents - what the variable would be if that household lived in that CD's state. - For state-dependent variables (like SNAP), values should differ across - states for at least some households. - - NOTE: This complements test_cross_state.py which verifies exact values. - These tests verify that variation exists (state rules are applied). - """ - - def test_values_change_across_states_for_national_targets( - self, targets_df, X_sparse, n_households, test_cds - ): - """ - Verify that for national targets, household values vary across CD - blocks from different states. - - This confirms the matrix builder is correctly recalculating variables - with state-specific rules when households are "borrowed" to different - geographic areas. - - The test checks: - 1. For each national target, examine households with non-zero values - 2. Compare each household's value across CD blocks from different states - 3. At least some households should have different values in different - states (confirming recalculation with different state rules) - """ - cds = test_cds - - # Group CDs by state - cds_by_state = {} - for cd_idx, cd in enumerate(cds): - state = int(cd) // 100 - if state not in cds_by_state: - cds_by_state[state] = [] - cds_by_state[state].append((cd_idx, cd)) - - states = list(cds_by_state.keys()) - if len(states) < 2: - pytest.skip("Need at least 2 states to test cross-state variation") - - # Find national-level targets - national_targets = targets_df[ - targets_df["geographic_id"].apply( - lambda x: _get_geo_level(x) == "national" - ) - ] - - if len(national_targets) == 0: - pytest.skip("No national-level targets found") - - results = [] - - for _, target in national_targets.iterrows(): - if target["variable"] not in VARIABLES_WITH_STATE_VARIATION: - continue - row_idx = target.name - variable = target["variable"] - row = X_sparse[row_idx, :].toarray().flatten() - - # For each household, collect values from different states - households_with_variation = 0 - households_checked = 0 - - # Sample households (check every 10th to keep test fast) - for hh_idx in range(0, n_households, 10): - # Get this household's value in each state (use first CD of - # each state) - state_values = {} - for state, cd_list in cds_by_state.items(): - cd_idx, _ = cd_list[0] # First CD in this state - col_idx = cd_idx * n_households + hh_idx - state_values[state] = row[col_idx] - - # Skip if all values are zero (household doesn't qualify for - # this variable) - nonzero_values = [v for v in state_values.values() if v != 0] - if len(nonzero_values) < 2: - continue - - households_checked += 1 - - # Check if values differ across states - unique_values = set(nonzero_values) - if len(unique_values) > 1: - households_with_variation += 1 - - variation_rate = ( - households_with_variation / households_checked - if households_checked > 0 - else 0 - ) - - results.append( - { - "variable": variable, - "households_checked": households_checked, - "households_with_variation": households_with_variation, - "variation_rate": variation_rate, - } - ) - - # For state-dependent variables, we expect SOME variation - # (not all households will vary - some may have $0 or max benefits - # regardless of state) - # The key is that variation exists, confirming recalculation occurs - for r in results: - if r["households_checked"] > 0: - # At least 10% of households should show variation for - # state-dependent variables - assert ( - r["variation_rate"] > 0.1 or r["households_checked"] < 10 - ), ( - f"No cross-state variation found for {r['variable']}. " - f"This suggests state-specific rules may not be applied " - f"when households are borrowed to different states." - ) - - def test_same_household_different_states_shows_rule_changes( - self, targets_df, X_sparse, household_states, n_households, test_cds - ): - """ - Deep dive test: pick specific households and verify their values - differ across states in a way consistent with state-specific rules. - - For SNAP specifically, different states have different: - - Standard deductions - - Shelter deduction caps - - Vehicle allowances - - Categorical eligibility rules - - This test finds households where we can verify the recalculation - is applying different state rules. - """ - state_fips_orig = household_states - cds = test_cds - - # Group CDs by state - cds_by_state = {} - for cd_idx, cd in enumerate(cds): - state = int(cd) // 100 - if state not in cds_by_state: - cds_by_state[state] = [] - cds_by_state[state].append((cd_idx, cd)) - - states = sorted(cds_by_state.keys()) - if len(states) < 2: - pytest.skip("Need at least 2 states") - - # Find national SNAP target (most state-dependent) - snap_national = targets_df[ - (targets_df["variable"] == "snap") - & ( - targets_df["geographic_id"].apply( - lambda x: _get_geo_level(x) == "national" - ) - ) - ] - - if len(snap_national) == 0: - pytest.skip("No national SNAP target found") - - row_idx = snap_national.iloc[0].name - row = X_sparse[row_idx, :].toarray().flatten() - - # Find households with interesting variation patterns - example_households = [] - - for hh_idx in range(n_households): - state_values = {} - for state, cd_list in cds_by_state.items(): - cd_idx, _ = cd_list[0] - col_idx = cd_idx * n_households + hh_idx - state_values[state] = row[col_idx] - - # Look for households where: - # 1. At least 2 states have non-zero SNAP - # 2. The values differ significantly (>10% relative difference) - nonzero_states = {s: v for s, v in state_values.items() if v > 0} - - if len(nonzero_states) >= 2: - values = list(nonzero_states.values()) - max_val = max(values) - min_val = min(values) - if min_val > 0 and (max_val - min_val) / min_val > 0.1: - example_households.append( - { - "hh_idx": hh_idx, - "original_state": state_fips_orig[hh_idx], - "state_values": nonzero_states, - "max_val": max_val, - "min_val": min_val, - "variation": (max_val - min_val) / min_val, - } - ) - - if len(example_households) >= 5: - break - - # Assert we found at least one household with variation - assert len(example_households) > 0, ( - "Expected to find households with >10% SNAP variation across " - "states, confirming state-specific rules are applied" - ) diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_period_selection_and_uprating.py b/policyengine_us_data/tests/test_local_area_calibration/test_period_selection_and_uprating.py deleted file mode 100644 index b6523f91b..000000000 --- a/policyengine_us_data/tests/test_local_area_calibration/test_period_selection_and_uprating.py +++ /dev/null @@ -1,246 +0,0 @@ -""" -Tests for best-period selection and uprating in SparseMatrixBuilder. -""" - -import unittest -import tempfile -import os -import pandas as pd -from sqlalchemy import create_engine, text - -from policyengine_us_data.datasets.cps.local_area_calibration.sparse_matrix_builder import ( - SparseMatrixBuilder, -) -from policyengine_us_data.db.create_database_tables import ( - TARGET_OVERVIEW_VIEW, -) - - -class TestPeriodSelectionAndUprating(unittest.TestCase): - """Test best-period SQL CTE and uprating logic.""" - - @classmethod - def setUpClass(cls): - cls.temp_db = tempfile.NamedTemporaryFile(suffix=".db", delete=False) - cls.db_path = cls.temp_db.name - cls.temp_db.close() - - cls.db_uri = f"sqlite:///{cls.db_path}" - engine = create_engine(cls.db_uri) - - with engine.connect() as conn: - conn.execute( - text("CREATE TABLE strata (" "stratum_id INTEGER PRIMARY KEY)") - ) - conn.execute( - text( - "CREATE TABLE stratum_constraints (" - "constraint_id INTEGER PRIMARY KEY, " - "stratum_id INTEGER, " - "constraint_variable TEXT, " - "operation TEXT, " - "value TEXT)" - ) - ) - conn.execute( - text( - "CREATE TABLE targets (" - "target_id INTEGER PRIMARY KEY, " - "stratum_id INTEGER, " - "variable TEXT, " - "value REAL, " - "period INTEGER, " - "active INTEGER DEFAULT 1)" - ) - ) - - conn.execute(text(TARGET_OVERVIEW_VIEW)) - conn.commit() - - @classmethod - def tearDownClass(cls): - os.unlink(cls.db_path) - - def setUp(self): - engine = create_engine(self.db_uri) - with engine.connect() as conn: - conn.execute(text("DELETE FROM targets")) - conn.execute(text("DELETE FROM stratum_constraints")) - conn.execute(text("DELETE FROM strata")) - conn.commit() - - def _insert_test_data(self, strata, constraints, targets): - engine = create_engine(self.db_uri) - with engine.connect() as conn: - for stratum_id, group_id in strata: - conn.execute( - text("INSERT INTO strata VALUES (:sid)"), - {"sid": stratum_id}, - ) - for i, (stratum_id, var, op, val) in enumerate(constraints): - conn.execute( - text( - "INSERT INTO stratum_constraints " - "VALUES (:cid, :sid, :var, :op, :val)" - ), - { - "cid": i + 1, - "sid": stratum_id, - "var": var, - "op": op, - "val": val, - }, - ) - for i, ( - stratum_id, - variable, - value, - period, - ) in enumerate(targets): - conn.execute( - text( - "INSERT INTO targets " - "(target_id, stratum_id, variable, " - "value, period) " - "VALUES (:tid, :sid, :var, :val, :period)" - ), - { - "tid": i + 1, - "sid": stratum_id, - "var": variable, - "val": value, - "period": period, - }, - ) - conn.commit() - - def _make_builder(self, time_period=2024): - return SparseMatrixBuilder( - db_uri=self.db_uri, - time_period=time_period, - cds_to_calibrate=["601"], - ) - - # ---- Period selection tests ---- - - def test_best_period_prefers_past(self): - """Targets at 2022 and 2026 -> picks 2022 for time_period=2024.""" - self._insert_test_data( - strata=[(1, 1)], - constraints=[ - (1, "congressional_district_geoid", "=", "601"), - ], - targets=[ - (1, "snap", 1000, 2022), - (1, "snap", 2000, 2026), - ], - ) - builder = self._make_builder(time_period=2024) - df = builder._query_targets({"stratum_ids": [1]}) - self.assertEqual(len(df), 1) - self.assertEqual(df.iloc[0]["period"], 2022) - self.assertEqual(df.iloc[0]["value"], 1000) - - def test_best_period_uses_future_when_no_past(self): - """Target only at 2026 -> picks 2026 for time_period=2024.""" - self._insert_test_data( - strata=[(1, 1)], - constraints=[ - (1, "congressional_district_geoid", "=", "601"), - ], - targets=[ - (1, "snap", 5000, 2026), - ], - ) - builder = self._make_builder(time_period=2024) - df = builder._query_targets({"stratum_ids": [1]}) - self.assertEqual(len(df), 1) - self.assertEqual(df.iloc[0]["period"], 2026) - - def test_best_period_exact_match(self): - """Targets at 2022, 2024, 2026 -> picks 2024 exactly.""" - self._insert_test_data( - strata=[(1, 1)], - constraints=[ - (1, "congressional_district_geoid", "=", "601"), - ], - targets=[ - (1, "snap", 1000, 2022), - (1, "snap", 1500, 2024), - (1, "snap", 2000, 2026), - ], - ) - builder = self._make_builder(time_period=2024) - df = builder._query_targets({"stratum_ids": [1]}) - self.assertEqual(len(df), 1) - self.assertEqual(df.iloc[0]["period"], 2024) - self.assertEqual(df.iloc[0]["value"], 1500) - - def test_independent_per_stratum_and_variable(self): - """Different strata/variables select independently.""" - self._insert_test_data( - strata=[(1, 1), (2, 1)], - constraints=[ - (1, "congressional_district_geoid", "=", "601"), - (2, "congressional_district_geoid", "=", "601"), - ], - targets=[ - (1, "snap", 1000, 2024), - (1, "snap", 800, 2022), - (2, "person_count", 500, 2022), - (2, "person_count", 600, 2026), - ], - ) - builder = self._make_builder(time_period=2024) - df = builder._query_targets({"stratum_ids": [1, 2]}) - self.assertEqual(len(df), 2) - snap_row = df[df["variable"] == "snap"].iloc[0] - self.assertEqual(snap_row["period"], 2024) - count_row = df[df["variable"] == "person_count"].iloc[0] - self.assertEqual(count_row["period"], 2022) - - # ---- Uprating info tests ---- - - def test_cpi_uprating_for_dollar_vars(self): - builder = self._make_builder(time_period=2024) - factors = {(2022, "cpi"): 1.06, (2022, "pop"): 1.01} - factor, type_ = builder._get_uprating_info("snap", 2022, factors) - self.assertAlmostEqual(factor, 1.06) - self.assertEqual(type_, "cpi") - - def test_pop_uprating_for_count_vars(self): - builder = self._make_builder(time_period=2024) - factors = {(2022, "cpi"): 1.06, (2022, "pop"): 1.01} - factor, type_ = builder._get_uprating_info( - "person_count", 2022, factors - ) - self.assertAlmostEqual(factor, 1.01) - self.assertEqual(type_, "pop") - - def test_no_uprating_for_current_period(self): - builder = self._make_builder(time_period=2024) - factors = {(2024, "cpi"): 1.0, (2024, "pop"): 1.0} - factor, type_ = builder._get_uprating_info("snap", 2024, factors) - self.assertAlmostEqual(factor, 1.0) - self.assertEqual(type_, "none") - - def test_pop_uprating_households_variable(self): - builder = self._make_builder(time_period=2024) - factors = {(2022, "cpi"): 1.06, (2022, "pop"): 1.02} - factor, type_ = builder._get_uprating_info("households", 2022, factors) - self.assertAlmostEqual(factor, 1.02) - self.assertEqual(type_, "pop") - - def test_pop_uprating_tax_units_variable(self): - builder = self._make_builder(time_period=2024) - factors = {(2022, "cpi"): 1.06, (2022, "pop"): 1.02} - factor, type_ = builder._get_uprating_info("tax_units", 2022, factors) - self.assertAlmostEqual(factor, 1.02) - self.assertEqual(type_, "pop") - - def test_missing_factor_defaults_to_1(self): - builder = self._make_builder(time_period=2024) - factors = {} - factor, type_ = builder._get_uprating_info("snap", 2020, factors) - self.assertAlmostEqual(factor, 1.0) - self.assertEqual(type_, "cpi") diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py b/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py deleted file mode 100644 index 065b99201..000000000 --- a/policyengine_us_data/tests/test_local_area_calibration/test_same_state.py +++ /dev/null @@ -1,126 +0,0 @@ -"""Test same-state values match original simulation values.""" - -import pytest -import numpy as np -from collections import defaultdict - -from .conftest import VARIABLES_TO_TEST, N_VERIFICATION_SAMPLES - - -@pytest.mark.skip( - reason="Sparse matrix builder not used in production; test needs rework after time_period fix" -) -def test_same_state_matches_original( - sim, - X_sparse, - targets_df, - test_cds, - n_households, - household_ids, - household_states, -): - """ - Same-state non-zero cells must match ORIGINAL simulation values. - - When household stays in same state, X_sparse should contain the value - from the original simulation (ground truth from H5 dataset). - - Uses stratified sampling to ensure all variables in VARIABLES_TO_TEST - are covered with approximately equal samples per variable. - """ - seed = 42 - rng = np.random.default_rng(seed) - n_hh = n_households - hh_ids = household_ids - hh_states = household_states - - nonzero_rows, nonzero_cols = X_sparse.nonzero() - - # Group same-state cells by variable for stratified sampling - variable_to_indices = defaultdict(list) - variables_to_test = {v[0] for v in VARIABLES_TO_TEST} - - for i in range(len(nonzero_rows)): - row_idx = nonzero_rows[i] - col_idx = nonzero_cols[i] - cd_idx = col_idx // n_hh - hh_idx = col_idx % n_hh - cd = test_cds[cd_idx] - dest_state = int(cd) // 100 - orig_state = int(hh_states[hh_idx]) - - # Only include same-state cells - if dest_state != orig_state: - continue - - variable = targets_df.iloc[row_idx]["variable"] - if variable in variables_to_test: - variable_to_indices[variable].append(i) - - if not variable_to_indices: - pytest.skip("No same-state non-zero cells found for test variables") - - # Stratified sampling: sample proportionally from each variable - samples_per_var = max( - 1, N_VERIFICATION_SAMPLES // len(variable_to_indices) - ) - sample_indices = [] - - for variable, indices in variable_to_indices.items(): - n_to_sample = min(samples_per_var, len(indices)) - sampled = rng.choice(indices, n_to_sample, replace=False) - sample_indices.extend(sampled) - - # Cache original values per variable to avoid repeated calculations - original_values_cache = {} - - def get_original_values(variable): - if variable not in original_values_cache: - original_values_cache[variable] = sim.calculate( - variable, map_to="household" - ).values - return original_values_cache[variable] - - errors = [] - variables_tested = set() - - for idx in sample_indices: - row_idx = nonzero_rows[idx] - col_idx = nonzero_cols[idx] - cd_idx = col_idx // n_hh - hh_idx = col_idx % n_hh - variable = targets_df.iloc[row_idx]["variable"] - actual = float(X_sparse[row_idx, col_idx]) - - # Compare to ORIGINAL simulation values (ground truth) - original_values = get_original_values(variable) - expected = float(original_values[hh_idx]) - - variables_tested.add(variable) - - if not np.isclose(actual, expected, atol=0.5): - errors.append( - { - "hh_id": hh_ids[hh_idx], - "hh_idx": hh_idx, - "variable": variable, - "actual": actual, - "expected": expected, - "diff": actual - expected, - "rel_diff": ( - (actual - expected) / expected - if expected != 0 - else np.inf - ), - } - ) - - missing_vars = variables_to_test - variables_tested - if missing_vars: - print(f"Warning: No same-state cells found for: {missing_vars}") - - assert not errors, ( - f"Same-state verification failed: {len(errors)}/{len(sample_indices)} " - f"mismatches across {len(variables_tested)} variables. " - f"First 5: {errors[:5]}" - )