diff --git a/changelog.d/fix-ss-subcomponent-reconciliation.fixed.md b/changelog.d/fix-ss-subcomponent-reconciliation.fixed.md
new file mode 100644
index 00000000..2bfa8149
--- /dev/null
+++ b/changelog.d/fix-ss-subcomponent-reconciliation.fixed.md
@@ -0,0 +1 @@
+Reconcile SS sub-components after PUF imputation so they sum to social_security.
diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py
index 81df6209..d5c73785 100644
--- a/policyengine_us_data/calibration/puf_impute.py
+++ b/policyengine_us_data/calibration/puf_impute.py
@@ -102,6 +102,13 @@
     "self_employment_income_would_be_qualified",
 ]
 
+SS_SUBCOMPONENTS = [
+    "social_security_retirement",
+    "social_security_disability",
+    "social_security_survivors",
+    "social_security_dependents",
+]
+
 OVERRIDDEN_IMPUTED_VARIABLES = [
     "partnership_s_corp_income",
     "interest_deduction",
@@ -155,6 +162,210 @@
 ]
 
 
+MINIMUM_RETIREMENT_AGE = 62
+
+SS_SPLIT_PREDICTORS = [
+    "age",
+    "is_male",
+    "tax_unit_is_joint",
+    "is_tax_unit_head",
+    "is_tax_unit_dependent",
+]
+
+MIN_QRF_TRAINING_RECORDS = 100
+
+
+def _qrf_ss_shares(
+    data: Dict[str, Dict[int, np.ndarray]],
+    n_cps: int,
+    time_period: int,
+    puf_has_ss: np.ndarray,
+) -> Optional[Dict[str, np.ndarray]]:
+    """Predict SS sub-component shares via QRF.
+
+    Trains on CPS records that have SS > 0 (where the reason-code
+    split is known), then predicts shares for all PUF records with
+    positive SS. The CPS-PUF link is statistical (not identity-based),
+    so the QRF gives a better expected prediction than using the
+    paired CPS record's split.
+
+    Args:
+        data: Dataset dict.
+        n_cps: Records in CPS half.
+        time_period: Tax year.
+        puf_has_ss: Boolean mask (length n_cps) — True where the
+            PUF half has positive social_security.
+
+    Returns:
+        Dict mapping sub-component name to predicted share arrays
+        (length = puf_has_ss.sum()), or None if training data is
+        insufficient.
+    """
+    from microimpute.models.qrf import QRF
+
+    cps_ss = data["social_security"][time_period][:n_cps]
+    has_ss = cps_ss > 0
+
+    if has_ss.sum() < MIN_QRF_TRAINING_RECORDS:
+        return None
+
+    # Build training features from available predictors.
+    predictors = []
+    train_cols = {}
+    test_cols = {}
+    for pred in SS_SPLIT_PREDICTORS:
+        if pred not in data:
+            continue
+        vals = data[pred][time_period][:n_cps]
+        train_cols[pred] = vals[has_ss].astype(np.float32)
+        test_cols[pred] = vals[puf_has_ss].astype(np.float32)
+        predictors.append(pred)
+
+    if not predictors:
+        return None
+
+    X_train = pd.DataFrame(train_cols)
+    X_test = pd.DataFrame(test_cols)
+
+    # Training targets: share going to each sub-component (0 or 1).
+    share_vars = []
+    with np.errstate(divide="ignore", invalid="ignore"):
+        for sub in SS_SUBCOMPONENTS:
+            if sub not in data:
+                continue
+            sub_vals = data[sub][time_period][:n_cps][has_ss]
+            share_name = sub + "_share"
+            X_train[share_name] = np.where(
+                cps_ss[has_ss] > 0,
+                sub_vals / cps_ss[has_ss],
+                0.0,
+            )
+            share_vars.append(share_name)
+
+    if not share_vars:
+        return None
+
+    qrf = QRF(log_level="WARNING", memory_efficient=True)
+    try:
+        fitted = qrf.fit(
+            X_train=X_train[predictors + share_vars],
+            predictors=predictors,
+            imputed_variables=share_vars,
+            n_jobs=1,
+        )
+        predictions = fitted.predict(X_test=X_test)
+    except Exception:
+        logger.warning("QRF SS split failed, falling back to heuristic")
+        return None
+
+    # Clip to [0, 1] and normalise so shares sum to 1.
+    shares = {}
+    total = np.zeros(len(X_test))
+    for sub in SS_SUBCOMPONENTS:
+        key = sub + "_share"
+        if key in predictions.columns:
+            s = np.clip(predictions[key].values, 0, 1)
+            shares[sub] = s
+            total += s
+
+    for sub in shares:
+        shares[sub] = np.where(total > 0, shares[sub] / total, 0.0)
+
+    del fitted, predictions
+    gc.collect()
+
+    logger.info(
+        "QRF SS split: predicted shares for %d PUF records",
+        puf_has_ss.sum(),
+    )
+    return shares
+
+
+def _age_heuristic_ss_shares(
+    data: Dict[str, Dict[int, np.ndarray]],
+    n_cps: int,
+    time_period: int,
+    puf_has_ss: np.ndarray,
+) -> Dict[str, np.ndarray]:
+    """Fallback: assign SS type by age threshold.
+
+    Age >= 62 -> retirement, < 62 -> disability.
+    If age is unavailable, all go to retirement.
+    """
+    n_pred = puf_has_ss.sum()
+    shares = {sub: np.zeros(n_pred) for sub in SS_SUBCOMPONENTS}
+
+    age = None
+    if "age" in data:
+        age = data["age"][time_period][:n_cps][puf_has_ss]
+
+    if age is not None:
+        is_old = age >= MINIMUM_RETIREMENT_AGE
+        if "social_security_retirement" in shares:
+            shares["social_security_retirement"] = is_old.astype(np.float64)
+        if "social_security_disability" in shares:
+            shares["social_security_disability"] = (~is_old).astype(np.float64)
+    else:
+        if "social_security_retirement" in shares:
+            shares["social_security_retirement"] = np.ones(n_pred)
+
+    return shares
+
+
+def reconcile_ss_subcomponents(
+    data: Dict[str, Dict[int, np.ndarray]],
+    n_cps: int,
+    time_period: int,
+) -> None:
+    """Predict SS sub-components for PUF half from demographics.
+
+    The CPS-PUF link is statistical (not identity-based), so the
+    paired CPS record's sub-component split is just one noisy draw.
+    A QRF trained on all CPS SS recipients gives a better expected
+    prediction by pooling across the full training set.
+
+    For all PUF records with positive social_security, this function
+    predicts shares via QRF (falling back to an age heuristic) and
+    scales them to match the imputed total. PUF records with zero
+    SS get all sub-components cleared to zero.
+
+    Modifies ``data`` in place. Only the PUF half (indices
+    n_cps .. 2*n_cps) is changed.
+
+    Args:
+        data: Dataset dict {variable: {time_period: array}}.
+        n_cps: Number of records in the CPS half.
+        time_period: Tax year key into data dicts.
+    """
+    if "social_security" not in data:
+        return
+
+    puf_ss = data["social_security"][time_period][n_cps:]
+    puf_has_ss = puf_ss > 0
+
+    # Predict shares for all PUF records with SS > 0.
+    shares = None
+    if puf_has_ss.any():
+        shares = _qrf_ss_shares(data, n_cps, time_period, puf_has_ss)
+        if shares is None:
+            shares = _age_heuristic_ss_shares(
+                data, n_cps, time_period, puf_has_ss
+            )
+
+    for sub in SS_SUBCOMPONENTS:
+        if sub not in data:
+            continue
+        arr = data[sub][time_period]
+
+        new_puf = np.zeros(n_cps)
+        if puf_has_ss.any() and shares is not None:
+            share = shares.get(sub, np.zeros(puf_has_ss.sum()))
+            new_puf[puf_has_ss] = puf_ss[puf_has_ss] * share
+
+        arr[n_cps:] = new_puf.astype(arr.dtype)
+        data[sub][time_period] = arr
+
+
 def puf_clone_dataset(
     data: Dict[str, Dict[int, np.ndarray]],
     state_fips: np.ndarray,
@@ -271,6 +482,9 @@ def _map_to_entity(pred_values, variable_name):
     if cps_sim is not None:
         del cps_sim
 
+    # Ensure SS sub-components match the (possibly imputed) total.
+    reconcile_ss_subcomponents(new_data, person_count, time_period)
+
     logger.info(
         "PUF clone complete: %d -> %d households",
         n_households,
diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
index 4b427a39..ac8b6e45 100644
--- a/policyengine_us_data/datasets/cps/extended_cps.py
+++ b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -50,8 +50,42 @@ def generate(self):
             dataset_path=str(self.cps.file_path),
         )
 
+        new_data = self._drop_formula_variables(new_data)
         self.save_dataset(new_data)
 
+    # Variables with formulas that must still be stored (e.g. IDs
+    # needed by the dataset loader before formulas can run).
+    _KEEP_FORMULA_VARS = {"person_id"}
+
+    @classmethod
+    def _drop_formula_variables(cls, data):
+        """Remove variables that are computed by policyengine-us.
+
+        Variables with formulas, ``adds``, or ``subtracts`` are
+        recomputed by the simulation engine, so storing them wastes
+        space and can mislead validation.
+        """
+        from policyengine_us import CountryTaxBenefitSystem
+
+        tbs = CountryTaxBenefitSystem()
+        formula_vars = {
+            name
+            for name, var in tbs.variables.items()
+            if (hasattr(var, "formulas") and len(var.formulas) > 0)
+            or getattr(var, "adds", None)
+            or getattr(var, "subtracts", None)
+        } - cls._KEEP_FORMULA_VARS
+        dropped = sorted(set(data.keys()) & formula_vars)
+        if dropped:
+            logger.info(
+                "Dropping %d formula variables: %s",
+                len(dropped),
+                dropped,
+            )
+            for var in dropped:
+                del data[var]
+        return data
+
 
 class ExtendedCPS_2024(ExtendedCPS):
     cps = CPS_2024_Full
diff --git a/policyengine_us_data/tests/test_puf_impute.py b/policyengine_us_data/tests/test_puf_impute.py
new file mode 100644
index 00000000..fcdcf763
--- /dev/null
+++ b/policyengine_us_data/tests/test_puf_impute.py
@@ -0,0 +1,366 @@
+"""Tests for PUF imputation, specifically SS sub-component reconciliation.
+
+When PUF imputation replaces social_security values, the sub-components
+(retirement, disability, survivors, dependents) must be predicted from
+demographics (not copied from the statistically-linked CPS record).
+See: https://github.com/PolicyEngine/policyengine-us-data/issues/551
+"""
+
+import numpy as np
+import pytest
+
+from policyengine_us_data.calibration.puf_impute import (
+    MINIMUM_RETIREMENT_AGE,
+    _age_heuristic_ss_shares,
+    _qrf_ss_shares,
+    reconcile_ss_subcomponents,
+)
+
+SS_SUBS = [
+    "social_security_retirement",
+    "social_security_disability",
+    "social_security_survivors",
+    "social_security_dependents",
+]
+
+
+def _make_data(
+    orig_ss,
+    orig_ret,
+    orig_dis,
+    orig_surv,
+    orig_dep,
+    imputed_ss,
+    age=None,
+    is_male=None,
+):
+    """Build a doubled data dict (CPS half + PUF half)."""
+    n = len(orig_ss)
+    tp = 2024
+    data = {
+        "social_security": {
+            tp: np.concatenate([orig_ss, imputed_ss]).astype(np.float32)
+        },
+        "social_security_retirement": {
+            tp: np.concatenate([orig_ret, orig_ret]).astype(np.float32)
+        },
+        "social_security_disability": {
+            tp: np.concatenate([orig_dis, orig_dis]).astype(np.float32)
+        },
+        "social_security_survivors": {
+            tp: np.concatenate([orig_surv, orig_surv]).astype(np.float32)
+        },
+        "social_security_dependents": {
+            tp: np.concatenate([orig_dep, orig_dep]).astype(np.float32)
+        },
+    }
+    if age is not None:
+        data["age"] = {tp: np.concatenate([age, age]).astype(np.float32)}
+    if is_male is not None:
+        data["is_male"] = {
+            tp: np.concatenate([is_male, is_male]).astype(np.float32)
+        }
+    return data, n, tp
+
+
+class TestReconcileSsSubcomponents:
+    """Sub-components must sum to social_security after reconciliation."""
+
+    def test_subs_sum_to_total(self):
+        """PUF subs must sum to imputed SS total."""
+        data, n, tp = _make_data(
+            orig_ss=np.array([20000, 15000]),
+            orig_ret=np.array([14000, 0]),
+            orig_dis=np.array([6000, 15000]),
+            orig_surv=np.array([0, 0]),
+            orig_dep=np.array([0, 0]),
+            imputed_ss=np.array([30000, 10000]),
+            age=np.array([70, 45]),
+        )
+        reconcile_ss_subcomponents(data, n, tp)
+
+        puf = slice(n, 2 * n)
+        ss = data["social_security"][tp][puf]
+        total_subs = sum(data[sub][tp][puf] for sub in SS_SUBS)
+        np.testing.assert_allclose(total_subs, ss, rtol=1e-5)
+
+    def test_age_determines_split(self):
+        """Age heuristic: >= 62 -> retirement, < 62 -> disability."""
+        data, n, tp = _make_data(
+            orig_ss=np.array([20000, 15000]),
+            orig_ret=np.array([14000, 0]),
+            orig_dis=np.array([6000, 15000]),
+            orig_surv=np.array([0, 0]),
+            orig_dep=np.array([0, 0]),
+            imputed_ss=np.array([30000, 10000]),
+            age=np.array([70, 45]),
+        )
+        reconcile_ss_subcomponents(data, n, tp)
+
+        puf = slice(n, 2 * n)
+        ret = data["social_security_retirement"][tp][puf]
+        dis = data["social_security_disability"][tp][puf]
+        # Person 0 is 70 -> retirement
+        assert ret[0] == pytest.approx(30000)
+        assert dis[0] == pytest.approx(0)
+        # Person 1 is 45 -> disability
+        assert ret[1] == pytest.approx(0)
+        assert dis[1] == pytest.approx(10000)
+
+    def test_imputed_zero_clears_subs(self):
+        """When PUF imputes zero SS, all subs should be zero."""
+        data, n, tp = _make_data(
+            orig_ss=np.array([20000]),
+            orig_ret=np.array([20000]),
+            orig_dis=np.array([0]),
+            orig_surv=np.array([0]),
+            orig_dep=np.array([0]),
+            imputed_ss=np.array([0]),
+        )
+        reconcile_ss_subcomponents(data, n, tp)
+
+        puf = slice(n, 2 * n)
+        for sub in SS_SUBS:
+            assert data[sub][tp][puf][0] == pytest.approx(0)
+
+    def test_cps_half_unchanged(self):
+        """CPS half should not be modified."""
+        orig_ret = np.array([14000, 0])
+        data, n, tp = _make_data(
+            orig_ss=np.array([20000, 15000]),
+            orig_ret=orig_ret,
+            orig_dis=np.array([6000, 15000]),
+            orig_surv=np.array([0, 0]),
+            orig_dep=np.array([0, 0]),
+            imputed_ss=np.array([30000, 10000]),
+        )
+        reconcile_ss_subcomponents(data, n, tp)
+
+        cps = slice(0, n)
+        np.testing.assert_array_equal(
+            data["social_security_retirement"][tp][cps],
+            orig_ret,
+        )
+
+    def test_missing_subcomponent_is_skipped(self):
+        """If a sub-component is absent from data, no error."""
+        data, n, tp = _make_data(
+            orig_ss=np.array([10000]),
+            orig_ret=np.array([10000]),
+            orig_dis=np.array([0]),
+            orig_surv=np.array([0]),
+            orig_dep=np.array([0]),
+            imputed_ss=np.array([15000]),
+        )
+        del data["social_security_survivors"]
+        del data["social_security_dependents"]
+        reconcile_ss_subcomponents(data, n, tp)
+
+    def test_no_social_security_is_noop(self):
+        """If social_security is absent, function is a no-op."""
+        data = {"some_var": {2024: np.array([1, 2])}}
+        reconcile_ss_subcomponents(data, 1, 2024)
+
+    def test_subs_sum_to_total_mixed_cases(self):
+        """Comprehensive: subs sum to total across mixed cases."""
+        data, n, tp = _make_data(
+            # Person 0: has CPS SS, has PUF SS (old)
+            # Person 1: no CPS SS, has PUF SS (old)
+            # Person 2: no CPS SS, has PUF SS (young)
+            # Person 3: has CPS SS, PUF zeroed out
+            orig_ss=np.array([20000, 0, 0, 15000]),
+            orig_ret=np.array([20000, 0, 0, 15000]),
+            orig_dis=np.array([0, 0, 0, 0]),
+            orig_surv=np.array([0, 0, 0, 0]),
+            orig_dep=np.array([0, 0, 0, 0]),
+            imputed_ss=np.array([25000, 18000, 12000, 0]),
+            age=np.array([70, 68, 40, 75]),
+        )
+        reconcile_ss_subcomponents(data, n, tp)
+
+        puf = slice(n, 2 * n)
+        ss = data["social_security"][tp][puf]
+        total_subs = sum(data[sub][tp][puf] for sub in SS_SUBS)
+        np.testing.assert_allclose(total_subs, ss, rtol=1e-5)
+
+
+class TestAgeHeuristicSsShares:
+    """Age-based fallback for SS type classification."""
+
+    def test_elderly_gets_retirement(self):
+        """Age >= 62 -> retirement."""
+        data, n, tp = _make_data(
+            orig_ss=np.array([0]),
+            orig_ret=np.array([0]),
+            orig_dis=np.array([0]),
+            orig_surv=np.array([0]),
+            orig_dep=np.array([0]),
+            imputed_ss=np.array([25000]),
+            age=np.array([70]),
+        )
+        puf_has_ss = np.array([True])
+        shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss)
+
+        assert shares["social_security_retirement"][0] == pytest.approx(1.0)
+        assert shares["social_security_disability"][0] == pytest.approx(0.0)
+
+    def test_young_gets_disability(self):
+        """Age < 62 -> disability."""
+        data, n, tp = _make_data(
+            orig_ss=np.array([0]),
+            orig_ret=np.array([0]),
+            orig_dis=np.array([0]),
+            orig_surv=np.array([0]),
+            orig_dep=np.array([0]),
+            imputed_ss=np.array([18000]),
+            age=np.array([45]),
+        )
+        puf_has_ss = np.array([True])
+        shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss)
+
+        assert shares["social_security_retirement"][0] == pytest.approx(0.0)
+        assert shares["social_security_disability"][0] == pytest.approx(1.0)
+
+    def test_no_age_defaults_to_retirement(self):
+        """Without age data, default to retirement."""
+        data, n, tp = _make_data(
+            orig_ss=np.array([0]),
+            orig_ret=np.array([0]),
+            orig_dis=np.array([0]),
+            orig_surv=np.array([0]),
+            orig_dep=np.array([0]),
+            imputed_ss=np.array([25000]),
+        )
+        puf_has_ss = np.array([True])
+        shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss)
+
+        assert shares["social_security_retirement"][0] == pytest.approx(1.0)
+
+    def test_mixed_ages(self):
+        """Multiple people split correctly by age threshold."""
+        data, n, tp = _make_data(
+            orig_ss=np.array([0, 0, 0]),
+            orig_ret=np.array([0, 0, 0]),
+            orig_dis=np.array([0, 0, 0]),
+            orig_surv=np.array([0, 0, 0]),
+            orig_dep=np.array([0, 0, 0]),
+            imputed_ss=np.array([1, 1, 1]),
+            age=np.array([30, 62, 80]),
+        )
+        puf_has_ss = np.array([True, True, True])
+        shares = _age_heuristic_ss_shares(data, n, tp, puf_has_ss)
+
+        # age 30 -> disability
+        assert shares["social_security_retirement"][0] == 0
+        assert shares["social_security_disability"][0] == 1
+        # age 62 -> retirement (>= threshold)
+        assert shares["social_security_retirement"][1] == 1
+        assert shares["social_security_disability"][1] == 0
+        # age 80 -> retirement
+        assert shares["social_security_retirement"][2] == 1
+
+
+class TestQrfSsShares:
+    """QRF-based SS sub-component share prediction."""
+
+    def test_returns_none_with_few_training_records(self):
+        """Returns None when < MIN_QRF_TRAINING_RECORDS have SS."""
+        # Only 2 training records with SS > 0.
+        data, n, tp = _make_data(
+            orig_ss=np.array([10000, 5000, 0]),
+            orig_ret=np.array([10000, 0, 0]),
+            orig_dis=np.array([0, 5000, 0]),
+            orig_surv=np.array([0, 0, 0]),
+            orig_dep=np.array([0, 0, 0]),
+            imputed_ss=np.array([0, 0, 20000]),
+            age=np.array([70, 45, 55]),
+        )
+        puf_has_ss = np.array([False, False, True])
+        result = _qrf_ss_shares(data, n, tp, puf_has_ss)
+        assert result is None
+
+    def test_shares_sum_to_one(self):
+        """QRF-predicted shares should sum to ~1 after normalisation."""
+        pytest.importorskip("microimpute")
+        rng = np.random.default_rng(42)
+        n = 500
+
+        # Synthetic training: age > 62 mostly retirement,
+        # age < 62 mostly disability.
+        ages = rng.integers(20, 90, size=n).astype(np.float32)
+        is_male = rng.integers(0, 2, size=n).astype(np.float32)
+        ss_vals = rng.uniform(5000, 40000, size=n).astype(np.float32)
+        ret = np.where(ages >= 62, ss_vals, 0).astype(np.float32)
+        dis = np.where(ages < 62, ss_vals, 0).astype(np.float32)
+        surv = np.zeros(n, dtype=np.float32)
+        dep = np.zeros(n, dtype=np.float32)
+
+        # PUF SS for all records.
+        puf_ss = rng.uniform(5000, 40000, size=n).astype(np.float32)
+
+        data, nn, tp = _make_data(
+            orig_ss=ss_vals,
+            orig_ret=ret,
+            orig_dis=dis,
+            orig_surv=surv,
+            orig_dep=dep,
+            imputed_ss=puf_ss,
+            age=ages,
+            is_male=is_male,
+        )
+
+        puf_has_ss = puf_ss > 0
+        shares = _qrf_ss_shares(data, nn, tp, puf_has_ss)
+
+        assert shares is not None
+        total = sum(shares.get(sub, 0) for sub in SS_SUBS)
+        np.testing.assert_allclose(total, 1.0, atol=0.01)
+
+    def test_elderly_predicted_as_retirement(self):
+        """QRF should mostly predict retirement for age >= 62."""
+        pytest.importorskip("microimpute")
+        rng = np.random.default_rng(123)
+        n_train = 500
+
+        ages = rng.integers(20, 90, size=n_train).astype(np.float32)
+        ss_vals = rng.uniform(5000, 40000, size=n_train).astype(np.float32)
+        ret = np.where(ages >= 62, ss_vals, 0).astype(np.float32)
+        dis = np.where(ages < 62, ss_vals, 0).astype(np.float32)
+        surv = np.zeros(n_train, dtype=np.float32)
+        dep = np.zeros(n_train, dtype=np.float32)
+
+        # Only elderly records get PUF SS > 0; training records
+        # get PUF SS = 0 so puf_has_ss selects just the elderly.
+        n_pred = 10
+        pred_ages = np.full(n_pred, 70, dtype=np.float32)
+
+        all_ages = np.concatenate([ages, pred_ages])
+        all_ss = np.concatenate([ss_vals, np.zeros(n_pred)])
+        all_ret = np.concatenate([ret, np.zeros(n_pred)])
+        all_dis = np.concatenate([dis, np.zeros(n_pred)])
+        all_surv = np.concatenate([surv, np.zeros(n_pred)])
+        all_dep = np.concatenate([dep, np.zeros(n_pred)])
+        puf_ss = np.concatenate(
+            [
+                np.zeros(n_train, dtype=np.float32),
+                np.full(n_pred, 20000, dtype=np.float32),
+            ]
+        )
+
+        data, nn, tp = _make_data(
+            orig_ss=all_ss,
+            orig_ret=all_ret,
+            orig_dis=all_dis,
+            orig_surv=all_surv,
+            orig_dep=all_dep,
+            imputed_ss=puf_ss,
+            age=all_ages,
+        )
+
+        puf_has_ss = data["social_security"][tp][nn:] > 0
+        shares = _qrf_ss_shares(data, nn, tp, puf_has_ss)
+
+        assert shares is not None
+        ret_share = shares["social_security_retirement"]
+        # All elderly -> retirement share should dominate.
+        assert np.mean(ret_share) > 0.7
diff --git a/tests/test_no_formula_variables_stored.py b/tests/test_no_formula_variables_stored.py
new file mode 100644
index 00000000..9334a5c7
--- /dev/null
+++ b/tests/test_no_formula_variables_stored.py
@@ -0,0 +1,183 @@
+"""Test dataset variable integrity.
+
+1. Variables with formulas in policyengine-us will be recomputed by the
+   simulation engine, ignoring any stored value. Storing them wastes
+   space and can mislead during validation.
+
+2. Stored input values should match what the simulation actually uses,
+   to catch issues like set_input_divide_by_period silently altering
+   values, or sub-components not summing to their parent total.
+"""
+
+import h5py
+import numpy as np
+import pytest
+from policyengine_us_data.datasets.cps.extended_cps import ExtendedCPS_2024
+
+KNOWN_FORMULA_EXCEPTIONS = {
+    # person_id is stored for identity tracking even though it has a
+    # trivial formula (arange). Safe to keep.
+    "person_id",
+}
+
+
+@pytest.fixture(scope="module")
+def tax_benefit_system():
+    from policyengine_us import CountryTaxBenefitSystem
+
+    return CountryTaxBenefitSystem()
+
+
+@pytest.fixture(scope="module")
+def formula_variables(tax_benefit_system):
+    """Return set of variable names computed by policyengine-us.
+
+    Includes variables with explicit formulas as well as those using
+    ``adds`` or ``subtracts`` (which the engine auto-sums at runtime).
+    """
+    return {
+        name
+        for name, var in tax_benefit_system.variables.items()
+        if (hasattr(var, "formulas") and len(var.formulas) > 0)
+        or getattr(var, "adds", None)
+        or getattr(var, "subtracts", None)
+    }
+
+
+@pytest.fixture(scope="module")
+def dataset_path():
+    path = ExtendedCPS_2024.file_path
+    if not path.exists():
+        pytest.skip("Extended CPS 2024 not built locally")
+    return path
+
+
+@pytest.fixture(scope="module")
+def stored_variables(dataset_path):
+    """Return set of variable names stored in the extended CPS."""
+    with h5py.File(dataset_path, "r") as f:
+        return set(f.keys())
+
+
+@pytest.fixture(scope="module")
+def sim():
+    from policyengine_us import Microsimulation
+
+    return Microsimulation(dataset=ExtendedCPS_2024)
+
+
+def test_no_formula_variables_stored(formula_variables, stored_variables):
+    """Computed variables should not be stored in the dataset."""
+    overlap = (stored_variables & formula_variables) - KNOWN_FORMULA_EXCEPTIONS
+    if overlap:
+        msg = (
+            f"These {len(overlap)} variables are computed by "
+            f"policyengine-us (formulas/adds/subtracts) but are "
+            f"stored in the dataset "
+            f"(stored values will be IGNORED by the simulation):\n"
+        )
+        for v in sorted(overlap):
+            msg += f"  - {v}\n"
+        pytest.fail(msg)
+
+
+def test_stored_values_match_computed(
+    sim, stored_variables, formula_variables, dataset_path
+):
+    """For input-only variables, stored values should match what the
+    simulation returns (catches set_input issues, rounding, etc.)."""
+    input_vars = stored_variables - formula_variables
+    mismatches = []
+
+    with h5py.File(dataset_path, "r") as f:
+        for var_name in sorted(input_vars):
+            if var_name not in f or "2024" not in f[var_name]:
+                continue
+            stored = f[var_name]["2024"][...]
+            if stored.dtype.kind not in ("f", "i", "u"):
+                continue
+
+            stored_f = stored.astype(float)
+            stored_total = np.sum(stored_f)
+            if abs(stored_total) < 1:
+                continue
+
+            try:
+                computed = np.array(sim.calculate(var_name, 2024))
+            except Exception:
+                continue
+
+            computed_total = np.sum(computed.astype(float))
+            if abs(stored_total) > 0:
+                pct_diff = (
+                    abs(stored_total - computed_total)
+                    / abs(stored_total)
+                    * 100
+                )
+            else:
+                pct_diff = 0
+
+            if pct_diff > 1:
+                mismatches.append(
+                    f"  {var_name}: stored=${stored_total:,.0f}, "
+                    f"computed=${computed_total:,.0f}, "
+                    f"diff={pct_diff:.1f}%"
+                )
+
+    if mismatches:
+        msg = (
+            f"These {len(mismatches)} input variables have >1% "
+            f"difference between stored and computed values:\n"
+        )
+        msg += "\n".join(mismatches)
+        pytest.fail(msg)
+
+
+def test_ss_subcomponents_sum_to_computed_total(sim, dataset_path):
+    """Social Security sub-components should sum to the computed total.
+
+    ``social_security`` is computed via ``adds`` in policyengine-us and
+    is NOT stored in the dataset.  We verify that the sub-components
+    stored in the dataset sum to the simulation's computed total.
+    """
+    with h5py.File(dataset_path, "r") as f:
+        ss_retirement = f["social_security_retirement"]["2024"][...].astype(
+            float
+        )
+        ss_disability = f["social_security_disability"]["2024"][...].astype(
+            float
+        )
+        ss_survivors = f["social_security_survivors"]["2024"][...].astype(
+            float
+        )
+        ss_dependents = f["social_security_dependents"]["2024"][...].astype(
+            float
+        )
+
+    sub_sum = ss_retirement + ss_disability + ss_survivors + ss_dependents
+    computed_total = np.array(sim.calculate("social_security", 2024)).astype(
+        float
+    )
+
+    # Only check records that have any SS income
+    has_ss = computed_total > 0
+    if not np.any(has_ss):
+        pytest.skip("No SS recipients in dataset")
+
+    total_computed = np.sum(computed_total[has_ss])
+    total_sub = np.sum(sub_sum[has_ss])
+    pct_diff = abs(total_computed - total_sub) / total_computed * 100
+
+    assert pct_diff < 1, (
+        f"SS sub-components don't sum to computed total: "
+        f"computed=${total_computed:,.0f}, sub_sum=${total_sub:,.0f}, "
+        f"diff={pct_diff:.1f}%"
+    )
+
+    # Per-person check: no person's sub-components should exceed total
+    excess = sub_sum[has_ss] - computed_total[has_ss]
+    n_excess = np.sum(excess > 1)
+    assert n_excess == 0, (
+        f"{n_excess} people have SS sub-components exceeding "
+        f"their computed total SS income"
+    )
diff --git a/uv.lock b/uv.lock
index fc61e96d..09a004ac 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1842,7 +1842,7 @@ wheels = [
 
 [[package]]
 name = "policyengine-us"
-version = "1.570.7"
+version = "1.584.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "microdf-python" },
@@ -1850,9 +1850,9 @@ dependencies = [
     { name = "policyengine-core" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/6a/eb/291b3085aa0fa97fcce4987d54991118f21aead49647b3f475998459f46b/policyengine_us-1.570.7.tar.gz", hash = "sha256:a2967af86a61468a0bdb6b2dc7af2fd0bb0f0064203fa557b6fee8023058360a", size = 8668680, upload-time = "2026-02-19T07:17:11.264Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ef/29/a4517eac6dedc1d6c1a17b0d105b9aa41ca41b877e6ecef39362ecd0438c/policyengine_us-1.584.0.tar.gz", hash = "sha256:9bb830765ece943b11939822da94d64a26c802c941b8f929a354354394a2ef7e", size = 8701324, upload-time = "2026-02-24T15:05:04.474Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/38/36/0213955310076e4dec2781baeabf96b6d6937f99cd19c373b363bfbd7152/policyengine_us-1.570.7-py3-none-any.whl", hash = "sha256:374fd5357d6cb3734b900bd08dfdb61760dfc5b913ed686f57a40239565b0edd", size = 7825404, upload-time = "2026-02-19T07:17:08.877Z" },
+    { url = "https://files.pythonhosted.org/packages/17/f4/b8224384de676662858a079bc3138141a072ad886266e1e2d28a3108ad9d/policyengine_us-1.584.0-py3-none-any.whl", hash = "sha256:55c97c9e63c8ba18b895b74076d893de720cd44089398a62f0d02cc7afc8d53c", size = 7933593, upload-time = "2026-02-24T15:05:01.846Z" },
 ]
 
 [[package]]