From 2dff473130bf4465bdefef5dda9f005e512e4b66 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 14 Mar 2026 07:38:02 -0700 Subject: [PATCH 1/2] fixup! QRF-impute CPS-only variables for PUF clone half (#589) --- .claude/worktrees/agent-a7222b9b | 1 + .../datasets/cps/extended_cps.py | 17 ++++++++-- .../tests/test_extended_cps.py | 33 +++++++++++++++++++ 3 files changed, 49 insertions(+), 2 deletions(-) create mode 160000 .claude/worktrees/agent-a7222b9b diff --git a/.claude/worktrees/agent-a7222b9b b/.claude/worktrees/agent-a7222b9b new file mode 160000 index 00000000..b783d980 --- /dev/null +++ b/.claude/worktrees/agent-a7222b9b @@ -0,0 +1 @@ +Subproject commit b783d980d093a037b3c1cb92634b70fba6fcbb9b diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index f38d5746..be058ae3 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -279,9 +279,12 @@ def reconcile_ss_subcomponents(predictions, total_ss): nonzero_rows = row_sums > 0 both = positive_mask & nonzero_rows shares[both] = values[both] / row_sums[both, np.newaxis] - # If row_sum == 0 but total_ss > 0, distribute equally. + # If row_sum == 0 but total_ss > 0, use SSA aggregate shares. equal_rows = positive_mask & ~nonzero_rows - shares[equal_rows] = 1.0 / values.shape[1] + ssa_totals = np.array( + [_SSA_DEFAULT_SHARES[c] for c in predictions.columns] + ) + shares[equal_rows] = ssa_totals / ssa_totals.sum() out = np.where( positive_mask[:, np.newaxis], @@ -306,6 +309,16 @@ def reconcile_ss_subcomponents(predictions, total_ss): "social_security_survivors", } +# SSA Fact Sheet aggregate totals (billions $). Used as fallback +# shares when QRF predicts all zeros for a record that has positive +# total social_security. Source: HARD_CODED_TOTALS in utils/loss.py. +_SSA_DEFAULT_SHARES = { + "social_security_retirement": 1_060e9, + "social_security_disability": 148e9, + "social_security_survivors": 160e9, + "social_security_dependents": 84e9, +} + def _apply_post_processing(predictions, X_test, time_period, data): """Apply retirement constraints and SS reconciliation.""" diff --git a/policyengine_us_data/tests/test_extended_cps.py b/policyengine_us_data/tests/test_extended_cps.py index 5ddf4692..636a45cd 100644 --- a/policyengine_us_data/tests/test_extended_cps.py +++ b/policyengine_us_data/tests/test_extended_cps.py @@ -18,6 +18,7 @@ from policyengine_us_data.datasets.cps.extended_cps import ( CPS_ONLY_IMPUTED_VARIABLES, CPS_STAGE2_INCOME_PREDICTORS, + _SSA_DEFAULT_SHARES, apply_retirement_constraints, reconcile_ss_subcomponents, ) @@ -250,6 +251,38 @@ def test_single_component_gets_full_total(self): 25000, abs=0.01 ) + def test_zero_predictions_use_ssa_shares(self): + """When QRF predicts all zeros, fallback should use SSA + aggregate proportions (not equal 1/4 shares).""" + cols = [ + "social_security_retirement", + "social_security_disability", + "social_security_dependents", + "social_security_survivors", + ] + predictions = pd.DataFrame( + {c: [0.0] for c in cols} + ) + total_ss = np.array([10000.0]) + result = reconcile_ss_subcomponents(predictions, total_ss) + + # Compute expected shares from the SSA totals dict. + ssa_totals = np.array([_SSA_DEFAULT_SHARES[c] for c in cols]) + expected_shares = ssa_totals / ssa_totals.sum() + + for i, col in enumerate(cols): + assert result[col].values[0] == pytest.approx( + 10000.0 * expected_shares[i], rel=1e-6 + ), f"{col} share mismatch" + + # Retirement should dominate (~73%), not be 25%. + ret_share = ( + result["social_security_retirement"].values[0] / 10000.0 + ) + assert ret_share > 0.70, ( + f"Retirement share {ret_share:.3f} should be > 0.70" + ) + class TestSequentialQRF: """Verify that sequential QRF produces correlated outputs.""" From 462aefd5bd0ec1974dfbbc3d1dce4676055fafb6 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 14 Mar 2026 07:38:23 -0700 Subject: [PATCH 2/2] Use SSA proportions for SS sub-component fallback shares Replace equal 1/4 fallback with SSA Fact Sheet aggregate proportions (retirement ~73%, disability ~10%, survivors ~11%, dependents ~6%) when QRF predicts all zeros for a record with positive total social_security. Co-Authored-By: Claude Opus 4.6 --- .../changed/improve-ss-subcomponent-shares.md | 1 + policyengine_us_data/datasets/cps/extended_cps.py | 4 +--- policyengine_us_data/tests/test_extended_cps.py | 12 +++--------- 3 files changed, 5 insertions(+), 12 deletions(-) create mode 100644 changelog.d/changed/improve-ss-subcomponent-shares.md diff --git a/changelog.d/changed/improve-ss-subcomponent-shares.md b/changelog.d/changed/improve-ss-subcomponent-shares.md new file mode 100644 index 00000000..1851394e --- /dev/null +++ b/changelog.d/changed/improve-ss-subcomponent-shares.md @@ -0,0 +1 @@ +Replace equal-share (1/4 each) fallback in SS sub-component reconciliation with SSA Fact Sheet proportions (~73% retirement, ~10% disability, ~11% survivors, ~6% dependents) for records where QRF predicts all zeros. diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index be058ae3..6213c336 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -281,9 +281,7 @@ def reconcile_ss_subcomponents(predictions, total_ss): shares[both] = values[both] / row_sums[both, np.newaxis] # If row_sum == 0 but total_ss > 0, use SSA aggregate shares. equal_rows = positive_mask & ~nonzero_rows - ssa_totals = np.array( - [_SSA_DEFAULT_SHARES[c] for c in predictions.columns] - ) + ssa_totals = np.array([_SSA_DEFAULT_SHARES[c] for c in predictions.columns]) shares[equal_rows] = ssa_totals / ssa_totals.sum() out = np.where( diff --git a/policyengine_us_data/tests/test_extended_cps.py b/policyengine_us_data/tests/test_extended_cps.py index 636a45cd..fcd45fb6 100644 --- a/policyengine_us_data/tests/test_extended_cps.py +++ b/policyengine_us_data/tests/test_extended_cps.py @@ -260,9 +260,7 @@ def test_zero_predictions_use_ssa_shares(self): "social_security_dependents", "social_security_survivors", ] - predictions = pd.DataFrame( - {c: [0.0] for c in cols} - ) + predictions = pd.DataFrame({c: [0.0] for c in cols}) total_ss = np.array([10000.0]) result = reconcile_ss_subcomponents(predictions, total_ss) @@ -276,12 +274,8 @@ def test_zero_predictions_use_ssa_shares(self): ), f"{col} share mismatch" # Retirement should dominate (~73%), not be 25%. - ret_share = ( - result["social_security_retirement"].values[0] / 10000.0 - ) - assert ret_share > 0.70, ( - f"Retirement share {ret_share:.3f} should be > 0.70" - ) + ret_share = result["social_security_retirement"].values[0] / 10000.0 + assert ret_share > 0.70, f"Retirement share {ret_share:.3f} should be > 0.70" class TestSequentialQRF: