From 89621f65febff850d3a826494b2ab96ee31ac827 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 9 Mar 2026 12:15:22 -0400 Subject: [PATCH 1/5] Replace batched QRF with sequential fit_predict() from microimpute The old _batch_qrf() split 85+ PUF income variables into batches of 10, each with a fresh QRF model. This destroyed covariance between variables in different batches (e.g. employment_income and long_term_capital_gains). Now uses microimpute's fit_predict() which runs a single sequential QRF: each variable is conditioned on all previously imputed variables, preserving the full joint distribution. Also replaces manual sampling and gc cleanup in weeks_unemployed and retirement imputation with max_train_samples parameter. Requires microimpute>=1.15.1 (fit_predict + max_train_samples). Co-Authored-By: Claude Opus 4.6 --- .../microimpute-fit-predict.changed.md | 1 + .../calibration/puf_impute.py | 171 +++++++++--------- .../datasets/cps/extended_cps.py | 18 +- pyproject.toml | 2 +- 4 files changed, 98 insertions(+), 94 deletions(-) create mode 100644 changelog.d/microimpute-fit-predict.changed.md diff --git a/changelog.d/microimpute-fit-predict.changed.md b/changelog.d/microimpute-fit-predict.changed.md new file mode 100644 index 00000000..2dbf75bc --- /dev/null +++ b/changelog.d/microimpute-fit-predict.changed.md @@ -0,0 +1 @@ +Replaced batched QRF imputation with single sequential QRF via microimpute's fit_predict() API, preserving full covariance across all 85+ PUF income variables. diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index bf835583..dfdada5f 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -194,7 +194,9 @@ "social_security", ] -RETIREMENT_PREDICTORS = RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS +RETIREMENT_PREDICTORS = ( + RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS +) def _get_retirement_limits(year: int) -> dict: @@ -409,7 +411,9 @@ def reconcile_ss_subcomponents( if puf_has_ss.any(): shares = _qrf_ss_shares(data, n_cps, time_period, puf_has_ss) if shares is None: - shares = _age_heuristic_ss_shares(data, n_cps, time_period, puf_has_ss) + shares = _age_heuristic_ss_shares( + data, n_cps, time_period, puf_has_ss + ) for sub in SS_SUBCOMPONENTS: if sub not in data: @@ -488,13 +492,17 @@ def _map_to_entity(pred_values, variable_name): return pred_values entity = var_meta.entity.key if entity != "person": - return cps_sim.populations[entity].value_from_first_person(pred_values) + return cps_sim.populations[entity].value_from_first_person( + pred_values + ) return pred_values # Impute weeks_unemployed for PUF half puf_weeks = None if y_full is not None and dataset_path is not None: - puf_weeks = _impute_weeks_unemployed(data, y_full, time_period, dataset_path) + puf_weeks = _impute_weeks_unemployed( + data, y_full, time_period, dataset_path + ) # Impute retirement contributions for PUF half puf_retirement = None @@ -518,14 +526,24 @@ def _map_to_entity(pred_values, variable_name): time_period: np.concatenate([values, values + values.max()]) } elif "_weight" in variable: - new_data[variable] = {time_period: np.concatenate([values, values * 0])} + new_data[variable] = { + time_period: np.concatenate([values, values * 0]) + } elif variable == "weeks_unemployed" and puf_weeks is not None: - new_data[variable] = {time_period: np.concatenate([values, puf_weeks])} - elif variable in CPS_RETIREMENT_VARIABLES and puf_retirement is not None: + new_data[variable] = { + time_period: np.concatenate([values, puf_weeks]) + } + elif ( + variable in CPS_RETIREMENT_VARIABLES and puf_retirement is not None + ): puf_vals = puf_retirement[variable] - new_data[variable] = {time_period: np.concatenate([values, puf_vals])} + new_data[variable] = { + time_period: np.concatenate([values, puf_vals]) + } else: - new_data[variable] = {time_period: np.concatenate([values, values])} + new_data[variable] = { + time_period: np.concatenate([values, values]) + } new_data["state_fips"] = { time_period: np.concatenate([state_fips, state_fips]).astype(np.int32) @@ -613,24 +631,18 @@ def _impute_weeks_unemployed( del cps_sim - qrf = QRF(log_level="INFO", memory_efficient=True) - # Subsample to 5000 for QRF training speed: CPS has ~200K person - # records; QRF fitting is O(n log n) per tree, so 5K keeps - # training under ~30s while retaining adequate distributional - # coverage. Empirical testing showed diminishing accuracy gains - # beyond ~5K–10K records for these predictors. - if len(X_train) > 5000: - X_train_sampled = X_train.sample(n=5000, random_state=42) - else: - X_train_sampled = X_train - - fitted = qrf.fit( - X_train=X_train_sampled, + qrf = QRF( + log_level="INFO", + memory_efficient=True, + max_train_samples=5000, + ) + predictions = qrf.fit_predict( + X_train=X_train, + X_test=X_test, predictors=WEEKS_PREDICTORS, imputed_variables=["weeks_unemployed"], n_jobs=1, ) - predictions = fitted.predict(X_test=X_test) imputed_weeks = predictions["weeks_unemployed"].values imputed_weeks = np.clip(imputed_weeks, 0, 52) @@ -644,11 +656,13 @@ def _impute_weeks_unemployed( logger.info( "Imputed weeks_unemployed for PUF: %d with weeks > 0, mean = %.1f", (imputed_weeks > 0).sum(), - (imputed_weeks[imputed_weeks > 0].mean() if (imputed_weeks > 0).any() else 0), + ( + imputed_weeks[imputed_weeks > 0].mean() + if (imputed_weeks > 0).any() + else 0 + ), ) - del fitted, predictions - gc.collect() return imputed_weeks @@ -706,23 +720,19 @@ def _impute_retirement_contributions( del cps_sim - # Subsample to 5000 for speed (see comment in - # _impute_weeks_unemployed for rationale). - if len(X_train) > 5000: - X_train_sampled = X_train.sample(n=5000, random_state=42) - else: - X_train_sampled = X_train - - # Train QRF - qrf = QRF(log_level="INFO", memory_efficient=True) + qrf = QRF( + log_level="INFO", + memory_efficient=True, + max_train_samples=5000, + ) try: - fitted = qrf.fit( - X_train=X_train_sampled, + predictions = qrf.fit_predict( + X_train=X_train, + X_test=X_test, predictors=RETIREMENT_PREDICTORS, imputed_variables=CPS_RETIREMENT_VARIABLES, n_jobs=1, ) - predictions = fitted.predict(X_test=X_test) except Exception: logger.warning( "QRF retirement imputation failed, returning zeros", @@ -779,8 +789,6 @@ def _impute_retirement_contributions( result["self_employed_pension_contributions"].mean(), ) - del fitted, predictions - gc.collect() return result @@ -814,7 +822,9 @@ def _run_qrf_imputation( puf_sim = Microsimulation(dataset=puf_dataset) - puf_agi = puf_sim.calculate("adjusted_gross_income", map_to="person").values + puf_agi = puf_sim.calculate( + "adjusted_gross_income", map_to="person" + ).values X_train_full = puf_sim.calculate_dataframe( DEMOGRAPHIC_PREDICTORS + IMPUTED_VARIABLES @@ -849,13 +859,15 @@ def _run_qrf_imputation( X_test[pred] = data[pred][time_period].astype(np.float32) logger.info("Imputing %d PUF variables (full)", len(IMPUTED_VARIABLES)) - y_full = _batch_qrf(X_train_full, X_test, DEMOGRAPHIC_PREDICTORS, IMPUTED_VARIABLES) + y_full = _sequential_qrf( + X_train_full, X_test, DEMOGRAPHIC_PREDICTORS, IMPUTED_VARIABLES + ) logger.info( "Imputing %d PUF variables (override)", len(OVERRIDDEN_IMPUTED_VARIABLES), ) - y_override = _batch_qrf( + y_override = _sequential_qrf( X_train_override, X_test, DEMOGRAPHIC_PREDICTORS, @@ -889,77 +901,56 @@ def _stratified_subsample_index( if remaining_quota >= len(bottom_idx): selected_bottom = bottom_idx else: - selected_bottom = rng.choice(bottom_idx, size=remaining_quota, replace=False) + selected_bottom = rng.choice( + bottom_idx, size=remaining_quota, replace=False + ) selected = np.concatenate([top_idx, selected_bottom]) selected.sort() return selected -def _batch_qrf( +def _sequential_qrf( X_train: pd.DataFrame, X_test: pd.DataFrame, predictors: List[str], output_vars: List[str], - batch_size: int = 10, ) -> Dict[str, np.ndarray]: - """Run QRF in batches to control memory. + """Run a single sequential QRF preserving covariance. + + Uses microimpute's fit_predict() which handles missing variable + detection, gc cleanup, and zero-fill internally. Each variable + is conditioned on all previously imputed variables, preserving + the full joint distribution. Args: X_train: Training data with predictors + output vars. X_test: Test data with predictors only. predictors: Predictor column names. output_vars: Output variable names to impute. - batch_size: Variables per batch. Returns: Dict mapping variable name to imputed values. """ from microimpute.models.qrf import QRF - available = [c for c in output_vars if c in X_train.columns] - missing = [c for c in output_vars if c not in X_train.columns] + qrf = QRF( + log_level="INFO", + memory_efficient=True, + ) + predictions = qrf.fit_predict( + X_train=X_train, + X_test=X_test, + predictors=predictors, + imputed_variables=output_vars, + n_jobs=1, + ) + result = {var: predictions[var].values for var in predictions.columns} + missing = set(output_vars) - set(result) if missing: - logger.warning( - "%d variables missing from training: %s", - len(missing), - missing[:5], - ) - - result = {} - - for batch_start in range(0, len(available), batch_size): - batch_vars = available[batch_start : batch_start + batch_size] - - gc.collect() - - qrf = QRF( - log_level="INFO", - memory_efficient=True, - batch_size=10, - cleanup_interval=5, + raise ValueError( + f"{len(missing)} variables requested but not returned " + f"by fit_predict(): {sorted(missing)[:10]}" ) - - batch_X_train = X_train[predictors + batch_vars].copy() - - fitted = qrf.fit( - X_train=batch_X_train, - predictors=predictors, - imputed_variables=batch_vars, - n_jobs=1, - ) - - predictions = fitted.predict(X_test=X_test) - - for var in batch_vars: - result[var] = predictions[var].values - - del fitted, predictions, batch_X_train - gc.collect() - - n_test = len(X_test) - for var in missing: - result[var] = np.zeros(n_test) - return result diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 4d24810f..0fb80c64 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -53,9 +53,21 @@ def generate(self): new_data = self._drop_formula_variables(new_data) self.save_dataset(new_data) - # Variables with formulas that must still be stored (e.g. IDs - # needed by the dataset loader before formulas can run). - _KEEP_FORMULA_VARS = {"person_id"} + # Variables with formulas that must still be stored. + # Includes person_id (needed by dataset loader) and PUF-imputed + # variables whose sub-components aren't separately stored — + # dropping these would lose the QRF imputation with no way to + # recompute them. + _KEEP_FORMULA_VARS = { + "person_id", + # PUF-imputed adds vars without stored sub-components: + "taxable_pension_income", + "interest_deduction", + "tax_exempt_pension_income", + "pre_tax_contributions", + "self_employed_pension_contribution_ald", + "self_employed_health_insurance_ald", + } @classmethod def _drop_formula_variables(cls, data): diff --git a/pyproject.toml b/pyproject.toml index 2085d93c..628315f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "tqdm>=4.60.0", "microdf_python>=1.2.1", "setuptools>=60", - "microimpute>=1.1.4", + "microimpute>=1.15.1", "pip-system-certs>=3.0", "google-cloud-storage>=2.0.0", "google-auth>=2.0.0", From 7f5458f9f1124f523b69f176f3606e62cf155c0d Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 9 Mar 2026 13:34:42 -0400 Subject: [PATCH 2/5] Rename imputed formula vars to leaf inputs before drop Instead of whitelisting formula vars in _KEEP_FORMULA_VARS, rename them to their leaf input equivalents so _drop_formula_variables works correctly and the engine can recompute aggregates from formulas: - taxable_pension_income -> taxable_private_pension_income - tax_exempt_pension_income -> tax_exempt_private_pension_income - interest_deduction -> deductible_mortgage_interest - self_employed_pension_contribution_ald -> _person - self_employed_health_insurance_ald -> _person Co-Authored-By: Claude Opus 4.6 --- .../datasets/cps/extended_cps.py | 51 ++++++++++++++----- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 0fb80c64..5844c91b 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -50,23 +50,46 @@ def generate(self): dataset_path=str(self.cps.file_path), ) + new_data = self._rename_imputed_to_inputs(new_data) new_data = self._drop_formula_variables(new_data) self.save_dataset(new_data) - # Variables with formulas that must still be stored. - # Includes person_id (needed by dataset loader) and PUF-imputed - # variables whose sub-components aren't separately stored — - # dropping these would lose the QRF imputation with no way to - # recompute them. - _KEEP_FORMULA_VARS = { - "person_id", - # PUF-imputed adds vars without stored sub-components: - "taxable_pension_income", - "interest_deduction", - "tax_exempt_pension_income", - "pre_tax_contributions", - "self_employed_pension_contribution_ald", - "self_employed_health_insurance_ald", + @classmethod + def _rename_imputed_to_inputs(cls, data): + """Rename QRF-imputed formula vars to their leaf inputs. + + The QRF imputes formula-level aggregates (e.g. + taxable_pension_income) but the engine needs leaf inputs + (e.g. taxable_private_pension_income) so formulas work. + """ + for formula_var, input_var in cls._IMPUTED_TO_INPUT.items(): + if formula_var in data and input_var not in data: + logger.info( + "Renaming %s -> %s (leaf input)", + formula_var, + input_var, + ) + data[input_var] = data.pop(formula_var) + return data + + # Variables with formulas that must still be stored (e.g. IDs + # needed by the dataset loader before formulas can run). + _KEEP_FORMULA_VARS = {"person_id"} + + # QRF imputes formula-level variables (e.g. taxable_pension_income) + # but we must store them under leaf input names so + # _drop_formula_variables doesn't discard them. The engine then + # recomputes the formula var from its adds. + _IMPUTED_TO_INPUT = { + "taxable_pension_income": "taxable_private_pension_income", + "tax_exempt_pension_income": "tax_exempt_private_pension_income", + "interest_deduction": "deductible_mortgage_interest", + "self_employed_pension_contribution_ald": ( + "self_employed_pension_contribution_ald_person" + ), + "self_employed_health_insurance_ald": ( + "self_employed_health_insurance_ald_person" + ), } @classmethod From 6da97ac35e4de8939881172588d008983e56ecc2 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 9 Mar 2026 10:36:24 -0400 Subject: [PATCH 3/5] Fix weekly hours worked deflation bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HRSWK and A_HRS1 are already weekly measures from the CPS — they don't need to be multiplied by WKSWORK/52. The old code deflated part-year workers' weekly hours (e.g. someone working 40 hrs/wk for 26 weeks showed as 20 hrs/wk). Fixes part of #561 Co-Authored-By: Claude Opus 4.6 --- policyengine_us_data/datasets/cps/cps.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 6775fa16..418d7396 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -501,8 +501,8 @@ def add_personal_income_variables(cps: h5py.File, person: DataFrame, year: int): # Assign CPS variables. cps["employment_income"] = person.WSAL_VAL - cps["weekly_hours_worked"] = person.HRSWK * person.WKSWORK / 52 - cps["hours_worked_last_week"] = person.A_HRS1 * person.WKSWORK / 52 + cps["weekly_hours_worked"] = person.HRSWK + cps["hours_worked_last_week"] = person.A_HRS1 cps["taxable_interest_income"] = person.INT_VAL * (p["taxable_interest_fraction"]) cps["tax_exempt_interest_income"] = person.INT_VAL * ( From b687cd27605787760dd63bc0e0dcb676f8131be7 Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 9 Mar 2026 11:40:22 -0400 Subject: [PATCH 4/5] Fix local area publish workflow and disable auto-trigger The modal run command failed because local_area.py now has two local entrypoints (main and main_promote), requiring explicit ::main disambiguation. Also temporarily disables push and repository_dispatch triggers to prevent blocking other workflows. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/local_area_publish.yaml | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/workflows/local_area_publish.yaml b/.github/workflows/local_area_publish.yaml index 44675e63..a7fcae7f 100644 --- a/.github/workflows/local_area_publish.yaml +++ b/.github/workflows/local_area_publish.yaml @@ -1,14 +1,15 @@ name: Publish Local Area H5 Files on: - push: - branches: [main] - paths: - - 'policyengine_us_data/datasets/cps/local_area_calibration/**' - - '.github/workflows/local_area_publish.yaml' - - 'modal_app/**' - repository_dispatch: - types: [calibration-updated] + # TEMPORARILY DISABLED - re-enable push/repository_dispatch triggers when ready + # push: + # branches: [main] + # paths: + # - 'policyengine_us_data/datasets/cps/local_area_calibration/**' + # - '.github/workflows/local_area_publish.yaml' + # - 'modal_app/**' + # repository_dispatch: + # types: [calibration-updated] workflow_dispatch: inputs: num_workers: @@ -55,7 +56,7 @@ jobs: SKIP_UPLOAD="${{ github.event.inputs.skip_upload || 'false' }}" BRANCH="${{ github.head_ref || github.ref_name }}" - CMD="modal run modal_app/local_area.py --branch=${BRANCH} --num-workers=${NUM_WORKERS}" + CMD="modal run modal_app/local_area.py::main --branch=${BRANCH} --num-workers=${NUM_WORKERS}" if [ "$SKIP_UPLOAD" = "true" ]; then CMD="${CMD} --skip-upload" From 979c3820c417b4fe734b33159611b1ca0fa2981f Mon Sep 17 00:00:00 2001 From: "baogorek@gmail.com" Date: Mon, 9 Mar 2026 13:59:00 -0400 Subject: [PATCH 5/5] Update uv.lock and run ruff format Co-Authored-By: Claude Opus 4.6 --- .../calibration/puf_impute.py | 50 +++++-------------- .../datasets/cps/extended_cps.py | 24 +++++---- uv.lock | 10 ++-- 3 files changed, 31 insertions(+), 53 deletions(-) diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index dfdada5f..445bd758 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -194,9 +194,7 @@ "social_security", ] -RETIREMENT_PREDICTORS = ( - RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS -) +RETIREMENT_PREDICTORS = RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS def _get_retirement_limits(year: int) -> dict: @@ -411,9 +409,7 @@ def reconcile_ss_subcomponents( if puf_has_ss.any(): shares = _qrf_ss_shares(data, n_cps, time_period, puf_has_ss) if shares is None: - shares = _age_heuristic_ss_shares( - data, n_cps, time_period, puf_has_ss - ) + shares = _age_heuristic_ss_shares(data, n_cps, time_period, puf_has_ss) for sub in SS_SUBCOMPONENTS: if sub not in data: @@ -492,17 +488,13 @@ def _map_to_entity(pred_values, variable_name): return pred_values entity = var_meta.entity.key if entity != "person": - return cps_sim.populations[entity].value_from_first_person( - pred_values - ) + return cps_sim.populations[entity].value_from_first_person(pred_values) return pred_values # Impute weeks_unemployed for PUF half puf_weeks = None if y_full is not None and dataset_path is not None: - puf_weeks = _impute_weeks_unemployed( - data, y_full, time_period, dataset_path - ) + puf_weeks = _impute_weeks_unemployed(data, y_full, time_period, dataset_path) # Impute retirement contributions for PUF half puf_retirement = None @@ -526,24 +518,14 @@ def _map_to_entity(pred_values, variable_name): time_period: np.concatenate([values, values + values.max()]) } elif "_weight" in variable: - new_data[variable] = { - time_period: np.concatenate([values, values * 0]) - } + new_data[variable] = {time_period: np.concatenate([values, values * 0])} elif variable == "weeks_unemployed" and puf_weeks is not None: - new_data[variable] = { - time_period: np.concatenate([values, puf_weeks]) - } - elif ( - variable in CPS_RETIREMENT_VARIABLES and puf_retirement is not None - ): + new_data[variable] = {time_period: np.concatenate([values, puf_weeks])} + elif variable in CPS_RETIREMENT_VARIABLES and puf_retirement is not None: puf_vals = puf_retirement[variable] - new_data[variable] = { - time_period: np.concatenate([values, puf_vals]) - } + new_data[variable] = {time_period: np.concatenate([values, puf_vals])} else: - new_data[variable] = { - time_period: np.concatenate([values, values]) - } + new_data[variable] = {time_period: np.concatenate([values, values])} new_data["state_fips"] = { time_period: np.concatenate([state_fips, state_fips]).astype(np.int32) @@ -656,11 +638,7 @@ def _impute_weeks_unemployed( logger.info( "Imputed weeks_unemployed for PUF: %d with weeks > 0, mean = %.1f", (imputed_weeks > 0).sum(), - ( - imputed_weeks[imputed_weeks > 0].mean() - if (imputed_weeks > 0).any() - else 0 - ), + (imputed_weeks[imputed_weeks > 0].mean() if (imputed_weeks > 0).any() else 0), ) return imputed_weeks @@ -822,9 +800,7 @@ def _run_qrf_imputation( puf_sim = Microsimulation(dataset=puf_dataset) - puf_agi = puf_sim.calculate( - "adjusted_gross_income", map_to="person" - ).values + puf_agi = puf_sim.calculate("adjusted_gross_income", map_to="person").values X_train_full = puf_sim.calculate_dataframe( DEMOGRAPHIC_PREDICTORS + IMPUTED_VARIABLES @@ -901,9 +877,7 @@ def _stratified_subsample_index( if remaining_quota >= len(bottom_idx): selected_bottom = bottom_idx else: - selected_bottom = rng.choice( - bottom_idx, size=remaining_quota, replace=False - ) + selected_bottom = rng.choice(bottom_idx, size=remaining_quota, replace=False) selected = np.concatenate([top_idx, selected_bottom]) selected.sort() diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 5844c91b..35147ff2 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -63,7 +63,7 @@ def _rename_imputed_to_inputs(cls, data): (e.g. taxable_private_pension_income) so formulas work. """ for formula_var, input_var in cls._IMPUTED_TO_INPUT.items(): - if formula_var in data and input_var not in data: + if formula_var in data: logger.info( "Renaming %s -> %s (leaf input)", formula_var, @@ -72,24 +72,26 @@ def _rename_imputed_to_inputs(cls, data): data[input_var] = data.pop(formula_var) return data - # Variables with formulas that must still be stored (e.g. IDs - # needed by the dataset loader before formulas can run). - _KEEP_FORMULA_VARS = {"person_id"} + # Variables with formulas/adds that must still be stored. + # Includes IDs needed before formulas run and tax-unit-level + # QRF-imputed vars that can't be renamed to person-level leaves + # due to entity shape mismatch. + _KEEP_FORMULA_VARS = { + "person_id", + "interest_deduction", + "self_employed_pension_contribution_ald", + "self_employed_health_insurance_ald", + } # QRF imputes formula-level variables (e.g. taxable_pension_income) # but we must store them under leaf input names so # _drop_formula_variables doesn't discard them. The engine then # recomputes the formula var from its adds. + # NOTE: only same-entity renames here; cross-entity vars + # (tax_unit -> person) go in _KEEP_FORMULA_VARS instead. _IMPUTED_TO_INPUT = { "taxable_pension_income": "taxable_private_pension_income", "tax_exempt_pension_income": "tax_exempt_private_pension_income", - "interest_deduction": "deductible_mortgage_interest", - "self_employed_pension_contribution_ald": ( - "self_employed_pension_contribution_ald_person" - ), - "self_employed_health_insurance_ald": ( - "self_employed_health_insurance_ald_person" - ), } @classmethod diff --git a/uv.lock b/uv.lock index 0290432b..741ec1e9 100644 --- a/uv.lock +++ b/uv.lock @@ -610,6 +610,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" }, { url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" }, { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" }, + { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" }, { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" }, { url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" }, { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" }, @@ -617,6 +618,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/2f/28592176381b9ab2cafa12829ba7b472d177f3acc35d8fbcf3673d966fff/greenlet-3.3.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:a1e41a81c7e2825822f4e068c48cb2196002362619e2d70b148f20a831c00739", size = 275140, upload-time = "2025-12-04T14:23:01.282Z" }, { url = "https://files.pythonhosted.org/packages/2c/80/fbe937bf81e9fca98c981fe499e59a3f45df2a04da0baa5c2be0dca0d329/greenlet-3.3.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f515a47d02da4d30caaa85b69474cec77b7929b2e936ff7fb853d42f4bf8808", size = 599219, upload-time = "2025-12-04T14:50:08.309Z" }, { url = "https://files.pythonhosted.org/packages/c2/ff/7c985128f0514271b8268476af89aee6866df5eec04ac17dcfbc676213df/greenlet-3.3.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d2d9fd66bfadf230b385fdc90426fcd6eb64db54b40c495b72ac0feb5766c54", size = 610211, upload-time = "2025-12-04T14:57:43.968Z" }, + { url = "https://files.pythonhosted.org/packages/79/07/c47a82d881319ec18a4510bb30463ed6891f2ad2c1901ed5ec23d3de351f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:30a6e28487a790417d036088b3bcb3f3ac7d8babaa7d0139edbaddebf3af9492", size = 624311, upload-time = "2025-12-04T15:07:14.697Z" }, { url = "https://files.pythonhosted.org/packages/fd/8e/424b8c6e78bd9837d14ff7df01a9829fc883ba2ab4ea787d4f848435f23f/greenlet-3.3.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:087ea5e004437321508a8d6f20efc4cfec5e3c30118e1417ea96ed1d93950527", size = 612833, upload-time = "2025-12-04T14:26:03.669Z" }, { url = "https://files.pythonhosted.org/packages/b5/ba/56699ff9b7c76ca12f1cdc27a886d0f81f2189c3455ff9f65246780f713d/greenlet-3.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ab97cf74045343f6c60a39913fa59710e4bd26a536ce7ab2397adf8b27e67c39", size = 1567256, upload-time = "2025-12-04T15:04:25.276Z" }, { url = "https://files.pythonhosted.org/packages/1e/37/f31136132967982d698c71a281a8901daf1a8fbab935dce7c0cf15f942cc/greenlet-3.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5375d2e23184629112ca1ea89a53389dddbffcf417dad40125713d88eb5f96e8", size = 1636483, upload-time = "2025-12-04T14:27:30.804Z" }, @@ -1159,7 +1161,7 @@ wheels = [ [[package]] name = "microimpute" -version = "1.12.0" +version = "1.15.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "joblib" }, @@ -1176,9 +1178,9 @@ dependencies = [ { name = "statsmodels" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8e/f1/b3e407ddadea69198b36f87b855416684d99631a1f62fb952ceb820f755c/microimpute-1.12.0.tar.gz", hash = "sha256:f8554b2f40d0d11b079860e7b32af04acb7910a8632dc5a6a8c469990c4aa225", size = 125271, upload-time = "2025-12-11T14:05:13.249Z" } +sdist = { url = "https://files.pythonhosted.org/packages/97/17/d621d4ed40e0afac6f1a2c4dea423783576613820d1460ae30d65c48309e/microimpute-1.15.1.tar.gz", hash = "sha256:af409525d475efeb8c8526e9630834c4f16563e15cd42665117d2a1397fcf404", size = 128669, upload-time = "2026-03-09T15:59:33.885Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/53/5f/6fb8a1058c6e06670f6cea56b49300cf169e685e254ff2455a97afc3f64b/microimpute-1.12.0-py3-none-any.whl", hash = "sha256:76433c4927a2140ab217e1da503b1e5c2fff03c4b6dfd940d8d7d5ccfc2df9fd", size = 108702, upload-time = "2025-12-11T14:05:12.005Z" }, + { url = "https://files.pythonhosted.org/packages/42/f1/1d80dbb8cc9e85962524a4233cfe42ac1a78e6f2cc0ca479ed1817f6d8ae/microimpute-1.15.1-py3-none-any.whl", hash = "sha256:f5f2de91eeedea28ddae42d42757b558d6eb85c1a1fd6a9097b53e309f19369c", size = 111313, upload-time = "2026-03-09T15:59:32.553Z" }, ] [[package]] @@ -1869,7 +1871,7 @@ requires-dist = [ { name = "google-cloud-storage", specifier = ">=2.0.0" }, { name = "l0-python", marker = "extra == 'l0'" }, { name = "microdf-python", specifier = ">=1.2.1" }, - { name = "microimpute", specifier = ">=1.1.4" }, + { name = "microimpute", specifier = ">=1.15.1" }, { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pandas", specifier = ">=2.3.1" }, { name = "pip-system-certs", specifier = ">=3.0" },