From 630a8a03580e72eb497bda66e4c3275c021a984e Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Sun, 8 Mar 2026 21:45:32 +0000 Subject: [PATCH 1/3] Fix NEED calibration to use gross income for income band assignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NEED 2023 income bands use Experian modelled gross household income, not net income. The previous code used hbai_household_net_income which misallocated households across bands (especially at extremes — too few in £100k+ bands). This switches to household_gross_income (LCFS P344p) for both the LCFS training calibration and the FRS 4D raking step. Co-Authored-By: Claude --- changelog.d/need_gross_income.fixed.md | 1 + .../datasets/imputations/consumption.py | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) create mode 100644 changelog.d/need_gross_income.fixed.md diff --git a/changelog.d/need_gross_income.fixed.md b/changelog.d/need_gross_income.fixed.md new file mode 100644 index 00000000..e44c3b92 --- /dev/null +++ b/changelog.d/need_gross_income.fixed.md @@ -0,0 +1 @@ +Use gross household income (LCFS P344p / FRS household_gross_income) instead of HBAI net income when assigning households to NEED 2023 income bands for energy consumption calibration. NEED uses Experian modelled gross income, so the previous use of net income misallocated households across bands. diff --git a/policyengine_uk_data/datasets/imputations/consumption.py b/policyengine_uk_data/datasets/imputations/consumption.py index bd77973e..070892a2 100644 --- a/policyengine_uk_data/datasets/imputations/consumption.py +++ b/policyengine_uk_data/datasets/imputations/consumption.py @@ -13,6 +13,9 @@ and demographics, matching the strong drivers in NEED admin data. - Imputed totals are calibrated to NEED 2023 mean kWh targets by income band, converted to spend using Ofgem Q4 2023 unit rates (Oct 2023 price cap). + NEED income bands use Experian modelled gross household income, so calibration + matches against gross income (LCFS P344p / FRS household_gross_income) rather + than HBAI net income. """ import pandas as pd @@ -75,6 +78,7 @@ "G019": "is_child", "Gorx": "region", "P389p": "hbai_household_net_income", + "P344p": "household_gross_income", "weighta": "household_weight", } PERSON_LCF_RENAMES = { @@ -146,6 +150,7 @@ OFGEM_Q4_2023_GAS_RATE = 6.89 / 100 # £/kWh (Oct 2023 price cap) # NEED 2023 mean kWh by income band (Table 11b gas, Table 12b electricity) +# Income bands are gross household income (Experian modelled data) NEED_INCOME_BANDS = [ (0, 15_000, "under_15k", 7_755, 2_412), # gas kWh, elec kWh (15_000, 20_000, "15k_20k", 9_196, 2_700), @@ -336,11 +341,14 @@ def _derive_energy_from_lcfs(household: pd.DataFrame) -> pd.DataFrame: def _calibrate_energy_to_need( - household: pd.DataFrame, income_col: str = "hbai_household_net_income" + household: pd.DataFrame, income_col: str = "household_gross_income" ) -> pd.DataFrame: """ Rescale imputed electricity and gas spend to match NEED 2023 income-band means. + NEED 2023 income bands use Experian modelled gross household income, so we + match against gross income rather than HBAI net income. + For each NEED income band, computes the ratio of the NEED-implied mean spend to the LCFS-derived mean spend and applies it multiplicatively. This preserves within-band distributional shape while anchoring the level to admin data. @@ -471,6 +479,7 @@ def generate_lcfs_table(lcfs_person: pd.DataFrame, lcfs_household: pd.DataFrame) # Annualise weekly LCFS values (× 52) annualise = list(CONSUMPTION_VARIABLE_RENAMES.values()) + [ "hbai_household_net_income", + "household_gross_income", "electricity_consumption", "gas_consumption", ] @@ -516,6 +525,7 @@ def uprate_lcfs_table(household: pd.DataFrame, time_period: str) -> pd.DataFrame # Uprate income predictor so training distribution matches FRS target year for col in [ "hbai_household_net_income", + "household_gross_income", "employment_income", "self_employment_income", "private_pension_income", @@ -584,7 +594,9 @@ def impute_consumption(dataset: UKSingleYearDataset) -> UKSingleYearDataset: # This is a 4-dimensional raking (vs the 1D income-band calibration on LCFS # training data in _calibrate_energy_to_need) because the FRS has the full # set of housing/demographic variables needed for multi-margin calibration. - income = input_df["hbai_household_net_income"].values + # NEED income bands use Experian modelled gross income, so we use + # household_gross_income rather than hbai_household_net_income. + income = sim.calculate("household_gross_income", map_to="household").values tenure = sim.calculate("tenure_type", map_to="household").values accomm = sim.calculate("accommodation_type", map_to="household").values region = sim.calculate("region", map_to="household").values From 2bf1ffbb38576b49dcbecdef7265a1f20de432ec Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Sun, 8 Mar 2026 21:58:39 +0000 Subject: [PATCH 2/3] Fix LCFS column name: p344p is lowercase Co-Authored-By: Claude --- policyengine_uk_data/datasets/imputations/consumption.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_uk_data/datasets/imputations/consumption.py b/policyengine_uk_data/datasets/imputations/consumption.py index 070892a2..d662f110 100644 --- a/policyengine_uk_data/datasets/imputations/consumption.py +++ b/policyengine_uk_data/datasets/imputations/consumption.py @@ -78,7 +78,7 @@ "G019": "is_child", "Gorx": "region", "P389p": "hbai_household_net_income", - "P344p": "household_gross_income", + "p344p": "household_gross_income", "weighta": "household_weight", } PERSON_LCF_RENAMES = { From bec0bfead56429189021b9438c156d476c02615c Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Mon, 9 Mar 2026 08:31:00 +0000 Subject: [PATCH 3/3] Use gross income in calibration test to match raking The test was evaluating NEED band fit using hbai_household_net_income while the raking now targets household_gross_income. Co-Authored-By: Claude --- policyengine_uk_data/tests/test_energy_calibration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_uk_data/tests/test_energy_calibration.py b/policyengine_uk_data/tests/test_energy_calibration.py index 0664f25e..5d254a22 100644 --- a/policyengine_uk_data/tests/test_energy_calibration.py +++ b/policyengine_uk_data/tests/test_energy_calibration.py @@ -50,7 +50,7 @@ def arrays(imputed): sim = Microsimulation(dataset=imputed) return dict( income=sim.calculate( - "hbai_household_net_income", map_to="household", period=2023 + "household_gross_income", map_to="household", period=2023 ).values, tenure=sim.calculate("tenure_type", map_to="household", period=2023).values, accomm=sim.calculate(