From 265f7f3fbad998a53f8034c12ec0d483fa5f1f45 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 12 Mar 2026 13:23:48 +0000 Subject: [PATCH 1/3] Add tiny dataset generation (n=1000 households) to pipeline Produces frs_2023_24_tiny.h5 and enhanced_frs_2023_24_tiny.h5 by subsampling with probability proportional to weight and rescaling weights to preserve population totals. --- changelog.d/tiny-datasets.added | 1 + .../datasets/create_datasets.py | 21 ++++++++++ policyengine_uk_data/utils/subsample.py | 38 +++++++++++++------ 3 files changed, 49 insertions(+), 11 deletions(-) create mode 100644 changelog.d/tiny-datasets.added diff --git a/changelog.d/tiny-datasets.added b/changelog.d/tiny-datasets.added new file mode 100644 index 00000000..e2a34058 --- /dev/null +++ b/changelog.d/tiny-datasets.added @@ -0,0 +1 @@ +Added tiny (n=1000 household) versions of frs_2023_24 and enhanced_frs_2023_24 datasets for faster testing and development. diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index 2396946e..e763eeb9 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -1,8 +1,10 @@ +from policyengine_uk.data import UKSingleYearDataset from policyengine_uk_data.datasets.frs import create_frs from policyengine_uk_data.storage import STORAGE_FOLDER import logging import os from policyengine_uk_data.utils.uprating import uprate_dataset +from policyengine_uk_data.utils.subsample import subsample_dataset from policyengine_uk_data.utils.progress import ( ProcessingProgress, display_success_panel, @@ -37,6 +39,7 @@ def main(): "Calibrate local authority weights", "Downrate to 2023", "Save final dataset", + "Create tiny datasets", ] with progress_tracker.track_dataset_creation(steps) as ( @@ -172,12 +175,30 @@ def main(): frs_calibrated.save(STORAGE_FOLDER / "enhanced_frs_2023_24.h5") update_dataset("Save final dataset", "completed") + # Create tiny (n=1000 households) versions for testing + update_dataset("Create tiny datasets", "processing") + TINY_SIZE = 1_000 + + frs_base = UKSingleYearDataset( + file_path=str(STORAGE_FOLDER / "frs_2023_24.h5") + ) + tiny_frs = subsample_dataset(frs_base, TINY_SIZE) + tiny_frs.save(STORAGE_FOLDER / "frs_2023_24_tiny.h5") + + tiny_enhanced = subsample_dataset(frs_calibrated, TINY_SIZE) + tiny_enhanced.save( + STORAGE_FOLDER / "enhanced_frs_2023_24_tiny.h5" + ) + update_dataset("Create tiny datasets", "completed") + # Display success message display_success_panel( "Dataset creation completed successfully", details={ "base_dataset": "frs_2023_24.h5", "enhanced_dataset": "enhanced_frs_2023_24.h5", + "tiny_base_dataset": "frs_2023_24_tiny.h5", + "tiny_enhanced_dataset": "enhanced_frs_2023_24_tiny.h5", "imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice, student_loan_plan", "calibration": "national, LA and constituency targets", }, diff --git a/policyengine_uk_data/utils/subsample.py b/policyengine_uk_data/utils/subsample.py index 5adc467a..f8f7590e 100644 --- a/policyengine_uk_data/utils/subsample.py +++ b/policyengine_uk_data/utils/subsample.py @@ -15,33 +15,49 @@ def subsample_dataset( seed: int = 42, ): """ - Subsample a UKSingleYearDataset to a specified sample size. + Subsample a UKSingleYearDataset to a specified number of households. + + Households are sampled with probability proportional to their weight, + and weights are rescaled so the subsampled dataset preserves population + totals. Parameters: dataset (UKSingleYearDataset): The dataset to subsample. - sample_size (int): The number of samples to retain. + sample_size (int): The number of households to retain. seed (int): Random seed for reproducibility. Returns: UKSingleYearDataset: A new dataset with the specified sample size. """ - np.random.seed(seed) - household_ids = np.random.choice( - dataset.household.household_id.values, + rng = np.random.default_rng(seed) + household_df = dataset.household + weights = household_df.household_weight.values.astype(float) + total_weight = weights.sum() + + # Sample proportional to weight for a more representative subsample + probs = weights / total_weight + indices = rng.choice( + len(household_df), size=sample_size, replace=False, + p=probs, ) + household_ids = household_df.household_id.values[indices] + person_filter = dataset.person.person_household_id.isin(household_ids) - benunit_ids = dataset.person.person_benunit_id[ - dataset.person.person_household_id.isin(household_ids) - ] + benunit_ids = dataset.person.person_benunit_id[person_filter] benunit_filter = dataset.benunit.benunit_id.isin(benunit_ids) household_filter = dataset.household.household_id.isin(household_ids) + # Rescale weights so the subsample preserves the original population total + sub_household = dataset.household[household_filter].copy() + scale = total_weight / sub_household.household_weight.sum() + sub_household["household_weight"] = sub_household.household_weight * scale + subsampled_dataset = UKSingleYearDataset( - person=dataset.person[person_filter], - benunit=dataset.benunit[benunit_filter], - household=dataset.household[household_filter], + person=dataset.person[person_filter].reset_index(drop=True), + benunit=dataset.benunit[benunit_filter].reset_index(drop=True), + household=sub_household.reset_index(drop=True), fiscal_year=dataset.time_period, ) From fa4905fd7584321c060e4c91a4072b108e55ac2d Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 12 Mar 2026 13:39:03 +0000 Subject: [PATCH 2/3] Fix subsample NaN crash and formatting Handle zero/NaN weights by falling back to uniform sampling, fixing the crash in impute_income which passes a zero-weight dataset. --- .../datasets/create_datasets.py | 4 +--- policyengine_uk_data/utils/subsample.py | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index e763eeb9..846a7390 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -186,9 +186,7 @@ def main(): tiny_frs.save(STORAGE_FOLDER / "frs_2023_24_tiny.h5") tiny_enhanced = subsample_dataset(frs_calibrated, TINY_SIZE) - tiny_enhanced.save( - STORAGE_FOLDER / "enhanced_frs_2023_24_tiny.h5" - ) + tiny_enhanced.save(STORAGE_FOLDER / "enhanced_frs_2023_24_tiny.h5") update_dataset("Create tiny datasets", "completed") # Display success message diff --git a/policyengine_uk_data/utils/subsample.py b/policyengine_uk_data/utils/subsample.py index f8f7590e..351e8cd2 100644 --- a/policyengine_uk_data/utils/subsample.py +++ b/policyengine_uk_data/utils/subsample.py @@ -32,10 +32,15 @@ def subsample_dataset( rng = np.random.default_rng(seed) household_df = dataset.household weights = household_df.household_weight.values.astype(float) - total_weight = weights.sum() + total_weight = np.nansum(weights) + + # Sample proportional to weight when weights are available, + # otherwise fall back to uniform sampling + if total_weight > 0 and not np.any(np.isnan(weights)): + probs = weights / total_weight + else: + probs = None - # Sample proportional to weight for a more representative subsample - probs = weights / total_weight indices = rng.choice( len(household_df), size=sample_size, @@ -51,8 +56,10 @@ def subsample_dataset( # Rescale weights so the subsample preserves the original population total sub_household = dataset.household[household_filter].copy() - scale = total_weight / sub_household.household_weight.sum() - sub_household["household_weight"] = sub_household.household_weight * scale + sub_weight_sum = sub_household.household_weight.sum() + if total_weight > 0 and sub_weight_sum > 0: + scale = total_weight / sub_weight_sum + sub_household["household_weight"] = sub_household.household_weight * scale subsampled_dataset = UKSingleYearDataset( person=dataset.person[person_filter].reset_index(drop=True), From d02d0b39600449706a1baf146db7503efb74d46a Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 12 Mar 2026 13:57:28 +0000 Subject: [PATCH 3/3] Fix weighted SPI sampling with replace=True Pandas rejects weighted sampling without replacement when weights are large. Using replace=True since the sample is used for training data. --- policyengine_uk_data/datasets/imputations/income.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py index 5f0639a7..f9ea4577 100644 --- a/policyengine_uk_data/datasets/imputations/income.py +++ b/policyengine_uk_data/datasets/imputations/income.py @@ -81,7 +81,7 @@ def generate_spi_table(spi: pd.DataFrame): spi = pd.concat( [ - spi.sample(100_000, weights=spi.person_weight), + spi.sample(100_000, weights=spi.person_weight, replace=True), ] )