diff --git a/changelog.d/tiny-datasets.added b/changelog.d/tiny-datasets.added new file mode 100644 index 00000000..e2a34058 --- /dev/null +++ b/changelog.d/tiny-datasets.added @@ -0,0 +1 @@ +Added tiny (n=1000 household) versions of frs_2023_24 and enhanced_frs_2023_24 datasets for faster testing and development. diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index 2396946e..846a7390 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -1,8 +1,10 @@ +from policyengine_uk.data import UKSingleYearDataset from policyengine_uk_data.datasets.frs import create_frs from policyengine_uk_data.storage import STORAGE_FOLDER import logging import os from policyengine_uk_data.utils.uprating import uprate_dataset +from policyengine_uk_data.utils.subsample import subsample_dataset from policyengine_uk_data.utils.progress import ( ProcessingProgress, display_success_panel, @@ -37,6 +39,7 @@ def main(): "Calibrate local authority weights", "Downrate to 2023", "Save final dataset", + "Create tiny datasets", ] with progress_tracker.track_dataset_creation(steps) as ( @@ -172,12 +175,28 @@ def main(): frs_calibrated.save(STORAGE_FOLDER / "enhanced_frs_2023_24.h5") update_dataset("Save final dataset", "completed") + # Create tiny (n=1000 households) versions for testing + update_dataset("Create tiny datasets", "processing") + TINY_SIZE = 1_000 + + frs_base = UKSingleYearDataset( + file_path=str(STORAGE_FOLDER / "frs_2023_24.h5") + ) + tiny_frs = subsample_dataset(frs_base, TINY_SIZE) + tiny_frs.save(STORAGE_FOLDER / "frs_2023_24_tiny.h5") + + tiny_enhanced = subsample_dataset(frs_calibrated, TINY_SIZE) + tiny_enhanced.save(STORAGE_FOLDER / "enhanced_frs_2023_24_tiny.h5") + update_dataset("Create tiny datasets", "completed") + # Display success message display_success_panel( "Dataset creation completed successfully", details={ "base_dataset": "frs_2023_24.h5", "enhanced_dataset": "enhanced_frs_2023_24.h5", + "tiny_base_dataset": "frs_2023_24_tiny.h5", + "tiny_enhanced_dataset": "enhanced_frs_2023_24_tiny.h5", "imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice, student_loan_plan", "calibration": "national, LA and constituency targets", }, diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py index 5f0639a7..f9ea4577 100644 --- a/policyengine_uk_data/datasets/imputations/income.py +++ b/policyengine_uk_data/datasets/imputations/income.py @@ -81,7 +81,7 @@ def generate_spi_table(spi: pd.DataFrame): spi = pd.concat( [ - spi.sample(100_000, weights=spi.person_weight), + spi.sample(100_000, weights=spi.person_weight, replace=True), ] ) diff --git a/policyengine_uk_data/utils/subsample.py b/policyengine_uk_data/utils/subsample.py index 5adc467a..351e8cd2 100644 --- a/policyengine_uk_data/utils/subsample.py +++ b/policyengine_uk_data/utils/subsample.py @@ -15,33 +15,56 @@ def subsample_dataset( seed: int = 42, ): """ - Subsample a UKSingleYearDataset to a specified sample size. + Subsample a UKSingleYearDataset to a specified number of households. + + Households are sampled with probability proportional to their weight, + and weights are rescaled so the subsampled dataset preserves population + totals. Parameters: dataset (UKSingleYearDataset): The dataset to subsample. - sample_size (int): The number of samples to retain. + sample_size (int): The number of households to retain. seed (int): Random seed for reproducibility. Returns: UKSingleYearDataset: A new dataset with the specified sample size. """ - np.random.seed(seed) - household_ids = np.random.choice( - dataset.household.household_id.values, + rng = np.random.default_rng(seed) + household_df = dataset.household + weights = household_df.household_weight.values.astype(float) + total_weight = np.nansum(weights) + + # Sample proportional to weight when weights are available, + # otherwise fall back to uniform sampling + if total_weight > 0 and not np.any(np.isnan(weights)): + probs = weights / total_weight + else: + probs = None + + indices = rng.choice( + len(household_df), size=sample_size, replace=False, + p=probs, ) + household_ids = household_df.household_id.values[indices] + person_filter = dataset.person.person_household_id.isin(household_ids) - benunit_ids = dataset.person.person_benunit_id[ - dataset.person.person_household_id.isin(household_ids) - ] + benunit_ids = dataset.person.person_benunit_id[person_filter] benunit_filter = dataset.benunit.benunit_id.isin(benunit_ids) household_filter = dataset.household.household_id.isin(household_ids) + # Rescale weights so the subsample preserves the original population total + sub_household = dataset.household[household_filter].copy() + sub_weight_sum = sub_household.household_weight.sum() + if total_weight > 0 and sub_weight_sum > 0: + scale = total_weight / sub_weight_sum + sub_household["household_weight"] = sub_household.household_weight * scale + subsampled_dataset = UKSingleYearDataset( - person=dataset.person[person_filter], - benunit=dataset.benunit[benunit_filter], - household=dataset.household[household_filter], + person=dataset.person[person_filter].reset_index(drop=True), + benunit=dataset.benunit[benunit_filter].reset_index(drop=True), + household=sub_household.reset_index(drop=True), fiscal_year=dataset.time_period, )