Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/tiny-datasets.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added tiny (n=1000 household) versions of frs_2023_24 and enhanced_frs_2023_24 datasets for faster testing and development.
19 changes: 19 additions & 0 deletions policyengine_uk_data/datasets/create_datasets.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk_data.datasets.frs import create_frs
from policyengine_uk_data.storage import STORAGE_FOLDER
import logging
import os
from policyengine_uk_data.utils.uprating import uprate_dataset
from policyengine_uk_data.utils.subsample import subsample_dataset
from policyengine_uk_data.utils.progress import (
ProcessingProgress,
display_success_panel,
Expand Down Expand Up @@ -37,6 +39,7 @@ def main():
"Calibrate local authority weights",
"Downrate to 2023",
"Save final dataset",
"Create tiny datasets",
]

with progress_tracker.track_dataset_creation(steps) as (
Expand Down Expand Up @@ -172,12 +175,28 @@ def main():
frs_calibrated.save(STORAGE_FOLDER / "enhanced_frs_2023_24.h5")
update_dataset("Save final dataset", "completed")

# Create tiny (n=1000 households) versions for testing
update_dataset("Create tiny datasets", "processing")
TINY_SIZE = 1_000

frs_base = UKSingleYearDataset(
file_path=str(STORAGE_FOLDER / "frs_2023_24.h5")
)
tiny_frs = subsample_dataset(frs_base, TINY_SIZE)
tiny_frs.save(STORAGE_FOLDER / "frs_2023_24_tiny.h5")

tiny_enhanced = subsample_dataset(frs_calibrated, TINY_SIZE)
tiny_enhanced.save(STORAGE_FOLDER / "enhanced_frs_2023_24_tiny.h5")
update_dataset("Create tiny datasets", "completed")

# Display success message
display_success_panel(
"Dataset creation completed successfully",
details={
"base_dataset": "frs_2023_24.h5",
"enhanced_dataset": "enhanced_frs_2023_24.h5",
"tiny_base_dataset": "frs_2023_24_tiny.h5",
"tiny_enhanced_dataset": "enhanced_frs_2023_24_tiny.h5",
"imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice, student_loan_plan",
"calibration": "national, LA and constituency targets",
},
Expand Down
2 changes: 1 addition & 1 deletion policyengine_uk_data/datasets/imputations/income.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def generate_spi_table(spi: pd.DataFrame):

spi = pd.concat(
[
spi.sample(100_000, weights=spi.person_weight),
spi.sample(100_000, weights=spi.person_weight, replace=True),
]
)

Expand Down
45 changes: 34 additions & 11 deletions policyengine_uk_data/utils/subsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,33 +15,56 @@ def subsample_dataset(
seed: int = 42,
):
"""
Subsample a UKSingleYearDataset to a specified sample size.
Subsample a UKSingleYearDataset to a specified number of households.

Households are sampled with probability proportional to their weight,
and weights are rescaled so the subsampled dataset preserves population
totals.

Parameters:
dataset (UKSingleYearDataset): The dataset to subsample.
sample_size (int): The number of samples to retain.
sample_size (int): The number of households to retain.
seed (int): Random seed for reproducibility.

Returns:
UKSingleYearDataset: A new dataset with the specified sample size.
"""
np.random.seed(seed)
household_ids = np.random.choice(
dataset.household.household_id.values,
rng = np.random.default_rng(seed)
household_df = dataset.household
weights = household_df.household_weight.values.astype(float)
total_weight = np.nansum(weights)

# Sample proportional to weight when weights are available,
# otherwise fall back to uniform sampling
if total_weight > 0 and not np.any(np.isnan(weights)):
probs = weights / total_weight
else:
probs = None

indices = rng.choice(
len(household_df),
size=sample_size,
replace=False,
p=probs,
)
household_ids = household_df.household_id.values[indices]

person_filter = dataset.person.person_household_id.isin(household_ids)
benunit_ids = dataset.person.person_benunit_id[
dataset.person.person_household_id.isin(household_ids)
]
benunit_ids = dataset.person.person_benunit_id[person_filter]
benunit_filter = dataset.benunit.benunit_id.isin(benunit_ids)
household_filter = dataset.household.household_id.isin(household_ids)

# Rescale weights so the subsample preserves the original population total
sub_household = dataset.household[household_filter].copy()
sub_weight_sum = sub_household.household_weight.sum()
if total_weight > 0 and sub_weight_sum > 0:
scale = total_weight / sub_weight_sum
sub_household["household_weight"] = sub_household.household_weight * scale

subsampled_dataset = UKSingleYearDataset(
person=dataset.person[person_filter],
benunit=dataset.benunit[benunit_filter],
household=dataset.household[household_filter],
person=dataset.person[person_filter].reset_index(drop=True),
benunit=dataset.benunit[benunit_filter].reset_index(drop=True),
household=sub_household.reset_index(drop=True),
fiscal_year=dataset.time_period,
)

Expand Down
Loading