PolicyEngine · nwoodruff-co · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
diff --git a/changelog.d/tiny-datasets.added b/changelog.d/tiny-datasets.added
@@ -0,0 +1 @@
+Added tiny (n=1000 household) versions of frs_2023_24 and enhanced_frs_2023_24 datasets for faster testing and development.
diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py
@@ -1,8 +1,10 @@
+from policyengine_uk.data import UKSingleYearDataset
 from policyengine_uk_data.datasets.frs import create_frs
 from policyengine_uk_data.storage import STORAGE_FOLDER
 import logging
 import os
 from policyengine_uk_data.utils.uprating import uprate_dataset
+from policyengine_uk_data.utils.subsample import subsample_dataset
 from policyengine_uk_data.utils.progress import (
     ProcessingProgress,
     display_success_panel,
@@ -37,6 +39,7 @@ def main():
             "Calibrate local authority weights",
             "Downrate to 2023",
             "Save final dataset",
+            "Create tiny datasets",
         ]
 
         with progress_tracker.track_dataset_creation(steps) as (
@@ -172,12 +175,28 @@ def main():
             frs_calibrated.save(STORAGE_FOLDER / "enhanced_frs_2023_24.h5")
             update_dataset("Save final dataset", "completed")
 
+            # Create tiny (n=1000 households) versions for testing
+            update_dataset("Create tiny datasets", "processing")
+            TINY_SIZE = 1_000
+
+            frs_base = UKSingleYearDataset(
+                file_path=str(STORAGE_FOLDER / "frs_2023_24.h5")
+            )
+            tiny_frs = subsample_dataset(frs_base, TINY_SIZE)
+            tiny_frs.save(STORAGE_FOLDER / "frs_2023_24_tiny.h5")
+
+            tiny_enhanced = subsample_dataset(frs_calibrated, TINY_SIZE)
+            tiny_enhanced.save(STORAGE_FOLDER / "enhanced_frs_2023_24_tiny.h5")
+            update_dataset("Create tiny datasets", "completed")
+
         # Display success message
         display_success_panel(
             "Dataset creation completed successfully",
             details={
                 "base_dataset": "frs_2023_24.h5",
                 "enhanced_dataset": "enhanced_frs_2023_24.h5",
+                "tiny_base_dataset": "frs_2023_24_tiny.h5",
+                "tiny_enhanced_dataset": "enhanced_frs_2023_24_tiny.h5",
                 "imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice, student_loan_plan",
                 "calibration": "national, LA and  constituency targets",
             },

diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py
@@ -81,7 +81,7 @@ def generate_spi_table(spi: pd.DataFrame):
 
     spi = pd.concat(
         [
-            spi.sample(100_000, weights=spi.person_weight),
+            spi.sample(100_000, weights=spi.person_weight, replace=True),
         ]
     )
 

diff --git a/policyengine_uk_data/utils/subsample.py b/policyengine_uk_data/utils/subsample.py
@@ -15,33 +15,56 @@ def subsample_dataset(
     seed: int = 42,
 ):
     """
-    Subsample a UKSingleYearDataset to a specified sample size.
+    Subsample a UKSingleYearDataset to a specified number of households.
+
+    Households are sampled with probability proportional to their weight,
+    and weights are rescaled so the subsampled dataset preserves population
+    totals.
 
     Parameters:
         dataset (UKSingleYearDataset): The dataset to subsample.
-        sample_size (int): The number of samples to retain.
+        sample_size (int): The number of households to retain.
         seed (int): Random seed for reproducibility.
 
     Returns:
         UKSingleYearDataset: A new dataset with the specified sample size.
     """
-    np.random.seed(seed)
-    household_ids = np.random.choice(
-        dataset.household.household_id.values,
+    rng = np.random.default_rng(seed)
+    household_df = dataset.household
+    weights = household_df.household_weight.values.astype(float)
+    total_weight = np.nansum(weights)
+
+    # Sample proportional to weight when weights are available,
+    # otherwise fall back to uniform sampling
+    if total_weight > 0 and not np.any(np.isnan(weights)):
+        probs = weights / total_weight
+    else:
+        probs = None
+
+    indices = rng.choice(
+        len(household_df),
         size=sample_size,
         replace=False,
+        p=probs,
     )
+    household_ids = household_df.household_id.values[indices]
+
     person_filter = dataset.person.person_household_id.isin(household_ids)
-    benunit_ids = dataset.person.person_benunit_id[
-        dataset.person.person_household_id.isin(household_ids)
-    ]
+    benunit_ids = dataset.person.person_benunit_id[person_filter]
     benunit_filter = dataset.benunit.benunit_id.isin(benunit_ids)
     household_filter = dataset.household.household_id.isin(household_ids)
 
+    # Rescale weights so the subsample preserves the original population total
+    sub_household = dataset.household[household_filter].copy()
+    sub_weight_sum = sub_household.household_weight.sum()
+    if total_weight > 0 and sub_weight_sum > 0:
+        scale = total_weight / sub_weight_sum
+        sub_household["household_weight"] = sub_household.household_weight * scale
+
     subsampled_dataset = UKSingleYearDataset(
-        person=dataset.person[person_filter],
-        benunit=dataset.benunit[benunit_filter],
-        household=dataset.household[household_filter],
+        person=dataset.person[person_filter].reset_index(drop=True),
+        benunit=dataset.benunit[benunit_filter].reset_index(drop=True),
+        household=sub_household.reset_index(drop=True),
         fiscal_year=dataset.time_period,
     )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Added tiny (n=1000 household) versions of frs_2023_24 and enhanced_frs_2023_24 datasets for faster testing and development.
-Original file line number
+Diff line change
@@ Expand Up / @@ -81,7 +81,7 @@ def generate_spi_table(spi: pd.DataFrame): @@
         spi = pd.concat(
             [
-                spi.sample(100_000, weights=spi.person_weight),
+                spi.sample(100_000, weights=spi.person_weight, replace=True),
             ]
         )
@@ Expand Down @@