From 265f7f3fbad998a53f8034c12ec0d483fa5f1f45 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Thu, 12 Mar 2026 13:23:48 +0000
Subject: [PATCH 1/3] Add tiny dataset generation (n=1000 households) to
 pipeline

Produces frs_2023_24_tiny.h5 and enhanced_frs_2023_24_tiny.h5 by
subsampling with probability proportional to weight and rescaling
weights to preserve population totals.
---
 changelog.d/tiny-datasets.added               |  1 +
 .../datasets/create_datasets.py               | 21 ++++++++++
 policyengine_uk_data/utils/subsample.py       | 38 +++++++++++++------
 3 files changed, 49 insertions(+), 11 deletions(-)
 create mode 100644 changelog.d/tiny-datasets.added

diff --git a/changelog.d/tiny-datasets.added b/changelog.d/tiny-datasets.added
new file mode 100644
index 00000000..e2a34058
--- /dev/null
+++ b/changelog.d/tiny-datasets.added
@@ -0,0 +1 @@
+Added tiny (n=1000 household) versions of frs_2023_24 and enhanced_frs_2023_24 datasets for faster testing and development.
diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py
index 2396946e..e763eeb9 100644
--- a/policyengine_uk_data/datasets/create_datasets.py
+++ b/policyengine_uk_data/datasets/create_datasets.py
@@ -1,8 +1,10 @@
+from policyengine_uk.data import UKSingleYearDataset
 from policyengine_uk_data.datasets.frs import create_frs
 from policyengine_uk_data.storage import STORAGE_FOLDER
 import logging
 import os
 from policyengine_uk_data.utils.uprating import uprate_dataset
+from policyengine_uk_data.utils.subsample import subsample_dataset
 from policyengine_uk_data.utils.progress import (
     ProcessingProgress,
     display_success_panel,
@@ -37,6 +39,7 @@ def main():
             "Calibrate local authority weights",
             "Downrate to 2023",
             "Save final dataset",
+            "Create tiny datasets",
         ]
 
         with progress_tracker.track_dataset_creation(steps) as (
@@ -172,12 +175,30 @@ def main():
             frs_calibrated.save(STORAGE_FOLDER / "enhanced_frs_2023_24.h5")
             update_dataset("Save final dataset", "completed")
 
+            # Create tiny (n=1000 households) versions for testing
+            update_dataset("Create tiny datasets", "processing")
+            TINY_SIZE = 1_000
+
+            frs_base = UKSingleYearDataset(
+                file_path=str(STORAGE_FOLDER / "frs_2023_24.h5")
+            )
+            tiny_frs = subsample_dataset(frs_base, TINY_SIZE)
+            tiny_frs.save(STORAGE_FOLDER / "frs_2023_24_tiny.h5")
+
+            tiny_enhanced = subsample_dataset(frs_calibrated, TINY_SIZE)
+            tiny_enhanced.save(
+                STORAGE_FOLDER / "enhanced_frs_2023_24_tiny.h5"
+            )
+            update_dataset("Create tiny datasets", "completed")
+
         # Display success message
         display_success_panel(
             "Dataset creation completed successfully",
             details={
                 "base_dataset": "frs_2023_24.h5",
                 "enhanced_dataset": "enhanced_frs_2023_24.h5",
+                "tiny_base_dataset": "frs_2023_24_tiny.h5",
+                "tiny_enhanced_dataset": "enhanced_frs_2023_24_tiny.h5",
                 "imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice, student_loan_plan",
                 "calibration": "national, LA and  constituency targets",
             },
diff --git a/policyengine_uk_data/utils/subsample.py b/policyengine_uk_data/utils/subsample.py
index 5adc467a..f8f7590e 100644
--- a/policyengine_uk_data/utils/subsample.py
+++ b/policyengine_uk_data/utils/subsample.py
@@ -15,33 +15,49 @@ def subsample_dataset(
     seed: int = 42,
 ):
     """
-    Subsample a UKSingleYearDataset to a specified sample size.
+    Subsample a UKSingleYearDataset to a specified number of households.
+
+    Households are sampled with probability proportional to their weight,
+    and weights are rescaled so the subsampled dataset preserves population
+    totals.
 
     Parameters:
         dataset (UKSingleYearDataset): The dataset to subsample.
-        sample_size (int): The number of samples to retain.
+        sample_size (int): The number of households to retain.
         seed (int): Random seed for reproducibility.
 
     Returns:
         UKSingleYearDataset: A new dataset with the specified sample size.
     """
-    np.random.seed(seed)
-    household_ids = np.random.choice(
-        dataset.household.household_id.values,
+    rng = np.random.default_rng(seed)
+    household_df = dataset.household
+    weights = household_df.household_weight.values.astype(float)
+    total_weight = weights.sum()
+
+    # Sample proportional to weight for a more representative subsample
+    probs = weights / total_weight
+    indices = rng.choice(
+        len(household_df),
         size=sample_size,
         replace=False,
+        p=probs,
     )
+    household_ids = household_df.household_id.values[indices]
+
     person_filter = dataset.person.person_household_id.isin(household_ids)
-    benunit_ids = dataset.person.person_benunit_id[
-        dataset.person.person_household_id.isin(household_ids)
-    ]
+    benunit_ids = dataset.person.person_benunit_id[person_filter]
     benunit_filter = dataset.benunit.benunit_id.isin(benunit_ids)
     household_filter = dataset.household.household_id.isin(household_ids)
 
+    # Rescale weights so the subsample preserves the original population total
+    sub_household = dataset.household[household_filter].copy()
+    scale = total_weight / sub_household.household_weight.sum()
+    sub_household["household_weight"] = sub_household.household_weight * scale
+
     subsampled_dataset = UKSingleYearDataset(
-        person=dataset.person[person_filter],
-        benunit=dataset.benunit[benunit_filter],
-        household=dataset.household[household_filter],
+        person=dataset.person[person_filter].reset_index(drop=True),
+        benunit=dataset.benunit[benunit_filter].reset_index(drop=True),
+        household=sub_household.reset_index(drop=True),
         fiscal_year=dataset.time_period,
     )
 

From fa4905fd7584321c060e4c91a4072b108e55ac2d Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Thu, 12 Mar 2026 13:39:03 +0000
Subject: [PATCH 2/3] Fix subsample NaN crash and formatting

Handle zero/NaN weights by falling back to uniform sampling, fixing
the crash in impute_income which passes a zero-weight dataset.
---
 .../datasets/create_datasets.py                 |  4 +---
 policyengine_uk_data/utils/subsample.py         | 17 ++++++++++++-----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py
index e763eeb9..846a7390 100644
--- a/policyengine_uk_data/datasets/create_datasets.py
+++ b/policyengine_uk_data/datasets/create_datasets.py
@@ -186,9 +186,7 @@ def main():
             tiny_frs.save(STORAGE_FOLDER / "frs_2023_24_tiny.h5")
 
             tiny_enhanced = subsample_dataset(frs_calibrated, TINY_SIZE)
-            tiny_enhanced.save(
-                STORAGE_FOLDER / "enhanced_frs_2023_24_tiny.h5"
-            )
+            tiny_enhanced.save(STORAGE_FOLDER / "enhanced_frs_2023_24_tiny.h5")
             update_dataset("Create tiny datasets", "completed")
 
         # Display success message
diff --git a/policyengine_uk_data/utils/subsample.py b/policyengine_uk_data/utils/subsample.py
index f8f7590e..351e8cd2 100644
--- a/policyengine_uk_data/utils/subsample.py
+++ b/policyengine_uk_data/utils/subsample.py
@@ -32,10 +32,15 @@ def subsample_dataset(
     rng = np.random.default_rng(seed)
     household_df = dataset.household
     weights = household_df.household_weight.values.astype(float)
-    total_weight = weights.sum()
+    total_weight = np.nansum(weights)
+
+    # Sample proportional to weight when weights are available,
+    # otherwise fall back to uniform sampling
+    if total_weight > 0 and not np.any(np.isnan(weights)):
+        probs = weights / total_weight
+    else:
+        probs = None
 
-    # Sample proportional to weight for a more representative subsample
-    probs = weights / total_weight
     indices = rng.choice(
         len(household_df),
         size=sample_size,
@@ -51,8 +56,10 @@ def subsample_dataset(
 
     # Rescale weights so the subsample preserves the original population total
     sub_household = dataset.household[household_filter].copy()
-    scale = total_weight / sub_household.household_weight.sum()
-    sub_household["household_weight"] = sub_household.household_weight * scale
+    sub_weight_sum = sub_household.household_weight.sum()
+    if total_weight > 0 and sub_weight_sum > 0:
+        scale = total_weight / sub_weight_sum
+        sub_household["household_weight"] = sub_household.household_weight * scale
 
     subsampled_dataset = UKSingleYearDataset(
         person=dataset.person[person_filter].reset_index(drop=True),

From d02d0b39600449706a1baf146db7503efb74d46a Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Thu, 12 Mar 2026 13:57:28 +0000
Subject: [PATCH 3/3] Fix weighted SPI sampling with replace=True

Pandas rejects weighted sampling without replacement when weights are
large. Using replace=True since the sample is used for training data.
---
 policyengine_uk_data/datasets/imputations/income.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py
index 5f0639a7..f9ea4577 100644
--- a/policyengine_uk_data/datasets/imputations/income.py
+++ b/policyengine_uk_data/datasets/imputations/income.py
@@ -81,7 +81,7 @@ def generate_spi_table(spi: pd.DataFrame):
 
     spi = pd.concat(
         [
-            spi.sample(100_000, weights=spi.person_weight),
+            spi.sample(100_000, weights=spi.person_weight, replace=True),
         ]
     )