diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py index 25c7975a..38d4883f 100644 --- a/policyengine_us_data/calibration/source_impute.py +++ b/policyengine_us_data/calibration/source_impute.py @@ -72,6 +72,9 @@ SIPP_ASSETS_PREDICTORS = [ "employment_income", + "interest_income", + "dividend_income", + "rental_income", "age", "is_female", "is_married", @@ -450,6 +453,10 @@ def _impute_sipp( "TVAL_BANK", "TVAL_STMF", "TVAL_BOND", + "TINC_BANK", + "TINC_STMF", + "TINC_BOND", + "TINC_RENT", ] asset_df = pd.read_csv( STORAGE_FOLDER / "pu2023.csv", @@ -465,6 +472,11 @@ def _impute_sipp( asset_df["is_female"] = asset_df.ESEX == 2 asset_df["is_married"] = asset_df.EMS == 1 asset_df["employment_income"] = asset_df.TPTOTINC * 12 + asset_df["interest_income"] = ( + asset_df["TINC_BANK"].fillna(0) + asset_df["TINC_BOND"].fillna(0) + ) * 12 + asset_df["dividend_income"] = asset_df["TINC_STMF"].fillna(0) * 12 + asset_df["rental_income"] = asset_df["TINC_RENT"].fillna(0) * 12 asset_df["household_weight"] = asset_df.WPFINWGT asset_df["is_under_18"] = asset_df.TAGE < 18 asset_df["count_under_18"] = ( @@ -476,6 +488,9 @@ def _impute_sipp( asset_train_cols = [ "employment_income", + "interest_income", + "dividend_income", + "rental_income", "bank_account_assets", "stock_assets", "bond_assets", @@ -499,7 +514,14 @@ def _impute_sipp( data, time_period, dataset_path, - ["employment_income", "age", "is_male"], + [ + "employment_income", + "interest_income", + "dividend_income", + "rental_income", + "age", + "is_male", + ], ) if "is_male" in cps_asset_df.columns: cps_asset_df["is_female"] = (~cps_asset_df["is_male"].astype(bool)).astype( @@ -518,6 +540,18 @@ def _impute_sipp( if "count_under_18" in cps_tip_df.columns else 0.0 ) + for cap_var in [ + "interest_income", + "dividend_income", + "rental_income", + ]: + if cap_var not in cps_asset_df.columns: + if cap_var in data: + cps_asset_df[cap_var] = data[cap_var][time_period].astype( + np.float32 + ) + else: + cps_asset_df[cap_var] = 0.0 asset_vars = [ "bank_account_assets", diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 418d7396..8bc674ce 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -1759,6 +1759,9 @@ def add_tips(self, cps: h5py.File): "person_id", "household_id", "employment_income", + "interest_income", + "dividend_income", + "rental_income", "age", "household_weight", "is_female", diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py index d7708266..4588685d 100644 --- a/policyengine_us_data/datasets/sipp/sipp.py +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -155,6 +155,11 @@ def get_tip_model() -> QRF: "TVAL_BANK", # Checking, savings, money market "TVAL_STMF", # Stocks and mutual funds "TVAL_BOND", # Bonds and government securities + # Income from assets (monthly, person-level) + "TINC_BANK", # Interest from bank accounts + "TINC_STMF", # Dividends from stocks/mutual funds + "TINC_BOND", # Interest from bonds + "TINC_RENT", # Rental income # SSI receipt (for validation) "RSSI_YRYN", # Received SSI in at least one month ] @@ -199,6 +204,12 @@ def train_asset_model(): df["household_weight"] = df.WPFINWGT df["household_id"] = df.SSUID + # Capital income predictors (annualized from monthly SIPP) + # Maps to CPS: interest_income, dividend_income, rental_income + df["interest_income"] = (df["TINC_BANK"].fillna(0) + df["TINC_BOND"].fillna(0)) * 12 + df["dividend_income"] = df["TINC_STMF"].fillna(0) * 12 + df["rental_income"] = df["TINC_RENT"].fillna(0) * 12 + # Calculate household-level counts df["is_under_18"] = df.TAGE < 18 df["count_under_18"] = ( @@ -209,6 +220,9 @@ def train_asset_model(): [ "household_id", "employment_income", + "interest_income", + "dividend_income", + "rental_income", "bank_account_assets", "stock_assets", "bond_assets", @@ -238,6 +252,9 @@ def train_asset_model(): X_train=sipp, predictors=[ "employment_income", + "interest_income", + "dividend_income", + "rental_income", "age", "is_female", "is_married",