From f32b8056a5de433c848e2fa7652e918a6dc2f8d9 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Sat, 16 Apr 2022 10:33:09 +0100 Subject: [PATCH 01/27] Add RawACS --- openfisca_us/__init__.py | 1 + openfisca_us/data/__init__.py | 2 +- openfisca_us/data/datasets/__init__.py | 5 +- openfisca_us/data/datasets/acs/__init__.py | 2 + openfisca_us/data/datasets/acs/acs.py | 82 +++++++++++++++++++ openfisca_us/data/datasets/acs/raw_acs.py | 92 ++++++++++++++++++++++ 6 files changed, 181 insertions(+), 3 deletions(-) create mode 100644 openfisca_us/data/datasets/acs/__init__.py create mode 100644 openfisca_us/data/datasets/acs/acs.py create mode 100644 openfisca_us/data/datasets/acs/raw_acs.py diff --git a/openfisca_us/__init__.py b/openfisca_us/__init__.py index c50e1db6b1c..7c638e14845 100644 --- a/openfisca_us/__init__.py +++ b/openfisca_us/__init__.py @@ -9,4 +9,5 @@ """ from openfisca_us.system import CountryTaxBenefitSystem from openfisca_us.api import Microsimulation, IndividualSim +from openfisca_us.data import ACS, CPS from openfisca_us import reforms diff --git a/openfisca_us/data/__init__.py b/openfisca_us/data/__init__.py index 34604780c73..44252f109ee 100644 --- a/openfisca_us/data/__init__.py +++ b/openfisca_us/data/__init__.py @@ -1 +1 @@ -from openfisca_us.data.datasets import CPS, RawCPS +from openfisca_us.data.datasets import CPS, RawCPS, ACS, RawACS diff --git a/openfisca_us/data/datasets/__init__.py b/openfisca_us/data/datasets/__init__.py index cfef16c0268..030227a241e 100644 --- a/openfisca_us/data/datasets/__init__.py +++ b/openfisca_us/data/datasets/__init__.py @@ -1,3 +1,4 @@ -from openfisca_us.data.datasets.cps import CPS, RawCPS +from .cps import CPS, RawCPS +from .acs import ACS, RawACS -DATASETS = [CPS, RawCPS] +DATASETS = [CPS, RawCPS, ACS, RawACS] diff --git a/openfisca_us/data/datasets/acs/__init__.py b/openfisca_us/data/datasets/acs/__init__.py new file mode 100644 index 00000000000..6ed1f62bcf5 --- /dev/null +++ b/openfisca_us/data/datasets/acs/__init__.py @@ -0,0 +1,2 @@ +from openfisca_us_data.datasets.acs.raw_acs import RawACS +from openfisca_us_data.datasets.acs.acs import ACS diff --git a/openfisca_us/data/datasets/acs/acs.py b/openfisca_us/data/datasets/acs/acs.py new file mode 100644 index 00000000000..169ab3eeea2 --- /dev/null +++ b/openfisca_us/data/datasets/acs/acs.py @@ -0,0 +1,82 @@ +import logging +from openfisca_tools.data import PublicDataset +import h5py +from openfisca_us.data.datasets.acs.raw_acs import RawACS +from openfisca_us.data.datasets.cps.raw_cps import RawCPS +from openfisca_us.data.storage import OPENFISCA_US_MICRODATA_FOLDER +from pandas import DataFrame, Series +import numpy as np + + +class ACS(PublicDataset): + name = "acs" + is_openfisca_compatible = True + + def generate(self, year: int) -> None: + """Generates the ACS dataset. + + Args: + year (int): The year of the raw ACS to use. + """ + + # Prepare raw ACS tables + year = int(year) + if year in self.years: + self.remove(year) + if year not in RawACS.years: + RawACS.generate(year) + + raw_data = RawACS.load(year) + acs = h5py.File(ACS.file(year), mode="w") + + person, spm_unit, household = [ + raw_data[entity] for entity in ("person", "spm_unit", "household") + ] + + add_ID_variables(acs, person, spm_unit, household) + add_SPM_variables(acs, spm_unit) + + raw_data.close() + acs.close() + +ACS = ACS() + + +def add_ID_variables( + acs: h5py.File, + person: DataFrame, + spm_unit: DataFrame, + household: DataFrame, +): + """Add basic ID and weight variables. + + Args: + acs (h5py.File): The ACS dataset file. + person (DataFrame): The person table of the ACS. + spm_unit (DataFrame): The SPM unit table created from the person table + of the ACS. + household (DataFrame): The household table of the ACS. + """ + # Add primary and foreign keys + acs["person_id"] = person.SERIALNO * 1e2 + person.SPORDER + acs["person_spm_unit_id"] = person.SPM_ID + acs["spm_unit_id"] = spm_unit.SPM_ID + # ACS doesn't have tax units. + acs["tax_unit_id"] = spm_unit.SPM_ID + # Until we add a family table, we'll use the person table. + acs["family_id"] = spm_unit.SPM_ID + acs["person_household_id"] = person.SERIALNO + acs["person_tax_unit_id"] = person.SPM_ID + acs["person_family_id"] = person.SPM_ID + acs["household_id"] = household.SERIALNO + + # Add weights + acs["person_weight"] = person.WT + +def add_person_variables(acs: h5py.File, person: DataFrame): + pass + + +def add_spm_variables(acs: h5py.File, spm_unit: DataFrame): + acs["spm_unit_net_income"] = spm_unit.SPM_RESOURCES + acs["poverty_threshold"] = spm_unit.SPM_POVTHRESHOLD diff --git a/openfisca_us/data/datasets/acs/raw_acs.py b/openfisca_us/data/datasets/acs/raw_acs.py new file mode 100644 index 00000000000..ed9457649f1 --- /dev/null +++ b/openfisca_us/data/datasets/acs/raw_acs.py @@ -0,0 +1,92 @@ +from io import BytesIO +import logging +from zipfile import ZipFile +import pandas as pd +from openfisca_tools.data import PublicDataset +import h5py +import requests +from openfisca_us.data.datasets.cps.raw_cps import RawCPS +from openfisca_us.data.storage import OPENFISCA_US_MICRODATA_FOLDER +from pandas import DataFrame, Series +import numpy as np + + +class RawACS(PublicDataset): + name = "raw_acs" + label = "Raw ACS" + + def generate(self, year: int) -> None: + year = int(year) + if year in self.years: + self.remove(year) + + + spm_url = f"https://www2.census.gov/programs-surveys/supplemental-poverty-measure/datasets/spm/spm_{year}_pu.dta" + person_url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_pus.zip" + household_url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_hus.zip" + try: + with pd.HDFStore(RawACS.file(year)) as storage: + # Person file + logging.info(f"Downloading person file") + storage["person"] = concat_zipped_csvs(person_url, "psam_pus") + # Household file + logging.info(f"Downloading household file") + storage["household"] = concat_zipped_csvs(household_url, "psam_hus") + # SPM unit file + logging.info(f"Downloading SPM unit file") + spm_person = pd.read_stata(spm_url).fillna(0) + spm_person.columns = spm_person.columns.str.upper() + storage["spm_unit"] = create_spm_unit_table(spm_person) + except Exception as e: + RawACS.remove(year) + raise ValueError( + f"Attempted to extract and save the CSV files, but encountered an error: {e}" + ) + +RawACS = RawACS() + +def concat_zipped_csvs(url: str, prefix: str) -> pd.DataFrame: + # Creates a DataFrame with the two csvs inside a zip file from a URL. + zf = ZipFile(BytesIO(requests.get(url))) + a = pd.read_csv(zf.open(prefix + "a.csv")) + b = pd.read_csv(zf.open(prefix + "b.csv")) + res = pd.concat([a, b]).fillna(0) + res.columns = res.columns.str.upper() + return res + + +def create_spm_unit_table(person: pd.DataFrame) -> pd.DataFrame: + SPM_UNIT_COLUMNS = [ + "CAPHOUSESUB", + "CAPWKCCXPNS", + "CHILDCAREXPNS", + "EITC", + "ENGVAL", + "EQUIVSCALE", + "FEDTAX", + "FEDTAXBC", + "FICA", + "GEOADJ", + "MEDXPNS", + "NUMADULTS", + "NUMKIDS", + "NUMPER", + "POOR", + "POVTHRESHOLD", + "RESOURCES", + "SCHLUNCH", + "SNAPSUB", + "STTAX", + "TENMORTSTATUS", + "TOTVAL", + "WCOHABIT", + "WICVAL", + "WKXPNS", + "WUI_LT15", + "ID", + ] + return ( + person[["SPM_" + column for column in SPM_UNIT_COLUMNS]] + .groupby(person.SPM_ID) + .first() + ) From 3ae098f8bd168c2c89e4a2b13c838ed1e71060f6 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Sat, 16 Apr 2022 11:09:51 +0100 Subject: [PATCH 02/27] Add person, household files to RawACS --- openfisca_us/data/datasets/acs/__init__.py | 4 ++-- openfisca_us/data/datasets/acs/acs.py | 8 ++++--- openfisca_us/data/datasets/acs/raw_acs.py | 25 ++++++++++++++++------ 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/openfisca_us/data/datasets/acs/__init__.py b/openfisca_us/data/datasets/acs/__init__.py index 6ed1f62bcf5..881fc59af8a 100644 --- a/openfisca_us/data/datasets/acs/__init__.py +++ b/openfisca_us/data/datasets/acs/__init__.py @@ -1,2 +1,2 @@ -from openfisca_us_data.datasets.acs.raw_acs import RawACS -from openfisca_us_data.datasets.acs.acs import ACS +from openfisca_us.data.datasets.acs.raw_acs import RawACS +from openfisca_us.data.datasets.acs.acs import ACS diff --git a/openfisca_us/data/datasets/acs/acs.py b/openfisca_us/data/datasets/acs/acs.py index 169ab3eeea2..57a0b0cd040 100644 --- a/openfisca_us/data/datasets/acs/acs.py +++ b/openfisca_us/data/datasets/acs/acs.py @@ -11,6 +11,8 @@ class ACS(PublicDataset): name = "acs" is_openfisca_compatible = True + label = "ACS" + folder_path = OPENFISCA_US_MICRODATA_FOLDER def generate(self, year: int) -> None: """Generates the ACS dataset. @@ -33,8 +35,8 @@ def generate(self, year: int) -> None: raw_data[entity] for entity in ("person", "spm_unit", "household") ] - add_ID_variables(acs, person, spm_unit, household) - add_SPM_variables(acs, spm_unit) + add_id_variables(acs, person, spm_unit, household) + add_spm_variables(acs, spm_unit) raw_data.close() acs.close() @@ -42,7 +44,7 @@ def generate(self, year: int) -> None: ACS = ACS() -def add_ID_variables( +def add_id_variables( acs: h5py.File, person: DataFrame, spm_unit: DataFrame, diff --git a/openfisca_us/data/datasets/acs/raw_acs.py b/openfisca_us/data/datasets/acs/raw_acs.py index ed9457649f1..93ceb6edc7e 100644 --- a/openfisca_us/data/datasets/acs/raw_acs.py +++ b/openfisca_us/data/datasets/acs/raw_acs.py @@ -5,15 +5,20 @@ from openfisca_tools.data import PublicDataset import h5py import requests +from tqdm import tqdm from openfisca_us.data.datasets.cps.raw_cps import RawCPS from openfisca_us.data.storage import OPENFISCA_US_MICRODATA_FOLDER from pandas import DataFrame, Series import numpy as np +logging.getLogger().setLevel(logging.INFO) + class RawACS(PublicDataset): name = "raw_acs" label = "Raw ACS" + is_openfisca_compatible = False + folder_path = OPENFISCA_US_MICRODATA_FOLDER def generate(self, year: int) -> None: year = int(year) @@ -26,12 +31,12 @@ def generate(self, year: int) -> None: household_url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_hus.zip" try: with pd.HDFStore(RawACS.file(year)) as storage: - # Person file - logging.info(f"Downloading person file") - storage["person"] = concat_zipped_csvs(person_url, "psam_pus") # Household file logging.info(f"Downloading household file") storage["household"] = concat_zipped_csvs(household_url, "psam_hus") + # Person file + logging.info(f"Downloading person file") + storage["person"] = concat_zipped_csvs(person_url, "psam_pus") # SPM unit file logging.info(f"Downloading SPM unit file") spm_person = pd.read_stata(spm_url).fillna(0) @@ -47,9 +52,17 @@ def generate(self, year: int) -> None: def concat_zipped_csvs(url: str, prefix: str) -> pd.DataFrame: # Creates a DataFrame with the two csvs inside a zip file from a URL. - zf = ZipFile(BytesIO(requests.get(url))) - a = pd.read_csv(zf.open(prefix + "a.csv")) - b = pd.read_csv(zf.open(prefix + "b.csv")) + req = requests.get(url, stream=True) + with BytesIO() as f: + pbar = tqdm() + for chunk in req.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + pbar.update (len(chunk)) + f.write(chunk) + f.seek(0) + zf = ZipFile(f) + a = pd.read_csv(zf.open(prefix + "a.csv")) + b = pd.read_csv(zf.open(prefix + "b.csv")) res = pd.concat([a, b]).fillna(0) res.columns = res.columns.str.upper() return res From 5c79d694836ea3914ea689bc218aaf030deb98a1 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Sat, 16 Apr 2022 11:39:41 +0100 Subject: [PATCH 03/27] Add columns --- openfisca_us/data/datasets/acs/raw_acs.py | 70 ++++++++++++++++++++--- 1 file changed, 63 insertions(+), 7 deletions(-) diff --git a/openfisca_us/data/datasets/acs/raw_acs.py b/openfisca_us/data/datasets/acs/raw_acs.py index 93ceb6edc7e..99f78f214e8 100644 --- a/openfisca_us/data/datasets/acs/raw_acs.py +++ b/openfisca_us/data/datasets/acs/raw_acs.py @@ -1,5 +1,6 @@ from io import BytesIO import logging +from typing import List from zipfile import ZipFile import pandas as pd from openfisca_tools.data import PublicDataset @@ -13,6 +14,48 @@ logging.getLogger().setLevel(logging.INFO) +PERSON_COLUMNS = [ + "SERIALNO", # Household ID + "SPORDER", # Person number within household + "PWGTP", # Person weight + "AGEP", # Age + "CIT", # Citizenship + "MAR", # Marital status + "WAGP", # Wage/salary + "SSP", # Social security income + "SSIP", # Supplemental security income + "SEX", # Sex + "SEMP", # Self-employment income + "SCHL", # Educational attainment + "RETP", # Retirement income + "PAP", # Public assistance income + "OIP", # Other income + "PERNP", # Total earnings + "PINCP", # Total income + "POVPIP", # Income-to-poverty line percentage + "RAC1P", # Race +] + +HOUSEHOLD_COLUMNS = [ + "SERIALNO", # Household ID + "PUMA", # PUMA area code + "ST", # State code + "ADJHSG", # Adjustment factor for housing dollar amounts + "ADJINC", # Adjustment factor for income + "WGTP", # Household weight + "NP", # Number of persons in household + "BDSP", # Number of bedrooms + "ELEP", # Electricity monthly cost + "FULP", # Fuel monthly cost + "GASP", # Gas monthly cost + "RMSP", # Number of rooms + "RNTP", # Monthly rent + "TEN", # Tenure + "VEH", # Number of vehicles + "FINCP", # Total income + "GRNTP", # Gross rent +] + class RawACS(PublicDataset): name = "raw_acs" @@ -25,18 +68,19 @@ def generate(self, year: int) -> None: if year in self.years: self.remove(year) - spm_url = f"https://www2.census.gov/programs-surveys/supplemental-poverty-measure/datasets/spm/spm_{year}_pu.dta" person_url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_pus.zip" household_url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_hus.zip" + + # The data dictionary for 2019 can be found here: https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_2019.pdf try: with pd.HDFStore(RawACS.file(year)) as storage: # Household file logging.info(f"Downloading household file") - storage["household"] = concat_zipped_csvs(household_url, "psam_hus") + storage["household"] = concat_zipped_csvs(household_url, "psam_hus", HOUSEHOLD_COLUMNS) # Person file logging.info(f"Downloading person file") - storage["person"] = concat_zipped_csvs(person_url, "psam_pus") + storage["person"] = concat_zipped_csvs(person_url, "psam_pus", PERSON_COLUMNS) # SPM unit file logging.info(f"Downloading SPM unit file") spm_person = pd.read_stata(spm_url).fillna(0) @@ -50,8 +94,17 @@ def generate(self, year: int) -> None: RawACS = RawACS() -def concat_zipped_csvs(url: str, prefix: str) -> pd.DataFrame: - # Creates a DataFrame with the two csvs inside a zip file from a URL. +def concat_zipped_csvs(url: str, prefix: str, columns: List[str]) -> pd.DataFrame: + """Downloads the ACS microdata, which is a zip file containing two halves in CSV format. + + Args: + url (str): The URL of the data server. + prefix (str): The prefix of the filenames, before a/b.csv. + columns (List[str]): The columns to filter (avoids hitting memory limits). + + Returns: + pd.DataFrame: The concatenated DataFrame. + """ req = requests.get(url, stream=True) with BytesIO() as f: pbar = tqdm() @@ -61,8 +114,11 @@ def concat_zipped_csvs(url: str, prefix: str) -> pd.DataFrame: f.write(chunk) f.seek(0) zf = ZipFile(f) - a = pd.read_csv(zf.open(prefix + "a.csv")) - b = pd.read_csv(zf.open(prefix + "b.csv")) + logging.info(f"Loading the first half of the dataset") + a = pd.read_csv(zf.open(prefix + "a.csv"))[columns] + logging.info(f"Loading the second half of the dataset") + b = pd.read_csv(zf.open(prefix + "b.csv"))[columns] + logging.info(f"Concatenating datasets") res = pd.concat([a, b]).fillna(0) res.columns = res.columns.str.upper() return res From 8d7292a0093cd7c8448702c1cca62399288474b9 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Sat, 16 Apr 2022 11:55:04 +0100 Subject: [PATCH 04/27] Fix pandas mis-use --- openfisca_us/data/datasets/acs/raw_acs.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/openfisca_us/data/datasets/acs/raw_acs.py b/openfisca_us/data/datasets/acs/raw_acs.py index 99f78f214e8..a26e1f3e96d 100644 --- a/openfisca_us/data/datasets/acs/raw_acs.py +++ b/openfisca_us/data/datasets/acs/raw_acs.py @@ -73,6 +73,7 @@ def generate(self, year: int) -> None: household_url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_hus.zip" # The data dictionary for 2019 can be found here: https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_2019.pdf + try: with pd.HDFStore(RawACS.file(year)) as storage: # Household file @@ -115,9 +116,9 @@ def concat_zipped_csvs(url: str, prefix: str, columns: List[str]) -> pd.DataFram f.seek(0) zf = ZipFile(f) logging.info(f"Loading the first half of the dataset") - a = pd.read_csv(zf.open(prefix + "a.csv"))[columns] + a = pd.read_csv(zf.open(prefix + "a.csv"), usecols=columns) logging.info(f"Loading the second half of the dataset") - b = pd.read_csv(zf.open(prefix + "b.csv"))[columns] + b = pd.read_csv(zf.open(prefix + "b.csv"), usecols=columns) logging.info(f"Concatenating datasets") res = pd.concat([a, b]).fillna(0) res.columns = res.columns.str.upper() From cf048e7036f679a5df4628ec87ce9db540ca50b1 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Sat, 16 Apr 2022 14:53:23 +0100 Subject: [PATCH 05/27] Add vehicle formula --- openfisca_us/data/datasets/acs/acs.py | 22 +++++++++------ openfisca_us/data/datasets/acs/raw_acs.py | 12 ++++++--- .../demographic/household/vehicles_owned.py | 11 ++++++++ .../variables/demographic/person/is_adult.py | 12 +++++++++ .../demographic/person/vehicles_owned.py | 27 +++++++++++++++++++ 5 files changed, 73 insertions(+), 11 deletions(-) create mode 100644 openfisca_us/variables/demographic/household/vehicles_owned.py create mode 100644 openfisca_us/variables/demographic/person/is_adult.py create mode 100644 openfisca_us/variables/demographic/person/vehicles_owned.py diff --git a/openfisca_us/data/datasets/acs/acs.py b/openfisca_us/data/datasets/acs/acs.py index 57a0b0cd040..ad4a8edb589 100644 --- a/openfisca_us/data/datasets/acs/acs.py +++ b/openfisca_us/data/datasets/acs/acs.py @@ -34,9 +34,15 @@ def generate(self, year: int) -> None: person, spm_unit, household = [ raw_data[entity] for entity in ("person", "spm_unit", "household") ] + # Add primary and foreign keys + household["household_id"] = household.index + person["household_id"] = household[["household_id", "SERIALNO"]].set_index("SERIALNO").loc[person.SERIALNO.values].values + household = household[household.household_id.isin(person.household_id)] + person = person[person.household_id.isin(household.household_id)] add_id_variables(acs, person, spm_unit, household) add_spm_variables(acs, spm_unit) + add_household_variables(acs, household) raw_data.close() acs.close() @@ -59,26 +65,26 @@ def add_id_variables( of the ACS. household (DataFrame): The household table of the ACS. """ - # Add primary and foreign keys - acs["person_id"] = person.SERIALNO * 1e2 + person.SPORDER + acs["person_id"] = person.household_id * 1e2 + person.SPORDER acs["person_spm_unit_id"] = person.SPM_ID acs["spm_unit_id"] = spm_unit.SPM_ID # ACS doesn't have tax units. acs["tax_unit_id"] = spm_unit.SPM_ID # Until we add a family table, we'll use the person table. acs["family_id"] = spm_unit.SPM_ID - acs["person_household_id"] = person.SERIALNO + acs["person_household_id"] = person.household_id acs["person_tax_unit_id"] = person.SPM_ID acs["person_family_id"] = person.SPM_ID - acs["household_id"] = household.SERIALNO + acs["household_id"] = household.household_id # Add weights - acs["person_weight"] = person.WT - -def add_person_variables(acs: h5py.File, person: DataFrame): - pass + acs["person_weight"] = person.PWGTP + acs["household_weight"] = household.WGTP def add_spm_variables(acs: h5py.File, spm_unit: DataFrame): acs["spm_unit_net_income"] = spm_unit.SPM_RESOURCES acs["poverty_threshold"] = spm_unit.SPM_POVTHRESHOLD + +def add_household_variables(acs: h5py.File, household: DataFrame): + acs["household_vehicles_owned"] = household.VEH diff --git a/openfisca_us/data/datasets/acs/raw_acs.py b/openfisca_us/data/datasets/acs/raw_acs.py index a26e1f3e96d..d9c8acb5437 100644 --- a/openfisca_us/data/datasets/acs/raw_acs.py +++ b/openfisca_us/data/datasets/acs/raw_acs.py @@ -86,7 +86,7 @@ def generate(self, year: int) -> None: logging.info(f"Downloading SPM unit file") spm_person = pd.read_stata(spm_url).fillna(0) spm_person.columns = spm_person.columns.str.upper() - storage["spm_unit"] = create_spm_unit_table(spm_person) + create_spm_unit_table(storage, spm_person) except Exception as e: RawACS.remove(year) raise ValueError( @@ -125,7 +125,7 @@ def concat_zipped_csvs(url: str, prefix: str, columns: List[str]) -> pd.DataFram return res -def create_spm_unit_table(person: pd.DataFrame) -> pd.DataFrame: +def create_spm_unit_table(storage: pd.HDFStore, person: pd.DataFrame) -> pd.DataFrame: SPM_UNIT_COLUMNS = [ "CAPHOUSESUB", "CAPWKCCXPNS", @@ -155,8 +155,14 @@ def create_spm_unit_table(person: pd.DataFrame) -> pd.DataFrame: "WUI_LT15", "ID", ] - return ( + spm_table = ( person[["SPM_" + column for column in SPM_UNIT_COLUMNS]] .groupby(person.SPM_ID) .first() ) + + person_table = storage["person"] + person_table["SPM_ID"] = person.SPM_ID + + storage["person"] = person_table + storage["spm_unit"] = spm_table diff --git a/openfisca_us/variables/demographic/household/vehicles_owned.py b/openfisca_us/variables/demographic/household/vehicles_owned.py new file mode 100644 index 00000000000..712d2e0cdc5 --- /dev/null +++ b/openfisca_us/variables/demographic/household/vehicles_owned.py @@ -0,0 +1,11 @@ +from openfisca_us.model_api import * + + +class household_vehicles_owned(Variable): + value_type = float + entity = Household + label = "Vehicles owned" + unit = USD + documentation = "Number of vehicles owned by the household" + definition_period = YEAR + diff --git a/openfisca_us/variables/demographic/person/is_adult.py b/openfisca_us/variables/demographic/person/is_adult.py new file mode 100644 index 00000000000..993e3b22ec0 --- /dev/null +++ b/openfisca_us/variables/demographic/person/is_adult.py @@ -0,0 +1,12 @@ +from openfisca_us.model_api import * + + +class is_adult(Variable): + value_type = bool + entity = Person + label = "Is an adult" + documentation = "Whether this person is over 18" + definition_period = YEAR + + def formula(person, period, parameters): + return person("age", period) >= 18 diff --git a/openfisca_us/variables/demographic/person/vehicles_owned.py b/openfisca_us/variables/demographic/person/vehicles_owned.py new file mode 100644 index 00000000000..d67057a34bd --- /dev/null +++ b/openfisca_us/variables/demographic/person/vehicles_owned.py @@ -0,0 +1,27 @@ +from numpy import maximum +from openfisca_us.model_api import * +from random import randint +from openfisca_core.populations import Population + +class vehicles_owned(Variable): + value_type = float + entity = Person + label = "Vehicles owned" + unit = USD + documentation = "Number of vehicles owned by this person" + definition_period = YEAR + + def formula(person, period, parameters): + household = person.household + household_vehicles = household("household_vehicles_owned", period) + is_adult = person("is_adult", period) + max_vehicles = household_vehicles.max() + adult_rank = where(is_adult, household.members_position, 100) + vehicles = np.zeros_like(is_adult) + for i in range(int(max_vehicles)): + # Pick a random adult in each household + selected_adult = randint(0, adult_rank[is_adult].max()) + maximum_reached = household.sum(vehicles) >= household_vehicles + vehicles += where(maximum_reached.project(), adult_rank == selected_adult, 0) + return vehicles + From ed4bed58365d32941af0262e0053b995817c7941 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Sat, 16 Apr 2022 15:28:23 +0100 Subject: [PATCH 06/27] Fix formula --- openfisca_us/data/datasets/acs/acs.py | 16 +++++++++------- .../demographic/person/vehicles_owned.py | 7 ++++--- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/openfisca_us/data/datasets/acs/acs.py b/openfisca_us/data/datasets/acs/acs.py index ad4a8edb589..0f690f35bf5 100644 --- a/openfisca_us/data/datasets/acs/acs.py +++ b/openfisca_us/data/datasets/acs/acs.py @@ -35,10 +35,12 @@ def generate(self, year: int) -> None: raw_data[entity] for entity in ("person", "spm_unit", "household") ] # Add primary and foreign keys - household["household_id"] = household.index - person["household_id"] = household[["household_id", "SERIALNO"]].set_index("SERIALNO").loc[person.SERIALNO.values].values - household = household[household.household_id.isin(person.household_id)] - person = person[person.household_id.isin(household.household_id)] + make_numeric = lambda x: int(x.replace("2019GQ", "0").replace("2019HU", "1")) + household.SERIALNO = household.SERIALNO.apply(make_numeric) + person.SERIALNO = person.SERIALNO.apply(make_numeric) + + person = person[person.SERIALNO.isin(household.SERIALNO)] + household = household[household.SERIALNO.isin(person.SERIALNO)] add_id_variables(acs, person, spm_unit, household) add_spm_variables(acs, spm_unit) @@ -65,17 +67,17 @@ def add_id_variables( of the ACS. household (DataFrame): The household table of the ACS. """ - acs["person_id"] = person.household_id * 1e2 + person.SPORDER + acs["person_id"] = person.SERIALNO * 1e2 + person.SPORDER acs["person_spm_unit_id"] = person.SPM_ID acs["spm_unit_id"] = spm_unit.SPM_ID # ACS doesn't have tax units. acs["tax_unit_id"] = spm_unit.SPM_ID # Until we add a family table, we'll use the person table. acs["family_id"] = spm_unit.SPM_ID - acs["person_household_id"] = person.household_id + acs["person_household_id"] = person.SERIALNO acs["person_tax_unit_id"] = person.SPM_ID acs["person_family_id"] = person.SPM_ID - acs["household_id"] = household.household_id + acs["household_id"] = household.SERIALNO # Add weights acs["person_weight"] = person.PWGTP diff --git a/openfisca_us/variables/demographic/person/vehicles_owned.py b/openfisca_us/variables/demographic/person/vehicles_owned.py index d67057a34bd..66aab782cb5 100644 --- a/openfisca_us/variables/demographic/person/vehicles_owned.py +++ b/openfisca_us/variables/demographic/person/vehicles_owned.py @@ -17,11 +17,12 @@ def formula(person, period, parameters): is_adult = person("is_adult", period) max_vehicles = household_vehicles.max() adult_rank = where(is_adult, household.members_position, 100) - vehicles = np.zeros_like(is_adult) - for i in range(int(max_vehicles)): + vehicles = is_adult * 0 + for _ in range(int(max_vehicles)): # Pick a random adult in each household selected_adult = randint(0, adult_rank[is_adult].max()) maximum_reached = household.sum(vehicles) >= household_vehicles - vehicles += where(maximum_reached.project(), adult_rank == selected_adult, 0) + should_add_vehicle = ~maximum_reached & (adult_rank == selected_adult) + vehicles += where(should_add_vehicle, 1, 0) return vehicles From d04e66497a47c68b9dcdc2b2b9171d700d4529b4 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Sat, 16 Apr 2022 15:32:28 +0100 Subject: [PATCH 07/27] Fix formula (working!) --- openfisca_us/variables/demographic/person/vehicles_owned.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openfisca_us/variables/demographic/person/vehicles_owned.py b/openfisca_us/variables/demographic/person/vehicles_owned.py index 66aab782cb5..8fdc69addbb 100644 --- a/openfisca_us/variables/demographic/person/vehicles_owned.py +++ b/openfisca_us/variables/demographic/person/vehicles_owned.py @@ -15,12 +15,13 @@ def formula(person, period, parameters): household = person.household household_vehicles = household("household_vehicles_owned", period) is_adult = person("is_adult", period) + num_adults_in_household = household.sum(is_adult) max_vehicles = household_vehicles.max() adult_rank = where(is_adult, household.members_position, 100) vehicles = is_adult * 0 for _ in range(int(max_vehicles)): # Pick a random adult in each household - selected_adult = randint(0, adult_rank[is_adult].max()) + selected_adult = randint(0, adult_rank[is_adult].max()) % num_adults_in_household maximum_reached = household.sum(vehicles) >= household_vehicles should_add_vehicle = ~maximum_reached & (adult_rank == selected_adult) vehicles += where(should_add_vehicle, 1, 0) From b6bb14e6476ef4f8758bff2291d0e3e95b17fc12 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Sat, 16 Apr 2022 17:13:39 +0100 Subject: [PATCH 08/27] Push recent changes --- openfisca_us/data/datasets/acs/acs.py | 4 +++- openfisca_us/data/datasets/acs/raw_acs.py | 6 +++--- .../variables/demographic/person/vehicles_owned.py | 1 + .../variables/income/spm_unit/spm_unit_fips.py | 11 +++++++++++ openfisca_us/variables/irs/income/sources.py | 2 +- 5 files changed, 19 insertions(+), 5 deletions(-) create mode 100644 openfisca_us/variables/income/spm_unit/spm_unit_fips.py diff --git a/openfisca_us/data/datasets/acs/acs.py b/openfisca_us/data/datasets/acs/acs.py index 0f690f35bf5..3bfc1640d9d 100644 --- a/openfisca_us/data/datasets/acs/acs.py +++ b/openfisca_us/data/datasets/acs/acs.py @@ -41,6 +41,7 @@ def generate(self, year: int) -> None: person = person[person.SERIALNO.isin(household.SERIALNO)] household = household[household.SERIALNO.isin(person.SERIALNO)] + spm_unit = spm_unit[spm_unit.SPM_ID.isin(person.SPM_ID)] add_id_variables(acs, person, spm_unit, household) add_spm_variables(acs, spm_unit) @@ -86,7 +87,8 @@ def add_id_variables( def add_spm_variables(acs: h5py.File, spm_unit: DataFrame): acs["spm_unit_net_income"] = spm_unit.SPM_RESOURCES - acs["poverty_threshold"] = spm_unit.SPM_POVTHRESHOLD + acs["spm_unit_spm_threshold"] = spm_unit.SPM_POVTHRESHOLD def add_household_variables(acs: h5py.File, household: DataFrame): acs["household_vehicles_owned"] = household.VEH + acs["fips"] = household.ST diff --git a/openfisca_us/data/datasets/acs/raw_acs.py b/openfisca_us/data/datasets/acs/raw_acs.py index d9c8acb5437..8e0219599fb 100644 --- a/openfisca_us/data/datasets/acs/raw_acs.py +++ b/openfisca_us/data/datasets/acs/raw_acs.py @@ -161,8 +161,8 @@ def create_spm_unit_table(storage: pd.HDFStore, person: pd.DataFrame) -> pd.Data .first() ) - person_table = storage["person"] - person_table["SPM_ID"] = person.SPM_ID + original_person_table = storage["person"] + combined_person_table = pd.merge(original_person_table, person, on=["SERIALNO", "SPORDER"]) - storage["person"] = person_table + storage["person"] = combined_person_table storage["spm_unit"] = spm_table diff --git a/openfisca_us/variables/demographic/person/vehicles_owned.py b/openfisca_us/variables/demographic/person/vehicles_owned.py index 8fdc69addbb..c7d0a826e4c 100644 --- a/openfisca_us/variables/demographic/person/vehicles_owned.py +++ b/openfisca_us/variables/demographic/person/vehicles_owned.py @@ -12,6 +12,7 @@ class vehicles_owned(Variable): definition_period = YEAR def formula(person, period, parameters): + # We randomly split the household's vehicles between its adults household = person.household household_vehicles = household("household_vehicles_owned", period) is_adult = person("is_adult", period) diff --git a/openfisca_us/variables/income/spm_unit/spm_unit_fips.py b/openfisca_us/variables/income/spm_unit/spm_unit_fips.py new file mode 100644 index 00000000000..26e9a99b35a --- /dev/null +++ b/openfisca_us/variables/income/spm_unit/spm_unit_fips.py @@ -0,0 +1,11 @@ +from openfisca_us.model_api import * + + +class spm_unit_fips(Variable): + value_type = float + entity = SPMUnit + label = "SPM unit FIPS code" + definition_period = YEAR + + def formula(spm_unit, period, parameters): + return spm_unit.household("fips", period) \ No newline at end of file diff --git a/openfisca_us/variables/irs/income/sources.py b/openfisca_us/variables/irs/income/sources.py index 3c2f093194e..55f14b4fcde 100644 --- a/openfisca_us/variables/irs/income/sources.py +++ b/openfisca_us/variables/irs/income/sources.py @@ -1009,7 +1009,7 @@ class ffpos(Variable): class fips(Variable): value_type = int - entity = TaxUnit + entity = Household definition_period = YEAR documentation = "FIPS state code (not used in tax-calculation logic)" From a2f826347143fe1ae8350b79c1dc573e62558313 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 16 Apr 2022 13:06:26 -0700 Subject: [PATCH 09/27] Remove unused imports --- openfisca_us/data/datasets/acs/acs.py | 13 +-- openfisca_us/data/datasets/acs/raw_acs.py | 105 ++++++++++++---------- 2 files changed, 64 insertions(+), 54 deletions(-) diff --git a/openfisca_us/data/datasets/acs/acs.py b/openfisca_us/data/datasets/acs/acs.py index 3bfc1640d9d..15c976313d2 100644 --- a/openfisca_us/data/datasets/acs/acs.py +++ b/openfisca_us/data/datasets/acs/acs.py @@ -1,11 +1,8 @@ -import logging from openfisca_tools.data import PublicDataset import h5py from openfisca_us.data.datasets.acs.raw_acs import RawACS -from openfisca_us.data.datasets.cps.raw_cps import RawCPS from openfisca_us.data.storage import OPENFISCA_US_MICRODATA_FOLDER -from pandas import DataFrame, Series -import numpy as np +from pandas import DataFrame class ACS(PublicDataset): @@ -35,10 +32,12 @@ def generate(self, year: int) -> None: raw_data[entity] for entity in ("person", "spm_unit", "household") ] # Add primary and foreign keys - make_numeric = lambda x: int(x.replace("2019GQ", "0").replace("2019HU", "1")) + make_numeric = lambda x: int( + x.replace("2019GQ", "0").replace("2019HU", "1") + ) household.SERIALNO = household.SERIALNO.apply(make_numeric) person.SERIALNO = person.SERIALNO.apply(make_numeric) - + person = person[person.SERIALNO.isin(household.SERIALNO)] household = household[household.SERIALNO.isin(person.SERIALNO)] spm_unit = spm_unit[spm_unit.SPM_ID.isin(person.SPM_ID)] @@ -50,6 +49,7 @@ def generate(self, year: int) -> None: raw_data.close() acs.close() + ACS = ACS() @@ -89,6 +89,7 @@ def add_spm_variables(acs: h5py.File, spm_unit: DataFrame): acs["spm_unit_net_income"] = spm_unit.SPM_RESOURCES acs["spm_unit_spm_threshold"] = spm_unit.SPM_POVTHRESHOLD + def add_household_variables(acs: h5py.File, household: DataFrame): acs["household_vehicles_owned"] = household.VEH acs["fips"] = household.ST diff --git a/openfisca_us/data/datasets/acs/raw_acs.py b/openfisca_us/data/datasets/acs/raw_acs.py index 8e0219599fb..87ce467b987 100644 --- a/openfisca_us/data/datasets/acs/raw_acs.py +++ b/openfisca_us/data/datasets/acs/raw_acs.py @@ -4,56 +4,53 @@ from zipfile import ZipFile import pandas as pd from openfisca_tools.data import PublicDataset -import h5py import requests from tqdm import tqdm -from openfisca_us.data.datasets.cps.raw_cps import RawCPS from openfisca_us.data.storage import OPENFISCA_US_MICRODATA_FOLDER -from pandas import DataFrame, Series -import numpy as np + logging.getLogger().setLevel(logging.INFO) PERSON_COLUMNS = [ - "SERIALNO", # Household ID - "SPORDER", # Person number within household - "PWGTP", # Person weight - "AGEP", # Age - "CIT", # Citizenship - "MAR", # Marital status - "WAGP", # Wage/salary - "SSP", # Social security income - "SSIP", # Supplemental security income - "SEX", # Sex - "SEMP", # Self-employment income - "SCHL", # Educational attainment - "RETP", # Retirement income - "PAP", # Public assistance income - "OIP", # Other income - "PERNP", # Total earnings - "PINCP", # Total income - "POVPIP", # Income-to-poverty line percentage - "RAC1P", # Race + "SERIALNO", # Household ID + "SPORDER", # Person number within household + "PWGTP", # Person weight + "AGEP", # Age + "CIT", # Citizenship + "MAR", # Marital status + "WAGP", # Wage/salary + "SSP", # Social security income + "SSIP", # Supplemental security income + "SEX", # Sex + "SEMP", # Self-employment income + "SCHL", # Educational attainment + "RETP", # Retirement income + "PAP", # Public assistance income + "OIP", # Other income + "PERNP", # Total earnings + "PINCP", # Total income + "POVPIP", # Income-to-poverty line percentage + "RAC1P", # Race ] HOUSEHOLD_COLUMNS = [ - "SERIALNO", # Household ID - "PUMA", # PUMA area code - "ST", # State code - "ADJHSG", # Adjustment factor for housing dollar amounts - "ADJINC", # Adjustment factor for income - "WGTP", # Household weight - "NP", # Number of persons in household - "BDSP", # Number of bedrooms - "ELEP", # Electricity monthly cost - "FULP", # Fuel monthly cost - "GASP", # Gas monthly cost - "RMSP", # Number of rooms - "RNTP", # Monthly rent - "TEN", # Tenure - "VEH", # Number of vehicles - "FINCP", # Total income - "GRNTP", # Gross rent + "SERIALNO", # Household ID + "PUMA", # PUMA area code + "ST", # State code + "ADJHSG", # Adjustment factor for housing dollar amounts + "ADJINC", # Adjustment factor for income + "WGTP", # Household weight + "NP", # Number of persons in household + "BDSP", # Number of bedrooms + "ELEP", # Electricity monthly cost + "FULP", # Fuel monthly cost + "GASP", # Gas monthly cost + "RMSP", # Number of rooms + "RNTP", # Monthly rent + "TEN", # Tenure + "VEH", # Number of vehicles + "FINCP", # Total income + "GRNTP", # Gross rent ] @@ -78,10 +75,14 @@ def generate(self, year: int) -> None: with pd.HDFStore(RawACS.file(year)) as storage: # Household file logging.info(f"Downloading household file") - storage["household"] = concat_zipped_csvs(household_url, "psam_hus", HOUSEHOLD_COLUMNS) + storage["household"] = concat_zipped_csvs( + household_url, "psam_hus", HOUSEHOLD_COLUMNS + ) # Person file logging.info(f"Downloading person file") - storage["person"] = concat_zipped_csvs(person_url, "psam_pus", PERSON_COLUMNS) + storage["person"] = concat_zipped_csvs( + person_url, "psam_pus", PERSON_COLUMNS + ) # SPM unit file logging.info(f"Downloading SPM unit file") spm_person = pd.read_stata(spm_url).fillna(0) @@ -93,9 +94,13 @@ def generate(self, year: int) -> None: f"Attempted to extract and save the CSV files, but encountered an error: {e}" ) + RawACS = RawACS() -def concat_zipped_csvs(url: str, prefix: str, columns: List[str]) -> pd.DataFrame: + +def concat_zipped_csvs( + url: str, prefix: str, columns: List[str] +) -> pd.DataFrame: """Downloads the ACS microdata, which is a zip file containing two halves in CSV format. Args: @@ -109,9 +114,9 @@ def concat_zipped_csvs(url: str, prefix: str, columns: List[str]) -> pd.DataFram req = requests.get(url, stream=True) with BytesIO() as f: pbar = tqdm() - for chunk in req.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - pbar.update (len(chunk)) + for chunk in req.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + pbar.update(len(chunk)) f.write(chunk) f.seek(0) zf = ZipFile(f) @@ -125,7 +130,9 @@ def concat_zipped_csvs(url: str, prefix: str, columns: List[str]) -> pd.DataFram return res -def create_spm_unit_table(storage: pd.HDFStore, person: pd.DataFrame) -> pd.DataFrame: +def create_spm_unit_table( + storage: pd.HDFStore, person: pd.DataFrame +) -> pd.DataFrame: SPM_UNIT_COLUMNS = [ "CAPHOUSESUB", "CAPWKCCXPNS", @@ -162,7 +169,9 @@ def create_spm_unit_table(storage: pd.HDFStore, person: pd.DataFrame) -> pd.Data ) original_person_table = storage["person"] - combined_person_table = pd.merge(original_person_table, person, on=["SERIALNO", "SPORDER"]) + combined_person_table = pd.merge( + original_person_table, person, on=["SERIALNO", "SPORDER"] + ) storage["person"] = combined_person_table storage["spm_unit"] = spm_table From b1199b29146d4b3ea799dc9848605bb59fb19c63 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 16 Apr 2022 13:11:15 -0700 Subject: [PATCH 10/27] Match key types --- openfisca_us/data/datasets/acs/raw_acs.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/openfisca_us/data/datasets/acs/raw_acs.py b/openfisca_us/data/datasets/acs/raw_acs.py index 87ce467b987..5c145037ed9 100644 --- a/openfisca_us/data/datasets/acs/raw_acs.py +++ b/openfisca_us/data/datasets/acs/raw_acs.py @@ -169,8 +169,17 @@ def create_spm_unit_table( ) original_person_table = storage["person"] + # Ensure that join keys are the same type. + JOIN_COLUMNS = ["SERIALNO", "SPORDER"] + original_person_table[JOIN_COLUMNS] = original_person_table[ + JOIN_COLUMNS + ].astype(int) + person[JOIN_COLUMNS] = person[JOIN_COLUMNS].astype(int) + # Add SPM_ID from the SPM person table to the original person table. combined_person_table = pd.merge( - original_person_table, person, on=["SERIALNO", "SPORDER"] + original_person_table, + person[JOIN_COLUMNS + ["SPM_ID"]], + on=JOIN_COLUMNS, ) storage["person"] = combined_person_table From 1769ebe877ed0b221a23e2deb8442ce37f288b11 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 16 Apr 2022 13:12:41 -0700 Subject: [PATCH 11/27] Bump changelog --- changelog.yaml | 320 +++++++++++++++++++++++++------------------------ 1 file changed, 162 insertions(+), 158 deletions(-) diff --git a/changelog.yaml b/changelog.yaml index cf10abacb76..2159c7fc3b4 100644 --- a/changelog.yaml +++ b/changelog.yaml @@ -1,524 +1,528 @@ - changes: added: - - First prototype version with a standard deduction variable. + - First prototype version with a standard deduction variable. date: 2021-06-28 00:00:00 version: 0.0.1 - bump: minor changes: added: - - Prototype with some tax implementations. + - Prototype with some tax implementations. date: 2021-12-25 00:00:00 - bump: minor changes: added: - - Tax variables, some benefit variables. + - Tax variables, some benefit variables. date: 2021-12-25 00:00:01 - bump: minor changes: added: - - Lifeline benefit. + - Lifeline benefit. date: 2021-12-25 00:00:02 - bump: patch changes: added: - - Automated tests. + - Automated tests. date: 2021-12-25 00:00:03 - bump: minor changes: added: - - TANF eligibility, broken down into demographic and financial variables, with - financial separated by current enrollment in program. - - Demographic TANF eligibility per IL rules. + - TANF eligibility, broken down into demographic and financial variables, with + financial separated by current enrollment in program. + - Demographic TANF eligibility per IL rules. date: 2021-12-26 00:00:00 - bump: minor changes: added: - - Medicaid income thresholds for California. + - Medicaid income thresholds for California. date: 2021-12-27 00:00:00 - bump: minor changes: added: - - Alternative Minimum Tax (AMT) income and liability logic. - - Development tools for auto-generating unit tests for Tax-Calculator functions. + - Alternative Minimum Tax (AMT) income and liability logic. + - Development tools for auto-generating unit tests for Tax-Calculator functions. date: 2021-12-28 00:00:00 - bump: minor changes: added: - - Gains Tax (capital gains treatment) logic and parameters. + - Gains Tax (capital gains treatment) logic and parameters. date: 2021-12-28 00:00:01 - bump: minor changes: added: - - Minimum benefit logic for SNAP. + - Minimum benefit logic for SNAP. date: 2021-12-28 00:00:02 - bump: minor changes: added: - - Social Security taxation logic. + - Social Security taxation logic. date: 2021-12-28 00:00:03 - bump: minor changes: added: - - Income-to-SMI (state median income) ratio. + - Income-to-SMI (state median income) ratio. date: 2021-12-28 00:00:04 - bump: minor changes: added: - - American Opportunity (tax) Credit. - - Lifetime Learning (tax) Credit. + - American Opportunity (tax) Credit. + - Lifetime Learning (tax) Credit. date: 2021-12-30 00:00:00 - bump: minor changes: added: - - Elderly and Disabled (tax) Credit. + - Elderly and Disabled (tax) Credit. date: 2021-12-30 00:00:01 - bump: minor changes: added: - - Formula for Medicaid person type, based on age and dependents. - - Variable for whether a person meets their Medicaid income eligibility requirement. + - Formula for Medicaid person type, based on age and dependents. + - Variable for whether a person meets their Medicaid income eligibility requirement. date: 2021-12-31 00:00:00 - bump: minor changes: added: - - SNAP eligibility based on federal net and gross income limits. - - Unit and integration tests for SNAP variables. + - SNAP eligibility based on federal net and gross income limits. + - Unit and integration tests for SNAP variables. date: 2022-01-03 00:00:00 - bump: minor changes: added: - - Federal SNAP asset tests logic + - Federal SNAP asset tests logic date: 2022-01-03 00:00:01 - bump: minor changes: added: - - CCDF subsidy top-level logic + - CCDF subsidy top-level logic date: 2022-01-03 00:00:02 - bump: minor changes: added: - - Categorical eligibility for SNAP, including broad-based categorical eligibility - via low-cost TANF programs that effectively extend SNAP's asset and income limits. + - Categorical eligibility for SNAP, including broad-based categorical eligibility + via low-cost TANF programs that effectively extend SNAP's asset and income limits. changed: - - Refactored SNAP code. + - Refactored SNAP code. date: 2022-01-04 00:00:00 - bump: patch changes: changed: - - Use USDA elderly and disabled definitions in SNAP calculations. + - Use USDA elderly and disabled definitions in SNAP calculations. date: 2022-01-06 00:00:00 - bump: minor changes: added: - - Total child care market rate. + - Total child care market rate. date: 2022-01-06 00:00:01 - bump: minor changes: added: - - Update child care market rate to annual. + - Update child care market rate to annual. date: 2022-01-06 00:00:02 - bump: patch changes: added: - - Formulas for `childcare_hours_per_week` and `spm_unit_size`. - - Unit tests and units for some variables. + - Formulas for `childcare_hours_per_week` and `spm_unit_size`. + - Unit tests and units for some variables. changed: - - Reorganized variables. + - Reorganized variables. date: 2022-01-07 00:00:00 - bump: patch changes: changed: - - Removes the `u` prefix from all variable label strings. + - Removes the `u` prefix from all variable label strings. date: 2022-01-08 00:00:00 - bump: patch changes: added: - - Units to all tax variables. + - Units to all tax variables. changed: - - Adds one line between tests in yaml files. - - Use consistent imports in variable Python files. + - Adds one line between tests in yaml files. + - Use consistent imports in variable Python files. removed: - - C-TAM benefit variables in tax Python files. - - Erroneous formula for `eic` variable. + - C-TAM benefit variables in tax Python files. + - Erroneous formula for `eic` variable. date: 2022-01-08 00:00:01 - bump: minor changes: added: - - Formula for initial TANF eligibility. - - 'Two new variables: `tanf_gross_earned_income` and `tanf_gross_unearned_income`.' - - Variable & parameter for `initial_employment_deduction`. - - Integration tests for TANF cash aid from TANF IL website. + - Formula for initial TANF eligibility. + - "Two new variables: `tanf_gross_earned_income` and `tanf_gross_unearned_income`." + - Variable & parameter for `initial_employment_deduction`. + - Integration tests for TANF cash aid from TANF IL website. changed: - - '`tanf_countable_income` now includes unearned income and earned income deduction.' + - "`tanf_countable_income` now includes unearned income and earned income deduction." date: 2022-01-09 00:00:00 - bump: patch changes: fixed: - - Test runner failed to test string values. + - Test runner failed to test string values. date: 2022-01-12 00:00:00 - bump: patch changes: added: - - Metadata for SNAP eligibility parameters. + - Metadata for SNAP eligibility parameters. fixed: - - Parameter misname in SNAP formula. + - Parameter misname in SNAP formula. date: 2022-01-14 00:00:00 - bump: minor changes: added: - - Add CCDF copay formula. + - Add CCDF copay formula. date: 2022-01-14 00:00:01 - bump: minor changes: added: - - Formula for SSI based on eligibility and amount if eligible. + - Formula for SSI based on eligibility and amount if eligible. date: 2022-01-14 00:00:02 - bump: minor changes: fixed: - - Update CCDF subsidy formula. + - Update CCDF subsidy formula. date: 2022-01-15 00:00:00 - bump: patch changes: fixed: - - Added links to version tag diffs in changelog. + - Added links to version tag diffs in changelog. date: 2022-01-15 00:00:01 - bump: minor changes: added: - - Logic for SNAP excess medical deduction and dependent care deduction. - - Limit SNAP earned income deduction to earned income. - - Jupyter Book documentation on SNAP. - - Updated SNAP parameters. - - 'Empty variables for calculating SNAP: `employment_income`, `self_employment_income`, - `dividend_income`, `interest_income`, `childcare_expenses`, and `medical_out_of_pocket_expenses`.' + - Logic for SNAP excess medical deduction and dependent care deduction. + - Limit SNAP earned income deduction to earned income. + - Jupyter Book documentation on SNAP. + - Updated SNAP parameters. + - "Empty variables for calculating SNAP: `employment_income`, `self_employment_income`, + `dividend_income`, `interest_income`, `childcare_expenses`, and `medical_out_of_pocket_expenses`." changed: - - Significant refactoring of SNAP code. - - Use openfisca-tools for `add` and `aggr` functions, and pass lists of variables - to these function. - - Rename min/max SNAP benefit parameters and variables to use `allotment`. + - Significant refactoring of SNAP code. + - Use openfisca-tools for `add` and `aggr` functions, and pass lists of variables + to these function. + - Rename min/max SNAP benefit parameters and variables to use `allotment`. date: 2022-01-17 00:00:00 - bump: patch changes: changed: - - Add metadata for variables and parameters used in SNAP calculations. - - Renames two parameters involved in SNAP deductions from `threshold` to `disregard`. + - Add metadata for variables and parameters used in SNAP calculations. + - Renames two parameters involved in SNAP deductions from `threshold` to `disregard`. date: 2022-01-17 00:00:01 - bump: minor changes: added: - - Child Tax Credit (including adult dependents) parameters, logic and tests. + - Child Tax Credit (including adult dependents) parameters, logic and tests. date: 2022-01-17 00:00:02 - bump: minor changes: added: - - Categorical eligibility to school meal subsidies. - - Documentation notebook on school meal subsidies. - - Parameterized income sources for school meal subsidies. + - Categorical eligibility to school meal subsidies. + - Documentation notebook on school meal subsidies. + - Parameterized income sources for school meal subsidies. changed: - - Count school meal subsidies by school enrollment rather than age. - - Remove `spm_unit_` prefix from school meal variables. + - Count school meal subsidies by school enrollment rather than age. + - Remove `spm_unit_` prefix from school meal variables. date: 2022-01-25 00:00:00 - bump: minor changes: added: - - Child Tax Credit (and historical policy). - - Non-refundable and refundable credit handling in tax logic. - - Metadata for education credits and the EITC. + - Child Tax Credit (and historical policy). + - Non-refundable and refundable credit handling in tax logic. + - Metadata for education credits and the EITC. fixed: - - Bugs in head/spouse detection and nonrefundable credits. + - Bugs in head/spouse detection and nonrefundable credits. date: 2022-01-28 00:00:00 - bump: patch changes: added: - - Metadata and variable aliases for key tax variables. - - Employment, self-employment, interest and dividend income as inputs to tax logic. + - Metadata and variable aliases for key tax variables. + - Employment, self-employment, interest and dividend income as inputs to tax logic. date: 2022-02-02 00:00:00 - bump: patch changes: added: - - Added formula for TANF variable `continuous_tanf_eligibility` - - Added integration test for continuous TANF eligibility to `integration.yaml` + - Added formula for TANF variable `continuous_tanf_eligibility` + - Added integration test for continuous TANF eligibility to `integration.yaml` date: 2022-02-06 00:00:00 - bump: minor changes: added: - - SNAP emergency allotments for California. - - SNAP unearned income example in JupyterBook docs. + - SNAP emergency allotments for California. + - SNAP unearned income example in JupyterBook docs. date: 2022-02-06 00:00:01 - bump: minor changes: added: - - California Clean Vehicle Rebate Project. + - California Clean Vehicle Rebate Project. date: 2022-02-07 00:00:00 - bump: minor changes: added: - - Guaranteed income / cash assistance pilot income variable. This counts as unearned - income for SNAP, uncounted for taxes and other benefits. + - Guaranteed income / cash assistance pilot income variable. This counts as unearned + income for SNAP, uncounted for taxes and other benefits. date: 2022-02-07 00:00:01 - bump: patch changes: fixed: - - EITC logic and parameters for non-3-child tax units. + - EITC logic and parameters for non-3-child tax units. date: 2022-02-08 00:00:00 - bump: patch changes: added: - - PolicyEngine metadata and notebook for Lifeline program. - - Formula for `irs_gross_income`, which Lifeline uses to calculate income-based - eligibility. + - PolicyEngine metadata and notebook for Lifeline program. + - Formula for `irs_gross_income`, which Lifeline uses to calculate income-based + eligibility. date: 2022-02-08 00:00:01 - bump: patch changes: fixed: - - Add Lifeline notebook to table of contents. + - Add Lifeline notebook to table of contents. date: 2022-02-08 00:00:02 - bump: minor changes: added: - - Income limits for 5 Maryland Medicaid coverage groups. + - Income limits for 5 Maryland Medicaid coverage groups. date: 2022-02-09 00:00:00 - bump: minor changes: added: - - WIC program. + - WIC program. fixed: - - Include guaranteed income / cash assistance in market income. + - Include guaranteed income / cash assistance in market income. date: 2022-02-09 00:00:01 - bump: patch changes: fixed: - - Change WIC display name from `WIC benefit value` to `WIC`. + - Change WIC display name from `WIC benefit value` to `WIC`. date: 2022-02-09 00:00:02 - bump: patch changes: fixed: - - Specify WIC's unit as USD. + - Specify WIC's unit as USD. date: 2022-02-09 00:00:03 - bump: patch changes: fixed: - - Remove guaranteed income / cash assistance from benefits. + - Remove guaranteed income / cash assistance from benefits. date: 2022-02-09 00:00:04 - bump: patch changes: added: - - Categorical breakdown metadata infrastructure from OpenFisca-Tools. + - Categorical breakdown metadata infrastructure from OpenFisca-Tools. date: 2022-02-10 00:00:00 - bump: patch changes: added: - - Chained CPI-U (monthly and August-only) parameters. - - Metadata for SNAP max allotment. + - Chained CPI-U (monthly and August-only) parameters. + - Metadata for SNAP max allotment. date: 2022-02-13 00:00:00 - bump: patch changes: changed: - - OpenFisca-Tools constraint widened to the current major version. + - OpenFisca-Tools constraint widened to the current major version. date: 2022-02-16 00:00:00 - bump: minor changes: added: - - Uprated tax parameters for federal income tax. + - Uprated tax parameters for federal income tax. date: 2022-02-21 00:00:00 - bump: minor changes: added: - - Affordable Connectivity Program. + - Affordable Connectivity Program. changed: - - Split school meal subsidies into free and reduced-price. + - Split school meal subsidies into free and reduced-price. date: 2022-02-21 00:00:01 - bump: minor changes: added: - - Rural Tribal supplement for Lifeline. + - Rural Tribal supplement for Lifeline. changed: - - Restructure ACP and EBB Tribal amounts to work with PolicyEngine. + - Restructure ACP and EBB Tribal amounts to work with PolicyEngine. date: 2022-02-21 00:00:02 - bump: patch changes: changed: - - Edited labels for ACP and SNAP normal allotment. + - Edited labels for ACP and SNAP normal allotment. date: 2022-02-21 00:00:03 - bump: patch changes: fixed: - - Subtract Lifeline from broadband cost before calculating ACP and EBB. + - Subtract Lifeline from broadband cost before calculating ACP and EBB. date: 2022-02-27 00:00:00 - bump: patch changes: added: - - Code coverage badge to README.md. - - Reminder for pull requests to run `make format && make documentation`. - - CPI-uprated values for WIC average payments. + - Code coverage badge to README.md. + - Reminder for pull requests to run `make format && make documentation`. + - CPI-uprated values for WIC average payments. changed: - - Child Tax Credit names renamed to `ctc`. - - Child and Dependent Care Credit names renamed to `cdcc`. + - Child Tax Credit names renamed to `ctc`. + - Child and Dependent Care Credit names renamed to `cdcc`. fixed: - - EITC maximum age in 2021 changed from 125 to infinity. + - EITC maximum age in 2021 changed from 125 to infinity. date: 2022-02-28 00:00:00 - bump: minor changes: added: - - Supplemental Security Income for individuals. - - Social Security input variables, counted as unearned income for several programs. + - Supplemental Security Income for individuals. + - Social Security input variables, counted as unearned income for several programs. date: 2022-03-04 00:00:00 - bump: patch changes: changed: - - Adjust variable labels for consistency. + - Adjust variable labels for consistency. date: 2022-03-04 00:00:01 - bump: minor changes: added: - - SNAP aggregate benefits and participation. + - SNAP aggregate benefits and participation. date: 2022-03-05 00:00:00 - bump: patch changes: changed: - - Point `e02400` to `social_security` (for PolicyEngine). + - Point `e02400` to `social_security` (for PolicyEngine). date: 2022-03-07 00:00:00 - bump: patch changes: added: - - '`spm_unit_weight` variable.' + - "`spm_unit_weight` variable." fixed: - - SNAP now uses the additional amounts where main rates are not available. + - SNAP now uses the additional amounts where main rates are not available. date: 2022-03-07 00:00:01 - bump: patch changes: changed: - - '`is_married` moved from person-level to family-level, with a formula added.' + - "`is_married` moved from person-level to family-level, with a formula added." date: 2022-03-08 00:00:00 - bump: patch changes: changed: - - IRS-published uprated income tax parameters for 2019-22. + - IRS-published uprated income tax parameters for 2019-22. date: 2022-03-09 00:00:00 - bump: patch changes: added: - - February 2022 chained CPI-U. + - February 2022 chained CPI-U. changed: - - Simplified WIC uprating. + - Simplified WIC uprating. date: 2022-03-11 00:00:00 - bump: patch changes: fixed: - - EITC uses the correct phase-in rate. + - EITC uses the correct phase-in rate. date: 2022-03-13 00:00:00 - bump: patch changes: changed: - - Tax folder re-organised to improve modularity. + - Tax folder re-organised to improve modularity. fixed: - - A bug in AMT calculations. + - A bug in AMT calculations. date: 2022-03-16 21:22:44 - bump: patch changes: fixed: - - Push action on GitHub correctly publishes. + - Push action on GitHub correctly publishes. date: 2022-03-16 20:29:58 - bump: patch changes: fixed: - - Push action on GitHub correctly publishes. + - Push action on GitHub correctly publishes. date: 2022-03-16 21:22:44 - bump: minor changes: changed: - - Added multiple parameters for California's TANF system. - - Refactored the TANF structure for easier implementation of other state TANF - programs. + - Added multiple parameters for California's TANF system. + - Refactored the TANF structure for easier implementation of other state TANF + programs. date: 2022-03-27 18:49:02 - bump: patch changes: added: - - Page on TANF to documentation. + - Page on TANF to documentation. date: 2022-03-28 10:40:42 - bump: patch changes: fixed: - - Versioning action didn't update `setup.py`. + - Versioning action didn't update `setup.py`. date: 2022-03-28 10:55:27 - bump: minor changes: changed: - - Added `is_eitc_qualifying_child` variable to improve EITC child logic. - - Split `is_in_school` into `is_in_k12_school` and `is_full_time_student`. + - Added `is_eitc_qualifying_child` variable to improve EITC child logic. + - Split `is_in_school` into `is_in_k12_school` and `is_full_time_student`. date: 2022-03-28 11:34:53 - bump: minor changes: added: - - Net income limits for SNAP BBCE (TANF) program. - - Legislative references for SNAP income limits. + - Net income limits for SNAP BBCE (TANF) program. + - Legislative references for SNAP income limits. removed: - - 165% SNAP gross income limit for separate elderly and disabled households (unused). + - 165% SNAP gross income limit for separate elderly and disabled households (unused). date: 2022-03-30 01:17:38 - bump: minor changes: added: - - CDCC parameters for eligibility and metadata. + - CDCC parameters for eligibility and metadata. fixed: - - A bug where the CDCC would phase down too quickly. + - A bug where the CDCC would phase down too quickly. date: 2022-03-30 11:46:11 - bump: patch changes: added: - - Parameter metadata for tax credits and payroll taxes. + - Parameter metadata for tax credits and payroll taxes. date: 2022-03-30 13:12:44 - bump: patch changes: added: - - Added full-time college student variable. + - Added full-time college student variable. date: 2022-03-30 18:53:00 - bump: minor changes: added: - - HUD adjusted income and dependent variables and logic. + - HUD adjusted income and dependent variables and logic. date: 2022-04-05 19:04:10 - bump: patch changes: fixed: - - Point TANF parameter to state instead of region. + - Point TANF parameter to state instead of region. date: 2022-04-06 10:35:14 - bump: minor changes: added: - - More recent Social Security payroll tax cap parameter values. - - Separate parameters for employer payroll taxes and self-employment taxes. - - Parameter for self-employment net earnings disregard. - - Unit tests and legislative references for payroll and self-employment tax variables. + - More recent Social Security payroll tax cap parameter values. + - Separate parameters for employer payroll taxes and self-employment taxes. + - Parameter for self-employment net earnings disregard. + - Unit tests and legislative references for payroll and self-employment tax variables. changed: - - Reorganized payroll and self-employment tax parameters and variables. - - Replaced large parameters with infinity and made number formatting consistent. + - Reorganized payroll and self-employment tax parameters and variables. + - Replaced large parameters with infinity and made number formatting consistent. removed: - - Reform-only `social_security.add_taxable_earnings` parameter. - - Unused `exact` variable. - - Variable for `social_security_taxes` (moved logic to `refundable_child_tax_credit`). + - Reform-only `social_security.add_taxable_earnings` parameter. + - Unused `exact` variable. + - Variable for `social_security_taxes` (moved logic to `refundable_child_tax_credit`). date: 2022-04-07 06:08:18 - bump: patch changes: fixed: - - Refundable CTC formula works properly when phase-in rate increased (comments - added). + - Refundable CTC formula works properly when phase-in rate increased (comments + added). date: 2022-04-12 18:38:49 - bump: minor changes: added: - - Capped non-refundable credits variable. - - Shortened labels for tax variables. + - Capped non-refundable credits variable. + - Shortened labels for tax variables. date: 2022-04-13 12:58:29 - bump: minor changes: added: - - Microdata now handled entirely within OpenFisca-US. + - Microdata now handled entirely within OpenFisca-US. date: 2022-04-14 08:19:40 - bump: patch changes: added: - - Legislative references for CDCC parameters. + - Legislative references for CDCC parameters. fixed: - - CDCC uses maximum dependent parameter. + - CDCC uses maximum dependent parameter. date: 2022-04-15 14:23:11 +- bump: minor + changes: + added: + - American Community Survey input. From 9f276dacb55ad4e010528cff1b87f53f4abebfb3 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 16 Apr 2022 13:28:25 -0700 Subject: [PATCH 12/27] make format and print types --- openfisca_us/data/datasets/acs/acs.py | 7 ++++--- openfisca_us/data/datasets/acs/raw_acs.py | 6 ++++-- .../variables/demographic/household/vehicles_owned.py | 1 - .../variables/demographic/person/vehicles_owned.py | 11 ++++++++--- .../variables/income/spm_unit/spm_unit_fips.py | 2 +- 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/openfisca_us/data/datasets/acs/acs.py b/openfisca_us/data/datasets/acs/acs.py index 15c976313d2..76e19dc2426 100644 --- a/openfisca_us/data/datasets/acs/acs.py +++ b/openfisca_us/data/datasets/acs/acs.py @@ -32,9 +32,10 @@ def generate(self, year: int) -> None: raw_data[entity] for entity in ("person", "spm_unit", "household") ] # Add primary and foreign keys - make_numeric = lambda x: int( - x.replace("2019GQ", "0").replace("2019HU", "1") - ) + + def make_numeric(x): + return int(x.replace("2019GQ", "0").replace("2019HU", "1")) + household.SERIALNO = household.SERIALNO.apply(make_numeric) person.SERIALNO = person.SERIALNO.apply(make_numeric) diff --git a/openfisca_us/data/datasets/acs/raw_acs.py b/openfisca_us/data/datasets/acs/raw_acs.py index 5c145037ed9..69180f88850 100644 --- a/openfisca_us/data/datasets/acs/raw_acs.py +++ b/openfisca_us/data/datasets/acs/raw_acs.py @@ -171,10 +171,12 @@ def create_spm_unit_table( original_person_table = storage["person"] # Ensure that join keys are the same type. JOIN_COLUMNS = ["SERIALNO", "SPORDER"] + print(original_person_table[JOIN_COLUMNS].dtypes) + print(person[JOIN_COLUMNS].dtypes) original_person_table[JOIN_COLUMNS] = original_person_table[ JOIN_COLUMNS - ].astype(int) - person[JOIN_COLUMNS] = person[JOIN_COLUMNS].astype(int) + ].astype(str) + person[JOIN_COLUMNS] = person[JOIN_COLUMNS].astype(str) # Add SPM_ID from the SPM person table to the original person table. combined_person_table = pd.merge( original_person_table, diff --git a/openfisca_us/variables/demographic/household/vehicles_owned.py b/openfisca_us/variables/demographic/household/vehicles_owned.py index 712d2e0cdc5..1dd8125571e 100644 --- a/openfisca_us/variables/demographic/household/vehicles_owned.py +++ b/openfisca_us/variables/demographic/household/vehicles_owned.py @@ -8,4 +8,3 @@ class household_vehicles_owned(Variable): unit = USD documentation = "Number of vehicles owned by the household" definition_period = YEAR - diff --git a/openfisca_us/variables/demographic/person/vehicles_owned.py b/openfisca_us/variables/demographic/person/vehicles_owned.py index c7d0a826e4c..ce32ecf6e95 100644 --- a/openfisca_us/variables/demographic/person/vehicles_owned.py +++ b/openfisca_us/variables/demographic/person/vehicles_owned.py @@ -3,6 +3,7 @@ from random import randint from openfisca_core.populations import Population + class vehicles_owned(Variable): value_type = float entity = Person @@ -22,9 +23,13 @@ def formula(person, period, parameters): vehicles = is_adult * 0 for _ in range(int(max_vehicles)): # Pick a random adult in each household - selected_adult = randint(0, adult_rank[is_adult].max()) % num_adults_in_household + selected_adult = ( + randint(0, adult_rank[is_adult].max()) + % num_adults_in_household + ) maximum_reached = household.sum(vehicles) >= household_vehicles - should_add_vehicle = ~maximum_reached & (adult_rank == selected_adult) + should_add_vehicle = ~maximum_reached & ( + adult_rank == selected_adult + ) vehicles += where(should_add_vehicle, 1, 0) return vehicles - diff --git a/openfisca_us/variables/income/spm_unit/spm_unit_fips.py b/openfisca_us/variables/income/spm_unit/spm_unit_fips.py index 26e9a99b35a..5353f5014e3 100644 --- a/openfisca_us/variables/income/spm_unit/spm_unit_fips.py +++ b/openfisca_us/variables/income/spm_unit/spm_unit_fips.py @@ -8,4 +8,4 @@ class spm_unit_fips(Variable): definition_period = YEAR def formula(spm_unit, period, parameters): - return spm_unit.household("fips", period) \ No newline at end of file + return spm_unit.household("fips", period) From 2ae4c2718f21e5da4e1189d7debe34bd47aef07f Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Mon, 18 Apr 2022 16:13:51 +0100 Subject: [PATCH 13/27] Add FIPS handling --- .../demographic/geographic/state_name.py | 57 +++++++++++++++++++ openfisca_us/variables/irs/income/sources.py | 1 + 2 files changed, 58 insertions(+) diff --git a/openfisca_us/variables/demographic/geographic/state_name.py b/openfisca_us/variables/demographic/geographic/state_name.py index 4bf51168c97..9fea58de658 100644 --- a/openfisca_us/variables/demographic/geographic/state_name.py +++ b/openfisca_us/variables/demographic/geographic/state_name.py @@ -73,3 +73,60 @@ class state_name(Variable): entity = Household label = "State" definition_period = ETERNITY + + def formula(household, period, parameters): + fips = household("fips", period) + return pd.Series(fips).map({ + 1: StateName.AL, + 2: StateName.AK, + 4: StateName.AZ, + 5: StateName.AR, + 6: StateName.CA, + 8: StateName.CO, + 9: StateName.CT, + 10: StateName.DE, + 11: StateName.DC, + 12: StateName.FL, + 13: StateName.GA, + 15: StateName.HI, + 16: StateName.ID, + 17: StateName.IL, + 18: StateName.IN, + 19: StateName.IA, + 20: StateName.KS, + 21: StateName.KY, + 22: StateName.LA, + 23: StateName.ME, + 24: StateName.MD, + 25: StateName.MA, + 26: StateName.MI, + 27: StateName.MN, + 28: StateName.MS, + 29: StateName.MO, + 30: StateName.MT, + 31: StateName.NE, + 32: StateName.NV, + 33: StateName.NH, + 34: StateName.NJ, + 35: StateName.NM, + 36: StateName.NY, + 37: StateName.NC, + 38: StateName.ND, + 39: StateName.OH, + 40: StateName.OK, + 41: StateName.OR, + 42: StateName.PA, + 44: StateName.RI, + 45: StateName.SC, + 46: StateName.SD, + 47: StateName.TN, + 48: StateName.TX, + 49: StateName.UT, + 50: StateName.VT, + 51: StateName.VA, + 53: StateName.WA, + 54: StateName.WV, + 55: StateName.WI, + 56: StateName.WY, + 72: StateName.PR, + }).values diff --git a/openfisca_us/variables/irs/income/sources.py b/openfisca_us/variables/irs/income/sources.py index 55f14b4fcde..32f392b1d11 100644 --- a/openfisca_us/variables/irs/income/sources.py +++ b/openfisca_us/variables/irs/income/sources.py @@ -1012,6 +1012,7 @@ class fips(Variable): entity = Household definition_period = YEAR documentation = "FIPS state code (not used in tax-calculation logic)" + default_value = 1 class h_seq(Variable): From 8d7768d5e96e6351f834f53f31ea0462506b8f5b Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Mon, 18 Apr 2022 16:14:00 +0100 Subject: [PATCH 14/27] API fixes --- openfisca_us/api/microsimulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfisca_us/api/microsimulation.py b/openfisca_us/api/microsimulation.py index 3645ab3c406..32b13d8710c 100644 --- a/openfisca_us/api/microsimulation.py +++ b/openfisca_us/api/microsimulation.py @@ -9,7 +9,7 @@ class Microsimulation(GeneralMicrosimulation): entities = entities default_dataset = CPS - def __init__(self, reform=(), dataset: type = CPS, year: int = None): + def __init__(self, reform=(), dataset: type = CPS, year: int = None, **kwargs): if dataset == CPS and len(CPS.years) == 0: CPS.generate(2020) From be0cbac89586d127ff9d17842c48472b4499eaca Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Mon, 18 Apr 2022 16:14:08 +0100 Subject: [PATCH 15/27] Add unit test for FIPS in CA --- .../policy/baseline/demographic/geographic/state_name.yaml | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 openfisca_us/tests/policy/baseline/demographic/geographic/state_name.yaml diff --git a/openfisca_us/tests/policy/baseline/demographic/geographic/state_name.yaml b/openfisca_us/tests/policy/baseline/demographic/geographic/state_name.yaml new file mode 100644 index 00000000000..3a2311773ed --- /dev/null +++ b/openfisca_us/tests/policy/baseline/demographic/geographic/state_name.yaml @@ -0,0 +1,7 @@ +- name: California is decoded correctly from FIPS code. + period: 2020 + absolute_error_margin: 0 + input: + fips: 6 + output: + state_name: CA From cb36e7b8b9997799683449606d7e9e48d5c94001 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Mon, 18 Apr 2022 16:14:15 +0100 Subject: [PATCH 16/27] Add age --- openfisca_us/data/datasets/acs/acs.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/openfisca_us/data/datasets/acs/acs.py b/openfisca_us/data/datasets/acs/acs.py index 3bfc1640d9d..77b37f5dcaf 100644 --- a/openfisca_us/data/datasets/acs/acs.py +++ b/openfisca_us/data/datasets/acs/acs.py @@ -36,14 +36,18 @@ def generate(self, year: int) -> None: ] # Add primary and foreign keys make_numeric = lambda x: int(x.replace("2019GQ", "0").replace("2019HU", "1")) - household.SERIALNO = household.SERIALNO.apply(make_numeric) - person.SERIALNO = person.SERIALNO.apply(make_numeric) + household.SERIALNO = household.SERIALNO.apply(make_numeric).astype(int) + person.SERIALNO = person.SERIALNO.apply(make_numeric).astype(int) + person.SPORDER = person.SPORDER.astype(int) + person.SPM_ID = person.SPM_ID.astype(int) + spm_unit.SPM_ID = spm_unit.SPM_ID.astype(int) person = person[person.SERIALNO.isin(household.SERIALNO)] household = household[household.SERIALNO.isin(person.SERIALNO)] spm_unit = spm_unit[spm_unit.SPM_ID.isin(person.SPM_ID)] add_id_variables(acs, person, spm_unit, household) + add_person_variables(acs, person) add_spm_variables(acs, spm_unit) add_household_variables(acs, household) @@ -84,6 +88,8 @@ def add_id_variables( acs["person_weight"] = person.PWGTP acs["household_weight"] = household.WGTP +def add_person_variables(acs: h5py.File, person: DataFrame): + acs["age"] = person.AGEP def add_spm_variables(acs: h5py.File, spm_unit: DataFrame): acs["spm_unit_net_income"] = spm_unit.SPM_RESOURCES From e2db30d862eafca5fff58070a8f61f042f72f5e6 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Mon, 18 Apr 2022 16:14:25 +0100 Subject: [PATCH 17/27] Add basic income parameters with metadata --- .../basic_income/amount_by_age.yaml | 47 +++++++++++++++++++ .../ubi_center/basic_income/basic_income.py | 15 ++++++ 2 files changed, 62 insertions(+) create mode 100644 openfisca_us/parameters/contrib/ubi_center/basic_income/amount_by_age.yaml create mode 100644 openfisca_us/variables/contrib/ubi_center/basic_income/basic_income.py diff --git a/openfisca_us/parameters/contrib/ubi_center/basic_income/amount_by_age.yaml b/openfisca_us/parameters/contrib/ubi_center/basic_income/amount_by_age.yaml new file mode 100644 index 00000000000..7f9dc4f0f08 --- /dev/null +++ b/openfisca_us/parameters/contrib/ubi_center/basic_income/amount_by_age.yaml @@ -0,0 +1,47 @@ +description: Basic income amounts by age. +brackets: + - threshold: + 2010-01-01: 0 + amount: + description: Unconditional payment to children. + values: + 2010-01-01: 0 + metadata: + label: Child basic income + unit: currency-USD + period: year + name: child_bi + - threshold: + description: Age at which individuals receive the working-age adult payment, rather than the child payment. + values: + 2010-01-01: 18 + metadata: + label: Basic income child age + unit: year + name: adult_bi_age + amount: + description: Unconditional payment to working-age adults. + values: + 2010-01-01: 0 + metadata: + label: Adult basic income + unit: currency-USD + period: year + name: adult_bi + - threshold: + description: Age at which individuals receive the senior citizen payment, rather than the working-age adult payment. + values: + 2010-01-01: 65 + metadata: + label: Senior citizen basic income + unit: year + name: senior_bi_age + amount: + description: Unconditional payment to senior citizens. + values: + 2010-01-01: 0 + metadata: + label: Senior citizen basic income + unit: currency-USD + period: year + name: senior_bi diff --git a/openfisca_us/variables/contrib/ubi_center/basic_income/basic_income.py b/openfisca_us/variables/contrib/ubi_center/basic_income/basic_income.py new file mode 100644 index 00000000000..579bbbcc40e --- /dev/null +++ b/openfisca_us/variables/contrib/ubi_center/basic_income/basic_income.py @@ -0,0 +1,15 @@ +from openfisca_us.model_api import * + + +class basic_income(Variable): + value_type = float + entity = Person + label = "Basic income" + unit = USD + documentation = "Total basic income payments for this person." + definition_period = YEAR + + def formula(person, period, parameters): + bi = parameters(period).contrib.ubi_center.basic_income + age = person("age", period) + return bi.amount_by_age.calc(age) \ No newline at end of file From f768a50c14297b81b7d8c7443cdba0857ce798da Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Mon, 18 Apr 2022 16:14:34 +0100 Subject: [PATCH 18/27] Fix weight and net income --- .../variables/demographic/weights/person_weight.py | 3 +++ .../demographic/weights/spm_unit_weight.py | 4 ++++ .../spm_unit/spm_unit_is_in_deep_spm_poverty.py | 13 +++++++++++++ .../income/spm_unit/spm_unit_net_income.py | 5 +++++ 4 files changed, 25 insertions(+) create mode 100644 openfisca_us/variables/income/spm_unit/spm_unit_is_in_deep_spm_poverty.py diff --git a/openfisca_us/variables/demographic/weights/person_weight.py b/openfisca_us/variables/demographic/weights/person_weight.py index a2a9b276a73..86019fea0c0 100644 --- a/openfisca_us/variables/demographic/weights/person_weight.py +++ b/openfisca_us/variables/demographic/weights/person_weight.py @@ -6,3 +6,6 @@ class person_weight(Variable): entity = Person label = "Person weight" definition_period = YEAR + + def formula(person, period, parameters): + return person.household("household_weight", period) \ No newline at end of file diff --git a/openfisca_us/variables/demographic/weights/spm_unit_weight.py b/openfisca_us/variables/demographic/weights/spm_unit_weight.py index 44c1918fdc8..7f35cb95c60 100644 --- a/openfisca_us/variables/demographic/weights/spm_unit_weight.py +++ b/openfisca_us/variables/demographic/weights/spm_unit_weight.py @@ -6,3 +6,7 @@ class spm_unit_weight(Variable): entity = SPMUnit label = "SPM unit weight" definition_period = YEAR + + def formula(spm_unit, period, parameters): + # Use household weights if not provided + return spm_unit.household("household_weight", period) diff --git a/openfisca_us/variables/income/spm_unit/spm_unit_is_in_deep_spm_poverty.py b/openfisca_us/variables/income/spm_unit/spm_unit_is_in_deep_spm_poverty.py new file mode 100644 index 00000000000..f3477012ac9 --- /dev/null +++ b/openfisca_us/variables/income/spm_unit/spm_unit_is_in_deep_spm_poverty.py @@ -0,0 +1,13 @@ +from openfisca_us.model_api import * + + +class spm_unit_is_in_deep_spm_poverty(Variable): + value_type = bool + entity = SPMUnit + label = "SPM unit in deep SPM poverty" + definition_period = YEAR + + def formula(spm_unit, period, parameters): + income = spm_unit("spm_unit_net_income", period) + poverty_threshold = spm_unit("spm_unit_spm_threshold", period) / 2 + return income < poverty_threshold diff --git a/openfisca_us/variables/income/spm_unit/spm_unit_net_income.py b/openfisca_us/variables/income/spm_unit/spm_unit_net_income.py index f2802ff5137..992b321fdc5 100644 --- a/openfisca_us/variables/income/spm_unit/spm_unit_net_income.py +++ b/openfisca_us/variables/income/spm_unit/spm_unit_net_income.py @@ -9,6 +9,11 @@ class spm_unit_net_income(Variable): unit = USD def formula(spm_unit, period, parameters): + reported_net_income = spm_unit("spm_unit_net_income_reported", period) + if reported_net_income.sum() > 0: + # If we have reported net income, use that instead for now. This + # is only until the full microsimulation can be run. + return reported_net_income market_income = spm_unit("spm_unit_market_income", period) benefits = spm_unit("spm_unit_benefits", period) taxes = spm_unit("spm_unit_taxes", period) From 3e360ed282f4c016599552e24a5278826f9637a3 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Mon, 18 Apr 2022 21:13:46 +0100 Subject: [PATCH 19/27] Default FIPS -> 6 --- openfisca_us/variables/irs/income/sources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfisca_us/variables/irs/income/sources.py b/openfisca_us/variables/irs/income/sources.py index 32f392b1d11..80bf0fedd5a 100644 --- a/openfisca_us/variables/irs/income/sources.py +++ b/openfisca_us/variables/irs/income/sources.py @@ -1012,7 +1012,7 @@ class fips(Variable): entity = Household definition_period = YEAR documentation = "FIPS state code (not used in tax-calculation logic)" - default_value = 1 + default_value = 6 class h_seq(Variable): From 97610eb3bfd9e980bfb867ca4bdb16bd93dd1c4b Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Tue, 19 Apr 2022 09:33:08 +0100 Subject: [PATCH 20/27] Add state --- openfisca_us/data/datasets/acs/acs.py | 2 +- openfisca_us/data/datasets/cps/cps.py | 3 +++ .../variables/demographic/spm_unit/spm_unit_id.py | 14 ++++++++++++++ setup.py | 1 + 4 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 openfisca_us/variables/demographic/spm_unit/spm_unit_id.py diff --git a/openfisca_us/data/datasets/acs/acs.py b/openfisca_us/data/datasets/acs/acs.py index 50f6cd36772..6502af325c4 100644 --- a/openfisca_us/data/datasets/acs/acs.py +++ b/openfisca_us/data/datasets/acs/acs.py @@ -99,4 +99,4 @@ def add_spm_variables(acs: h5py.File, spm_unit: DataFrame): def add_household_variables(acs: h5py.File, household: DataFrame): acs["household_vehicles_owned"] = household.VEH - acs["fips"] = household.ST + acs["fips"] = acs["household_fips"] = household.ST diff --git a/openfisca_us/data/datasets/cps/cps.py b/openfisca_us/data/datasets/cps/cps.py index 509c82163c8..0994026ca3e 100644 --- a/openfisca_us/data/datasets/cps/cps.py +++ b/openfisca_us/data/datasets/cps/cps.py @@ -46,6 +46,7 @@ def generate(self, year: int): add_personal_variables(cps, person) add_personal_income_variables(cps, person) add_spm_variables(cps, spm_unit) + add_household_variables(cps, household) raw_data.close() cps.close() @@ -166,5 +167,7 @@ def add_spm_variables(cps: h5py.File, spm_unit: DataFrame): cps["reduced_price_school_meals"] = cps["free_school_meals"][...] * 0 +def add_household_variables(cps: h5py.File, household: DataFrame): + cps["fips"] = household.GESTFIPS CPS = CPS() diff --git a/openfisca_us/variables/demographic/spm_unit/spm_unit_id.py b/openfisca_us/variables/demographic/spm_unit/spm_unit_id.py new file mode 100644 index 00000000000..10f201f14a4 --- /dev/null +++ b/openfisca_us/variables/demographic/spm_unit/spm_unit_id.py @@ -0,0 +1,14 @@ +from openfisca_us.model_api import * + +class spm_unit_id(Variable): + value_type = int + entity = SPMUnit + label = "SPM unit ID" + definition_period = YEAR + +class person_spm_unit_id(Variable): + value_type = int + entity = Person + label = "SPM unit ID" + definition_period = YEAR + diff --git a/setup.py b/setup.py index eaba6c1c931..4370dda64e3 100644 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ "coverage", "plotly", "yaml-changelog>=0.1.6", + "python-us", ], }, python_requires=">=3.7,<3.8", From 069f3667de8c53349336b3afbf997366be8557f1 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Tue, 19 Apr 2022 09:38:25 +0100 Subject: [PATCH 21/27] Add ACS tests --- .../tests/microsimulation/data/acs/test_acs.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 openfisca_us/tests/microsimulation/data/acs/test_acs.py diff --git a/openfisca_us/tests/microsimulation/data/acs/test_acs.py b/openfisca_us/tests/microsimulation/data/acs/test_acs.py new file mode 100644 index 00000000000..fc120aa18dd --- /dev/null +++ b/openfisca_us/tests/microsimulation/data/acs/test_acs.py @@ -0,0 +1,17 @@ +from openfisca_us.data import ACS +import pytest +from openfisca_us import Microsimulation + +ACS_YEARS = [2019] + + +@pytest.mark.dependency(name="acs") +@pytest.mark.parametrize("year", ACS_YEARS) +def test_cps_dataset_generates(year): + ACS.generate(year) + + +@pytest.mark.dependency(depends=["acs"]) +@pytest.mark.parametrize("year", ACS_YEARS) +def test_cps_openfisca_us_compatible(year): + Microsimulation(dataset=ACS, year=year).calc("tax") From aa051af332dd64fb5587193fba4edac1aa461827 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Tue, 19 Apr 2022 09:38:48 +0100 Subject: [PATCH 22/27] Format --- openfisca_us/api/microsimulation.py | 4 +- openfisca_us/data/datasets/acs/acs.py | 2 + openfisca_us/data/datasets/cps/cps.py | 2 + .../ubi_center/basic_income/basic_income.py | 2 +- .../demographic/geographic/state_name.py | 114 +++++++++--------- .../demographic/spm_unit/spm_unit_id.py | 3 +- .../demographic/weights/person_weight.py | 2 +- 7 files changed, 71 insertions(+), 58 deletions(-) diff --git a/openfisca_us/api/microsimulation.py b/openfisca_us/api/microsimulation.py index 32b13d8710c..a821787206b 100644 --- a/openfisca_us/api/microsimulation.py +++ b/openfisca_us/api/microsimulation.py @@ -9,7 +9,9 @@ class Microsimulation(GeneralMicrosimulation): entities = entities default_dataset = CPS - def __init__(self, reform=(), dataset: type = CPS, year: int = None, **kwargs): + def __init__( + self, reform=(), dataset: type = CPS, year: int = None, **kwargs + ): if dataset == CPS and len(CPS.years) == 0: CPS.generate(2020) diff --git a/openfisca_us/data/datasets/acs/acs.py b/openfisca_us/data/datasets/acs/acs.py index 6502af325c4..52fac6bb861 100644 --- a/openfisca_us/data/datasets/acs/acs.py +++ b/openfisca_us/data/datasets/acs/acs.py @@ -89,9 +89,11 @@ def add_id_variables( acs["person_weight"] = person.PWGTP acs["household_weight"] = household.WGTP + def add_person_variables(acs: h5py.File, person: DataFrame): acs["age"] = person.AGEP + def add_spm_variables(acs: h5py.File, spm_unit: DataFrame): acs["spm_unit_net_income"] = spm_unit.SPM_RESOURCES acs["spm_unit_spm_threshold"] = spm_unit.SPM_POVTHRESHOLD diff --git a/openfisca_us/data/datasets/cps/cps.py b/openfisca_us/data/datasets/cps/cps.py index 0994026ca3e..81fcdd724e9 100644 --- a/openfisca_us/data/datasets/cps/cps.py +++ b/openfisca_us/data/datasets/cps/cps.py @@ -167,7 +167,9 @@ def add_spm_variables(cps: h5py.File, spm_unit: DataFrame): cps["reduced_price_school_meals"] = cps["free_school_meals"][...] * 0 + def add_household_variables(cps: h5py.File, household: DataFrame): cps["fips"] = household.GESTFIPS + CPS = CPS() diff --git a/openfisca_us/variables/contrib/ubi_center/basic_income/basic_income.py b/openfisca_us/variables/contrib/ubi_center/basic_income/basic_income.py index 579bbbcc40e..e429154c7cc 100644 --- a/openfisca_us/variables/contrib/ubi_center/basic_income/basic_income.py +++ b/openfisca_us/variables/contrib/ubi_center/basic_income/basic_income.py @@ -12,4 +12,4 @@ class basic_income(Variable): def formula(person, period, parameters): bi = parameters(period).contrib.ubi_center.basic_income age = person("age", period) - return bi.amount_by_age.calc(age) \ No newline at end of file + return bi.amount_by_age.calc(age) diff --git a/openfisca_us/variables/demographic/geographic/state_name.py b/openfisca_us/variables/demographic/geographic/state_name.py index 9fea58de658..516b37305ab 100644 --- a/openfisca_us/variables/demographic/geographic/state_name.py +++ b/openfisca_us/variables/demographic/geographic/state_name.py @@ -76,57 +76,63 @@ class state_name(Variable): def formula(household, period, parameters): fips = household("fips", period) - return pd.Series(fips).map({ - 1: StateName.AL, - 2: StateName.AK, - 4: StateName.AZ, - 5: StateName.AR, - 6: StateName.CA, - 8: StateName.CO, - 9: StateName.CT, - 10: StateName.DE, - 11: StateName.DC, - 12: StateName.FL, - 13: StateName.GA, - 15: StateName.HI, - 16: StateName.ID, - 17: StateName.IL, - 18: StateName.IN, - 19: StateName.IA, - 20: StateName.KS, - 21: StateName.KY, - 22: StateName.LA, - 23: StateName.ME, - 24: StateName.MD, - 25: StateName.MA, - 26: StateName.MI, - 27: StateName.MN, - 28: StateName.MS, - 29: StateName.MO, - 30: StateName.MT, - 31: StateName.NE, - 32: StateName.NV, - 33: StateName.NH, - 34: StateName.NJ, - 35: StateName.NM, - 36: StateName.NY, - 37: StateName.NC, - 38: StateName.ND, - 39: StateName.OH, - 40: StateName.OK, - 41: StateName.OR, - 42: StateName.PA, - 44: StateName.RI, - 45: StateName.SC, - 46: StateName.SD, - 47: StateName.TN, - 48: StateName.TX, - 49: StateName.UT, - 50: StateName.VT, - 51: StateName.VA, - 53: StateName.WA, - 54: StateName.WV, - 55: StateName.WI, - 56: StateName.WY, - 72: StateName.PR, - }).values + return ( + pd.Series(fips) + .map( + { + 1: StateName.AL, + 2: StateName.AK, + 4: StateName.AZ, + 5: StateName.AR, + 6: StateName.CA, + 8: StateName.CO, + 9: StateName.CT, + 10: StateName.DE, + 11: StateName.DC, + 12: StateName.FL, + 13: StateName.GA, + 15: StateName.HI, + 16: StateName.ID, + 17: StateName.IL, + 18: StateName.IN, + 19: StateName.IA, + 20: StateName.KS, + 21: StateName.KY, + 22: StateName.LA, + 23: StateName.ME, + 24: StateName.MD, + 25: StateName.MA, + 26: StateName.MI, + 27: StateName.MN, + 28: StateName.MS, + 29: StateName.MO, + 30: StateName.MT, + 31: StateName.NE, + 32: StateName.NV, + 33: StateName.NH, + 34: StateName.NJ, + 35: StateName.NM, + 36: StateName.NY, + 37: StateName.NC, + 38: StateName.ND, + 39: StateName.OH, + 40: StateName.OK, + 41: StateName.OR, + 42: StateName.PA, + 44: StateName.RI, + 45: StateName.SC, + 46: StateName.SD, + 47: StateName.TN, + 48: StateName.TX, + 49: StateName.UT, + 50: StateName.VT, + 51: StateName.VA, + 53: StateName.WA, + 54: StateName.WV, + 55: StateName.WI, + 56: StateName.WY, + 72: StateName.PR, + } + ) + .values + ) diff --git a/openfisca_us/variables/demographic/spm_unit/spm_unit_id.py b/openfisca_us/variables/demographic/spm_unit/spm_unit_id.py index 10f201f14a4..c516a6d643e 100644 --- a/openfisca_us/variables/demographic/spm_unit/spm_unit_id.py +++ b/openfisca_us/variables/demographic/spm_unit/spm_unit_id.py @@ -1,14 +1,15 @@ from openfisca_us.model_api import * + class spm_unit_id(Variable): value_type = int entity = SPMUnit label = "SPM unit ID" definition_period = YEAR + class person_spm_unit_id(Variable): value_type = int entity = Person label = "SPM unit ID" definition_period = YEAR - diff --git a/openfisca_us/variables/demographic/weights/person_weight.py b/openfisca_us/variables/demographic/weights/person_weight.py index 86019fea0c0..9ca2ae46d18 100644 --- a/openfisca_us/variables/demographic/weights/person_weight.py +++ b/openfisca_us/variables/demographic/weights/person_weight.py @@ -8,4 +8,4 @@ class person_weight(Variable): definition_period = YEAR def formula(person, period, parameters): - return person.household("household_weight", period) \ No newline at end of file + return person.household("household_weight", period) From 9f2f99f18fc90a940ee451beca56413601744d17 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Tue, 19 Apr 2022 09:49:27 +0100 Subject: [PATCH 23/27] Fix dep --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4370dda64e3..b83e13ae9e5 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ "coverage", "plotly", "yaml-changelog>=0.1.6", - "python-us", + "us", ], }, python_requires=">=3.7,<3.8", From 26b4ed2c506510cb4912c1a06c60c254aeebe6ae Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Tue, 19 Apr 2022 09:52:23 +0100 Subject: [PATCH 24/27] Add employment income --- openfisca_us/data/datasets/acs/acs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openfisca_us/data/datasets/acs/acs.py b/openfisca_us/data/datasets/acs/acs.py index 52fac6bb861..d3ff13a23fd 100644 --- a/openfisca_us/data/datasets/acs/acs.py +++ b/openfisca_us/data/datasets/acs/acs.py @@ -92,6 +92,8 @@ def add_id_variables( def add_person_variables(acs: h5py.File, person: DataFrame): acs["age"] = person.AGEP + acs["employment_income"] = person.WAGP + acs["self_employment_income"] = person.SEMP def add_spm_variables(acs: h5py.File, spm_unit: DataFrame): From 1291e033386d5affc4c084c68d82e16902861ad5 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Tue, 19 Apr 2022 10:09:54 +0100 Subject: [PATCH 25/27] Add type --- openfisca_us/data/datasets/acs/acs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openfisca_us/data/datasets/acs/acs.py b/openfisca_us/data/datasets/acs/acs.py index d3ff13a23fd..7177c3f701e 100644 --- a/openfisca_us/data/datasets/acs/acs.py +++ b/openfisca_us/data/datasets/acs/acs.py @@ -36,8 +36,8 @@ def generate(self, year: int) -> None: def make_numeric(x): return int(x.replace("2019GQ", "0").replace("2019HU", "1")) - household.SERIALNO = household.SERIALNO.apply(make_numeric) - person.SERIALNO = person.SERIALNO.apply(make_numeric) + household.SERIALNO = household.SERIALNO.apply(make_numeric).astype(int) + person.SERIALNO = person.SERIALNO.apply(make_numeric).astype(int) person.SPORDER = person.SPORDER.astype(int) person.SPM_ID = person.SPM_ID.astype(int) spm_unit.SPM_ID = spm_unit.SPM_ID.astype(int) From 168d7e672b7595e1bb486edc312272ed8e871a3f Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Tue, 19 Apr 2022 11:03:19 +0100 Subject: [PATCH 26/27] Fix linking bugs --- openfisca_us/data/datasets/acs/acs.py | 17 ++++++++++----- openfisca_us/data/datasets/acs/raw_acs.py | 26 +++++++++++++++++------ 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/openfisca_us/data/datasets/acs/acs.py b/openfisca_us/data/datasets/acs/acs.py index 7177c3f701e..b0a0a80e4c5 100644 --- a/openfisca_us/data/datasets/acs/acs.py +++ b/openfisca_us/data/datasets/acs/acs.py @@ -1,3 +1,4 @@ +import logging from openfisca_tools.data import PublicDataset import h5py from openfisca_us.data.datasets.acs.raw_acs import RawACS @@ -33,17 +34,23 @@ def generate(self, year: int) -> None: ] # Add primary and foreign keys - def make_numeric(x): - return int(x.replace("2019GQ", "0").replace("2019HU", "1")) - - household.SERIALNO = household.SERIALNO.apply(make_numeric).astype(int) - person.SERIALNO = person.SERIALNO.apply(make_numeric).astype(int) + household.SERIALNO = household.SERIALNO.astype(int) + person.SERIALNO = person.SERIALNO.astype(int) person.SPORDER = person.SPORDER.astype(int) person.SPM_ID = person.SPM_ID.astype(int) spm_unit.SPM_ID = spm_unit.SPM_ID.astype(int) + logging.info( + f"Persons with a linked household {person.SERIALNO.isin(household.SERIALNO).mean():.1%}" + ) person = person[person.SERIALNO.isin(household.SERIALNO)] + logging.info( + f"Households with a linked person {household.SERIALNO.isin(person.SERIALNO).mean():.1%}" + ) household = household[household.SERIALNO.isin(person.SERIALNO)] + logging.info( + f"SPM units with a linked person {spm_unit.SPM_ID.isin(person.SPM_ID).mean():.1%}" + ) spm_unit = spm_unit[spm_unit.SPM_ID.isin(person.SPM_ID)] add_id_variables(acs, person, spm_unit, household) diff --git a/openfisca_us/data/datasets/acs/raw_acs.py b/openfisca_us/data/datasets/acs/raw_acs.py index 69180f88850..0d14319715a 100644 --- a/openfisca_us/data/datasets/acs/raw_acs.py +++ b/openfisca_us/data/datasets/acs/raw_acs.py @@ -75,14 +75,27 @@ def generate(self, year: int) -> None: with pd.HDFStore(RawACS.file(year)) as storage: # Household file logging.info(f"Downloading household file") - storage["household"] = concat_zipped_csvs( + household = concat_zipped_csvs( household_url, "psam_hus", HOUSEHOLD_COLUMNS ) + # Remove group quarters (zero weight) + household = household[ + ~household.SERIALNO.str.contains("2019GQ") + ] + household["SERIALNO"] = household["SERIALNO"].apply( + lambda x: int(x.replace("2019HU", "")) + ) + storage["household"] = household # Person file logging.info(f"Downloading person file") - storage["person"] = concat_zipped_csvs( + person = concat_zipped_csvs( person_url, "psam_pus", PERSON_COLUMNS ) + person = person[~person.SERIALNO.str.contains("2019GQ")] + person["SERIALNO"] = person["SERIALNO"].apply( + lambda x: int(x.replace("2019HU", "")) + ) + storage["person"] = person # SPM unit file logging.info(f"Downloading SPM unit file") spm_person = pd.read_stata(spm_url).fillna(0) @@ -90,9 +103,10 @@ def generate(self, year: int) -> None: create_spm_unit_table(storage, spm_person) except Exception as e: RawACS.remove(year) - raise ValueError( + logging.error( f"Attempted to extract and save the CSV files, but encountered an error: {e}" ) + raise e RawACS = RawACS() @@ -171,12 +185,10 @@ def create_spm_unit_table( original_person_table = storage["person"] # Ensure that join keys are the same type. JOIN_COLUMNS = ["SERIALNO", "SPORDER"] - print(original_person_table[JOIN_COLUMNS].dtypes) - print(person[JOIN_COLUMNS].dtypes) original_person_table[JOIN_COLUMNS] = original_person_table[ JOIN_COLUMNS - ].astype(str) - person[JOIN_COLUMNS] = person[JOIN_COLUMNS].astype(str) + ].astype(int) + person[JOIN_COLUMNS] = person[JOIN_COLUMNS].astype(int) # Add SPM_ID from the SPM person table to the original person table. combined_person_table = pd.merge( original_person_table, From 3a24bbb225ca3a4d0caff8a6861dc4295abb488d Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff <35577657+nikhilwoodruff@users.noreply.github.com> Date: Tue, 19 Apr 2022 11:12:28 +0100 Subject: [PATCH 27/27] Remove datasets before building --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 9ecfbcc6111..de644bf86b1 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,7 @@ test: documentation: jb build docs/book build: + rm openfisca_us/data/storage/*.h5 python setup.py sdist bdist_wheel changelog: