From 6359e077194bc2efa515de0d67af365189a5c0e7 Mon Sep 17 00:00:00 2001 From: Kklu78 <76934184+Kklu78@users.noreply.github.com> Date: Mon, 1 Nov 2021 17:37:44 -0700 Subject: [PATCH 1/5] update --- openfisca_us_data/datasets/acs/raw_fullacs.py | 107 ++++++++++++++++++ .../acs/{raw_acs.py => raw_spm_acs.py} | 0 .../datasets/fullacs/__init__.py | 2 + openfisca_us_data/datasets/fullacs/fullacs.py | 73 ++++++++++++ 4 files changed, 182 insertions(+) create mode 100644 openfisca_us_data/datasets/acs/raw_fullacs.py rename openfisca_us_data/datasets/acs/{raw_acs.py => raw_spm_acs.py} (100%) create mode 100644 openfisca_us_data/datasets/fullacs/__init__.py create mode 100644 openfisca_us_data/datasets/fullacs/fullacs.py diff --git a/openfisca_us_data/datasets/acs/raw_fullacs.py b/openfisca_us_data/datasets/acs/raw_fullacs.py new file mode 100644 index 0000000..779e7d4 --- /dev/null +++ b/openfisca_us_data/datasets/acs/raw_fullacs.py @@ -0,0 +1,107 @@ +from openfisca_us_data.utils import * +import requests +from io import BytesIO +import pandas as pd +from zipfile import ZipFile + +@dataset +class RawfullACS: + name = "raw_fullacs" + + def generate(year: int) -> None: + url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_pus.zip" + request = requests.get(url) + file = ZipFile(BytesIO(request.content)) + file.extractall(f'{year}_pus') + + url2 = 'https://www2.census.gov/programs-surveys/acs/data/pums/2019/1-Year/csv_hus.zip' + request = requests.get(url2) + file = ZipFile(BytesIO(request.content)) + file.extractall(f'{year}_hus') + + try: + with pd.HDFStore(RawfullACS.file(year)) as storage: + persona = pd.read_csv(f'{year}_pus/psam_pusa.csv') + personb = pd.read_csv(f'{year}_pus/psam_pusb.csv') + person_df = pd.concat(persona, personb).fillna(0) + person_df.columns = person_df.columns.str.upper() + + householda = pd.read_csv(f'{year}_hus/psam_husa.csv') + householdb = pd.read_csv(f'{year}_hus/psam_husa.csv') + household_df = pd.concat(householda, householdb).fillna(0) + household_df.columns = household_df.columns.str.upper() + + + + except Exception as e: + RawfullACS.remove(year) + raise ValueError( + f"Attempted to extract and save the CSV files, but encountered an error: {e}" + ) + + + + + + + + +@dataset +class RawACS: + name = "raw_acs" + + def generate(year: int) -> None: + url = f"https://www2.census.gov/programs-surveys/supplemental-poverty-measure/datasets/spm/spm_{year}_pu.dta" + try: + with pd.HDFStore(RawACS.file(year)) as storage: + person = pd.read_stata(url).fillna(0) + person.columns = person.columns.str.upper() + storage["person"] = person + storage["spm_unit"] = create_SPM_unit_table(person) + storage["household"] = create_household_table(person) + except Exception as e: + RawACS.remove(year) + raise ValueError( + f"Attempted to extract and save the CSV files, but encountered an error: {e}" + ) + + +def create_SPM_unit_table(person: pd.DataFrame) -> pd.DataFrame: + SPM_UNIT_COLUMNS = [ + "CAPHOUSESUB", + "CAPWKCCXPNS", + "CHILDCAREXPNS", + "EITC", + "ENGVAL", + "EQUIVSCALE", + "FEDTAX", + "FEDTAXBC", + "FICA", + "GEOADJ", + "MEDXPNS", + "NUMADULTS", + "NUMKIDS", + "NUMPER", + "POOR", + "POVTHRESHOLD", + "RESOURCES", + "SCHLUNCH", + "SNAPSUB", + "STTAX", + "TENMORTSTATUS", + "TOTVAL", + "WCOHABIT", + "WICVAL", + "WKXPNS", + "WUI_LT15", + "ID", + ] + return ( + person[["SPM_" + column for column in SPM_UNIT_COLUMNS]] + .groupby(person.SPM_ID) + .first() + ) + + +def create_household_table(person: pd.DataFrame) -> pd.DataFrame: + return person[["SERIALNO", "ST", "PUMA"]].groupby(person.SERIALNO).first() diff --git a/openfisca_us_data/datasets/acs/raw_acs.py b/openfisca_us_data/datasets/acs/raw_spm_acs.py similarity index 100% rename from openfisca_us_data/datasets/acs/raw_acs.py rename to openfisca_us_data/datasets/acs/raw_spm_acs.py diff --git a/openfisca_us_data/datasets/fullacs/__init__.py b/openfisca_us_data/datasets/fullacs/__init__.py new file mode 100644 index 0000000..57ad83c --- /dev/null +++ b/openfisca_us_data/datasets/fullacs/__init__.py @@ -0,0 +1,2 @@ +from openfisca_us_data.datasets.acs.raw_fullacs import RawACS +from openfisca_us_data.datasets.acs.fullacs import ACS diff --git a/openfisca_us_data/datasets/fullacs/fullacs.py b/openfisca_us_data/datasets/fullacs/fullacs.py new file mode 100644 index 0000000..3aa2d5a --- /dev/null +++ b/openfisca_us_data/datasets/fullacs/fullacs.py @@ -0,0 +1,73 @@ +from openfisca_us_data.utils import US, dataset +from openfisca_us_data.datasets.fullacs.raw_fullacs import RawACS +from pandas import DataFrame +import h5py + + +@dataset +class ACS: + name = "acs" + model = US + + # Note: no self because it uses a decorator. + def generate(year: int) -> None: + """Generates the ACS dataset. + + Args: + year (int): The year of the raw ACS to use. + """ + + # Prepare raw ACS tables + year = int(year) + if year not in RawACS.years: + RawACS.generate(year) + + raw_data = RawACS.load(year) + acs = h5py.File(ACS.file(year), mode="w") + + person, spm_unit, household = [ + raw_data[entity] for entity in ("person", "spm_unit", "household") + ] + + add_ID_variables(acs, person, spm_unit, household) + add_SPM_variables(acs, spm_unit) + + raw_data.close() + acs.close() + + +def add_ID_variables( + acs: h5py.File, + person: DataFrame, + spm_unit: DataFrame, + household: DataFrame, +): + """Add basic ID and weight variables. + + Args: + acs (h5py.File): The ACS dataset file. + person (DataFrame): The person table of the ACS. + spm_unit (DataFrame): The SPM unit table created from the person table + of the ACS. + household (DataFrame): The household table of the ACS. + """ + # Add primary and foreign keys + acs["person_id"] = person.SERIALNO * 1e2 + person.SPORDER + acs["person_spm_unit_id"] = person.SPM_ID + acs["spm_unit_id"] = spm_unit.SPM_ID + # ACS doesn't have tax units. + acs["tax_unit_id"] = spm_unit.SPM_ID + # Until we add a family table, we'll use the person table. + acs["family_id"] = spm_unit.SPM_ID + acs["person_household_id"] = person.SERIALNO + acs["person_tax_unit_id"] = person.SPM_ID + acs["person_family_id"] = person.SPM_ID + acs["household_id"] = household.SERIALNO + + # Add weights + acs["person_weight"] = person.WT + + +def add_SPM_variables(acs: h5py.File, spm_unit: DataFrame): + acs["SPM_unit_net_income"] = spm_unit.SPM_RESOURCES + acs["poverty_threshold"] = spm_unit.SPM_POVTHRESHOLD From 945520d3df82860093c02e3ff00228e81a35b3e2 Mon Sep 17 00:00:00 2001 From: Kklu78 <76934184+Kklu78@users.noreply.github.com> Date: Mon, 1 Nov 2021 17:45:11 -0700 Subject: [PATCH 2/5] Moved raw_fullacs.py to ACS --- openfisca_us_data/datasets/acs/raw_fullacs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/openfisca_us_data/datasets/acs/raw_fullacs.py b/openfisca_us_data/datasets/acs/raw_fullacs.py index 779e7d4..3a38be0 100644 --- a/openfisca_us_data/datasets/acs/raw_fullacs.py +++ b/openfisca_us_data/datasets/acs/raw_fullacs.py @@ -31,8 +31,6 @@ def generate(year: int) -> None: household_df = pd.concat(householda, householdb).fillna(0) household_df.columns = household_df.columns.str.upper() - - except Exception as e: RawfullACS.remove(year) raise ValueError( From bbaa6de0ffdb0846115aae57dea123f96ee20c40 Mon Sep 17 00:00:00 2001 From: Kklu78 <76934184+Kklu78@users.noreply.github.com> Date: Mon, 1 Nov 2021 18:00:28 -0700 Subject: [PATCH 3/5] Upt raw_fullacs to raw_acs, raw_acs to raw_spm_acs --- README.md | 7 +- openfisca_us_data/__init__.py | 2 +- openfisca_us_data/datasets/acs/__init__.py | 1 + openfisca_us_data/datasets/acs/acs.py | 8 +- openfisca_us_data/datasets/acs/raw_acs.py | 38 +++++++ openfisca_us_data/datasets/acs/raw_fullacs.py | 105 ------------------ openfisca_us_data/datasets/acs/raw_spm_acs.py | 8 +- .../datasets/fullacs/__init__.py | 2 - openfisca_us_data/datasets/fullacs/fullacs.py | 73 ------------ tests/test_imports.py | 3 + 10 files changed, 56 insertions(+), 191 deletions(-) create mode 100644 openfisca_us_data/datasets/acs/raw_acs.py delete mode 100644 openfisca_us_data/datasets/acs/raw_fullacs.py delete mode 100644 openfisca_us_data/datasets/fullacs/__init__.py delete mode 100644 openfisca_us_data/datasets/fullacs/fullacs.py diff --git a/README.md b/README.md index f2bfc14..e3fbcbd 100644 --- a/README.md +++ b/README.md @@ -45,9 +45,12 @@ class CustomDataset: ### CPS - OpenFisca-US-compatible - Contains OpenFisca-US-compatible input arrays. -### RawACS +### RawSPMACS - Not OpenFisca-US-compatible - Contains the tables from the raw [ACS SPM research file](https://www.census.gov/data/datasets/time-series/demo/supplemental-poverty-measure/acs-research-files.html) microdata. +### RawACS +- Not OpenFisca-US-compatible +- Contains the tables from the raw [ACS person and household file](https://www.census.gov/programs-surveys/acs/microdata.html) ### ACS - OpenFisca-US-compatible -- Contains OpenFisca-US-compatible input arrays. +- Contains OpenFisca-US-compatible input arrays from the spm research file. diff --git a/openfisca_us_data/__init__.py b/openfisca_us_data/__init__.py index 4d0d0de..750584c 100644 --- a/openfisca_us_data/__init__.py +++ b/openfisca_us_data/__init__.py @@ -3,4 +3,4 @@ REPO = Path(__file__).parent -DATASETS = (RawCPS, CPS, RawACS, ACS) +DATASETS = (RawCPS, CPS, RawACS, ACS, RawSPMACS) diff --git a/openfisca_us_data/datasets/acs/__init__.py b/openfisca_us_data/datasets/acs/__init__.py index 6ed1f62..832134a 100644 --- a/openfisca_us_data/datasets/acs/__init__.py +++ b/openfisca_us_data/datasets/acs/__init__.py @@ -1,2 +1,3 @@ from openfisca_us_data.datasets.acs.raw_acs import RawACS +from openfisca_us_data.datasets.acs.raw_spm_acs import RawSPMACS from openfisca_us_data.datasets.acs.acs import ACS diff --git a/openfisca_us_data/datasets/acs/acs.py b/openfisca_us_data/datasets/acs/acs.py index 35739d4..056bd8c 100644 --- a/openfisca_us_data/datasets/acs/acs.py +++ b/openfisca_us_data/datasets/acs/acs.py @@ -1,5 +1,5 @@ from openfisca_us_data.utils import US, dataset -from openfisca_us_data.datasets.acs.raw_acs import RawACS +from openfisca_us_data.datasets.acs.raw_spm_acs import RawSPMACS from pandas import DataFrame import h5py @@ -19,10 +19,10 @@ def generate(year: int) -> None: # Prepare raw ACS tables year = int(year) - if year not in RawACS.years: - RawACS.generate(year) + if year not in RawSPMACS.years: + RawSPMACS.generate(year) - raw_data = RawACS.load(year) + raw_data = RawSPMACS.load(year) acs = h5py.File(ACS.file(year), mode="w") person, spm_unit, household = [ diff --git a/openfisca_us_data/datasets/acs/raw_acs.py b/openfisca_us_data/datasets/acs/raw_acs.py new file mode 100644 index 0000000..d468eb9 --- /dev/null +++ b/openfisca_us_data/datasets/acs/raw_acs.py @@ -0,0 +1,38 @@ +from openfisca_us_data.utils import * +import requests +from io import BytesIO +import pandas as pd +from zipfile import ZipFile + +@dataset +class RawACS: + name = "raw_acs" + + def generate(year: int) -> None: + url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_pus.zip" + request = requests.get(url) + file = ZipFile(BytesIO(request.content)) + file.extractall(f'{year}_pus') + + url2 = 'https://www2.census.gov/programs-surveys/acs/data/pums/2019/1-Year/csv_hus.zip' + request = requests.get(url2) + file = ZipFile(BytesIO(request.content)) + file.extractall(f'{year}_hus') + + try: + with pd.HDFStore(RawACS.file(year)) as storage: + persona = pd.read_csv(f'{year}_pus/psam_pusa.csv') + personb = pd.read_csv(f'{year}_pus/psam_pusb.csv') + person_df = pd.concat(persona, personb).fillna(0) + person_df.columns = person_df.columns.str.upper() + + householda = pd.read_csv(f'{year}_hus/psam_husa.csv') + householdb = pd.read_csv(f'{year}_hus/psam_husa.csv') + household_df = pd.concat(householda, householdb).fillna(0) + household_df.columns = household_df.columns.str.upper() + + except Exception as e: + RawACS.remove(year) + raise ValueError( + f"Attempted to extract and save the CSV files, but encountered an error: {e}" + ) \ No newline at end of file diff --git a/openfisca_us_data/datasets/acs/raw_fullacs.py b/openfisca_us_data/datasets/acs/raw_fullacs.py deleted file mode 100644 index 3a38be0..0000000 --- a/openfisca_us_data/datasets/acs/raw_fullacs.py +++ /dev/null @@ -1,105 +0,0 @@ -from openfisca_us_data.utils import * -import requests -from io import BytesIO -import pandas as pd -from zipfile import ZipFile - -@dataset -class RawfullACS: - name = "raw_fullacs" - - def generate(year: int) -> None: - url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_pus.zip" - request = requests.get(url) - file = ZipFile(BytesIO(request.content)) - file.extractall(f'{year}_pus') - - url2 = 'https://www2.census.gov/programs-surveys/acs/data/pums/2019/1-Year/csv_hus.zip' - request = requests.get(url2) - file = ZipFile(BytesIO(request.content)) - file.extractall(f'{year}_hus') - - try: - with pd.HDFStore(RawfullACS.file(year)) as storage: - persona = pd.read_csv(f'{year}_pus/psam_pusa.csv') - personb = pd.read_csv(f'{year}_pus/psam_pusb.csv') - person_df = pd.concat(persona, personb).fillna(0) - person_df.columns = person_df.columns.str.upper() - - householda = pd.read_csv(f'{year}_hus/psam_husa.csv') - householdb = pd.read_csv(f'{year}_hus/psam_husa.csv') - household_df = pd.concat(householda, householdb).fillna(0) - household_df.columns = household_df.columns.str.upper() - - except Exception as e: - RawfullACS.remove(year) - raise ValueError( - f"Attempted to extract and save the CSV files, but encountered an error: {e}" - ) - - - - - - - - -@dataset -class RawACS: - name = "raw_acs" - - def generate(year: int) -> None: - url = f"https://www2.census.gov/programs-surveys/supplemental-poverty-measure/datasets/spm/spm_{year}_pu.dta" - try: - with pd.HDFStore(RawACS.file(year)) as storage: - person = pd.read_stata(url).fillna(0) - person.columns = person.columns.str.upper() - storage["person"] = person - storage["spm_unit"] = create_SPM_unit_table(person) - storage["household"] = create_household_table(person) - except Exception as e: - RawACS.remove(year) - raise ValueError( - f"Attempted to extract and save the CSV files, but encountered an error: {e}" - ) - - -def create_SPM_unit_table(person: pd.DataFrame) -> pd.DataFrame: - SPM_UNIT_COLUMNS = [ - "CAPHOUSESUB", - "CAPWKCCXPNS", - "CHILDCAREXPNS", - "EITC", - "ENGVAL", - "EQUIVSCALE", - "FEDTAX", - "FEDTAXBC", - "FICA", - "GEOADJ", - "MEDXPNS", - "NUMADULTS", - "NUMKIDS", - "NUMPER", - "POOR", - "POVTHRESHOLD", - "RESOURCES", - "SCHLUNCH", - "SNAPSUB", - "STTAX", - "TENMORTSTATUS", - "TOTVAL", - "WCOHABIT", - "WICVAL", - "WKXPNS", - "WUI_LT15", - "ID", - ] - return ( - person[["SPM_" + column for column in SPM_UNIT_COLUMNS]] - .groupby(person.SPM_ID) - .first() - ) - - -def create_household_table(person: pd.DataFrame) -> pd.DataFrame: - return person[["SERIALNO", "ST", "PUMA"]].groupby(person.SERIALNO).first() diff --git a/openfisca_us_data/datasets/acs/raw_spm_acs.py b/openfisca_us_data/datasets/acs/raw_spm_acs.py index 361cac2..96d6cc9 100644 --- a/openfisca_us_data/datasets/acs/raw_spm_acs.py +++ b/openfisca_us_data/datasets/acs/raw_spm_acs.py @@ -5,20 +5,20 @@ @dataset -class RawACS: - name = "raw_acs" +class RawSPMACS: + name = "raw_spm_acs" def generate(year: int) -> None: url = f"https://www2.census.gov/programs-surveys/supplemental-poverty-measure/datasets/spm/spm_{year}_pu.dta" try: - with pd.HDFStore(RawACS.file(year)) as storage: + with pd.HDFStore(RawSPMACS.file(year)) as storage: person = pd.read_stata(url).fillna(0) person.columns = person.columns.str.upper() storage["person"] = person storage["spm_unit"] = create_SPM_unit_table(person) storage["household"] = create_household_table(person) except Exception as e: - RawACS.remove(year) + RawSPMACS.remove(year) raise ValueError( f"Attempted to extract and save the CSV files, but encountered an error: {e}" ) diff --git a/openfisca_us_data/datasets/fullacs/__init__.py b/openfisca_us_data/datasets/fullacs/__init__.py deleted file mode 100644 index 57ad83c..0000000 --- a/openfisca_us_data/datasets/fullacs/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from openfisca_us_data.datasets.acs.raw_fullacs import RawACS -from openfisca_us_data.datasets.acs.fullacs import ACS diff --git a/openfisca_us_data/datasets/fullacs/fullacs.py b/openfisca_us_data/datasets/fullacs/fullacs.py deleted file mode 100644 index 3aa2d5a..0000000 --- a/openfisca_us_data/datasets/fullacs/fullacs.py +++ /dev/null @@ -1,73 +0,0 @@ -from openfisca_us_data.utils import US, dataset -from openfisca_us_data.datasets.fullacs.raw_fullacs import RawACS -from pandas import DataFrame -import h5py - - -@dataset -class ACS: - name = "acs" - model = US - - # Note: no self because it uses a decorator. - def generate(year: int) -> None: - """Generates the ACS dataset. - - Args: - year (int): The year of the raw ACS to use. - """ - - # Prepare raw ACS tables - year = int(year) - if year not in RawACS.years: - RawACS.generate(year) - - raw_data = RawACS.load(year) - acs = h5py.File(ACS.file(year), mode="w") - - person, spm_unit, household = [ - raw_data[entity] for entity in ("person", "spm_unit", "household") - ] - - add_ID_variables(acs, person, spm_unit, household) - add_SPM_variables(acs, spm_unit) - - raw_data.close() - acs.close() - - -def add_ID_variables( - acs: h5py.File, - person: DataFrame, - spm_unit: DataFrame, - household: DataFrame, -): - """Add basic ID and weight variables. - - Args: - acs (h5py.File): The ACS dataset file. - person (DataFrame): The person table of the ACS. - spm_unit (DataFrame): The SPM unit table created from the person table - of the ACS. - household (DataFrame): The household table of the ACS. - """ - # Add primary and foreign keys - acs["person_id"] = person.SERIALNO * 1e2 + person.SPORDER - acs["person_spm_unit_id"] = person.SPM_ID - acs["spm_unit_id"] = spm_unit.SPM_ID - # ACS doesn't have tax units. - acs["tax_unit_id"] = spm_unit.SPM_ID - # Until we add a family table, we'll use the person table. - acs["family_id"] = spm_unit.SPM_ID - acs["person_household_id"] = person.SERIALNO - acs["person_tax_unit_id"] = person.SPM_ID - acs["person_family_id"] = person.SPM_ID - acs["household_id"] = household.SERIALNO - - # Add weights - acs["person_weight"] = person.WT - - -def add_SPM_variables(acs: h5py.File, spm_unit: DataFrame): - acs["SPM_unit_net_income"] = spm_unit.SPM_RESOURCES - acs["poverty_threshold"] = spm_unit.SPM_POVTHRESHOLD diff --git a/tests/test_imports.py b/tests/test_imports.py index 5e482da..1346153 100644 --- a/tests/test_imports.py +++ b/tests/test_imports.py @@ -13,6 +13,9 @@ def test_CPS_import(): def test_RawACS_import(): from openfisca_us_data import RawACS +def test_RawSPMACS_import(): + from openfisca_us_data import RawSPMACS + def test_ACS_import(): from openfisca_us_data import ACS From 575698f79484f7b2a6ad9d4bbee1c4d02329a7c8 Mon Sep 17 00:00:00 2001 From: Kklu78 <76934184+Kklu78@users.noreply.github.com> Date: Mon, 1 Nov 2021 18:01:02 -0700 Subject: [PATCH 4/5] update --- openfisca_us_data/datasets/acs/raw_acs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfisca_us_data/datasets/acs/raw_acs.py b/openfisca_us_data/datasets/acs/raw_acs.py index d468eb9..c0b9f0c 100644 --- a/openfisca_us_data/datasets/acs/raw_acs.py +++ b/openfisca_us_data/datasets/acs/raw_acs.py @@ -14,7 +14,7 @@ def generate(year: int) -> None: file = ZipFile(BytesIO(request.content)) file.extractall(f'{year}_pus') - url2 = 'https://www2.census.gov/programs-surveys/acs/data/pums/2019/1-Year/csv_hus.zip' + url2 = 'https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_hus.zip' request = requests.get(url2) file = ZipFile(BytesIO(request.content)) file.extractall(f'{year}_hus') From 76cc4c85e56a856c1fd00df2f167f6348713294b Mon Sep 17 00:00:00 2001 From: Kklu78 <76934184+Kklu78@users.noreply.github.com> Date: Tue, 2 Nov 2021 11:57:38 -0700 Subject: [PATCH 5/5] fixed typo --- openfisca_us_data/datasets/acs/raw_acs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfisca_us_data/datasets/acs/raw_acs.py b/openfisca_us_data/datasets/acs/raw_acs.py index c0b9f0c..b2978c4 100644 --- a/openfisca_us_data/datasets/acs/raw_acs.py +++ b/openfisca_us_data/datasets/acs/raw_acs.py @@ -14,7 +14,7 @@ def generate(year: int) -> None: file = ZipFile(BytesIO(request.content)) file.extractall(f'{year}_pus') - url2 = 'https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_hus.zip' + url2 = f'https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_hus.zip' request = requests.get(url2) file = ZipFile(BytesIO(request.content)) file.extractall(f'{year}_hus')