diff --git a/README.md b/README.md index f2bfc14..e3fbcbd 100644 --- a/README.md +++ b/README.md @@ -45,9 +45,12 @@ class CustomDataset: ### CPS - OpenFisca-US-compatible - Contains OpenFisca-US-compatible input arrays. -### RawACS +### RawSPMACS - Not OpenFisca-US-compatible - Contains the tables from the raw [ACS SPM research file](https://www.census.gov/data/datasets/time-series/demo/supplemental-poverty-measure/acs-research-files.html) microdata. +### RawACS +- Not OpenFisca-US-compatible +- Contains the tables from the raw [ACS person and household file](https://www.census.gov/programs-surveys/acs/microdata.html) ### ACS - OpenFisca-US-compatible -- Contains OpenFisca-US-compatible input arrays. +- Contains OpenFisca-US-compatible input arrays from the spm research file. diff --git a/openfisca_us_data/__init__.py b/openfisca_us_data/__init__.py index 4d0d0de..750584c 100644 --- a/openfisca_us_data/__init__.py +++ b/openfisca_us_data/__init__.py @@ -3,4 +3,4 @@ REPO = Path(__file__).parent -DATASETS = (RawCPS, CPS, RawACS, ACS) +DATASETS = (RawCPS, CPS, RawACS, ACS, RawSPMACS) diff --git a/openfisca_us_data/datasets/acs/__init__.py b/openfisca_us_data/datasets/acs/__init__.py index 6ed1f62..832134a 100644 --- a/openfisca_us_data/datasets/acs/__init__.py +++ b/openfisca_us_data/datasets/acs/__init__.py @@ -1,2 +1,3 @@ from openfisca_us_data.datasets.acs.raw_acs import RawACS +from openfisca_us_data.datasets.acs.raw_spm_acs import RawSPMACS from openfisca_us_data.datasets.acs.acs import ACS diff --git a/openfisca_us_data/datasets/acs/acs.py b/openfisca_us_data/datasets/acs/acs.py index 35739d4..056bd8c 100644 --- a/openfisca_us_data/datasets/acs/acs.py +++ b/openfisca_us_data/datasets/acs/acs.py @@ -1,5 +1,5 @@ from openfisca_us_data.utils import US, dataset -from openfisca_us_data.datasets.acs.raw_acs import RawACS +from openfisca_us_data.datasets.acs.raw_spm_acs import RawSPMACS from pandas import DataFrame import h5py @@ -19,10 +19,10 @@ def generate(year: int) -> None: # Prepare raw ACS tables year = int(year) - if year not in RawACS.years: - RawACS.generate(year) + if year not in RawSPMACS.years: + RawSPMACS.generate(year) - raw_data = RawACS.load(year) + raw_data = RawSPMACS.load(year) acs = h5py.File(ACS.file(year), mode="w") person, spm_unit, household = [ diff --git a/openfisca_us_data/datasets/acs/raw_acs.py b/openfisca_us_data/datasets/acs/raw_acs.py index 361cac2..b2978c4 100644 --- a/openfisca_us_data/datasets/acs/raw_acs.py +++ b/openfisca_us_data/datasets/acs/raw_acs.py @@ -2,64 +2,37 @@ import requests from io import BytesIO import pandas as pd - +from zipfile import ZipFile @dataset class RawACS: name = "raw_acs" def generate(year: int) -> None: - url = f"https://www2.census.gov/programs-surveys/supplemental-poverty-measure/datasets/spm/spm_{year}_pu.dta" + url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_pus.zip" + request = requests.get(url) + file = ZipFile(BytesIO(request.content)) + file.extractall(f'{year}_pus') + + url2 = f'https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_hus.zip' + request = requests.get(url2) + file = ZipFile(BytesIO(request.content)) + file.extractall(f'{year}_hus') + try: with pd.HDFStore(RawACS.file(year)) as storage: - person = pd.read_stata(url).fillna(0) - person.columns = person.columns.str.upper() - storage["person"] = person - storage["spm_unit"] = create_SPM_unit_table(person) - storage["household"] = create_household_table(person) + persona = pd.read_csv(f'{year}_pus/psam_pusa.csv') + personb = pd.read_csv(f'{year}_pus/psam_pusb.csv') + person_df = pd.concat(persona, personb).fillna(0) + person_df.columns = person_df.columns.str.upper() + + householda = pd.read_csv(f'{year}_hus/psam_husa.csv') + householdb = pd.read_csv(f'{year}_hus/psam_husa.csv') + household_df = pd.concat(householda, householdb).fillna(0) + household_df.columns = household_df.columns.str.upper() + except Exception as e: RawACS.remove(year) raise ValueError( f"Attempted to extract and save the CSV files, but encountered an error: {e}" - ) - - -def create_SPM_unit_table(person: pd.DataFrame) -> pd.DataFrame: - SPM_UNIT_COLUMNS = [ - "CAPHOUSESUB", - "CAPWKCCXPNS", - "CHILDCAREXPNS", - "EITC", - "ENGVAL", - "EQUIVSCALE", - "FEDTAX", - "FEDTAXBC", - "FICA", - "GEOADJ", - "MEDXPNS", - "NUMADULTS", - "NUMKIDS", - "NUMPER", - "POOR", - "POVTHRESHOLD", - "RESOURCES", - "SCHLUNCH", - "SNAPSUB", - "STTAX", - "TENMORTSTATUS", - "TOTVAL", - "WCOHABIT", - "WICVAL", - "WKXPNS", - "WUI_LT15", - "ID", - ] - return ( - person[["SPM_" + column for column in SPM_UNIT_COLUMNS]] - .groupby(person.SPM_ID) - .first() - ) - - -def create_household_table(person: pd.DataFrame) -> pd.DataFrame: - return person[["SERIALNO", "ST", "PUMA"]].groupby(person.SERIALNO).first() + ) \ No newline at end of file diff --git a/openfisca_us_data/datasets/acs/raw_spm_acs.py b/openfisca_us_data/datasets/acs/raw_spm_acs.py new file mode 100644 index 0000000..96d6cc9 --- /dev/null +++ b/openfisca_us_data/datasets/acs/raw_spm_acs.py @@ -0,0 +1,65 @@ +from openfisca_us_data.utils import * +import requests +from io import BytesIO +import pandas as pd + + +@dataset +class RawSPMACS: + name = "raw_spm_acs" + + def generate(year: int) -> None: + url = f"https://www2.census.gov/programs-surveys/supplemental-poverty-measure/datasets/spm/spm_{year}_pu.dta" + try: + with pd.HDFStore(RawSPMACS.file(year)) as storage: + person = pd.read_stata(url).fillna(0) + person.columns = person.columns.str.upper() + storage["person"] = person + storage["spm_unit"] = create_SPM_unit_table(person) + storage["household"] = create_household_table(person) + except Exception as e: + RawSPMACS.remove(year) + raise ValueError( + f"Attempted to extract and save the CSV files, but encountered an error: {e}" + ) + + +def create_SPM_unit_table(person: pd.DataFrame) -> pd.DataFrame: + SPM_UNIT_COLUMNS = [ + "CAPHOUSESUB", + "CAPWKCCXPNS", + "CHILDCAREXPNS", + "EITC", + "ENGVAL", + "EQUIVSCALE", + "FEDTAX", + "FEDTAXBC", + "FICA", + "GEOADJ", + "MEDXPNS", + "NUMADULTS", + "NUMKIDS", + "NUMPER", + "POOR", + "POVTHRESHOLD", + "RESOURCES", + "SCHLUNCH", + "SNAPSUB", + "STTAX", + "TENMORTSTATUS", + "TOTVAL", + "WCOHABIT", + "WICVAL", + "WKXPNS", + "WUI_LT15", + "ID", + ] + return ( + person[["SPM_" + column for column in SPM_UNIT_COLUMNS]] + .groupby(person.SPM_ID) + .first() + ) + + +def create_household_table(person: pd.DataFrame) -> pd.DataFrame: + return person[["SERIALNO", "ST", "PUMA"]].groupby(person.SERIALNO).first() diff --git a/tests/test_imports.py b/tests/test_imports.py index 5e482da..1346153 100644 --- a/tests/test_imports.py +++ b/tests/test_imports.py @@ -13,6 +13,9 @@ def test_CPS_import(): def test_RawACS_import(): from openfisca_us_data import RawACS +def test_RawSPMACS_import(): + from openfisca_us_data import RawSPMACS + def test_ACS_import(): from openfisca_us_data import ACS