Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,16 @@ data: install tmd_files test
format:
black . -l 79

PYLINT_DISABLE = duplicate-code,invalid-name,too-many-instance-attributes,too-many-locals,too-many-arguments,too-many-positional-arguments,too-many-statements,too-many-branches,too-many-nested-blocks,broad-exception-caught,missing-function-docstring,missing-module-docstring

PYLINT_OPTIONS = --disable=$(PYLINT_DISABLE) --score=no --jobs=4 \
--check-quote-consistency=yes

.PHONY=style
style:
@pycodestyle --ignore=E731,E712,W503 .
@pylint $(PYLINT_OPTIONS) .

.PHONY=reweighting-visualisation
reweighting-visualisation:
tensorboard --logdir=tmd/storage/output/reweighting
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
"scipy",
"jax",
"black>=26.1.0",
"pycodestyle>=2.14.0",
"pylint>=3.3.8",
"pytest",
"pytest-xdist",
"jupyter-book",
Expand Down
1 change: 0 additions & 1 deletion tests/test_area_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"""

import yaml
import pandas as pd
import taxcalc as tc
from tmd.storage import STORAGE_FOLDER
from tmd.create_taxcalc_input_variables import TAXYEAR
Expand Down
1 change: 0 additions & 1 deletion tests/test_tax_revenue.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import yaml
import numpy as np
import taxcalc as tc
from tmd.storage import STORAGE_FOLDER
from tmd.create_taxcalc_input_variables import TAXYEAR

FIRST_CYR = 2023
Expand Down
6 changes: 3 additions & 3 deletions tmd/areas/create_area_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ def target_rmse(wght, target_matrix, target_array, out, delta=None):
continue
line = (
f">={interval.left:13.6f}, <{interval.right:13.6f}:"
f" {num:6d} {cum:6d} {num/tot:7.2%} {cum/tot:7.2%}\n"
f" {num:6d} {cum:6d} {num / tot:7.2%} {cum / tot:7.2%}\n"
)
out.write(line)
if cum == tot:
Expand Down Expand Up @@ -429,7 +429,7 @@ def weight_ratio_distribution(ratio, delta, out):
continue
line = (
f">={interval.left:13.6f}, <{interval.right:13.6f}:"
f" {num:6d} {cum:6d} {num/tot:7.2%} {cum/tot:7.2%}\n"
f" {num:6d} {cum:6d} {num / tot:7.2%} {cum / tot:7.2%}\n"
)
out.write(line)
if cum == tot:
Expand Down Expand Up @@ -620,7 +620,7 @@ def create_area_weights_file(
)
else:
res_summary = (
f">>> final delta loop exectime= {(time1-time0):.1f} secs"
f">>> final delta loop exectime= {(time1 - time0):.1f} secs"
f" iterations={res.nit} success={res.success}\n"
f">>> message: {res.message}\n"
f">>> L-BFGS-B optimized objective function value: {res.fun:.9e}\n"
Expand Down
45 changes: 33 additions & 12 deletions tmd/datasets/cps.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import os
import yaml
from io import BytesIO
from typing import Type
from zipfile import ZipFile
import yaml
import requests
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -152,11 +152,26 @@ def generate(self) -> pd.DataFrame:
file_year_code = str(file_year)[-2:]

CPS_URL_BY_YEAR = {
2018: "https://www2.census.gov/programs-surveys/cps/datasets/2019/march/asecpub19csv.zip",
2019: "https://www2.census.gov/programs-surveys/cps/datasets/2020/march/asecpub20csv.zip",
2020: "https://www2.census.gov/programs-surveys/cps/datasets/2021/march/asecpub21csv.zip",
2021: "https://www2.census.gov/programs-surveys/cps/datasets/2022/march/asecpub22csv.zip",
2022: "https://www2.census.gov/programs-surveys/cps/datasets/2023/march/asecpub23csv.zip",
2018: (
"https://www2.census.gov/programs-surveys/cps/datasets/"
"2019/march/asecpub19csv.zip"
),
2019: (
"https://www2.census.gov/programs-surveys/cps/datasets/"
"2020/march/asecpub20csv.zip"
),
2020: (
"https://www2.census.gov/programs-surveys/cps/datasets/"
"2021/march/asecpub21csv.zip"
),
2021: (
"https://www2.census.gov/programs-surveys/cps/datasets/"
"2022/march/asecpub22csv.zip"
),
2022: (
"https://www2.census.gov/programs-surveys/cps/datasets/"
"2023/march/asecpub23csv.zip"
),
}

if self.time_period not in CPS_URL_BY_YEAR:
Expand Down Expand Up @@ -239,7 +254,9 @@ def generate(self) -> pd.DataFrame:
)
except Exception as e:
raise ValueError(
f"Attempted to extract and save the CSV files, but encountered an error: {e} (removed the intermediate dataset)."
"Attempted to extract and save the CSV files, "
f"but encountered an error: {e} "
"(removed the intermediate dataset)."
)

@staticmethod
Expand Down Expand Up @@ -366,7 +383,9 @@ def add_id_variables(

marital_unit_id = Series(marital_unit_id).rank(
method="dense"
) # Simplify to a natural number sequence with repetitions [0, 1, 1, 2, 3, ...]
# simplifies to a natural number sequence
# with repetitions [0, 1, 1, 2, 3, ...]
)

cps["person_marital_unit_id"] = marital_unit_id.values
cps["marital_unit_id"] = marital_unit_id.drop_duplicates().values
Expand Down Expand Up @@ -563,10 +582,11 @@ def add_personal_income_variables(
# 2) If they report any wage and salary income, allocate in this order:
# a) Traditional 401(k) contributions up to to limit
# b) Roth 401(k) contributions up to the limit
# c) IRA contributions up to the limit, split according to administrative fractions
# c) IRA contributions up to the limit, split according
# to administrative fractions
# d) Other retirement contributions
# Disregard reported pension contributions from people who report neither wage and salary
# nor self-employment income.
# Disregard reported pension contributions from people
# who report neither wage and salary nor self-employment income.
# Assume no 403(b) or 457 contributions for now.
LIMIT_401K_2022 = 20_500
LIMIT_401K_CATCH_UP_2022 = 6_500
Expand Down Expand Up @@ -620,7 +640,8 @@ def add_personal_income_variables(
np.minimum(remaining_retirement_contributions, roth_ira_limit),
0,
)
# Allocate capital gains into long-term and short-term based on aggregate split.
# Allocate capital gains into long-term and short-term
# based on aggregate split.
cps["long_term_capital_gains"] = person.CAP_VAL * (
p["long_term_capgain_fraction"]
)
Expand Down
13 changes: 6 additions & 7 deletions tmd/datasets/puf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import yaml
from tqdm import tqdm
import numpy as np
import pandas as pd
from tqdm import tqdm
from microdf import MicroDataFrame
from policyengine_core.data import Dataset
from policyengine_us.system import system
from tmd.storage import STORAGE_FOLDER
from tmd.datasets.uprate_puf import uprate_puf
from tmd.utils.imputation import Imputation
Expand All @@ -10,9 +12,6 @@
IMPUTATION_BETA_RNG_SEED,
W2_WAGES_SCALE,
)
from microdf import MicroDataFrame
from policyengine_core.data import Dataset
from policyengine_us.system import system

FILER_AGE_RNG = np.random.default_rng(seed=64963751)
SPOUSE_GENDER_RNG = np.random.default_rng(seed=83746519)
Expand Down Expand Up @@ -281,15 +280,15 @@ def generate(self, puf: pd.DataFrame, demographics: pd.DataFrame):
IDVARS = ["E17500", "E18400", "E18500", "E19200", "E19800"]
wght = puf.S006 / 100.0
for var in IDVARS:
print(f"%%15:{var}= {(puf[var]*wght).sum()*1e-9:.3f}")
print(f"%%15:{var}= {(puf[var] * wght).sum() * 1e-9:.3f}")

if self.time_period > 2015:
puf = uprate_puf(puf, 2015, self.time_period)

if itmded_dump:
wght = puf.S006 / 100.0
for var in IDVARS:
print(f"%%21:{var}= {(puf[var]*wght).sum()*1e-9:.3f}")
print(f"%%21:{var}= {(puf[var] * wght).sum() * 1e-9:.3f}")

puf = puf[puf.MARS != 0]

Expand Down
6 changes: 5 additions & 1 deletion tmd/datasets/taxcalc_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,11 @@ def map_to_tax_unit(var_array):

# correct case of df variable names for Tax-Calculator
tc_variable_metadata = yaml.safe_load(
open(STORAGE_FOLDER / "input" / "tc_variable_metadata.yaml", "r")
open(
STORAGE_FOLDER / "input" / "tc_variable_metadata.yaml",
"r",
encoding="utf-8",
)
)
renames = {}
for variable in df.columns:
Expand Down
1 change: 0 additions & 1 deletion tmd/datasets/tmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from tmd.utils.trace import trace1
from tmd.utils.taxcalc_utils import add_taxcalc_outputs
from tmd.utils.reweight import reweight
from tmd.storage import STORAGE_FOLDER


def create_tmd_2021():
Expand Down
6 changes: 4 additions & 2 deletions tmd/examination/2022/bootstrap_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@ def bootstrap_sampling(outfile):
else:
fdf = gdf
print(f"SS_FRAC = {SS_FRAC:.2f}")
print(f"SS:wght(#M) = {fdf['s006'].sum() * 1e-6:.3f}")
print(f"SS:itax($B) = {(fdf['s006'] * fdf['iitax']).sum() * 1e-9:.3f}")
wght = fdf["s006"]
print(f"SS:wght(#M) = {wght.sum() * 1e-6:.3f}")
itax = fdf["iitax"]
print(f"SS:itax($B) = {(wght * itax).sum() * 1e-9:.3f}")

# compute sum of wght and wght*itax for each bootstrap sample
xdf = pd.DataFrame({"wght": fdf["s006"], "itax": fdf["iitax"]})
Expand Down
2 changes: 1 addition & 1 deletion tmd/storage/output/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pandas as pd
from pathlib import Path
import pandas as pd

output = Path(__file__).parent

Expand Down
31 changes: 13 additions & 18 deletions tmd/utils/imputation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pathlib import Path
from typing import List, Dict
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
Expand Down Expand Up @@ -154,19 +155,16 @@ def save(self, path: str):
Args:
path (str): The path to save the model to.
"""

import pickle

path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "wb") as f:
# Store the models only in a dictionary.
data = dict(
models=self.models,
X_columns=self.X_columns,
X_category_mappings=self.X_category_mappings,
Y_columns=self.Y_columns,
)
data = {
"models": self.models,
"X_columns": self.X_columns,
"X_category_mappings": self.X_category_mappings,
"Y_columns": self.Y_columns,
}
pickle.dump(data, f)

@staticmethod
Expand All @@ -180,9 +178,6 @@ def load(path: str) -> "Imputation":
Returns:
Imputation: The imputation model.
"""

import pickle

imputation = Imputation()
with open(path, "rb") as f:
data = pickle.load(f)
Expand Down Expand Up @@ -234,6 +229,7 @@ class ManyToOneImputation:
"""Random number generator seed used by RandomForestRegressor."""
beta_rng_seed: int = None
"""Random number generator seed used to generate Beta variates."""
encode_categories: pd.DataFrame = None

def train(
self,
Expand Down Expand Up @@ -264,7 +260,7 @@ def train(
self.is_integer_coded = (
isinstance(y[0], str) or (y - y.round()).mean() < 1e-3
)
except Exception as e:
except Exception:
pass
self.model.fit(X, y, sample_weight=sample_weight)

Expand All @@ -284,7 +280,7 @@ def predict(
pd.Series: The predicted distribution of values for each input row.
"""
if isinstance(X, pd.DataFrame) and any(
[X[column].dtype == "O" for column in X.columns]
X[column].dtype == "O" for column in X.columns
):
X = self.encode_categories(X)
X = to_array(X)
Expand Down Expand Up @@ -344,8 +340,8 @@ def loss(mean_quantile):
pred_values = self.predict(input_df, mean_quantile)
pred_aggregate = (pred_values * weights).sum()
msg = (
f"PREDICTED: {pred_aggregate/1e9:.1f} "
f"(target: {target/1e9:.1f})"
f"PREDICTED: {pred_aggregate / 1e9:.1f} "
f"(target: {target / 1e9:.1f})"
)
print(msg)
return (pred_aggregate - target) ** 2, pred_aggregate
Expand All @@ -364,8 +360,7 @@ def loss(mean_quantile):
f"(loss: {loss_value:.4f})"
)
print(msg)
if loss_value < best_loss:
best_loss = loss_value
best_loss = min(loss_value, best_loss)
if pred_agg < target:
min_quantile = mean_quantile
else:
Expand Down
11 changes: 7 additions & 4 deletions tmd/utils/is_tax_filer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from policyengine_us.system import system


def is_tax_filer(
irs_gross_income: float,
filing_status: str,
Expand All @@ -16,9 +19,11 @@ def is_tax_filer(
filing_status: Filing status of the tax unit.
earned_income: Total earned income.
total_income_tax: Total income tax liability.
aged_blind_count: Number of aged or blind individuals in the tax unit.
aged_blind_count: Number of aged or blind individuals
in the tax unit.
standard_deduction: Standard deduction for the tax unit.
aged_blind_standard_deduction: Additional standard deduction for aged or blind individuals.
aged_blind_standard_deduction: Additional standard deduction
for aged or blind individuals.
exemption_amount: Exemption amount for the tax unit.

Returns:
Expand Down Expand Up @@ -55,8 +60,6 @@ def is_tax_filer(
return required_to_file or not_required_but_likely_filer


from policyengine_us.system import system

parameters = system.parameters.gov.irs

aged_blind_standard_deduction = (
Expand Down
1 change: 0 additions & 1 deletion tmd/utils/pension_contributions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import numpy as np
from policyengine_us import Microsimulation
from tmd.datasets.cps import CPS_2021
from tmd.utils.imputation import Imputation
Expand Down
Loading