This repository was archived by the owner on Mar 11, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmake.jl
More file actions
112 lines (104 loc) · 3.58 KB
/
make.jl
File metadata and controls
112 lines (104 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# Generate example datasets as compressed CSV files
# See data/README.md for the sources of the input data files
# To regenerate the .csv.gz files:
# 1) Have all input files ready in the data folder
# 2) Instantiate the package environment for data/src
# 3) Run this script and call `make()` with the root folder as working directory
using CSV, CodecBzip2, CodecZlib, DataFrames, DataValues, RData, ReadStat
function _to_array(d::DataValueArray{T}) where T
a = Array{T}(undef, size(d))
hasmissing = false
@inbounds for i in eachindex(d)
v = d[i]
if hasvalue(v)
a[i] = v.value
elseif !hasmissing
a = convert(Array{Union{T,Missing}}, a)
hasmissing = true
a[i] = missing
else
a[i] = missing
end
end
return a
end
function _get_columns(data::ReadStatDataFrame, names::Vector{Symbol})
lookup = Dict(data.headers.=>keys(data.headers))
cols = Vector{AbstractVector}(undef, length(names))
for (i, n) in enumerate(names)
col = data.data[lookup[n]]
cols[i] = _to_array(col)
end
return cols
end
# The steps for preparing data follow Sun and Abraham (2020)
function hrs()
raw = read_dta("data/HRS_long.dta")
names = [:hhidpn, :wave, :wave_hosp, :evt_time, :oop_spend, :riearnsemp, :rwthh,
:male, :spouse, :white, :black, :hispanic, :age_hosp]
cols = _get_columns(raw, names)
df = dropmissing!(DataFrame(cols, names), [:wave, :age_hosp, :evt_time])
df = df[(df.wave.>=7).&(df.age_hosp.<=59), :]
# Must count wave after the above selection
transform!(groupby(df, :hhidpn), nrow=>:nwave, :evt_time => minimum => :evt_time)
df = df[(df.nwave.==5).&(df.evt_time.<0), :]
transform!(groupby(df, :hhidpn), :wave_hosp => minimum∘skipmissing => :wave_hosp)
select!(df, Not([:nwave, :evt_time, :age_hosp]))
for n in (:male, :spouse, :white, :black, :hispanic)
df[!, n] .= ifelse.(df[!, n].==100, 1, 0)
end
for n in propertynames(df)
if !(n in (:oop_spend, :riearnsemp, :wrthh))
df[!, n] .= convert(Array{Int}, df[!, n])
end
end
# Replace the original hh index with enumeration
ids = IdDict{Int,Int}()
hhidpn = df.hhidpn
newid = 0
for i in 1:length(hhidpn)
oldid = hhidpn[i]
id = get(ids, oldid, 0)
if id === 0
newid += 1
ids[oldid] = newid
hhidpn[i] = newid
else
hhidpn[i] = id
end
end
open(GzipCompressorStream, "data/hrs.csv.gz", "w") do stream
CSV.write(stream, df)
end
end
# Produce a subset of nsw_long from the DRDID R package
function nsw()
df = DataFrame(CSV.File("data/ec675_nsw.tab", delim='\t'))
df = df[(isequal.(df.treated, 0)).|(df.sample.==2), Not([:dwincl, :early_ra])]
df.experimental = ifelse.(ismissing.(df.treated), 0, 1)
select!(df, Not([:treated, :sample]))
df.id = 1:nrow(df)
# Convert the data to long format
df = stack(df, [:re75, :re78])
df.year = ifelse.(df.variable.=="re75", 1975, 1978)
select!(df, Not(:variable))
rename!(df, :value=>:re)
sort!(df, :id)
open(GzipCompressorStream, "data/nsw.csv.gz", "w") do stream
CSV.write(stream, df)
end
end
# Convert mpdta from the did R package to csv format
function mpdta()
df = load("data/mpdta.rda")["mpdta"]
df.first_treat = convert(Vector{Int}, df.first_treat)
select!(df, Not(:treat))
open(GzipCompressorStream, "data/mpdta.csv.gz", "w") do stream
CSV.write(stream, df)
end
end
function make()
hrs()
nsw()
mpdta()
end