Skip to content

Commit 8841614

Browse files
author
Christopher Doris
committed
stringify any pandas column names
1 parent a33af30 commit 8841614

File tree

1 file changed

+67
-89
lines changed

1 file changed

+67
-89
lines changed

src/pywrap/PyPandasDataFrame.jl

Lines changed: 67 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,22 @@
11
"""
2-
PyPandasDataFrame(x; indexname=nothing, columntypes=())
2+
PyPandasDataFrame(x; [indexname::Union{Nothing,Symbol}], [columnnames::Function], [columntypes::Function])
33
44
Wraps the pandas DataFrame `x` as a Tables.jl-compatible table.
55
6-
`indexname` is the name of the column to contain the index. It may be `nothing` to exclude the index.
7-
8-
`columntypes` is a mapping of column names to column element types, in case automatic deduction does not work.
6+
- `indexname`: The name of the column including the index. The default is `nothing`, meaning
7+
to exclude the index.
8+
- `columnnames`: A function mapping the Python column name (a `Py`) to the Julia one (a
9+
`Symbol`). The default is `x -> Symbol(x)`.
10+
- `columntypes`: A function taking the column name (a `Symbol`) and returning either the
11+
desired element type of the column, or `nothing` to indicate automatic inference.
912
"""
1013
struct PyPandasDataFrame <: PyTable
1114
py::Py
12-
indexname::Union{String,Nothing}
13-
columntypes::Dict{String,Type}
14-
function PyPandasDataFrame(x; indexname=nothing, columntypes=())
15-
if indexname !== nothing
16-
indexname = convert(String, indexname)
17-
end
18-
columntypes = Dict{String,Type}(columntypes)
19-
new(Py(x), indexname, columntypes)
15+
indexname::Union{Symbol,Nothing}
16+
columnnames::Function # Py -> Symbol
17+
columntypes::Function # Symbol -> Union{Type,Nothing}
18+
function PyPandasDataFrame(x; indexname::Union{Symbol,Nothing}=nothing, columnnames::Function=x->Symbol(x), columntypes::Function=x->nothing)
19+
new(Py(x), indexname, columnnames, columntypes)
2020
end
2121
end
2222
export PyPandasDataFrame
@@ -27,79 +27,6 @@ pydel!(x::PyPandasDataFrame) = pydel!(x.py)
2727

2828
pyconvert_rule_pandasdataframe(::Type{PyPandasDataFrame}, x::Py) = pyconvert_return(PyPandasDataFrame(x))
2929

30-
### Dict interface
31-
32-
function Base.keys(df::PyPandasDataFrame)
33-
ans = String[]
34-
@py for c in df.columns
35-
if isinstance(c, str)
36-
@jl push!(ans, pyconvert(String, c))
37-
@del c
38-
else
39-
@jl error("name of column '$c' is not a string")
40-
end
41-
end
42-
if df.indexname !== nothing
43-
if df.indexname in ans
44-
error("dataframe already includes a column called '$(df.indexname)'")
45-
else
46-
pushfirst!(ans, df.indexname)
47-
end
48-
end
49-
return ans
50-
end
51-
52-
Base.haskey(df::PyPandasDataFrame, k::String) = (df.indexname !== nothing && k == df.indexname) || @py k in df.columns
53-
Base.haskey(df::PyPandasDataFrame, k::Symbol) = haskey(df, string(k))
54-
Base.haskey(df::PyPandasDataFrame, k) = haskey(df, convert(String, k))
55-
56-
function Base.getindex(df::PyPandasDataFrame, k::String)
57-
# get the given column
58-
if df.indexname !== nothing && k == df.indexname
59-
c = @py df.index
60-
else
61-
c = @py df[k]
62-
end
63-
# convert to a vector
64-
if haskey(df.columntypes, k)
65-
ans = pyconvert_and_del(AbstractVector{df.columntypes[k]}, c)
66-
else
67-
ans = pyconvert_and_del(AbstractVector, c)
68-
# narrow the type
69-
ans = identity.(ans)
70-
# convert any Py to something more useful
71-
if Py <: eltype(ans)
72-
ans = [x isa Py ? pyconvert(Any, x) : x for x in ans]
73-
end
74-
# convert NaN to missing
75-
if eltype(ans) != Float64 && Float64 <: eltype(ans)
76-
ans = [x isa Float64 && isnan(x) ? missing : x for x in ans]
77-
end
78-
end
79-
return ans :: AbstractVector
80-
end
81-
Base.getindex(df::PyPandasDataFrame, k::Symbol) = getindex(df, string(k))
82-
Base.getindex(df::PyPandasDataFrame, k) = getindex(df, convert(String, k))
83-
84-
Base.get(df::PyPandasDataFrame, k, d) = haskey(df, k) ? df[k] : d
85-
86-
Base.values(df::PyPandasDataFrame) = (df[k] for k in keys(df))
87-
88-
Base.pairs(df::PyPandasDataFrame) = (Pair{String, AbstractVector}(k, df[k]) for k in keys(df))
89-
90-
Base.getproperty(df::PyPandasDataFrame, k::Symbol) = hasfield(PyPandasDataFrame, k) ? getfield(df, k) : df[k]
91-
Base.getproperty(df::PyPandasDataFrame, k::String) = getproperty(df, Symbol(k))
92-
93-
function Base.propertynames(df::PyPandasDataFrame, private::Bool=false)
94-
ans = Symbol.(keys(df))
95-
if private
96-
append!(ans, fieldnames(PyPandasDataFrame))
97-
else
98-
push!(ans, :indexname, :columntypes)
99-
end
100-
return ans
101-
end
102-
10330
### Show
10431

10532
function Base.show(io::IO, mime::MIME"text/plain", df::PyPandasDataFrame)
@@ -117,9 +44,60 @@ Base.showable(mime::MIME, df::PyPandasDataFrame) = pyshowable(mime, df)
11744
### Tables
11845

11946
Tables.istable(::Type{PyPandasDataFrame}) = true
47+
12048
Tables.columnaccess(::Type{PyPandasDataFrame}) = true
121-
function Tables.columns(df::PyPandasDataFrame)
122-
ns = Tuple(Symbol.(keys(df)))
123-
cs = values(df)
124-
return NamedTuple{ns}(cs)
49+
50+
Tables.columns(df::PyPandasDataFrame) = _columns(df, df.columnnames, df.columntypes)
51+
52+
function _columns(df, columnnames, columntypes)
53+
# collect columns
54+
colnames = Symbol[]
55+
pycolumns = Py[]
56+
if df.indexname !== nothing
57+
push!(colnames, df.indexname)
58+
push!(pycolumns, df.py.index)
59+
end
60+
for pycolname in df.py.columns
61+
colname = columnnames(pycolname)::Symbol
62+
pycolumn = df.py[pycolname]
63+
push!(colnames, colname)
64+
push!(pycolumns, pycolumn)
65+
end
66+
# ensure column names are unique by appending a _N suffix
67+
colnamecount = Dict{Symbol,Int}()
68+
for (i, colname) in pairs(colnames)
69+
n = get(colnamecount, colname, 0) + 1
70+
colnamecount[colname] = n
71+
if n > 1
72+
colnames[i] = Symbol(colname, :_, n)
73+
end
74+
end
75+
# convert columns to vectors
76+
columns = AbstractVector[]
77+
coltypes = Type[]
78+
for (colname, pycolumn) in zip(colnames, pycolumns)
79+
coltype = columntypes(colname)::Union{Nothing,Type}
80+
if coltype !== nothing
81+
column = pyconvert_and_del(AbstractVector{coltype}, pycolumn)
82+
else
83+
column = pyconvert_and_del(AbstractVector, pycolumn)
84+
# narrow the type
85+
column = identity.(column)
86+
# convert any Py to something more useful
87+
if Py <: eltype(column)
88+
column = [x isa Py ? pyconvert(Any, x) : x for x in column]
89+
end
90+
# convert NaN to missing
91+
if eltype(column) != Float64 && Float64 <: eltype(column)
92+
column = [x isa Float64 && isnan(x) ? missing : x for x in column]
93+
end
94+
end
95+
push!(columns, column)
96+
push!(coltypes, eltype(column))
97+
end
98+
# output a table
99+
# TODO: realising columns to vectors could be done lazily with a different table type
100+
schema = Tables.Schema(colnames, coltypes)
101+
coldict = Dict(k=>v for (k,v) in zip(colnames, columns))
102+
Tables.DictColumnTable(schema, coldict)
125103
end

0 commit comments

Comments
 (0)