11"""
2- PyPandasDataFrame(x; indexname=nothing, columntypes=() )
2+ PyPandasDataFrame(x; [ indexname::Union{Nothing,Symbol}], [columnnames::Function], [ columntypes::Function] )
33
44Wraps the pandas DataFrame `x` as a Tables.jl-compatible table.
55
6- `indexname` is the name of the column to contain the index. It may be `nothing` to exclude the index.
7-
8- `columntypes` is a mapping of column names to column element types, in case automatic deduction does not work.
6+ - `indexname`: The name of the column including the index. The default is `nothing`, meaning
7+ to exclude the index.
8+ - `columnnames`: A function mapping the Python column name (a `Py`) to the Julia one (a
9+ `Symbol`). The default is `x -> Symbol(x)`.
10+ - `columntypes`: A function taking the column name (a `Symbol`) and returning either the
11+ desired element type of the column, or `nothing` to indicate automatic inference.
912"""
1013struct PyPandasDataFrame <: PyTable
1114 py:: Py
12- indexname:: Union{String,Nothing}
13- columntypes:: Dict{String,Type}
14- function PyPandasDataFrame (x; indexname= nothing , columntypes= ())
15- if indexname != = nothing
16- indexname = convert (String, indexname)
17- end
18- columntypes = Dict {String,Type} (columntypes)
19- new (Py (x), indexname, columntypes)
15+ indexname:: Union{Symbol,Nothing}
16+ columnnames:: Function # Py -> Symbol
17+ columntypes:: Function # Symbol -> Union{Type,Nothing}
18+ function PyPandasDataFrame (x; indexname:: Union{Symbol,Nothing} = nothing , columnnames:: Function = x-> Symbol (x), columntypes:: Function = x-> nothing )
19+ new (Py (x), indexname, columnnames, columntypes)
2020 end
2121end
2222export PyPandasDataFrame
@@ -27,79 +27,6 @@ pydel!(x::PyPandasDataFrame) = pydel!(x.py)
2727
2828pyconvert_rule_pandasdataframe (:: Type{PyPandasDataFrame} , x:: Py ) = pyconvert_return (PyPandasDataFrame (x))
2929
30- # ## Dict interface
31-
32- function Base. keys (df:: PyPandasDataFrame )
33- ans = String[]
34- @py for c in df. columns
35- if isinstance (c, str)
36- @jl push! (ans, pyconvert (String, c))
37- @del c
38- else
39- @jl error (" name of column '$c ' is not a string" )
40- end
41- end
42- if df. indexname != = nothing
43- if df. indexname in ans
44- error (" dataframe already includes a column called '$(df. indexname) '" )
45- else
46- pushfirst! (ans, df. indexname)
47- end
48- end
49- return ans
50- end
51-
52- Base. haskey (df:: PyPandasDataFrame , k:: String ) = (df. indexname != = nothing && k == df. indexname) || @py k in df. columns
53- Base. haskey (df:: PyPandasDataFrame , k:: Symbol ) = haskey (df, string (k))
54- Base. haskey (df:: PyPandasDataFrame , k) = haskey (df, convert (String, k))
55-
56- function Base. getindex (df:: PyPandasDataFrame , k:: String )
57- # get the given column
58- if df. indexname != = nothing && k == df. indexname
59- c = @py df. index
60- else
61- c = @py df[k]
62- end
63- # convert to a vector
64- if haskey (df. columntypes, k)
65- ans = pyconvert_and_del (AbstractVector{df. columntypes[k]}, c)
66- else
67- ans = pyconvert_and_del (AbstractVector, c)
68- # narrow the type
69- ans = identity .(ans)
70- # convert any Py to something more useful
71- if Py <: eltype (ans)
72- ans = [x isa Py ? pyconvert (Any, x) : x for x in ans]
73- end
74- # convert NaN to missing
75- if eltype (ans) != Float64 && Float64 <: eltype (ans)
76- ans = [x isa Float64 && isnan (x) ? missing : x for x in ans]
77- end
78- end
79- return ans :: AbstractVector
80- end
81- Base. getindex (df:: PyPandasDataFrame , k:: Symbol ) = getindex (df, string (k))
82- Base. getindex (df:: PyPandasDataFrame , k) = getindex (df, convert (String, k))
83-
84- Base. get (df:: PyPandasDataFrame , k, d) = haskey (df, k) ? df[k] : d
85-
86- Base. values (df:: PyPandasDataFrame ) = (df[k] for k in keys (df))
87-
88- Base. pairs (df:: PyPandasDataFrame ) = (Pair {String, AbstractVector} (k, df[k]) for k in keys (df))
89-
90- Base. getproperty (df:: PyPandasDataFrame , k:: Symbol ) = hasfield (PyPandasDataFrame, k) ? getfield (df, k) : df[k]
91- Base. getproperty (df:: PyPandasDataFrame , k:: String ) = getproperty (df, Symbol (k))
92-
93- function Base. propertynames (df:: PyPandasDataFrame , private:: Bool = false )
94- ans = Symbol .(keys (df))
95- if private
96- append! (ans, fieldnames (PyPandasDataFrame))
97- else
98- push! (ans, :indexname , :columntypes )
99- end
100- return ans
101- end
102-
10330# ## Show
10431
10532function Base. show (io:: IO , mime:: MIME"text/plain" , df:: PyPandasDataFrame )
@@ -117,9 +44,60 @@ Base.showable(mime::MIME, df::PyPandasDataFrame) = pyshowable(mime, df)
11744# ## Tables
11845
11946Tables. istable (:: Type{PyPandasDataFrame} ) = true
47+
12048Tables. columnaccess (:: Type{PyPandasDataFrame} ) = true
121- function Tables. columns (df:: PyPandasDataFrame )
122- ns = Tuple (Symbol .(keys (df)))
123- cs = values (df)
124- return NamedTuple {ns} (cs)
49+
50+ Tables. columns (df:: PyPandasDataFrame ) = _columns (df, df. columnnames, df. columntypes)
51+
52+ function _columns (df, columnnames, columntypes)
53+ # collect columns
54+ colnames = Symbol[]
55+ pycolumns = Py[]
56+ if df. indexname != = nothing
57+ push! (colnames, df. indexname)
58+ push! (pycolumns, df. py. index)
59+ end
60+ for pycolname in df. py. columns
61+ colname = columnnames (pycolname):: Symbol
62+ pycolumn = df. py[pycolname]
63+ push! (colnames, colname)
64+ push! (pycolumns, pycolumn)
65+ end
66+ # ensure column names are unique by appending a _N suffix
67+ colnamecount = Dict {Symbol,Int} ()
68+ for (i, colname) in pairs (colnames)
69+ n = get (colnamecount, colname, 0 ) + 1
70+ colnamecount[colname] = n
71+ if n > 1
72+ colnames[i] = Symbol (colname, :_ , n)
73+ end
74+ end
75+ # convert columns to vectors
76+ columns = AbstractVector[]
77+ coltypes = Type[]
78+ for (colname, pycolumn) in zip (colnames, pycolumns)
79+ coltype = columntypes (colname):: Union{Nothing,Type}
80+ if coltype != = nothing
81+ column = pyconvert_and_del (AbstractVector{coltype}, pycolumn)
82+ else
83+ column = pyconvert_and_del (AbstractVector, pycolumn)
84+ # narrow the type
85+ column = identity .(column)
86+ # convert any Py to something more useful
87+ if Py <: eltype (column)
88+ column = [x isa Py ? pyconvert (Any, x) : x for x in column]
89+ end
90+ # convert NaN to missing
91+ if eltype (column) != Float64 && Float64 <: eltype (column)
92+ column = [x isa Float64 && isnan (x) ? missing : x for x in column]
93+ end
94+ end
95+ push! (columns, column)
96+ push! (coltypes, eltype (column))
97+ end
98+ # output a table
99+ # TODO : realising columns to vectors could be done lazily with a different table type
100+ schema = Tables. Schema (colnames, coltypes)
101+ coldict = Dict (k=> v for (k,v) in zip (colnames, columns))
102+ Tables. DictColumnTable (schema, coldict)
125103end
0 commit comments