Skip to content

Commit 1529509

Browse files
committed
Make vcat(dfs) do container & eltype promotion
1 parent 7c574c7 commit 1529509

File tree

5 files changed

+154
-81
lines changed

5 files changed

+154
-81
lines changed

src/abstractdataframe/abstractdataframe.jl

+49-16
Original file line numberDiff line numberDiff line change
@@ -318,37 +318,70 @@ Base.vcat(df::AbstractDataFrame) = df
318318
Base.vcat(dfs::AbstractDataFrame...) = vcat(collect(dfs))
319319

320320
function Base.vcat{T<:AbstractDataFrame}(dfs::Vector{T})
321-
Nrow = sum(nrow, dfs)
322-
# build up column names and eltypes
323-
colnams = names(dfs[1])
324-
coltyps = eltypes(dfs[1])
325-
for i in 2:length(dfs)
326-
cni = _names(dfs[i])
327-
cti = eltypes(dfs[i])
328-
for j in 1:length(cni)
329-
cn = cni[j]
330-
if !in(cn, colnams) # new column
331-
push!(colnams, cn)
332-
push!(coltyps, cti[j])
333-
end
334-
end
335-
end
321+
coltyps, colnams, similars = _colinfo(dfs)
322+
336323
res = DataFrame()
324+
Nrow = sum(nrow, dfs)
337325
for j in 1:length(colnams)
338-
col = DataArray(coltyps[j], Nrow)
339326
colnam = colnams[j]
327+
col = similar(similars[j], coltyps[j], Nrow)
328+
340329
i = 1
341330
for df in dfs
342331
if haskey(df, colnam)
343332
copy!(col, i, df[colnam])
344333
end
345334
i += size(df, 1)
346335
end
336+
347337
res[colnam] = col
348338
end
349339
res
350340
end
351341

342+
_isnullable(::AbstractArray) = false
343+
_isnullable(::AbstractDataArray) = true
344+
const EMPTY_DATA = DataArray(Void, 0)
345+
346+
function _colinfo{T<:AbstractDataFrame}(dfs::Vector{T})
347+
df1 = dfs[1]
348+
colindex = copy(index(df1))
349+
coltyps = eltypes(df1)
350+
similars = collect(columns(df1))
351+
nonnull_ct = Int[_isnullable(c) for c in columns(df1)]
352+
353+
for i in 2:length(dfs)
354+
df = dfs[i]
355+
for j in 1:size(df, 2)
356+
col = df[j]
357+
cn, ct = _names(df)[j], eltype(col)
358+
if haskey(colindex, cn)
359+
idx = colindex[cn]
360+
361+
oldtyp = coltyps[idx]
362+
if !(ct <: oldtyp)
363+
coltyps[idx] = promote_type(oldtyp, ct)
364+
end
365+
nonnull_ct[idx] += !_isnullable(col)
366+
else # new column
367+
push!(colindex, cn)
368+
push!(coltyps, ct)
369+
push!(similars, col)
370+
push!(nonnull_ct, !_isnullable(col))
371+
end
372+
end
373+
end
374+
375+
for j in 1:length(colindex)
376+
if nonnull_ct[j] < length(dfs) && !_isnullable(similars[j])
377+
similars[j] = EMPTY_DATA
378+
end
379+
end
380+
colnams = _names(colindex)
381+
382+
coltyps, colnams, similars
383+
end
384+
352385
##############################################################################
353386
##
354387
## Hashing

test/cat.jl

+104
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
module TestCat
2+
using Base.Test
3+
using DataFrames
4+
5+
#
6+
# hcat
7+
#
8+
9+
dvint = @data([1, 2, NA, 4])
10+
dvstr = @data(["one", "two", NA, "four"])
11+
12+
df2 = DataFrame(Any[dvint, dvstr])
13+
df3 = DataFrame(Any[dvint])
14+
df4 = convert(DataFrame, [1:4 1:4])
15+
df5 = DataFrame(Any[@data([1,2,3,4]), dvstr])
16+
17+
dfh = hcat(df3, df4)
18+
@test size(dfh, 2) == 3
19+
@test names(dfh) == [:x1, :x1_1, :x2]
20+
@test isequal(dfh[:x1], df3[:x1])
21+
@test isequal(dfh, [df3 df4])
22+
23+
dfh3 = hcat(df3, df4, df5)
24+
@test names(dfh3) == [:x1, :x1_1, :x2, :x1_2, :x2_1]
25+
@test isequal(dfh3, hcat(dfh, df5))
26+
27+
#
28+
# vcat
29+
#
30+
31+
null_df = DataFrame(Int, 0, 0)
32+
df = DataFrame(Int, 4, 3)
33+
34+
# Assignment of rows
35+
df[1, :] = df[1, :]
36+
df[1:2, :] = df[1:2, :]
37+
38+
# Broadcasting assignment of rows
39+
df[1, :] = 1
40+
41+
# Assignment of columns
42+
df[1] = zeros(4)
43+
44+
# Broadcasting assignment of columns
45+
df[:, 1] = 1
46+
df[1] = 3
47+
df[:x3] = 2
48+
49+
vcat(null_df)
50+
vcat(null_df, null_df)
51+
vcat(null_df, df)
52+
vcat(df, null_df)
53+
vcat(df, df)
54+
vcat(df, df, df)
55+
56+
alt_df = deepcopy(df)
57+
vcat(df, alt_df)
58+
59+
# Don't fail on non-matching types
60+
df[1] = zeros(Int, nrow(df))
61+
vcat(df, alt_df)
62+
63+
# Don't fail on non-matching names
64+
names!(alt_df, [:A, :B, :C])
65+
vcat(df, alt_df)
66+
67+
dfr = vcat(df4, df4)
68+
@test size(dfr, 1) == 8
69+
@test names(df4) == names(dfr)
70+
@test isequal(dfr, [df4, df4])
71+
72+
dfr = vcat(df2, df3)
73+
@test size(dfr) == (8,2)
74+
@test names(df2) == names(dfr)
75+
@test isna(dfr[8,:x2])
76+
77+
# Eltype promotion
78+
@test eltypes(vcat(DataFrame(a = [1]), DataFrame(a = [2.1]))) == [Float64]
79+
80+
# Minimal container type promotion
81+
dfa = DataFrame(a = @pdata([1, 2, 2]))
82+
dfb = DataFrame(a = @pdata([2, 3, 4]))
83+
dfc = DataFrame(a = @data([2, 3, 4]))
84+
dfd = DataFrame(Any[2:4], [:a])
85+
@test vcat(dfa, dfb)[:a] == @pdata([1, 2, 2, 2, 3, 4])
86+
@test vcat(dfa, dfc)[:a] == @pdata([1, 2, 2, 2, 3, 4])
87+
# ^^ container may flip if container promotion happens in Base/DataArrays
88+
dc = vcat(dfd, dfc)
89+
@test vcat(dfc, dfd) == dc
90+
91+
# Zero-row DataFrames
92+
dfc0 = similar(dfc, 0)
93+
@test vcat(dfd, dfc0, dfc) == dc
94+
@test eltypes(vcat(dfd, dfc0)) == eltypes(dc)
95+
96+
# Missing columns
97+
rename!(dfd, :a, :b)
98+
dfda = DataFrame(b = @data([2, 3, 4, NA, NA, NA]),
99+
a = @pdata([NA, NA, NA, 1, 2, 2]))
100+
@test isequal(vcat(dfd, dfa), dfda)
101+
102+
# Alignment
103+
@test isequal(vcat(dfda, dfd, dfa), vcat(dfda, dfda))
104+
end

test/data.jl

-22
Original file line numberDiff line numberDiff line change
@@ -41,28 +41,6 @@ module TestData
4141
@test size(head(df6,2)) == (2, 3)
4242
# lots more to do
4343

44-
#test_group("hcat")
45-
dfc = hcat(df3, df4)
46-
@test size(dfc, 2) == 3
47-
@test names(dfc) == [:x1, :x1_1, :x2]
48-
@test isequal(dfc[:x1], df3[:x1])
49-
@test isequal(dfc, [df3 df4])
50-
51-
dfc3 = hcat(df3, df4, df5)
52-
@test names(dfc3) == [:x1, :x1_1, :x2, :x1_2, :x2_1]
53-
@test isequal(dfc3, hcat(dfc, df5))
54-
55-
#test_group("vcat")
56-
dfr = vcat(df4, df4)
57-
@test size(dfr, 1) == 8
58-
@test names(df4) == names(dfr)
59-
@test isequal(dfr, [df4, df4])
60-
61-
dfr = vcat(df2, df3)
62-
@test size(dfr) == (8,2)
63-
@test names(df2) == names(dfr)
64-
@test isna(dfr[8,:x2])
65-
6644
#test_group("assign")
6745
df6[3] = @data(["un", "deux", "troix", "quatre"])
6846
@test df6[1, 3] == "un"

test/runtests.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ using Base.Test
1010
using DataFrames
1111

1212
my_tests = ["utils.jl",
13+
"cat.jl",
1314
"data.jl",
1415
"index.jl",
1516
"dataframe.jl",
@@ -22,7 +23,6 @@ my_tests = ["utils.jl",
2223
"sort.jl",
2324
"grouping.jl",
2425
"join.jl",
25-
"vcat.jl",
2626
"iteration.jl",
2727
"duplicates.jl",
2828
"show.jl"]

test/vcat.jl

-42
This file was deleted.

0 commit comments

Comments
 (0)