JuliaParallel · jpsamaroo · Oct 15, 2021 · Aug 29, 2021 · Aug 31, 2021 · Sep 1, 2021
diff --git a/docs/src/dtable.md b/docs/src/dtable.md
@@ -34,7 +34,6 @@ julia> d = DTable(table, 2)
 DTable with 3 partitions
 Tabletype: NamedTuple
 
-
 julia> fetch(d)
 (a = [1, 2, 3, 4, 5], b = [6, 7, 8, 9, 10])
 ```
@@ -51,8 +50,7 @@ julia> files = ["1.csv", "2.csv", "3.csv"];
 
 julia> d = DTable(CSV.File, files)
 DTable with 3 partitions
-Tabletype: unknown (use `tabletype(::DTable)`)
-
+Tabletype: unknown (use `tabletype!(::DTable)`)
 
 julia> tabletype(d)
 NamedTuple
@@ -73,7 +71,6 @@ julia> d = DTable(table, 2)
 DTable with 3 partitions
 Tabletype: NamedTuple
 
-
 julia> fetch(d)
 (a = [1, 2, 3, 4, 5], b = [6, 7, 8, 9, 10])
 ```
@@ -165,12 +162,143 @@ julia> fetch(r)
 (v = 5500,)
 ```
 
+# Dagger.groupby interface
+
+A `DTable` can be grouped which will result in creation of a `GDTable`.
+A distinct set of values contained in a single or multiple columns can be used as grouping keys.
+If a transformation of a row needs to be performed in order to obtain the grouping key there's
+also an option to provide a custom function returning a key, which is applied per row.
+
+The set of keys the `GDTable` is grouped by can be obtained using
+the `keys(gd::GDTable)` function. To get a fragment of the `GDTable` containing
+records belonging under a single keythe `getindex(gd::GDTable, key)` function can be used.
+
+```julia
+julia> d = DTable((a=shuffle(repeat('a':'d', inner=4, outer=4)),b=repeat(1:4, 16)), 4)
+DTable with 16 partitions
+Tabletype: NamedTuple
+
+julia> Dagger.groupby(d, :a)
+GDTable with 4 partitions and 4 keys
+Tabletype: NamedTuple
+Grouped by: [:a]
+
+julia> Dagger.groupby(d, [:a, :b])
+GDTable with 16 partitions and 16 keys
+Tabletype: NamedTuple
+Grouped by: [:a, :b]
+
+julia> Dagger.groupby(d, row -> row.a + row.b)
+GDTable with 7 partitions and 7 keys
+Tabletype: NamedTuple
+Grouped by: custom function
+
+julia> g = Dagger.groupby(d, :a); keys(g)
+KeySet for a Dict{Char, Vector{UInt64}} with 4 entries. Keys:
+  'c'
+  'd'
+  'a'
+  'b'
+
+julia> g['c']
+DTable with 1 partitions
+Tabletype: NamedTuple
+```
+
+## GDTable operations
+
+Operations such as `map`, `filter`, `reduce` can be performed on a `GDTable`
+
+```julia
+julia> g = Dagger.groupby(d, [:a, :b])
+GDTable with 16 partitions and 16 keys
+Tabletype: NamedTuple
+Grouped by: [:a, :b]
+
+julia> f = filter(x -> x.a != 'd', g)
+GDTable with 16 partitions and 16 keys
+Tabletype: NamedTuple
+Grouped by: [:a, :b]
+
+julia> trim!(f)
+GDTable with 12 partitions and 12 keys
+Tabletype: NamedTuple
+Grouped by: [:a, :b]
+
+julia> m = map(r -> (a = r.a, b = r.b, c = r.b .- 3), f)
+GDTable with 12 partitions and 12 keys
+Tabletype: NamedTuple
+Grouped by: [:a, :b]
+
+julia> r = reduce(*, m)
+EagerThunk (running)
+
+julia> DataFrame(fetch(r))
+12×5 DataFrame
+ Row │ a     b      result_a  result_b  result_c 
+     │ Char  Int64  String    Int64     Int64    
+─────┼───────────────────────────────────────────
+   1 │ a         1  aaaa             1        16
+   2 │ c         3  ccc             27         0
+   3 │ a         3  aa               9         0
+   4 │ b         4  bbbb           256         1
+   5 │ c         4  cccc           256         1
+   6 │ b         2  bbbb            16         1
+   7 │ b         1  bbbb             1        16
+   8 │ a         2  aaa              8        -1
+   9 │ a         4  aaaaaaa      16384         1
+  10 │ b         3  bbbb            81         0
+  11 │ c         2  ccccc           32        -1
+  12 │ c         1  cccc             1        16
+```
+
+## Iterating over a GDTable
+
+`GDTable` can be iterated over and each element returned will be a pair of key
+and a `DTable` containing all rows associated with that grouping key.
+
+```julia
+julia> d = DTable((a=repeat('a':'b', inner=2),b=1:4), 2)
+DTable with 2 partitions
+Tabletype: NamedTuple
+
+julia> g = Dagger.groupby(d, :a)
+GDTable with 2 partitions and 2 keys
+Tabletype: NamedTuple
+Grouped by: [:a]
+
+julia> for (key, dt) in g
+           println("Key: $key")
+           println(fetch(dt, DataFrame))
+       end
+Key: a
+2×2 DataFrame
+ Row │ a     b     
+     │ Char  Int64 
+─────┼─────────────
+   1 │ a         1
+   2 │ a         2
+Key: b
+2×2 DataFrame
+ Row │ a     b     
+     │ Char  Int64 
+─────┼─────────────
+   1 │ b         3
+   2 │ b         4
+```
+
 # API
 
 ```@docs
 DTable
 tabletype
+tabletype!
+trim
+trim!
 map
 filter
 reduce
+groupby
+keys
+getindex
 ```
diff --git a/src/Dagger.jl b/src/Dagger.jl
@@ -55,7 +55,9 @@ include("ui/gantt-common.jl")
 include("ui/gantt-text.jl")
 
 include("table/dtable.jl")
+include("table/gdtable.jl")
 include("table/operations.jl")
+include("table/groupby.jl")
 
 include("lib/logging-extras.jl")
 

diff --git a/src/table/gdtable.jl b/src/table/gdtable.jl
@@ -0,0 +1,148 @@
+import Base: keys, iterate, length, getindex
+
+"""
+    GDTable
+
+Structure representing a grouped `DTable`.
+It wraps over a DTable object and provides additional information on how the `GDTable` is grouped.
+To represent the grouping a `cols` field is used, which contains the column symbols used for
+grouping and an `index`, which allows to effectively lookup the partitions grouped under a single key.
+"""
+mutable struct GDTable
+    dtable::DTable
+    cols::Union{Vector{Symbol}, Nothing}
+    index::Dict
+
+    GDTable(dtable, cols, index) = new(dtable, cols, deepcopy(index))
+end
+
+fetch(gd::GDTable) = fetch(gd.dtable)
+fetch(gd::GDTable, sink) = fetch(gd.dtable, sink)
+
+"""
+    grouped_cols(gd::GDTable) -> Vector{Symbol}
+
+Returns the symbols of columns used in the grouping.
+In case grouping on a function was performed a `:KEYS` symbol will be returned.
+"""
+grouped_cols(gd::GDTable) = gd.cols === nothing ? [:KEYS] : gd.cols
+
+"""
+    keys(gd::GDTable)
+
+Returns the keys by which the `gd` is grouped by.
+"""
+keys(gd::GDTable) = keys(gd.index)
+
+partition(gd::GDTable, key) = partition(gd, gd.index[key])
+partition(gd::GDTable, indices::Vector{UInt}) = DTable(getindex.(Ref(gd.dtable.chunks), indices), gd.dtable.tabletype)
+
+length(gd::GDTable) = length(keys(gd.index))
+
+
+function iterate(gd::GDTable)
+    it = iterate(gd.index)
+    if it !== nothing
+        ((key, partition_indices), state) = it
+        return key => partition(gd, partition_indices), state
+    end
+    return nothing
+end
+
+
+function iterate(gd::GDTable, state)
+    it = iterate(gd.index, state)
+    if it !== nothing
+        ((key, partition_indices), state) = it
+        return key => partition(gd, partition_indices), state
+    end
+    return nothing
+end
+
+
+"""
+    trim!(gd::GDTable) -> GDTable
+
+Removes empty chunks from `gd` and empty keys from its index.
+"""
+function trim!(gd::GDTable)
+    d = gd.dtable
+    check_result = [Dagger.@spawn isnonempty(c) for c in d.chunks]
+    results = fetch.(check_result)
+
+    ok_indices = filter(x -> results[x], 1:length(results))
+    d.chunks = getindex.(Ref(d.chunks), sort(ok_indices))
+
+    offsets = zeros(UInt, length(results))
+
+    counter = zero(UInt)
+    for (i, r) in enumerate(results)
+        counter = r ? counter : counter + 1
+        offsets[i] = counter
+    end
+
+    for key in keys(gd.index)
+        ind = gd.index[key]
+        filter!(x -> results[x], ind)
+
+        if isempty(ind)
+            delete!(gd.index, key)
+        else
+            gd.index[key] = ind .- getindex.(Ref(offsets), ind)
+        end
+    end
+    gd
+end
+
+
+"""
+    trim(gd::GDTable) -> GDTable
+
+Returns `gd` with empty chunks and keys removed.
+"""
+trim(gd::GDTable) = trim!(GDTable(DTable(gd.dtable.chunks, gd.dtable.tabletype), gd.cols, gd.index))
+
+
+"""
+    tabletype!(gd::GDTable)
+
+Provides the type of the underlying table partition and caches it in `gd`.
+
+In case the tabletype cannot be obtained the default return value is `NamedTuple`.
+"""
+tabletype!(gd::GDTable) = gd.dtable.tabletype = resolve_tabletype(gd.dtable)
+
+
+"""
+    tabletype(gd::GDTable)
+
+Provides the type of the underlying table partition.
+Uses the cached tabletype if available.
+
+In case the tabletype cannot be obtained the default return value is `NamedTuple`.
+"""
+tabletype(gd::GDTable) = gd.dtable.tabletype === nothing ? resolve_tabletype(gd.dtable) : gd.dtable.tabletype
+
+
+show(io::IO, gd::GDTable) = show(io, MIME"text/plain"(), gd)
+
+
+function show(io::IO, ::MIME"text/plain", gd::GDTable)
+    tabletype = isnothing(gd.dtable.tabletype) ? "unknown (use `tabletype!(::GDTable)`)" : gd.dtable.tabletype
+    grouped_by_cols = isnothing(gd.cols) ? "custom function" : grouped_cols(gd)
+    println(io, "GDTable with $(length(gd.dtable.chunks)) partitions and $(length(keys(gd.index))) keys")
+    println(io, "Tabletype: $tabletype")
+    print(io, "Grouped by: $grouped_by_cols")
+    nothing
+end
+
+"""
+    getindex(gdt::GDTable, key) -> DTable
+
+Retrieves a `DTable` from `gdt` with rows belonging to the provided grouping key.
+"""
+function getindex(gdt::GDTable, key)
+    key ∉ keys(gdt) && throw(KeyError("Key $key not present in the GDTable"))
+    # TODO: try to resolve more forms of key even if it doesn't exactly match the key in the dict
+    partition(gdt, key)
+end