JuliaGPU · brabreda · Aug 14, 2023 · Aug 14, 2023 · Sep 9, 2023 · Aug 28, 2023
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
@@ -139,10 +139,31 @@ function unsafe_free! end
 # - @groupsize
 # - @ndrange
 ###
-
 function groupsize end
 function ndrange end
 
+"""
+    @subgroupsize()
+
+    returns the GPUs subgroupsize.
+"""
+macro subgroupsize()
+    quote
+        $__subgroupsize()
+    end
+end
-"""
-    @subgroupsize()
-
-    returns the GPUs subgroupsize.
-"""
-macro subgroupsize()
-    quote
-        $__subgroupsize()
-    end
-end
-"""
-    @subgroupsize()
-
-    returns the GPUs subgroupsize.
-"""
-macro subgroupsize()
-    quote
-        $__subgroupsize()
-    end
-end
+
+"""
+    @subgroupsize()
+
+    returns the GPUs subgroupsize.
+"""
+macro subgroupsize()
+    quote
+        $__subgroupsize()
+    end
+end
+
 """
     @groupsize()
 
@@ -657,6 +678,10 @@ function __synchronize()
     error("@synchronize used outside kernel or not captured")
 end
 
+function __subgroupsize()
+    error("@subgroupsize used outside kernel or not captured")
+end
+
 @generated function __print(items...)
     str = ""
     args = []
@@ -700,6 +725,7 @@ end
     @inbounds A[I] = B[I]
 end
 
+
 # CPU backend
 
 include("cpu.jl")
@@ -726,4 +752,7 @@ end
     end
 end
 
+# group- and subgroupreduce
+include("reduce.jl")
+
 end #module
diff --git a/src/reduce.jl b/src/reduce.jl
@@ -0,0 +1,98 @@
+export @groupreduce, @subgroupreduce
+
+"""
+
+@subgroupreduce(op, val)
+
+reduce values across a subgroup. This operation is only supported if subgroups are supported by the backend.
+"""
+macro subgroupreduce(op, val)
+    quote
+        $__subgroupreduce($(esc(op)),$(esc(val)))
+    end
+end
+
+function __subgroupreduce(op, val)
+    error("@subgroupreduce used outside kernel, not captured, or not supported")
+end
+
+"""
+
+@groupreduce(op, val, neutral, use_subgroups)
+
+Reduce values across a block
+- `op`: the operator of the reduction
+- `val`: value that each thread contibutes to the values that need to be reduced
+- `netral`: value of the operator, so that `op(netural, neutral) = neutral``
+- `use_subgroups`: make use of the subgroupreduction of the groupreduction
+"""
+macro groupreduce(op, val, neutral, use_subgroups) 
+    quote
+        $__groupreduce($(esc(:__ctx__)),$(esc(op)), $(esc(val)), $(esc(neutral)), $(esc(typeof(val))), Val(use_subgroups))
+    end
+end
+
+@inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}, ::Val{true}) where {T}
+    idx_in_group = @index(Local)
+    groupsize = @groupsize()[1]
+    subgroupsize = @subgroupsize()
+
+    localmem = @localmem(T, subgroupsize)
+
+    idx_subgroup, idx_in_subgroup = fldmod1(idx_in_group, subgroupsize)
+
+    # first subgroup reduction
+    val = @subgroupreduce(op, val)
+
+    # store partial results in local memory
+    if idx_in_subgroup == 1
+        @inbounds localmem[idx_in_subgroup] = val
+    end
+
+    @synchronize()
+
+    val = if idx_in_subgroup <= fld1(groupsize, subgroupsize)
+            @inbounds localmem[idx_in_subgroup]
+    else
+        neutral
+    end
+
+    # second subgroup reduction to reduce partial results
+    if idx_in_subgroup == 1
+        val =  @subgroupreduce(op, val)
+    end
+
+    return val
+end
+
+@inline function __groupreduce(__ctx__, op, val, neutral, ::Type{T}, ::Val{false}) where {T}
+    idx_in_group = @index(Local)
+    groupsize = @groupsize()[1]
+
+    localmem = @localmem(T, groupsize)
+
+    @inbounds localmem[idx_in_group] = val
+
+    # perform the reduction
+    d = 1
+    while d < groupsize
+        @synchronize()
+        index = 2 * d * (idx_in_group-1) + 1
+        @inbounds if index <= groupsize
+            other_val = if index + d <= groupsize
+                localmem[index+d]
+            else
+                neutral
+            end
+            localmem[index] = op(localmem[index], other_val)
+        end
+        d *= 2
+    end
+
+    # load the final value on the first thread
+    if idx_in_group == 1
+        val = @inbounds localmem[idx_in_group]
+    end
+
+    return val 
+end
diff --git a/test/reduce.jl b/test/reduce.jl
@@ -0,0 +1,39 @@
+using KernelAbstractions, Test
+
+
+
+
+@kernel function reduce(a, b, op, neutral)
+    idx_in_group = @index(Local)
+
+    val = a[idx_in_group]
+
+    val = @groupreduce(op, val, netral, false)
+
+    b[1] = val 
+end
+
+function(backend, ArrayT)
+    @testset "groupreduce one group" begin
+        @testset for op in (+,*,max,min)
+            @testset for type in (Int32, Float32, Float64)
+                @test test_1group_groupreduce(backend, ArrayT ,op, type, op(neutral))
+            end
+        end
+    end
+end
+
+function test_1group_groupreduce(backend,ArrayT, op, type, neutral)
+    a = rand(type, 32)
+    b = ArrayT(a)  
+
+    c = similar(b,1)
+    reduce(a, c, op, neutral)
+
+    expected = mapreduce(x->x^2, +, a)
+    actual = c[1]
+    return expected = actual
+end
+
+
+