Skip to content

Commit 6374613

Browse files
leiostkfvchuravy
authored
adding atomic support with atomix (#299)
Co-authored-by: Takafumi Arakaki <[email protected]> Co-authored-by: Valentin Churavy <[email protected]>
1 parent 5ef9c19 commit 6374613

File tree

5 files changed

+123
-0
lines changed

5 files changed

+123
-0
lines changed

Project.toml

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ version = "0.8.1"
55

66
[deps]
77
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
8+
Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
89
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
910
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
1011
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"

examples/histogram.jl

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
using KernelAbstractions, Test
2+
using KernelAbstractions: @atomic, @atomicswap, @atomicreplace
3+
include(joinpath(@__DIR__, "utils.jl")) # Load backend
4+
5+
6+
# Function to use as a baseline for CPU metrics
7+
function create_histogram(input)
8+
histogram_output = zeros(Int, maximum(input))
9+
for i = 1:length(input)
10+
histogram_output[input[i]] += 1
11+
end
12+
return histogram_output
13+
end
14+
15+
# This a 1D histogram kernel where the histogramming happens on shmem
16+
@kernel function histogram_kernel!(histogram_output, input)
17+
tid = @index(Global, Linear)
18+
lid = @index(Local, Linear)
19+
20+
@uniform warpsize = Int(32)
21+
22+
@uniform gs = @groupsize()[1]
23+
@uniform N = length(histogram_output)
24+
25+
shared_histogram = @localmem Int (gs)
26+
27+
# This will go through all input elements and assign them to a location in
28+
# shmem. Note that if there is not enough shem, we create different shmem
29+
# blocks to write to. For example, if shmem is of size 256, but it's
30+
# possible to get a value of 312, then we will have 2 separate shmem blocks,
31+
# one from 1->256, and another from 256->512
32+
@uniform max_element = 1
33+
for min_element = 1:gs:N
34+
35+
# Setting shared_histogram to 0
36+
@inbounds shared_histogram[lid] = 0
37+
@synchronize()
38+
39+
max_element = min_element + gs
40+
if max_element > N
41+
max_element = N+1
42+
end
43+
44+
# Defining bin on shared memory and writing to it if possible
45+
bin = input[tid]
46+
if bin >= min_element && bin < max_element
47+
bin -= min_element-1
48+
GC.@preserve shared_histogram begin
49+
@atomic shared_histogram[bin] += 1
50+
end
51+
end
52+
53+
@synchronize()
54+
55+
if ((lid+min_element-1) <= N)
56+
@atomic histogram_output[lid+min_element-1] += shared_histogram[lid]
57+
end
58+
59+
end
60+
61+
end
62+
63+
function histogram!(histogram_output, input;
64+
numcores = 4, numthreads = 256)
65+
66+
if isa(input, Array)
67+
kernel! = histogram_kernel!(CPU(), numcores)
68+
else
69+
kernel! = histogram_kernel!(CUDADevice(), numthreads)
70+
end
71+
72+
kernel!(histogram_output, input, ndrange=size(input))
73+
end
74+
75+
@testset "histogram tests" begin
76+
77+
rand_input = [rand(1:128) for i = 1:1000]
78+
linear_input = [i for i = 1:1024]
79+
all_2 = [2 for i = 1:512]
80+
81+
histogram_rand_baseline = create_histogram(rand_input)
82+
histogram_linear_baseline = create_histogram(linear_input)
83+
histogram_2_baseline = create_histogram(all_2)
84+
85+
if Base.VERSION >= v"1.7.0"
86+
CPU_rand_histogram = zeros(Int, 128)
87+
CPU_linear_histogram = zeros(Int, 1024)
88+
CPU_2_histogram = zeros(Int, 2)
89+
90+
wait(histogram!(CPU_rand_histogram, rand_input))
91+
wait(histogram!(CPU_linear_histogram, linear_input))
92+
wait(histogram!(CPU_2_histogram, all_2))
93+
94+
@test isapprox(CPU_rand_histogram, histogram_rand_baseline)
95+
@test isapprox(CPU_linear_histogram, histogram_linear_baseline)
96+
@test isapprox(CPU_2_histogram, histogram_2_baseline)
97+
end
98+
99+
if has_cuda_gpu()
100+
CUDA.allowscalar(false)
101+
102+
GPU_rand_input = CuArray(rand_input)
103+
GPU_linear_input = CuArray(linear_input)
104+
GPU_2_input = CuArray(all_2)
105+
106+
GPU_rand_histogram = CuArray(zeros(Int, 128))
107+
GPU_linear_histogram = CuArray(zeros(Int, 1024))
108+
GPU_2_histogram = CuArray(zeros(Int, 2))
109+
110+
wait(histogram!(GPU_rand_histogram, GPU_rand_input))
111+
wait(histogram!(GPU_linear_histogram, GPU_linear_input))
112+
wait(histogram!(GPU_2_histogram, GPU_2_input))
113+
114+
@test isapprox(Array(GPU_rand_histogram), histogram_rand_baseline)
115+
@test isapprox(Array(GPU_linear_histogram), histogram_linear_baseline)
116+
@test isapprox(Array(GPU_2_histogram), histogram_2_baseline)
117+
end
118+
119+
end

lib/CUDAKernels/Project.toml

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
88
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
99
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
1010
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
11+
UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
1112

1213
[compat]
1314
Adapt = "3.0"

lib/CUDAKernels/src/CUDAKernels.jl

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import StaticArrays
55
import StaticArrays: MArray
66
import Adapt
77
import KernelAbstractions
8+
import UnsafeAtomicsLLVM
89

910
export CUDADevice
1011

src/KernelAbstractions.jl

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ export @Const, @localmem, @private, @uniform, @synchronize, @index, @groupsize,
55
export Device, GPU, CPU, Event, MultiEvent, NoneEvent
66
export async_copy!
77

8+
import Atomix: @atomic, @atomicswap, @atomicreplace
89

910
using LinearAlgebra
1011
using MacroTools

0 commit comments

Comments
 (0)