|
| 1 | +using KernelAbstractions, Test |
| 2 | +using KernelAbstractions: @atomic, @atomicswap, @atomicreplace |
| 3 | +include(joinpath(@__DIR__, "utils.jl")) # Load backend |
| 4 | + |
| 5 | + |
| 6 | +# Function to use as a baseline for CPU metrics |
| 7 | +function create_histogram(input) |
| 8 | + histogram_output = zeros(Int, maximum(input)) |
| 9 | + for i = 1:length(input) |
| 10 | + histogram_output[input[i]] += 1 |
| 11 | + end |
| 12 | + return histogram_output |
| 13 | +end |
| 14 | + |
| 15 | +# This a 1D histogram kernel where the histogramming happens on shmem |
| 16 | +@kernel function histogram_kernel!(histogram_output, input) |
| 17 | + tid = @index(Global, Linear) |
| 18 | + lid = @index(Local, Linear) |
| 19 | + |
| 20 | + @uniform warpsize = Int(32) |
| 21 | + |
| 22 | + @uniform gs = @groupsize()[1] |
| 23 | + @uniform N = length(histogram_output) |
| 24 | + |
| 25 | + shared_histogram = @localmem Int (gs) |
| 26 | + |
| 27 | + # This will go through all input elements and assign them to a location in |
| 28 | + # shmem. Note that if there is not enough shem, we create different shmem |
| 29 | + # blocks to write to. For example, if shmem is of size 256, but it's |
| 30 | + # possible to get a value of 312, then we will have 2 separate shmem blocks, |
| 31 | + # one from 1->256, and another from 256->512 |
| 32 | + @uniform max_element = 1 |
| 33 | + for min_element = 1:gs:N |
| 34 | + |
| 35 | + # Setting shared_histogram to 0 |
| 36 | + @inbounds shared_histogram[lid] = 0 |
| 37 | + @synchronize() |
| 38 | + |
| 39 | + max_element = min_element + gs |
| 40 | + if max_element > N |
| 41 | + max_element = N+1 |
| 42 | + end |
| 43 | + |
| 44 | + # Defining bin on shared memory and writing to it if possible |
| 45 | + bin = input[tid] |
| 46 | + if bin >= min_element && bin < max_element |
| 47 | + bin -= min_element-1 |
| 48 | + GC.@preserve shared_histogram begin |
| 49 | + @atomic shared_histogram[bin] += 1 |
| 50 | + end |
| 51 | + end |
| 52 | + |
| 53 | + @synchronize() |
| 54 | + |
| 55 | + if ((lid+min_element-1) <= N) |
| 56 | + @atomic histogram_output[lid+min_element-1] += shared_histogram[lid] |
| 57 | + end |
| 58 | + |
| 59 | + end |
| 60 | + |
| 61 | +end |
| 62 | + |
| 63 | +function histogram!(histogram_output, input; |
| 64 | + numcores = 4, numthreads = 256) |
| 65 | + |
| 66 | + if isa(input, Array) |
| 67 | + kernel! = histogram_kernel!(CPU(), numcores) |
| 68 | + else |
| 69 | + kernel! = histogram_kernel!(CUDADevice(), numthreads) |
| 70 | + end |
| 71 | + |
| 72 | + kernel!(histogram_output, input, ndrange=size(input)) |
| 73 | +end |
| 74 | + |
| 75 | +@testset "histogram tests" begin |
| 76 | + |
| 77 | + rand_input = [rand(1:128) for i = 1:1000] |
| 78 | + linear_input = [i for i = 1:1024] |
| 79 | + all_2 = [2 for i = 1:512] |
| 80 | + |
| 81 | + histogram_rand_baseline = create_histogram(rand_input) |
| 82 | + histogram_linear_baseline = create_histogram(linear_input) |
| 83 | + histogram_2_baseline = create_histogram(all_2) |
| 84 | + |
| 85 | + if Base.VERSION >= v"1.7.0" |
| 86 | + CPU_rand_histogram = zeros(Int, 128) |
| 87 | + CPU_linear_histogram = zeros(Int, 1024) |
| 88 | + CPU_2_histogram = zeros(Int, 2) |
| 89 | + |
| 90 | + wait(histogram!(CPU_rand_histogram, rand_input)) |
| 91 | + wait(histogram!(CPU_linear_histogram, linear_input)) |
| 92 | + wait(histogram!(CPU_2_histogram, all_2)) |
| 93 | + |
| 94 | + @test isapprox(CPU_rand_histogram, histogram_rand_baseline) |
| 95 | + @test isapprox(CPU_linear_histogram, histogram_linear_baseline) |
| 96 | + @test isapprox(CPU_2_histogram, histogram_2_baseline) |
| 97 | + end |
| 98 | + |
| 99 | + if has_cuda_gpu() |
| 100 | + CUDA.allowscalar(false) |
| 101 | + |
| 102 | + GPU_rand_input = CuArray(rand_input) |
| 103 | + GPU_linear_input = CuArray(linear_input) |
| 104 | + GPU_2_input = CuArray(all_2) |
| 105 | + |
| 106 | + GPU_rand_histogram = CuArray(zeros(Int, 128)) |
| 107 | + GPU_linear_histogram = CuArray(zeros(Int, 1024)) |
| 108 | + GPU_2_histogram = CuArray(zeros(Int, 2)) |
| 109 | + |
| 110 | + wait(histogram!(GPU_rand_histogram, GPU_rand_input)) |
| 111 | + wait(histogram!(GPU_linear_histogram, GPU_linear_input)) |
| 112 | + wait(histogram!(GPU_2_histogram, GPU_2_input)) |
| 113 | + |
| 114 | + @test isapprox(Array(GPU_rand_histogram), histogram_rand_baseline) |
| 115 | + @test isapprox(Array(GPU_linear_histogram), histogram_linear_baseline) |
| 116 | + @test isapprox(Array(GPU_2_histogram), histogram_2_baseline) |
| 117 | + end |
| 118 | + |
| 119 | +end |
0 commit comments