JuliaGPU · vchuravy · Feb 17, 2025 · Jan 15, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -22,12 +22,8 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6'
-          - '1.7'
-          - '1.8'
-          - '1.9'
           - '1.10'
-          - '~1.11.0-0'
+          - '1.11'
         os:
           - ubuntu-latest
           - macOS-latest

diff --git a/Project.toml b/Project.toml
@@ -1,20 +1,32 @@
 name = "KernelAbstractions"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 authors = ["Valentin Churavy <[email protected]> and contributors"]
-version = "0.9.34"
+version = "0.10.0-dev"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
-EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
+GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
+OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
-Requires = "ae029012-a4dd-5104-9daa-d747884805df"
-SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd"
+
+[weakdeps]
+EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+[extensions]
+EnzymeExt = "EnzymeCore"
+LinearAlgebraExt = "LinearAlgebra"
+SparseArraysExt = "SparseArrays"
 
 [compat]
 Adapt = "0.4, 1.0, 2.0, 3.0, 4"
@@ -24,23 +36,12 @@ InteractiveUtils = "1.6"
 LinearAlgebra = "1.6"
 MacroTools = "0.5"
 PrecompileTools = "1"
-Requires = "1.3"
 SparseArrays = "<0.0.1, 1.6"
 StaticArrays = "0.12, 1.0"
 UUIDs = "<0.0.1, 1.6"
-julia = "1.6"
-
-[extensions]
-EnzymeExt = "EnzymeCore"
-LinearAlgebraExt = "LinearAlgebra"
-SparseArraysExt = "SparseArrays"
+julia = "1.10"
 
 [extras]
 EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-
-[weakdeps]
-EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
diff --git a/cuda/Project.toml b/cuda/Project.toml
@@ -0,0 +1,3 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
diff --git a/examples/histogram.jl b/examples/histogram.jl
@@ -74,32 +74,28 @@ function move(backend, input)
 end
 
 @testset "histogram tests" begin
-    if Base.VERSION < v"1.7.0" && !KernelAbstractions.isgpu(backend)
-        @test_skip false
-    else
-        rand_input = [rand(1:128) for i in 1:1000]
-        linear_input = [i for i in 1:1024]
-        all_two = [2 for i in 1:512]
-
-        histogram_rand_baseline = create_histogram(rand_input)
-        histogram_linear_baseline = create_histogram(linear_input)
-        histogram_two_baseline = create_histogram(all_two)
-
-        rand_input = move(backend, rand_input)
-        linear_input = move(backend, linear_input)
-        all_two = move(backend, all_two)
-
-        rand_histogram = KernelAbstractions.zeros(backend, Int, 128)
-        linear_histogram = KernelAbstractions.zeros(backend, Int, 1024)
-        two_histogram = KernelAbstractions.zeros(backend, Int, 2)
-
-        histogram!(rand_histogram, rand_input)
-        histogram!(linear_histogram, linear_input)
-        histogram!(two_histogram, all_two)
-        KernelAbstractions.synchronize(CPU())
-
-        @test isapprox(Array(rand_histogram), histogram_rand_baseline)
-        @test isapprox(Array(linear_histogram), histogram_linear_baseline)
-        @test isapprox(Array(two_histogram), histogram_two_baseline)
-    end
+    rand_input = [rand(1:128) for i in 1:1000]
+    linear_input = [i for i in 1:1024]
+    all_two = [2 for i in 1:512]
+
+    histogram_rand_baseline = create_histogram(rand_input)
+    histogram_linear_baseline = create_histogram(linear_input)
+    histogram_two_baseline = create_histogram(all_two)
+
+    rand_input = move(backend, rand_input)
+    linear_input = move(backend, linear_input)
+    all_two = move(backend, all_two)
+
+    rand_histogram = KernelAbstractions.zeros(backend, Int, 128)
+    linear_histogram = KernelAbstractions.zeros(backend, Int, 1024)
+    two_histogram = KernelAbstractions.zeros(backend, Int, 2)
+
+    histogram!(rand_histogram, rand_input)
+    histogram!(linear_histogram, linear_input)
+    histogram!(two_histogram, all_two)
+    KernelAbstractions.synchronize(CPU())
+
+    @test isapprox(Array(rand_histogram), histogram_rand_baseline)
+    @test isapprox(Array(linear_histogram), histogram_linear_baseline)
+    @test isapprox(Array(two_histogram), histogram_two_baseline)
 end
diff --git a/examples/naive_transpose.jl b/examples/naive_transpose.jl
@@ -15,8 +15,7 @@ function naive_transpose!(a, b)
     end
     backend = get_backend(a)
     @assert get_backend(b) == backend
-    groupsize = KernelAbstractions.isgpu(backend) ? 256 : 1024
-    kernel! = naive_transpose_kernel!(backend, groupsize)
+    kernel! = naive_transpose_kernel!(backend, 256)
     kernel!(a, b, ndrange = size(a))
     return
 end

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
@@ -50,7 +50,7 @@ synchronize(backend)
 ```
 """
 macro kernel(expr)
-    return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false, #=unsafe_indices=# false)
+    return __kernel(expr, #=force_inbounds=# false, #=unsafe_indices=# false)
 end
 
 """
@@ -66,18 +66,20 @@ This allows for two different configurations:
 
 !!! warn
     This is an experimental feature.
+
+!!! note
+    `cpu={true, false}` is deprecated for KernelAbstractions 1.0
 """
 macro kernel(ex...)
     if length(ex) == 1
-        return __kernel(ex[1], true, false, false)
+        return __kernel(ex[1], false, false)
     else
-        generate_cpu = true
         unsafe_indices = false
         force_inbounds = false
         for i in 1:(length(ex) - 1)
             if ex[i] isa Expr && ex[i].head == :(=) &&
                     ex[i].args[1] == :cpu && ex[i].args[2] isa Bool
-                generate_cpu = ex[i].args[2]
+                #deprecated
             elseif ex[i] isa Expr && ex[i].head == :(=) &&
                     ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool
                 force_inbounds = ex[i].args[2]
@@ -94,7 +96,7 @@ macro kernel(ex...)
                 )
             end
         end
-        return __kernel(ex[end], generate_cpu, force_inbounds, unsafe_indices)
+        return __kernel(ex[end], force_inbounds, unsafe_indices)
     end
 end
 
@@ -190,6 +192,8 @@ After releasing the memory of an array, it should no longer be accessed.
 """
 function unsafe_free! end
 
+unsafe_free!(::AbstractArray) = return
+
 ###
 # Kernel language
 # - @localmem
@@ -254,6 +258,9 @@ For storage that only persists between `@synchronize` statements, an `MArray` ca
 instead.
 
 See also [`@uniform`](@ref).
+
+!!! note
+    `@private` is deprecated for KernelAbstractions 1.0
 """
 macro private(T, dims)
     if dims isa Integer
@@ -269,6 +276,9 @@ end
 
 Creates a private local of `mem` per item in the workgroup. This can be safely used
 across [`@synchronize`](@ref) statements.
+
+!!! note
+    `@private` is deprecated for KernelAbstractions 1.0
 """
 macro private(expr)
     return esc(expr)
@@ -279,6 +289,9 @@ end
 
 `expr` is evaluated outside the workitem scope. This is useful for variable declarations
 that span workitems, or are reused across `@synchronize` statements.
+
+!!! note
+    `@uniform` is deprecated for KernelAbstractions 1.0
 """
 macro uniform(value)
     return esc(value)
@@ -330,6 +343,8 @@ Access the hidden context object used by KernelAbstractions.
 !!! warn
     Only valid to be used from a kernel with `cpu=false`.
 
+!!! note
+    `@context` will be supported on all backends in KernelAbstractions 1.0
 ```
 function f(@context, a)
     I = @index(Global, Linear)
@@ -478,31 +493,11 @@ Abstract type for all GPU based KernelAbstractions backends.
 
 !!! note
     New backend implementations **must** sub-type this abstract type.
-"""
-abstract type GPU <: Backend end
-
-"""
-    CPU(; static=false)
-
-Instantiate a CPU (multi-threaded) backend.
-
-## Options:
- - `static`: Uses a static thread assignment, this can be beneficial for NUMA aware code.
-   Defaults to false.
-"""
-struct CPU <: Backend
-    static::Bool
-    CPU(; static::Bool = false) = new(static)
-end
-
-"""
-    isgpu(::Backend)::Bool
 
-Returns true for all [`GPU`](@ref) backends.
+!!! note
+    `GPU` will be removed in KernelAbstractions v1.0
 """
-isgpu(::GPU) = true
-isgpu(::CPU) = false
-
+abstract type GPU <: Backend end
 
 """
     get_backend(A::AbstractArray)::Backend
@@ -518,12 +513,9 @@ function get_backend end
 # Should cover SubArray, ReshapedArray, ReinterpretArray, Hermitian, AbstractTriangular, etc.:
 get_backend(A::AbstractArray) = get_backend(parent(A))
 
-get_backend(::Array) = CPU()
-
 # Define:
 #   adapt_storage(::Backend, a::Array) = adapt(BackendArray, a)
 #   adapt_storage(::Backend, a::BackendArray) = a
-Adapt.adapt_storage(::CPU, a::Array) = a
 
 """
     allocate(::Backend, Type, dims...)::AbstractArray
@@ -743,7 +735,7 @@ Partition a kernel for the given ndrange and workgroupsize.
     return iterspace, dynamic
 end
 
-function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: Union{CPU, GPU}, S <: _Size, NDRange <: _Size, XPUName}
+function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: GPU, S <: _Size, NDRange <: _Size, XPUName}
     return Kernel{Backend, S, NDRange, XPUName}(backend, xpu_name)
 end
 
@@ -760,6 +752,10 @@ include("compiler.jl")
 function __workitems_iterspace end
 function __validindex end
 
+# for reflection
+function mkcontext end
+function launch_config end
+
 include("macros.jl")
 
 ###
@@ -829,8 +825,11 @@ end
 end
 
 # CPU backend
+include("pocl/pocl.jl")
+using .POCL
+export POCLBackend
 
-include("cpu.jl")
+const CPU = POCLBackend
 
 # precompile
 PrecompileTools.@compile_workload begin
@@ -844,19 +843,4 @@ PrecompileTools.@compile_workload begin
     end
 end
 
-if !isdefined(Base, :get_extension)
-    using Requires
-end
-
-@static if !isdefined(Base, :get_extension)
-    function __init__()
-        @require EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" include("../ext/EnzymeExt.jl")
-    end
-end
-
-if !isdefined(Base, :get_extension)
-    include("../ext/LinearAlgebraExt.jl")
-    include("../ext/SparseArraysExt.jl")
-end
-
 end #module
diff --git a/src/cpu.jl b/src/cpu.jl
@@ -1,4 +1,3 @@
-unsafe_free!(::AbstractArray) = return
 synchronize(::CPU) = nothing
 
 allocate(::CPU, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims)