Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement a CPU backend using POCL #556

Merged
merged 1 commit into from
Feb 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,8 @@ jobs:
fail-fast: false
matrix:
version:
- '1.6'
- '1.7'
- '1.8'
- '1.9'
- '1.10'
- '~1.11.0-0'
- '1.11'
os:
- ubuntu-latest
- macOS-latest
Expand Down
35 changes: 18 additions & 17 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,20 +1,32 @@
name = "KernelAbstractions"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
authors = ["Valentin Churavy <[email protected]> and contributors"]
version = "0.9.34"
version = "0.10.0-dev"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd"

[weakdeps]
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

[extensions]
EnzymeExt = "EnzymeCore"
LinearAlgebraExt = "LinearAlgebra"
SparseArraysExt = "SparseArrays"

[compat]
Adapt = "0.4, 1.0, 2.0, 3.0, 4"
Expand All @@ -24,23 +36,12 @@ InteractiveUtils = "1.6"
LinearAlgebra = "1.6"
MacroTools = "0.5"
PrecompileTools = "1"
Requires = "1.3"
SparseArrays = "<0.0.1, 1.6"
StaticArrays = "0.12, 1.0"
UUIDs = "<0.0.1, 1.6"
julia = "1.6"

[extensions]
EnzymeExt = "EnzymeCore"
LinearAlgebraExt = "LinearAlgebra"
SparseArraysExt = "SparseArrays"
julia = "1.10"

[extras]
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

[weakdeps]
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
3 changes: 3 additions & 0 deletions cuda/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[deps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
52 changes: 24 additions & 28 deletions examples/histogram.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,32 +74,28 @@ function move(backend, input)
end

@testset "histogram tests" begin
if Base.VERSION < v"1.7.0" && !KernelAbstractions.isgpu(backend)
@test_skip false
else
rand_input = [rand(1:128) for i in 1:1000]
linear_input = [i for i in 1:1024]
all_two = [2 for i in 1:512]

histogram_rand_baseline = create_histogram(rand_input)
histogram_linear_baseline = create_histogram(linear_input)
histogram_two_baseline = create_histogram(all_two)

rand_input = move(backend, rand_input)
linear_input = move(backend, linear_input)
all_two = move(backend, all_two)

rand_histogram = KernelAbstractions.zeros(backend, Int, 128)
linear_histogram = KernelAbstractions.zeros(backend, Int, 1024)
two_histogram = KernelAbstractions.zeros(backend, Int, 2)

histogram!(rand_histogram, rand_input)
histogram!(linear_histogram, linear_input)
histogram!(two_histogram, all_two)
KernelAbstractions.synchronize(CPU())

@test isapprox(Array(rand_histogram), histogram_rand_baseline)
@test isapprox(Array(linear_histogram), histogram_linear_baseline)
@test isapprox(Array(two_histogram), histogram_two_baseline)
end
rand_input = [rand(1:128) for i in 1:1000]
linear_input = [i for i in 1:1024]
all_two = [2 for i in 1:512]

histogram_rand_baseline = create_histogram(rand_input)
histogram_linear_baseline = create_histogram(linear_input)
histogram_two_baseline = create_histogram(all_two)

rand_input = move(backend, rand_input)
linear_input = move(backend, linear_input)
all_two = move(backend, all_two)

rand_histogram = KernelAbstractions.zeros(backend, Int, 128)
linear_histogram = KernelAbstractions.zeros(backend, Int, 1024)
two_histogram = KernelAbstractions.zeros(backend, Int, 2)

histogram!(rand_histogram, rand_input)
histogram!(linear_histogram, linear_input)
histogram!(two_histogram, all_two)
KernelAbstractions.synchronize(CPU())

@test isapprox(Array(rand_histogram), histogram_rand_baseline)
@test isapprox(Array(linear_histogram), histogram_linear_baseline)
@test isapprox(Array(two_histogram), histogram_two_baseline)
end
3 changes: 1 addition & 2 deletions examples/naive_transpose.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ function naive_transpose!(a, b)
end
backend = get_backend(a)
@assert get_backend(b) == backend
groupsize = KernelAbstractions.isgpu(backend) ? 256 : 1024
kernel! = naive_transpose_kernel!(backend, groupsize)
kernel! = naive_transpose_kernel!(backend, 256)
kernel!(a, b, ndrange = size(a))
return
end
Expand Down
80 changes: 32 additions & 48 deletions src/KernelAbstractions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ synchronize(backend)
```
"""
macro kernel(expr)
return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false, #=unsafe_indices=# false)
return __kernel(expr, #=force_inbounds=# false, #=unsafe_indices=# false)
end

"""
Expand All @@ -66,18 +66,20 @@ This allows for two different configurations:

!!! warn
This is an experimental feature.

!!! note
`cpu={true, false}` is deprecated for KernelAbstractions 1.0
"""
macro kernel(ex...)
if length(ex) == 1
return __kernel(ex[1], true, false, false)
return __kernel(ex[1], false, false)
else
generate_cpu = true
unsafe_indices = false
force_inbounds = false
for i in 1:(length(ex) - 1)
if ex[i] isa Expr && ex[i].head == :(=) &&
ex[i].args[1] == :cpu && ex[i].args[2] isa Bool
generate_cpu = ex[i].args[2]
#deprecated
elseif ex[i] isa Expr && ex[i].head == :(=) &&
ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool
force_inbounds = ex[i].args[2]
Expand All @@ -94,7 +96,7 @@ macro kernel(ex...)
)
end
end
return __kernel(ex[end], generate_cpu, force_inbounds, unsafe_indices)
return __kernel(ex[end], force_inbounds, unsafe_indices)
end
end

Expand Down Expand Up @@ -190,6 +192,8 @@ After releasing the memory of an array, it should no longer be accessed.
"""
function unsafe_free! end

unsafe_free!(::AbstractArray) = return

###
# Kernel language
# - @localmem
Expand Down Expand Up @@ -254,6 +258,9 @@ For storage that only persists between `@synchronize` statements, an `MArray` ca
instead.

See also [`@uniform`](@ref).

!!! note
`@private` is deprecated for KernelAbstractions 1.0
"""
macro private(T, dims)
if dims isa Integer
Expand All @@ -269,6 +276,9 @@ end

Creates a private local of `mem` per item in the workgroup. This can be safely used
across [`@synchronize`](@ref) statements.

!!! note
`@private` is deprecated for KernelAbstractions 1.0
"""
macro private(expr)
return esc(expr)
Expand All @@ -279,6 +289,9 @@ end

`expr` is evaluated outside the workitem scope. This is useful for variable declarations
that span workitems, or are reused across `@synchronize` statements.

!!! note
`@uniform` is deprecated for KernelAbstractions 1.0
"""
macro uniform(value)
return esc(value)
Expand Down Expand Up @@ -330,6 +343,8 @@ Access the hidden context object used by KernelAbstractions.
!!! warn
Only valid to be used from a kernel with `cpu=false`.

!!! note
`@context` will be supported on all backends in KernelAbstractions 1.0
```
function f(@context, a)
I = @index(Global, Linear)
Expand Down Expand Up @@ -478,31 +493,11 @@ Abstract type for all GPU based KernelAbstractions backends.

!!! note
New backend implementations **must** sub-type this abstract type.
"""
abstract type GPU <: Backend end

"""
CPU(; static=false)

Instantiate a CPU (multi-threaded) backend.

## Options:
- `static`: Uses a static thread assignment, this can be beneficial for NUMA aware code.
Defaults to false.
"""
struct CPU <: Backend
static::Bool
CPU(; static::Bool = false) = new(static)
end

"""
isgpu(::Backend)::Bool

Returns true for all [`GPU`](@ref) backends.
!!! note
`GPU` will be removed in KernelAbstractions v1.0
"""
isgpu(::GPU) = true
isgpu(::CPU) = false

abstract type GPU <: Backend end

"""
get_backend(A::AbstractArray)::Backend
Expand All @@ -518,12 +513,9 @@ function get_backend end
# Should cover SubArray, ReshapedArray, ReinterpretArray, Hermitian, AbstractTriangular, etc.:
get_backend(A::AbstractArray) = get_backend(parent(A))

get_backend(::Array) = CPU()

# Define:
# adapt_storage(::Backend, a::Array) = adapt(BackendArray, a)
# adapt_storage(::Backend, a::BackendArray) = a
Adapt.adapt_storage(::CPU, a::Array) = a

"""
allocate(::Backend, Type, dims...)::AbstractArray
Expand Down Expand Up @@ -743,7 +735,7 @@ Partition a kernel for the given ndrange and workgroupsize.
return iterspace, dynamic
end

function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: Union{CPU, GPU}, S <: _Size, NDRange <: _Size, XPUName}
function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: GPU, S <: _Size, NDRange <: _Size, XPUName}
return Kernel{Backend, S, NDRange, XPUName}(backend, xpu_name)
end

Expand All @@ -760,6 +752,10 @@ include("compiler.jl")
function __workitems_iterspace end
function __validindex end

# for reflection
function mkcontext end
function launch_config end

include("macros.jl")

###
Expand Down Expand Up @@ -829,8 +825,11 @@ end
end

# CPU backend
include("pocl/pocl.jl")
using .POCL
export POCLBackend

include("cpu.jl")
const CPU = POCLBackend

# precompile
PrecompileTools.@compile_workload begin
Expand All @@ -844,19 +843,4 @@ PrecompileTools.@compile_workload begin
end
end

if !isdefined(Base, :get_extension)
using Requires
end

@static if !isdefined(Base, :get_extension)
function __init__()
@require EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" include("../ext/EnzymeExt.jl")
end
end

if !isdefined(Base, :get_extension)
include("../ext/LinearAlgebraExt.jl")
include("../ext/SparseArraysExt.jl")
end

end #module
1 change: 0 additions & 1 deletion src/cpu.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
unsafe_free!(::AbstractArray) = return
synchronize(::CPU) = nothing

allocate(::CPU, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims)
Expand Down
Loading
Loading