Skip to content

Commit ed45e9b

Browse files
committed
Allow opt-out of implicit bounds-checking
KernelAbstractions currently creates kernels that look like: ``` if __validindex(ctx) # Body end ``` This is problematic due to the convergence requirement on `@synchronize`.
1 parent 31d5b44 commit ed45e9b

File tree

4 files changed

+116
-8
lines changed

4 files changed

+116
-8
lines changed

docs/src/index.md

+83
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,89 @@ Major refactor of KernelAbstractions. In particular:
3838
- Removal of the event system. Kernel are now implicitly ordered.
3939
- Removal of backend packages, backends are now directly provided by CUDA.jl and similar
4040

41+
#### 0.9.33
42+
Restricts the semantics of `@synchronize` to require convergent execution.
43+
The OpenCL backend had several miss-compilations due to divergent execution of `@synchronize`.
44+
The `CPU` backend always had this limitation and upon investigation the CUDA backend similarly requires convergent execution,
45+
but allows for a wider set of valid kernels.
46+
47+
This highlighted a design flaw in KernelAbstractions. Most GPU implementations execute KernelAbstraction workgroups on static blocks
48+
This means a kernel with `ndrange=(32, 30)` might be executed on a static block of `(32,32)`. In order to block these extra indicies,
49+
KernelAbstraction would insert a dynamic boundscheck.
50+
51+
Prior to v0.9.33 a kernel like
52+
53+
```julia
54+
@kernel function localmem(A)
55+
N = @uniform prod(@groupsize())
56+
I = @index(Global, Linear)
57+
i = @index(Local, Linear)
58+
lmem = @localmem Int (N,) # Ok iff groupsize is static
59+
lmem[i] = i
60+
@synchronize
61+
A[I] = lmem[N - i + 1]
62+
end
63+
```
64+
65+
was lowered to GPU backends like this:
66+
67+
```julia
68+
function localmem_gpu(A)
69+
if __validindex(__ctx__)
70+
N = @uniform prod(@groupsize())
71+
I = @index(Global, Linear)
72+
i = @index(Local, Linear)
73+
lmem = @localmem Int (N,) # Ok iff groupsize is static
74+
lmem[i] = i
75+
@synchronize
76+
A[I] = lmem[N - i + 1]
77+
end
78+
end
79+
```
80+
81+
This would cause an implicit divergent execution of `@synchronize`.
82+
83+
With this release the lowering has been changed to:
84+
85+
```julia
86+
function localmem_gpu(A)
87+
__valid_lane__ __validindex(__ctx__)
88+
N = @uniform prod(@groupsize())
89+
lmem = @localmem Int (N,) # Ok iff groupsize is static
90+
if __valid_lane__
91+
I = @index(Global, Linear)
92+
i = @index(Local, Linear)
93+
lmem[i] = i
94+
end
95+
@synchronize
96+
if __valid_lane__
97+
A[I] = lmem[N - i + 1]
98+
end
99+
end
100+
```
101+
102+
Note that this follow the CPU lowering with respect to `@uniform`, `@private`, `@localmem` and `@synchronize`.
103+
104+
Since this transformation can be disruptive, user can now opt out of the implicit bounds-check,
105+
but users must avoid the use of `@index(Global)` and instead use their own derivation based on `@index(Group)` and `@index(Local)`.
106+
107+
```julia
108+
@kernel unsafe_indicies=false function localmem(A)
109+
N = @uniform prod(@groupsize())
110+
gI = @index(Group, Linear)
111+
i = @index(Local, Linear)
112+
lmem = @localmem Int (N,) # Ok iff groupsize is static
113+
if i <= N
114+
lmem[i] = i
115+
end
116+
I = (gI - 1) * N + i
117+
@synchronize
118+
if i <= N && I <= length(A)
119+
A[I] = lmem[N - i + 1]
120+
end
121+
end
122+
```
123+
41124
## Semantic differences
42125

43126
### To CUDA.jl/AMDGPU.jl

src/KernelAbstractions.jl

+9-3
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ synchronize(backend)
5050
```
5151
"""
5252
macro kernel(expr)
53-
return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false)
53+
return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false, #=unsafe_indicies=# false)
5454
end
5555

5656
"""
@@ -60,6 +60,7 @@ This allows for two different configurations:
6060
6161
1. `cpu={true, false}`: Disables code-generation of the CPU function. This relaxes semantics such that KernelAbstractions primitives can be used in non-kernel functions.
6262
2. `inbounds={false, true}`: Enables a forced `@inbounds` macro around the function definition in the case the user is using too many `@inbounds` already in their kernel. Note that this can lead to incorrect results, crashes, etc and is fundamentally unsafe. Be careful!
63+
3. `unsafe_indicies={false, true}`: Disables the implicit validation of indicies, users must avoid `@index(Global)`.
6364
6465
- [`@context`](@ref)
6566
@@ -68,9 +69,10 @@ This allows for two different configurations:
6869
"""
6970
macro kernel(ex...)
7071
if length(ex) == 1
71-
return __kernel(ex[1], true, false)
72+
return __kernel(ex[1], true, false, false)
7273
else
7374
generate_cpu = true
75+
unsafe_indicies = false
7476
force_inbounds = false
7577
for i in 1:(length(ex) - 1)
7678
if ex[i] isa Expr && ex[i].head == :(=) &&
@@ -79,16 +81,20 @@ macro kernel(ex...)
7981
elseif ex[i] isa Expr && ex[i].head == :(=) &&
8082
ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool
8183
force_inbounds = ex[i].args[2]
84+
elseif ex[i] isa Expr && ex[i].head == :(=) &&
85+
ex[i].args[1] == :unsafe_indicies && ex[i].args[2] isa Bool
86+
unsafe_indicies = ex[i].args[2]
8287
else
8388
error(
8489
"Configuration should be of form:\n" *
8590
"* `cpu=true`\n" *
8691
"* `inbounds=false`\n" *
92+
"* `unsafe_indicies=false`\n" *
8793
"got `", ex[i], "`",
8894
)
8995
end
9096
end
91-
return __kernel(ex[end], generate_cpu, force_inbounds)
97+
return __kernel(ex[end], generate_cpu, force_inbounds, unsafe_indicies)
9298
end
9399
end
94100

src/macros.jl

+8-4
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ function find_return(stmt)
1010
end
1111

1212
# XXX: Proper errors
13-
function __kernel(expr, generate_cpu = true, force_inbounds = false)
13+
function __kernel(expr, generate_cpu = true, force_inbounds = false, unsafe_indicies = true)
1414
def = splitdef(expr)
1515
name = def[:name]
1616
args = def[:args]
@@ -46,7 +46,7 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false)
4646

4747
def_gpu = deepcopy(def)
4848
def_gpu[:name] = gpu_name = Symbol(:gpu_, name)
49-
transform_gpu!(def_gpu, constargs, force_inbounds)
49+
transform_gpu!(def_gpu, constargs, force_inbounds, unsafe_indicies)
5050
gpu_function = combinedef(def_gpu)
5151

5252
# create constructor functions
@@ -78,7 +78,7 @@ end
7878

7979
# The easy case, transform the function for GPU execution
8080
# - mark constant arguments by applying `constify`.
81-
function transform_gpu!(def, constargs, force_inbounds)
81+
function transform_gpu!(def, constargs, force_inbounds, unsafe_indicies)
8282
let_constargs = Expr[]
8383
for (i, arg) in enumerate(def[:args])
8484
if constargs[i]
@@ -94,7 +94,11 @@ function transform_gpu!(def, constargs, force_inbounds)
9494
if force_inbounds
9595
push!(new_stmts, Expr(:inbounds, true))
9696
end
97-
append!(new_stmts, split(emit_gpu, body.args))
97+
if unsafe_indicies
98+
append!(new_stmts, split(emit_gpu, body.args))
99+
else
100+
push!(new_stmts, body)
101+
end
98102
if force_inbounds
99103
push!(new_stmts, Expr(:inbounds, :pop))
100104
end

test/localmem.jl

+16-1
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,24 @@ end
3434
end
3535
end
3636

37+
@kernel unsafe_indicies = false function localmem_unsafe_indicies(A)
38+
N = @uniform prod(@groupsize())
39+
gI = @index(Group, Linear)
40+
i = @index(Local, Linear)
41+
lmem = @localmem Int (N,) # Ok iff groupsize is static
42+
if i <= N
43+
lmem[i] = i
44+
end
45+
I = (gI - 1) * N + i
46+
@synchronize
47+
if i <= N && I <= length(A)
48+
A[I] = lmem[N - i + 1]
49+
end
50+
end
51+
3752
function localmem_testsuite(backend, ArrayT)
3853
@testset "kernels" begin
39-
@testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16))
54+
@testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indicies(backend(), 16))
4055
A = ArrayT{Int}(undef, 64)
4156
kernel!(A, ndrange = size(A))
4257
synchronize(backend())

0 commit comments

Comments
 (0)