From 0d1964b9178df6ef34725062b2f656616fccc31e Mon Sep 17 00:00:00 2001 From: James Schloss Date: Sat, 14 Dec 2024 18:16:05 +0100 Subject: [PATCH] fastmath demo --- src/KernelAbstractions.jl | 10 ++++++++-- src/macros.jl | 16 +++++++++++----- test/runtests.jl | 12 +++++++++++- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 55d5d465..eb5181bd 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -50,7 +50,7 @@ synchronize(backend) ``` """ macro kernel(expr) - __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false) + __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false, #=force_fastmath=# false) end """ @@ -60,6 +60,7 @@ This allows for two different configurations: 1. `cpu={true, false}`: Disables code-generation of the CPU function. This relaxes semantics such that KernelAbstractions primitives can be used in non-kernel functions. 2. `inbounds={false, true}`: Enables a forced `@inbounds` macro around the function definition in the case the user is using too many `@inbounds` already in their kernel. Note that this can lead to incorrect results, crashes, etc and is fundamentally unsafe. Be careful! +3. `fastmath={false, true}`: Enables a forced `@fastmath` macro around the function definition. This will use less precise square roots and flush denormals. - [`@context`](@ref) @@ -72,6 +73,7 @@ macro kernel(ex...) else generate_cpu = true force_inbounds = false + force_fastmath = false for i in 1:(length(ex) - 1) if ex[i] isa Expr && ex[i].head == :(=) && ex[i].args[1] == :cpu && ex[i].args[2] isa Bool @@ -79,16 +81,20 @@ macro kernel(ex...) elseif ex[i] isa Expr && ex[i].head == :(=) && ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool force_inbounds = ex[i].args[2] + elseif ex[i] isa Expr && ex[i].head == :(=) && + ex[i].args[1] == :fastmath && ex[i].args[2] isa Bool + force_fastmath = ex[i].args[2] else error( "Configuration should be of form:\n" * "* `cpu=true`\n" * "* `inbounds=false`\n" * + "* `fastmath=false`\n" * "got `", ex[i], "`", ) end end - __kernel(ex[end], generate_cpu, force_inbounds) + __kernel(ex[end], generate_cpu, force_inbounds, force_fastmath) end end diff --git a/src/macros.jl b/src/macros.jl index a511758d..15894579 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -10,7 +10,7 @@ function find_return(stmt) end # XXX: Proper errors -function __kernel(expr, generate_cpu = true, force_inbounds = false) +function __kernel(expr, generate_cpu = true, force_inbounds = false, force_fastmath = false) def = splitdef(expr) name = def[:name] args = def[:args] @@ -40,13 +40,13 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false) if generate_cpu def_cpu = deepcopy(def) def_cpu[:name] = cpu_name - transform_cpu!(def_cpu, constargs, force_inbounds) + transform_cpu!(def_cpu, constargs, force_inbounds, force_fastmath) cpu_function = combinedef(def_cpu) end def_gpu = deepcopy(def) def_gpu[:name] = gpu_name = Symbol(:gpu_, name) - transform_gpu!(def_gpu, constargs, force_inbounds) + transform_gpu!(def_gpu, constargs, force_inbounds, force_fastmath) gpu_function = combinedef(def_gpu) # create constructor functions @@ -78,7 +78,7 @@ end # The easy case, transform the function for GPU execution # - mark constant arguments by applying `constify`. -function transform_gpu!(def, constargs, force_inbounds) +function transform_gpu!(def, constargs, force_inbounds, force_fastmath) let_constargs = Expr[] for (i, arg) in enumerate(def[:args]) if constargs[i] @@ -92,6 +92,11 @@ function transform_gpu!(def, constargs, force_inbounds) @inbounds $(body) end end + if force_fastmath + body = quote + @fastmath $(body) + end + end body = quote if $__validindex(__ctx__) $(body) @@ -112,7 +117,7 @@ end # - handle indicies # - hoist workgroup definitions # - hoist uniform variables -function transform_cpu!(def, constargs, force_inbounds) +function transform_cpu!(def, constargs, force_inbounds, force_fastmath) let_constargs = Expr[] for (i, arg) in enumerate(def[:args]) if constargs[i] @@ -130,6 +135,7 @@ function transform_cpu!(def, constargs, force_inbounds) if force_inbounds push!(new_stmts, Expr(:inbounds, :pop)) end + push!(new_stmts, Expr(:popaliasscope)) push!(new_stmts, :(return nothing)) def[:body] = Expr( diff --git a/test/runtests.jl b/test/runtests.jl index f992afad..8cf3fe45 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -22,7 +22,7 @@ end @test_throws ErrorException("This kernel is unavailable for backend CPU") my_no_cpu_kernel(CPU()) # testing multiple configurations at the same time -@kernel cpu = false inbounds = false function my_no_cpu_kernel2(a) +@kernel cpu = false inbounds = false fastmath = false function my_no_cpu_kernel2(a) end @test_throws ErrorException("This kernel is unavailable for backend CPU") my_no_cpu_kernel2(CPU()) @@ -43,6 +43,16 @@ if Base.JLOptions().check_bounds == 0 || Base.JLOptions().check_bounds == 2 @test nothing == my_inbounds_kernel(CPU())(Int[], ndrange = 1) end +if Base.JLOptions().fast_math == 0 + @kernel fastmath = true function my_fastmath_kernel(a) + idx = @index(Global, Linear) + a[idx] = sqrt(10) + end + A = [0.0] + my_fastmath_kernel(CPU())(A, ndrange = 1) + @test A[1] == @fastmath sqrt(10) +end + struct NewBackend <: KernelAbstractions.GPU end @testset "Default host implementation" begin backend = NewBackend()