Skip to content

Commit bdfebce

Browse files
authored
Merge pull request #18 from JuliaGPU/newrand
Change to native `rand`
2 parents 7244289 + 601589d commit bdfebce

File tree

5 files changed

+293
-138
lines changed

5 files changed

+293
-138
lines changed

.buildkite/pipeline.yml

+19-8
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,36 @@
11
steps:
22
# Julia versions
33

4-
- label: "Julia 1.6, CUDA 11.2"
4+
- label: "Julia 1.6"
55
plugins:
66
- JuliaCI/julia#v1:
7-
version: 1.6-nightly
7+
version: 1.6
88
- JuliaCI/julia-test#v1:
9-
test_args: "--thorough"
9+
test_args: "--quickfail"
1010
- JuliaCI/julia-coverage#v1:
1111
codecov: true
1212
dirs:
1313
- src
1414
agents:
1515
queue: "juliagpu"
16-
cuda: "11.2"
17-
cap: "recent"
18-
env:
19-
JULIA_CUDA_VERSION: '11.2'
20-
JULIA_CUDA_USE_BINARYBUILDER: 'true'
16+
cuda: "*"
2117
if: build.message !~ /\[skip tests\]/
2218
timeout_in_minutes: 120
19+
20+
- label: "Julia 1.7"
21+
plugins:
22+
- JuliaCI/julia#v1:
23+
version: 1.7
24+
- JuliaCI/julia-test#v1: ~
25+
- JuliaCI/julia-coverage#v1:
26+
codecov: true
27+
dirs:
28+
- src
29+
agents:
30+
queue: "juliagpu"
31+
cuda: "*"
32+
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
33+
timeout_in_minutes: 120
2334

2435
env:
2536
JULIA_PKG_SERVER: "" # it often struggles with our large artifacts

Project.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
99
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1010

1111
[compat]
12-
BenchmarkTools = "0.6, 0.7, 1.0"
13-
CUDA = "2, 3.0"
12+
BenchmarkTools = "1"
13+
CUDA = "3"
1414
julia = "1.6"
1515

1616
[extras]

src/BinomialGPU.jl

+3
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ module BinomialGPU
33
using CUDA
44
using Random
55

6+
using CUDA: cuda_rng, i32
7+
8+
69
# user-level API
710
include("rand_binomial.jl")
811
export rand_binomial!

src/kernels.jl

+201-74
Original file line numberDiff line numberDiff line change
@@ -31,116 +31,243 @@ function stirling_approx_tail(k)::Float32
3131
end
3232

3333

34-
# BTRS algorithm, adapted from the tensorflow library
35-
# (github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc)
36-
function kernel_BTRS!(
37-
A, count, prob,
38-
R1, R2, Rp, Ra,
39-
count_dim_larger_than_prob_dim,
40-
seed::UInt32
41-
)
42-
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
34+
# BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc)
4335

44-
@inbounds Random.seed!(seed)
36+
## Kernel for scalar parameters
37+
function kernel_BTRS_scalar!(A, n, p, seed::UInt32, counter::UInt32)
38+
device_rng = Random.default_rng()
4539

46-
@inbounds if i <= length(A)
47-
I = Ra[i]
48-
Ip = Rp[I[1]]
49-
I1 = R1[Ip[1]]
50-
I2 = R2[Ip[2]]
40+
# initialize the state
41+
@inbounds Random.seed!(device_rng, seed, counter)
5142

52-
if count_dim_larger_than_prob_dim
53-
n = count[CartesianIndex(I1, I2)]
54-
p = prob[I1]
55-
else
56-
n = count[I1]
57-
p = prob[CartesianIndex(I1, I2)]
58-
end
43+
# grid-stride loop
44+
tid = threadIdx().x
45+
window = (blockDim().x - 1i32) * gridDim().x
46+
offset = (blockIdx().x - 1i32) * blockDim().x
47+
48+
k = 0
49+
while offset < length(A)
50+
i = tid + offset
5951

6052
# edge cases
6153
if p <= 0 || n <= 0
62-
A[i] = 0
63-
return
54+
k = 0
6455
elseif p >= 1
65-
A[i] = n
66-
return
67-
end
68-
56+
k = n
6957
# Use naive algorithm for n <= 17
70-
if n <= 17
58+
elseif n <= 17
7159
k = 0
7260
ctr = 1
7361
while ctr <= n
7462
rand(Float32) < p && (k += 1)
7563
ctr += 1
7664
end
77-
A[i] = k
78-
return
79-
end
80-
8165
# Use inversion algorithm for n*p < 10
82-
if n * p < 10f0
83-
logp = log(1f0-p)
66+
elseif n * p < 10f0
67+
logp = CUDA.log(1f0-p)
8468
geom_sum = 0f0
85-
num_geom = 0
69+
k = 0
8670
while true
87-
geom = ceil(log(rand(Float32)) / logp)
71+
geom = ceil(CUDA.log(rand(Float32)) / logp)
8872
geom_sum += geom
8973
geom_sum > n && break
90-
num_geom += 1
74+
k += 1
75+
end
76+
# BTRS algorithm
77+
else
78+
# BTRS approximations work well for p <= 0.5
79+
# invert p and set `invert` flag
80+
(invert = p > 0.5f0) && (p = 1f0 - p)
81+
82+
r = p/(1f0-p)
83+
s = p*(1f0-p)
84+
85+
stddev = sqrt(n * s)
86+
b = 1.15f0 + 2.53f0 * stddev
87+
a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
88+
c = n * p + 0.5f0
89+
v_r = 0.92f0 - 4.2f0 / b
90+
91+
alpha = (2.83f0 + 5.1f0 / b) * stddev;
92+
m = floor((n + 1) * p)
93+
94+
ks = 0f0
95+
96+
while true
97+
usample = rand(Float32) - 0.5f0
98+
vsample = rand(Float32)
99+
100+
us = 0.5f0 - abs(usample)
101+
ks = floor((2 * a / us + b) * usample + c)
102+
103+
if us >= 0.07f0 && vsample <= v_r
104+
break
105+
end
106+
107+
if ks < 0 || ks > n
108+
continue
109+
end
110+
111+
v2 = CUDA.log(vsample * alpha / (a / (us * us) + b))
112+
ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) +
113+
(n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) +
114+
(ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) +
115+
stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
116+
if v2 <= ub
117+
break
118+
end
91119
end
92-
A[i] = num_geom
93-
return
120+
invert && (ks = n - ks)
121+
k = Int(ks)
94122
end
95123

96-
# BTRS algorithm
97-
# BTRS approximations work well for p <= 0.5
98-
(invert = p > 0.5f0) && (p = 1f0 - p)
124+
if i <= length(A)
125+
@inbounds A[i] = k
126+
end
127+
offset += window
128+
end
129+
return nothing
130+
end
99131

100-
r = p/(1f0-p)
101-
s = p*(1f0-p)
102132

103-
stddev = sqrt(n * s)
104-
b = 1.15f0 + 2.53f0 * stddev
105-
a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
106-
c = n * p + 0.5f0
107-
v_r = 0.92f0 - 4.2f0 / b
133+
## Kernel for arrays of parameters
134+
function kernel_BTRS!(
135+
A,
136+
count, prob,
137+
seed::UInt32, counter::UInt32,
138+
R1, R2, Rp, Ra,
139+
count_dim_larger_than_prob_dim
140+
)
141+
device_rng = Random.default_rng()
142+
143+
# initialize the state
144+
@inbounds Random.seed!(device_rng, seed, counter)
108145

109-
alpha = (2.83f0 + 5.1f0 / b) * stddev;
110-
m = floor((n + 1) * p)
146+
# grid-stride loop
147+
tid = threadIdx().x
148+
window = (blockDim().x - 1i32) * gridDim().x
149+
offset = (blockIdx().x - 1i32) * blockDim().x
111150

112-
while true
113-
usample = rand(Float32) - 0.5f0
114-
vsample = rand(Float32)
151+
while offset < length(A)
152+
i = tid + offset
115153

116-
us = 0.5f0 - abs(usample)
117-
ks = floor((2 * a / us + b) * usample + c)
154+
# PARAMETER LOOKUP
155+
if i <= length(A)
156+
@inbounds I = Ra[i]
157+
@inbounds Ip = Rp[I[1]]
158+
@inbounds I1 = R1[Ip[1]]
159+
@inbounds I2 = R2[Ip[2]]
118160

119-
if us >= 0.07f0 && vsample <= v_r
120-
break
161+
if count_dim_larger_than_prob_dim
162+
@inbounds n = count[CartesianIndex(I1, I2)]
163+
@inbounds p = prob[I1]
164+
else
165+
@inbounds n = count[I1]
166+
@inbounds p = prob[CartesianIndex(I1, I2)]
121167
end
168+
else
169+
n = 0
170+
p = 0
171+
end
122172

123-
if ks < 0 || ks > n
124-
continue
173+
# SAMPLER
174+
# edge cases
175+
if p <= 0 || n <= 0
176+
k = 0
177+
elseif p >= 1
178+
k = n
179+
# Use naive algorithm for n <= 17
180+
elseif n <= 17
181+
k = 0
182+
ctr = 1
183+
while ctr <= n
184+
rand(Float32) < p && (k += 1)
185+
ctr += 1
125186
end
187+
# Use inversion algorithm for n*p < 10
188+
elseif n * p < 10f0
189+
logp = CUDA.log(1f0-p)
190+
geom_sum = 0f0
191+
k = 0
192+
while true
193+
geom = ceil(CUDA.log(rand(Float32)) / logp)
194+
geom_sum += geom
195+
geom_sum > n && break
196+
k += 1
197+
end
198+
# BTRS algorithm
199+
else
200+
# BTRS approximations work well for p <= 0.5
201+
# invert p and set `invert` flag
202+
(invert = p > 0.5f0) && (p = 1f0 - p)
203+
204+
r = p/(1f0-p)
205+
s = p*(1f0-p)
206+
207+
stddev = sqrt(n * s)
208+
b = 1.15f0 + 2.53f0 * stddev
209+
a = -0.0873f0 + 0.0248f0 * b + 0.01f0 * p
210+
c = n * p + 0.5f0
211+
v_r = 0.92f0 - 4.2f0 / b
212+
213+
alpha = (2.83f0 + 5.1f0 / b) * stddev;
214+
m = floor((n + 1) * p)
126215

127-
v2 = log(vsample * alpha / (a / (us * us) + b))
128-
ub = (m + 0.5f0) * log((m + 1) / (r * (n - m + 1))) +
129-
(n + 1) * log((n - m + 1) / (n - ks + 1)) +
130-
(ks + 0.5f0) * log(r * (n - ks + 1) / (ks + 1)) +
131-
stirling_approx_tail(m) + stirling_approx_tail(n - m) -
132-
stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
133-
if v2 <= ub
134-
break
216+
ks = 0f0
217+
218+
while true
219+
usample = rand(Float32) - 0.5f0
220+
vsample = rand(Float32)
221+
222+
us = 0.5f0 - abs(usample)
223+
ks = floor((2 * a / us + b) * usample + c)
224+
225+
if us >= 0.07f0 && vsample <= v_r
226+
break
227+
end
228+
229+
if ks < 0 || ks > n
230+
continue
231+
end
232+
233+
v2 = CUDA.log(vsample * alpha / (a / (us * us) + b))
234+
ub = (m + 0.5f0) * CUDA.log((m + 1) / (r * (n - m + 1))) +
235+
(n + 1) * CUDA.log((n - m + 1) / (n - ks + 1)) +
236+
(ks + 0.5f0) * CUDA.log(r * (n - ks + 1) / (ks + 1)) +
237+
stirling_approx_tail(m) + stirling_approx_tail(n - m) - stirling_approx_tail(ks) - stirling_approx_tail(n - ks)
238+
if v2 <= ub
239+
break
240+
end
135241
end
242+
invert && (ks = n - ks)
243+
k = Int(ks)
136244
end
137245

138-
# if p = 1 - p[i] was used, undo inversion
139-
invert && (ks = n - ks)
140-
A[i] = Int(ks);
141-
nothing
142-
end#if
246+
if i <= length(A)
247+
@inbounds A[i] = k
248+
end
249+
offset += window
250+
end
251+
return nothing
252+
end
253+
254+
255+
## old, unused kernels (for reference)
256+
257+
#naive algorithm, full
258+
function kernel_naive_full!(A, count, prob, randstates)
259+
index1 = (blockIdx().x - 1) * blockDim().x + threadIdx().x
260+
stride1 = blockDim().x * gridDim().x
261+
262+
@inbounds for i in index1:stride1:length(A)
263+
A[i] = 0
264+
for m in 1:count[i]
265+
@inbounds A[i] += GPUArrays.gpu_rand(Float32, CUDA.CuKernelContext(), randstates) < prob[i]
266+
end
267+
end
143268
return
144269
end
145270

271+
272+
146273
## COV_EXCL_STOP

0 commit comments

Comments
 (0)