@@ -31,116 +31,243 @@ function stirling_approx_tail(k)::Float32
31
31
end
32
32
33
33
34
- # BTRS algorithm, adapted from the tensorflow library
35
- # (github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc)
36
- function kernel_BTRS! (
37
- A, count, prob,
38
- R1, R2, Rp, Ra,
39
- count_dim_larger_than_prob_dim,
40
- seed:: UInt32
41
- )
42
- i = (blockIdx (). x - 1 ) * blockDim (). x + threadIdx (). x
34
+ # BTRS algorithm, adapted from the tensorflow library (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/random_binomial_op.cc)
43
35
44
- @inbounds Random. seed! (seed)
36
+ # # Kernel for scalar parameters
37
+ function kernel_BTRS_scalar! (A, n, p, seed:: UInt32 , counter:: UInt32 )
38
+ device_rng = Random. default_rng ()
45
39
46
- @inbounds if i <= length (A)
47
- I = Ra[i]
48
- Ip = Rp[I[1 ]]
49
- I1 = R1[Ip[1 ]]
50
- I2 = R2[Ip[2 ]]
40
+ # initialize the state
41
+ @inbounds Random. seed! (device_rng, seed, counter)
51
42
52
- if count_dim_larger_than_prob_dim
53
- n = count[CartesianIndex (I1, I2)]
54
- p = prob[I1]
55
- else
56
- n = count[I1]
57
- p = prob[CartesianIndex (I1, I2)]
58
- end
43
+ # grid-stride loop
44
+ tid = threadIdx (). x
45
+ window = (blockDim (). x - 1 i32) * gridDim (). x
46
+ offset = (blockIdx (). x - 1 i32) * blockDim (). x
47
+
48
+ k = 0
49
+ while offset < length (A)
50
+ i = tid + offset
59
51
60
52
# edge cases
61
53
if p <= 0 || n <= 0
62
- A[i] = 0
63
- return
54
+ k = 0
64
55
elseif p >= 1
65
- A[i] = n
66
- return
67
- end
68
-
56
+ k = n
69
57
# Use naive algorithm for n <= 17
70
- if n <= 17
58
+ elseif n <= 17
71
59
k = 0
72
60
ctr = 1
73
61
while ctr <= n
74
62
rand (Float32) < p && (k += 1 )
75
63
ctr += 1
76
64
end
77
- A[i] = k
78
- return
79
- end
80
-
81
65
# Use inversion algorithm for n*p < 10
82
- if n * p < 10f0
83
- logp = log (1f0 - p)
66
+ elseif n * p < 10f0
67
+ logp = CUDA . log (1f0 - p)
84
68
geom_sum = 0f0
85
- num_geom = 0
69
+ k = 0
86
70
while true
87
- geom = ceil (log (rand (Float32)) / logp)
71
+ geom = ceil (CUDA . log (rand (Float32)) / logp)
88
72
geom_sum += geom
89
73
geom_sum > n && break
90
- num_geom += 1
74
+ k += 1
75
+ end
76
+ # BTRS algorithm
77
+ else
78
+ # BTRS approximations work well for p <= 0.5
79
+ # invert p and set `invert` flag
80
+ (invert = p > 0.5f0 ) && (p = 1f0 - p)
81
+
82
+ r = p/ (1f0 - p)
83
+ s = p* (1f0 - p)
84
+
85
+ stddev = sqrt (n * s)
86
+ b = 1.15f0 + 2.53f0 * stddev
87
+ a = - 0.0873f0 + 0.0248f0 * b + 0.01f0 * p
88
+ c = n * p + 0.5f0
89
+ v_r = 0.92f0 - 4.2f0 / b
90
+
91
+ alpha = (2.83f0 + 5.1f0 / b) * stddev;
92
+ m = floor ((n + 1 ) * p)
93
+
94
+ ks = 0f0
95
+
96
+ while true
97
+ usample = rand (Float32) - 0.5f0
98
+ vsample = rand (Float32)
99
+
100
+ us = 0.5f0 - abs (usample)
101
+ ks = floor ((2 * a / us + b) * usample + c)
102
+
103
+ if us >= 0.07f0 && vsample <= v_r
104
+ break
105
+ end
106
+
107
+ if ks < 0 || ks > n
108
+ continue
109
+ end
110
+
111
+ v2 = CUDA. log (vsample * alpha / (a / (us * us) + b))
112
+ ub = (m + 0.5f0 ) * CUDA. log ((m + 1 ) / (r * (n - m + 1 ))) +
113
+ (n + 1 ) * CUDA. log ((n - m + 1 ) / (n - ks + 1 )) +
114
+ (ks + 0.5f0 ) * CUDA. log (r * (n - ks + 1 ) / (ks + 1 )) +
115
+ stirling_approx_tail (m) + stirling_approx_tail (n - m) - stirling_approx_tail (ks) - stirling_approx_tail (n - ks)
116
+ if v2 <= ub
117
+ break
118
+ end
91
119
end
92
- A[i] = num_geom
93
- return
120
+ invert && (ks = n - ks)
121
+ k = Int (ks)
94
122
end
95
123
96
- # BTRS algorithm
97
- # BTRS approximations work well for p <= 0.5
98
- (invert = p > 0.5f0 ) && (p = 1f0 - p)
124
+ if i <= length (A)
125
+ @inbounds A[i] = k
126
+ end
127
+ offset += window
128
+ end
129
+ return nothing
130
+ end
99
131
100
- r = p/ (1f0 - p)
101
- s = p* (1f0 - p)
102
132
103
- stddev = sqrt (n * s)
104
- b = 1.15f0 + 2.53f0 * stddev
105
- a = - 0.0873f0 + 0.0248f0 * b + 0.01f0 * p
106
- c = n * p + 0.5f0
107
- v_r = 0.92f0 - 4.2f0 / b
133
+ # # Kernel for arrays of parameters
134
+ function kernel_BTRS! (
135
+ A,
136
+ count, prob,
137
+ seed:: UInt32 , counter:: UInt32 ,
138
+ R1, R2, Rp, Ra,
139
+ count_dim_larger_than_prob_dim
140
+ )
141
+ device_rng = Random. default_rng ()
142
+
143
+ # initialize the state
144
+ @inbounds Random. seed! (device_rng, seed, counter)
108
145
109
- alpha = (2.83f0 + 5.1f0 / b) * stddev;
110
- m = floor ((n + 1 ) * p)
146
+ # grid-stride loop
147
+ tid = threadIdx (). x
148
+ window = (blockDim (). x - 1 i32) * gridDim (). x
149
+ offset = (blockIdx (). x - 1 i32) * blockDim (). x
111
150
112
- while true
113
- usample = rand (Float32) - 0.5f0
114
- vsample = rand (Float32)
151
+ while offset < length (A)
152
+ i = tid + offset
115
153
116
- us = 0.5f0 - abs (usample)
117
- ks = floor ((2 * a / us + b) * usample + c)
154
+ # PARAMETER LOOKUP
155
+ if i <= length (A)
156
+ @inbounds I = Ra[i]
157
+ @inbounds Ip = Rp[I[1 ]]
158
+ @inbounds I1 = R1[Ip[1 ]]
159
+ @inbounds I2 = R2[Ip[2 ]]
118
160
119
- if us >= 0.07f0 && vsample <= v_r
120
- break
161
+ if count_dim_larger_than_prob_dim
162
+ @inbounds n = count[CartesianIndex (I1, I2)]
163
+ @inbounds p = prob[I1]
164
+ else
165
+ @inbounds n = count[I1]
166
+ @inbounds p = prob[CartesianIndex (I1, I2)]
121
167
end
168
+ else
169
+ n = 0
170
+ p = 0
171
+ end
122
172
123
- if ks < 0 || ks > n
124
- continue
173
+ # SAMPLER
174
+ # edge cases
175
+ if p <= 0 || n <= 0
176
+ k = 0
177
+ elseif p >= 1
178
+ k = n
179
+ # Use naive algorithm for n <= 17
180
+ elseif n <= 17
181
+ k = 0
182
+ ctr = 1
183
+ while ctr <= n
184
+ rand (Float32) < p && (k += 1 )
185
+ ctr += 1
125
186
end
187
+ # Use inversion algorithm for n*p < 10
188
+ elseif n * p < 10f0
189
+ logp = CUDA. log (1f0 - p)
190
+ geom_sum = 0f0
191
+ k = 0
192
+ while true
193
+ geom = ceil (CUDA. log (rand (Float32)) / logp)
194
+ geom_sum += geom
195
+ geom_sum > n && break
196
+ k += 1
197
+ end
198
+ # BTRS algorithm
199
+ else
200
+ # BTRS approximations work well for p <= 0.5
201
+ # invert p and set `invert` flag
202
+ (invert = p > 0.5f0 ) && (p = 1f0 - p)
203
+
204
+ r = p/ (1f0 - p)
205
+ s = p* (1f0 - p)
206
+
207
+ stddev = sqrt (n * s)
208
+ b = 1.15f0 + 2.53f0 * stddev
209
+ a = - 0.0873f0 + 0.0248f0 * b + 0.01f0 * p
210
+ c = n * p + 0.5f0
211
+ v_r = 0.92f0 - 4.2f0 / b
212
+
213
+ alpha = (2.83f0 + 5.1f0 / b) * stddev;
214
+ m = floor ((n + 1 ) * p)
126
215
127
- v2 = log (vsample * alpha / (a / (us * us) + b))
128
- ub = (m + 0.5f0 ) * log ((m + 1 ) / (r * (n - m + 1 ))) +
129
- (n + 1 ) * log ((n - m + 1 ) / (n - ks + 1 )) +
130
- (ks + 0.5f0 ) * log (r * (n - ks + 1 ) / (ks + 1 )) +
131
- stirling_approx_tail (m) + stirling_approx_tail (n - m) -
132
- stirling_approx_tail (ks) - stirling_approx_tail (n - ks)
133
- if v2 <= ub
134
- break
216
+ ks = 0f0
217
+
218
+ while true
219
+ usample = rand (Float32) - 0.5f0
220
+ vsample = rand (Float32)
221
+
222
+ us = 0.5f0 - abs (usample)
223
+ ks = floor ((2 * a / us + b) * usample + c)
224
+
225
+ if us >= 0.07f0 && vsample <= v_r
226
+ break
227
+ end
228
+
229
+ if ks < 0 || ks > n
230
+ continue
231
+ end
232
+
233
+ v2 = CUDA. log (vsample * alpha / (a / (us * us) + b))
234
+ ub = (m + 0.5f0 ) * CUDA. log ((m + 1 ) / (r * (n - m + 1 ))) +
235
+ (n + 1 ) * CUDA. log ((n - m + 1 ) / (n - ks + 1 )) +
236
+ (ks + 0.5f0 ) * CUDA. log (r * (n - ks + 1 ) / (ks + 1 )) +
237
+ stirling_approx_tail (m) + stirling_approx_tail (n - m) - stirling_approx_tail (ks) - stirling_approx_tail (n - ks)
238
+ if v2 <= ub
239
+ break
240
+ end
135
241
end
242
+ invert && (ks = n - ks)
243
+ k = Int (ks)
136
244
end
137
245
138
- # if p = 1 - p[i] was used, undo inversion
139
- invert && (ks = n - ks)
140
- A[i] = Int (ks);
141
- nothing
142
- end # if
246
+ if i <= length (A)
247
+ @inbounds A[i] = k
248
+ end
249
+ offset += window
250
+ end
251
+ return nothing
252
+ end
253
+
254
+
255
+ # # old, unused kernels (for reference)
256
+
257
+ # naive algorithm, full
258
+ function kernel_naive_full! (A, count, prob, randstates)
259
+ index1 = (blockIdx (). x - 1 ) * blockDim (). x + threadIdx (). x
260
+ stride1 = blockDim (). x * gridDim (). x
261
+
262
+ @inbounds for i in index1: stride1: length (A)
263
+ A[i] = 0
264
+ for m in 1 : count[i]
265
+ @inbounds A[i] += GPUArrays. gpu_rand (Float32, CUDA. CuKernelContext (), randstates) < prob[i]
266
+ end
267
+ end
143
268
return
144
269
end
145
270
271
+
272
+
146
273
# # COV_EXCL_STOP
0 commit comments