Skip to content

Commit 99a7d7a

Browse files
authored
(#313) AINT module fixes.
Fixed AINT bug due to race conditions in GPU; this successfully addresses the issue with GeForce 10xx series.
1 parent d25da49 commit 99a7d7a

File tree

4 files changed

+21
-25
lines changed

4 files changed

+21
-25
lines changed

g2g/analytic_integral/cuda/kernels/coulomb_energy.h

+4-5
Original file line numberDiff line numberDiff line change
@@ -144,24 +144,23 @@ __global__ void gpu_coulomb_fock(
144144

145145
prefactor_mo = (double)(cc * PI52 * ovlap) / zeta;
146146
}
147-
__shared__ uint term_start[3];
147+
uint term_start[3];
148148
term_start[0] = 0;
149149
term_start[1] = p_offset;
150150
term_start[2] = d_offset;
151-
__shared__ uint term_end[3];
151+
uint term_end[3];
152152
term_end[0] = s_end;
153153
term_end[1] = p_end;
154154
term_end[2] = d_end;
155-
__shared__ uint inner_stop[3];
155+
uint inner_stop[3];
156156
inner_stop[0] = QMMM_BLOCK_SIZE;
157157
inner_stop[1] = 126;
158158
inner_stop[2] = 126;
159-
__shared__ uint inner_step[3];
159+
uint inner_step[3];
160160
inner_step[0] = 1;
161161
inner_step[1] = 3;
162162
inner_step[2] = 6;
163163

164-
#pragma unroll 3
165164
for (int func_type = 0; func_type < 3; func_type++) {
166165
//
167166
// Outer loop: read in block of MM atom information into shared memory

g2g/analytic_integral/cuda/kernels/coulomb_fit.h

+4-5
Original file line numberDiff line numberDiff line change
@@ -187,25 +187,24 @@ __global__ void gpu_coulomb_fit1(uint num_terms,
187187

188188
prefactor_mo = (double)(cc * PI52 * ovlap) / zeta;
189189
}
190-
__shared__ uint term_start[3];
190+
uint term_start[3];
191191
term_start[0] = 0;
192192
term_start[1] = p_offset;
193193
term_start[2] = d_offset;
194-
__shared__ uint term_end[3];
194+
uint term_end[3];
195195
term_end[0] = s_end;
196196
term_end[1] = p_end;
197197
term_end[2] = d_end;
198-
__shared__ uint inner_stop[3];
198+
uint inner_stop[3];
199199
inner_stop[0] = QMMM_BLOCK_SIZE;
200200
inner_stop[1] = 126;
201201
inner_stop[2] = 126;
202-
__shared__ uint inner_step[3];
202+
uint inner_step[3];
203203
inner_step[0] = 1;
204204
inner_step[1] = 3;
205205
inner_step[2] = 6;
206206

207207
uint rc_ind = 0;
208-
#pragma unroll 3
209208
for (int func_type = 0; func_type < 3; func_type++) {
210209
//
211210
// Outer loop: read in block of MM atom information into shared memory

g2g/analytic_integral/cuda/kernels/coulomb_forces.h

+4-5
Original file line numberDiff line numberDiff line change
@@ -210,24 +210,23 @@ __global__ void gpu_coulomb_forces(
210210

211211
prefactor_mo = (double)(cc * PI52 * ovlap) / zeta;
212212
}
213-
__shared__ uint term_start[3];
213+
uint term_start[3];
214214
term_start[0] = 0;
215215
term_start[1] = p_offset;
216216
term_start[2] = d_offset;
217-
__shared__ uint term_end[3];
217+
uint term_end[3];
218218
term_end[0] = s_end;
219219
term_end[1] = p_end;
220220
term_end[2] = d_end;
221-
__shared__ uint inner_stop[3];
221+
uint inner_stop[3];
222222
inner_stop[0] = QMMM_BLOCK_SIZE;
223223
inner_stop[1] = 126;
224224
inner_stop[2] = 126;
225-
__shared__ uint inner_step[3];
225+
uint inner_step[3];
226226
inner_step[0] = 1;
227227
inner_step[1] = 3;
228228
inner_step[2] = 6;
229229

230-
#pragma unroll 3
231230
for (int func_type = 0; func_type < 3; func_type++) {
232231
//
233232
// Outer loop: read in block of MM atom information into shared memory

g2g/analytic_integral/os_cutoff.cpp

+9-10
Original file line numberDiff line numberDiff line change
@@ -187,22 +187,21 @@ void OSIntegral<scalar_type>::new_cutoff(void) {
187187
}
188188
}
189189
// Pad the input arrays so the next term type has an aligned offset
190-
for (j = 0; j < QMMM_BLOCK_SIZE -
191-
(term_type_counts[current_term_type] % QMMM_BLOCK_SIZE);
192-
j++) {
193-
this->func_code.push_back(
194-
func_code[term_type_offsets[current_term_type]]); // Use the first
195-
// code from this
196-
// term type
197-
this->local_dens.push_back(
198-
local_dens[term_type_offsets[current_term_type]]);
199-
}
200190
if (term_type_counts[current_term_type] > 0) {
191+
for (j = 0; j < QMMM_BLOCK_SIZE - (term_type_counts[current_term_type] % QMMM_BLOCK_SIZE); j++) {
192+
this->func_code.push_back(func_code[term_type_offsets[current_term_type]]);
193+
// Use the first code from this term type
194+
this->local_dens.push_back(local_dens[term_type_offsets[current_term_type]]);
195+
}
201196
for (j = 0; j < QMMM_BLOCK_SIZE - (dens_counts[current_term_type] % QMMM_BLOCK_SIZE); j++) {
202197
this->dens_values.push_back(dens_values[dens_offsets[current_term_type]]);
203198
this->local2globaldens.push_back(local2globaldens[dens_offsets[current_term_type]]);
204199
}
205200
} else {
201+
for (j = 0; j < QMMM_BLOCK_SIZE - (dens_counts[current_term_type] % QMMM_BLOCK_SIZE); j++) {
202+
this->func_code.push_back(0);
203+
this->local_dens.push_back(0);
204+
}
206205
for (j = 0; j < QMMM_BLOCK_SIZE - (dens_counts[current_term_type] % QMMM_BLOCK_SIZE); j++) {
207206
this->dens_values.push_back(0.0f);
208207
this->local2globaldens.push_back(0);

0 commit comments

Comments
 (0)