(#313) AINT module fixes.

fedepedron · web-flow · commit 99a7d7a3b9ff · 2019-09-19T16:49:15.000-03:00
Fixed AINT bug due to race conditions in GPU; this successfully addresses the issue with GeForce 10xx series.
diff --git a/g2g/analytic_integral/cuda/kernels/coulomb_energy.h b/g2g/analytic_integral/cuda/kernels/coulomb_energy.h
@@ -144,24 +144,23 @@ __global__ void gpu_coulomb_fock(
 
       prefactor_mo = (double)(cc * PI52 * ovlap) / zeta;
     }
-    __shared__ uint term_start[3];
+    uint term_start[3];
     term_start[0] = 0;
     term_start[1] = p_offset;
     term_start[2] = d_offset;
-    __shared__ uint term_end[3];
+    uint term_end[3];
     term_end[0] = s_end;
     term_end[1] = p_end;
     term_end[2] = d_end;
-    __shared__ uint inner_stop[3];
+    uint inner_stop[3];
     inner_stop[0] = QMMM_BLOCK_SIZE;
     inner_stop[1] = 126;
     inner_stop[2] = 126;
-    __shared__ uint inner_step[3];
+    uint inner_step[3];
     inner_step[0] = 1;
     inner_step[1] = 3;
     inner_step[2] = 6;
 
-#pragma unroll 3
     for (int func_type = 0; func_type < 3; func_type++) {
       //
       // Outer loop: read in block of MM atom information into shared memory
diff --git a/g2g/analytic_integral/cuda/kernels/coulomb_fit.h b/g2g/analytic_integral/cuda/kernels/coulomb_fit.h
@@ -187,25 +187,24 @@ __global__ void gpu_coulomb_fit1(uint num_terms,
 
       prefactor_mo = (double)(cc * PI52 * ovlap) / zeta;
     }
-    __shared__ uint term_start[3];
+    uint term_start[3];
     term_start[0] = 0;
     term_start[1] = p_offset;
     term_start[2] = d_offset;
-    __shared__ uint term_end[3];
+    uint term_end[3];
     term_end[0] = s_end;
     term_end[1] = p_end;
     term_end[2] = d_end;
-    __shared__ uint inner_stop[3];
+    uint inner_stop[3];
     inner_stop[0] = QMMM_BLOCK_SIZE;
     inner_stop[1] = 126;
     inner_stop[2] = 126;
-    __shared__ uint inner_step[3];
+    uint inner_step[3];
     inner_step[0] = 1;
     inner_step[1] = 3;
     inner_step[2] = 6;
 
     uint rc_ind = 0;
-#pragma unroll 3
     for (int func_type = 0; func_type < 3; func_type++) {
       //
       // Outer loop: read in block of MM atom information into shared memory
diff --git a/g2g/analytic_integral/cuda/kernels/coulomb_forces.h b/g2g/analytic_integral/cuda/kernels/coulomb_forces.h
@@ -210,24 +210,23 @@ __global__ void gpu_coulomb_forces(
 
       prefactor_mo = (double)(cc * PI52 * ovlap) / zeta;
     }
-    __shared__ uint term_start[3];
+    uint term_start[3];
     term_start[0] = 0;
     term_start[1] = p_offset;
     term_start[2] = d_offset;
-    __shared__ uint term_end[3];
+    uint term_end[3];
     term_end[0] = s_end;
     term_end[1] = p_end;
     term_end[2] = d_end;
-    __shared__ uint inner_stop[3];
+    uint inner_stop[3];
     inner_stop[0] = QMMM_BLOCK_SIZE;
     inner_stop[1] = 126;
     inner_stop[2] = 126;
-    __shared__ uint inner_step[3];
+    uint inner_step[3];
     inner_step[0] = 1;
     inner_step[1] = 3;
     inner_step[2] = 6;
 
-#pragma unroll 3
     for (int func_type = 0; func_type < 3; func_type++) {
       //
       // Outer loop: read in block of MM atom information into shared memory
diff --git a/g2g/analytic_integral/os_cutoff.cpp b/g2g/analytic_integral/os_cutoff.cpp
@@ -187,22 +187,21 @@ void OSIntegral<scalar_type>::new_cutoff(void) {
       }
     }
     // Pad the input arrays so the next term type has an aligned offset
-    for (j = 0; j < QMMM_BLOCK_SIZE -
-                        (term_type_counts[current_term_type] % QMMM_BLOCK_SIZE);
-         j++) {
-      this->func_code.push_back(
-          func_code[term_type_offsets[current_term_type]]);  // Use the first
-                                                             // code from this
-                                                             // term type
-      this->local_dens.push_back(
-          local_dens[term_type_offsets[current_term_type]]);
-    }
     if (term_type_counts[current_term_type] > 0) {
+      for (j = 0; j < QMMM_BLOCK_SIZE - (term_type_counts[current_term_type] % QMMM_BLOCK_SIZE); j++) {
+        this->func_code.push_back(func_code[term_type_offsets[current_term_type]]);  
+        // Use the first code from this term type
+        this->local_dens.push_back(local_dens[term_type_offsets[current_term_type]]);
+      }
       for (j = 0; j < QMMM_BLOCK_SIZE - (dens_counts[current_term_type] % QMMM_BLOCK_SIZE); j++) {
         this->dens_values.push_back(dens_values[dens_offsets[current_term_type]]);
         this->local2globaldens.push_back(local2globaldens[dens_offsets[current_term_type]]);
       }
     } else {
+      for (j = 0; j < QMMM_BLOCK_SIZE - (dens_counts[current_term_type] % QMMM_BLOCK_SIZE); j++) {
+        this->func_code.push_back(0);
+        this->local_dens.push_back(0);
+      }
       for (j = 0; j < QMMM_BLOCK_SIZE - (dens_counts[current_term_type] % QMMM_BLOCK_SIZE); j++) {
         this->dens_values.push_back(0.0f);
         this->local2globaldens.push_back(0);