Add missing synchronization calls in CUDA version of LINCS

author Artem Zhmurov <zhmurov@gmail.com>

Tue, 5 Oct 2021 15:24:29 +0000 (18:24 +0300)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 6 Oct 2021 09:58:16 +0000 (09:58 +0000)
author Artem Zhmurov <zhmurov@gmail.com>
Tue, 5 Oct 2021 15:24:29 +0000 (18:24 +0300)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 6 Oct 2021 09:58:16 +0000 (09:58 +0000)
diff --git a/src/gromacs/mdlib/lincs_gpu_internal.cu b/src/gromacs/mdlib/lincs_gpu_internal.cu

index ccd9e0a6a7c20041e860be1757cfb74b6c9003c7..487aa8c01ee490bc6ee9c52e83c5c2db71412b18 100644 (file)
--- a/src/gromacs/mdlib/lincs_gpu_internal.cu
+++ b/src/gromacs/mdlib/lincs_gpu_internal.cu
@@ -200,6 +200,9 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
       *  Inverse matrix using a set of expansionOrder matrix multiplications
       */
  
       *  Inverse matrix using a set of expansionOrder matrix multiplications
       */
  
+    // Make sure that we don't overwrite the sm_r[..] array.
+    __syncthreads();
+
      // This will use the same memory space as sm_r, which is no longer needed.
      extern __shared__ float sm_rhs[];
      // Save current right-hand-side vector in the shared memory
      // This will use the same memory space as sm_r, which is no longer needed.
      extern __shared__ float sm_rhs[];
      // Save current right-hand-side vector in the shared memory
@@ -333,7 +336,8 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
          // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
          // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
          // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
          // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
          // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
          // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
-        // two are no longer in use.
+        // two are no longer in use, which we make sure by waiting for all threads in block.
+        __syncthreads();
          extern __shared__ float sm_threadVirial[];
          float                   mult                  = targetLength * lagrangeScaled;
          sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x;
          extern __shared__ float sm_threadVirial[];
          float                   mult                  = targetLength * lagrangeScaled;
          sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x;
author	Artem Zhmurov <zhmurov@gmail.com>
	Tue, 5 Oct 2021 15:24:29 +0000 (18:24 +0300)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 6 Oct 2021 09:58:16 +0000 (09:58 +0000)