From 7ca97ff90154335724729778337fceb772c62a9f Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Tue, 5 Oct 2021 18:24:29 +0300
Subject: [PATCH] Add missing synchronization calls in CUDA version of LINCS

In CUDA, the same shared memory buffer is used to store different
intermediate values. To avoid overwriting these values prematurely,
extra blocking synchronizations are needed in the GPU kernel.

This bug was not exposed neither by regression tests nor by unit tests
because it was affecting values rarely and the deviation were within
tolerances.
---
 src/gromacs/mdlib/lincs_gpu_internal.cu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/gromacs/mdlib/lincs_gpu_internal.cu b/src/gromacs/mdlib/lincs_gpu_internal.cu
index ccd9e0a6a7..487aa8c01e 100644
--- a/src/gromacs/mdlib/lincs_gpu_internal.cu
+++ b/src/gromacs/mdlib/lincs_gpu_internal.cu
@@ -200,6 +200,9 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
      *  Inverse matrix using a set of expansionOrder matrix multiplications
      */
 
+    // Make sure that we don't overwrite the sm_r[..] array.
+    __syncthreads();
+
     // This will use the same memory space as sm_r, which is no longer needed.
     extern __shared__ float sm_rhs[];
     // Save current right-hand-side vector in the shared memory
@@ -333,7 +336,8 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
         // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
         // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
         // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
-        // two are no longer in use.
+        // two are no longer in use, which we make sure by waiting for all threads in block.
+        __syncthreads();
         extern __shared__ float sm_threadVirial[];
         float                   mult                  = targetLength * lagrangeScaled;
         sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x;
-- 
2.22.0