Fix CUDA fallback shmem gather reduction
authorSzilárd Páll <pall.szilard@gmail.com>
Fri, 15 Feb 2019 17:34:52 +0000 (18:34 +0100)
committerBerk Hess <hess@kth.se>
Wed, 24 Apr 2019 15:04:28 +0000 (17:04 +0200)
Fixed some shmem races and an intra-warp WAR hazard.

Change-Id: Ic02874f147e1c3a0dad1b67a6464ede453f77c47

src/gromacs/ewald/pme_gather.cu

index e9775a5dd0b379f502ade3dadfda43e034b53c60..6c29ebcf2156670a5ef779da86e697220a7c079e 100644 (file)
@@ -139,7 +139,7 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
         // We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
         // which are stored separately (first 2 dimensions only)
         const int         smemPerDim   = warp_size;
-        const int         smemReserved = (DIM - 1) * smemPerDim;
+        const int         smemReserved = (DIM) *smemPerDim;
         __shared__ float  sm_forceReduction[smemReserved + blockSize];
         __shared__ float *sm_forceTemp[DIM];
 
@@ -152,6 +152,8 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
             int elementIndex = smemReserved + lineIndex;
             // Store input force contributions
             sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
+            // sync here because two warps write data that the first one consumes below
+            __syncthreads();
             // Reduce to fit into smemPerDim (warp size)
 #pragma unroll
             for (int redStride = atomDataSize / 2; redStride > minStride; redStride >>= 1)
@@ -170,10 +172,9 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
                 const int packedIndex = atomIndexLocal * redStride + splineIndex;
                 sm_forceTemp[dimIndex][packedIndex] = sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
             }
+            __syncthreads();
         }
 
-        __syncthreads();
-
         assert ((blockSize / warp_size) >= DIM);
         //assert (atomsPerBlock <= warp_size);
 
@@ -193,9 +194,11 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
                 }
             }
 
-            const float n = read_grid_size(realGridSizeFP, dimIndex);
+            gmx_syncwarp();
 
+            const float n         = read_grid_size(realGridSizeFP, dimIndex);
             const int   atomIndex = sourceIndex / minStride;
+
             if (sourceIndex == minStride * atomIndex)
             {
                 *((float *)(&sm_forces[atomIndex]) + dimIndex) = (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;