Compute reference indices in GPU LINCS upon data preparation
authorArtem Zhmurov <zhmurov@gmail.com>
Tue, 8 Jun 2021 11:48:22 +0000 (14:48 +0300)
committerPaul Bauer <paul.bauer.q@gmail.com>
Wed, 9 Jun 2021 11:20:29 +0000 (11:20 +0000)
The reference indices are used to map into shared memory location.
The start of the block is then subtracted from the index in map
to get the location in shared memory that should be used. This can
be done when the structure is populated to reduce the amount of
computations in the GPU kernel.

Refs #3350

src/gromacs/mdlib/lincs_gpu.cpp
src/gromacs/mdlib/lincs_gpu_internal.cu

index 86211b17cda06e9abe7d9d3b5578c9aec73850aa..a5b656bc26b7978ce9d241753bcac42dacdefc99 100644 (file)
@@ -475,8 +475,9 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const
                 int index = kernelParams_.numConstraintsThreads
                                     * coupledConstraintsCountsHost.at(splitMap.at(c1))
                             + splitMap.at(c1);
+                int threadBlockStarts = splitMap.at(c1) - splitMap.at(c1) % c_threadsPerBlock;
 
-                coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
+                coupledConstraintsIndicesHost.at(index) = splitMap.at(c2) - threadBlockStarts;
 
                 int center = c1a1;
 
@@ -502,8 +503,9 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const
                 int index = kernelParams_.numConstraintsThreads
                                     * coupledConstraintsCountsHost.at(splitMap.at(c1))
                             + splitMap.at(c1);
+                int threadBlockStarts = splitMap.at(c1) - splitMap.at(c1) % c_threadsPerBlock;
 
-                coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
+                coupledConstraintsIndicesHost.at(index) = splitMap.at(c2) - threadBlockStarts;
 
                 int center = c1a2;
 
index 15e3a288df980b151bd806bc8fa478a8d1904c1d..c19b1831f105f0bc29f5ddb3eccd51e2d38c87cd 100644 (file)
@@ -101,11 +101,11 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
     const AtomPair* __restrict__ gm_constraints           = kernelParams.d_constraints;
     const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
     const int* __restrict__ gm_coupledConstraintsCounts   = kernelParams.d_coupledConstraintsCounts;
-    const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIndices;
-    const float* __restrict__ gm_massFactors           = kernelParams.d_massFactors;
-    float* __restrict__ gm_matrixA                     = kernelParams.d_matrixA;
-    const float* __restrict__ gm_inverseMasses         = kernelParams.d_inverseMasses;
-    float* __restrict__ gm_virialScaled                = kernelParams.d_virialScaled;
+    const int* __restrict__ gm_coupledConstraintsIndices = kernelParams.d_coupledConstraintsIndices;
+    const float* __restrict__ gm_massFactors             = kernelParams.d_massFactors;
+    float* __restrict__ gm_matrixA                       = kernelParams.d_matrixA;
+    const float* __restrict__ gm_inverseMasses           = kernelParams.d_inverseMasses;
+    float* __restrict__ gm_virialScaled                  = kernelParams.d_virialScaled;
 
     const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -179,9 +179,9 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
     for (int n = 0; n < coupledConstraintsCount; n++)
     {
         int index = n * numConstraintsThreads + threadIndex;
-        int c1    = gm_coupledConstraintsIdxes[index];
+        int c1    = gm_coupledConstraintsIndices[index];
 
-        float3 rc1        = sm_r[c1 - blockIdx.x * blockDim.x];
+        float3 rc1        = sm_r[c1];
         gm_matrixA[index] = gm_massFactors[index] * (rc.x * rc1.x + rc.y * rc1.y + rc.z * rc1.z);
     }
 
@@ -214,10 +214,10 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
         for (int n = 0; n < coupledConstraintsCount; n++)
         {
             int index = n * numConstraintsThreads + threadIndex;
-            int c1    = gm_coupledConstraintsIdxes[index];
+            int c1    = gm_coupledConstraintsIndices[index];
             // Convolute current right-hand-side with A
             // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
-            mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
+            mvb = mvb + gm_matrixA[index] * sm_rhs[c1 + blockDim.x * (rec % 2)];
         }
         // 'Switch' rhs vectors, save current result
         // These values will be accessed in the loop above during the next iteration.
@@ -285,9 +285,9 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
             for (int n = 0; n < coupledConstraintsCount; n++)
             {
                 int index = n * numConstraintsThreads + threadIndex;
-                int c1    = gm_coupledConstraintsIdxes[index];
+                int c1    = gm_coupledConstraintsIndices[index];
 
-                mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
+                mvb = mvb + gm_matrixA[index] * sm_rhs[c1 + blockDim.x * (rec % 2)];
             }
             sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
             sol                                                = sol + mvb;