From: Artem Zhmurov <zhmurov@gmail.com>
Date: Tue, 8 Jun 2021 11:48:22 +0000 (+0300)
Subject: Compute reference indices in GPU LINCS upon data preparation
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=0adce6e0acc8b75b9c84dde5f3672b3bda3c09bf;p=alexxy%2Fgromacs.git

Compute reference indices in GPU LINCS upon data preparation

The reference indices are used to map into shared memory location.
The start of the block is then subtracted from the index in map
to get the location in shared memory that should be used. This can
be done when the structure is populated to reduce the amount of
computations in the GPU kernel.

Refs #3350
---

diff --git a/src/gromacs/mdlib/lincs_gpu.cpp b/src/gromacs/mdlib/lincs_gpu.cpp
index 86211b17cd..a5b656bc26 100644
--- a/src/gromacs/mdlib/lincs_gpu.cpp
+++ b/src/gromacs/mdlib/lincs_gpu.cpp
@@ -475,8 +475,9 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const
                 int index = kernelParams_.numConstraintsThreads
                                     * coupledConstraintsCountsHost.at(splitMap.at(c1))
                             + splitMap.at(c1);
+                int threadBlockStarts = splitMap.at(c1) - splitMap.at(c1) % c_threadsPerBlock;
 
-                coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
+                coupledConstraintsIndicesHost.at(index) = splitMap.at(c2) - threadBlockStarts;
 
                 int center = c1a1;
 
@@ -502,8 +503,9 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const
                 int index = kernelParams_.numConstraintsThreads
                                     * coupledConstraintsCountsHost.at(splitMap.at(c1))
                             + splitMap.at(c1);
+                int threadBlockStarts = splitMap.at(c1) - splitMap.at(c1) % c_threadsPerBlock;
 
-                coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
+                coupledConstraintsIndicesHost.at(index) = splitMap.at(c2) - threadBlockStarts;
 
                 int center = c1a2;
 
diff --git a/src/gromacs/mdlib/lincs_gpu_internal.cu b/src/gromacs/mdlib/lincs_gpu_internal.cu
index 15e3a288df..c19b1831f1 100644
--- a/src/gromacs/mdlib/lincs_gpu_internal.cu
+++ b/src/gromacs/mdlib/lincs_gpu_internal.cu
@@ -101,11 +101,11 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
     const AtomPair* __restrict__ gm_constraints           = kernelParams.d_constraints;
     const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
     const int* __restrict__ gm_coupledConstraintsCounts   = kernelParams.d_coupledConstraintsCounts;
-    const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIndices;
-    const float* __restrict__ gm_massFactors           = kernelParams.d_massFactors;
-    float* __restrict__ gm_matrixA                     = kernelParams.d_matrixA;
-    const float* __restrict__ gm_inverseMasses         = kernelParams.d_inverseMasses;
-    float* __restrict__ gm_virialScaled                = kernelParams.d_virialScaled;
+    const int* __restrict__ gm_coupledConstraintsIndices = kernelParams.d_coupledConstraintsIndices;
+    const float* __restrict__ gm_massFactors             = kernelParams.d_massFactors;
+    float* __restrict__ gm_matrixA                       = kernelParams.d_matrixA;
+    const float* __restrict__ gm_inverseMasses           = kernelParams.d_inverseMasses;
+    float* __restrict__ gm_virialScaled                  = kernelParams.d_virialScaled;
 
     const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -179,9 +179,9 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
     for (int n = 0; n < coupledConstraintsCount; n++)
     {
         int index = n * numConstraintsThreads + threadIndex;
-        int c1    = gm_coupledConstraintsIdxes[index];
+        int c1    = gm_coupledConstraintsIndices[index];
 
-        float3 rc1        = sm_r[c1 - blockIdx.x * blockDim.x];
+        float3 rc1        = sm_r[c1];
         gm_matrixA[index] = gm_massFactors[index] * (rc.x * rc1.x + rc.y * rc1.y + rc.z * rc1.z);
     }
 
@@ -214,10 +214,10 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
         for (int n = 0; n < coupledConstraintsCount; n++)
         {
             int index = n * numConstraintsThreads + threadIndex;
-            int c1    = gm_coupledConstraintsIdxes[index];
+            int c1    = gm_coupledConstraintsIndices[index];
             // Convolute current right-hand-side with A
             // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
-            mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
+            mvb = mvb + gm_matrixA[index] * sm_rhs[c1 + blockDim.x * (rec % 2)];
         }
         // 'Switch' rhs vectors, save current result
         // These values will be accessed in the loop above during the next iteration.
@@ -285,9 +285,9 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
             for (int n = 0; n < coupledConstraintsCount; n++)
             {
                 int index = n * numConstraintsThreads + threadIndex;
-                int c1    = gm_coupledConstraintsIdxes[index];
+                int c1    = gm_coupledConstraintsIndices[index];
 
-                mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
+                mvb = mvb + gm_matrixA[index] * sm_rhs[c1 + blockDim.x * (rec % 2)];
             }
             sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
             sol                                                = sol + mvb;