From: Artem Zhmurov Date: Tue, 8 Jun 2021 11:48:22 +0000 (+0300) Subject: Compute reference indices in GPU LINCS upon data preparation X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=0adce6e0acc8b75b9c84dde5f3672b3bda3c09bf;p=alexxy%2Fgromacs.git Compute reference indices in GPU LINCS upon data preparation The reference indices are used to map into shared memory location. The start of the block is then subtracted from the index in map to get the location in shared memory that should be used. This can be done when the structure is populated to reduce the amount of computations in the GPU kernel. Refs #3350 --- diff --git a/src/gromacs/mdlib/lincs_gpu.cpp b/src/gromacs/mdlib/lincs_gpu.cpp index 86211b17cd..a5b656bc26 100644 --- a/src/gromacs/mdlib/lincs_gpu.cpp +++ b/src/gromacs/mdlib/lincs_gpu.cpp @@ -475,8 +475,9 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const int index = kernelParams_.numConstraintsThreads * coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1); + int threadBlockStarts = splitMap.at(c1) - splitMap.at(c1) % c_threadsPerBlock; - coupledConstraintsIndicesHost.at(index) = splitMap.at(c2); + coupledConstraintsIndicesHost.at(index) = splitMap.at(c2) - threadBlockStarts; int center = c1a1; @@ -502,8 +503,9 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const int index = kernelParams_.numConstraintsThreads * coupledConstraintsCountsHost.at(splitMap.at(c1)) + splitMap.at(c1); + int threadBlockStarts = splitMap.at(c1) - splitMap.at(c1) % c_threadsPerBlock; - coupledConstraintsIndicesHost.at(index) = splitMap.at(c2); + coupledConstraintsIndicesHost.at(index) = splitMap.at(c2) - threadBlockStarts; int center = c1a2; diff --git a/src/gromacs/mdlib/lincs_gpu_internal.cu b/src/gromacs/mdlib/lincs_gpu_internal.cu index 15e3a288df..c19b1831f1 100644 --- a/src/gromacs/mdlib/lincs_gpu_internal.cu +++ b/src/gromacs/mdlib/lincs_gpu_internal.cu @@ -101,11 +101,11 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__ const AtomPair* __restrict__ gm_constraints = kernelParams.d_constraints; const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths; const int* __restrict__ gm_coupledConstraintsCounts = kernelParams.d_coupledConstraintsCounts; - const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIndices; - const float* __restrict__ gm_massFactors = kernelParams.d_massFactors; - float* __restrict__ gm_matrixA = kernelParams.d_matrixA; - const float* __restrict__ gm_inverseMasses = kernelParams.d_inverseMasses; - float* __restrict__ gm_virialScaled = kernelParams.d_virialScaled; + const int* __restrict__ gm_coupledConstraintsIndices = kernelParams.d_coupledConstraintsIndices; + const float* __restrict__ gm_massFactors = kernelParams.d_massFactors; + float* __restrict__ gm_matrixA = kernelParams.d_matrixA; + const float* __restrict__ gm_inverseMasses = kernelParams.d_inverseMasses; + float* __restrict__ gm_virialScaled = kernelParams.d_virialScaled; const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; @@ -179,9 +179,9 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__ for (int n = 0; n < coupledConstraintsCount; n++) { int index = n * numConstraintsThreads + threadIndex; - int c1 = gm_coupledConstraintsIdxes[index]; + int c1 = gm_coupledConstraintsIndices[index]; - float3 rc1 = sm_r[c1 - blockIdx.x * blockDim.x]; + float3 rc1 = sm_r[c1]; gm_matrixA[index] = gm_massFactors[index] * (rc.x * rc1.x + rc.y * rc1.y + rc.z * rc1.z); } @@ -214,10 +214,10 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__ for (int n = 0; n < coupledConstraintsCount; n++) { int index = n * numConstraintsThreads + threadIndex; - int c1 = gm_coupledConstraintsIdxes[index]; + int c1 = gm_coupledConstraintsIndices[index]; // Convolute current right-hand-side with A // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations - mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)]; + mvb = mvb + gm_matrixA[index] * sm_rhs[c1 + blockDim.x * (rec % 2)]; } // 'Switch' rhs vectors, save current result // These values will be accessed in the loop above during the next iteration. @@ -285,9 +285,9 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__ for (int n = 0; n < coupledConstraintsCount; n++) { int index = n * numConstraintsThreads + threadIndex; - int c1 = gm_coupledConstraintsIdxes[index]; + int c1 = gm_coupledConstraintsIndices[index]; - mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)]; + mvb = mvb + gm_matrixA[index] * sm_rhs[c1 + blockDim.x * (rec % 2)]; } sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb; sol = sol + mvb;