Merge commit d30f2cb6 from release-2020 into master

[alexxy/gromacs.git] / src / gromacs / nbnxm / cuda / nbnxm_buffer_ops_kernels.cuh
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh

index a535842e7ef5f28240308329cb5937acad8305cd..efb713c093b8f1c414b70867594fe11d20e90401 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -52,84 +52,68 @@
   *  - rename kernel so naming matches with the other NBNXM kernels;
   *  - enable separate compilation unit
  
- * \param[in]     numColumns          extent of cell-level parallelism
- * \param[out]    gm_coordinatesNbnxm coordinates buffer in nbnxm layout
- * \param[in]     setFillerCoords     tells whether to set the coordinates of the filler particles
- * \param[in]     gm_coordinatesRvec  coordinates buffer in rvec format
- * \param[in]     gm_atomIndex        atom index mapping
- * \param[in]     gm_numAtoms         array of number of atoms
- * \param[in]     gm_cellIndex        array of cell indices
- * \param[in]     cellOffset          first cell
- * \param[in]     numAtomsPerCell     number of atoms per cell
+ * \param[in]     numColumns          Extent of cell-level parallelism.
+ * \param[out]    gm_xq               Coordinates buffer in nbnxm layout.
+ * \param[in]     setFillerCoords     Whether to set the coordinates of the filler particles.
+ * \param[in]     gm_x                Coordinates buffer.
+ * \param[in]     gm_atomIndex        Atom index mapping.
+ * \param[in]     gm_numAtoms         Array of number of atoms.
+ * \param[in]     gm_cellIndex        Array of cell indices.
+ * \param[in]     cellOffset          Airst cell.
+ * \param[in]     numAtomsPerCell     Number of atoms per cell.
   */
-__global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColumns,
-                                             float* __restrict__ gm_coordinatesNbnxm,
-                                             bool setFillerCoords,
-                                             const rvec* __restrict__ gm_coordinatesRvec,
-                                             const int* __restrict__ gm_atomIndex,
-                                             const int* __restrict__ gm_numAtoms,
-                                             const int* __restrict__ gm_cellIndex,
-                                             int cellOffset,
-                                             int numAtomsPerCell);
-
-
-__global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColumns,
-                                             float* __restrict__ gm_coordinatesNbnxm,
-                                             bool setFillerCoords,
-                                             const rvec* __restrict__ gm_coordinatesRvec,
-                                             const int* __restrict__ gm_atomIndex,
-                                             const int* __restrict__ gm_numAtoms,
-                                             const int* __restrict__ gm_cellIndex,
-                                             int cellOffset,
-                                             int numAtomsPerCell)
+static __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColumns,
+                                                    float4* __restrict__ gm_xq,
+                                                    bool setFillerCoords,
+                                                    const float3* __restrict__ gm_x,
+                                                    const int* __restrict__ gm_atomIndex,
+                                                    const int* __restrict__ gm_numAtoms,
+                                                    const int* __restrict__ gm_cellIndex,
+                                                    int cellOffset,
+                                                    int numAtomsPerCell)
  {
  
  
      const float farAway = -1000000.0f;
  
-    /* map cell-level parallelism to y component of CUDA block index */
+    // Map cell-level parallelism to y component of CUDA block index.
      int cxy = blockIdx.y;
  
      if (cxy < numColumns)
      {
  
-        int na = gm_numAtoms[cxy];
-        int a0 = (cellOffset + gm_cellIndex[cxy]) * numAtomsPerCell;
-        int na_round;
+        const int numAtoms = gm_numAtoms[cxy];
+        const int offset   = (cellOffset + gm_cellIndex[cxy]) * numAtomsPerCell;
+        int       numAtomsRounded;
          if (setFillerCoords)
          {
              // TODO: This can be done more efficiently
-            na_round = (gm_cellIndex[cxy + 1] - gm_cellIndex[cxy]) * numAtomsPerCell;
+            numAtomsRounded = (gm_cellIndex[cxy + 1] - gm_cellIndex[cxy]) * numAtomsPerCell;
          }
          else
          {
-            /* We fill only the real particle locations.
-             * We assume the filling entries at the end have been
-             * properly set before during pair-list generation.
-             */
-            na_round = na;
+            // We fill only the real particle locations.
+            // We assume the filling entries at the end have been
+            // properly set before during pair-list generation.
+            numAtomsRounded = numAtoms;
          }
  
-        /* map parallelism within a cell to x component of CUDA block index linearized
-         * with threads within a block */
-        int i, j0;
-        i = blockIdx.x * blockDim.x + threadIdx.x;
+        const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
  
-        j0 = a0 * STRIDE_XYZQ;
+        // Destination address where x should be stored in nbnxm layout. We use this cast here to
+        // save only x, y and z components, not touching the w (q) component, which is pre-defined.
+        float3* gm_xqDest = (float3*)&gm_xq[threadIndex + offset];
  
-        // destination address where x shoud be stored in nbnxm layout
-        float3* gm_coordinatesDest = (float3*)&gm_coordinatesNbnxm[j0 + 4 * i];
-
-        /* perform conversion of each element */
-        if (i < na_round)
+        // Perform layout conversion of each element.
+        if (threadIndex < numAtomsRounded)
          {
-            if (i < na)
+            if (threadIndex < numAtoms)
              {
-                *gm_coordinatesDest = *((float3*)gm_coordinatesRvec[gm_atomIndex[a0 + i]]);
+                *gm_xqDest = gm_x[gm_atomIndex[threadIndex + offset]];
              }
              else
              {
-                *gm_coordinatesDest = make_float3(farAway);
+                *gm_xqDest = make_float3(farAway);
              }
          }
      }
@@ -140,54 +124,47 @@ __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColumns,
   * \tparam        accumulateForce  If the initial forces in \p gm_fTotal should be saved.
   * \tparam        addPmeForce      Whether the PME force should be added to the total.
   *
- * \param[in]     gm_forcesNbnxm   Non-bonded forces in nbnxm format.
- * \param[in]     gm_forcesPme     PME forces.
- * \param[in,out] gm_forcesTotal   Force buffer to be reduced into.
- * \param[in]     cell             Cell index mapping.
- * \param[in]     atomStart        Start atom index.
- * \param[in]     numAtoms         Number of atoms.
+ * \param[in]     gm_fNB     Non-bonded forces in nbnxm format.
+ * \param[in]     gm_fPme    PME forces.
+ * \param[in,out] gm_fTotal  Force buffer to be reduced into.
+ * \param[in]     cell       Cell index mapping.
+ * \param[in]     atomStart  Start atom index.
+ * \param[in]     numAtoms   Number of atoms.
   */
  template<bool accumulateForce, bool addPmeForce>
-__global__ void nbnxn_gpu_add_nbat_f_to_f_kernel(const float3* __restrict__ gb_forcesNbnxm,
-                                                 const float3* __restrict__ gm_forcesPme,
-                                                 float3* gm_forcesTotal,
-                                                 const int* __restrict__ gm_cell,
-                                                 const int atomStart,
-                                                 const int numAtoms);
-template<bool accumulateForce, bool addPmeForce>
-__global__ void nbnxn_gpu_add_nbat_f_to_f_kernel(const float3* __restrict__ gb_forcesNbnxm,
-                                                 const float3* __restrict__ gm_forcesPme,
-                                                 float3* gm_forcesTotal,
-                                                 const int* __restrict__ gm_cell,
-                                                 const int atomStart,
-                                                 const int numAtoms)
+static __global__ void nbnxn_gpu_add_nbat_f_to_f_kernel(const float3* __restrict__ gm_fNB,
+                                                        const float3* __restrict__ gm_fPme,
+                                                        float3* gm_fTotal,
+                                                        const int* __restrict__ gm_cell,
+                                                        const int atomStart,
+                                                        const int numAtoms)
  {
  
      /* map particle-level parallelism to 1D CUDA thread and block index */
-    int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+    const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
  
      /* perform addition for each particle*/
      if (threadIndex < numAtoms)
      {
  
-        int     i             = gm_cell[atomStart + threadIndex];
-        float3* gm_forcesDest = (float3*)&gm_forcesTotal[atomStart + threadIndex];
-        float3  temp;
+        const int i        = gm_cell[atomStart + threadIndex];
+        float3*   gm_fDest = &gm_fTotal[atomStart + threadIndex];
+        float3    temp;
  
          if (accumulateForce)
          {
-            temp = *gm_forcesDest;
-            temp += gb_forcesNbnxm[i];
+            temp = *gm_fDest;
+            temp += gm_fNB[i];
          }
          else
          {
-            temp = gb_forcesNbnxm[i];
+            temp = gm_fNB[i];
          }
          if (addPmeForce)
          {
-            temp += gm_forcesPme[atomStart + threadIndex];
+            temp += gm_fPme[atomStart + threadIndex];
          }
-        *gm_forcesDest = temp;
+        *gm_fDest = temp;
      }
      return;
  }