Simplify GPU nbnxm x conversion
authorBerk Hess <hess@kth.se>
Fri, 7 Jun 2019 07:37:25 +0000 (09:37 +0200)
committerSzilárd Páll <pall.szilard@gmail.com>
Mon, 17 Jun 2019 16:39:58 +0000 (18:39 +0200)
The GPU nbnxm x conversion change 42b343b9 lead to data being transferred
multiple times and introduced several ugly hacks. This change simplifies
the code and transfers each coordinate only once.

Change-Id: I430496251ae40ee4c604e3920b789b3ae2b297dd

src/gromacs/nbnxm/atomdata.cpp
src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/grid.cpp
src/gromacs/nbnxm/grid.h
src/gromacs/nbnxm/nbnxm_gpu.h

index fc507739b8d232d7c3de559c3d84d6e38e69d284..55084763f3805b9c4067d3785adb6d50b4d060f2 100644 (file)
@@ -1034,69 +1034,60 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
         nbat->natoms_local = gridSet.grids()[0].atomIndexEnd();
     }
 
-    const int nth = useGpu ? 1 : gmx_omp_nthreads_get(emntPairsearch);
-
-#pragma omp parallel for num_threads(nth) schedule(static)
-    for (int th = 0; th < nth; th++)
+    if (useGpu)
     {
-        try
+        for (int g = gridBegin; g < gridEnd; g++)
         {
-            for (int g = gridBegin; g < gridEnd; g++)
+            nbnxn_gpu_x_to_nbat_x(gridSet.grids()[g],
+                                  FillLocal && g == 0,
+                                  gpu_nbv,
+                                  xPmeDevicePtr,
+                                  locality,
+                                  x);
+        }
+    }
+    else
+    {
+        const int nth = gmx_omp_nthreads_get(emntPairsearch);
+#pragma omp parallel for num_threads(nth) schedule(static)
+        for (int th = 0; th < nth; th++)
+        {
+            try
             {
-                const Nbnxm::Grid  &grid       = gridSet.grids()[g];
-
-                int                 maxAtomsInColumn = 0;
-
-                const int           numCellsXY = grid.numColumns();
-
-                const int           cxy0 = (numCellsXY* th      + nth - 1)/nth;
-                const int           cxy1 = (numCellsXY*(th + 1) + nth - 1)/nth;
-
-                for (int cxy = cxy0; cxy < cxy1; cxy++)
+                for (int g = gridBegin; g < gridEnd; g++)
                 {
-                    const int na  = grid.numAtomsInColumn(cxy);
-                    const int ash = grid.firstAtomInColumn(cxy);
+                    const Nbnxm::Grid  &grid       = gridSet.grids()[g];
+                    const int           numCellsXY = grid.numColumns();
 
-                    int       na_fill;
-                    if (g == 0 && FillLocal)
-                    {
-                        na_fill = grid.paddedNumAtomsInColumn(cxy);
-                    }
-                    else
-                    {
-                        /* We fill only the real particle locations.
-                         * We assume the filling entries at the end have been
-                         * properly set before during pair-list generation.
-                         */
-                        na_fill = na;
-                    }
-                    if (useGpu)
-                    {
-                        // All columns will be processed in a single GPU kernel (below).
-                        // We need to determine the maximum number of atoms in a column
-                        maxAtomsInColumn = std::max(maxAtomsInColumn, na);
-                    }
-                    else
+                    const int           cxy0 = (numCellsXY* th      + nth - 1)/nth;
+                    const int           cxy1 = (numCellsXY*(th + 1) + nth - 1)/nth;
+
+                    for (int cxy = cxy0; cxy < cxy1; cxy++)
                     {
+                        const int na  = grid.numAtomsInColumn(cxy);
+                        const int ash = grid.firstAtomInColumn(cxy);
+
+                        int       na_fill;
+                        if (g == 0 && FillLocal)
+                        {
+                            na_fill = grid.paddedNumAtomsInColumn(cxy);
+                        }
+                        else
+                        {
+                            /* We fill only the real particle locations.
+                             * We assume the filling entries at the end have been
+                             * properly set before during pair-list generation.
+                             */
+                            na_fill = na;
+                        }
                         copy_rvec_to_nbat_real(gridSet.atomIndices().data() + ash,
                                                na, na_fill, x,
                                                nbat->XFormat, nbat->x().data(), ash);
                     }
                 }
-                if (useGpu)
-                {
-                    nbnxn_gpu_x_to_nbat_x(gridSet,
-                                          g,
-                                          FillLocal,
-                                          gpu_nbv,
-                                          xPmeDevicePtr,
-                                          maxAtomsInColumn,
-                                          locality,
-                                          x);
-                }
             }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
         }
-        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
     }
 }
 
index 40ebbdb0ec1dba39f77ed2cd0011e65035c5ae6a..172d2221488d13cb3013a54c5095484dd93be00a 100644 (file)
@@ -55,8 +55,7 @@
 
  * \param[in]     numColumns        extent of cell-level parallelism
  * \param[out]    xnb               position buffer in nbnxm layout
- * \param[in]     gridIndex         grid index
- * \param[in]     FillLocal         boolean to specify if Fill Local is true
+ * \param[in]     setFillerCoords   tells whether to set the coordinates of the filler particles
  * \param[in]     x                 position buffer
  * \param[in]     a                 atom index mapping stride between atoms in memory
  * \param[in]     cxy_na            array of extents
@@ -66,8 +65,7 @@
  */
 __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int                         numColumns,
                                              float *  __restrict__       xnb,
-                                             int                         gridIndex,
-                                             bool                        FillLocal,
+                                             bool                        setFillerCoords,
                                              const rvec *  __restrict__  x,
                                              const int *  __restrict__   a,
                                              const int *  __restrict__   cxy_na,
@@ -78,8 +76,7 @@ __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int                         numColu
 
 __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int                         numColumns,
                                              float *  __restrict__       xnb,
-                                             int                         gridIndex,
-                                             bool                        FillLocal,
+                                             bool                        setFillerCoords,
                                              const rvec *  __restrict__  x,
                                              const int *  __restrict__   a,
                                              const int *  __restrict__   cxy_na,
@@ -100,7 +97,7 @@ __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int                         numColu
         int na = cxy_na[cxy];
         int a0 = (cellOffset + cxy_ind[cxy])*numAtomsPerCell;
         int na_round;
-        if (gridIndex == 0 && FillLocal)
+        if (setFillerCoords)
         {
             // TODO: This can be done more efficiently
             na_round =
index 3846fecfc10f61e2046729d18e9b5095cebd2bbb..7162c9f593944e68682e9c0e85ef14ba67379a7a 100644 (file)
@@ -59,7 +59,7 @@
 #include "gromacs/nbnxm/gpu_common.h"
 #include "gromacs/nbnxm/gpu_common_utils.h"
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
-#include "gromacs/nbnxm/gridset.h"
+#include "gromacs/nbnxm/grid.h"
 #include "gromacs/nbnxm/nbnxm.h"
 #include "gromacs/nbnxm/pairlist.h"
 #include "gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh"
@@ -738,32 +738,27 @@ void cuda_set_cacheconfig()
 }
 
 /* X buffer operations on GPU: performs conversion from rvec to nb format. */
-void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
-                           int                              gridIndex,
-                           bool                             FillLocal,
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
+                           bool                             setFillerCoords,
                            gmx_nbnxn_gpu_t                 *nb,
                            void                            *xPmeDevicePtr,
-                           int                              maxAtomsInColumn,
                            const Nbnxm::AtomLocality        locality,
                            const rvec                      *x)
 {
     cu_atomdata_t             *adat    = nb->atdat;
     bool                       bDoTime = nb->bDoTime;
 
-    const Nbnxm::Grid         &grid       = gridSet.grids()[gridIndex];
-
     const int                  numColumns                = grid.numColumns();
     const int                  cellOffset                = grid.cellOffset();
     const int                  numAtomsPerCell           = grid.numAtomsPerCell();
+    // TODO: Document this, one can not infer the interaction locality from the atom locality
     Nbnxm::InteractionLocality interactionLoc            = Nbnxm::InteractionLocality::Local;
-    int nCopyAtoms                                       = gridSet.numRealAtomsLocal();
-    int copyAtomStart                                    = 0;
+    int nCopyAtoms                                       = grid.srcAtomEnd() - grid.srcAtomBegin();
+    int copyAtomStart                                    = grid.srcAtomBegin();
 
     if (locality == Nbnxm::AtomLocality::NonLocal)
     {
         interactionLoc          = Nbnxm::InteractionLocality::NonLocal;
-        nCopyAtoms              = gridSet.numRealAtomsTotal()-gridSet.numRealAtomsLocal();
-        copyAtomStart           = gridSet.numRealAtomsLocal();
     }
 
     cudaStream_t   stream  = nb->stream[interactionLoc];
@@ -818,9 +813,10 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
     config.blockSize[0]     = threadsPerBlock;
     config.blockSize[1]     = 1;
     config.blockSize[2]     = 1;
-    config.gridSize[0]      = ((maxAtomsInColumn+1)+threadsPerBlock-1)/threadsPerBlock;
+    config.gridSize[0]      = (grid.numCellsColumnMax()*numAtomsPerCell + threadsPerBlock - 1)/threadsPerBlock;
     config.gridSize[1]      = numColumns;
     config.gridSize[2]      = 1;
+    GMX_ASSERT(config.gridSize[0] > 0, "Can not have empty grid, early return above avoids this");
     config.sharedMemorySize = 0;
     config.stream           = stream;
 
@@ -832,8 +828,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
     const auto kernelArgs         = prepareGpuKernelArguments(kernelFn, config,
                                                               &numColumns,
                                                               &xqPtr,
-                                                              &gridIndex,
-                                                              &FillLocal,
+                                                              &setFillerCoords,
                                                               &d_x,
                                                               &d_atomIndices,
                                                               &d_cxy_na,
index 2cd41950f2db46770d5f97c6e49ec4c9ae290f2a..dce3330832ec71dc9c5a0a725aa24769924a8279 100644 (file)
@@ -1307,7 +1307,10 @@ void Grid::setCellIndices(int                             ddZone,
                           const int                       numAtomsMoved,
                           nbnxn_atomdata_t               *nbat)
 {
-    cellOffset_ = cellOffset;
+    cellOffset_   = cellOffset;
+
+    srcAtomBegin_ = atomStart;
+    srcAtomEnd_   = atomEnd;
 
     const int nthread = gmx_omp_nthreads_get(emntPairsearch);
 
@@ -1342,7 +1345,8 @@ void Grid::setCellIndices(int                             ddZone,
         /* Clear cxy_na_, so we can reuse the array below */
         cxy_na_[i] = 0;
     }
-    numCellsTotal_ = cxy_ind_[numColumns()] - cxy_ind_[0];
+    numCellsTotal_     = cxy_ind_[numColumns()] - cxy_ind_[0];
+    numCellsColumnMax_ = ncz_max;
 
     /* Resize grid and atom data which depend on the number of cells */
     resizeForNumberOfCells(atomIndexEnd(), numAtomsMoved, gridSetData, nbat);
index d44c476c22e7d21c644a83562edafa72aeffb8dd..645048c47fb1e3a01454923bd11ab40f1ae562db 100644 (file)
@@ -332,6 +332,24 @@ class Grid
             return cellOffset_;
         }
 
+        //! Returns the maximum number of grid cells in a column
+        int numCellsColumnMax() const
+        {
+            return numCellsColumnMax_;
+        }
+
+        //! Returns the start of the source atom range mapped to this grid
+        int srcAtomBegin() const
+        {
+            return srcAtomBegin_;
+        }
+
+        //! Returns the end of the source atom range mapped to this grid
+        int srcAtomEnd() const
+        {
+            return srcAtomEnd_;
+        }
+
         //! Returns the first cell index in the grid, starting at 0 in this grid
         int firstCellInColumn(int columnIndex) const
         {
@@ -538,8 +556,15 @@ class Grid
 
         //! The total number of cells in this grid
         int        numCellsTotal_;
-        //! Index in nbs->cell corresponding to cell 0  */
+        //! Index in nbs->cell corresponding to cell 0
         int        cellOffset_;
+        //! The maximum number of cells in a column
+        int        numCellsColumnMax_;
+
+        //! The start of the source atom range mapped to this grid
+        int        srcAtomBegin_;
+        //! The end of the source atom range mapped to this grid
+        int        srcAtomEnd_;
 
         /* Grid data */
         /*! \brief The number of, non-filler, atoms for each grid column.
index 30d5f33725960cc669b45c844c21e2e1c91eb8c9..49dc1bfce2fc6a4a045b07d37e56e4c1853155f1 100644 (file)
@@ -58,7 +58,7 @@ enum class GpuTaskCompletion;
 namespace Nbnxm
 {
 
-class GridSet;
+class Grid;
 
 /*! \brief
  * Launch asynchronously the xq buffer host to device copy.
@@ -226,12 +226,10 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
 /*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
  */
 CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
-                           int                gmx_unused  g,
-                           bool               gmx_unused  FillLocal,
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid  gmx_unused &grid,
+                           bool               gmx_unused  setFillerCoords,
                            gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv,
                            void               gmx_unused *xPmeDevicePtr,
-                           int                gmx_unused  na_round_max,
                            Nbnxm::AtomLocality gmx_unused locality,
                            const rvec         gmx_unused *x) CUDA_FUNC_TERM