Simplify GPU nbnxm x conversion

author Berk Hess <hess@kth.se>

Fri, 7 Jun 2019 07:37:25 +0000 (09:37 +0200)

committer Szilárd Páll <pall.szilard@gmail.com>

Mon, 17 Jun 2019 16:39:58 +0000 (18:39 +0200)
author Berk Hess <hess@kth.se>
Fri, 7 Jun 2019 07:37:25 +0000 (09:37 +0200)
committer Szilárd Páll <pall.szilard@gmail.com>
Mon, 17 Jun 2019 16:39:58 +0000 (18:39 +0200)
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp

index fc507739b8d232d7c3de559c3d84d6e38e69d284..55084763f3805b9c4067d3785adb6d50b4d060f2 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1034,69 +1034,60 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
          nbat->natoms_local = gridSet.grids()[0].atomIndexEnd();
      }
  
-    const int nth = useGpu ? 1 : gmx_omp_nthreads_get(emntPairsearch);
-
-#pragma omp parallel for num_threads(nth) schedule(static)
-    for (int th = 0; th < nth; th++)
+    if (useGpu)
      {
-        try
+        for (int g = gridBegin; g < gridEnd; g++)
          {
-            for (int g = gridBegin; g < gridEnd; g++)
+            nbnxn_gpu_x_to_nbat_x(gridSet.grids()[g],
+                                  FillLocal && g == 0,
+                                  gpu_nbv,
+                                  xPmeDevicePtr,
+                                  locality,
+                                  x);
+        }
+    }
+    else
+    {
+        const int nth = gmx_omp_nthreads_get(emntPairsearch);
+#pragma omp parallel for num_threads(nth) schedule(static)
+        for (int th = 0; th < nth; th++)
+        {
+            try
              {
-                const Nbnxm::Grid  &grid       = gridSet.grids()[g];
-
-                int                 maxAtomsInColumn = 0;
-
-                const int           numCellsXY = grid.numColumns();
-
-                const int           cxy0 = (numCellsXY* th      + nth - 1)/nth;
-                const int           cxy1 = (numCellsXY*(th + 1) + nth - 1)/nth;
-
-                for (int cxy = cxy0; cxy < cxy1; cxy++)
+                for (int g = gridBegin; g < gridEnd; g++)
                  {
-                    const int na  = grid.numAtomsInColumn(cxy);
-                    const int ash = grid.firstAtomInColumn(cxy);
+                    const Nbnxm::Grid  &grid       = gridSet.grids()[g];
+                    const int           numCellsXY = grid.numColumns();
  
-                    int       na_fill;
-                    if (g == 0 && FillLocal)
-                    {
-                        na_fill = grid.paddedNumAtomsInColumn(cxy);
-                    }
-                    else
-                    {
-                        /* We fill only the real particle locations.
-                         * We assume the filling entries at the end have been
-                         * properly set before during pair-list generation.
-                         */
-                        na_fill = na;
-                    }
-                    if (useGpu)
-                    {
-                        // All columns will be processed in a single GPU kernel (below).
-                        // We need to determine the maximum number of atoms in a column
-                        maxAtomsInColumn = std::max(maxAtomsInColumn, na);
-                    }
-                    else
+                    const int           cxy0 = (numCellsXY* th      + nth - 1)/nth;
+                    const int           cxy1 = (numCellsXY*(th + 1) + nth - 1)/nth;
+
+                    for (int cxy = cxy0; cxy < cxy1; cxy++)
                      {
+                        const int na  = grid.numAtomsInColumn(cxy);
+                        const int ash = grid.firstAtomInColumn(cxy);
+
+                        int       na_fill;
+                        if (g == 0 && FillLocal)
+                        {
+                            na_fill = grid.paddedNumAtomsInColumn(cxy);
+                        }
+                        else
+                        {
+                            /* We fill only the real particle locations.
+                             * We assume the filling entries at the end have been
+                             * properly set before during pair-list generation.
+                             */
+                            na_fill = na;
+                        }
                          copy_rvec_to_nbat_real(gridSet.atomIndices().data() + ash,
                                                 na, na_fill, x,
                                                 nbat->XFormat, nbat->x().data(), ash);
                      }
                  }
-                if (useGpu)
-                {
-                    nbnxn_gpu_x_to_nbat_x(gridSet,
-                                          g,
-                                          FillLocal,
-                                          gpu_nbv,
-                                          xPmeDevicePtr,
-                                          maxAtomsInColumn,
-                                          locality,
-                                          x);
-                }
              }
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
          }
-        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
      }
  }
  
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh

index 40ebbdb0ec1dba39f77ed2cd0011e65035c5ae6a..172d2221488d13cb3013a54c5095484dd93be00a 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
@@ -55,8 +55,7 @@
  
   * \param[in]     numColumns        extent of cell-level parallelism
   * \param[out]    xnb               position buffer in nbnxm layout
- * \param[in]     gridIndex         grid index
- * \param[in]     FillLocal         boolean to specify if Fill Local is true
+ * \param[in]     setFillerCoords   tells whether to set the coordinates of the filler particles
   * \param[in]     x                 position buffer
   * \param[in]     a                 atom index mapping stride between atoms in memory
   * \param[in]     cxy_na            array of extents
@@ -66,8 +65,7 @@
   */
  __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int                         numColumns,
                                               float *  __restrict__       xnb,
-                                             int                         gridIndex,
-                                             bool                        FillLocal,
+                                             bool                        setFillerCoords,
                                               const rvec *  __restrict__  x,
                                               const int *  __restrict__   a,
                                               const int *  __restrict__   cxy_na,
@@ -78,8 +76,7 @@ __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int                         numColu
  
  __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int                         numColumns,
                                               float *  __restrict__       xnb,
-                                             int                         gridIndex,
-                                             bool                        FillLocal,
+                                             bool                        setFillerCoords,
                                               const rvec *  __restrict__  x,
                                               const int *  __restrict__   a,
                                               const int *  __restrict__   cxy_na,
@@ -100,7 +97,7 @@ __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int                         numColu
          int na = cxy_na[cxy];
          int a0 = (cellOffset + cxy_ind[cxy])*numAtomsPerCell;
          int na_round;
-        if (gridIndex == 0 && FillLocal)
+        if (setFillerCoords)
          {
              // TODO: This can be done more efficiently
              na_round =
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 3846fecfc10f61e2046729d18e9b5095cebd2bbb..7162c9f593944e68682e9c0e85ef14ba67379a7a 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -59,7 +59,7 @@
  #include "gromacs/nbnxm/gpu_common.h"
  #include "gromacs/nbnxm/gpu_common_utils.h"
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
-#include "gromacs/nbnxm/gridset.h"
+#include "gromacs/nbnxm/grid.h"
  #include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/nbnxm/pairlist.h"
  #include "gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh"
@@ -738,32 +738,27 @@ void cuda_set_cacheconfig()
  }
  
  /* X buffer operations on GPU: performs conversion from rvec to nb format. */
-void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
-                           int                              gridIndex,
-                           bool                             FillLocal,
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
+                           bool                             setFillerCoords,
                             gmx_nbnxn_gpu_t                 *nb,
                             void                            *xPmeDevicePtr,
-                           int                              maxAtomsInColumn,
                             const Nbnxm::AtomLocality        locality,
                             const rvec                      *x)
  {
      cu_atomdata_t             *adat    = nb->atdat;
      bool                       bDoTime = nb->bDoTime;
  
-    const Nbnxm::Grid         &grid       = gridSet.grids()[gridIndex];
-
      const int                  numColumns                = grid.numColumns();
      const int                  cellOffset                = grid.cellOffset();
      const int                  numAtomsPerCell           = grid.numAtomsPerCell();
+    // TODO: Document this, one can not infer the interaction locality from the atom locality
      Nbnxm::InteractionLocality interactionLoc            = Nbnxm::InteractionLocality::Local;
-    int nCopyAtoms                                       = gridSet.numRealAtomsLocal();
-    int copyAtomStart                                    = 0;
+    int nCopyAtoms                                       = grid.srcAtomEnd() - grid.srcAtomBegin();
+    int copyAtomStart                                    = grid.srcAtomBegin();
  
      if (locality == Nbnxm::AtomLocality::NonLocal)
      {
          interactionLoc          = Nbnxm::InteractionLocality::NonLocal;
-        nCopyAtoms              = gridSet.numRealAtomsTotal()-gridSet.numRealAtomsLocal();
-        copyAtomStart           = gridSet.numRealAtomsLocal();
      }
  
      cudaStream_t   stream  = nb->stream[interactionLoc];
@@ -818,9 +813,10 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
      config.blockSize[0]     = threadsPerBlock;
      config.blockSize[1]     = 1;
      config.blockSize[2]     = 1;
-    config.gridSize[0]      = ((maxAtomsInColumn+1)+threadsPerBlock-1)/threadsPerBlock;
+    config.gridSize[0]      = (grid.numCellsColumnMax()*numAtomsPerCell + threadsPerBlock - 1)/threadsPerBlock;
      config.gridSize[1]      = numColumns;
      config.gridSize[2]      = 1;
+    GMX_ASSERT(config.gridSize[0] > 0, "Can not have empty grid, early return above avoids this");
      config.sharedMemorySize = 0;
      config.stream           = stream;
  
@@ -832,8 +828,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
      const auto kernelArgs         = prepareGpuKernelArguments(kernelFn, config,
                                                                &numColumns,
                                                                &xqPtr,
-                                                              &gridIndex,
-                                                              &FillLocal,
+                                                              &setFillerCoords,
                                                                &d_x,
                                                                &d_atomIndices,
                                                                &d_cxy_na,
diff --git a/src/gromacs/nbnxm/grid.cpp b/src/gromacs/nbnxm/grid.cpp

index 2cd41950f2db46770d5f97c6e49ec4c9ae290f2a..dce3330832ec71dc9c5a0a725aa24769924a8279 100644 (file)
--- a/src/gromacs/nbnxm/grid.cpp
+++ b/src/gromacs/nbnxm/grid.cpp
@@ -1307,7 +1307,10 @@ void Grid::setCellIndices(int                             ddZone,
                            const int                       numAtomsMoved,
                            nbnxn_atomdata_t               *nbat)
  {
-    cellOffset_ = cellOffset;
+    cellOffset_   = cellOffset;
+
+    srcAtomBegin_ = atomStart;
+    srcAtomEnd_   = atomEnd;
  
      const int nthread = gmx_omp_nthreads_get(emntPairsearch);
  
@@ -1342,7 +1345,8 @@ void Grid::setCellIndices(int                             ddZone,
          /* Clear cxy_na_, so we can reuse the array below */
          cxy_na_[i] = 0;
      }
-    numCellsTotal_ = cxy_ind_[numColumns()] - cxy_ind_[0];
+    numCellsTotal_     = cxy_ind_[numColumns()] - cxy_ind_[0];
+    numCellsColumnMax_ = ncz_max;
  
      /* Resize grid and atom data which depend on the number of cells */
      resizeForNumberOfCells(atomIndexEnd(), numAtomsMoved, gridSetData, nbat);
diff --git a/src/gromacs/nbnxm/grid.h b/src/gromacs/nbnxm/grid.h

index d44c476c22e7d21c644a83562edafa72aeffb8dd..645048c47fb1e3a01454923bd11ab40f1ae562db 100644 (file)
--- a/src/gromacs/nbnxm/grid.h
+++ b/src/gromacs/nbnxm/grid.h
@@ -332,6 +332,24 @@ class Grid
              return cellOffset_;
          }
  
+        //! Returns the maximum number of grid cells in a column
+        int numCellsColumnMax() const
+        {
+            return numCellsColumnMax_;
+        }
+
+        //! Returns the start of the source atom range mapped to this grid
+        int srcAtomBegin() const
+        {
+            return srcAtomBegin_;
+        }
+
+        //! Returns the end of the source atom range mapped to this grid
+        int srcAtomEnd() const
+        {
+            return srcAtomEnd_;
+        }
+
          //! Returns the first cell index in the grid, starting at 0 in this grid
          int firstCellInColumn(int columnIndex) const
          {
@@ -538,8 +556,15 @@ class Grid
  
          //! The total number of cells in this grid
          int        numCellsTotal_;
-        //! Index in nbs->cell corresponding to cell 0  */
+        //! Index in nbs->cell corresponding to cell 0
          int        cellOffset_;
+        //! The maximum number of cells in a column
+        int        numCellsColumnMax_;
+
+        //! The start of the source atom range mapped to this grid
+        int        srcAtomBegin_;
+        //! The end of the source atom range mapped to this grid
+        int        srcAtomEnd_;
  
          /* Grid data */
          /*! \brief The number of, non-filler, atoms for each grid column.
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 30d5f33725960cc669b45c844c21e2e1c91eb8c9..49dc1bfce2fc6a4a045b07d37e56e4c1853155f1 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -58,7 +58,7 @@ enum class GpuTaskCompletion;
  namespace Nbnxm
  {
  
-class GridSet;
+class Grid;
  
  /*! \brief
   * Launch asynchronously the xq buffer host to device copy.
@@ -226,12 +226,10 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
  /*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
   */
  CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
-                           int                gmx_unused  g,
-                           bool               gmx_unused  FillLocal,
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid  gmx_unused &grid,
+                           bool               gmx_unused  setFillerCoords,
                             gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv,
                             void               gmx_unused *xPmeDevicePtr,
-                           int                gmx_unused  na_round_max,
                             Nbnxm::AtomLocality gmx_unused locality,
                             const rvec         gmx_unused *x) CUDA_FUNC_TERM
author	Berk Hess <hess@kth.se>
	Fri, 7 Jun 2019 07:37:25 +0000 (09:37 +0200)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Mon, 17 Jun 2019 16:39:58 +0000 (18:39 +0200)
src/gromacs/nbnxm/atomdata.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/grid.cpp		patch \| blob \| history
src/gromacs/nbnxm/grid.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history