Position buffer ops in CUDA
authorAlan Gray <alang@nvidia.com>
Thu, 24 Jan 2019 15:31:21 +0000 (07:31 -0800)
committerSzilárd Páll <pall.szilard@gmail.com>
Fri, 10 May 2019 23:40:03 +0000 (01:40 +0200)
On all but search steps the buffer ops transform can now be done on a
CUDA GPU. If PME runs on the same GPU the already uploaded coordinates
will be used as input.

Activate with GMX_USE_GPU_BUFFER_OPS env variable.

Note:
-  waits for X copy on the PME stream to finish, need to implement sync
   point between PME and NB streams (in follow-up).

Implements part of #2817

Change-Id: Ib87dabd74a02727898681249691ac9786b8ac65c

16 files changed:
src/gromacs/ewald/pme.h
src/gromacs/ewald/pme_gpu.cpp
src/gromacs/ewald/pme_gpu_internal.cpp
src/gromacs/ewald/pme_gpu_internal.h
src/gromacs/mdlib/sim_util.cpp
src/gromacs/nbnxm/atomdata.cpp
src/gromacs/nbnxm/atomdata.h
src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh [new file with mode: 0644]
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
src/gromacs/nbnxm/gpu_types.h
src/gromacs/nbnxm/grid.h
src/gromacs/nbnxm/nbnxm.cpp
src/gromacs/nbnxm/nbnxm.h
src/gromacs/nbnxm/nbnxm_gpu.h

index f4dffeed6ac6031d82b6efa94b4fb38bedd95ed8..33d497d7c2b4644deab9dc09a0de3f38b48cf28c 100644 (file)
@@ -441,4 +441,7 @@ pme_gpu_wait_and_reduce(gmx_pme_t            *GPU_FUNC_ARGUMENT(pme),
 GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
                                                    gmx_wallcycle   *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM
 
+
+/*! \brief Get pointer to device copy of coordinate data. */
+GPU_FUNC_QUALIFIER void *pme_gpu_get_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr)
 #endif
index efe4a60235e557cac1b0c84ffaddf09bbd74f509..07d15cdf1a55130b6eb9e53ec6992874503ab443 100644 (file)
@@ -391,3 +391,12 @@ void pme_gpu_reinit_computation(const gmx_pme_t *pme,
     wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
     wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 }
+
+void *pme_gpu_get_device_x(const gmx_pme_t *pme)
+{
+    if (!pme || !pme_gpu_active(pme))
+    {
+        return nullptr;
+    }
+    return pme_gpu_get_kernelparam_coordinates(pme->gpu);
+}
index 27da475ac8c3e0eec0e428d4afbaee39252d87ab..a8560a7a020485d3b77de284615f2ca15a774436 100644 (file)
@@ -236,6 +236,10 @@ void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGpu, const rvec *h_coordina
     copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, h_coordinatesFloat,
                        0, pmeGpu->kernelParams->atoms.nAtoms * DIM,
                        pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+    // FIXME: sync required since the copied data will be used by PP stream when using single GPU for both
+    //        Remove after adding the required event-based sync between the above H2D and the transform kernel
+    pme_gpu_synchronize(pmeGpu);
+
 #endif
 }
 
@@ -1267,3 +1271,16 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
 
     pme_gpu_copy_output_forces(pmeGpu);
 }
+
+void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *pmeGpu)
+{
+    if (pmeGpu && pmeGpu->kernelParams)
+    {
+        return pmeGpu->kernelParams->atoms.d_coordinates;
+    }
+    else
+    {
+        return nullptr;
+    }
+
+}
index afe3bf65abcfce0c6e979b663796a106d9fa0631..377f032fa2d1646cc1ea1ca9e80417ccd319cb9a 100644 (file)
@@ -443,6 +443,8 @@ GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu                *GPU_FUNC_ARGUMENT(
                                        const float           *GPU_FUNC_ARGUMENT(h_grid)
                                        ) GPU_FUNC_TERM
 
+/*! \brief Return pointer to device copy of coordinate data. */
+GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr)
 
 /* The inlined convenience PME GPU status getters */
 
index a27d8c58ec3a7e4e5e2146d779e3eaed13dd7be1..d53d02d6945ac5ed76aae10e1bc09bfcba50e9e0 100644 (file)
 // PME-first ordering would suffice).
 static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
 
+// environment variable to enable GPU buffer ops, to alow incremental and optional
+// introduction of this functionality.
+// TODO eventially tie this in with other existing GPU flags.
+static const bool c_enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
+
+
 
 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 {
@@ -868,6 +874,8 @@ void do_force(FILE                                     *fplog,
         ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
         ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
 
+    const bool useGpuXBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA));
+
     /* At a search step we need to start the first balancing region
      * somewhere early inside the step after communication during domain
      * decomposition (and not during the previous step as usual).
@@ -1036,11 +1044,17 @@ void do_force(FILE                                     *fplog,
                                &top->excls, step, nrnb);
         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
         wallcycle_stop(wcycle, ewcNS);
+
+        if (useGpuXBufOps)
+        {
+            nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::Local);
+        }
+
     }
     else
     {
         nbv->setCoordinates(Nbnxm::AtomLocality::Local, false,
-                            x.unpaddedArrayRef(), wcycle);
+                            x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata), wcycle);
     }
 
     if (bUseGPU)
@@ -1051,10 +1065,14 @@ void do_force(FILE                                     *fplog,
 
         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
-        Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
-                                  Nbnxm::AtomLocality::Local,
-                                  ppForceWorkload->haveGpuBondedWork);
+        if (bNS || !useGpuXBufOps)
+        {
+            Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
+                                      Nbnxm::AtomLocality::Local,
+                                      ppForceWorkload->haveGpuBondedWork);
+        }
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        // with X buffer ops offloaded to the GPU on all but the search steps
 
         // bonded work not split into separate local and non-local, so with DD
         // we can only launch the kernel after non-local coordinates have been received.
@@ -1096,25 +1114,34 @@ void do_force(FILE                                     *fplog,
                                    &top->excls, step, nrnb);
             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
             wallcycle_stop(wcycle, ewcNS);
+
+            if (useGpuXBufOps)
+            {
+
+                nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::NonLocal);
+            }
         }
         else
         {
             dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
 
             nbv->setCoordinates(Nbnxm::AtomLocality::NonLocal, false,
-                                x.unpaddedArrayRef(), wcycle);
+                                x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata), wcycle);
+
         }
 
         if (bUseGPU)
         {
             wallcycle_start(wcycle, ewcLAUNCH_GPU);
 
-            /* launch non-local nonbonded tasks on GPU */
-            wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
-                                      Nbnxm::AtomLocality::NonLocal,
-                                      ppForceWorkload->haveGpuBondedWork);
-            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+            if (bNS || !useGpuXBufOps)
+            {
+                wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+                Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
+                                          Nbnxm::AtomLocality::NonLocal,
+                                          ppForceWorkload->haveGpuBondedWork);
+                wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+            }
 
             if (ppForceWorkload->haveGpuBondedWork)
             {
@@ -1123,6 +1150,7 @@ void do_force(FILE                                     *fplog,
                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
             }
 
+            /* launch non-local nonbonded tasks on GPU */
             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
             do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                          step, nrnb, wcycle);
index fda635cf8fae2f46580845ac15facce91938d7bd..1cb4550003f598d074452a5db8509cd31a6e8b54 100644 (file)
@@ -1003,7 +1003,10 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
                                      const Nbnxm::AtomLocality locality,
                                      gmx_bool                  FillLocal,
                                      const rvec               *x,
-                                     nbnxn_atomdata_t         *nbat)
+                                     nbnxn_atomdata_t         *nbat,
+                                     bool                      useGpu,
+                                     gmx_nbnxn_gpu_t          *gpu_nbv,
+                                     void                     *xPmeDevicePtr)
 {
     int gridBegin = 0;
     int gridEnd   = 0;
@@ -1031,7 +1034,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
         nbat->natoms_local = gridSet.grids()[0].atomIndexEnd();
     }
 
-    const int nth = gmx_omp_nthreads_get(emntPairsearch);
+    const int nth = useGpu ? 1 : gmx_omp_nthreads_get(emntPairsearch);
 
 #pragma omp parallel for num_threads(nth) schedule(static)
     for (int th = 0; th < nth; th++)
@@ -1041,6 +1044,9 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
             for (int g = gridBegin; g < gridEnd; g++)
             {
                 const Nbnxm::Grid  &grid       = gridSet.grids()[g];
+
+                int                 maxAtomsInColumn = 0;
+
                 const int           numCellsXY = grid.numColumns();
 
                 const int           cxy0 = (numCellsXY* th      + nth - 1)/nth;
@@ -1064,9 +1070,29 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
                          */
                         na_fill = na;
                     }
-                    copy_rvec_to_nbat_real(gridSet.atomIndices().data() + ash,
-                                           na, na_fill, x,
-                                           nbat->XFormat, nbat->x().data(), ash);
+                    if (useGpu)
+                    {
+                        // All columns will be processed in a single GPU kernel (below).
+                        // We need to determine the maximum number of atoms in a column
+                        maxAtomsInColumn = std::max(maxAtomsInColumn, na);
+                    }
+                    else
+                    {
+                        copy_rvec_to_nbat_real(gridSet.atomIndices().data() + ash,
+                                               na, na_fill, x,
+                                               nbat->XFormat, nbat->x().data(), ash);
+                    }
+                }
+                if (useGpu)
+                {
+                    nbnxn_gpu_x_to_nbat_x(gridSet,
+                                          g,
+                                          FillLocal,
+                                          gpu_nbv,
+                                          xPmeDevicePtr,
+                                          maxAtomsInColumn,
+                                          locality,
+                                          x);
                 }
             }
         }
index e91094e37ecce5c2a3eeffc602174d525ea15876..7538566d27c133d1d2a85019401108212793dc53 100644 (file)
@@ -44,6 +44,7 @@
 #include "gromacs/utility/bitmask.h"
 #include "gromacs/utility/real.h"
 
+#include "gpu_types.h"
 #include "locality.h"
 
 namespace gmx
@@ -311,7 +312,10 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &gridSet,
                                      Nbnxm::AtomLocality   locality,
                                      gmx_bool              FillLocal,
                                      const rvec           *x,
-                                     nbnxn_atomdata_t     *nbat);
+                                     nbnxn_atomdata_t     *nbat,
+                                     bool                  useGpu,
+                                     gmx_nbnxn_gpu_t      *gpu_nbv,
+                                     void                 *xPmeDevicePtr);
 
 //! Add the computed forces to \p f, an internal reduction might be performed as well
 void reduceForces(nbnxn_atomdata_t     *nbat,
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
new file mode 100644 (file)
index 0000000..bc2e91d
--- /dev/null
@@ -0,0 +1,137 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *
+ * \brief
+ * CUDA kernel for GPU version of copy_rvec_to_nbat_real.
+ * Converts coordinate data from rvec to nb format.
+ *
+ *  \author Alan Gray <alang@nvidia.com>
+ *  \author Jon Vincent <jvincent@nvidia.com>
+ */
+
+#include "gromacs/nbnxm/nbnxm.h"
+
+/*! \brief CUDA kernel for transforming position coordinates from rvec to nbnxm layout.
+ *
+ * \param[in]     ncxy      extent of cell-level parallelism
+ * \param[out]    xnb       position buffer in nbnxm layout
+ * \param[in]     g         grid index
+ * \param[in]     FillLocal boolean to specify if Fill Local is true
+ * \param[in]     x         position buffer
+ * \param[in]     a         atom index mapping stride between atoms in memory
+ * \param[in]     na_all    array of extents
+ * \param[in]     cxy_ind   array of cell indices
+ * \param[in]     cell0     first cell
+ * \param[in]     na_sc     number of atoms per cell
+ */
+//TODO improve/simplify/document use of na_all and na_round
+__global__ void nbnxn_gpu_x_to_nbat_x_kernel(int         ncxy,
+                                             float      *xnb,
+                                             int         g,
+                                             bool        FillLocal,
+                                             const rvec *x,
+                                             const int  *a,
+                                             const int  *na_all,
+                                             const int  *cxy_ind,
+                                             int         cell0,
+                                             int         na_sc);
+
+__global__ void nbnxn_gpu_x_to_nbat_x_kernel(int         ncxy,
+                                             float      *xnb,
+                                             int         g,
+                                             bool        FillLocal,
+                                             const rvec *x,
+                                             const int  *a,
+                                             const int  *na_all,
+                                             const int  *cxy_ind,
+                                             int         cell0,
+                                             int         na_sc)
+{
+
+
+    const float farAway = -1000000;
+
+    /* map cell-level parallelism to y component of CUDA block index */
+    int cxy = blockIdx.y;
+
+    if (cxy < ncxy)
+    {
+
+        int na = na_all[cxy];
+        int a0 = (cell0 + cxy_ind[cxy])*na_sc;
+        int na_round;
+        if (g == 0 && FillLocal)
+        {
+            na_round =
+                (cxy_ind[cxy+1] - cxy_ind[cxy])*na_sc;
+        }
+        else
+        {
+            /* We fill only the real particle locations.
+             * We assume the filling entries at the end have been
+             * properly set before during pair-list generation.
+             */
+            na_round = na;
+        }
+
+        /* map parallelism within a cell to x component of CUDA block index linearized
+         * with threads within a block */
+        int i, j0;
+        i = blockIdx.x*blockDim.x+threadIdx.x;
+
+        j0 = a0*STRIDE_XYZQ;
+
+        /* perform conversion of each element */
+        if (i < na_round)
+        {
+            if (i < na)
+            {
+
+                xnb[j0+4*i]   = x[a[a0+i]][XX];
+                xnb[j0+4*i+1] = x[a[a0+i]][YY];
+                xnb[j0+4*i+2] = x[a[a0+i]][ZZ];
+            }
+            else
+            {
+                xnb[j0+4*i]   = farAway;
+                xnb[j0+4*i+1] = farAway;
+                xnb[j0+4*i+2] = farAway;
+            }
+        }
+    }
+
+}
index 250323b280a21abae6df836d96ef3aee17a294ef..19718a45d20f9771ca1ebbdae0acc816f049ade4 100644 (file)
 #include "gromacs/nbnxm/gpu_common.h"
 #include "gromacs/nbnxm/gpu_common_utils.h"
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/gridset.h"
 #include "gromacs/nbnxm/nbnxm.h"
 #include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh"
 #include "gromacs/timing/gpu_timing.h"
 #include "gromacs/utility/cstringutil.h"
 #include "gromacs/utility/gmxassert.h"
@@ -271,6 +273,39 @@ static inline int calc_shmem_required_nonbonded(const int num_threads_z, const g
     return shmem;
 }
 
+/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
+ *
+ *  As the point where the local stream tasks can be considered complete happens
+ *  at the same call point where the nonlocal stream should be synced with the
+ *  the local, this function recrds the event if called with the local stream as
+ *  argument and inserts in the GPU stream a wait on the event on the nonlocal.
+ */
+static void insertNonlocalGpuDependency(const gmx_nbnxn_cuda_t   *nb,
+                                        const InteractionLocality interactionLocality)
+{
+    cudaStream_t stream  = nb->stream[interactionLocality];
+
+    /* When we get here all misc operations issued in the local stream as well as
+       the local xq H2D are done,
+       so we record that in the local stream and wait for it in the nonlocal one.
+       This wait needs to precede any PP tasks, bonded or nonbonded, that may
+       compute on interactions between local and nonlocal atoms.
+     */
+    if (nb->bUseTwoStreams)
+    {
+        if (interactionLocality == InteractionLocality::Local)
+        {
+            cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
+            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
+        }
+        else
+        {
+            cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
+            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
+        }
+    }
+}
+
 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
                         const nbnxn_atomdata_t *nbatom,
@@ -319,15 +354,14 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
         adat_len    = adat->natoms - adat->natoms_local;
     }
 
+    /* HtoD x, q */
     /* beginning of timed HtoD section */
     if (bDoTime)
     {
         t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
     }
 
-    /* HtoD x, q */
-    cu_copy_H2D_async(adat->xq + adat_begin,
-                      static_cast<const void *>(nbatom->x().data() + adat_begin * 4),
+    cu_copy_H2D_async(adat->xq + adat_begin, static_cast<const void *>(nbatom->x().data() + adat_begin * 4),
                       adat_len * sizeof(*adat->xq), stream);
 
     if (bDoTime)
@@ -341,19 +375,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
        This wait needs to precede any PP tasks, bonded or nonbonded, that may
        compute on interactions between local and nonlocal atoms.
      */
-    if (nb->bUseTwoStreams)
-    {
-        if (iloc == InteractionLocality::Local)
-        {
-            cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
-            CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
-        }
-        else
-        {
-            cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
-            CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
-        }
-    }
+    insertNonlocalGpuDependency(nb, iloc);
 }
 
 /*! As we execute nonbonded workload in separate streams, before launching
@@ -715,4 +737,122 @@ void cuda_set_cacheconfig()
     }
 }
 
+/* X buffer operations on GPU: performs conversion from rvec to nb format. */
+//TODO improve variable naming for g
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
+                           int                              g,
+                           bool                             FillLocal,
+                           gmx_nbnxn_gpu_t                 *nb,
+                           void                            *xPmeDevicePtr,
+                           int                              maxAtomsInColumn,
+                           const Nbnxm::AtomLocality        locality,
+                           const rvec                      *x)
+{
+    cu_atomdata_t             *adat    = nb->atdat;
+    bool                       bDoTime = nb->bDoTime;
+
+    const Nbnxm::Grid         &grid       = gridSet.grids()[g];
+
+    //TODO improve naming here. Either use the getters straight or
+    //using variables with about the same name as the getters (perhaps
+    //nColumns, cellOffset, nNatomsPerCell)
+    const int                  ncxy            = grid.numColumns();
+    const int                  cell0           = grid.cellOffset();
+    const int                  na_sc           = grid.numAtomsPerCell();
+    Nbnxm::InteractionLocality interactionLoc  = Nbnxm::InteractionLocality::Local;
+    int nCopyAtoms                             = gridSet.numRealAtomsLocal();
+    int copyAtomStart                          = 0;
+
+    if (locality == Nbnxm::AtomLocality::NonLocal)
+    {
+        interactionLoc          = Nbnxm::InteractionLocality::NonLocal;
+        nCopyAtoms              = gridSet.numRealAtomsTotal()-gridSet.numRealAtomsLocal();
+        copyAtomStart           = gridSet.numRealAtomsLocal();
+    }
+
+    cudaStream_t   stream  = nb->stream[interactionLoc];
+
+    // FIXME: need to either let the local stream get to the
+    // insertNonlocalGpuDependency call or call it separately here
+    if (nCopyAtoms == 0) // empty domain
+    {
+        if (interactionLoc == Nbnxm::InteractionLocality::Local)
+        {
+            insertNonlocalGpuDependency(nb, interactionLoc);
+        }
+        return;
+    }
+
+    const rvec *d_x;
+
+    // copy of coordinates will be required if null pointer has been
+    // passed to function
+    // TODO improve this mechanism
+    bool        copyCoord = (xPmeDevicePtr == nullptr);
+
+    // copy X-coordinate data to device
+    if (copyCoord)
+    {
+        if (bDoTime)
+        {
+            nb->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+        }
+
+        // FIXME: use copyToDeviceBuffer wrapper
+        // There still exist issues with host buffer not being pinned
+        // and another problem with wrong size being picked up by API
+        // auto devicePtr = &nb->xrvec[copyAtomStart][0];
+        // copyToDeviceBuffer(&devicePtr, &x[copyAtomStart][0], 0, nCopyAtoms,
+        //                    stream, GpuApiCallBehavior::Async, nullptr);
+        cudaError_t stat = cudaMemcpyAsync(&nb->xrvec[copyAtomStart][0], &x[copyAtomStart][0],
+                                           nCopyAtoms*sizeof(rvec), cudaMemcpyHostToDevice, stream);
+        CU_RET_ERR(stat, "cudaMemcpy failed on nb->xrvec");
+
+
+        if (bDoTime)
+        {
+            nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+        }
+
+        d_x = nb->xrvec;
+    }
+    else //coordinates have already been copied by PME stream
+    {
+        d_x = (rvec*) xPmeDevicePtr;
+    }
+
+    /* launch kernel on GPU */
+    const int          threadsPerBlock = 128;
+
+    KernelLaunchConfig config;
+    config.blockSize[0]     = threadsPerBlock;
+    config.blockSize[1]     = 1;
+    config.blockSize[2]     = 1;
+    config.gridSize[0]      = ((maxAtomsInColumn+1)+threadsPerBlock-1)/threadsPerBlock;
+    config.gridSize[1]      = ncxy;
+    config.gridSize[2]      = 1;
+    config.sharedMemorySize = 0;
+    config.stream           = stream;
+
+    auto       kernelFn     = nbnxn_gpu_x_to_nbat_x_kernel;
+    float     *xqPtr        = &(adat->xq->x);
+    const int *abufops      = nb->abufops;
+    const int *nabufopsPtr  = nb->nabufops[locality];
+    const int *cxybufopsPtr = nb->cxybufops[locality];
+    const auto kernelArgs   = prepareGpuKernelArguments(kernelFn, config,
+                                                        &ncxy,
+                                                        &xqPtr,
+                                                        &g,
+                                                        &FillLocal,
+                                                        &d_x,
+                                                        &abufops,
+                                                        &nabufopsPtr,
+                                                        &cxybufopsPtr,
+                                                        &cell0,
+                                                        &na_sc);
+    launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
+
+    insertNonlocalGpuDependency(nb, interactionLoc);
+}
+
 } // namespace Nbnxm
index 46e8c771e3d19d44a1a1b32506d0628ce1ffb967..ca739e57694b6e43f29df3a49054040bba2bf00c 100644 (file)
@@ -59,6 +59,7 @@
 #include "gromacs/mdtypes/md_enums.h"
 #include "gromacs/nbnxm/atomdata.h"
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/gridset.h"
 #include "gromacs/nbnxm/nbnxm.h"
 #include "gromacs/nbnxm/pairlistsets.h"
 #include "gromacs/pbcutil/ishift.h"
@@ -857,4 +858,153 @@ rvec *gpu_get_fshift(gmx_nbnxn_gpu_t *nb)
     return reinterpret_cast<rvec *>(nb->atdat->fshift);
 }
 
+/* Initialization for X buffer operations on GPU. */
+/* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
+void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
+                                gmx_nbnxn_gpu_t                 *gpu_nbv,
+                                const Nbnxm::AtomLocality        locality)
+{
+    cudaError_t                      stat;
+    const Nbnxm::InteractionLocality iloc = ((locality == AtomLocality::Local) ?
+                                             InteractionLocality::Local : InteractionLocality::NonLocal);
+    cudaStream_t                     stream    = gpu_nbv->stream[iloc];
+    bool                             bDoTime   = gpu_nbv->bDoTime;
+    int                              gridBegin = 0, gridEnd = 0;
+
+
+    switch (locality)
+    {
+        case Nbnxm::AtomLocality::All:
+            gridBegin = 0;
+            gridEnd   = gridSet.grids().size();
+            break;
+        case Nbnxm::AtomLocality::Local:
+            gridBegin = 0;
+            gridEnd   = 1;
+            break;
+        case Nbnxm::AtomLocality::NonLocal:
+            gridBegin = 1;
+            gridEnd   = gridSet.grids().size();
+            break;
+        case Nbnxm::AtomLocality::Count:
+            GMX_ASSERT(false, "Count is invalid locality specifier");
+            break;
+    }
+
+
+
+    for (int g = gridBegin; g < gridEnd; g++)
+    {
+
+        const Nbnxm::Grid  &grid       = gridSet.grids()[g];
+
+        //TODO improve naming here. Either use the getters straight or
+        //using variables with about the same name as the getters
+        const int           ncxy            = grid.numColumns();
+        const int          *a               = gridSet.atomIndices().data();
+        const int           a_nalloc        = gridSet.atomIndices().size();
+        const int          *na_all          = grid.cxy_na().data();
+        const int          *cxy_ind         = grid.cxy_ind().data();
+        const int           natoms_nonlocal = gridSet.numRealAtomsTotal();
+
+        if (iloc == Nbnxm::InteractionLocality::Local)
+        {
+
+            if (gpu_nbv->xrvec)
+            {
+                freeDeviceBuffer(&gpu_nbv->xrvec);
+            }
+            //TODO replace with reallocateDeviceBuffer
+            allocateDeviceBuffer(&gpu_nbv->xrvec, natoms_nonlocal, nullptr);
+
+            if (gpu_nbv->abufops)
+            {
+                freeDeviceBuffer(&gpu_nbv->abufops);
+            }
+            //TODO replace with reallocateDeviceBuffer
+            allocateDeviceBuffer(&gpu_nbv->abufops, a_nalloc, nullptr);
+
+            if (a_nalloc > 0)
+            {
+                // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
+                stat = cudaHostRegister((void*) a, a_nalloc*sizeof(int), cudaHostRegisterDefault);
+                CU_RET_ERR(stat, "cudaHostRegister failed on a");
+
+                if (bDoTime)
+                {
+                    gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+                }
+
+                copyToDeviceBuffer(&gpu_nbv->abufops, a, 0, a_nalloc, stream, GpuApiCallBehavior::Async, nullptr);
+
+                if (bDoTime)
+                {
+                    gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+                }
+
+                stat = cudaHostUnregister((void*) a);
+                CU_RET_ERR(stat, "cudaHostUnRegister failed on a");
+            }
+        }
+
+
+        if (gpu_nbv->nabufops[locality])
+        {
+            freeDeviceBuffer(&gpu_nbv->nabufops[locality]);
+
+        }
+        //TODO replace with reallocateDeviceBuffer
+        allocateDeviceBuffer(&gpu_nbv->nabufops[locality], ncxy, nullptr);
+
+        if (gpu_nbv->cxybufops[locality])
+        {
+            freeDeviceBuffer(&gpu_nbv->cxybufops[locality]);
+        }
+        //TODO replace with reallocateDeviceBuffer
+        allocateDeviceBuffer(&gpu_nbv->cxybufops[locality], ncxy, nullptr);
+
+        if (ncxy > 0)
+        {
+            // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
+            stat = cudaHostRegister((void*) na_all, ncxy*sizeof(int), cudaHostRegisterDefault);
+            CU_RET_ERR(stat, "cudaHostRegister failed on na_all");
+
+            if (bDoTime)
+            {
+                gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+            }
+
+            copyToDeviceBuffer(&gpu_nbv->nabufops[locality], na_all, 0, ncxy, stream, GpuApiCallBehavior::Async, nullptr);
+
+            if (bDoTime)
+            {
+                gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+            }
+
+            stat = cudaHostUnregister((void*) na_all);
+            CU_RET_ERR(stat, "cudaHostUnRegister failed on na_all");
+
+            // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
+            stat = cudaHostRegister((void*) cxy_ind, ncxy*sizeof(int), cudaHostRegisterDefault);
+            CU_RET_ERR(stat, "cudaHostRegister failed on cxy_ind");
+
+            if (bDoTime)
+            {
+                gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+            }
+
+            copyToDeviceBuffer(&gpu_nbv->cxybufops[locality], cxy_ind, 0, ncxy, stream, GpuApiCallBehavior::Async, nullptr);
+
+            if (bDoTime)
+            {
+                gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+            }
+
+            stat = cudaHostUnregister((void*) cxy_ind);
+            CU_RET_ERR(stat, "cudaHostUnRegister failed on cxy_ind");
+        }
+    }
+    return;
+}
+
 } // namespace Nbnxm
index 5a8380f5b5b7eb09c952e0941ca24f4ebd4a1c18..70a7f90945c43d79e360822df7f670a42d930b60 100644 (file)
@@ -214,6 +214,10 @@ struct gmx_nbnxn_cuda_t
     const gmx_device_info_t                                        *dev_info;       /**< CUDA device information                              */
     bool                                                            bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU    */
     cu_atomdata_t                                                  *atdat;          /**< atom data                                            */
+    rvec                                                           *xrvec;          /**< coordinates in rvec format                           */
+    int                                                            *abufops;        /**< x buf ops input buffer index mapping                 */
+    gmx::EnumerationArray<Nbnxm::AtomLocality, int *>               nabufops;       /**< x buf ops num of atoms (local and non-local)         */
+    gmx::EnumerationArray<Nbnxm::AtomLocality, int *>               cxybufops;      /**< x buf ops cell index mapping (local and non-local)   */
     cu_nbparam_t                                                   *nbparam;        /**< parameters required for the non-bonded calc.         */
     gmx::EnumerationArray<Nbnxm::InteractionLocality, cu_plist_t *> plist;          /**< pair-list data structures (local and non-local)      */
     nb_staging_t                                                    nbst;           /**< staging area where fshift/energies get downloaded    */
index f02e456ad975bcd54aab2a94b743f46359cc900c..d875d58a157a230bf223070a9e0051200d8b588c 100644 (file)
@@ -56,7 +56,7 @@ using gmx_nbnxn_gpu_t = gmx_nbnxn_cuda_t;
 #endif
 
 #if GMX_GPU == GMX_GPU_NONE
-typedef int gmx_nbnxn_gpu_t;
+using gmx_nbnxn_gpu_t = int;
 #endif
 
 #endif // !DOXYGEN
index c16f6830e1a60aedc9b8c5bff5dc09bf2f1ac2a3..d44c476c22e7d21c644a83562edafa72aeffb8dd 100644 (file)
@@ -356,6 +356,27 @@ class Grid
             return cxy_na_[columnIndex];
         }
 
+        /*! \brief Returns a view of the number of non-filler, atoms for each grid column
+         *
+         * \todo Needs a useful name. */
+        gmx::ArrayRef<const int> cxy_na() const
+        {
+            return cxy_na_;
+        }
+        /*! \brief Returns a view of the grid-local cell index for each grid column
+         *
+         * \todo Needs a useful name. */
+        gmx::ArrayRef<const int> cxy_ind() const
+        {
+            return cxy_ind_;
+        }
+
+        //! Returns the number of real atoms in the column
+        int numAtomsPerCell() const
+        {
+            return geometry_.numAtomsPerCell;
+        }
+
         //! Returns the number of atoms in the column including padding
         int paddedNumAtomsInColumn(int columnIndex) const
         {
@@ -521,9 +542,13 @@ class Grid
         int        cellOffset_;
 
         /* Grid data */
-        //! The number of, non-filler, atoms for each grid column
+        /*! \brief The number of, non-filler, atoms for each grid column.
+         *
+         * \todo Needs a useful name. */
         std::vector<int> cxy_na_;
-        //! The grid-local cell index for each grid column
+        /*! \brief The grid-local cell index for each grid column
+         *
+         * \todo Needs a useful name. */
         std::vector<int> cxy_ind_;
 
         //! The number of cluster for each cell
index 688bcacac962e5499947b2df31dc2b3ab757535d..355517864e3a758b7e500d15585ad7e38bfe3b6f 100644 (file)
@@ -135,15 +135,15 @@ void nonbonded_verlet_t::setAtomProperties(const t_mdatoms          &mdatoms,
 void nonbonded_verlet_t::setCoordinates(const Nbnxm::AtomLocality       locality,
                                         const bool                      fillLocal,
                                         gmx::ArrayRef<const gmx::RVec>  x,
+                                        bool                            useGpu,
+                                        void                           *xPmeDevicePtr,
                                         gmx_wallcycle                  *wcycle)
 {
     wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
     wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
-
     nbnxn_atomdata_copy_x_to_nbat_x(pairSearch_->gridSet(), locality, fillLocal,
                                     as_rvec_array(x.data()),
-                                    nbat.get());
-
+                                    nbat.get(), useGpu, gpu_nbv, xPmeDevicePtr);
     wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
     wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
 }
@@ -183,4 +183,14 @@ void nonbonded_verlet_t::changePairlistRadii(real rlistOuter,
     pairlistSets_->changePairlistRadii(rlistOuter, rlistInner);
 }
 
+void
+nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu(const Nbnxm::AtomLocality        locality)
+{
+
+    nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(),
+                               gpu_nbv,
+                               locality);
+
+
+}
 /*! \endcond */
index abf3cf32052e8c9b68f8880bd6f38512e223d7ab..ef5bda9b8789cca668784e3d7cea62bb42c5c7b5 100644 (file)
@@ -244,8 +244,14 @@ struct nonbonded_verlet_t
         void setCoordinates(Nbnxm::AtomLocality             locality,
                             bool                            fillLocal,
                             gmx::ArrayRef<const gmx::RVec>  x,
+                            bool                            useGpu,
+                            void                           *xPmeDevicePtr,
                             gmx_wallcycle                  *wcycle);
 
+        //! Init for GPU version of setup coordinates in Nbnxm, for the given locality
+        void atomdata_init_copy_x_to_nbat_x_gpu(Nbnxm::AtomLocality        locality);
+
+
         //! Returns a reference to the pairlist sets
         const PairlistSets &pairlistSets() const
         {
index 8e5f77a9af48d89a82f570e47e1da4bd4fd9b01b..30d5f33725960cc669b45c844c21e2e1c91eb8c9 100644 (file)
@@ -45,6 +45,7 @@
 
 #include "gromacs/gpu_utils/gpu_macros.h"
 #include "gromacs/math/vectypes.h"
+#include "gromacs/nbnxm/atomdata.h"
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/real.h"
 
@@ -57,6 +58,8 @@ enum class GpuTaskCompletion;
 namespace Nbnxm
 {
 
+class GridSet;
+
 /*! \brief
  * Launch asynchronously the xq buffer host to device copy.
  *
@@ -213,6 +216,25 @@ void gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
 GPU_FUNC_QUALIFIER
 int gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETURN(-1)
 
+/*! \brief Initialization for X buffer operations on GPU.
+ * Called on the NS step and performs (re-)allocations and memory copies. !*/
+CUDA_FUNC_QUALIFIER
+void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
+                                gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv,
+                                Nbnxm::AtomLocality gmx_unused locality) CUDA_FUNC_TERM
+
+/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
+                           int                gmx_unused  g,
+                           bool               gmx_unused  FillLocal,
+                           gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv,
+                           void               gmx_unused *xPmeDevicePtr,
+                           int                gmx_unused  na_round_max,
+                           Nbnxm::AtomLocality gmx_unused locality,
+                           const rvec         gmx_unused *x) CUDA_FUNC_TERM
+
 } // namespace Nbnxm
 
 #endif