Decouple GPU force buffer management from buffer ops in NBNXM

author Artem Zhmurov <zhmurov@gmail.com>

Wed, 11 Sep 2019 12:19:31 +0000 (14:19 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 11 Sep 2019 20:53:52 +0000 (22:53 +0200)
author Artem Zhmurov <zhmurov@gmail.com>
Wed, 11 Sep 2019 12:19:31 +0000 (14:19 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 11 Sep 2019 20:53:52 +0000 (22:53 +0200)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 853f97b021063e196bd11bdf89422f78ee511f37..768ca4577733cfc69b43a9bc73dca0946a8b411a 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1455,14 +1455,22 @@ void do_force(FILE                                     *fplog,
              // flag to specify if forces should be accumulated in force buffer
              // ops. For non-local part, this just depends on whether CPU forces are present.
              bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) && haveCpuForces;
-            nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
-                                          forceWithShiftForces.force(), pme_gpu_get_device_f(fr->pmedata),
-                                          pme_gpu_get_f_ready_synchronizer(fr->pmedata),
-                                          useGpuFBufOps, useGpuFPmeReduction, accumulateForce);
+
              if (useGpuFBufOps == BufferOpsUseGpu::True)
              {
+                nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal,
+                                                  nbv->getDeviceForces(),
+                                                  pme_gpu_get_device_f(fr->pmedata),
+                                                  pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+                                                  useGpuFPmeReduction, accumulateForce);
                  nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::NonLocal);
              }
+            else
+            {
+                nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
+                                              forceWithShiftForces.force());
+            }
+
  
              if (fr->nbv->emulateGpu() && forceFlags.computeVirial)
              {
@@ -1574,15 +1582,22 @@ void do_force(FILE                                     *fplog,
          // non-local part).
          bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) &&
              (haveCpuForces || DOMAINDECOMP(cr));
-        nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
-                                      force, pme_gpu_get_device_f(fr->pmedata),
-                                      pme_gpu_get_f_ready_synchronizer(fr->pmedata),
-                                      useGpuFBufOps, useGpuFPmeReduction, accumulateForce);
+
          if (useGpuFBufOps == BufferOpsUseGpu::True)
          {
+            nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local,
+                                              nbv->getDeviceForces(),
+                                              pme_gpu_get_device_f(fr->pmedata),
+                                              pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+                                              useGpuFPmeReduction, accumulateForce);
              nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::Local);
              nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
          }
+        else
+        {
+            nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, force);
+        }
+
      }
  
      launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd,
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp

index 6424b1558e4d958aa566b88db8872d970965cba1..056146263322059b642cb8d336296e669629f96b 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1472,16 +1472,10 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t *nbat,
  
  
  /* Add the force array(s) from nbnxn_atomdata_t to f */
-template <bool  useGpu>
  void reduceForces(nbnxn_atomdata_t                *nbat,
                    const Nbnxm::AtomLocality        locality,
                    const Nbnxm::GridSet            &gridSet,
-                  rvec                            *f,
-                  void                            *pmeFDeviceBuffer,
-                  GpuEventSynchronizer            *pmeForcesReady,
-                  gmx_nbnxn_gpu_t                 *gpu_nbv,
-                  bool                             useGpuFPmeReduction,
-                  bool                             accumulateForce)
+                  rvec                            *f)
  {
      int a0 = 0;
      int na = 0;
@@ -1494,75 +1488,77 @@ void reduceForces(nbnxn_atomdata_t                *nbat,
          return;
      }
  
-    if (useGpu)
-    {
-        Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality,
-                                         gpu_nbv,
-                                         pmeFDeviceBuffer,
-                                         pmeForcesReady,
-                                         a0, na,
-                                         useGpuFPmeReduction,
-                                         accumulateForce);
-    }
-    else
-    {
-        int nth = gmx_omp_nthreads_get(emntNonbonded);
+    int nth = gmx_omp_nthreads_get(emntNonbonded);
  
-        if (nbat->out.size() > 1)
+    if (nbat->out.size() > 1)
+    {
+        if (locality != Nbnxm::AtomLocality::All)
          {
-            if (locality != Nbnxm::AtomLocality::All)
-            {
-                gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
-            }
+            gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
+        }
  
-            /* Reduce the force thread output buffers into buffer 0, before adding
-             * them to the, differently ordered, "real" force buffer.
-             */
-            if (nbat->bUseTreeReduce)
-            {
-                nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth);
-            }
-            else
-            {
-                nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth);
-            }
+        /* Reduce the force thread output buffers into buffer 0, before adding
+         * them to the, differently ordered, "real" force buffer.
+         */
+        if (nbat->bUseTreeReduce)
+        {
+            nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth);
          }
+        else
+        {
+            nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth);
+        }
+    }
  #pragma omp parallel for num_threads(nth) schedule(static)
-        for (int th = 0; th < nth; th++)
+    for (int th = 0; th < nth; th++)
+    {
+        try
          {
-            try
-            {
-                nbnxn_atomdata_add_nbat_f_to_f_part(gridSet, *nbat,
-                                                    nbat->out[0],
-                                                    a0 + ((th + 0)*na)/nth,
-                                                    a0 + ((th + 1)*na)/nth,
-                                                    f);
-            }
-            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+            nbnxn_atomdata_add_nbat_f_to_f_part(gridSet, *nbat,
+                                                nbat->out[0],
+                                                a0 + ((th + 0)*na)/nth,
+                                                a0 + ((th + 1)*na)/nth,
+                                                f);
          }
+        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+    }
+}
+
+/* Add the force array(s) from nbnxn_atomdata_t to f */
+void reduceForcesGpu(const Nbnxm::AtomLocality        locality,
+                     DeviceBuffer<float>              totalForcesDevice,
+                     const Nbnxm::GridSet            &gridSet,
+                     void                            *pmeForcesDevice,
+                     GpuEventSynchronizer            *pmeForcesReady,
+                     gmx_nbnxn_gpu_t                 *gpu_nbv,
+                     bool                             useGpuFPmeReduction,
+                     bool                             accumulateForce)
+{
+    int atomsStart = 0;
+    int numAtoms   = 0;
+
+    nbnxn_get_atom_range(locality, gridSet, &atomsStart, &numAtoms);
+
+    if (numAtoms == 0)
+    {
+        /* The are no atoms for this reduction, avoid some overhead */
+        return;
      }
+
+    Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality,
+                                     totalForcesDevice,
+                                     gpu_nbv,
+                                     pmeForcesDevice,
+                                     pmeForcesReady,
+                                     atomsStart, numAtoms,
+                                     useGpuFPmeReduction,
+                                     accumulateForce);
+}
+
+DeviceBuffer<float> nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv)
+{
+    return Nbnxm::nbnxn_gpu_get_f_gpu(gpu_nbv);
  }
-template
-void reduceForces<true>(nbnxn_atomdata_t             *nbat,
-                        const Nbnxm::AtomLocality     locality,
-                        const Nbnxm::GridSet         &gridSet,
-                        rvec                         *f,
-                        void                         *fpme,
-                        GpuEventSynchronizer         *pmeForcesReady,
-                        gmx_nbnxn_gpu_t              *gpu_nbv,
-                        bool                          useGpuFPmeReduction,
-                        bool                          accumulateForce);
-
-template
-void reduceForces<false>(nbnxn_atomdata_t             *nbat,
-                         const Nbnxm::AtomLocality     locality,
-                         const Nbnxm::GridSet         &gridSet,
-                         rvec                         *f,
-                         void                         *fpme,
-                         GpuEventSynchronizer         *pmeForcesReady,
-                         gmx_nbnxn_gpu_t              *gpu_nbv,
-                         bool                          useGpuFPmeReduction,
-                         bool                          accumulateForce);
  
  void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t   &nbat,
                                                gmx::ArrayRef<gmx::RVec>  fshift)
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h

index 488290bb5b2e3e5257ecfa3e119ec927f2b95bc5..ba644bd436e093cd022a611e044dc486fddf0f08 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -367,40 +367,47 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
                                      gmx_nbnxn_gpu_t          *gpu_nbv,
                                      DeviceBuffer<float>       coordinatesDevice);
  
-//! Add the computed forces to \p f, an internal reduction might be performed as well
-template <bool  useGpu>
+/*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
+ *
+ * \param[in]  nbat        Atom data in NBNXM format.
+ * \param[in]  locality    If the reduction should be performed on local or non-local atoms.
+ * \param[in]  gridSet     The grids data.
+ * \param[out] totalForce  Buffer to accumulate resulting force
+ */
  void reduceForces(nbnxn_atomdata_t                   *nbat,
                    Nbnxm::AtomLocality                 locality,
                    const Nbnxm::GridSet               &gridSet,
-                  rvec                               *f,
-                  void                               *pmeFDeviceBuffer,
-                  GpuEventSynchronizer               *pmeForcesReady,
-                  gmx_nbnxn_gpu_t                    *gpu_nbv,
-                  bool                                useGpuFPmeReduction,
-                  bool                                accumulateForce);
-
-
-extern template
-void reduceForces<true>(nbnxn_atomdata_t             *nbat,
-                        const Nbnxm::AtomLocality     locality,
-                        const Nbnxm::GridSet         &gridSet,
-                        rvec                         *f,
-                        void                         *pmeFDeviceBuffer,
-                        GpuEventSynchronizer         *pmeForcesReady,
-                        gmx_nbnxn_gpu_t              *gpu_nbv,
-                        bool                          useGpuFPmeReduction,
-                        bool                          accumulateForce);
-
-extern template
-void reduceForces<false>(nbnxn_atomdata_t             *nbat,
-                         const Nbnxm::AtomLocality     locality,
-                         const Nbnxm::GridSet         &gridSet,
-                         rvec                         *f,
-                         void                         *pmeFDeviceBuffer,
-                         GpuEventSynchronizer         *pmeForcesReady,
-                         gmx_nbnxn_gpu_t              *gpu_nbv,
-                         bool                          useGpuFPmeReduction,
-                         bool                          accumulateForce);
+                  rvec                               *totalForce);
+
+/*! \brief Reduce forces on the GPU
+ *
+ * \param[in]  locality             If the reduction should be performed on local or non-local atoms.
+ * \param[out] totalForcesDevice    Device buffer to accumulate resulting force.
+ * \param[in]  gridSet              The grids data.
+ * \param[in]  pmeForcesDevice      Device buffer with PME forces.
+ * \param[in]  pmeForcesReady       Event that signals when the PME forces are ready for the reduction.
+ * \param[in]  gpu_nbv              The NBNXM GPU data structure.
+ * \param[in]  useGpuFPmeReduction  Whether PME forces should be added.
+ * \param[in]  accumulateForce      Whether there are usefull data already in the total force buffer.
+ */
+void reduceForcesGpu(Nbnxm::AtomLocality                 locality,
+                     DeviceBuffer<float>                 totalForcesDevice,
+                     const Nbnxm::GridSet               &gridSet,
+                     void                               *pmeForcesDevice,
+                     GpuEventSynchronizer               *pmeForcesReady,
+                     gmx_nbnxn_gpu_t                    *gpu_nbv,
+                     bool                                useGpuFPmeReduction,
+                     bool                                accumulateForce);
+
+/*!\brief Getter for the GPU forces buffer
+ *
+ * \todo Will be removed when the buffer management is lifted out of the NBNXM
+ *
+ * \param[in] gpu_nbv  The NBNXM GPU data structure.
+ *
+ * \returns Device forces buffer
+ */
+DeviceBuffer<float> nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv);
  
  /* Add the fshift force stored in nbat to fshift */
  void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t   &nbat,
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh

index bc0d2c2c5e81e1dda9f0ec8bbb66b4bc4ea9296f..3c8f7b1cb3877568852c527bd99d386ded9f9700 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh
@@ -137,56 +137,59 @@ __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int                         numColu
  
  }
  
-/*! \brief CUDA kernel to add part of the force array(s) from nbnxn_atomdata_t to f
+/*! \brief CUDA kernel to sum up the force components
   *
- * \param[in]     fnb              Force in nbat format
- * \param[in]     fPmeDeviceBuffer PME force
- * \param[in,out] f                Force buffer to be reduced into
- * \param[in]     cell             Cell index mapping
- * \param[in]     atomStart        Start atom index
- * \param[in]     nAtoms           Number of Atoms
+ * \tparam        accumulateForce  If the initial forces in \p d_fTotal should be saved.
+ * \tparam        addPmeForce      Whether the PME force should be added to the total.
+ *
+ * \param[in]     d_fNB            Non-bonded forces in nbat format.
+ * \param[in]     d_fPme           PME forces.
+ * \param[in,out] d_fTotal         Force buffer to be reduced into.
+ * \param[in]     cell             Cell index mapping.
+ * \param[in]     atomStart        Start atom index.
+ * \param[in]     numAtoms         Number of atoms.
   */
-template <bool accumulateForce, bool addPmeF>
+template <bool accumulateForce, bool addPmeForce>
  __global__ void
-nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
-                                 const float3 *__restrict__ fPmeDeviceBuffer,
-                                 float3                   * f,
-                                 const int *__restrict__    cell,
-                                 const int                  atomStart,
-                                 const int                  nAtoms);
-template <bool accumulateForce, bool addPmeF>
+nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__  d_fNB,
+                                 const float3 *__restrict__  d_fPme,
+                                 float3                     *d_fTotal,
+                                 const int *__restrict__     d_cell,
+                                 const int                   atomStart,
+                                 const int                   numAtoms);
+template <bool accumulateForce, bool addPmeForce>
  __global__ void
-nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
-                                 const float3 *__restrict__ fPmeDeviceBuffer,
-                                 float3                   * f,
-                                 const int *__restrict__    cell,
-                                 const int                  atomStart,
-                                 const int                  nAtoms)
+nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__  d_fNB,
+                                 const float3 *__restrict__  d_fPme,
+                                 float3                     *d_fTotal,
+                                 const int *__restrict__     d_cell,
+                                 const int                   atomStart,
+                                 const int                   numAtoms)
  {
  
      /* map particle-level parallelism to 1D CUDA thread and block index */
      int threadIndex = blockIdx.x*blockDim.x+threadIdx.x;
  
      /* perform addition for each particle*/
-    if (threadIndex < nAtoms)
+    if (threadIndex < numAtoms)
      {
  
-        int     i        = cell[atomStart+threadIndex];
-        float3 *fDest    = (float3 *)&f[atomStart+threadIndex];
+        int     i        = d_cell[atomStart+threadIndex];
+        float3 *fDest    = (float3 *)&d_fTotal[atomStart+threadIndex];
          float3  temp;
  
          if (accumulateForce)
          {
              temp  = *fDest;
-            temp += fnb[i];
+            temp += d_fNB[i];
          }
          else
          {
-            temp = fnb[i];
+            temp = d_fNB[i];
          }
-        if (addPmeF)
+        if (addPmeForce)
          {
-            temp += fPmeDeviceBuffer[atomStart+threadIndex];
+            temp += d_fPme[atomStart+threadIndex];
          }
          *fDest = temp;
  
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 63051b7c4314f77632157b6c011d0549d9cd03ae..6a17ee6a9b14386238cae8a4b4dfb4acead899ee 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -859,22 +859,22 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
  
  /* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
  void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
+                               DeviceBuffer<float>              totalForcesDevice,
                                 gmx_nbnxn_gpu_t                 *nb,
-                               void                            *fPmeDevicePtr,
+                               void                            *pmeForcesDevice,
                                 GpuEventSynchronizer            *pmeForcesReady,
                                 int                              atomStart,
-                               int                              nAtoms,
+                               int                              numAtoms,
                                 bool                             useGpuFPmeReduction,
                                 bool                             accumulateForce)
  {
      GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
  
-    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
-    cudaStream_t              stream    = nb->stream[iLocality];
-    cu_atomdata_t            *adat      = nb->atdat;
-    bool                      addPmeF   = useGpuFPmeReduction;
+    const InteractionLocality iLocality     = gpuAtomToInteractionLocality(atomLocality);
+    cudaStream_t              stream        = nb->stream[iLocality];
+    cu_atomdata_t            *adat          = nb->atdat;
  
-    if (addPmeF)
+    if (useGpuFPmeReduction)
      {
          //Stream must wait for PME force completion
          pmeForcesReady->enqueueWaitEvent(stream);
@@ -886,7 +886,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
      config.blockSize[0]     = c_bufOpsThreadsPerBlock;
      config.blockSize[1]     = 1;
      config.blockSize[2]     = 1;
-    config.gridSize[0]      = ((nAtoms+1)+c_bufOpsThreadsPerBlock-1)/c_bufOpsThreadsPerBlock;
+    config.gridSize[0]      = ((numAtoms+1)+c_bufOpsThreadsPerBlock-1)/c_bufOpsThreadsPerBlock;
      config.gridSize[1]      = 1;
      config.gridSize[2]      = 1;
      config.sharedMemorySize = 0;
@@ -896,30 +896,35 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
          nbnxn_gpu_add_nbat_f_to_f_kernel<true, false> :
          nbnxn_gpu_add_nbat_f_to_f_kernel<false, false>;
  
-    if (addPmeF)
+    if (useGpuFPmeReduction)
      {
          kernelFn = accumulateForce ?
              nbnxn_gpu_add_nbat_f_to_f_kernel<true, true> :
              nbnxn_gpu_add_nbat_f_to_f_kernel<false, true>;
      }
  
-    const float3     *d_f     = adat->f;
-    float3           *d_fNB   = (float3*) nb->frvec;
-    const float3     *d_fPme  = (float3*) fPmeDevicePtr;
-    const int        *d_cell  = nb->cell;
+    const float3     *d_fNB    = adat->f;
+    const float3     *d_fPme   = (float3*) pmeForcesDevice;
+    float3           *d_fTotal = (float3*) totalForcesDevice;
+    const int        *d_cell   = nb->cell;
  
      const auto        kernelArgs   = prepareGpuKernelArguments(kernelFn, config,
-                                                               &d_f,
-                                                               &d_fPme,
                                                                 &d_fNB,
+                                                               &d_fPme,
+                                                               &d_fTotal,
                                                                 &d_cell,
                                                                 &atomStart,
-                                                               &nAtoms);
+                                                               &numAtoms);
  
      launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs);
  
  }
  
+DeviceBuffer<float> nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t *nb)
+{
+    return reinterpret_cast< DeviceBuffer<float> >(nb->frvec);
+}
+
  void nbnxn_launch_copy_f_to_gpu(const AtomLocality               atomLocality,
                                  const Nbnxm::GridSet            &gridSet,
                                  gmx_nbnxn_gpu_t                 *nb,
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 794b184f36747dd496a155626868c76eff2f6e74..68eac205a431f5def1534b8db975dbeb39e98ebb 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -203,26 +203,22 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality
      wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
      wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
  
-    reduceForces<false>(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data()), nullptr, nullptr, gpu_nbv, false, false);
+    reduceForces(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data()));
  
      wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
      wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
  }
  
  void
-nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality           locality,
-                                             gmx::ArrayRef<gmx::RVec>            force,
-                                             void                               *fPmeDeviceBuffer,
-                                             GpuEventSynchronizer               *pmeForcesReady,
-                                             BufferOpsUseGpu                     useGpu,
-                                             bool                                useGpuFPmeReduction,
-                                             bool                                accumulateForce)
+nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality           locality,
+                                                 DeviceBuffer<float>                 totalForcesDevice,
+                                                 void                               *forcesPmeDevice,
+                                                 GpuEventSynchronizer               *pmeForcesReady,
+                                                 bool                                useGpuFPmeReduction,
+                                                 bool                                accumulateForce)
  {
  
-    GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) && accumulateForce),
-               "Accumulatation of force is only valid when GPU buffer ops are active");
-
-    GMX_ASSERT((useGpuFPmeReduction == (fPmeDeviceBuffer != nullptr)),
+    GMX_ASSERT((useGpuFPmeReduction == (forcesPmeDevice != nullptr)),
                 "GPU PME force reduction is only valid when a non-null GPU PME force pointer is available");
  
      /* Skip the reduction if there was no short-range GPU work to do
@@ -235,8 +231,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality
      wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
      wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
  
-    auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces<true> : reduceForces<false>;
-    fn(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data()), fPmeDeviceBuffer, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce);
+    reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce);
  
      wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
      wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
@@ -259,6 +254,11 @@ nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
      wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
  }
  
+DeviceBuffer<float> nonbonded_verlet_t::getDeviceForces()
+{
+    return nbnxn_atomdata_get_f_gpu(gpu_nbv);
+}
+
  real nonbonded_verlet_t::pairlistInnerRadius() const
  {
      return pairlistSets_->params().rlistInner;
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index 50ead8c6551f405489acffe0deb617f9028d1a70..db550859dfffa7522a18482a09d1c28e0f1b8b8a 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -354,22 +354,30 @@ struct nonbonded_verlet_t
          void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality                 locality,
                                        gmx::ArrayRef<gmx::RVec>            force);
  
-        /*! \brief Add the forces stored in nbat to f, allowing for possibility that GPU buffer ops are active
-         * \param [in] locality         Local or non-local
-         * \param [inout] force         Force to be added to
-         * \param [in] fPme             Force from PME calculation
-         * \param [in] pmeForcesReady   Event triggered when PME force calculation has completed
-         * \param [in] useGpu           Whether GPU buffer ops are active
-         * \param [in] useGpuFPmeReduction   Whether PME force reduction is on GPU
-         * \param [in] accumulateForce  Whether force should be accumulated or stored
+        /*! \brief Add the forces stored in nbat to total force using GPU buffer opse
+         *
+         * \param [in]     locality             Local or non-local
+         * \param [in,out] totalForcesDevice    Force to be added to
+         * \param [in]     forcesPmeDevice      Device buffer with PME forces
+         * \param [in]     pmeForcesReady       Event triggered when PME force calculation has completed
+         * \param [in]     useGpuFPmeReduction  Whether PME forces should be added
+         * \param [in]     accumulateForce      If the total force buffer already contains data
           */
-        void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality                 locality,
-                                      gmx::ArrayRef<gmx::RVec>            force,
-                                      void                               *fPme,
-                                      GpuEventSynchronizer               *pmeForcesReady,
-                                      BufferOpsUseGpu                     useGpu,
-                                      bool                                useGpuFPmeReduction,
-                                      bool                                accumulateForce);
+        void atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality                 locality,
+                                          DeviceBuffer<float>                 totalForcesDevice,
+                                          void                               *forcesPmeDevice,
+                                          GpuEventSynchronizer               *pmeForcesReady,
+                                          bool                                useGpuFPmeReduction,
+                                          bool                                accumulateForce);
+
+        /*!\brief Getter for the GPU force buffer.
+         *
+         * \todo This function will be removed in future patches as the management of the device buffers
+         *       is moved to a separate object.
+         *
+         * \returns The force buffer in plain rvec format.
+         */
+        DeviceBuffer<float> getDeviceForces();
  
          /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
          void atomdata_init_add_nbat_f_to_f_gpu();
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 4dfcfa15d476f9737c912824d4047d42dfea7652..8721e40da6eefec07b8f00b8eba4c612bf1cf8f9 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -324,17 +324,44 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int               gmx_unused *cell,
                                      gmx_nbnxn_gpu_t         gmx_unused *gpu_nbv,
                                      int                     gmx_unused  natoms_total) CUDA_FUNC_TERM;
  
-/*! \brief F buffer operations on GPU: adds nb format force to rvec format. */
+/*! \brief Force buffer operations on GPU.
+ *
+ * Transforms non-bonded forces into plain rvec format and add all the force components to the total
+ * force buffer
+ *
+ * \param[in]   atomLocality         If the reduction should be performed on local or non-local atoms.
+ * \param[in]   totalForcesDevice    Device buffer to accumulate resulting force.
+ * \param[in]   gpu_nbv              The NBNXM GPU data structure.
+ * \param[in]   pmeForcesDevice      Device buffer with PME forces.
+ * \param[in]   pmeForcesReady       Event that signals when the PME forces are ready for the reduction.
+ * \param[in]   atomStart            Index of the first atom to reduce forces for.
+ * \param[in]   numAtoms             Number of atoms to reduce forces for.
+ * \param[in]   useGpuFPmeReduction  Whether PME forces should be added.
+ * \param[in]   accumulateForce      Whether there are usefull data already in the total force buffer.
+ *
+ */
  CUDA_FUNC_QUALIFIER
  void nbnxn_gpu_add_nbat_f_to_f(AtomLocality                 gmx_unused  atomLocality,
+                               DeviceBuffer<float>          gmx_unused  totalForcesDevice,
                                 gmx_nbnxn_gpu_t              gmx_unused *gpu_nbv,
-                               void                         gmx_unused *fPmeDevicePtr,
+                               void                         gmx_unused *pmeForcesDevice,
                                 GpuEventSynchronizer         gmx_unused *pmeForcesReady,
                                 int                          gmx_unused  atomStart,
-                               int                          gmx_unused  nAtoms,
+                               int                          gmx_unused  numAtoms,
                                 bool                         gmx_unused  useGpuFPmeReduction,
                                 bool                         gmx_unused  accumulateForce) CUDA_FUNC_TERM;
  
+/*! \brief Getter for the device coordinates buffer.
+ *
+ * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
+ *
+ * \param[in]  gpu_nbv  The nonbonded data GPU structure.
+ *
+ * \returns Device coordinates buffer in plain rvec format.
+ */
+CUDA_FUNC_QUALIFIER
+DeviceBuffer<float> nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
+
  /*! \brief Copy force buffer from CPU to GPU */
  CUDA_FUNC_QUALIFIER
  void nbnxn_launch_copy_f_to_gpu(AtomLocality            gmx_unused  atomLocality,
author	Artem Zhmurov <zhmurov@gmail.com>
	Wed, 11 Sep 2019 12:19:31 +0000 (14:19 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 11 Sep 2019 20:53:52 +0000 (22:53 +0200)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.h		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history