Pass list of EventSynchronizers to GPU reduction

author Szilárd Páll <pall.szilard@gmail.com>

Tue, 8 Oct 2019 01:47:46 +0000 (03:47 +0200)

committer Artem Zhmurov <zhmurov@gmail.com>

Thu, 10 Oct 2019 18:24:02 +0000 (20:24 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Tue, 8 Oct 2019 01:47:46 +0000 (03:47 +0200)
committer Artem Zhmurov <zhmurov@gmail.com>
Thu, 10 Oct 2019 18:24:02 +0000 (20:24 +0200)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 0ea11d91465992adca3f0c305d077399554b1b11..331c48a64078500a654a5d8cbde65f0c1a1cad29 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1493,6 +1493,14 @@ void do_force(FILE                                     *fplog,
  
              if (useGpuFBufOps == BufferOpsUseGpu::True)
              {
+                std::vector<GpuEventSynchronizer*> dependencyList;
+                dependencyList.reserve(2);
+
+                if (useGpuPmeFReduction)
+                {
+                    dependencyList.push_back(pme_gpu_get_f_ready_synchronizer(fr->pmedata));
+                }
+
                  // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
                  // The bonded and free energy CPU tasks can have non-local force contributions
                  // which are a dependency for the GPU force reduction.
@@ -1501,11 +1509,16 @@ void do_force(FILE                                     *fplog,
                  if (haveNonLocalForceContribInCpuBuffer)
                  {
                      stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
+                    dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal));
                  }
+
+                //
+                // FIXME: are we adding a PME->nonlocal dep here?
+                //
                  nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal,
                                                    stateGpu->getForces(),
                                                    pme_gpu_get_device_f(fr->pmedata),
-                                                  pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+                                                  dependencyList,
                                                    useGpuPmeFReduction, haveNonLocalForceContribInCpuBuffer);
                  stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
              }
@@ -1626,6 +1639,14 @@ void do_force(FILE                                     *fplog,
       * on the non-alternating path. */
      if (bUseOrEmulGPU && !alternateGpuWait)
      {
+        std::vector<GpuEventSynchronizer*> dependencyList;
+        dependencyList.reserve(2);
+
+        if (useGpuPmeFReduction)
+        {
+            dependencyList.push_back(pme_gpu_get_f_ready_synchronizer(fr->pmedata));
+        }
+
          gmx::ArrayRef<gmx::RVec>  forceWithShift = forceOut.forceWithShiftForces().force();
  
          if (useGpuFBufOps == BufferOpsUseGpu::True)
@@ -1646,6 +1667,7 @@ void do_force(FILE                                     *fplog,
              if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
              {
                  stateGpu->copyForcesToGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
+                dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local));
              }
              if (useGpuForcesHaloExchange)
              {
@@ -1653,13 +1675,15 @@ void do_force(FILE                                     *fplog,
                  // for the local buffer ops on the result of GPU halo
                  // exchange, which operates in the non-local stream and
                  // writes to to local parf og the force buffer.
+                //
                  // TODO improve this through use of an event - see Redmine #3093
+                //      push the event into the dependencyList
                  nbv->stream_local_wait_for_nonlocal();
              }
              nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local,
                                                stateGpu->getForces(),
                                                pme_gpu_get_device_f(fr->pmedata),
-                                              pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+                                              dependencyList,
                                                useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
              // This function call synchronizes the local stream
              nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp

index a4e7b0285d86874cf628c1bc64b61facc0c76214..4a22cebe83e32e21dbde7514f7341881c3e0f8bf 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1492,14 +1492,14 @@ void reduceForces(nbnxn_atomdata_t                *nbat,
  }
  
  /* Add the force array(s) from nbnxn_atomdata_t to f */
-void reduceForcesGpu(const Nbnxm::AtomLocality        locality,
-                     DeviceBuffer<float>              totalForcesDevice,
-                     const Nbnxm::GridSet            &gridSet,
-                     void                            *pmeForcesDevice,
-                     GpuEventSynchronizer            *pmeForcesReady,
-                     gmx_nbnxn_gpu_t                 *gpu_nbv,
-                     bool                             useGpuFPmeReduction,
-                     bool                             accumulateForce)
+void reduceForcesGpu(const Nbnxm::AtomLocality                   locality,
+                     DeviceBuffer<float>                         totalForcesDevice,
+                     const Nbnxm::GridSet                       &gridSet,
+                     void                                       *pmeForcesDevice,
+                     gmx::ArrayRef<GpuEventSynchronizer* const>  dependencyList,
+                     gmx_nbnxn_gpu_t                            *gpu_nbv,
+                     bool                                        useGpuFPmeReduction,
+                     bool                                        accumulateForce)
  {
      int atomsStart = 0;
      int numAtoms   = 0;
@@ -1516,7 +1516,7 @@ void reduceForcesGpu(const Nbnxm::AtomLocality        locality,
                                       totalForcesDevice,
                                       gpu_nbv,
                                       pmeForcesDevice,
-                                     pmeForcesReady,
+                                     dependencyList,
                                       atomsStart, numAtoms,
                                       useGpuFPmeReduction,
                                       accumulateForce);
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h

index 412d328bcea5be22cea2621dcf84599294c451e2..35d711beffc3902949d18d1c086528abe57e8d35 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -359,19 +359,19 @@ void reduceForces(nbnxn_atomdata_t                   *nbat,
   * \param[out] totalForcesDevice    Device buffer to accumulate resulting force.
   * \param[in]  gridSet              The grids data.
   * \param[in]  pmeForcesDevice      Device buffer with PME forces.
- * \param[in]  pmeForcesReady       Event that signals when the PME forces are ready for the reduction.
+ * \param[in]  dependencyList       List of synchronizers that represent the dependencies the reduction task needs to sync on.
   * \param[in]  gpu_nbv              The NBNXM GPU data structure.
   * \param[in]  useGpuFPmeReduction  Whether PME forces should be added.
   * \param[in]  accumulateForce      Whether there are usefull data already in the total force buffer.
   */
-void reduceForcesGpu(Nbnxm::AtomLocality                 locality,
-                     DeviceBuffer<float>                 totalForcesDevice,
-                     const Nbnxm::GridSet               &gridSet,
-                     void                               *pmeForcesDevice,
-                     GpuEventSynchronizer               *pmeForcesReady,
-                     gmx_nbnxn_gpu_t                    *gpu_nbv,
-                     bool                                useGpuFPmeReduction,
-                     bool                                accumulateForce);
+void reduceForcesGpu(Nbnxm::AtomLocality                         locality,
+                     DeviceBuffer<float>                         totalForcesDevice,
+                     const Nbnxm::GridSet                       &gridSet,
+                     void                                       *pmeForcesDevice,
+                     gmx::ArrayRef<GpuEventSynchronizer* const>  dependencyList,
+                     gmx_nbnxn_gpu_t                            *gpu_nbv,
+                     bool                                        useGpuFPmeReduction,
+                     bool                                        accumulateForce);
  
  /* Add the fshift force stored in nbat to fshift */
  void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t   &nbat,
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index ca88b5e94f8accde6f1d1e4944848cfb14d21daa..23502b3fbe8bac6d88c26a5c7910c65d5790e2b2 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -806,15 +806,15 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
  }
  
  /* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
-void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
-                               DeviceBuffer<float>              totalForcesDevice,
-                               gmx_nbnxn_gpu_t                 *nb,
-                               void                            *pmeForcesDevice,
-                               GpuEventSynchronizer            *pmeForcesReady,
-                               int                              atomStart,
-                               int                              numAtoms,
-                               bool                             useGpuFPmeReduction,
-                               bool                             accumulateForce)
+void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                          atomLocality,
+                               DeviceBuffer<float>                         totalForcesDevice,
+                               gmx_nbnxn_gpu_t                            *nb,
+                               void                                       *pmeForcesDevice,
+                               gmx::ArrayRef<GpuEventSynchronizer* const>  dependencyList,
+                               int                                         atomStart,
+                               int                                         numAtoms,
+                               bool                                        useGpuFPmeReduction,
+                               bool                                        accumulateForce)
  {
      GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
      GMX_ASSERT(numAtoms != 0, "Cannot call function with no atoms");
@@ -824,10 +824,15 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
      cudaStream_t              stream        = nb->stream[iLocality];
      cu_atomdata_t            *adat          = nb->atdat;
  
-    if (useGpuFPmeReduction)
+    size_t gmx_used_in_debug  numDependency =
+        static_cast<size_t>((useGpuFPmeReduction == true)) +
+        static_cast<size_t>((accumulateForce == true));
+    GMX_ASSERT(numDependency >= dependencyList.size(), "Mismatching number of dependencies and call signature");
+
+    // Enqueue wait on all dependencies passed
+    for (auto const synchronizer : dependencyList)
      {
-        //Stream must wait for PME force completion
-        pmeForcesReady->enqueueWaitEvent(stream);
+        synchronizer->enqueueWaitEvent(stream);
      }
  
      /* launch kernel */
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 88e6169b71791aa19c2ee75e2a1b0a1041d1605d..0ec882f3b7f50a9652d1b8e9d39a72af215a54ed 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -189,12 +189,12 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality
  }
  
  void
-nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality           locality,
-                                                 DeviceBuffer<float>                 totalForcesDevice,
-                                                 void                               *forcesPmeDevice,
-                                                 GpuEventSynchronizer               *pmeForcesReady,
-                                                 bool                                useGpuFPmeReduction,
-                                                 bool                                accumulateForce)
+nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality                   locality,
+                                                 DeviceBuffer<float>                         totalForcesDevice,
+                                                 void                                       *forcesPmeDevice,
+                                                 gmx::ArrayRef<GpuEventSynchronizer* const>  dependencyList,
+                                                 bool                                        useGpuFPmeReduction,
+                                                 bool                                        accumulateForce)
  {
  
      GMX_ASSERT((useGpuFPmeReduction == (forcesPmeDevice != nullptr)),
@@ -210,7 +210,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality
      wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
      wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
  
-    reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce);
+    reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice, dependencyList, gpu_nbv, useGpuFPmeReduction, accumulateForce);
  
      wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
      wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index 3be931f7d98db9a76b4bf8da0f945234b15403c5..2f25f701cb5bd72145a3c5d5d599253509dc576e 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -346,16 +346,16 @@ struct nonbonded_verlet_t
           * \param [in]     locality             Local or non-local
           * \param [in,out] totalForcesDevice    Force to be added to
           * \param [in]     forcesPmeDevice      Device buffer with PME forces
-         * \param [in]     pmeForcesReady       Event triggered when PME force calculation has completed
+         * \param[in]      dependencyList       List of synchronizers that represent the dependencies the reduction task needs to sync on.
           * \param [in]     useGpuFPmeReduction  Whether PME forces should be added
           * \param [in]     accumulateForce      If the total force buffer already contains data
           */
-        void atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality                 locality,
-                                          DeviceBuffer<float>                 totalForcesDevice,
-                                          void                               *forcesPmeDevice,
-                                          GpuEventSynchronizer               *pmeForcesReady,
-                                          bool                                useGpuFPmeReduction,
-                                          bool                                accumulateForce);
+        void atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality                         locality,
+                                          DeviceBuffer<float>                         totalForcesDevice,
+                                          void                                       *forcesPmeDevice,
+                                          gmx::ArrayRef<GpuEventSynchronizer* const>  dependencyList,
+                                          bool                                        useGpuFPmeReduction,
+                                          bool                                        accumulateForce);
  
          /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
          void atomdata_init_add_nbat_f_to_f_gpu();
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index ae396e90897c74a55903b44766db85749ca756bf..ccb2aa5bdea7b8f4b3f64910d6b0582dcbc8bec1 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -301,7 +301,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int               gmx_unused *cell,
   * \param[in]   totalForcesDevice    Device buffer to accumulate resulting force.
   * \param[in]   gpu_nbv              The NBNXM GPU data structure.
   * \param[in]   pmeForcesDevice      Device buffer with PME forces.
- * \param[in]   pmeForcesReady       Event that signals when the PME forces are ready for the reduction.
+ * \param[in]   dependencyList       List of synchronizers that represent the dependencies the reduction task needs to sync on.
   * \param[in]   atomStart            Index of the first atom to reduce forces for.
   * \param[in]   numAtoms             Number of atoms to reduce forces for.
   * \param[in]   useGpuFPmeReduction  Whether PME forces should be added.
@@ -309,15 +309,15 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int               gmx_unused *cell,
   *
   */
  CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_add_nbat_f_to_f(AtomLocality                 gmx_unused  atomLocality,
-                               DeviceBuffer<float>          gmx_unused  totalForcesDevice,
-                               gmx_nbnxn_gpu_t              gmx_unused *gpu_nbv,
-                               void                         gmx_unused *pmeForcesDevice,
-                               GpuEventSynchronizer         gmx_unused *pmeForcesReady,
-                               int                          gmx_unused  atomStart,
-                               int                          gmx_unused  numAtoms,
-                               bool                         gmx_unused  useGpuFPmeReduction,
-                               bool                         gmx_unused  accumulateForce) CUDA_FUNC_TERM;
+void nbnxn_gpu_add_nbat_f_to_f(AtomLocality                               gmx_unused  atomLocality,
+                               DeviceBuffer<float>                        gmx_unused  totalForcesDevice,
+                               gmx_nbnxn_gpu_t                            gmx_unused *gpu_nbv,
+                               void                                       gmx_unused *pmeForcesDevice,
+                               gmx::ArrayRef<GpuEventSynchronizer* const> gmx_unused  dependencyList,
+                               int                                        gmx_unused  atomStart,
+                               int                                        gmx_unused  numAtoms,
+                               bool                                       gmx_unused  useGpuFPmeReduction,
+                               bool                                       gmx_unused  accumulateForce) CUDA_FUNC_TERM;
  
  /*! \brief Wait for GPU stream to complete */
  CUDA_FUNC_QUALIFIER
author	Szilárd Páll <pall.szilard@gmail.com>
	Tue, 8 Oct 2019 01:47:46 +0000 (03:47 +0200)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Thu, 10 Oct 2019 18:24:02 +0000 (20:24 +0200)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.h		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history