Fix random typos

[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 241cc3b9fc3036f0267a7439981ce462c65aa328..5fad2c1d3f4ba3440734bc9a44dd752d69dbb042 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -54,6 +54,7 @@
  #include "gromacs/domdec/partition.h"
  #include "gromacs/essentialdynamics/edsam.h"
  #include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
  #include "gromacs/ewald/pme_pp.h"
  #include "gromacs/ewald/pme_pp_comm_gpu.h"
  #include "gromacs/gmxlib/network.h"
@@ -526,7 +527,8 @@ static real averageKineticEnergyEstimate(const t_grpopts& groupOptions)
   *
   * \param[in] step      The step number, used for checking and printing
   * \param[in] enerd     The energy data; the non-bonded group energies need to be added to
- * enerd.term[F_EPOT] before calling this routine \param[in] inputrec  The input record
+ *                      \c enerd.term[F_EPOT] before calling this routine
+ * \param[in] inputrec  The input record
   */
  static void checkPotentialEnergyValidity(int64_t step, const gmx_enerdata_t& enerd, const t_inputrec& inputrec)
  {
@@ -746,7 +748,10 @@ static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
                                        gmx_wallcycle*        wcycle)
  {
      pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
-    pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ);
+    bool                           useGpuDirectComm         = false;
+    gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
+    pme_gpu_launch_spread(
+            pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
  }
  
  /*! \brief Launch the FFT and gather stages of PME GPU
@@ -1113,65 +1118,69 @@ static void combineMtsForces(const int      numAtoms,
      }
  }
  
-/*! \brief Setup for the local and non-local GPU force reductions:
+/*! \brief Setup for the local GPU force reduction:
   * reinitialization plus the registration of forces and dependencies.
   *
- * \param [in] runScheduleWork               Schedule workload flag structure
- * \param [in] cr                            Communication record object
- * \param [in] fr                            Force record object
+ * \param [in] runScheduleWork     Schedule workload flag structure
+ * \param [in] nbv                 Non-bonded Verlet object
+ * \param [in] stateGpu            GPU state propagator object
+ * \param [in] gpuForceReduction   GPU force reduction object
+ * \param [in] pmePpCommGpu        PME-PP GPU communication object
+ * \param [in] pmedata             PME data object
+ * \param [in] dd                  Domain decomposition object
   */
-static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
-                                    const t_commrec*            cr,
-                                    t_forcerec*                 fr)
+static void setupLocalGpuForceReduction(const gmx::MdrunScheduleWorkload* runScheduleWork,
+                                        const nonbonded_verlet_t*         nbv,
+                                        gmx::StatePropagatorDataGpu*      stateGpu,
+                                        gmx::GpuForceReduction*           gpuForceReduction,
+                                        gmx::PmePpCommGpu*                pmePpCommGpu,
+                                        const gmx_pme_t*                  pmedata,
+                                        const gmx_domdec_t*               dd)
  {
-
-    nonbonded_verlet_t*          nbv      = fr->nbv.get();
-    gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
+    GMX_ASSERT(!runScheduleWork->simulationWork.useMts,
+               "GPU force reduction is not compatible with MTS");
  
      // (re-)initialize local GPU force reduction
      const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork
                              || runScheduleWork->simulationWork.havePpDomainDecomposition;
      const int atomStart = 0;
-    fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(
-            stateGpu->getForces(),
-            nbv->getNumAtoms(AtomLocality::Local),
-            nbv->getGridIndices(),
-            atomStart,
-            accumulate,
-            stateGpu->fReducedOnDevice(AtomLocality::Local));
+    gpuForceReduction->reinit(stateGpu->getForces(),
+                              nbv->getNumAtoms(AtomLocality::Local),
+                              nbv->getGridIndices(),
+                              atomStart,
+                              accumulate,
+                              stateGpu->fReducedOnDevice(AtomLocality::Local));
  
      // register forces and add dependencies
-    fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
+    gpuForceReduction->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
  
-    if (runScheduleWork->simulationWork.useGpuPme
-        && (!runScheduleWork->simulationWork.haveSeparatePmeRank
-            || runScheduleWork->simulationWork.useGpuPmePpCommunication))
-    {
-        DeviceBuffer<gmx::RVec> forcePtr =
-                runScheduleWork->simulationWork.haveSeparatePmeRank
-                        ? fr->pmePpCommGpu->getGpuForceStagingPtr() // buffer received from other GPU
-                        : pme_gpu_get_device_f(fr->pmedata);        // PME force buffer on same GPU
-        fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
+    DeviceBuffer<gmx::RVec> pmeForcePtr;
+    GpuEventSynchronizer*   pmeSynchronizer     = nullptr;
+    bool                    havePmeContribution = false;
  
-        if (runScheduleWork->simulationWork.haveSeparatePmeRank)
+    if (runScheduleWork->simulationWork.useGpuPme && !runScheduleWork->simulationWork.haveSeparatePmeRank)
+    {
+        pmeForcePtr         = pme_gpu_get_device_f(pmedata);
+        pmeSynchronizer     = pme_gpu_get_f_ready_synchronizer(pmedata);
+        havePmeContribution = true;
+    }
+    else if (runScheduleWork->simulationWork.useGpuPmePpCommunication)
+    {
+        pmeForcePtr = pmePpCommGpu->getGpuForceStagingPtr();
+        if (GMX_THREAD_MPI)
          {
-            // PME force buffer on remote GPU -
-            // event synchronizer received from other GPU only in case of thread-mpi
-            if (GMX_THREAD_MPI)
-            {
-                GpuEventSynchronizer* const pmeSynchronizer =
-                        fr->pmePpCommGpu->getForcesReadySynchronizer();
-                GMX_ASSERT(pmeSynchronizer != nullptr,
-                           "PME force ready cuda event should not be NULL");
-                fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
-            }
+            pmeSynchronizer = pmePpCommGpu->getForcesReadySynchronizer();
          }
-        else
+        havePmeContribution = true;
+    }
+
+    if (havePmeContribution)
+    {
+        gpuForceReduction->registerRvecForce(pmeForcePtr);
+        if (!runScheduleWork->simulationWork.useGpuPmePpCommunication || GMX_THREAD_MPI)
          {
-            // PME force buffer on same GPU - add dependency on PME force computation
-            GpuEventSynchronizer* const pmeSynchronizer = pme_gpu_get_f_ready_synchronizer(fr->pmedata);
              GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL");
-            fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
+            gpuForceReduction->addDependency(pmeSynchronizer);
          }
      }
  
@@ -1179,39 +1188,47 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
          || (runScheduleWork->simulationWork.havePpDomainDecomposition
              && !runScheduleWork->simulationWork.useGpuHaloExchange))
      {
-        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
-                stateGpu->fReadyOnDevice(AtomLocality::Local));
+        gpuForceReduction->addDependency(stateGpu->fReadyOnDevice(AtomLocality::Local));
      }
  
      if (runScheduleWork->simulationWork.useGpuHaloExchange)
      {
-        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
-                cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
+        gpuForceReduction->addDependency(dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
      }
+}
  
-    if (runScheduleWork->simulationWork.havePpDomainDecomposition)
-    {
-        // (re-)initialize non-local GPU force reduction
-        const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
-                                || runScheduleWork->domainWork.haveFreeEnergyWork;
-        const int atomStart = dd_numHomeAtoms(*cr->dd);
-        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(
-                stateGpu->getForces(),
-                nbv->getNumAtoms(AtomLocality::NonLocal),
-                nbv->getGridIndices(),
-                atomStart,
-                accumulate,
-                stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
+/*! \brief Setup for the non-local GPU force reduction:
+ * reinitialization plus the registration of forces and dependencies.
+ *
+ * \param [in] runScheduleWork     Schedule workload flag structure
+ * \param [in] nbv                 Non-bonded Verlet object
+ * \param [in] stateGpu            GPU state propagator object
+ * \param [in] gpuForceReduction   GPU force reduction object
+ * \param [in] dd                  Domain decomposition object
+ */
+static void setupNonLocalGpuForceReduction(const gmx::MdrunScheduleWorkload* runScheduleWork,
+                                           const nonbonded_verlet_t*         nbv,
+                                           gmx::StatePropagatorDataGpu*      stateGpu,
+                                           gmx::GpuForceReduction*           gpuForceReduction,
+                                           const gmx_domdec_t*               dd)
+{
+    // (re-)initialize non-local GPU force reduction
+    const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
+                            || runScheduleWork->domainWork.haveFreeEnergyWork;
+    const int atomStart = dd_numHomeAtoms(*dd);
+    gpuForceReduction->reinit(stateGpu->getForces(),
+                              nbv->getNumAtoms(AtomLocality::NonLocal),
+                              nbv->getGridIndices(),
+                              atomStart,
+                              accumulate,
+                              stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
  
-        // register forces and add dependencies
-        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(
-                Nbnxm::gpu_get_f(nbv->gpu_nbv));
+    // register forces and add dependencies
+    gpuForceReduction->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
  
-        if (runScheduleWork->domainWork.haveNonLocalForceContribInCpuBuffer)
-        {
-            fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->addDependency(
-                    stateGpu->fReadyOnDevice(AtomLocality::NonLocal));
-        }
+    if (runScheduleWork->domainWork.haveNonLocalForceContribInCpuBuffer)
+    {
+        gpuForceReduction->addDependency(stateGpu->fReadyOnDevice(AtomLocality::NonLocal));
      }
  }
  
@@ -1371,6 +1388,21 @@ void do_force(FILE*                               fplog,
          {
              GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
              stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::Local);
+            if (stepWork.doNeighborSearch)
+            {
+                /* On NS steps, we skip X buffer ops. So, unless we use PME or direct GPU
+                 * communications, we don't wait for the coordinates on the device,
+                 * and we must consume the event here.
+                 * Issue #3988. */
+                const bool eventWillBeConsumedByGpuPme = stepWork.haveGpuPmeOnThisRank;
+                const bool eventWillBeConsumedByGpuPmePPComm =
+                        (simulationWork.haveSeparatePmeRank && stepWork.computeSlowForces)
+                        && pmeSendCoordinatesFromGpu;
+                if (!eventWillBeConsumedByGpuPme && !eventWillBeConsumedByGpuPmePPComm)
+                {
+                    stateGpu->consumeCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
+                }
+            }
          }
      }
  
@@ -1504,7 +1536,21 @@ void do_force(FILE*                               fplog,
  
          if (simulationWork.useGpuBufferOps)
          {
-            setupGpuForceReductions(runScheduleWork, cr, fr);
+            setupLocalGpuForceReduction(runScheduleWork,
+                                        fr->nbv.get(),
+                                        stateGpu,
+                                        fr->gpuForceReduction[gmx::AtomLocality::Local].get(),
+                                        fr->pmePpCommGpu.get(),
+                                        fr->pmedata,
+                                        cr->dd);
+            if (runScheduleWork->simulationWork.havePpDomainDecomposition)
+            {
+                setupNonLocalGpuForceReduction(runScheduleWork,
+                                               fr->nbv.get(),
+                                               stateGpu,
+                                               fr->gpuForceReduction[gmx::AtomLocality::NonLocal].get(),
+                                               cr->dd);
+            }
          }
      }
      else if (!EI_TPI(inputrec.eI) && stepWork.computeNonbondedForces)
@@ -2060,6 +2106,10 @@ void do_force(FILE*                               fplog,
  
                  if (!stepWork.useGpuFHalo)
                  {
+                    /* We don't explicitly wait for the forces to be reduced on device,
+                     * but wait for them to finish copying to CPU instead.
+                     * So, we manually consume the event, see Issue #3988. */
+                    stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::NonLocal);
                      // copy from GPU input for dd_move_f()
                      stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
                                                  AtomLocality::NonLocal);
@@ -2267,6 +2317,13 @@ void do_force(FILE*                               fplog,
                  || (simulationWork.useGpuUpdate && haveDDAtomOrdering(*cr) && simulationWork.useCpuPmePpCommunication)
                  || vsite)
              {
+                if (stepWork.computeNonbondedForces)
+                {
+                    /* We have previously issued force reduction on the GPU, but we will
+                     * not use this event, instead relying on the stream being in-order.
+                     * Issue #3988. */
+                    stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::Local);
+                }
                  stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
                  stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
              }