From f8a8252cd2a4b7760167cbbce3dc3474ae1534e1 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Thu, 5 Dec 2019 14:49:25 +0100
Subject: [PATCH] Allow using GPU update with DD and update groups

The GPU update is now can be enabled for the supported DD cases
with the GMX_FORCE_UPDATE_DEFAULT_GPU environment variable. Also
added the checks on whether the SHAKE algorithm was requested,
since SHAKE is not supported by the GPU update.

Refs. #3226, #3163.

Change-Id: I57e3ad3b8a571ec244989e888afd5cfcbaf9b75e
---
 admin/builds/gpuupdate-matrix.txt             | 21 +++-----
 src/gromacs/domdec/domdec.cpp                 |  5 ++
 src/gromacs/domdec/domdec.h                   |  3 ++
 src/gromacs/mdlib/sim_util.cpp                | 53 +++++++++++--------
 src/gromacs/mdrun/md.cpp                      |  9 +++-
 src/gromacs/mdrun/runner.cpp                  | 42 ++++++++++-----
 src/gromacs/taskassignment/decidegpuusage.cpp | 22 ++++++--
 src/gromacs/taskassignment/decidegpuusage.h   | 28 +++++-----
 8 files changed, 117 insertions(+), 66 deletions(-)

diff --git a/admin/builds/gpuupdate-matrix.txt b/admin/builds/gpuupdate-matrix.txt
index 5fdbc2ed0f..cd0c31b09e 100644
--- a/admin/builds/gpuupdate-matrix.txt
+++ b/admin/builds/gpuupdate-matrix.txt
@@ -1,26 +1,19 @@
 # This matrix is intended to permit Jenkins on-demand testing
-# of code hidden behind the GMX_USE_BUFFER_OPS feature flag
-# during development. When the feature flag is removed, the
-# normal test matrices will be adapted to cover this code path.
+# of GPU update code path during development.
 #
 # Comment line(s) preceding each configuration document the main
 # intent behind that configuration, so that we can correctly judge
 # whether to preserve that during maintenance decisions.
-#
-# Both configurations currently target bs_nix1204, for better load
-# balance with pre-submit matrix, which makes heavier use of
-# bs_nix1310 agent.
 
-# Test newest gcc supported by newest CUDA at time of release
-# Test thread-MPI with CUDA
-# Test GPU update-constraints features in the above combination
+# Test GPU update-constraints features on a single PP+PME rank
 gcc-8 gpuhw=nvidia nranks=1 gpu_id=1 cuda-10.1 thread-mpi openmp cmake-3.10.0 release-with-assert simd=avx2_256 hwloc libhwloc-2.0.4 gpuupdate
 
-# Test CUDA build on a agent with no CUDA devices
-# Test without TNG support
-# Test GPU update-constraints features in the above combination
+# Test GPU update-constraints features in a CUDA build without CUDA devices
 gcc-7 gpuhw=none cuda-10.0 openmp no-tng release-with-assert gpuupdate
 
-# Test OpenCL build with gpudev features
 # Test GPU update-constraints on the OpenCL path where it is unsupported
 clang-8 openmp gpuhw=amd opencl-1.2 clFFT-2.14 simd=None gpuupdate
+
+# Test GPU update-constraints features with multiple PP ranks and one PME rank
+# Note: this should fall back correctly to the CPU codepath
+gcc-5 gpuhw=nvidia cuda-9.0 cmake-3.9.6 thread-mpi npme=1 nranks=3 release-with-assert gpuupdate
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index 2f931240e4..de3fc3c4b3 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -1032,6 +1032,11 @@ bool ddHaveSplitConstraints(const gmx_domdec_t& dd)
     return dd.comm->systemInfo.haveSplitConstraints;
 }
 
+bool ddUsesUpdateGroups(const gmx_domdec_t& dd)
+{
+    return dd.comm->systemInfo.useUpdateGroups;
+}
+
 void dd_cycles_add(const gmx_domdec_t* dd, float cycles, int ddCycl)
 {
     /* Note that the cycles value can be incorrect, either 0 or some
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h
index 61691d63e3..9fb46bc060 100644
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -147,6 +147,9 @@ int dd_pme_maxshift_y(const gmx_domdec_t* dd);
 /*! \brief Return whether constraints, not including settles, cross domain boundaries */
 bool ddHaveSplitConstraints(const gmx_domdec_t& dd);
 
+/*! \brief Return whether update groups are used */
+bool ddUsesUpdateGroups(const gmx_domdec_t& dd);
+
 /*! \brief Return whether the DD has a single dimension with a single pulse
  *
  * The GPU halo exchange code requires a 1D single-pulse DD, and its
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index c6adc7b73c..832ff21620 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1032,11 +1032,31 @@ void do_force(FILE*                               fplog,
         }
     }
 
-    // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on
-    // the CPU, or for the computation of virial. At search steps the current coordinates are
-    // already on the host, hence copy is not needed.
+    // TODO Update this comment when introducing SimulationWorkload
+    //
+    // The conditions for gpuHaloExchange e.g. using GPU buffer
+    // operations were checked before construction, so here we can
+    // just use it and assert upon any conditions.
+    gmx::GpuHaloExchange* gpuHaloExchange =
+            (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr);
+    const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr);
+    GMX_ASSERT(!ddUsesGpuDirectCommunication || (useGpuXBufOps == BufferOpsUseGpu::True),
+               "Must use coordinate buffer ops with GPU halo exchange");
+    const bool useGpuForcesHaloExchange =
+            ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
+
+    // Copy coordinate from the GPU if update is on the GPU and there
+    // are forces to be computed on the CPU, or for the computation of
+    // virial, or if host-side data will be transferred from this task
+    // to a remote task for halo exchange or PME-PP communication. At
+    // search steps the current coordinates are already on the host,
+    // hence copy is not needed.
+    const bool haveHostPmePpComms =
+            !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
+    const bool haveHostHaloExchangeComms = havePPDomainDecomposition(cr) && !ddUsesGpuDirectCommunication;
     if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
-        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
+        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
+            || haveHostPmePpComms || haveHostHaloExchangeComms))
     {
         stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
         stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
@@ -1223,19 +1243,6 @@ void do_force(FILE*                               fplog,
         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
     }
 
-    // TODO Update this comment when introducing SimulationWorkload
-    //
-    // The conditions for gpuHaloExchange e.g. using GPU buffer
-    // operations were checked before construction, so here we can
-    // just use it and assert upon any conditions.
-    gmx::GpuHaloExchange* gpuHaloExchange =
-            (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr);
-    const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr);
-    GMX_ASSERT(!ddUsesGpuDirectCommunication || (useGpuXBufOps == BufferOpsUseGpu::True),
-               "Must use coordinate buffer ops with GPU halo exchange");
-    const bool useGpuForcesHaloExchange =
-            ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
-
     /* Communicate coordinates and sum dipole if necessary +
        do non-local pair search */
     if (havePPDomainDecomposition(cr))
@@ -1638,9 +1645,10 @@ void do_force(FILE*                               fplog,
         wallcycle_stop(wcycle, ewcFORCE);
     }
 
-    // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
-    // TODO refoactor this and unify with below default-path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication)
+    // If on GPU PME-PP comms or GPU update path, receive forces from PME before GPU buffer ops
+    // TODO refactor this and unify with below default-path call to the same function
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME)
+        && (simulationWork.useGpuPmePpCommunication || simulationWork.useGpuUpdate))
     {
         /* In case of node-splitting, the PP nodes receive the long-range
          * forces, virial and energy from the PME nodes here.
@@ -1757,8 +1765,9 @@ void do_force(FILE*                               fplog,
         }
     }
 
-    // TODO refoactor this and unify with above PME-PP GPU communication path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication)
+    // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication
+        && !simulationWork.useGpuUpdate)
     {
         /* In case of node-splitting, the PP nodes receive the long-range
          * forces, virial and energy from the PME nodes here.
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index a5cdccce96..ff3d0f88b8 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -334,8 +334,13 @@ void gmx::LegacySimulator::do_md()
 
     if (useGpuForUpdate)
     {
-        GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr),
-                           "Domain decomposition is not supported with the GPU update.\n");
+        GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
+                                   || constr->numConstraintsTotal() == 0,
+                           "Constraints in domain decomposition are only supported with update "
+                           "groups if using GPU update.\n");
+        GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
+                                   || constr->numConstraintsTotal() == 0,
+                           "SHAKE is not supported with GPU update.");
         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
                            "Either PME or short-ranged non-bonded interaction tasks must run on "
                            "the GPU to use GPU update.\n");
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 9b30f4d60c..2a1a0be8b0 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -99,6 +99,7 @@
 #include "gromacs/mdlib/qmmm.h"
 #include "gromacs/mdlib/sighandler.h"
 #include "gromacs/mdlib/stophandler.h"
+#include "gromacs/mdlib/updategroups.h"
 #include "gromacs/mdrun/mdmodules.h"
 #include "gromacs/mdrun/simulationcontext.h"
 #include "gromacs/mdrunutility/handlerestart.h"
@@ -175,6 +176,8 @@ struct DevelopmentFeatureFlags
     //! True if the Buffer ops development feature is enabled
     // TODO: when the trigger of the buffer ops offload is fully automated this should go away
     bool enableGpuBufferOps = false;
+    //! If true, forces 'mdrun -update auto' default to 'gpu' when running with DD
+    bool forceGpuUpdateDefaultWithDD = false;
     //! True if the GPU halo exchange development feature is enabled
     bool enableGpuHaloExchange = false;
     //! True if the PME PP direct communication GPU development feature is enabled
@@ -209,6 +212,7 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
 #pragma GCC diagnostic ignored "-Wunused-result"
     devFlags.enableGpuBufferOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr)
                                   && (GMX_GPU == GMX_GPU_CUDA) && useGpuForNonbonded;
+    devFlags.forceGpuUpdateDefaultWithDD = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr);
     devFlags.enableGpuHaloExchange =
             (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
     devFlags.enableGpuPmePPComm =
@@ -224,6 +228,15 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
                         "GMX_USE_GPU_BUFFER_OPS environment variable.");
     }
 
+    if (devFlags.forceGpuUpdateDefaultWithDD)
+    {
+        GMX_LOG(mdlog.warning)
+                .asParagraph()
+                .appendTextFormatted(
+                        "NOTE: This run will default to '-update gpu' as requested by the "
+                        "GMX_FORCE_UPDATE_DEFAULT_GPU environment variable.");
+    }
+
     if (devFlags.enableGpuHaloExchange)
     {
         if (useGpuForNonbonded)
@@ -890,19 +903,6 @@ int Mdrunner::mdrunner()
     const DevelopmentFeatureFlags devFlags =
             manageDevelopmentFeatures(mdlog, useGpuForNonbonded, pmeRunMode);
 
-    // NOTE: The devFlags need decideWhetherToUseGpusForNonbonded(...) and decideWhetherToUseGpusForPme(...) for overrides,
-    //       decideWhetherToUseGpuForUpdate() needs devFlags for the '-update auto' override, hence the interleaving.
-    // NOTE: When the simulationWork is constructed, the useGpuForUpdate overrides the devFlags.enableGpuBufferOps.
-    try
-    {
-        useGpuForUpdate = decideWhetherToUseGpuForUpdate(
-                useDomainDecomposition, useGpuForPme, useGpuForNonbonded, updateTarget,
-                gpusWereDetected, *inputrec, mtop, doEssentialDynamics,
-                gmx_mtop_ftype_count(mtop, F_ORIRES) > 0, replExParams.exchangeInterval > 0, doRerun);
-    }
-    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-
-
     // Build restraints.
     // TODO: hide restraint implementation details from Mdrunner.
     // There is nothing unique about restraints at this point as far as the
@@ -1178,6 +1178,22 @@ int Mdrunner::mdrunner()
         // Note that local state still does not exist yet.
     }
 
+    // The GPU update is decided here because we need to know whether the constraints or
+    // SETTLEs can span accross the domain borders (i.e. whether or not update groups are
+    // defined). This is only known after DD is initialized, hence decision on using GPU
+    // update is done so late.
+    try
+    {
+        const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
+
+        useGpuForUpdate = decideWhetherToUseGpuForUpdate(
+                devFlags.forceGpuUpdateDefaultWithDD, useDomainDecomposition, useUpdateGroups,
+                useGpuForPme, useGpuForNonbonded, updateTarget, gpusWereDetected, *inputrec, mtop,
+                doEssentialDynamics, gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
+                replExParams.exchangeInterval > 0, doRerun);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
+
     if (PAR(cr))
     {
         /* After possible communicator splitting in make_dd_communicators.
diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp
index b7eff1d3bf..fc1f4faf60 100644
--- a/src/gromacs/taskassignment/decidegpuusage.cpp
+++ b/src/gromacs/taskassignment/decidegpuusage.cpp
@@ -489,7 +489,9 @@ bool decideWhetherToUseGpusForBonded(const bool       useGpuForNonbonded,
     return gpusWereDetected && usingOurCpuForPmeOrEwald;
 }
 
-bool decideWhetherToUseGpuForUpdate(const bool        isDomainDecomposition,
+bool decideWhetherToUseGpuForUpdate(const bool        forceGpuUpdateDefaultWithDD,
+                                    const bool        isDomainDecomposition,
+                                    const bool        useUpdateGroups,
                                     const bool        useGpuForPme,
                                     const bool        useGpuForNonbonded,
                                     const TaskTarget  updateTarget,
@@ -507,11 +509,20 @@ bool decideWhetherToUseGpuForUpdate(const bool        isDomainDecomposition,
         return false;
     }
 
+    const bool hasAnyConstraints = gmx_mtop_interaction_count(mtop, IF_CONSTRAINT) > 0;
+
     std::string errorMessage;
 
-    if (isDomainDecomposition)
+    if (isDomainDecomposition && hasAnyConstraints && !useUpdateGroups)
+    {
+        errorMessage +=
+                "Domain decomposition is only supported with constraints when update groups are "
+                "used. This means constraining all bonds is not supported, except for small "
+                "molecules, and box sizes close to half the pair-list cutoff are not supported.\n ";
+    }
+    if (inputrec.eConstrAlg == econtSHAKE && hasAnyConstraints && gmx_mtop_ftype_count(mtop, F_CONSTR) > 0)
     {
-        errorMessage += "Domain decomposition is not supported.\n";
+        errorMessage += "SHAKE constraints are not supported.\n";
     }
     // Using the GPU-version of update if:
     // 1. PME is on the GPU (there should be a copy of coordinates on GPU for PME spread), or
@@ -606,6 +617,11 @@ bool decideWhetherToUseGpuForUpdate(const bool        isDomainDecomposition,
         return false;
     }
 
+    if (isDomainDecomposition)
+    {
+        return forceGpuUpdateDefaultWithDD;
+    }
+
     return true;
 }
 
diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h
index ace5f39cc2..b01aa97c42 100644
--- a/src/gromacs/taskassignment/decidegpuusage.h
+++ b/src/gromacs/taskassignment/decidegpuusage.h
@@ -231,23 +231,27 @@ bool decideWhetherToUseGpusForBonded(bool       useGpuForNonbonded,
 
 /*! \brief Decide whether to use GPU for update.
  *
- * \param[in]  isDomainDecomposition     Whether there more than one domain.
- * \param[in]  useGpuForPme              Whether GPUs will be used for PME interactions.
- * \param[in]  useGpuForNonbonded        Whether GPUs will be used for nonbonded interactions.
- * \param[in]  updateTarget              User choice for running simulation on GPU.
- * \param[in]  gpusWereDetected          Whether compatible GPUs were detected on any node.
- * \param[in]  inputrec                  The user input.
- * \param[in]  mtop                      The global topology.
- * \param[in]  useEssentialDynamics      If essential dynamics is active.
- * \param[in]  doOrientationRestraints   If orientation restraints are enabled.
- * \param[in]  useReplicaExchange        If this is a REMD simulation.
- * \param[in]  doRerun                   It this is a rerun.
+ * \param[in]  forceGpuUpdateDefaultWithDD  If update should run on GPU with DD by default.
+ * \param[in]  isDomainDecomposition        Whether there more than one domain.
+ * \param[in]  useUpdateGroups              If the constraints can be split across domains.
+ * \param[in]  useGpuForPme                 Whether GPUs will be used for PME interactions.
+ * \param[in]  useGpuForNonbonded           Whether GPUs will be used for nonbonded interactions.
+ * \param[in]  updateTarget                 User choice for running simulation on GPU.
+ * \param[in]  gpusWereDetected             Whether compatible GPUs were detected on any node.
+ * \param[in]  inputrec                     The user input.
+ * \param[in]  mtop                         The global topology.
+ * \param[in]  useEssentialDynamics         If essential dynamics is active.
+ * \param[in]  doOrientationRestraints      If orientation restraints are enabled.
+ * \param[in]  useReplicaExchange           If this is a REMD simulation.
+ * \param[in]  doRerun                      It this is a rerun.
  *
  * \returns    Whether complete simulation can be run on GPU.
  * \throws     std::bad_alloc            If out of memory
  *             InconsistentInputError    If the user requirements are inconsistent.
  */
-bool decideWhetherToUseGpuForUpdate(bool              isDomainDecomposition,
+bool decideWhetherToUseGpuForUpdate(bool              forceGpuUpdateDefaultWithDD,
+                                    bool              isDomainDecomposition,
+                                    bool              useUpdateGroups,
                                     bool              useGpuForPme,
                                     bool              useGpuForNonbonded,
                                     TaskTarget        updateTarget,
-- 
2.22.0