From 3b46d7d65c19a7a156f66f9eb5d0b85bf10c134b Mon Sep 17 00:00:00 2001
From: Alan Gray <alangray3@gmail.com>
Date: Wed, 4 Nov 2020 18:56:03 +0000
Subject: [PATCH] Allow GPU update without GPU DD

---
 admin/gitlab-ci/gromacs.gitlab-ci.yml         | 23 ++++++
 docs/release-notes/2021/major/performance.rst |  7 ++
 src/gromacs/mdlib/sim_util.cpp                | 74 +++++++++----------
 src/gromacs/mdrun/md.cpp                      | 11 ++-
 src/gromacs/taskassignment/decidegpuusage.cpp | 32 +++-----
 5 files changed, 85 insertions(+), 62 deletions(-)

diff --git a/admin/gitlab-ci/gromacs.gitlab-ci.yml b/admin/gitlab-ci/gromacs.gitlab-ci.yml
index b38de61e53..a4deaa01e7 100644
--- a/admin/gitlab-ci/gromacs.gitlab-ci.yml
+++ b/admin/gitlab-ci/gromacs.gitlab-ci.yml
@@ -1015,6 +1015,29 @@ gromacs:gcc-8-cuda-11.0:regressiontest-gpucommupd-tMPI:
     when: always
     expire_in: 1 week
 
+gromacs:gcc-8-cuda-11.0:regressiontest-upd-tMPI:
+  extends:
+    - .gromacs:base:regressiontest
+    - .rules:post-merge-acceptance
+  image: gromacs/cmake-3.15.7-gcc-8-cuda-11.0-nvidiaopencl-clfft-openmpi-master
+  variables:
+    KUBERNETES_EXTENDED_RESOURCE_NAME: "nvidia.com/gpu"
+    KUBERNETES_EXTENDED_RESOURCE_LIMIT: 2
+    REGRESSIONTEST_PME_RANK_NUMBER: 0
+    REGRESSIONTEST_TOTAL_RANK_NUMBER: 4
+    REGRESSIONTEST_OMP_RANK_NUMBER: 1
+    GMX_FORCE_UPDATE_DEFAULT_GPU: 1
+  tags:
+    - k8s-scilifelab
+  needs:
+    - job: gromacs:gcc-8-cuda-11.0:build
+    - job: regressiontests:prepare
+  artifacts:
+    paths:
+      - regressiontests
+    when: always
+    expire_in: 1 week
+
 gromacs:gcc-8-cuda-11.0:regressiontest-gpucommupd-MPI:
   extends:
     - .gromacs:base:regressiontest
diff --git a/docs/release-notes/2021/major/performance.rst b/docs/release-notes/2021/major/performance.rst
index b94c016d48..85b08653c8 100644
--- a/docs/release-notes/2021/major/performance.rst
+++ b/docs/release-notes/2021/major/performance.rst
@@ -32,3 +32,10 @@ CPU SIMD accelerated implementation of harmonic bonds
 SIMD acceleration for bonds slightly improves performance for systems
 with H-bonds only constrained or no constraints. This gives a significant
 improvement with multiple time stepping.
+
+Allow offloading GPU update and constraints without direct GPU communication
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+Allow domain-decomposition and separate PME rank parallel runs to offload update and
+constraints to a GPU with CUDA without requiring the (experimental) direct GPU
+communication features to be also enabled.
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index e788c6b66b..18606f120f 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1213,6 +1213,30 @@ void do_force(FILE*                               fplog,
                                                        AtomLocality::Local, simulationWork, stepWork)
                                              : nullptr;
 
+    // Copy coordinate from the GPU if update is on the GPU and there
+    // are forces to be computed on the CPU, or for the computation of
+    // virial, or if host-side data will be transferred from this task
+    // to a remote task for halo exchange or PME-PP communication. At
+    // search steps the current coordinates are already on the host,
+    // hence copy is not needed.
+    const bool haveHostPmePpComms =
+            !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
+
+    GMX_ASSERT(simulationWork.useGpuHaloExchange
+                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
+               "The GPU halo exchange is active, but it has not been constructed.");
+    const bool haveHostHaloExchangeComms =
+            havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
+
+    bool gmx_used_in_debug haveCopiedXFromGpu = false;
+    if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
+        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
+            || haveHostPmePpComms || haveHostHaloExchangeComms))
+    {
+        stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
+        haveCopiedXFromGpu = true;
+    }
+
     // If coordinates are to be sent to PME task from CPU memory, perform that send here.
     // Otherwise the send will occur after H2D coordinate transfer.
     if (GMX_MPI && !thisRankHasDuty(cr, DUTY_PME) && !pmeSendCoordinatesFromGpu && stepWork.computeSlowForces)
@@ -1220,11 +1244,7 @@ void do_force(FILE*                               fplog,
         /* Send particle coordinates to the pme nodes */
         if (!stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
         {
-            GMX_RELEASE_ASSERT(false,
-                               "GPU update and separate PME ranks are only supported with GPU "
-                               "direct communication!");
-            // TODO: when this code-path becomes supported add:
-            // stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+            stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
         }
 
         gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
@@ -1260,31 +1280,6 @@ void do_force(FILE*                               fplog,
         }
     }
 
-    // Copy coordinate from the GPU if update is on the GPU and there
-    // are forces to be computed on the CPU, or for the computation of
-    // virial, or if host-side data will be transferred from this task
-    // to a remote task for halo exchange or PME-PP communication. At
-    // search steps the current coordinates are already on the host,
-    // hence copy is not needed.
-    const bool haveHostPmePpComms =
-            !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
-
-    GMX_ASSERT(simulationWork.useGpuHaloExchange
-                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
-               "The GPU halo exchange is active, but it has not been constructed.");
-    const bool haveHostHaloExchangeComms =
-            havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
-
-    bool gmx_used_in_debug haveCopiedXFromGpu = false;
-    if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
-        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
-            || haveHostPmePpComms || haveHostHaloExchangeComms))
-    {
-        GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
-        stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
-        haveCopiedXFromGpu = true;
-    }
-
     // If coordinates are to be sent to PME task from GPU memory, perform that send here.
     // Otherwise the send will occur before the H2D coordinate transfer.
     if (!thisRankHasDuty(cr, DUTY_PME) && pmeSendCoordinatesFromGpu)
@@ -1495,10 +1490,12 @@ void do_force(FILE*                               fplog,
             }
             else
             {
-                // Note: GPU update + DD without direct communication is not supported,
-                // a waitCoordinatesReadyOnHost() should be issued if it will be.
-                GMX_ASSERT(!simulationWork.useGpuUpdate,
-                           "GPU update is not supported with CPU halo exchange");
+                if (simulationWork.useGpuUpdate)
+                {
+                    GMX_ASSERT(haveCopiedXFromGpu,
+                               "a wait should only be triggered if copy has been scheduled");
+                    stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+                }
                 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
             }
 
@@ -1978,10 +1975,10 @@ void do_force(FILE*                               fplog,
         wallcycle_stop(wcycle, ewcFORCE);
     }
 
-    // If on GPU PME-PP comms or GPU update path, receive forces from PME before GPU buffer ops
+    // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
     // TODO refactor this and unify with below default-path call to the same function
     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces
-        && (simulationWork.useGpuPmePpCommunication || simulationWork.useGpuUpdate))
+        && simulationWork.useGpuPmePpCommunication)
     {
         /* In case of node-splitting, the PP nodes receive the long-range
          * forces, virial and energy from the PME nodes here.
@@ -2039,7 +2036,8 @@ void do_force(FILE*                               fplog,
             //       copy call done in sim_utils(...) for the output.
             // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
             //       they should not be copied in do_md(...) for the output.
-            if (!simulationWork.useGpuUpdate || vsite)
+            if (!simulationWork.useGpuUpdate
+                || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && haveHostPmePpComms) || vsite)
             {
                 stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
                 stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
@@ -2076,7 +2074,7 @@ void do_force(FILE*                               fplog,
 
     // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication
-        && !simulationWork.useGpuUpdate && stepWork.computeSlowForces)
+        && stepWork.computeSlowForces)
     {
         /* In case of node-splitting, the PP nodes receive the long-range
          * forces, virial and energy from the PME nodes here.
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index c20574e37f..f5c0fd393e 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -1267,9 +1267,16 @@ void gmx::LegacySimulator::do_md()
                 stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
             }
 
-            // If the buffer ops were not offloaded this step, the forces are on the host and have to be copied
-            if (!runScheduleWork->stepWork.useGpuFBufferOps)
+            if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
+                && !thisRankHasDuty(cr, DUTY_PME))
             {
+                // The PME forces were recieved to the host, so have to be copied
+                stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
+            }
+            else if (!runScheduleWork->stepWork.useGpuFBufferOps)
+            {
+                // The buffer ops were not offloaded this step, so the forces are on the
+                // host and have to be copied
                 stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
             }
 
diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp
index 91f0eb4720..e22b51a29c 100644
--- a/src/gromacs/taskassignment/decidegpuusage.cpp
+++ b/src/gromacs/taskassignment/decidegpuusage.cpp
@@ -561,26 +561,19 @@ bool decideWhetherToUseGpuForUpdate(const bool                     isDomainDecom
 
     if (isDomainDecomposition)
     {
-        if (!devFlags.enableGpuHaloExchange)
+        if (hasAnyConstraints && !useUpdateGroups)
         {
-            errorMessage += "Domain decomposition without GPU halo exchange is not supported.\n ";
+            errorMessage +=
+                    "Domain decomposition is only supported with constraints when update "
+                    "groups "
+                    "are used. This means constraining all bonds is not supported, except for "
+                    "small molecules, and box sizes close to half the pair-list cutoff are not "
+                    "supported.\n ";
         }
-        else
-        {
-            if (hasAnyConstraints && !useUpdateGroups)
-            {
-                errorMessage +=
-                        "Domain decomposition is only supported with constraints when update "
-                        "groups "
-                        "are used. This means constraining all bonds is not supported, except for "
-                        "small molecules, and box sizes close to half the pair-list cutoff are not "
-                        "supported.\n ";
-            }
 
-            if (pmeUsesCpu)
-            {
-                errorMessage += "With domain decomposition, PME must run fully on the GPU.\n";
-            }
+        if (pmeUsesCpu)
+        {
+            errorMessage += "With domain decomposition, PME must run fully on the GPU.\n";
         }
     }
 
@@ -590,11 +583,6 @@ bool decideWhetherToUseGpuForUpdate(const bool                     isDomainDecom
         {
             errorMessage += "With separate PME rank(s), PME must run fully on the GPU.\n";
         }
-
-        if (!devFlags.enableGpuPmePPComm)
-        {
-            errorMessage += "With separate PME rank(s), PME must use direct communication.\n";
-        }
     }
 
     if (inputrec.useMts)
-- 
2.22.0