Allow GPU update without GPU DD

author Alan Gray <alangray3@gmail.com>

Wed, 4 Nov 2020 18:56:03 +0000 (18:56 +0000)

committer Szilárd Páll <pall.szilard@gmail.com>

Wed, 4 Nov 2020 18:56:03 +0000 (18:56 +0000)
author Alan Gray <alangray3@gmail.com>
Wed, 4 Nov 2020 18:56:03 +0000 (18:56 +0000)
committer Szilárd Páll <pall.szilard@gmail.com>
Wed, 4 Nov 2020 18:56:03 +0000 (18:56 +0000)
diff --git a/admin/gitlab-ci/gromacs.gitlab-ci.yml b/admin/gitlab-ci/gromacs.gitlab-ci.yml

index b38de61e53715eb85f2fa1ca84798122cbde3c60..a4deaa01e7824371a03246d4d203951992b9756a 100644 (file)
--- a/admin/gitlab-ci/gromacs.gitlab-ci.yml
+++ b/admin/gitlab-ci/gromacs.gitlab-ci.yml
@@ -1015,6 +1015,29 @@ gromacs:gcc-8-cuda-11.0:regressiontest-gpucommupd-tMPI:
      when: always
      expire_in: 1 week
  
+gromacs:gcc-8-cuda-11.0:regressiontest-upd-tMPI:
+  extends:
+    - .gromacs:base:regressiontest
+    - .rules:post-merge-acceptance
+  image: gromacs/cmake-3.15.7-gcc-8-cuda-11.0-nvidiaopencl-clfft-openmpi-master
+  variables:
+    KUBERNETES_EXTENDED_RESOURCE_NAME: "nvidia.com/gpu"
+    KUBERNETES_EXTENDED_RESOURCE_LIMIT: 2
+    REGRESSIONTEST_PME_RANK_NUMBER: 0
+    REGRESSIONTEST_TOTAL_RANK_NUMBER: 4
+    REGRESSIONTEST_OMP_RANK_NUMBER: 1
+    GMX_FORCE_UPDATE_DEFAULT_GPU: 1
+  tags:
+    - k8s-scilifelab
+  needs:
+    - job: gromacs:gcc-8-cuda-11.0:build
+    - job: regressiontests:prepare
+  artifacts:
+    paths:
+      - regressiontests
+    when: always
+    expire_in: 1 week
+
  gromacs:gcc-8-cuda-11.0:regressiontest-gpucommupd-MPI:
    extends:
      - .gromacs:base:regressiontest
diff --git a/docs/release-notes/2021/major/performance.rst b/docs/release-notes/2021/major/performance.rst

index b94c016d48585e286b367c96babdcd8ea25fb55b..85b08653c852b526852e9827ca3248e437134c25 100644 (file)
--- a/docs/release-notes/2021/major/performance.rst
+++ b/docs/release-notes/2021/major/performance.rst
@@ -32,3 +32,10 @@ CPU SIMD accelerated implementation of harmonic bonds
  SIMD acceleration for bonds slightly improves performance for systems
  with H-bonds only constrained or no constraints. This gives a significant
  improvement with multiple time stepping.
+
+Allow offloading GPU update and constraints without direct GPU communication
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+Allow domain-decomposition and separate PME rank parallel runs to offload update and
+constraints to a GPU with CUDA without requiring the (experimental) direct GPU
+communication features to be also enabled.
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index e788c6b66b2d2d5237d0f005f4aad8795402a016..18606f120fb1f1320d85542e64a27dabcef9f8f7 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1213,6 +1213,30 @@ void do_force(FILE*                               fplog,
                                                         AtomLocality::Local, simulationWork, stepWork)
                                               : nullptr;
  
+    // Copy coordinate from the GPU if update is on the GPU and there
+    // are forces to be computed on the CPU, or for the computation of
+    // virial, or if host-side data will be transferred from this task
+    // to a remote task for halo exchange or PME-PP communication. At
+    // search steps the current coordinates are already on the host,
+    // hence copy is not needed.
+    const bool haveHostPmePpComms =
+            !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
+
+    GMX_ASSERT(simulationWork.useGpuHaloExchange
+                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
+               "The GPU halo exchange is active, but it has not been constructed.");
+    const bool haveHostHaloExchangeComms =
+            havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
+
+    bool gmx_used_in_debug haveCopiedXFromGpu = false;
+    if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
+        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
+            || haveHostPmePpComms || haveHostHaloExchangeComms))
+    {
+        stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
+        haveCopiedXFromGpu = true;
+    }
+
      // If coordinates are to be sent to PME task from CPU memory, perform that send here.
      // Otherwise the send will occur after H2D coordinate transfer.
      if (GMX_MPI && !thisRankHasDuty(cr, DUTY_PME) && !pmeSendCoordinatesFromGpu && stepWork.computeSlowForces)
@@ -1220,11 +1244,7 @@ void do_force(FILE*                               fplog,
          /* Send particle coordinates to the pme nodes */
          if (!stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
          {
-            GMX_RELEASE_ASSERT(false,
-                               "GPU update and separate PME ranks are only supported with GPU "
-                               "direct communication!");
-            // TODO: when this code-path becomes supported add:
-            // stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+            stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
          }
  
          gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
@@ -1260,31 +1280,6 @@ void do_force(FILE*                               fplog,
          }
      }
  
-    // Copy coordinate from the GPU if update is on the GPU and there
-    // are forces to be computed on the CPU, or for the computation of
-    // virial, or if host-side data will be transferred from this task
-    // to a remote task for halo exchange or PME-PP communication. At
-    // search steps the current coordinates are already on the host,
-    // hence copy is not needed.
-    const bool haveHostPmePpComms =
-            !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
-
-    GMX_ASSERT(simulationWork.useGpuHaloExchange
-                       == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())),
-               "The GPU halo exchange is active, but it has not been constructed.");
-    const bool haveHostHaloExchangeComms =
-            havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange;
-
-    bool gmx_used_in_debug haveCopiedXFromGpu = false;
-    if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
-        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
-            || haveHostPmePpComms || haveHostHaloExchangeComms))
-    {
-        GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
-        stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
-        haveCopiedXFromGpu = true;
-    }
-
      // If coordinates are to be sent to PME task from GPU memory, perform that send here.
      // Otherwise the send will occur before the H2D coordinate transfer.
      if (!thisRankHasDuty(cr, DUTY_PME) && pmeSendCoordinatesFromGpu)
@@ -1495,10 +1490,12 @@ void do_force(FILE*                               fplog,
              }
              else
              {
-                // Note: GPU update + DD without direct communication is not supported,
-                // a waitCoordinatesReadyOnHost() should be issued if it will be.
-                GMX_ASSERT(!simulationWork.useGpuUpdate,
-                           "GPU update is not supported with CPU halo exchange");
+                if (simulationWork.useGpuUpdate)
+                {
+                    GMX_ASSERT(haveCopiedXFromGpu,
+                               "a wait should only be triggered if copy has been scheduled");
+                    stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+                }
                  dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
              }
  
@@ -1978,10 +1975,10 @@ void do_force(FILE*                               fplog,
          wallcycle_stop(wcycle, ewcFORCE);
      }
  
-    // If on GPU PME-PP comms or GPU update path, receive forces from PME before GPU buffer ops
+    // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
      // TODO refactor this and unify with below default-path call to the same function
      if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces
-        && (simulationWork.useGpuPmePpCommunication || simulationWork.useGpuUpdate))
+        && simulationWork.useGpuPmePpCommunication)
      {
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
@@ -2039,7 +2036,8 @@ void do_force(FILE*                               fplog,
              //       copy call done in sim_utils(...) for the output.
              // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
              //       they should not be copied in do_md(...) for the output.
-            if (!simulationWork.useGpuUpdate || vsite)
+            if (!simulationWork.useGpuUpdate
+                || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && haveHostPmePpComms) || vsite)
              {
                  stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
                  stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
@@ -2076,7 +2074,7 @@ void do_force(FILE*                               fplog,
  
      // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
      if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication
-        && !simulationWork.useGpuUpdate && stepWork.computeSlowForces)
+        && stepWork.computeSlowForces)
      {
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index c20574e37fb4494c93e330ae67e924711c0d5be1..f5c0fd393e527b48ecbb59402f34748c0afcf9a7 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -1267,9 +1267,16 @@ void gmx::LegacySimulator::do_md()
                  stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
              }
  
-            // If the buffer ops were not offloaded this step, the forces are on the host and have to be copied
-            if (!runScheduleWork->stepWork.useGpuFBufferOps)
+            if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
+                && !thisRankHasDuty(cr, DUTY_PME))
              {
+                // The PME forces were recieved to the host, so have to be copied
+                stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
+            }
+            else if (!runScheduleWork->stepWork.useGpuFBufferOps)
+            {
+                // The buffer ops were not offloaded this step, so the forces are on the
+                // host and have to be copied
                  stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
              }
  
diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp

index 91f0eb4720fe8aba7f75c3152aa714ab65cadc42..e22b51a29c5a4205b2a2693de140aa6235295134 100644 (file)
--- a/src/gromacs/taskassignment/decidegpuusage.cpp
+++ b/src/gromacs/taskassignment/decidegpuusage.cpp
@@ -561,26 +561,19 @@ bool decideWhetherToUseGpuForUpdate(const bool                     isDomainDecom
  
      if (isDomainDecomposition)
      {
-        if (!devFlags.enableGpuHaloExchange)
+        if (hasAnyConstraints && !useUpdateGroups)
          {
-            errorMessage += "Domain decomposition without GPU halo exchange is not supported.\n ";
+            errorMessage +=
+                    "Domain decomposition is only supported with constraints when update "
+                    "groups "
+                    "are used. This means constraining all bonds is not supported, except for "
+                    "small molecules, and box sizes close to half the pair-list cutoff are not "
+                    "supported.\n ";
          }
-        else
-        {
-            if (hasAnyConstraints && !useUpdateGroups)
-            {
-                errorMessage +=
-                        "Domain decomposition is only supported with constraints when update "
-                        "groups "
-                        "are used. This means constraining all bonds is not supported, except for "
-                        "small molecules, and box sizes close to half the pair-list cutoff are not "
-                        "supported.\n ";
-            }
  
-            if (pmeUsesCpu)
-            {
-                errorMessage += "With domain decomposition, PME must run fully on the GPU.\n";
-            }
+        if (pmeUsesCpu)
+        {
+            errorMessage += "With domain decomposition, PME must run fully on the GPU.\n";
          }
      }
  
@@ -590,11 +583,6 @@ bool decideWhetherToUseGpuForUpdate(const bool                     isDomainDecom
          {
              errorMessage += "With separate PME rank(s), PME must run fully on the GPU.\n";
          }
-
-        if (!devFlags.enableGpuPmePPComm)
-        {
-            errorMessage += "With separate PME rank(s), PME must use direct communication.\n";
-        }
      }
  
      if (inputrec.useMts)
author	Alan Gray <alangray3@gmail.com>
	Wed, 4 Nov 2020 18:56:03 +0000 (18:56 +0000)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Wed, 4 Nov 2020 18:56:03 +0000 (18:56 +0000)
admin/gitlab-ci/gromacs.gitlab-ci.yml		patch \| blob \| history
docs/release-notes/2021/major/performance.rst		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/taskassignment/decidegpuusage.cpp		patch \| blob \| history