Remove excessive H2D and D2H coordinates copies when update is offloaded

author Artem Zhmurov <zhmurov@gmail.com>

Wed, 16 Oct 2019 14:33:31 +0000 (16:33 +0200)

committer Artem Zhmurov <zhmurov@gmail.com>

Fri, 18 Oct 2019 14:45:32 +0000 (16:45 +0200)
author Artem Zhmurov <zhmurov@gmail.com>
Wed, 16 Oct 2019 14:33:31 +0000 (16:33 +0200)
committer Artem Zhmurov <zhmurov@gmail.com>
Fri, 18 Oct 2019 14:45:32 +0000 (16:45 +0200)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index adf6da0e5b5b119a6318e6d4ea75443997677e21..eb2185a6fa431c5185e2564aca53347a0146d9af 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1037,7 +1037,13 @@ void do_force(FILE                                     *fplog,
                  pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
              }
          }
                  pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
              }
          }
-        stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
+        // We need to copy coordinates when:
+        // 1. Update is not offloaded
+        // 2. The buffers were reinitialized on search step
+        if (!simulationWork.useGpuUpdate || stepWork.doNeighborSearch)
+        {
+            stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
+        }
      }
  
      const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local,
      }
  
      const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local,
@@ -1155,7 +1161,7 @@ void do_force(FILE                                     *fplog,
          // NS step is also a virial step (on which f buf ops are deactivated).
          if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
          {
          // NS step is also a virial step (on which f buf ops are deactivated).
          if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
          {
-            GMX_ASSERT(stateGpu, "stateGpu should be valid here");
+            GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
              nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice());
          }
      }
              nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice());
          }
      }
@@ -1163,6 +1169,7 @@ void do_force(FILE                                     *fplog,
      {
          if (useGpuXBufOps == BufferOpsUseGpu::True)
          {
      {
          if (useGpuXBufOps == BufferOpsUseGpu::True)
          {
+            GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
              nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
                                         stateGpu->getCoordinates(),
                                         localXReadyOnDevice);
              nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
                                         stateGpu->getCoordinates(),
                                         localXReadyOnDevice);
diff --git a/src/gromacs/mdlib/update_constrain_cuda.h b/src/gromacs/mdlib/update_constrain_cuda.h

index edad9c6c0a9739b2419a215f021aaa6301d2b56d..07add034566e7114caf927fdca67c825013de59e 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_cuda.h
+++ b/src/gromacs/mdlib/update_constrain_cuda.h
@@ -140,13 +140,6 @@ class UpdateConstrainCuda
           */
          void setPbc(const t_pbc *pbc);
  
           */
          void setPbc(const t_pbc *pbc);
  
-        /*! \brief Blocking wait on the update of coordinates being ready.
-         *
-         * \todo Remove when the "stitching" is done.
-         */
-        void waitCoordinatesReadyOnDevice();
-
-
          /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
           */
          GpuEventSynchronizer* getCoordinatesReadySync();
          /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
           */
          GpuEventSynchronizer* getCoordinatesReadySync();
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp

index adc765c8754ea4e6eddcc6c892d7cce45fbe5df1..80dadb153c103342b573670f95708be6c0bb045c 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp
@@ -95,11 +95,6 @@ void UpdateConstrainCuda::setPbc(gmx_unused const t_pbc *pbc)
      GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
  }
  
      GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
  }
  
-void UpdateConstrainCuda::waitCoordinatesReadyOnDevice()
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
-}
-
  GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
  {
      GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
  GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
  {
      GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cu b/src/gromacs/mdlib/update_constrain_cuda_impl.cu

index 2052bccb7c801b96b4b4916349cbcb0338cb76bd..e06aa7a5ebe23cb9a1a24ec67e577ba580448bd9 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_cuda_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cu
@@ -172,11 +172,6 @@ void UpdateConstrainCuda::Impl::setPbc(const t_pbc *pbc)
      settleCuda_->setPbc(pbc);
  }
  
      settleCuda_->setPbc(pbc);
  }
  
-void UpdateConstrainCuda::Impl::waitCoordinatesReadyOnDevice()
-{
-    coordinatesReady_->waitForEvent();
-}
-
  GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync()
  {
      return coordinatesReady_;
  GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync()
  {
      return coordinatesReady_;
@@ -224,11 +219,6 @@ void UpdateConstrainCuda::setPbc(const t_pbc *pbc)
      impl_->setPbc(pbc);
  }
  
      impl_->setPbc(pbc);
  }
  
-void UpdateConstrainCuda::waitCoordinatesReadyOnDevice()
-{
-    impl_->waitCoordinatesReadyOnDevice();
-}
-
  GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
  {
      return impl_->getCoordinatesReadySync();
  GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
  {
      return impl_->getCoordinatesReadySync();
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.h b/src/gromacs/mdlib/update_constrain_cuda_impl.h

index ab6e38431eb9671712104c6f13f8d812e9a275fc..b52cfce2c73983986935d4672394695c41465ab2 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_cuda_impl.h
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.h
@@ -142,13 +142,6 @@ class UpdateConstrainCuda::Impl
           */
          void setPbc(const t_pbc *pbc);
  
           */
          void setPbc(const t_pbc *pbc);
  
-        /*! \brief Blocking wait on the update of coordinates being ready.
-         *
-         * \todo Remove when the "stitching" is done.
-         */
-        void waitCoordinatesReadyOnDevice();
-
-
          /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
           */
          GpuEventSynchronizer* getCoordinatesReadySync();
          /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
           */
          GpuEventSynchronizer* getCoordinatesReadySync();
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index b30edb8473766ee18524e6a8d2454df817c1b289..d62fafa24942a93cbd728dfc2ba36a30af28a7f7 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -738,6 +738,13 @@ void gmx::LegacySimulator::do_md()
  
          if (bPMETune && bNStList)
          {
  
          if (bPMETune && bNStList)
          {
+            // This has to be here because PME load balancing is called so early.
+            // TODO: Move to after all booleans are defined.
+            if (useGpuForUpdate && !bFirstStep)
+            {
+                stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+            }
              /* PME grid + cut-off optimization with GPUs or PME nodes */
              pme_loadbal_do(pme_loadbal, cr,
                             (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
              /* PME grid + cut-off optimization with GPUs or PME nodes */
              pme_loadbal_do(pme_loadbal, cr,
                             (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
@@ -795,18 +802,29 @@ void gmx::LegacySimulator::do_md()
          do_verbose = mdrunOptions.verbose &&
              (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
  
          do_verbose = mdrunOptions.verbose &&
              (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
  
-        // Copy velocities from the GPU when needed:
-        // - On search steps to keep copy on host (device buffers are reinitialized).
-        // - When needed for the output.
          if (useGpuForUpdate && !bFirstStep)
          {
          if (useGpuForUpdate && !bFirstStep)
          {
+            // Copy velocities from the GPU when needed:
+            // - On search steps to keep copy on host (device buffers are reinitialized).
+            // - When needed for the output.
              if (bNS || do_per_step(step, ir->nstvout))
              {
                  stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
                  stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
              }
              if (bNS || do_per_step(step, ir->nstvout))
              {
                  stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
                  stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
              }
-        }
  
  
+            // Copy coordinate from the GPU when needed:
+            // - On search steps to keep copy on host (device buffers are reinitialized).
+            // - There are CPU bonded forces that need current coordinates
+            // - When needed for the output.
+            if (bNS ||
+                (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork) ||
+                do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed))
+            {
+                stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+            }
+        }
  
          if (bNS && !(bFirstStep && ir->bContinuation))
          {
  
          if (bNS && !(bFirstStep && ir->bContinuation))
          {
@@ -1254,9 +1272,9 @@ void gmx::LegacySimulator::do_md()
  
                  // Copy data to the GPU after buffers might have being reinitialized
                  stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
  
                  // Copy data to the GPU after buffers might have being reinitialized
                  stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
              }
  
              }
  
-            stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
              stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), StatePropagatorDataGpu::AtomLocality::All);
  
              // TODO: Use StepWorkload fields.
              stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), StatePropagatorDataGpu::AtomLocality::All);
  
              // TODO: Use StepWorkload fields.
@@ -1270,7 +1288,6 @@ void gmx::LegacySimulator::do_md()
                                    ir->delta_t, true, bCalcVir, shake_vir,
                                    doTempCouple, ekind->tcstat,
                                    doParrinelloRahman, ir->nstpcouple*ir->delta_t, M);
                                    ir->delta_t, true, bCalcVir, shake_vir,
                                    doTempCouple, ekind->tcstat,
                                    doParrinelloRahman, ir->nstpcouple*ir->delta_t, M);
-            stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
  
              // Copy velocities D2H after update if:
              // - Globals are computed this step (includes the energy output steps).
  
              // Copy velocities D2H after update if:
              // - Globals are computed this step (includes the energy output steps).
@@ -1279,10 +1296,10 @@ void gmx::LegacySimulator::do_md()
              {
                  stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
                  stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
              {
                  stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
                  stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
              }
  
              }
  
-            // TODO: replace with stateGpu->waitForCopyCoordinatesFromGpu(...)
-            integrator->waitCoordinatesReadyOnDevice();
          }
          else
          {
          }
          else
          {
@@ -1423,6 +1440,13 @@ void gmx::LegacySimulator::do_md()
                  {
                      process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(), state->v.rvec_array());
                      inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
                  {
                      process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(), state->v.rvec_array());
                      inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
+
+                    // TODO: The special case of removing CM motion should be dealt more gracefully
+                    if (useGpuForUpdate)
+                    {
+                        stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+                        stateGpu->waitCoordinatesCopiedToDevice(StatePropagatorDataGpu::AtomLocality::Local);
+                    }
                  }
              }
          }
                  }
              }
          }
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index 96066d0ac3f948cd0b5320de6276a10d9b10be81..b5bc2699c0bcbae12af7f45e46e17a6919de237b 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -215,6 +215,14 @@ class StatePropagatorDataGpu
                                                                 const SimulationWorkload &simulationWork,
                                                                 const StepWorkload       &stepWork);
  
                                                                 const SimulationWorkload &simulationWork,
                                                                 const StepWorkload       &stepWork);
  
+        /*! \brief Blocking wait until coordinates are copied to the device.
+         *
+         * Synchronizes the stream in which the copy was executed.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         */
+        void waitCoordinatesCopiedToDevice(AtomLocality  atomLocality);
+
          /*! \brief Getter for the event synchronizer for the update is done on th GPU
           *
           *  \returns  The event to synchronize the stream coordinates wre updated on device.
          /*! \brief Getter for the event synchronizer for the update is done on th GPU
           *
           *  \returns  The event to synchronize the stream coordinates wre updated on device.
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp

index 417d488a803c3bd531d5ceda5978c0fd8f378b12..43d707dc3c952126a108c6242768eee497613847 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -104,6 +104,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(A
      return nullptr;
  }
  
      return nullptr;
  }
  
+void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality  /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+}
+
  GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
  {
      GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
  GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
  {
      GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index 1b687c022ef6ba71bceccc2de90af99ec9084550..2b1db0d791739e7a3a91fce3027e6d5b2d46e0b5 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -200,6 +200,14 @@ class StatePropagatorDataGpu::Impl
                                                                 const SimulationWorkload &simulationWork,
                                                                 const StepWorkload       &stepWork);
  
                                                                 const SimulationWorkload &simulationWork,
                                                                 const StepWorkload       &stepWork);
  
+        /*! \brief Blocking wait until coordinates are copied to the device.
+         *
+         * Synchronizes the stream in which the copy was executed.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         */
+        void waitCoordinatesCopiedToDevice(AtomLocality  atomLocality);
+
          /*! \brief Getter for the event synchronizer for the update is done on th GPU
           *
           *  \returns  The event to synchronize the stream coordinates wre updated on device.
          /*! \brief Getter for the event synchronizer for the update is done on th GPU
           *
           *  \returns  The event to synchronize the stream coordinates wre updated on device.
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 50294a921cd46f223530063ee4c5149335d18739..cca10920d1bebd0ff15905d5a1a904deee951af4 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -330,6 +330,12 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceE
      }
  }
  
      }
  }
  
+void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality  atomLocality)
+{
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    xReadyOnDevice_[atomLocality].waitForEvent();
+}
+
  GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
  {
      return &xUpdatedOnDevice_;
  GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
  {
      return &xUpdatedOnDevice_;
@@ -527,6 +533,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(A
      return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
  }
  
      return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
  }
  
+void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality  atomLocality)
+{
+    return impl_->waitCoordinatesCopiedToDevice(atomLocality);
+}
+
  GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
  {
      return impl_->xUpdatedOnDevice();
  GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
  {
      return impl_->xUpdatedOnDevice();
author	Artem Zhmurov <zhmurov@gmail.com>
	Wed, 16 Oct 2019 14:33:31 +0000 (16:33 +0200)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Fri, 18 Oct 2019 14:45:32 +0000 (16:45 +0200)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdlib/update_constrain_cuda.h		patch \| blob \| history
src/gromacs/mdlib/update_constrain_cuda_impl.cpp		patch \| blob \| history
src/gromacs/mdlib/update_constrain_cuda_impl.cu		patch \| blob \| history
src/gromacs/mdlib/update_constrain_cuda_impl.h		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history