Remove excessive H2D and D2H coordinates copies when update is offloaded
authorArtem Zhmurov <zhmurov@gmail.com>
Wed, 16 Oct 2019 14:33:31 +0000 (16:33 +0200)
committerArtem Zhmurov <zhmurov@gmail.com>
Fri, 18 Oct 2019 14:45:32 +0000 (16:45 +0200)
The H2D copies are only needed:
1. When update is not ofloaded.
2. At the search steps, after device buffers were reinitialized.

The D2H copies are only needed:
1. On the search steps, since the device buffers are reinitialized.
2. If there are CPU consumers, e.g. CPU bondeds.
3. When the energy is computed.
4. When coordinates are needed for output.

There are two special cases, when coordinates are needed on host,
that dealt with separately:
1. When the PME it tuned.
2. When center of mass motion is removed.

The locality of copied atoms when update is offloaded is changed
from All to Local in preparation for multi-GPU case. The blocking sync
on H2D copy event is moved from UpdateConstraints to
StatePropagatorDataGpu.

Change-Id: I971a6273b39fa7da07600312c085ce343b5d25ee

src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdlib/update_constrain_cuda.h
src/gromacs/mdlib/update_constrain_cuda_impl.cpp
src/gromacs/mdlib/update_constrain_cuda_impl.cu
src/gromacs/mdlib/update_constrain_cuda_impl.h
src/gromacs/mdrun/md.cpp
src/gromacs/mdtypes/state_propagator_data_gpu.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index adf6da0e5b5b119a6318e6d4ea75443997677e21..eb2185a6fa431c5185e2564aca53347a0146d9af 100644 (file)
@@ -1037,7 +1037,13 @@ void do_force(FILE                                     *fplog,
                 pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
             }
         }
-        stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
+        // We need to copy coordinates when:
+        // 1. Update is not offloaded
+        // 2. The buffers were reinitialized on search step
+        if (!simulationWork.useGpuUpdate || stepWork.doNeighborSearch)
+        {
+            stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
+        }
     }
 
     const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local,
@@ -1155,7 +1161,7 @@ void do_force(FILE                                     *fplog,
         // NS step is also a virial step (on which f buf ops are deactivated).
         if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
         {
-            GMX_ASSERT(stateGpu, "stateGpu should be valid here");
+            GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
             nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice());
         }
     }
@@ -1163,6 +1169,7 @@ void do_force(FILE                                     *fplog,
     {
         if (useGpuXBufOps == BufferOpsUseGpu::True)
         {
+            GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
             nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
                                        stateGpu->getCoordinates(),
                                        localXReadyOnDevice);
index edad9c6c0a9739b2419a215f021aaa6301d2b56d..07add034566e7114caf927fdca67c825013de59e 100644 (file)
@@ -140,13 +140,6 @@ class UpdateConstrainCuda
          */
         void setPbc(const t_pbc *pbc);
 
-        /*! \brief Blocking wait on the update of coordinates being ready.
-         *
-         * \todo Remove when the "stitching" is done.
-         */
-        void waitCoordinatesReadyOnDevice();
-
-
         /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
          */
         GpuEventSynchronizer* getCoordinatesReadySync();
index adc765c8754ea4e6eddcc6c892d7cce45fbe5df1..80dadb153c103342b573670f95708be6c0bb045c 100644 (file)
@@ -95,11 +95,6 @@ void UpdateConstrainCuda::setPbc(gmx_unused const t_pbc *pbc)
     GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
 }
 
-void UpdateConstrainCuda::waitCoordinatesReadyOnDevice()
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
-}
-
 GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
 {
     GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
index 2052bccb7c801b96b4b4916349cbcb0338cb76bd..e06aa7a5ebe23cb9a1a24ec67e577ba580448bd9 100644 (file)
@@ -172,11 +172,6 @@ void UpdateConstrainCuda::Impl::setPbc(const t_pbc *pbc)
     settleCuda_->setPbc(pbc);
 }
 
-void UpdateConstrainCuda::Impl::waitCoordinatesReadyOnDevice()
-{
-    coordinatesReady_->waitForEvent();
-}
-
 GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync()
 {
     return coordinatesReady_;
@@ -224,11 +219,6 @@ void UpdateConstrainCuda::setPbc(const t_pbc *pbc)
     impl_->setPbc(pbc);
 }
 
-void UpdateConstrainCuda::waitCoordinatesReadyOnDevice()
-{
-    impl_->waitCoordinatesReadyOnDevice();
-}
-
 GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync()
 {
     return impl_->getCoordinatesReadySync();
index ab6e38431eb9671712104c6f13f8d812e9a275fc..b52cfce2c73983986935d4672394695c41465ab2 100644 (file)
@@ -142,13 +142,6 @@ class UpdateConstrainCuda::Impl
          */
         void setPbc(const t_pbc *pbc);
 
-        /*! \brief Blocking wait on the update of coordinates being ready.
-         *
-         * \todo Remove when the "stitching" is done.
-         */
-        void waitCoordinatesReadyOnDevice();
-
-
         /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device.
          */
         GpuEventSynchronizer* getCoordinatesReadySync();
index b30edb8473766ee18524e6a8d2454df817c1b289..d62fafa24942a93cbd728dfc2ba36a30af28a7f7 100644 (file)
@@ -738,6 +738,13 @@ void gmx::LegacySimulator::do_md()
 
         if (bPMETune && bNStList)
         {
+            // This has to be here because PME load balancing is called so early.
+            // TODO: Move to after all booleans are defined.
+            if (useGpuForUpdate && !bFirstStep)
+            {
+                stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+            }
             /* PME grid + cut-off optimization with GPUs or PME nodes */
             pme_loadbal_do(pme_loadbal, cr,
                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
@@ -795,18 +802,29 @@ void gmx::LegacySimulator::do_md()
         do_verbose = mdrunOptions.verbose &&
             (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 
-        // Copy velocities from the GPU when needed:
-        // - On search steps to keep copy on host (device buffers are reinitialized).
-        // - When needed for the output.
         if (useGpuForUpdate && !bFirstStep)
         {
+            // Copy velocities from the GPU when needed:
+            // - On search steps to keep copy on host (device buffers are reinitialized).
+            // - When needed for the output.
             if (bNS || do_per_step(step, ir->nstvout))
             {
                 stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
                 stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
             }
-        }
 
+            // Copy coordinate from the GPU when needed:
+            // - On search steps to keep copy on host (device buffers are reinitialized).
+            // - There are CPU bonded forces that need current coordinates
+            // - When needed for the output.
+            if (bNS ||
+                (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork) ||
+                do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed))
+            {
+                stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+            }
+        }
 
         if (bNS && !(bFirstStep && ir->bContinuation))
         {
@@ -1254,9 +1272,9 @@ void gmx::LegacySimulator::do_md()
 
                 // Copy data to the GPU after buffers might have being reinitialized
                 stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
             }
 
-            stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
             stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), StatePropagatorDataGpu::AtomLocality::All);
 
             // TODO: Use StepWorkload fields.
@@ -1270,7 +1288,6 @@ void gmx::LegacySimulator::do_md()
                                   ir->delta_t, true, bCalcVir, shake_vir,
                                   doTempCouple, ekind->tcstat,
                                   doParrinelloRahman, ir->nstpcouple*ir->delta_t, M);
-            stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
 
             // Copy velocities D2H after update if:
             // - Globals are computed this step (includes the energy output steps).
@@ -1279,10 +1296,10 @@ void gmx::LegacySimulator::do_md()
             {
                 stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local);
                 stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+                stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local);
             }
 
-            // TODO: replace with stateGpu->waitForCopyCoordinatesFromGpu(...)
-            integrator->waitCoordinatesReadyOnDevice();
         }
         else
         {
@@ -1423,6 +1440,13 @@ void gmx::LegacySimulator::do_md()
                 {
                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(), state->v.rvec_array());
                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
+
+                    // TODO: The special case of removing CM motion should be dealt more gracefully
+                    if (useGpuForUpdate)
+                    {
+                        stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::Local);
+                        stateGpu->waitCoordinatesCopiedToDevice(StatePropagatorDataGpu::AtomLocality::Local);
+                    }
                 }
             }
         }
index 96066d0ac3f948cd0b5320de6276a10d9b10be81..b5bc2699c0bcbae12af7f45e46e17a6919de237b 100644 (file)
@@ -215,6 +215,14 @@ class StatePropagatorDataGpu
                                                                const SimulationWorkload &simulationWork,
                                                                const StepWorkload       &stepWork);
 
+        /*! \brief Blocking wait until coordinates are copied to the device.
+         *
+         * Synchronizes the stream in which the copy was executed.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         */
+        void waitCoordinatesCopiedToDevice(AtomLocality  atomLocality);
+
         /*! \brief Getter for the event synchronizer for the update is done on th GPU
          *
          *  \returns  The event to synchronize the stream coordinates wre updated on device.
index 417d488a803c3bd531d5ceda5978c0fd8f378b12..43d707dc3c952126a108c6242768eee497613847 100644 (file)
@@ -104,6 +104,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(A
     return nullptr;
 }
 
+void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality  /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+}
+
 GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
 {
     GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
index 1b687c022ef6ba71bceccc2de90af99ec9084550..2b1db0d791739e7a3a91fce3027e6d5b2d46e0b5 100644 (file)
@@ -200,6 +200,14 @@ class StatePropagatorDataGpu::Impl
                                                                const SimulationWorkload &simulationWork,
                                                                const StepWorkload       &stepWork);
 
+        /*! \brief Blocking wait until coordinates are copied to the device.
+         *
+         * Synchronizes the stream in which the copy was executed.
+         *
+         *  \param[in] atomLocality  Locality of the particles to wait for.
+         */
+        void waitCoordinatesCopiedToDevice(AtomLocality  atomLocality);
+
         /*! \brief Getter for the event synchronizer for the update is done on th GPU
          *
          *  \returns  The event to synchronize the stream coordinates wre updated on device.
index 50294a921cd46f223530063ee4c5149335d18739..cca10920d1bebd0ff15905d5a1a904deee951af4 100644 (file)
@@ -330,6 +330,12 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceE
     }
 }
 
+void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality  atomLocality)
+{
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    xReadyOnDevice_[atomLocality].waitForEvent();
+}
+
 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
 {
     return &xUpdatedOnDevice_;
@@ -527,6 +533,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(A
     return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
 }
 
+void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality  atomLocality)
+{
+    return impl_->waitCoordinatesCopiedToDevice(atomLocality);
+}
+
 GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
 {
     return impl_->xUpdatedOnDevice();