Bug fix for event passed to GPU comm features
authorAlan Gray <alangraygerrit@gmail.com>
Mon, 28 Oct 2019 09:07:46 +0000 (02:07 -0700)
committerMark Abraham <mark.j.abraham@gmail.com>
Thu, 31 Oct 2019 15:49:36 +0000 (16:49 +0100)
The GPU coordinate halo exchange and coordinate PME-PP transfer were
waiting on an event signalling that the coordinates are available on
the GPU that lives in the NB class, but the new state propagator
feature instead records a different event for this purpose, breaking
the dependency. This change fixes the bug by instead pass the state
propagator event as a dependency to these methods. It needs to be done
every step rather than stored in the class since it can change
depending on the type of step.

Change-Id: I7ea048c6f303192c1310d6b5593227c0ad9a81d0

13 files changed:
src/gromacs/domdec/gpuhaloexchange.h
src/gromacs/domdec/gpuhaloexchange_impl.cpp
src/gromacs/domdec/gpuhaloexchange_impl.cu
src/gromacs/domdec/gpuhaloexchange_impl.cuh
src/gromacs/ewald/pme.h
src/gromacs/ewald/pme_pp.cpp
src/gromacs/ewald/pme_pp_comm_gpu.h
src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
src/gromacs/ewald/pme_pp_comm_gpu_impl.h
src/gromacs/mdlib/forcerec.cpp
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/runner.cpp

index f991e0512327e77f64d8bd830d1db2dbdae7d51b..d5d8ee995310b37b28b046aea34be73c6c28f902 100644 (file)
@@ -49,6 +49,7 @@
 #include "gromacs/utility/gmxmpi.h"
 
 struct gmx_domdec_t;
+class GpuEventSynchronizer;
 
 namespace gmx
 {
@@ -68,7 +69,7 @@ class GpuHaloExchange
          * the non-local non-bonded kernels). It also must be called
          * after the local coordinates buffer operations (where the
          * coordinates are copied to the device and hence the \c
-         * coordinatesOnDeviceEvent is recorded). Force Halo exchange
+         * coordinatesReadyOnDeviceEvent is recorded). Force Halo exchange
          * will be performed in \c streamNonLocal (also potentally
          * with buffer clearing in \c streamLocal)and the \c
          * communicateHaloForces method must be called after the
@@ -81,13 +82,11 @@ class GpuHaloExchange
          * \param [in]    mpi_comm_mysim           communicator used for simulation
          * \param [in]    streamLocal              local NB CUDA stream.
          * \param [in]    streamNonLocal           non-local NB CUDA stream.
-         * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
          */
         GpuHaloExchange(gmx_domdec_t *dd,
                         MPI_Comm      mpi_comm_mysim,
                         void         *streamLocal,
-                        void         *streamNonLocal,
-                        void         *coordinatesOnDeviceEvent);
+                        void         *streamNonLocal);
         ~GpuHaloExchange();
 
         /*! \brief
@@ -106,8 +105,9 @@ class GpuHaloExchange
          * event when the coordinate data has been copied to the
          * device).
          * \param [in] box  Coordinate box (from which shifts will be constructed)
+         * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
          */
-        void communicateHaloCoordinates(const matrix box);
+        void communicateHaloCoordinates(const matrix box, GpuEventSynchronizer *coordinatesReadyOnDeviceEvent);
 
         /*! \brief GPU halo exchange of force buffer.
          * \param[in] accumulateForces  True if forces should accumulate, otherwise they are set
index c4c191b314fb4f6aad8dc11ffa2745ca17759357..2bb917d6b444e134a5cfcbf5159901bf991e5da3 100644 (file)
@@ -63,8 +63,7 @@ class GpuHaloExchange::Impl
 GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
                                  MPI_Comm       /* mpi_comm_mysim */,
                                  void         * /*streamLocal */,
-                                 void         * /*streamNonLocal */,
-                                 void         * /*coordinatesOnDeviceEvent*/)
+                                 void         * /*streamNonLocal */)
     : impl_(nullptr)
 {
     GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
@@ -80,7 +79,8 @@ void GpuHaloExchange::reinitHalo(DeviceBuffer<float> /* d_coordinatesBuffer */,
 }
 
 /*!\brief apply X halo exchange stub. */
-void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */)
+void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */,
+                                                 GpuEventSynchronizer * /*coordinatesOnDeviceEvent*/)
 {
     GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange exchange was called insted of the correct implementation.");
 }
index c195f96cb577aff3e46c7bc57f4a937b0ff43bce..1e5038d0e40513513b6c376484a2fa6a30f522df 100644 (file)
@@ -194,13 +194,12 @@ void GpuHaloExchange::Impl::reinitHalo(float3      *d_coordinatesBuffer,
     return;
 }
 
-// The following method be called after local setCoordinates (which records the coordinatesOnDeviceEvent_
-// event when the coordinate data has been copied to the device).
-void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box)
+void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box,
+                                                       GpuEventSynchronizer *coordinatesReadyOnDeviceEvent)
 {
 
     //ensure stream waits until coordinate data is available on device
-    coordinatesOnDeviceEvent_->enqueueWaitEvent(nonLocalStream_);
+    coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
 
     // launch kernel to pack send buffer
     KernelLaunchConfig config;
@@ -385,8 +384,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
 GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
                             MPI_Comm      mpi_comm_mysim,
                             void        * localStream,
-                            void        * nonLocalStream,
-                            void        * coordinatesOnDeviceEvent)
+                            void        * nonLocalStream)
     : dd_(dd),
       sendRankX_(dd->neighbor[0][1]),
       recvRankX_(dd->neighbor[0][0]),
@@ -396,8 +394,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
       haloDataTransferLaunched_(new GpuEventSynchronizer()),
       mpi_comm_mysim_(mpi_comm_mysim),
       localStream_(*static_cast<cudaStream_t*> (localStream)),
-      nonLocalStream_(*static_cast<cudaStream_t*> (nonLocalStream)),
-      coordinatesOnDeviceEvent_(static_cast<GpuEventSynchronizer*> (coordinatesOnDeviceEvent))
+      nonLocalStream_(*static_cast<cudaStream_t*> (nonLocalStream))
 {
 
     GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "GPU Halo exchange is currently only supported with thread-MPI enabled");
@@ -430,9 +427,8 @@ GpuHaloExchange::Impl::~Impl()
 GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd,
                                  MPI_Comm      mpi_comm_mysim,
                                  void         *localStream,
-                                 void         *nonLocalStream,
-                                 void         *coordinatesOnDeviceEvent)
-    : impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream, coordinatesOnDeviceEvent))
+                                 void         *nonLocalStream)
+    : impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream))
 {
 }
 
@@ -444,9 +440,9 @@ void GpuHaloExchange::reinitHalo(DeviceBuffer<float>  d_coordinatesBuffer,
     impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer), reinterpret_cast<float3*>(d_forcesBuffer));
 }
 
-void GpuHaloExchange::communicateHaloCoordinates(const matrix box)
+void GpuHaloExchange::communicateHaloCoordinates(const matrix box, GpuEventSynchronizer *coordinatesReadyOnDeviceEvent)
 {
-    impl_->communicateHaloCoordinates(box);
+    impl_->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
 }
 
 void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
index 3652eca85be8c854e526b5c93465363d5e5809cb..2a41dcfa951c59bdb707b682eec6e3996a27c6db 100644 (file)
@@ -71,13 +71,11 @@ class GpuHaloExchange::Impl
          * \param [in]    mpi_comm_mysim           communicator used for simulation
          * \param [in]    localStream              local NB CUDA stream
          * \param [in]    nonLocalStream           non-local NB CUDA stream
-         * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
          */
         Impl(gmx_domdec_t *dd,
              MPI_Comm mpi_comm_mysim,
              void *localStream,
-             void *nonLocalStream,
-             void *coordinatesOnDeviceEvent);
+             void *nonLocalStream);
         ~Impl();
 
         /*! \brief
@@ -92,8 +90,10 @@ class GpuHaloExchange::Impl
         /*! \brief
          * GPU halo exchange of coordinates buffer
          * \param [in] box  Coordinate box (from which shifts will be constructed)
+         * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
          */
-        void communicateHaloCoordinates(const matrix box);
+        void communicateHaloCoordinates(const matrix          box,
+                                        GpuEventSynchronizer *coordinatesReadyOnDeviceEvent);
 
         /*! \brief  GPU halo exchange of force buffer
          * \param[in] accumulateForces  True if forces should accumulate, otherwise they are set
@@ -180,8 +180,6 @@ class GpuHaloExchange::Impl
         cudaStream_t                localStream_              = nullptr;
         //! CUDA stream for non-local non-bonded calculations
         cudaStream_t                nonLocalStream_           = nullptr;
-        //! Event triggered when coordinates have been copied to device
-        GpuEventSynchronizer       *coordinatesOnDeviceEvent_ = nullptr;
         //! full coordinates buffer in GPU memory
         float3                     *d_x_                      = nullptr;
         //! full forces buffer in GPU memory
index 3695f12f7222ce1d397b3d0c8f67577ed81d5c5a..507af05bc20c658f584a519925bbf35d4cc91867 100644 (file)
@@ -232,7 +232,8 @@ void gmx_pme_send_coordinates(t_forcerec *fr, const t_commrec *cr, const matrix
                               gmx_bool bEnerVir,
                               int64_t step, bool useGpuPmePpComms,
                               bool reinitGpuPmePpComms,
-                              bool sendCoordinatesFromGpu, gmx_wallcycle *wcycle);
+                              bool sendCoordinatesFromGpu,
+                              GpuEventSynchronizer *coordinatesReadyOnDeviceEvent, gmx_wallcycle *wcycle);
 
 /*! \brief Tell our PME-only node to finish */
 void gmx_pme_send_finish(const t_commrec *cr);
index 18509725becc8015cff26da146f6c3d8cbe50bfb..cbf6781b10c1da61fc64c37cb35996875b2ce89e 100644 (file)
@@ -101,7 +101,8 @@ static void gmx_pme_send_coeffs_coords(t_forcerec *fr, const t_commrec *cr, unsi
                                        int maxshift_x, int maxshift_y,
                                        int64_t step, bool useGpuPmePpComms,
                                        bool reinitGpuPmePpComms,
-                                       bool sendCoordinatesFromGpu)
+                                       bool sendCoordinatesFromGpu,
+                                       GpuEventSynchronizer *coordinatesReadyOnDeviceEvent)
 {
     gmx_domdec_t         *dd;
     gmx_pme_comm_n_box_t *cnb;
@@ -220,7 +221,7 @@ static void gmx_pme_send_coeffs_coords(t_forcerec *fr, const t_commrec *cr, unsi
             {
                 void *sendPtr = sendCoordinatesFromGpu ? static_cast<void*> (fr->stateGpu->getCoordinates()) :
                     static_cast<void*> (xRealPtr);
-                fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu);
+                fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
             }
             else
             {
@@ -270,7 +271,7 @@ void gmx_pme_send_parameters(const t_commrec *cr,
     gmx_pme_send_coeffs_coords(nullptr, cr, flags,
                                chargeA, chargeB,
                                sqrt_c6A, sqrt_c6B, sigmaA, sigmaB,
-                               nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false, false, false);
+                               nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false, false, false, nullptr);
 }
 
 void gmx_pme_send_coordinates(t_forcerec *fr, const t_commrec *cr, const matrix box, const rvec *x,
@@ -278,7 +279,8 @@ void gmx_pme_send_coordinates(t_forcerec *fr, const t_commrec *cr, const matrix
                               gmx_bool bEnerVir,
                               int64_t step, bool useGpuPmePpComms,
                               bool receiveCoordinateAddressFromPme,
-                              bool sendCoordinatesFromGpu, gmx_wallcycle *wcycle)
+                              bool sendCoordinatesFromGpu,
+                              GpuEventSynchronizer *coordinatesReadyOnDeviceEvent, gmx_wallcycle *wcycle)
 {
     wallcycle_start(wcycle, ewcPP_PMESENDX);
 
@@ -289,7 +291,7 @@ void gmx_pme_send_coordinates(t_forcerec *fr, const t_commrec *cr, const matrix
     }
     gmx_pme_send_coeffs_coords(fr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
                                box, x, lambda_q, lambda_lj, 0, 0, step, useGpuPmePpComms, receiveCoordinateAddressFromPme,
-                               sendCoordinatesFromGpu);
+                               sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
 
     wallcycle_stop(wcycle, ewcPP_PMESENDX);
 }
@@ -298,7 +300,7 @@ void gmx_pme_send_finish(const t_commrec *cr)
 {
     unsigned int flags = PP_PME_FINISH;
 
-    gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false);
+    gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false, nullptr);
 }
 
 void gmx_pme_send_switchgrid(const t_commrec *cr,
index 7461dae9428414725cef3bbc5d25d0019e98ef97..b58125513e87caa3593aa30c1e5553bad5e706ab 100644 (file)
@@ -45,6 +45,8 @@
 #include "gromacs/utility/classhelpers.h"
 #include "gromacs/utility/gmxmpi.h"
 
+class GpuEventSynchronizer;
+
 namespace gmx
 {
 
@@ -59,9 +61,8 @@ class PmePpCommGpu
         /*! \brief Creates PME-PP GPU communication object
          * \param[in] comm            Communicator used for simulation
          * \param[in] pmeRank         Rank of PME task
-         * \param[in] coordinatesOnDeviceEvent Event recorded when coordinates are available on device
          */
-        PmePpCommGpu(MPI_Comm comm, int pmeRank, void* coordinatesOnDeviceEvent);
+        PmePpCommGpu(MPI_Comm comm, int pmeRank);
         ~PmePpCommGpu();
 
         /*! \brief Perform steps required when buffer size changes
@@ -81,8 +82,9 @@ class PmePpCommGpu
          * \param[in] sendPtr Buffer with coordinate data
          * \param[in] sendSize Number of elements to send
          * \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU
+         * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
          */
-        void sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu);
+        void sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
 
         /*! \brief
          * Return pointer to buffer used for staging PME force on GPU
index 44ce2fa0e2de1032515a8a664407fff1132758be..5324343a8edb06e0a4694240766bef39e89d74f5 100755 (executable)
@@ -62,7 +62,7 @@ class PmePpCommGpu::Impl
 };
 
 /*!\brief Constructor stub. */
-PmePpCommGpu::PmePpCommGpu(MPI_Comm gmx_unused comm, int gmx_unused pmeRank, void gmx_unused *coordinatesOnDeviceEvent)
+PmePpCommGpu::PmePpCommGpu(MPI_Comm gmx_unused comm, int gmx_unused pmeRank)
     : impl_(nullptr)
 {
     GMX_ASSERT(false, "A CPU stub for PME-PP GPU communication was called instead of the correct implementation.");
@@ -81,7 +81,7 @@ void PmePpCommGpu::receiveForceFromPmeCudaDirect(void gmx_unused *recvPtr, int g
     GMX_ASSERT(false, "A CPU stub for PME-PP GPU communication was called instead of the correct implementation.");
 }
 
-void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void gmx_unused *sendPtr, int gmx_unused sendSize, bool gmx_unused sendPmeCoordinatesFromGpu)
+void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void gmx_unused *sendPtr, int gmx_unused sendSize, bool gmx_unused sendPmeCoordinatesFromGpu, GpuEventSynchronizer gmx_unused *coordinatesOnDeviceEvent)
 {
     GMX_ASSERT(false, "A CPU stub for PME-PP GPU communication was called instead of the correct implementation.");
 }
index 3b9e6e7d8b5c778a67e26889535678bc9cb4c7da..621f18893fd32f55e98b2f7f39d5e40b6eed3d43 100644 (file)
 namespace gmx
 {
 
-PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, void* coordinatesOnDeviceEvent)
+PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank)
     : comm_(comm),
-      pmeRank_(pmeRank),
-      coordinatesOnDeviceEvent_(static_cast<GpuEventSynchronizer*> (coordinatesOnDeviceEvent))
+      pmeRank_(pmeRank)
 {
     GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
     cudaStreamCreate(&pmePpCommStream_);
@@ -115,11 +114,11 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void *recvPtr, int recvSi
     }
 }
 
-void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool gmx_unused sendPmeCoordinatesFromGpu)
+void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool gmx_unused sendPmeCoordinatesFromGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
 {
 
     //ensure stream waits until coordinate data is available on device
-    coordinatesOnDeviceEvent_->enqueueWaitEvent(pmePpCommStream_);
+    coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_);
 
     cudaError_t stat        = cudaMemcpyAsync(remotePmeXBuffer_, sendPtr,
                                               sendSize*DIM*sizeof(float), cudaMemcpyDefault,
@@ -144,8 +143,8 @@ void* PmePpCommGpu::Impl::getForcesReadySynchronizer()
     return static_cast<void*> (&forcesReadySynchronizer_);
 }
 
-PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, int pmeRank, void* coordinatesOnDeviceEvent)
-    : impl_(new Impl(comm,  pmeRank, coordinatesOnDeviceEvent))
+PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, int pmeRank)
+    : impl_(new Impl(comm,  pmeRank))
 {
 }
 
@@ -161,9 +160,9 @@ void PmePpCommGpu::receiveForceFromPmeCudaDirect(void *recvPtr, int recvSize, bo
     impl_->receiveForceFromPmeCudaDirect(recvPtr, recvSize, receivePmeForceToGpu);
 }
 
-void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu)
+void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
 {
-    impl_->sendCoordinatesToPmeCudaDirect(sendPtr, sendSize, sendPmeCoordinatesFromGpu);
+    impl_->sendCoordinatesToPmeCudaDirect(sendPtr, sendSize, sendPmeCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
 }
 
 void* PmePpCommGpu::getGpuForceStagingPtr()
index 2b279a43a1e7ae3e3f24e8157c3dde4bfcec6f45..be408acbf4ac6a9d2e5922bf7c455298c83ce370 100644 (file)
@@ -59,9 +59,8 @@ class PmePpCommGpu::Impl
         /*! \brief Creates PME-PP GPU communication object.
          * \param[in] comm            Communicator used for simulation
          * \param[in] pmeRank         Rank of PME task
-         * \param[in] coordinatesOnDeviceEvent Event recorded when coordinates are available on device
          */
-        Impl(MPI_Comm comm, int pmeRank, void* coordinatesOnDeviceEvent);
+        Impl(MPI_Comm comm, int pmeRank);
         ~Impl();
 
         /*! \brief Perform steps required when buffer size changes
@@ -98,8 +97,9 @@ class PmePpCommGpu::Impl
          * \param[in] sendPtr Buffer with coordinate data
          * \param[in] sendSize Number of elements to send
          * \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU
+         * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
          */
-        void sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu);
+        void sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
 
         /*! \brief
          * Return pointer to buffer used for staging PME force on GPU
@@ -132,8 +132,6 @@ class PmePpCommGpu::Impl
         GpuEventSynchronizer    forcesReadySynchronizer_;
         //! Event recorded when coordinates have been transferred to PME task
         GpuEventSynchronizer    pmeCoordinatesSynchronizer_;
-        //! Event recorded when coordinates have been copied to GPU on this PP task.
-        GpuEventSynchronizer   *coordinatesOnDeviceEvent_;
 };
 
 } // namespace gmx
index 25f0ad8aecced1355aa74bd73a8dd0384aba1fd2..4372c3544fd1e3391048ddcd8c204da34892897d 100644 (file)
@@ -1494,10 +1494,8 @@ void init_forcerec(FILE                             *fp,
 
     if (pmeOnlyRankUsesGpu && c_enableGpuPmePpComms)
     {
-        void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
         fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(cr->mpi_comm_mysim,
-                                                               cr->dd->pme_nodeid,
-                                                               coordinatesOnDeviceEvent);
+                                                               cr->dd->pme_nodeid);
     }
 }
 
index b5670683d4601fbb374c971306d80f945f1a0587..6a2cfbe34e7e6ac1fa75abc0f7a32206a40be7f9 100644 (file)
@@ -1046,6 +1046,9 @@ void do_force(FILE                                     *fplog,
         stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
     }
 
+    const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(AtomLocality::Local,
+                                                                                                        simulationWork, stepWork) : nullptr;
+
 #if GMX_MPI
     if (!thisRankHasDuty(cr, DUTY_PME))
     {
@@ -1060,12 +1063,10 @@ void do_force(FILE                                     *fplog,
                                  lambda[efptCOUL], lambda[efptVDW],
                                  (stepWork.computeVirial || stepWork.computeEnergy),
                                  step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
-                                 sendCoordinatesFromGpu, wcycle);
+                                 sendCoordinatesFromGpu, localXReadyOnDevice, wcycle);
     }
 #endif /* GMX_MPI */
 
-    const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(AtomLocality::Local,
-                                                                                                        simulationWork, stepWork) : nullptr;
     if (useGpuPmeOnThisRank)
     {
         launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags,
@@ -1282,7 +1283,7 @@ void do_force(FILE                                     *fplog,
             {
                 // The following must be called after local setCoordinates (which records an event
                 // when the coordinate data has been copied to the device).
-                gpuHaloExchange->communicateHaloCoordinates(box);
+                gpuHaloExchange->communicateHaloCoordinates(box, localXReadyOnDevice);
 
                 if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
                 {
index 88fa3ea8dacccdb905ba1571324d6dceeff2afa0..1c2fb3e42d3044b315e693b900918260f66cd9ac 100644 (file)
@@ -1356,11 +1356,10 @@ int Mdrunner::mdrunner()
             GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps, "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
             void *streamLocal              = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
             void *streamNonLocal           = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
-            void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
                     "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable.");
             cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, streamLocal,
-                                                                        streamNonLocal, coordinatesOnDeviceEvent);
+                                                                        streamNonLocal);
         }
 
         /* Initialize the mdAtoms structure.