Bug fix for event passed to GPU comm features

author Alan Gray <alangraygerrit@gmail.com>

Mon, 28 Oct 2019 09:07:46 +0000 (02:07 -0700)

committer Mark Abraham <mark.j.abraham@gmail.com>

Thu, 31 Oct 2019 15:49:36 +0000 (16:49 +0100)
author Alan Gray <alangraygerrit@gmail.com>
Mon, 28 Oct 2019 09:07:46 +0000 (02:07 -0700)
committer Mark Abraham <mark.j.abraham@gmail.com>
Thu, 31 Oct 2019 15:49:36 +0000 (16:49 +0100)
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h

index f991e0512327e77f64d8bd830d1db2dbdae7d51b..d5d8ee995310b37b28b046aea34be73c6c28f902 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -49,6 +49,7 @@
  #include "gromacs/utility/gmxmpi.h"
  
  struct gmx_domdec_t;
+class GpuEventSynchronizer;
  
  namespace gmx
  {
@@ -68,7 +69,7 @@ class GpuHaloExchange
           * the non-local non-bonded kernels). It also must be called
           * after the local coordinates buffer operations (where the
           * coordinates are copied to the device and hence the \c
-         * coordinatesOnDeviceEvent is recorded). Force Halo exchange
+         * coordinatesReadyOnDeviceEvent is recorded). Force Halo exchange
           * will be performed in \c streamNonLocal (also potentally
           * with buffer clearing in \c streamLocal)and the \c
           * communicateHaloForces method must be called after the
@@ -81,13 +82,11 @@ class GpuHaloExchange
           * \param [in]    mpi_comm_mysim           communicator used for simulation
           * \param [in]    streamLocal              local NB CUDA stream.
           * \param [in]    streamNonLocal           non-local NB CUDA stream.
-         * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
           */
          GpuHaloExchange(gmx_domdec_t *dd,
                          MPI_Comm      mpi_comm_mysim,
                          void         *streamLocal,
-                        void         *streamNonLocal,
-                        void         *coordinatesOnDeviceEvent);
+                        void         *streamNonLocal);
          ~GpuHaloExchange();
  
          /*! \brief
@@ -106,8 +105,9 @@ class GpuHaloExchange
           * event when the coordinate data has been copied to the
           * device).
           * \param [in] box  Coordinate box (from which shifts will be constructed)
+         * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
           */
-        void communicateHaloCoordinates(const matrix box);
+        void communicateHaloCoordinates(const matrix box, GpuEventSynchronizer *coordinatesReadyOnDeviceEvent);
  
          /*! \brief GPU halo exchange of force buffer.
           * \param[in] accumulateForces  True if forces should accumulate, otherwise they are set
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp

index c4c191b314fb4f6aad8dc11ffa2745ca17759357..2bb917d6b444e134a5cfcbf5159901bf991e5da3 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -63,8 +63,7 @@ class GpuHaloExchange::Impl
  GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
                                   MPI_Comm       /* mpi_comm_mysim */,
                                   void         * /*streamLocal */,
-                                 void         * /*streamNonLocal */,
-                                 void         * /*coordinatesOnDeviceEvent*/)
+                                 void         * /*streamNonLocal */)
      : impl_(nullptr)
  {
      GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
@@ -80,7 +79,8 @@ void GpuHaloExchange::reinitHalo(DeviceBuffer<float> /* d_coordinatesBuffer */,
  }
  
  /*!\brief apply X halo exchange stub. */
-void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */)
+void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */,
+                                                 GpuEventSynchronizer * /*coordinatesOnDeviceEvent*/)
  {
      GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange exchange was called insted of the correct implementation.");
  }
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

index c195f96cb577aff3e46c7bc57f4a937b0ff43bce..1e5038d0e40513513b6c376484a2fa6a30f522df 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -194,13 +194,12 @@ void GpuHaloExchange::Impl::reinitHalo(float3      *d_coordinatesBuffer,
      return;
  }
  
-// The following method be called after local setCoordinates (which records the coordinatesOnDeviceEvent_
-// event when the coordinate data has been copied to the device).
-void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box)
+void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box,
+                                                       GpuEventSynchronizer *coordinatesReadyOnDeviceEvent)
  {
  
      //ensure stream waits until coordinate data is available on device
-    coordinatesOnDeviceEvent_->enqueueWaitEvent(nonLocalStream_);
+    coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
  
      // launch kernel to pack send buffer
      KernelLaunchConfig config;
@@ -385,8 +384,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
  GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
                              MPI_Comm      mpi_comm_mysim,
                              void        * localStream,
-                            void        * nonLocalStream,
-                            void        * coordinatesOnDeviceEvent)
+                            void        * nonLocalStream)
      : dd_(dd),
        sendRankX_(dd->neighbor[0][1]),
        recvRankX_(dd->neighbor[0][0]),
@@ -396,8 +394,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
        haloDataTransferLaunched_(new GpuEventSynchronizer()),
        mpi_comm_mysim_(mpi_comm_mysim),
        localStream_(*static_cast<cudaStream_t*> (localStream)),
-      nonLocalStream_(*static_cast<cudaStream_t*> (nonLocalStream)),
-      coordinatesOnDeviceEvent_(static_cast<GpuEventSynchronizer*> (coordinatesOnDeviceEvent))
+      nonLocalStream_(*static_cast<cudaStream_t*> (nonLocalStream))
  {
  
      GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "GPU Halo exchange is currently only supported with thread-MPI enabled");
@@ -430,9 +427,8 @@ GpuHaloExchange::Impl::~Impl()
  GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd,
                                   MPI_Comm      mpi_comm_mysim,
                                   void         *localStream,
-                                 void         *nonLocalStream,
-                                 void         *coordinatesOnDeviceEvent)
-    : impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream, coordinatesOnDeviceEvent))
+                                 void         *nonLocalStream)
+    : impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream))
  {
  }
  
@@ -444,9 +440,9 @@ void GpuHaloExchange::reinitHalo(DeviceBuffer<float>  d_coordinatesBuffer,
      impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer), reinterpret_cast<float3*>(d_forcesBuffer));
  }
  
-void GpuHaloExchange::communicateHaloCoordinates(const matrix box)
+void GpuHaloExchange::communicateHaloCoordinates(const matrix box, GpuEventSynchronizer *coordinatesReadyOnDeviceEvent)
  {
-    impl_->communicateHaloCoordinates(box);
+    impl_->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
  }
  
  void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh

index 3652eca85be8c854e526b5c93465363d5e5809cb..2a41dcfa951c59bdb707b682eec6e3996a27c6db 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -71,13 +71,11 @@ class GpuHaloExchange::Impl
           * \param [in]    mpi_comm_mysim           communicator used for simulation
           * \param [in]    localStream              local NB CUDA stream
           * \param [in]    nonLocalStream           non-local NB CUDA stream
-         * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
           */
          Impl(gmx_domdec_t *dd,
               MPI_Comm mpi_comm_mysim,
               void *localStream,
-             void *nonLocalStream,
-             void *coordinatesOnDeviceEvent);
+             void *nonLocalStream);
          ~Impl();
  
          /*! \brief
@@ -92,8 +90,10 @@ class GpuHaloExchange::Impl
          /*! \brief
           * GPU halo exchange of coordinates buffer
           * \param [in] box  Coordinate box (from which shifts will be constructed)
+         * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
           */
-        void communicateHaloCoordinates(const matrix box);
+        void communicateHaloCoordinates(const matrix          box,
+                                        GpuEventSynchronizer *coordinatesReadyOnDeviceEvent);
  
          /*! \brief  GPU halo exchange of force buffer
           * \param[in] accumulateForces  True if forces should accumulate, otherwise they are set
@@ -180,8 +180,6 @@ class GpuHaloExchange::Impl
          cudaStream_t                localStream_              = nullptr;
          //! CUDA stream for non-local non-bonded calculations
          cudaStream_t                nonLocalStream_           = nullptr;
-        //! Event triggered when coordinates have been copied to device
-        GpuEventSynchronizer       *coordinatesOnDeviceEvent_ = nullptr;
          //! full coordinates buffer in GPU memory
          float3                     *d_x_                      = nullptr;
          //! full forces buffer in GPU memory
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index 3695f12f7222ce1d397b3d0c8f67577ed81d5c5a..507af05bc20c658f584a519925bbf35d4cc91867 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -232,7 +232,8 @@ void gmx_pme_send_coordinates(t_forcerec *fr, const t_commrec *cr, const matrix
                                gmx_bool bEnerVir,
                                int64_t step, bool useGpuPmePpComms,
                                bool reinitGpuPmePpComms,
-                              bool sendCoordinatesFromGpu, gmx_wallcycle *wcycle);
+                              bool sendCoordinatesFromGpu,
+                              GpuEventSynchronizer *coordinatesReadyOnDeviceEvent, gmx_wallcycle *wcycle);
  
  /*! \brief Tell our PME-only node to finish */
  void gmx_pme_send_finish(const t_commrec *cr);
diff --git a/src/gromacs/ewald/pme_pp.cpp b/src/gromacs/ewald/pme_pp.cpp

index 18509725becc8015cff26da146f6c3d8cbe50bfb..cbf6781b10c1da61fc64c37cb35996875b2ce89e 100644 (file)
--- a/src/gromacs/ewald/pme_pp.cpp
+++ b/src/gromacs/ewald/pme_pp.cpp
@@ -101,7 +101,8 @@ static void gmx_pme_send_coeffs_coords(t_forcerec *fr, const t_commrec *cr, unsi
                                         int maxshift_x, int maxshift_y,
                                         int64_t step, bool useGpuPmePpComms,
                                         bool reinitGpuPmePpComms,
-                                       bool sendCoordinatesFromGpu)
+                                       bool sendCoordinatesFromGpu,
+                                       GpuEventSynchronizer *coordinatesReadyOnDeviceEvent)
  {
      gmx_domdec_t         *dd;
      gmx_pme_comm_n_box_t *cnb;
@@ -220,7 +221,7 @@ static void gmx_pme_send_coeffs_coords(t_forcerec *fr, const t_commrec *cr, unsi
              {
                  void *sendPtr = sendCoordinatesFromGpu ? static_cast<void*> (fr->stateGpu->getCoordinates()) :
                      static_cast<void*> (xRealPtr);
-                fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu);
+                fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
              }
              else
              {
@@ -270,7 +271,7 @@ void gmx_pme_send_parameters(const t_commrec *cr,
      gmx_pme_send_coeffs_coords(nullptr, cr, flags,
                                 chargeA, chargeB,
                                 sqrt_c6A, sqrt_c6B, sigmaA, sigmaB,
-                               nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false, false, false);
+                               nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false, false, false, nullptr);
  }
  
  void gmx_pme_send_coordinates(t_forcerec *fr, const t_commrec *cr, const matrix box, const rvec *x,
@@ -278,7 +279,8 @@ void gmx_pme_send_coordinates(t_forcerec *fr, const t_commrec *cr, const matrix
                                gmx_bool bEnerVir,
                                int64_t step, bool useGpuPmePpComms,
                                bool receiveCoordinateAddressFromPme,
-                              bool sendCoordinatesFromGpu, gmx_wallcycle *wcycle)
+                              bool sendCoordinatesFromGpu,
+                              GpuEventSynchronizer *coordinatesReadyOnDeviceEvent, gmx_wallcycle *wcycle)
  {
      wallcycle_start(wcycle, ewcPP_PMESENDX);
  
@@ -289,7 +291,7 @@ void gmx_pme_send_coordinates(t_forcerec *fr, const t_commrec *cr, const matrix
      }
      gmx_pme_send_coeffs_coords(fr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
                                 box, x, lambda_q, lambda_lj, 0, 0, step, useGpuPmePpComms, receiveCoordinateAddressFromPme,
-                               sendCoordinatesFromGpu);
+                               sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
  
      wallcycle_stop(wcycle, ewcPP_PMESENDX);
  }
@@ -298,7 +300,7 @@ void gmx_pme_send_finish(const t_commrec *cr)
  {
      unsigned int flags = PP_PME_FINISH;
  
-    gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false);
+    gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false, nullptr);
  }
  
  void gmx_pme_send_switchgrid(const t_commrec *cr,
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h

index 7461dae9428414725cef3bbc5d25d0019e98ef97..b58125513e87caa3593aa30c1e5553bad5e706ab 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu.h
@@ -45,6 +45,8 @@
  #include "gromacs/utility/classhelpers.h"
  #include "gromacs/utility/gmxmpi.h"
  
+class GpuEventSynchronizer;
+
  namespace gmx
  {
  
@@ -59,9 +61,8 @@ class PmePpCommGpu
          /*! \brief Creates PME-PP GPU communication object
           * \param[in] comm            Communicator used for simulation
           * \param[in] pmeRank         Rank of PME task
-         * \param[in] coordinatesOnDeviceEvent Event recorded when coordinates are available on device
           */
-        PmePpCommGpu(MPI_Comm comm, int pmeRank, void* coordinatesOnDeviceEvent);
+        PmePpCommGpu(MPI_Comm comm, int pmeRank);
          ~PmePpCommGpu();
  
          /*! \brief Perform steps required when buffer size changes
@@ -81,8 +82,9 @@ class PmePpCommGpu
           * \param[in] sendPtr Buffer with coordinate data
           * \param[in] sendSize Number of elements to send
           * \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU
+         * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
           */
-        void sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu);
+        void sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
  
          /*! \brief
           * Return pointer to buffer used for staging PME force on GPU
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp

index 44ce2fa0e2de1032515a8a664407fff1132758be..5324343a8edb06e0a4694240766bef39e89d74f5 100755 (executable)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
@@ -62,7 +62,7 @@ class PmePpCommGpu::Impl
  };
  
  /*!\brief Constructor stub. */
-PmePpCommGpu::PmePpCommGpu(MPI_Comm gmx_unused comm, int gmx_unused pmeRank, void gmx_unused *coordinatesOnDeviceEvent)
+PmePpCommGpu::PmePpCommGpu(MPI_Comm gmx_unused comm, int gmx_unused pmeRank)
      : impl_(nullptr)
  {
      GMX_ASSERT(false, "A CPU stub for PME-PP GPU communication was called instead of the correct implementation.");
@@ -81,7 +81,7 @@ void PmePpCommGpu::receiveForceFromPmeCudaDirect(void gmx_unused *recvPtr, int g
      GMX_ASSERT(false, "A CPU stub for PME-PP GPU communication was called instead of the correct implementation.");
  }
  
-void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void gmx_unused *sendPtr, int gmx_unused sendSize, bool gmx_unused sendPmeCoordinatesFromGpu)
+void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void gmx_unused *sendPtr, int gmx_unused sendSize, bool gmx_unused sendPmeCoordinatesFromGpu, GpuEventSynchronizer gmx_unused *coordinatesOnDeviceEvent)
  {
      GMX_ASSERT(false, "A CPU stub for PME-PP GPU communication was called instead of the correct implementation.");
  }
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu

index 3b9e6e7d8b5c778a67e26889535678bc9cb4c7da..621f18893fd32f55e98b2f7f39d5e40b6eed3d43 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -55,10 +55,9 @@
  namespace gmx
  {
  
-PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, void* coordinatesOnDeviceEvent)
+PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank)
      : comm_(comm),
-      pmeRank_(pmeRank),
-      coordinatesOnDeviceEvent_(static_cast<GpuEventSynchronizer*> (coordinatesOnDeviceEvent))
+      pmeRank_(pmeRank)
  {
      GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
      cudaStreamCreate(&pmePpCommStream_);
@@ -115,11 +114,11 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void *recvPtr, int recvSi
      }
  }
  
-void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool gmx_unused sendPmeCoordinatesFromGpu)
+void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool gmx_unused sendPmeCoordinatesFromGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
  {
  
      //ensure stream waits until coordinate data is available on device
-    coordinatesOnDeviceEvent_->enqueueWaitEvent(pmePpCommStream_);
+    coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_);
  
      cudaError_t stat        = cudaMemcpyAsync(remotePmeXBuffer_, sendPtr,
                                                sendSize*DIM*sizeof(float), cudaMemcpyDefault,
@@ -144,8 +143,8 @@ void* PmePpCommGpu::Impl::getForcesReadySynchronizer()
      return static_cast<void*> (&forcesReadySynchronizer_);
  }
  
-PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, int pmeRank, void* coordinatesOnDeviceEvent)
-    : impl_(new Impl(comm,  pmeRank, coordinatesOnDeviceEvent))
+PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, int pmeRank)
+    : impl_(new Impl(comm,  pmeRank))
  {
  }
  
@@ -161,9 +160,9 @@ void PmePpCommGpu::receiveForceFromPmeCudaDirect(void *recvPtr, int recvSize, bo
      impl_->receiveForceFromPmeCudaDirect(recvPtr, recvSize, receivePmeForceToGpu);
  }
  
-void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu)
+void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
  {
-    impl_->sendCoordinatesToPmeCudaDirect(sendPtr, sendSize, sendPmeCoordinatesFromGpu);
+    impl_->sendCoordinatesToPmeCudaDirect(sendPtr, sendSize, sendPmeCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
  }
  
  void* PmePpCommGpu::getGpuForceStagingPtr()
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h

index 2b279a43a1e7ae3e3f24e8157c3dde4bfcec6f45..be408acbf4ac6a9d2e5922bf7c455298c83ce370 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -59,9 +59,8 @@ class PmePpCommGpu::Impl
          /*! \brief Creates PME-PP GPU communication object.
           * \param[in] comm            Communicator used for simulation
           * \param[in] pmeRank         Rank of PME task
-         * \param[in] coordinatesOnDeviceEvent Event recorded when coordinates are available on device
           */
-        Impl(MPI_Comm comm, int pmeRank, void* coordinatesOnDeviceEvent);
+        Impl(MPI_Comm comm, int pmeRank);
          ~Impl();
  
          /*! \brief Perform steps required when buffer size changes
@@ -98,8 +97,9 @@ class PmePpCommGpu::Impl
           * \param[in] sendPtr Buffer with coordinate data
           * \param[in] sendSize Number of elements to send
           * \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU
+         * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
           */
-        void sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu);
+        void sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
  
          /*! \brief
           * Return pointer to buffer used for staging PME force on GPU
@@ -132,8 +132,6 @@ class PmePpCommGpu::Impl
          GpuEventSynchronizer    forcesReadySynchronizer_;
          //! Event recorded when coordinates have been transferred to PME task
          GpuEventSynchronizer    pmeCoordinatesSynchronizer_;
-        //! Event recorded when coordinates have been copied to GPU on this PP task.
-        GpuEventSynchronizer   *coordinatesOnDeviceEvent_;
  };
  
  } // namespace gmx
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index 25f0ad8aecced1355aa74bd73a8dd0384aba1fd2..4372c3544fd1e3391048ddcd8c204da34892897d 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -1494,10 +1494,8 @@ void init_forcerec(FILE                             *fp,
  
      if (pmeOnlyRankUsesGpu && c_enableGpuPmePpComms)
      {
-        void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
          fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(cr->mpi_comm_mysim,
-                                                               cr->dd->pme_nodeid,
-                                                               coordinatesOnDeviceEvent);
+                                                               cr->dd->pme_nodeid);
      }
  }
  
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index b5670683d4601fbb374c971306d80f945f1a0587..6a2cfbe34e7e6ac1fa75abc0f7a32206a40be7f9 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1046,6 +1046,9 @@ void do_force(FILE                                     *fplog,
          stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
      }
  
+    const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(AtomLocality::Local,
+                                                                                                        simulationWork, stepWork) : nullptr;
+
  #if GMX_MPI
      if (!thisRankHasDuty(cr, DUTY_PME))
      {
@@ -1060,12 +1063,10 @@ void do_force(FILE                                     *fplog,
                                   lambda[efptCOUL], lambda[efptVDW],
                                   (stepWork.computeVirial || stepWork.computeEnergy),
                                   step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
-                                 sendCoordinatesFromGpu, wcycle);
+                                 sendCoordinatesFromGpu, localXReadyOnDevice, wcycle);
      }
  #endif /* GMX_MPI */
  
-    const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(AtomLocality::Local,
-                                                                                                        simulationWork, stepWork) : nullptr;
      if (useGpuPmeOnThisRank)
      {
          launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags,
@@ -1282,7 +1283,7 @@ void do_force(FILE                                     *fplog,
              {
                  // The following must be called after local setCoordinates (which records an event
                  // when the coordinate data has been copied to the device).
-                gpuHaloExchange->communicateHaloCoordinates(box);
+                gpuHaloExchange->communicateHaloCoordinates(box, localXReadyOnDevice);
  
                  if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
                  {
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 88fa3ea8dacccdb905ba1571324d6dceeff2afa0..1c2fb3e42d3044b315e693b900918260f66cd9ac 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1356,11 +1356,10 @@ int Mdrunner::mdrunner()
              GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps, "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
              void *streamLocal              = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
              void *streamNonLocal           = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
-            void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
              GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
                      "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable.");
              cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, streamLocal,
-                                                                        streamNonLocal, coordinatesOnDeviceEvent);
+                                                                        streamNonLocal);
          }
  
          /* Initialize the mdAtoms structure.
author	Alan Gray <alangraygerrit@gmail.com>
	Mon, 28 Oct 2019 09:07:46 +0000 (02:07 -0700)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Thu, 31 Oct 2019 15:49:36 +0000 (16:49 +0100)
src/gromacs/domdec/gpuhaloexchange.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cpp		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cu		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cuh		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/pme_pp.cpp		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu.h		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.h		patch \| blob \| history
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history