Link GPU coordinate producer and consumer tasks

author Szilárd Páll <pall.szilard@gmail.com>

Thu, 10 Oct 2019 16:10:38 +0000 (18:10 +0200)

committer Artem Zhmurov <zhmurov@gmail.com>

Tue, 15 Oct 2019 13:08:12 +0000 (15:08 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Thu, 10 Oct 2019 16:10:38 +0000 (18:10 +0200)
committer Artem Zhmurov <zhmurov@gmail.com>
Tue, 15 Oct 2019 13:08:12 +0000 (15:08 +0200)
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index c6ec4e6fca8d1c0d8110b4a0bf9bcbfcdf4ffaba..1af591c33d2e833217404991292b9935c9fa7ed6 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -368,10 +368,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t      *GPU_FUNC_ARG
   * Launches first stage of PME on GPU - spreading kernel.
   *
   * \param[in] pme                The PME data structure.
   * Launches first stage of PME on GPU - spreading kernel.
   *
   * \param[in] pme                The PME data structure.
+ * \param[in] xReadyOnDevice     Event synchronizer indicating that the coordinates are ready in the device memory.
   * \param[in] wcycle             The wallclock counter.
   */
   * \param[in] wcycle             The wallclock counter.
   */
-GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t      *GPU_FUNC_ARGUMENT(pme),
-                                              gmx_wallcycle  *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t            *GPU_FUNC_ARGUMENT(pme),
+                                              GpuEventSynchronizer *GPU_FUNC_ARGUMENT(xReadyOnDevice),
+                                              gmx_wallcycle        *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
  
  /*! \brief
   * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
  
  /*! \brief
   * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp

index 4685913dbde8e7922d0bff0e983f875b96fb4b94..8bbd184ac8a48d97e98e034c6c1415b7dcae504f 100644 (file)
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -175,6 +175,7 @@ void pme_gpu_prepare_computation(gmx_pme_t            *pme,
  }
  
  void pme_gpu_launch_spread(gmx_pme_t            *pme,
  }
  
  void pme_gpu_launch_spread(gmx_pme_t            *pme,
+                           GpuEventSynchronizer *xReadyOnDevice,
                             gmx_wallcycle        *wcycle)
  {
      GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
                             gmx_wallcycle        *wcycle)
  {
      GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
@@ -190,7 +191,7 @@ void pme_gpu_launch_spread(gmx_pme_t            *pme,
          const bool spreadCharges  = true;
          wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
          wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
          const bool spreadCharges  = true;
          wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
          wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
-        pme_gpu_spread(pmeGpu, gridIndex, fftgrid, computeSplines, spreadCharges);
+        pme_gpu_spread(pmeGpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
      }
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
      }
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index 7d71782183dafc9b4c5caec0637aac125b5ad8d6..e98ae3a223c8be7ee88cd7aad5682b32866f2442 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -1014,11 +1014,12 @@ std::pair<int, int> inline pmeGpuCreateGrid(const PmeGpu *pmeGpu, int blockCount
      return std::pair<int, int>(colCount, minRowCount);
  }
  
      return std::pair<int, int>(colCount, minRowCount);
  }
  
-void pme_gpu_spread(const PmeGpu    *pmeGpu,
-                    int gmx_unused   gridIndex,
-                    real            *h_grid,
-                    bool             computeSplines,
-                    bool             spreadCharges)
+void pme_gpu_spread(const PmeGpu         *pmeGpu,
+                    GpuEventSynchronizer *xReadyOnDevice,
+                    int gmx_unused        gridIndex,
+                    real                 *h_grid,
+                    bool                  computeSplines,
+                    bool                  spreadCharges)
  {
      GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)");
      const auto   *kernelParamsPtr = pmeGpu->kernelParams.get();
  {
      GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)");
      const auto   *kernelParamsPtr = pmeGpu->kernelParams.get();
@@ -1037,6 +1038,16 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
      //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
      GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
  
      //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
      GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
  
+    // Ensure that coordinates are ready on the device before launching spread;
+    // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
+    // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
+    // Note: Consider adding an assertion on xReadyOnDevice when we can detect
+    // here separate PME ranks.
+    if (xReadyOnDevice)
+    {
+        xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream);
+    }
+
      const int          blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
      auto               dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
  
      const int          blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
      auto               dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
  
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h

index bdd92bb83be24a3bcd10bc147c8a638bf9cd5640..0f66915b0d65374f52ae7972b61272c0d2cc2cd5 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -383,17 +383,20 @@ void pme_gpu_get_timings(const PmeGpu            *pmeGpu,
   * A GPU spline computation and charge spreading function.
   *
   * \param[in]  pmeGpu          The PME GPU structure.
   * A GPU spline computation and charge spreading function.
   *
   * \param[in]  pmeGpu          The PME GPU structure.
+ * \param[in]  xReadyOnDevice  Event synchronizer indicating that the coordinates are ready in the device memory;
+ *                             can be nullptr when invoked on a separate PME rank or from PME tests.
   * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
   * \param[out] h_grid          The host-side grid buffer (used only if the result of the spread is expected on the host,
   *                             e.g. testing or host-side FFT)
   * \param[in]  computeSplines  Should the computation of spline parameters and gridline indices be performed.
   * \param[in]  spreadCharges   Should the charges/coefficients be spread on the grid.
   */
   * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
   * \param[out] h_grid          The host-side grid buffer (used only if the result of the spread is expected on the host,
   *                             e.g. testing or host-side FFT)
   * \param[in]  computeSplines  Should the computation of spline parameters and gridline indices be performed.
   * \param[in]  spreadCharges   Should the charges/coefficients be spread on the grid.
   */
-GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu    *GPU_FUNC_ARGUMENT(pmeGpu),
-                                       int              GPU_FUNC_ARGUMENT(gridIndex),
-                                       real            *GPU_FUNC_ARGUMENT(h_grid),
-                                       bool             GPU_FUNC_ARGUMENT(computeSplines),
-                                       bool             GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu         *GPU_FUNC_ARGUMENT(pmeGpu),
+                                       GpuEventSynchronizer *GPU_FUNC_ARGUMENT(xReadyOnDevice),
+                                       int                   GPU_FUNC_ARGUMENT(gridIndex),
+                                       real                 *GPU_FUNC_ARGUMENT(h_grid),
+                                       bool                  GPU_FUNC_ARGUMENT(computeSplines),
+                                       bool                  GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
  
  /*! \libinternal \brief
   * 3D FFT R2C/C2R routine.
  
  /*! \libinternal \brief
   * 3D FFT R2C/C2R routine.
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp

index 10ceb2ce3b8be2e2890c67d1af8681885374285d..8e5ac1de84ce4faf8bcd223b3db43738ccb3d994 100644 (file)
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -644,8 +644,10 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
              // or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested!
              pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags, useGpuPmeForceReduction);
              stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x), gmx::StatePropagatorDataGpu::AtomLocality::All);
              // or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested!
              pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags, useGpuPmeForceReduction);
              stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x), gmx::StatePropagatorDataGpu::AtomLocality::All);
+            // On the separate PME rank we do not need a synchronizer as we schedule everything in a single stream
+            auto xReadyOnDevice = nullptr;
  
  
-            pme_gpu_launch_spread(pme, wcycle);
+            pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle);
              pme_gpu_launch_complex_transforms(pme, wcycle);
              pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
              output = pme_gpu_wait_finish_task(pme, pmeFlags, wcycle);
              pme_gpu_launch_complex_transforms(pme, wcycle);
              pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
              output = pme_gpu_wait_finish_task(pme, pmeFlags, wcycle);
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index 2153e2c1395b554b397c784b099c49747b7bf0b8..1311ab30c46b22860670043f7e3607b14d52701f 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -311,8 +311,12 @@ void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qual
              break;
  
          case CodePath::GPU:
              break;
  
          case CodePath::GPU:
-            pme_gpu_spread(pme->gpu, gridIndex, fftgrid, computeSplines, spreadCharges);
-            break;
+        {
+            // no synchronization needed as x is transferred in the PME stream
+            GpuEventSynchronizer *xReadyOnDevice = nullptr;
+            pme_gpu_spread(pme->gpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
+        }
+        break;
  
          default:
              GMX_THROW(InternalError("Test not implemented for this mode"));
  
          default:
              GMX_THROW(InternalError("Test not implemented for this mode"));
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 4c8323444d1d37180d6c61af1fed89376668edb7..8aa2d195f9ed0b49552eb957d4a540249f1f8a5f 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -613,16 +613,18 @@ static int makePmeFlags(const StepWorkload &stepWork)
   * \param[in]  box                  The box matrix
   * \param[in]  stepWork             Step schedule flags
   * \param[in]  pmeFlags             PME flags
   * \param[in]  box                  The box matrix
   * \param[in]  stepWork             Step schedule flags
   * \param[in]  pmeFlags             PME flags
+ * \param[in]  xReadyOnDevice       Event synchronizer indicating that the coordinates are ready in the device memory.
   * \param[in]  wcycle               The wallcycle structure
   */
   * \param[in]  wcycle               The wallcycle structure
   */
-static inline void launchPmeGpuSpread(gmx_pme_t          *pmedata,
-                                      const matrix        box,
-                                      const StepWorkload &stepWork,
-                                      int                 pmeFlags,
-                                      gmx_wallcycle_t     wcycle)
+static inline void launchPmeGpuSpread(gmx_pme_t            *pmedata,
+                                      const matrix          box,
+                                      const StepWorkload   &stepWork,
+                                      int                   pmeFlags,
+                                      GpuEventSynchronizer *xReadyOnDevice,
+                                      gmx_wallcycle_t       wcycle)
  {
      pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, stepWork.useGpuPmeFReduction);
  {
      pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, stepWork.useGpuPmeFReduction);
-    pme_gpu_launch_spread(pmedata, wcycle);
+    pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle);
  }
  
  /*! \brief Launch the FFT and gather stages of PME GPU
  }
  
  /*! \brief Launch the FFT and gather stages of PME GPU
@@ -1028,9 +1030,12 @@ void do_force(FILE                                     *fplog,
          stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
      }
  
          stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
      }
  
+    const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local,
+                                                                                                        simulationWork, stepWork) : nullptr;
      if (useGpuPmeOnThisRank)
      {
      if (useGpuPmeOnThisRank)
      {
-        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, wcycle);
+        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags,
+                           localXReadyOnDevice, wcycle);
      }
  
      /* do gridding for pair search */
      }
  
      /* do gridding for pair search */
@@ -1148,7 +1153,8 @@ void do_force(FILE                                     *fplog,
          if (useGpuXBufOps == BufferOpsUseGpu::True)
          {
              nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
          if (useGpuXBufOps == BufferOpsUseGpu::True)
          {
              nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
-                                       stateGpu->getCoordinates());
+                                       stateGpu->getCoordinates(),
+                                       localXReadyOnDevice);
          }
          else
          {
          }
          else
          {
@@ -1259,7 +1265,9 @@ void do_force(FILE                                     *fplog,
                      stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
                  }
                  nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::NonLocal, false,
                      stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
                  }
                  nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::NonLocal, false,
-                                           stateGpu->getCoordinates());
+                                           stateGpu->getCoordinates(),
+                                           stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal,
+                                                                                      simulationWork, stepWork));
              }
              else
              {
              }
              else
              {
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 70c73a1657d38fb21041e6b1a9cabd3d231ff04f..fd591bc9846c822617efc7a4d0999b389c0ec81d 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -72,7 +72,6 @@ StatePropagatorDataGpu::Impl::Impl(const void            *pmeStream,
      paddingSize_(paddingSize)
  {
      static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
      paddingSize_(paddingSize)
  {
      static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
-    GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
  
      // TODO: Refactor when the StreamManager is introduced.
      if (GMX_GPU == GMX_GPU_OPENCL)
  
      // TODO: Refactor when the StreamManager is introduced.
      if (GMX_GPU == GMX_GPU_OPENCL)
@@ -136,7 +135,6 @@ StatePropagatorDataGpu::Impl::Impl(const void            *pmeStream,
      paddingSize_(paddingSize)
  {
      static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
      paddingSize_(paddingSize)
  {
      static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
-    GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
  
      if (GMX_GPU == GMX_GPU_OPENCL)
      {
  
      if (GMX_GPU == GMX_GPU_OPENCL)
      {
@@ -303,8 +301,6 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
      if (GMX_GPU == GMX_GPU_CUDA)
      {
          xReadyOnDevice_[atomLocality].markEvent(commandStream);
      if (GMX_GPU == GMX_GPU_CUDA)
      {
          xReadyOnDevice_[atomLocality].markEvent(commandStream);
-        // TODO: Remove When event-based synchronization is introduced
-        gpuStreamSynchronize(commandStream);
      }
  }
  
      }
  }
  
@@ -319,6 +315,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceE
      //
      // TODO: This should be reconsidered to support the halo exchange.
      //
      //
      // TODO: This should be reconsidered to support the halo exchange.
      //
+    // In OpenCL no events are used as coordinate sync is not necessary
+    if (GMX_GPU == GMX_GPU_OPENCL)
+    {
+        return nullptr;
+    }
      if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
      {
          return &xUpdatedOnDevice_;
      if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
      {
          return &xUpdatedOnDevice_;
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp

index 4a22cebe83e32e21dbde7514f7341881c3e0f8bf..99b69b0f9a49bd5d9e997a73be468356c3b54066 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1085,11 +1085,12 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
  }
  
  /* Copies (and reorders) the coordinates to nbnxn_atomdata_t on the GPU*/
  }
  
  /* Copies (and reorders) the coordinates to nbnxn_atomdata_t on the GPU*/
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
-                                    const Nbnxm::AtomLocality locality,
-                                    bool                      fillLocal,
-                                    gmx_nbnxn_gpu_t          *gpu_nbv,
-                                    DeviceBuffer<float>       d_x)
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet       &gridSet,
+                                    const Nbnxm::AtomLocality   locality,
+                                    bool                        fillLocal,
+                                    gmx_nbnxn_gpu_t            *gpu_nbv,
+                                    DeviceBuffer<float>         d_x,
+                                    GpuEventSynchronizer       *xReadyOnDevice)
  {
  
      int gridBegin = 0;
  {
  
      int gridBegin = 0;
@@ -1102,6 +1103,7 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
                                fillLocal && g == 0,
                                gpu_nbv,
                                d_x,
                                fillLocal && g == 0,
                                gpu_nbv,
                                d_x,
+                              xReadyOnDevice,
                                locality,
                                g,
                                gridSet.numColumnsMax());
                                locality,
                                g,
                                gridSet.numColumnsMax());
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h

index 35d711beffc3902949d18d1c086528abe57e8d35..bb3984d4d3535a73a27d08957e63fd431db70bc6 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -334,12 +334,14 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet       &gridSet,
   * \param[in]     fillLocal  Tells if the local filler particle coordinates should be zeroed.
   * \param[in,out] gpu_nbv    The NBNXM GPU data structure.
   * \param[in]     d_x        Coordinates to be copied (in plain rvec format).
   * \param[in]     fillLocal  Tells if the local filler particle coordinates should be zeroed.
   * \param[in,out] gpu_nbv    The NBNXM GPU data structure.
   * \param[in]     d_x        Coordinates to be copied (in plain rvec format).
+ * \param[in]     xReadyOnDevice   Event synchronizer indicating that the coordinates are ready in the device memory.
   */
   */
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
-                                    Nbnxm::AtomLocality       locality,
-                                    bool                      fillLocal,
-                                    gmx_nbnxn_gpu_t          *gpu_nbv,
-                                    DeviceBuffer<float>       d_x);
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet      &gridSet,
+                                    Nbnxm::AtomLocality        locality,
+                                    bool                       fillLocal,
+                                    gmx_nbnxn_gpu_t           *gpu_nbv,
+                                    DeviceBuffer<float>        d_x,
+                                    GpuEventSynchronizer      *xReadyOnDevice);
  
  /*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
   *
  
  /*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
   *
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index e6cd8253b752af1a6faad70e83dd79139e91bae4..7b1241e5e2ca56f1bdfee93a950964d11692d3e0 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -748,6 +748,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
                             bool                             setFillerCoords,
                             gmx_nbnxn_gpu_t                 *nb,
                             DeviceBuffer<float>              d_x,
                             bool                             setFillerCoords,
                             gmx_nbnxn_gpu_t                 *nb,
                             DeviceBuffer<float>              d_x,
+                           GpuEventSynchronizer            *xReadyOnDevice,
                             const Nbnxm::AtomLocality        locality,
                             int                              gridId,
                             int                              numColumnsMax)
                             const Nbnxm::AtomLocality        locality,
                             int                              gridId,
                             int                              numColumnsMax)
@@ -770,6 +771,10 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
          // TODO: This will only work with CUDA
          GMX_ASSERT(d_x, "Need a valid device pointer");
  
          // TODO: This will only work with CUDA
          GMX_ASSERT(d_x, "Need a valid device pointer");
  
+        // ensure that coordinates are ready on the device before launching the kernel
+        GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object");
+        xReadyOnDevice->enqueueWaitEvent(stream);
+
          KernelLaunchConfig config;
          config.blockSize[0]     = c_bufOpsThreadsPerBlock;
          config.blockSize[1]     = 1;
          KernelLaunchConfig config;
          config.blockSize[0]     = c_bufOpsThreadsPerBlock;
          config.blockSize[1]     = 1;
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 4fb8cbacdafaf2028d967349034c2f40ec9bc138..3a796b504e67ada819190c51ffcd665e649e591e 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -147,16 +147,18 @@ void nonbonded_verlet_t::convertCoordinates(const Nbnxm::AtomLocality       loca
      wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
  }
  
      wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
  }
  
-void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality       locality,
-                                               const bool                      fillLocal,
-                                               DeviceBuffer<float>             d_x)
+void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality        locality,
+                                               const bool                       fillLocal,
+                                               DeviceBuffer<float>              d_x,
+                                               GpuEventSynchronizer            *xReadyOnDevice)
  {
      wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
      wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
  
      nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal,
                                     gpu_nbv,
  {
      wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
      wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
  
      nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal,
                                     gpu_nbv,
-                                   d_x);
+                                   d_x,
+                                   xReadyOnDevice);
  
      wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
      wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
  
      wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
      wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index a5ef5baff3f74d6cf2558c369fe0bf587b4e01bf..4378663b933cea0b1139e7596a12ce59cb7d2865 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -280,13 +280,15 @@ struct nonbonded_verlet_t
           *
           * The API function for the transformation of the coordinates from one layout to another in the GPU memory.
           *
           *
           * The API function for the transformation of the coordinates from one layout to another in the GPU memory.
           *
-         * \param[in] locality   Whether coordinates for local or non-local atoms should be transformed.
-         * \param[in] fillLocal  If the coordinates for filler particles should be zeroed.
-         * \param[in] d_x        GPU coordinates buffer in plain rvec format to be transformed.
+         * \param[in] locality        Whether coordinates for local or non-local atoms should be transformed.
+         * \param[in] fillLocal       If the coordinates for filler particles should be zeroed.
+         * \param[in] d_x             GPU coordinates buffer in plain rvec format to be transformed.
+         * \param[in] xReadyOnDevice  Event synchronizer indicating that the coordinates are ready in the device memory.
           */
           */
-        void convertCoordinatesGpu(Nbnxm::AtomLocality             locality,
-                                   bool                            fillLocal,
-                                   DeviceBuffer<float>             d_x);
+        void convertCoordinatesGpu(Nbnxm::AtomLocality              locality,
+                                   bool                             fillLocal,
+                                   DeviceBuffer<float>              d_x,
+                                   GpuEventSynchronizer            *xReadyOnDevice);
  
          //! Init for GPU version of setup coordinates in Nbnxm
          void atomdata_init_copy_x_to_nbat_x_gpu();
  
          //! Init for GPU version of setup coordinates in Nbnxm
          void atomdata_init_copy_x_to_nbat_x_gpu();
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index afdafb80ff8fa343726676861f0ccb19d4be9c2e..1061292b2da57a9dd50eb8db0332a2736eeaecbd 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -235,18 +235,20 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
   * \param[in]     setFillerCoords  If the filler coordinates are used.
   * \param[in,out] gpu_nbv          The nonbonded data GPU structure.
   * \param[in]     d_x              Device-side coordinates in plain rvec format.
   * \param[in]     setFillerCoords  If the filler coordinates are used.
   * \param[in,out] gpu_nbv          The nonbonded data GPU structure.
   * \param[in]     d_x              Device-side coordinates in plain rvec format.
+ * \param[in]     xReadyOnDevice   Event synchronizer indicating that the coordinates are ready in the device memory.
   * \param[in]     locality         Copy coordinates for local or non-local atoms.
   * \param[in]     gridId           Index of the grid being converted.
   * \param[in]     numColumnsMax    Maximum number of columns in the grid.
   */
  CUDA_FUNC_QUALIFIER
   * \param[in]     locality         Copy coordinates for local or non-local atoms.
   * \param[in]     gridId           Index of the grid being converted.
   * \param[in]     numColumnsMax    Maximum number of columns in the grid.
   */
  CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid   gmx_unused &grid,
-                           bool                gmx_unused  setFillerCoords,
-                           gmx_nbnxn_gpu_t     gmx_unused *gpu_nbv,
-                           DeviceBuffer<float> gmx_unused  d_x,
-                           Nbnxm::AtomLocality gmx_unused  locality,
-                           int                 gmx_unused  gridId,
-                           int                 gmx_unused  numColumnsMax) CUDA_FUNC_TERM;
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid    gmx_unused &grid,
+                           bool                 gmx_unused  setFillerCoords,
+                           gmx_nbnxn_gpu_t      gmx_unused *gpu_nbv,
+                           DeviceBuffer<float>  gmx_unused  d_x,
+                           GpuEventSynchronizer gmx_unused *xReadyOnDevice,
+                           Nbnxm::AtomLocality  gmx_unused  locality,
+                           int                  gmx_unused  gridId,
+                           int                  gmx_unused  numColumnsMax) CUDA_FUNC_TERM;
  
  /*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
   * \param[in] nb                   The nonbonded data GPU structure
  
  /*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
   * \param[in] nb                   The nonbonded data GPU structure
author	Szilárd Páll <pall.szilard@gmail.com>
	Thu, 10 Oct 2019 16:10:38 +0000 (18:10 +0200)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Tue, 15 Oct 2019 13:08:12 +0000 (15:08 +0200)
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.h		patch \| blob \| history
src/gromacs/ewald/pme_only.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.cpp		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.h		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history