Pipeline GPU PME Spline/Spread with PP Comms

[alexxy/gromacs.git] / src / gromacs / ewald / pme_gpu_internal.cpp
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index 028a66a35a9f30df54cdfc40e1628b878b9ae154..2f7da67a3a0524ec5a60e4761c7cbd361b733e0d 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team.
+ * Copyright (c) 2016,2017,2018,2019,2020 by the GROMACS development team.
   * Copyright (c) 2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
@@ -57,9 +57,14 @@
  #include <string>
  
  #include "gromacs/ewald/ewald_utils.h"
+#include "gromacs/fft/gpu_3dfft.h"
  #include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/device_stream.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/gpu_utils/pmalloc.h"
+#if GMX_GPU_SYCL
+#    include "gromacs/gpu_utils/syclutils.h"
+#endif
  #include "gromacs/hardware/device_information.h"
  #include "gromacs/math/invertmatrix.h"
  #include "gromacs/math/units.h"
@@ -69,20 +74,13 @@
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/logger.h"
  #include "gromacs/utility/stringutil.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
  
  #if GMX_GPU_CUDA
-#    include "gromacs/gpu_utils/pmalloc_cuda.h"
-
  #    include "pme.cuh"
-#elif GMX_GPU_OPENCL
-#    include "gromacs/gpu_utils/gmxopencl.h"
-#elif GMX_GPU_SYCL
-#    include "gromacs/gpu_utils/syclutils.h"
  #endif
  
-#include "gromacs/ewald/pme.h"
-
-#include "pme_gpu_3dfft.h"
  #include "pme_gpu_calculate_splines.h"
  #include "pme_gpu_constants.h"
  #include "pme_gpu_program_impl.h"
@@ -229,7 +227,7 @@ void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu)
  
  void pme_gpu_realloc_forces(PmeGpu* pmeGpu)
  {
-    const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
+    const size_t newForcesSize = pmeGpu->nAtomsAlloc;
      GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
      reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
                             newForcesSize,
@@ -248,11 +246,10 @@ void pme_gpu_free_forces(const PmeGpu* pmeGpu)
  void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
  {
      GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
-    float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
      copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
-                       h_forcesFloat,
+                       pmeGpu->staging.h_forces.data(),
                         0,
-                       DIM * pmeGpu->kernelParams->atoms.nAtoms,
+                       pmeGpu->kernelParams->atoms.nAtoms,
                         pmeGpu->archSpecific->pmeStream_,
                         pmeGpu->settings.transferKind,
                         nullptr);
@@ -261,11 +258,10 @@ void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
  void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
  {
      GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
-    float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
-    copyFromDeviceBuffer(h_forcesFloat,
+    copyFromDeviceBuffer(pmeGpu->staging.h_forces.data(),
                           &pmeGpu->kernelParams->atoms.d_forces,
                           0,
-                         DIM * pmeGpu->kernelParams->atoms.nAtoms,
+                         pmeGpu->kernelParams->atoms.nAtoms,
                           pmeGpu->archSpecific->pmeStream_,
                           pmeGpu->settings.transferKind,
                           nullptr);
@@ -468,9 +464,9 @@ void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu)
      auto* kernelParamsPtr = pmeGpu->kernelParams.get();
  #if GMX_GPU_CUDA
      destroyParamLookupTable(&kernelParamsPtr->grid.d_fractShiftsTable,
-                            kernelParamsPtr->fractShiftsTableTexture);
+                            &kernelParamsPtr->fractShiftsTableTexture);
      destroyParamLookupTable(&kernelParamsPtr->grid.d_gridlineIndicesTable,
-                            kernelParamsPtr->gridlineIndicesTableTexture);
+                            &kernelParamsPtr->gridlineIndicesTableTexture);
  #elif GMX_GPU_OPENCL || GMX_GPU_SYCL
      freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
      freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable);
@@ -598,7 +594,10 @@ static void pme_gpu_init_internal(PmeGpu* pmeGpu, const DeviceContext& deviceCon
       */
  
  #if GMX_GPU_CUDA
-    pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
+    pmeGpu->kernelParams->usePipeline       = false;
+    pmeGpu->kernelParams->pipelineAtomStart = 0;
+    pmeGpu->kernelParams->pipelineAtomEnd   = 0;
+    pmeGpu->maxGridWidthX                   = deviceContext.deviceInfo().prop.maxGridSize[0];
  #else
      // Use this path for any non-CUDA GPU acceleration
      // TODO: is there no really global work size limit in OpenCL?
@@ -611,9 +610,46 @@ void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
      if (pme_gpu_settings(pmeGpu).performGPUFFT)
      {
          pmeGpu->archSpecific->fftSetup.resize(0);
+        const bool         performOutOfPlaceFFT      = pmeGpu->archSpecific->performOutOfPlaceFFT;
+        const bool         allocateGrid              = false;
+        MPI_Comm           comm                      = MPI_COMM_NULL;
+        std::array<int, 1> gridOffsetsInXForEachRank = { 0 };
+        std::array<int, 1> gridOffsetsInYForEachRank = { 0 };
+#if GMX_GPU_CUDA
+        const gmx::FftBackend backend = gmx::FftBackend::Cufft;
+#elif GMX_GPU_OPENCL
+        const gmx::FftBackend backend = gmx::FftBackend::Ocl;
+#elif GMX_GPU_SYCL
+#    if GMX_SYCL_DPCPP && GMX_FFT_MKL
+        const gmx::FftBackend backend = gmx::FftBackend::SyclMkl;
+#    elif GMX_SYCL_HIPSYCL
+        const gmx::FftBackend backend = gmx::FftBackend::SyclRocfft;
+#    else
+        const gmx::FftBackend backend = gmx::FftBackend::Sycl;
+#    endif
+#else
+        GMX_RELEASE_ASSERT(false, "Unknown GPU backend");
+        const gmx::FftBackend backend = gmx::FftBackend::Count;
+#endif
+
+        PmeGpuGridParams& grid = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->grid;
          for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
          {
-            pmeGpu->archSpecific->fftSetup.push_back(std::make_unique<GpuParallel3dFft>(pmeGpu, gridIndex));
+            pmeGpu->archSpecific->fftSetup.push_back(
+                    std::make_unique<gmx::Gpu3dFft>(backend,
+                                                    allocateGrid,
+                                                    comm,
+                                                    gridOffsetsInXForEachRank,
+                                                    gridOffsetsInYForEachRank,
+                                                    grid.realGridSize[ZZ],
+                                                    performOutOfPlaceFFT,
+                                                    pmeGpu->archSpecific->deviceContext_,
+                                                    pmeGpu->archSpecific->pmeStream_,
+                                                    grid.realGridSize,
+                                                    grid.realGridSizePadded,
+                                                    grid.complexGridSizePadded,
+                                                    &(grid.d_realGrid[gridIndex]),
+                                                    &(grid.d_fourierGrid[gridIndex])));
          }
      }
  }
@@ -833,7 +869,7 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t* pme)
      pmeGpu->common->nn.insert(pmeGpu->common->nn.end(), pme->nnz, pme->nnz + cellCount * pme->nkz);
      pmeGpu->common->runMode       = pme->runMode;
      pmeGpu->common->isRankPmeOnly = !pme->bPPnode;
-    pmeGpu->common->boxScaler     = pme->boxScaler;
+    pmeGpu->common->boxScaler     = pme->boxScaler.get();
  }
  
  /*! \libinternal \brief
@@ -1018,23 +1054,23 @@ void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* chargesA
   * In CUDA result can be nullptr stub, per GpuRegionTimer implementation.
   *
   * \param[in] pmeGpu         The PME GPU data structure.
- * \param[in] PMEStageId     The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
+ * \param[in] pmeStageId     The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
   */
-static CommandEvent* pme_gpu_fetch_timing_event(const PmeGpu* pmeGpu, size_t PMEStageId)
+static CommandEvent* pme_gpu_fetch_timing_event(const PmeGpu* pmeGpu, PmeStage pmeStageId)
  {
      CommandEvent* timingEvent = nullptr;
      if (pme_gpu_timings_enabled(pmeGpu))
      {
-        GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(),
-                   "Wrong PME GPU timing event index");
-        timingEvent = pmeGpu->archSpecific->timingEvents[PMEStageId].fetchNextEvent();
+        GMX_ASSERT(pmeStageId < PmeStage::Count, "Wrong PME GPU timing event index");
+        timingEvent = pmeGpu->archSpecific->timingEvents[pmeStageId].fetchNextEvent();
      }
      return timingEvent;
  }
  
  void pme_gpu_3dfft(const PmeGpu* pmeGpu, gmx_fft_direction dir, const int grid_index)
  {
-    int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R;
+    PmeStage timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? PmeStage::FftTransformR2C
+                                                        : PmeStage::FftTransformC2R;
  
      pme_gpu_start_timing(pmeGpu, timerId);
      pmeGpu->archSpecific->fftSetup[grid_index]->perform3dFft(
@@ -1138,8 +1174,8 @@ static auto selectSplineAndSpreadKernelPtr(const PmeGpu*  pmeGpu,
   *
   * \return Pointer to CUDA kernel
   */
-static auto selectSplineKernelPtr(const PmeGpu*  pmeGpu,
-                                  ThreadsPerAtom threadsPerAtom,
+static auto selectSplineKernelPtr(const PmeGpu*   pmeGpu,
+                                  ThreadsPerAtom  threadsPerAtom,
                                    bool gmx_unused writeSplinesToGlobal,
                                    const int       numGrids)
  {
@@ -1197,6 +1233,7 @@ static auto selectSpreadKernelPtr(const PmeGpu*  pmeGpu,
              {
                  kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4Dual;
              }
+            else
              {
                  kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4Single;
              }
@@ -1243,12 +1280,14 @@ static auto selectSpreadKernelPtr(const PmeGpu*  pmeGpu,
      return kernelPtr;
  }
  
-void pme_gpu_spread(const PmeGpu*         pmeGpu,
-                    GpuEventSynchronizer* xReadyOnDevice,
-                    real**                h_grids,
-                    bool                  computeSplines,
-                    bool                  spreadCharges,
-                    const real            lambda)
+void pme_gpu_spread(const PmeGpu*                  pmeGpu,
+                    GpuEventSynchronizer*          xReadyOnDevice,
+                    real**                         h_grids,
+                    bool                           computeSplines,
+                    bool                           spreadCharges,
+                    const real                     lambda,
+                    const bool                     useGpuDirectComm,
+                    gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
  {
      GMX_ASSERT(
              pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
@@ -1315,60 +1354,134 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
      config.gridSize[0]  = dimGrid.first;
      config.gridSize[1]  = dimGrid.second;
  
-    int                                timingId;
+    PmeStage                           timingId;
      PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+    const bool writeGlobalOrSaveSplines          = writeGlobal || (!recalculateSplines);
      if (computeSplines)
      {
          if (spreadCharges)
          {
-            timingId  = gtPME_SPLINEANDSPREAD;
+            timingId  = PmeStage::SplineAndSpread;
              kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu,
                                                         pmeGpu->settings.threadsPerAtom,
-                                                       writeGlobal || (!recalculateSplines),
+                                                       writeGlobalOrSaveSplines,
                                                         pmeGpu->common->ngrids);
          }
          else
          {
-            timingId  = gtPME_SPLINE;
+            timingId  = PmeStage::Spline;
              kernelPtr = selectSplineKernelPtr(pmeGpu,
                                                pmeGpu->settings.threadsPerAtom,
-                                              writeGlobal || (!recalculateSplines),
+                                              writeGlobalOrSaveSplines,
                                                pmeGpu->common->ngrids);
          }
      }
      else
      {
-        timingId  = gtPME_SPREAD;
-        kernelPtr = selectSpreadKernelPtr(pmeGpu,
-                                          pmeGpu->settings.threadsPerAtom,
-                                          writeGlobal || (!recalculateSplines),
-                                          pmeGpu->common->ngrids);
+        timingId  = PmeStage::Spread;
+        kernelPtr = selectSpreadKernelPtr(
+                pmeGpu, pmeGpu->settings.threadsPerAtom, writeGlobalOrSaveSplines, pmeGpu->common->ngrids);
      }
  
  
      pme_gpu_start_timing(pmeGpu, timingId);
      auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+
+    kernelParamsPtr->usePipeline = computeSplines && spreadCharges && useGpuDirectComm
+                                   && (pmeCoordinateReceiverGpu->ppCommNumSenderRanks() > 1)
+                                   && !writeGlobalOrSaveSplines;
+    if (kernelParamsPtr->usePipeline)
+    {
+        int numStagesInPipeline = pmeCoordinateReceiverGpu->ppCommNumSenderRanks();
+
+        for (int i = 0; i < numStagesInPipeline; i++)
+        {
+            int senderRank;
+            if (useGpuDirectComm)
+            {
+                senderRank = pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRank(
+                        i, *(pmeCoordinateReceiverGpu->ppCommStream(i)));
+            }
+            else
+            {
+                senderRank = i;
+            }
+
+            // set kernel configuration options specific to this stage of the pipeline
+            std::tie(kernelParamsPtr->pipelineAtomStart, kernelParamsPtr->pipelineAtomEnd) =
+                    pmeCoordinateReceiverGpu->ppCommAtomRange(senderRank);
+            const int blockCount       = static_cast<int>(std::ceil(
+                    static_cast<float>(kernelParamsPtr->pipelineAtomEnd - kernelParamsPtr->pipelineAtomStart)
+                    / atomsPerBlock));
+            auto      dimGrid          = pmeGpuCreateGrid(pmeGpu, blockCount);
+            config.gridSize[0]         = dimGrid.first;
+            config.gridSize[1]         = dimGrid.second;
+            DeviceStream* launchStream = pmeCoordinateReceiverGpu->ppCommStream(senderRank);
+
+
  #if c_canEmbedBuffers
-    const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+            const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
  #else
-    const auto kernelArgs =
-            prepareGpuKernelArguments(kernelPtr,
-                                      config,
-                                      kernelParamsPtr,
-                                      &kernelParamsPtr->atoms.d_theta,
-                                      &kernelParamsPtr->atoms.d_dtheta,
-                                      &kernelParamsPtr->atoms.d_gridlineIndices,
-                                      &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
-                                      &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
-                                      &kernelParamsPtr->grid.d_fractShiftsTable,
-                                      &kernelParamsPtr->grid.d_gridlineIndicesTable,
-                                      &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
-                                      &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
-                                      &kernelParamsPtr->atoms.d_coordinates);
+            const auto kernelArgs =
+                    prepareGpuKernelArguments(kernelPtr,
+                                              config,
+                                              kernelParamsPtr,
+                                              &kernelParamsPtr->atoms.d_theta,
+                                              &kernelParamsPtr->atoms.d_dtheta,
+                                              &kernelParamsPtr->atoms.d_gridlineIndices,
+                                              &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+                                              &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+                                              &kernelParamsPtr->grid.d_fractShiftsTable,
+                                              &kernelParamsPtr->grid.d_gridlineIndicesTable,
+                                              &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+                                              &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+                                              &kernelParamsPtr->atoms.d_coordinates);
+#endif
+
+            launchGpuKernel(kernelPtr, config, *launchStream, timingEvent, "PME spline/spread", kernelArgs);
+        }
+        // Set dependencies for PME stream on all pipeline streams
+        for (int i = 0; i < pmeCoordinateReceiverGpu->ppCommNumSenderRanks(); i++)
+        {
+            GpuEventSynchronizer event;
+            event.markEvent(*(pmeCoordinateReceiverGpu->ppCommStream(i)));
+            event.enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
+        }
+    }
+    else // pipelining is not in use
+    {
+        if (useGpuDirectComm) // Sync all PME-PP communications to PME stream
+        {
+            pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromAllPpRanks(pmeGpu->archSpecific->pmeStream_);
+        }
+
+#if c_canEmbedBuffers
+        const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+#else
+        const auto kernelArgs =
+                prepareGpuKernelArguments(kernelPtr,
+                                          config,
+                                          kernelParamsPtr,
+                                          &kernelParamsPtr->atoms.d_theta,
+                                          &kernelParamsPtr->atoms.d_dtheta,
+                                          &kernelParamsPtr->atoms.d_gridlineIndices,
+                                          &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+                                          &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+                                          &kernelParamsPtr->grid.d_fractShiftsTable,
+                                          &kernelParamsPtr->grid.d_gridlineIndicesTable,
+                                          &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+                                          &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+                                          &kernelParamsPtr->atoms.d_coordinates);
  #endif
  
-    launchGpuKernel(
-            kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME spline/spread", kernelArgs);
+        launchGpuKernel(kernelPtr,
+                        config,
+                        pmeGpu->archSpecific->pmeStream_,
+                        timingEvent,
+                        "PME spline/spread",
+                        kernelArgs);
+    }
+
      pme_gpu_stop_timing(pmeGpu, timingId);
  
      const auto& settings    = pmeGpu->settings;
@@ -1465,7 +1578,7 @@ void pme_gpu_solve(const PmeGpu* pmeGpu,
                           / gridLinesPerBlock;
      config.gridSize[2] = pmeGpu->kernelParams->grid.complexGridSize[majorDim];
  
-    int                                timingId  = gtPME_SOLVE;
+    PmeStage                           timingId  = PmeStage::Solve;
      PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
      if (gridOrdering == GridOrdering::YZX)
      {
@@ -1657,7 +1770,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, real** h_grids, const float lambda)
  
      // TODO test different cache configs
  
-    int                                timingId = gtPME_GATHER;
+    PmeStage                           timingId = PmeStage::Gather;
      PmeGpuProgramImpl::PmeKernelHandle kernelPtr =
              selectGatherKernelPtr(pmeGpu,
                                    pmeGpu->settings.threadsPerAtom,
@@ -1706,7 +1819,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, real** h_grids, const float lambda)
      }
  }
  
-void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
+DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
  {
      if (pmeGpu && pmeGpu->kernelParams)
      {
@@ -1714,7 +1827,7 @@ void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
      }
      else
      {
-        return nullptr;
+        return DeviceBuffer<gmx::RVec>{};
      }
  }