Pipeline GPU PME Spline/Spread with PP Comms

[alexxy/gromacs.git] / src / gromacs / ewald / tests / pmetestcommon.cpp
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index e446b8e36d2d3e0ea3d74331e840436058f92cd4..0016cd80241e43f304af1595ddd7fab1b694d799 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -1,7 +1,8 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020 by the GROMACS development team.
+ * Copyright (c) 2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -60,6 +61,7 @@
  #include "gromacs/ewald/pme_spread.h"
  #include "gromacs/fft/parallel_3dfft.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/device_management.h"
  #include "gromacs/math/invertmatrix.h"
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/pbcutil/pbc.h"
@@ -68,10 +70,12 @@
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/logger.h"
  #include "gromacs/utility/stringutil.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
  
+#include "testutils/test_hardware_environment.h"
  #include "testutils/testasserts.h"
  
-#include "testhardwarecontexts.h"
+class DeviceContext;
  
  namespace gmx
  {
@@ -80,15 +84,14 @@ namespace test
  
  bool pmeSupportsInputForMode(const gmx_hw_info_t& hwinfo, const t_inputrec* inputRec, CodePath mode)
  {
-    bool       implemented;
-    gmx_mtop_t mtop;
+    bool implemented;
      switch (mode)
      {
          case CodePath::CPU: implemented = true; break;
  
          case CodePath::GPU:
              implemented = (pme_gpu_supports_build(nullptr) && pme_gpu_supports_hardware(hwinfo, nullptr)
-                           && pme_gpu_supports_input(*inputRec, mtop, nullptr));
+                           && pme_gpu_supports_input(*inputRec, nullptr));
              break;
  
          default: GMX_THROW(InternalError("Test not implemented for this mode"));
@@ -106,21 +109,34 @@ uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder)
  }
  
  //! PME initialization
-PmeSafePointer pmeInitWrapper(const t_inputrec*        inputRec,
-                              const CodePath           mode,
-                              const DeviceInformation* deviceInfo,
-                              const PmeGpuProgram*     pmeGpuProgram,
-                              const Matrix3x3&         box,
-                              const real               ewaldCoeff_q,
-                              const real               ewaldCoeff_lj)
+PmeSafePointer pmeInitWrapper(const t_inputrec*    inputRec,
+                              const CodePath       mode,
+                              const DeviceContext* deviceContext,
+                              const DeviceStream*  deviceStream,
+                              const PmeGpuProgram* pmeGpuProgram,
+                              const Matrix3x3&     box,
+                              const real           ewaldCoeff_q,
+                              const real           ewaldCoeff_lj)
  {
      const MDLogger dummyLogger;
      const auto     runMode       = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::Mixed;
      t_commrec      dummyCommrec  = { 0 };
      NumPmeDomains  numPmeDomains = { 1, 1 };
-    gmx_pme_t*     pmeDataRaw =
-            gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, false, false, true, ewaldCoeff_q,
-                         ewaldCoeff_lj, 1, runMode, nullptr, deviceInfo, pmeGpuProgram, dummyLogger);
+    gmx_pme_t*     pmeDataRaw    = gmx_pme_init(&dummyCommrec,
+                                         numPmeDomains,
+                                         inputRec,
+                                         false,
+                                         false,
+                                         true,
+                                         ewaldCoeff_q,
+                                         ewaldCoeff_lj,
+                                         1,
+                                         runMode,
+                                         nullptr,
+                                         deviceContext,
+                                         deviceStream,
+                                         pmeGpuProgram,
+                                         dummyLogger);
      PmeSafePointer pme(pmeDataRaw); // taking ownership
  
      // TODO get rid of this with proper matrix type
@@ -150,35 +166,22 @@ PmeSafePointer pmeInitWrapper(const t_inputrec*        inputRec,
      return pme;
  }
  
-//! Simple PME initialization based on input, no atom data
-PmeSafePointer pmeInitEmpty(const t_inputrec*        inputRec,
-                            const CodePath           mode,
-                            const DeviceInformation* deviceInfo,
-                            const PmeGpuProgram*     pmeGpuProgram,
-                            const Matrix3x3&         box,
-                            const real               ewaldCoeff_q,
-                            const real               ewaldCoeff_lj)
-{
-    return pmeInitWrapper(inputRec, mode, deviceInfo, pmeGpuProgram, box, ewaldCoeff_q, ewaldCoeff_lj);
-    // hiding the fact that PME actually needs to know the number of atoms in advance
-}
-
  PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec)
  {
      const Matrix3x3 defaultBox = { { 1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F } };
-    return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
+    return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
  }
  
  //! Make a GPU state-propagator manager
  std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
-                                                                   const DeviceContext& deviceContext)
+                                                                   const DeviceContext* deviceContext,
+                                                                   const DeviceStream* deviceStream)
  {
      // TODO: Pin the host buffer and use async memory copies
      // TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to
      //       restrict one from using other constructor here.
-    return std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(&pme), deviceContext,
-                                                    GpuApiCallBehavior::Sync,
-                                                    pme_gpu_get_block_size(&pme), nullptr);
+    return std::make_unique<StatePropagatorDataGpu>(
+            deviceStream, *deviceContext, GpuApiCallBehavior::Sync, pme_gpu_get_block_size(&pme), nullptr);
  }
  
  //! PME initialization with atom data
@@ -198,7 +201,7 @@ void pmeInitAtoms(gmx_pme_t*               pme,
              atc              = &(pme->atc[0]);
              atc->x           = coordinates;
              atc->coefficient = charges;
-            gmx_pme_reinit_atoms(pme, atomCount, charges.data());
+            gmx_pme_reinit_atoms(pme, atomCount, charges, {});
              /* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */
              break;
  
@@ -207,11 +210,11 @@ void pmeInitAtoms(gmx_pme_t*               pme,
              atc = &(pme->atc[0]);
              // We need to set atc->n for passing the size in the tests
              atc->setNumAtoms(atomCount);
-            gmx_pme_reinit_atoms(pme, atomCount, charges.data());
+            gmx_pme_reinit_atoms(pme, atomCount, charges, {});
  
              stateGpu->reinit(atomCount, atomCount);
              stateGpu->copyCoordinatesToGpu(arrayRefFromArray(coordinates.data(), coordinates.size()),
-                                           gmx::AtomLocality::All);
+                                           gmx::AtomLocality::Local);
              pme_gpu_set_kernelparam_coordinates(pme->gpu, stateGpu->getCoordinates());
  
              break;
@@ -238,8 +241,8 @@ static void pmeGetRealGridSizesInternal(const gmx_pme_t* pme,
      switch (mode)
      {
          case CodePath::CPU:
-            gmx_parallel_3dfft_real_limits(pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused,
-                                           paddedGridSize);
+            gmx_parallel_3dfft_real_limits(
+                    pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused, paddedGridSize);
              break;
  
          case CodePath::GPU:
@@ -264,8 +267,8 @@ static void pmeGetComplexGridSizesInternal(const gmx_pme_t* pme,
  {
      const size_t gridIndex = 0;
      IVec         gridOffsetUnused, complexOrderUnused;
-    gmx_parallel_3dfft_complex_limits(pme->pfft_setup[gridIndex], complexOrderUnused, gridSize,
-                                      gridOffsetUnused, paddedGridSize); // TODO: what about YZX ordering?
+    gmx_parallel_3dfft_complex_limits(
+            pme->pfft_setup[gridIndex], complexOrderUnused, gridSize, gridOffsetUnused, paddedGridSize); // TODO: what about YZX ordering?
  }
  
  //! Getting the PME grid memory buffer and its sizes - template definition
@@ -277,7 +280,7 @@ static void pmeGetGridAndSizesInternal(const gmx_pme_t* /*unused*/,
                                         IVec& /*unused*/)       //NOLINT(google-runtime-references)
  {
      GMX_THROW(InternalError("Deleted function call"));
-    // explicitly deleting general template does not compile in clang/icc, see https://llvm.org/bugs/show_bug.cgi?id=17537
+    // explicitly deleting general template does not compile in clang, see https://llvm.org/bugs/show_bug.cgi?id=17537
  }
  
  //! Getting the PME real grid memory buffer and its sizes
@@ -310,28 +313,51 @@ void pmePerformSplineAndSpread(gmx_pme_t* pme,
      PmeAtomComm* atc                          = &(pme->atc[0]);
      const size_t gridIndex                    = 0;
      const bool   computeSplinesForZeroCharges = true;
-    real*        fftgrid                      = spreadCharges ? pme->fftgrid[gridIndex] : nullptr;
+    real**       fftgrid                      = spreadCharges ? pme->fftgrid : nullptr;
      real*        pmegrid                      = pme->pmegrid[gridIndex].grid.grid;
  
      switch (mode)
      {
          case CodePath::CPU:
-            spread_on_grid(pme, atc, &pme->pmegrid[gridIndex], computeSplines, spreadCharges,
-                           fftgrid, computeSplinesForZeroCharges, gridIndex);
+            spread_on_grid(pme,
+                           atc,
+                           &pme->pmegrid[gridIndex],
+                           computeSplines,
+                           spreadCharges,
+                           fftgrid != nullptr ? fftgrid[gridIndex] : nullptr,
+                           computeSplinesForZeroCharges,
+                           gridIndex);
              if (spreadCharges && !pme->bUseThreads)
              {
                  wrap_periodic_pmegrid(pme, pmegrid);
-                copy_pmegrid_to_fftgrid(pme, pmegrid, fftgrid, gridIndex);
+                copy_pmegrid_to_fftgrid(
+                        pme, pmegrid, fftgrid != nullptr ? fftgrid[gridIndex] : nullptr, gridIndex);
              }
              break;
  
+/* The compiler will complain about passing fftgrid (converting double ** to float **) if using
+ * double precision. GPUs are not used with double precision anyhow. */
+#if !GMX_DOUBLE
          case CodePath::GPU:
          {
+            const real lambdaQ = 1.0;
              // no synchronization needed as x is transferred in the PME stream
              GpuEventSynchronizer* xReadyOnDevice = nullptr;
-            pme_gpu_spread(pme->gpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
+
+            bool                           useGpuDirectComm         = false;
+            gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
+
+            pme_gpu_spread(pme->gpu,
+                           xReadyOnDevice,
+                           fftgrid,
+                           computeSplines,
+                           spreadCharges,
+                           lambdaQ,
+                           useGpuDirectComm,
+                           pmeCoordinateReceiverGpu);
          }
          break;
+#endif
  
          default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
@@ -370,6 +396,7 @@ void pmePerformSolve(const gmx_pme_t*  pme,
      t_complex*   h_grid              = pmeGetComplexGridInternal(pme);
      const bool   useLorentzBerthelot = false;
      const size_t threadIndex         = 0;
+    const size_t gridIndex           = 0;
      switch (mode)
      {
          case CodePath::CPU:
@@ -384,8 +411,13 @@ void pmePerformSolve(const gmx_pme_t*  pme,
                      break;
  
                  case PmeSolveAlgorithm::LennardJones:
-                    solve_pme_lj_yzx(pme, &h_grid, useLorentzBerthelot, cellVolume,
-                                     computeEnergyAndVirial, pme->nthread, threadIndex);
+                    solve_pme_lj_yzx(pme,
+                                     &h_grid,
+                                     useLorentzBerthelot,
+                                     cellVolume,
+                                     computeEnergyAndVirial,
+                                     pme->nthread,
+                                     threadIndex);
                      break;
  
                  default: GMX_THROW(InternalError("Test not implemented for this mode"));
@@ -396,7 +428,7 @@ void pmePerformSolve(const gmx_pme_t*  pme,
              switch (method)
              {
                  case PmeSolveAlgorithm::Coulomb:
-                    pme_gpu_solve(pme->gpu, h_grid, gridOrdering, computeEnergyAndVirial);
+                    pme_gpu_solve(pme->gpu, gridIndex, h_grid, gridOrdering, computeEnergyAndVirial);
                      break;
  
                  default: GMX_THROW(InternalError("Test not implemented for this mode"));
@@ -417,7 +449,7 @@ void pmePerformGather(gmx_pme_t* pme, CodePath mode, ForcesVector& forces)
      const size_t threadIndex = 0;
      const size_t gridIndex   = 0;
      real*        pmegrid     = pme->pmegrid[gridIndex].grid.grid;
-    real*        fftgrid     = pme->fftgrid[gridIndex];
+    real**       fftgrid     = pme->fftgrid;
  
      switch (mode)
      {
@@ -428,22 +460,27 @@ void pmePerformGather(gmx_pme_t* pme, CodePath mode, ForcesVector& forces)
                  // something which is normally done in serial spline computation (make_thread_local_ind())
                  atc->spline[threadIndex].n = atomCount;
              }
-            copy_fftgrid_to_pmegrid(pme, fftgrid, pmegrid, gridIndex, pme->nthread, threadIndex);
+            copy_fftgrid_to_pmegrid(pme, fftgrid[gridIndex], pmegrid, gridIndex, pme->nthread, threadIndex);
              unwrap_periodic_pmegrid(pme, pmegrid);
              gather_f_bsplines(pme, pmegrid, true, atc, &atc->spline[threadIndex], scale);
              break;
  
+/* The compiler will complain about passing fftgrid (converting double ** to float **) if using
+ * double precision. GPUs are not used with double precision anyhow. */
+#if !GMX_DOUBLE
          case CodePath::GPU:
          {
              // Variable initialization needs a non-switch scope
              const bool computeEnergyAndVirial = false;
-            PmeOutput  output                 = pme_gpu_getOutput(*pme, computeEnergyAndVirial);
+            const real lambdaQ                = 1.0;
+            PmeOutput  output = pme_gpu_getOutput(*pme, computeEnergyAndVirial, lambdaQ);
              GMX_ASSERT(forces.size() == output.forces_.size(),
                         "Size of force buffers did not match");
-            pme_gpu_gather(pme->gpu, reinterpret_cast<float*>(fftgrid));
+            pme_gpu_gather(pme->gpu, fftgrid, lambdaQ);
              std::copy(std::begin(output.forces_), std::end(output.forces_), std::begin(forces));
          }
          break;
+#endif
  
          default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
@@ -530,14 +567,10 @@ static int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int
  /*!\brief Return the number of atoms per warp */
  static int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
  {
-    if (pmeGpu->settings.useOrderThreadsPerAtom)
-    {
-        return pmeGpu->programHandle_->warpSize() / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
-    }
-    else
-    {
-        return pmeGpu->programHandle_->warpSize() / c_pmeSpreadGatherThreadsPerAtom;
-    }
+    const int order = pmeGpu->common->pme_order;
+    const int threadsPerAtom =
+            (pmeGpu->settings.threadsPerAtom == ThreadsPerAtom::Order ? order : order * order);
+    return pmeGpu->programHandle_->warpSize() / threadsPerAtom;
  }
  
  /*! \brief Rearranges the atom spline data between the GPU and host layouts.
@@ -661,7 +694,8 @@ void pmeSetGridLineIndices(gmx_pme_t* pme, CodePath mode, const GridLineIndicesV
      switch (mode)
      {
          case CodePath::GPU:
-            memcpy(pme_gpu_staging(pme->gpu).h_gridlineIndices, gridLineIndices.data(),
+            memcpy(pme_gpu_staging(pme->gpu).h_gridlineIndices,
+                   gridLineIndices.data(),
                     atomCount * sizeof(gridLineIndices[0]));
              break;
  
@@ -707,8 +741,7 @@ static void pmeSetGridInternal(const gmx_pme_t*                        pme,
      {
          case CodePath::GPU: // intentional absence of break, the grid will be copied from the host buffer in testing mode
          case CodePath::CPU:
-            std::memset(grid, 0,
-                        paddedGridSize[XX] * paddedGridSize[YY] * paddedGridSize[ZZ] * sizeof(ValueType));
+            std::memset(grid, 0, paddedGridSize[XX] * paddedGridSize[YY] * paddedGridSize[ZZ] * sizeof(ValueType));
              for (const auto& gridValue : gridValues)
              {
                  for (int i = 0; i < DIM; i++)
@@ -843,7 +876,8 @@ SparseComplexGridValuesOutput pmeGetComplexGrid(const gmx_pme_t* pme, CodePath m
  //! Getting the reciprocal energy and virial
  PmeOutput pmeGetReciprocalEnergyAndVirial(const gmx_pme_t* pme, CodePath mode, PmeSolveAlgorithm method)
  {
-    PmeOutput output;
+    PmeOutput  output;
+    const real lambdaQ = 1.0;
      switch (mode)
      {
          case CodePath::CPU:
@@ -863,7 +897,9 @@ PmeOutput pmeGetReciprocalEnergyAndVirial(const gmx_pme_t* pme, CodePath mode, P
          case CodePath::GPU:
              switch (method)
              {
-                case PmeSolveAlgorithm::Coulomb: pme_gpu_getEnergyAndVirial(*pme, &output); break;
+                case PmeSolveAlgorithm::Coulomb:
+                    pme_gpu_getEnergyAndVirial(*pme, lambdaQ, &output);
+                    break;
  
                  default: GMX_THROW(InternalError("Test not implemented for this mode"));
              }
@@ -874,5 +910,57 @@ PmeOutput pmeGetReciprocalEnergyAndVirial(const gmx_pme_t* pme, CodePath mode, P
      return output;
  }
  
+const char* codePathToString(CodePath codePath)
+{
+    switch (codePath)
+    {
+        case CodePath::CPU: return "CPU";
+        case CodePath::GPU: return "GPU";
+        default: GMX_THROW(NotImplementedError("This CodePath should support codePathToString"));
+    }
+}
+
+PmeTestHardwareContext::PmeTestHardwareContext() : codePath_(CodePath::CPU) {}
+
+PmeTestHardwareContext::PmeTestHardwareContext(TestDevice* testDevice) :
+    codePath_(CodePath::GPU), testDevice_(testDevice)
+{
+    setActiveDevice(testDevice_->deviceInfo());
+    pmeGpuProgram_ = buildPmeGpuProgram(testDevice_->deviceContext());
+}
+
+//! Returns a human-readable context description line
+std::string PmeTestHardwareContext::description() const
+{
+    switch (codePath_)
+    {
+        case CodePath::CPU: return "CPU";
+        case CodePath::GPU: return "GPU (" + testDevice_->description() + ")";
+        default: return "Unknown code path.";
+    }
+}
+
+void PmeTestHardwareContext::activate() const
+{
+    if (codePath_ == CodePath::GPU)
+    {
+        setActiveDevice(testDevice_->deviceInfo());
+    }
+}
+
+std::vector<std::unique_ptr<PmeTestHardwareContext>> createPmeTestHardwareContextList()
+{
+    std::vector<std::unique_ptr<PmeTestHardwareContext>> pmeTestHardwareContextList;
+    // Add CPU
+    pmeTestHardwareContextList.emplace_back(std::make_unique<PmeTestHardwareContext>());
+    // Add GPU devices
+    const auto& testDeviceList = getTestHardwareEnvironment()->getTestDeviceList();
+    for (const auto& testDevice : testDeviceList)
+    {
+        pmeTestHardwareContextList.emplace_back(std::make_unique<PmeTestHardwareContext>(testDevice.get()));
+    }
+    return pmeTestHardwareContextList;
+}
+
  } // namespace test
  } // namespace gmx