Pipeline GPU PME Spline/Spread with PP Comms

[alexxy/gromacs.git] / src / gromacs / ewald / tests / pmetestcommon.cpp
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index 90f5a400eb1a79a5fae23e45c456f875627a13b8..0016cd80241e43f304af1595ddd7fab1b694d799 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -1,7 +1,8 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020 by the GROMACS development team.
+ * Copyright (c) 2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -45,15 +46,22 @@
  
  #include <cstring>
  
+#include <algorithm>
+
  #include "gromacs/domdec/domdec.h"
-#include "gromacs/ewald/pme-gather.h"
-#include "gromacs/ewald/pme-gpu-internal.h"
-#include "gromacs/ewald/pme-grid.h"
-#include "gromacs/ewald/pme-internal.h"
-#include "gromacs/ewald/pme-solve.h"
-#include "gromacs/ewald/pme-spread.h"
+#include "gromacs/ewald/pme_gather.h"
+#include "gromacs/ewald/pme_gpu_calculate_splines.h"
+#include "gromacs/ewald/pme_gpu_constants.h"
+#include "gromacs/ewald/pme_gpu_internal.h"
+#include "gromacs/ewald/pme_gpu_staging.h"
+#include "gromacs/ewald/pme_grid.h"
+#include "gromacs/ewald/pme_internal.h"
+#include "gromacs/ewald/pme_redistribute.h"
+#include "gromacs/ewald/pme_solve.h"
+#include "gromacs/ewald/pme_spread.h"
  #include "gromacs/fft/parallel_3dfft.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/device_management.h"
  #include "gromacs/math/invertmatrix.h"
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/pbcutil/pbc.h"
@@ -62,31 +70,31 @@
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/logger.h"
  #include "gromacs/utility/stringutil.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
  
+#include "testutils/test_hardware_environment.h"
  #include "testutils/testasserts.h"
  
+class DeviceContext;
+
  namespace gmx
  {
  namespace test
  {
  
-bool pmeSupportsInputForMode(const t_inputrec *inputRec, CodePath mode)
+bool pmeSupportsInputForMode(const gmx_hw_info_t& hwinfo, const t_inputrec* inputRec, CodePath mode)
  {
-    bool       implemented;
-    gmx_mtop_t mtop;
+    bool implemented;
      switch (mode)
      {
-        case CodePath::CPU:
-            implemented = true;
-            break;
+        case CodePath::CPU: implemented = true; break;
  
-        case CodePath::CUDA:
-            implemented = (pme_gpu_supports_build(nullptr) &&
-                           pme_gpu_supports_input(*inputRec, mtop, nullptr));
+        case CodePath::GPU:
+            implemented = (pme_gpu_supports_build(nullptr) && pme_gpu_supports_hardware(hwinfo, nullptr)
+                           && pme_gpu_supports_input(*inputRec, nullptr));
              break;
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
      return implemented;
  }
@@ -100,23 +108,35 @@ uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder)
      return 4 * (splineOrder - 2) + 2 * sineUlps * splineOrder;
  }
  
-//! PME initialization - internal
-static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
-                                      CodePath                  mode,
-                                      const gmx_device_info_t  *gpuInfo,
-                                      PmeGpuProgramHandle       pmeGpuProgram,
-                                      size_t                    atomCount,
-                                      const Matrix3x3          &box,
-                                      real                      ewaldCoeff_q = 1.0f,
-                                      real                      ewaldCoeff_lj = 1.0f
-                                      )
+//! PME initialization
+PmeSafePointer pmeInitWrapper(const t_inputrec*    inputRec,
+                              const CodePath       mode,
+                              const DeviceContext* deviceContext,
+                              const DeviceStream*  deviceStream,
+                              const PmeGpuProgram* pmeGpuProgram,
+                              const Matrix3x3&     box,
+                              const real           ewaldCoeff_q,
+                              const real           ewaldCoeff_lj)
  {
      const MDLogger dummyLogger;
-    const auto     runMode       = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::GPU;
-    t_commrec      dummyCommrec  = {0};
+    const auto     runMode       = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::Mixed;
+    t_commrec      dummyCommrec  = { 0 };
      NumPmeDomains  numPmeDomains = { 1, 1 };
-    gmx_pme_t     *pmeDataRaw    = gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, atomCount, false, false, true,
-                                                ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, gpuInfo, pmeGpuProgram, dummyLogger);
+    gmx_pme_t*     pmeDataRaw    = gmx_pme_init(&dummyCommrec,
+                                         numPmeDomains,
+                                         inputRec,
+                                         false,
+                                         false,
+                                         true,
+                                         ewaldCoeff_q,
+                                         ewaldCoeff_lj,
+                                         1,
+                                         runMode,
+                                         nullptr,
+                                         deviceContext,
+                                         deviceStream,
+                                         pmeGpuProgram,
+                                         dummyLogger);
      PmeSafePointer pme(pmeDataRaw); // taking ownership
  
      // TODO get rid of this with proper matrix type
@@ -128,208 +148,255 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
              boxTemp[i][j] = box[i * DIM + j];
          }
      }
-    const char *boxError = check_box(-1, boxTemp);
+    const char* boxError = check_box(PbcType::Unset, boxTemp);
      GMX_RELEASE_ASSERT(boxError == nullptr, boxError);
  
      switch (mode)
      {
-        case CodePath::CPU:
-            invertBoxMatrix(boxTemp, pme->recipbox);
-            break;
+        case CodePath::CPU: invertBoxMatrix(boxTemp, pme->recipbox); break;
  
-        case CodePath::CUDA:
+        case CodePath::GPU:
              pme_gpu_set_testing(pme->gpu, true);
              pme_gpu_update_input_box(pme->gpu, boxTemp);
              break;
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
  
      return pme;
  }
  
-//! Simple PME initialization based on input, no atom data
-PmeSafePointer pmeInitEmpty(const t_inputrec         *inputRec,
-                            CodePath                  mode,
-                            const gmx_device_info_t  *gpuInfo,
-                            PmeGpuProgramHandle       pmeGpuProgram,
-                            const Matrix3x3          &box,
-                            real                      ewaldCoeff_q,
-                            real                      ewaldCoeff_lj
-                            )
+PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec)
  {
-    return pmeInitInternal(inputRec, mode, gpuInfo, pmeGpuProgram, 0, box, ewaldCoeff_q, ewaldCoeff_lj);
-    // hiding the fact that PME actually needs to know the number of atoms in advance
+    const Matrix3x3 defaultBox = { { 1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F } };
+    return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
+}
+
+//! Make a GPU state-propagator manager
+std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
+                                                                   const DeviceContext* deviceContext,
+                                                                   const DeviceStream* deviceStream)
+{
+    // TODO: Pin the host buffer and use async memory copies
+    // TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to
+    //       restrict one from using other constructor here.
+    return std::make_unique<StatePropagatorDataGpu>(
+            deviceStream, *deviceContext, GpuApiCallBehavior::Sync, pme_gpu_get_block_size(&pme), nullptr);
  }
  
  //! PME initialization with atom data
-PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
-                            CodePath                  mode,
-                            const gmx_device_info_t  *gpuInfo,
-                            PmeGpuProgramHandle       pmeGpuProgram,
-                            const CoordinatesVector  &coordinates,
-                            const ChargesVector      &charges,
-                            const Matrix3x3          &box
-                            )
-{
-    const index     atomCount = coordinates.size();
-    GMX_RELEASE_ASSERT(atomCount == charges.size(), "Mismatch in atom data");
-    PmeSafePointer  pmeSafe = pmeInitInternal(inputRec, mode, gpuInfo, pmeGpuProgram, atomCount, box);
-    pme_atomcomm_t *atc     = nullptr;
+void pmeInitAtoms(gmx_pme_t*               pme,
+                  StatePropagatorDataGpu*  stateGpu,
+                  const CodePath           mode,
+                  const CoordinatesVector& coordinates,
+                  const ChargesVector&     charges)
+{
+    const index atomCount = coordinates.size();
+    GMX_RELEASE_ASSERT(atomCount == charges.ssize(), "Mismatch in atom data");
+    PmeAtomComm* atc = nullptr;
  
      switch (mode)
      {
          case CodePath::CPU:
-            atc              = &(pmeSafe->atc[0]);
-            atc->x           = const_cast<rvec *>(as_rvec_array(coordinates.data()));
-            atc->coefficient = const_cast<real *>(charges.data());
+            atc              = &(pme->atc[0]);
+            atc->x           = coordinates;
+            atc->coefficient = charges;
+            gmx_pme_reinit_atoms(pme, atomCount, charges, {});
              /* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */
              break;
  
-        case CodePath::CUDA:
-            gmx_pme_reinit_atoms(pmeSafe.get(), atomCount, charges.data());
-            pme_gpu_copy_input_coordinates(pmeSafe->gpu, as_rvec_array(coordinates.data()));
+        case CodePath::GPU:
+            // TODO: Avoid use of atc in the GPU code path
+            atc = &(pme->atc[0]);
+            // We need to set atc->n for passing the size in the tests
+            atc->setNumAtoms(atomCount);
+            gmx_pme_reinit_atoms(pme, atomCount, charges, {});
+
+            stateGpu->reinit(atomCount, atomCount);
+            stateGpu->copyCoordinatesToGpu(arrayRefFromArray(coordinates.data(), coordinates.size()),
+                                           gmx::AtomLocality::Local);
+            pme_gpu_set_kernelparam_coordinates(pme->gpu, stateGpu->getCoordinates());
+
              break;
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
-
-    return pmeSafe;
  }
  
  //! Getting local PME real grid pointer for test I/O
-static real *pmeGetRealGridInternal(const gmx_pme_t *pme)
+static real* pmeGetRealGridInternal(const gmx_pme_t* pme)
  {
      const size_t gridIndex = 0;
      return pme->fftgrid[gridIndex];
  }
  
  //! Getting local PME real grid dimensions
-static void pmeGetRealGridSizesInternal(const gmx_pme_t      *pme,
-                                        CodePath              mode,
-                                        IVec                 &gridSize,       //NOLINT(google-runtime-references)
-                                        IVec                 &paddedGridSize) //NOLINT(google-runtime-references)
+static void pmeGetRealGridSizesInternal(const gmx_pme_t* pme,
+                                        CodePath         mode,
+                                        IVec& gridSize,       //NOLINT(google-runtime-references)
+                                        IVec& paddedGridSize) //NOLINT(google-runtime-references)
  {
      const size_t gridIndex = 0;
      IVec         gridOffsetUnused;
      switch (mode)
      {
          case CodePath::CPU:
-            gmx_parallel_3dfft_real_limits(pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused, paddedGridSize);
+            gmx_parallel_3dfft_real_limits(
+                    pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused, paddedGridSize);
              break;
  
-        case CodePath::CUDA:
+        case CodePath::GPU:
              pme_gpu_get_real_grid_sizes(pme->gpu, &gridSize, &paddedGridSize);
              break;
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
  }
  
  //! Getting local PME complex grid pointer for test I/O
-static t_complex *pmeGetComplexGridInternal(const gmx_pme_t *pme)
+static t_complex* pmeGetComplexGridInternal(const gmx_pme_t* pme)
  {
      const size_t gridIndex = 0;
      return pme->cfftgrid[gridIndex];
  }
  
  //! Getting local PME complex grid dimensions
-static void pmeGetComplexGridSizesInternal(const gmx_pme_t      *pme,
-                                           IVec                 &gridSize,       //NOLINT(google-runtime-references)
-                                           IVec                 &paddedGridSize) //NOLINT(google-runtime-references)
+static void pmeGetComplexGridSizesInternal(const gmx_pme_t* pme,
+                                           IVec& gridSize,       //NOLINT(google-runtime-references)
+                                           IVec& paddedGridSize) //NOLINT(google-runtime-references)
  {
      const size_t gridIndex = 0;
      IVec         gridOffsetUnused, complexOrderUnused;
-    gmx_parallel_3dfft_complex_limits(pme->pfft_setup[gridIndex], complexOrderUnused, gridSize, gridOffsetUnused, paddedGridSize); //TODO: what about YZX ordering?
+    gmx_parallel_3dfft_complex_limits(
+            pme->pfft_setup[gridIndex], complexOrderUnused, gridSize, gridOffsetUnused, paddedGridSize); // TODO: what about YZX ordering?
  }
  
  //! Getting the PME grid memory buffer and its sizes - template definition
-template<typename ValueType> static void pmeGetGridAndSizesInternal(const gmx_pme_t * /*unused*/, CodePath /*unused*/, ValueType * & /*unused*/, IVec & /*unused*/, IVec & /*unused*/) //NOLINT(google-runtime-references)
+template<typename ValueType>
+static void pmeGetGridAndSizesInternal(const gmx_pme_t* /*unused*/,
+                                       CodePath /*unused*/,
+                                       ValueType*& /*unused*/, //NOLINT(google-runtime-references)
+                                       IVec& /*unused*/,       //NOLINT(google-runtime-references)
+                                       IVec& /*unused*/)       //NOLINT(google-runtime-references)
  {
      GMX_THROW(InternalError("Deleted function call"));
-    // explicitly deleting general template does not compile in clang/icc, see https://llvm.org/bugs/show_bug.cgi?id=17537
+    // explicitly deleting general template does not compile in clang, see https://llvm.org/bugs/show_bug.cgi?id=17537
  }
  
  //! Getting the PME real grid memory buffer and its sizes
-template<> void pmeGetGridAndSizesInternal<real>(const gmx_pme_t *pme, CodePath mode, real * &grid, IVec &gridSize, IVec &paddedGridSize)
+template<>
+void pmeGetGridAndSizesInternal<real>(const gmx_pme_t* pme, CodePath mode, real*& grid, IVec& gridSize, IVec& paddedGridSize)
  {
      grid = pmeGetRealGridInternal(pme);
      pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSize);
  }
  
  //! Getting the PME complex grid memory buffer and its sizes
-template<> void pmeGetGridAndSizesInternal<t_complex>(const gmx_pme_t *pme, CodePath /*unused*/, t_complex * &grid, IVec &gridSize, IVec &paddedGridSize)
+template<>
+void pmeGetGridAndSizesInternal<t_complex>(const gmx_pme_t* pme,
+                                           CodePath /*unused*/,
+                                           t_complex*& grid,
+                                           IVec&       gridSize,
+                                           IVec&       paddedGridSize)
  {
      grid = pmeGetComplexGridInternal(pme);
      pmeGetComplexGridSizesInternal(pme, gridSize, paddedGridSize);
  }
  
  //! PME spline calculation and charge spreading
-void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qualifiers elsewhere
-                               bool computeSplines, bool spreadCharges)
+void pmePerformSplineAndSpread(gmx_pme_t* pme,
+                               CodePath   mode, // TODO const qualifiers elsewhere
+                               bool       computeSplines,
+                               bool       spreadCharges)
  {
      GMX_RELEASE_ASSERT(pme != nullptr, "PME data is not initialized");
-    pme_atomcomm_t *atc                          = &(pme->atc[0]);
-    const size_t    gridIndex                    = 0;
-    const bool      computeSplinesForZeroCharges = true;
-    real           *fftgrid                      = spreadCharges ? pme->fftgrid[gridIndex] : nullptr;
-    real           *pmegrid                      = pme->pmegrid[gridIndex].grid.grid;
+    PmeAtomComm* atc                          = &(pme->atc[0]);
+    const size_t gridIndex                    = 0;
+    const bool   computeSplinesForZeroCharges = true;
+    real**       fftgrid                      = spreadCharges ? pme->fftgrid : nullptr;
+    real*        pmegrid                      = pme->pmegrid[gridIndex].grid.grid;
  
      switch (mode)
      {
          case CodePath::CPU:
-            spread_on_grid(pme, atc, &pme->pmegrid[gridIndex], computeSplines, spreadCharges,
-                           fftgrid, computeSplinesForZeroCharges, gridIndex);
+            spread_on_grid(pme,
+                           atc,
+                           &pme->pmegrid[gridIndex],
+                           computeSplines,
+                           spreadCharges,
+                           fftgrid != nullptr ? fftgrid[gridIndex] : nullptr,
+                           computeSplinesForZeroCharges,
+                           gridIndex);
              if (spreadCharges && !pme->bUseThreads)
              {
                  wrap_periodic_pmegrid(pme, pmegrid);
-                copy_pmegrid_to_fftgrid(pme, pmegrid, fftgrid, gridIndex);
+                copy_pmegrid_to_fftgrid(
+                        pme, pmegrid, fftgrid != nullptr ? fftgrid[gridIndex] : nullptr, gridIndex);
              }
              break;
  
-        case CodePath::CUDA:
-            pme_gpu_spread(pme->gpu, gridIndex, fftgrid, computeSplines, spreadCharges);
-            break;
+/* The compiler will complain about passing fftgrid (converting double ** to float **) if using
+ * double precision. GPUs are not used with double precision anyhow. */
+#if !GMX_DOUBLE
+        case CodePath::GPU:
+        {
+            const real lambdaQ = 1.0;
+            // no synchronization needed as x is transferred in the PME stream
+            GpuEventSynchronizer* xReadyOnDevice = nullptr;
+
+            bool                           useGpuDirectComm         = false;
+            gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
+
+            pme_gpu_spread(pme->gpu,
+                           xReadyOnDevice,
+                           fftgrid,
+                           computeSplines,
+                           spreadCharges,
+                           lambdaQ,
+                           useGpuDirectComm,
+                           pmeCoordinateReceiverGpu);
+        }
+        break;
+#endif
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
  }
  
  //! Getting the internal spline data buffer pointer
-static real *pmeGetSplineDataInternal(const gmx_pme_t *pme, PmeSplineDataType type, int dimIndex)
+static real* pmeGetSplineDataInternal(const gmx_pme_t* pme, PmeSplineDataType type, int dimIndex)
  {
      GMX_ASSERT((0 <= dimIndex) && (dimIndex < DIM), "Invalid dimension index");
-    const pme_atomcomm_t *atc          = &(pme->atc[0]);
-    const size_t          threadIndex  = 0;
-    real                 *splineBuffer = nullptr;
+    const PmeAtomComm* atc          = &(pme->atc[0]);
+    const size_t       threadIndex  = 0;
+    real*              splineBuffer = nullptr;
      switch (type)
      {
          case PmeSplineDataType::Values:
-            splineBuffer = atc->spline[threadIndex].theta[dimIndex];
+            splineBuffer = atc->spline[threadIndex].theta.coefficients[dimIndex];
              break;
  
          case PmeSplineDataType::Derivatives:
-            splineBuffer = atc->spline[threadIndex].dtheta[dimIndex];
+            splineBuffer = atc->spline[threadIndex].dtheta.coefficients[dimIndex];
              break;
  
-        default:
-            GMX_THROW(InternalError("Unknown spline data type"));
+        default: GMX_THROW(InternalError("Unknown spline data type"));
      }
      return splineBuffer;
  }
  
  //! PME solving
-void pmePerformSolve(const gmx_pme_t *pme, CodePath mode,
-                     PmeSolveAlgorithm method, real cellVolume,
-                     GridOrdering gridOrdering, bool computeEnergyAndVirial)
+void pmePerformSolve(const gmx_pme_t*  pme,
+                     CodePath          mode,
+                     PmeSolveAlgorithm method,
+                     real              cellVolume,
+                     GridOrdering      gridOrdering,
+                     bool              computeEnergyAndVirial)
  {
-    t_complex      *h_grid                 = pmeGetComplexGridInternal(pme);
-    const bool      useLorentzBerthelot    = false;
-    const size_t    threadIndex            = 0;
+    t_complex*   h_grid              = pmeGetComplexGridInternal(pme);
+    const bool   useLorentzBerthelot = false;
+    const size_t threadIndex         = 0;
+    const size_t gridIndex           = 0;
      switch (mode)
      {
          case CodePath::CPU:
@@ -340,117 +407,255 @@ void pmePerformSolve(const gmx_pme_t *pme, CodePath mode,
              switch (method)
              {
                  case PmeSolveAlgorithm::Coulomb:
-                    solve_pme_yzx(pme, h_grid, cellVolume,
-                                  computeEnergyAndVirial, pme->nthread, threadIndex);
+                    solve_pme_yzx(pme, h_grid, cellVolume, computeEnergyAndVirial, pme->nthread, threadIndex);
                      break;
  
                  case PmeSolveAlgorithm::LennardJones:
-                    solve_pme_lj_yzx(pme, &h_grid, useLorentzBerthelot,
-                                     cellVolume, computeEnergyAndVirial, pme->nthread, threadIndex);
+                    solve_pme_lj_yzx(pme,
+                                     &h_grid,
+                                     useLorentzBerthelot,
+                                     cellVolume,
+                                     computeEnergyAndVirial,
+                                     pme->nthread,
+                                     threadIndex);
                      break;
  
-                default:
-                    GMX_THROW(InternalError("Test not implemented for this mode"));
+                default: GMX_THROW(InternalError("Test not implemented for this mode"));
              }
              break;
  
-        case CodePath::CUDA:
+        case CodePath::GPU:
              switch (method)
              {
                  case PmeSolveAlgorithm::Coulomb:
-                    pme_gpu_solve(pme->gpu, h_grid, gridOrdering, computeEnergyAndVirial);
+                    pme_gpu_solve(pme->gpu, gridIndex, h_grid, gridOrdering, computeEnergyAndVirial);
                      break;
  
-                default:
-                    GMX_THROW(InternalError("Test not implemented for this mode"));
+                default: GMX_THROW(InternalError("Test not implemented for this mode"));
              }
              break;
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
  }
  
  //! PME force gathering
-void pmePerformGather(gmx_pme_t *pme, CodePath mode,
-                      PmeForceOutputHandling inputTreatment, ForcesVector &forces)
-{
-    pme_atomcomm_t *atc                     = &(pme->atc[0]);
-    const index     atomCount               = atc->n;
-    GMX_RELEASE_ASSERT(forces.size() == atomCount, "Invalid force buffer size");
-    const bool      forceReductionWithInput = (inputTreatment == PmeForceOutputHandling::ReduceWithInput);
-    const real      scale                   = 1.0;
-    const size_t    threadIndex             = 0;
-    const size_t    gridIndex               = 0;
-    real           *pmegrid                 = pme->pmegrid[gridIndex].grid.grid;
-    real           *fftgrid                 = pme->fftgrid[gridIndex];
+void pmePerformGather(gmx_pme_t* pme, CodePath mode, ForcesVector& forces)
+{
+    PmeAtomComm* atc       = &(pme->atc[0]);
+    const index  atomCount = atc->numAtoms();
+    GMX_RELEASE_ASSERT(forces.ssize() == atomCount, "Invalid force buffer size");
+    const real   scale       = 1.0;
+    const size_t threadIndex = 0;
+    const size_t gridIndex   = 0;
+    real*        pmegrid     = pme->pmegrid[gridIndex].grid.grid;
+    real**       fftgrid     = pme->fftgrid;
  
      switch (mode)
      {
          case CodePath::CPU:
-            atc->f = as_rvec_array(forces.begin());
+            atc->f = forces;
              if (atc->nthread == 1)
              {
                  // something which is normally done in serial spline computation (make_thread_local_ind())
                  atc->spline[threadIndex].n = atomCount;
              }
-            copy_fftgrid_to_pmegrid(pme, fftgrid, pmegrid, gridIndex, pme->nthread, threadIndex);
+            copy_fftgrid_to_pmegrid(pme, fftgrid[gridIndex], pmegrid, gridIndex, pme->nthread, threadIndex);
              unwrap_periodic_pmegrid(pme, pmegrid);
-            gather_f_bsplines(pme, pmegrid, !forceReductionWithInput, atc, &atc->spline[threadIndex], scale);
+            gather_f_bsplines(pme, pmegrid, true, atc, &atc->spline[threadIndex], scale);
              break;
  
-        case CodePath::CUDA:
+/* The compiler will complain about passing fftgrid (converting double ** to float **) if using
+ * double precision. GPUs are not used with double precision anyhow. */
+#if !GMX_DOUBLE
+        case CodePath::GPU:
          {
              // Variable initialization needs a non-switch scope
-            auto stagingForces = pme_gpu_get_forces(pme->gpu);
-            GMX_ASSERT(forces.size() == stagingForces.size(), "Size of force buffers did not match");
-            if (forceReductionWithInput)
-            {
-                for (index i = 0; i != forces.size(); ++i)
-                {
-                    stagingForces[i] = forces[i];
-                }
-            }
-            pme_gpu_gather(pme->gpu, inputTreatment, reinterpret_cast<float *>(fftgrid));
-            for (index i = 0; i != forces.size(); ++i)
-            {
-                forces[i] = stagingForces[i];
-            }
+            const bool computeEnergyAndVirial = false;
+            const real lambdaQ                = 1.0;
+            PmeOutput  output = pme_gpu_getOutput(*pme, computeEnergyAndVirial, lambdaQ);
+            GMX_ASSERT(forces.size() == output.forces_.size(),
+                       "Size of force buffers did not match");
+            pme_gpu_gather(pme->gpu, fftgrid, lambdaQ);
+            std::copy(std::begin(output.forces_), std::end(output.forces_), std::begin(forces));
          }
          break;
+#endif
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
  }
  
  //! PME test finalization before fetching the outputs
-void pmeFinalizeTest(const gmx_pme_t *pme, CodePath mode)
+void pmeFinalizeTest(const gmx_pme_t* pme, CodePath mode)
  {
      switch (mode)
      {
-        case CodePath::CPU:
+        case CodePath::CPU: break;
+
+        case CodePath::GPU: pme_gpu_synchronize(pme->gpu); break;
+
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
+    }
+}
+
+//! A binary enum for spline data layout transformation
+enum class PmeLayoutTransform
+{
+    GpuToHost,
+    HostToGpu
+};
+
+/*! \brief Gets a unique index to an element in a spline parameter buffer.
+ *
+ * These theta/dtheta buffers are laid out for GPU spread/gather
+ * kernels. The index is wrt the execution block, in range(0,
+ * atomsPerBlock * order * DIM).
+ *
+ * This is a wrapper, only used in unit tests.
+ * \param[in] order            PME order
+ * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
+ * \param[in] dimIndex         Dimension index (from 0 to 2)
+ * \param[in] atomIndex        Atom index wrt the block.
+ * \param[in] atomsPerWarp     Number of atoms processed by a warp.
+ *
+ * \returns Index into theta or dtheta array using GPU layout.
+ */
+static int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp)
+{
+    if (order != c_pmeGpuOrder)
+    {
+        throw order;
+    }
+    constexpr int fixedOrder = c_pmeGpuOrder;
+    GMX_UNUSED_VALUE(fixedOrder);
+
+    const int atomWarpIndex = atomIndex % atomsPerWarp;
+    const int warpIndex     = atomIndex / atomsPerWarp;
+    int       indexBase, result;
+    switch (atomsPerWarp)
+    {
+        case 1:
+            indexBase = getSplineParamIndexBase<fixedOrder, 1>(warpIndex, atomWarpIndex);
+            result    = getSplineParamIndex<fixedOrder, 1>(indexBase, dimIndex, splineIndex);
+            break;
+
+        case 2:
+            indexBase = getSplineParamIndexBase<fixedOrder, 2>(warpIndex, atomWarpIndex);
+            result    = getSplineParamIndex<fixedOrder, 2>(indexBase, dimIndex, splineIndex);
+            break;
+
+        case 4:
+            indexBase = getSplineParamIndexBase<fixedOrder, 4>(warpIndex, atomWarpIndex);
+            result    = getSplineParamIndex<fixedOrder, 4>(indexBase, dimIndex, splineIndex);
              break;
  
-        case CodePath::CUDA:
-            pme_gpu_synchronize(pme->gpu);
+        case 8:
+            indexBase = getSplineParamIndexBase<fixedOrder, 8>(warpIndex, atomWarpIndex);
+            result    = getSplineParamIndex<fixedOrder, 8>(indexBase, dimIndex, splineIndex);
              break;
  
          default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+            GMX_THROW(NotImplementedError(
+                    formatString("Test function call not unrolled for atomsPerWarp = %d in "
+                                 "getSplineParamFullIndex",
+                                 atomsPerWarp)));
+    }
+    return result;
+}
+
+/*!\brief Return the number of atoms per warp */
+static int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
+{
+    const int order = pmeGpu->common->pme_order;
+    const int threadsPerAtom =
+            (pmeGpu->settings.threadsPerAtom == ThreadsPerAtom::Order ? order : order * order);
+    return pmeGpu->programHandle_->warpSize() / threadsPerAtom;
+}
+
+/*! \brief Rearranges the atom spline data between the GPU and host layouts.
+ * Only used for test purposes so far, likely to be horribly slow.
+ *
+ * \param[in]  pmeGpu     The PME GPU structure.
+ * \param[out] atc        The PME CPU atom data structure (with a single-threaded layout).
+ * \param[in]  type       The spline data type (values or derivatives).
+ * \param[in]  dimIndex   Dimension index.
+ * \param[in]  transform  Layout transform type
+ */
+static void pme_gpu_transform_spline_atom_data(const PmeGpu*      pmeGpu,
+                                               const PmeAtomComm* atc,
+                                               PmeSplineDataType  type,
+                                               int                dimIndex,
+                                               PmeLayoutTransform transform)
+{
+    // The GPU atom spline data is laid out in a different way currently than the CPU one.
+    // This function converts the data from GPU to CPU layout (in the host memory).
+    // It is only intended for testing purposes so far.
+    // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their
+    // performance (e.g. spreading on GPU, gathering on CPU).
+    GMX_RELEASE_ASSERT(atc->nthread == 1, "Only the serial PME data layout is supported");
+    const uintmax_t threadIndex  = 0;
+    const auto      atomCount    = atc->numAtoms();
+    const auto      atomsPerWarp = pme_gpu_get_atoms_per_warp(pmeGpu);
+    const auto      pmeOrder     = pmeGpu->common->pme_order;
+    GMX_ASSERT(pmeOrder == c_pmeGpuOrder, "Only PME order 4 is implemented");
+
+    real*  cpuSplineBuffer;
+    float* h_splineBuffer;
+    switch (type)
+    {
+        case PmeSplineDataType::Values:
+            cpuSplineBuffer = atc->spline[threadIndex].theta.coefficients[dimIndex];
+            h_splineBuffer  = pmeGpu->staging.h_theta;
+            break;
+
+        case PmeSplineDataType::Derivatives:
+            cpuSplineBuffer = atc->spline[threadIndex].dtheta.coefficients[dimIndex];
+            h_splineBuffer  = pmeGpu->staging.h_dtheta;
+            break;
+
+        default: GMX_THROW(InternalError("Unknown spline data type"));
+    }
+
+    for (auto atomIndex = 0; atomIndex < atomCount; atomIndex++)
+    {
+        for (auto orderIndex = 0; orderIndex < pmeOrder; orderIndex++)
+        {
+            const auto gpuValueIndex =
+                    getSplineParamFullIndex(pmeOrder, orderIndex, dimIndex, atomIndex, atomsPerWarp);
+            const auto cpuValueIndex = atomIndex * pmeOrder + orderIndex;
+            GMX_ASSERT(cpuValueIndex < atomCount * pmeOrder,
+                       "Atom spline data index out of bounds (while transforming GPU data layout "
+                       "for host)");
+            switch (transform)
+            {
+                case PmeLayoutTransform::GpuToHost:
+                    cpuSplineBuffer[cpuValueIndex] = h_splineBuffer[gpuValueIndex];
+                    break;
+
+                case PmeLayoutTransform::HostToGpu:
+                    h_splineBuffer[gpuValueIndex] = cpuSplineBuffer[cpuValueIndex];
+                    break;
+
+                default: GMX_THROW(InternalError("Unknown layout transform"));
+            }
+        }
      }
  }
  
  //! Setting atom spline values/derivatives to be used in spread/gather
-void pmeSetSplineData(const gmx_pme_t *pme, CodePath mode,
-                      const SplineParamsDimVector &splineValues, PmeSplineDataType type, int dimIndex)
+void pmeSetSplineData(const gmx_pme_t*             pme,
+                      CodePath                     mode,
+                      const SplineParamsDimVector& splineValues,
+                      PmeSplineDataType            type,
+                      int                          dimIndex)
  {
-    const pme_atomcomm_t *atc         = &(pme->atc[0]);
-    const index           atomCount   = atc->n;
-    const index           pmeOrder    = pme->pme_order;
-    const index           dimSize     = pmeOrder * atomCount;
-    GMX_RELEASE_ASSERT(dimSize == splineValues.size(), "Mismatch in spline data");
-    real                 *splineBuffer = pmeGetSplineDataInternal(pme, type, dimIndex);
+    const PmeAtomComm* atc       = &(pme->atc[0]);
+    const index        atomCount = atc->numAtoms();
+    const index        pmeOrder  = pme->pme_order;
+    const index        dimSize   = pmeOrder * atomCount;
+    GMX_RELEASE_ASSERT(dimSize == splineValues.ssize(), "Mismatch in spline data");
+    real* splineBuffer = pmeGetSplineDataInternal(pme, type, dimIndex);
  
      switch (mode)
      {
@@ -458,54 +663,52 @@ void pmeSetSplineData(const gmx_pme_t *pme, CodePath mode,
              std::copy(splineValues.begin(), splineValues.end(), splineBuffer);
              break;
  
-        case CodePath::CUDA:
+        case CodePath::GPU:
              std::copy(splineValues.begin(), splineValues.end(), splineBuffer);
              pme_gpu_transform_spline_atom_data(pme->gpu, atc, type, dimIndex, PmeLayoutTransform::HostToGpu);
              break;
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
  }
  
  //! Setting gridline indices to be used in spread/gather
-void pmeSetGridLineIndices(const gmx_pme_t *pme, CodePath mode,
-                           const GridLineIndicesVector &gridLineIndices)
+void pmeSetGridLineIndices(gmx_pme_t* pme, CodePath mode, const GridLineIndicesVector& gridLineIndices)
  {
-    const pme_atomcomm_t       *atc         = &(pme->atc[0]);
-    const index                 atomCount   = atc->n;
-    GMX_RELEASE_ASSERT(atomCount == gridLineIndices.size(), "Mismatch in gridline indices size");
+    PmeAtomComm* atc       = &(pme->atc[0]);
+    const index  atomCount = atc->numAtoms();
+    GMX_RELEASE_ASSERT(atomCount == gridLineIndices.ssize(), "Mismatch in gridline indices size");
  
      IVec paddedGridSizeUnused, gridSize(0, 0, 0);
      pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSizeUnused);
  
-    for (const auto &index : gridLineIndices)
+    for (const auto& index : gridLineIndices)
      {
          for (int i = 0; i < DIM; i++)
          {
-            GMX_RELEASE_ASSERT((0 <= index[i]) && (index[i] < gridSize[i]), "Invalid gridline index");
+            GMX_RELEASE_ASSERT((0 <= index[i]) && (index[i] < gridSize[i]),
+                               "Invalid gridline index");
          }
      }
  
      switch (mode)
      {
-        case CodePath::CUDA:
-            memcpy(pme->gpu->staging.h_gridlineIndices, gridLineIndices.data(), atomCount * sizeof(gridLineIndices[0]));
+        case CodePath::GPU:
+            memcpy(pme_gpu_staging(pme->gpu).h_gridlineIndices,
+                   gridLineIndices.data(),
+                   atomCount * sizeof(gridLineIndices[0]));
              break;
  
          case CodePath::CPU:
-            // incompatible IVec and ivec assignment?
-            //std::copy(gridLineIndices.begin(), gridLineIndices.end(), atc->idx);
-            memcpy(atc->idx, gridLineIndices.data(), atomCount * sizeof(gridLineIndices[0]));
+            atc->idx.resize(gridLineIndices.size());
+            std::copy(gridLineIndices.begin(), gridLineIndices.end(), atc->idx.begin());
              break;
-
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
  }
  
  //! Getting plain index into the complex 3d grid
-inline size_t pmeGetGridPlainIndexInternal(const IVec &index, const IVec &paddedGridSize, GridOrdering gridOrdering)
+inline size_t pmeGetGridPlainIndexInternal(const IVec& index, const IVec& paddedGridSize, GridOrdering gridOrdering)
  {
      size_t result;
      switch (gridOrdering)
@@ -518,122 +721,119 @@ inline size_t pmeGetGridPlainIndexInternal(const IVec &index, const IVec &padded
              result = (index[XX] * paddedGridSize[YY] + index[YY]) * paddedGridSize[ZZ] + index[ZZ];
              break;
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
      return result;
  }
  
  //! Setting real or complex grid
  template<typename ValueType>
-static void pmeSetGridInternal(const gmx_pme_t *pme, CodePath mode,
-                               GridOrdering gridOrdering,
-                               const SparseGridValuesInput<ValueType> &gridValues)
+static void pmeSetGridInternal(const gmx_pme_t*                        pme,
+                               CodePath                                mode,
+                               GridOrdering                            gridOrdering,
+                               const SparseGridValuesInput<ValueType>& gridValues)
  {
      IVec       gridSize(0, 0, 0), paddedGridSize(0, 0, 0);
-    ValueType *grid;
+    ValueType* grid;
      pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
  
      switch (mode)
      {
-        case CodePath::CUDA: // intentional absence of break, the grid will be copied from the host buffer in testing mode
+        case CodePath::GPU: // intentional absence of break, the grid will be copied from the host buffer in testing mode
          case CodePath::CPU:
              std::memset(grid, 0, paddedGridSize[XX] * paddedGridSize[YY] * paddedGridSize[ZZ] * sizeof(ValueType));
-            for (const auto &gridValue : gridValues)
+            for (const auto& gridValue : gridValues)
              {
                  for (int i = 0; i < DIM; i++)
                  {
-                    GMX_RELEASE_ASSERT((0 <= gridValue.first[i]) && (gridValue.first[i] < gridSize[i]), "Invalid grid value index");
+                    GMX_RELEASE_ASSERT((0 <= gridValue.first[i]) && (gridValue.first[i] < gridSize[i]),
+                                       "Invalid grid value index");
                  }
-                const size_t gridValueIndex = pmeGetGridPlainIndexInternal(gridValue.first, paddedGridSize, gridOrdering);
+                const size_t gridValueIndex =
+                        pmeGetGridPlainIndexInternal(gridValue.first, paddedGridSize, gridOrdering);
                  grid[gridValueIndex] = gridValue.second;
              }
              break;
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
  }
  
  //! Setting real grid to be used in gather
-void pmeSetRealGrid(const gmx_pme_t *pme, CodePath mode,
-                    const SparseRealGridValuesInput &gridValues)
+void pmeSetRealGrid(const gmx_pme_t* pme, CodePath mode, const SparseRealGridValuesInput& gridValues)
  {
      pmeSetGridInternal<real>(pme, mode, GridOrdering::XYZ, gridValues);
  }
  
  //! Setting complex grid to be used in solve
-void pmeSetComplexGrid(const gmx_pme_t *pme, CodePath mode,
-                       GridOrdering gridOrdering,
-                       const SparseComplexGridValuesInput &gridValues)
+void pmeSetComplexGrid(const gmx_pme_t*                    pme,
+                       CodePath                            mode,
+                       GridOrdering                        gridOrdering,
+                       const SparseComplexGridValuesInput& gridValues)
  {
      pmeSetGridInternal<t_complex>(pme, mode, gridOrdering, gridValues);
  }
  
  //! Getting the single dimension's spline values or derivatives
-SplineParamsDimVector pmeGetSplineData(const gmx_pme_t *pme, CodePath mode,
-                                       PmeSplineDataType type, int dimIndex)
+SplineParamsDimVector pmeGetSplineData(const gmx_pme_t* pme, CodePath mode, PmeSplineDataType type, int dimIndex)
  {
      GMX_RELEASE_ASSERT(pme != nullptr, "PME data is not initialized");
-    const pme_atomcomm_t    *atc         = &(pme->atc[0]);
-    const size_t             atomCount   = atc->n;
-    const size_t             pmeOrder    = pme->pme_order;
-    const size_t             dimSize     = pmeOrder * atomCount;
+    const PmeAtomComm* atc       = &(pme->atc[0]);
+    const size_t       atomCount = atc->numAtoms();
+    const size_t       pmeOrder  = pme->pme_order;
+    const size_t       dimSize   = pmeOrder * atomCount;
  
-    real                    *sourceBuffer = pmeGetSplineDataInternal(pme, type, dimIndex);
-    SplineParamsDimVector    result;
+    real*                 sourceBuffer = pmeGetSplineDataInternal(pme, type, dimIndex);
+    SplineParamsDimVector result;
      switch (mode)
      {
-        case CodePath::CUDA:
+        case CodePath::GPU:
              pme_gpu_transform_spline_atom_data(pme->gpu, atc, type, dimIndex, PmeLayoutTransform::GpuToHost);
              result = arrayRefFromArray(sourceBuffer, dimSize);
              break;
  
-        case CodePath::CPU:
-            result = arrayRefFromArray(sourceBuffer, dimSize);
-            break;
+        case CodePath::CPU: result = arrayRefFromArray(sourceBuffer, dimSize); break;
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
      return result;
  }
  
  //! Getting the gridline indices
-GridLineIndicesVector pmeGetGridlineIndices(const gmx_pme_t *pme, CodePath mode)
+GridLineIndicesVector pmeGetGridlineIndices(const gmx_pme_t* pme, CodePath mode)
  {
      GMX_RELEASE_ASSERT(pme != nullptr, "PME data is not initialized");
-    const pme_atomcomm_t *atc         = &(pme->atc[0]);
-    const size_t          atomCount   = atc->n;
+    const PmeAtomComm* atc       = &(pme->atc[0]);
+    const size_t       atomCount = atc->numAtoms();
  
      GridLineIndicesVector gridLineIndices;
      switch (mode)
      {
-        case CodePath::CUDA:
-            gridLineIndices = arrayRefFromArray(reinterpret_cast<IVec *>(pme->gpu->staging.h_gridlineIndices), atomCount);
+        case CodePath::GPU:
+            gridLineIndices = arrayRefFromArray(
+                    reinterpret_cast<IVec*>(pme_gpu_staging(pme->gpu).h_gridlineIndices), atomCount);
              break;
  
-        case CodePath::CPU:
-            gridLineIndices = arrayRefFromArray(reinterpret_cast<IVec *>(atc->idx), atomCount);
-            break;
+        case CodePath::CPU: gridLineIndices = atc->idx; break;
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
      return gridLineIndices;
  }
  
  //! Getting real or complex grid - only non zero values
  template<typename ValueType>
-static SparseGridValuesOutput<ValueType> pmeGetGridInternal(const gmx_pme_t *pme, CodePath mode, GridOrdering gridOrdering)
+static SparseGridValuesOutput<ValueType> pmeGetGridInternal(const gmx_pme_t* pme,
+                                                            CodePath         mode,
+                                                            GridOrdering     gridOrdering)
  {
      IVec       gridSize(0, 0, 0), paddedGridSize(0, 0, 0);
-    ValueType *grid;
+    ValueType* grid;
      pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
      SparseGridValuesOutput<ValueType> gridValues;
      switch (mode)
      {
-        case CodePath::CUDA: // intentional absence of break
+        case CodePath::GPU: // intentional absence of break
          case CodePath::CPU:
              gridValues.clear();
              for (int ix = 0; ix < gridSize[XX]; ix++)
@@ -642,12 +842,13 @@ static SparseGridValuesOutput<ValueType> pmeGetGridInternal(const gmx_pme_t *pme
                  {
                      for (int iz = 0; iz < gridSize[ZZ]; iz++)
                      {
-                        IVec            temp(ix, iy, iz);
-                        const size_t    gridValueIndex = pmeGetGridPlainIndexInternal(temp, paddedGridSize, gridOrdering);
-                        const ValueType value          = grid[gridValueIndex];
-                        if (value != ValueType {})
+                        IVec         temp(ix, iy, iz);
+                        const size_t gridValueIndex =
+                                pmeGetGridPlainIndexInternal(temp, paddedGridSize, gridOrdering);
+                        const ValueType value = grid[gridValueIndex];
+                        if (value != ValueType{})
                          {
-                            auto key = formatString("Cell %d %d %d", ix, iy, iz);
+                            auto key        = formatString("Cell %d %d %d", ix, iy, iz);
                              gridValues[key] = value;
                          }
                      }
@@ -655,73 +856,111 @@ static SparseGridValuesOutput<ValueType> pmeGetGridInternal(const gmx_pme_t *pme
              }
              break;
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
      return gridValues;
  }
  
  //! Getting the real grid (spreading output of pmePerformSplineAndSpread())
-SparseRealGridValuesOutput pmeGetRealGrid(const gmx_pme_t *pme, CodePath mode)
+SparseRealGridValuesOutput pmeGetRealGrid(const gmx_pme_t* pme, CodePath mode)
  {
      return pmeGetGridInternal<real>(pme, mode, GridOrdering::XYZ);
  }
  
  //! Getting the complex grid output of pmePerformSolve()
-SparseComplexGridValuesOutput pmeGetComplexGrid(const gmx_pme_t *pme, CodePath mode,
-                                                GridOrdering gridOrdering)
+SparseComplexGridValuesOutput pmeGetComplexGrid(const gmx_pme_t* pme, CodePath mode, GridOrdering gridOrdering)
  {
      return pmeGetGridInternal<t_complex>(pme, mode, gridOrdering);
  }
  
  //! Getting the reciprocal energy and virial
-PmeSolveOutput pmeGetReciprocalEnergyAndVirial(const gmx_pme_t *pme, CodePath mode,
-                                               PmeSolveAlgorithm method)
+PmeOutput pmeGetReciprocalEnergyAndVirial(const gmx_pme_t* pme, CodePath mode, PmeSolveAlgorithm method)
  {
-    real      energy = 0.0f;
-    Matrix3x3 virial;
-    matrix    virialTemp = {{0}}; //TODO get rid of
+    PmeOutput  output;
+    const real lambdaQ = 1.0;
      switch (mode)
      {
          case CodePath::CPU:
              switch (method)
              {
                  case PmeSolveAlgorithm::Coulomb:
-                    get_pme_ener_vir_q(pme->solve_work, pme->nthread, &energy, virialTemp);
+                    get_pme_ener_vir_q(pme->solve_work, pme->nthread, &output);
                      break;
  
                  case PmeSolveAlgorithm::LennardJones:
-                    get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &energy, virialTemp);
+                    get_pme_ener_vir_lj(pme->solve_work, pme->nthread, &output);
                      break;
  
-                default:
-                    GMX_THROW(InternalError("Test not implemented for this mode"));
+                default: GMX_THROW(InternalError("Test not implemented for this mode"));
              }
              break;
-        case CodePath::CUDA:
+        case CodePath::GPU:
              switch (method)
              {
                  case PmeSolveAlgorithm::Coulomb:
-                    pme_gpu_get_energy_virial(pme->gpu, &energy, virialTemp);
+                    pme_gpu_getEnergyAndVirial(*pme, lambdaQ, &output);
                      break;
  
-                default:
-                    GMX_THROW(InternalError("Test not implemented for this mode"));
+                default: GMX_THROW(InternalError("Test not implemented for this mode"));
              }
              break;
  
-        default:
-            GMX_THROW(InternalError("Test not implemented for this mode"));
+        default: GMX_THROW(InternalError("Test not implemented for this mode"));
      }
-    for (int i = 0; i < DIM; i++)
+    return output;
+}
+
+const char* codePathToString(CodePath codePath)
+{
+    switch (codePath)
      {
-        for (int j = 0; j < DIM; j++)
-        {
-            virial[i * DIM + j] = virialTemp[i][j];
-        }
+        case CodePath::CPU: return "CPU";
+        case CodePath::GPU: return "GPU";
+        default: GMX_THROW(NotImplementedError("This CodePath should support codePathToString"));
+    }
+}
+
+PmeTestHardwareContext::PmeTestHardwareContext() : codePath_(CodePath::CPU) {}
+
+PmeTestHardwareContext::PmeTestHardwareContext(TestDevice* testDevice) :
+    codePath_(CodePath::GPU), testDevice_(testDevice)
+{
+    setActiveDevice(testDevice_->deviceInfo());
+    pmeGpuProgram_ = buildPmeGpuProgram(testDevice_->deviceContext());
+}
+
+//! Returns a human-readable context description line
+std::string PmeTestHardwareContext::description() const
+{
+    switch (codePath_)
+    {
+        case CodePath::CPU: return "CPU";
+        case CodePath::GPU: return "GPU (" + testDevice_->description() + ")";
+        default: return "Unknown code path.";
+    }
+}
+
+void PmeTestHardwareContext::activate() const
+{
+    if (codePath_ == CodePath::GPU)
+    {
+        setActiveDevice(testDevice_->deviceInfo());
+    }
+}
+
+std::vector<std::unique_ptr<PmeTestHardwareContext>> createPmeTestHardwareContextList()
+{
+    std::vector<std::unique_ptr<PmeTestHardwareContext>> pmeTestHardwareContextList;
+    // Add CPU
+    pmeTestHardwareContextList.emplace_back(std::make_unique<PmeTestHardwareContext>());
+    // Add GPU devices
+    const auto& testDeviceList = getTestHardwareEnvironment()->getTestDeviceList();
+    for (const auto& testDevice : testDeviceList)
+    {
+        pmeTestHardwareContextList.emplace_back(std::make_unique<PmeTestHardwareContext>(testDevice.get()));
      }
-    return std::make_tuple(energy, virial);
+    return pmeTestHardwareContextList;
  }
  
-}  // namespace test
-}  // namespace gmx
+} // namespace test
+} // namespace gmx