Move testing code to test directory
authorMark Abraham <mark.j.abraham@gmail.com>
Wed, 4 Mar 2020 17:44:34 +0000 (18:44 +0100)
committerMark Abraham <mark.j.abraham@gmail.com>
Sat, 14 Mar 2020 08:11:27 +0000 (09:11 +0100)
This should not add bulk to libgromacs

Also decoupled some headers from unnecessarily depending on other
headers, some of which inhibited the proper separation of code and
tests. In particular, there's now a proper getter for warp size.

This needed the default implementation of pmeInitEmpty to move into a
source file, because it was requiring the definition of CodePath to be
available.

Change-Id: I3040c77404ade4d8cf0a41faf4aa4da4db42e3d3

16 files changed:
src/gromacs/ewald/pme_gather.h
src/gromacs/ewald/pme_gpu_internal.cpp
src/gromacs/ewald/pme_gpu_internal.h
src/gromacs/ewald/pme_gpu_program.cpp
src/gromacs/ewald/pme_gpu_program.h
src/gromacs/ewald/pme_gpu_program_impl.cpp
src/gromacs/ewald/pme_gpu_program_impl.cu
src/gromacs/ewald/pme_gpu_program_impl.h
src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
src/gromacs/ewald/tests/CMakeLists.txt
src/gromacs/ewald/tests/pmebsplinetest.cpp
src/gromacs/ewald/tests/pmegathertest.cpp
src/gromacs/ewald/tests/pmesolvetest.cpp
src/gromacs/ewald/tests/pmesplinespreadtest.cpp
src/gromacs/ewald/tests/pmetestcommon.cpp
src/gromacs/ewald/tests/pmetestcommon.h

index 846794c94950d56b91da5da297d89bd13a75df8a..a088f25070d472ce43398959704ce00bcc279cd6 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 #ifndef GMX_EWALD_PME_GATHER_H
 #define GMX_EWALD_PME_GATHER_H
 
+#include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/real.h"
 
-#include "pme_internal.h"
+class PmeAtomComm;
+struct gmx_pme_t;
+struct splinedata_t;
 
 void gather_f_bsplines(const struct gmx_pme_t* pme,
                        const real*             grid,
index 54c3f546240585aa1fa13ced43ea79174db59558..b32fe80483d26e93458965aeb9fe10369016de7e 100644 (file)
@@ -113,18 +113,6 @@ int pme_gpu_get_atom_data_block_size()
     return c_pmeAtomDataBlockSize;
 }
 
-int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
-{
-    if (pmeGpu->settings.useOrderThreadsPerAtom)
-    {
-        return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
-    }
-    else
-    {
-        return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom;
-    }
-}
-
 void pme_gpu_synchronize(const PmeGpu* pmeGpu)
 {
     pmeGpu->archSpecific->pmeStream_.synchronize();
@@ -528,49 +516,6 @@ void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu)
     pmeGpu->archSpecific->fftSetup.resize(0);
 }
 
-int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp)
-{
-    if (order != c_pmeGpuOrder)
-    {
-        throw order;
-    }
-    constexpr int fixedOrder = c_pmeGpuOrder;
-    GMX_UNUSED_VALUE(fixedOrder);
-
-    const int atomWarpIndex = atomIndex % atomsPerWarp;
-    const int warpIndex     = atomIndex / atomsPerWarp;
-    int       indexBase, result;
-    switch (atomsPerWarp)
-    {
-        case 1:
-            indexBase = getSplineParamIndexBase<fixedOrder, 1>(warpIndex, atomWarpIndex);
-            result    = getSplineParamIndex<fixedOrder, 1>(indexBase, dimIndex, splineIndex);
-            break;
-
-        case 2:
-            indexBase = getSplineParamIndexBase<fixedOrder, 2>(warpIndex, atomWarpIndex);
-            result    = getSplineParamIndex<fixedOrder, 2>(indexBase, dimIndex, splineIndex);
-            break;
-
-        case 4:
-            indexBase = getSplineParamIndexBase<fixedOrder, 4>(warpIndex, atomWarpIndex);
-            result    = getSplineParamIndex<fixedOrder, 4>(indexBase, dimIndex, splineIndex);
-            break;
-
-        case 8:
-            indexBase = getSplineParamIndexBase<fixedOrder, 8>(warpIndex, atomWarpIndex);
-            result    = getSplineParamIndex<fixedOrder, 8>(indexBase, dimIndex, splineIndex);
-            break;
-
-        default:
-            GMX_THROW(gmx::NotImplementedError(
-                    gmx::formatString("Test function call not unrolled for atomsPerWarp = %d in "
-                                      "getSplineParamFullIndex",
-                                      atomsPerWarp)));
-    }
-    return result;
-}
-
 void pme_gpu_getEnergyAndVirial(const gmx_pme_t& pme, PmeOutput* output)
 {
     const PmeGpu* pmeGpu = pme.gpu;
@@ -805,67 +750,6 @@ static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, co
     kernelParamsPtr->constants.elFactor = ONE_4PI_EPS0 / pmeGpu->common->epsilon_r;
 }
 
-void pme_gpu_transform_spline_atom_data(const PmeGpu*      pmeGpu,
-                                        const PmeAtomComm* atc,
-                                        PmeSplineDataType  type,
-                                        int                dimIndex,
-                                        PmeLayoutTransform transform)
-{
-    // The GPU atom spline data is laid out in a different way currently than the CPU one.
-    // This function converts the data from GPU to CPU layout (in the host memory).
-    // It is only intended for testing purposes so far.
-    // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their
-    // performance (e.g. spreading on GPU, gathering on CPU).
-    GMX_RELEASE_ASSERT(atc->nthread == 1, "Only the serial PME data layout is supported");
-    const uintmax_t threadIndex  = 0;
-    const auto      atomCount    = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->atoms.nAtoms;
-    const auto      atomsPerWarp = pme_gpu_get_atoms_per_warp(pmeGpu);
-    const auto      pmeOrder     = pmeGpu->common->pme_order;
-    GMX_ASSERT(pmeOrder == c_pmeGpuOrder, "Only PME order 4 is implemented");
-
-    real*  cpuSplineBuffer;
-    float* h_splineBuffer;
-    switch (type)
-    {
-        case PmeSplineDataType::Values:
-            cpuSplineBuffer = atc->spline[threadIndex].theta.coefficients[dimIndex];
-            h_splineBuffer  = pmeGpu->staging.h_theta;
-            break;
-
-        case PmeSplineDataType::Derivatives:
-            cpuSplineBuffer = atc->spline[threadIndex].dtheta.coefficients[dimIndex];
-            h_splineBuffer  = pmeGpu->staging.h_dtheta;
-            break;
-
-        default: GMX_THROW(gmx::InternalError("Unknown spline data type"));
-    }
-
-    for (auto atomIndex = 0; atomIndex < atomCount; atomIndex++)
-    {
-        for (auto orderIndex = 0; orderIndex < pmeOrder; orderIndex++)
-        {
-            const auto gpuValueIndex =
-                    getSplineParamFullIndex(pmeOrder, orderIndex, dimIndex, atomIndex, atomsPerWarp);
-            const auto cpuValueIndex = atomIndex * pmeOrder + orderIndex;
-            GMX_ASSERT(cpuValueIndex < atomCount * pmeOrder,
-                       "Atom spline data index out of bounds (while transforming GPU data layout "
-                       "for host)");
-            switch (transform)
-            {
-                case PmeLayoutTransform::GpuToHost:
-                    cpuSplineBuffer[cpuValueIndex] = h_splineBuffer[gpuValueIndex];
-                    break;
-
-                case PmeLayoutTransform::HostToGpu:
-                    h_splineBuffer[gpuValueIndex] = cpuSplineBuffer[cpuValueIndex];
-                    break;
-
-                default: GMX_THROW(gmx::InternalError("Unknown layout transform"));
-            }
-        }
-    }
-}
-
 void pme_gpu_get_real_grid_sizes(const PmeGpu* pmeGpu, gmx::IVec* gridSize, gmx::IVec* paddedGridSize)
 {
     GMX_ASSERT(gridSize != nullptr, "");
@@ -1267,7 +1151,7 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
     {
         cellsPerBlock = (gridLineSize + blocksPerGridLine - 1) / blocksPerGridLine;
     }
-    const int warpSize  = pmeGpu->programHandle_->impl_->warpSize;
+    const int warpSize  = pmeGpu->programHandle_->warpSize();
     const int blockSize = (cellsPerBlock + warpSize - 1) / warpSize * warpSize;
 
     static_assert(GMX_GPU != GMX_GPU_CUDA || c_solveMaxWarpsPerBlock / 2 >= 4,
index 93ffa77416a4a3c3ad475c577512db7b4a8c4680..3d764fd468a57448424199574a50a076d5aa60da 100644 (file)
@@ -98,14 +98,6 @@ enum class GridOrdering
  */
 int pme_gpu_get_atom_data_block_size();
 
-/*! \libinternal \brief
- * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
- *
- * \param[in] pmeGpu            The PME GPU structure.
- * \returns   Number of atoms in a single GPU atom spline data chunk.
- */
-int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu);
-
 /*! \libinternal \brief
  * Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.
  *
@@ -492,44 +484,6 @@ GPU_FUNC_QUALIFIER void pme_gpu_update_input_box(PmeGpu*      GPU_FUNC_ARGUMENT(
  */
 void pme_gpu_finish_computation(const PmeGpu* pmeGpu);
 
-//! A binary enum for spline data layout transformation
-enum class PmeLayoutTransform
-{
-    GpuToHost,
-    HostToGpu
-};
-
-/*! \libinternal \brief
- * Rearranges the atom spline data between the GPU and host layouts.
- * Only used for test purposes so far, likely to be horribly slow.
- *
- * \param[in]  pmeGpu     The PME GPU structure.
- * \param[out] atc        The PME CPU atom data structure (with a single-threaded layout).
- * \param[in]  type       The spline data type (values or derivatives).
- * \param[in]  dimIndex   Dimension index.
- * \param[in]  transform  Layout transform type
- */
-GPU_FUNC_QUALIFIER void pme_gpu_transform_spline_atom_data(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
-                                                           const PmeAtomComm* GPU_FUNC_ARGUMENT(atc),
-                                                           PmeSplineDataType GPU_FUNC_ARGUMENT(type),
-                                                           int GPU_FUNC_ARGUMENT(dimIndex),
-                                                           PmeLayoutTransform GPU_FUNC_ARGUMENT(transform)) GPU_FUNC_TERM;
-
-/*! \libinternal \brief
- * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
- * which is laid out for GPU spread/gather kernels. The index is wrt the execution block,
- * in range(0, atomsPerBlock * order * DIM).
- * This is a wrapper, only used in unit tests.
- * \param[in] order            PME order
- * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
- * \param[in] dimIndex         Dimension index (from 0 to 2)
- * \param[in] atomIndex        Atom index wrt the block.
- * \param[in] atomsPerWarp     Number of atoms processed by a warp.
- *
- * \returns Index into theta or dtheta array using GPU layout.
- */
-int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp);
-
 /*! \libinternal \brief
  * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
  *
index 23981a661b83bcf6c3a0eeafa189b38bfe7bd772..72711c91ae1017dfaa5364526c84870490742dd7 100644 (file)
@@ -60,6 +60,11 @@ PmeGpuProgram::PmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceCo
 
 PmeGpuProgram::~PmeGpuProgram() = default;
 
+int PmeGpuProgram::warpSize() const
+{
+    return impl_->warpSize();
+}
+
 PmeGpuProgramStorage buildPmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext)
 {
     return std::make_unique<PmeGpuProgram>(deviceInfo, deviceContext);
index d4dbdf449d9b17eba6edd7b5b5b0af8868a8efdd..c4888d97c71c6755382faf99ef57e33b35e87e57 100644 (file)
 
 /*! \libinternal \file
  * \brief
- * Declares PmeGpuProgram, which wrap arounds PmeGpuProgramImpl
- * to store permanent PME GPU context-derived data,
- * such as (compiled) kernel handles.
+ * Declares PmeGpuProgram
+ * to store data derived from the GPU context or devices for
+ * PME, such as (compiled) kernel handles and the warp sizes
+ * they work with.
  *
  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  * \ingroup module_ewald
@@ -54,13 +55,24 @@ class DeviceContext;
 struct PmeGpuProgramImpl;
 struct DeviceInformation;
 
+/*! \libinternal
+ * \brief Stores PME data derived from the GPU context or devices.
+ *
+ * This includes compiled kernel handles and the warp sizes they
+ * work with.
+ */
 class PmeGpuProgram
 {
 public:
+    //! Constructor
     explicit PmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext);
     ~PmeGpuProgram();
 
-    // TODO: design getters for information inside, if needed for PME, and make this private?
+    //! Return the warp size for which the kernels were compiled
+    int warpSize() const;
+
+    // TODO: design more getters for information inside, if needed for PME, and make this private?
+    //! Private impl class
     std::unique_ptr<PmeGpuProgramImpl> impl_;
 };
 
index ccaffa5acdc03ddfb0a568c013e1ed661aa051d1..9056881227f2c6a907932c3d6efedc5035cb386f 100644 (file)
@@ -48,7 +48,7 @@
 PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& /* deviceInfo */,
                                      const DeviceContext& deviceContext) :
     deviceContext_(deviceContext),
-    warpSize(0),
+    warpSize_(0),
     spreadWorkGroupSize(0),
     gatherWorkGroupSize(0),
     solveMaxWorkGroupSize(0)
index 53bf2f0d1eab0df4cd293cc82176ee700c3425b3..e1331330c8a278f2edbaa5e54d01bc36273cf06a 100644 (file)
@@ -103,7 +103,7 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& /* deviceInfo */,
     deviceContext_(deviceContext)
 {
     // kernel parameters
-    warpSize              = warp_size;
+    warpSize_             = warp_size;
     spreadWorkGroupSize   = c_spreadMaxThreadsPerBlock;
     solveMaxWorkGroupSize = c_solveMaxThreadsPerBlock;
     gatherWorkGroupSize   = c_gatherMaxThreadsPerBlock;
index cb1471abf17be0ec925c638221039466083ade2f..b9d1adc0d46c3dc8ea916b471c94bb0bdae41f89 100644 (file)
@@ -94,7 +94,7 @@ struct PmeGpuProgramImpl
      * For CUDA, this is a static value that comes from gromacs/gpu_utils/cuda_arch_utils.cuh;
      * for OpenCL, we have to query it dynamically.
      */
-    size_t warpSize;
+    size_t warpSize_;
 
     //@{
     /**
@@ -150,6 +150,9 @@ struct PmeGpuProgramImpl
     ~PmeGpuProgramImpl();
     GMX_DISALLOW_COPY_AND_ASSIGN(PmeGpuProgramImpl);
 
+    //! Return the warp size for which the kernels were compiled
+    int warpSize() const { return warpSize_; }
+
 private:
     // Compiles kernels, if supported. Called by the constructor.
     void compileKernels(const DeviceInformation& deviceInfo);
index 5e82fec6dfa1d58e08105e8b1cedc58c7307e8ae..2609763afa9383f2750d11dd96ac855fef35faac 100644 (file)
@@ -57,12 +57,12 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& deviceInfo, const
     deviceContext_(deviceContext)
 {
     // kernel parameters
-    warpSize = gmx::ocl::getDeviceWarpSize(deviceContext_.context(), deviceInfo.oclDeviceId);
+    warpSize_ = gmx::ocl::getDeviceWarpSize(deviceContext_.context(), deviceInfo.oclDeviceId);
     // TODO: for Intel ideally we'd want to set these based on the compiler warp size
     // but given that we've done no tuning for Intel iGPU, this is as good as anything.
-    spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize, deviceInfo.maxWorkGroupSize);
-    solveMaxWorkGroupSize = std::min(c_solveMaxWarpsPerBlock * warpSize, deviceInfo.maxWorkGroupSize);
-    gatherWorkGroupSize = std::min(c_gatherMaxWarpsPerBlock * warpSize, deviceInfo.maxWorkGroupSize);
+    spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize_, deviceInfo.maxWorkGroupSize);
+    solveMaxWorkGroupSize = std::min(c_solveMaxWarpsPerBlock * warpSize_, deviceInfo.maxWorkGroupSize);
+    gatherWorkGroupSize = std::min(c_gatherMaxWarpsPerBlock * warpSize_, deviceInfo.maxWorkGroupSize);
 
     compileKernels(deviceInfo);
 }
@@ -137,7 +137,7 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation& deviceInfo)
                 "-DDIM=%d -DXX=%d -DYY=%d -DZZ=%d "
                 // decomposition parameter placeholders
                 "-DwrapX=true -DwrapY=true ",
-                warpSize, c_pmeGpuOrder, c_pmeSpreadGatherThreadsPerAtom,
+                warpSize_, c_pmeGpuOrder, c_pmeSpreadGatherThreadsPerAtom,
                 static_cast<float>(c_pmeMaxUnitcellShift), static_cast<int>(c_skipNeutralAtoms),
                 c_virialAndEnergyCount, spreadWorkGroupSize, solveMaxWorkGroupSize,
                 gatherWorkGroupSize, DIM, XX, YY, ZZ);
index 790bb0628daa61e18de95912b1d685ca3a16e074..6dea2b21ecca50c3a1ffc77a158c56729fa342d2 100644 (file)
@@ -38,6 +38,7 @@ gmx_add_unit_test(EwaldUnitTests ewald-test HARDWARE_DETECTION
         pmegathertest.cpp
         pmesolvetest.cpp
         pmesplinespreadtest.cpp
-        pmetestcommon.cpp
         testhardwarecontexts.cpp
+    GPU_CPP_SOURCE_FILES
+        pmetestcommon.cpp
         )
index b6a4e9e35f81b69e24f16180b26b36bfcabcd898..f136fabe4fb0a80d467f56c36c7387cdad7d720c 100644 (file)
@@ -55,6 +55,7 @@
 #include "testutils/testasserts.h"
 
 #include "pmetestcommon.h"
+#include "testhardwarecontexts.h"
 
 namespace gmx
 {
index 59efd56faccc460f8cf0a6bea446c63a1ff5da9a..51035f035576a8a031396a88d7a8506583ef0272 100644 (file)
@@ -53,6 +53,7 @@
 #include "testutils/testasserts.h"
 
 #include "pmetestcommon.h"
+#include "testhardwarecontexts.h"
 
 namespace gmx
 {
index 0cc1eec9a8e8df2ea12ee1ef391d0f31ecdd895f..86688822ebcc8a1f70b980fab2d039e30d32730f 100644 (file)
@@ -53,6 +53,7 @@
 #include "testutils/testasserts.h"
 
 #include "pmetestcommon.h"
+#include "testhardwarecontexts.h"
 
 namespace gmx
 {
index 8f2935b9f321b50b3f99ccbc83a084a2b244b600..5c2d8663ef593f5b52c2973cc45fb8bb058389a2 100644 (file)
@@ -53,6 +53,7 @@
 #include "testutils/testasserts.h"
 
 #include "pmetestcommon.h"
+#include "testhardwarecontexts.h"
 
 namespace gmx
 {
index 80960b647ae94d76d75846eb3549bfac6c74d489..e446b8e36d2d3e0ea3d74331e840436058f92cd4 100644 (file)
@@ -49,6 +49,8 @@
 
 #include "gromacs/domdec/domdec.h"
 #include "gromacs/ewald/pme_gather.h"
+#include "gromacs/ewald/pme_gpu_calculate_splines.h"
+#include "gromacs/ewald/pme_gpu_constants.h"
 #include "gromacs/ewald/pme_gpu_internal.h"
 #include "gromacs/ewald/pme_gpu_staging.h"
 #include "gromacs/ewald/pme_grid.h"
@@ -69,6 +71,8 @@
 
 #include "testutils/testasserts.h"
 
+#include "testhardwarecontexts.h"
+
 namespace gmx
 {
 namespace test
@@ -152,13 +156,19 @@ PmeSafePointer pmeInitEmpty(const t_inputrec*        inputRec,
                             const DeviceInformation* deviceInfo,
                             const PmeGpuProgram*     pmeGpuProgram,
                             const Matrix3x3&         box,
-                            real                     ewaldCoeff_q,
-                            real                     ewaldCoeff_lj)
+                            const real               ewaldCoeff_q,
+                            const real               ewaldCoeff_lj)
 {
     return pmeInitWrapper(inputRec, mode, deviceInfo, pmeGpuProgram, box, ewaldCoeff_q, ewaldCoeff_lj);
     // hiding the fact that PME actually needs to know the number of atoms in advance
 }
 
+PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec)
+{
+    const Matrix3x3 defaultBox = { { 1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F } };
+    return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
+}
+
 //! Make a GPU state-propagator manager
 std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
                                                                    const DeviceContext& deviceContext)
@@ -452,6 +462,154 @@ void pmeFinalizeTest(const gmx_pme_t* pme, CodePath mode)
     }
 }
 
+//! A binary enum for spline data layout transformation
+enum class PmeLayoutTransform
+{
+    GpuToHost,
+    HostToGpu
+};
+
+/*! \brief Gets a unique index to an element in a spline parameter buffer.
+ *
+ * These theta/dtheta buffers are laid out for GPU spread/gather
+ * kernels. The index is wrt the execution block, in range(0,
+ * atomsPerBlock * order * DIM).
+ *
+ * This is a wrapper, only used in unit tests.
+ * \param[in] order            PME order
+ * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
+ * \param[in] dimIndex         Dimension index (from 0 to 2)
+ * \param[in] atomIndex        Atom index wrt the block.
+ * \param[in] atomsPerWarp     Number of atoms processed by a warp.
+ *
+ * \returns Index into theta or dtheta array using GPU layout.
+ */
+static int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp)
+{
+    if (order != c_pmeGpuOrder)
+    {
+        throw order;
+    }
+    constexpr int fixedOrder = c_pmeGpuOrder;
+    GMX_UNUSED_VALUE(fixedOrder);
+
+    const int atomWarpIndex = atomIndex % atomsPerWarp;
+    const int warpIndex     = atomIndex / atomsPerWarp;
+    int       indexBase, result;
+    switch (atomsPerWarp)
+    {
+        case 1:
+            indexBase = getSplineParamIndexBase<fixedOrder, 1>(warpIndex, atomWarpIndex);
+            result    = getSplineParamIndex<fixedOrder, 1>(indexBase, dimIndex, splineIndex);
+            break;
+
+        case 2:
+            indexBase = getSplineParamIndexBase<fixedOrder, 2>(warpIndex, atomWarpIndex);
+            result    = getSplineParamIndex<fixedOrder, 2>(indexBase, dimIndex, splineIndex);
+            break;
+
+        case 4:
+            indexBase = getSplineParamIndexBase<fixedOrder, 4>(warpIndex, atomWarpIndex);
+            result    = getSplineParamIndex<fixedOrder, 4>(indexBase, dimIndex, splineIndex);
+            break;
+
+        case 8:
+            indexBase = getSplineParamIndexBase<fixedOrder, 8>(warpIndex, atomWarpIndex);
+            result    = getSplineParamIndex<fixedOrder, 8>(indexBase, dimIndex, splineIndex);
+            break;
+
+        default:
+            GMX_THROW(NotImplementedError(
+                    formatString("Test function call not unrolled for atomsPerWarp = %d in "
+                                 "getSplineParamFullIndex",
+                                 atomsPerWarp)));
+    }
+    return result;
+}
+
+/*!\brief Return the number of atoms per warp */
+static int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
+{
+    if (pmeGpu->settings.useOrderThreadsPerAtom)
+    {
+        return pmeGpu->programHandle_->warpSize() / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
+    }
+    else
+    {
+        return pmeGpu->programHandle_->warpSize() / c_pmeSpreadGatherThreadsPerAtom;
+    }
+}
+
+/*! \brief Rearranges the atom spline data between the GPU and host layouts.
+ * Only used for test purposes so far, likely to be horribly slow.
+ *
+ * \param[in]  pmeGpu     The PME GPU structure.
+ * \param[out] atc        The PME CPU atom data structure (with a single-threaded layout).
+ * \param[in]  type       The spline data type (values or derivatives).
+ * \param[in]  dimIndex   Dimension index.
+ * \param[in]  transform  Layout transform type
+ */
+static void pme_gpu_transform_spline_atom_data(const PmeGpu*      pmeGpu,
+                                               const PmeAtomComm* atc,
+                                               PmeSplineDataType  type,
+                                               int                dimIndex,
+                                               PmeLayoutTransform transform)
+{
+    // The GPU atom spline data is laid out in a different way currently than the CPU one.
+    // This function converts the data from GPU to CPU layout (in the host memory).
+    // It is only intended for testing purposes so far.
+    // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their
+    // performance (e.g. spreading on GPU, gathering on CPU).
+    GMX_RELEASE_ASSERT(atc->nthread == 1, "Only the serial PME data layout is supported");
+    const uintmax_t threadIndex  = 0;
+    const auto      atomCount    = atc->numAtoms();
+    const auto      atomsPerWarp = pme_gpu_get_atoms_per_warp(pmeGpu);
+    const auto      pmeOrder     = pmeGpu->common->pme_order;
+    GMX_ASSERT(pmeOrder == c_pmeGpuOrder, "Only PME order 4 is implemented");
+
+    real*  cpuSplineBuffer;
+    float* h_splineBuffer;
+    switch (type)
+    {
+        case PmeSplineDataType::Values:
+            cpuSplineBuffer = atc->spline[threadIndex].theta.coefficients[dimIndex];
+            h_splineBuffer  = pmeGpu->staging.h_theta;
+            break;
+
+        case PmeSplineDataType::Derivatives:
+            cpuSplineBuffer = atc->spline[threadIndex].dtheta.coefficients[dimIndex];
+            h_splineBuffer  = pmeGpu->staging.h_dtheta;
+            break;
+
+        default: GMX_THROW(InternalError("Unknown spline data type"));
+    }
+
+    for (auto atomIndex = 0; atomIndex < atomCount; atomIndex++)
+    {
+        for (auto orderIndex = 0; orderIndex < pmeOrder; orderIndex++)
+        {
+            const auto gpuValueIndex =
+                    getSplineParamFullIndex(pmeOrder, orderIndex, dimIndex, atomIndex, atomsPerWarp);
+            const auto cpuValueIndex = atomIndex * pmeOrder + orderIndex;
+            GMX_ASSERT(cpuValueIndex < atomCount * pmeOrder,
+                       "Atom spline data index out of bounds (while transforming GPU data layout "
+                       "for host)");
+            switch (transform)
+            {
+                case PmeLayoutTransform::GpuToHost:
+                    cpuSplineBuffer[cpuValueIndex] = h_splineBuffer[gpuValueIndex];
+                    break;
+
+                case PmeLayoutTransform::HostToGpu:
+                    h_splineBuffer[gpuValueIndex] = cpuSplineBuffer[cpuValueIndex];
+                    break;
+
+                default: GMX_THROW(InternalError("Unknown layout transform"));
+            }
+        }
+    }
+}
+
 //! Setting atom spline values/derivatives to be used in spread/gather
 void pmeSetSplineData(const gmx_pme_t*             pme,
                       CodePath                     mode,
index c67f78bacf265251dc9a772bfff2ec8eaa98ea60..98a2bbd4d2106092219be25eb8b8cffc6c9e22df 100644 (file)
@@ -46,8 +46,6 @@
 #include <map>
 #include <vector>
 
-#include <gtest/gtest.h>
-
 #include "gromacs/ewald/pme.h"
 #include "gromacs/ewald/pme_gpu_internal.h"
 #include "gromacs/math/gmxcomplex.h"
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/unique_cptr.h"
 
-#include "testhardwarecontexts.h"
-
 namespace gmx
 {
 namespace test
 {
 
+// Forward declaration
+enum class CodePath;
+
 // Convenience typedefs
 //! A safe pointer type for PME.
 typedef gmx::unique_cptr<gmx_pme_t, gmx_pme_destroy> PmeSafePointer;
@@ -128,12 +127,14 @@ PmeSafePointer pmeInitWrapper(const t_inputrec*        inputRec,
                               real                     ewaldCoeff_lj = 1.0F);
 //! Simple PME initialization (no atom data)
 PmeSafePointer pmeInitEmpty(const t_inputrec*        inputRec,
-                            CodePath                 mode          = CodePath::CPU,
-                            const DeviceInformation* deviceInfo    = nullptr,
-                            const PmeGpuProgram*     pmeGpuProgram = nullptr,
-                            const Matrix3x3& box = { { 1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F } },
-                            real             ewaldCoeff_q  = 0.0F,
-                            real             ewaldCoeff_lj = 0.0F);
+                            CodePath                 mode,
+                            const DeviceInformation* deviceInfo,
+                            const PmeGpuProgram*     pmeGpuProgram,
+                            const Matrix3x3&         box,
+                            real                     ewaldCoeff_q,
+                            real                     ewaldCoeff_lj);
+//! Simple PME initialization based on inputrec only
+PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec);
 //! Make a GPU state-propagator manager
 std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
                                                                    const DeviceContext& deviceContext);
@@ -167,6 +168,7 @@ void pmeSetSplineData(const gmx_pme_t*             pme,
                       const SplineParamsDimVector& splineValues,
                       PmeSplineDataType            type,
                       int                          dimIndex);
+
 //! Setting gridline indices be used in spread/gather
 void pmeSetGridLineIndices(gmx_pme_t* pme, CodePath mode, const GridLineIndicesVector& gridLineIndices);
 //! Setting real grid to be used in gather