Move testing code to test directory

author Mark Abraham <mark.j.abraham@gmail.com>

Wed, 4 Mar 2020 17:44:34 +0000 (18:44 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Sat, 14 Mar 2020 08:11:27 +0000 (09:11 +0100)
author Mark Abraham <mark.j.abraham@gmail.com>
Wed, 4 Mar 2020 17:44:34 +0000 (18:44 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Sat, 14 Mar 2020 08:11:27 +0000 (09:11 +0100)
diff --git a/src/gromacs/ewald/pme_gather.h b/src/gromacs/ewald/pme_gather.h

index 846794c94950d56b91da5da297d89bd13a75df8a..a088f25070d472ce43398959704ce00bcc279cd6 100644 (file)
--- a/src/gromacs/ewald/pme_gather.h
+++ b/src/gromacs/ewald/pme_gather.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -35,9 +35,12 @@
  #ifndef GMX_EWALD_PME_GATHER_H
  #define GMX_EWALD_PME_GATHER_H
  
+#include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/real.h"
  
-#include "pme_internal.h"
+class PmeAtomComm;
+struct gmx_pme_t;
+struct splinedata_t;
  
  void gather_f_bsplines(const struct gmx_pme_t* pme,
                         const real*             grid,
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index 54c3f546240585aa1fa13ced43ea79174db59558..b32fe80483d26e93458965aeb9fe10369016de7e 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -113,18 +113,6 @@ int pme_gpu_get_atom_data_block_size()
      return c_pmeAtomDataBlockSize;
  }
  
-int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
-{
-    if (pmeGpu->settings.useOrderThreadsPerAtom)
-    {
-        return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
-    }
-    else
-    {
-        return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom;
-    }
-}
-
  void pme_gpu_synchronize(const PmeGpu* pmeGpu)
  {
      pmeGpu->archSpecific->pmeStream_.synchronize();
@@ -528,49 +516,6 @@ void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu)
      pmeGpu->archSpecific->fftSetup.resize(0);
  }
  
-int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp)
-{
-    if (order != c_pmeGpuOrder)
-    {
-        throw order;
-    }
-    constexpr int fixedOrder = c_pmeGpuOrder;
-    GMX_UNUSED_VALUE(fixedOrder);
-
-    const int atomWarpIndex = atomIndex % atomsPerWarp;
-    const int warpIndex     = atomIndex / atomsPerWarp;
-    int       indexBase, result;
-    switch (atomsPerWarp)
-    {
-        case 1:
-            indexBase = getSplineParamIndexBase<fixedOrder, 1>(warpIndex, atomWarpIndex);
-            result    = getSplineParamIndex<fixedOrder, 1>(indexBase, dimIndex, splineIndex);
-            break;
-
-        case 2:
-            indexBase = getSplineParamIndexBase<fixedOrder, 2>(warpIndex, atomWarpIndex);
-            result    = getSplineParamIndex<fixedOrder, 2>(indexBase, dimIndex, splineIndex);
-            break;
-
-        case 4:
-            indexBase = getSplineParamIndexBase<fixedOrder, 4>(warpIndex, atomWarpIndex);
-            result    = getSplineParamIndex<fixedOrder, 4>(indexBase, dimIndex, splineIndex);
-            break;
-
-        case 8:
-            indexBase = getSplineParamIndexBase<fixedOrder, 8>(warpIndex, atomWarpIndex);
-            result    = getSplineParamIndex<fixedOrder, 8>(indexBase, dimIndex, splineIndex);
-            break;
-
-        default:
-            GMX_THROW(gmx::NotImplementedError(
-                    gmx::formatString("Test function call not unrolled for atomsPerWarp = %d in "
-                                      "getSplineParamFullIndex",
-                                      atomsPerWarp)));
-    }
-    return result;
-}
-
  void pme_gpu_getEnergyAndVirial(const gmx_pme_t& pme, PmeOutput* output)
  {
      const PmeGpu* pmeGpu = pme.gpu;
@@ -805,67 +750,6 @@ static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, co
      kernelParamsPtr->constants.elFactor = ONE_4PI_EPS0 / pmeGpu->common->epsilon_r;
  }
  
-void pme_gpu_transform_spline_atom_data(const PmeGpu*      pmeGpu,
-                                        const PmeAtomComm* atc,
-                                        PmeSplineDataType  type,
-                                        int                dimIndex,
-                                        PmeLayoutTransform transform)
-{
-    // The GPU atom spline data is laid out in a different way currently than the CPU one.
-    // This function converts the data from GPU to CPU layout (in the host memory).
-    // It is only intended for testing purposes so far.
-    // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their
-    // performance (e.g. spreading on GPU, gathering on CPU).
-    GMX_RELEASE_ASSERT(atc->nthread == 1, "Only the serial PME data layout is supported");
-    const uintmax_t threadIndex  = 0;
-    const auto      atomCount    = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->atoms.nAtoms;
-    const auto      atomsPerWarp = pme_gpu_get_atoms_per_warp(pmeGpu);
-    const auto      pmeOrder     = pmeGpu->common->pme_order;
-    GMX_ASSERT(pmeOrder == c_pmeGpuOrder, "Only PME order 4 is implemented");
-
-    real*  cpuSplineBuffer;
-    float* h_splineBuffer;
-    switch (type)
-    {
-        case PmeSplineDataType::Values:
-            cpuSplineBuffer = atc->spline[threadIndex].theta.coefficients[dimIndex];
-            h_splineBuffer  = pmeGpu->staging.h_theta;
-            break;
-
-        case PmeSplineDataType::Derivatives:
-            cpuSplineBuffer = atc->spline[threadIndex].dtheta.coefficients[dimIndex];
-            h_splineBuffer  = pmeGpu->staging.h_dtheta;
-            break;
-
-        default: GMX_THROW(gmx::InternalError("Unknown spline data type"));
-    }
-
-    for (auto atomIndex = 0; atomIndex < atomCount; atomIndex++)
-    {
-        for (auto orderIndex = 0; orderIndex < pmeOrder; orderIndex++)
-        {
-            const auto gpuValueIndex =
-                    getSplineParamFullIndex(pmeOrder, orderIndex, dimIndex, atomIndex, atomsPerWarp);
-            const auto cpuValueIndex = atomIndex * pmeOrder + orderIndex;
-            GMX_ASSERT(cpuValueIndex < atomCount * pmeOrder,
-                       "Atom spline data index out of bounds (while transforming GPU data layout "
-                       "for host)");
-            switch (transform)
-            {
-                case PmeLayoutTransform::GpuToHost:
-                    cpuSplineBuffer[cpuValueIndex] = h_splineBuffer[gpuValueIndex];
-                    break;
-
-                case PmeLayoutTransform::HostToGpu:
-                    h_splineBuffer[gpuValueIndex] = cpuSplineBuffer[cpuValueIndex];
-                    break;
-
-                default: GMX_THROW(gmx::InternalError("Unknown layout transform"));
-            }
-        }
-    }
-}
-
  void pme_gpu_get_real_grid_sizes(const PmeGpu* pmeGpu, gmx::IVec* gridSize, gmx::IVec* paddedGridSize)
  {
      GMX_ASSERT(gridSize != nullptr, "");
@@ -1267,7 +1151,7 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd
      {
          cellsPerBlock = (gridLineSize + blocksPerGridLine - 1) / blocksPerGridLine;
      }
-    const int warpSize  = pmeGpu->programHandle_->impl_->warpSize;
+    const int warpSize  = pmeGpu->programHandle_->warpSize();
      const int blockSize = (cellsPerBlock + warpSize - 1) / warpSize * warpSize;
  
      static_assert(GMX_GPU != GMX_GPU_CUDA || c_solveMaxWarpsPerBlock / 2 >= 4,
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h

index 93ffa77416a4a3c3ad475c577512db7b4a8c4680..3d764fd468a57448424199574a50a076d5aa60da 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -98,14 +98,6 @@ enum class GridOrdering
   */
  int pme_gpu_get_atom_data_block_size();
  
-/*! \libinternal \brief
- * Returns the number of atoms per chunk in the atom spline theta/dtheta data layout.
- *
- * \param[in] pmeGpu            The PME GPU structure.
- * \returns   Number of atoms in a single GPU atom spline data chunk.
- */
-int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu);
-
  /*! \libinternal \brief
   * Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.
   *
@@ -492,44 +484,6 @@ GPU_FUNC_QUALIFIER void pme_gpu_update_input_box(PmeGpu*      GPU_FUNC_ARGUMENT(
   */
  void pme_gpu_finish_computation(const PmeGpu* pmeGpu);
  
-//! A binary enum for spline data layout transformation
-enum class PmeLayoutTransform
-{
-    GpuToHost,
-    HostToGpu
-};
-
-/*! \libinternal \brief
- * Rearranges the atom spline data between the GPU and host layouts.
- * Only used for test purposes so far, likely to be horribly slow.
- *
- * \param[in]  pmeGpu     The PME GPU structure.
- * \param[out] atc        The PME CPU atom data structure (with a single-threaded layout).
- * \param[in]  type       The spline data type (values or derivatives).
- * \param[in]  dimIndex   Dimension index.
- * \param[in]  transform  Layout transform type
- */
-GPU_FUNC_QUALIFIER void pme_gpu_transform_spline_atom_data(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
-                                                           const PmeAtomComm* GPU_FUNC_ARGUMENT(atc),
-                                                           PmeSplineDataType GPU_FUNC_ARGUMENT(type),
-                                                           int GPU_FUNC_ARGUMENT(dimIndex),
-                                                           PmeLayoutTransform GPU_FUNC_ARGUMENT(transform)) GPU_FUNC_TERM;
-
-/*! \libinternal \brief
- * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
- * which is laid out for GPU spread/gather kernels. The index is wrt the execution block,
- * in range(0, atomsPerBlock * order * DIM).
- * This is a wrapper, only used in unit tests.
- * \param[in] order            PME order
- * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
- * \param[in] dimIndex         Dimension index (from 0 to 2)
- * \param[in] atomIndex        Atom index wrt the block.
- * \param[in] atomsPerWarp     Number of atoms processed by a warp.
- *
- * \returns Index into theta or dtheta array using GPU layout.
- */
-int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp);
-
  /*! \libinternal \brief
   * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
   *
diff --git a/src/gromacs/ewald/pme_gpu_program.cpp b/src/gromacs/ewald/pme_gpu_program.cpp

index 23981a661b83bcf6c3a0eeafa189b38bfe7bd772..72711c91ae1017dfaa5364526c84870490742dd7 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program.cpp
+++ b/src/gromacs/ewald/pme_gpu_program.cpp
@@ -60,6 +60,11 @@ PmeGpuProgram::PmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceCo
  
  PmeGpuProgram::~PmeGpuProgram() = default;
  
+int PmeGpuProgram::warpSize() const
+{
+    return impl_->warpSize();
+}
+
  PmeGpuProgramStorage buildPmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext)
  {
      return std::make_unique<PmeGpuProgram>(deviceInfo, deviceContext);
diff --git a/src/gromacs/ewald/pme_gpu_program.h b/src/gromacs/ewald/pme_gpu_program.h

index d4dbdf449d9b17eba6edd7b5b5b0af8868a8efdd..c4888d97c71c6755382faf99ef57e33b35e87e57 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program.h
+++ b/src/gromacs/ewald/pme_gpu_program.h
@@ -35,9 +35,10 @@
  
  /*! \libinternal \file
   * \brief
- * Declares PmeGpuProgram, which wrap arounds PmeGpuProgramImpl
- * to store permanent PME GPU context-derived data,
- * such as (compiled) kernel handles.
+ * Declares PmeGpuProgram
+ * to store data derived from the GPU context or devices for
+ * PME, such as (compiled) kernel handles and the warp sizes
+ * they work with.
   *
   * \author Aleksei Iupinov <a.yupinov@gmail.com>
   * \ingroup module_ewald
@@ -54,13 +55,24 @@ class DeviceContext;
  struct PmeGpuProgramImpl;
  struct DeviceInformation;
  
+/*! \libinternal
+ * \brief Stores PME data derived from the GPU context or devices.
+ *
+ * This includes compiled kernel handles and the warp sizes they
+ * work with.
+ */
  class PmeGpuProgram
  {
  public:
+    //! Constructor
      explicit PmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext);
      ~PmeGpuProgram();
  
-    // TODO: design getters for information inside, if needed for PME, and make this private?
+    //! Return the warp size for which the kernels were compiled
+    int warpSize() const;
+
+    // TODO: design more getters for information inside, if needed for PME, and make this private?
+    //! Private impl class
      std::unique_ptr<PmeGpuProgramImpl> impl_;
  };
  
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.cpp b/src/gromacs/ewald/pme_gpu_program_impl.cpp

index ccaffa5acdc03ddfb0a568c013e1ed661aa051d1..9056881227f2c6a907932c3d6efedc5035cb386f 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program_impl.cpp
+++ b/src/gromacs/ewald/pme_gpu_program_impl.cpp
@@ -48,7 +48,7 @@
  PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& /* deviceInfo */,
                                       const DeviceContext& deviceContext) :
      deviceContext_(deviceContext),
-    warpSize(0),
+    warpSize_(0),
      spreadWorkGroupSize(0),
      gatherWorkGroupSize(0),
      solveMaxWorkGroupSize(0)
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.cu b/src/gromacs/ewald/pme_gpu_program_impl.cu

index 53bf2f0d1eab0df4cd293cc82176ee700c3425b3..e1331330c8a278f2edbaa5e54d01bc36273cf06a 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program_impl.cu
+++ b/src/gromacs/ewald/pme_gpu_program_impl.cu
@@ -103,7 +103,7 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& /* deviceInfo */,
      deviceContext_(deviceContext)
  {
      // kernel parameters
-    warpSize              = warp_size;
+    warpSize_             = warp_size;
      spreadWorkGroupSize   = c_spreadMaxThreadsPerBlock;
      solveMaxWorkGroupSize = c_solveMaxThreadsPerBlock;
      gatherWorkGroupSize   = c_gatherMaxThreadsPerBlock;
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.h b/src/gromacs/ewald/pme_gpu_program_impl.h

index cb1471abf17be0ec925c638221039466083ade2f..b9d1adc0d46c3dc8ea916b471c94bb0bdae41f89 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program_impl.h
+++ b/src/gromacs/ewald/pme_gpu_program_impl.h
@@ -94,7 +94,7 @@ struct PmeGpuProgramImpl
       * For CUDA, this is a static value that comes from gromacs/gpu_utils/cuda_arch_utils.cuh;
       * for OpenCL, we have to query it dynamically.
       */
-    size_t warpSize;
+    size_t warpSize_;
  
      //@{
      /**
@@ -150,6 +150,9 @@ struct PmeGpuProgramImpl
      ~PmeGpuProgramImpl();
      GMX_DISALLOW_COPY_AND_ASSIGN(PmeGpuProgramImpl);
  
+    //! Return the warp size for which the kernels were compiled
+    int warpSize() const { return warpSize_; }
+
  private:
      // Compiles kernels, if supported. Called by the constructor.
      void compileKernels(const DeviceInformation& deviceInfo);
diff --git a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp

index 5e82fec6dfa1d58e08105e8b1cedc58c7307e8ae..2609763afa9383f2750d11dd96ac855fef35faac 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
@@ -57,12 +57,12 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& deviceInfo, const
      deviceContext_(deviceContext)
  {
      // kernel parameters
-    warpSize = gmx::ocl::getDeviceWarpSize(deviceContext_.context(), deviceInfo.oclDeviceId);
+    warpSize_ = gmx::ocl::getDeviceWarpSize(deviceContext_.context(), deviceInfo.oclDeviceId);
      // TODO: for Intel ideally we'd want to set these based on the compiler warp size
      // but given that we've done no tuning for Intel iGPU, this is as good as anything.
-    spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize, deviceInfo.maxWorkGroupSize);
-    solveMaxWorkGroupSize = std::min(c_solveMaxWarpsPerBlock * warpSize, deviceInfo.maxWorkGroupSize);
-    gatherWorkGroupSize = std::min(c_gatherMaxWarpsPerBlock * warpSize, deviceInfo.maxWorkGroupSize);
+    spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize_, deviceInfo.maxWorkGroupSize);
+    solveMaxWorkGroupSize = std::min(c_solveMaxWarpsPerBlock * warpSize_, deviceInfo.maxWorkGroupSize);
+    gatherWorkGroupSize = std::min(c_gatherMaxWarpsPerBlock * warpSize_, deviceInfo.maxWorkGroupSize);
  
      compileKernels(deviceInfo);
  }
@@ -137,7 +137,7 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation& deviceInfo)
                  "-DDIM=%d -DXX=%d -DYY=%d -DZZ=%d "
                  // decomposition parameter placeholders
                  "-DwrapX=true -DwrapY=true ",
-                warpSize, c_pmeGpuOrder, c_pmeSpreadGatherThreadsPerAtom,
+                warpSize_, c_pmeGpuOrder, c_pmeSpreadGatherThreadsPerAtom,
                  static_cast<float>(c_pmeMaxUnitcellShift), static_cast<int>(c_skipNeutralAtoms),
                  c_virialAndEnergyCount, spreadWorkGroupSize, solveMaxWorkGroupSize,
                  gatherWorkGroupSize, DIM, XX, YY, ZZ);
diff --git a/src/gromacs/ewald/tests/CMakeLists.txt b/src/gromacs/ewald/tests/CMakeLists.txt

index 790bb0628daa61e18de95912b1d685ca3a16e074..6dea2b21ecca50c3a1ffc77a158c56729fa342d2 100644 (file)
--- a/src/gromacs/ewald/tests/CMakeLists.txt
+++ b/src/gromacs/ewald/tests/CMakeLists.txt
@@ -38,6 +38,7 @@ gmx_add_unit_test(EwaldUnitTests ewald-test HARDWARE_DETECTION
          pmegathertest.cpp
          pmesolvetest.cpp
          pmesplinespreadtest.cpp
-        pmetestcommon.cpp
          testhardwarecontexts.cpp
+    GPU_CPP_SOURCE_FILES
+        pmetestcommon.cpp
          )
diff --git a/src/gromacs/ewald/tests/pmebsplinetest.cpp b/src/gromacs/ewald/tests/pmebsplinetest.cpp

index b6a4e9e35f81b69e24f16180b26b36bfcabcd898..f136fabe4fb0a80d467f56c36c7387cdad7d720c 100644 (file)
--- a/src/gromacs/ewald/tests/pmebsplinetest.cpp
+++ b/src/gromacs/ewald/tests/pmebsplinetest.cpp
@@ -55,6 +55,7 @@
  #include "testutils/testasserts.h"
  
  #include "pmetestcommon.h"
+#include "testhardwarecontexts.h"
  
  namespace gmx
  {
diff --git a/src/gromacs/ewald/tests/pmegathertest.cpp b/src/gromacs/ewald/tests/pmegathertest.cpp

index 59efd56faccc460f8cf0a6bea446c63a1ff5da9a..51035f035576a8a031396a88d7a8506583ef0272 100644 (file)
--- a/src/gromacs/ewald/tests/pmegathertest.cpp
+++ b/src/gromacs/ewald/tests/pmegathertest.cpp
@@ -53,6 +53,7 @@
  #include "testutils/testasserts.h"
  
  #include "pmetestcommon.h"
+#include "testhardwarecontexts.h"
  
  namespace gmx
  {
diff --git a/src/gromacs/ewald/tests/pmesolvetest.cpp b/src/gromacs/ewald/tests/pmesolvetest.cpp

index 0cc1eec9a8e8df2ea12ee1ef391d0f31ecdd895f..86688822ebcc8a1f70b980fab2d039e30d32730f 100644 (file)
--- a/src/gromacs/ewald/tests/pmesolvetest.cpp
+++ b/src/gromacs/ewald/tests/pmesolvetest.cpp
@@ -53,6 +53,7 @@
  #include "testutils/testasserts.h"
  
  #include "pmetestcommon.h"
+#include "testhardwarecontexts.h"
  
  namespace gmx
  {
diff --git a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp

index 8f2935b9f321b50b3f99ccbc83a084a2b244b600..5c2d8663ef593f5b52c2973cc45fb8bb058389a2 100644 (file)
--- a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
+++ b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
@@ -53,6 +53,7 @@
  #include "testutils/testasserts.h"
  
  #include "pmetestcommon.h"
+#include "testhardwarecontexts.h"
  
  namespace gmx
  {
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index 80960b647ae94d76d75846eb3549bfac6c74d489..e446b8e36d2d3e0ea3d74331e840436058f92cd4 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -49,6 +49,8 @@
  
  #include "gromacs/domdec/domdec.h"
  #include "gromacs/ewald/pme_gather.h"
+#include "gromacs/ewald/pme_gpu_calculate_splines.h"
+#include "gromacs/ewald/pme_gpu_constants.h"
  #include "gromacs/ewald/pme_gpu_internal.h"
  #include "gromacs/ewald/pme_gpu_staging.h"
  #include "gromacs/ewald/pme_grid.h"
@@ -69,6 +71,8 @@
  
  #include "testutils/testasserts.h"
  
+#include "testhardwarecontexts.h"
+
  namespace gmx
  {
  namespace test
@@ -152,13 +156,19 @@ PmeSafePointer pmeInitEmpty(const t_inputrec*        inputRec,
                              const DeviceInformation* deviceInfo,
                              const PmeGpuProgram*     pmeGpuProgram,
                              const Matrix3x3&         box,
-                            real                     ewaldCoeff_q,
-                            real                     ewaldCoeff_lj)
+                            const real               ewaldCoeff_q,
+                            const real               ewaldCoeff_lj)
  {
      return pmeInitWrapper(inputRec, mode, deviceInfo, pmeGpuProgram, box, ewaldCoeff_q, ewaldCoeff_lj);
      // hiding the fact that PME actually needs to know the number of atoms in advance
  }
  
+PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec)
+{
+    const Matrix3x3 defaultBox = { { 1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F } };
+    return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
+}
+
  //! Make a GPU state-propagator manager
  std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
                                                                     const DeviceContext& deviceContext)
@@ -452,6 +462,154 @@ void pmeFinalizeTest(const gmx_pme_t* pme, CodePath mode)
      }
  }
  
+//! A binary enum for spline data layout transformation
+enum class PmeLayoutTransform
+{
+    GpuToHost,
+    HostToGpu
+};
+
+/*! \brief Gets a unique index to an element in a spline parameter buffer.
+ *
+ * These theta/dtheta buffers are laid out for GPU spread/gather
+ * kernels. The index is wrt the execution block, in range(0,
+ * atomsPerBlock * order * DIM).
+ *
+ * This is a wrapper, only used in unit tests.
+ * \param[in] order            PME order
+ * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
+ * \param[in] dimIndex         Dimension index (from 0 to 2)
+ * \param[in] atomIndex        Atom index wrt the block.
+ * \param[in] atomsPerWarp     Number of atoms processed by a warp.
+ *
+ * \returns Index into theta or dtheta array using GPU layout.
+ */
+static int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp)
+{
+    if (order != c_pmeGpuOrder)
+    {
+        throw order;
+    }
+    constexpr int fixedOrder = c_pmeGpuOrder;
+    GMX_UNUSED_VALUE(fixedOrder);
+
+    const int atomWarpIndex = atomIndex % atomsPerWarp;
+    const int warpIndex     = atomIndex / atomsPerWarp;
+    int       indexBase, result;
+    switch (atomsPerWarp)
+    {
+        case 1:
+            indexBase = getSplineParamIndexBase<fixedOrder, 1>(warpIndex, atomWarpIndex);
+            result    = getSplineParamIndex<fixedOrder, 1>(indexBase, dimIndex, splineIndex);
+            break;
+
+        case 2:
+            indexBase = getSplineParamIndexBase<fixedOrder, 2>(warpIndex, atomWarpIndex);
+            result    = getSplineParamIndex<fixedOrder, 2>(indexBase, dimIndex, splineIndex);
+            break;
+
+        case 4:
+            indexBase = getSplineParamIndexBase<fixedOrder, 4>(warpIndex, atomWarpIndex);
+            result    = getSplineParamIndex<fixedOrder, 4>(indexBase, dimIndex, splineIndex);
+            break;
+
+        case 8:
+            indexBase = getSplineParamIndexBase<fixedOrder, 8>(warpIndex, atomWarpIndex);
+            result    = getSplineParamIndex<fixedOrder, 8>(indexBase, dimIndex, splineIndex);
+            break;
+
+        default:
+            GMX_THROW(NotImplementedError(
+                    formatString("Test function call not unrolled for atomsPerWarp = %d in "
+                                 "getSplineParamFullIndex",
+                                 atomsPerWarp)));
+    }
+    return result;
+}
+
+/*!\brief Return the number of atoms per warp */
+static int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
+{
+    if (pmeGpu->settings.useOrderThreadsPerAtom)
+    {
+        return pmeGpu->programHandle_->warpSize() / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
+    }
+    else
+    {
+        return pmeGpu->programHandle_->warpSize() / c_pmeSpreadGatherThreadsPerAtom;
+    }
+}
+
+/*! \brief Rearranges the atom spline data between the GPU and host layouts.
+ * Only used for test purposes so far, likely to be horribly slow.
+ *
+ * \param[in]  pmeGpu     The PME GPU structure.
+ * \param[out] atc        The PME CPU atom data structure (with a single-threaded layout).
+ * \param[in]  type       The spline data type (values or derivatives).
+ * \param[in]  dimIndex   Dimension index.
+ * \param[in]  transform  Layout transform type
+ */
+static void pme_gpu_transform_spline_atom_data(const PmeGpu*      pmeGpu,
+                                               const PmeAtomComm* atc,
+                                               PmeSplineDataType  type,
+                                               int                dimIndex,
+                                               PmeLayoutTransform transform)
+{
+    // The GPU atom spline data is laid out in a different way currently than the CPU one.
+    // This function converts the data from GPU to CPU layout (in the host memory).
+    // It is only intended for testing purposes so far.
+    // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their
+    // performance (e.g. spreading on GPU, gathering on CPU).
+    GMX_RELEASE_ASSERT(atc->nthread == 1, "Only the serial PME data layout is supported");
+    const uintmax_t threadIndex  = 0;
+    const auto      atomCount    = atc->numAtoms();
+    const auto      atomsPerWarp = pme_gpu_get_atoms_per_warp(pmeGpu);
+    const auto      pmeOrder     = pmeGpu->common->pme_order;
+    GMX_ASSERT(pmeOrder == c_pmeGpuOrder, "Only PME order 4 is implemented");
+
+    real*  cpuSplineBuffer;
+    float* h_splineBuffer;
+    switch (type)
+    {
+        case PmeSplineDataType::Values:
+            cpuSplineBuffer = atc->spline[threadIndex].theta.coefficients[dimIndex];
+            h_splineBuffer  = pmeGpu->staging.h_theta;
+            break;
+
+        case PmeSplineDataType::Derivatives:
+            cpuSplineBuffer = atc->spline[threadIndex].dtheta.coefficients[dimIndex];
+            h_splineBuffer  = pmeGpu->staging.h_dtheta;
+            break;
+
+        default: GMX_THROW(InternalError("Unknown spline data type"));
+    }
+
+    for (auto atomIndex = 0; atomIndex < atomCount; atomIndex++)
+    {
+        for (auto orderIndex = 0; orderIndex < pmeOrder; orderIndex++)
+        {
+            const auto gpuValueIndex =
+                    getSplineParamFullIndex(pmeOrder, orderIndex, dimIndex, atomIndex, atomsPerWarp);
+            const auto cpuValueIndex = atomIndex * pmeOrder + orderIndex;
+            GMX_ASSERT(cpuValueIndex < atomCount * pmeOrder,
+                       "Atom spline data index out of bounds (while transforming GPU data layout "
+                       "for host)");
+            switch (transform)
+            {
+                case PmeLayoutTransform::GpuToHost:
+                    cpuSplineBuffer[cpuValueIndex] = h_splineBuffer[gpuValueIndex];
+                    break;
+
+                case PmeLayoutTransform::HostToGpu:
+                    h_splineBuffer[gpuValueIndex] = cpuSplineBuffer[cpuValueIndex];
+                    break;
+
+                default: GMX_THROW(InternalError("Unknown layout transform"));
+            }
+        }
+    }
+}
+
  //! Setting atom spline values/derivatives to be used in spread/gather
  void pmeSetSplineData(const gmx_pme_t*             pme,
                        CodePath                     mode,
diff --git a/src/gromacs/ewald/tests/pmetestcommon.h b/src/gromacs/ewald/tests/pmetestcommon.h

index c67f78bacf265251dc9a772bfff2ec8eaa98ea60..98a2bbd4d2106092219be25eb8b8cffc6c9e22df 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.h
+++ b/src/gromacs/ewald/tests/pmetestcommon.h
@@ -46,8 +46,6 @@
  #include <map>
  #include <vector>
  
-#include <gtest/gtest.h>
-
  #include "gromacs/ewald/pme.h"
  #include "gromacs/ewald/pme_gpu_internal.h"
  #include "gromacs/math/gmxcomplex.h"
@@ -55,13 +53,14 @@
  #include "gromacs/utility/arrayref.h"
  #include "gromacs/utility/unique_cptr.h"
  
-#include "testhardwarecontexts.h"
-
  namespace gmx
  {
  namespace test
  {
  
+// Forward declaration
+enum class CodePath;
+
  // Convenience typedefs
  //! A safe pointer type for PME.
  typedef gmx::unique_cptr<gmx_pme_t, gmx_pme_destroy> PmeSafePointer;
@@ -128,12 +127,14 @@ PmeSafePointer pmeInitWrapper(const t_inputrec*        inputRec,
                                real                     ewaldCoeff_lj = 1.0F);
  //! Simple PME initialization (no atom data)
  PmeSafePointer pmeInitEmpty(const t_inputrec*        inputRec,
-                            CodePath                 mode          = CodePath::CPU,
-                            const DeviceInformation* deviceInfo    = nullptr,
-                            const PmeGpuProgram*     pmeGpuProgram = nullptr,
-                            const Matrix3x3& box = { { 1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F } },
-                            real             ewaldCoeff_q  = 0.0F,
-                            real             ewaldCoeff_lj = 0.0F);
+                            CodePath                 mode,
+                            const DeviceInformation* deviceInfo,
+                            const PmeGpuProgram*     pmeGpuProgram,
+                            const Matrix3x3&         box,
+                            real                     ewaldCoeff_q,
+                            real                     ewaldCoeff_lj);
+//! Simple PME initialization based on inputrec only
+PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec);
  //! Make a GPU state-propagator manager
  std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
                                                                     const DeviceContext& deviceContext);
@@ -167,6 +168,7 @@ void pmeSetSplineData(const gmx_pme_t*             pme,
                        const SplineParamsDimVector& splineValues,
                        PmeSplineDataType            type,
                        int                          dimIndex);
+
  //! Setting gridline indices be used in spread/gather
  void pmeSetGridLineIndices(gmx_pme_t* pme, CodePath mode, const GridLineIndicesVector& gridLineIndices);
  //! Setting real grid to be used in gather
author	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 4 Mar 2020 17:44:34 +0000 (18:44 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Sat, 14 Mar 2020 08:11:27 +0000 (09:11 +0100)
src/gromacs/ewald/pme_gather.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp		patch \| blob \| history
src/gromacs/ewald/tests/CMakeLists.txt		patch \| blob \| history
src/gromacs/ewald/tests/pmebsplinetest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmegathertest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmesolvetest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmesplinespreadtest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.h		patch \| blob \| history