PME spline+spread CUDA kernel and unit tests

author Aleksei Iupinov <a.yupinov@gmail.com>

Mon, 5 Dec 2016 16:36:09 +0000 (17:36 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 11 Oct 2017 11:49:12 +0000 (13:49 +0200)
author Aleksei Iupinov <a.yupinov@gmail.com>
Mon, 5 Dec 2016 16:36:09 +0000 (17:36 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 11 Oct 2017 11:49:12 +0000 (13:49 +0200)
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp

index 3349b63b79c189c1c6ac1d4a38d5466e904180a4..165482d77ad381a72c07e0fc71635c4f400a8c1f 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -222,12 +222,14 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme)
          pmeGPU->common->bsp_mod[i].assign(pme->bsp_mod[i], pme->bsp_mod[i] + pmeGPU->common->nk[i]);
      }
      const int cellCount = c_pmeNeighborUnitcellCount;
-    pmeGPU->common->fsh[XX].assign(pme->fshx, pme->fshx + cellCount * pme->nkx);
-    pmeGPU->common->fsh[YY].assign(pme->fshy, pme->fshy + cellCount * pme->nky);
-    pmeGPU->common->fsh[ZZ].assign(pme->fshz, pme->fshz + cellCount * pme->nkz);
-    pmeGPU->common->nn[XX].assign(pme->nnx, pme->nnx + cellCount * pme->nkx);
-    pmeGPU->common->nn[YY].assign(pme->nny, pme->nny + cellCount * pme->nky);
-    pmeGPU->common->nn[ZZ].assign(pme->nnz, pme->nnz + cellCount * pme->nkz);
+    pmeGPU->common->fsh.resize(0);
+    pmeGPU->common->fsh.insert(pmeGPU->common->fsh.end(), pme->fshx, pme->fshx + cellCount * pme->nkx);
+    pmeGPU->common->fsh.insert(pmeGPU->common->fsh.end(), pme->fshy, pme->fshy + cellCount * pme->nky);
+    pmeGPU->common->fsh.insert(pmeGPU->common->fsh.end(), pme->fshz, pme->fshz + cellCount * pme->nkz);
+    pmeGPU->common->nn.resize(0);
+    pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nnx, pme->nnx + cellCount * pme->nkx);
+    pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nny, pme->nny + cellCount * pme->nky);
+    pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nnz, pme->nnz + cellCount * pme->nkz);
      pmeGPU->common->runMode = pme->runMode;
  }
  
@@ -306,6 +308,8 @@ static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::
      /* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
      pmeGPU->settings.performGPUGather = true;
  
+    pme_gpu_set_testing(pmeGPU, false);
+
      // GPU initialization
      init_gpu(mdlog, cr->nodeid, gpuInfo);
      pmeGPU->deviceInfo = gpuInfo;
@@ -322,6 +326,64 @@ static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::
      kernelParamsPtr->constants.elFactor = ONE_4PI_EPS0 / pmeGPU->common->epsilon_r;
  }
  
+void pme_gpu_transform_spline_atom_data_for_host(const pme_gpu_t *pmeGPU, const pme_atomcomm_t *atc, PmeSplineDataType type, int dimIndex)
+{
+    // The GPU atom spline data is laid out in a different way currently than the CPU one.
+    // This function converts the data from GPU to CPU layout. Obviously, it is slow.
+    // It is only intended for testing purposes so far.
+    // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their performance
+    // (e.g. spreading on GPU, gathering on CPU).
+    GMX_RELEASE_ASSERT(atc->nthread == 1, "Only the serial PME data layout is supported");
+    const uintmax_t threadIndex  = 0;
+    const auto      atomCount    = pme_gpu_get_kernel_params_base_ptr(pmeGPU)->atoms.nAtoms;
+    const auto      atomsPerWarp = pme_gpu_get_atom_spline_data_alignment(pmeGPU);
+    const auto      pmeOrder     = pmeGPU->common->pme_order;
+
+    real           *h_splineBuffer;
+    float          *d_splineBuffer;
+    switch (type)
+    {
+        case PmeSplineDataType::Values:
+            h_splineBuffer = atc->spline[threadIndex].theta[dimIndex];
+            d_splineBuffer = pmeGPU->staging.h_theta;
+            break;
+
+        case PmeSplineDataType::Derivatives:
+            h_splineBuffer = atc->spline[threadIndex].dtheta[dimIndex];
+            d_splineBuffer = pmeGPU->staging.h_dtheta;
+            break;
+
+        default:
+            GMX_THROW(gmx::InternalError("Unknown spline data type"));
+    }
+
+    for (auto atomIndex = 0; atomIndex < atomCount; atomIndex++)
+    {
+        auto atomWarpIndex = atomIndex % atomsPerWarp;
+        auto warpIndex     = atomIndex / atomsPerWarp;
+        for (auto orderIndex = 0; orderIndex < pmeOrder; orderIndex++)
+        {
+            const auto gpuValueIndex = ((pmeOrder * warpIndex + orderIndex) * DIM + dimIndex) * atomsPerWarp + atomWarpIndex;
+            const auto cpuValueIndex = atomIndex * pmeOrder + orderIndex;
+            GMX_ASSERT(cpuValueIndex < atomCount * pmeOrder, "Atom spline data index out of bounds (while transforming GPU data layout for host)");
+            h_splineBuffer[cpuValueIndex] = d_splineBuffer[gpuValueIndex];
+        }
+    }
+}
+
+void pme_gpu_get_real_grid_sizes(const pme_gpu_t *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize)
+{
+    GMX_ASSERT(gridSize != nullptr, "");
+    GMX_ASSERT(paddedGridSize != nullptr, "");
+    GMX_ASSERT(pmeGPU != nullptr, "");
+    auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU);
+    for (int i = 0; i < DIM; i++)
+    {
+        (*gridSize)[i]       = kernelParamsPtr->grid.realGridSize[i];
+        (*paddedGridSize)[i] = kernelParamsPtr->grid.realGridSizePadded[i];
+    }
+}
+
  void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr)
  {
      if (!pme_gpu_active(pme))
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h

index 03fe8de852d8bc87c8f8b008dc508102c48aa275..9cdb5568775862843531a3515a63db6982672fe8 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -62,6 +62,13 @@ namespace gmx
  class MDLogger;
  }
  
+//! Type of spline data
+enum class PmeSplineDataType
+{
+    Values,      // theta
+    Derivatives, // dtheta
+};               //TODO move this into new and shiny pme.h (pme-types.h?)
+
  /* Some general constants for PME GPU behaviour follow. */
  
  /*! \brief \libinternal
@@ -328,6 +335,13 @@ CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const pme_gpu_t *CUDA_FU
  CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
                                                           float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
  
+/*! \libinternal \brief
+ * Copies the spread output spline data and gridline indices from the GPU to the host.
+ *
+ * \param[in] pmeGPU   The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
  /*! \libinternal \brief
   * Waits for the grid copying to the host-side buffer after spreading to finish.
   *
@@ -418,6 +432,25 @@ CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const pme_gpu_t *CUDA_FUNC_ARGUME
  CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const pme_gpu_t         *CUDA_FUNC_ARGUMENT(pmeGPU),
                                               gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
  
+/* The PME stages themselves */
+
+/*! \libinternal \brief
+ * A GPU spline computation and charge spreading function.
+ *
+ * \param[in]  pmeGpu          The PME GPU structure.
+ * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
+ * \param[out] h_grid          The host-side grid buffer (used only if the result of the spread is expected on the host,
+ *                             e.g. testing or host-side FFT)
+ * \param[in]  computeSplines  Should the computation of spline parameters and gridline indices be performed.
+ * \param[in]  spreadCharges   Should the charges/coefficients be spread on the grid.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_spread(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu),
+                                        int              CUDA_FUNC_ARGUMENT(gridIndex),
+                                        real            *CUDA_FUNC_ARGUMENT(h_grid),
+                                        bool             CUDA_FUNC_ARGUMENT(computeSplines),
+                                        bool             CUDA_FUNC_ARGUMENT(spreadCharges)) CUDA_FUNC_TERM
+
+
  /* The inlined convenience PME GPU status getters */
  
  /*! \libinternal \brief
@@ -475,6 +508,29 @@ gmx_inline bool pme_gpu_performs_solve(const pme_gpu_t *pmeGPU)
      return pmeGPU->settings.performGPUSolve;
  }
  
+/*! \libinternal \brief
+ * Enables or disables the testing mode.
+ * Testing mode only implies copying all the outputs, even the intermediate ones, to the host.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ * \param[in] testing            Should the testing mode be enabled, or disabled.
+ */
+gmx_inline void pme_gpu_set_testing(pme_gpu_t *pmeGPU, bool testing)
+{
+    pmeGPU->settings.copyAllOutputs = testing;
+}
+
+/*! \libinternal \brief
+ * Tells if PME is in the testing mode.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ * \returns                      true if testing mode is enabled, false otherwise.
+ */
+gmx_inline bool pme_gpu_is_testing(const pme_gpu_t *pmeGPU)
+{
+    return pmeGPU->settings.copyAllOutputs;
+}
+
  /* A block of C++ functions that live in pme-gpu-internal.cpp */
  
  /*! \libinternal \brief
@@ -516,6 +572,28 @@ void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix bo
  void pme_gpu_finish_step(const pme_gpu_t *pmeGPU,  const bool       bCalcForces,
                           const bool       bCalcEnerVir);
  
+/*! \libinternal \brief
+ * Rearranges the atom spline data, copied from GPU to the host after pme_gpu_sync_spline_atom_data() call,
+ * to have the atom data layout of a single-threaded PME CPU run.
+ * Only used for test purposes so far.
+ *
+ * \param[in]  pmeGPU    The PME GPU structure.
+ * \param[out] atc       The PME CPU atom data structure (with a single-threaded layout).
+ * \param[in]  type      The spilne data type (values or derivatives).
+ * \param[in]  dimIndex  Dimension index.
+ */
+void pme_gpu_transform_spline_atom_data_for_host(const pme_gpu_t *pmeGPU, const pme_atomcomm_t *atc,
+                                                 PmeSplineDataType type, int dimIndex);
+
+/*! \libinternal \brief
+ * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ * \param[out] gridSize          Pointer to the grid dimensions to fill in.
+ * \param[out] paddedGridSize    Pointer to the padded grid dimensions to fill in.
+ */
+void pme_gpu_get_real_grid_sizes(const pme_gpu_t *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize);
+
  /*! \libinternal \brief
   * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
   *
diff --git a/src/gromacs/ewald/pme-gpu-types.h b/src/gromacs/ewald/pme-gpu-types.h

index 1bf3a88c1a6e1acaff288af5d99af0363abcf288..6a79e7c8fbd93bc978864120ce45519059548955 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-types.h
+++ b/src/gromacs/ewald/pme-gpu-types.h
@@ -298,9 +298,9 @@ struct pme_shared_t
      /*! \brief Electrostatics parameter */
      real              epsilon_r;
      /*! \brief Gridline indices - nnx, nny, nnz */
-    std::vector<int>  nn[DIM];
+    std::vector<int>  nn;
      /*! \brief Fractional shifts - fshx, fshy, fshz */
-    std::vector<real> fsh[DIM];
+    std::vector<real> fsh;
      /*! \brief Precomputed B-spline values */
      std::vector<real> bsp_mod[DIM];
      /*! \brief The PME codepath being taken */
diff --git a/src/gromacs/ewald/pme-gpu.cpp b/src/gromacs/ewald/pme-gpu.cpp

index e830c53d8d9fe0b13dec41e8faadadc418e222d4..e24565adbf26b8028cc09d99789d252b5e3803c0 100644 (file)
--- a/src/gromacs/ewald/pme-gpu.cpp
+++ b/src/gromacs/ewald/pme-gpu.cpp
@@ -42,10 +42,16 @@
  
  #include "gmxpre.h"
  
+#include "config.h"
+
+#include <list>
+
  #include "gromacs/ewald/pme.h"
+#include "gromacs/mdtypes/inputrec.h"
  #include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
  
  #include "pme-gpu-internal.h"
  #include "pme-grid.h"
@@ -57,6 +63,54 @@ bool pme_gpu_task_enabled(const gmx_pme_t *pme)
      return (pme != nullptr) && (pme->runMode != PmeRunMode::CPU);
  }
  
+bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error)
+{
+    std::list<std::string> errorReasons;
+    if (!EEL_PME(ir->coulombtype))
+    {
+        errorReasons.push_back("systems that do not use PME for electrostatics");
+    }
+    if (ir->pme_order != 4)
+    {
+        errorReasons.push_back("interpolation orders other than 4");
+    }
+    if (ir->efep != efepNO)
+    {
+        errorReasons.push_back("free energy calculations (multiple grids)");
+    }
+    if (EVDW_PME(ir->vdwtype))
+    {
+        errorReasons.push_back("Lennard-Jones PME");
+    }
+#if GMX_DOUBLE
+    {
+        errorReasons.push_back("double precision");
+    }
+#endif
+#if GMX_GPU != GMX_GPU_CUDA
+    {
+        errorReasons.push_back("non-CUDA build of GROMACS");
+    }
+#endif
+    if (ir->cutoff_scheme == ecutsGROUP)
+    {
+        errorReasons.push_back("group cutoff scheme");
+    }
+    if (EI_TPI(ir->eI))
+    {
+        errorReasons.push_back("test particle insertion");
+    }
+
+    bool inputSupported = errorReasons.empty();
+    if (!inputSupported && error)
+    {
+        std::string regressionTestMarker = "PME GPU does not support";
+        // this prefix is tested for in the regression tests script gmxtest.pl
+        *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
+    }
+    return inputSupported;
+}
+
  void pme_gpu_reset_timings(const gmx_pme_t *pme)
  {
      if (pme_gpu_active(pme))
diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu

new file mode 100644 (file)

index 0000000..ee92874
--- /dev/null
+++ b/src/gromacs/ewald/pme-spread.cu
@@ -0,0 +1,558 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013-2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Implements PME GPU spline calculation and charge spreading in CUDA.
+ *  TODO: consider always pre-sorting particles (as in DD case).
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include <cassert>
+
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+
+#include "pme.cuh"
+#include "pme-grid.h"
+#include "pme-timings.cuh"
+
+/*
+ * This define affects the spline calculation behaviour in the kernel.
+ * 0: a single GPU thread handles a single dimension of a single particle (calculating and storing (order) spline values and derivatives).
+ * 1: (order) threads do redundant work on this same task, each one stores only a single theta and single dtheta into global arrays.
+ * The only efficiency difference is less global store operations, countered by more redundant spline computation.
+ *
+ * TODO: estimate if this should be a boolean parameter (and add it to the unit test if so).
+ */
+#define PME_GPU_PARALLEL_SPLINE 0
+
+
+//! Spreading max block width in warps picked among powers of 2 (2, 4, 8, 16) for max. occupancy and min. runtime in most cases
+constexpr int c_spreadMaxWarpsPerBlock = 8;
+/* TODO: it has been observed that the kernel can be faster with smaller block sizes (2 or 4 warps)
+ * only on some GPUs (660Ti) with large enough grid (>= 48^3) due to load/store units being overloaded
+ * (ldst_fu_utilization metric maxed out in nvprof). Runtime block size choice might be nice to have.
+ * This has been tried on architectures up to Maxwell (GTX 750) and it would be good to revisit this.
+ */
+//! Spreading max block size in threads
+constexpr int c_spreadMaxThreadsPerBlock = c_spreadMaxWarpsPerBlock * warp_size;
+
+//! Texture references for CC 2.x
+texture<int, 1, cudaReadModeElementType>   gridlineIndicesTableTextureRef;
+texture<float, 1, cudaReadModeElementType> fractShiftsTableTextureRef;
+
+/*! Returns the reference to the gridlineIndices texture. */
+texture<int, 1, cudaReadModeElementType> &pme_gpu_get_gridline_texref()
+{
+    return gridlineIndicesTableTextureRef;
+}
+
+/*! Returns the reference to the fractShifts texture. */
+texture<float, 1, cudaReadModeElementType> &pme_gpu_get_fract_shifts_texref()
+{
+    return fractShiftsTableTextureRef;
+}
+
+/*! \brief
+ * General purpose function for loading atom-related data from global to shared memory.
+ *
+ * \tparam[in] T                 Data type (float/int/...)
+ * \tparam[in] atomsPerBlock     Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
+ * \tparam[in] dataCountPerAtom  Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
+ * \param[in]  kernelParams      Input PME CUDA data in constant memory.
+ * \param[out] sm_destination    Shared memory array for output.
+ * \param[in]  gm_source         Global memory array for input.
+ */
+template<typename T,
+         const int atomsPerBlock,
+         const int dataCountPerAtom>
+__device__  __forceinline__
+void pme_gpu_stage_atom_data(const pme_gpu_cuda_kernel_params_t kernelParams,
+                             T * __restrict__                   sm_destination,
+                             const T * __restrict__             gm_source)
+{
+    static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta per-warp alignment");
+    const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
+    const int localIndex       = threadLocalIndex;
+    const int globalIndexBase  = blockIdx.x * atomsPerBlock * dataCountPerAtom;
+    const int globalIndex      = globalIndexBase + localIndex;
+    const int globalCheck      = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
+    if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+    {
+        assert(isfinite(float(gm_source[globalIndex])));
+        sm_destination[localIndex] = gm_source[globalIndex];
+    }
+}
+
+/*! \brief
+ * PME GPU spline parameter and gridline indices calculation.
+ * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines().
+ * First stage of the whole kernel.
+ *
+ * \tparam[in] order                PME interpolation order.
+ * \tparam[in] atomsPerBlock        Number of atoms processed by a block - should be accounted for
+ *                                  in the sizes of the shared memory arrays.
+ * \param[in]  kernelParams         Input PME CUDA data in constant memory.
+ * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
+ * \param[in]  sm_coordinates       Atom coordinates in the shared memory.
+ * \param[in]  sm_coefficients      Atom charges/coefficients in the shared memory.
+ * \param[out] sm_theta             Atom spline values in the shared memory.
+ * \param[out] sm_gridlineIndices   Atom gridline indices in the shared memory.
+ */
+template <const int order,
+          const int atomsPerBlock>
+__device__ __forceinline__ void calculate_splines(const pme_gpu_cuda_kernel_params_t     kernelParams,
+                                                  const int                              atomIndexOffset,
+                                                  const float3 * __restrict__            sm_coordinates,
+                                                  const float * __restrict__             sm_coefficients,
+                                                  float * __restrict__                   sm_theta,
+                                                  int * __restrict__                     sm_gridlineIndices)
+{
+    /* Global memory pointers for output */
+    float * __restrict__ gm_theta           = kernelParams.atoms.d_theta;
+    float * __restrict__ gm_dtheta          = kernelParams.atoms.d_dtheta;
+    int * __restrict__   gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
+
+    /* Fractional coordinates */
+    __shared__ float sm_fractCoords[atomsPerBlock * DIM];
+
+    /* Thread index w.r.t. block */
+    const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
+        + (threadIdx.y * blockDim.x) + threadIdx.x;
+    /* Warp index w.r.t. block - could probably be obtained easier? */
+    const int warpIndex = threadLocalId / warp_size;
+    /* Thread index w.r.t. warp */
+    const int threadWarpIndex = threadLocalId % warp_size;
+    /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
+    const int atomWarpIndex = threadWarpIndex % PME_SPREADGATHER_ATOMS_PER_WARP;
+    /* Atom index w.r.t. block/shared memory */
+    const int atomIndexLocal = warpIndex * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex;
+
+    /* Atom index w.r.t. global memory */
+    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+    /* Spline contribution index in one dimension */
+    const int orderIndex = threadWarpIndex / (PME_SPREADGATHER_ATOMS_PER_WARP * DIM);
+    /* Dimension index */
+    const int dimIndex = (threadWarpIndex - orderIndex * (PME_SPREADGATHER_ATOMS_PER_WARP * DIM)) / PME_SPREADGATHER_ATOMS_PER_WARP;
+
+    /* Multi-purpose index of rvec/ivec atom data */
+    const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
+
+    /* Spline parameters need a working buffer.
+     * With PME_GPU_PARALLEL_SPLINE == 0 it is just a local array of (order) values for some of the threads, which is fine;
+     * With PME_GPU_PARALLEL_SPLINE == 1 (order) times more threads are involved, so the shared memory is used to avoid overhead.
+     * The buffer's size, striding and indexing are adjusted accordingly.
+     * The buffer is accessed with SPLINE_DATA_PTR and SPLINE_DATA macros.
+     */
+#if PME_GPU_PARALLEL_SPLINE
+    const int        splineDataStride  = atomsPerBlock * DIM;
+    const int        splineDataIndex   = sharedMemoryIndex;
+    __shared__ float sm_splineData[splineDataStride * order];
+    float           *splineDataPtr = sm_splineData;
+#else
+    const int        splineDataStride = 1;
+    const int        splineDataIndex  = 0;
+    float            splineData[splineDataStride * order];
+    float           *splineDataPtr = splineData;
+#endif
+
+#define SPLINE_DATA_PTR(i) (splineDataPtr + ((i) * splineDataStride + splineDataIndex))
+#define SPLINE_DATA(i) (*SPLINE_DATA_PTR(i))
+
+    const int localCheck  = (dimIndex < DIM) && (orderIndex < (PME_GPU_PARALLEL_SPLINE ? order : 1));
+    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
+
+    if (localCheck && globalCheck)
+    {
+        /* Indices interpolation */
+
+        if (orderIndex == 0)
+        {
+            int           tableIndex, tInt;
+            float         n, t;
+            const float3  x = sm_coordinates[atomIndexLocal];
+            /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
+             * puts them into local memory(!) instead of accessing the constant memory directly.
+             * That's the reason for the switch, to unroll explicitly.
+             * The commented parts correspond to the 0 components of the recipbox.
+             */
+            switch (dimIndex)
+            {
+                case XX:
+                    tableIndex  = kernelParams.grid.tablesOffsets[XX];
+                    n           = kernelParams.grid.realGridSizeFP[XX];
+                    t           = x.x * kernelParams.step.recipBox[dimIndex][XX] + x.y * kernelParams.step.recipBox[dimIndex][YY] + x.z * kernelParams.step.recipBox[dimIndex][ZZ];
+                    break;
+
+                case YY:
+                    tableIndex  = kernelParams.grid.tablesOffsets[YY];
+                    n           = kernelParams.grid.realGridSizeFP[YY];
+                    t           = /*x.x * kernelParams.step.recipbox[dimIndex][XX] + */ x.y * kernelParams.step.recipBox[dimIndex][YY] + x.z * kernelParams.step.recipBox[dimIndex][ZZ];
+                    break;
+
+                case ZZ:
+                    tableIndex  = kernelParams.grid.tablesOffsets[ZZ];
+                    n           = kernelParams.grid.realGridSizeFP[ZZ];
+                    t           = /*x.x * kernelParams.step.recipbox[dimIndex][XX] + x.y * kernelParams.step.recipbox[dimIndex][YY] + */ x.z * kernelParams.step.recipBox[dimIndex][ZZ];
+                    break;
+            }
+            const float shift = c_pmeMaxUnitcellShift;
+            /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
+            t    = (t + shift) * n;
+            tInt = (int)t;
+            sm_fractCoords[sharedMemoryIndex] = t - tInt;
+            tableIndex                       += tInt;
+            assert(tInt >= 0);
+            assert(tInt < c_pmeNeighborUnitcellCount * n);
+
+            // TODO have shared table for both parameters to share the fetch, as index is always same?
+            // TODO compare texture/LDG performance
+            sm_fractCoords[sharedMemoryIndex] += fetchFromParamLookupTable(kernelParams.grid.d_fractShiftsTable, kernelParams.fractShiftsTableTexture,
+                                                                           fractShiftsTableTextureRef, tableIndex);
+            sm_gridlineIndices[sharedMemoryIndex] = fetchFromParamLookupTable(kernelParams.grid.d_gridlineIndicesTable, kernelParams.gridlineIndicesTableTexture,
+                                                                              gridlineIndicesTableTextureRef, tableIndex);
+            gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] = sm_gridlineIndices[sharedMemoryIndex];
+        }
+
+        /* B-spline calculation */
+
+        const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
+        if (chargeCheck)
+        {
+            float       div;
+            int         o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
+
+            const float dr = sm_fractCoords[sharedMemoryIndex];
+            assert(isfinite(dr));
+
+            /* dr is relative offset from lower cell limit */
+            *SPLINE_DATA_PTR(order - 1) = 0.0f;
+            *SPLINE_DATA_PTR(1)         = dr;
+            *SPLINE_DATA_PTR(0)         = 1.0f - dr;
+
+#pragma unroll
+            for (int k = 3; k < order; k++)
+            {
+                div                     = 1.0f / (k - 1.0f);
+                *SPLINE_DATA_PTR(k - 1) = div * dr * SPLINE_DATA(k - 2);
+#pragma unroll
+                for (int l = 1; l < (k - 1); l++)
+                {
+                    *SPLINE_DATA_PTR(k - l - 1) = div * ((dr + l) * SPLINE_DATA(k - l - 2) + (k - l - dr) * SPLINE_DATA(k - l - 1));
+                }
+                *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
+            }
+
+            const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order;
+
+            /* Differentiation and storing the spline derivatives (dtheta) */
+#if !PME_GPU_PARALLEL_SPLINE
+            // With PME_GPU_PARALLEL_SPLINE == 1, o is already set to orderIndex;
+            // With PME_GPU_PARALLEL_SPLINE == 0, we loop o over range(order).
+#pragma unroll
+            for (o = 0; o < order; o++)
+#endif
+            {
+                const int   thetaIndex       = PME_SPLINE_THETA_STRIDE * (((o + order * warpIndex) * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex);
+                const int   thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
+
+                const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0f) - SPLINE_DATA(o);
+                assert(isfinite(dtheta));
+                gm_dtheta[thetaGlobalIndex] = dtheta;
+            }
+
+            div  = 1.0f / (order - 1.0f);
+            *SPLINE_DATA_PTR(order - 1) = div * dr * SPLINE_DATA(order - 2);
+#pragma unroll
+            for (int k = 1; k < (order - 1); k++)
+            {
+                *SPLINE_DATA_PTR(order - k - 1) = div * ((dr + k) * SPLINE_DATA(order - k - 2) + (order - k - dr) * SPLINE_DATA(order - k - 1));
+            }
+            *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
+
+            /* Storing the spline values (theta) */
+#if !PME_GPU_PARALLEL_SPLINE
+            // See comment for the loop above
+#pragma unroll
+            for (o = 0; o < order; o++)
+#endif
+            {
+                const int thetaIndex       = PME_SPLINE_THETA_STRIDE * (((o + order * warpIndex) * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex);
+                const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
+
+                sm_theta[thetaIndex]       = SPLINE_DATA(o);
+                assert(isfinite(sm_theta[thetaIndex]));
+                gm_theta[thetaGlobalIndex] = SPLINE_DATA(o);
+            }
+        }
+    }
+}
+
+/*! \brief
+ * Charge spreading onto the grid.
+ * This corresponds to the CPU function spread_coefficients_bsplines_thread().
+ * Second stage of the whole kernel.
+ *
+ * \tparam[in] order                PME interpolation order.
+ * \tparam[in] wrapX                A boolean which tells if the grid overlap in dimension X should be wrapped.
+ * \tparam[in] wrapY                A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \param[in]  kernelParams         Input PME CUDA data in constant memory.
+ * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
+ * \param[in]  sm_coefficients      Atom coefficents/charges in the shared memory.
+ * \param[in]  sm_gridlineIndices   Atom gridline indices in the shared memory.
+ * \param[in]  sm_theta             Atom spline values in the shared memory.
+ */
+template <
+    const int order, const bool wrapX, const bool wrapY>
+__device__ __forceinline__ void spread_charges(const pme_gpu_cuda_kernel_params_t     kernelParams,
+                                               int                                    atomIndexOffset,
+                                               const float * __restrict__             sm_coefficients,
+                                               const int * __restrict__               sm_gridlineIndices,
+                                               const float * __restrict__             sm_theta)
+{
+    /* Global memory pointer to the output grid */
+    float * __restrict__ gm_grid = kernelParams.grid.d_realGrid;
+
+
+    const int nx  = kernelParams.grid.realGridSize[XX];
+    const int ny  = kernelParams.grid.realGridSize[YY];
+    const int nz  = kernelParams.grid.realGridSize[ZZ];
+    const int pny = kernelParams.grid.realGridSizePadded[YY];
+    const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
+
+    const int offx = 0, offy = 0, offz = 0; // unused for now
+
+    const int atomIndexLocal  = threadIdx.z;
+    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+
+    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
+    const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
+    if (chargeCheck & globalCheck)
+    {
+        // Spline Y/Z coordinates
+        const int ithy   = threadIdx.y;
+        const int ithz   = threadIdx.x;
+        const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX] - offx;
+        int       iy     = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy + ithy;
+        if (wrapY & (iy >= ny))
+        {
+            iy -= ny;
+        }
+        int iz  = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz;
+        if (iz >= nz)
+        {
+            iz -= nz;
+        }
+
+        /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
+        const int    atomWarpIndex     = atomIndexLocal % PME_SPREADGATHER_ATOMS_PER_WARP;
+        /* Warp index w.r.t. block - could probably be obtained easier? */
+        const int    warpIndex         = atomIndexLocal / PME_SPREADGATHER_ATOMS_PER_WARP;
+        const int    dimStride         = PME_SPLINE_THETA_STRIDE * PME_SPREADGATHER_ATOMS_PER_WARP;
+        const int    orderStride       = dimStride * DIM;
+        const int    thetaOffsetBase   = orderStride * order * warpIndex + atomWarpIndex;
+
+        const float  thetaZ         = sm_theta[thetaOffsetBase + ithz * orderStride + ZZ * dimStride];
+        const float  thetaY         = sm_theta[thetaOffsetBase + ithy * orderStride + YY * dimStride];
+        const float  constVal       = thetaZ * thetaY * sm_coefficients[atomIndexLocal];
+        assert(isfinite(constVal));
+        const int    constOffset       = iy * pnz + iz;
+        const float *sm_thetaX         = sm_theta + (thetaOffsetBase + XX * dimStride);
+
+#pragma unroll
+        for (int ithx = 0; (ithx < order); ithx++)
+        {
+            int ix = ixBase + ithx;
+            if (wrapX & (ix >= nx))
+            {
+                ix -= nx;
+            }
+            const int gridIndexGlobal = ix * pny * pnz + constOffset;
+            assert(isfinite(sm_thetaX[ithx * orderStride]));
+            assert(isfinite(gm_grid[gridIndexGlobal]));
+            atomicAdd(gm_grid + gridIndexGlobal, sm_thetaX[ithx * orderStride] * constVal);
+        }
+    }
+}
+
+/*! \brief
+ * A spline computation and charge spreading kernel function.
+ *
+ * \tparam[in] order                PME interpolation order.
+ * \tparam[in] computeSplines       A boolean which tells if the spline parameter and
+ *                                  gridline indices' computation should be performed.
+ * \tparam[in] spreadCharges        A boolean which tells if the charge spreading should be performed.
+ * \tparam[in] wrapX                A boolean which tells if the grid overlap in dimension X should be wrapped.
+ * \tparam[in] wrapY                A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \param[in]  kernelParams         Input PME CUDA data in constant memory.
+ */
+template <
+    const int order,
+    const bool computeSplines,
+    const bool spreadCharges,
+    const bool wrapX,
+    const bool wrapY
+    >
+__launch_bounds__(c_spreadMaxThreadsPerBlock)
+__global__ void pme_spline_and_spread_kernel(const pme_gpu_cuda_kernel_params_t kernelParams)
+{
+    const int        atomsPerBlock = c_spreadMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM;
+    // Gridline indices, ivec
+    __shared__ int   sm_gridlineIndices[atomsPerBlock * DIM];
+    // Charges
+    __shared__ float sm_coefficients[atomsPerBlock];
+    // Spline values
+    __shared__ float sm_theta[atomsPerBlock * DIM * order];
+
+    const int        atomIndexOffset = blockIdx.x * atomsPerBlock;
+
+    /* Staging coefficients/charges for both spline and spread */
+    pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
+
+    if (computeSplines)
+    {
+        /* Staging coordinates */
+        __shared__ float sm_coordinates[DIM * atomsPerBlock];
+        pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
+
+        __syncthreads();
+        calculate_splines<order, atomsPerBlock>(kernelParams, atomIndexOffset, (const float3 *)sm_coordinates,
+                                                sm_coefficients, sm_theta, sm_gridlineIndices);
+    }
+    else
+    {
+        /* Staging the data for spread
+         * (the data is assumed to be in GPU global memory with proper layout already,
+         * as in after running the spline kernel)
+         */
+        /* Spline data - only thetas (dthetas will only be needed in gather) */
+        pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(kernelParams, sm_theta, kernelParams.atoms.d_theta);
+        /* Gridline indices */
+        pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(kernelParams, sm_gridlineIndices, kernelParams.atoms.d_gridlineIndices);
+
+        __syncthreads();
+    }
+
+    /* Spreading */
+    if (spreadCharges)
+    {
+        spread_charges<order, wrapX, wrapY>(kernelParams, atomIndexOffset, sm_coefficients, sm_gridlineIndices, sm_theta);
+    }
+}
+
+void pme_gpu_spread(const pme_gpu_t *pmeGpu,
+                    int gmx_unused   gridIndex,
+                    real            *h_grid,
+                    bool             computeSplines,
+                    bool             spreadCharges)
+{
+    GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)");
+    cudaStream_t  stream          = pmeGpu->archSpecific->pmeStream;
+    const auto   *kernelParamsPtr = pmeGpu->kernelParams.get();
+    GMX_ASSERT(kernelParamsPtr->atoms.nAtoms > 0, "No atom data in PME GPU spread");
+
+    const int order         = pmeGpu->common->pme_order;
+    const int atomsPerBlock = c_spreadMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM;
+    // TODO: pick smaller block size in runtime if needed
+    // (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms))
+    // If doing so, change atomsPerBlock in the kernels as well.
+    // TODO: test varying block sizes on modern arch-s as well
+    // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
+    //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
+    GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
+
+    dim3 nBlocks(pmeGpu->nAtomsPadded / atomsPerBlock);
+    dim3 dimBlock(order, order, atomsPerBlock);
+
+    // These should later check for PME decomposition
+    const bool wrapX = true;
+    const bool wrapY = true;
+    GMX_UNUSED_VALUE(wrapX);
+    GMX_UNUSED_VALUE(wrapY);
+    switch (order)
+    {
+        case 4:
+        {
+            // TODO: cleaner unroll with some template trick?
+            if (computeSplines)
+            {
+                if (spreadCharges)
+                {
+                    pme_gpu_start_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
+                    pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+                    CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
+                    pme_gpu_stop_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
+                }
+                else
+                {
+                    pme_gpu_start_timing(pmeGpu, gtPME_SPLINE);
+                    pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+                    CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
+                    pme_gpu_stop_timing(pmeGpu, gtPME_SPLINE);
+                }
+            }
+            else
+            {
+                pme_gpu_start_timing(pmeGpu, gtPME_SPREAD);
+                pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+                CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
+                pme_gpu_stop_timing(pmeGpu, gtPME_SPREAD);
+            }
+        }
+        break;
+
+        default:
+            GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not tested!"));
+    }
+
+    const bool copyBackGrid = spreadCharges && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_FFT(pmeGpu));
+    if (copyBackGrid)
+    {
+        pme_gpu_copy_output_spread_grid(pmeGpu, h_grid);
+    }
+    const bool copyBackAtomData = computeSplines && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_gather(pmeGpu));
+    if (copyBackAtomData)
+    {
+        pme_gpu_copy_output_spread_atom_data(pmeGpu);
+    }
+}
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu

index bd67a01ee745e7c9c78fd23a8a51434b795a7b6f..744cf28ed5c24023cea94f5855e7b49949792b2e 100644 (file)
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -59,14 +59,14 @@
  int pme_gpu_get_atom_data_alignment(const pme_gpu_t *pmeGPU)
  {
      const int order = pmeGPU->common->pme_order;
-    GMX_RELEASE_ASSERT(order > 0, "Invalid PME order");
+    GMX_ASSERT(order > 0, "Invalid PME order");
      return PME_ATOM_DATA_ALIGNMENT;
  }
  
  int pme_gpu_get_atom_spline_data_alignment(const pme_gpu_t *pmeGPU)
  {
      const int order = pmeGPU->common->pme_order;
-    GMX_RELEASE_ASSERT(order > 0, "Invalid PME order");
+    GMX_ASSERT(order > 0, "Invalid PME order");
      return PME_SPREADGATHER_ATOMS_PER_WARP;
  }
  
@@ -340,7 +340,8 @@ void pme_gpu_clear_grids(const pme_gpu_t *pmeGPU)
  
  void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *pmeGPU)
  {
-    cudaStream_t stream          = pmeGPU->archSpecific->pmeStream;
+    pme_gpu_free_fract_shifts(pmeGPU);
+
      auto        *kernelParamsPtr = pmeGPU->kernelParams.get();
  
      const int    nx                  = kernelParamsPtr->grid.realGridSize[XX];
@@ -353,38 +354,32 @@ void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *pmeGPU)
  
      const int    newFractShiftsSize  = cellCount * (nx + ny + nz);
  
-    /* Two arrays, same size */
-    int currentSizeTemp      = pmeGPU->archSpecific->fractShiftsSize;
-    int currentSizeTempAlloc = pmeGPU->archSpecific->fractShiftsSizeAlloc;
-    cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fractShiftsTable, nullptr, sizeof(float),
-                        &currentSizeTemp, &currentSizeTempAlloc,
-                        newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true);
-    float *fractShiftsArray = kernelParamsPtr->grid.d_fractShiftsTable;
-    cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_gridlineIndicesTable, nullptr, sizeof(int),
-                        &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc,
-                        newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true);
-    int *gridlineIndicesArray = kernelParamsPtr->grid.d_gridlineIndicesTable;
-
-    /* TODO: pinning of the pmeGPU->common->fsh/nn host-side arrays */
+    initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
+                         kernelParamsPtr->fractShiftsTableTexture,
+                         &pme_gpu_get_fract_shifts_texref(),
+                         pmeGPU->common->fsh.data(),
+                         newFractShiftsSize,
+                         pmeGPU->deviceInfo);
  
-    for (int i = 0; i < DIM; i++)
-    {
-        kernelParamsPtr->grid.tablesOffsets[i] = gridDataOffset[i];
-        cu_copy_H2D_async(fractShiftsArray + gridDataOffset[i], pmeGPU->common->fsh[i].data(),
-                          cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(float), stream);
-        cu_copy_H2D_async(gridlineIndicesArray + gridDataOffset[i], pmeGPU->common->nn[i].data(),
-                          cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(int), stream);
-    }
-
-    pme_gpu_make_fract_shifts_textures(pmeGPU);
+    initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
+                         kernelParamsPtr->gridlineIndicesTableTexture,
+                         &pme_gpu_get_gridline_texref(),
+                         pmeGPU->common->nn.data(),
+                         newFractShiftsSize,
+                         pmeGPU->deviceInfo);
  }
  
  void pme_gpu_free_fract_shifts(const pme_gpu_t *pmeGPU)
  {
-    pme_gpu_free_fract_shifts_textures(pmeGPU);
-    /* Two arrays, same size */
-    cu_free_buffered(pmeGPU->kernelParams->grid.d_fractShiftsTable);
-    cu_free_buffered(pmeGPU->kernelParams->grid.d_gridlineIndicesTable, &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc);
+    auto *kernelParamsPtr = pmeGPU->kernelParams.get();
+    destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
+                            kernelParamsPtr->fractShiftsTableTexture,
+                            &pme_gpu_get_fract_shifts_texref(),
+                            pmeGPU->deviceInfo);
+    destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
+                            kernelParamsPtr->gridlineIndicesTableTexture,
+                            &pme_gpu_get_gridline_texref(),
+                            pmeGPU->deviceInfo);
  }
  
  void pme_gpu_sync_output_energy_virial(const pme_gpu_t *pmeGPU)
@@ -412,6 +407,20 @@ void pme_gpu_copy_output_spread_grid(const pme_gpu_t *pmeGpu, float *h_grid)
      CU_RET_ERR(stat, "PME spread grid sync event record failure");
  }
  
+void pme_gpu_copy_output_spread_atom_data(const pme_gpu_t *pmeGpu)
+{
+    const int    alignment       = pme_gpu_get_atom_spline_data_alignment(pmeGpu);
+    const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
+    const size_t splinesSize     = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
+    auto        *kernelParamsPtr = pmeGpu->kernelParams.get();
+    cu_copy_D2H_async(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream);
+    cu_copy_D2H_async(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
+    cu_copy_D2H_async(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
+                      kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
+    cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSplineAtomDataD2H, pmeGpu->archSpecific->pmeStream);
+    CU_RET_ERR(stat, "PME spread atom data sync event record failure");
+}
+
  void pme_gpu_sync_spread_grid(const pme_gpu_t *pmeGPU)
  {
      cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSpreadGridD2H, 0);
diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh

index 16b4b9aae8ae905f433656c9e41daea96b24f156..bed5feb4923a64a4c0c1bab45a9d6ce70739a8da 100644 (file)
--- a/src/gromacs/ewald/pme.cuh
+++ b/src/gromacs/ewald/pme.cuh
@@ -45,6 +45,8 @@
  #ifndef GMX_EWALD_PME_CUH
  #define GMX_EWALD_PME_CUH
  
+#include "config.h"
+
  #include <cassert>
  
  #include <array>
@@ -89,12 +91,23 @@ class GpuParallel3dFft;
   */
  #define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
  
-//! The spread/gather integer constant which depends on the templated order parameter (2 atoms per warp for order == 4)
+/*! \brief
+ * The number of atoms processed by a single warp in spread/gather.
+ * This macro depends on the templated order parameter (2 atoms per warp for order 4).
+ * It is mostly used for spline data layout tweaked for coalesced access.
+ */
  #define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM)
  
-//! Atom data alignment - has to be divisible both by spread and gather maximal atoms-per-block counts,
-//! which is asserted in case we use atom data padding at all.
-#define PME_ATOM_DATA_ALIGNMENT (16 * PME_SPREADGATHER_ATOMS_PER_WARP);
+/*! \brief
+ * Atom data alignment (in terms of number of atoms).
+ * If the GPU atom data buffers are padded (c_usePadding == true),
+ * Then the numbers of atoms which would fit in the padded GPU buffers has to be divisible by this.
+ * The literal number (16) expresses maximum spread/gather block width in warps.
+ * Accordingly, spread and gather block widths in warps should be divisors of this
+ * (e.g. in the pme-spread.cu: constexpr int c_spreadMaxThreadsPerBlock = 8 * warp_size;).
+ * There are debug asserts for this divisibility.
+ */
+#define PME_ATOM_DATA_ALIGNMENT (16 * PME_SPREADGATHER_ATOMS_PER_WARP)
  
  /*! \brief \internal
   * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
@@ -193,10 +206,6 @@ struct pme_gpu_cuda_t
      int splineValuesSize;
      /*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */
      int splineValuesSizeAlloc;
-    /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (actual) */
-    int fractShiftsSize;
-    /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (reserved) */
-    int fractShiftsSizeAlloc;
      /*! \brief The kernelParams.grid.realGrid float element count (actual) */
      int realGridSize;
      /*! \brief The kernelParams.grid.realGrid float element count (reserved) */
@@ -222,22 +231,16 @@ struct pme_gpu_cuda_kernel_params_t : pme_gpu_kernel_params_base_t
      cudaTextureObject_t gridlineIndicesTableTexture;
  };
  
-/* CUDA texture functions which will reside in respective kernel files
+/* CUDA texture reference functions which reside in respective kernel files
   * (due to texture references having scope of a translation unit).
   */
-
-/*! \brief \internal
- * Creates/binds 2 textures used in the spline parameter computation.
- *
- * \param[in, out] pmeGPU         The PME GPU structure.
- */
-inline void pme_gpu_make_fract_shifts_textures(pme_gpu_t gmx_unused *pmeGpu){};
-
-/*! \brief \internal
- * Frees/unbinds 2 textures used in the spline parameter computation.
- *
- * \param[in] pmeGPU             The PME GPU structure.
- */
-inline void pme_gpu_free_fract_shifts_textures(const pme_gpu_t gmx_unused *pmeGpu){};
+#if !GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
+extern texture<int, 1, cudaReadModeElementType>   gridlineIndicesTableTextureRef;
+extern texture<float, 1, cudaReadModeElementType> fractShiftsTableTextureRef;
+#endif
+/*! Returns the reference to the gridlineIndices texture. */
+texture<int, 1, cudaReadModeElementType>   &pme_gpu_get_gridline_texref();
+/*! Returns the reference to the fractShifts texture. */
+texture<float, 1, cudaReadModeElementType> &pme_gpu_get_fract_shifts_texref();
  
  #endif
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index e9e89875311e4793fb0357391199e9b98a7f0e41..c1e5f8c5e0c2190d8d629c63027ae44d0555ac0a 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -205,6 +205,18 @@ void gmx_pme_reinit_atoms(const gmx_pme_t *pme, const int nAtoms, const real *ch
  
  /* A block of PME GPU functions */
  
+/*! \brief Checks whether the input system allows to run PME on GPU.
+ * TODO: this mostly duplicates an internal PME assert function
+ * pme_gpu_check_restrictions(), except that works with a
+ * formed gmx_pme_t structure. Should that one go away/work with inputrec?
+ *
+ * \param[in]  ir     Input system.
+ * \param[out] error  The error message if the input is not supported on GPU.
+ *
+ * \returns true if PME can run on GPU with this input, false otherwise.
+ */
+bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error);
+
  /*! \brief
   * Tells if PME is enabled to run on GPU (not necessarily active at the moment).
   * For now, this decision is stored in the PME structure itself.
diff --git a/src/gromacs/ewald/tests/CMakeLists.txt b/src/gromacs/ewald/tests/CMakeLists.txt

index 1ebfbb0155134c1ffd1ca5cbb99b291024bc0497..3155182ae7e8f4b9fc265af193e32351b2dadbbc 100644 (file)
--- a/src/gromacs/ewald/tests/CMakeLists.txt
+++ b/src/gromacs/ewald/tests/CMakeLists.txt
@@ -33,5 +33,9 @@
  # the research papers on the package. Check out http://www.gromacs.org.
  
  file(GLOB EWALD_TEST_SOURCES *.cpp)
+if (GMX_USE_CUDA)
+    file(GLOB EWALD_CUDA_SOURCES ../*.cu)
+endif()
+
  gmx_add_unit_test(EwaldUnitTests ewald-test
-                  ${EWALD_TEST_SOURCES})
+                  ${EWALD_TEST_SOURCES} ${EWALD_CUDA_SOURCES})
diff --git a/src/gromacs/ewald/tests/pmegathertest.cpp b/src/gromacs/ewald/tests/pmegathertest.cpp

index 7e1e74f3ee5bbf3cd8a7715c05549dd199be8858..d65b9a796179e421d6527c222ec44cfed05ab8f1 100644 (file)
--- a/src/gromacs/ewald/tests/pmegathertest.cpp
+++ b/src/gromacs/ewald/tests/pmegathertest.cpp
@@ -403,7 +403,7 @@ class PmeGatherTest : public ::testing::TestWithParam<GatherInputParameters>
                                            (inputForceTreatment == PmeGatherInputHandling::ReduceWith) ? "with reduction" : "without reduction"
                                            ));
  
-                PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, inputAtomData.coordinates, inputAtomData.charges, box);
+                PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, mode.first, nullptr, inputAtomData.coordinates, inputAtomData.charges, box);
  
                  /* Setting some more inputs */
                  pmeSetRealGrid(pmeSafe.get(), mode.first, nonZeroGridValues);
diff --git a/src/gromacs/ewald/tests/pmesolvetest.cpp b/src/gromacs/ewald/tests/pmesolvetest.cpp

index c5c3b8ea1d2ae5175e49b8cf8dcda05c0b6cdc54..0163781a4d3a8c7294ea442efb70036000276044 100644 (file)
--- a/src/gromacs/ewald/tests/pmesolvetest.cpp
+++ b/src/gromacs/ewald/tests/pmesolvetest.cpp
@@ -128,7 +128,7 @@ class PmeSolveTest : public ::testing::TestWithParam<SolveInputParameters>
                                                    ));
  
                          /* Running the test */
-                        PmeSafePointer pmeSafe = pmeInitEmpty(&inputRec, box, ewaldCoeff_q, ewaldCoeff_lj);
+                        PmeSafePointer pmeSafe = pmeInitEmpty(&inputRec, mode.first, nullptr, box, ewaldCoeff_q, ewaldCoeff_lj);
                          pmeSetComplexGrid(pmeSafe.get(), mode.first, gridOrdering.first, nonZeroGridValues);
                          const real     cellVolume = box[0] * box[4] * box[8];
                          //FIXME - this is box[XX][XX] * box[YY][YY] * box[ZZ][ZZ], should be stored in the PME structure
diff --git a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp

index c751ef1ebfb59815f55737735b3a9e99d41dea18..e6352d0c41a177096b807fae7e83e830b05391b5 100644 (file)
--- a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
+++ b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
@@ -103,96 +103,134 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParam<SplineAndSpreadIn
              inputRec.nkz         = gridSize[ZZ];
              inputRec.pme_order   = pmeOrder;
              inputRec.coulombtype = eelPME;
+            inputRec.epsilon_r   = 1.0;
  
              TestReferenceData                                      refData;
  
-            const std::map<CodePath, std::string>                  modesToTest   = {{CodePath::CPU, "CPU"}};
+            const std::map<CodePath, std::string>                  modesToTest   = {{CodePath::CPU, "CPU"},
+                                                                                    {CodePath::CUDA, "CUDA"}};
+
              const std::map<PmeSplineAndSpreadOptions, std::string> optionsToTest = {{PmeSplineAndSpreadOptions::SplineAndSpreadUnified, "spline computation and charge spreading (fused)"},
                                                                                      {PmeSplineAndSpreadOptions::SplineOnly, "spline computation"},
                                                                                      {PmeSplineAndSpreadOptions::SpreadOnly, "charge spreading"}};
+
+            // There is a subtle problem with multiple comparisons against same reference data:
+            // The subsequent (GPU) spreading runs at one point didn't actually copy the output grid into the proper buffer,
+            // but the reference data was already marked as checked (hasBeenChecked_) by the CPU run, so nothing failed.
+            // For now we will manually track that the count of the grid entries is the same on each run.
+            // This is just a hack for a single specific output though.
+            // What would be much better TODO is to split different codepaths into separate tests,
+            // while making them use the same reference files.
+            bool   gridValuesSizeAssigned = false;
+            size_t previousGridValuesSize;
+
              for (const auto &mode : modesToTest)
              {
-                for (const auto &option : optionsToTest)
+                const bool supportedInput = pmeSupportsInputForMode(&inputRec, mode.first);
+                if (!supportedInput)
                  {
-                    /* Describing the test uniquely in case it fails */
-
-                    SCOPED_TRACE(formatString("Testing %s with %s for PME grid size %d %d %d"
-                                              ", order %d, %zu atoms",
-                                              option.second.c_str(), mode.second.c_str(),
-                                              gridSize[XX], gridSize[YY], gridSize[ZZ],
-                                              pmeOrder,
-                                              atomCount));
+                    /* Testing the failure for the unsupported input */
+                    EXPECT_THROW(pmeInitAtoms(&inputRec, mode.first, nullptr, coordinates, charges, box), NotImplementedError);
+                    continue;
+                }
  
-                    /* Running the test */
+                const auto contextsToTest = pmeEnv->getHardwareContexts(mode.first);
+                for (const auto &context : contextsToTest)
+                {
+                    for (const auto &option : optionsToTest)
+                    {
+                        /* Describing the test uniquely in case it fails */
  
-                    PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, coordinates, charges, box);
+                        SCOPED_TRACE(formatString("Testing %s with %s %sfor PME grid size %d %d %d"
+                                                  ", order %d, %zu atoms",
+                                                  option.second.c_str(), mode.second.c_str(),
+                                                  context.getDescription().c_str(),
+                                                  gridSize[XX], gridSize[YY], gridSize[ZZ],
+                                                  pmeOrder,
+                                                  atomCount));
  
-                    const bool     computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
-                    const bool     spreadCharges  = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
+                        /* Running the test */
  
-                    if (!computeSplines)
-                    {
-                        // Here we should set up the results of the spline computation so that the spread can run.
-                        // What is lazy and works is running the separate spline so that it will set it up for us:
-                        pmePerformSplineAndSpread(pmeSafe.get(), mode.first, true, false);
-                        // We know that it is tested in another iteration.
-                        // TODO: Clean alternative: read and set the reference gridline indices, spline params
-                    }
+                        PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, mode.first, context.getDeviceInfo(), coordinates, charges, box);
  
-                    pmePerformSplineAndSpread(pmeSafe.get(), mode.first, computeSplines, spreadCharges);
+                        const bool     computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
+                        const bool     spreadCharges  = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
  
-                    /* Outputs correctness check */
-                    /* All tolerances were picked empirically for single precision on CPU */
+                        if (!computeSplines)
+                        {
+                            // Here we should set up the results of the spline computation so that the spread can run.
+                            // What is lazy and works is running the separate spline so that it will set it up for us:
+                            pmePerformSplineAndSpread(pmeSafe.get(), mode.first, true, false);
+                            // We know that it is tested in another iteration.
+                            // TODO: Clean alternative: read and set the reference gridline indices, spline params
+                        }
  
-                    TestReferenceChecker rootChecker(refData.rootChecker());
+                        pmePerformSplineAndSpread(pmeSafe.get(), mode.first, computeSplines, spreadCharges);
+                        pmeFinalizeTest(pmeSafe.get(), mode.first);
  
-                    const auto           maxGridSize              = std::max(std::max(gridSize[XX], gridSize[YY]), gridSize[ZZ]);
-                    const auto           ulpToleranceSplineValues = 2 * (pmeOrder - 2) * maxGridSize;
-                    /* 2 is empiric, the rest follows from the amount of operations */
+                        /* Outputs correctness check */
+                        /* All tolerances were picked empirically for single precision on CPU */
  
-                    if (computeSplines)
-                    {
-                        const char *dimString[] = { "X", "Y", "Z" };
+                        TestReferenceChecker rootChecker(refData.rootChecker());
  
-                        /* Spline values */
-                        SCOPED_TRACE(formatString("Testing spline values with tolerance of %ld", ulpToleranceSplineValues));
-                        TestReferenceChecker splineValuesChecker(rootChecker.checkCompound("Splines", "Values"));
-                        splineValuesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceSplineValues));
-                        for (int i = 0; i < DIM; i++)
-                        {
-                            auto splineValuesDim = pmeGetSplineData(pmeSafe.get(), mode.first, PmeSplineDataType::Values, i);
-                            splineValuesChecker.checkSequence(splineValuesDim.begin(), splineValuesDim.end(), dimString[i]);
-                        }
+                        const auto           maxGridSize              = std::max(std::max(gridSize[XX], gridSize[YY]), gridSize[ZZ]);
+                        const auto           ulpToleranceSplineValues = 2 * (pmeOrder - 2) * maxGridSize;
+                        /* 2 is empiric, the rest follows from the amount of operations */
  
-                        /* Spline derivatives */
-                        const auto ulpToleranceSplineDerivatives = 4 * ulpToleranceSplineValues;
-                        /* 4 is just a wild guess since the derivatives are deltas of neighbor spline values which could differ greatly */
-                        SCOPED_TRACE(formatString("Testing spline derivatives with tolerance of %ld", ulpToleranceSplineDerivatives));
-                        TestReferenceChecker splineDerivativesChecker(rootChecker.checkCompound("Splines", "Derivatives"));
-                        splineDerivativesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceSplineDerivatives));
-                        for (int i = 0; i < DIM; i++)
+                        if (computeSplines)
                          {
-                            auto splineDerivativesDim = pmeGetSplineData(pmeSafe.get(), mode.first, PmeSplineDataType::Derivatives, i);
-                            splineDerivativesChecker.checkSequence(splineDerivativesDim.begin(), splineDerivativesDim.end(), dimString[i]);
+                            const char *dimString[] = { "X", "Y", "Z" };
+
+                            /* Spline values */
+                            SCOPED_TRACE(formatString("Testing spline values with tolerance of %ld", ulpToleranceSplineValues));
+                            TestReferenceChecker splineValuesChecker(rootChecker.checkCompound("Splines", "Values"));
+                            splineValuesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceSplineValues));
+                            for (int i = 0; i < DIM; i++)
+                            {
+                                auto splineValuesDim = pmeGetSplineData(pmeSafe.get(), mode.first, PmeSplineDataType::Values, i);
+                                splineValuesChecker.checkSequence(splineValuesDim.begin(), splineValuesDim.end(), dimString[i]);
+                            }
+
+                            /* Spline derivatives */
+                            const auto ulpToleranceSplineDerivatives = 4 * ulpToleranceSplineValues;
+                            /* 4 is just a wild guess since the derivatives are deltas of neighbor spline values which could differ greatly */
+                            SCOPED_TRACE(formatString("Testing spline derivatives with tolerance of %ld", ulpToleranceSplineDerivatives));
+                            TestReferenceChecker splineDerivativesChecker(rootChecker.checkCompound("Splines", "Derivatives"));
+                            splineDerivativesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceSplineDerivatives));
+                            for (int i = 0; i < DIM; i++)
+                            {
+                                auto splineDerivativesDim = pmeGetSplineData(pmeSafe.get(), mode.first, PmeSplineDataType::Derivatives, i);
+                                splineDerivativesChecker.checkSequence(splineDerivativesDim.begin(), splineDerivativesDim.end(), dimString[i]);
+                            }
+
+                            /* Particle gridline indices */
+                            auto gridLineIndices = pmeGetGridlineIndices(pmeSafe.get(), mode.first);
+                            rootChecker.checkSequence(gridLineIndices.begin(), gridLineIndices.end(), "Gridline indices");
                          }
  
-                        /* Particle gridline indices */
-                        auto gridLineIndices = pmeGetGridlineIndices(pmeSafe.get(), mode.first);
-                        rootChecker.checkSequence(gridLineIndices.begin(), gridLineIndices.end(), "Gridline indices");
-                    }
-
-                    if (spreadCharges)
-                    {
-                        /* The wrapped grid */
-                        SparseRealGridValuesOutput nonZeroGridValues = pmeGetRealGrid(pmeSafe.get(), mode.first);
-                        TestReferenceChecker       gridValuesChecker(rootChecker.checkCompound("NonZeroGridValues", "RealSpaceGrid"));
-                        const auto                 ulpToleranceGrid = 2 * ulpToleranceSplineValues * (int)(ceil(sqrt(atomCount)));
-                        /* 2 is empiric; sqrt(atomCount) assumes all the input charges may spread onto the same cell */
-                        SCOPED_TRACE(formatString("Testing grid values with tolerance of %ld", ulpToleranceGrid));
-                        gridValuesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceGrid));
-                        for (const auto &point : nonZeroGridValues)
+                        if (spreadCharges)
                          {
-                            gridValuesChecker.checkReal(point.second, point.first.c_str());
+                            /* The wrapped grid */
+                            SparseRealGridValuesOutput nonZeroGridValues = pmeGetRealGrid(pmeSafe.get(), mode.first);
+                            TestReferenceChecker       gridValuesChecker(rootChecker.checkCompound("NonZeroGridValues", "RealSpaceGrid"));
+                            const auto                 ulpToleranceGrid = 2 * ulpToleranceSplineValues * (int)(ceil(sqrt(atomCount)));
+                            /* 2 is empiric; sqrt(atomCount) assumes all the input charges may spread onto the same cell */
+                            SCOPED_TRACE(formatString("Testing grid values with tolerance of %ld", ulpToleranceGrid));
+                            if (!gridValuesSizeAssigned)
+                            {
+                                previousGridValuesSize = nonZeroGridValues.size();
+                                gridValuesSizeAssigned = true;
+                            }
+                            else
+                            {
+                                EXPECT_EQ(previousGridValuesSize, nonZeroGridValues.size());
+                            }
+
+                            gridValuesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceGrid));
+                            for (const auto &point : nonZeroGridValues)
+                            {
+                                gridValuesChecker.checkReal(point.second, point.first.c_str());
+                            }
                          }
                      }
                  }
@@ -201,7 +239,7 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParam<SplineAndSpreadIn
  };
  
  
-/*! \brief Test for PME B-spline moduli computation */
+/*! \brief Test for spline parameter computation and charge spreading. */
  TEST_P(PmeSplineAndSpreadTest, ReproducesOutputs)
  {
      EXPECT_NO_THROW(runTest());
@@ -210,7 +248,7 @@ TEST_P(PmeSplineAndSpreadTest, ReproducesOutputs)
  /* Valid input instances */
  
  //! A couple of valid inputs for boxes.
-static std::vector<Matrix3x3> const sampleBoxes
+static std::vector<Matrix3x3> const c_sampleBoxes
  {
      // normal box
      Matrix3x3 {{
@@ -227,7 +265,7 @@ static std::vector<Matrix3x3> const sampleBoxes
  };
  
  //! A couple of valid inputs for grid sizes.
-static std::vector<IVec> const sampleGridSizes
+static std::vector<IVec> const c_sampleGridSizes
  {
      IVec {
          16, 12, 14
@@ -238,19 +276,19 @@ static std::vector<IVec> const sampleGridSizes
  };
  
  //! Random charges
-static std::vector<real> const sampleChargesFull
+static std::vector<real> const c_sampleChargesFull
  {
      4.95f, 3.11f, 3.97f, 1.08f, 2.09f, 1.1f, 4.13f, 3.31f, 2.8f, 5.83f, 5.09f, 6.1f, 2.86f, 0.24f, 5.76f, 5.19f, 0.72f
  };
  //! 1 charge
-static auto const sampleCharges1 = ChargesVector::fromVector(sampleChargesFull.begin(), sampleChargesFull.begin() + 1);
+static auto const c_sampleCharges1 = ChargesVector::fromVector(c_sampleChargesFull.begin(), c_sampleChargesFull.begin() + 1);
  //! 2 charges
-static auto const sampleCharges2 = ChargesVector::fromVector(sampleChargesFull.begin() + 1, sampleChargesFull.begin() + 3);
+static auto const c_sampleCharges2 = ChargesVector::fromVector(c_sampleChargesFull.begin() + 1, c_sampleChargesFull.begin() + 3);
  //! 13 charges
-static auto const sampleCharges13 = ChargesVector::fromVector(sampleChargesFull.begin() + 3, sampleChargesFull.begin() + 16);
+static auto const c_sampleCharges13 = ChargesVector::fromVector(c_sampleChargesFull.begin() + 3, c_sampleChargesFull.begin() + 16);
  
  //! Random coordinate vectors
-static CoordinatesVector const sampleCoordinatesFull
+static CoordinatesVector const c_sampleCoordinatesFull
  {
      {
          5.59f, 1.37f, 0.95f
@@ -289,33 +327,34 @@ static CoordinatesVector const sampleCoordinatesFull
      }
  };
  //! 1 coordinate vector
-static CoordinatesVector const sampleCoordinates1(sampleCoordinatesFull.begin(), sampleCoordinatesFull.begin() + 1);
+static CoordinatesVector const c_sampleCoordinates1(c_sampleCoordinatesFull.begin(), c_sampleCoordinatesFull.begin() + 1);
  //! 2 coordinate vectors
-static CoordinatesVector const sampleCoordinates2(sampleCoordinatesFull.begin() + 1, sampleCoordinatesFull.begin() + 3);
+static CoordinatesVector const c_sampleCoordinates2(c_sampleCoordinatesFull.begin() + 1, c_sampleCoordinatesFull.begin() + 3);
  //! 13 coordinate vectors
-static CoordinatesVector const sampleCoordinates13(sampleCoordinatesFull.begin() + 3, sampleCoordinatesFull.begin() + 16);
+static CoordinatesVector const c_sampleCoordinates13(c_sampleCoordinatesFull.begin() + 3, c_sampleCoordinatesFull.begin() + 16);
  
  //! moved out from instantiantions for readability
-auto inputBoxes     = ::testing::ValuesIn(sampleBoxes);
+auto c_inputBoxes     = ::testing::ValuesIn(c_sampleBoxes);
  //! moved out from instantiantions for readability
-auto inputPmeOrders = ::testing::Range(3, 5 + 1);
+auto c_inputPmeOrders = ::testing::Range(3, 5 + 1);
  //! moved out from instantiantions for readability
-auto inputGridSizes = ::testing::ValuesIn(sampleGridSizes);
+auto c_inputGridSizes = ::testing::ValuesIn(c_sampleGridSizes);
  
-/*! \brief Instantiation of the PME spline computation test with valid input and 1 atom */
-INSTANTIATE_TEST_CASE_P(SaneInput1, PmeSplineAndSpreadTest, ::testing::Combine(inputBoxes, inputPmeOrders, inputGridSizes,
-                                                                                   ::testing::Values(sampleCoordinates1),
-                                                                                   ::testing::Values(sampleCharges1)
+/*! \brief Instantiation of the test with valid input and 1 atom */
+INSTANTIATE_TEST_CASE_P(SaneInput1, PmeSplineAndSpreadTest, ::testing::Combine(c_inputBoxes, c_inputPmeOrders, c_inputGridSizes,
+                                                                                   ::testing::Values(c_sampleCoordinates1),
+                                                                                   ::testing::Values(c_sampleCharges1)
                                                                                 ));
-/*! \brief Instantiation of the PME spline computation test with valid input and 2 atoms */
-INSTANTIATE_TEST_CASE_P(SaneInput2, PmeSplineAndSpreadTest, ::testing::Combine(inputBoxes, inputPmeOrders, inputGridSizes,
-                                                                                   ::testing::Values(sampleCoordinates2),
-                                                                                   ::testing::Values(sampleCharges2)
+
+/*! \brief Instantiation of the test with valid input and 2 atoms */
+INSTANTIATE_TEST_CASE_P(SaneInput2, PmeSplineAndSpreadTest, ::testing::Combine(c_inputBoxes, c_inputPmeOrders, c_inputGridSizes,
+                                                                                   ::testing::Values(c_sampleCoordinates2),
+                                                                                   ::testing::Values(c_sampleCharges2)
                                                                                 ));
-/*! \brief Instantiation of the PME spline computation test with valid input and 13 atoms */
-INSTANTIATE_TEST_CASE_P(SaneInput13, PmeSplineAndSpreadTest, ::testing::Combine(inputBoxes, inputPmeOrders, inputGridSizes,
-                                                                                    ::testing::Values(sampleCoordinates13),
-                                                                                    ::testing::Values(sampleCharges13)
+/*! \brief Instantiation of the test with valid input and 13 atoms */
+INSTANTIATE_TEST_CASE_P(SaneInput13, PmeSplineAndSpreadTest, ::testing::Combine(c_inputBoxes, c_inputPmeOrders, c_inputGridSizes,
+                                                                                    ::testing::Values(c_sampleCoordinates13),
+                                                                                    ::testing::Values(c_sampleCharges13)
                                                                                  ));
  }
  }
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index d1cafc59d46d7a00f8fc9fa9e11760249fe08fa2..c44599542e47e8badf85ccfa0b074c90392f0277 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -45,8 +45,8 @@
  
  #include <cstring>
  
-#include "gromacs/ewald/pme.h"
  #include "gromacs/ewald/pme-gather.h"
+#include "gromacs/ewald/pme-gpu-internal.h"
  #include "gromacs/ewald/pme-grid.h"
  #include "gromacs/ewald/pme-internal.h"
  #include "gromacs/ewald/pme-solve.h"
@@ -65,8 +65,29 @@ namespace gmx
  namespace test
  {
  
+bool pmeSupportsInputForMode(const t_inputrec *inputRec, CodePath mode)
+{
+    bool implemented;
+    switch (mode)
+    {
+        case CodePath::CPU:
+            implemented = true;
+            break;
+
+        case CodePath::CUDA:
+            implemented = pme_gpu_supports_input(inputRec, nullptr);
+            break;
+
+        default:
+            GMX_THROW(InternalError("Test not implemented for this mode"));
+    }
+    return implemented;
+}
+
  //! PME initialization - internal
  static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
+                                      CodePath                  mode,
+                                      gmx_device_info_t        *gpuInfo,
                                        size_t                    atomCount,
                                        const Matrix3x3          &box,
                                        real                      ewaldCoeff_q = 1.0f,
@@ -75,10 +96,10 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
  {
      gmx_pme_t     *pmeDataRaw = nullptr;
      const MDLogger dummyLogger;
-    const auto     runMode       = PmeRunMode::CPU;
+    const auto     runMode       = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::GPU;
      t_commrec      dummyCommrec  = {0};
      gmx_pme_init(&pmeDataRaw, &dummyCommrec, 1, 1, inputRec, atomCount, false, false, true,
-                 ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, nullptr, dummyLogger);
+                 ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, gpuInfo, dummyLogger);
      PmeSafePointer pme(pmeDataRaw); // taking ownership
  
      // TODO get rid of this with proper matrix type
@@ -92,24 +113,40 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
      }
      const char *boxError = check_box(-1, boxTemp);
      GMX_RELEASE_ASSERT(boxError == nullptr, boxError);
-    invertBoxMatrix(boxTemp, pme->recipbox);
+
+    switch (mode)
+    {
+        case CodePath::CPU:
+            invertBoxMatrix(boxTemp, pme->recipbox);
+            break;
+
+        case CodePath::CUDA:
+            pme_gpu_update_input_box(pme->gpu, boxTemp);
+            break;
+        default:
+            GMX_THROW(InternalError("Test not implemented for this mode"));
+    }
  
      return pme;
  }
  
  //! Simple PME initialization based on input, no atom data
  PmeSafePointer pmeInitEmpty(const t_inputrec         *inputRec,
+                            CodePath                  mode,
+                            gmx_device_info_t        *gpuInfo,
                              const Matrix3x3          &box,
                              real                      ewaldCoeff_q,
                              real                      ewaldCoeff_lj
                              )
  {
-    return pmeInitInternal(inputRec, 0, box, ewaldCoeff_q, ewaldCoeff_lj);
+    return pmeInitInternal(inputRec, mode, gpuInfo, 0, box, ewaldCoeff_q, ewaldCoeff_lj);
      // hiding the fact that PME actually needs to know the number of atoms in advance
  }
  
  //! PME initialization with atom data
  PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
+                            CodePath                  mode,
+                            gmx_device_info_t        *gpuInfo,
                              const CoordinatesVector  &coordinates,
                              const ChargesVector      &charges,
                              const Matrix3x3          &box
@@ -117,11 +154,28 @@ PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
  {
      const size_t    atomCount = coordinates.size();
      GMX_RELEASE_ASSERT(atomCount == charges.size(), "Mismatch in atom data");
-    PmeSafePointer  pmeSafe = pmeInitInternal(inputRec, atomCount, box);
-    pme_atomcomm_t *atc     = &(pmeSafe->atc[0]);
-    atc->x           = const_cast<rvec *>(as_rvec_array(coordinates.data()));
-    atc->coefficient = const_cast<real *>(charges.data());
-    /* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */
+    PmeSafePointer  pmeSafe = pmeInitInternal(inputRec, mode, gpuInfo, atomCount, box);
+    pme_atomcomm_t *atc     = nullptr;
+
+    switch (mode)
+    {
+        case CodePath::CPU:
+            atc              = &(pmeSafe->atc[0]);
+            atc->x           = const_cast<rvec *>(as_rvec_array(coordinates.data()));
+            atc->coefficient = const_cast<real *>(charges.data());
+            /* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */
+            break;
+
+        case CodePath::CUDA:
+            pme_gpu_set_testing(pmeSafe->gpu, true);
+            gmx_pme_reinit_atoms(pmeSafe.get(), atomCount, charges.data());
+            pme_gpu_copy_input_coordinates(pmeSafe->gpu, as_rvec_array(coordinates.data()));
+            break;
+
+        default:
+            GMX_THROW(InternalError("Test not implemented for this mode"));
+    }
+
      return pmeSafe;
  }
  
@@ -134,12 +188,25 @@ static real *pmeGetRealGridInternal(const gmx_pme_t *pme)
  
  //! Getting local PME real grid dimensions
  static void pmeGetRealGridSizesInternal(const gmx_pme_t      *pme,
+                                        CodePath              mode,
                                          IVec                 &gridSize,
                                          IVec                 &paddedGridSize)
  {
      const size_t gridIndex = 0;
      IVec         gridOffsetUnused;
-    gmx_parallel_3dfft_real_limits(pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused, paddedGridSize);
+    switch (mode)
+    {
+        case CodePath::CPU:
+            gmx_parallel_3dfft_real_limits(pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused, paddedGridSize);
+            break;
+
+        case CodePath::CUDA:
+            pme_gpu_get_real_grid_sizes(pme->gpu, &gridSize, &paddedGridSize);
+            break;
+
+        default:
+            GMX_THROW(InternalError("Test not implemented for this mode"));
+    }
  }
  
  //! Getting local PME complex grid pointer for test I/O
@@ -160,28 +227,28 @@ static void pmeGetComplexGridSizesInternal(const gmx_pme_t      *pme,
  }
  
  //! Getting the PME grid memory buffer and its sizes - template definition
-template<typename ValueType> static void pmeGetGridAndSizesInternal(const gmx_pme_t *, ValueType * &, IVec &, IVec &)
+template<typename ValueType> static void pmeGetGridAndSizesInternal(const gmx_pme_t *, CodePath, ValueType * &, IVec &, IVec &)
  {
      GMX_THROW(InternalError("Deleted function call"));
      // explicitly deleting general template does not compile in clang/icc, see https://llvm.org/bugs/show_bug.cgi?id=17537
  }
  
  //! Getting the PME real grid memory buffer and its sizes
-template<> void pmeGetGridAndSizesInternal<real>(const gmx_pme_t *pme, real * &grid, IVec &gridSize, IVec &paddedGridSize)
+template<> void pmeGetGridAndSizesInternal<real>(const gmx_pme_t *pme, CodePath mode, real * &grid, IVec &gridSize, IVec &paddedGridSize)
  {
      grid = pmeGetRealGridInternal(pme);
-    pmeGetRealGridSizesInternal(pme, gridSize, paddedGridSize);
+    pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSize);
  }
  
  //! Getting the PME complex grid memory buffer and its sizes
-template<> void pmeGetGridAndSizesInternal<t_complex>(const gmx_pme_t *pme, t_complex * &grid, IVec &gridSize, IVec &paddedGridSize)
+template<> void pmeGetGridAndSizesInternal<t_complex>(const gmx_pme_t *pme, CodePath, t_complex * &grid, IVec &gridSize, IVec &paddedGridSize)
  {
      grid = pmeGetComplexGridInternal(pme);
      pmeGetComplexGridSizesInternal(pme, gridSize, paddedGridSize);
  }
  
  //! PME spline calculation and charge spreading
-void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qualifiers
+void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qualifiers elsewhere
                                 bool computeSplines, bool spreadCharges)
  {
      GMX_RELEASE_ASSERT(pme != nullptr, "PME data is not initialized");
@@ -189,6 +256,7 @@ void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qual
      const size_t    gridIndex                    = 0;
      const bool      computeSplinesForZeroCharges = true;
      real           *fftgrid                      = spreadCharges ? pme->fftgrid[gridIndex] : nullptr;
+    real           *pmegrid                      = pme->pmegrid[gridIndex].grid.grid;
  
      switch (mode)
      {
@@ -197,11 +265,15 @@ void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qual
                             fftgrid, computeSplinesForZeroCharges, gridIndex);
              if (spreadCharges && !pme->bUseThreads)
              {
-                wrap_periodic_pmegrid(pme, pme->pmegrid[gridIndex].grid.grid);
-                copy_pmegrid_to_fftgrid(pme, pme->pmegrid[gridIndex].grid.grid, fftgrid, gridIndex);
+                wrap_periodic_pmegrid(pme, pmegrid);
+                copy_pmegrid_to_fftgrid(pme, pmegrid, fftgrid, gridIndex);
              }
              break;
  
+        case CodePath::CUDA:
+            pme_gpu_spread(pme->gpu, gridIndex, fftgrid, computeSplines, spreadCharges);
+            break;
+
          default:
              GMX_THROW(InternalError("Test not implemented for this mode"));
      }
@@ -298,6 +370,23 @@ void pmePerformGather(gmx_pme_t *pme, CodePath mode,
      }
  }
  
+//! PME test finalization before fetching the outputs
+void pmeFinalizeTest(const gmx_pme_t *pme, CodePath mode)
+{
+    switch (mode)
+    {
+        case CodePath::CPU:
+            break;
+
+        case CodePath::CUDA:
+            pme_gpu_synchronize(pme->gpu);
+            break;
+
+        default:
+            GMX_THROW(InternalError("Test not implemented for this mode"));
+    }
+}
+
  //! Setting atom spline values/derivatives to be used in spread/gather
  void pmeSetSplineData(const gmx_pme_t *pme, CodePath mode,
                        const SplineParamsDimVector &splineValues, PmeSplineDataType type, int dimIndex)
@@ -329,7 +418,7 @@ void pmeSetGridLineIndices(const gmx_pme_t *pme, CodePath mode,
      GMX_RELEASE_ASSERT(atomCount == gridLineIndices.size(), "Mismatch in gridline indices size");
  
      IVec paddedGridSizeUnused, gridSize;
-    pmeGetRealGridSizesInternal(pme, gridSize, paddedGridSizeUnused);
+    pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSizeUnused);
  
      for (const auto &index : gridLineIndices)
      {
@@ -380,7 +469,7 @@ static void pmeSetGridInternal(const gmx_pme_t *pme, CodePath mode,
  {
      IVec       gridSize, paddedGridSize;
      ValueType *grid;
-    pmeGetGridAndSizesInternal<ValueType>(pme, grid, gridSize, paddedGridSize);
+    pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
  
      switch (mode)
      {
@@ -431,6 +520,10 @@ SplineParamsDimVector pmeGetSplineData(const gmx_pme_t *pme, CodePath mode,
      SplineParamsDimVector    result;
      switch (mode)
      {
+        case CodePath::CUDA:
+            pme_gpu_transform_spline_atom_data_for_host(pme->gpu, atc, type, dimIndex);
+        // fallthrough
+
          case CodePath::CPU:
              result = SplineParamsDimVector::fromArray(sourceBuffer, dimSize);
              break;
@@ -451,6 +544,10 @@ GridLineIndicesVector pmeGetGridlineIndices(const gmx_pme_t *pme, CodePath mode)
      GridLineIndicesVector gridLineIndices;
      switch (mode)
      {
+        case CodePath::CUDA:
+            gridLineIndices = GridLineIndicesVector::fromArray(reinterpret_cast<IVec *>(pme->gpu->staging.h_gridlineIndices), atomCount);
+            break;
+
          case CodePath::CPU:
              gridLineIndices = GridLineIndicesVector::fromArray(reinterpret_cast<IVec *>(atc->idx), atomCount);
              break;
@@ -467,10 +564,11 @@ static SparseGridValuesOutput<ValueType> pmeGetGridInternal(const gmx_pme_t *pme
  {
      IVec       gridSize, paddedGridSize;
      ValueType *grid;
-    pmeGetGridAndSizesInternal<ValueType>(pme, grid, gridSize, paddedGridSize);
+    pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
      SparseGridValuesOutput<ValueType> gridValues;
      switch (mode)
      {
+        case CodePath::CUDA: // intentional absence of break
          case CodePath::CPU:
              gridValues.clear();
              for (int ix = 0; ix < gridSize[XX]; ix++)
diff --git a/src/gromacs/ewald/tests/pmetestcommon.h b/src/gromacs/ewald/tests/pmetestcommon.h

index e3bd8e810aed97273c92cae8171f02f7a3372e8b..0d89a98a1e0d817478006cb470d707d0e88db975 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.h
+++ b/src/gromacs/ewald/tests/pmetestcommon.h
@@ -46,12 +46,15 @@
  #include <map>
  #include <vector>
  
+#include <gtest/gtest.h>
+
  #include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-gpu-internal.h"
  #include "gromacs/math/gmxcomplex.h"
  #include "gromacs/utility/arrayref.h"
  #include "gromacs/utility/unique_cptr.h"
  
-struct t_inputrec;
+#include "testhardwarecontexts.h"
  
  namespace gmx
  {
@@ -69,12 +72,6 @@ typedef std::vector<RVec> CoordinatesVector;
  typedef ArrayRef<RVec> ForcesVector;
  //! Gridline indices
  typedef ConstArrayRef<IVec> GridLineIndicesVector;
-//! Type of spline data
-enum class PmeSplineDataType
-{
-    Values,      // theta
-    Derivatives, // dtheta
-};
  /*! \brief Spline parameters (theta or dtheta).
   * A reference to a single dimension's spline data; this means (atomCount * pmeOrder) values or derivatives.
   */
@@ -97,11 +94,6 @@ typedef SparseGridValuesOutput<real> SparseRealGridValuesOutput;
  typedef SparseGridValuesOutput<t_complex> SparseComplexGridValuesOutput;
  //! TODO: make proper C++ matrix for the whole Gromacs, get rid of this
  typedef std::array<real, DIM * DIM> Matrix3x3;
-//! PME code path being tested
-enum class CodePath
-{
-    CPU,    // serial CPU code
-};
  //! PME gathering input forces treatment
  enum class PmeGatherInputHandling
  {
@@ -123,14 +115,24 @@ enum class GridOrdering
  //! PME solver results - reciprocal energy and virial
  typedef std::tuple<real, Matrix3x3> PmeSolveOutput;
  
+// Misc.
+
+//! Tells if this generally valid PME input is supported for this mode
+bool pmeSupportsInputForMode(const t_inputrec *inputRec, CodePath mode);
+
  // PME stages
  
+// TODO: currently PME initializations do not store CodePath. They probably should (unless we would need mixed CPU-GPU execution?).
  //! Simple PME initialization (no atom data)
  PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec,
+                            CodePath mode = CodePath::CPU,
+                            gmx_device_info_t *gpuInfo = nullptr,
                              const Matrix3x3 &box = {{1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}},
                              real ewaldCoeff_q = 0.0f, real ewaldCoeff_lj = 0.0f);
-//! PME initialization with atom data and system box; lacks Ewald coefficients
+//! PME initialization with atom data and system box
  PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
+                            CodePath                  mode,
+                            gmx_device_info_t        *gpuInfo,
                              const CoordinatesVector  &coordinates,
                              const ChargesVector      &charges,
                              const Matrix3x3          &box
@@ -145,6 +147,8 @@ void pmePerformSolve(const gmx_pme_t *pme, CodePath mode,
  //! PME force gathering
  void pmePerformGather(gmx_pme_t *pme, CodePath mode,
                        PmeGatherInputHandling inputTreatment, ForcesVector &forces);
+//! PME test finalization before fetching the outputs
+void pmeFinalizeTest(const gmx_pme_t *pme, CodePath mode);
  
  // PME state setters
  
diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.cpp b/src/gromacs/ewald/tests/testhardwarecontexts.cpp

new file mode 100644 (file)

index 0000000..762e4d4
--- /dev/null
+++ b/src/gromacs/ewald/tests/testhardwarecontexts.cpp
@@ -0,0 +1,100 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Implements test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "testhardwarecontexts.h"
+
+#include "config.h"
+
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/taskassignment/hardwareassign.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/loggerbuilder.h"
+
+namespace gmx
+{
+namespace test
+{
+
+//! This constructs the test environment
+PmeTestEnvironment * const pmeEnv = (PmeTestEnvironment *)::testing::AddGlobalTestEnvironment(new PmeTestEnvironment);
+
+//! Simple hardware initialization
+void PmeTestEnvironment::hardwareInit()
+{
+    unique_cptr<t_commrec, done_commrec> commrec(init_commrec());
+    gmx_init_intranode_counters(commrec.get());
+    LoggerBuilder builder;
+    LoggerOwner   logOwner(builder.build());
+    MDLogger      log(logOwner.logger());
+    hardwareInfo_.reset(gmx_detect_hardware(log, commrec.get()));
+}
+
+void PmeTestEnvironment::SetUp()
+{
+    TestHardwareContext emptyContext("", nullptr);
+    hardwareContextsByMode_[CodePath::CPU].push_back(emptyContext);
+
+    hardwareInit();
+
+    // Constructing contexts for all compatible GPUs - will be empty on non-GPU builds
+    TestHardwareContexts gpuContexts;
+    const auto           compatibleGpus = getCompatibleGpus(hardwareInfo_->gpu_info);
+    for (int gpuIndex : compatibleGpus)
+    {
+        char        stmp[200] = {};
+        get_gpu_device_info_string(stmp, hardwareInfo_->gpu_info, gpuIndex);
+        std::string description = "(GPU " + std::string(stmp) + ") ";
+        auto       *gpuInfo     = reinterpret_cast<gmx_device_info_t *>(reinterpret_cast<char *>(hardwareInfo_->gpu_info.gpu_dev) + gpuIndex * sizeof_gpu_dev_info());
+        //TODO move previous line to gpu_utils and reuse in hardwareassign.cpp
+        gpuContexts.emplace_back(TestHardwareContext(description.c_str(), gpuInfo));
+    }
+#if GMX_GPU == GMX_GPU_CUDA
+    hardwareContextsByMode_[CodePath::CUDA] = gpuContexts;
+#endif
+}
+
+}
+}
diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.h b/src/gromacs/ewald/tests/testhardwarecontexts.h

new file mode 100644 (file)

index 0000000..9fc2b51
--- /dev/null
+++ b/src/gromacs/ewald/tests/testhardwarecontexts.h
@@ -0,0 +1,117 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_EWALD_TEST_HARDWARE_CONTEXTS_H
+#define GMX_EWALD_TEST_HARDWARE_CONTEXTS_H
+
+/*! \internal \file
+ * \brief
+ * Describes test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include <list>
+#include <map>
+
+#include <gtest/gtest.h>
+
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/gpu_hw_info.h"
+#include "gromacs/utility/unique_cptr.h"
+
+namespace gmx
+{
+namespace test
+{
+//! Hardware code path being tested
+enum class CodePath
+{
+    CPU,
+    CUDA
+};
+
+/*! \internal \brief
+ * A structure to describe a hardware context - an abstraction over
+ * gmx_device_info_t with a human-readable string.
+ * TODO: currently this does not know which CodePath it belongs too.
+ * It probably should! That would save us one loop in all the PME tests.
+ */
+struct TestHardwareContext
+{
+    //! Readable description
+    std::string        description_;
+    //! Device information pointer
+    gmx_device_info_t *deviceInfo_;
+
+    public:
+        //! Returns a human-readable context description line
+        std::string getDescription() const{return description_; }
+//! Returns the device info pointer
+        gmx_device_info_t *getDeviceInfo() const{return deviceInfo_; }
+        //! Constructs the context
+        TestHardwareContext(const char *description, gmx_device_info_t *deviceInfo) : description_(description), deviceInfo_(deviceInfo){}
+};
+
+//! A list of hardware contexts
+typedef std::list<TestHardwareContext> TestHardwareContexts;
+
+/*! \internal \brief
+ * This class performs one-time test initialization (enumerating the hardware)
+ */
+class PmeTestEnvironment : public ::testing::Environment
+{
+    private:
+        //! General hardware info
+        unique_cptr<gmx_hw_info_t, gmx_hardware_info_free> hardwareInfo_;
+        //! Storage of hardware contexts
+        std::map<CodePath, TestHardwareContexts>           hardwareContextsByMode_;
+        //! Simple GPU initialization, allowing for PME to work on GPU
+        void hardwareInit();
+
+    public:
+        //! Default
+        ~PmeTestEnvironment() = default;
+        //! This is called by GTest framework once to query the hardware
+        void SetUp();
+        //! Get available hardware contexts for given code path
+        const TestHardwareContexts &getHardwareContexts(CodePath mode){return hardwareContextsByMode_.at(mode); }
+};
+
+//! The test environment
+extern PmeTestEnvironment * const pmeEnv;
+}
+}
+#endif
diff --git a/src/gromacs/gpu_utils/cuda_kernel_utils.cuh b/src/gromacs/gpu_utils/cuda_kernel_utils.cuh

index e96edb366f4cb06b0c7a362f4f70ab2f2b187407..2bba07dbae373cecff338a552d235dc23410bf25 100644 (file)
--- a/src/gromacs/gpu_utils/cuda_kernel_utils.cuh
+++ b/src/gromacs/gpu_utils/cuda_kernel_utils.cuh
@@ -56,5 +56,65 @@ __device__ __forceinline__ T LDG(const T* ptr)
  #endif
  }
  
+/*! \brief Fetch the value by \p index from the texture object or reference.
+ * Fetching from the object is the preferred behaviour on CC >= 3.0.
+ *
+ * \tparam[in] T        Raw data type
+ * \param[in] texObj    Table texture object
+ * \param[in] texRef    Table texture reference
+ * \param[in] index     Non-negative element index
+ * \returns             The value from the table at \p index
+ */
+template <typename T>
+static __forceinline__ __device__
+T fetchFromTexture(const cudaTextureObject_t texObj,
+                   const struct texture<T, 1, cudaReadModeElementType> texRef,
+                   int index)
+{
+    assert(index >= 0);
+    assert(!c_disableCudaTextures);
+    T result;
+#if GMX_PTX_ARCH >= 300  // Preferring texture objects on any new arch
+    GMX_UNUSED_VALUE(texRef);
+    result = tex1Dfetch<T>(texObj, index);
+#else
+    GMX_UNUSED_VALUE(texObj);
+    result = tex1Dfetch(texRef, index);
+#endif
+    return result;
+}
+
+/*! \brief Fetch the value by \p index from the parameter lookup table.
+ *
+ *  Depending on what is supported, it fetches parameters either
+ *  using direct load, texture objects, or texture references.
+ *
+ * \tparam[in] T        Raw data type
+ * \param[in] d_ptr     Device pointer to the raw table memory
+ * \param[in] texObj    Table texture object
+ * \param[in] texRef    Table texture reference
+ * \param[in] index     Non-negative element index
+ * \returns             The value from the table at \p index
+ */
+template <typename T>
+static __forceinline__ __device__
+T fetchFromParamLookupTable(const T                  *d_ptr,
+                            const cudaTextureObject_t texObj,
+                            const struct texture<T, 1, cudaReadModeElementType> texRef,
+                            int index)
+{
+    assert(index >= 0);
+    T result;
+#if DISABLE_CUDA_TEXTURES
+    GMX_UNUSED_VALUE(texObj);
+    GMX_UNUSED_VALUE(texRef);
+    result = LDG(d_ptr + index);
+#else
+    GMX_UNUSED_VALUE(d_ptr);
+    result = fetchFromTexture<T>(texObj, texRef, index);
+#endif
+    return result;
+}
+
  
  #endif /* GMX_GPU_UTILS_CUDA_KERNEL_UTILS_CUH */
diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu

index 4f7cff946f1d58c6e7471028005dcdc3b389fe39..28316f8ba9f11b7b04c21ed484a12e37a3702642 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cu
+++ b/src/gromacs/gpu_utils/cudautils.cu
@@ -331,3 +331,5 @@ void destroyParamLookupTable(T                       *d_ptr,
   */
  template void initParamLookupTable<float>(float * &, cudaTextureObject_t &, const texture<float, 1, cudaReadModeElementType> *, const float *, int, const gmx_device_info_t *);
  template void destroyParamLookupTable<float>(float *, cudaTextureObject_t, const texture<float, 1, cudaReadModeElementType> *, const gmx_device_info_t *);
+template void initParamLookupTable<int>(int * &, cudaTextureObject_t &, const texture<int, 1, cudaReadModeElementType> *, const int *, int, const gmx_device_info_t *);
+template void destroyParamLookupTable<int>(int *, cudaTextureObject_t, const texture<int, 1, cudaReadModeElementType> *, const gmx_device_info_t *);
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh

index 3402d20a4aeeb5a1893959119c0a69c7f7795325..bea1e4a0bd908df73a11cd8b899c413df7ca84e2 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -157,7 +157,9 @@ void cu_realloc_buffered(void **d_dest, void *h_src,
                           bool bAsync);
  
  // TODO: the 2 functions below are pretty much a constructor/destructor of a simple
-// GPU table object. We just need to add a templated __device__ table data fetching to complete it.
+// GPU table object. There is also almost self-contained fetchFromParamLookupTable()
+// in cuda_kernel_utils.cuh. They could all live in a separate class/struct file,
+// granted storing static texture references in there does not pose problems.
  
  /*! \brief Initialize parameter lookup table.
   *
diff --git a/src/gromacs/taskassignment/hardwareassign.cpp b/src/gromacs/taskassignment/hardwareassign.cpp

index 000f85766fea4de8eb92d8a50046819b578b12e5..ef9cb744bbbabd541c7b61a81d150e8b879a32cf 100644 (file)
--- a/src/gromacs/taskassignment/hardwareassign.cpp
+++ b/src/gromacs/taskassignment/hardwareassign.cpp
@@ -204,14 +204,7 @@ static void exitUnlessUserGpuTaskAssignmentIsValid(const gmx_gpu_info_t   &gpu_i
      }
  }
  
-/*! \brief Filter the compatible GPUs
- *
- * This function filters gpu_info.gpu_dev for compatible GPUs based
- * on the previously run compatibility tests.
- *
- * \param[in]     gpu_info    Information detected about GPUs, including compatibility
- * \return                    vector of IDs of GPUs already recorded as compatible */
-static std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
+std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
  {
      // Possible minor over-allocation here, but not important for anything
      std::vector<int> compatibleGpus;
diff --git a/src/gromacs/taskassignment/hardwareassign.h b/src/gromacs/taskassignment/hardwareassign.h

index d1793c8fe5496ac85dafa8acd03eb3d0f181458c..d8d3952bc9236766a4114461eb96c06648c6b3a0 100644 (file)
--- a/src/gromacs/taskassignment/hardwareassign.h
+++ b/src/gromacs/taskassignment/hardwareassign.h
@@ -74,6 +74,15 @@ class MDLogger;
   */
  std::vector<int> parseGpuTaskAssignment(const std::string &gpuTaskAssignment);
  
+/*! \brief Filter the compatible GPUs
+ *
+ * This function filters gpu_info.gpu_dev for compatible GPUs based
+ * on the previously run compatibility tests.
+ *
+ * \param[in]     gpu_info    Information detected about GPUs, including compatibility
+ * \return                    vector of IDs of GPUs already recorded as compatible */
+std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info);
+
  /*! \brief Assign PP ranks to valid GPU IDs.
   *
   * Will return a validated mapping from PP ranks (ie tasks that can
author	Aleksei Iupinov <a.yupinov@gmail.com>
	Mon, 5 Dec 2016 16:36:09 +0000 (17:36 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 11 Oct 2017 11:49:12 +0000 (13:49 +0200)
src/gromacs/ewald/pme-gpu-internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme-gpu-internal.h		patch \| blob \| history
src/gromacs/ewald/pme-gpu-types.h		patch \| blob \| history
src/gromacs/ewald/pme-gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme-spread.cu	[new file with mode: 0644]	patch \| blob
src/gromacs/ewald/pme.cu		patch \| blob \| history
src/gromacs/ewald/pme.cuh		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/tests/CMakeLists.txt		patch \| blob \| history
src/gromacs/ewald/tests/pmegathertest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmesolvetest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmesplinespreadtest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.h		patch \| blob \| history
src/gromacs/ewald/tests/testhardwarecontexts.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/ewald/tests/testhardwarecontexts.h	[new file with mode: 0644]	patch \| blob
src/gromacs/gpu_utils/cuda_kernel_utils.cuh		patch \| blob \| history
src/gromacs/gpu_utils/cudautils.cu		patch \| blob \| history
src/gromacs/gpu_utils/cudautils.cuh		patch \| blob \| history
src/gromacs/taskassignment/hardwareassign.cpp		patch \| blob \| history
src/gromacs/taskassignment/hardwareassign.h		patch \| blob \| history