PME spline+spread CUDA kernel and unit tests
authorAleksei Iupinov <a.yupinov@gmail.com>
Mon, 5 Dec 2016 16:36:09 +0000 (17:36 +0100)
committerMark Abraham <mark.j.abraham@gmail.com>
Wed, 11 Oct 2017 11:49:12 +0000 (13:49 +0200)
The CUDA implementation of PME spline computation and charge spreading
for PME order 4 is added in pme-spread.cu.

The unit tests for PME CPU spline/spread stages
(e8cf7c0) are also extended to work with
the PME CUDA kernel, using the same reference data.
The tests iterate over all CUDA GPUs which are compatible with Gromacs.

Refs #2054, #2092.

Change-Id: If5ec49f030b9b94395db28fa454ea25c3efb05d1

21 files changed:
src/gromacs/ewald/pme-gpu-internal.cpp
src/gromacs/ewald/pme-gpu-internal.h
src/gromacs/ewald/pme-gpu-types.h
src/gromacs/ewald/pme-gpu.cpp
src/gromacs/ewald/pme-spread.cu [new file with mode: 0644]
src/gromacs/ewald/pme.cu
src/gromacs/ewald/pme.cuh
src/gromacs/ewald/pme.h
src/gromacs/ewald/tests/CMakeLists.txt
src/gromacs/ewald/tests/pmegathertest.cpp
src/gromacs/ewald/tests/pmesolvetest.cpp
src/gromacs/ewald/tests/pmesplinespreadtest.cpp
src/gromacs/ewald/tests/pmetestcommon.cpp
src/gromacs/ewald/tests/pmetestcommon.h
src/gromacs/ewald/tests/testhardwarecontexts.cpp [new file with mode: 0644]
src/gromacs/ewald/tests/testhardwarecontexts.h [new file with mode: 0644]
src/gromacs/gpu_utils/cuda_kernel_utils.cuh
src/gromacs/gpu_utils/cudautils.cu
src/gromacs/gpu_utils/cudautils.cuh
src/gromacs/taskassignment/hardwareassign.cpp
src/gromacs/taskassignment/hardwareassign.h

index 3349b63b79c189c1c6ac1d4a38d5466e904180a4..165482d77ad381a72c07e0fc71635c4f400a8c1f 100644 (file)
@@ -222,12 +222,14 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t *pme)
         pmeGPU->common->bsp_mod[i].assign(pme->bsp_mod[i], pme->bsp_mod[i] + pmeGPU->common->nk[i]);
     }
     const int cellCount = c_pmeNeighborUnitcellCount;
-    pmeGPU->common->fsh[XX].assign(pme->fshx, pme->fshx + cellCount * pme->nkx);
-    pmeGPU->common->fsh[YY].assign(pme->fshy, pme->fshy + cellCount * pme->nky);
-    pmeGPU->common->fsh[ZZ].assign(pme->fshz, pme->fshz + cellCount * pme->nkz);
-    pmeGPU->common->nn[XX].assign(pme->nnx, pme->nnx + cellCount * pme->nkx);
-    pmeGPU->common->nn[YY].assign(pme->nny, pme->nny + cellCount * pme->nky);
-    pmeGPU->common->nn[ZZ].assign(pme->nnz, pme->nnz + cellCount * pme->nkz);
+    pmeGPU->common->fsh.resize(0);
+    pmeGPU->common->fsh.insert(pmeGPU->common->fsh.end(), pme->fshx, pme->fshx + cellCount * pme->nkx);
+    pmeGPU->common->fsh.insert(pmeGPU->common->fsh.end(), pme->fshy, pme->fshy + cellCount * pme->nky);
+    pmeGPU->common->fsh.insert(pmeGPU->common->fsh.end(), pme->fshz, pme->fshz + cellCount * pme->nkz);
+    pmeGPU->common->nn.resize(0);
+    pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nnx, pme->nnx + cellCount * pme->nkx);
+    pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nny, pme->nny + cellCount * pme->nky);
+    pmeGPU->common->nn.insert(pmeGPU->common->nn.end(), pme->nnz, pme->nnz + cellCount * pme->nkz);
     pmeGPU->common->runMode = pme->runMode;
 }
 
@@ -306,6 +308,8 @@ static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::
     /* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
     pmeGPU->settings.performGPUGather = true;
 
+    pme_gpu_set_testing(pmeGPU, false);
+
     // GPU initialization
     init_gpu(mdlog, cr->nodeid, gpuInfo);
     pmeGPU->deviceInfo = gpuInfo;
@@ -322,6 +326,64 @@ static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::
     kernelParamsPtr->constants.elFactor = ONE_4PI_EPS0 / pmeGPU->common->epsilon_r;
 }
 
+void pme_gpu_transform_spline_atom_data_for_host(const pme_gpu_t *pmeGPU, const pme_atomcomm_t *atc, PmeSplineDataType type, int dimIndex)
+{
+    // The GPU atom spline data is laid out in a different way currently than the CPU one.
+    // This function converts the data from GPU to CPU layout. Obviously, it is slow.
+    // It is only intended for testing purposes so far.
+    // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their performance
+    // (e.g. spreading on GPU, gathering on CPU).
+    GMX_RELEASE_ASSERT(atc->nthread == 1, "Only the serial PME data layout is supported");
+    const uintmax_t threadIndex  = 0;
+    const auto      atomCount    = pme_gpu_get_kernel_params_base_ptr(pmeGPU)->atoms.nAtoms;
+    const auto      atomsPerWarp = pme_gpu_get_atom_spline_data_alignment(pmeGPU);
+    const auto      pmeOrder     = pmeGPU->common->pme_order;
+
+    real           *h_splineBuffer;
+    float          *d_splineBuffer;
+    switch (type)
+    {
+        case PmeSplineDataType::Values:
+            h_splineBuffer = atc->spline[threadIndex].theta[dimIndex];
+            d_splineBuffer = pmeGPU->staging.h_theta;
+            break;
+
+        case PmeSplineDataType::Derivatives:
+            h_splineBuffer = atc->spline[threadIndex].dtheta[dimIndex];
+            d_splineBuffer = pmeGPU->staging.h_dtheta;
+            break;
+
+        default:
+            GMX_THROW(gmx::InternalError("Unknown spline data type"));
+    }
+
+    for (auto atomIndex = 0; atomIndex < atomCount; atomIndex++)
+    {
+        auto atomWarpIndex = atomIndex % atomsPerWarp;
+        auto warpIndex     = atomIndex / atomsPerWarp;
+        for (auto orderIndex = 0; orderIndex < pmeOrder; orderIndex++)
+        {
+            const auto gpuValueIndex = ((pmeOrder * warpIndex + orderIndex) * DIM + dimIndex) * atomsPerWarp + atomWarpIndex;
+            const auto cpuValueIndex = atomIndex * pmeOrder + orderIndex;
+            GMX_ASSERT(cpuValueIndex < atomCount * pmeOrder, "Atom spline data index out of bounds (while transforming GPU data layout for host)");
+            h_splineBuffer[cpuValueIndex] = d_splineBuffer[gpuValueIndex];
+        }
+    }
+}
+
+void pme_gpu_get_real_grid_sizes(const pme_gpu_t *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize)
+{
+    GMX_ASSERT(gridSize != nullptr, "");
+    GMX_ASSERT(paddedGridSize != nullptr, "");
+    GMX_ASSERT(pmeGPU != nullptr, "");
+    auto *kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGPU);
+    for (int i = 0; i < DIM; i++)
+    {
+        (*gridSize)[i]       = kernelParamsPtr->grid.realGridSize[i];
+        (*paddedGridSize)[i] = kernelParamsPtr->grid.realGridSizePadded[i];
+    }
+}
+
 void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr)
 {
     if (!pme_gpu_active(pme))
index 03fe8de852d8bc87c8f8b008dc508102c48aa275..9cdb5568775862843531a3515a63db6982672fe8 100644 (file)
@@ -62,6 +62,13 @@ namespace gmx
 class MDLogger;
 }
 
+//! Type of spline data
+enum class PmeSplineDataType
+{
+    Values,      // theta
+    Derivatives, // dtheta
+};               //TODO move this into new and shiny pme.h (pme-types.h?)
+
 /* Some general constants for PME GPU behaviour follow. */
 
 /*! \brief \libinternal
@@ -328,6 +335,13 @@ CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_gather_grid(const pme_gpu_t *CUDA_FU
 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_grid(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU),
                                                          float           *CUDA_FUNC_ARGUMENT(h_grid)) CUDA_FUNC_TERM
 
+/*! \libinternal \brief
+ * Copies the spread output spline data and gridline indices from the GPU to the host.
+ *
+ * \param[in] pmeGPU   The PME GPU structure.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_spread_atom_data(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+
 /*! \libinternal \brief
  * Waits for the grid copying to the host-side buffer after spreading to finish.
  *
@@ -418,6 +432,25 @@ CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const pme_gpu_t *CUDA_FUNC_ARGUME
 CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const pme_gpu_t         *CUDA_FUNC_ARGUMENT(pmeGPU),
                                              gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
 
+/* The PME stages themselves */
+
+/*! \libinternal \brief
+ * A GPU spline computation and charge spreading function.
+ *
+ * \param[in]  pmeGpu          The PME GPU structure.
+ * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
+ * \param[out] h_grid          The host-side grid buffer (used only if the result of the spread is expected on the host,
+ *                             e.g. testing or host-side FFT)
+ * \param[in]  computeSplines  Should the computation of spline parameters and gridline indices be performed.
+ * \param[in]  spreadCharges   Should the charges/coefficients be spread on the grid.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_spread(const pme_gpu_t *CUDA_FUNC_ARGUMENT(pmeGpu),
+                                        int              CUDA_FUNC_ARGUMENT(gridIndex),
+                                        real            *CUDA_FUNC_ARGUMENT(h_grid),
+                                        bool             CUDA_FUNC_ARGUMENT(computeSplines),
+                                        bool             CUDA_FUNC_ARGUMENT(spreadCharges)) CUDA_FUNC_TERM
+
+
 /* The inlined convenience PME GPU status getters */
 
 /*! \libinternal \brief
@@ -475,6 +508,29 @@ gmx_inline bool pme_gpu_performs_solve(const pme_gpu_t *pmeGPU)
     return pmeGPU->settings.performGPUSolve;
 }
 
+/*! \libinternal \brief
+ * Enables or disables the testing mode.
+ * Testing mode only implies copying all the outputs, even the intermediate ones, to the host.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ * \param[in] testing            Should the testing mode be enabled, or disabled.
+ */
+gmx_inline void pme_gpu_set_testing(pme_gpu_t *pmeGPU, bool testing)
+{
+    pmeGPU->settings.copyAllOutputs = testing;
+}
+
+/*! \libinternal \brief
+ * Tells if PME is in the testing mode.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ * \returns                      true if testing mode is enabled, false otherwise.
+ */
+gmx_inline bool pme_gpu_is_testing(const pme_gpu_t *pmeGPU)
+{
+    return pmeGPU->settings.copyAllOutputs;
+}
+
 /* A block of C++ functions that live in pme-gpu-internal.cpp */
 
 /*! \libinternal \brief
@@ -516,6 +572,28 @@ void pme_gpu_start_step(pme_gpu_t *pmeGPU, bool needToUpdateBox, const matrix bo
 void pme_gpu_finish_step(const pme_gpu_t *pmeGPU,  const bool       bCalcForces,
                          const bool       bCalcEnerVir);
 
+/*! \libinternal \brief
+ * Rearranges the atom spline data, copied from GPU to the host after pme_gpu_sync_spline_atom_data() call,
+ * to have the atom data layout of a single-threaded PME CPU run.
+ * Only used for test purposes so far.
+ *
+ * \param[in]  pmeGPU    The PME GPU structure.
+ * \param[out] atc       The PME CPU atom data structure (with a single-threaded layout).
+ * \param[in]  type      The spilne data type (values or derivatives).
+ * \param[in]  dimIndex  Dimension index.
+ */
+void pme_gpu_transform_spline_atom_data_for_host(const pme_gpu_t *pmeGPU, const pme_atomcomm_t *atc,
+                                                 PmeSplineDataType type, int dimIndex);
+
+/*! \libinternal \brief
+ * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ * \param[out] gridSize          Pointer to the grid dimensions to fill in.
+ * \param[out] paddedGridSize    Pointer to the padded grid dimensions to fill in.
+ */
+void pme_gpu_get_real_grid_sizes(const pme_gpu_t *pmeGPU, gmx::IVec *gridSize, gmx::IVec *paddedGridSize);
+
 /*! \libinternal \brief
  * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
  *
index 1bf3a88c1a6e1acaff288af5d99af0363abcf288..6a79e7c8fbd93bc978864120ce45519059548955 100644 (file)
@@ -298,9 +298,9 @@ struct pme_shared_t
     /*! \brief Electrostatics parameter */
     real              epsilon_r;
     /*! \brief Gridline indices - nnx, nny, nnz */
-    std::vector<int>  nn[DIM];
+    std::vector<int>  nn;
     /*! \brief Fractional shifts - fshx, fshy, fshz */
-    std::vector<real> fsh[DIM];
+    std::vector<real> fsh;
     /*! \brief Precomputed B-spline values */
     std::vector<real> bsp_mod[DIM];
     /*! \brief The PME codepath being taken */
index e830c53d8d9fe0b13dec41e8faadadc418e222d4..e24565adbf26b8028cc09d99789d252b5e3803c0 100644 (file)
 
 #include "gmxpre.h"
 
+#include "config.h"
+
+#include <list>
+
 #include "gromacs/ewald/pme.h"
+#include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
 
 #include "pme-gpu-internal.h"
 #include "pme-grid.h"
@@ -57,6 +63,54 @@ bool pme_gpu_task_enabled(const gmx_pme_t *pme)
     return (pme != nullptr) && (pme->runMode != PmeRunMode::CPU);
 }
 
+bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error)
+{
+    std::list<std::string> errorReasons;
+    if (!EEL_PME(ir->coulombtype))
+    {
+        errorReasons.push_back("systems that do not use PME for electrostatics");
+    }
+    if (ir->pme_order != 4)
+    {
+        errorReasons.push_back("interpolation orders other than 4");
+    }
+    if (ir->efep != efepNO)
+    {
+        errorReasons.push_back("free energy calculations (multiple grids)");
+    }
+    if (EVDW_PME(ir->vdwtype))
+    {
+        errorReasons.push_back("Lennard-Jones PME");
+    }
+#if GMX_DOUBLE
+    {
+        errorReasons.push_back("double precision");
+    }
+#endif
+#if GMX_GPU != GMX_GPU_CUDA
+    {
+        errorReasons.push_back("non-CUDA build of GROMACS");
+    }
+#endif
+    if (ir->cutoff_scheme == ecutsGROUP)
+    {
+        errorReasons.push_back("group cutoff scheme");
+    }
+    if (EI_TPI(ir->eI))
+    {
+        errorReasons.push_back("test particle insertion");
+    }
+
+    bool inputSupported = errorReasons.empty();
+    if (!inputSupported && error)
+    {
+        std::string regressionTestMarker = "PME GPU does not support";
+        // this prefix is tested for in the regression tests script gmxtest.pl
+        *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
+    }
+    return inputSupported;
+}
+
 void pme_gpu_reset_timings(const gmx_pme_t *pme)
 {
     if (pme_gpu_active(pme))
diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu
new file mode 100644 (file)
index 0000000..ee92874
--- /dev/null
@@ -0,0 +1,558 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013-2016,2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Implements PME GPU spline calculation and charge spreading in CUDA.
+ *  TODO: consider always pre-sorting particles (as in DD case).
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include <cassert>
+
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+
+#include "pme.cuh"
+#include "pme-grid.h"
+#include "pme-timings.cuh"
+
+/*
+ * This define affects the spline calculation behaviour in the kernel.
+ * 0: a single GPU thread handles a single dimension of a single particle (calculating and storing (order) spline values and derivatives).
+ * 1: (order) threads do redundant work on this same task, each one stores only a single theta and single dtheta into global arrays.
+ * The only efficiency difference is less global store operations, countered by more redundant spline computation.
+ *
+ * TODO: estimate if this should be a boolean parameter (and add it to the unit test if so).
+ */
+#define PME_GPU_PARALLEL_SPLINE 0
+
+
+//! Spreading max block width in warps picked among powers of 2 (2, 4, 8, 16) for max. occupancy and min. runtime in most cases
+constexpr int c_spreadMaxWarpsPerBlock = 8;
+/* TODO: it has been observed that the kernel can be faster with smaller block sizes (2 or 4 warps)
+ * only on some GPUs (660Ti) with large enough grid (>= 48^3) due to load/store units being overloaded
+ * (ldst_fu_utilization metric maxed out in nvprof). Runtime block size choice might be nice to have.
+ * This has been tried on architectures up to Maxwell (GTX 750) and it would be good to revisit this.
+ */
+//! Spreading max block size in threads
+constexpr int c_spreadMaxThreadsPerBlock = c_spreadMaxWarpsPerBlock * warp_size;
+
+//! Texture references for CC 2.x
+texture<int, 1, cudaReadModeElementType>   gridlineIndicesTableTextureRef;
+texture<float, 1, cudaReadModeElementType> fractShiftsTableTextureRef;
+
+/*! Returns the reference to the gridlineIndices texture. */
+texture<int, 1, cudaReadModeElementType> &pme_gpu_get_gridline_texref()
+{
+    return gridlineIndicesTableTextureRef;
+}
+
+/*! Returns the reference to the fractShifts texture. */
+texture<float, 1, cudaReadModeElementType> &pme_gpu_get_fract_shifts_texref()
+{
+    return fractShiftsTableTextureRef;
+}
+
+/*! \brief
+ * General purpose function for loading atom-related data from global to shared memory.
+ *
+ * \tparam[in] T                 Data type (float/int/...)
+ * \tparam[in] atomsPerBlock     Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
+ * \tparam[in] dataCountPerAtom  Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
+ * \param[in]  kernelParams      Input PME CUDA data in constant memory.
+ * \param[out] sm_destination    Shared memory array for output.
+ * \param[in]  gm_source         Global memory array for input.
+ */
+template<typename T,
+         const int atomsPerBlock,
+         const int dataCountPerAtom>
+__device__  __forceinline__
+void pme_gpu_stage_atom_data(const pme_gpu_cuda_kernel_params_t kernelParams,
+                             T * __restrict__                   sm_destination,
+                             const T * __restrict__             gm_source)
+{
+    static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta per-warp alignment");
+    const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
+    const int localIndex       = threadLocalIndex;
+    const int globalIndexBase  = blockIdx.x * atomsPerBlock * dataCountPerAtom;
+    const int globalIndex      = globalIndexBase + localIndex;
+    const int globalCheck      = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
+    if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+    {
+        assert(isfinite(float(gm_source[globalIndex])));
+        sm_destination[localIndex] = gm_source[globalIndex];
+    }
+}
+
+/*! \brief
+ * PME GPU spline parameter and gridline indices calculation.
+ * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines().
+ * First stage of the whole kernel.
+ *
+ * \tparam[in] order                PME interpolation order.
+ * \tparam[in] atomsPerBlock        Number of atoms processed by a block - should be accounted for
+ *                                  in the sizes of the shared memory arrays.
+ * \param[in]  kernelParams         Input PME CUDA data in constant memory.
+ * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
+ * \param[in]  sm_coordinates       Atom coordinates in the shared memory.
+ * \param[in]  sm_coefficients      Atom charges/coefficients in the shared memory.
+ * \param[out] sm_theta             Atom spline values in the shared memory.
+ * \param[out] sm_gridlineIndices   Atom gridline indices in the shared memory.
+ */
+template <const int order,
+          const int atomsPerBlock>
+__device__ __forceinline__ void calculate_splines(const pme_gpu_cuda_kernel_params_t     kernelParams,
+                                                  const int                              atomIndexOffset,
+                                                  const float3 * __restrict__            sm_coordinates,
+                                                  const float * __restrict__             sm_coefficients,
+                                                  float * __restrict__                   sm_theta,
+                                                  int * __restrict__                     sm_gridlineIndices)
+{
+    /* Global memory pointers for output */
+    float * __restrict__ gm_theta           = kernelParams.atoms.d_theta;
+    float * __restrict__ gm_dtheta          = kernelParams.atoms.d_dtheta;
+    int * __restrict__   gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
+
+    /* Fractional coordinates */
+    __shared__ float sm_fractCoords[atomsPerBlock * DIM];
+
+    /* Thread index w.r.t. block */
+    const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
+        + (threadIdx.y * blockDim.x) + threadIdx.x;
+    /* Warp index w.r.t. block - could probably be obtained easier? */
+    const int warpIndex = threadLocalId / warp_size;
+    /* Thread index w.r.t. warp */
+    const int threadWarpIndex = threadLocalId % warp_size;
+    /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
+    const int atomWarpIndex = threadWarpIndex % PME_SPREADGATHER_ATOMS_PER_WARP;
+    /* Atom index w.r.t. block/shared memory */
+    const int atomIndexLocal = warpIndex * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex;
+
+    /* Atom index w.r.t. global memory */
+    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+    /* Spline contribution index in one dimension */
+    const int orderIndex = threadWarpIndex / (PME_SPREADGATHER_ATOMS_PER_WARP * DIM);
+    /* Dimension index */
+    const int dimIndex = (threadWarpIndex - orderIndex * (PME_SPREADGATHER_ATOMS_PER_WARP * DIM)) / PME_SPREADGATHER_ATOMS_PER_WARP;
+
+    /* Multi-purpose index of rvec/ivec atom data */
+    const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
+
+    /* Spline parameters need a working buffer.
+     * With PME_GPU_PARALLEL_SPLINE == 0 it is just a local array of (order) values for some of the threads, which is fine;
+     * With PME_GPU_PARALLEL_SPLINE == 1 (order) times more threads are involved, so the shared memory is used to avoid overhead.
+     * The buffer's size, striding and indexing are adjusted accordingly.
+     * The buffer is accessed with SPLINE_DATA_PTR and SPLINE_DATA macros.
+     */
+#if PME_GPU_PARALLEL_SPLINE
+    const int        splineDataStride  = atomsPerBlock * DIM;
+    const int        splineDataIndex   = sharedMemoryIndex;
+    __shared__ float sm_splineData[splineDataStride * order];
+    float           *splineDataPtr = sm_splineData;
+#else
+    const int        splineDataStride = 1;
+    const int        splineDataIndex  = 0;
+    float            splineData[splineDataStride * order];
+    float           *splineDataPtr = splineData;
+#endif
+
+#define SPLINE_DATA_PTR(i) (splineDataPtr + ((i) * splineDataStride + splineDataIndex))
+#define SPLINE_DATA(i) (*SPLINE_DATA_PTR(i))
+
+    const int localCheck  = (dimIndex < DIM) && (orderIndex < (PME_GPU_PARALLEL_SPLINE ? order : 1));
+    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
+
+    if (localCheck && globalCheck)
+    {
+        /* Indices interpolation */
+
+        if (orderIndex == 0)
+        {
+            int           tableIndex, tInt;
+            float         n, t;
+            const float3  x = sm_coordinates[atomIndexLocal];
+            /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
+             * puts them into local memory(!) instead of accessing the constant memory directly.
+             * That's the reason for the switch, to unroll explicitly.
+             * The commented parts correspond to the 0 components of the recipbox.
+             */
+            switch (dimIndex)
+            {
+                case XX:
+                    tableIndex  = kernelParams.grid.tablesOffsets[XX];
+                    n           = kernelParams.grid.realGridSizeFP[XX];
+                    t           = x.x * kernelParams.step.recipBox[dimIndex][XX] + x.y * kernelParams.step.recipBox[dimIndex][YY] + x.z * kernelParams.step.recipBox[dimIndex][ZZ];
+                    break;
+
+                case YY:
+                    tableIndex  = kernelParams.grid.tablesOffsets[YY];
+                    n           = kernelParams.grid.realGridSizeFP[YY];
+                    t           = /*x.x * kernelParams.step.recipbox[dimIndex][XX] + */ x.y * kernelParams.step.recipBox[dimIndex][YY] + x.z * kernelParams.step.recipBox[dimIndex][ZZ];
+                    break;
+
+                case ZZ:
+                    tableIndex  = kernelParams.grid.tablesOffsets[ZZ];
+                    n           = kernelParams.grid.realGridSizeFP[ZZ];
+                    t           = /*x.x * kernelParams.step.recipbox[dimIndex][XX] + x.y * kernelParams.step.recipbox[dimIndex][YY] + */ x.z * kernelParams.step.recipBox[dimIndex][ZZ];
+                    break;
+            }
+            const float shift = c_pmeMaxUnitcellShift;
+            /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
+            t    = (t + shift) * n;
+            tInt = (int)t;
+            sm_fractCoords[sharedMemoryIndex] = t - tInt;
+            tableIndex                       += tInt;
+            assert(tInt >= 0);
+            assert(tInt < c_pmeNeighborUnitcellCount * n);
+
+            // TODO have shared table for both parameters to share the fetch, as index is always same?
+            // TODO compare texture/LDG performance
+            sm_fractCoords[sharedMemoryIndex] += fetchFromParamLookupTable(kernelParams.grid.d_fractShiftsTable, kernelParams.fractShiftsTableTexture,
+                                                                           fractShiftsTableTextureRef, tableIndex);
+            sm_gridlineIndices[sharedMemoryIndex] = fetchFromParamLookupTable(kernelParams.grid.d_gridlineIndicesTable, kernelParams.gridlineIndicesTableTexture,
+                                                                              gridlineIndicesTableTextureRef, tableIndex);
+            gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] = sm_gridlineIndices[sharedMemoryIndex];
+        }
+
+        /* B-spline calculation */
+
+        const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
+        if (chargeCheck)
+        {
+            float       div;
+            int         o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
+
+            const float dr = sm_fractCoords[sharedMemoryIndex];
+            assert(isfinite(dr));
+
+            /* dr is relative offset from lower cell limit */
+            *SPLINE_DATA_PTR(order - 1) = 0.0f;
+            *SPLINE_DATA_PTR(1)         = dr;
+            *SPLINE_DATA_PTR(0)         = 1.0f - dr;
+
+#pragma unroll
+            for (int k = 3; k < order; k++)
+            {
+                div                     = 1.0f / (k - 1.0f);
+                *SPLINE_DATA_PTR(k - 1) = div * dr * SPLINE_DATA(k - 2);
+#pragma unroll
+                for (int l = 1; l < (k - 1); l++)
+                {
+                    *SPLINE_DATA_PTR(k - l - 1) = div * ((dr + l) * SPLINE_DATA(k - l - 2) + (k - l - dr) * SPLINE_DATA(k - l - 1));
+                }
+                *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
+            }
+
+            const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order;
+
+            /* Differentiation and storing the spline derivatives (dtheta) */
+#if !PME_GPU_PARALLEL_SPLINE
+            // With PME_GPU_PARALLEL_SPLINE == 1, o is already set to orderIndex;
+            // With PME_GPU_PARALLEL_SPLINE == 0, we loop o over range(order).
+#pragma unroll
+            for (o = 0; o < order; o++)
+#endif
+            {
+                const int   thetaIndex       = PME_SPLINE_THETA_STRIDE * (((o + order * warpIndex) * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex);
+                const int   thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
+
+                const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0f) - SPLINE_DATA(o);
+                assert(isfinite(dtheta));
+                gm_dtheta[thetaGlobalIndex] = dtheta;
+            }
+
+            div  = 1.0f / (order - 1.0f);
+            *SPLINE_DATA_PTR(order - 1) = div * dr * SPLINE_DATA(order - 2);
+#pragma unroll
+            for (int k = 1; k < (order - 1); k++)
+            {
+                *SPLINE_DATA_PTR(order - k - 1) = div * ((dr + k) * SPLINE_DATA(order - k - 2) + (order - k - dr) * SPLINE_DATA(order - k - 1));
+            }
+            *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
+
+            /* Storing the spline values (theta) */
+#if !PME_GPU_PARALLEL_SPLINE
+            // See comment for the loop above
+#pragma unroll
+            for (o = 0; o < order; o++)
+#endif
+            {
+                const int thetaIndex       = PME_SPLINE_THETA_STRIDE * (((o + order * warpIndex) * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex);
+                const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
+
+                sm_theta[thetaIndex]       = SPLINE_DATA(o);
+                assert(isfinite(sm_theta[thetaIndex]));
+                gm_theta[thetaGlobalIndex] = SPLINE_DATA(o);
+            }
+        }
+    }
+}
+
+/*! \brief
+ * Charge spreading onto the grid.
+ * This corresponds to the CPU function spread_coefficients_bsplines_thread().
+ * Second stage of the whole kernel.
+ *
+ * \tparam[in] order                PME interpolation order.
+ * \tparam[in] wrapX                A boolean which tells if the grid overlap in dimension X should be wrapped.
+ * \tparam[in] wrapY                A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \param[in]  kernelParams         Input PME CUDA data in constant memory.
+ * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
+ * \param[in]  sm_coefficients      Atom coefficents/charges in the shared memory.
+ * \param[in]  sm_gridlineIndices   Atom gridline indices in the shared memory.
+ * \param[in]  sm_theta             Atom spline values in the shared memory.
+ */
+template <
+    const int order, const bool wrapX, const bool wrapY>
+__device__ __forceinline__ void spread_charges(const pme_gpu_cuda_kernel_params_t     kernelParams,
+                                               int                                    atomIndexOffset,
+                                               const float * __restrict__             sm_coefficients,
+                                               const int * __restrict__               sm_gridlineIndices,
+                                               const float * __restrict__             sm_theta)
+{
+    /* Global memory pointer to the output grid */
+    float * __restrict__ gm_grid = kernelParams.grid.d_realGrid;
+
+
+    const int nx  = kernelParams.grid.realGridSize[XX];
+    const int ny  = kernelParams.grid.realGridSize[YY];
+    const int nz  = kernelParams.grid.realGridSize[ZZ];
+    const int pny = kernelParams.grid.realGridSizePadded[YY];
+    const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
+
+    const int offx = 0, offy = 0, offz = 0; // unused for now
+
+    const int atomIndexLocal  = threadIdx.z;
+    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+
+    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
+    const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
+    if (chargeCheck & globalCheck)
+    {
+        // Spline Y/Z coordinates
+        const int ithy   = threadIdx.y;
+        const int ithz   = threadIdx.x;
+        const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX] - offx;
+        int       iy     = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy + ithy;
+        if (wrapY & (iy >= ny))
+        {
+            iy -= ny;
+        }
+        int iz  = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz;
+        if (iz >= nz)
+        {
+            iz -= nz;
+        }
+
+        /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
+        const int    atomWarpIndex     = atomIndexLocal % PME_SPREADGATHER_ATOMS_PER_WARP;
+        /* Warp index w.r.t. block - could probably be obtained easier? */
+        const int    warpIndex         = atomIndexLocal / PME_SPREADGATHER_ATOMS_PER_WARP;
+        const int    dimStride         = PME_SPLINE_THETA_STRIDE * PME_SPREADGATHER_ATOMS_PER_WARP;
+        const int    orderStride       = dimStride * DIM;
+        const int    thetaOffsetBase   = orderStride * order * warpIndex + atomWarpIndex;
+
+        const float  thetaZ         = sm_theta[thetaOffsetBase + ithz * orderStride + ZZ * dimStride];
+        const float  thetaY         = sm_theta[thetaOffsetBase + ithy * orderStride + YY * dimStride];
+        const float  constVal       = thetaZ * thetaY * sm_coefficients[atomIndexLocal];
+        assert(isfinite(constVal));
+        const int    constOffset       = iy * pnz + iz;
+        const float *sm_thetaX         = sm_theta + (thetaOffsetBase + XX * dimStride);
+
+#pragma unroll
+        for (int ithx = 0; (ithx < order); ithx++)
+        {
+            int ix = ixBase + ithx;
+            if (wrapX & (ix >= nx))
+            {
+                ix -= nx;
+            }
+            const int gridIndexGlobal = ix * pny * pnz + constOffset;
+            assert(isfinite(sm_thetaX[ithx * orderStride]));
+            assert(isfinite(gm_grid[gridIndexGlobal]));
+            atomicAdd(gm_grid + gridIndexGlobal, sm_thetaX[ithx * orderStride] * constVal);
+        }
+    }
+}
+
+/*! \brief
+ * A spline computation and charge spreading kernel function.
+ *
+ * \tparam[in] order                PME interpolation order.
+ * \tparam[in] computeSplines       A boolean which tells if the spline parameter and
+ *                                  gridline indices' computation should be performed.
+ * \tparam[in] spreadCharges        A boolean which tells if the charge spreading should be performed.
+ * \tparam[in] wrapX                A boolean which tells if the grid overlap in dimension X should be wrapped.
+ * \tparam[in] wrapY                A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \param[in]  kernelParams         Input PME CUDA data in constant memory.
+ */
+template <
+    const int order,
+    const bool computeSplines,
+    const bool spreadCharges,
+    const bool wrapX,
+    const bool wrapY
+    >
+__launch_bounds__(c_spreadMaxThreadsPerBlock)
+__global__ void pme_spline_and_spread_kernel(const pme_gpu_cuda_kernel_params_t kernelParams)
+{
+    const int        atomsPerBlock = c_spreadMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM;
+    // Gridline indices, ivec
+    __shared__ int   sm_gridlineIndices[atomsPerBlock * DIM];
+    // Charges
+    __shared__ float sm_coefficients[atomsPerBlock];
+    // Spline values
+    __shared__ float sm_theta[atomsPerBlock * DIM * order];
+
+    const int        atomIndexOffset = blockIdx.x * atomsPerBlock;
+
+    /* Staging coefficients/charges for both spline and spread */
+    pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
+
+    if (computeSplines)
+    {
+        /* Staging coordinates */
+        __shared__ float sm_coordinates[DIM * atomsPerBlock];
+        pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
+
+        __syncthreads();
+        calculate_splines<order, atomsPerBlock>(kernelParams, atomIndexOffset, (const float3 *)sm_coordinates,
+                                                sm_coefficients, sm_theta, sm_gridlineIndices);
+    }
+    else
+    {
+        /* Staging the data for spread
+         * (the data is assumed to be in GPU global memory with proper layout already,
+         * as in after running the spline kernel)
+         */
+        /* Spline data - only thetas (dthetas will only be needed in gather) */
+        pme_gpu_stage_atom_data<float, atomsPerBlock, DIM * order>(kernelParams, sm_theta, kernelParams.atoms.d_theta);
+        /* Gridline indices */
+        pme_gpu_stage_atom_data<int, atomsPerBlock, DIM>(kernelParams, sm_gridlineIndices, kernelParams.atoms.d_gridlineIndices);
+
+        __syncthreads();
+    }
+
+    /* Spreading */
+    if (spreadCharges)
+    {
+        spread_charges<order, wrapX, wrapY>(kernelParams, atomIndexOffset, sm_coefficients, sm_gridlineIndices, sm_theta);
+    }
+}
+
+void pme_gpu_spread(const pme_gpu_t *pmeGpu,
+                    int gmx_unused   gridIndex,
+                    real            *h_grid,
+                    bool             computeSplines,
+                    bool             spreadCharges)
+{
+    GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)");
+    cudaStream_t  stream          = pmeGpu->archSpecific->pmeStream;
+    const auto   *kernelParamsPtr = pmeGpu->kernelParams.get();
+    GMX_ASSERT(kernelParamsPtr->atoms.nAtoms > 0, "No atom data in PME GPU spread");
+
+    const int order         = pmeGpu->common->pme_order;
+    const int atomsPerBlock = c_spreadMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM;
+    // TODO: pick smaller block size in runtime if needed
+    // (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms))
+    // If doing so, change atomsPerBlock in the kernels as well.
+    // TODO: test varying block sizes on modern arch-s as well
+    // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
+    //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
+    GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
+
+    dim3 nBlocks(pmeGpu->nAtomsPadded / atomsPerBlock);
+    dim3 dimBlock(order, order, atomsPerBlock);
+
+    // These should later check for PME decomposition
+    const bool wrapX = true;
+    const bool wrapY = true;
+    GMX_UNUSED_VALUE(wrapX);
+    GMX_UNUSED_VALUE(wrapY);
+    switch (order)
+    {
+        case 4:
+        {
+            // TODO: cleaner unroll with some template trick?
+            if (computeSplines)
+            {
+                if (spreadCharges)
+                {
+                    pme_gpu_start_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
+                    pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+                    CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
+                    pme_gpu_stop_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
+                }
+                else
+                {
+                    pme_gpu_start_timing(pmeGpu, gtPME_SPLINE);
+                    pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+                    CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
+                    pme_gpu_stop_timing(pmeGpu, gtPME_SPLINE);
+                }
+            }
+            else
+            {
+                pme_gpu_start_timing(pmeGpu, gtPME_SPREAD);
+                pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< nBlocks, dimBlock, 0, stream>>> (*kernelParamsPtr);
+                CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
+                pme_gpu_stop_timing(pmeGpu, gtPME_SPREAD);
+            }
+        }
+        break;
+
+        default:
+            GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not tested!"));
+    }
+
+    const bool copyBackGrid = spreadCharges && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_FFT(pmeGpu));
+    if (copyBackGrid)
+    {
+        pme_gpu_copy_output_spread_grid(pmeGpu, h_grid);
+    }
+    const bool copyBackAtomData = computeSplines && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_gather(pmeGpu));
+    if (copyBackAtomData)
+    {
+        pme_gpu_copy_output_spread_atom_data(pmeGpu);
+    }
+}
index bd67a01ee745e7c9c78fd23a8a51434b795a7b6f..744cf28ed5c24023cea94f5855e7b49949792b2e 100644 (file)
 int pme_gpu_get_atom_data_alignment(const pme_gpu_t *pmeGPU)
 {
     const int order = pmeGPU->common->pme_order;
-    GMX_RELEASE_ASSERT(order > 0, "Invalid PME order");
+    GMX_ASSERT(order > 0, "Invalid PME order");
     return PME_ATOM_DATA_ALIGNMENT;
 }
 
 int pme_gpu_get_atom_spline_data_alignment(const pme_gpu_t *pmeGPU)
 {
     const int order = pmeGPU->common->pme_order;
-    GMX_RELEASE_ASSERT(order > 0, "Invalid PME order");
+    GMX_ASSERT(order > 0, "Invalid PME order");
     return PME_SPREADGATHER_ATOMS_PER_WARP;
 }
 
@@ -340,7 +340,8 @@ void pme_gpu_clear_grids(const pme_gpu_t *pmeGPU)
 
 void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *pmeGPU)
 {
-    cudaStream_t stream          = pmeGPU->archSpecific->pmeStream;
+    pme_gpu_free_fract_shifts(pmeGPU);
+
     auto        *kernelParamsPtr = pmeGPU->kernelParams.get();
 
     const int    nx                  = kernelParamsPtr->grid.realGridSize[XX];
@@ -353,38 +354,32 @@ void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *pmeGPU)
 
     const int    newFractShiftsSize  = cellCount * (nx + ny + nz);
 
-    /* Two arrays, same size */
-    int currentSizeTemp      = pmeGPU->archSpecific->fractShiftsSize;
-    int currentSizeTempAlloc = pmeGPU->archSpecific->fractShiftsSizeAlloc;
-    cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fractShiftsTable, nullptr, sizeof(float),
-                        &currentSizeTemp, &currentSizeTempAlloc,
-                        newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true);
-    float *fractShiftsArray = kernelParamsPtr->grid.d_fractShiftsTable;
-    cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_gridlineIndicesTable, nullptr, sizeof(int),
-                        &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc,
-                        newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true);
-    int *gridlineIndicesArray = kernelParamsPtr->grid.d_gridlineIndicesTable;
-
-    /* TODO: pinning of the pmeGPU->common->fsh/nn host-side arrays */
+    initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
+                         kernelParamsPtr->fractShiftsTableTexture,
+                         &pme_gpu_get_fract_shifts_texref(),
+                         pmeGPU->common->fsh.data(),
+                         newFractShiftsSize,
+                         pmeGPU->deviceInfo);
 
-    for (int i = 0; i < DIM; i++)
-    {
-        kernelParamsPtr->grid.tablesOffsets[i] = gridDataOffset[i];
-        cu_copy_H2D_async(fractShiftsArray + gridDataOffset[i], pmeGPU->common->fsh[i].data(),
-                          cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(float), stream);
-        cu_copy_H2D_async(gridlineIndicesArray + gridDataOffset[i], pmeGPU->common->nn[i].data(),
-                          cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(int), stream);
-    }
-
-    pme_gpu_make_fract_shifts_textures(pmeGPU);
+    initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
+                         kernelParamsPtr->gridlineIndicesTableTexture,
+                         &pme_gpu_get_gridline_texref(),
+                         pmeGPU->common->nn.data(),
+                         newFractShiftsSize,
+                         pmeGPU->deviceInfo);
 }
 
 void pme_gpu_free_fract_shifts(const pme_gpu_t *pmeGPU)
 {
-    pme_gpu_free_fract_shifts_textures(pmeGPU);
-    /* Two arrays, same size */
-    cu_free_buffered(pmeGPU->kernelParams->grid.d_fractShiftsTable);
-    cu_free_buffered(pmeGPU->kernelParams->grid.d_gridlineIndicesTable, &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc);
+    auto *kernelParamsPtr = pmeGPU->kernelParams.get();
+    destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
+                            kernelParamsPtr->fractShiftsTableTexture,
+                            &pme_gpu_get_fract_shifts_texref(),
+                            pmeGPU->deviceInfo);
+    destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
+                            kernelParamsPtr->gridlineIndicesTableTexture,
+                            &pme_gpu_get_gridline_texref(),
+                            pmeGPU->deviceInfo);
 }
 
 void pme_gpu_sync_output_energy_virial(const pme_gpu_t *pmeGPU)
@@ -412,6 +407,20 @@ void pme_gpu_copy_output_spread_grid(const pme_gpu_t *pmeGpu, float *h_grid)
     CU_RET_ERR(stat, "PME spread grid sync event record failure");
 }
 
+void pme_gpu_copy_output_spread_atom_data(const pme_gpu_t *pmeGpu)
+{
+    const int    alignment       = pme_gpu_get_atom_spline_data_alignment(pmeGpu);
+    const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
+    const size_t splinesSize     = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
+    auto        *kernelParamsPtr = pmeGpu->kernelParams.get();
+    cu_copy_D2H_async(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream);
+    cu_copy_D2H_async(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
+    cu_copy_D2H_async(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
+                      kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
+    cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSplineAtomDataD2H, pmeGpu->archSpecific->pmeStream);
+    CU_RET_ERR(stat, "PME spread atom data sync event record failure");
+}
+
 void pme_gpu_sync_spread_grid(const pme_gpu_t *pmeGPU)
 {
     cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSpreadGridD2H, 0);
index 16b4b9aae8ae905f433656c9e41daea96b24f156..bed5feb4923a64a4c0c1bab45a9d6ce70739a8da 100644 (file)
@@ -45,6 +45,8 @@
 #ifndef GMX_EWALD_PME_CUH
 #define GMX_EWALD_PME_CUH
 
+#include "config.h"
+
 #include <cassert>
 
 #include <array>
@@ -89,12 +91,23 @@ class GpuParallel3dFft;
  */
 #define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
 
-//! The spread/gather integer constant which depends on the templated order parameter (2 atoms per warp for order == 4)
+/*! \brief
+ * The number of atoms processed by a single warp in spread/gather.
+ * This macro depends on the templated order parameter (2 atoms per warp for order 4).
+ * It is mostly used for spline data layout tweaked for coalesced access.
+ */
 #define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM)
 
-//! Atom data alignment - has to be divisible both by spread and gather maximal atoms-per-block counts,
-//! which is asserted in case we use atom data padding at all.
-#define PME_ATOM_DATA_ALIGNMENT (16 * PME_SPREADGATHER_ATOMS_PER_WARP);
+/*! \brief
+ * Atom data alignment (in terms of number of atoms).
+ * If the GPU atom data buffers are padded (c_usePadding == true),
+ * Then the numbers of atoms which would fit in the padded GPU buffers has to be divisible by this.
+ * The literal number (16) expresses maximum spread/gather block width in warps.
+ * Accordingly, spread and gather block widths in warps should be divisors of this
+ * (e.g. in the pme-spread.cu: constexpr int c_spreadMaxThreadsPerBlock = 8 * warp_size;).
+ * There are debug asserts for this divisibility.
+ */
+#define PME_ATOM_DATA_ALIGNMENT (16 * PME_SPREADGATHER_ATOMS_PER_WARP)
 
 /*! \brief \internal
  * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
@@ -193,10 +206,6 @@ struct pme_gpu_cuda_t
     int splineValuesSize;
     /*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */
     int splineValuesSizeAlloc;
-    /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (actual) */
-    int fractShiftsSize;
-    /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (reserved) */
-    int fractShiftsSizeAlloc;
     /*! \brief The kernelParams.grid.realGrid float element count (actual) */
     int realGridSize;
     /*! \brief The kernelParams.grid.realGrid float element count (reserved) */
@@ -222,22 +231,16 @@ struct pme_gpu_cuda_kernel_params_t : pme_gpu_kernel_params_base_t
     cudaTextureObject_t gridlineIndicesTableTexture;
 };
 
-/* CUDA texture functions which will reside in respective kernel files
+/* CUDA texture reference functions which reside in respective kernel files
  * (due to texture references having scope of a translation unit).
  */
-
-/*! \brief \internal
- * Creates/binds 2 textures used in the spline parameter computation.
- *
- * \param[in, out] pmeGPU         The PME GPU structure.
- */
-inline void pme_gpu_make_fract_shifts_textures(pme_gpu_t gmx_unused *pmeGpu){};
-
-/*! \brief \internal
- * Frees/unbinds 2 textures used in the spline parameter computation.
- *
- * \param[in] pmeGPU             The PME GPU structure.
- */
-inline void pme_gpu_free_fract_shifts_textures(const pme_gpu_t gmx_unused *pmeGpu){};
+#if !GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
+extern texture<int, 1, cudaReadModeElementType>   gridlineIndicesTableTextureRef;
+extern texture<float, 1, cudaReadModeElementType> fractShiftsTableTextureRef;
+#endif
+/*! Returns the reference to the gridlineIndices texture. */
+texture<int, 1, cudaReadModeElementType>   &pme_gpu_get_gridline_texref();
+/*! Returns the reference to the fractShifts texture. */
+texture<float, 1, cudaReadModeElementType> &pme_gpu_get_fract_shifts_texref();
 
 #endif
index e9e89875311e4793fb0357391199e9b98a7f0e41..c1e5f8c5e0c2190d8d629c63027ae44d0555ac0a 100644 (file)
@@ -205,6 +205,18 @@ void gmx_pme_reinit_atoms(const gmx_pme_t *pme, const int nAtoms, const real *ch
 
 /* A block of PME GPU functions */
 
+/*! \brief Checks whether the input system allows to run PME on GPU.
+ * TODO: this mostly duplicates an internal PME assert function
+ * pme_gpu_check_restrictions(), except that works with a
+ * formed gmx_pme_t structure. Should that one go away/work with inputrec?
+ *
+ * \param[in]  ir     Input system.
+ * \param[out] error  The error message if the input is not supported on GPU.
+ *
+ * \returns true if PME can run on GPU with this input, false otherwise.
+ */
+bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error);
+
 /*! \brief
  * Tells if PME is enabled to run on GPU (not necessarily active at the moment).
  * For now, this decision is stored in the PME structure itself.
index 1ebfbb0155134c1ffd1ca5cbb99b291024bc0497..3155182ae7e8f4b9fc265af193e32351b2dadbbc 100644 (file)
@@ -33,5 +33,9 @@
 # the research papers on the package. Check out http://www.gromacs.org.
 
 file(GLOB EWALD_TEST_SOURCES *.cpp)
+if (GMX_USE_CUDA)
+    file(GLOB EWALD_CUDA_SOURCES ../*.cu)
+endif()
+
 gmx_add_unit_test(EwaldUnitTests ewald-test
-                  ${EWALD_TEST_SOURCES})
+                  ${EWALD_TEST_SOURCES} ${EWALD_CUDA_SOURCES})
index 7e1e74f3ee5bbf3cd8a7715c05549dd199be8858..d65b9a796179e421d6527c222ec44cfed05ab8f1 100644 (file)
@@ -403,7 +403,7 @@ class PmeGatherTest : public ::testing::TestWithParam<GatherInputParameters>
                                           (inputForceTreatment == PmeGatherInputHandling::ReduceWith) ? "with reduction" : "without reduction"
                                           ));
 
-                PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, inputAtomData.coordinates, inputAtomData.charges, box);
+                PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, mode.first, nullptr, inputAtomData.coordinates, inputAtomData.charges, box);
 
                 /* Setting some more inputs */
                 pmeSetRealGrid(pmeSafe.get(), mode.first, nonZeroGridValues);
index c5c3b8ea1d2ae5175e49b8cf8dcda05c0b6cdc54..0163781a4d3a8c7294ea442efb70036000276044 100644 (file)
@@ -128,7 +128,7 @@ class PmeSolveTest : public ::testing::TestWithParam<SolveInputParameters>
                                                   ));
 
                         /* Running the test */
-                        PmeSafePointer pmeSafe = pmeInitEmpty(&inputRec, box, ewaldCoeff_q, ewaldCoeff_lj);
+                        PmeSafePointer pmeSafe = pmeInitEmpty(&inputRec, mode.first, nullptr, box, ewaldCoeff_q, ewaldCoeff_lj);
                         pmeSetComplexGrid(pmeSafe.get(), mode.first, gridOrdering.first, nonZeroGridValues);
                         const real     cellVolume = box[0] * box[4] * box[8];
                         //FIXME - this is box[XX][XX] * box[YY][YY] * box[ZZ][ZZ], should be stored in the PME structure
index c751ef1ebfb59815f55737735b3a9e99d41dea18..e6352d0c41a177096b807fae7e83e830b05391b5 100644 (file)
@@ -103,96 +103,134 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParam<SplineAndSpreadIn
             inputRec.nkz         = gridSize[ZZ];
             inputRec.pme_order   = pmeOrder;
             inputRec.coulombtype = eelPME;
+            inputRec.epsilon_r   = 1.0;
 
             TestReferenceData                                      refData;
 
-            const std::map<CodePath, std::string>                  modesToTest   = {{CodePath::CPU, "CPU"}};
+            const std::map<CodePath, std::string>                  modesToTest   = {{CodePath::CPU, "CPU"},
+                                                                                    {CodePath::CUDA, "CUDA"}};
+
             const std::map<PmeSplineAndSpreadOptions, std::string> optionsToTest = {{PmeSplineAndSpreadOptions::SplineAndSpreadUnified, "spline computation and charge spreading (fused)"},
                                                                                     {PmeSplineAndSpreadOptions::SplineOnly, "spline computation"},
                                                                                     {PmeSplineAndSpreadOptions::SpreadOnly, "charge spreading"}};
+
+            // There is a subtle problem with multiple comparisons against same reference data:
+            // The subsequent (GPU) spreading runs at one point didn't actually copy the output grid into the proper buffer,
+            // but the reference data was already marked as checked (hasBeenChecked_) by the CPU run, so nothing failed.
+            // For now we will manually track that the count of the grid entries is the same on each run.
+            // This is just a hack for a single specific output though.
+            // What would be much better TODO is to split different codepaths into separate tests,
+            // while making them use the same reference files.
+            bool   gridValuesSizeAssigned = false;
+            size_t previousGridValuesSize;
+
             for (const auto &mode : modesToTest)
             {
-                for (const auto &option : optionsToTest)
+                const bool supportedInput = pmeSupportsInputForMode(&inputRec, mode.first);
+                if (!supportedInput)
                 {
-                    /* Describing the test uniquely in case it fails */
-
-                    SCOPED_TRACE(formatString("Testing %s with %s for PME grid size %d %d %d"
-                                              ", order %d, %zu atoms",
-                                              option.second.c_str(), mode.second.c_str(),
-                                              gridSize[XX], gridSize[YY], gridSize[ZZ],
-                                              pmeOrder,
-                                              atomCount));
+                    /* Testing the failure for the unsupported input */
+                    EXPECT_THROW(pmeInitAtoms(&inputRec, mode.first, nullptr, coordinates, charges, box), NotImplementedError);
+                    continue;
+                }
 
-                    /* Running the test */
+                const auto contextsToTest = pmeEnv->getHardwareContexts(mode.first);
+                for (const auto &context : contextsToTest)
+                {
+                    for (const auto &option : optionsToTest)
+                    {
+                        /* Describing the test uniquely in case it fails */
 
-                    PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, coordinates, charges, box);
+                        SCOPED_TRACE(formatString("Testing %s with %s %sfor PME grid size %d %d %d"
+                                                  ", order %d, %zu atoms",
+                                                  option.second.c_str(), mode.second.c_str(),
+                                                  context.getDescription().c_str(),
+                                                  gridSize[XX], gridSize[YY], gridSize[ZZ],
+                                                  pmeOrder,
+                                                  atomCount));
 
-                    const bool     computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
-                    const bool     spreadCharges  = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
+                        /* Running the test */
 
-                    if (!computeSplines)
-                    {
-                        // Here we should set up the results of the spline computation so that the spread can run.
-                        // What is lazy and works is running the separate spline so that it will set it up for us:
-                        pmePerformSplineAndSpread(pmeSafe.get(), mode.first, true, false);
-                        // We know that it is tested in another iteration.
-                        // TODO: Clean alternative: read and set the reference gridline indices, spline params
-                    }
+                        PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, mode.first, context.getDeviceInfo(), coordinates, charges, box);
 
-                    pmePerformSplineAndSpread(pmeSafe.get(), mode.first, computeSplines, spreadCharges);
+                        const bool     computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
+                        const bool     spreadCharges  = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
 
-                    /* Outputs correctness check */
-                    /* All tolerances were picked empirically for single precision on CPU */
+                        if (!computeSplines)
+                        {
+                            // Here we should set up the results of the spline computation so that the spread can run.
+                            // What is lazy and works is running the separate spline so that it will set it up for us:
+                            pmePerformSplineAndSpread(pmeSafe.get(), mode.first, true, false);
+                            // We know that it is tested in another iteration.
+                            // TODO: Clean alternative: read and set the reference gridline indices, spline params
+                        }
 
-                    TestReferenceChecker rootChecker(refData.rootChecker());
+                        pmePerformSplineAndSpread(pmeSafe.get(), mode.first, computeSplines, spreadCharges);
+                        pmeFinalizeTest(pmeSafe.get(), mode.first);
 
-                    const auto           maxGridSize              = std::max(std::max(gridSize[XX], gridSize[YY]), gridSize[ZZ]);
-                    const auto           ulpToleranceSplineValues = 2 * (pmeOrder - 2) * maxGridSize;
-                    /* 2 is empiric, the rest follows from the amount of operations */
+                        /* Outputs correctness check */
+                        /* All tolerances were picked empirically for single precision on CPU */
 
-                    if (computeSplines)
-                    {
-                        const char *dimString[] = { "X", "Y", "Z" };
+                        TestReferenceChecker rootChecker(refData.rootChecker());
 
-                        /* Spline values */
-                        SCOPED_TRACE(formatString("Testing spline values with tolerance of %ld", ulpToleranceSplineValues));
-                        TestReferenceChecker splineValuesChecker(rootChecker.checkCompound("Splines", "Values"));
-                        splineValuesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceSplineValues));
-                        for (int i = 0; i < DIM; i++)
-                        {
-                            auto splineValuesDim = pmeGetSplineData(pmeSafe.get(), mode.first, PmeSplineDataType::Values, i);
-                            splineValuesChecker.checkSequence(splineValuesDim.begin(), splineValuesDim.end(), dimString[i]);
-                        }
+                        const auto           maxGridSize              = std::max(std::max(gridSize[XX], gridSize[YY]), gridSize[ZZ]);
+                        const auto           ulpToleranceSplineValues = 2 * (pmeOrder - 2) * maxGridSize;
+                        /* 2 is empiric, the rest follows from the amount of operations */
 
-                        /* Spline derivatives */
-                        const auto ulpToleranceSplineDerivatives = 4 * ulpToleranceSplineValues;
-                        /* 4 is just a wild guess since the derivatives are deltas of neighbor spline values which could differ greatly */
-                        SCOPED_TRACE(formatString("Testing spline derivatives with tolerance of %ld", ulpToleranceSplineDerivatives));
-                        TestReferenceChecker splineDerivativesChecker(rootChecker.checkCompound("Splines", "Derivatives"));
-                        splineDerivativesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceSplineDerivatives));
-                        for (int i = 0; i < DIM; i++)
+                        if (computeSplines)
                         {
-                            auto splineDerivativesDim = pmeGetSplineData(pmeSafe.get(), mode.first, PmeSplineDataType::Derivatives, i);
-                            splineDerivativesChecker.checkSequence(splineDerivativesDim.begin(), splineDerivativesDim.end(), dimString[i]);
+                            const char *dimString[] = { "X", "Y", "Z" };
+
+                            /* Spline values */
+                            SCOPED_TRACE(formatString("Testing spline values with tolerance of %ld", ulpToleranceSplineValues));
+                            TestReferenceChecker splineValuesChecker(rootChecker.checkCompound("Splines", "Values"));
+                            splineValuesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceSplineValues));
+                            for (int i = 0; i < DIM; i++)
+                            {
+                                auto splineValuesDim = pmeGetSplineData(pmeSafe.get(), mode.first, PmeSplineDataType::Values, i);
+                                splineValuesChecker.checkSequence(splineValuesDim.begin(), splineValuesDim.end(), dimString[i]);
+                            }
+
+                            /* Spline derivatives */
+                            const auto ulpToleranceSplineDerivatives = 4 * ulpToleranceSplineValues;
+                            /* 4 is just a wild guess since the derivatives are deltas of neighbor spline values which could differ greatly */
+                            SCOPED_TRACE(formatString("Testing spline derivatives with tolerance of %ld", ulpToleranceSplineDerivatives));
+                            TestReferenceChecker splineDerivativesChecker(rootChecker.checkCompound("Splines", "Derivatives"));
+                            splineDerivativesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceSplineDerivatives));
+                            for (int i = 0; i < DIM; i++)
+                            {
+                                auto splineDerivativesDim = pmeGetSplineData(pmeSafe.get(), mode.first, PmeSplineDataType::Derivatives, i);
+                                splineDerivativesChecker.checkSequence(splineDerivativesDim.begin(), splineDerivativesDim.end(), dimString[i]);
+                            }
+
+                            /* Particle gridline indices */
+                            auto gridLineIndices = pmeGetGridlineIndices(pmeSafe.get(), mode.first);
+                            rootChecker.checkSequence(gridLineIndices.begin(), gridLineIndices.end(), "Gridline indices");
                         }
 
-                        /* Particle gridline indices */
-                        auto gridLineIndices = pmeGetGridlineIndices(pmeSafe.get(), mode.first);
-                        rootChecker.checkSequence(gridLineIndices.begin(), gridLineIndices.end(), "Gridline indices");
-                    }
-
-                    if (spreadCharges)
-                    {
-                        /* The wrapped grid */
-                        SparseRealGridValuesOutput nonZeroGridValues = pmeGetRealGrid(pmeSafe.get(), mode.first);
-                        TestReferenceChecker       gridValuesChecker(rootChecker.checkCompound("NonZeroGridValues", "RealSpaceGrid"));
-                        const auto                 ulpToleranceGrid = 2 * ulpToleranceSplineValues * (int)(ceil(sqrt(atomCount)));
-                        /* 2 is empiric; sqrt(atomCount) assumes all the input charges may spread onto the same cell */
-                        SCOPED_TRACE(formatString("Testing grid values with tolerance of %ld", ulpToleranceGrid));
-                        gridValuesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceGrid));
-                        for (const auto &point : nonZeroGridValues)
+                        if (spreadCharges)
                         {
-                            gridValuesChecker.checkReal(point.second, point.first.c_str());
+                            /* The wrapped grid */
+                            SparseRealGridValuesOutput nonZeroGridValues = pmeGetRealGrid(pmeSafe.get(), mode.first);
+                            TestReferenceChecker       gridValuesChecker(rootChecker.checkCompound("NonZeroGridValues", "RealSpaceGrid"));
+                            const auto                 ulpToleranceGrid = 2 * ulpToleranceSplineValues * (int)(ceil(sqrt(atomCount)));
+                            /* 2 is empiric; sqrt(atomCount) assumes all the input charges may spread onto the same cell */
+                            SCOPED_TRACE(formatString("Testing grid values with tolerance of %ld", ulpToleranceGrid));
+                            if (!gridValuesSizeAssigned)
+                            {
+                                previousGridValuesSize = nonZeroGridValues.size();
+                                gridValuesSizeAssigned = true;
+                            }
+                            else
+                            {
+                                EXPECT_EQ(previousGridValuesSize, nonZeroGridValues.size());
+                            }
+
+                            gridValuesChecker.setDefaultTolerance(relativeToleranceAsUlp(1.0, ulpToleranceGrid));
+                            for (const auto &point : nonZeroGridValues)
+                            {
+                                gridValuesChecker.checkReal(point.second, point.first.c_str());
+                            }
                         }
                     }
                 }
@@ -201,7 +239,7 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParam<SplineAndSpreadIn
 };
 
 
-/*! \brief Test for PME B-spline moduli computation */
+/*! \brief Test for spline parameter computation and charge spreading. */
 TEST_P(PmeSplineAndSpreadTest, ReproducesOutputs)
 {
     EXPECT_NO_THROW(runTest());
@@ -210,7 +248,7 @@ TEST_P(PmeSplineAndSpreadTest, ReproducesOutputs)
 /* Valid input instances */
 
 //! A couple of valid inputs for boxes.
-static std::vector<Matrix3x3> const sampleBoxes
+static std::vector<Matrix3x3> const c_sampleBoxes
 {
     // normal box
     Matrix3x3 {{
@@ -227,7 +265,7 @@ static std::vector<Matrix3x3> const sampleBoxes
 };
 
 //! A couple of valid inputs for grid sizes.
-static std::vector<IVec> const sampleGridSizes
+static std::vector<IVec> const c_sampleGridSizes
 {
     IVec {
         16, 12, 14
@@ -238,19 +276,19 @@ static std::vector<IVec> const sampleGridSizes
 };
 
 //! Random charges
-static std::vector<real> const sampleChargesFull
+static std::vector<real> const c_sampleChargesFull
 {
     4.95f, 3.11f, 3.97f, 1.08f, 2.09f, 1.1f, 4.13f, 3.31f, 2.8f, 5.83f, 5.09f, 6.1f, 2.86f, 0.24f, 5.76f, 5.19f, 0.72f
 };
 //! 1 charge
-static auto const sampleCharges1 = ChargesVector::fromVector(sampleChargesFull.begin(), sampleChargesFull.begin() + 1);
+static auto const c_sampleCharges1 = ChargesVector::fromVector(c_sampleChargesFull.begin(), c_sampleChargesFull.begin() + 1);
 //! 2 charges
-static auto const sampleCharges2 = ChargesVector::fromVector(sampleChargesFull.begin() + 1, sampleChargesFull.begin() + 3);
+static auto const c_sampleCharges2 = ChargesVector::fromVector(c_sampleChargesFull.begin() + 1, c_sampleChargesFull.begin() + 3);
 //! 13 charges
-static auto const sampleCharges13 = ChargesVector::fromVector(sampleChargesFull.begin() + 3, sampleChargesFull.begin() + 16);
+static auto const c_sampleCharges13 = ChargesVector::fromVector(c_sampleChargesFull.begin() + 3, c_sampleChargesFull.begin() + 16);
 
 //! Random coordinate vectors
-static CoordinatesVector const sampleCoordinatesFull
+static CoordinatesVector const c_sampleCoordinatesFull
 {
     {
         5.59f, 1.37f, 0.95f
@@ -289,33 +327,34 @@ static CoordinatesVector const sampleCoordinatesFull
     }
 };
 //! 1 coordinate vector
-static CoordinatesVector const sampleCoordinates1(sampleCoordinatesFull.begin(), sampleCoordinatesFull.begin() + 1);
+static CoordinatesVector const c_sampleCoordinates1(c_sampleCoordinatesFull.begin(), c_sampleCoordinatesFull.begin() + 1);
 //! 2 coordinate vectors
-static CoordinatesVector const sampleCoordinates2(sampleCoordinatesFull.begin() + 1, sampleCoordinatesFull.begin() + 3);
+static CoordinatesVector const c_sampleCoordinates2(c_sampleCoordinatesFull.begin() + 1, c_sampleCoordinatesFull.begin() + 3);
 //! 13 coordinate vectors
-static CoordinatesVector const sampleCoordinates13(sampleCoordinatesFull.begin() + 3, sampleCoordinatesFull.begin() + 16);
+static CoordinatesVector const c_sampleCoordinates13(c_sampleCoordinatesFull.begin() + 3, c_sampleCoordinatesFull.begin() + 16);
 
 //! moved out from instantiantions for readability
-auto inputBoxes     = ::testing::ValuesIn(sampleBoxes);
+auto c_inputBoxes     = ::testing::ValuesIn(c_sampleBoxes);
 //! moved out from instantiantions for readability
-auto inputPmeOrders = ::testing::Range(3, 5 + 1);
+auto c_inputPmeOrders = ::testing::Range(3, 5 + 1);
 //! moved out from instantiantions for readability
-auto inputGridSizes = ::testing::ValuesIn(sampleGridSizes);
+auto c_inputGridSizes = ::testing::ValuesIn(c_sampleGridSizes);
 
-/*! \brief Instantiation of the PME spline computation test with valid input and 1 atom */
-INSTANTIATE_TEST_CASE_P(SaneInput1, PmeSplineAndSpreadTest, ::testing::Combine(inputBoxes, inputPmeOrders, inputGridSizes,
-                                                                                   ::testing::Values(sampleCoordinates1),
-                                                                                   ::testing::Values(sampleCharges1)
+/*! \brief Instantiation of the test with valid input and 1 atom */
+INSTANTIATE_TEST_CASE_P(SaneInput1, PmeSplineAndSpreadTest, ::testing::Combine(c_inputBoxes, c_inputPmeOrders, c_inputGridSizes,
+                                                                                   ::testing::Values(c_sampleCoordinates1),
+                                                                                   ::testing::Values(c_sampleCharges1)
                                                                                ));
-/*! \brief Instantiation of the PME spline computation test with valid input and 2 atoms */
-INSTANTIATE_TEST_CASE_P(SaneInput2, PmeSplineAndSpreadTest, ::testing::Combine(inputBoxes, inputPmeOrders, inputGridSizes,
-                                                                                   ::testing::Values(sampleCoordinates2),
-                                                                                   ::testing::Values(sampleCharges2)
+
+/*! \brief Instantiation of the test with valid input and 2 atoms */
+INSTANTIATE_TEST_CASE_P(SaneInput2, PmeSplineAndSpreadTest, ::testing::Combine(c_inputBoxes, c_inputPmeOrders, c_inputGridSizes,
+                                                                                   ::testing::Values(c_sampleCoordinates2),
+                                                                                   ::testing::Values(c_sampleCharges2)
                                                                                ));
-/*! \brief Instantiation of the PME spline computation test with valid input and 13 atoms */
-INSTANTIATE_TEST_CASE_P(SaneInput13, PmeSplineAndSpreadTest, ::testing::Combine(inputBoxes, inputPmeOrders, inputGridSizes,
-                                                                                    ::testing::Values(sampleCoordinates13),
-                                                                                    ::testing::Values(sampleCharges13)
+/*! \brief Instantiation of the test with valid input and 13 atoms */
+INSTANTIATE_TEST_CASE_P(SaneInput13, PmeSplineAndSpreadTest, ::testing::Combine(c_inputBoxes, c_inputPmeOrders, c_inputGridSizes,
+                                                                                    ::testing::Values(c_sampleCoordinates13),
+                                                                                    ::testing::Values(c_sampleCharges13)
                                                                                 ));
 }
 }
index d1cafc59d46d7a00f8fc9fa9e11760249fe08fa2..c44599542e47e8badf85ccfa0b074c90392f0277 100644 (file)
@@ -45,8 +45,8 @@
 
 #include <cstring>
 
-#include "gromacs/ewald/pme.h"
 #include "gromacs/ewald/pme-gather.h"
+#include "gromacs/ewald/pme-gpu-internal.h"
 #include "gromacs/ewald/pme-grid.h"
 #include "gromacs/ewald/pme-internal.h"
 #include "gromacs/ewald/pme-solve.h"
@@ -65,8 +65,29 @@ namespace gmx
 namespace test
 {
 
+bool pmeSupportsInputForMode(const t_inputrec *inputRec, CodePath mode)
+{
+    bool implemented;
+    switch (mode)
+    {
+        case CodePath::CPU:
+            implemented = true;
+            break;
+
+        case CodePath::CUDA:
+            implemented = pme_gpu_supports_input(inputRec, nullptr);
+            break;
+
+        default:
+            GMX_THROW(InternalError("Test not implemented for this mode"));
+    }
+    return implemented;
+}
+
 //! PME initialization - internal
 static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
+                                      CodePath                  mode,
+                                      gmx_device_info_t        *gpuInfo,
                                       size_t                    atomCount,
                                       const Matrix3x3          &box,
                                       real                      ewaldCoeff_q = 1.0f,
@@ -75,10 +96,10 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
 {
     gmx_pme_t     *pmeDataRaw = nullptr;
     const MDLogger dummyLogger;
-    const auto     runMode       = PmeRunMode::CPU;
+    const auto     runMode       = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::GPU;
     t_commrec      dummyCommrec  = {0};
     gmx_pme_init(&pmeDataRaw, &dummyCommrec, 1, 1, inputRec, atomCount, false, false, true,
-                 ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, nullptr, dummyLogger);
+                 ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr, gpuInfo, dummyLogger);
     PmeSafePointer pme(pmeDataRaw); // taking ownership
 
     // TODO get rid of this with proper matrix type
@@ -92,24 +113,40 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
     }
     const char *boxError = check_box(-1, boxTemp);
     GMX_RELEASE_ASSERT(boxError == nullptr, boxError);
-    invertBoxMatrix(boxTemp, pme->recipbox);
+
+    switch (mode)
+    {
+        case CodePath::CPU:
+            invertBoxMatrix(boxTemp, pme->recipbox);
+            break;
+
+        case CodePath::CUDA:
+            pme_gpu_update_input_box(pme->gpu, boxTemp);
+            break;
+        default:
+            GMX_THROW(InternalError("Test not implemented for this mode"));
+    }
 
     return pme;
 }
 
 //! Simple PME initialization based on input, no atom data
 PmeSafePointer pmeInitEmpty(const t_inputrec         *inputRec,
+                            CodePath                  mode,
+                            gmx_device_info_t        *gpuInfo,
                             const Matrix3x3          &box,
                             real                      ewaldCoeff_q,
                             real                      ewaldCoeff_lj
                             )
 {
-    return pmeInitInternal(inputRec, 0, box, ewaldCoeff_q, ewaldCoeff_lj);
+    return pmeInitInternal(inputRec, mode, gpuInfo, 0, box, ewaldCoeff_q, ewaldCoeff_lj);
     // hiding the fact that PME actually needs to know the number of atoms in advance
 }
 
 //! PME initialization with atom data
 PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
+                            CodePath                  mode,
+                            gmx_device_info_t        *gpuInfo,
                             const CoordinatesVector  &coordinates,
                             const ChargesVector      &charges,
                             const Matrix3x3          &box
@@ -117,11 +154,28 @@ PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
 {
     const size_t    atomCount = coordinates.size();
     GMX_RELEASE_ASSERT(atomCount == charges.size(), "Mismatch in atom data");
-    PmeSafePointer  pmeSafe = pmeInitInternal(inputRec, atomCount, box);
-    pme_atomcomm_t *atc     = &(pmeSafe->atc[0]);
-    atc->x           = const_cast<rvec *>(as_rvec_array(coordinates.data()));
-    atc->coefficient = const_cast<real *>(charges.data());
-    /* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */
+    PmeSafePointer  pmeSafe = pmeInitInternal(inputRec, mode, gpuInfo, atomCount, box);
+    pme_atomcomm_t *atc     = nullptr;
+
+    switch (mode)
+    {
+        case CodePath::CPU:
+            atc              = &(pmeSafe->atc[0]);
+            atc->x           = const_cast<rvec *>(as_rvec_array(coordinates.data()));
+            atc->coefficient = const_cast<real *>(charges.data());
+            /* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */
+            break;
+
+        case CodePath::CUDA:
+            pme_gpu_set_testing(pmeSafe->gpu, true);
+            gmx_pme_reinit_atoms(pmeSafe.get(), atomCount, charges.data());
+            pme_gpu_copy_input_coordinates(pmeSafe->gpu, as_rvec_array(coordinates.data()));
+            break;
+
+        default:
+            GMX_THROW(InternalError("Test not implemented for this mode"));
+    }
+
     return pmeSafe;
 }
 
@@ -134,12 +188,25 @@ static real *pmeGetRealGridInternal(const gmx_pme_t *pme)
 
 //! Getting local PME real grid dimensions
 static void pmeGetRealGridSizesInternal(const gmx_pme_t      *pme,
+                                        CodePath              mode,
                                         IVec                 &gridSize,
                                         IVec                 &paddedGridSize)
 {
     const size_t gridIndex = 0;
     IVec         gridOffsetUnused;
-    gmx_parallel_3dfft_real_limits(pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused, paddedGridSize);
+    switch (mode)
+    {
+        case CodePath::CPU:
+            gmx_parallel_3dfft_real_limits(pme->pfft_setup[gridIndex], gridSize, gridOffsetUnused, paddedGridSize);
+            break;
+
+        case CodePath::CUDA:
+            pme_gpu_get_real_grid_sizes(pme->gpu, &gridSize, &paddedGridSize);
+            break;
+
+        default:
+            GMX_THROW(InternalError("Test not implemented for this mode"));
+    }
 }
 
 //! Getting local PME complex grid pointer for test I/O
@@ -160,28 +227,28 @@ static void pmeGetComplexGridSizesInternal(const gmx_pme_t      *pme,
 }
 
 //! Getting the PME grid memory buffer and its sizes - template definition
-template<typename ValueType> static void pmeGetGridAndSizesInternal(const gmx_pme_t *, ValueType * &, IVec &, IVec &)
+template<typename ValueType> static void pmeGetGridAndSizesInternal(const gmx_pme_t *, CodePath, ValueType * &, IVec &, IVec &)
 {
     GMX_THROW(InternalError("Deleted function call"));
     // explicitly deleting general template does not compile in clang/icc, see https://llvm.org/bugs/show_bug.cgi?id=17537
 }
 
 //! Getting the PME real grid memory buffer and its sizes
-template<> void pmeGetGridAndSizesInternal<real>(const gmx_pme_t *pme, real * &grid, IVec &gridSize, IVec &paddedGridSize)
+template<> void pmeGetGridAndSizesInternal<real>(const gmx_pme_t *pme, CodePath mode, real * &grid, IVec &gridSize, IVec &paddedGridSize)
 {
     grid = pmeGetRealGridInternal(pme);
-    pmeGetRealGridSizesInternal(pme, gridSize, paddedGridSize);
+    pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSize);
 }
 
 //! Getting the PME complex grid memory buffer and its sizes
-template<> void pmeGetGridAndSizesInternal<t_complex>(const gmx_pme_t *pme, t_complex * &grid, IVec &gridSize, IVec &paddedGridSize)
+template<> void pmeGetGridAndSizesInternal<t_complex>(const gmx_pme_t *pme, CodePath, t_complex * &grid, IVec &gridSize, IVec &paddedGridSize)
 {
     grid = pmeGetComplexGridInternal(pme);
     pmeGetComplexGridSizesInternal(pme, gridSize, paddedGridSize);
 }
 
 //! PME spline calculation and charge spreading
-void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qualifiers
+void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qualifiers elsewhere
                                bool computeSplines, bool spreadCharges)
 {
     GMX_RELEASE_ASSERT(pme != nullptr, "PME data is not initialized");
@@ -189,6 +256,7 @@ void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qual
     const size_t    gridIndex                    = 0;
     const bool      computeSplinesForZeroCharges = true;
     real           *fftgrid                      = spreadCharges ? pme->fftgrid[gridIndex] : nullptr;
+    real           *pmegrid                      = pme->pmegrid[gridIndex].grid.grid;
 
     switch (mode)
     {
@@ -197,11 +265,15 @@ void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qual
                            fftgrid, computeSplinesForZeroCharges, gridIndex);
             if (spreadCharges && !pme->bUseThreads)
             {
-                wrap_periodic_pmegrid(pme, pme->pmegrid[gridIndex].grid.grid);
-                copy_pmegrid_to_fftgrid(pme, pme->pmegrid[gridIndex].grid.grid, fftgrid, gridIndex);
+                wrap_periodic_pmegrid(pme, pmegrid);
+                copy_pmegrid_to_fftgrid(pme, pmegrid, fftgrid, gridIndex);
             }
             break;
 
+        case CodePath::CUDA:
+            pme_gpu_spread(pme->gpu, gridIndex, fftgrid, computeSplines, spreadCharges);
+            break;
+
         default:
             GMX_THROW(InternalError("Test not implemented for this mode"));
     }
@@ -298,6 +370,23 @@ void pmePerformGather(gmx_pme_t *pme, CodePath mode,
     }
 }
 
+//! PME test finalization before fetching the outputs
+void pmeFinalizeTest(const gmx_pme_t *pme, CodePath mode)
+{
+    switch (mode)
+    {
+        case CodePath::CPU:
+            break;
+
+        case CodePath::CUDA:
+            pme_gpu_synchronize(pme->gpu);
+            break;
+
+        default:
+            GMX_THROW(InternalError("Test not implemented for this mode"));
+    }
+}
+
 //! Setting atom spline values/derivatives to be used in spread/gather
 void pmeSetSplineData(const gmx_pme_t *pme, CodePath mode,
                       const SplineParamsDimVector &splineValues, PmeSplineDataType type, int dimIndex)
@@ -329,7 +418,7 @@ void pmeSetGridLineIndices(const gmx_pme_t *pme, CodePath mode,
     GMX_RELEASE_ASSERT(atomCount == gridLineIndices.size(), "Mismatch in gridline indices size");
 
     IVec paddedGridSizeUnused, gridSize;
-    pmeGetRealGridSizesInternal(pme, gridSize, paddedGridSizeUnused);
+    pmeGetRealGridSizesInternal(pme, mode, gridSize, paddedGridSizeUnused);
 
     for (const auto &index : gridLineIndices)
     {
@@ -380,7 +469,7 @@ static void pmeSetGridInternal(const gmx_pme_t *pme, CodePath mode,
 {
     IVec       gridSize, paddedGridSize;
     ValueType *grid;
-    pmeGetGridAndSizesInternal<ValueType>(pme, grid, gridSize, paddedGridSize);
+    pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
 
     switch (mode)
     {
@@ -431,6 +520,10 @@ SplineParamsDimVector pmeGetSplineData(const gmx_pme_t *pme, CodePath mode,
     SplineParamsDimVector    result;
     switch (mode)
     {
+        case CodePath::CUDA:
+            pme_gpu_transform_spline_atom_data_for_host(pme->gpu, atc, type, dimIndex);
+        // fallthrough
+
         case CodePath::CPU:
             result = SplineParamsDimVector::fromArray(sourceBuffer, dimSize);
             break;
@@ -451,6 +544,10 @@ GridLineIndicesVector pmeGetGridlineIndices(const gmx_pme_t *pme, CodePath mode)
     GridLineIndicesVector gridLineIndices;
     switch (mode)
     {
+        case CodePath::CUDA:
+            gridLineIndices = GridLineIndicesVector::fromArray(reinterpret_cast<IVec *>(pme->gpu->staging.h_gridlineIndices), atomCount);
+            break;
+
         case CodePath::CPU:
             gridLineIndices = GridLineIndicesVector::fromArray(reinterpret_cast<IVec *>(atc->idx), atomCount);
             break;
@@ -467,10 +564,11 @@ static SparseGridValuesOutput<ValueType> pmeGetGridInternal(const gmx_pme_t *pme
 {
     IVec       gridSize, paddedGridSize;
     ValueType *grid;
-    pmeGetGridAndSizesInternal<ValueType>(pme, grid, gridSize, paddedGridSize);
+    pmeGetGridAndSizesInternal<ValueType>(pme, mode, grid, gridSize, paddedGridSize);
     SparseGridValuesOutput<ValueType> gridValues;
     switch (mode)
     {
+        case CodePath::CUDA: // intentional absence of break
         case CodePath::CPU:
             gridValues.clear();
             for (int ix = 0; ix < gridSize[XX]; ix++)
index e3bd8e810aed97273c92cae8171f02f7a3372e8b..0d89a98a1e0d817478006cb470d707d0e88db975 100644 (file)
 #include <map>
 #include <vector>
 
+#include <gtest/gtest.h>
+
 #include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme-gpu-internal.h"
 #include "gromacs/math/gmxcomplex.h"
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/unique_cptr.h"
 
-struct t_inputrec;
+#include "testhardwarecontexts.h"
 
 namespace gmx
 {
@@ -69,12 +72,6 @@ typedef std::vector<RVec> CoordinatesVector;
 typedef ArrayRef<RVec> ForcesVector;
 //! Gridline indices
 typedef ConstArrayRef<IVec> GridLineIndicesVector;
-//! Type of spline data
-enum class PmeSplineDataType
-{
-    Values,      // theta
-    Derivatives, // dtheta
-};
 /*! \brief Spline parameters (theta or dtheta).
  * A reference to a single dimension's spline data; this means (atomCount * pmeOrder) values or derivatives.
  */
@@ -97,11 +94,6 @@ typedef SparseGridValuesOutput<real> SparseRealGridValuesOutput;
 typedef SparseGridValuesOutput<t_complex> SparseComplexGridValuesOutput;
 //! TODO: make proper C++ matrix for the whole Gromacs, get rid of this
 typedef std::array<real, DIM * DIM> Matrix3x3;
-//! PME code path being tested
-enum class CodePath
-{
-    CPU,    // serial CPU code
-};
 //! PME gathering input forces treatment
 enum class PmeGatherInputHandling
 {
@@ -123,14 +115,24 @@ enum class GridOrdering
 //! PME solver results - reciprocal energy and virial
 typedef std::tuple<real, Matrix3x3> PmeSolveOutput;
 
+// Misc.
+
+//! Tells if this generally valid PME input is supported for this mode
+bool pmeSupportsInputForMode(const t_inputrec *inputRec, CodePath mode);
+
 // PME stages
 
+// TODO: currently PME initializations do not store CodePath. They probably should (unless we would need mixed CPU-GPU execution?).
 //! Simple PME initialization (no atom data)
 PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec,
+                            CodePath mode = CodePath::CPU,
+                            gmx_device_info_t *gpuInfo = nullptr,
                             const Matrix3x3 &box = {{1.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f}},
                             real ewaldCoeff_q = 0.0f, real ewaldCoeff_lj = 0.0f);
-//! PME initialization with atom data and system box; lacks Ewald coefficients
+//! PME initialization with atom data and system box
 PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
+                            CodePath                  mode,
+                            gmx_device_info_t        *gpuInfo,
                             const CoordinatesVector  &coordinates,
                             const ChargesVector      &charges,
                             const Matrix3x3          &box
@@ -145,6 +147,8 @@ void pmePerformSolve(const gmx_pme_t *pme, CodePath mode,
 //! PME force gathering
 void pmePerformGather(gmx_pme_t *pme, CodePath mode,
                       PmeGatherInputHandling inputTreatment, ForcesVector &forces);
+//! PME test finalization before fetching the outputs
+void pmeFinalizeTest(const gmx_pme_t *pme, CodePath mode);
 
 // PME state setters
 
diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.cpp b/src/gromacs/ewald/tests/testhardwarecontexts.cpp
new file mode 100644 (file)
index 0000000..762e4d4
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Implements test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "testhardwarecontexts.h"
+
+#include "config.h"
+
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/taskassignment/hardwareassign.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/loggerbuilder.h"
+
+namespace gmx
+{
+namespace test
+{
+
+//! This constructs the test environment
+PmeTestEnvironment * const pmeEnv = (PmeTestEnvironment *)::testing::AddGlobalTestEnvironment(new PmeTestEnvironment);
+
+//! Simple hardware initialization
+void PmeTestEnvironment::hardwareInit()
+{
+    unique_cptr<t_commrec, done_commrec> commrec(init_commrec());
+    gmx_init_intranode_counters(commrec.get());
+    LoggerBuilder builder;
+    LoggerOwner   logOwner(builder.build());
+    MDLogger      log(logOwner.logger());
+    hardwareInfo_.reset(gmx_detect_hardware(log, commrec.get()));
+}
+
+void PmeTestEnvironment::SetUp()
+{
+    TestHardwareContext emptyContext("", nullptr);
+    hardwareContextsByMode_[CodePath::CPU].push_back(emptyContext);
+
+    hardwareInit();
+
+    // Constructing contexts for all compatible GPUs - will be empty on non-GPU builds
+    TestHardwareContexts gpuContexts;
+    const auto           compatibleGpus = getCompatibleGpus(hardwareInfo_->gpu_info);
+    for (int gpuIndex : compatibleGpus)
+    {
+        char        stmp[200] = {};
+        get_gpu_device_info_string(stmp, hardwareInfo_->gpu_info, gpuIndex);
+        std::string description = "(GPU " + std::string(stmp) + ") ";
+        auto       *gpuInfo     = reinterpret_cast<gmx_device_info_t *>(reinterpret_cast<char *>(hardwareInfo_->gpu_info.gpu_dev) + gpuIndex * sizeof_gpu_dev_info());
+        //TODO move previous line to gpu_utils and reuse in hardwareassign.cpp
+        gpuContexts.emplace_back(TestHardwareContext(description.c_str(), gpuInfo));
+    }
+#if GMX_GPU == GMX_GPU_CUDA
+    hardwareContextsByMode_[CodePath::CUDA] = gpuContexts;
+#endif
+}
+
+}
+}
diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.h b/src/gromacs/ewald/tests/testhardwarecontexts.h
new file mode 100644 (file)
index 0000000..9fc2b51
--- /dev/null
@@ -0,0 +1,117 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_EWALD_TEST_HARDWARE_CONTEXTS_H
+#define GMX_EWALD_TEST_HARDWARE_CONTEXTS_H
+
+/*! \internal \file
+ * \brief
+ * Describes test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include <list>
+#include <map>
+
+#include <gtest/gtest.h>
+
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/gpu_hw_info.h"
+#include "gromacs/utility/unique_cptr.h"
+
+namespace gmx
+{
+namespace test
+{
+//! Hardware code path being tested
+enum class CodePath
+{
+    CPU,
+    CUDA
+};
+
+/*! \internal \brief
+ * A structure to describe a hardware context - an abstraction over
+ * gmx_device_info_t with a human-readable string.
+ * TODO: currently this does not know which CodePath it belongs too.
+ * It probably should! That would save us one loop in all the PME tests.
+ */
+struct TestHardwareContext
+{
+    //! Readable description
+    std::string        description_;
+    //! Device information pointer
+    gmx_device_info_t *deviceInfo_;
+
+    public:
+        //! Returns a human-readable context description line
+        std::string getDescription() const{return description_; }
+//! Returns the device info pointer
+        gmx_device_info_t *getDeviceInfo() const{return deviceInfo_; }
+        //! Constructs the context
+        TestHardwareContext(const char *description, gmx_device_info_t *deviceInfo) : description_(description), deviceInfo_(deviceInfo){}
+};
+
+//! A list of hardware contexts
+typedef std::list<TestHardwareContext> TestHardwareContexts;
+
+/*! \internal \brief
+ * This class performs one-time test initialization (enumerating the hardware)
+ */
+class PmeTestEnvironment : public ::testing::Environment
+{
+    private:
+        //! General hardware info
+        unique_cptr<gmx_hw_info_t, gmx_hardware_info_free> hardwareInfo_;
+        //! Storage of hardware contexts
+        std::map<CodePath, TestHardwareContexts>           hardwareContextsByMode_;
+        //! Simple GPU initialization, allowing for PME to work on GPU
+        void hardwareInit();
+
+    public:
+        //! Default
+        ~PmeTestEnvironment() = default;
+        //! This is called by GTest framework once to query the hardware
+        void SetUp();
+        //! Get available hardware contexts for given code path
+        const TestHardwareContexts &getHardwareContexts(CodePath mode){return hardwareContextsByMode_.at(mode); }
+};
+
+//! The test environment
+extern PmeTestEnvironment * const pmeEnv;
+}
+}
+#endif
index e96edb366f4cb06b0c7a362f4f70ab2f2b187407..2bba07dbae373cecff338a552d235dc23410bf25 100644 (file)
@@ -56,5 +56,65 @@ __device__ __forceinline__ T LDG(const T* ptr)
 #endif
 }
 
+/*! \brief Fetch the value by \p index from the texture object or reference.
+ * Fetching from the object is the preferred behaviour on CC >= 3.0.
+ *
+ * \tparam[in] T        Raw data type
+ * \param[in] texObj    Table texture object
+ * \param[in] texRef    Table texture reference
+ * \param[in] index     Non-negative element index
+ * \returns             The value from the table at \p index
+ */
+template <typename T>
+static __forceinline__ __device__
+T fetchFromTexture(const cudaTextureObject_t texObj,
+                   const struct texture<T, 1, cudaReadModeElementType> texRef,
+                   int index)
+{
+    assert(index >= 0);
+    assert(!c_disableCudaTextures);
+    T result;
+#if GMX_PTX_ARCH >= 300  // Preferring texture objects on any new arch
+    GMX_UNUSED_VALUE(texRef);
+    result = tex1Dfetch<T>(texObj, index);
+#else
+    GMX_UNUSED_VALUE(texObj);
+    result = tex1Dfetch(texRef, index);
+#endif
+    return result;
+}
+
+/*! \brief Fetch the value by \p index from the parameter lookup table.
+ *
+ *  Depending on what is supported, it fetches parameters either
+ *  using direct load, texture objects, or texture references.
+ *
+ * \tparam[in] T        Raw data type
+ * \param[in] d_ptr     Device pointer to the raw table memory
+ * \param[in] texObj    Table texture object
+ * \param[in] texRef    Table texture reference
+ * \param[in] index     Non-negative element index
+ * \returns             The value from the table at \p index
+ */
+template <typename T>
+static __forceinline__ __device__
+T fetchFromParamLookupTable(const T                  *d_ptr,
+                            const cudaTextureObject_t texObj,
+                            const struct texture<T, 1, cudaReadModeElementType> texRef,
+                            int index)
+{
+    assert(index >= 0);
+    T result;
+#if DISABLE_CUDA_TEXTURES
+    GMX_UNUSED_VALUE(texObj);
+    GMX_UNUSED_VALUE(texRef);
+    result = LDG(d_ptr + index);
+#else
+    GMX_UNUSED_VALUE(d_ptr);
+    result = fetchFromTexture<T>(texObj, texRef, index);
+#endif
+    return result;
+}
+
 
 #endif /* GMX_GPU_UTILS_CUDA_KERNEL_UTILS_CUH */
index 4f7cff946f1d58c6e7471028005dcdc3b389fe39..28316f8ba9f11b7b04c21ed484a12e37a3702642 100644 (file)
@@ -331,3 +331,5 @@ void destroyParamLookupTable(T                       *d_ptr,
  */
 template void initParamLookupTable<float>(float * &, cudaTextureObject_t &, const texture<float, 1, cudaReadModeElementType> *, const float *, int, const gmx_device_info_t *);
 template void destroyParamLookupTable<float>(float *, cudaTextureObject_t, const texture<float, 1, cudaReadModeElementType> *, const gmx_device_info_t *);
+template void initParamLookupTable<int>(int * &, cudaTextureObject_t &, const texture<int, 1, cudaReadModeElementType> *, const int *, int, const gmx_device_info_t *);
+template void destroyParamLookupTable<int>(int *, cudaTextureObject_t, const texture<int, 1, cudaReadModeElementType> *, const gmx_device_info_t *);
index 3402d20a4aeeb5a1893959119c0a69c7f7795325..bea1e4a0bd908df73a11cd8b899c413df7ca84e2 100644 (file)
@@ -157,7 +157,9 @@ void cu_realloc_buffered(void **d_dest, void *h_src,
                          bool bAsync);
 
 // TODO: the 2 functions below are pretty much a constructor/destructor of a simple
-// GPU table object. We just need to add a templated __device__ table data fetching to complete it.
+// GPU table object. There is also almost self-contained fetchFromParamLookupTable()
+// in cuda_kernel_utils.cuh. They could all live in a separate class/struct file,
+// granted storing static texture references in there does not pose problems.
 
 /*! \brief Initialize parameter lookup table.
  *
index 000f85766fea4de8eb92d8a50046819b578b12e5..ef9cb744bbbabd541c7b61a81d150e8b879a32cf 100644 (file)
@@ -204,14 +204,7 @@ static void exitUnlessUserGpuTaskAssignmentIsValid(const gmx_gpu_info_t   &gpu_i
     }
 }
 
-/*! \brief Filter the compatible GPUs
- *
- * This function filters gpu_info.gpu_dev for compatible GPUs based
- * on the previously run compatibility tests.
- *
- * \param[in]     gpu_info    Information detected about GPUs, including compatibility
- * \return                    vector of IDs of GPUs already recorded as compatible */
-static std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
+std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
 {
     // Possible minor over-allocation here, but not important for anything
     std::vector<int> compatibleGpus;
index d1793c8fe5496ac85dafa8acd03eb3d0f181458c..d8d3952bc9236766a4114461eb96c06648c6b3a0 100644 (file)
@@ -74,6 +74,15 @@ class MDLogger;
  */
 std::vector<int> parseGpuTaskAssignment(const std::string &gpuTaskAssignment);
 
+/*! \brief Filter the compatible GPUs
+ *
+ * This function filters gpu_info.gpu_dev for compatible GPUs based
+ * on the previously run compatibility tests.
+ *
+ * \param[in]     gpu_info    Information detected about GPUs, including compatibility
+ * \return                    vector of IDs of GPUs already recorded as compatible */
+std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info);
+
 /*! \brief Assign PP ranks to valid GPU IDs.
  *
  * Will return a validated mapping from PP ranks (ie tasks that can