Update PME CUDA spread/gather
authorJonathan Vincent <jvincent@nvidia.com>
Wed, 2 Jan 2019 16:59:18 +0000 (08:59 -0800)
committerSzilárd Páll <pall.szilard@gmail.com>
Wed, 30 Oct 2019 12:21:23 +0000 (13:21 +0100)
Adds addtional templated kernels to the CUDA spread and
gather kernels. Allowing the use of 4 threads per atom instead of
16 and allowing the spline data to be recalculated in the spread
instead of saved to global memory and reloaded.

The combinations mean we have 4 different kernels that can be called
depending on which is most appropriate for the problem size and
hardware (to be decided heuritically). By default existing method is
used (16 threads per atom, saving and reloading of spline data).

Added an additional option to disable the preloading of charges and
coordinates into shared memory, and instead each thread would
deal with a single atom.

Removed the (currently disabled) PME_GPU_PARALLEL_SPLINE=1 code
path.

Refs #2792 #3185 #3186 #3187 #3188

Change-Id: Ia48d8eb63e38d0d23eefd755dcc228ff9b66d3e6

src/gromacs/ewald/pme_calculate_splines.cuh [new file with mode: 0644]
src/gromacs/ewald/pme_gather.cu
src/gromacs/ewald/pme_gpu_constants.h
src/gromacs/ewald/pme_gpu_internal.cpp
src/gromacs/ewald/pme_gpu_program_impl.cu
src/gromacs/ewald/pme_gpu_program_impl.h
src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
src/gromacs/ewald/pme_spread.cu

diff --git a/src/gromacs/ewald/pme_calculate_splines.cuh b/src/gromacs/ewald/pme_calculate_splines.cuh
new file mode 100644 (file)
index 0000000..7220e1e
--- /dev/null
@@ -0,0 +1,295 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Implements helper routines for PME gather and spline routines.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include <cassert>
+
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+
+#include "pme.cuh"
+#include "pme_gpu_utils.h"
+#include "pme_grid.h"
+
+//! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global
+static const bool c_useAtomDataPrefetch = true;
+
+/*! \brief
+ * General purpose function for loading atom-related data from global to shared memory.
+ *
+ * \tparam[in] T                 Data type (float/int/...)
+ * \tparam[in] atomsPerBlock     Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
+ * \tparam[in] dataCountPerAtom  Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
+ * \param[in]  kernelParams      Input PME CUDA data in constant memory.
+ * \param[out] sm_destination    Shared memory array for output.
+ * \param[in]  gm_source         Global memory array for input.
+ */
+template<typename T,
+         const int atomsPerBlock,
+         const int dataCountPerAtom>
+__device__  __forceinline__
+void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams       kernelParams,
+                             T * __restrict__                   sm_destination,
+                             const T * __restrict__             gm_source)
+{
+    static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta pr-warp alignment");
+    const int blockIndex       = blockIdx.y * gridDim.x + blockIdx.x;
+    const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
+    const int localIndex       = threadLocalIndex;
+    const int globalIndexBase  = blockIndex * atomsPerBlock * dataCountPerAtom;
+    const int globalIndex      = globalIndexBase + localIndex;
+    const int globalCheck      = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
+    if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+    {
+        assert(isfinite(float(gm_source[globalIndex])));
+        sm_destination[localIndex] = gm_source[globalIndex];
+    }
+}
+
+/*! \brief
+ * PME GPU spline parameter and gridline indices calculation.
+ * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines().
+ * First stage of the whole kernel.
+ *
+ * \tparam[in] order                PME interpolation order.
+ * \tparam[in] atomsPerBlock        Number of atoms processed by a block - should be accounted for
+ *                                  in the sizes of the shared memory arrays.
+ * \tparam[in] atomsPerWarp         Number of atoms processed by a warp
+ * \tparam[in] writeSmDtheta        Bool controling if the theta derivative should be written to shared memory. Enables calculation of dtheta if set.
+ * \tparam[in] writeGlobal          A boolean which tells if the theta values and gridlines should be written to global memory. Enables calculation of dtheta if set.
+ * \param[in]  kernelParams         Input PME CUDA data in constant memory.
+ * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
+ * \param[in]  atomX                Atom coordinate of atom processed by thread.
+ * \param[in]  atomCharge           Atom charge/coefficient of atom processed by thread.
+ * \param[out] sm_theta             Atom spline values in the shared memory.
+ * \param[out] sm_dtheta            Derivative of atom spline values in shared memory.
+ * \param[out] sm_gridlineIndices   Atom gridline indices in the shared memory.
+ */
+
+template <const int order,
+          const int atomsPerBlock,
+          const int atomsPerWarp,
+          const bool writeSmDtheta,
+          const bool writeGlobal>
+__device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams            kernelParams,
+                                                  const int                               atomIndexOffset,
+                                                  const float3                            atomX,
+                                                  const float                             atomCharge,
+                                                  float * __restrict__                    sm_theta,
+                                                  float * __restrict__                    sm_dtheta,
+                                                  int * __restrict__                      sm_gridlineIndices)
+{
+    /* Global memory pointers for output */
+    float * __restrict__ gm_theta           = kernelParams.atoms.d_theta;
+    float * __restrict__ gm_dtheta          = kernelParams.atoms.d_dtheta;
+    int * __restrict__   gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
+
+    /* Fractional coordinates */
+    __shared__ float sm_fractCoords[atomsPerBlock * DIM];
+
+    /* Thread index w.r.t. block */
+    const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
+        + (threadIdx.y * blockDim.x) + threadIdx.x;
+    /* Warp index w.r.t. block - could probably be obtained easier? */
+    const int warpIndex = threadLocalId / warp_size;
+    /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
+    const int atomWarpIndex = threadIdx.z %atomsPerWarp;
+    /* Atom index w.r.t. block/shared memory */
+    const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
+
+    /* Atom index w.r.t. global memory */
+    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+    /* Spline contribution index in one dimension */
+    const int threadLocalIdXY = (threadIdx.y * blockDim.x) + threadIdx.x;
+    const int orderIndex      = threadLocalIdXY / DIM;
+    /* Dimension index */
+    const int dimIndex = threadLocalIdXY % DIM;
+
+    /* Multi-purpose index of rvec/ivec atom data */
+    const int        sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
+
+    float            splineData[order];
+
+    const int        localCheck  = (dimIndex < DIM) && (orderIndex <  1);
+    const int        globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
+
+    /* we have 4 threads per atom, but can only use 3 here for the dimensions */
+    if (localCheck && globalCheck)
+    {
+        /* Indices interpolation */
+
+        if (orderIndex == 0)
+        {
+            int           tableIndex, tInt;
+            float         n, t;
+            assert(atomIndexLocal < DIM*atomsPerBlock);
+            /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
+             * puts them into local memory(!) instead of accessing the constant memory directly.
+             * That's the reason for the switch, to unroll explicitly.
+             * The commented parts correspond to the 0 components of the recipbox.
+             */
+            switch (dimIndex)
+            {
+                case XX:
+                    tableIndex  = kernelParams.grid.tablesOffsets[XX];
+                    n           = kernelParams.grid.realGridSizeFP[XX];
+                    t           = atomX.x * kernelParams.current.recipBox[dimIndex][XX] + atomX.y * kernelParams.current.recipBox[dimIndex][YY] + atomX.z * kernelParams.current.recipBox[dimIndex][ZZ];
+                    break;
+
+                case YY:
+                    tableIndex  = kernelParams.grid.tablesOffsets[YY];
+                    n           = kernelParams.grid.realGridSizeFP[YY];
+                    t           = /*atomX.x * kernelParams.current.recipBox[dimIndex][XX] + */ atomX.y * kernelParams.current.recipBox[dimIndex][YY] + atomX.z * kernelParams.current.recipBox[dimIndex][ZZ];
+                    break;
+
+                case ZZ:
+                    tableIndex  = kernelParams.grid.tablesOffsets[ZZ];
+                    n           = kernelParams.grid.realGridSizeFP[ZZ];
+                    t           = /*atomX.x * kernelParams.current.recipBox[dimIndex][XX] + atomX.y * kernelParams.current.recipBox[dimIndex][YY] + */ atomX.z * kernelParams.current.recipBox[dimIndex][ZZ];
+                    break;
+            }
+            const float shift = c_pmeMaxUnitcellShift;
+            /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
+            t    = (t + shift) * n;
+            tInt = (int)t;
+            assert(sharedMemoryIndex < atomsPerBlock*DIM);
+            sm_fractCoords[sharedMemoryIndex] = t - tInt;
+            tableIndex                       += tInt;
+            assert(tInt >= 0);
+            assert(tInt < c_pmeNeighborUnitcellCount * n);
+
+            // TODO have shared table for both parameters to share the fetch, as index is always same?
+            // TODO compare texture/LDG performance
+            sm_fractCoords[sharedMemoryIndex] +=
+                fetchFromParamLookupTable(kernelParams.grid.d_fractShiftsTable,
+                                          kernelParams.fractShiftsTableTexture,
+                                          tableIndex);
+            sm_gridlineIndices[sharedMemoryIndex] =
+                fetchFromParamLookupTable(kernelParams.grid.d_gridlineIndicesTable,
+                                          kernelParams.gridlineIndicesTableTexture,
+                                          tableIndex);
+            if (writeGlobal)
+            {
+                gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] = sm_gridlineIndices[sharedMemoryIndex];
+            }
+        }
+
+        /* B-spline calculation */
+
+        const int chargeCheck = pme_gpu_check_atom_charge(atomCharge);
+        if (chargeCheck)
+        {
+            float       div;
+            int         o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
+
+            const float dr = sm_fractCoords[sharedMemoryIndex];
+            assert(isfinite(dr));
+
+            /* dr is relative offset from lower cell limit */
+            splineData[order-1] = 0.0f;
+            splineData[1]       = dr;
+            splineData[0]       = 1.0f - dr;
+
+#pragma unroll
+            for (int k = 3; k < order; k++)
+            {
+                div                     = 1.0f / (k - 1.0f);
+                splineData[k - 1]       = div * dr * splineData[k - 2];
+#pragma unroll
+                for (int l = 1; l < (k - 1); l++)
+                {
+                    splineData[k - l - 1] = div * ((dr + l) * splineData[k - l - 2] + (k - l - dr) * splineData[k - l - 1] );
+                }
+                splineData[0] = div * (1.0f - dr) * splineData[0];
+            }
+
+            const int thetaIndexBase        = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
+            const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order;
+            /* only calculate dtheta if we are saving it to shared or global memory */
+            if (writeSmDtheta || writeGlobal)
+            {
+                /* Differentiation and storing the spline derivatives (dtheta) */
+#pragma unroll
+                for (o = 0; o < order; o++)
+                {
+                    const int   thetaIndex       = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
+
+                    const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0f) - splineData[o];
+                    assert(isfinite(dtheta));
+                    assert(thetaIndex < order*DIM*atomsPerBlock);
+                    if (writeSmDtheta)
+                    {
+                        sm_dtheta[thetaIndex] = dtheta;
+                    }
+                    if (writeGlobal)
+                    {
+                        const int   thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
+                        gm_dtheta[thetaGlobalIndex] = dtheta;
+                    }
+                }
+            }
+
+            div                   = 1.0f / (order - 1.0f);
+            splineData[order - 1] = div * dr * splineData[order - 2];
+#pragma unroll
+            for (int k = 1; k < (order - 1); k++)
+            {
+                splineData[order - k - 1] = div * ((dr + k) * splineData[order - k - 2] + (order - k - dr) * splineData[order - k - 1]);
+            }
+            splineData[0] = div * (1.0f - dr) * splineData[0];
+
+            /* Storing the spline values (theta) */
+#pragma unroll
+            for (o = 0; o < order; o++)
+            {
+                const int thetaIndex       = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
+                assert(thetaIndex < order*DIM*atomsPerBlock);
+                sm_theta[thetaIndex]       = splineData[o];
+                assert(isfinite(sm_theta[thetaIndex]));
+                if (writeGlobal)
+                {
+                    const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
+                    gm_theta[thetaGlobalIndex] = splineData[o];
+                }
+            }
+        }
+    }
+}
index be34c3c6cda751526bb88c070d2276f4cbb648bd..ead734c687ed57f99d855c724442cf1d0f4fd762 100644 (file)
 
 #include <cassert>
 
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+
 #include "pme.cuh"
+#include "pme_calculate_splines.cuh"
 #include "pme_gpu_utils.h"
+#include "pme_grid.h"
 
 /*! \brief
  * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
@@ -216,13 +220,17 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
  *                                  False: the forces are added non-atomically to the output buffer (e.g. to the bonded forces).
  * \tparam[in] wrapX                Tells if the grid is wrapped in the X dimension.
  * \tparam[in] wrapY                Tells if the grid is wrapped in the Y dimension.
+ * \tparam[in] readGlobal           Tells if we should read spline values from global memory
+ * \tparam[in] useOrderThreads      Tells if we should use order threads per atom (order*order used if false)
  * \param[in]  kernelParams         All the PME GPU data.
  */
 template <
     const int order,
     const bool overwriteForces,
     const bool wrapX,
-    const bool wrapY
+    const bool wrapY,
+    const bool readGlobal,
+    const bool useOrderThreads
     >
 __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP)
 __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
@@ -230,19 +238,27 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
     /* Global memory pointers */
     const float * __restrict__  gm_coefficients     = kernelParams.atoms.d_coefficients;
     const float * __restrict__  gm_grid             = kernelParams.grid.d_realGrid;
+    float * __restrict__        gm_forces           = kernelParams.atoms.d_forces;
+
+    /* Global memory pointers for readGlobal */
     const float * __restrict__  gm_theta            = kernelParams.atoms.d_theta;
     const float * __restrict__  gm_dtheta           = kernelParams.atoms.d_dtheta;
     const int * __restrict__    gm_gridlineIndices  = kernelParams.atoms.d_gridlineIndices;
-    float * __restrict__        gm_forces           = kernelParams.atoms.d_forces;
+
+    float3 atomX;
+    float  atomCharge;
 
     /* Some sizes */
-    const int    atomsPerBlock  = (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom);
-    const int    atomDataSize   = c_pmeSpreadGatherThreadsPerAtom; /* Number of data components and threads for a single atom */
-    const int    atomsPerWarp   = c_pmeSpreadGatherAtomsPerWarp;
+    const int    atomsPerBlock   = useOrderThreads ? (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom) :
+        (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom);
+    const int    blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
+
+    /* Number of data components and threads for a single atom */
+    const int    atomDataSize   = useOrderThreads ? c_pmeSpreadGatherThreadsPerAtom4ThPerAtom : c_pmeSpreadGatherThreadsPerAtom;
+    const int    atomsPerWarp   = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom : c_pmeSpreadGatherAtomsPerWarp;
 
     const int    blockSize      = atomsPerBlock * atomDataSize;
     assert(blockSize == blockDim.x * blockDim.y * blockDim.z);
-    const int    blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
 
     /* These are the atom indices - for the shared and global memory */
     const int         atomIndexLocal    = threadIdx.z;
@@ -256,46 +272,89 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
     {
         return;
     }
-
+    // 4 warps per block, 8 atoms per warp *3 *4
     const int         splineParamsSize             = atomsPerBlock * DIM * order;
     const int         gridlineIndicesSize          = atomsPerBlock * DIM;
     __shared__ int    sm_gridlineIndices[gridlineIndicesSize];
-    __shared__ float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs  as .x/.y */
+    __shared__ float  sm_theta[splineParamsSize];
+    __shared__ float  sm_dtheta[splineParamsSize];
 
-    /* Spline Y/Z coordinates */
-    const int ithy = threadIdx.y;
+    /* Spline Z coordinates */
     const int ithz = threadIdx.x;
 
     /* These are the spline contribution indices in shared memory */
-    const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x;                  /* Relative to the current particle , 0..15 for order 4 */
-    const int lineIndex   = (threadIdx.z * (blockDim.x * blockDim.y)) + splineIndex; /* And to all the block's particles */
-
-    int       threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
-        + (threadIdx.y * blockDim.x)
-        + threadIdx.x;
-
-    /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
-    const int localGridlineIndicesIndex  = threadLocalId;
-    const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
-    const int globalCheckIndices         = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
-    if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+    const int       splineIndex =  threadIdx.y * blockDim.x + threadIdx.x;
+    const int       lineIndex   = (threadIdx.z * (blockDim.x * blockDim.y)) + splineIndex; /* And to all the block's particles */
+
+    const int       threadLocalId    = (threadIdx.z * (blockDim.x * blockDim.y)) + blockDim.x*threadIdx.y   + threadIdx.x;
+    const int       threadLocalIdMax = blockDim.x * blockDim.y * blockDim.z;
+
+    if (readGlobal)
     {
-        sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
-        assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
+        /* Read splines */
+        const int localGridlineIndicesIndex  = threadLocalId;
+        const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
+        const int globalCheckIndices         = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
+        if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+        {
+            sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
+            assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
+        }
+        /* The loop needed for order threads per atom to make sure we load all data values, as each thread must load multiple values
+           with order*order threads per atom, it is only required for each thread to load one data value */
+
+        const int iMin = 0;
+        const int iMax = useOrderThreads ? 3 : 1;
+
+        for (int i = iMin; i < iMax; i++)
+        {
+            int localSplineParamsIndex  = threadLocalId + i*threadLocalIdMax;   /* i will always be zero for order*order threads per atom */
+            int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
+            int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
+            if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+            {
+                sm_theta[localSplineParamsIndex]  = gm_theta[globalSplineParamsIndex];
+                sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
+                assert(isfinite(sm_theta[localSplineParamsIndex]));
+                assert(isfinite(sm_dtheta[localSplineParamsIndex]));
+            }
+        }
+        __syncthreads();
     }
-    /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
-    const int localSplineParamsIndex  = threadLocalId;
-    const int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
-    const int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
-    if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+    else
     {
-        sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
-        sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
-        assert(isfinite(sm_splineParams[localSplineParamsIndex].x));
-        assert(isfinite(sm_splineParams[localSplineParamsIndex].y));
-    }
-    __syncthreads();
+        /* Recaclulate  Splines  */
+        if (c_useAtomDataPrefetch)
+        {
+            //charges
+            __shared__ float sm_coefficients[atomsPerBlock];
+            // Coordinates
+            __shared__ float sm_coordinates[DIM * atomsPerBlock];
+            /* Staging coefficients/charges */
+            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
+
+            /* Staging coordinates */
+            pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
+            __syncthreads();
+            atomX.x    = sm_coordinates[atomIndexLocal*DIM+XX];
+            atomX.y    = sm_coordinates[atomIndexLocal*DIM+YY];
+            atomX.z    = sm_coordinates[atomIndexLocal*DIM+ZZ];
+            atomCharge = sm_coefficients[atomIndexLocal];
 
+        }
+        else
+        {
+            atomCharge =  gm_coefficients[atomIndexGlobal];
+            atomX.x    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + XX];
+            atomX.y    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + YY];
+            atomX.z    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + ZZ];
+        }
+        calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(kernelParams, atomIndexOffset,
+                                                                           atomX, atomCharge,
+                                                                           sm_theta, sm_dtheta,
+                                                                           sm_gridlineIndices);
+        __syncwarp();
+    }
     float           fx = 0.0f;
     float           fy = 0.0f;
     float           fz = 0.0f;
@@ -315,43 +374,52 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
         const int    warpIndex     = atomIndexLocal / atomsPerWarp;
 
         const int    splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
-        const int    splineIndexY    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
-        const float2 tdy             = sm_splineParams[splineIndexY];
         const int    splineIndexZ    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
-        const float2 tdz             = sm_splineParams[splineIndexZ];
+        const float2 tdz             =  make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ] );
 
+        int          iz             = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
         const int    ixBase         = sm_gridlineIndices[atomIndexLocal * DIM + XX];
-        int          iy             = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
-        if (wrapY & (iy >= ny))
-        {
-            iy -= ny;
-        }
-        int iz  = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
+
         if (iz >= nz)
         {
             iz -= nz;
         }
-        const int constOffset    = iy * pnz + iz;
+        int       constOffset, iy;
 
-#pragma unroll
-        for (int ithx = 0; (ithx < order); ithx++)
+        const int ithyMin = useOrderThreads ? 0 : threadIdx.y;
+        const int ithyMax = useOrderThreads ? order : threadIdx.y + 1;
+        for (int ithy = ithyMin; ithy < ithyMax; ithy++)
         {
-            int ix = ixBase + ithx;
-            if (wrapX & (ix >= nx))
+            const int    splineIndexY    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
+            const float2 tdy             =  make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY] );
+
+            iy             = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
+            if (wrapY & (iy >= ny))
             {
-                ix -= nx;
+                iy -= ny;
+            }
+            constOffset    = iy * pnz + iz;
+
+#pragma unroll
+            for (int ithx = 0; (ithx < order); ithx++)
+            {
+                int ix = ixBase + ithx;
+                if (wrapX & (ix >= nx))
+                {
+                    ix -= nx;
+                }
+                const int     gridIndexGlobal  = ix * pny * pnz + constOffset;
+                assert(gridIndexGlobal >= 0);
+                const float   gridValue    = gm_grid[gridIndexGlobal];
+                assert(isfinite(gridValue));
+                const int     splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
+                const float2  tdx          = make_float2( sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
+                const float   fxy1         = tdz.x * gridValue;
+                const float   fz1          = tdz.y * gridValue;
+                fx += tdx.y * tdy.x * fxy1;
+                fy += tdx.x * tdy.y * fxy1;
+                fz += tdx.x * tdy.x * fz1;
             }
-            const int     gridIndexGlobal  = ix * pny * pnz + constOffset;
-            assert(gridIndexGlobal >= 0);
-            const float   gridValue    = gm_grid[gridIndexGlobal];
-            assert(isfinite(gridValue));
-            const int     splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
-            const float2  tdx          = sm_splineParams[splineIndexX];
-            const float   fxy1         = tdz.x * gridValue;
-            const float   fz1          = tdz.y * gridValue;
-            fx += tdx.y * tdy.x * fxy1;
-            fy += tdx.x * tdy.y * fxy1;
-            fz += tdx.x * tdy.x * fz1;
         }
     }
 
@@ -410,5 +478,11 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
 }
 
 //! Kernel instantiations
-template __global__ void pme_gather_kernel<4, true, true, true>(const PmeGpuCudaKernelParams);
-template __global__ void pme_gather_kernel<4, false, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, true, true, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true, true, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, true, false, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, true, false, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true, false, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true, false, false>(const PmeGpuCudaKernelParams);
index 695a9e220caa252630287dba0349d9ebdb28a61a..cbbab88c7c52d91129936eca6cf00677d5d2aac4 100644 (file)
@@ -125,14 +125,20 @@ constexpr int c_pmeGpuOrder = 4;
  * The assumption is currently that any thread processes only a single atom's contributions.
  * TODO: this assumption leads to minimum execution width of 16. See Redmine #2516
  */
-constexpr int c_pmeSpreadGatherThreadsPerAtom = (c_pmeGpuOrder * c_pmeGpuOrder);
+constexpr int c_pmeSpreadGatherThreadsPerAtom  = c_pmeGpuOrder*c_pmeGpuOrder;
+
+//! Number of threads per atom when order threads are used
+constexpr int c_pmeSpreadGatherThreadsPerAtom4ThPerAtom = c_pmeGpuOrder;
 
 /*! \brief Minimum execution width of the PME spread and gather kernels.
  *
  * Due to the one thread per atom and order=4 implementation constraints, order^2 threads
  * should execute without synchronization needed. See c_pmeSpreadGatherThreadsPerAtom
  */
-constexpr int c_pmeSpreadGatherMinWarpSize = c_pmeSpreadGatherThreadsPerAtom;
+constexpr int c_pmeSpreadGatherMinWarpSize  = c_pmeSpreadGatherThreadsPerAtom;
+
+//! Minimum warp size if order threads pera atom are used instead of order^2
+constexpr int c_pmeSpreadGatherMinWarpSize4ThPerAtom = c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
 
 /*! \brief
  * Atom data alignment (in terms of number of atoms).
@@ -142,7 +148,7 @@ constexpr int c_pmeSpreadGatherMinWarpSize = c_pmeSpreadGatherThreadsPerAtom;
  * Then the numbers of atoms which would fit in the padded GPU buffers have to be divisible by this.
  * There are debug asserts for this divisibility in pme_gpu_spread() and pme_gpu_gather().
  */
-constexpr int c_pmeAtomDataAlignment = 32;
+constexpr int c_pmeAtomDataAlignment = 64;
 
 /*
  * The execution widths for PME GPU kernels, used both on host and device for correct scheduling.
@@ -176,7 +182,10 @@ constexpr int c_gatherMaxWarpsPerBlock = 4;
  * This macro depends on the templated order parameter (2 atoms per warp for order 4 and warp_size of 32).
  * It is mostly used for spline data layout tweaked for coalesced access.
  */
-constexpr int c_pmeSpreadGatherAtomsPerWarp = (warp_size / c_pmeSpreadGatherThreadsPerAtom);
+constexpr int c_pmeSpreadGatherAtomsPerWarp  = (warp_size / c_pmeSpreadGatherThreadsPerAtom);
+
+//! number of atoms per warp when order threads are used per atom
+constexpr int c_pmeSpreadGatherAtomsPerWarp4ThPerAtom = (warp_size / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom);
 
 //! Spreading max block size in threads
 constexpr int c_spreadMaxThreadsPerBlock = c_spreadMaxWarpsPerBlock * warp_size;
index a3a25c408b027815ff9123c7d5d6b7edc38231de..caa7911409b6bcf5c3bc72fe0368d9284da3d560 100644 (file)
 #include "pme_internal.h"
 #include "pme_solve.h"
 
+
+/*! \brief
+ *  CUDA only.
+ *  Controls if we should use order (i.e. 4) threads per atom for the GPU
+ *  or order*order (i.e. 16) threads per atom.
+ */
+constexpr bool c_useOrderThreadsPerAtom = false;
+/*! \brief
+ * CUDA only.
+ * Controls if we should recalculate the splines in the gather or
+ * save the values in the spread and reload in the gather.
+ */
+constexpr bool c_recalculateSplines = false;
+
 /*! \internal \brief
  * Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure.
  *
@@ -117,7 +131,14 @@ int pme_gpu_get_atom_data_alignment(const PmeGpu * /*unused*/)
 
 int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGpu)
 {
-    return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom;
+    if (c_useOrderThreadsPerAtom)
+    {
+        return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
+    }
+    else
+    {
+        return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom;
+    }
 }
 
 void pme_gpu_synchronize(const PmeGpu *pmeGpu)
@@ -1015,6 +1036,114 @@ std::pair<int, int> inline pmeGpuCreateGrid(const PmeGpu *pmeGpu, int blockCount
     return std::pair<int, int>(colCount, minRowCount);
 }
 
+/*! \brief
+ * Returns a pointer to appropriate spline and spread kernel based on the input bool values
+ *
+ * \param[in]  pmeGpu                   The PME GPU structure.
+ * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
+ * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
+ *
+ * \return Pointer to CUDA kernel
+ */
+static auto selectSplineAndSpreadKernelPtr(const PmeGpu *pmeGpu,
+                                           bool useOrderThreadsPerAtom,
+                                           bool writeSplinesToGlobal)
+{
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+    if (writeSplinesToGlobal)
+    {
+        if (useOrderThreadsPerAtom)
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4;
+        }
+        else
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplines;
+        }
+    }
+    else
+    {
+        if (useOrderThreadsPerAtom)
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
+        }
+        else
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
+        }
+    }
+
+    return kernelPtr;
+}
+
+/*! \brief
+ * Returns a pointer to appropriate spline kernel based on the input bool values
+ *
+ * \param[in]  pmeGpu                   The PME GPU structure.
+ * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
+ * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
+ *
+ * \return Pointer to CUDA kernel
+ */
+static auto selectSplineKernelPtr(const PmeGpu *pmeGpu,
+                                  bool useOrderThreadsPerAtom,
+                                  bool gmx_unused writeSplinesToGlobal)
+{
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+    GMX_ASSERT(writeSplinesToGlobal, "Spline data should always be written to global memory when just calculating splines");
+
+    if (useOrderThreadsPerAtom)
+    {
+        kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4;
+    }
+    else
+    {
+        kernelPtr = pmeGpu->programHandle_->impl_->splineKernel;
+    }
+    return kernelPtr;
+}
+
+/*! \brief
+ * Returns a pointer to appropriate spread kernel based on the input bool values
+ *
+ * \param[in]  pmeGpu                   The PME GPU structure.
+ * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
+ * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
+ *
+ * \return Pointer to CUDA kernel
+ */
+static auto selectSpreadKernelPtr(const PmeGpu *pmeGpu,
+                                  bool useOrderThreadsPerAtom,
+                                  bool writeSplinesToGlobal)
+{
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+    if (writeSplinesToGlobal)
+    {
+        if (useOrderThreadsPerAtom)
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4;
+        }
+        else
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel;
+        }
+    }
+    else
+    {
+        /* if we are not saving the spline data we need to recalculate it
+           using the spline and spread Kernel */
+        if (useOrderThreadsPerAtom)
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
+        }
+        else
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
+        }
+    }
+    return kernelPtr;
+}
+
 void pme_gpu_spread(const PmeGpu         *pmeGpu,
                     GpuEventSynchronizer *xReadyOnDevice,
                     int gmx_unused        gridIndex,
@@ -1030,13 +1159,20 @@ void pme_gpu_spread(const PmeGpu         *pmeGpu,
 
     const int    order         = pmeGpu->common->pme_order;
     GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented");
-    const int    atomsPerBlock = blockSize / c_pmeSpreadGatherThreadsPerAtom;
+    const bool   writeGlobal   = pmeGpu->settings.copyAllOutputs;
+#if GMX_GPU == GMX_GPU_OPENCL
+    GMX_ASSERT(!c_useOrderThreadsPerAtom, "Only 16 threads per atom supported in OpenCL");
+    GMX_ASSERT(!c_recalculateSplines, "Recalculating splines not supported in OpenCL");
+#endif
+    const int    atomsPerBlock = c_useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom :
+        blockSize / c_pmeSpreadGatherThreadsPerAtom;
+
     // TODO: pick smaller block size in runtime if needed
     // (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms))
     // If doing so, change atomsPerBlock in the kernels as well.
     // TODO: test varying block sizes on modern arch-s as well
     // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
-    //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
+    //(for spline data mostly)
     GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
 
     // Ensure that coordinates are ready on the device before launching spread;
@@ -1054,7 +1190,8 @@ void pme_gpu_spread(const PmeGpu         *pmeGpu,
     auto               dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
 
     KernelLaunchConfig config;
-    config.blockSize[0] = config.blockSize[1] = order;
+    config.blockSize[0] = order;
+    config.blockSize[1] = c_useOrderThreadsPerAtom ? 1 : order;
     config.blockSize[2] = atomsPerBlock;
     config.gridSize[0]  = dimGrid.first;
     config.gridSize[1]  = dimGrid.second;
@@ -1067,20 +1204,27 @@ void pme_gpu_spread(const PmeGpu         *pmeGpu,
         if (spreadCharges)
         {
             timingId  = gtPME_SPLINEANDSPREAD;
-            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
+            kernelPtr = selectSplineAndSpreadKernelPtr( pmeGpu,
+                                                        c_useOrderThreadsPerAtom,
+                                                        writeGlobal || (!c_recalculateSplines));
         }
         else
         {
             timingId  = gtPME_SPLINE;
-            kernelPtr = pmeGpu->programHandle_->impl_->splineKernel;
+            kernelPtr = selectSplineKernelPtr( pmeGpu,
+                                               c_useOrderThreadsPerAtom,
+                                               writeGlobal || (!c_recalculateSplines));
         }
     }
     else
     {
         timingId  = gtPME_SPREAD;
-        kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel;
+        kernelPtr = selectSpreadKernelPtr( pmeGpu,
+                                           c_useOrderThreadsPerAtom,
+                                           writeGlobal || (!c_recalculateSplines));
     }
 
+
     pme_gpu_start_timing(pmeGpu, timingId);
     auto      *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
 #if c_canEmbedBuffers
@@ -1219,6 +1363,58 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
     }
 }
 
+/*! \brief
+ * Returns a pointer to appropriate gather kernel based on the inputvalues
+ *
+ * \param[in]  pmeGpu                   The PME GPU structure.
+ * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
+ * \param[in]  readSplinesFromGlobal    bool controlling if we should write spline data to global memory
+ * \param[in]  forceTreatment           Controls if the forces from the gather should increment or replace the input forces.
+ *
+ * \return Pointer to CUDA kernel
+ */
+inline auto selectGatherKernelPtr(const PmeGpu *pmeGpu,
+                                  bool useOrderThreadsPerAtom,
+                                  bool readSplinesFromGlobal,
+                                  PmeForceOutputHandling forceTreatment)
+
+{
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+
+    if (readSplinesFromGlobal)
+    {
+        if (useOrderThreadsPerAtom)
+        {
+            kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+                pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4 :
+                pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelReadSplinesThPerAtom4;
+        }
+        else
+        {
+            kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+                pmeGpu->programHandle_->impl_->gatherKernelReadSplines :
+                pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelReadSplines;
+        }
+    }
+    else
+    {
+        if (useOrderThreadsPerAtom)
+        {
+            kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+                pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4 :
+                pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelThPerAtom4;
+        }
+        else
+        {
+            kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+                pmeGpu->programHandle_->impl_->gatherKernel :
+                pmeGpu->programHandle_->impl_->gatherReduceWithInputKernel;
+        }
+    }
+    return kernelPtr;
+}
+
+
 void pme_gpu_gather(PmeGpu                *pmeGpu,
                     PmeForceOutputHandling forceTreatment,
                     const float           *h_grid)
@@ -1239,19 +1435,27 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
         pme_gpu_copy_input_gather_atom_data(pmeGpu);
     }
 
+    /* Set if we have unit tests */
+    const bool   readGlobal    = pmeGpu->settings.copyAllOutputs;
     const size_t blockSize     = pmeGpu->programHandle_->impl_->gatherWorkGroupSize;
-    const int    atomsPerBlock = blockSize / c_pmeSpreadGatherThreadsPerAtom;
+#if GMX_GPU == GMX_GPU_OPENCL
+    GMX_ASSERT(!c_useOrderThreadsPerAtom, "Only 16 threads per atom supported in OpenCL");
+    GMX_ASSERT(!c_recalculateSplines, "Recalculating splines not supported in OpenCL");
+#endif
+    const int    atomsPerBlock = c_useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom :
+        blockSize / c_pmeSpreadGatherThreadsPerAtom;
+
     GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. gathering block size");
 
     const int          blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
     auto               dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
 
-
-    const int order = pmeGpu->common->pme_order;
+    const int          order = pmeGpu->common->pme_order;
     GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented");
 
     KernelLaunchConfig config;
-    config.blockSize[0] = config.blockSize[1] = order;
+    config.blockSize[0] = order;
+    config.blockSize[1] = c_useOrderThreadsPerAtom ? 1 : order;
     config.blockSize[2] = atomsPerBlock;
     config.gridSize[0]  = dimGrid.first;
     config.gridSize[1]  = dimGrid.second;
@@ -1260,10 +1464,11 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
     // TODO test different cache configs
 
     int  timingId = gtPME_GATHER;
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = selectGatherKernelPtr( pmeGpu,
+                                                                          c_useOrderThreadsPerAtom,
+                                                                          readGlobal || (!c_recalculateSplines),
+                                                                          forceTreatment);
     // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
-    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
-        pmeGpu->programHandle_->impl_->gatherKernel :
-        pmeGpu->programHandle_->impl_->gatherReduceWithInputKernel;
 
     pme_gpu_start_timing(pmeGpu, timingId);
     auto       *timingEvent     = pme_gpu_fetch_timing_event(pmeGpu, timingId);
index 087d85f50e51c523b9f280f36775423f879a6ace..347d69f9102adbd4d9b55db1caa1763c7cc1f80b 100644 (file)
@@ -61,18 +61,30 @@ template <
     const bool computeSplines,
     const bool spreadCharges,
     const bool wrapX,
-    const bool wrapY
+    const bool wrapY,
+    const bool writeGlobal,
+    const bool orderThreads
     >
 void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
 
 // Add extern declarations to inform that there will be a definition
 // provided in another translation unit.
 extern template
-void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
 extern template
-void pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
 extern template
-void pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, false>(const PmeGpuCudaKernelParams);
 
 template<
     GridOrdering gridOrdering,
@@ -95,16 +107,30 @@ template <
     const int order,
     const bool overwriteForces,
     const bool wrapX,
-    const bool wrapY
+    const bool wrapY,
+    const bool readGlobal,
+    const bool orderThreads
     >
 void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
 
 // Add extern declarations to inform that there will be a definition
 // provided in another translation unit.
 extern template
-void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, false, true>(const PmeGpuCudaKernelParams);
 extern template
-void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, false, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, false, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, false, false>(const PmeGpuCudaKernelParams);
 
 PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *)
 {
@@ -114,15 +140,31 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *)
     solveMaxWorkGroupSize = c_solveMaxThreadsPerBlock;
     gatherWorkGroupSize   = c_gatherMaxThreadsPerBlock;
 
-    splineAndSpreadKernel       = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY>;
-    splineKernel                = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY>;
-    spreadKernel                = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY>;
-    gatherKernel                = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY>;
-    gatherReduceWithInputKernel = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY>;
-    solveXYZKernel              = pme_solve_kernel<GridOrdering::XYZ, false>;
-    solveXYZEnergyKernel        = pme_solve_kernel<GridOrdering::XYZ, true>;
-    solveYZXKernel              = pme_solve_kernel<GridOrdering::YZX, false>;
-    solveYZXEnergyKernel        = pme_solve_kernel<GridOrdering::YZX, true>;
+/*!
+ * Not all combinations of the splineAndSpread, spline and Spread kernels are required
+ * If only the spline (without the spread) then it does not make sense not to write the data to global memory
+ * Similarly the spread kernel (without the spline) implies that we should read the spline data from global memory
+ */
+    splineAndSpreadKernel                             = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, false>;
+    splineAndSpreadKernelThPerAtom4                   = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, true>;
+    splineAndSpreadKernelWriteSplines                 = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, false>;
+    splineAndSpreadKernelWriteSplinesThPerAtom4       = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, true>;
+    splineKernel                                      = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, false>;
+    splineKernelThPerAtom4                            = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, true>;
+    spreadKernel                                      = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, false>;
+    spreadKernelThPerAtom4                            = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, true>;
+    gatherKernel                                      = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, false, false>;
+    gatherKernelThPerAtom4                            = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, false, true>;
+    gatherKernelReadSplines                           = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, true, false>;
+    gatherKernelReadSplinesThPerAtom4                 = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, true, true>;
+    gatherReduceWithInputKernel                       = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, false, false>;
+    gatherReduceWithInputKernelThPerAtom4             = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, false, true>;
+    gatherReduceWithInputKernelReadSplines            = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, true, false>;
+    gatherReduceWithInputKernelReadSplinesThPerAtom4  = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, true, true>;
+    solveXYZKernel                                    = pme_solve_kernel<GridOrdering::XYZ, false>;
+    solveXYZEnergyKernel                              = pme_solve_kernel<GridOrdering::XYZ, true>;
+    solveYZXKernel                                    = pme_solve_kernel<GridOrdering::YZX, false>;
+    solveYZXEnergyKernel                              = pme_solve_kernel<GridOrdering::YZX, true>;
 }
 
 PmeGpuProgramImpl::~PmeGpuProgramImpl()
index bfe4815af78ff658dbf341a411bd2e197ab99520..e45a3110f5d31d741da8176e2f335ee2126542d4 100644 (file)
@@ -99,24 +99,41 @@ struct PmeGpuProgramImpl
     //@{
     /**
      * Spread/spline kernels are compiled only for order of 4.
+     * There are multiple versions of each kernel, paramaretized according to
+     *   Number of threads per atom. Using either order(4) or order*order (16) threads per atom is supported
+     *   If the spline data is written in the spline/spread kernel and loaded in the gather
+     *   or recalculated in the gather.
      * Spreading kernels also have hardcoded X/Y indices wrapping parameters,
      * as a placeholder for implementing 1/2D decomposition.
      */
     size_t          spreadWorkGroupSize;
 
     PmeKernelHandle splineKernel;
+    PmeKernelHandle splineKernelThPerAtom4;
     PmeKernelHandle spreadKernel;
+    PmeKernelHandle spreadKernelThPerAtom4;
     PmeKernelHandle splineAndSpreadKernel;
+    PmeKernelHandle splineAndSpreadKernelThPerAtom4;
+    PmeKernelHandle splineAndSpreadKernelWriteSplines;
+    PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4;
     //@}
 
     //@{
     /** Same for gather: hardcoded X/Y unwrap parameters, order of 4, plus
      * it can either reduce with previous forces in the host buffer, or ignore them.
+     * Also similarly to the gather we can use either order(4) or order*order (16) threads per atom
+     * and either recalculate the splines or read the ones written by the spread
      */
     size_t          gatherWorkGroupSize;
 
     PmeKernelHandle gatherReduceWithInputKernel;
+    PmeKernelHandle gatherReduceWithInputKernelThPerAtom4;
     PmeKernelHandle gatherKernel;
+    PmeKernelHandle gatherKernelThPerAtom4;
+    PmeKernelHandle gatherReduceWithInputKernelReadSplines;
+    PmeKernelHandle gatherReduceWithInputKernelReadSplinesThPerAtom4;
+    PmeKernelHandle gatherKernelReadSplines;
+    PmeKernelHandle gatherKernelReadSplinesThPerAtom4;
     //@}
 
     //@{
index 492128ed3eeaa9e90d852893ff6cc0dbf50367db..423eaf2c8a0e31ab483bcfc5ced08c699c895837 100644 (file)
@@ -222,26 +222,29 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
         // TODO use a map with string key instead?
         if (!strcmp(kernelNamesBuffer.data(), "pmeSplineKernel"))
         {
-            splineKernel = kernel;
+            splineKernel             = kernel;
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernel"))
         {
-            splineAndSpreadKernel = kernel;
+            splineAndSpreadKernel             = kernel;
+            splineAndSpreadKernelWriteSplines = kernel;
             checkRequiredWarpSize(splineAndSpreadKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernel"))
         {
-            spreadKernel = kernel;
+            spreadKernel             = kernel;
             checkRequiredWarpSize(spreadKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernel"))
         {
-            gatherKernel = kernel;
+            gatherKernel            = kernel;
+            gatherKernelReadSplines = kernel;
             checkRequiredWarpSize(gatherKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherReduceWithInputKernel"))
         {
-            gatherReduceWithInputKernel = kernel;
+            gatherReduceWithInputKernel            = kernel;
+            gatherReduceWithInputKernelReadSplines = kernel;
             checkRequiredWarpSize(gatherReduceWithInputKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernel"))
index 4037a71f7683a8db0d72143f5e42a85c8d6e07aa..3fbd4dca0fc57e49a3bbc2a24b421053c4569ec6 100644 (file)
 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
 
 #include "pme.cuh"
+#include "pme_calculate_splines.cuh"
 #include "pme_gpu_utils.h"
 #include "pme_grid.h"
 
-/*
- * This define affects the spline calculation behaviour in the kernel.
- * 0: a single GPU thread handles a single dimension of a single particle (calculating and storing (order) spline values and derivatives).
- * 1: (order) threads do redundant work on this same task, each one stores only a single theta and single dtheta into global arrays.
- * The only efficiency difference is less global store operations, countered by more redundant spline computation.
- *
- * TODO: estimate if this should be a boolean parameter (and add it to the unit test if so).
- */
-#define PME_GPU_PARALLEL_SPLINE 0
-
-
-/*! \brief
- * General purpose function for loading atom-related data from global to shared memory.
- *
- * \tparam[in] T                 Data type (float/int/...)
- * \tparam[in] atomsPerBlock     Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
- * \tparam[in] dataCountPerAtom  Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
- * \param[in]  kernelParams      Input PME CUDA data in constant memory.
- * \param[out] sm_destination    Shared memory array for output.
- * \param[in]  gm_source         Global memory array for input.
- */
-template<typename T,
-         const int atomsPerBlock,
-         const int dataCountPerAtom>
-__device__  __forceinline__
-void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams       kernelParams,
-                             T * __restrict__                   sm_destination,
-                             const T * __restrict__             gm_source)
-{
-    static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta per-warp alignment");
-    const int blockIndex       = blockIdx.y * gridDim.x + blockIdx.x;
-    const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
-    const int localIndex       = threadLocalIndex;
-    const int globalIndexBase  = blockIndex * atomsPerBlock * dataCountPerAtom;
-    const int globalIndex      = globalIndexBase + localIndex;
-    const int globalCheck      = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
-    if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
-    {
-        assert(isfinite(float(gm_source[globalIndex])));
-        sm_destination[localIndex] = gm_source[globalIndex];
-    }
-}
-
-/*! \brief
- * PME GPU spline parameter and gridline indices calculation.
- * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines().
- * First stage of the whole kernel.
- *
- * \tparam[in] order                PME interpolation order.
- * \tparam[in] atomsPerBlock        Number of atoms processed by a block - should be accounted for
- *                                  in the sizes of the shared memory arrays.
- * \param[in]  kernelParams         Input PME CUDA data in constant memory.
- * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
- * \param[in]  sm_coordinates       Atom coordinates in the shared memory.
- * \param[in]  sm_coefficients      Atom charges/coefficients in the shared memory.
- * \param[out] sm_theta             Atom spline values in the shared memory.
- * \param[out] sm_gridlineIndices   Atom gridline indices in the shared memory.
- */
-template <const int order,
-          const int atomsPerBlock>
-__device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams           kernelParams,
-                                                  const int                              atomIndexOffset,
-                                                  const float3 * __restrict__            sm_coordinates,
-                                                  const float * __restrict__             sm_coefficients,
-                                                  float * __restrict__                   sm_theta,
-                                                  int * __restrict__                     sm_gridlineIndices)
-{
-    /* Global memory pointers for output */
-    float * __restrict__ gm_theta           = kernelParams.atoms.d_theta;
-    float * __restrict__ gm_dtheta          = kernelParams.atoms.d_dtheta;
-    int * __restrict__   gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
-
-    const int            atomsPerWarp = c_pmeSpreadGatherAtomsPerWarp;
-
-    /* Fractional coordinates */
-    __shared__ float sm_fractCoords[atomsPerBlock * DIM];
-
-    /* Thread index w.r.t. block */
-    const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
-        + (threadIdx.y * blockDim.x) + threadIdx.x;
-    /* Warp index w.r.t. block - could probably be obtained easier? */
-    const int warpIndex = threadLocalId / warp_size;
-    /* Thread index w.r.t. warp */
-    const int threadWarpIndex = threadLocalId % warp_size;
-    /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
-    const int atomWarpIndex = threadWarpIndex % atomsPerWarp;
-    /* Atom index w.r.t. block/shared memory */
-    const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
-
-    /* Atom index w.r.t. global memory */
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
-    /* Spline contribution index in one dimension */
-    const int orderIndex = threadWarpIndex / (atomsPerWarp * DIM);
-    /* Dimension index */
-    const int dimIndex = (threadWarpIndex - orderIndex * (atomsPerWarp * DIM)) / atomsPerWarp;
-
-    /* Multi-purpose index of rvec/ivec atom data */
-    const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
-
-    /* Spline parameters need a working buffer.
-     * With PME_GPU_PARALLEL_SPLINE == 0 it is just a local array of (order) values for some of the threads, which is fine;
-     * With PME_GPU_PARALLEL_SPLINE == 1 (order) times more threads are involved, so the shared memory is used to avoid overhead.
-     * The buffer's size, striding and indexing are adjusted accordingly.
-     * The buffer is accessed with SPLINE_DATA_PTR and SPLINE_DATA macros.
-     */
-#if PME_GPU_PARALLEL_SPLINE
-    const int        splineDataStride  = atomsPerBlock * DIM;
-    const int        splineDataIndex   = sharedMemoryIndex;
-    __shared__ float sm_splineData[splineDataStride * order];
-    float           *splineDataPtr = sm_splineData;
-#else
-    const int        splineDataStride = 1;
-    const int        splineDataIndex  = 0;
-    float            splineData[splineDataStride * order];
-    float           *splineDataPtr = splineData;
-#endif
-
-#define SPLINE_DATA_PTR(i) (splineDataPtr + ((i) * splineDataStride + splineDataIndex))
-#define SPLINE_DATA(i) (*SPLINE_DATA_PTR(i))
-
-    const int localCheck  = (dimIndex < DIM) && (orderIndex < (PME_GPU_PARALLEL_SPLINE ? order : 1));
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
-
-    if (localCheck && globalCheck)
-    {
-        /* Indices interpolation */
-
-        if (orderIndex == 0)
-        {
-            int           tableIndex, tInt;
-            float         n, t;
-            const float3  x = sm_coordinates[atomIndexLocal];
-            /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
-             * puts them into local memory(!) instead of accessing the constant memory directly.
-             * That's the reason for the switch, to unroll explicitly.
-             * The commented parts correspond to the 0 components of the recipbox.
-             */
-            switch (dimIndex)
-            {
-                case XX:
-                    tableIndex  = kernelParams.grid.tablesOffsets[XX];
-                    n           = kernelParams.grid.realGridSizeFP[XX];
-                    t           = x.x * kernelParams.current.recipBox[dimIndex][XX] + x.y * kernelParams.current.recipBox[dimIndex][YY] + x.z * kernelParams.current.recipBox[dimIndex][ZZ];
-                    break;
-
-                case YY:
-                    tableIndex  = kernelParams.grid.tablesOffsets[YY];
-                    n           = kernelParams.grid.realGridSizeFP[YY];
-                    t           = /*x.x * kernelParams.current.recipbox[dimIndex][XX] + */ x.y * kernelParams.current.recipBox[dimIndex][YY] + x.z * kernelParams.current.recipBox[dimIndex][ZZ];
-                    break;
-
-                case ZZ:
-                    tableIndex  = kernelParams.grid.tablesOffsets[ZZ];
-                    n           = kernelParams.grid.realGridSizeFP[ZZ];
-                    t           = /*x.x * kernelParams.current.recipbox[dimIndex][XX] + x.y * kernelParams.current.recipbox[dimIndex][YY] + */ x.z * kernelParams.current.recipBox[dimIndex][ZZ];
-                    break;
-            }
-            const float shift = c_pmeMaxUnitcellShift;
-            /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
-            t    = (t + shift) * n;
-            tInt = (int)t;
-            sm_fractCoords[sharedMemoryIndex] = t - tInt;
-            tableIndex                       += tInt;
-            assert(tInt >= 0);
-            assert(tInt < c_pmeNeighborUnitcellCount * n);
-
-            // TODO have shared table for both parameters to share the fetch, as index is always same?
-            // TODO compare texture/LDG performance
-            sm_fractCoords[sharedMemoryIndex] +=
-                fetchFromParamLookupTable(kernelParams.grid.d_fractShiftsTable,
-                                          kernelParams.fractShiftsTableTexture,
-                                          tableIndex);
-            sm_gridlineIndices[sharedMemoryIndex] =
-                fetchFromParamLookupTable(kernelParams.grid.d_gridlineIndicesTable,
-                                          kernelParams.gridlineIndicesTableTexture,
-                                          tableIndex);
-            gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] = sm_gridlineIndices[sharedMemoryIndex];
-        }
-
-        /* B-spline calculation */
-
-        const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
-        if (chargeCheck)
-        {
-            float       div;
-            int         o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
-
-            const float dr = sm_fractCoords[sharedMemoryIndex];
-            assert(isfinite(dr));
-
-            /* dr is relative offset from lower cell limit */
-            *SPLINE_DATA_PTR(order - 1) = 0.0f;
-            *SPLINE_DATA_PTR(1)         = dr;
-            *SPLINE_DATA_PTR(0)         = 1.0f - dr;
-
-#pragma unroll
-            for (int k = 3; k < order; k++)
-            {
-                div                     = 1.0f / (k - 1.0f);
-                *SPLINE_DATA_PTR(k - 1) = div * dr * SPLINE_DATA(k - 2);
-#pragma unroll
-                for (int l = 1; l < (k - 1); l++)
-                {
-                    *SPLINE_DATA_PTR(k - l - 1) = div * ((dr + l) * SPLINE_DATA(k - l - 2) + (k - l - dr) * SPLINE_DATA(k - l - 1));
-                }
-                *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
-            }
-
-            const int thetaIndexBase        = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
-            const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order;
-
-            /* Differentiation and storing the spline derivatives (dtheta) */
-#if !PME_GPU_PARALLEL_SPLINE
-            // With PME_GPU_PARALLEL_SPLINE == 1, o is already set to orderIndex;
-            // With PME_GPU_PARALLEL_SPLINE == 0, we loop o over range(order).
-#pragma unroll
-            for (o = 0; o < order; o++)
-#endif
-            {
-                const int   thetaIndex       = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
-                const int   thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
-
-                const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0f) - SPLINE_DATA(o);
-                assert(isfinite(dtheta));
-                gm_dtheta[thetaGlobalIndex] = dtheta;
-            }
-
-            div  = 1.0f / (order - 1.0f);
-            *SPLINE_DATA_PTR(order - 1) = div * dr * SPLINE_DATA(order - 2);
-#pragma unroll
-            for (int k = 1; k < (order - 1); k++)
-            {
-                *SPLINE_DATA_PTR(order - k - 1) = div * ((dr + k) * SPLINE_DATA(order - k - 2) + (order - k - dr) * SPLINE_DATA(order - k - 1));
-            }
-            *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
-
-            /* Storing the spline values (theta) */
-#if !PME_GPU_PARALLEL_SPLINE
-            // See comment for the loop above
-#pragma unroll
-            for (o = 0; o < order; o++)
-#endif
-            {
-                const int thetaIndex       = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
-                const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
-
-                sm_theta[thetaIndex]       = SPLINE_DATA(o);
-                assert(isfinite(sm_theta[thetaIndex]));
-                gm_theta[thetaGlobalIndex] = SPLINE_DATA(o);
-            }
-        }
-    }
-}
-
 /*! \brief
  * Charge spreading onto the grid.
  * This corresponds to the CPU function spread_coefficients_bsplines_thread().
- * Second stage of the whole kernel.
+ * Optional second stage of the spline_and_spread_kernel.
  *
  * \tparam[in] order                PME interpolation order.
  * \tparam[in] wrapX                A boolean which tells if the grid overlap in dimension X should be wrapped.
  * \tparam[in] wrapY                A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \tparam[in] useOrderThreads      A boolean which Tells if we should use order threads per atom (order*order used if false)
  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
  * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
- * \param[in]  sm_coefficients      Atom coefficents/charges in the shared memory.
+ * \param[in]  atomCharge           Atom charge/coefficient of atom processed by thread.
  * \param[in]  sm_gridlineIndices   Atom gridline indices in the shared memory.
  * \param[in]  sm_theta             Atom spline values in the shared memory.
  */
 template <
-    const int order, const bool wrapX, const bool wrapY>
+    const int order, const bool wrapX, const bool wrapY,  const bool useOrderThreads >
 __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams           kernelParams,
                                                int                                    atomIndexOffset,
-                                               const float * __restrict__             sm_coefficients,
+                                               const float *                          atomCharge,
                                                const int * __restrict__               sm_gridlineIndices,
                                                const float * __restrict__             sm_theta)
 {
@@ -332,7 +80,7 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams
     float * __restrict__ gm_grid = kernelParams.grid.d_realGrid;
 
 
-    const int atomsPerWarp = c_pmeSpreadGatherAtomsPerWarp;
+    const int atomsPerWarp = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom : c_pmeSpreadGatherAtomsPerWarp;
 
     const int nx  = kernelParams.grid.realGridSize[XX];
     const int ny  = kernelParams.grid.realGridSize[YY];
@@ -346,24 +94,19 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams
     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
 
     const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
-    const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
+    const int chargeCheck = pme_gpu_check_atom_charge(*atomCharge);
     if (chargeCheck & globalCheck)
     {
-        // Spline Y/Z coordinates
-        const int ithy   = threadIdx.y;
+        // Spline Z coordinates
         const int ithz   = threadIdx.x;
+
         const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX] - offx;
-        int       iy     = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy + ithy;
-        if (wrapY & (iy >= ny))
-        {
-            iy -= ny;
-        }
-        int iz  = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz;
+        const int iyBase = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy;
+        int       iz     = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz;
         if (iz >= nz)
         {
             iz -= nz;
         }
-
         /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
         const int    atomWarpIndex   = atomIndexLocal % atomsPerWarp;
         /* Warp index w.r.t. block - could probably be obtained easier? */
@@ -372,26 +115,39 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams
         const int    splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
         const int    splineIndexZ    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
         const float  thetaZ          = sm_theta[splineIndexZ];
-        const int    splineIndexY    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
-        const float  thetaY          = sm_theta[splineIndexY];
-        const float  constVal        = thetaZ * thetaY * sm_coefficients[atomIndexLocal];
-        assert(isfinite(constVal));
-        const int    constOffset     = iy * pnz + iz;
 
-#pragma unroll
-        for (int ithx = 0; (ithx < order); ithx++)
+        /* loop not used if order*order threads per atom */
+        const int ithyMin = useOrderThreads ? 0 : threadIdx.y;
+        const int ithyMax = useOrderThreads ? order : threadIdx.y + 1;
+        for (int ithy = ithyMin; ithy < ithyMax; ithy++)
         {
-            int ix = ixBase + ithx;
-            if (wrapX & (ix >= nx))
+            int       iy     = iyBase + ithy;
+            if (wrapY & (iy >= ny))
+            {
+                iy -= ny;
+            }
+
+            const int    splineIndexY    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
+            float        thetaY          = sm_theta[splineIndexY];
+            const float  Val             = thetaZ * thetaY * (*atomCharge);
+            assert(isfinite(Val));
+            const int    offset     = iy * pnz + iz;
+
+#pragma unroll
+            for (int ithx = 0; (ithx < order); ithx++)
             {
-                ix -= nx;
+                int ix = ixBase + ithx;
+                if (wrapX & (ix >= nx))
+                {
+                    ix -= nx;
+                }
+                const int   gridIndexGlobal = ix * pny * pnz + offset;
+                const int   splineIndexX    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
+                const float thetaX          = sm_theta[splineIndexX];
+                assert(isfinite(thetaX));
+                assert(isfinite(gm_grid[gridIndexGlobal]));
+                atomicAdd(gm_grid + gridIndexGlobal, thetaX * Val);
             }
-            const int   gridIndexGlobal = ix * pny * pnz + constOffset;
-            const int   splineIndexX    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
-            const float thetaX          = sm_theta[splineIndexX];
-            assert(isfinite(thetaX));
-            assert(isfinite(gm_grid[gridIndexGlobal]));
-            atomicAdd(gm_grid + gridIndexGlobal, thetaX * constVal);
         }
     }
 }
@@ -399,12 +155,18 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams
 /*! \brief
  * A spline computation and charge spreading kernel function.
  *
+ * Two tuning parameters can be used for additional performance. For small systems and for debugging
+ * writeGlobal should be used removing the need to recalculate the theta values in the gather kernel.
+ * Similarly for useOrderThreads large systems order threads per atom gives higher performance than order*order threads
+ *
  * \tparam[in] order                PME interpolation order.
  * \tparam[in] computeSplines       A boolean which tells if the spline parameter and
  *                                  gridline indices' computation should be performed.
  * \tparam[in] spreadCharges        A boolean which tells if the charge spreading should be performed.
  * \tparam[in] wrapX                A boolean which tells if the grid overlap in dimension X should be wrapped.
  * \tparam[in] wrapY                A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \tparam[in] writeGlobal          A boolean which tells if the theta values and gridlines should be written to global memory.
+ * \tparam[in] useOrderThreads         A boolean which tells if we should use order threads per atom (order*order used if false).
  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
  */
 template <
@@ -412,22 +174,43 @@ template <
     const bool computeSplines,
     const bool spreadCharges,
     const bool wrapX,
-    const bool wrapY
+    const bool wrapY,
+    const bool writeGlobal,
+    const bool useOrderThreads
     >
 __launch_bounds__(c_spreadMaxThreadsPerBlock)
 CLANG_DISABLE_OPTIMIZATION_ATTRIBUTE
 __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams)
 {
-    const int        atomsPerBlock = c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom;
+    const int            atomsPerBlock = useOrderThreads ? c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom :
+        c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom;
     // Gridline indices, ivec
-    __shared__ int   sm_gridlineIndices[atomsPerBlock * DIM];
-    // Charges
-    __shared__ float sm_coefficients[atomsPerBlock];
+    __shared__ int       sm_gridlineIndices[atomsPerBlock * DIM];
     // Spline values
-    __shared__ float sm_theta[atomsPerBlock * DIM * order];
+    __shared__ float     sm_theta[atomsPerBlock * DIM * order];
+    float                dtheta;
+
+    const int            atomsPerWarp = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom :
+        c_pmeSpreadGatherAtomsPerWarp;
 
-    const int        blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
-    const int        atomIndexOffset = blockIndex * atomsPerBlock;
+    float3               atomX;
+    float                atomCharge;
+
+    const int            blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
+    const int            atomIndexOffset = blockIndex * atomsPerBlock;
+
+    /* Thread index w.r.t. block */
+    const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
+        + (threadIdx.y * blockDim.x) + threadIdx.x;
+    /* Warp index w.r.t. block - could probably be obtained easier? */
+    const int warpIndex = threadLocalId / warp_size;
+
+    /* Atom index w.r.t. warp */
+    const int atomWarpIndex = threadIdx.z %atomsPerWarp;
+    /* Atom index w.r.t. block/shared memory */
+    const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
+    /* Atom index w.r.t. global memory */
+    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
 
     /* Early return for fully empty blocks at the end
      * (should only happen for billions of input atoms)
@@ -436,19 +219,42 @@ __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernel
     {
         return;
     }
-
-    /* Staging coefficients/charges for both spline and spread */
-    pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
+    /* Charges, required for both spline and spread */
+    if (c_useAtomDataPrefetch)
+    {
+        __shared__ float sm_coefficients[atomsPerBlock];
+        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
+        __syncthreads();
+        atomCharge = sm_coefficients[atomIndexLocal];
+    }
+    else
+    {
+        atomCharge =  kernelParams.atoms.d_coefficients[atomIndexGlobal];
+    }
 
     if (computeSplines)
     {
-        /* Staging coordinates */
-        __shared__ float sm_coordinates[DIM * atomsPerBlock];
-        pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
-
-        __syncthreads();
-        calculate_splines<order, atomsPerBlock>(kernelParams, atomIndexOffset, (const float3 *)sm_coordinates,
-                                                sm_coefficients, sm_theta, sm_gridlineIndices);
+        if (c_useAtomDataPrefetch)
+        {
+            // Coordinates
+            __shared__ float sm_coordinates[DIM * atomsPerBlock];
+
+            /* Staging coordinates */
+            pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
+            __syncthreads();
+            atomX.x    = sm_coordinates[atomIndexLocal*DIM+XX];
+            atomX.y    = sm_coordinates[atomIndexLocal*DIM+YY];
+            atomX.z    = sm_coordinates[atomIndexLocal*DIM+ZZ];
+        }
+        else
+        {
+            atomX.x    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + XX];
+            atomX.y    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + YY];
+            atomX.z    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + ZZ];
+        }
+        calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal>
+            (kernelParams, atomIndexOffset, atomX, atomCharge,
+            sm_theta, &dtheta, sm_gridlineIndices);
         __syncwarp();
     }
     else
@@ -468,11 +274,20 @@ __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernel
     /* Spreading */
     if (spreadCharges)
     {
-        spread_charges<order, wrapX, wrapY>(kernelParams, atomIndexOffset, sm_coefficients, sm_gridlineIndices, sm_theta);
+        spread_charges<order, wrapX, wrapY, useOrderThreads>(kernelParams, atomIndexOffset, &atomCharge,
+                                                             sm_gridlineIndices, sm_theta);
     }
 }
 
 //! Kernel instantiations
-template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true>(const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true>(const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, true, true>(const PmeGpuCudaKernelParams);
+
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, false, true>(const PmeGpuCudaKernelParams);
+
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, true, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, true, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, true, false>(const PmeGpuCudaKernelParams);
+
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, false, false>(const PmeGpuCudaKernelParams);