Update PME CUDA spread/gather

author Jonathan Vincent <jvincent@nvidia.com>

Wed, 2 Jan 2019 16:59:18 +0000 (08:59 -0800)

committer Szilárd Páll <pall.szilard@gmail.com>

Wed, 30 Oct 2019 12:21:23 +0000 (13:21 +0100)
author Jonathan Vincent <jvincent@nvidia.com>
Wed, 2 Jan 2019 16:59:18 +0000 (08:59 -0800)
committer Szilárd Páll <pall.szilard@gmail.com>
Wed, 30 Oct 2019 12:21:23 +0000 (13:21 +0100)
diff --git a/src/gromacs/ewald/pme_calculate_splines.cuh b/src/gromacs/ewald/pme_calculate_splines.cuh

new file mode 100644 (file)

index 0000000..7220e1e
--- /dev/null
+++ b/src/gromacs/ewald/pme_calculate_splines.cuh
@@ -0,0 +1,295 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Implements helper routines for PME gather and spline routines.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include <cassert>
+
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+
+#include "pme.cuh"
+#include "pme_gpu_utils.h"
+#include "pme_grid.h"
+
+//! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global
+static const bool c_useAtomDataPrefetch = true;
+
+/*! \brief
+ * General purpose function for loading atom-related data from global to shared memory.
+ *
+ * \tparam[in] T                 Data type (float/int/...)
+ * \tparam[in] atomsPerBlock     Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
+ * \tparam[in] dataCountPerAtom  Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
+ * \param[in]  kernelParams      Input PME CUDA data in constant memory.
+ * \param[out] sm_destination    Shared memory array for output.
+ * \param[in]  gm_source         Global memory array for input.
+ */
+template<typename T,
+         const int atomsPerBlock,
+         const int dataCountPerAtom>
+__device__  __forceinline__
+void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams       kernelParams,
+                             T * __restrict__                   sm_destination,
+                             const T * __restrict__             gm_source)
+{
+    static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta pr-warp alignment");
+    const int blockIndex       = blockIdx.y * gridDim.x + blockIdx.x;
+    const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
+    const int localIndex       = threadLocalIndex;
+    const int globalIndexBase  = blockIndex * atomsPerBlock * dataCountPerAtom;
+    const int globalIndex      = globalIndexBase + localIndex;
+    const int globalCheck      = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
+    if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+    {
+        assert(isfinite(float(gm_source[globalIndex])));
+        sm_destination[localIndex] = gm_source[globalIndex];
+    }
+}
+
+/*! \brief
+ * PME GPU spline parameter and gridline indices calculation.
+ * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines().
+ * First stage of the whole kernel.
+ *
+ * \tparam[in] order                PME interpolation order.
+ * \tparam[in] atomsPerBlock        Number of atoms processed by a block - should be accounted for
+ *                                  in the sizes of the shared memory arrays.
+ * \tparam[in] atomsPerWarp         Number of atoms processed by a warp
+ * \tparam[in] writeSmDtheta        Bool controling if the theta derivative should be written to shared memory. Enables calculation of dtheta if set.
+ * \tparam[in] writeGlobal          A boolean which tells if the theta values and gridlines should be written to global memory. Enables calculation of dtheta if set.
+ * \param[in]  kernelParams         Input PME CUDA data in constant memory.
+ * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
+ * \param[in]  atomX                Atom coordinate of atom processed by thread.
+ * \param[in]  atomCharge           Atom charge/coefficient of atom processed by thread.
+ * \param[out] sm_theta             Atom spline values in the shared memory.
+ * \param[out] sm_dtheta            Derivative of atom spline values in shared memory.
+ * \param[out] sm_gridlineIndices   Atom gridline indices in the shared memory.
+ */
+
+template <const int order,
+          const int atomsPerBlock,
+          const int atomsPerWarp,
+          const bool writeSmDtheta,
+          const bool writeGlobal>
+__device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams            kernelParams,
+                                                  const int                               atomIndexOffset,
+                                                  const float3                            atomX,
+                                                  const float                             atomCharge,
+                                                  float * __restrict__                    sm_theta,
+                                                  float * __restrict__                    sm_dtheta,
+                                                  int * __restrict__                      sm_gridlineIndices)
+{
+    /* Global memory pointers for output */
+    float * __restrict__ gm_theta           = kernelParams.atoms.d_theta;
+    float * __restrict__ gm_dtheta          = kernelParams.atoms.d_dtheta;
+    int * __restrict__   gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
+
+    /* Fractional coordinates */
+    __shared__ float sm_fractCoords[atomsPerBlock * DIM];
+
+    /* Thread index w.r.t. block */
+    const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
+        + (threadIdx.y * blockDim.x) + threadIdx.x;
+    /* Warp index w.r.t. block - could probably be obtained easier? */
+    const int warpIndex = threadLocalId / warp_size;
+    /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
+    const int atomWarpIndex = threadIdx.z %atomsPerWarp;
+    /* Atom index w.r.t. block/shared memory */
+    const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
+
+    /* Atom index w.r.t. global memory */
+    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+    /* Spline contribution index in one dimension */
+    const int threadLocalIdXY = (threadIdx.y * blockDim.x) + threadIdx.x;
+    const int orderIndex      = threadLocalIdXY / DIM;
+    /* Dimension index */
+    const int dimIndex = threadLocalIdXY % DIM;
+
+    /* Multi-purpose index of rvec/ivec atom data */
+    const int        sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
+
+    float            splineData[order];
+
+    const int        localCheck  = (dimIndex < DIM) && (orderIndex <  1);
+    const int        globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
+
+    /* we have 4 threads per atom, but can only use 3 here for the dimensions */
+    if (localCheck && globalCheck)
+    {
+        /* Indices interpolation */
+
+        if (orderIndex == 0)
+        {
+            int           tableIndex, tInt;
+            float         n, t;
+            assert(atomIndexLocal < DIM*atomsPerBlock);
+            /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
+             * puts them into local memory(!) instead of accessing the constant memory directly.
+             * That's the reason for the switch, to unroll explicitly.
+             * The commented parts correspond to the 0 components of the recipbox.
+             */
+            switch (dimIndex)
+            {
+                case XX:
+                    tableIndex  = kernelParams.grid.tablesOffsets[XX];
+                    n           = kernelParams.grid.realGridSizeFP[XX];
+                    t           = atomX.x * kernelParams.current.recipBox[dimIndex][XX] + atomX.y * kernelParams.current.recipBox[dimIndex][YY] + atomX.z * kernelParams.current.recipBox[dimIndex][ZZ];
+                    break;
+
+                case YY:
+                    tableIndex  = kernelParams.grid.tablesOffsets[YY];
+                    n           = kernelParams.grid.realGridSizeFP[YY];
+                    t           = /*atomX.x * kernelParams.current.recipBox[dimIndex][XX] + */ atomX.y * kernelParams.current.recipBox[dimIndex][YY] + atomX.z * kernelParams.current.recipBox[dimIndex][ZZ];
+                    break;
+
+                case ZZ:
+                    tableIndex  = kernelParams.grid.tablesOffsets[ZZ];
+                    n           = kernelParams.grid.realGridSizeFP[ZZ];
+                    t           = /*atomX.x * kernelParams.current.recipBox[dimIndex][XX] + atomX.y * kernelParams.current.recipBox[dimIndex][YY] + */ atomX.z * kernelParams.current.recipBox[dimIndex][ZZ];
+                    break;
+            }
+            const float shift = c_pmeMaxUnitcellShift;
+            /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
+            t    = (t + shift) * n;
+            tInt = (int)t;
+            assert(sharedMemoryIndex < atomsPerBlock*DIM);
+            sm_fractCoords[sharedMemoryIndex] = t - tInt;
+            tableIndex                       += tInt;
+            assert(tInt >= 0);
+            assert(tInt < c_pmeNeighborUnitcellCount * n);
+
+            // TODO have shared table for both parameters to share the fetch, as index is always same?
+            // TODO compare texture/LDG performance
+            sm_fractCoords[sharedMemoryIndex] +=
+                fetchFromParamLookupTable(kernelParams.grid.d_fractShiftsTable,
+                                          kernelParams.fractShiftsTableTexture,
+                                          tableIndex);
+            sm_gridlineIndices[sharedMemoryIndex] =
+                fetchFromParamLookupTable(kernelParams.grid.d_gridlineIndicesTable,
+                                          kernelParams.gridlineIndicesTableTexture,
+                                          tableIndex);
+            if (writeGlobal)
+            {
+                gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] = sm_gridlineIndices[sharedMemoryIndex];
+            }
+        }
+
+        /* B-spline calculation */
+
+        const int chargeCheck = pme_gpu_check_atom_charge(atomCharge);
+        if (chargeCheck)
+        {
+            float       div;
+            int         o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
+
+            const float dr = sm_fractCoords[sharedMemoryIndex];
+            assert(isfinite(dr));
+
+            /* dr is relative offset from lower cell limit */
+            splineData[order-1] = 0.0f;
+            splineData[1]       = dr;
+            splineData[0]       = 1.0f - dr;
+
+#pragma unroll
+            for (int k = 3; k < order; k++)
+            {
+                div                     = 1.0f / (k - 1.0f);
+                splineData[k - 1]       = div * dr * splineData[k - 2];
+#pragma unroll
+                for (int l = 1; l < (k - 1); l++)
+                {
+                    splineData[k - l - 1] = div * ((dr + l) * splineData[k - l - 2] + (k - l - dr) * splineData[k - l - 1] );
+                }
+                splineData[0] = div * (1.0f - dr) * splineData[0];
+            }
+
+            const int thetaIndexBase        = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
+            const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order;
+            /* only calculate dtheta if we are saving it to shared or global memory */
+            if (writeSmDtheta || writeGlobal)
+            {
+                /* Differentiation and storing the spline derivatives (dtheta) */
+#pragma unroll
+                for (o = 0; o < order; o++)
+                {
+                    const int   thetaIndex       = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
+
+                    const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0f) - splineData[o];
+                    assert(isfinite(dtheta));
+                    assert(thetaIndex < order*DIM*atomsPerBlock);
+                    if (writeSmDtheta)
+                    {
+                        sm_dtheta[thetaIndex] = dtheta;
+                    }
+                    if (writeGlobal)
+                    {
+                        const int   thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
+                        gm_dtheta[thetaGlobalIndex] = dtheta;
+                    }
+                }
+            }
+
+            div                   = 1.0f / (order - 1.0f);
+            splineData[order - 1] = div * dr * splineData[order - 2];
+#pragma unroll
+            for (int k = 1; k < (order - 1); k++)
+            {
+                splineData[order - k - 1] = div * ((dr + k) * splineData[order - k - 2] + (order - k - dr) * splineData[order - k - 1]);
+            }
+            splineData[0] = div * (1.0f - dr) * splineData[0];
+
+            /* Storing the spline values (theta) */
+#pragma unroll
+            for (o = 0; o < order; o++)
+            {
+                const int thetaIndex       = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
+                assert(thetaIndex < order*DIM*atomsPerBlock);
+                sm_theta[thetaIndex]       = splineData[o];
+                assert(isfinite(sm_theta[thetaIndex]));
+                if (writeGlobal)
+                {
+                    const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
+                    gm_theta[thetaGlobalIndex] = splineData[o];
+                }
+            }
+        }
+    }
+}
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu

index be34c3c6cda751526bb88c070d2276f4cbb648bd..ead734c687ed57f99d855c724442cf1d0f4fd762 100644 (file)
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -43,8 +43,12 @@
  
  #include <cassert>
  
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+
  #include "pme.cuh"
+#include "pme_calculate_splines.cuh"
  #include "pme_gpu_utils.h"
+#include "pme_grid.h"
  
  /*! \brief
   * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
@@ -216,13 +220,17 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
   *                                  False: the forces are added non-atomically to the output buffer (e.g. to the bonded forces).
   * \tparam[in] wrapX                Tells if the grid is wrapped in the X dimension.
   * \tparam[in] wrapY                Tells if the grid is wrapped in the Y dimension.
+ * \tparam[in] readGlobal           Tells if we should read spline values from global memory
+ * \tparam[in] useOrderThreads      Tells if we should use order threads per atom (order*order used if false)
   * \param[in]  kernelParams         All the PME GPU data.
   */
  template <
      const int order,
      const bool overwriteForces,
      const bool wrapX,
-    const bool wrapY
+    const bool wrapY,
+    const bool readGlobal,
+    const bool useOrderThreads
      >
  __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP)
  __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
@@ -230,19 +238,27 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
      /* Global memory pointers */
      const float * __restrict__  gm_coefficients     = kernelParams.atoms.d_coefficients;
      const float * __restrict__  gm_grid             = kernelParams.grid.d_realGrid;
+    float * __restrict__        gm_forces           = kernelParams.atoms.d_forces;
+
+    /* Global memory pointers for readGlobal */
      const float * __restrict__  gm_theta            = kernelParams.atoms.d_theta;
      const float * __restrict__  gm_dtheta           = kernelParams.atoms.d_dtheta;
      const int * __restrict__    gm_gridlineIndices  = kernelParams.atoms.d_gridlineIndices;
-    float * __restrict__        gm_forces           = kernelParams.atoms.d_forces;
+
+    float3 atomX;
+    float  atomCharge;
  
      /* Some sizes */
-    const int    atomsPerBlock  = (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom);
-    const int    atomDataSize   = c_pmeSpreadGatherThreadsPerAtom; /* Number of data components and threads for a single atom */
-    const int    atomsPerWarp   = c_pmeSpreadGatherAtomsPerWarp;
+    const int    atomsPerBlock   = useOrderThreads ? (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom) :
+        (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom);
+    const int    blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
+
+    /* Number of data components and threads for a single atom */
+    const int    atomDataSize   = useOrderThreads ? c_pmeSpreadGatherThreadsPerAtom4ThPerAtom : c_pmeSpreadGatherThreadsPerAtom;
+    const int    atomsPerWarp   = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom : c_pmeSpreadGatherAtomsPerWarp;
  
      const int    blockSize      = atomsPerBlock * atomDataSize;
      assert(blockSize == blockDim.x * blockDim.y * blockDim.z);
-    const int    blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
  
      /* These are the atom indices - for the shared and global memory */
      const int         atomIndexLocal    = threadIdx.z;
@@ -256,46 +272,89 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
      {
          return;
      }
-
+    // 4 warps per block, 8 atoms per warp *3 *4
      const int         splineParamsSize             = atomsPerBlock * DIM * order;
      const int         gridlineIndicesSize          = atomsPerBlock * DIM;
      __shared__ int    sm_gridlineIndices[gridlineIndicesSize];
-    __shared__ float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs  as .x/.y */
+    __shared__ float  sm_theta[splineParamsSize];
+    __shared__ float  sm_dtheta[splineParamsSize];
  
-    /* Spline Y/Z coordinates */
-    const int ithy = threadIdx.y;
+    /* Spline Z coordinates */
      const int ithz = threadIdx.x;
  
      /* These are the spline contribution indices in shared memory */
-    const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x;                  /* Relative to the current particle , 0..15 for order 4 */
-    const int lineIndex   = (threadIdx.z * (blockDim.x * blockDim.y)) + splineIndex; /* And to all the block's particles */
-
-    int       threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
-        + (threadIdx.y * blockDim.x)
-        + threadIdx.x;
-
-    /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
-    const int localGridlineIndicesIndex  = threadLocalId;
-    const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
-    const int globalCheckIndices         = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
-    if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+    const int       splineIndex =  threadIdx.y * blockDim.x + threadIdx.x;
+    const int       lineIndex   = (threadIdx.z * (blockDim.x * blockDim.y)) + splineIndex; /* And to all the block's particles */
+
+    const int       threadLocalId    = (threadIdx.z * (blockDim.x * blockDim.y)) + blockDim.x*threadIdx.y   + threadIdx.x;
+    const int       threadLocalIdMax = blockDim.x * blockDim.y * blockDim.z;
+
+    if (readGlobal)
      {
-        sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
-        assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
+        /* Read splines */
+        const int localGridlineIndicesIndex  = threadLocalId;
+        const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
+        const int globalCheckIndices         = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
+        if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+        {
+            sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
+            assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
+        }
+        /* The loop needed for order threads per atom to make sure we load all data values, as each thread must load multiple values
+           with order*order threads per atom, it is only required for each thread to load one data value */
+
+        const int iMin = 0;
+        const int iMax = useOrderThreads ? 3 : 1;
+
+        for (int i = iMin; i < iMax; i++)
+        {
+            int localSplineParamsIndex  = threadLocalId + i*threadLocalIdMax;   /* i will always be zero for order*order threads per atom */
+            int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
+            int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
+            if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+            {
+                sm_theta[localSplineParamsIndex]  = gm_theta[globalSplineParamsIndex];
+                sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
+                assert(isfinite(sm_theta[localSplineParamsIndex]));
+                assert(isfinite(sm_dtheta[localSplineParamsIndex]));
+            }
+        }
+        __syncthreads();
      }
-    /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
-    const int localSplineParamsIndex  = threadLocalId;
-    const int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
-    const int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
-    if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+    else
      {
-        sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
-        sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
-        assert(isfinite(sm_splineParams[localSplineParamsIndex].x));
-        assert(isfinite(sm_splineParams[localSplineParamsIndex].y));
-    }
-    __syncthreads();
+        /* Recaclulate  Splines  */
+        if (c_useAtomDataPrefetch)
+        {
+            //charges
+            __shared__ float sm_coefficients[atomsPerBlock];
+            // Coordinates
+            __shared__ float sm_coordinates[DIM * atomsPerBlock];
+            /* Staging coefficients/charges */
+            pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
+
+            /* Staging coordinates */
+            pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
+            __syncthreads();
+            atomX.x    = sm_coordinates[atomIndexLocal*DIM+XX];
+            atomX.y    = sm_coordinates[atomIndexLocal*DIM+YY];
+            atomX.z    = sm_coordinates[atomIndexLocal*DIM+ZZ];
+            atomCharge = sm_coefficients[atomIndexLocal];
  
+        }
+        else
+        {
+            atomCharge =  gm_coefficients[atomIndexGlobal];
+            atomX.x    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + XX];
+            atomX.y    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + YY];
+            atomX.z    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + ZZ];
+        }
+        calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(kernelParams, atomIndexOffset,
+                                                                           atomX, atomCharge,
+                                                                           sm_theta, sm_dtheta,
+                                                                           sm_gridlineIndices);
+        __syncwarp();
+    }
      float           fx = 0.0f;
      float           fy = 0.0f;
      float           fz = 0.0f;
@@ -315,43 +374,52 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
          const int    warpIndex     = atomIndexLocal / atomsPerWarp;
  
          const int    splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
-        const int    splineIndexY    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
-        const float2 tdy             = sm_splineParams[splineIndexY];
          const int    splineIndexZ    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
-        const float2 tdz             = sm_splineParams[splineIndexZ];
+        const float2 tdz             =  make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ] );
  
+        int          iz             = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
          const int    ixBase         = sm_gridlineIndices[atomIndexLocal * DIM + XX];
-        int          iy             = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
-        if (wrapY & (iy >= ny))
-        {
-            iy -= ny;
-        }
-        int iz  = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
+
          if (iz >= nz)
          {
              iz -= nz;
          }
-        const int constOffset    = iy * pnz + iz;
+        int       constOffset, iy;
  
-#pragma unroll
-        for (int ithx = 0; (ithx < order); ithx++)
+        const int ithyMin = useOrderThreads ? 0 : threadIdx.y;
+        const int ithyMax = useOrderThreads ? order : threadIdx.y + 1;
+        for (int ithy = ithyMin; ithy < ithyMax; ithy++)
          {
-            int ix = ixBase + ithx;
-            if (wrapX & (ix >= nx))
+            const int    splineIndexY    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
+            const float2 tdy             =  make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY] );
+
+            iy             = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
+            if (wrapY & (iy >= ny))
              {
-                ix -= nx;
+                iy -= ny;
+            }
+            constOffset    = iy * pnz + iz;
+
+#pragma unroll
+            for (int ithx = 0; (ithx < order); ithx++)
+            {
+                int ix = ixBase + ithx;
+                if (wrapX & (ix >= nx))
+                {
+                    ix -= nx;
+                }
+                const int     gridIndexGlobal  = ix * pny * pnz + constOffset;
+                assert(gridIndexGlobal >= 0);
+                const float   gridValue    = gm_grid[gridIndexGlobal];
+                assert(isfinite(gridValue));
+                const int     splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
+                const float2  tdx          = make_float2( sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
+                const float   fxy1         = tdz.x * gridValue;
+                const float   fz1          = tdz.y * gridValue;
+                fx += tdx.y * tdy.x * fxy1;
+                fy += tdx.x * tdy.y * fxy1;
+                fz += tdx.x * tdy.x * fz1;
              }
-            const int     gridIndexGlobal  = ix * pny * pnz + constOffset;
-            assert(gridIndexGlobal >= 0);
-            const float   gridValue    = gm_grid[gridIndexGlobal];
-            assert(isfinite(gridValue));
-            const int     splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
-            const float2  tdx          = sm_splineParams[splineIndexX];
-            const float   fxy1         = tdz.x * gridValue;
-            const float   fz1          = tdz.y * gridValue;
-            fx += tdx.y * tdy.x * fxy1;
-            fy += tdx.x * tdy.y * fxy1;
-            fz += tdx.x * tdy.x * fz1;
          }
      }
  
@@ -410,5 +478,11 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
  }
  
  //! Kernel instantiations
-template __global__ void pme_gather_kernel<4, true, true, true>(const PmeGpuCudaKernelParams);
-template __global__ void pme_gather_kernel<4, false, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, true, true, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true, true, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, true, false, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, true, false, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true, false, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true, false, false>(const PmeGpuCudaKernelParams);
diff --git a/src/gromacs/ewald/pme_gpu_constants.h b/src/gromacs/ewald/pme_gpu_constants.h

index 695a9e220caa252630287dba0349d9ebdb28a61a..cbbab88c7c52d91129936eca6cf00677d5d2aac4 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_constants.h
+++ b/src/gromacs/ewald/pme_gpu_constants.h
@@ -125,14 +125,20 @@ constexpr int c_pmeGpuOrder = 4;
   * The assumption is currently that any thread processes only a single atom's contributions.
   * TODO: this assumption leads to minimum execution width of 16. See Redmine #2516
   */
-constexpr int c_pmeSpreadGatherThreadsPerAtom = (c_pmeGpuOrder * c_pmeGpuOrder);
+constexpr int c_pmeSpreadGatherThreadsPerAtom  = c_pmeGpuOrder*c_pmeGpuOrder;
+
+//! Number of threads per atom when order threads are used
+constexpr int c_pmeSpreadGatherThreadsPerAtom4ThPerAtom = c_pmeGpuOrder;
  
  /*! \brief Minimum execution width of the PME spread and gather kernels.
   *
   * Due to the one thread per atom and order=4 implementation constraints, order^2 threads
   * should execute without synchronization needed. See c_pmeSpreadGatherThreadsPerAtom
   */
-constexpr int c_pmeSpreadGatherMinWarpSize = c_pmeSpreadGatherThreadsPerAtom;
+constexpr int c_pmeSpreadGatherMinWarpSize  = c_pmeSpreadGatherThreadsPerAtom;
+
+//! Minimum warp size if order threads pera atom are used instead of order^2
+constexpr int c_pmeSpreadGatherMinWarpSize4ThPerAtom = c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
  
  /*! \brief
   * Atom data alignment (in terms of number of atoms).
@@ -142,7 +148,7 @@ constexpr int c_pmeSpreadGatherMinWarpSize = c_pmeSpreadGatherThreadsPerAtom;
   * Then the numbers of atoms which would fit in the padded GPU buffers have to be divisible by this.
   * There are debug asserts for this divisibility in pme_gpu_spread() and pme_gpu_gather().
   */
-constexpr int c_pmeAtomDataAlignment = 32;
+constexpr int c_pmeAtomDataAlignment = 64;
  
  /*
   * The execution widths for PME GPU kernels, used both on host and device for correct scheduling.
@@ -176,7 +182,10 @@ constexpr int c_gatherMaxWarpsPerBlock = 4;
   * This macro depends on the templated order parameter (2 atoms per warp for order 4 and warp_size of 32).
   * It is mostly used for spline data layout tweaked for coalesced access.
   */
-constexpr int c_pmeSpreadGatherAtomsPerWarp = (warp_size / c_pmeSpreadGatherThreadsPerAtom);
+constexpr int c_pmeSpreadGatherAtomsPerWarp  = (warp_size / c_pmeSpreadGatherThreadsPerAtom);
+
+//! number of atoms per warp when order threads are used per atom
+constexpr int c_pmeSpreadGatherAtomsPerWarp4ThPerAtom = (warp_size / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom);
  
  //! Spreading max block size in threads
  constexpr int c_spreadMaxThreadsPerBlock = c_spreadMaxWarpsPerBlock * warp_size;
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index a3a25c408b027815ff9123c7d5d6b7edc38231de..caa7911409b6bcf5c3bc72fe0368d9284da3d560 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -88,6 +88,20 @@
  #include "pme_internal.h"
  #include "pme_solve.h"
  
+
+/*! \brief
+ *  CUDA only.
+ *  Controls if we should use order (i.e. 4) threads per atom for the GPU
+ *  or order*order (i.e. 16) threads per atom.
+ */
+constexpr bool c_useOrderThreadsPerAtom = false;
+/*! \brief
+ * CUDA only.
+ * Controls if we should recalculate the splines in the gather or
+ * save the values in the spread and reload in the gather.
+ */
+constexpr bool c_recalculateSplines = false;
+
  /*! \internal \brief
   * Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure.
   *
@@ -117,7 +131,14 @@ int pme_gpu_get_atom_data_alignment(const PmeGpu * /*unused*/)
  
  int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGpu)
  {
-    return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom;
+    if (c_useOrderThreadsPerAtom)
+    {
+        return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
+    }
+    else
+    {
+        return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom;
+    }
  }
  
  void pme_gpu_synchronize(const PmeGpu *pmeGpu)
@@ -1015,6 +1036,114 @@ std::pair<int, int> inline pmeGpuCreateGrid(const PmeGpu *pmeGpu, int blockCount
      return std::pair<int, int>(colCount, minRowCount);
  }
  
+/*! \brief
+ * Returns a pointer to appropriate spline and spread kernel based on the input bool values
+ *
+ * \param[in]  pmeGpu                   The PME GPU structure.
+ * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
+ * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
+ *
+ * \return Pointer to CUDA kernel
+ */
+static auto selectSplineAndSpreadKernelPtr(const PmeGpu *pmeGpu,
+                                           bool useOrderThreadsPerAtom,
+                                           bool writeSplinesToGlobal)
+{
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+    if (writeSplinesToGlobal)
+    {
+        if (useOrderThreadsPerAtom)
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4;
+        }
+        else
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplines;
+        }
+    }
+    else
+    {
+        if (useOrderThreadsPerAtom)
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
+        }
+        else
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
+        }
+    }
+
+    return kernelPtr;
+}
+
+/*! \brief
+ * Returns a pointer to appropriate spline kernel based on the input bool values
+ *
+ * \param[in]  pmeGpu                   The PME GPU structure.
+ * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
+ * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
+ *
+ * \return Pointer to CUDA kernel
+ */
+static auto selectSplineKernelPtr(const PmeGpu *pmeGpu,
+                                  bool useOrderThreadsPerAtom,
+                                  bool gmx_unused writeSplinesToGlobal)
+{
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+    GMX_ASSERT(writeSplinesToGlobal, "Spline data should always be written to global memory when just calculating splines");
+
+    if (useOrderThreadsPerAtom)
+    {
+        kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4;
+    }
+    else
+    {
+        kernelPtr = pmeGpu->programHandle_->impl_->splineKernel;
+    }
+    return kernelPtr;
+}
+
+/*! \brief
+ * Returns a pointer to appropriate spread kernel based on the input bool values
+ *
+ * \param[in]  pmeGpu                   The PME GPU structure.
+ * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
+ * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
+ *
+ * \return Pointer to CUDA kernel
+ */
+static auto selectSpreadKernelPtr(const PmeGpu *pmeGpu,
+                                  bool useOrderThreadsPerAtom,
+                                  bool writeSplinesToGlobal)
+{
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+    if (writeSplinesToGlobal)
+    {
+        if (useOrderThreadsPerAtom)
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4;
+        }
+        else
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel;
+        }
+    }
+    else
+    {
+        /* if we are not saving the spline data we need to recalculate it
+           using the spline and spread Kernel */
+        if (useOrderThreadsPerAtom)
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
+        }
+        else
+        {
+            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
+        }
+    }
+    return kernelPtr;
+}
+
  void pme_gpu_spread(const PmeGpu         *pmeGpu,
                      GpuEventSynchronizer *xReadyOnDevice,
                      int gmx_unused        gridIndex,
@@ -1030,13 +1159,20 @@ void pme_gpu_spread(const PmeGpu         *pmeGpu,
  
      const int    order         = pmeGpu->common->pme_order;
      GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented");
-    const int    atomsPerBlock = blockSize / c_pmeSpreadGatherThreadsPerAtom;
+    const bool   writeGlobal   = pmeGpu->settings.copyAllOutputs;
+#if GMX_GPU == GMX_GPU_OPENCL
+    GMX_ASSERT(!c_useOrderThreadsPerAtom, "Only 16 threads per atom supported in OpenCL");
+    GMX_ASSERT(!c_recalculateSplines, "Recalculating splines not supported in OpenCL");
+#endif
+    const int    atomsPerBlock = c_useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom :
+        blockSize / c_pmeSpreadGatherThreadsPerAtom;
+
      // TODO: pick smaller block size in runtime if needed
      // (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms))
      // If doing so, change atomsPerBlock in the kernels as well.
      // TODO: test varying block sizes on modern arch-s as well
      // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
-    //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
+    //(for spline data mostly)
      GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
  
      // Ensure that coordinates are ready on the device before launching spread;
@@ -1054,7 +1190,8 @@ void pme_gpu_spread(const PmeGpu         *pmeGpu,
      auto               dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
  
      KernelLaunchConfig config;
-    config.blockSize[0] = config.blockSize[1] = order;
+    config.blockSize[0] = order;
+    config.blockSize[1] = c_useOrderThreadsPerAtom ? 1 : order;
      config.blockSize[2] = atomsPerBlock;
      config.gridSize[0]  = dimGrid.first;
      config.gridSize[1]  = dimGrid.second;
@@ -1067,20 +1204,27 @@ void pme_gpu_spread(const PmeGpu         *pmeGpu,
          if (spreadCharges)
          {
              timingId  = gtPME_SPLINEANDSPREAD;
-            kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
+            kernelPtr = selectSplineAndSpreadKernelPtr( pmeGpu,
+                                                        c_useOrderThreadsPerAtom,
+                                                        writeGlobal || (!c_recalculateSplines));
          }
          else
          {
              timingId  = gtPME_SPLINE;
-            kernelPtr = pmeGpu->programHandle_->impl_->splineKernel;
+            kernelPtr = selectSplineKernelPtr( pmeGpu,
+                                               c_useOrderThreadsPerAtom,
+                                               writeGlobal || (!c_recalculateSplines));
          }
      }
      else
      {
          timingId  = gtPME_SPREAD;
-        kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel;
+        kernelPtr = selectSpreadKernelPtr( pmeGpu,
+                                           c_useOrderThreadsPerAtom,
+                                           writeGlobal || (!c_recalculateSplines));
      }
  
+
      pme_gpu_start_timing(pmeGpu, timingId);
      auto      *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
  #if c_canEmbedBuffers
@@ -1219,6 +1363,58 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
      }
  }
  
+/*! \brief
+ * Returns a pointer to appropriate gather kernel based on the inputvalues
+ *
+ * \param[in]  pmeGpu                   The PME GPU structure.
+ * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
+ * \param[in]  readSplinesFromGlobal    bool controlling if we should write spline data to global memory
+ * \param[in]  forceTreatment           Controls if the forces from the gather should increment or replace the input forces.
+ *
+ * \return Pointer to CUDA kernel
+ */
+inline auto selectGatherKernelPtr(const PmeGpu *pmeGpu,
+                                  bool useOrderThreadsPerAtom,
+                                  bool readSplinesFromGlobal,
+                                  PmeForceOutputHandling forceTreatment)
+
+{
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+
+    if (readSplinesFromGlobal)
+    {
+        if (useOrderThreadsPerAtom)
+        {
+            kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+                pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4 :
+                pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelReadSplinesThPerAtom4;
+        }
+        else
+        {
+            kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+                pmeGpu->programHandle_->impl_->gatherKernelReadSplines :
+                pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelReadSplines;
+        }
+    }
+    else
+    {
+        if (useOrderThreadsPerAtom)
+        {
+            kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+                pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4 :
+                pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelThPerAtom4;
+        }
+        else
+        {
+            kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+                pmeGpu->programHandle_->impl_->gatherKernel :
+                pmeGpu->programHandle_->impl_->gatherReduceWithInputKernel;
+        }
+    }
+    return kernelPtr;
+}
+
+
  void pme_gpu_gather(PmeGpu                *pmeGpu,
                      PmeForceOutputHandling forceTreatment,
                      const float           *h_grid)
@@ -1239,19 +1435,27 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
          pme_gpu_copy_input_gather_atom_data(pmeGpu);
      }
  
+    /* Set if we have unit tests */
+    const bool   readGlobal    = pmeGpu->settings.copyAllOutputs;
      const size_t blockSize     = pmeGpu->programHandle_->impl_->gatherWorkGroupSize;
-    const int    atomsPerBlock = blockSize / c_pmeSpreadGatherThreadsPerAtom;
+#if GMX_GPU == GMX_GPU_OPENCL
+    GMX_ASSERT(!c_useOrderThreadsPerAtom, "Only 16 threads per atom supported in OpenCL");
+    GMX_ASSERT(!c_recalculateSplines, "Recalculating splines not supported in OpenCL");
+#endif
+    const int    atomsPerBlock = c_useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom :
+        blockSize / c_pmeSpreadGatherThreadsPerAtom;
+
      GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. gathering block size");
  
      const int          blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
      auto               dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
  
-
-    const int order = pmeGpu->common->pme_order;
+    const int          order = pmeGpu->common->pme_order;
      GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented");
  
      KernelLaunchConfig config;
-    config.blockSize[0] = config.blockSize[1] = order;
+    config.blockSize[0] = order;
+    config.blockSize[1] = c_useOrderThreadsPerAtom ? 1 : order;
      config.blockSize[2] = atomsPerBlock;
      config.gridSize[0]  = dimGrid.first;
      config.gridSize[1]  = dimGrid.second;
@@ -1260,10 +1464,11 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
      // TODO test different cache configs
  
      int  timingId = gtPME_GATHER;
+    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = selectGatherKernelPtr( pmeGpu,
+                                                                          c_useOrderThreadsPerAtom,
+                                                                          readGlobal || (!c_recalculateSplines),
+                                                                          forceTreatment);
      // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
-    PmeGpuProgramImpl::PmeKernelHandle kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
-        pmeGpu->programHandle_->impl_->gatherKernel :
-        pmeGpu->programHandle_->impl_->gatherReduceWithInputKernel;
  
      pme_gpu_start_timing(pmeGpu, timingId);
      auto       *timingEvent     = pme_gpu_fetch_timing_event(pmeGpu, timingId);
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.cu b/src/gromacs/ewald/pme_gpu_program_impl.cu

index 087d85f50e51c523b9f280f36775423f879a6ace..347d69f9102adbd4d9b55db1caa1763c7cc1f80b 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program_impl.cu
+++ b/src/gromacs/ewald/pme_gpu_program_impl.cu
@@ -61,18 +61,30 @@ template <
      const bool computeSplines,
      const bool spreadCharges,
      const bool wrapX,
-    const bool wrapY
+    const bool wrapY,
+    const bool writeGlobal,
+    const bool orderThreads
      >
  void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
  
  // Add extern declarations to inform that there will be a definition
  // provided in another translation unit.
  extern template
-void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
  extern template
-void pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
  extern template
-void pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, false>(const PmeGpuCudaKernelParams);
  
  template<
      GridOrdering gridOrdering,
@@ -95,16 +107,30 @@ template <
      const int order,
      const bool overwriteForces,
      const bool wrapX,
-    const bool wrapY
+    const bool wrapY,
+    const bool readGlobal,
+    const bool orderThreads
      >
  void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
  
  // Add extern declarations to inform that there will be a definition
  // provided in another translation unit.
  extern template
-void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, false, true>(const PmeGpuCudaKernelParams);
  extern template
-void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, false, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, false, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, false, false>(const PmeGpuCudaKernelParams);
  
  PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *)
  {
@@ -114,15 +140,31 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *)
      solveMaxWorkGroupSize = c_solveMaxThreadsPerBlock;
      gatherWorkGroupSize   = c_gatherMaxThreadsPerBlock;
  
-    splineAndSpreadKernel       = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY>;
-    splineKernel                = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY>;
-    spreadKernel                = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY>;
-    gatherKernel                = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY>;
-    gatherReduceWithInputKernel = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY>;
-    solveXYZKernel              = pme_solve_kernel<GridOrdering::XYZ, false>;
-    solveXYZEnergyKernel        = pme_solve_kernel<GridOrdering::XYZ, true>;
-    solveYZXKernel              = pme_solve_kernel<GridOrdering::YZX, false>;
-    solveYZXEnergyKernel        = pme_solve_kernel<GridOrdering::YZX, true>;
+/*!
+ * Not all combinations of the splineAndSpread, spline and Spread kernels are required
+ * If only the spline (without the spread) then it does not make sense not to write the data to global memory
+ * Similarly the spread kernel (without the spline) implies that we should read the spline data from global memory
+ */
+    splineAndSpreadKernel                             = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, false>;
+    splineAndSpreadKernelThPerAtom4                   = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, true>;
+    splineAndSpreadKernelWriteSplines                 = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, false>;
+    splineAndSpreadKernelWriteSplinesThPerAtom4       = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, true>;
+    splineKernel                                      = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, false>;
+    splineKernelThPerAtom4                            = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, true>;
+    spreadKernel                                      = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, false>;
+    spreadKernelThPerAtom4                            = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, true>;
+    gatherKernel                                      = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, false, false>;
+    gatherKernelThPerAtom4                            = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, false, true>;
+    gatherKernelReadSplines                           = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, true, false>;
+    gatherKernelReadSplinesThPerAtom4                 = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, true, true>;
+    gatherReduceWithInputKernel                       = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, false, false>;
+    gatherReduceWithInputKernelThPerAtom4             = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, false, true>;
+    gatherReduceWithInputKernelReadSplines            = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, true, false>;
+    gatherReduceWithInputKernelReadSplinesThPerAtom4  = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, true, true>;
+    solveXYZKernel                                    = pme_solve_kernel<GridOrdering::XYZ, false>;
+    solveXYZEnergyKernel                              = pme_solve_kernel<GridOrdering::XYZ, true>;
+    solveYZXKernel                                    = pme_solve_kernel<GridOrdering::YZX, false>;
+    solveYZXEnergyKernel                              = pme_solve_kernel<GridOrdering::YZX, true>;
  }
  
  PmeGpuProgramImpl::~PmeGpuProgramImpl()
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.h b/src/gromacs/ewald/pme_gpu_program_impl.h

index bfe4815af78ff658dbf341a411bd2e197ab99520..e45a3110f5d31d741da8176e2f335ee2126542d4 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program_impl.h
+++ b/src/gromacs/ewald/pme_gpu_program_impl.h
@@ -99,24 +99,41 @@ struct PmeGpuProgramImpl
      //@{
      /**
       * Spread/spline kernels are compiled only for order of 4.
+     * There are multiple versions of each kernel, paramaretized according to
+     *   Number of threads per atom. Using either order(4) or order*order (16) threads per atom is supported
+     *   If the spline data is written in the spline/spread kernel and loaded in the gather
+     *   or recalculated in the gather.
       * Spreading kernels also have hardcoded X/Y indices wrapping parameters,
       * as a placeholder for implementing 1/2D decomposition.
       */
      size_t          spreadWorkGroupSize;
  
      PmeKernelHandle splineKernel;
+    PmeKernelHandle splineKernelThPerAtom4;
      PmeKernelHandle spreadKernel;
+    PmeKernelHandle spreadKernelThPerAtom4;
      PmeKernelHandle splineAndSpreadKernel;
+    PmeKernelHandle splineAndSpreadKernelThPerAtom4;
+    PmeKernelHandle splineAndSpreadKernelWriteSplines;
+    PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4;
      //@}
  
      //@{
      /** Same for gather: hardcoded X/Y unwrap parameters, order of 4, plus
       * it can either reduce with previous forces in the host buffer, or ignore them.
+     * Also similarly to the gather we can use either order(4) or order*order (16) threads per atom
+     * and either recalculate the splines or read the ones written by the spread
       */
      size_t          gatherWorkGroupSize;
  
      PmeKernelHandle gatherReduceWithInputKernel;
+    PmeKernelHandle gatherReduceWithInputKernelThPerAtom4;
      PmeKernelHandle gatherKernel;
+    PmeKernelHandle gatherKernelThPerAtom4;
+    PmeKernelHandle gatherReduceWithInputKernelReadSplines;
+    PmeKernelHandle gatherReduceWithInputKernelReadSplinesThPerAtom4;
+    PmeKernelHandle gatherKernelReadSplines;
+    PmeKernelHandle gatherKernelReadSplinesThPerAtom4;
      //@}
  
      //@{
diff --git a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp

index 492128ed3eeaa9e90d852893ff6cc0dbf50367db..423eaf2c8a0e31ab483bcfc5ced08c699c895837 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
@@ -222,26 +222,29 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
          // TODO use a map with string key instead?
          if (!strcmp(kernelNamesBuffer.data(), "pmeSplineKernel"))
          {
-            splineKernel = kernel;
+            splineKernel             = kernel;
          }
          else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernel"))
          {
-            splineAndSpreadKernel = kernel;
+            splineAndSpreadKernel             = kernel;
+            splineAndSpreadKernelWriteSplines = kernel;
              checkRequiredWarpSize(splineAndSpreadKernel, kernelNamesBuffer.data(), deviceInfo);
          }
          else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernel"))
          {
-            spreadKernel = kernel;
+            spreadKernel             = kernel;
              checkRequiredWarpSize(spreadKernel, kernelNamesBuffer.data(), deviceInfo);
          }
          else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernel"))
          {
-            gatherKernel = kernel;
+            gatherKernel            = kernel;
+            gatherKernelReadSplines = kernel;
              checkRequiredWarpSize(gatherKernel, kernelNamesBuffer.data(), deviceInfo);
          }
          else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherReduceWithInputKernel"))
          {
-            gatherReduceWithInputKernel = kernel;
+            gatherReduceWithInputKernel            = kernel;
+            gatherReduceWithInputKernelReadSplines = kernel;
              checkRequiredWarpSize(gatherReduceWithInputKernel, kernelNamesBuffer.data(), deviceInfo);
          }
          else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernel"))
diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu

index 4037a71f7683a8db0d72143f5e42a85c8d6e07aa..3fbd4dca0fc57e49a3bbc2a24b421053c4569ec6 100644 (file)
--- a/src/gromacs/ewald/pme_spread.cu
+++ b/src/gromacs/ewald/pme_spread.cu
@@ -49,282 +49,30 @@
  #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
  
  #include "pme.cuh"
+#include "pme_calculate_splines.cuh"
  #include "pme_gpu_utils.h"
  #include "pme_grid.h"
  
-/*
- * This define affects the spline calculation behaviour in the kernel.
- * 0: a single GPU thread handles a single dimension of a single particle (calculating and storing (order) spline values and derivatives).
- * 1: (order) threads do redundant work on this same task, each one stores only a single theta and single dtheta into global arrays.
- * The only efficiency difference is less global store operations, countered by more redundant spline computation.
- *
- * TODO: estimate if this should be a boolean parameter (and add it to the unit test if so).
- */
-#define PME_GPU_PARALLEL_SPLINE 0
-
-
-/*! \brief
- * General purpose function for loading atom-related data from global to shared memory.
- *
- * \tparam[in] T                 Data type (float/int/...)
- * \tparam[in] atomsPerBlock     Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
- * \tparam[in] dataCountPerAtom  Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
- * \param[in]  kernelParams      Input PME CUDA data in constant memory.
- * \param[out] sm_destination    Shared memory array for output.
- * \param[in]  gm_source         Global memory array for input.
- */
-template<typename T,
-         const int atomsPerBlock,
-         const int dataCountPerAtom>
-__device__  __forceinline__
-void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams       kernelParams,
-                             T * __restrict__                   sm_destination,
-                             const T * __restrict__             gm_source)
-{
-    static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta per-warp alignment");
-    const int blockIndex       = blockIdx.y * gridDim.x + blockIdx.x;
-    const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
-    const int localIndex       = threadLocalIndex;
-    const int globalIndexBase  = blockIndex * atomsPerBlock * dataCountPerAtom;
-    const int globalIndex      = globalIndexBase + localIndex;
-    const int globalCheck      = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
-    if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
-    {
-        assert(isfinite(float(gm_source[globalIndex])));
-        sm_destination[localIndex] = gm_source[globalIndex];
-    }
-}
-
-/*! \brief
- * PME GPU spline parameter and gridline indices calculation.
- * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines().
- * First stage of the whole kernel.
- *
- * \tparam[in] order                PME interpolation order.
- * \tparam[in] atomsPerBlock        Number of atoms processed by a block - should be accounted for
- *                                  in the sizes of the shared memory arrays.
- * \param[in]  kernelParams         Input PME CUDA data in constant memory.
- * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
- * \param[in]  sm_coordinates       Atom coordinates in the shared memory.
- * \param[in]  sm_coefficients      Atom charges/coefficients in the shared memory.
- * \param[out] sm_theta             Atom spline values in the shared memory.
- * \param[out] sm_gridlineIndices   Atom gridline indices in the shared memory.
- */
-template <const int order,
-          const int atomsPerBlock>
-__device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams           kernelParams,
-                                                  const int                              atomIndexOffset,
-                                                  const float3 * __restrict__            sm_coordinates,
-                                                  const float * __restrict__             sm_coefficients,
-                                                  float * __restrict__                   sm_theta,
-                                                  int * __restrict__                     sm_gridlineIndices)
-{
-    /* Global memory pointers for output */
-    float * __restrict__ gm_theta           = kernelParams.atoms.d_theta;
-    float * __restrict__ gm_dtheta          = kernelParams.atoms.d_dtheta;
-    int * __restrict__   gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
-
-    const int            atomsPerWarp = c_pmeSpreadGatherAtomsPerWarp;
-
-    /* Fractional coordinates */
-    __shared__ float sm_fractCoords[atomsPerBlock * DIM];
-
-    /* Thread index w.r.t. block */
-    const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
-        + (threadIdx.y * blockDim.x) + threadIdx.x;
-    /* Warp index w.r.t. block - could probably be obtained easier? */
-    const int warpIndex = threadLocalId / warp_size;
-    /* Thread index w.r.t. warp */
-    const int threadWarpIndex = threadLocalId % warp_size;
-    /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
-    const int atomWarpIndex = threadWarpIndex % atomsPerWarp;
-    /* Atom index w.r.t. block/shared memory */
-    const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
-
-    /* Atom index w.r.t. global memory */
-    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
-    /* Spline contribution index in one dimension */
-    const int orderIndex = threadWarpIndex / (atomsPerWarp * DIM);
-    /* Dimension index */
-    const int dimIndex = (threadWarpIndex - orderIndex * (atomsPerWarp * DIM)) / atomsPerWarp;
-
-    /* Multi-purpose index of rvec/ivec atom data */
-    const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
-
-    /* Spline parameters need a working buffer.
-     * With PME_GPU_PARALLEL_SPLINE == 0 it is just a local array of (order) values for some of the threads, which is fine;
-     * With PME_GPU_PARALLEL_SPLINE == 1 (order) times more threads are involved, so the shared memory is used to avoid overhead.
-     * The buffer's size, striding and indexing are adjusted accordingly.
-     * The buffer is accessed with SPLINE_DATA_PTR and SPLINE_DATA macros.
-     */
-#if PME_GPU_PARALLEL_SPLINE
-    const int        splineDataStride  = atomsPerBlock * DIM;
-    const int        splineDataIndex   = sharedMemoryIndex;
-    __shared__ float sm_splineData[splineDataStride * order];
-    float           *splineDataPtr = sm_splineData;
-#else
-    const int        splineDataStride = 1;
-    const int        splineDataIndex  = 0;
-    float            splineData[splineDataStride * order];
-    float           *splineDataPtr = splineData;
-#endif
-
-#define SPLINE_DATA_PTR(i) (splineDataPtr + ((i) * splineDataStride + splineDataIndex))
-#define SPLINE_DATA(i) (*SPLINE_DATA_PTR(i))
-
-    const int localCheck  = (dimIndex < DIM) && (orderIndex < (PME_GPU_PARALLEL_SPLINE ? order : 1));
-    const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
-
-    if (localCheck && globalCheck)
-    {
-        /* Indices interpolation */
-
-        if (orderIndex == 0)
-        {
-            int           tableIndex, tInt;
-            float         n, t;
-            const float3  x = sm_coordinates[atomIndexLocal];
-            /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
-             * puts them into local memory(!) instead of accessing the constant memory directly.
-             * That's the reason for the switch, to unroll explicitly.
-             * The commented parts correspond to the 0 components of the recipbox.
-             */
-            switch (dimIndex)
-            {
-                case XX:
-                    tableIndex  = kernelParams.grid.tablesOffsets[XX];
-                    n           = kernelParams.grid.realGridSizeFP[XX];
-                    t           = x.x * kernelParams.current.recipBox[dimIndex][XX] + x.y * kernelParams.current.recipBox[dimIndex][YY] + x.z * kernelParams.current.recipBox[dimIndex][ZZ];
-                    break;
-
-                case YY:
-                    tableIndex  = kernelParams.grid.tablesOffsets[YY];
-                    n           = kernelParams.grid.realGridSizeFP[YY];
-                    t           = /*x.x * kernelParams.current.recipbox[dimIndex][XX] + */ x.y * kernelParams.current.recipBox[dimIndex][YY] + x.z * kernelParams.current.recipBox[dimIndex][ZZ];
-                    break;
-
-                case ZZ:
-                    tableIndex  = kernelParams.grid.tablesOffsets[ZZ];
-                    n           = kernelParams.grid.realGridSizeFP[ZZ];
-                    t           = /*x.x * kernelParams.current.recipbox[dimIndex][XX] + x.y * kernelParams.current.recipbox[dimIndex][YY] + */ x.z * kernelParams.current.recipBox[dimIndex][ZZ];
-                    break;
-            }
-            const float shift = c_pmeMaxUnitcellShift;
-            /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
-            t    = (t + shift) * n;
-            tInt = (int)t;
-            sm_fractCoords[sharedMemoryIndex] = t - tInt;
-            tableIndex                       += tInt;
-            assert(tInt >= 0);
-            assert(tInt < c_pmeNeighborUnitcellCount * n);
-
-            // TODO have shared table for both parameters to share the fetch, as index is always same?
-            // TODO compare texture/LDG performance
-            sm_fractCoords[sharedMemoryIndex] +=
-                fetchFromParamLookupTable(kernelParams.grid.d_fractShiftsTable,
-                                          kernelParams.fractShiftsTableTexture,
-                                          tableIndex);
-            sm_gridlineIndices[sharedMemoryIndex] =
-                fetchFromParamLookupTable(kernelParams.grid.d_gridlineIndicesTable,
-                                          kernelParams.gridlineIndicesTableTexture,
-                                          tableIndex);
-            gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] = sm_gridlineIndices[sharedMemoryIndex];
-        }
-
-        /* B-spline calculation */
-
-        const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
-        if (chargeCheck)
-        {
-            float       div;
-            int         o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
-
-            const float dr = sm_fractCoords[sharedMemoryIndex];
-            assert(isfinite(dr));
-
-            /* dr is relative offset from lower cell limit */
-            *SPLINE_DATA_PTR(order - 1) = 0.0f;
-            *SPLINE_DATA_PTR(1)         = dr;
-            *SPLINE_DATA_PTR(0)         = 1.0f - dr;
-
-#pragma unroll
-            for (int k = 3; k < order; k++)
-            {
-                div                     = 1.0f / (k - 1.0f);
-                *SPLINE_DATA_PTR(k - 1) = div * dr * SPLINE_DATA(k - 2);
-#pragma unroll
-                for (int l = 1; l < (k - 1); l++)
-                {
-                    *SPLINE_DATA_PTR(k - l - 1) = div * ((dr + l) * SPLINE_DATA(k - l - 2) + (k - l - dr) * SPLINE_DATA(k - l - 1));
-                }
-                *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
-            }
-
-            const int thetaIndexBase        = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
-            const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order;
-
-            /* Differentiation and storing the spline derivatives (dtheta) */
-#if !PME_GPU_PARALLEL_SPLINE
-            // With PME_GPU_PARALLEL_SPLINE == 1, o is already set to orderIndex;
-            // With PME_GPU_PARALLEL_SPLINE == 0, we loop o over range(order).
-#pragma unroll
-            for (o = 0; o < order; o++)
-#endif
-            {
-                const int   thetaIndex       = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
-                const int   thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
-
-                const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0f) - SPLINE_DATA(o);
-                assert(isfinite(dtheta));
-                gm_dtheta[thetaGlobalIndex] = dtheta;
-            }
-
-            div  = 1.0f / (order - 1.0f);
-            *SPLINE_DATA_PTR(order - 1) = div * dr * SPLINE_DATA(order - 2);
-#pragma unroll
-            for (int k = 1; k < (order - 1); k++)
-            {
-                *SPLINE_DATA_PTR(order - k - 1) = div * ((dr + k) * SPLINE_DATA(order - k - 2) + (order - k - dr) * SPLINE_DATA(order - k - 1));
-            }
-            *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
-
-            /* Storing the spline values (theta) */
-#if !PME_GPU_PARALLEL_SPLINE
-            // See comment for the loop above
-#pragma unroll
-            for (o = 0; o < order; o++)
-#endif
-            {
-                const int thetaIndex       = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
-                const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
-
-                sm_theta[thetaIndex]       = SPLINE_DATA(o);
-                assert(isfinite(sm_theta[thetaIndex]));
-                gm_theta[thetaGlobalIndex] = SPLINE_DATA(o);
-            }
-        }
-    }
-}
-
  /*! \brief
   * Charge spreading onto the grid.
   * This corresponds to the CPU function spread_coefficients_bsplines_thread().
- * Second stage of the whole kernel.
+ * Optional second stage of the spline_and_spread_kernel.
   *
   * \tparam[in] order                PME interpolation order.
   * \tparam[in] wrapX                A boolean which tells if the grid overlap in dimension X should be wrapped.
   * \tparam[in] wrapY                A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \tparam[in] useOrderThreads      A boolean which Tells if we should use order threads per atom (order*order used if false)
   * \param[in]  kernelParams         Input PME CUDA data in constant memory.
   * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
- * \param[in]  sm_coefficients      Atom coefficents/charges in the shared memory.
+ * \param[in]  atomCharge           Atom charge/coefficient of atom processed by thread.
   * \param[in]  sm_gridlineIndices   Atom gridline indices in the shared memory.
   * \param[in]  sm_theta             Atom spline values in the shared memory.
   */
  template <
-    const int order, const bool wrapX, const bool wrapY>
+    const int order, const bool wrapX, const bool wrapY,  const bool useOrderThreads >
  __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams           kernelParams,
                                                 int                                    atomIndexOffset,
-                                               const float * __restrict__             sm_coefficients,
+                                               const float *                          atomCharge,
                                                 const int * __restrict__               sm_gridlineIndices,
                                                 const float * __restrict__             sm_theta)
  {
@@ -332,7 +80,7 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams
      float * __restrict__ gm_grid = kernelParams.grid.d_realGrid;
  
  
-    const int atomsPerWarp = c_pmeSpreadGatherAtomsPerWarp;
+    const int atomsPerWarp = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom : c_pmeSpreadGatherAtomsPerWarp;
  
      const int nx  = kernelParams.grid.realGridSize[XX];
      const int ny  = kernelParams.grid.realGridSize[YY];
@@ -346,24 +94,19 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams
      const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
  
      const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
-    const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
+    const int chargeCheck = pme_gpu_check_atom_charge(*atomCharge);
      if (chargeCheck & globalCheck)
      {
-        // Spline Y/Z coordinates
-        const int ithy   = threadIdx.y;
+        // Spline Z coordinates
          const int ithz   = threadIdx.x;
+
          const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX] - offx;
-        int       iy     = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy + ithy;
-        if (wrapY & (iy >= ny))
-        {
-            iy -= ny;
-        }
-        int iz  = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz;
+        const int iyBase = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy;
+        int       iz     = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz;
          if (iz >= nz)
          {
              iz -= nz;
          }
-
          /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
          const int    atomWarpIndex   = atomIndexLocal % atomsPerWarp;
          /* Warp index w.r.t. block - could probably be obtained easier? */
@@ -372,26 +115,39 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams
          const int    splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
          const int    splineIndexZ    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
          const float  thetaZ          = sm_theta[splineIndexZ];
-        const int    splineIndexY    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
-        const float  thetaY          = sm_theta[splineIndexY];
-        const float  constVal        = thetaZ * thetaY * sm_coefficients[atomIndexLocal];
-        assert(isfinite(constVal));
-        const int    constOffset     = iy * pnz + iz;
  
-#pragma unroll
-        for (int ithx = 0; (ithx < order); ithx++)
+        /* loop not used if order*order threads per atom */
+        const int ithyMin = useOrderThreads ? 0 : threadIdx.y;
+        const int ithyMax = useOrderThreads ? order : threadIdx.y + 1;
+        for (int ithy = ithyMin; ithy < ithyMax; ithy++)
          {
-            int ix = ixBase + ithx;
-            if (wrapX & (ix >= nx))
+            int       iy     = iyBase + ithy;
+            if (wrapY & (iy >= ny))
+            {
+                iy -= ny;
+            }
+
+            const int    splineIndexY    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
+            float        thetaY          = sm_theta[splineIndexY];
+            const float  Val             = thetaZ * thetaY * (*atomCharge);
+            assert(isfinite(Val));
+            const int    offset     = iy * pnz + iz;
+
+#pragma unroll
+            for (int ithx = 0; (ithx < order); ithx++)
              {
-                ix -= nx;
+                int ix = ixBase + ithx;
+                if (wrapX & (ix >= nx))
+                {
+                    ix -= nx;
+                }
+                const int   gridIndexGlobal = ix * pny * pnz + offset;
+                const int   splineIndexX    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
+                const float thetaX          = sm_theta[splineIndexX];
+                assert(isfinite(thetaX));
+                assert(isfinite(gm_grid[gridIndexGlobal]));
+                atomicAdd(gm_grid + gridIndexGlobal, thetaX * Val);
              }
-            const int   gridIndexGlobal = ix * pny * pnz + constOffset;
-            const int   splineIndexX    = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
-            const float thetaX          = sm_theta[splineIndexX];
-            assert(isfinite(thetaX));
-            assert(isfinite(gm_grid[gridIndexGlobal]));
-            atomicAdd(gm_grid + gridIndexGlobal, thetaX * constVal);
          }
      }
  }
@@ -399,12 +155,18 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams
  /*! \brief
   * A spline computation and charge spreading kernel function.
   *
+ * Two tuning parameters can be used for additional performance. For small systems and for debugging
+ * writeGlobal should be used removing the need to recalculate the theta values in the gather kernel.
+ * Similarly for useOrderThreads large systems order threads per atom gives higher performance than order*order threads
+ *
   * \tparam[in] order                PME interpolation order.
   * \tparam[in] computeSplines       A boolean which tells if the spline parameter and
   *                                  gridline indices' computation should be performed.
   * \tparam[in] spreadCharges        A boolean which tells if the charge spreading should be performed.
   * \tparam[in] wrapX                A boolean which tells if the grid overlap in dimension X should be wrapped.
   * \tparam[in] wrapY                A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \tparam[in] writeGlobal          A boolean which tells if the theta values and gridlines should be written to global memory.
+ * \tparam[in] useOrderThreads         A boolean which tells if we should use order threads per atom (order*order used if false).
   * \param[in]  kernelParams         Input PME CUDA data in constant memory.
   */
  template <
@@ -412,22 +174,43 @@ template <
      const bool computeSplines,
      const bool spreadCharges,
      const bool wrapX,
-    const bool wrapY
+    const bool wrapY,
+    const bool writeGlobal,
+    const bool useOrderThreads
      >
  __launch_bounds__(c_spreadMaxThreadsPerBlock)
  CLANG_DISABLE_OPTIMIZATION_ATTRIBUTE
  __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams)
  {
-    const int        atomsPerBlock = c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom;
+    const int            atomsPerBlock = useOrderThreads ? c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom :
+        c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom;
      // Gridline indices, ivec
-    __shared__ int   sm_gridlineIndices[atomsPerBlock * DIM];
-    // Charges
-    __shared__ float sm_coefficients[atomsPerBlock];
+    __shared__ int       sm_gridlineIndices[atomsPerBlock * DIM];
      // Spline values
-    __shared__ float sm_theta[atomsPerBlock * DIM * order];
+    __shared__ float     sm_theta[atomsPerBlock * DIM * order];
+    float                dtheta;
+
+    const int            atomsPerWarp = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom :
+        c_pmeSpreadGatherAtomsPerWarp;
  
-    const int        blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
-    const int        atomIndexOffset = blockIndex * atomsPerBlock;
+    float3               atomX;
+    float                atomCharge;
+
+    const int            blockIndex      = blockIdx.y * gridDim.x + blockIdx.x;
+    const int            atomIndexOffset = blockIndex * atomsPerBlock;
+
+    /* Thread index w.r.t. block */
+    const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
+        + (threadIdx.y * blockDim.x) + threadIdx.x;
+    /* Warp index w.r.t. block - could probably be obtained easier? */
+    const int warpIndex = threadLocalId / warp_size;
+
+    /* Atom index w.r.t. warp */
+    const int atomWarpIndex = threadIdx.z %atomsPerWarp;
+    /* Atom index w.r.t. block/shared memory */
+    const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
+    /* Atom index w.r.t. global memory */
+    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
  
      /* Early return for fully empty blocks at the end
       * (should only happen for billions of input atoms)
@@ -436,19 +219,42 @@ __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernel
      {
          return;
      }
-
-    /* Staging coefficients/charges for both spline and spread */
-    pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
+    /* Charges, required for both spline and spread */
+    if (c_useAtomDataPrefetch)
+    {
+        __shared__ float sm_coefficients[atomsPerBlock];
+        pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
+        __syncthreads();
+        atomCharge = sm_coefficients[atomIndexLocal];
+    }
+    else
+    {
+        atomCharge =  kernelParams.atoms.d_coefficients[atomIndexGlobal];
+    }
  
      if (computeSplines)
      {
-        /* Staging coordinates */
-        __shared__ float sm_coordinates[DIM * atomsPerBlock];
-        pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
-
-        __syncthreads();
-        calculate_splines<order, atomsPerBlock>(kernelParams, atomIndexOffset, (const float3 *)sm_coordinates,
-                                                sm_coefficients, sm_theta, sm_gridlineIndices);
+        if (c_useAtomDataPrefetch)
+        {
+            // Coordinates
+            __shared__ float sm_coordinates[DIM * atomsPerBlock];
+
+            /* Staging coordinates */
+            pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
+            __syncthreads();
+            atomX.x    = sm_coordinates[atomIndexLocal*DIM+XX];
+            atomX.y    = sm_coordinates[atomIndexLocal*DIM+YY];
+            atomX.z    = sm_coordinates[atomIndexLocal*DIM+ZZ];
+        }
+        else
+        {
+            atomX.x    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + XX];
+            atomX.y    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + YY];
+            atomX.z    =  kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + ZZ];
+        }
+        calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal>
+            (kernelParams, atomIndexOffset, atomX, atomCharge,
+            sm_theta, &dtheta, sm_gridlineIndices);
          __syncwarp();
      }
      else
@@ -468,11 +274,20 @@ __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernel
      /* Spreading */
      if (spreadCharges)
      {
-        spread_charges<order, wrapX, wrapY>(kernelParams, atomIndexOffset, sm_coefficients, sm_gridlineIndices, sm_theta);
+        spread_charges<order, wrapX, wrapY, useOrderThreads>(kernelParams, atomIndexOffset, &atomCharge,
+                                                             sm_gridlineIndices, sm_theta);
      }
  }
  
  //! Kernel instantiations
-template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true>(const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true>(const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, true, true>(const PmeGpuCudaKernelParams);
+
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, false, true>(const PmeGpuCudaKernelParams);
+
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, true, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, true, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, true, false>(const PmeGpuCudaKernelParams);
+
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, false, false>(const PmeGpuCudaKernelParams);
author	Jonathan Vincent <jvincent@nvidia.com>
	Wed, 2 Jan 2019 16:59:18 +0000 (08:59 -0800)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Wed, 30 Oct 2019 12:21:23 +0000 (13:21 +0100)
src/gromacs/ewald/pme_calculate_splines.cuh	[new file with mode: 0644]	patch \| blob
src/gromacs/ewald/pme_gather.cu		patch \| blob \| history
src/gromacs/ewald/pme_gpu_constants.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_spread.cu		patch \| blob \| history