* \tparam[in] atomsPerBlock Number of atoms processed by a block - should be accounted for
* in the sizes of the shared memory arrays.
* \tparam[in] atomsPerWarp Number of atoms processed by a warp
- * \tparam[in] writeSmDtheta Bool controlling if the theta derivative should be written to shared memory. Enables calculation of dtheta if set.
- * \tparam[in] writeGlobal A boolean which tells if the theta values and gridlines should be written to global memory. Enables calculation of dtheta if set.
+ * \tparam[in] writeSmDtheta Bool controlling if the theta derivative should be written to
+ * shared memory. Enables calculation of dtheta if set.
+ * \tparam[in] writeGlobal A boolean which tells if the theta values and gridlines should
+ * be written to global memory. Enables calculation of dtheta if
+ * set.
+ * \tparam[in] numGrids The number of grids using the splines.
* \param[in] kernelParams Input PME CUDA data in constant memory.
* \param[in] atomIndexOffset Starting atom index for the execution block w.r.t. global memory.
* \param[in] atomX Atom coordinate of atom processed by thread.
* \param[out] sm_gridlineIndices Atom gridline indices in the shared memory.
*/
-template<int order, int atomsPerBlock, int atomsPerWarp, bool writeSmDtheta, bool writeGlobal>
+template<int order, int atomsPerBlock, int atomsPerWarp, bool writeSmDtheta, bool writeGlobal, int numGrids>
__device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams kernelParams,
const int atomIndexOffset,
const float3 atomX,
float* __restrict__ sm_dtheta,
int* __restrict__ sm_gridlineIndices)
{
+ assert(numGrids == 1 || numGrids == 2);
+ assert(numGrids == 1 || c_skipNeutralAtoms == false);
+
/* Global memory pointers for output */
float* __restrict__ gm_theta = kernelParams.atoms.d_theta;
float* __restrict__ gm_dtheta = kernelParams.atoms.d_dtheta;
/* B-spline calculation */
const int chargeCheck = pme_gpu_check_atom_charge(atomCharge);
- if (chargeCheck)
+ /* With FEP (numGrids == 2), we might have 0 charge in state A, but !=0 in state B, so we always calculate splines */
+ if (numGrids == 2 || chargeCheck)
{
float div;
int o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1