* This corresponds to the CPU function spread_coefficients_bsplines_thread().
* Optional second stage of the spline_and_spread_kernel.
*
- * \tparam[in] order PME interpolation order.
- * \tparam[in] wrapX Whether the grid overlap in dimension X should be wrapped.
- * \tparam[in] wrapY Whether the grid overlap in dimension Y should be wrapped.
- * \tparam[in] gridIndex The index of the grid to use in the kernel.
- * \tparam[in] threadsPerAtom How many threads work on each atom
+ * \tparam order PME interpolation order.
+ * \tparam wrapX Whether the grid overlap in dimension X should be wrapped.
+ * \tparam wrapY Whether the grid overlap in dimension Y should be wrapped.
+ * \tparam gridIndex The index of the grid to use in the kernel.
+ * \tparam threadsPerAtom How many threads work on each atom
*
* \param[in] kernelParams Input PME CUDA data in constant memory.
* \param[in] atomCharge Atom charge/coefficient of atom processed by thread.
* writeGlobal should be used removing the need to recalculate the theta values in the gather kernel.
* Similarly for useOrderThreads large systems order threads per atom gives higher performance than order*order threads
*
- * \tparam[in] order PME interpolation order.
- * \tparam[in] computeSplines A boolean which tells if the spline parameter and
+ * \tparam order PME interpolation order.
+ * \tparam computeSplines A boolean which tells if the spline parameter and
* gridline indices' computation should be performed.
- * \tparam[in] spreadCharges A boolean which tells if the charge spreading should be performed.
- * \tparam[in] wrapX A boolean which tells if the grid overlap in dimension X should be wrapped.
- * \tparam[in] wrapY A boolean which tells if the grid overlap in dimension Y should be wrapped.
- * \tparam[in] numGrids The number of grids to use in the kernel. Can be 1 or 2.
- * \tparam[in] writeGlobal A boolean which tells if the theta values and gridlines should be written to global memory.
- * \tparam[in] threadsPerAtom How many threads work on each atom
+ * \tparam spreadCharges A boolean which tells if the charge spreading should be performed.
+ * \tparam wrapX A boolean which tells if the grid overlap in dimension X should be wrapped.
+ * \tparam wrapY A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \tparam numGrids The number of grids to use in the kernel. Can be 1 or 2.
+ * \tparam writeGlobal A boolean which tells if the theta values and gridlines should be written to global memory.
+ * \tparam threadsPerAtom How many threads work on each atom
* \param[in] kernelParams Input PME CUDA data in constant memory.
*/
template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
float atomCharge;
const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
- const int atomIndexOffset = blockIndex * atomsPerBlock;
+ const int atomIndexOffset = blockIndex * atomsPerBlock + kernelParams.pipelineAtomStart;
/* Thread index w.r.t. block */
const int threadLocalId =
/* Charges, required for both spline and spread */
if (c_useAtomDataPrefetch)
{
- pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients,
- kernelParams.atoms.d_coefficients[0]);
+ pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(
+ sm_coefficients, &kernelParams.atoms.d_coefficients[0][kernelParams.pipelineAtomStart]);
__syncthreads();
atomCharge = sm_coefficients[atomIndexLocal];
}
if (computeSplines)
{
- const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
+ const float3* __restrict__ gm_coordinates =
+ asFloat3(&kernelParams.atoms.d_coordinates[kernelParams.pipelineAtomStart]);
if (c_useAtomDataPrefetch)
{
// Coordinates
/* Spreading */
if (spreadCharges)
{
- spread_charges<order, wrapX, wrapY, 0, threadsPerAtom>(kernelParams, &atomCharge,
- sm_gridlineIndices, sm_theta);
+
+ if (!kernelParams.usePipeline || (atomIndexGlobal < kernelParams.pipelineAtomEnd))
+ {
+ spread_charges<order, wrapX, wrapY, 0, threadsPerAtom>(
+ kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+ }
}
if (numGrids == 2)
{
}
if (spreadCharges)
{
- spread_charges<order, wrapX, wrapY, 1, threadsPerAtom>(kernelParams, &atomCharge,
- sm_gridlineIndices, sm_theta);
+ if (!kernelParams.usePipeline || (atomIndexGlobal < kernelParams.pipelineAtomEnd))
+ {
+ spread_charges<order, wrapX, wrapY, 1, threadsPerAtom>(
+ kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+ }
}
}
}