int pulse,
gmx_wallcycle* wcycle) :
dd_(dd),
- dimIndex_(dimIndex),
sendRankX_(dd->neighbor[dimIndex][1]),
recvRankX_(dd->neighbor[dimIndex][0]),
sendRankF_(dd->neighbor[dimIndex][0]),
deviceContext_(deviceContext),
localStream_(localStream),
nonLocalStream_(nonLocalStream),
+ dimIndex_(dimIndex),
pulse_(pulse),
wcycle_(wcycle)
{
int dimIndex_ = 0;
//! The pulse corresponding to this halo exchange instance
int pulse_ = 0;
- //! Number of zones. Always 1 for 1-D case.
- const int nzone_ = 1;
//! The wallclock counter
gmx_wallcycle* wcycle_ = nullptr;
//! The atom offset for receive (x) or send (f) for dimension index and pulse corresponding to this halo exchange instance
}
}
}
-
-/*! \brief CUDA kernel to sum up the force components
- *
- * \tparam accumulateForce If the initial forces in \p gm_fTotal should be saved.
- * \tparam addPmeForce Whether the PME force should be added to the total.
- *
- * \param[in] gm_fNB Non-bonded forces in nbnxm format.
- * \param[in] gm_fPme PME forces.
- * \param[in,out] gm_fTotal Force buffer to be reduced into.
- * \param[in] cell Cell index mapping.
- * \param[in] atomStart Start atom index.
- * \param[in] numAtoms Number of atoms.
- */
-template<bool accumulateForce, bool addPmeForce>
-static __global__ void nbnxn_gpu_add_nbat_f_to_f_kernel(const float3* __restrict__ gm_fNB,
- const float3* __restrict__ gm_fPme,
- float3* gm_fTotal,
- const int* __restrict__ gm_cell,
- const int atomStart,
- const int numAtoms)
-{
-
- /* map particle-level parallelism to 1D CUDA thread and block index */
- const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
-
- /* perform addition for each particle*/
- if (threadIndex < numAtoms)
- {
-
- const int i = gm_cell[atomStart + threadIndex];
- float3* gm_fDest = &gm_fTotal[atomStart + threadIndex];
- float3 temp;
-
- if (accumulateForce)
- {
- temp = *gm_fDest;
- temp += gm_fNB[i];
- }
- else
- {
- temp = gm_fNB[i];
- }
- if (addPmeForce)
- {
- temp += gm_fPme[atomStart + threadIndex];
- }
- *gm_fDest = temp;
- }
- return;
-}