/* turn energy calculation always on/off (for debugging/testing only) */
bCalcEner = (bCalcEner || always_ener) && !never_ener;
- /* don't launch the kernel if there is no work to do */
- if (plist->nsci == 0)
+ /* Don't launch the non-local kernel if there is no work to do.
+ Doing the same for the local kernel is more complicated, since the
+ local part of the force array also depends on the non-local kernel.
+ So to avoid complicating the code and to reduce the risk of bugs,
+ we always call the local kernel, the local x+q copy and later (not in
+ this function) the stream wait, local f copyback and the f buffer
+ clearing. All these operations, except for the local interaction kernel,
+ are needed for the non-local interactions. */
+ if (iloc == eintNonlocal && plist->nsci == 0)
{
return;
}
bool bCalcEner = flags & GMX_FORCE_VIRIAL;
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
- /* don't launch copy-back if there was no work to do */
- if (cu_nb->plist[iloc]->nsci == 0)
+ /* don't launch non-local copy-back if there was no non-local work to do */
+ if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
{
return;
}
}
/* With DD the local D2H transfer can only start after the non-local
- has been launched. */
+ kernel has finished. */
if (iloc == eintLocal && cu_nb->bUseTwoStreams)
{
stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
/* After the non-local D2H is launched the nonlocal_done event can be
recorded which signals that the local D2H can proceed. This event is not
- placed after the non-local kernel because we first need the non-local
- data back first. */
+ placed after the non-local kernel because we want the non-local data
+ back first. */
if (iloc == eintNonlocal)
{
stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* turn energy calculation always on/off (for debugging/testing only) */
- bCalcEner = (bCalcEner || always_ener) && !never_ener;
+ bCalcEner = (bCalcEner || always_ener) && !never_ener;
- /* don't launch wait/update timers & counters if there was no work to do
+ /* Launch wait/update timers & counters, unless doing the non-local phase
+ when there is not actually work to do. This is consistent with
+ nbnxn_cuda_launch_kernel.
NOTE: if timing with multiple GPUs (streams) becomes possible, the
counters could end up being inconsistent due to not being incremented
on some of the nodes! */
- if (cu_nb->plist[iloc]->nsci == 0)
+ if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
{
return;
}
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
/* skip the reduction if there was no non-local work to do */
- if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+ if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
{
nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
nbv->grp[eintNonlocal].nbat, f);
}
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
- if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
- {
- /* skip the reduction if there was no non-local work to do */
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
- nbv->grp[eintLocal].nbat, f);
- }
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+ nbv->grp[eintLocal].nbat, f);
wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
}