With domain decomposition and GPUs the copy of the non-local part of
the host memory force buffer to the force array was conditional on
the local instead of the non-local list size. This meant that with
an empty non-local list and non-empty local list outdated non-local
forces would be copied. Conversely, with an empty local list all
non-local forces would not be added. Both things can only happen
in systems with partially empty boxes and then only rarely.
Having the local kernel, D2H copyback and F reduction called
conditionally is not useful in practice, so they are now unconditional
to avoid complicating the code.
Fixes #1721.
Change-Id: I06731b0055a4fb5a16168e7180964e0b87443b0f
/* turn energy calculation always on/off (for debugging/testing only) */
bCalcEner = (bCalcEner || always_ener) && !never_ener;
/* turn energy calculation always on/off (for debugging/testing only) */
bCalcEner = (bCalcEner || always_ener) && !never_ener;
- /* don't launch the kernel if there is no work to do */
- if (plist->nsci == 0)
+ /* Don't launch the non-local kernel if there is no work to do.
+ Doing the same for the local kernel is more complicated, since the
+ local part of the force array also depends on the non-local kernel.
+ So to avoid complicating the code and to reduce the risk of bugs,
+ we always call the local kernel, the local x+q copy and later (not in
+ this function) the stream wait, local f copyback and the f buffer
+ clearing. All these operations, except for the local interaction kernel,
+ are needed for the non-local interactions. */
+ if (iloc == eintNonlocal && plist->nsci == 0)
bool bCalcEner = flags & GMX_FORCE_VIRIAL;
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
bool bCalcEner = flags & GMX_FORCE_VIRIAL;
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
- /* don't launch copy-back if there was no work to do */
- if (cu_nb->plist[iloc]->nsci == 0)
+ /* don't launch non-local copy-back if there was no non-local work to do */
+ if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
}
/* With DD the local D2H transfer can only start after the non-local
}
/* With DD the local D2H transfer can only start after the non-local
+ kernel has finished. */
if (iloc == eintLocal && cu_nb->bUseTwoStreams)
{
stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
if (iloc == eintLocal && cu_nb->bUseTwoStreams)
{
stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
/* After the non-local D2H is launched the nonlocal_done event can be
recorded which signals that the local D2H can proceed. This event is not
/* After the non-local D2H is launched the nonlocal_done event can be
recorded which signals that the local D2H can proceed. This event is not
- placed after the non-local kernel because we first need the non-local
- data back first. */
+ placed after the non-local kernel because we want the non-local data
+ back first. */
if (iloc == eintNonlocal)
{
stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
if (iloc == eintNonlocal)
{
stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* turn energy calculation always on/off (for debugging/testing only) */
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* turn energy calculation always on/off (for debugging/testing only) */
- bCalcEner = (bCalcEner || always_ener) && !never_ener;
+ bCalcEner = (bCalcEner || always_ener) && !never_ener;
- /* don't launch wait/update timers & counters if there was no work to do
+ /* Launch wait/update timers & counters, unless doing the non-local phase
+ when there is not actually work to do. This is consistent with
+ nbnxn_cuda_launch_kernel.
NOTE: if timing with multiple GPUs (streams) becomes possible, the
counters could end up being inconsistent due to not being incremented
on some of the nodes! */
NOTE: if timing with multiple GPUs (streams) becomes possible, the
counters could end up being inconsistent due to not being incremented
on some of the nodes! */
- if (cu_nb->plist[iloc]->nsci == 0)
+ if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
/* skip the reduction if there was no non-local work to do */
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
/* skip the reduction if there was no non-local work to do */
- if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+ if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
{
nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
nbv->grp[eintNonlocal].nbat, f);
{
nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
nbv->grp[eintNonlocal].nbat, f);
}
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
}
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
- if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
- {
- /* skip the reduction if there was no non-local work to do */
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
- nbv->grp[eintLocal].nbat, f);
- }
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+ nbv->grp[eintLocal].nbat, f);
wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
}
wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
}