From: Berk Hess Date: Thu, 30 Apr 2015 19:32:44 +0000 (+0200) Subject: Fixed GPU non-local F copy local conditional X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=fc8a5624e96769cbe95da6159398f56bd297b38e;p=alexxy%2Fgromacs.git Fixed GPU non-local F copy local conditional With domain decomposition and GPUs the copy of the non-local part of the host memory force buffer to the force array was conditional on the local instead of the non-local list size. This meant that with an empty non-local list and non-empty local list outdated non-local forces would be copied. Conversely, with an empty local list all non-local forces would not be added. Both things can only happen in systems with partially empty boxes and then only rarely. Having the local kernel, D2H copyback and F reduction called conditionally is not useful in practice, so they are now unconditional to avoid complicating the code. Fixes #1721. Change-Id: I06731b0055a4fb5a16168e7180964e0b87443b0f --- diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu index f82fd3a990..11a1f43291 100644 --- a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu +++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu @@ -285,8 +285,15 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb, /* turn energy calculation always on/off (for debugging/testing only) */ bCalcEner = (bCalcEner || always_ener) && !never_ener; - /* don't launch the kernel if there is no work to do */ - if (plist->nsci == 0) + /* Don't launch the non-local kernel if there is no work to do. + Doing the same for the local kernel is more complicated, since the + local part of the force array also depends on the non-local kernel. + So to avoid complicating the code and to reduce the risk of bugs, + we always call the local kernel, the local x+q copy and later (not in + this function) the stream wait, local f copyback and the f buffer + clearing. All these operations, except for the local interaction kernel, + are needed for the non-local interactions. */ + if (iloc == eintNonlocal && plist->nsci == 0) { return; } @@ -406,8 +413,8 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb, bool bCalcEner = flags & GMX_FORCE_VIRIAL; bool bCalcFshift = flags & GMX_FORCE_VIRIAL; - /* don't launch copy-back if there was no work to do */ - if (cu_nb->plist[iloc]->nsci == 0) + /* don't launch non-local copy-back if there was no non-local work to do */ + if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0) { return; } @@ -462,7 +469,7 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb, } /* With DD the local D2H transfer can only start after the non-local - has been launched. */ + kernel has finished. */ if (iloc == eintLocal && cu_nb->bUseTwoStreams) { stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0); @@ -475,8 +482,8 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb, /* After the non-local D2H is launched the nonlocal_done event can be recorded which signals that the local D2H can proceed. This event is not - placed after the non-local kernel because we first need the non-local - data back first. */ + placed after the non-local kernel because we want the non-local data + back first. */ if (iloc == eintNonlocal) { stat = cudaEventRecord(cu_nb->nonlocal_done, stream); @@ -563,14 +570,16 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb, bool bCalcFshift = flags & GMX_FORCE_VIRIAL; /* turn energy calculation always on/off (for debugging/testing only) */ - bCalcEner = (bCalcEner || always_ener) && !never_ener; + bCalcEner = (bCalcEner || always_ener) && !never_ener; - /* don't launch wait/update timers & counters if there was no work to do + /* Launch wait/update timers & counters, unless doing the non-local phase + when there is not actually work to do. This is consistent with + nbnxn_cuda_launch_kernel. NOTE: if timing with multiple GPUs (streams) becomes possible, the counters could end up being inconsistent due to not being incremented on some of the nodes! */ - if (cu_nb->plist[iloc]->nsci == 0) + if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0) { return; } diff --git a/src/mdlib/sim_util.c b/src/mdlib/sim_util.c index 88f6aa6621..61f255ecf4 100644 --- a/src/mdlib/sim_util.c +++ b/src/mdlib/sim_util.c @@ -1316,7 +1316,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS); /* skip the reduction if there was no non-local work to do */ - if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0) + if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0) { nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal, nbv->grp[eintNonlocal].nbat, f); @@ -1388,12 +1388,8 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, } wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS); - if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0) - { - /* skip the reduction if there was no non-local work to do */ - nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal, - nbv->grp[eintLocal].nbat, f); - } + nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal, + nbv->grp[eintLocal].nbat, f); wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS); wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS); }