From: Berk Hess <hess@kth.se>
Date: Thu, 30 Apr 2015 19:32:44 +0000 (+0200)
Subject: Fixed GPU non-local F copy local conditional
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=fc8a5624e96769cbe95da6159398f56bd297b38e;p=alexxy%2Fgromacs.git

Fixed GPU non-local F copy local conditional

With domain decomposition and GPUs the copy of the non-local part of
the host memory force buffer to the force array was conditional on
the local instead of the non-local list size. This meant that with
an empty non-local list and non-empty local list outdated non-local
forces would be copied. Conversely, with an empty local list all
non-local forces would not be added. Both things can only happen
in systems with partially empty boxes and then only rarely.
Having the local kernel, D2H copyback and F reduction called
conditionally is not useful in practice, so they are now unconditional
to avoid complicating the code.
Fixes #1721.

Change-Id: I06731b0055a4fb5a16168e7180964e0b87443b0f
---

diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index f82fd3a990..11a1f43291 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -285,8 +285,15 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
     /* turn energy calculation always on/off (for debugging/testing only) */
     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 
-    /* don't launch the kernel if there is no work to do */
-    if (plist->nsci == 0)
+    /* Don't launch the non-local kernel if there is no work to do.
+       Doing the same for the local kernel is more complicated, since the
+       local part of the force array also depends on the non-local kernel.
+       So to avoid complicating the code and to reduce the risk of bugs,
+       we always call the local kernel, the local x+q copy and later (not in
+       this function) the stream wait, local f copyback and the f buffer
+       clearing. All these operations, except for the local interaction kernel,
+       are needed for the non-local interactions. */
+    if (iloc == eintNonlocal && plist->nsci == 0)
     {
         return;
     }
@@ -406,8 +413,8 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 
-    /* don't launch copy-back if there was no work to do */
-    if (cu_nb->plist[iloc]->nsci == 0)
+    /* don't launch non-local copy-back if there was no non-local work to do */
+    if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
     {
         return;
     }
@@ -462,7 +469,7 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
     }
 
     /* With DD the local D2H transfer can only start after the non-local 
-       has been launched. */
+       kernel has finished. */
     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
     {
         stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
@@ -475,8 +482,8 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 
     /* After the non-local D2H is launched the nonlocal_done event can be
        recorded which signals that the local D2H can proceed. This event is not
-       placed after the non-local kernel because we first need the non-local
-       data back first. */
+       placed after the non-local kernel because we want the non-local data
+       back first. */
     if (iloc == eintNonlocal)
     {
         stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
@@ -563,14 +570,16 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
     bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 
     /* turn energy calculation always on/off (for debugging/testing only) */
-    bCalcEner = (bCalcEner || always_ener) && !never_ener; 
+    bCalcEner = (bCalcEner || always_ener) && !never_ener;
 
-    /* don't launch wait/update timers & counters if there was no work to do
+    /* Launch wait/update timers & counters, unless doing the non-local phase
+       when there is not actually work to do. This is consistent with
+       nbnxn_cuda_launch_kernel.
 
        NOTE: if timing with multiple GPUs (streams) becomes possible, the
        counters could end up being inconsistent due to not being incremented
        on some of the nodes! */
-    if (cu_nb->plist[iloc]->nsci == 0)
+    if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
     {
         return;
     }
diff --git a/src/mdlib/sim_util.c b/src/mdlib/sim_util.c
index 88f6aa6621..61f255ecf4 100644
--- a/src/mdlib/sim_util.c
+++ b/src/mdlib/sim_util.c
@@ -1316,7 +1316,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
             wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
             /* skip the reduction if there was no non-local work to do */
-            if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+            if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
             {
                 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
                                                nbv->grp[eintNonlocal].nbat, f);
@@ -1388,12 +1388,8 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         }
         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
-        if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
-        {
-            /* skip the reduction if there was no non-local work to do */
-            nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
-                                           nbv->grp[eintLocal].nbat, f);
-        }
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+                                       nbv->grp[eintLocal].nbat, f);
         wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
         wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
     }