Fixed GPU non-local F copy local conditional

author Berk Hess <hess@kth.se>

Thu, 30 Apr 2015 19:32:44 +0000 (21:32 +0200)

committer Berk Hess <hess@kth.se>

Mon, 4 May 2015 14:02:28 +0000 (16:02 +0200)
author Berk Hess <hess@kth.se>
Thu, 30 Apr 2015 19:32:44 +0000 (21:32 +0200)
committer Berk Hess <hess@kth.se>
Mon, 4 May 2015 14:02:28 +0000 (16:02 +0200)
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index f82fd3a9905b31a4ab91f1b72c3dfff404420794..11a1f43291f4142b98a0d8e898ae48781a363816 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -285,8 +285,15 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
      /* turn energy calculation always on/off (for debugging/testing only) */
      bCalcEner = (bCalcEner || always_ener) && !never_ener;
  
      /* turn energy calculation always on/off (for debugging/testing only) */
      bCalcEner = (bCalcEner || always_ener) && !never_ener;
  
-    /* don't launch the kernel if there is no work to do */
-    if (plist->nsci == 0)
+    /* Don't launch the non-local kernel if there is no work to do.
+       Doing the same for the local kernel is more complicated, since the
+       local part of the force array also depends on the non-local kernel.
+       So to avoid complicating the code and to reduce the risk of bugs,
+       we always call the local kernel, the local x+q copy and later (not in
+       this function) the stream wait, local f copyback and the f buffer
+       clearing. All these operations, except for the local interaction kernel,
+       are needed for the non-local interactions. */
+    if (iloc == eintNonlocal && plist->nsci == 0)
      {
          return;
      }
      {
          return;
      }
@@ -406,8 +413,8 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
      bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
      bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
      bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
-    /* don't launch copy-back if there was no work to do */
-    if (cu_nb->plist[iloc]->nsci == 0)
+    /* don't launch non-local copy-back if there was no non-local work to do */
+    if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
      {
          return;
      }
      {
          return;
      }
@@ -462,7 +469,7 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
      }
  
      /* With DD the local D2H transfer can only start after the non-local 
      }
  
      /* With DD the local D2H transfer can only start after the non-local 
-       has been launched. */
+       kernel has finished. */
      if (iloc == eintLocal && cu_nb->bUseTwoStreams)
      {
          stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
      if (iloc == eintLocal && cu_nb->bUseTwoStreams)
      {
          stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
@@ -475,8 +482,8 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
  
      /* After the non-local D2H is launched the nonlocal_done event can be
         recorded which signals that the local D2H can proceed. This event is not
  
      /* After the non-local D2H is launched the nonlocal_done event can be
         recorded which signals that the local D2H can proceed. This event is not
-       placed after the non-local kernel because we first need the non-local
-       data back first. */
+       placed after the non-local kernel because we want the non-local data
+       back first. */
      if (iloc == eintNonlocal)
      {
          stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
      if (iloc == eintNonlocal)
      {
          stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
@@ -563,14 +570,16 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
      bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      /* turn energy calculation always on/off (for debugging/testing only) */
      bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      /* turn energy calculation always on/off (for debugging/testing only) */
-    bCalcEner = (bCalcEner || always_ener) && !never_ener; 
+    bCalcEner = (bCalcEner || always_ener) && !never_ener;
  
  
-    /* don't launch wait/update timers & counters if there was no work to do
+    /* Launch wait/update timers & counters, unless doing the non-local phase
+       when there is not actually work to do. This is consistent with
+       nbnxn_cuda_launch_kernel.
  
         NOTE: if timing with multiple GPUs (streams) becomes possible, the
         counters could end up being inconsistent due to not being incremented
         on some of the nodes! */
  
         NOTE: if timing with multiple GPUs (streams) becomes possible, the
         counters could end up being inconsistent due to not being incremented
         on some of the nodes! */
-    if (cu_nb->plist[iloc]->nsci == 0)
+    if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
      {
          return;
      }
      {
          return;
      }
diff --git a/src/mdlib/sim_util.c b/src/mdlib/sim_util.c

index 88f6aa6621f0e95d5cee4a75d084b9971f58de1f..61f255ecf447f30cb6cffdd56aa76dbddabe4101 100644 (file)
--- a/src/mdlib/sim_util.c
+++ b/src/mdlib/sim_util.c
@@ -1316,7 +1316,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
              wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
              wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
              /* skip the reduction if there was no non-local work to do */
              wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
              wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
              /* skip the reduction if there was no non-local work to do */
-            if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+            if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
              {
                  nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
                                                 nbv->grp[eintNonlocal].nbat, f);
              {
                  nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
                                                 nbv->grp[eintNonlocal].nbat, f);
@@ -1388,12 +1388,8 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          }
          wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
          wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
          }
          wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
          wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
-        if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
-        {
-            /* skip the reduction if there was no non-local work to do */
-            nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
-                                           nbv->grp[eintLocal].nbat, f);
-        }
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+                                       nbv->grp[eintLocal].nbat, f);
          wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
          wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
      }
          wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
          wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
      }
author	Berk Hess <hess@kth.se>
	Thu, 30 Apr 2015 19:32:44 +0000 (21:32 +0200)
committer	Berk Hess <hess@kth.se>
	Mon, 4 May 2015 14:02:28 +0000 (16:02 +0200)
src/mdlib/nbnxn_cuda/nbnxn_cuda.cu		patch \| blob \| history
src/mdlib/sim_util.c		patch \| blob \| history