Fixed GPU non-local F copy local conditional

[alexxy/gromacs.git] / src / mdlib / nbnxn_cuda / nbnxn_cuda.cu
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index f82fd3a9905b31a4ab91f1b72c3dfff404420794..11a1f43291f4142b98a0d8e898ae48781a363816 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -285,8 +285,15 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
      /* turn energy calculation always on/off (for debugging/testing only) */
      bCalcEner = (bCalcEner || always_ener) && !never_ener;
  
-    /* don't launch the kernel if there is no work to do */
-    if (plist->nsci == 0)
+    /* Don't launch the non-local kernel if there is no work to do.
+       Doing the same for the local kernel is more complicated, since the
+       local part of the force array also depends on the non-local kernel.
+       So to avoid complicating the code and to reduce the risk of bugs,
+       we always call the local kernel, the local x+q copy and later (not in
+       this function) the stream wait, local f copyback and the f buffer
+       clearing. All these operations, except for the local interaction kernel,
+       are needed for the non-local interactions. */
+    if (iloc == eintNonlocal && plist->nsci == 0)
      {
          return;
      }
@@ -406,8 +413,8 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
      bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
      bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
-    /* don't launch copy-back if there was no work to do */
-    if (cu_nb->plist[iloc]->nsci == 0)
+    /* don't launch non-local copy-back if there was no non-local work to do */
+    if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
      {
          return;
      }
@@ -462,7 +469,7 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
      }
  
      /* With DD the local D2H transfer can only start after the non-local 
-       has been launched. */
+       kernel has finished. */
      if (iloc == eintLocal && cu_nb->bUseTwoStreams)
      {
          stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
@@ -475,8 +482,8 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
  
      /* After the non-local D2H is launched the nonlocal_done event can be
         recorded which signals that the local D2H can proceed. This event is not
-       placed after the non-local kernel because we first need the non-local
-       data back first. */
+       placed after the non-local kernel because we want the non-local data
+       back first. */
      if (iloc == eintNonlocal)
      {
          stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
@@ -563,14 +570,16 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
      bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      /* turn energy calculation always on/off (for debugging/testing only) */
-    bCalcEner = (bCalcEner || always_ener) && !never_ener; 
+    bCalcEner = (bCalcEner || always_ener) && !never_ener;
  
-    /* don't launch wait/update timers & counters if there was no work to do
+    /* Launch wait/update timers & counters, unless doing the non-local phase
+       when there is not actually work to do. This is consistent with
+       nbnxn_cuda_launch_kernel.
  
         NOTE: if timing with multiple GPUs (streams) becomes possible, the
         counters could end up being inconsistent due to not being incremented
         on some of the nodes! */
-    if (cu_nb->plist[iloc]->nsci == 0)
+    if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
      {
          return;
      }