Refactor GPU buf ops-related launches in do_force()
authorSzilárd Páll <pall.szilard@gmail.com>
Wed, 11 Sep 2019 17:07:00 +0000 (19:07 +0200)
committerSzilárd Páll <pall.szilard@gmail.com>
Mon, 16 Sep 2019 14:27:39 +0000 (16:27 +0200)
- Consolidate branchy-ness into better organized code;
- Rename and clarify conditionals related to CPU force contributions:
  for non-local forces only the presence of bonded interactions should
  be a dependency, while for locals, the presence of any local force
  contribtion from the CPU.

Change-Id: I91921d5d7425206b5980ec2b96032790a5d54408

src/gromacs/mdlib/sim_util.cpp

index e2fffacd83c6c0366c9d9e6752417305940f9203..aba304222fe64f7749165a16c445a1ce9960ece5 100644 (file)
@@ -1458,18 +1458,12 @@ void do_force(FILE                                     *fplog,
                          forceFlags, &forceOut.forceWithVirial(), enerd,
                          ed, forceFlags.doNeighborSearch);
 
-    const bool useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
-    // TODO Force flags should include haveFreeEnergyWork for this domain
-    const bool haveCpuForces       = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork ||
-                                      useCpuFPmeReduction || (fr->efep != efepNO));
-
     // Will store the amount of cycles spent waiting for the GPU that
     // will be later used in the DLB accounting.
     float cycles_wait_gpu = 0;
     if (bUseOrEmulGPU)
     {
         auto &forceWithShiftForces = forceOut.forceWithShiftForces();
-        rvec *f                    = as_rvec_array(forceWithShiftForces.force().data());
 
         /* wait for non-local forces (or calculate in emulation mode) */
         if (havePPDomainDecomposition(cr))
@@ -1491,21 +1485,23 @@ void do_force(FILE                                     *fplog,
                 wallcycle_stop(wcycle, ewcFORCE);
             }
 
-            if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces)
-            {
-                nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::NonLocal);
-            }
-
-            // flag to specify if forces should be accumulated in force buffer
-            // ops. For non-local part, this just depends on whether CPU forces are present.
-            bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) && haveCpuForces;
             if (useGpuFBufOps == BufferOpsUseGpu::True)
             {
+                // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
+                // The bonded and free energy CPU tasks can have non-local force contributions
+                // which are a dependency for the GPU force reduction.
+                bool  haveNonLocalForceContribInCpuBuffer = forceWork.haveCpuBondedWork || (fr->efep != efepNO);
+
+                rvec *f = as_rvec_array(forceWithShiftForces.force().data());
+                if (haveNonLocalForceContribInCpuBuffer)
+                {
+                    nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::NonLocal);
+                }
                 nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal,
                                                   nbv->getDeviceForces(),
                                                   pme_gpu_get_device_f(fr->pmedata),
                                                   pme_gpu_get_f_ready_synchronizer(fr->pmedata),
-                                                  useGpuPmeFReduction, accumulateForce);
+                                                  useGpuPmeFReduction, haveNonLocalForceContribInCpuBuffer);
                 nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::NonLocal);
             }
             else
@@ -1606,39 +1602,43 @@ void do_force(FILE                                     *fplog,
      * on the non-alternating path. */
     if (bUseOrEmulGPU && !alternateGpuWait)
     {
-        gmx::ArrayRef<gmx::RVec>  force = forceOut.forceWithShiftForces().force();
-        rvec                     *f     = as_rvec_array(force.data());
+        gmx::ArrayRef<gmx::RVec>  forceWithShift = forceOut.forceWithShiftForces().force();
 
-        // TODO: move these steps as early as possible:
-        // - CPU f H2D should be as soon as all CPU-side forces are done
-        // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
-        //   before the next CPU task that consumes the forces: vsite spread or update)
-        //
-        if (useGpuFBufOps == BufferOpsUseGpu::True && (haveCpuForces || DOMAINDECOMP(cr)))
-        {
-            nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
-        }
-        // flag to specify if forces should be accumulated in force
-        // buffer ops. For local part, this depends on whether CPU
-        // forces are present, or if DD is active (in which case the
-        // halo exchange has resulted in contributions from the
-        // non-local part).
-        bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) &&
-            (haveCpuForces || DOMAINDECOMP(cr));
+
+        const bool useCpuPmeFReduction    = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
+        // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
+        const bool haveCpuLocalForces     = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuPmeFReduction ||
+                                             (fr->efep != efepNO));
 
         if (useGpuFBufOps == BufferOpsUseGpu::True)
         {
+            // Flag to specify whether the CPU force buffer has contributions to
+            // local atoms. This depends on whether there are CPU-based force tasks
+            // or when DD is active the halo exchange has resulted in contributions
+            // from the non-local part.
+            const bool haveLocalForceContribInCpuBuffer = (haveCpuLocalForces || havePPDomainDecomposition(cr));
+
+            // TODO: move these steps as early as possible:
+            // - CPU f H2D should be as soon as all CPU-side forces are done
+            // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
+            //   before the next CPU task that consumes the forces: vsite spread or update)
+            //
+            rvec *f = as_rvec_array(forceWithShift.data());
+            if (haveLocalForceContribInCpuBuffer)
+            {
+                nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
+            }
             nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local,
                                               nbv->getDeviceForces(),
                                               pme_gpu_get_device_f(fr->pmedata),
                                               pme_gpu_get_f_ready_synchronizer(fr->pmedata),
-                                              useGpuPmeFReduction, accumulateForce);
+                                              useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
             nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::Local);
             nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
         }
         else
         {
-            nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, force);
+            nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, forceWithShift);
         }
 
     }