From e7ebefd8ac43786eb0c1d310f7bc652c4a179a5f Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Thu, 23 Jan 2020 18:30:00 +0100
Subject: [PATCH] Remove BufferOpsUseGpu enum

Its limited role has been fully replaced by the equivalent workload
flags and the verbosity its used requires is not warranted for the
current use-cases.

Change-Id: I67ad1e9eca16e095ea9da8173e07d8f7683a862c
---
 src/gromacs/mdlib/sim_util.cpp | 41 +++++++++++++---------------------
 src/gromacs/nbnxm/nbnxm.h      |  7 ------
 2 files changed, 16 insertions(+), 32 deletions(-)

diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index e5864fae56..984df2b145 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -948,14 +948,6 @@ void do_force(FILE*                               fplog,
     const bool useGpuPmeOnThisRank = simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME);
     const int  pmeFlags            = makePmeFlags(stepWork);
 
-    // Switches on whether to use GPU for position and force buffer operations
-    // TODO consider all possible combinations of triggers, and how to combine optimally in each case.
-    const BufferOpsUseGpu useGpuXBufOps =
-            stepWork.useGpuXBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
-    // GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU
-    const BufferOpsUseGpu useGpuFBufOps =
-            stepWork.useGpuFBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
-
     /* At a search step we need to start the first balancing region
      * somewhere early inside the step after communication during domain
      * decomposition (and not during the previous step as usual).
@@ -1012,7 +1004,7 @@ void do_force(FILE*                               fplog,
     // The local coordinates can be copied right away.
     // NOTE: Consider moving this copy to right after they are updated and constrained,
     //       if the later is not offloaded.
-    if (useGpuPmeOnThisRank || useGpuXBufOps == BufferOpsUseGpu::True)
+    if (useGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
     {
         if (stepWork.doNeighborSearch)
         {
@@ -1041,10 +1033,9 @@ void do_force(FILE*                               fplog,
     gmx::GpuHaloExchange* gpuHaloExchange =
             (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr);
     const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr);
-    GMX_ASSERT(!ddUsesGpuDirectCommunication || (useGpuXBufOps == BufferOpsUseGpu::True),
+    GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps,
                "Must use coordinate buffer ops with GPU halo exchange");
-    const bool useGpuForcesHaloExchange =
-            ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
+    const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps;
 
     // Copy coordinate from the GPU if update is on the GPU and there
     // are forces to be computed on the CPU, or for the computation of
@@ -1184,12 +1175,12 @@ void do_force(FILE*                               fplog,
         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
         wallcycle_stop(wcycle, ewcNS);
 
-        if (useGpuXBufOps == BufferOpsUseGpu::True)
+        if (stepWork.useGpuXBufferOps)
         {
             nbv->atomdata_init_copy_x_to_nbat_x_gpu();
         }
         // For force buffer ops, we use the below conditon rather than
-        // useGpuFBufOps to ensure that init is performed even if this
+        // useGpuFBufferOps to ensure that init is performed even if this
         // NS step is also a virial step (on which f buf ops are deactivated).
         if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
         {
@@ -1199,7 +1190,7 @@ void do_force(FILE*                               fplog,
     }
     else if (!EI_TPI(inputrec->eI))
     {
-        if (useGpuXBufOps == BufferOpsUseGpu::True)
+        if (stepWork.useGpuXBufferOps)
         {
             GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
             nbv->convertCoordinatesGpu(AtomLocality::Local, false, stateGpu->getCoordinates(),
@@ -1228,7 +1219,7 @@ void do_force(FILE*                               fplog,
 
         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
-        if (stepWork.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
+        if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
         {
             Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
         }
@@ -1303,7 +1294,7 @@ void do_force(FILE*                               fplog,
                 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
             }
 
-            if (useGpuXBufOps == BufferOpsUseGpu::True)
+            if (stepWork.useGpuXBufferOps)
             {
                 if (!useGpuPmeOnThisRank && !ddUsesGpuDirectCommunication)
                 {
@@ -1323,7 +1314,7 @@ void do_force(FILE*                               fplog,
         {
             wallcycle_start(wcycle, ewcLAUNCH_GPU);
 
-            if (stepWork.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
+            if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
             {
                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
                 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal);
@@ -1547,7 +1538,7 @@ void do_force(FILE*                               fplog,
                 wallcycle_stop(wcycle, ewcFORCE);
             }
 
-            if (useGpuFBufOps == BufferOpsUseGpu::True)
+            if (stepWork.useGpuFBufferOps)
             {
                 gmx::FixedCapacityVector<GpuEventSynchronizer*, 1> dependencyList;
 
@@ -1562,7 +1553,7 @@ void do_force(FILE*                               fplog,
                     stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(),
                                               AtomLocality::NonLocal);
                     dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(
-                            AtomLocality::NonLocal, useGpuFBufOps == BufferOpsUseGpu::True));
+                            AtomLocality::NonLocal, stepWork.useGpuFBufferOps));
                 }
 
                 nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::NonLocal, stateGpu->getForces(),
@@ -1610,7 +1601,7 @@ void do_force(FILE*                               fplog,
             }
             else
             {
-                if (useGpuFBufOps == BufferOpsUseGpu::True)
+                if (stepWork.useGpuFBufferOps)
                 {
                     stateGpu->waitForcesReadyOnHost(AtomLocality::NonLocal);
                 }
@@ -1622,7 +1613,7 @@ void do_force(FILE*                               fplog,
     // With both nonbonded and PME offloaded a GPU on the same rank, we use
     // an alternating wait/reduction scheme.
     bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPmeOnThisRank && simulationWork.useGpuNonbonded
-                             && !DOMAINDECOMP(cr) && (useGpuFBufOps == BufferOpsUseGpu::False));
+                             && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps);
     if (alternateGpuWait)
     {
         alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, stepWork, pmeFlags, wcycle);
@@ -1716,7 +1707,7 @@ void do_force(FILE*                               fplog,
 
         gmx::ArrayRef<gmx::RVec> forceWithShift = forceOut.forceWithShiftForces().force();
 
-        if (useGpuFBufOps == BufferOpsUseGpu::True)
+        if (stepWork.useGpuFBufferOps)
         {
             // Flag to specify whether the CPU force buffer has contributions to
             // local atoms. This depends on whether there are CPU-based force tasks
@@ -1742,8 +1733,8 @@ void do_force(FILE*                               fplog,
                 auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
 
                 stateGpu->copyForcesToGpu(forceWithShift, locality);
-                dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(
-                        locality, useGpuFBufOps == BufferOpsUseGpu::True));
+                dependencyList.push_back(
+                        stateGpu->getForcesReadyOnDeviceEvent(locality, stepWork.useGpuFBufferOps));
             }
             if (useGpuForcesHaloExchange)
             {
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 4fc235ca4f..3cd2a38ee4 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -139,13 +139,6 @@ struct t_nrnb;
 struct t_forcerec;
 struct t_inputrec;
 
-/*! \brief Switch for whether to use GPU for buffer ops*/
-enum class BufferOpsUseGpu
-{
-    True,
-    False
-};
-
 class GpuEventSynchronizer;
 
 namespace gmx
-- 
2.22.0