Bug fix in call to deviceStreamManager constructor
authorAlan Gray <alangray3@gmail.com>
Tue, 8 Sep 2020 16:34:25 +0000 (16:34 +0000)
committerPaul Bauer <paul.bauer.q@gmail.com>
Tue, 8 Sep 2020 16:34:25 +0000 (16:34 +0000)
DeviceStreamManager construction expects
(deviceInfo,useGpuForPme,havePpDomainDecomposition,
doGpuPmePpTransfer,useGpuForUpdate,useTiming) but was being passed
(deviceInfo,useGpuForPme,useGpuForNonbonded,
havePPDomainDecomposition,useGpuForUpdate,useTiming) Now passing
correct arguments through simulationWork. Also ensures update stream
is correctly set up when gpu buffer ops are enabled.

Fixes #3639 and #3641

src/gromacs/gpu_utils/device_stream_manager.cpp
src/gromacs/gpu_utils/device_stream_manager.h
src/gromacs/gpu_utils/tests/device_stream_manager.cpp
src/gromacs/mdrun/runner.cpp

index 96d3119b7c3875d532cc653fa948b16c7d95b64a..8b09713695ae9c3e24465a4d262164dfc43d28ed 100644 (file)
@@ -47,6 +47,7 @@
 
 #include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/mdtypes/simulation_workload.h"
 #include "gromacs/utility/enumerationhelpers.h"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/gmxassert.h"
@@ -71,10 +72,8 @@ public:
      * \throws InternalError  If any of the required resources could not be initialized.
      */
     Impl(const DeviceInformation& deviceInfo,
-         bool                     useGpuForPme,
          bool                     havePpDomainDecomposition,
-         bool                     doGpuPmePpTransfer,
-         bool                     useGpuForUpdate,
+         SimulationWorkload       simulationWork,
          bool                     useTiming);
     ~Impl();
 
@@ -86,10 +85,8 @@ public:
 
 // DeviceStreamManager::Impl
 DeviceStreamManager::Impl::Impl(const DeviceInformation& deviceInfo,
-                                const bool               useGpuForPme,
                                 const bool               havePpDomainDecomposition,
-                                const bool               doGpuPmePpTransfer,
-                                const bool               useGpuForUpdate,
+                                const SimulationWorkload simulationWork,
                                 const bool               useTiming) :
     context_(deviceInfo)
 {
@@ -97,7 +94,7 @@ DeviceStreamManager::Impl::Impl(const DeviceInformation& deviceInfo,
     {
         streams_[DeviceStreamType::NonBondedLocal].init(context_, DeviceStreamPriority::Normal, useTiming);
 
-        if (useGpuForPme)
+        if (simulationWork.useGpuPme)
         {
             /* Creating a PME GPU stream:
              * - default high priority with CUDA
@@ -112,12 +109,12 @@ DeviceStreamManager::Impl::Impl(const DeviceInformation& deviceInfo,
                                                                useTiming);
         }
         // Update stream is used both for coordinates transfers and for GPU update/constraints
-        if (useGpuForPme || useGpuForUpdate)
+        if (simulationWork.useGpuPme || simulationWork.useGpuUpdate || simulationWork.useGpuBufferOps)
         {
             streams_[DeviceStreamType::UpdateAndConstraints].init(
                     context_, DeviceStreamPriority::Normal, useTiming);
         }
-        if (doGpuPmePpTransfer)
+        if (simulationWork.useGpuPmePpCommunication)
         {
             streams_[DeviceStreamType::PmePpTransfer].init(context_, DeviceStreamPriority::Normal, useTiming);
         }
@@ -129,12 +126,10 @@ DeviceStreamManager::Impl::~Impl() = default;
 
 // DeviceStreamManager
 DeviceStreamManager::DeviceStreamManager(const DeviceInformation& deviceInfo,
-                                         const bool               useGpuForPme,
                                          const bool               havePpDomainDecomposition,
-                                         const bool               doGpuPmePpTransfer,
-                                         const bool               useGpuForUpdate,
+                                         const SimulationWorkload simulationWork,
                                          const bool               useTiming) :
-    impl_(new Impl(deviceInfo, useGpuForPme, havePpDomainDecomposition, doGpuPmePpTransfer, useGpuForUpdate, useTiming))
+    impl_(new Impl(deviceInfo, havePpDomainDecomposition, simulationWork, useTiming))
 {
 }
 
index 4565d1ac0c2044633420536cd4aecfffea5e07d6..1f8366e743fc7336855973af53fc8f4caa732362 100644 (file)
@@ -57,6 +57,8 @@ class DeviceStream;
 namespace gmx
 {
 
+class SimulationWorkload;
+
 /*! \brief Class enum to describe the different logical streams used
  * for GPU work.
  *
@@ -98,10 +100,8 @@ public:
      * \throws InternalError  If any of the required resources could not be initialized.
      */
     DeviceStreamManager(const DeviceInformation& deviceInfo,
-                        bool                     useGpuForPme,
                         bool                     havePpDomainDecomposition,
-                        bool                     doGpuPmePpTransfer,
-                        bool                     useGpuForUpdate,
+                        SimulationWorkload       simulationWork,
                         bool                     useTiming);
     ~DeviceStreamManager();
 
index e3db2cb19e45365b58053853df359baaf420ee22..12221ef118bf7b7e44852b5ffe9e31f6940a4971 100644 (file)
@@ -51,6 +51,7 @@
 
 #include <gtest/gtest.h>
 
+#include "gromacs/mdtypes/simulation_workload.h"
 #include "gromacs/utility/enumerationhelpers.h"
 
 #include "gputest.h"
@@ -116,12 +117,12 @@ TEST_F(DeviceStreamManagerTest, CorrectStreamsAreReturnedOnNonbondedDevice)
 
         {
             SCOPED_TRACE("No DD, no PME rank, no GPU update");
-            bool                useGpuForPme              = false;
+            SimulationWorkload simulationWork;
+            simulationWork.useGpuPme                      = false;
+            simulationWork.useGpuPmePpCommunication       = false;
+            simulationWork.useGpuUpdate                   = false;
             bool                havePpDomainDecomposition = false;
-            bool                doGpuPmePpTransfer        = false;
-            bool                useGpuForUpdate           = false;
-            DeviceStreamManager manager(*deviceInfo, useGpuForPme, havePpDomainDecomposition,
-                                        doGpuPmePpTransfer, useGpuForUpdate, useTiming);
+            DeviceStreamManager manager(*deviceInfo, havePpDomainDecomposition, simulationWork, useTiming);
 
             expectValidStreams(&manager, { DeviceStreamType::NonBondedLocal });
             expectInvalidStreams(&manager, { DeviceStreamType::NonBondedNonLocal,
@@ -131,12 +132,12 @@ TEST_F(DeviceStreamManagerTest, CorrectStreamsAreReturnedOnNonbondedDevice)
 
         {
             SCOPED_TRACE("With DD, no PME rank, no GPU update");
-            bool                useGpuForPme              = false;
+            SimulationWorkload simulationWork;
+            simulationWork.useGpuPme                      = false;
+            simulationWork.useGpuPmePpCommunication       = false;
+            simulationWork.useGpuUpdate                   = false;
             bool                havePpDomainDecomposition = true;
-            bool                doGpuPmePpTransfer        = false;
-            bool                useGpuForUpdate           = false;
-            DeviceStreamManager manager(*deviceInfo, useGpuForPme, havePpDomainDecomposition,
-                                        doGpuPmePpTransfer, useGpuForUpdate, useTiming);
+            DeviceStreamManager manager(*deviceInfo, havePpDomainDecomposition, simulationWork, useTiming);
 
             expectValidStreams(&manager, { DeviceStreamType::NonBondedLocal,
                                            DeviceStreamType::NonBondedNonLocal });
@@ -146,12 +147,12 @@ TEST_F(DeviceStreamManagerTest, CorrectStreamsAreReturnedOnNonbondedDevice)
 
         {
             SCOPED_TRACE("No DD, with PME rank, no GPU update");
-            bool                useGpuForPme              = true;
+            SimulationWorkload simulationWork;
+            simulationWork.useGpuPme                      = true;
+            simulationWork.useGpuPmePpCommunication       = true;
+            simulationWork.useGpuUpdate                   = false;
             bool                havePpDomainDecomposition = false;
-            bool                doGpuPmePpTransfer        = true;
-            bool                useGpuForUpdate           = false;
-            DeviceStreamManager manager(*deviceInfo, useGpuForPme, havePpDomainDecomposition,
-                                        doGpuPmePpTransfer, useGpuForUpdate, useTiming);
+            DeviceStreamManager manager(*deviceInfo, havePpDomainDecomposition, simulationWork, useTiming);
 
             expectValidStreams(&manager, { DeviceStreamType::Pme, DeviceStreamType::NonBondedLocal,
                                            DeviceStreamType::PmePpTransfer,
@@ -161,12 +162,12 @@ TEST_F(DeviceStreamManagerTest, CorrectStreamsAreReturnedOnNonbondedDevice)
 
         {
             SCOPED_TRACE("With DD, with PME rank, no GPU update");
-            bool                useGpuForPme              = true;
+            SimulationWorkload simulationWork;
+            simulationWork.useGpuPme                      = true;
+            simulationWork.useGpuPmePpCommunication       = true;
+            simulationWork.useGpuUpdate                   = false;
             bool                havePpDomainDecomposition = true;
-            bool                doGpuPmePpTransfer        = true;
-            bool                useGpuForUpdate           = false;
-            DeviceStreamManager manager(*deviceInfo, useGpuForPme, havePpDomainDecomposition,
-                                        doGpuPmePpTransfer, useGpuForUpdate, useTiming);
+            DeviceStreamManager manager(*deviceInfo, havePpDomainDecomposition, simulationWork, useTiming);
 
             expectValidStreams(&manager, { DeviceStreamType::Pme, DeviceStreamType::NonBondedLocal,
                                            DeviceStreamType::NonBondedNonLocal, DeviceStreamType::PmePpTransfer,
@@ -175,12 +176,12 @@ TEST_F(DeviceStreamManagerTest, CorrectStreamsAreReturnedOnNonbondedDevice)
 
         {
             SCOPED_TRACE("No DD, no PME rank, with GPU update");
-            bool                useGpuForPme              = false;
+            SimulationWorkload simulationWork;
+            simulationWork.useGpuPme                      = false;
+            simulationWork.useGpuPmePpCommunication       = false;
+            simulationWork.useGpuUpdate                   = true;
             bool                havePpDomainDecomposition = false;
-            bool                doGpuPmePpTransfer        = false;
-            bool                useGpuForUpdate           = true;
-            DeviceStreamManager manager(*deviceInfo, useGpuForPme, havePpDomainDecomposition,
-                                        doGpuPmePpTransfer, useGpuForUpdate, useTiming);
+            DeviceStreamManager manager(*deviceInfo, havePpDomainDecomposition, simulationWork, useTiming);
 
             expectValidStreams(&manager, { DeviceStreamType::NonBondedLocal,
                                            DeviceStreamType::UpdateAndConstraints });
@@ -190,12 +191,12 @@ TEST_F(DeviceStreamManagerTest, CorrectStreamsAreReturnedOnNonbondedDevice)
 
         {
             SCOPED_TRACE("With DD, no PME rank, with GPU update");
-            bool                useGpuForPme              = false;
+            SimulationWorkload simulationWork;
+            simulationWork.useGpuPme                      = false;
+            simulationWork.useGpuPmePpCommunication       = false;
+            simulationWork.useGpuUpdate                   = true;
             bool                havePpDomainDecomposition = true;
-            bool                doGpuPmePpTransfer        = false;
-            bool                useGpuForUpdate           = true;
-            DeviceStreamManager manager(*deviceInfo, useGpuForPme, havePpDomainDecomposition,
-                                        doGpuPmePpTransfer, useGpuForUpdate, useTiming);
+            DeviceStreamManager manager(*deviceInfo, havePpDomainDecomposition, simulationWork, useTiming);
 
             expectValidStreams(&manager, { DeviceStreamType::NonBondedLocal, DeviceStreamType::NonBondedNonLocal,
                                            DeviceStreamType::UpdateAndConstraints });
@@ -204,12 +205,12 @@ TEST_F(DeviceStreamManagerTest, CorrectStreamsAreReturnedOnNonbondedDevice)
 
         {
             SCOPED_TRACE("No DD, with PME rank, with GPU update");
-            bool                useGpuForPme              = true;
+            SimulationWorkload simulationWork;
+            simulationWork.useGpuPme                      = true;
+            simulationWork.useGpuPmePpCommunication       = true;
+            simulationWork.useGpuUpdate                   = true;
             bool                havePpDomainDecomposition = false;
-            bool                doGpuPmePpTransfer        = true;
-            bool                useGpuForUpdate           = true;
-            DeviceStreamManager manager(*deviceInfo, useGpuForPme, havePpDomainDecomposition,
-                                        doGpuPmePpTransfer, useGpuForUpdate, useTiming);
+            DeviceStreamManager manager(*deviceInfo, havePpDomainDecomposition, simulationWork, useTiming);
 
             expectValidStreams(&manager, { DeviceStreamType::Pme, DeviceStreamType::NonBondedLocal,
                                            DeviceStreamType::PmePpTransfer,
@@ -219,12 +220,12 @@ TEST_F(DeviceStreamManagerTest, CorrectStreamsAreReturnedOnNonbondedDevice)
 
         {
             SCOPED_TRACE("With DD, with PME rank, with GPU update");
-            bool                useGpuForPme              = true;
+            SimulationWorkload simulationWork;
+            simulationWork.useGpuPme                      = true;
+            simulationWork.useGpuPmePpCommunication       = true;
+            simulationWork.useGpuUpdate                   = true;
             bool                havePpDomainDecomposition = true;
-            bool                doGpuPmePpTransfer        = true;
-            bool                useGpuForUpdate           = true;
-            DeviceStreamManager manager(*deviceInfo, useGpuForPme, havePpDomainDecomposition,
-                                        doGpuPmePpTransfer, useGpuForUpdate, useTiming);
+            DeviceStreamManager manager(*deviceInfo, havePpDomainDecomposition, simulationWork, useTiming);
 
             expectValidStreams(&manager, { DeviceStreamType::Pme, DeviceStreamType::NonBondedLocal,
                                            DeviceStreamType::NonBondedNonLocal, DeviceStreamType::PmePpTransfer,
index cc76ce80d7e2adb580ab19b2f3dbbce0e8019d25..65384951cebdb2610715d8a4dda5cdbb1f025f89 100644 (file)
@@ -1250,6 +1250,12 @@ int Mdrunner::mdrunner()
     const bool printHostName = (cr->nnodes > 1);
     gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
 
+    MdrunScheduleWorkload runScheduleWork;
+    // Also populates the simulation constant workload description.
+    runScheduleWork.simulationWork = createSimulationWorkload(
+            *inputrec, useGpuForNonbonded, pmeRunMode, useGpuForBonded, useGpuForUpdate,
+            devFlags.enableGpuBufferOps, devFlags.enableGpuHaloExchange, devFlags.enableGpuPmePPComm);
+
     std::unique_ptr<DeviceStreamManager> deviceStreamManager = nullptr;
 
     if (deviceInfo != nullptr)
@@ -1259,8 +1265,7 @@ int Mdrunner::mdrunner()
             dd_setup_dlb_resource_sharing(cr, deviceId);
         }
         deviceStreamManager = std::make_unique<DeviceStreamManager>(
-                *deviceInfo, useGpuForPme, useGpuForNonbonded, havePPDomainDecomposition(cr),
-                useGpuForUpdate, useTiming);
+                *deviceInfo, havePPDomainDecomposition(cr), runScheduleWork.simulationWork, useTiming);
     }
 
     // If the user chose a task assignment, give them some hints
@@ -1346,7 +1351,8 @@ int Mdrunner::mdrunner()
     // Only for DD, only master PP rank needs to perform setup, and only if thread MPI plus
     // any of the GPU communication features are active.
     if (DOMAINDECOMP(cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
-        && (devFlags.enableGpuHaloExchange || devFlags.enableGpuPmePPComm))
+        && (runScheduleWork.simulationWork.useGpuHaloExchange
+            || runScheduleWork.simulationWork.useGpuPmePpCommunication))
     {
         setupGpuDevicePeerAccess(gpuIdsToUse, mdlog);
     }
@@ -1418,7 +1424,7 @@ int Mdrunner::mdrunner()
         // TODO: Forcerec is not a correct place to store it.
         fr->deviceStreamManager = deviceStreamManager.get();
 
-        if (devFlags.enableGpuPmePPComm && !thisRankHasDuty(cr, DUTY_PME))
+        if (runScheduleWork.simulationWork.useGpuPmePpCommunication && !thisRankHasDuty(cr, DUTY_PME))
         {
             GMX_RELEASE_ASSERT(
                     deviceStreamManager != nullptr,
@@ -1433,10 +1439,11 @@ int Mdrunner::mdrunner()
                     deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
         }
 
-        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec.get(), fr, cr, *hwinfo, useGpuForNonbonded,
+        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec.get(), fr, cr, *hwinfo,
+                                        runScheduleWork.simulationWork.useGpuNonbonded,
                                         deviceStreamManager.get(), &mtop, box, wcycle);
         // TODO: Move the logic below to a GPU bonded builder
-        if (useGpuForBonded)
+        if (runScheduleWork.simulationWork.useGpuBonded)
         {
             GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
                                "GPU device stream manager should be valid in order to use GPU "
@@ -1551,17 +1558,22 @@ int Mdrunner::mdrunner()
             try
             {
                 // TODO: This should be in the builder.
-                GMX_RELEASE_ASSERT(!useGpuForPme || (deviceStreamManager != nullptr),
+                GMX_RELEASE_ASSERT(!runScheduleWork.simulationWork.useGpuPme
+                                           || (deviceStreamManager != nullptr),
                                    "Device stream manager should be valid in order to use GPU "
                                    "version of PME.");
                 GMX_RELEASE_ASSERT(
-                        !useGpuForPme || deviceStreamManager->streamIsValid(DeviceStreamType::Pme),
+                        !runScheduleWork.simulationWork.useGpuPme
+                                || deviceStreamManager->streamIsValid(DeviceStreamType::Pme),
                         "GPU PME stream should be valid in order to use GPU version of PME.");
 
-                const DeviceContext* deviceContext =
-                        useGpuForPme ? &deviceStreamManager->context() : nullptr;
+                const DeviceContext* deviceContext = runScheduleWork.simulationWork.useGpuPme
+                                                             ? &deviceStreamManager->context()
+                                                             : nullptr;
                 const DeviceStream* pmeStream =
-                        useGpuForPme ? &deviceStreamManager->stream(DeviceStreamType::Pme) : nullptr;
+                        runScheduleWork.simulationWork.useGpuPme
+                                ? &deviceStreamManager->stream(DeviceStreamType::Pme)
+                                : nullptr;
 
                 pmedata = gmx_pme_init(cr, getNumPmeDomains(cr->dd), inputrec.get(),
                                        nChargePerturbed != 0, nTypePerturbed != 0,
@@ -1652,19 +1664,9 @@ int Mdrunner::mdrunner()
                             domdecOptions.checkBondedInteractions, fr->cginfo_mb);
         }
 
-        // TODO This is not the right place to manage the lifetime of
-        // this data structure, but currently it's the easiest way to
-        // make it work.
-        MdrunScheduleWorkload runScheduleWork;
-        // Also populates the simulation constant workload description.
-        runScheduleWork.simulationWork =
-                createSimulationWorkload(*inputrec, useGpuForNonbonded, pmeRunMode, useGpuForBonded,
-                                         useGpuForUpdate, devFlags.enableGpuBufferOps,
-                                         devFlags.enableGpuHaloExchange, devFlags.enableGpuPmePPComm);
-
         std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
         if (gpusWereDetected
-            && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME))
+            && ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME))
                 || runScheduleWork.simulationWork.useGpuBufferOps))
         {
             GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)