Fix gpuupdate matrix
authorArtem Zhmurov <zhmurov@gmail.com>
Mon, 30 Dec 2019 11:17:20 +0000 (12:17 +0100)
committerMark Abraham <mark.j.abraham@gmail.com>
Tue, 31 Dec 2019 11:49:09 +0000 (12:49 +0100)
1. Set GPU direct communications environment variables in update
   matrix to allow for multi-rank testing.
2. Fall back to the CPU update if PME is not fully offloaded
   in DD case and with PME-only rank (requred for GPU direct
   communications).

Change-Id: I66283a446b0cc58fd0165226b01bf58ce9ec90f1

admin/builds/gromacs.py
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/runner.cpp
src/gromacs/taskassignment/decidegpuusage.cpp
src/gromacs/taskassignment/decidegpuusage.h

index e429c623bc650b256c17992060950b8a5cd426dc..a734f2cd8b9bec82dda142602e4d65350ae38ab1 100644 (file)
@@ -186,6 +186,8 @@ def do_build(context):
     # GPU update flag enables GPU update+constraints as well as buffer ops (dependency)
     if context.opts.gpuupdate:
         context.env.set_env_var('GMX_FORCE_UPDATE_DEFAULT_GPU', "1")
+        context.env.set_env_var('GMX_GPU_DD_COMMS', "1")
+        context.env.set_env_var('GMX_GPU_PME_PP_COMMS', "1")
 
     regressiontests_path = context.workspace.get_project_dir(Project.REGRESSIONTESTS)
 
index c8d3cbb713cf504460c7342a1e563f59e24ac7fa..91761324e020f4d30aa973bb9e72c52d9f1f1d92 100644 (file)
@@ -1301,7 +1301,7 @@ void do_force(FILE*                               fplog,
                 // Note: GPU update + DD without direct communication is not supported,
                 // a waitCoordinatesReadyOnHost() should be issued if it will be.
                 GMX_ASSERT(!simulationWork.useGpuUpdate,
-                           "GPU update is not supported with halo exchange");
+                           "GPU update is not supported with CPU halo exchange");
                 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
             }
 
index f5b9cb1b7ab66d47a762e7f311f315a601023ae5..4eea98711058b16766a2c4c6bd31ec050ac7b389 100644 (file)
@@ -1191,9 +1191,10 @@ int Mdrunner::mdrunner()
         const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
 
         useGpuForUpdate = decideWhetherToUseGpuForUpdate(
-                devFlags.forceGpuUpdateDefault, useDomainDecomposition, useUpdateGroups, useGpuForPme,
-                useGpuForNonbonded, updateTarget, gpusWereDetected, *inputrec, mtop, doEssentialDynamics,
-                gmx_mtop_ftype_count(mtop, F_ORIRES) > 0, replExParams.exchangeInterval > 0, doRerun);
+                devFlags.forceGpuUpdateDefault, useDomainDecomposition, useUpdateGroups, pmeRunMode,
+                domdecOptions.numPmeRanks > 0, useGpuForNonbonded, updateTarget, gpusWereDetected,
+                *inputrec, mtop, doEssentialDynamics, gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
+                replExParams.exchangeInterval > 0, doRerun);
     }
     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 
index 9b069fd890a44c139a700fe8bc9a00001855848a..3bb192d44610df031148376461244273694b29b3 100644 (file)
@@ -492,7 +492,8 @@ bool decideWhetherToUseGpusForBonded(const bool       useGpuForNonbonded,
 bool decideWhetherToUseGpuForUpdate(const bool        forceGpuUpdateDefault,
                                     const bool        isDomainDecomposition,
                                     const bool        useUpdateGroups,
-                                    const bool        useGpuForPme,
+                                    const PmeRunMode  pmeRunMode,
+                                    const bool        havePmeOnlyRank,
                                     const bool        useGpuForNonbonded,
                                     const TaskTarget  updateTarget,
                                     const bool        gpusWereDetected,
@@ -536,11 +537,16 @@ bool decideWhetherToUseGpuForUpdate(const bool        forceGpuUpdateDefault,
     // Using the GPU-version of update if:
     // 1. PME is on the GPU (there should be a copy of coordinates on GPU for PME spread), or
     // 2. Non-bonded interactions are on the GPU.
-    if (!(useGpuForPme || useGpuForNonbonded))
+    if (pmeRunMode == PmeRunMode::CPU && !useGpuForNonbonded)
     {
         errorMessage +=
                 "Either PME or short-ranged non-bonded interaction tasks must run on the GPU.\n";
     }
+    // Since only direct GPU communications are supported with GPU update, PME should be fully offloaded in DD and PME only cases.
+    if (pmeRunMode != PmeRunMode::GPU && (isDomainDecomposition || havePmeOnlyRank))
+    {
+        errorMessage += "PME should run on GPU.\n";
+    }
     if (!gpusWereDetected)
     {
         errorMessage += "Compatible GPUs must have been found.\n";
index 45ba2d333f43312593fdec38641d882d5532319e..94db669b4f10a675d1ac40a994a2c25f07689e83 100644 (file)
@@ -48,6 +48,7 @@
 struct gmx_hw_info_t;
 struct gmx_mtop_t;
 struct t_inputrec;
+enum class PmeRunMode;
 
 namespace gmx
 {
@@ -235,7 +236,8 @@ bool decideWhetherToUseGpusForBonded(bool       useGpuForNonbonded,
  * \param[in]  forceGpuUpdateDefault        If update should run on GPU by default.
  * \param[in]  isDomainDecomposition        Whether there more than one domain.
  * \param[in]  useUpdateGroups              If the constraints can be split across domains.
- * \param[in]  useGpuForPme                 Whether GPUs will be used for PME interactions.
+ * \param[in]  pmeRunMode                   PME running mode: CPU, GPU or mixed.
+ * \param[in]  havePmeOnlyRank              If there is a PME-only rank in the simulation.
  * \param[in]  useGpuForNonbonded           Whether GPUs will be used for nonbonded interactions.
  * \param[in]  updateTarget                 User choice for running simulation on GPU.
  * \param[in]  gpusWereDetected             Whether compatible GPUs were detected on any node.
@@ -253,7 +255,8 @@ bool decideWhetherToUseGpusForBonded(bool       useGpuForNonbonded,
 bool decideWhetherToUseGpuForUpdate(bool              forceGpuUpdateDefault,
                                     bool              isDomainDecomposition,
                                     bool              useUpdateGroups,
-                                    bool              useGpuForPme,
+                                    PmeRunMode        pmeRunMode,
+                                    bool              havePmeOnlyRank,
                                     bool              useGpuForNonbonded,
                                     TaskTarget        updateTarget,
                                     bool              gpusWereDetected,